#!/usr/bin/env python # # Copyright 2010 Open Source Beef Computing. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE OPEN SOURCE BEEF COMPUTING PROJECT ``AS IS'' AND ANY EXPRESS # OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL THE FREEBSD PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # import sys import os __doc__=""" The text file ROW MANipulation utility. Version 0.0.5 -f name of file to process -o name of output file -u Parse a file an show only unique rows *line breaks are counted, so two identical lines but one having a line break and the other not will show a two unique lines -d Parse a file and show only duplicate rows -x Parse file removing duplicates. Duplicates will appear as one line in output -c if -d is specified -c will give the count of the number of time the row appears """ def main(): FILE_NAME = '' WRITE = False COUNT = False argc = len(sys.argv) idx = 1 while(idx < argc): if(sys.argv[idx] == "-f"): FILE_NAME = sys.argv[(idx+1)] elif(sys.argv[idx] == "-u"): UNIQUE = True DUPS = False REMOVE = False elif(sys.argv[idx] == "-d"): DUPS = True UNIQUE = False REMOVE = False elif(sys.argv[idx] == "-x"): REMOVE = True UNIQUE = False DUPS = False elif(sys.argv[idx] == "-c"): COUNT = True elif(sys.argv[idx] == "-o"): WRITE = True OUT_FILE = sys.argv[(idx+1)] elif(sys.argv[idx] in("-h","--help")): print __doc__ sys.exit(0) idx = (idx + 1) fh = open(FILE_NAME) hold = set() unique_lines = set() dup_lines = set() dup_dict = {} if(UNIQUE): for line in fh: if line in hold: dup_lines.add(line) else: hold.add(line) unique_lines = hold.difference(dup_lines) if(WRITE): fo = open(OUT_FILE,'w+') for line in unique_lines: fo.write(line) else: for line in unique_lines: print line elif(DUPS): for line in fh: if line in hold: dup_lines.add(line) if(COUNT): if line in dup_dict.keys(): dup_dict[line] += 1 else: dup_dict[line] = 1 else: hold.add(line) if(WRITE): fo = open(OUT_FILE,'w+') for line in dup_lines: if(COUNT): fo.write("%s : %s" % (dup_dict[line], line)) else: fo.write(line) else: for line in dup_lines: if(COUNT): print "%s : %s" % (dup_dict[line], line) else: print line elif(REMOVE): for line in fh: unique_lines.add(line) if(WRITE): fo = open(OUT_FILE,'w+') for line in unique_lines: fo.write(line) else: for line in unique_lines: print line print 'Done' if __name__ == "__main__": main()