The assignment was to find all the duplicate images from the given direcories as commandline arguments.
dup_images can find any type of duplicate files not only the images.
1 #!/usr/bin/env python 2 import os 3 import sys 4 import hashlib 5 6 data = {} # A dictionary to keep all data. 7 arranged_data = {} # A dictionary to keep data in an arranged way. 8 9 def hash_it(directory): 10 """Function to generate md5 hash 11 """ 12 for path, sub_directories, files in os.walk(directory): 13 # Picks each file as 'filename' and generating hash. 14 for filename in files: 15 path_to_file = os.path.join(path, filename) 16 # Appends file as key and generated md5 hash as value to 'data'. 17 data[str(path_to_file)] = hashlib.md5(open(str(path_to_file)).read()).hexdigest() 18 19 def find_duplicates(): 20 """Function to find duplicate files 21 """ 22 # Iterates through each key and value at a time. 23 for pair in data.items(): 24 if pair[1] not in arranged_data.keys(): 25 # If new hash value found appends it to 'arranged_data' as key. 26 arranged_data[pair[1]] = [] 27 # Keeps all the files as a list in value to the same hash(key). 28 arranged_data[pair[1]].append(pair[0]) 29 30 # Iterates through the new dictionary, the files which are duplicates keeps them as a list. 31 dup_list = [files for hashes, files in arranged_data.items() if len(files) > 1] 32 33 # A simple counter to count no. of duplicate files. 34 count = 1 35 # Iterates through each list and prints them. 36 for files in dup_list: 37 print str(count) + '.' 38 for dup_file in files: 39 print dup_file 40 count += 1 41 42 if __name__ == '__main__': 43 for i in range(1,len(sys.argv)): 44 if os.path.exists(sys.argv[i]): 45 hash_it(sys.argv[i]) 46 else: 47 print "Wrong path: %s" %(sys.argv[i]) 48 find_duplicates() 49 sys.exit(0)
You canfind it here.
Run the above script like:
$ ./dup_images <PATH> <PATH> <PATH>
Here example output is given below:
sudip@sudip-mint dup_images $ pwd /home/sudip/tmp/code/dup_images sudip@sudip-mint dup_images $ ls -R .: dup_images new1 new2 new3 ./new1: fb.jpg fdgdg.txt ./new2: ds.bmp newsub1 wfdss.jpg ./new2/newsub1: fb.jpg newsub2 ./new2/newsub1/newsub2: dgg.bmp viewer.py ./new3: cs.jpg sdf.png sudip@sudip-mint dup_images $ python dup_images /home/sudip/tmp/code/dup_images /asshddk/ahdjd/ddjf Wrong path: /asshddk/ahdjd/ddjf 1. /home/sudip/tmp/code/dup_images/new2/newsub1/newsub2/dgg.bmp /home/sudip/tmp/code/dup_images/new2/ds.bmp 2. /home/sudip/tmp/code/dup_images/new1/fdgdg.txt /home/sudip/tmp/code/dup_images/new2/newsub1/newsub2/viewer.py 3. /home/sudip/tmp/code/dup_images/new2/wfdss.jpg /home/sudip/tmp/code/dup_images/new3/cs.jpg /home/sudip/tmp/code/dup_images/new1/fb.jpg /home/sudip/tmp/code/dup_images/new2/newsub1/fb.jpg