.. link: http://dgplug.org/summertraining/2013/posts/iowabeakster-image_dup-20130731.html .. description: .. tags: .. date: 2013/07/31 13:25:12 .. title: iowabeakster image_dup 20130731 .. slug: iowabeakster-image_dup-20130731 image_dup ========= basic functions of script ------------------------- - find duplicate images in specified directories - take multiple directories as directory paths in args - import necessary modules - make magic happen (search out for duplicate images) link ---- `Github Link `_ script code ----------- .. code:: python import sys import os import hashlib # function to read file one mb at a time def file_reader(file_name, size=1024): """function that reads file_name in 1 MB pieces""" while True: file_piece = file_name.read(size) if not file_piece: return yield file_piece # iterate through the given paths for all files, adding files # and their hashes to the dictionary "hashes" after comparing # each new pair to the pairs already in dictionary def check_for_duplicates(paths, hash=hashlib.sha1): # create empty ditionary called "hashes" hashes = {} for path in paths: for dirpath, dirnames, filenames in os.walk(path): for filename in filenames: # identify each file using whole pathname full_path = os.path.join(dirpath, filename) hash_return = hash() # use file_reader function to break file into file_pieces and read pieces for file_piece in file_reader(open(full_path, 'rb')): # run the hash function on each piece hash_return.update(file_piece) # compare the hash digest and pathname to items is dictionary hashes file_id = (hash_return.digest(), os.path.getsize(full_path)) duplicate = hashes.get(file_id, None) # if a duplicate hash is found, print the pair # otherwise just add it as another pair to the dictionary if duplicate: print "Duplicate found: %s and %s" % (full_path, duplicate) else: hashes[file_id] = full_path if sys.argv[1:]: check_for_duplicates(sys.argv[1:]) else: print "Please pass the paths to check as parameters to the script"