.. link: http://dgplug.org/summertraining/2013/posts/amreshkumar-dup_images-assignment-20130730.html .. description: .. tags: .. date: 2013/07/30 15:02:29 .. title: amreshkumar dup_images assignment 20130730 .. slug: amreshkumar-dup_images-assignment-20130730 dup_images --------------- Command-line tool to find duplicate images in given directory(s). Assignment: ----------- To write a program such as ./dup_images.py [dir_path]...[dir_path] finds all duplicate images in the given directory(s). Solution: --------- usage:: $ ./dup_images.py /home/user /home/user2 . searches for all duplicate images in directory '/home/user' and '/home/user2' and '.' $ ./dup_images.py -r /home/user /home/user2 searches for all duplicate images in directory '/home/user' and '/home/user2' and all their sub-directories Code: ##### `Link to code at Github `_ install with pip:: $ pip install PIL if the installation fails, first try installing python-devel:: $ pip install python-devel Code snippet: .. code:: python #!/usr/bin/env python """ dup_images.py Python program to find duplicate images in given directory(s) v 0.1 Issues : Occasional False-positive result observed in (deep)recursive search in absence of Exif data. """ import sys,os,argparse import types from PIL import Image from PIL.ExifTags import TAGS from md5 import md5 def imread(IMG): """Reads exif data from image IMG and returns dictionary object containing Exif Info. If Exif data is not present, MD5 Hash is returned.""" img = Image.open(IMG) if isinstance(img._getexif(), types.NoneType): return md5(img.tostring()).hexdigest() else: return img._getexif() def recursive_search(dir_name): """ Creates a list of 'jpg' files in the given directory and all sub-directories """ file_list = [] try: for root, dirs, files in os.walk(dir_name, topdown=True): for name in files: if name.endswith(".jpg") or name.endswith(".jpeg") or name.endswith(".JPG") or name.endswith(".JPEG"): file_list.append(os.path.join(root,name)) return file_list except OSError as e: print '\n','*'*30,'\n',e.filename,' ->', e.strerror exit() def simple_search(dir_name): """ Creates a list of 'jpg' files in the given directory """ file_list = [] try: for name in os.listdir(dir_name): if name.endswith(".jpg") or name.endswith(".jpeg") or name.endswith(".JPG") or name.endswith(".JPEG"): file_list.append(os.path.join(dir_name,name)) return file_list except OSError as e: print '\n','*'*30,'\n',e.filename,' ->', e.strerror exit() def compare(file_name,list_to_search): """ Input: file_name, list_to_search Process: Match Exif/md5 data of 'file_name' with all files in 'list_to_search' Output: Dislays message for each MATCH FOUND. """ try: dict_info_file = imread(file_name) for test_file in list_to_search: dict_info_test_file = imread(test_file) if dict_info_file == dict_info_test_file: print 'File Match : %s <---> %s' % (file_name,test_file) except KeyboardInterrupt: print '\n\n:( you killed me.\n' sys.exit() if __name__ == "__main__": """ Parses argument and does a simple/recursive search and comparision of jpg files """ parser = argparse.ArgumentParser(description='Finds duplicate images in DIRECTORY (current directory is default). Displays list of duplicates.',epilog='Dependency: PIL Library') parser.add_argument('-r',action='store_true', help='find images moving recursively through the child directories (default : search in parent directory only)') parser.add_argument('dir_list',nargs='+',default = os.getcwd(),help='list of directories to search') args = parser.parse_args() #print args # debugging args if args.r: # if recursive switch is ON for dir_name in args.dir_list: file_list = recursive_search(dir_name) # do recursive search for jpg files for file_name in file_list: file_list.remove(file_name) # remove current file from list compare(file_name,file_list) # compare file_name with remaining files in list else: # if recursive swith is OFF for dir_name in args.dir_list: file_list = simple_search(dir_name) # do simple search for jpg files for file_name in file_list: file_list.remove(file_name) # remove current file from list compare(file_name,file_list)# compare file_name with remaining files in list