Command-line tool to find duplicate images in given directory(s).
To write a program such as ./dup_images.py [dir_path]...[dir_path] finds all duplicate images in the given directory(s).
usage:
$ ./dup_images.py /home/user /home/user2 .
searches for all duplicate images in directory '/home/user' and '/home/user2' and '.'
$ ./dup_images.py -r /home/user /home/user2
searches for all duplicate images in directory '/home/user' and '/home/user2' and all their sub-directories
install with pip:
$ pip install PIL
if the installation fails, first try installing python-devel:
$ pip install python-devel
Code snippet:
#!/usr/bin/env python """ dup_images.py Python program to find duplicate images in given directory(s) v 0.1 Issues : Occasional False-positive result observed in (deep)recursive search in absence of Exif data. """ import sys,os,argparse import types from PIL import Image from PIL.ExifTags import TAGS from md5 import md5 def imread(IMG): """Reads exif data from image IMG and returns dictionary object containing Exif Info. If Exif data is not present, MD5 Hash is returned.""" img = Image.open(IMG) if isinstance(img._getexif(), types.NoneType): return md5(img.tostring()).hexdigest() else: return img._getexif() def recursive_search(dir_name): """ Creates a list of 'jpg' files in the given directory and all sub-directories """ file_list = [] try: for root, dirs, files in os.walk(dir_name, topdown=True): for name in files: if name.endswith(".jpg") or name.endswith(".jpeg") or name.endswith(".JPG") or name.endswith(".JPEG"): file_list.append(os.path.join(root,name)) return file_list except OSError as e: print '\n','*'*30,'\n',e.filename,' ->', e.strerror exit() def simple_search(dir_name): """ Creates a list of 'jpg' files in the given directory """ file_list = [] try: for name in os.listdir(dir_name): if name.endswith(".jpg") or name.endswith(".jpeg") or name.endswith(".JPG") or name.endswith(".JPEG"): file_list.append(os.path.join(dir_name,name)) return file_list except OSError as e: print '\n','*'*30,'\n',e.filename,' ->', e.strerror exit() def compare(file_name,list_to_search): """ Input: file_name, list_to_search Process: Match Exif/md5 data of 'file_name' with all files in 'list_to_search' Output: Dislays message for each MATCH FOUND. """ try: dict_info_file = imread(file_name) for test_file in list_to_search: dict_info_test_file = imread(test_file) if dict_info_file == dict_info_test_file: print 'File Match : %s <---> %s' % (file_name,test_file) except KeyboardInterrupt: print '\n\n:( you killed me.\n' sys.exit() if __name__ == "__main__": """ Parses argument and does a simple/recursive search and comparision of jpg files """ parser = argparse.ArgumentParser(description='Finds duplicate images in DIRECTORY (current directory is default). Displays list of duplicates.',epilog='Dependency: PIL Library') parser.add_argument('-r',action='store_true', help='find images moving recursively through the child directories (default : search in parent directory only)') parser.add_argument('dir_list',nargs='+',default = os.getcwd(),help='list of directories to search') args = parser.parse_args() #print args # debugging args if args.r: # if recursive switch is ON for dir_name in args.dir_list: file_list = recursive_search(dir_name) # do recursive search for jpg files for file_name in file_list: file_list.remove(file_name) # remove current file from list compare(file_name,file_list) # compare file_name with remaining files in list else: # if recursive swith is OFF for dir_name in args.dir_list: file_list = simple_search(dir_name) # do simple search for jpg files for file_name in file_list: file_list.remove(file_name) # remove current file from list compare(file_name,file_list)# compare file_name with remaining files in list