amreshkumar dup_images assignment 20130730


Posted:

dup_images

Command-line tool to find duplicate images in given directory(s).

Assignment:

To write a program such as ./dup_images.py [dir_path]...[dir_path] finds all duplicate images in the given directory(s).

Solution:

usage:

$ ./dup_images.py /home/user /home/user2 .

searches for all duplicate images in directory '/home/user' and '/home/user2' and '.'

$ ./dup_images.py -r /home/user /home/user2

searches for all duplicate images in directory '/home/user' and '/home/user2' and all their sub-directories

Code:

Link to code at Github

install with pip:

$ pip install PIL

if the installation fails, first try installing python-devel:

$ pip install python-devel

Code snippet:

#!/usr/bin/env python

"""
dup_images.py

Python program to find duplicate images in given directory(s)
v 0.1

Issues : Occasional False-positive result observed in (deep)recursive search in absence of Exif data.

"""
import sys,os,argparse
import types

from PIL import Image
from PIL.ExifTags import TAGS
from md5 import md5

def imread(IMG):
        """Reads exif data from image IMG and returns dictionary object containing Exif Info. If Exif data is not present, MD5 Hash is returned."""
        img = Image.open(IMG)
        if isinstance(img._getexif(), types.NoneType):
                return  md5(img.tostring()).hexdigest()
        else:
                return img._getexif()


def recursive_search(dir_name):
        """ Creates a list of 'jpg' files in the given directory and all sub-directories """
        file_list = []
        try:
                for root, dirs, files in os.walk(dir_name, topdown=True):
                        for name in files:
                                if name.endswith(".jpg") or name.endswith(".jpeg") or name.endswith(".JPG") or name.endswith(".JPEG"):
                                        file_list.append(os.path.join(root,name))
                return file_list
        except OSError as e:
                print '\n','*'*30,'\n',e.filename,' ->', e.strerror
                exit()

def simple_search(dir_name):
        """ Creates a list of 'jpg' files in the given directory """
        file_list = []
        try:
                for name in os.listdir(dir_name):
                        if name.endswith(".jpg") or name.endswith(".jpeg") or name.endswith(".JPG") or name.endswith(".JPEG"):
                                file_list.append(os.path.join(dir_name,name))
                return file_list
        except OSError as e:
                print '\n','*'*30,'\n',e.filename,' ->', e.strerror
                exit()


def compare(file_name,list_to_search):
        """
            Input:   file_name, list_to_search
            Process: Match Exif/md5 data of 'file_name' with all files in 'list_to_search'
            Output:  Dislays message for each MATCH FOUND.

        """
        try:
                dict_info_file = imread(file_name)
                for test_file in list_to_search:
                        dict_info_test_file = imread(test_file)
                        if dict_info_file == dict_info_test_file:
                                print 'File Match : %s <---> %s' % (file_name,test_file)
        except KeyboardInterrupt:
                print '\n\n:( you killed me.\n'
                sys.exit()


if __name__ == "__main__":
        """
        Parses argument and does a simple/recursive search and comparision of jpg files

        """

        parser = argparse.ArgumentParser(description='Finds duplicate images in DIRECTORY (current directory is default). Displays list of duplicates.',epilog='Dependency: PIL Library')
        parser.add_argument('-r',action='store_true', help='find images moving recursively through the child directories (default : search in parent directory only)')
        parser.add_argument('dir_list',nargs='+',default = os.getcwd(),help='list of directories to search')
        args = parser.parse_args()
        #print args # debugging args

        if args.r: # if recursive switch is ON
         for dir_name in args.dir_list:
                file_list = recursive_search(dir_name) # do recursive search for jpg files
                for file_name in file_list:
                        file_list.remove(file_name)  # remove current file from list
                        compare(file_name,file_list) # compare file_name with remaining files in list

        else:    # if recursive swith is OFF
         for dir_name in args.dir_list:
                file_list = simple_search(dir_name) # do simple search for jpg files
                for file_name in file_list:
                        file_list.remove(file_name) # remove current file from list
                        compare(file_name,file_list)# compare file_name with remaining files in list
Contents © 2013 dgplug - Powered by Nikola
Share
UA-42392315-1