JCaselles dup_images 20130801

Posted: 2013-08-01 12:42
Dup_images

Assignment:

To write a command-line tool to find duplicate images, images being jpeg files with exif information.
Solution:

Use Pillow (former PIL) to read the exif information of every image, and compare them to find duplicates in every directory.
Dependencies:

pillow module
Code:

Link to GitHub
Snippet:

#!/usr/bin/env python

"""
Finds duplicate images in the specified directories

"""
from sys import argv, exit
from PIL import Image
from os import listdir, walk
from os.path import join
from argparse import ArgumentParser


def print_duplicates(im_dict):
    """
    Prints, from all entries in im_dict dictionary, those keys which have more
    than one value, i. e. more than one location for each exif information
    """

    if im_dict:

        print "\n\nDuplicated images:\n------------------\n\n"

        theres_any = False

        for x in im_dict.values():

            if len(x) > 1:
                theres_any = True
                print "Duplicated instances:"

                for w in range(len(x)):
                    print " - %s" % x[w]

                print "\n"

        if not theres_any:
            print "No duplicated images found. Hurra!!\n\n"

    else:
        print "\n\nNo images in these directories!\n\n"


def get_exif(dirs):
    """
    Returns a dictionary with all the ~.jpg files founded in the specific
    directories of dirs list. Format:
    {"exif_values" : "[path1, <path2, ...>]"}

    Explanation:
    ------------

    Whe create a nested loop as following: for each filename in dir_content,
    which is each list generated by listdir() of the given list of directories.
    Then we check if this file is .jpg or related, open it and get the exif
    information. We don't need to read it, therefore no need for TAGS.
    _getexif() returns a dictionary, which can't be a dictionary key as it is
    mutable. Therefore, we generate a string with all exif values concatenated.

    Then we add the dictionary entry, using setdefault, which will help in our
    propose: if the key exists, appends the new value to the existing value of
    that key. If the key doesn't exists, creates a new entry. Therefore we can
    know which files are duplicate: those which more than one value in his
    entry

    """

    exifpath_dict = {} # returned dict

    for dir_content, x in [(listdir(x), x) for x in set(dirs)]:
        # Using set(dirs) to prevent duplicate dirs

        for filename in dir_content:

            exif_string = ""

            if filename.endswith(".jpg") or filename.endswith(".jpeg") \
                or filename.endswith(".JPG") or filename.endswith(".JPEG"):

                try:
                    exif_info = Image.open(join(x, filename))._getexif()

                except IOError, e:
                    print "**ERROR** Error opening %s: %s" % (filename, e)

                else:
                    try:
                        for z in exif_info.keys():
                            exif_string += str(exif_info[z])

                    except AttributeError:
                        print "**WARNING** %s doesn't have exif " \
                              "info or is corrupted, skipping it" % filename

                    else:
                        exifpath_dict.setdefault(exif_string, []).append(join (x, filename))

    return exifpath_dict



if __name__ == "__main__":

    parser = ArgumentParser()

    parser.add_argument("Directories",
                        help = "Directories to search for duplicates",
                        nargs = "+")

    args = parser.parse_args()

    print ""
    print_duplicates(get_exif(args.Directories))