Christina-B dup_images 20130731 Posted: 2013-07-31 17:46

Problem: Write a program in python to find duplicate images in given directories.

Requirements: Python Image Library(PIL)

Input: Directories in which images are to be searched.

Description

Collect the images present in the directories mentioned.
Compare the images using Image.difference(imag1, imag2).getbbox(), where imag1 and imag2 are name of the images.
If the function returns None then the images are identical

Click the link to view the program

link

Execution

$ python duplicatImages.py /home/user/Downloads /home/user/Documents

Comments

iowabeakster image_dup 20130731 Posted: 2013-07-31 13:25

basic functions of script

find duplicate images in specified directories
take multiple directories as directory paths in args
import necessary modules
make magic happen (search out for duplicate images)

link

Github Link

script code

import sys
import os
import hashlib

# function to read file one mb at a time
def file_reader(file_name, size=1024):
    """function that reads file_name in 1 MB pieces"""
    while True:
        file_piece = file_name.read(size)
        if not file_piece:
            return
        yield file_piece

# iterate through the given paths for all files, adding files
# and their hashes to the dictionary "hashes" after comparing
# each new pair to the pairs already in dictionary

def check_for_duplicates(paths, hash=hashlib.sha1):
# create empty ditionary called "hashes"
    hashes = {}
    for path in paths:
        for dirpath, dirnames, filenames in os.walk(path):
            for filename in filenames:
# identify each file using whole pathname
                full_path = os.path.join(dirpath, filename)
                hash_return = hash()
# use file_reader function to break file into file_pieces and read pieces
                for file_piece in file_reader(open(full_path, 'rb')):
# run the hash function on each piece
                    hash_return.update(file_piece)
# compare the hash digest and pathname to items is dictionary hashes
                file_id = (hash_return.digest(), os.path.getsize(full_path))
                duplicate = hashes.get(file_id, None)
# if a duplicate hash is found, print the pair
# otherwise just add it as another pair to the dictionary
                if duplicate:
                    print "Duplicate found: %s and %s" % (full_path, duplicate)

                else:
                    hashes[file_id] = full_path

if sys.argv[1:]:
    check_for_duplicates(sys.argv[1:])
else:
    print "Please pass the paths to check as parameters to the script"

Comments

kaviskhandelwal dup_images 20130731 Posted: 2013-07-31 09:00

The assignment was to find all the duplicate images from the given direcories as commandline arguments.

duplicate_image find the dulicate image files in a given directory

Code

#!/usr/bin/env python
""" duplicate_image.py is a python program to search duplicate images in a given directory """

import os , sys
import hashlib
from PIL import Image
import types                                    """ Package check NoneType EXIF information """
file_list = []

def MD5(location, file):                        """ Function to get MD5 code of Image file having no EXIF information """
        os.chdir(location)
        with open(file, "r") as picture:
                fileContents = picture.read()
                md5 = hashlib.md5()
                md5.update(fileContents)
                hash = md5.hexdigest()
                return hash

def exif(location, file):                       """ Function to get EXIF information of Imagefile """
        res = {}
        os.chdir(location)
        res = Image.open(file)._getexif()
        if isinstance( Image.open(file)._getexif(), types.NoneType):

                return MD5(location, file)
        else :  return res

def search(path):                               """ Function to get a list of all 'jpg' and 'jpeg' image file """

        filelist = []

        for r,d,f in os.walk(path):
                for files in f:
                        if (files.endswith(".jpg") or files.endwith(".JPG")) :
                                os.path.join(r,files)
                                file = [r, files]
                                filelist.append(file)
        return filelist



def check_for_duplicate():                              """ Function to check the duplicate images in a directory """

        global file_list

        for n in file_list:

                info = exif(n[0], n[1])
                for x in file_list:

                        info2 = exif(x[0], x[1])
                        if(info == info2):
                                if ((n[1] == x[1] and n[0] != x[0]) or (n[1] != x[1])):
                                        print "%s path -> %s  is duplicate of %s path -> %s " %(n[1],n[0], x[1],x[0])


if __name__=='__main__':
        file_list = search(sys.argv[1])                 """ List of all 'jpg' and ' jpeg' images """
        check_for_duplicate()

RUN

$ ./duplicate_image.py /home/user

searches for all duplicate file in a directory '/home/user'

Comments

iamsudip dup_images 20130730 Posted: 2013-07-30 22:34

The assignment was to find all the duplicate images from the given direcories as commandline arguments.

dup_images can find any type of duplicate files not only the images.

Code

 1 #!/usr/bin/env python
 2 import os
 3 import sys
 4 import hashlib
 5 
 6 data = {}  # A dictionary to keep all data.
 7 arranged_data = {}  # A dictionary to keep data in an arranged way.
 8 
 9 def hash_it(directory):
10     """Function to generate md5 hash
11     """
12     for path, sub_directories, files in os.walk(directory):
13         # Picks each file as 'filename' and generating hash.
14         for filename in files:
15             path_to_file = os.path.join(path, filename)
16             # Appends file as key and generated md5 hash as value to 'data'.
17             data[str(path_to_file)] = hashlib.md5(open(str(path_to_file)).read()).hexdigest()
18 
19 def find_duplicates():
20     """Function to find duplicate files
21     """
22     # Iterates through each key and value at a time.
23     for pair in data.items():
24         if pair[1] not in arranged_data.keys():
25             # If new hash value found appends it to 'arranged_data' as key.
26             arranged_data[pair[1]] = []
27         # Keeps all the files as a list in value to the same hash(key).
28         arranged_data[pair[1]].append(pair[0])
29 
30     # Iterates through the new dictionary, the files which are duplicates keeps them as a list.
31     dup_list = [files for hashes, files in arranged_data.items() if len(files) > 1]
32 
33     # A simple counter to count no. of duplicate files.
34     count = 1
35     # Iterates through each list and prints them.
36     for files in dup_list:
37         print str(count) + '.'
38         for dup_file in files:
39             print dup_file
40         count += 1
41 
42 if __name__ == '__main__':
43     for i in range(1,len(sys.argv)):
44         if os.path.exists(sys.argv[i]):
45             hash_it(sys.argv[i])
46         else:
47             print "Wrong path: %s" %(sys.argv[i])
48     find_duplicates()
49     sys.exit(0)

Link to code

You canfind it here.

How to execute code

Run the above script like:

$ ./dup_images <PATH> <PATH> <PATH>

Example output

Here example output is given below:

sudip@sudip-mint dup_images $  pwd
/home/sudip/tmp/code/dup_images
sudip@sudip-mint dup_images $  ls -R
.:
dup_images  new1  new2  new3

./new1:
fb.jpg  fdgdg.txt

./new2:
ds.bmp  newsub1  wfdss.jpg

./new2/newsub1:
fb.jpg  newsub2

./new2/newsub1/newsub2:
dgg.bmp  viewer.py

./new3:
cs.jpg  sdf.png
sudip@sudip-mint dup_images $  python dup_images /home/sudip/tmp/code/dup_images /asshddk/ahdjd/ddjf
Wrong path: /asshddk/ahdjd/ddjf
1.
/home/sudip/tmp/code/dup_images/new2/newsub1/newsub2/dgg.bmp
/home/sudip/tmp/code/dup_images/new2/ds.bmp
2.
/home/sudip/tmp/code/dup_images/new1/fdgdg.txt
/home/sudip/tmp/code/dup_images/new2/newsub1/newsub2/viewer.py
3.
/home/sudip/tmp/code/dup_images/new2/wfdss.jpg
/home/sudip/tmp/code/dup_images/new3/cs.jpg
/home/sudip/tmp/code/dup_images/new1/fb.jpg
/home/sudip/tmp/code/dup_images/new2/newsub1/fb.jpg

Comments

amreshkumar dup_images assignment 20130730 Posted: 2013-07-30 15:02

dup_images

Command-line tool to find duplicate images in given directory(s).

Assignment:

To write a program such as ./dup_images.py [dir_path]...[dir_path] finds all duplicate images in the given directory(s).

Solution:

usage:

$ ./dup_images.py /home/user /home/user2 .

searches for all duplicate images in directory '/home/user' and '/home/user2' and '.'

$ ./dup_images.py -r /home/user /home/user2

searches for all duplicate images in directory '/home/user' and '/home/user2' and all their sub-directories

Code:

Link to code at Github

install with pip:

$ pip install PIL

if the installation fails, first try installing python-devel:

$ pip install python-devel

Code snippet:

#!/usr/bin/env python

"""
dup_images.py

Python program to find duplicate images in given directory(s)
v 0.1

Issues : Occasional False-positive result observed in (deep)recursive search in absence of Exif data.

"""
import sys,os,argparse
import types

from PIL import Image
from PIL.ExifTags import TAGS
from md5 import md5

def imread(IMG):
        """Reads exif data from image IMG and returns dictionary object containing Exif Info. If Exif data is not present, MD5 Hash is returned."""
        img = Image.open(IMG)
        if isinstance(img._getexif(), types.NoneType):
                return  md5(img.tostring()).hexdigest()
        else:
                return img._getexif()


def recursive_search(dir_name):
        """ Creates a list of 'jpg' files in the given directory and all sub-directories """
        file_list = []
        try:
                for root, dirs, files in os.walk(dir_name, topdown=True):
                        for name in files:
                                if name.endswith(".jpg") or name.endswith(".jpeg") or name.endswith(".JPG") or name.endswith(".JPEG"):
                                        file_list.append(os.path.join(root,name))
                return file_list
        except OSError as e:
                print '\n','*'*30,'\n',e.filename,' ->', e.strerror
                exit()

def simple_search(dir_name):
        """ Creates a list of 'jpg' files in the given directory """
        file_list = []
        try:
                for name in os.listdir(dir_name):
                        if name.endswith(".jpg") or name.endswith(".jpeg") or name.endswith(".JPG") or name.endswith(".JPEG"):
                                file_list.append(os.path.join(dir_name,name))
                return file_list
        except OSError as e:
                print '\n','*'*30,'\n',e.filename,' ->', e.strerror
                exit()


def compare(file_name,list_to_search):
        """
            Input:   file_name, list_to_search
            Process: Match Exif/md5 data of 'file_name' with all files in 'list_to_search'
            Output:  Dislays message for each MATCH FOUND.

        """
        try:
                dict_info_file = imread(file_name)
                for test_file in list_to_search:
                        dict_info_test_file = imread(test_file)
                        if dict_info_file == dict_info_test_file:
                                print 'File Match : %s <---> %s' % (file_name,test_file)
        except KeyboardInterrupt:
                print '\n\n:( you killed me.\n'
                sys.exit()


if __name__ == "__main__":
        """
        Parses argument and does a simple/recursive search and comparision of jpg files

        """

        parser = argparse.ArgumentParser(description='Finds duplicate images in DIRECTORY (current directory is default). Displays list of duplicates.',epilog='Dependency: PIL Library')
        parser.add_argument('-r',action='store_true', help='find images moving recursively through the child directories (default : search in parent directory only)')
        parser.add_argument('dir_list',nargs='+',default = os.getcwd(),help='list of directories to search')
        args = parser.parse_args()
        #print args # debugging args

        if args.r: # if recursive switch is ON
         for dir_name in args.dir_list:
                file_list = recursive_search(dir_name) # do recursive search for jpg files
                for file_name in file_list:
                        file_list.remove(file_name)  # remove current file from list
                        compare(file_name,file_list) # compare file_name with remaining files in list

        else:    # if recursive swith is OFF
         for dir_name in args.dir_list:
                file_list = simple_search(dir_name) # do simple search for jpg files
                for file_name in file_list:
                        file_list.remove(file_name) # remove current file from list
                        compare(file_name,file_list)# compare file_name with remaining files in list

Comments

m0rin09ma3 dup_images 20130730 Posted: 2013-07-30 13:49

Prerequisite

I installed PIL modules for this assignment in my 'virt1' environment. Also, I prepared a exif/non-exif image files for testing.

(virt1) $ pip list
PIL (1.1.7)

(virt1) $ python dup_images.py -h
usage: dup_images.py [-h] D [D ...]

find duplicated image files.

positional arguments:
  D           directory to be searched

  optional arguments:
    -h, --help  show this help message and exit

A sample output. Current directory has exif.jpg and nonexif.jpg. /tmp/dir_1/ has a.jpg which is copied from ./exif.jpg and renamed as a.jpg. /tmp/dir_2/ has exif.jpg which is copied from ./exif.jpg.

(virt1) $ python dup_images.py /home/ska /usr/bin ./ abc /tmp/dir_1 /tmp/dir_2/
Warning: "/home/ska" is not a directory.
Warning: "abc" is not a directory.
Warning: "./nonexif.jpg" has no EXIF data.
Found a match: "/tmp/dir_2/exif.jpg"    "./exif.jpg"
Found a match: "/tmp/dir_2/exif.jpg"    "/tmp/dir_1/a.jpg"
Found a match: "/tmp/dir_1/a.jpg"       "./exif.jpg"

A link to the source code.

Explanation

This program will find duplicated JPEG image files from directories you specify as command line arguments. Please note that this program will skip finding a match for files from invalid directories or no EXIF data.

First of all, my main function

def main():
    """
    0. get command line arguments
    1. find image files from directory
    2. find a match by comparing exif info
    """
    list_directory = parse_command_line()
    #print list_directory.directory

    list_image_file = find_image_files(list_directory)
    #print list_image_file
    if not list_image_file:
        print 'no image file found in the directory.'
        return 1

    status = find_a_match(list_image_file)

    return 0

if __name__ == '__main__':
    exit(main())

Parsing command line arguments. Imported argparse module.

def parse_command_line():
    """ User requires to specify directory """
    parser = argparse.ArgumentParser(description='find duplicated image files.')
    parser.add_argument('directory', metavar='D', type=str, nargs='+',
                        help='directory to be searched')
    args = parser.parse_args()

    return args

Find JPEG image file from valid directory and return it. Imported os, glob modules.

def find_image_files(list_directory):
    """ Find image files only from valid directory """
    image_files = []
    for dir in list_directory.directory:
        if os.path.isdir(dir):
           image_files.extend( glob.glob(os.path.join(dir, '*.jpg')) )
        else:
           print 'Warning: "%s" is not a directory.' % dir

    return image_files

Finally, a bit longer/confusing/vulnerable code to find duplicates. I use 2 lists. The one has image files with EXIF data(I separated and created a sub function to check EXIF data in an image file). Another list has EXIF details. These 2 lists are related. Therefore, it is important whoever maintains the code should know the order of elements in these 2 lists must be persistent.

def find_a_match(list_image_file):
    """ Find duplicates only from files with EXIF data """
    list_image_file_with_exif = []
    list_exif_data = []
    for image_file in list_image_file:
        dict_exif_data = get_exif_data(image_file)
        #print dict_exif_data
        if not dict_exif_data:
            print 'Warning: "%s" has no EXIF data.' % image_file
        else: # Assume the order of elements in list is persistent
            list_image_file_with_exif.append(image_file)
            list_exif_data.append(dict_exif_data)

    #print list_image_file_with_exif
    #print list_exif_data
    total = len(list_image_file_with_exif)
    # Any better approach for finding a match? I'm keen to know/learn what others doing ;)
    for i in range(total-1, 0, -1):
        for j in range(i):
            #print 'cmp(dict_%d, dict_%d)' % (i, j),
            if not cmp(list_exif_data[i], list_exif_data[j]):
                print 'Found a match: "%s"\t"%s"' % (
                                      list_image_file_with_exif[i],
                                      list_image_file_with_exif[j])

    return 0

def get_exif_data(fname):
    """ Get embedded EXIF data from image file. """
    exif_data = {}
    try:
        img = Image.open(fname)
    except IOError:
        print 'Error: IOError ' + fname
    else:
        if hasattr(img, '_getexif'):
            exif = img._getexif()
            if exif != None:
                for tag, value in exif.items():
                    decoded = TAGS.get(tag, tag)
                    exif_data[decoded] = value

    return exif_data

Comments

elitalobo Dummy project v1 20130730 Posted: 2013-07-30 13:16

TASK

Write a shell script that would install Myshell Project. Few changes had to be made in setup.py file to install required dependencies.

Code

Code for setup.py file

#!/usr/bin/env python2

from setuptools import find_packages, setup


setup(
   name='elitashell',
   version="0.3.2",
   description="elita's Shell",
   long_description="Dummy Project",
   platforms=["Linux"],
   author="Elita",
   author_email="loboelita@gmail.com",
   url="https://github.com/elitalobo/HomeTask1",
   license="MIT",
   install_requires=["requests", "cmd2"],
   packages=find_packages(),
   package_data = {
         '': [ 'elitashell/*.rst'],
   },
   entry_points={
       'console_scripts': [
           'eshell = elitashell:main',
         ]
      },
   )

link link

link to the project

link1

Comments

elitalobo Myshell 20130730 Posted: 2013-07-30 06:43

task

Write a code to create a cmd shell which greets the user on giving the command greet and which prints sharevalue of company on giving the command stock <NASDAQ_SYMBOL>.

CODE This is the code of Myshell.py. It imports Stock function from ShareValue module.

from cmd2 import Cmd
__version__ = '0.1'
from getpass import getuser
import requests
import sys
from ShareValue import Stock #imports Stock function from Sharevalue module
# this code creates a cmd shell
class Application(Cmd):
    """
    The main Application class

    """

    def __init__(self):
         Cmd.__init__(self)

    def do_hello(self,line):
         print "Hello:",line
         # prints hello <line> on giving the command hello <line>

    def do_sayit(self, line):
         print "Python Rocks!"
         # prints Pyhton Rocks! on giving the command sayit <line>

    def do_greet(self, line):
         print "Hi! %s" %(getuser()) # username is obtained by using requests library
         # greets user on giving the command greet

    def do_stock(self,line):
         Stock(line)
         # prints stock value on giving the command stock <NASDAQ_SYMBOL>
    def main():
         app = Application() #calls Application() function
         app.cmdloop()


if __name__ == '__main__':
        main() #calls main()

code for ShareValue.py

import requests
#imports request library

def Stock(line1): #defines stock function
     a=line1.split(" ") #splits input line
     url = 'http://download.finance.yahoo.com/d/quotes.csv?s=%s&f=l1'% a[0]
     sharevalue=requests.get(url) # uses request library to open url
     s=sharevalue.text #obtains text contents from webpage
     c=str(s).split("\n") #splits contents into individual lines
     d= c[0].split(" ") #splits first line into words
     if str(d[0])=="0.00\r": #checks if nasdaq symbol is invalid by checking if sharevalue=0
        print "invalid nasdaq symbol" #prints invalid nasdaq symbol
     else: #else prints sharevalue of the company
        print "Sharevalue of the company with nasdaq symbol %s is %s" %(a[0],sharevalue.text)
     sharevalue.close()

link link

Comments

elitalobo Dummy Project 20130730 Posted: 2013-07-30 06:42

TASK

Write a shell script that would install Myshell Project. Few changes had to be made in setup.py file to install required dependencies.

Code

Code for setup.py file

#!/usr/bin/env python2

from setuptools import find_packages, setup


setup(
   name='elitashell',
   version="0.3.2",
   description="elita's Shell",
   long_description="Dummy Project",
   platforms=["Linux"],
   author="Elita",
   author_email="loboelita@gmail.com",
   url="https://github.com/elitalobo/HomeTask1",
   license="MIT",
   install_requires=["requests", "cmd2"],
   packages=find_packages(),
   package_data = {
         '': [ 'elitashell/*.rst'],
   },
   entry_points={
       'console_scripts': [
           'eshell = elitashell:main',
         ]
      },
   )

link link

link to the project

link1

Comments

JCaselles tweetirup 20130729 Posted: 2013-07-29 12:58

Tweetir'up!

Command-line tool to tweet an image with description.

Assignment:

To write a command such as tweetup -f <path> -d <description> that have to succesfully tweet an image with an appended description.

Solution:

Problematic:

In this assignment we face some issues

First, we have to choose which module to use among the list available. We were told to better use user/password authentication. But this authentication method is deprecated and is only supported by tweepy module. Since Twitter clearly states that only OAuth authentication is supported, I've decided to use this method. Therefore, my choosen module is Twython, for it supports update_status_with_media.

Second, how to store the user credentials. As I'm using OAuth, the authentication is managed by web allowance of the user and the resulting authorize tokens. At first run, this app will ask the user (prompting a web browser tab) to allow this app to use its Twitter account, and to copy the ping provided in order to allow the app to get the tokens. Once this is achieved, this tokens are stored in tweetiruprc for further uses.

Code:

Link to code at Github

install with pip:

$ pip install twython
$ pip install -i https://testpypi.python.org/pypi tweetirup

Code snippet:

#!/usr/bin/env python

"""
Command-line twitter updater tool.

"""
from sys import exit
from os import path
from twython import Twython, TwythonError
from webbrowser import open as webopen
from argparse import ArgumentParser

CONSUMER_KEY = "cGZNRDAMsJIDqDtpasgg"
CONSUMER_SECRET = "tIpyNSjr32wiCSPJeIdD8qtxMNOyGRohcbX9nMtNg"

def auth_control ():
    """
    Controls the authentication proces. At the very first run, it will ask
    user to authorize the app in browser, and to write down the ping generated.
    Then it generates the necessary tokens for authenticate the app, and stores
    them for further runs.

    Returns: Twython instance fully authenticated, ready to perfom.

    """

    token = None
    secret = None

    try:
        tokens_f = open(path.expanduser("~/.tweetiruprc"), "r")
        """
        TODO: evaluate the use of ConfigParse

        """

    except IOError:
        token, secret = get_auth_tokens() # Get new tokens, see below

        try:
            tokens_f = open(path.expanduser("~/.tweetiruprc"), "w")
            tokens_f.write("OAUTH_TOKEN: %s\n" % token)
            tokens_f.write("OAUTH_TOKEN_SECRET: %s\n" % secret)

        except IOError:
            exit("Unexpected error")

        finally:
            tokens_f.close()

    else:
        token = tokens_f.readline().split(" ")[1].strip()
        secret = tokens_f.readline().split(" ")[1].strip()
        tokens_f.close()

    return Twython(CONSUMER_KEY, CONSUMER_SECRET, token, secret)



def get_auth_tokens():
    """
    Gets authentication tokens using Twython procedure.
    It will open a browser tab to ask user to allow this app,
    and copy the ping.

    Returns: tokens "OAUTH_TOKEN" and "OAUTH_TOKEN_SECRET" in a tuple

    """

    first_step = Twython(CONSUMER_KEY, CONSUMER_SECRET)
    mid_step = first_step.get_authentication_tokens()
    webopen(mid_step["auth_url"], 2)

    auth_pin = raw_input("This is your first time using this app, you have to"
                        " authorize it.\nA new tab in your browser has been open"
                        ", where you can authorize this app. Remember to copy"
                        " the pin number given to you.\n\nEnter the pin number"
                        "here: ")

    first_step = None

    twy = Twython(CONSUMER_KEY, CONSUMER_SECRET,
                  mid_step["oauth_token"], mid_step["oauth_token_secret"])

    final_tokens = twy.get_authorized_tokens(auth_pin)

    return final_tokens["oauth_token"], final_tokens["oauth_token_secret"]



def tweet_image (image_path, status):
    """
    Tweets an image with it's correspondent description.
    Uses Twython's update_status_with_media method.

    param image_path: the path to the image
    param status: the description (tweet, status) annexed to this image

    """

    try:
        pic = open(image_path, "rb")

    except IOError, e:
        exit("\n*Tweetir'up!* Error opening image: %s" % e)

    else:

        try:
            auth_control().update_status_with_media(media = pic, status = status)

        except TwythonError:
            exit("\n*Tweetir'up!* Unespected server error")

        else:
            print "\n*Tweetir'up!* Successfully twitted image"

        finally:
            pic.close()




if __name__ == "__main__":
    parser = ArgumentParser()
    """
    Parsing options:
        -f --file: (required) path to image
        -d --description: (optional) description to append to the image.
                          If not provided, adds default string

    """

    parser.add_argument("-f", "--file", help = "Add path to image", required = True)
    parser.add_argument("-d", "--description",
                        help = "Add optional description of the image")

    args = parser.parse_args()

    if args.description:
        descript = args.description
    else:
        descript = "Twitted with Tweetir'up!"

    tweet_image(args.file, descript)

Comments