Problem: Write a program in python to find duplicate images in given directories.
Requirements: Python Image Library(PIL)
Input: Directories in which images are to be searched.
import sys import os import hashlib # function to read file one mb at a time def file_reader(file_name, size=1024): """function that reads file_name in 1 MB pieces""" while True: file_piece = file_name.read(size) if not file_piece: return yield file_piece # iterate through the given paths for all files, adding files # and their hashes to the dictionary "hashes" after comparing # each new pair to the pairs already in dictionary def check_for_duplicates(paths, hash=hashlib.sha1): # create empty ditionary called "hashes" hashes = {} for path in paths: for dirpath, dirnames, filenames in os.walk(path): for filename in filenames: # identify each file using whole pathname full_path = os.path.join(dirpath, filename) hash_return = hash() # use file_reader function to break file into file_pieces and read pieces for file_piece in file_reader(open(full_path, 'rb')): # run the hash function on each piece hash_return.update(file_piece) # compare the hash digest and pathname to items is dictionary hashes file_id = (hash_return.digest(), os.path.getsize(full_path)) duplicate = hashes.get(file_id, None) # if a duplicate hash is found, print the pair # otherwise just add it as another pair to the dictionary if duplicate: print "Duplicate found: %s and %s" % (full_path, duplicate) else: hashes[file_id] = full_path if sys.argv[1:]: check_for_duplicates(sys.argv[1:]) else: print "Please pass the paths to check as parameters to the script"
The assignment was to find all the duplicate images from the given direcories as commandline arguments.
duplicate_image find the dulicate image files in a given directory
#!/usr/bin/env python """ duplicate_image.py is a python program to search duplicate images in a given directory """ import os , sys import hashlib from PIL import Image import types """ Package check NoneType EXIF information """ file_list = [] def MD5(location, file): """ Function to get MD5 code of Image file having no EXIF information """ os.chdir(location) with open(file, "r") as picture: fileContents = picture.read() md5 = hashlib.md5() md5.update(fileContents) hash = md5.hexdigest() return hash def exif(location, file): """ Function to get EXIF information of Imagefile """ res = {} os.chdir(location) res = Image.open(file)._getexif() if isinstance( Image.open(file)._getexif(), types.NoneType): return MD5(location, file) else : return res def search(path): """ Function to get a list of all 'jpg' and 'jpeg' image file """ filelist = [] for r,d,f in os.walk(path): for files in f: if (files.endswith(".jpg") or files.endwith(".JPG")) : os.path.join(r,files) file = [r, files] filelist.append(file) return filelist def check_for_duplicate(): """ Function to check the duplicate images in a directory """ global file_list for n in file_list: info = exif(n[0], n[1]) for x in file_list: info2 = exif(x[0], x[1]) if(info == info2): if ((n[1] == x[1] and n[0] != x[0]) or (n[1] != x[1])): print "%s path -> %s is duplicate of %s path -> %s " %(n[1],n[0], x[1],x[0]) if __name__=='__main__': file_list = search(sys.argv[1]) """ List of all 'jpg' and ' jpeg' images """ check_for_duplicate()
$ ./duplicate_image.py /home/user
searches for all duplicate file in a directory '/home/user'
The assignment was to find all the duplicate images from the given direcories as commandline arguments.
dup_images can find any type of duplicate files not only the images.
1 #!/usr/bin/env python 2 import os 3 import sys 4 import hashlib 5 6 data = {} # A dictionary to keep all data. 7 arranged_data = {} # A dictionary to keep data in an arranged way. 8 9 def hash_it(directory): 10 """Function to generate md5 hash 11 """ 12 for path, sub_directories, files in os.walk(directory): 13 # Picks each file as 'filename' and generating hash. 14 for filename in files: 15 path_to_file = os.path.join(path, filename) 16 # Appends file as key and generated md5 hash as value to 'data'. 17 data[str(path_to_file)] = hashlib.md5(open(str(path_to_file)).read()).hexdigest() 18 19 def find_duplicates(): 20 """Function to find duplicate files 21 """ 22 # Iterates through each key and value at a time. 23 for pair in data.items(): 24 if pair[1] not in arranged_data.keys(): 25 # If new hash value found appends it to 'arranged_data' as key. 26 arranged_data[pair[1]] = [] 27 # Keeps all the files as a list in value to the same hash(key). 28 arranged_data[pair[1]].append(pair[0]) 29 30 # Iterates through the new dictionary, the files which are duplicates keeps them as a list. 31 dup_list = [files for hashes, files in arranged_data.items() if len(files) > 1] 32 33 # A simple counter to count no. of duplicate files. 34 count = 1 35 # Iterates through each list and prints them. 36 for files in dup_list: 37 print str(count) + '.' 38 for dup_file in files: 39 print dup_file 40 count += 1 41 42 if __name__ == '__main__': 43 for i in range(1,len(sys.argv)): 44 if os.path.exists(sys.argv[i]): 45 hash_it(sys.argv[i]) 46 else: 47 print "Wrong path: %s" %(sys.argv[i]) 48 find_duplicates() 49 sys.exit(0)
You canfind it here.
Run the above script like:
$ ./dup_images <PATH> <PATH> <PATH>
Here example output is given below:
sudip@sudip-mint dup_images $ pwd /home/sudip/tmp/code/dup_images sudip@sudip-mint dup_images $ ls -R .: dup_images new1 new2 new3 ./new1: fb.jpg fdgdg.txt ./new2: ds.bmp newsub1 wfdss.jpg ./new2/newsub1: fb.jpg newsub2 ./new2/newsub1/newsub2: dgg.bmp viewer.py ./new3: cs.jpg sdf.png sudip@sudip-mint dup_images $ python dup_images /home/sudip/tmp/code/dup_images /asshddk/ahdjd/ddjf Wrong path: /asshddk/ahdjd/ddjf 1. /home/sudip/tmp/code/dup_images/new2/newsub1/newsub2/dgg.bmp /home/sudip/tmp/code/dup_images/new2/ds.bmp 2. /home/sudip/tmp/code/dup_images/new1/fdgdg.txt /home/sudip/tmp/code/dup_images/new2/newsub1/newsub2/viewer.py 3. /home/sudip/tmp/code/dup_images/new2/wfdss.jpg /home/sudip/tmp/code/dup_images/new3/cs.jpg /home/sudip/tmp/code/dup_images/new1/fb.jpg /home/sudip/tmp/code/dup_images/new2/newsub1/fb.jpg
Command-line tool to find duplicate images in given directory(s).
To write a program such as ./dup_images.py [dir_path]...[dir_path] finds all duplicate images in the given directory(s).
usage:
$ ./dup_images.py /home/user /home/user2 .
searches for all duplicate images in directory '/home/user' and '/home/user2' and '.'
$ ./dup_images.py -r /home/user /home/user2
searches for all duplicate images in directory '/home/user' and '/home/user2' and all their sub-directories
install with pip:
$ pip install PIL
if the installation fails, first try installing python-devel:
$ pip install python-devel
Code snippet:
#!/usr/bin/env python """ dup_images.py Python program to find duplicate images in given directory(s) v 0.1 Issues : Occasional False-positive result observed in (deep)recursive search in absence of Exif data. """ import sys,os,argparse import types from PIL import Image from PIL.ExifTags import TAGS from md5 import md5 def imread(IMG): """Reads exif data from image IMG and returns dictionary object containing Exif Info. If Exif data is not present, MD5 Hash is returned.""" img = Image.open(IMG) if isinstance(img._getexif(), types.NoneType): return md5(img.tostring()).hexdigest() else: return img._getexif() def recursive_search(dir_name): """ Creates a list of 'jpg' files in the given directory and all sub-directories """ file_list = [] try: for root, dirs, files in os.walk(dir_name, topdown=True): for name in files: if name.endswith(".jpg") or name.endswith(".jpeg") or name.endswith(".JPG") or name.endswith(".JPEG"): file_list.append(os.path.join(root,name)) return file_list except OSError as e: print '\n','*'*30,'\n',e.filename,' ->', e.strerror exit() def simple_search(dir_name): """ Creates a list of 'jpg' files in the given directory """ file_list = [] try: for name in os.listdir(dir_name): if name.endswith(".jpg") or name.endswith(".jpeg") or name.endswith(".JPG") or name.endswith(".JPEG"): file_list.append(os.path.join(dir_name,name)) return file_list except OSError as e: print '\n','*'*30,'\n',e.filename,' ->', e.strerror exit() def compare(file_name,list_to_search): """ Input: file_name, list_to_search Process: Match Exif/md5 data of 'file_name' with all files in 'list_to_search' Output: Dislays message for each MATCH FOUND. """ try: dict_info_file = imread(file_name) for test_file in list_to_search: dict_info_test_file = imread(test_file) if dict_info_file == dict_info_test_file: print 'File Match : %s <---> %s' % (file_name,test_file) except KeyboardInterrupt: print '\n\n:( you killed me.\n' sys.exit() if __name__ == "__main__": """ Parses argument and does a simple/recursive search and comparision of jpg files """ parser = argparse.ArgumentParser(description='Finds duplicate images in DIRECTORY (current directory is default). Displays list of duplicates.',epilog='Dependency: PIL Library') parser.add_argument('-r',action='store_true', help='find images moving recursively through the child directories (default : search in parent directory only)') parser.add_argument('dir_list',nargs='+',default = os.getcwd(),help='list of directories to search') args = parser.parse_args() #print args # debugging args if args.r: # if recursive switch is ON for dir_name in args.dir_list: file_list = recursive_search(dir_name) # do recursive search for jpg files for file_name in file_list: file_list.remove(file_name) # remove current file from list compare(file_name,file_list) # compare file_name with remaining files in list else: # if recursive swith is OFF for dir_name in args.dir_list: file_list = simple_search(dir_name) # do simple search for jpg files for file_name in file_list: file_list.remove(file_name) # remove current file from list compare(file_name,file_list)# compare file_name with remaining files in list
I installed PIL modules for this assignment in my 'virt1' environment. Also, I prepared a exif/non-exif image files for testing.
(virt1) $ pip list PIL (1.1.7)
(virt1) $ python dup_images.py -h usage: dup_images.py [-h] D [D ...] find duplicated image files. positional arguments: D directory to be searched optional arguments: -h, --help show this help message and exit
A sample output. Current directory has exif.jpg and nonexif.jpg. /tmp/dir_1/ has a.jpg which is copied from ./exif.jpg and renamed as a.jpg. /tmp/dir_2/ has exif.jpg which is copied from ./exif.jpg.
(virt1) $ python dup_images.py /home/ska /usr/bin ./ abc /tmp/dir_1 /tmp/dir_2/ Warning: "/home/ska" is not a directory. Warning: "abc" is not a directory. Warning: "./nonexif.jpg" has no EXIF data. Found a match: "/tmp/dir_2/exif.jpg" "./exif.jpg" Found a match: "/tmp/dir_2/exif.jpg" "/tmp/dir_1/a.jpg" Found a match: "/tmp/dir_1/a.jpg" "./exif.jpg"
A link to the source code.
This program will find duplicated JPEG image files from directories you specify as command line arguments. Please note that this program will skip finding a match for files from invalid directories or no EXIF data.
First of all, my main function
def main(): """ 0. get command line arguments 1. find image files from directory 2. find a match by comparing exif info """ list_directory = parse_command_line() #print list_directory.directory list_image_file = find_image_files(list_directory) #print list_image_file if not list_image_file: print 'no image file found in the directory.' return 1 status = find_a_match(list_image_file) return 0 if __name__ == '__main__': exit(main())
Parsing command line arguments. Imported argparse module.
def parse_command_line(): """ User requires to specify directory """ parser = argparse.ArgumentParser(description='find duplicated image files.') parser.add_argument('directory', metavar='D', type=str, nargs='+', help='directory to be searched') args = parser.parse_args() return args
Find JPEG image file from valid directory and return it. Imported os, glob modules.
def find_image_files(list_directory): """ Find image files only from valid directory """ image_files = [] for dir in list_directory.directory: if os.path.isdir(dir): image_files.extend( glob.glob(os.path.join(dir, '*.jpg')) ) else: print 'Warning: "%s" is not a directory.' % dir return image_files
Finally, a bit longer/confusing/vulnerable code to find duplicates. I use 2 lists. The one has image files with EXIF data(I separated and created a sub function to check EXIF data in an image file). Another list has EXIF details. These 2 lists are related. Therefore, it is important whoever maintains the code should know the order of elements in these 2 lists must be persistent.
def find_a_match(list_image_file): """ Find duplicates only from files with EXIF data """ list_image_file_with_exif = [] list_exif_data = [] for image_file in list_image_file: dict_exif_data = get_exif_data(image_file) #print dict_exif_data if not dict_exif_data: print 'Warning: "%s" has no EXIF data.' % image_file else: # Assume the order of elements in list is persistent list_image_file_with_exif.append(image_file) list_exif_data.append(dict_exif_data) #print list_image_file_with_exif #print list_exif_data total = len(list_image_file_with_exif) # Any better approach for finding a match? I'm keen to know/learn what others doing ;) for i in range(total-1, 0, -1): for j in range(i): #print 'cmp(dict_%d, dict_%d)' % (i, j), if not cmp(list_exif_data[i], list_exif_data[j]): print 'Found a match: "%s"\t"%s"' % ( list_image_file_with_exif[i], list_image_file_with_exif[j]) return 0 def get_exif_data(fname): """ Get embedded EXIF data from image file. """ exif_data = {} try: img = Image.open(fname) except IOError: print 'Error: IOError ' + fname else: if hasattr(img, '_getexif'): exif = img._getexif() if exif != None: for tag, value in exif.items(): decoded = TAGS.get(tag, tag) exif_data[decoded] = value return exif_data
TASK
Write a shell script that would install Myshell Project. Few changes had to be made in setup.py file to install required dependencies.
Code
Code for setup.py file
#!/usr/bin/env python2 from setuptools import find_packages, setup setup( name='elitashell', version="0.3.2", description="elita's Shell", long_description="Dummy Project", platforms=["Linux"], author="Elita", author_email="loboelita@gmail.com", url="https://github.com/elitalobo/HomeTask1", license="MIT", install_requires=["requests", "cmd2"], packages=find_packages(), package_data = { '': [ 'elitashell/*.rst'], }, entry_points={ 'console_scripts': [ 'eshell = elitashell:main', ] }, )
link link
link to the project
task
Write a code to create a cmd shell which greets the user on giving the command greet and which prints sharevalue of company on giving the command stock <NASDAQ_SYMBOL>.
CODE This is the code of Myshell.py. It imports Stock function from ShareValue module.
from cmd2 import Cmd __version__ = '0.1' from getpass import getuser import requests import sys from ShareValue import Stock #imports Stock function from Sharevalue module # this code creates a cmd shell class Application(Cmd): """ The main Application class """ def __init__(self): Cmd.__init__(self) def do_hello(self,line): print "Hello:",line # prints hello <line> on giving the command hello <line> def do_sayit(self, line): print "Python Rocks!" # prints Pyhton Rocks! on giving the command sayit <line> def do_greet(self, line): print "Hi! %s" %(getuser()) # username is obtained by using requests library # greets user on giving the command greet def do_stock(self,line): Stock(line) # prints stock value on giving the command stock <NASDAQ_SYMBOL> def main(): app = Application() #calls Application() function app.cmdloop() if __name__ == '__main__': main() #calls main()
code for ShareValue.py
import requests #imports request library def Stock(line1): #defines stock function a=line1.split(" ") #splits input line url = 'http://download.finance.yahoo.com/d/quotes.csv?s=%s&f=l1'% a[0] sharevalue=requests.get(url) # uses request library to open url s=sharevalue.text #obtains text contents from webpage c=str(s).split("\n") #splits contents into individual lines d= c[0].split(" ") #splits first line into words if str(d[0])=="0.00\r": #checks if nasdaq symbol is invalid by checking if sharevalue=0 print "invalid nasdaq symbol" #prints invalid nasdaq symbol else: #else prints sharevalue of the company print "Sharevalue of the company with nasdaq symbol %s is %s" %(a[0],sharevalue.text) sharevalue.close()
link link
TASK
Write a shell script that would install Myshell Project. Few changes had to be made in setup.py file to install required dependencies.
Code
Code for setup.py file
#!/usr/bin/env python2 from setuptools import find_packages, setup setup( name='elitashell', version="0.3.2", description="elita's Shell", long_description="Dummy Project", platforms=["Linux"], author="Elita", author_email="loboelita@gmail.com", url="https://github.com/elitalobo/HomeTask1", license="MIT", install_requires=["requests", "cmd2"], packages=find_packages(), package_data = { '': [ 'elitashell/*.rst'], }, entry_points={ 'console_scripts': [ 'eshell = elitashell:main', ] }, )
link link
link to the project
Command-line tool to tweet an image with description.
To write a command such as tweetup -f <path> -d <description> that have to succesfully tweet an image with an appended description.
In this assignment we face some issues
First, we have to choose which module to use among the list available. We were told to better use user/password authentication. But this authentication method is deprecated and is only supported by tweepy module. Since Twitter clearly states that only OAuth authentication is supported, I've decided to use this method. Therefore, my choosen module is Twython, for it supports update_status_with_media.
Second, how to store the user credentials. As I'm using OAuth, the authentication is managed by web allowance of the user and the resulting authorize tokens. At first run, this app will ask the user (prompting a web browser tab) to allow this app to use its Twitter account, and to copy the ping provided in order to allow the app to get the tokens. Once this is achieved, this tokens are stored in tweetiruprc for further uses.
install with pip:
$ pip install twython $ pip install -i https://testpypi.python.org/pypi tweetirup
Code snippet:
#!/usr/bin/env python """ Command-line twitter updater tool. """ from sys import exit from os import path from twython import Twython, TwythonError from webbrowser import open as webopen from argparse import ArgumentParser CONSUMER_KEY = "cGZNRDAMsJIDqDtpasgg" CONSUMER_SECRET = "tIpyNSjr32wiCSPJeIdD8qtxMNOyGRohcbX9nMtNg" def auth_control (): """ Controls the authentication proces. At the very first run, it will ask user to authorize the app in browser, and to write down the ping generated. Then it generates the necessary tokens for authenticate the app, and stores them for further runs. Returns: Twython instance fully authenticated, ready to perfom. """ token = None secret = None try: tokens_f = open(path.expanduser("~/.tweetiruprc"), "r") """ TODO: evaluate the use of ConfigParse """ except IOError: token, secret = get_auth_tokens() # Get new tokens, see below try: tokens_f = open(path.expanduser("~/.tweetiruprc"), "w") tokens_f.write("OAUTH_TOKEN: %s\n" % token) tokens_f.write("OAUTH_TOKEN_SECRET: %s\n" % secret) except IOError: exit("Unexpected error") finally: tokens_f.close() else: token = tokens_f.readline().split(" ")[1].strip() secret = tokens_f.readline().split(" ")[1].strip() tokens_f.close() return Twython(CONSUMER_KEY, CONSUMER_SECRET, token, secret) def get_auth_tokens(): """ Gets authentication tokens using Twython procedure. It will open a browser tab to ask user to allow this app, and copy the ping. Returns: tokens "OAUTH_TOKEN" and "OAUTH_TOKEN_SECRET" in a tuple """ first_step = Twython(CONSUMER_KEY, CONSUMER_SECRET) mid_step = first_step.get_authentication_tokens() webopen(mid_step["auth_url"], 2) auth_pin = raw_input("This is your first time using this app, you have to" " authorize it.\nA new tab in your browser has been open" ", where you can authorize this app. Remember to copy" " the pin number given to you.\n\nEnter the pin number" "here: ") first_step = None twy = Twython(CONSUMER_KEY, CONSUMER_SECRET, mid_step["oauth_token"], mid_step["oauth_token_secret"]) final_tokens = twy.get_authorized_tokens(auth_pin) return final_tokens["oauth_token"], final_tokens["oauth_token_secret"] def tweet_image (image_path, status): """ Tweets an image with it's correspondent description. Uses Twython's update_status_with_media method. param image_path: the path to the image param status: the description (tweet, status) annexed to this image """ try: pic = open(image_path, "rb") except IOError, e: exit("\n*Tweetir'up!* Error opening image: %s" % e) else: try: auth_control().update_status_with_media(media = pic, status = status) except TwythonError: exit("\n*Tweetir'up!* Unespected server error") else: print "\n*Tweetir'up!* Successfully twitted image" finally: pic.close() if __name__ == "__main__": parser = ArgumentParser() """ Parsing options: -f --file: (required) path to image -d --description: (optional) description to append to the image. If not provided, adds default string """ parser.add_argument("-f", "--file", help = "Add path to image", required = True) parser.add_argument("-d", "--description", help = "Add optional description of the image") args = parser.parse_args() if args.description: descript = args.description else: descript = "Twitted with Tweetir'up!" tweet_image(args.file, descript)