Find duplicate files using Python

Question 1

Most Python "duplicate file finder" scripts I found do a brute-force of calculating the hashes of all files under a directory. So, I wrote my own -- hopefully faster -- script to kind of do things more intelligently.

Basically, it first searches for files of exact same size, then it compares only N bytes at the head and tail of the files, and finally it compares the files' full hashes.

My #1 concern of course would be correctness, followed closely by maintainability.

__author__ = 'pepoluan'
import os
import hashlib
from glob import glob
from itertools import chain
# Global variables
class G:
 OutFormat = 'list' # Possible values: 'list', 'csv'
 OutFile = None
 StartPaths = [
 'D:\\DATA_2',
 'D:\\DATA_1',
 'D:\\'
 ]
 PartialCheckSize = 8192
 FullFileHash = True
 MinSize = 16 * 1024 * 1024
 ProgPeriod = 1000
 FullBlockSize = 1024 * 1024
 Quiet = False
 HashFunc = hashlib.md5
def get_walker_generator(at_path):
 return (
 chain.from_iterable(
 glob(
 os.path.join(
 x[0].replace('[', '[[]').replace(']', '[]]'),
 '*.*'
 )
 ) for x in os.walk(at_path)
 )
 )
def dict_filter_by_len(rawdict, minlen=2):
 assert isinstance(rawdict, dict)
 return {k: v for k, v in rawdict.items() if len(v) >= minlen}
def qprint(*args, **kwargs):
 if not G.Quiet:
 print(*args, **kwargs)
def get_dupes_by_size(path_list):
 qprint('===== Recursively stat-ing {0}'.format(path_list))
 processed = set()
 size_dict = {}
 for statpath in path_list:
 c = 0
 uniq_in_path = 0
 qprint('{0}...'.format(statpath), end='')
 for fname in get_walker_generator(statpath):
 try:
 if c >= G.ProgPeriod:
 print('.', end='', flush=True)
 c = 0
 if fname not in processed:
 c += 1
 uniq_in_path += 1
 fstat = os.stat(fname)
 fsize = fstat.st_size
 flist = size_dict.get(fsize, set())
 flist.add(fname)
 size_dict[fsize] = flist
 processed.add(fname)
 except:
 print('\nException on ', fname)
 raise
 qprint(uniq_in_path)
 qprint('\nTotal files: ', len(processed))
 dupe_sizes = {(None, sz): list(fset) for sz, fset in size_dict.items() if sz >= G.MinSize and len(fset) > 1}
 qprint('Dupes: ', len(dupe_sizes))
 return dupe_sizes
def refine_dupes_by_partial_hash(dupes_dict, partial_check_size=G.PartialCheckSize, hashfunc=G.HashFunc):
 assert isinstance(dupes_dict, dict)
 qprint('===== Checking hash of first and last {0} bytes ====='.format(partial_check_size))
 qprint('Processing...', end='', flush=True)
 size_and_hashes = {}
 for selector, flist in dupes_dict.items():
 fsize = selector[-1]
 for fname in flist:
 with open(fname, 'rb') as fin:
 hash_front = hashfunc(fin.read(partial_check_size)).hexdigest()
 seek_targ = fsize - G.PartialCheckSize - 1
 if seek_targ > 0:
 fin.seek(seek_targ)
 hash_rear = hashfunc(fin.read(partial_check_size)).hexdigest()
 else:
 hash_rear = hash_front
 # "size" at rear, so a simple print will still result in a nicely-aligned table
 selector = (hash_front, hash_rear, fsize)
 flist = size_and_hashes.get(selector, [])
 flist.append(fname)
 size_and_hashes[selector] = flist
 qprint('.', end='', flush=True)
 dupe_exact = dict_filter_by_len(size_and_hashes)
 qprint('\nDupes: ', len(dupe_exact))
 return dupe_exact
def refine_dupes_by_full_hash(dupes_dict, block_size=G.FullBlockSize, hashfunc=G.HashFunc):
 assert isinstance(dupes_dict, dict)
 qprint('===== Checking full hashes of Dupes')
 qprint('Processing...', end='', flush=True)
 fullhashes = {}
 for selector, flist in dupes_dict.items():
 sz = selector[-1] # Save size so we can still inform the user of the size
 for fname in flist:
 hasher = hashfunc()
 with open(fname, 'rb') as fin:
 while True:
 buf = fin.read(block_size)
 if not buf: break
 hasher.update(buf)
 # "size" at rear, so a simple print will still result in a nicely-aligned table
 slct = (hasher.hexdigest(), sz)
 flist = fullhashes.get(slct, [])
 flist.append(fname)
 fullhashes[slct] = flist
 qprint('.', end='', flush=True)
 dupe_exact = dict_filter_by_len(fullhashes)
 qprint('\nDupes: ', len(dupe_exact))
 return dupe_exact
def output_results(dupes_dict, out_format=G.OutFormat, out_file=G.OutFile):
 assert isinstance(dupes_dict, dict)
 kiys = [k for k in dupes_dict]
 kiys.sort(key=lambda x: x[-1])
 if out_file is not None:
 qprint('Writing result in "{0}" format to file: {1} ...'.format(out_format, out_file), end='')
 else:
 qprint()
 if out_format == 'list':
 for kiy in kiys:
 flist = dupes_dict[kiy]
 print('-- {0}:'.format(kiy), file=out_file)
 flist.sort()
 for fname in flist:
 print(' {0}'.format(fname), file=out_file)
 elif out_format == 'csv':
 print('"Ord","Selector","FullPath"', file=out_file)
 order = 1
 for kiy in kiys:
 flist = dupes_dict[kiy]
 flist.sort()
 for fname in flist:
 print('"{0}","{1}","{2}"'.format(order, kiy, fname), file=out_file)
 order += 1
 if out_file is not None:
 qprint('done.')
def _main():
 dupes = get_dupes_by_size(G.StartPaths)
 dupes = refine_dupes_by_partial_hash(dupes)
 if G.FullFileHash:
 dupes = refine_dupes_by_full_hash(dupes)
 output_results(dupes, out_format=G.OutFormat, out_file=G.OutFile)
if __name__ == '__main__':
 _main()

Question 2

Hi, please don't edit your question's code after answers. I appreciate you're happy you got the answer you wanted but simply is enough for us :-) I'm not sure on whether or not adding code post ex facto is allowed to demonstrate your finalised solution, but certainly, you should not modify the original code in the question as this invalidates answers. I've rolled-back your code to what it was before you edited your answer for you.

Question 3

@ARedHerring ah, okay, sorry about that. Is it okay, though, to upload the end result to a pastebin-like site and edit the original question? (Also, to the best of my recollection, I think I did not edit the original code in the question)

Question 4

Regarding pastebin, I don't know (nor am I convinced you would gain much from this as your question is already marked as answered, but it is your question and probably choice as well). About edit, you definitely did edit the original code in the question :-) You can see in the revision log that your edit replaces the question almost entirely, including the original excerpt of code you provided, which I rolled back.

Question 5

Well, there's that line in pale blue that says "182 identical lines skipped", so I think I remembered correctly that I didn't overwrite/edit the original question ;-) ... It's okay. Just in case someone wants to see the Really FinalTM version, it's shared here: bitbucket.org/snippets/pepoluan/z9Gkj

Question 6

Correct you are, I apologise... I would not have known that unless I clicked side-by-side, neither would the other person who rollbacked. Very confusing.

Question 7

Don't Repeat Yourself

The refine_dupes_by_partial_hash is almost identical to refine_dupes_by_full_hash; the only difference is what to be done with a file. Factor the difference out into a hashing_strategy callable:

def refine_dupes(dupes_dict, hashing_strategy, block_size, hashfunc): 
 hashes = {}
 for selector, flist in dupes_dict.items(): 
 fsize = selector[-1]
 for name in flist:
 hasher = hashfunc() 
 with open(fname, 'rb') as fin: 
 result = hashing_strategy(fin, strategy, blocksize, hasher)
 flist = fullhashes.get(result, []) 
 flist.append(fname) 
 hashes[result] = flist 
 dupe_exact = dict_filter_by_len(hashes) 
 return dupe_exact

In fact, I'd go one step further and make hashing_strategy a class to incapsulate block_size and hashfunc.

Use builtins where possible:
```
for order, fname in enumerate(flist, 1):
```
instead of manually incrementing order
Use sys.argv instead of hardcoding paths.

Question 8

Agree with the use of sys.argv instead of hardcoding. In fact, that's the next on the drawing board :-) I just hardcode them at this moment to focus on the correctness. Anyways, nice tips! I'll certainly consider the 'class-ification' of hashing_strategy.

Question 9

Oh, about incrementing order manually, I did that instead of using enumerate() because flist was actually iterated over a dict's values, and the length of each value in the dict are not guaranteed to be the same. E.g.: {1: ['a', 'b'], 2: ['c', 'd', 'e'], 3: ['f', 'g', 'h', 'i'], 4: ['j', 'k']}

Question 10

To improve maintainability the Python style guide is key: PEP0008

In particular, your global variables should all be uppercase to signify that they're constants. PascalCase is for objects and makes it look like that's what these variables are.

Also you don't need class G. Python's scoping means that if you just declare those variables outside of any class or function they'll be global for this file. And if you import this file as a module you can just reference them with module_name.CONSTANT.

Lastly docstrings are a must for maintainability. You don't explain what your functions do at all, making it harder for others to use or change them. Python has another PEP about docstrings that gives a lot of info about how to make good ones.

Question 11

I encapsulated global vars into a G class to allow modification of those vars without having to pepper the code with global statements. Just a habit I did on other programs that I also do here, even though the functions do not actually modify any of the global vars.

Question 12

And the docstring, I agree. I actually stripped out the docstrings to make the listing here shorter. In hindsight, not a good thing xP

Question 13

@pepoluan Ah, the idea of avoiding globals hadn't occurred to me. Smart! I'd just note that somewhere... Maybe you had a docstring for it. XD

vnp vnp 58.6k4 gold badges55 silver badges144 bronze badges · Accepted Answer · 2015-09-11 17:23:32Z

Don't Repeat Yourself

The refine_dupes_by_partial_hash is almost identical to refine_dupes_by_full_hash; the only difference is what to be done with a file. Factor the difference out into a hashing_strategy callable:

def refine_dupes(dupes_dict, hashing_strategy, block_size, hashfunc): 
 hashes = {}
 for selector, flist in dupes_dict.items(): 
 fsize = selector[-1]
 for name in flist:
 hasher = hashfunc() 
 with open(fname, 'rb') as fin: 
 result = hashing_strategy(fin, strategy, blocksize, hasher)
 flist = fullhashes.get(result, []) 
 flist.append(fname) 
 hashes[result] = flist 
 dupe_exact = dict_filter_by_len(hashes) 
 return dupe_exact

In fact, I'd go one step further and make hashing_strategy a class to incapsulate block_size and hashfunc.

Use builtins where possible:
```
for order, fname in enumerate(flist, 1):
```
instead of manually incrementing order
Use sys.argv instead of hardcoding paths.

Agree with the use of sys.argv instead of hardcoding. In fact, that's the next on the drawing board :-) I just hardcode them at this moment to focus on the correctness. Anyways, nice tips! I'll certainly consider the 'class-ification' of hashing_strategy.
Oh, about incrementing order manually, I did that instead of using enumerate() because flist was actually iterated over a dict's values, and the length of each value in the dict are not guaranteed to be the same. E.g.: {1: ['a', 'b'], 2: ['c', 'd', 'e'], 3: ['f', 'g', 'h', 'i'], 4: ['j', 'k']}

Stack Exchange Network

Find duplicate files using Python

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Find duplicate files using Python

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions