Most Python "duplicate file finder" scripts I found do a brute-force of calculating the hashes of all files under a directory. So, I wrote my own -- hopefully faster -- script to kind of do things more intelligently.
Basically, it first searches for files of exact same size, then it compares only N bytes at the head and tail of the files, and finally it compares the files' full hashes.
My #1 concern of course would be correctness, followed closely by maintainability.
__author__ = 'pepoluan'
import os
import hashlib
from glob import glob
from itertools import chain
# Global variables
class G:
OutFormat = 'list' # Possible values: 'list', 'csv'
OutFile = None
StartPaths = [
'D:\\DATA_2',
'D:\\DATA_1',
'D:\\'
]
PartialCheckSize = 8192
FullFileHash = True
MinSize = 16 * 1024 * 1024
ProgPeriod = 1000
FullBlockSize = 1024 * 1024
Quiet = False
HashFunc = hashlib.md5
def get_walker_generator(at_path):
return (
chain.from_iterable(
glob(
os.path.join(
x[0].replace('[', '[[]').replace(']', '[]]'),
'*.*'
)
) for x in os.walk(at_path)
)
)
def dict_filter_by_len(rawdict, minlen=2):
assert isinstance(rawdict, dict)
return {k: v for k, v in rawdict.items() if len(v) >= minlen}
def qprint(*args, **kwargs):
if not G.Quiet:
print(*args, **kwargs)
def get_dupes_by_size(path_list):
qprint('===== Recursively stat-ing {0}'.format(path_list))
processed = set()
size_dict = {}
for statpath in path_list:
c = 0
uniq_in_path = 0
qprint('{0}...'.format(statpath), end='')
for fname in get_walker_generator(statpath):
try:
if c >= G.ProgPeriod:
print('.', end='', flush=True)
c = 0
if fname not in processed:
c += 1
uniq_in_path += 1
fstat = os.stat(fname)
fsize = fstat.st_size
flist = size_dict.get(fsize, set())
flist.add(fname)
size_dict[fsize] = flist
processed.add(fname)
except:
print('\nException on ', fname)
raise
qprint(uniq_in_path)
qprint('\nTotal files: ', len(processed))
dupe_sizes = {(None, sz): list(fset) for sz, fset in size_dict.items() if sz >= G.MinSize and len(fset) > 1}
qprint('Dupes: ', len(dupe_sizes))
return dupe_sizes
def refine_dupes_by_partial_hash(dupes_dict, partial_check_size=G.PartialCheckSize, hashfunc=G.HashFunc):
assert isinstance(dupes_dict, dict)
qprint('===== Checking hash of first and last {0} bytes ====='.format(partial_check_size))
qprint('Processing...', end='', flush=True)
size_and_hashes = {}
for selector, flist in dupes_dict.items():
fsize = selector[-1]
for fname in flist:
with open(fname, 'rb') as fin:
hash_front = hashfunc(fin.read(partial_check_size)).hexdigest()
seek_targ = fsize - G.PartialCheckSize - 1
if seek_targ > 0:
fin.seek(seek_targ)
hash_rear = hashfunc(fin.read(partial_check_size)).hexdigest()
else:
hash_rear = hash_front
# "size" at rear, so a simple print will still result in a nicely-aligned table
selector = (hash_front, hash_rear, fsize)
flist = size_and_hashes.get(selector, [])
flist.append(fname)
size_and_hashes[selector] = flist
qprint('.', end='', flush=True)
dupe_exact = dict_filter_by_len(size_and_hashes)
qprint('\nDupes: ', len(dupe_exact))
return dupe_exact
def refine_dupes_by_full_hash(dupes_dict, block_size=G.FullBlockSize, hashfunc=G.HashFunc):
assert isinstance(dupes_dict, dict)
qprint('===== Checking full hashes of Dupes')
qprint('Processing...', end='', flush=True)
fullhashes = {}
for selector, flist in dupes_dict.items():
sz = selector[-1] # Save size so we can still inform the user of the size
for fname in flist:
hasher = hashfunc()
with open(fname, 'rb') as fin:
while True:
buf = fin.read(block_size)
if not buf: break
hasher.update(buf)
# "size" at rear, so a simple print will still result in a nicely-aligned table
slct = (hasher.hexdigest(), sz)
flist = fullhashes.get(slct, [])
flist.append(fname)
fullhashes[slct] = flist
qprint('.', end='', flush=True)
dupe_exact = dict_filter_by_len(fullhashes)
qprint('\nDupes: ', len(dupe_exact))
return dupe_exact
def output_results(dupes_dict, out_format=G.OutFormat, out_file=G.OutFile):
assert isinstance(dupes_dict, dict)
kiys = [k for k in dupes_dict]
kiys.sort(key=lambda x: x[-1])
if out_file is not None:
qprint('Writing result in "{0}" format to file: {1} ...'.format(out_format, out_file), end='')
else:
qprint()
if out_format == 'list':
for kiy in kiys:
flist = dupes_dict[kiy]
print('-- {0}:'.format(kiy), file=out_file)
flist.sort()
for fname in flist:
print(' {0}'.format(fname), file=out_file)
elif out_format == 'csv':
print('"Ord","Selector","FullPath"', file=out_file)
order = 1
for kiy in kiys:
flist = dupes_dict[kiy]
flist.sort()
for fname in flist:
print('"{0}","{1}","{2}"'.format(order, kiy, fname), file=out_file)
order += 1
if out_file is not None:
qprint('done.')
def _main():
dupes = get_dupes_by_size(G.StartPaths)
dupes = refine_dupes_by_partial_hash(dupes)
if G.FullFileHash:
dupes = refine_dupes_by_full_hash(dupes)
output_results(dupes, out_format=G.OutFormat, out_file=G.OutFile)
if __name__ == '__main__':
_main()
-
1\$\begingroup\$ Hi, please don't edit your question's code after answers. I appreciate you're happy you got the answer you wanted but simply is enough for us :-) I'm not sure on whether or not adding code post ex facto is allowed to demonstrate your finalised solution, but certainly, you should not modify the original code in the question as this invalidates answers. I've rolled-back your code to what it was before you edited your answer for you. \$\endgroup\$Dan– Dan2015年09月14日 07:58:59 +00:00Commented Sep 14, 2015 at 7:58
-
\$\begingroup\$ @ARedHerring ah, okay, sorry about that. Is it okay, though, to upload the end result to a pastebin-like site and edit the original question? (Also, to the best of my recollection, I think I did not edit the original code in the question) \$\endgroup\$pepoluan– pepoluan2015年09月14日 08:03:00 +00:00Commented Sep 14, 2015 at 8:03
-
\$\begingroup\$ Regarding pastebin, I don't know (nor am I convinced you would gain much from this as your question is already marked as answered, but it is your question and probably choice as well). About edit, you definitely did edit the original code in the question :-) You can see in the revision log that your edit replaces the question almost entirely, including the original excerpt of code you provided, which I rolled back. \$\endgroup\$Dan– Dan2015年09月14日 08:04:39 +00:00Commented Sep 14, 2015 at 8:04
-
\$\begingroup\$ Well, there's that line in pale blue that says "182 identical lines skipped", so I think I remembered correctly that I didn't overwrite/edit the original question ;-) ... It's okay. Just in case someone wants to see the Really FinalTM version, it's shared here: bitbucket.org/snippets/pepoluan/z9Gkj \$\endgroup\$pepoluan– pepoluan2015年09月14日 08:23:43 +00:00Commented Sep 14, 2015 at 8:23
-
1\$\begingroup\$ Correct you are, I apologise... I would not have known that unless I clicked side-by-side, neither would the other person who rollbacked. Very confusing. \$\endgroup\$Dan– Dan2015年09月14日 08:51:58 +00:00Commented Sep 14, 2015 at 8:51
2 Answers 2
Don't Repeat Yourself
The
refine_dupes_by_partial_hash
is almost identical torefine_dupes_by_full_hash
; the only difference is what to be done with a file. Factor the difference out into ahashing_strategy
callable:def refine_dupes(dupes_dict, hashing_strategy, block_size, hashfunc): hashes = {} for selector, flist in dupes_dict.items(): fsize = selector[-1] for name in flist: hasher = hashfunc() with open(fname, 'rb') as fin: result = hashing_strategy(fin, strategy, blocksize, hasher) flist = fullhashes.get(result, []) flist.append(fname) hashes[result] = flist dupe_exact = dict_filter_by_len(hashes) return dupe_exact
In fact, I'd go one step further and make
hashing_strategy
a class to incapsulateblock_size
andhashfunc
.Use builtins where possible:
for order, fname in enumerate(flist, 1):
instead of manually incrementing
order
Use
sys.argv
instead of hardcoding paths.
-
\$\begingroup\$ Agree with the use of
sys.argv
instead of hardcoding. In fact, that's the next on the drawing board :-) I just hardcode them at this moment to focus on the correctness. Anyways, nice tips! I'll certainly consider the 'class-ification' ofhashing_strategy
. \$\endgroup\$pepoluan– pepoluan2015年09月14日 03:38:21 +00:00Commented Sep 14, 2015 at 3:38 -
\$\begingroup\$ Oh, about incrementing
order
manually, I did that instead of usingenumerate()
becauseflist
was actually iterated over adict
's values, and the length of each value in thedict
are not guaranteed to be the same. E.g.:{1: ['a', 'b'], 2: ['c', 'd', 'e'], 3: ['f', 'g', 'h', 'i'], 4: ['j', 'k']}
\$\endgroup\$pepoluan– pepoluan2015年09月14日 03:51:53 +00:00Commented Sep 14, 2015 at 3:51
To improve maintainability the Python style guide is key: PEP0008
In particular, your global variables should all be uppercase to signify that they're constants. PascalCase is for objects and makes it look like that's what these variables are.
Also you don't need class G
. Python's scoping means that if you just declare those variables outside of any class or function they'll be global for this file. And if you import this file as a module you can just reference them with module_name.CONSTANT
.
Lastly docstrings are a must for maintainability. You don't explain what your functions do at all, making it harder for others to use or change them. Python has another PEP about docstrings that gives a lot of info about how to make good ones.
-
\$\begingroup\$ I encapsulated global vars into a
G
class to allow modification of those vars without having to pepper the code withglobal
statements. Just a habit I did on other programs that I also do here, even though the functions do not actually modify any of the global vars. \$\endgroup\$pepoluan– pepoluan2015年09月14日 03:40:21 +00:00Commented Sep 14, 2015 at 3:40 -
\$\begingroup\$ And the docstring, I agree. I actually stripped out the docstrings to make the listing here shorter. In hindsight, not a good thing xP \$\endgroup\$pepoluan– pepoluan2015年09月14日 03:41:29 +00:00Commented Sep 14, 2015 at 3:41
-
\$\begingroup\$ @pepoluan Ah, the idea of avoiding globals hadn't occurred to me. Smart! I'd just note that somewhere... Maybe you had a docstring for it. XD \$\endgroup\$SuperBiasedMan– SuperBiasedMan2015年09月14日 04:03:59 +00:00Commented Sep 14, 2015 at 4:03