I tried to follow the generator pipeline style (see David Beazley's famous presentation) for finding duplicate files (similar to answers here here). It seems pretty straightforward with MapReduce, so I thought it should also be possible to produce a clean, simple code with a generator pipeline. I tried both lambdas and named functions, but couldn't find how.
import pprint, os, glob, collections, hashlib, functools, itertools, sys, operator
BUFFER_SIZE = 2 ** 20
def read_files(filepaths):
for filepath in filepaths:
with open(filepath, mode='rb') as file:
for data in iter(functools.partial(file.read, BUFFER_SIZE), b''):
yield data, filepath
def get_digest(hash_obj, iterator):
for data in iterator:
hash_obj.update(data)
return hash_obj.digest()
def get_digests(data_iter):
for filepath, group in itertools.groupby(data_iter, key=lambda x: x[1]):
yield get_digest(hashlib.sha256(), map(operator.itemgetter(0), group)), filepath
def scantree(path):
"""Recursively yield DirEntry objects for given directory.
From httphttps://stackoverflow.com/a/33135143/336527
"""
with os.scandir(path) as it:
for entry in it:
if entry.is_dir(follow_symlinks=False):
yield from scantree(entry.path) # see below for Python 2.x
else:
yield entry
def find_files(root_folder):
'''Yields full paths of all files starting with root_folder, recursively'''
for entry in scantree(root_folder):
if entry.is_file():
yield entry.path
def find_duplicates(root_folder):
'''
Args:
root_folder: folder to start searching from
Yields:
Tuples of paths that correspond to duplicate files
'''
filepaths = find_files(root_folder)
data_iter = read_files(filepaths)
digests = get_digests(data_iter)
for _, group in itertools.groupby(digests, key=lambda x: x[0]):
_, filepaths = zip(*group)
if len(filepaths) >= 2:
yield filepaths
def main():
folder = sys.argv[1]
for dup in find_duplicates(folder):
pprint.pprint(dup)
I tried to follow the generator pipeline style (see David Beazley's famous presentation) for finding duplicate files (similar to answers here). It seems pretty straightforward with MapReduce, so I thought it should also be possible to produce a clean, simple code with a generator pipeline. I tried both lambdas and named functions, but couldn't find how.
import pprint, os, glob, collections, hashlib, functools, itertools, sys, operator
BUFFER_SIZE = 2 ** 20
def read_files(filepaths):
for filepath in filepaths:
with open(filepath, mode='rb') as file:
for data in iter(functools.partial(file.read, BUFFER_SIZE), b''):
yield data, filepath
def get_digest(hash_obj, iterator):
for data in iterator:
hash_obj.update(data)
return hash_obj.digest()
def get_digests(data_iter):
for filepath, group in itertools.groupby(data_iter, key=lambda x: x[1]):
yield get_digest(hashlib.sha256(), map(operator.itemgetter(0), group)), filepath
def scantree(path):
"""Recursively yield DirEntry objects for given directory.
From http://stackoverflow.com/a/33135143/336527
"""
with os.scandir(path) as it:
for entry in it:
if entry.is_dir(follow_symlinks=False):
yield from scantree(entry.path) # see below for Python 2.x
else:
yield entry
def find_files(root_folder):
'''Yields full paths of all files starting with root_folder, recursively'''
for entry in scantree(root_folder):
if entry.is_file():
yield entry.path
def find_duplicates(root_folder):
'''
Args:
root_folder: folder to start searching from
Yields:
Tuples of paths that correspond to duplicate files
'''
filepaths = find_files(root_folder)
data_iter = read_files(filepaths)
digests = get_digests(data_iter)
for _, group in itertools.groupby(digests, key=lambda x: x[0]):
_, filepaths = zip(*group)
if len(filepaths) >= 2:
yield filepaths
def main():
folder = sys.argv[1]
for dup in find_duplicates(folder):
pprint.pprint(dup)
I tried to follow the generator pipeline style (see David Beazley's famous presentation) for finding duplicate files (similar to answers here). It seems pretty straightforward with MapReduce, so I thought it should also be possible to produce a clean, simple code with a generator pipeline. I tried both lambdas and named functions, but couldn't find how.
import pprint, os, glob, collections, hashlib, functools, itertools, sys, operator
BUFFER_SIZE = 2 ** 20
def read_files(filepaths):
for filepath in filepaths:
with open(filepath, mode='rb') as file:
for data in iter(functools.partial(file.read, BUFFER_SIZE), b''):
yield data, filepath
def get_digest(hash_obj, iterator):
for data in iterator:
hash_obj.update(data)
return hash_obj.digest()
def get_digests(data_iter):
for filepath, group in itertools.groupby(data_iter, key=lambda x: x[1]):
yield get_digest(hashlib.sha256(), map(operator.itemgetter(0), group)), filepath
def scantree(path):
"""Recursively yield DirEntry objects for given directory.
From https://stackoverflow.com/a/33135143/336527
"""
with os.scandir(path) as it:
for entry in it:
if entry.is_dir(follow_symlinks=False):
yield from scantree(entry.path) # see below for Python 2.x
else:
yield entry
def find_files(root_folder):
'''Yields full paths of all files starting with root_folder, recursively'''
for entry in scantree(root_folder):
if entry.is_file():
yield entry.path
def find_duplicates(root_folder):
'''
Args:
root_folder: folder to start searching from
Yields:
Tuples of paths that correspond to duplicate files
'''
filepaths = find_files(root_folder)
data_iter = read_files(filepaths)
digests = get_digests(data_iter)
for _, group in itertools.groupby(digests, key=lambda x: x[0]):
_, filepaths = zip(*group)
if len(filepaths) >= 2:
yield filepaths
def main():
folder = sys.argv[1]
for dup in find_duplicates(folder):
pprint.pprint(dup)
Update:
Here's the slightly modified code from @ferada answer (who fixed a bug in my code and made my code much cleaner). Per @ferada suggestion, I made get_digest
just deal with digest calculation, and factored out the grouping code.
import pprint, os, glob, collections, hashlib, functools, itertools, sys, operator
BUFFER_SIZE = 2 ** 20
def read_files(filepaths):
for filepath in filepaths:
with open(filepath, mode='rb') as file:
for data in iter(functools.partial(file.read, BUFFER_SIZE), b''):
yield data, filepath
def get_digest(hash_obj, iterator):
for data in iterator:
hash_obj.update(data)
return hash_obj.digest()
def get_digests(data_iter):
for filepath, group in itertools.groupby(data_iter, key=lambda x: x[1]):
yield get_digest(hashlib.sha256(), map(operator.itemgetter(0), group)), filepath
def scantree(path):
"""Recursively yield DirEntry objects for given directory.
From http://stackoverflow.com/a/33135143/336527
"""
with os.scandir(path) as it:
for entry in it:
if entry.is_dir(follow_symlinks=False):
yield from scantree(entry.path) # see below for Python 2.x
else:
yield entry
def find_files(root_folder):
'''Yields full paths of all files starting with root_folder, recursively'''
for entry in scantree(root_folder):
if entry.is_file():
yield entry.path
def find_duplicates(root_folder):
'''
Args:
root_folder: folder to start searching from
Yields:
Tuples of paths that correspond to duplicate files
'''
filepaths = find_files(root_folder)
data_iter = read_files(filepaths)
digests = get_digests(data_iter)
for _, group in itertools.groupby(digests, key=lambda x: x[0]):
_, filepaths = zip(*group)
if len(filepaths) >= 2:
yield filepaths
def main():
folder = sys.argv[1]
for dup in find_duplicates(folder):
pprint.pprint(dup)
The passing around of filepath
as a second argument in yield remains an annoyance to be fixed.
Update:
Here's the slightly modified code from @ferada answer (who fixed a bug in my code and made my code much cleaner). Per @ferada suggestion, I made get_digest
just deal with digest calculation, and factored out the grouping code.
import pprint, os, glob, collections, hashlib, functools, itertools, sys, operator
BUFFER_SIZE = 2 ** 20
def read_files(filepaths):
for filepath in filepaths:
with open(filepath, mode='rb') as file:
for data in iter(functools.partial(file.read, BUFFER_SIZE), b''):
yield data, filepath
def get_digest(hash_obj, iterator):
for data in iterator:
hash_obj.update(data)
return hash_obj.digest()
def get_digests(data_iter):
for filepath, group in itertools.groupby(data_iter, key=lambda x: x[1]):
yield get_digest(hashlib.sha256(), map(operator.itemgetter(0), group)), filepath
def scantree(path):
"""Recursively yield DirEntry objects for given directory.
From http://stackoverflow.com/a/33135143/336527
"""
with os.scandir(path) as it:
for entry in it:
if entry.is_dir(follow_symlinks=False):
yield from scantree(entry.path) # see below for Python 2.x
else:
yield entry
def find_files(root_folder):
'''Yields full paths of all files starting with root_folder, recursively'''
for entry in scantree(root_folder):
if entry.is_file():
yield entry.path
def find_duplicates(root_folder):
'''
Args:
root_folder: folder to start searching from
Yields:
Tuples of paths that correspond to duplicate files
'''
filepaths = find_files(root_folder)
data_iter = read_files(filepaths)
digests = get_digests(data_iter)
for _, group in itertools.groupby(digests, key=lambda x: x[0]):
_, filepaths = zip(*group)
if len(filepaths) >= 2:
yield filepaths
def main():
folder = sys.argv[1]
for dup in find_duplicates(folder):
pprint.pprint(dup)
The passing around of filepath
as a second argument in yield remains an annoyance to be fixed.
# python 3.5 but nothing important lost if I port it to python 2.7
import os
import glob
import collections
import hashlib
import functools
BUFFER_SIZE = 2 ** 20
def get_files(filepaths):
for filepath in filepaths:
yield open(filepath, mode='rb'), filepath
def read_files(files):
for file, filepath in files:
for data in iter(functools.partial(file.read, BUFFER_SIZE), b''):
yield data, filepath
def get_digests(data_iter):
current_filepath = None
hash_obj = hashlib.sha256()
for data, filepath in data_iter:
if filepath != current_filepath:
if current_filepath is not None:
yield hash_obj.digest(), current_filepath
current_filepath = filepath
hash_obj.update(data)
yield hash_obj.digest(), current_filepath
def find_duplicates(root_folder):
'''
Args:
root_folder: folder to start searching from
Returns:
a list of lists of paths that correspond to duplicate files
'''
# combine generators into a pipeline
paths = glob.iglob(os.path.join(root_folder, '**'), recursive=True)
filepaths = filter(os.path.isfile, paths)
files = get_files(filepaths)
data_iter = read_files(files)
digests = get_digests(data_iter)
# collect data into a dictionary, then list
# I feel this part is ok
duplicates = collections.defaultdict(list)
for digest, filepath in digests:
duplicates[digest].append(filepath)
return [v for v in duplicates.values() if len(v) >=2]
# print duplicate files found in the current folder or below
duplicates = find_duplicates('.')
for group in duplicates:
print('the following files are duplicates:')
for filename in group:
print(filename)
print('\n')
# python 3.5 but nothing important lost if I port it to python 2.7
import os
import glob
import collections
import hashlib
import functools
BUFFER_SIZE = 2 ** 20
def get_files(filepaths):
for filepath in filepaths:
yield open(filepath, mode='rb'), filepath
def read_files(files):
for file, filepath in files:
for data in iter(functools.partial(file.read, BUFFER_SIZE), b''):
yield data, filepath
def get_digests(data_iter):
current_filepath = None
hash_obj = hashlib.sha256()
for data, filepath in data_iter:
if filepath != current_filepath:
if current_filepath is not None:
yield hash_obj.digest(), current_filepath
current_filepath = filepath
hash_obj.update(data)
yield hash_obj.digest(), current_filepath
def find_duplicates(root_folder):
'''
Args:
root_folder: folder to start searching from
Returns:
a list of lists of paths that correspond to duplicate files
'''
# combine generators into a pipeline
paths = glob.iglob(os.path.join(root_folder, '**'), recursive=True)
filepaths = filter(os.path.isfile, paths)
files = get_files(filepaths)
data_iter = read_files(files)
digests = get_digests(data_iter)
# collect data into a dictionary, then list
# I feel this part is ok
duplicates = collections.defaultdict(list)
for digest, filepath in digests:
duplicates[digest].append(filepath)
return [v for v in duplicates.values() if len(v) >=2]
# python 3.5 but nothing important lost if I port it to python 2.7
import os
import glob
import collections
import hashlib
import functools
BUFFER_SIZE = 2 ** 20
def get_files(filepaths):
for filepath in filepaths:
yield open(filepath, mode='rb'), filepath
def read_files(files):
for file, filepath in files:
for data in iter(functools.partial(file.read, BUFFER_SIZE), b''):
yield data, filepath
def get_digests(data_iter):
current_filepath = None
hash_obj = hashlib.sha256()
for data, filepath in data_iter:
if filepath != current_filepath:
if current_filepath is not None:
yield hash_obj.digest(), current_filepath
current_filepath = filepath
hash_obj.update(data)
yield hash_obj.digest(), current_filepath
def find_duplicates(root_folder):
'''
Args:
root_folder: folder to start searching from
Returns:
a list of lists of paths that correspond to duplicate files
'''
# combine generators into a pipeline
paths = glob.iglob(os.path.join(root_folder, '**'), recursive=True)
filepaths = filter(os.path.isfile, paths)
files = get_files(filepaths)
data_iter = read_files(files)
digests = get_digests(data_iter)
# collect data into a dictionary, then list
# I feel this part is ok
duplicates = collections.defaultdict(list)
for digest, filepath in digests:
duplicates[digest].append(filepath)
return [v for v in duplicates.values() if len(v) >=2]
# print duplicate files found in the current folder or below
duplicates = find_duplicates('.')
for group in duplicates:
print('the following files are duplicates:')
for filename in group:
print(filename)
print('\n')