Return to Question

replaced http://stackoverflow.com/ with https://stackoverflow.com/

edited May 23, 2017 at 12:40

I tried to follow the generator pipeline style (see David Beazley's famous presentation) for finding duplicate files (similar to answers here here). It seems pretty straightforward with MapReduce, so I thought it should also be possible to produce a clean, simple code with a generator pipeline. I tried both lambdas and named functions, but couldn't find how.

import pprint, os, glob, collections, hashlib, functools, itertools, sys, operator
BUFFER_SIZE = 2 ** 20
def read_files(filepaths):
 for filepath in filepaths:
 with open(filepath, mode='rb') as file:
 for data in iter(functools.partial(file.read, BUFFER_SIZE), b''):
 yield data, filepath
def get_digest(hash_obj, iterator):
 for data in iterator:
 hash_obj.update(data)
 return hash_obj.digest()
def get_digests(data_iter):
 for filepath, group in itertools.groupby(data_iter, key=lambda x: x[1]):
 yield get_digest(hashlib.sha256(), map(operator.itemgetter(0), group)), filepath
def scantree(path):
 """Recursively yield DirEntry objects for given directory.
 From httphttps://stackoverflow.com/a/33135143/336527
 """
 with os.scandir(path) as it:
 for entry in it:
 if entry.is_dir(follow_symlinks=False):
 yield from scantree(entry.path) # see below for Python 2.x
 else:
 yield entry
def find_files(root_folder):
 '''Yields full paths of all files starting with root_folder, recursively'''
 for entry in scantree(root_folder):
 if entry.is_file():
 yield entry.path
def find_duplicates(root_folder):
 '''
 Args:
 root_folder: folder to start searching from
 Yields:
 Tuples of paths that correspond to duplicate files
 '''
 filepaths = find_files(root_folder)
 data_iter = read_files(filepaths)
 digests = get_digests(data_iter)
 for _, group in itertools.groupby(digests, key=lambda x: x[0]):
 _, filepaths = zip(*group)
 if len(filepaths) >= 2:
 yield filepaths
def main():
 folder = sys.argv[1]
 for dup in find_duplicates(folder):
 pprint.pprint(dup)

I tried to follow the generator pipeline style (see David Beazley's famous presentation) for finding duplicate files (similar to answers here). It seems pretty straightforward with MapReduce, so I thought it should also be possible to produce a clean, simple code with a generator pipeline. I tried both lambdas and named functions, but couldn't find how.

import pprint, os, glob, collections, hashlib, functools, itertools, sys, operator
BUFFER_SIZE = 2 ** 20
def read_files(filepaths):
 for filepath in filepaths:
 with open(filepath, mode='rb') as file:
 for data in iter(functools.partial(file.read, BUFFER_SIZE), b''):
 yield data, filepath
def get_digest(hash_obj, iterator):
 for data in iterator:
 hash_obj.update(data)
 return hash_obj.digest()
def get_digests(data_iter):
 for filepath, group in itertools.groupby(data_iter, key=lambda x: x[1]):
 yield get_digest(hashlib.sha256(), map(operator.itemgetter(0), group)), filepath
def scantree(path):
 """Recursively yield DirEntry objects for given directory.
 From http://stackoverflow.com/a/33135143/336527
 """
 with os.scandir(path) as it:
 for entry in it:
 if entry.is_dir(follow_symlinks=False):
 yield from scantree(entry.path) # see below for Python 2.x
 else:
 yield entry
def find_files(root_folder):
 '''Yields full paths of all files starting with root_folder, recursively'''
 for entry in scantree(root_folder):
 if entry.is_file():
 yield entry.path
def find_duplicates(root_folder):
 '''
 Args:
 root_folder: folder to start searching from
 Yields:
 Tuples of paths that correspond to duplicate files
 '''
 filepaths = find_files(root_folder)
 data_iter = read_files(filepaths)
 digests = get_digests(data_iter)
 for _, group in itertools.groupby(digests, key=lambda x: x[0]):
 _, filepaths = zip(*group)
 if len(filepaths) >= 2:
 yield filepaths
def main():
 folder = sys.argv[1]
 for dup in find_duplicates(folder):
 pprint.pprint(dup)

import pprint, os, glob, collections, hashlib, functools, itertools, sys, operator
BUFFER_SIZE = 2 ** 20
def read_files(filepaths):
 for filepath in filepaths:
 with open(filepath, mode='rb') as file:
 for data in iter(functools.partial(file.read, BUFFER_SIZE), b''):
 yield data, filepath
def get_digest(hash_obj, iterator):
 for data in iterator:
 hash_obj.update(data)
 return hash_obj.digest()
def get_digests(data_iter):
 for filepath, group in itertools.groupby(data_iter, key=lambda x: x[1]):
 yield get_digest(hashlib.sha256(), map(operator.itemgetter(0), group)), filepath
def scantree(path):
 """Recursively yield DirEntry objects for given directory.
 From https://stackoverflow.com/a/33135143/336527
 """
 with os.scandir(path) as it:
 for entry in it:
 if entry.is_dir(follow_symlinks=False):
 yield from scantree(entry.path) # see below for Python 2.x
 else:
 yield entry
def find_files(root_folder):
 '''Yields full paths of all files starting with root_folder, recursively'''
 for entry in scantree(root_folder):
 if entry.is_file():
 yield entry.path
def find_duplicates(root_folder):
 '''
 Args:
 root_folder: folder to start searching from
 Yields:
 Tuples of paths that correspond to duplicate files
 '''
 filepaths = find_files(root_folder)
 data_iter = read_files(filepaths)
 digests = get_digests(data_iter)
 for _, group in itertools.groupby(digests, key=lambda x: x[0]):
 _, filepaths = zip(*group)
 if len(filepaths) >= 2:
 yield filepaths
def main():
 folder = sys.argv[1]
 for dup in find_duplicates(folder):
 pprint.pprint(dup)

added 2419 characters in body

Source Link

edited Feb 18, 2017 at 1:15

max

edited Feb 18, 2017 at 1:15

max

Update:

Here's the slightly modified code from @ferada answer (who fixed a bug in my code and made my code much cleaner). Per @ferada suggestion, I made get_digest just deal with digest calculation, and factored out the grouping code.

import pprint, os, glob, collections, hashlib, functools, itertools, sys, operator
BUFFER_SIZE = 2 ** 20
def read_files(filepaths):
 for filepath in filepaths:
 with open(filepath, mode='rb') as file:
 for data in iter(functools.partial(file.read, BUFFER_SIZE), b''):
 yield data, filepath
def get_digest(hash_obj, iterator):
 for data in iterator:
 hash_obj.update(data)
 return hash_obj.digest()
def get_digests(data_iter):
 for filepath, group in itertools.groupby(data_iter, key=lambda x: x[1]):
 yield get_digest(hashlib.sha256(), map(operator.itemgetter(0), group)), filepath
def scantree(path):
 """Recursively yield DirEntry objects for given directory.
 From http://stackoverflow.com/a/33135143/336527
 """
 with os.scandir(path) as it:
 for entry in it:
 if entry.is_dir(follow_symlinks=False):
 yield from scantree(entry.path) # see below for Python 2.x
 else:
 yield entry
def find_files(root_folder):
 '''Yields full paths of all files starting with root_folder, recursively'''
 for entry in scantree(root_folder):
 if entry.is_file():
 yield entry.path
def find_duplicates(root_folder):
 '''
 Args:
 root_folder: folder to start searching from
 Yields:
 Tuples of paths that correspond to duplicate files
 '''
 filepaths = find_files(root_folder)
 data_iter = read_files(filepaths)
 digests = get_digests(data_iter)
 for _, group in itertools.groupby(digests, key=lambda x: x[0]):
 _, filepaths = zip(*group)
 if len(filepaths) >= 2:
 yield filepaths
def main():
 folder = sys.argv[1]
 for dup in find_duplicates(folder):
 pprint.pprint(dup)

The passing around of filepath as a second argument in yield remains an annoyance to be fixed.

Update:

import pprint, os, glob, collections, hashlib, functools, itertools, sys, operator
BUFFER_SIZE = 2 ** 20
def read_files(filepaths):
 for filepath in filepaths:
 with open(filepath, mode='rb') as file:
 for data in iter(functools.partial(file.read, BUFFER_SIZE), b''):
 yield data, filepath
def get_digest(hash_obj, iterator):
 for data in iterator:
 hash_obj.update(data)
 return hash_obj.digest()
def get_digests(data_iter):
 for filepath, group in itertools.groupby(data_iter, key=lambda x: x[1]):
 yield get_digest(hashlib.sha256(), map(operator.itemgetter(0), group)), filepath
def scantree(path):
 """Recursively yield DirEntry objects for given directory.
 From http://stackoverflow.com/a/33135143/336527
 """
 with os.scandir(path) as it:
 for entry in it:
 if entry.is_dir(follow_symlinks=False):
 yield from scantree(entry.path) # see below for Python 2.x
 else:
 yield entry
def find_files(root_folder):
 '''Yields full paths of all files starting with root_folder, recursively'''
 for entry in scantree(root_folder):
 if entry.is_file():
 yield entry.path
def find_duplicates(root_folder):
 '''
 Args:
 root_folder: folder to start searching from
 Yields:
 Tuples of paths that correspond to duplicate files
 '''
 filepaths = find_files(root_folder)
 data_iter = read_files(filepaths)
 digests = get_digests(data_iter)
 for _, group in itertools.groupby(digests, key=lambda x: x[0]):
 _, filepaths = zip(*group)
 if len(filepaths) >= 2:
 yield filepaths
def main():
 folder = sys.argv[1]
 for dup in find_duplicates(folder):
 pprint.pprint(dup)

The passing around of filepath as a second argument in yield remains an annoyance to be fixed.

Bumped by Community user

occurred Feb 17, 2017 at 8:59

Bumped by Community user

occurred Jan 18, 2017 at 8:47

added 273 characters in body

Source Link

edited Dec 19, 2016 at 7:42

max

edited Dec 19, 2016 at 7:42

max

# python 3.5 but nothing important lost if I port it to python 2.7
import os
import glob
import collections
import hashlib
import functools
BUFFER_SIZE = 2 ** 20
def get_files(filepaths):
 for filepath in filepaths:
 yield open(filepath, mode='rb'), filepath
 
def read_files(files):
 for file, filepath in files:
 for data in iter(functools.partial(file.read, BUFFER_SIZE), b''):
 yield data, filepath
 
def get_digests(data_iter):
 current_filepath = None
 hash_obj = hashlib.sha256()
 for data, filepath in data_iter:
 if filepath != current_filepath:
 if current_filepath is not None:
 yield hash_obj.digest(), current_filepath
 current_filepath = filepath
 hash_obj.update(data)
 yield hash_obj.digest(), current_filepath
def find_duplicates(root_folder):
 '''
 Args:
 root_folder: folder to start searching from
 Returns:
 a list of lists of paths that correspond to duplicate files
 '''
 # combine generators into a pipeline
 paths = glob.iglob(os.path.join(root_folder, '**'), recursive=True)
 filepaths = filter(os.path.isfile, paths)
 files = get_files(filepaths)
 data_iter = read_files(files)
 digests = get_digests(data_iter)
 # collect data into a dictionary, then list
 # I feel this part is ok
 duplicates = collections.defaultdict(list)
 for digest, filepath in digests:
 duplicates[digest].append(filepath)
 return [v for v in duplicates.values() if len(v) >=2]
# print duplicate files found in the current folder or below
duplicates = find_duplicates('.')
for group in duplicates:
 print('the following files are duplicates:')
 for filename in group:
 print(filename)
 print('\n')

# python 3.5 but nothing important lost if I port it to python 2.7
import os
import glob
import collections
import hashlib
import functools
BUFFER_SIZE = 2 ** 20
def get_files(filepaths):
 for filepath in filepaths:
 yield open(filepath, mode='rb'), filepath
 
def read_files(files):
 for file, filepath in files:
 for data in iter(functools.partial(file.read, BUFFER_SIZE), b''):
 yield data, filepath
 
def get_digests(data_iter):
 current_filepath = None
 hash_obj = hashlib.sha256()
 for data, filepath in data_iter:
 if filepath != current_filepath:
 if current_filepath is not None:
 yield hash_obj.digest(), current_filepath
 current_filepath = filepath
 hash_obj.update(data)
 yield hash_obj.digest(), current_filepath
def find_duplicates(root_folder):
 '''
 Args:
 root_folder: folder to start searching from
 Returns:
 a list of lists of paths that correspond to duplicate files
 '''
 # combine generators into a pipeline
 paths = glob.iglob(os.path.join(root_folder, '**'), recursive=True)
 filepaths = filter(os.path.isfile, paths)
 files = get_files(filepaths)
 data_iter = read_files(files)
 digests = get_digests(data_iter)
 # collect data into a dictionary, then list
 # I feel this part is ok
 duplicates = collections.defaultdict(list)
 for digest, filepath in digests:
 duplicates[digest].append(filepath)
 return [v for v in duplicates.values() if len(v) >=2]

# python 3.5 but nothing important lost if I port it to python 2.7
import os
import glob
import collections
import hashlib
import functools
BUFFER_SIZE = 2 ** 20
def get_files(filepaths):
 for filepath in filepaths:
 yield open(filepath, mode='rb'), filepath
 
def read_files(files):
 for file, filepath in files:
 for data in iter(functools.partial(file.read, BUFFER_SIZE), b''):
 yield data, filepath
 
def get_digests(data_iter):
 current_filepath = None
 hash_obj = hashlib.sha256()
 for data, filepath in data_iter:
 if filepath != current_filepath:
 if current_filepath is not None:
 yield hash_obj.digest(), current_filepath
 current_filepath = filepath
 hash_obj.update(data)
 yield hash_obj.digest(), current_filepath
def find_duplicates(root_folder):
 '''
 Args:
 root_folder: folder to start searching from
 Returns:
 a list of lists of paths that correspond to duplicate files
 '''
 # combine generators into a pipeline
 paths = glob.iglob(os.path.join(root_folder, '**'), recursive=True)
 filepaths = filter(os.path.isfile, paths)
 files = get_files(filepaths)
 data_iter = read_files(files)
 digests = get_digests(data_iter)
 # collect data into a dictionary, then list
 # I feel this part is ok
 duplicates = collections.defaultdict(list)
 for digest, filepath in digests:
 duplicates[digest].append(filepath)
 return [v for v in duplicates.values() if len(v) >=2]
# print duplicate files found in the current folder or below
duplicates = find_duplicates('.')
for group in duplicates:
 print('the following files are duplicates:')
 for filename in group:
 print(filename)
 print('\n')

added 131 characters in body

Source Link

edited Dec 19, 2016 at 6:54

max

edited Dec 19, 2016 at 6:54

max

Tweeted twitter.com/StackCodeReview/status/802009477381844992

occurred Nov 25, 2016 at 4:42

Source Link

asked Oct 11, 2016 at 6:18

max

asked Oct 11, 2016 at 6:18

max

lang-py