Return to Answer

deleted 29 characters in body; added 8 characters in body

edited Oct 7, 2016 at 12:18

145.6k
22
190
479

def duplicates(paths):
 """Partition paths into sets whose contents are identical. Files that
 are not identical with any other file are omitted."""
 prototypes = []
 for path in paths:
 for proto in prototypes:
 if are_identical(proto[0], path):
 proto.append(path)
 break
 else:
 prototypes.append([path])
 return [set(dup_set) for dup_set in prototypes:
  if len(dup_set) > 1:
  yield set(dup_set)1]
def main():
 table = checksum_table(md5_checksum, '/media/sf_Shared/', '*.pdf')
 for md5, dup_candidates in table.items():
 for dup_files in duplicates(dup_candidates):
 print("The following files are identical, with MD5 {}:".format(md5))
 for path in sorted(dup_files):
 print(' ' + path)

def duplicates(paths):
 """Partition paths into sets whose contents are identical. Files that
 are not identical with any other file are omitted."""
 prototypes = []
 for path in paths:
 for proto in prototypes:
 if are_identical(proto[0], path):
 proto.append(path)
 break
 else:
 prototypes.append([path])
 for dup_set in prototypes:
  if len(dup_set) > 1:
  yield set(dup_set)
def main():
 table = checksum_table(md5_checksum, '/media/sf_Shared/', '*.pdf')
 for md5, dup_candidates in table.items():
 for dup_files in duplicates(dup_candidates):
 print("The following files are identical, with MD5 {}:".format(md5))
 for path in dup_files:
 print(' ' + path)

def duplicates(paths):
 """Partition paths into sets whose contents are identical. Files that
 are not identical with any other file are omitted."""
 prototypes = []
 for path in paths:
 for proto in prototypes:
 if are_identical(proto[0], path):
 proto.append(path)
 break
 else:
 prototypes.append([path])
 return [set(dup_set) for dup_set in prototypes if len(dup_set) > 1]
def main():
 table = checksum_table(md5_checksum, '/media/sf_Shared/', '*.pdf')
 for md5, dup_candidates in table.items():
 for dup_files in duplicates(dup_candidates):
 print("The following files are identical, with MD5 {}:".format(md5))
 for path in sorted(dup_files):
 print(' ' + path)

Source Link

answered Oct 6, 2016 at 22:43

200_success

answered Oct 6, 2016 at 22:43

200_success

145.6k
22
190
479

These lines of code...

command = 'md5sum ' + filepath

and

command = 'diff %s %s' % (filename1, filename2)

... should make you very suspicious. In general, you should use caution anytime you compose a string (whether by concatenation or interpolation) that will be interpreted by another computer system.

@Simon mentioned the possibility of failure if a filename happens to contain a space. The bug is actually far more insidious than that: you have an arbitrary command execution vulnerability. If a file has a hostile name that contains shell metacharacters, it can cause your program to read any file (subject to file permissions), write to any file, pipe input to/from a command... anything could happen!

The best way to avoid arbitrary command execution vulnerabilities would be to avoid executing external commands altogether. It's not difficult to use hashlib or to read and compare two files in Python. The next best strategy, though, would be to use subprocess.Popen(args), where args is list.

from subprocess import Popen, run, DEVNULL, PIPE
def are_identical(path1, path2):
 """Determine whether two files have identical contents
 using the diff Unix command."""
 return 0 == run(['diff', path1, path2], stdout=DEVNULL).returncode
def md5_checksum(path):
 """Obtain the MD5 checksum of a file (as a string of hex digits)
 using the md5sum Unix command."""
 with Popen(['md5sum', path], stdout=PIPE) as proc:
 for line in proc.stdout:
 return line.split()[0].decode('ASCII')

The md5_checksum_table function is pretty good. You could get greater flexibility for free with two tweaks:

Make the checksum algorithm a parameter.
Pass a glob pattern instead of a suffix.

import fnmatch
import os
def checksum_table(checksum_algorithm, dirname, pattern):
 table = {}
 for root, sub, files in os.walk(dirname):
 for file in fnmatch.filter(files, pattern):
 path = os.path.join(root, file)
 checksum = checksum_algorithm(path)
 table.setdefault(checksum, []).append(path)
 return table

The are_identical() function isn't quite right. Suppose you call are_identical([a, b, c]), where files a and b are identical, but c is different — what happens?

The problem is that the interface is logically flawed. You can't take more than two files and summarize the situation as a single boolean. diff can only work on pairs of files. (Well, GNU diff can do three-way diffs, but that won't help you here.)

def duplicates(paths):
 """Partition paths into sets whose contents are identical. Files that
 are not identical with any other file are omitted."""
 prototypes = []
 for path in paths:
 for proto in prototypes:
 if are_identical(proto[0], path):
 proto.append(path)
 break
 else:
 prototypes.append([path])
 for dup_set in prototypes:
 if len(dup_set) > 1:
 yield set(dup_set)
def main():
 table = checksum_table(md5_checksum, '/media/sf_Shared/', '*.pdf')
 for md5, dup_candidates in table.items():
 for dup_files in duplicates(dup_candidates):
 print("The following files are identical, with MD5 {}:".format(md5))
 for path in dup_files:
 print(' ' + path)

lang-py