I created a small Python utility to analyze multiple same-size files and report on the difference byte-by-byte (i.e. Hamming distance, not Levenshtein distance.) I am planning to extend the capabilities to allow a GUI display. Mostly, I'm interested in scalability and usability, with efficiency still being a high priority. I've already examined Python code that compares two files byte by byte and other similar questions, but none seem to involve the same type of analysis for the purpose of reporting on differences.
Specifically, I'm interested in two details:
Is there anything else I should consider in my
Difference
container structure and its handling?Are there any downsides to my frequent use of generators? I tried to let it cycle through each difference via generator, minimizing memory usage, but I think that also has a few performance implications.
import os
import os.path
from enum import IntEnum
from collections import defaultdict
def compare_iterator_elements(lst, f=lambda x: x):
'''Returns true if all the elements of an iterator are true, else returns false.'''
try:
first = f(next(lst))
for i in lst:
if first != f(i):
return False
return True
except StopIteration:
return True
# Source: https://stackoverflow.com/a/312464
def chunks(l, n):
'''Yield successive n-sized chunks from l. If len(l) is not divisible by n, the last chunk will be len(l) % n sized.'''
for i in range(0, len(l), n):
yield l[i:i + n]
class Settings:
'''Container class for difference formatting options.'''
class Separation(IntEnum):
CLUMPED = 0
WIDTH_BROKEN = 1
BLOCK_SEPARATED = 2
separation = Separation.BLOCK_SEPARATED
block_width = 0x10
# Planned future settings:
# byte_group = 1
# big_endian_groups = True
# align_offset = 0
class Difference():
'''Dictionary wrapper representing the difference at a specific point in a file.'''
def __init__(self, difference, start):
self.difference = difference
self.start = start
def extend(self, other):
if other.start < self.start:
raise ValueError(f'Differences can only be extended to differences with larger start values ({other.start} < {other.start}).')
blank_width = other.start - (self.start + len(self))
if blank_width < 0:
raise ValueError(f'Extending to an overlapping Difference is not allowed.')
for file, v in self.difference.items():
v += [' '] * blank_width + other.difference[file]
def __len__(self):
return len(next(iter(self.difference.values())))
def __getitem__(self, value):
difference = {k: v[value] for k,v in self.difference.items()}
if isinstance(value, slice):
return Difference(difference, self.start + (value.start if value.start else 0))
else:
return Difference(difference, self.start + value)
def _cli_difference(differences, start, settings):
'''Outputs a difference in the command line.'''
name_width = settings.name_width
start_display = hex(start)[:-1] + 'X' if settings.separation > settings.Separation.CLUMPED else hex(start)
print(f'{start_display.rjust(name_width)}:', end='')
if settings.separation > settings.Separation.CLUMPED:
print(' ' + ''.join((f' {i:x}' for i in range(settings.block_width))), end='')
print()
offset = start % settings.block_width
for file_name, b in differences.items():
print(f'{file_name.rjust(name_width)}: ', end='')
if settings.separation > settings.Separation.CLUMPED:
print(' ' * offset,
''.join(b[:settings.block_width - offset]),
sep='', end='')
for line in chunks(b[settings.block_width - offset:], settings.block_width):
print('\n', ' ' * (name_width + 2), *line, sep='', end='')
print()
else:
print(''.join(b))
print()
def _separate_line_by_line(differences, settings):
'''Generator function that separates and combines differences into fixed width blocks.'''
current = next(differences)
try:
while True:
offset = current.start % settings.block_width
if offset + len(current) > settings.block_width:
next_block = settings.block_width - offset
yield current[:next_block]
current = current[next_block:]
else:
next_diff = next(differences)
block_start = current.start - offset
if next_diff.start in range(block_start, block_start+settings.block_width):
current.extend(next_diff)
else:
yield current
current = next_diff
except StopIteration:
yield current
def report_cli_differences(differences, settings):
'''Outputs an iterable of differences in the command line.'''
print('Differences:\n')
if settings.separation == Settings.Separation.BLOCK_SEPARATED:
differences = _separate_line_by_line(differences, settings)
for diff in differences:
_cli_difference(diff.difference, diff.start, settings)
print('End of differences.')
def compare_files(file_args, difference_mask):
'''Generator function that returns Difference objects for each aligned difference in same size files.'''
if not compare_iterator_elements(iter(file_args), os.path.getsize):
raise ValueError('The file sizes must be equal.')
files = [open(file_name, 'rb') for file_name in file_args]
while True:
comparison = {f.name: f.read(1) for f in files}
if not list(comparison.values())[0]:
break
bytes_different = lambda: not compare_iterator_elements(iter(comparison.values())) and compare_iterator_elements((comparison[file_name] for file_name in difference_mask))
if bytes_different():
start = files[0].tell()
different_bytes = defaultdict(list)
while True:
for file_name, b in comparison.items():
different_bytes[file_name].append(f'{ord(b):0>2x}')
comparison = {f.name: f.read(1) for f in files}
if not bytes_different():
break
yield Difference(different_bytes, start)
for f in files:
f.close()
if __name__ == '__main__':
import argparse
arg_parser = argparse.ArgumentParser(description='Compares two files for differences.')
arg_parser.add_argument('files', metavar='files', nargs='+', default=[f for f in os.listdir() if os.path.isfile(f)], help='file names to compare')
arg_parser.add_argument('--separation', default=2, type = int, help='Specifies the level of difference separation. 0 is no separation, 1 is carriage returns after every block, and 2 separates every new block into a new comparison.')
arg_parser.add_argument('--ignored_diff', metavar='FILES', nargs='+', help='For the purpose of reporting on differences, ignore the positions where differences occur between the given files.', default=[])
args = arg_parser.parse_args()
file_args = args.files
settings = Settings()
settings.separation = Settings.Separation(args.separation)
ignore_differences = args.ignored_diff
settings.name_width = max(len(str(os.path.getsize(file_args[0]))),
*(len(file_name) for file_name in file_args))
report_cli_differences(compare_files(file_args,
ignore_differences), settings)