5
\$\begingroup\$

I created a small Python utility to analyze multiple same-size files and report on the difference byte-by-byte (i.e. Hamming distance, not Levenshtein distance.) I am planning to extend the capabilities to allow a GUI display. Mostly, I'm interested in scalability and usability, with efficiency still being a high priority. I've already examined Python code that compares two files byte by byte and other similar questions, but none seem to involve the same type of analysis for the purpose of reporting on differences.

Specifically, I'm interested in two details:

  1. Is there anything else I should consider in my Difference container structure and its handling?

  2. Are there any downsides to my frequent use of generators? I tried to let it cycle through each difference via generator, minimizing memory usage, but I think that also has a few performance implications.


import os
import os.path
from enum import IntEnum
from collections import defaultdict
def compare_iterator_elements(lst, f=lambda x: x):
 '''Returns true if all the elements of an iterator are true, else returns false.'''
 try:
 first = f(next(lst))
 for i in lst:
 if first != f(i):
 return False
 return True
 except StopIteration:
 return True
# Source: https://stackoverflow.com/a/312464
def chunks(l, n):
 '''Yield successive n-sized chunks from l. If len(l) is not divisible by n, the last chunk will be len(l) % n sized.'''
 for i in range(0, len(l), n):
 yield l[i:i + n]
class Settings:
 '''Container class for difference formatting options.'''
 class Separation(IntEnum):
 CLUMPED = 0
 WIDTH_BROKEN = 1
 BLOCK_SEPARATED = 2
 separation = Separation.BLOCK_SEPARATED
 block_width = 0x10
# Planned future settings:
 # byte_group = 1
 # big_endian_groups = True
 # align_offset = 0
class Difference():
 '''Dictionary wrapper representing the difference at a specific point in a file.'''
 def __init__(self, difference, start):
 self.difference = difference
 self.start = start
 def extend(self, other):
 if other.start < self.start:
 raise ValueError(f'Differences can only be extended to differences with larger start values ({other.start} < {other.start}).')
 blank_width = other.start - (self.start + len(self))
 if blank_width < 0:
 raise ValueError(f'Extending to an overlapping Difference is not allowed.')
 for file, v in self.difference.items():
 v += [' '] * blank_width + other.difference[file]
 def __len__(self):
 return len(next(iter(self.difference.values())))
 def __getitem__(self, value):
 difference = {k: v[value] for k,v in self.difference.items()}
 if isinstance(value, slice):
 return Difference(difference, self.start + (value.start if value.start else 0))
 else:
 return Difference(difference, self.start + value)
def _cli_difference(differences, start, settings):
 '''Outputs a difference in the command line.'''
 name_width = settings.name_width
 start_display = hex(start)[:-1] + 'X' if settings.separation > settings.Separation.CLUMPED else hex(start)
 print(f'{start_display.rjust(name_width)}:', end='')
 if settings.separation > settings.Separation.CLUMPED:
 print(' ' + ''.join((f' {i:x}' for i in range(settings.block_width))), end='')
 print()
 offset = start % settings.block_width
 for file_name, b in differences.items():
 print(f'{file_name.rjust(name_width)}: ', end='')
 if settings.separation > settings.Separation.CLUMPED:
 print(' ' * offset,
 ''.join(b[:settings.block_width - offset]),
 sep='', end='')
 for line in chunks(b[settings.block_width - offset:], settings.block_width):
 print('\n', ' ' * (name_width + 2), *line, sep='', end='')
 print()
 else:
 print(''.join(b))
 print()
def _separate_line_by_line(differences, settings):
 '''Generator function that separates and combines differences into fixed width blocks.'''
 current = next(differences)
 try:
 while True:
 offset = current.start % settings.block_width
 if offset + len(current) > settings.block_width:
 next_block = settings.block_width - offset
 yield current[:next_block]
 current = current[next_block:]
 else:
 next_diff = next(differences)
 block_start = current.start - offset
 if next_diff.start in range(block_start, block_start+settings.block_width):
 current.extend(next_diff)
 else:
 yield current
 current = next_diff
 except StopIteration:
 yield current
def report_cli_differences(differences, settings):
 '''Outputs an iterable of differences in the command line.'''
 print('Differences:\n')
 if settings.separation == Settings.Separation.BLOCK_SEPARATED:
 differences = _separate_line_by_line(differences, settings)
 for diff in differences:
 _cli_difference(diff.difference, diff.start, settings)
 print('End of differences.')
def compare_files(file_args, difference_mask):
 '''Generator function that returns Difference objects for each aligned difference in same size files.'''
 if not compare_iterator_elements(iter(file_args), os.path.getsize):
 raise ValueError('The file sizes must be equal.')
 files = [open(file_name, 'rb') for file_name in file_args]
 while True:
 comparison = {f.name: f.read(1) for f in files}
 if not list(comparison.values())[0]:
 break
 bytes_different = lambda: not compare_iterator_elements(iter(comparison.values())) and compare_iterator_elements((comparison[file_name] for file_name in difference_mask))
 if bytes_different():
 start = files[0].tell()
 different_bytes = defaultdict(list)
 while True:
 for file_name, b in comparison.items():
 different_bytes[file_name].append(f'{ord(b):0>2x}')
 comparison = {f.name: f.read(1) for f in files}
 if not bytes_different():
 break
 yield Difference(different_bytes, start)
 for f in files:
 f.close()
if __name__ == '__main__':
 import argparse
 arg_parser = argparse.ArgumentParser(description='Compares two files for differences.')
 arg_parser.add_argument('files', metavar='files', nargs='+', default=[f for f in os.listdir() if os.path.isfile(f)], help='file names to compare')
 arg_parser.add_argument('--separation', default=2, type = int, help='Specifies the level of difference separation. 0 is no separation, 1 is carriage returns after every block, and 2 separates every new block into a new comparison.')
 arg_parser.add_argument('--ignored_diff', metavar='FILES', nargs='+', help='For the purpose of reporting on differences, ignore the positions where differences occur between the given files.', default=[])
 args = arg_parser.parse_args()
 file_args = args.files
 settings = Settings()
 settings.separation = Settings.Separation(args.separation)
 ignore_differences = args.ignored_diff
 settings.name_width = max(len(str(os.path.getsize(file_args[0]))),
 *(len(file_name) for file_name in file_args))
 report_cli_differences(compare_files(file_args, 
 ignore_differences), settings)
asked Aug 3, 2018 at 3:03
\$\endgroup\$

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.