7
\$\begingroup\$

I posted a small hexdump generator function from a program I've been writing not long ago and applied what a reviewer suggested since then. The goal was to lazily hexdump bytes objects (byte strings, binary files without committing to I/O code).

Here is the relevant code (minus docstrings/tests/script), with some description below:

import re
from itertools import islice
class HexdumpGenerator:
 def __init__(self, iterable, base_addr=0, start=0, stop=None, step=16, sep='\b'):
 self.iterable = islice(iterable, start, stop)
 self.base_addr = base_addr
 self.start = start
 self.stop = stop
 self.step = step
 self.col0 = '08X'
 self.col1 = '02X'
 self.fmt = '{} {} {}'
 self.placeholder = [' ']
 self._sep = sep[0]
 self._mod = (base_addr + start) % self.step
 self._next = start + self.step - self._mod
 def __iter__(self):
 while True:
 row = bytearray(islice(self.iterable, self._next - self.start))
 if not row:
 break
 col0 = format(self.base_addr + self.start - self._mod, self.col0)
 col1 = self._mod * self.placeholder
 col2 = self._mod * ' '
 for byte in row:
 ch = chr(byte)
 col1 += [format(byte, self.col1)]
 col2 += ch if ch.isprintable() else '.'
 self._mod = self.step - len(col1)
 col1 += self._mod * self.placeholder
 col2 += self._mod * ' '
 col1.insert(self.step // 2, self._sep)
 yield self.fmt.format(col0, ' '.join(col1), col2)
 self.start = self._next
 self._next += self.step
class CompressHexdumpGenerator(HexdumpGenerator):
 def __init__(self, *args, **kwargs):
 super(CompressHexdumpGenerator, self).__init__(*args, **kwargs)
 self.row = ''
 self.delimiter = ' '
 self.duplicates = 0
 def _compress(self):
 index = self.row.index(self.delimiter)
 col0 = int(self.row[:index], 16)
 col0 += self.duplicates * self.step
 return format(col0, self.col0) + self.row[index:]
 def __iter__(self):
 for i in super().__iter__():
 if self.row.split()[1:] == i.split()[1:]:
 if not self.duplicates:
 yield '*'
 self.duplicates += 1
 else:
 yield i
 self.row = i
 self.duplicates = 0
 if self.duplicates:
 yield self._compress()
class FromHexdumpGenerator(CompressHexdumpGenerator):
 def __init__(self, *args, **kwargs):
 super(FromHexdumpGenerator, self).__init__(*args, **kwargs)
 self.base = 16
 self.len = '3'
 def get_repr(self, _row):
 row = bytearray()
 for i in _row[2:self.step * 2 + 1]:
 if i.isalnum():
 row.append(int(i, self.base))
 elif re.match('(\s{' + self.len + ',})', i):
 break
 return row
 def decompress_gen(self, row0, row1):
 i = int(row0[0].rstrip(self.delimiter), 16) + self.step
 j = int(row1[0].rstrip(self.delimiter), 16)
 while not i >= j:
 row = format(i, self.col0) + self.delimiter
 row = [row.rstrip(' ')] + row0[1:]
 yield self.get_repr(row)
 i += self.step
 def __iter__(self):
 i = j = ''
 while True:
 row = j if j else next(self.iterable, None)
 if row is None:
 break
 elif row == '*' or row == '*\n':
 j = next(self.iterable)
 yield from self.decompress_gen(i, j.split())
 else:
 index = row.find(self._sep)
 i = row[:index] + row[index + 1:]
 i = re.split('(\s+)', i)
 j = ''
 yield self.get_repr(i)

Utility functions:

from itertools import chain
def read_binary_gen(file):
 with open(file, 'rb') as f:
 yield from chain.from_iterable(f)
def write(file, gen):
 with open(file, 'w') as f:
 for i in gen:
 f.write(i + '\n')
def read_gen(file):
 with open(file, 'r') as f:
 yield from f
def write_binary(file, gen):
 with open(file, 'wb') as f:
 for i in gen:
 f.write(i)

read_binary_gen() is meant to be passed to the first two generator classes, while read_gen to the latter, thus not reading a file into memory.

I've tested the code with different formats (03o, 03d): if specifying 03d, then the placeholder attribute must be assigned a list with a single string composed of 3 spaces. Using FromHexdumpGenerator to undo the hexdump would then require that base be assigned the integer 10, and len the number '4' (3 + 1). The col0 attribute must remain hex (as it is an address).

If fmt's first column ends with a colon (as I've seen other programs use), the delimiter must be set to that value.

I struggled with different encodings before figuring out this was a case for bytearray, so if a bytes object can be dumped, it can be undumped.

EDIT

@Peilonrayz I attempted to fix the issues you pointed out and came up with the following (this is the whole module, with some additional functionality I'd been working on):

from itertools import islice, takewhile, tee, chain
from re import match, split
from copy import copy
from colorama import init, Style
tee = tee
init()
COL0 = '08X'
COL1 = '02X'
PAD = ' '
FMT = '{} {} {}'
DLM = ' '
BASE = 16
def change_format(col0, col1, fmt=''):
 global COL0, COL1, PAD, FMT, DLM, BASE
 COL0 = col0
 COL1 = col1
 PAD = ' ' * int(col1[:-1])
 if fmt:
 FMT = fmt
 DLM = fmt[fmt.index('}') + 1]
 BASE = {'b': 2, 'o': 8, 'd': 10, 'x': 16, 'X': 16}[col1[-1]]
def fix(it, offset, start, stop, step):
 n = (offset + start) % step
 return islice(it, start, stop), n, start + step - n
def mk_row(it, nxt, start):
 return bytearray(islice(it, nxt - start))
def to_hex(i):
 return format(i, COL1)
def to_chr(i):
 i = chr(i)
 return i if i.isprintable() else '.'
def pad_gen(fn, pad, it, n, step):
 count = 0
 for i in range(n):
 yield pad
 for i in it:
 yield fn(i)
 count += 1
 while count < step - n:
 yield pad
 count += 1
def hexdump_gen(it, offset=0, start=0, stop=None, step=16, sep='\b'):
 it, n, nxt = fix(it, offset, start, stop, step)
 while True:
 row = mk_row(it, nxt, start)
 if not row:
 break
 args = row, n, step
 col0 = format(offset + start - n, COL0)
 col1 = list(pad_gen(to_hex, PAD, *args))
 col2 = ''.join(pad_gen(to_chr, ' ', *args))
 col1.insert(step // 2, sep)
 yield FMT.format(col0, ' '.join(col1), col2)
 start = nxt
 nxt += step
 n = 0
def compress_hexdump_gen(*args, **kwargs):
 row = ''
 duplicates = 0
 for i in hexdump_gen(*args, **kwargs):
 if row.split()[1:] == i.split()[1:]:
 if not duplicates:
 yield '*'
 duplicates += 1
 else:
 yield i
 row = i
 duplicates = 0
 if duplicates > 1:
 index = row.index(DLM)
 col0 = int(row[:index], 16)
 col0 += duplicates * kwargs.get('step', 16)
 yield format(col0, COL0) + row[index:]
def predicate(i):
 return not match('(\s{3,})', i)
def highlight(row, sep, ba):
 for n, i in enumerate(takewhile(predicate, copy(row))):
 if ' ' not in i and i != sep:
 j = int(i, BASE)
 if j in ba or (not ba and chr(j).isprintable()):
 row[n] = Style.BRIGHT + i + Style.RESET_ALL
def highlight_hexdump_gen(it, step=16, sep='\b', ba=b''):
 ba = bytearray(ba)
 index = step * 2 + (1 if not sep else 3)
 for i in it:
 row0 = split('(\s+)', i)
 row1 = row0[2:index]
 highlight(row1, sep, ba)
 yield ''.join(row0[:2] + row1 + row0[index:])
def to_bytes(row, step):
 ba = bytearray()
 for i in takewhile(predicate, row[2:step * 2 + 1]):
 i = i.replace(Style.BRIGHT, '').replace(Style.RESET_ALL, '')
 if i.isalnum():
 ba.append(int(i, BASE))
 return ba
def decompress_gen(row0, row1, step):
 i = int(row0[0].rstrip(DLM), 16) + step
 j = int(row1[0].rstrip(DLM), 16)
 while not i >= j:
 row = [(format(i, COL0) + DLM).rstrip(' ')] + row0[1:]
 yield to_bytes(row, step)
 i += step
def from_hexdump_gen(it, step=16, sep='\b'):
 i = j = ''
 while True:
 row = j if j else next(it, None)
 if row is None:
 break
 elif row == '*' or row == '*\n':
 j = next(it, ''.join(i))
 yield from decompress_gen(i, j.split(), step)
 else:
 index = row.find(sep)
 i = row[:index] + row[index + 1:]
 i = split('(\s+)', i)
 j = ''
 yield to_bytes(i, step)
def test(it0, it1, offset=0, start=0, stop=None, step=16, sep='\b'):
 it1, _, nxt = fix(it1, offset, start, stop, step)
 for i in from_hexdump_gen(it0, step=step, sep=sep):
 if mk_row(it1, nxt, start) != i:
 break
 start = nxt
 nxt += step
 else:
 return True
def read_binary_gen(file):
 with open(file, 'rb') as f:
 yield from chain.from_iterable(f)
def write(file, gen):
 with open(file, 'w') as f:
 for i in gen:
 f.write(i + '\n')
def read_gen(file):
 with open(file, 'r') as f:
 yield from f
def write_binary(file, gen):
 with open(file, 'wb') as f:
 for i in gen:
 f.write(i)
asked Jul 5, 2017 at 1:10
\$\endgroup\$

1 Answer 1

4
\$\begingroup\$
  • Don't use a class when you don't need to. A generator is good enough for this, and makes the code much clearer.
  • Make functions/classes responsible for one thing. HexdumpGenerator changes the iterable and formats.
  • Use better variable names, self._mod is beyond cryptic. As in this took me a long while to understand, as it's not really modulo, and I can't think of anything else that shortens to mod. How about padding.
  • Rather than having to care about base_addr and start in your __iter__, you could just change the beginning of the iterable to None, and make it print empty information on None.
  • Rather than adding variables to your class try to keep them only in function scope. This makes them much easier to work with, and reason with. Plus the only variables you use in child classes are step, col0, delimiter, iterable and _sep. Which don't change after __iter__.
  • I don't think CompressHexdumpGenerator works correctly:

    First, I checked what it outputted:

    >>> print('\n'.join(list(CompressHexdumpGenerator(map(ord, '123456781234567812345678'), step=8))))
    self._compress
    00000000 31 32 33 34 35 36 37 38 12345678
    *
    00000010 31 32 33 34 35 36 37 38 12345678
    >>> print('\n'.join(list(CompressHexdumpGenerator(map(ord, '123456781234567812345679'), step=8))))
    00000000 31 32 33 34 35 36 37 38 12345678
    *
    00000010 31 32 33 34 35 36 37 39 12345679
    >>> print('\n'.join(list(CompressHexdumpGenerator(map(ord, '123456781234567912345679'), step=8))))
    00000000 31 32 33 34 35 36 37 38 12345678
    00000008 31 32 33 34 35 36 37 39 12345679
    *
    00000010 31 32 33 34 35 36 37 39 12345679
    

    The first and second kind of make sense. I don't really get why you'd care about the very last, but none of the ones in-between. Maybe so you have the last index, IDK. But the last example from above makes no sense, why should it output *? It's just a waist of a line and makes me go 'huh, nothing's there'.

  • If I'm reading things correctly, you don't need _compress if you move self.row = i out of the if else statement, since all you change is the index to that of the last.

And so ignoring the possible CompressHexdumpGenerator bug, I'd change your code to heavily use itertools, rather than use some odd mutation stuff:

import itertools
def byte_gen(iterable, chunk=1024):
 while True:
 ret = bytearray(islice(iterable, chunk))
 if not ret:
 break
 yield from ret
def grouper(iterable, n, fillvalue=None):
 "Collect data into fixed-length chunks or blocks"
 # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
 args = [iter(iterable)] * n
 return itertools.zip_longest(*args, fillvalue=fillvalue)
def hex_iter_changer(iterable, base_addr=0, start=0, stop=None, chunk=1024):
 sliced = itertools.islice(iterable, start, stop)
 byte_slice = byte_gen(sliced)
 empty_prefix = itertools.repeat(None, base_addr + start)
 return itertools.chain(empty_prefix, byte_slice)
def _hexdump_generator(iterable, base_addr=0, start=0, stop=None, width=16, sep='\b'):
 iterable = hex_iter_changer(iterable, base_addr, start, stop)
 groups = grouper(iterable, width)
 for index, group in zip(itertools.count(0, width), groups):
 group = list(group)
 yield (
 index,
 [
 None if byte is None else format(byte, '02X')
 for byte in group
 ],
 [
 None if byte is None else chr(byte)
 for byte in group
 ]
 )
def hexdump_generator(iterable, base_addr=0, start=0, stop=None, width=16, sep='\b'):
 middle = width // 2
 for index, hex_values, values in _hexdump_generator(iterable, base_addr, start, stop, width, sep):
 hex_values = [' ' if byte is None else byte for byte in hex_values]
 hex_values.insert(middle, sep)
 values = [' ' if char is None else char for char in values]
 yield '{:08X} {} {}'.format(index, ' '.join(hex_values), ''.join(values))
def compress_hexdump_generator(iterable, base_addr=0, start=0, stop=None, width=16, sep='\b'):
 prev = ''
 duplicates = 0
 for row in hexdump_generator(iterable, base_addr, start, stop, width, sep):
 if prev.split()[1:] == row.split()[1:]:
 if not duplicates:
 yield '*'
 duplicates += 1
 else:
 yield row
 duplicates = 0
 prev = row
 if duplicates:
 yield prev
answered Jul 5, 2017 at 9:50
\$\endgroup\$
1
  • \$\begingroup\$ You are right, and your end result looks pretty good. I should've looked into _compress more recently. I wanted the last row for undoing. Without it, I could not restore a binary file. Thanks for pointing it out. \$\endgroup\$ Commented Jul 5, 2017 at 10:27

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.