I posted a small hexdump generator function from a program I've been writing not long ago and applied what a reviewer suggested since then. The goal was to lazily hexdump bytes objects (byte strings, binary files without committing to I/O code).
Here is the relevant code (minus docstrings/tests/script), with some description below:
import re
from itertools import islice
class HexdumpGenerator:
def __init__(self, iterable, base_addr=0, start=0, stop=None, step=16, sep='\b'):
self.iterable = islice(iterable, start, stop)
self.base_addr = base_addr
self.start = start
self.stop = stop
self.step = step
self.col0 = '08X'
self.col1 = '02X'
self.fmt = '{} {} {}'
self.placeholder = [' ']
self._sep = sep[0]
self._mod = (base_addr + start) % self.step
self._next = start + self.step - self._mod
def __iter__(self):
while True:
row = bytearray(islice(self.iterable, self._next - self.start))
if not row:
break
col0 = format(self.base_addr + self.start - self._mod, self.col0)
col1 = self._mod * self.placeholder
col2 = self._mod * ' '
for byte in row:
ch = chr(byte)
col1 += [format(byte, self.col1)]
col2 += ch if ch.isprintable() else '.'
self._mod = self.step - len(col1)
col1 += self._mod * self.placeholder
col2 += self._mod * ' '
col1.insert(self.step // 2, self._sep)
yield self.fmt.format(col0, ' '.join(col1), col2)
self.start = self._next
self._next += self.step
class CompressHexdumpGenerator(HexdumpGenerator):
def __init__(self, *args, **kwargs):
super(CompressHexdumpGenerator, self).__init__(*args, **kwargs)
self.row = ''
self.delimiter = ' '
self.duplicates = 0
def _compress(self):
index = self.row.index(self.delimiter)
col0 = int(self.row[:index], 16)
col0 += self.duplicates * self.step
return format(col0, self.col0) + self.row[index:]
def __iter__(self):
for i in super().__iter__():
if self.row.split()[1:] == i.split()[1:]:
if not self.duplicates:
yield '*'
self.duplicates += 1
else:
yield i
self.row = i
self.duplicates = 0
if self.duplicates:
yield self._compress()
class FromHexdumpGenerator(CompressHexdumpGenerator):
def __init__(self, *args, **kwargs):
super(FromHexdumpGenerator, self).__init__(*args, **kwargs)
self.base = 16
self.len = '3'
def get_repr(self, _row):
row = bytearray()
for i in _row[2:self.step * 2 + 1]:
if i.isalnum():
row.append(int(i, self.base))
elif re.match('(\s{' + self.len + ',})', i):
break
return row
def decompress_gen(self, row0, row1):
i = int(row0[0].rstrip(self.delimiter), 16) + self.step
j = int(row1[0].rstrip(self.delimiter), 16)
while not i >= j:
row = format(i, self.col0) + self.delimiter
row = [row.rstrip(' ')] + row0[1:]
yield self.get_repr(row)
i += self.step
def __iter__(self):
i = j = ''
while True:
row = j if j else next(self.iterable, None)
if row is None:
break
elif row == '*' or row == '*\n':
j = next(self.iterable)
yield from self.decompress_gen(i, j.split())
else:
index = row.find(self._sep)
i = row[:index] + row[index + 1:]
i = re.split('(\s+)', i)
j = ''
yield self.get_repr(i)
Utility functions:
from itertools import chain
def read_binary_gen(file):
with open(file, 'rb') as f:
yield from chain.from_iterable(f)
def write(file, gen):
with open(file, 'w') as f:
for i in gen:
f.write(i + '\n')
def read_gen(file):
with open(file, 'r') as f:
yield from f
def write_binary(file, gen):
with open(file, 'wb') as f:
for i in gen:
f.write(i)
read_binary_gen()
is meant to be passed to the first two generator classes, while read_gen
to the latter, thus not reading a file into memory.
I've tested the code with different formats (03o
, 03d
): if specifying 03d
, then the placeholder
attribute must be assigned a list with a single string composed of 3 spaces. Using FromHexdumpGenerator
to undo the hexdump would then require that base
be assigned the integer 10, and len
the number '4'
(3 + 1). The col0
attribute must remain hex (as it is an address).
If fmt
's first column ends with a colon (as I've seen other programs use), the delimiter
must be set to that value.
I struggled with different encodings before figuring out this was a case for bytearray
, so if a bytes object can be dumped, it can be undumped.
EDIT
@Peilonrayz I attempted to fix the issues you pointed out and came up with the following (this is the whole module, with some additional functionality I'd been working on):
from itertools import islice, takewhile, tee, chain
from re import match, split
from copy import copy
from colorama import init, Style
tee = tee
init()
COL0 = '08X'
COL1 = '02X'
PAD = ' '
FMT = '{} {} {}'
DLM = ' '
BASE = 16
def change_format(col0, col1, fmt=''):
global COL0, COL1, PAD, FMT, DLM, BASE
COL0 = col0
COL1 = col1
PAD = ' ' * int(col1[:-1])
if fmt:
FMT = fmt
DLM = fmt[fmt.index('}') + 1]
BASE = {'b': 2, 'o': 8, 'd': 10, 'x': 16, 'X': 16}[col1[-1]]
def fix(it, offset, start, stop, step):
n = (offset + start) % step
return islice(it, start, stop), n, start + step - n
def mk_row(it, nxt, start):
return bytearray(islice(it, nxt - start))
def to_hex(i):
return format(i, COL1)
def to_chr(i):
i = chr(i)
return i if i.isprintable() else '.'
def pad_gen(fn, pad, it, n, step):
count = 0
for i in range(n):
yield pad
for i in it:
yield fn(i)
count += 1
while count < step - n:
yield pad
count += 1
def hexdump_gen(it, offset=0, start=0, stop=None, step=16, sep='\b'):
it, n, nxt = fix(it, offset, start, stop, step)
while True:
row = mk_row(it, nxt, start)
if not row:
break
args = row, n, step
col0 = format(offset + start - n, COL0)
col1 = list(pad_gen(to_hex, PAD, *args))
col2 = ''.join(pad_gen(to_chr, ' ', *args))
col1.insert(step // 2, sep)
yield FMT.format(col0, ' '.join(col1), col2)
start = nxt
nxt += step
n = 0
def compress_hexdump_gen(*args, **kwargs):
row = ''
duplicates = 0
for i in hexdump_gen(*args, **kwargs):
if row.split()[1:] == i.split()[1:]:
if not duplicates:
yield '*'
duplicates += 1
else:
yield i
row = i
duplicates = 0
if duplicates > 1:
index = row.index(DLM)
col0 = int(row[:index], 16)
col0 += duplicates * kwargs.get('step', 16)
yield format(col0, COL0) + row[index:]
def predicate(i):
return not match('(\s{3,})', i)
def highlight(row, sep, ba):
for n, i in enumerate(takewhile(predicate, copy(row))):
if ' ' not in i and i != sep:
j = int(i, BASE)
if j in ba or (not ba and chr(j).isprintable()):
row[n] = Style.BRIGHT + i + Style.RESET_ALL
def highlight_hexdump_gen(it, step=16, sep='\b', ba=b''):
ba = bytearray(ba)
index = step * 2 + (1 if not sep else 3)
for i in it:
row0 = split('(\s+)', i)
row1 = row0[2:index]
highlight(row1, sep, ba)
yield ''.join(row0[:2] + row1 + row0[index:])
def to_bytes(row, step):
ba = bytearray()
for i in takewhile(predicate, row[2:step * 2 + 1]):
i = i.replace(Style.BRIGHT, '').replace(Style.RESET_ALL, '')
if i.isalnum():
ba.append(int(i, BASE))
return ba
def decompress_gen(row0, row1, step):
i = int(row0[0].rstrip(DLM), 16) + step
j = int(row1[0].rstrip(DLM), 16)
while not i >= j:
row = [(format(i, COL0) + DLM).rstrip(' ')] + row0[1:]
yield to_bytes(row, step)
i += step
def from_hexdump_gen(it, step=16, sep='\b'):
i = j = ''
while True:
row = j if j else next(it, None)
if row is None:
break
elif row == '*' or row == '*\n':
j = next(it, ''.join(i))
yield from decompress_gen(i, j.split(), step)
else:
index = row.find(sep)
i = row[:index] + row[index + 1:]
i = split('(\s+)', i)
j = ''
yield to_bytes(i, step)
def test(it0, it1, offset=0, start=0, stop=None, step=16, sep='\b'):
it1, _, nxt = fix(it1, offset, start, stop, step)
for i in from_hexdump_gen(it0, step=step, sep=sep):
if mk_row(it1, nxt, start) != i:
break
start = nxt
nxt += step
else:
return True
def read_binary_gen(file):
with open(file, 'rb') as f:
yield from chain.from_iterable(f)
def write(file, gen):
with open(file, 'w') as f:
for i in gen:
f.write(i + '\n')
def read_gen(file):
with open(file, 'r') as f:
yield from f
def write_binary(file, gen):
with open(file, 'wb') as f:
for i in gen:
f.write(i)
1 Answer 1
- Don't use a class when you don't need to. A generator is good enough for this, and makes the code much clearer.
- Make functions/classes responsible for one thing.
HexdumpGenerator
changes the iterable and formats. - Use better variable names,
self._mod
is beyond cryptic. As in this took me a long while to understand, as it's not really modulo, and I can't think of anything else that shortens to mod. How aboutpadding
. - Rather than having to care about
base_addr
andstart
in your__iter__
, you could just change the beginning of the iterable toNone
, and make it print empty information onNone
. - Rather than adding variables to your class try to keep them only in function scope. This makes them much easier to work with, and reason with. Plus the only variables you use in child classes are
step
,col0
,delimiter
,iterable
and_sep
. Which don't change after__iter__
. I don't think
CompressHexdumpGenerator
works correctly:First, I checked what it outputted:
>>> print('\n'.join(list(CompressHexdumpGenerator(map(ord, '123456781234567812345678'), step=8)))) self._compress 00000000 31 32 33 34 35 36 37 38 12345678 * 00000010 31 32 33 34 35 36 37 38 12345678 >>> print('\n'.join(list(CompressHexdumpGenerator(map(ord, '123456781234567812345679'), step=8)))) 00000000 31 32 33 34 35 36 37 38 12345678 * 00000010 31 32 33 34 35 36 37 39 12345679 >>> print('\n'.join(list(CompressHexdumpGenerator(map(ord, '123456781234567912345679'), step=8)))) 00000000 31 32 33 34 35 36 37 38 12345678 00000008 31 32 33 34 35 36 37 39 12345679 * 00000010 31 32 33 34 35 36 37 39 12345679
The first and second kind of make sense. I don't really get why you'd care about the very last, but none of the ones in-between. Maybe so you have the last index, IDK. But the last example from above makes no sense, why should it output
*
? It's just a waist of a line and makes me go 'huh, nothing's there'.If I'm reading things correctly, you don't need
_compress
if you moveself.row = i
out of theif else
statement, since all you change is the index to that of the last.
And so ignoring the possible CompressHexdumpGenerator
bug, I'd change your code to heavily use itertools
, rather than use some odd mutation stuff:
import itertools
def byte_gen(iterable, chunk=1024):
while True:
ret = bytearray(islice(iterable, chunk))
if not ret:
break
yield from ret
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return itertools.zip_longest(*args, fillvalue=fillvalue)
def hex_iter_changer(iterable, base_addr=0, start=0, stop=None, chunk=1024):
sliced = itertools.islice(iterable, start, stop)
byte_slice = byte_gen(sliced)
empty_prefix = itertools.repeat(None, base_addr + start)
return itertools.chain(empty_prefix, byte_slice)
def _hexdump_generator(iterable, base_addr=0, start=0, stop=None, width=16, sep='\b'):
iterable = hex_iter_changer(iterable, base_addr, start, stop)
groups = grouper(iterable, width)
for index, group in zip(itertools.count(0, width), groups):
group = list(group)
yield (
index,
[
None if byte is None else format(byte, '02X')
for byte in group
],
[
None if byte is None else chr(byte)
for byte in group
]
)
def hexdump_generator(iterable, base_addr=0, start=0, stop=None, width=16, sep='\b'):
middle = width // 2
for index, hex_values, values in _hexdump_generator(iterable, base_addr, start, stop, width, sep):
hex_values = [' ' if byte is None else byte for byte in hex_values]
hex_values.insert(middle, sep)
values = [' ' if char is None else char for char in values]
yield '{:08X} {} {}'.format(index, ' '.join(hex_values), ''.join(values))
def compress_hexdump_generator(iterable, base_addr=0, start=0, stop=None, width=16, sep='\b'):
prev = ''
duplicates = 0
for row in hexdump_generator(iterable, base_addr, start, stop, width, sep):
if prev.split()[1:] == row.split()[1:]:
if not duplicates:
yield '*'
duplicates += 1
else:
yield row
duplicates = 0
prev = row
if duplicates:
yield prev
-
\$\begingroup\$ You are right, and your end result looks pretty good. I should've looked into
_compress
more recently. I wanted the last row for undoing. Without it, I could not restore a binary file. Thanks for pointing it out. \$\endgroup\$user133955– user1339552017年07月05日 10:27:27 +00:00Commented Jul 5, 2017 at 10:27