I have a plain text file which contains lines with fields below:
<str_addr_a> <hex_int_addr>
<str_addr_a> <hex_int_addr> <dec_int_addr>
<str_addr_a> <str_addr_b>
<str_addr_a> <str_addr_b> <hex_int_addr>
<str_addr_a> <str_addr_b> <hex_int_addr> <dec_int_addr>
For example, the content of the file could be following:
# <str_addr_a> <hex_int_addr>
dkfi A18A
# <str_addr_a> <str_addr_b>
kloe uuep
# <str_addr_a> <str_addr_b> <hex_int_addr> <dec_int_addr>
ctff yaaq BBF2 19
# <str_addr_a> <str_addr_b> <hex_int_addr>
fkii hhyf E118
# <str_addr_a> <str_addr_b> <hex_int_addr> <dec_int_addr>
ctkj yuuq BBF0 12
I'm reading the file with function below:
def read_data(file):
with open(file, 'r', encoding='utf-8') as f:
addresses = []
for line_nr, line in enumerate(f, start=1):
line = line.strip()
if line and not line.startswith('#'):
str_addr_a = str_addr_b = hex_int_addr = dec_int_addr = None
try:
# '<str_addr_a> <hex_int_addr>'
str_addr_a, hex_int_addr = line.split()
if not is_valid_str_addr(str_addr_a):
raise ValueError from None
hex_int_addr = hex_addr(hex_int_addr).compress
except ValueError:
try:
# '<str_addr_a> <hex_int_addr> <dec_int_addr>'
str_addr_a = hex_int_addr = None
str_addr_a, hex_int_addr, dec_int_addr = line.split()
if not is_valid_str_addr(str_addr_a):
raise ValueError from None
hex_int_addr = hex_addr(hex_int_addr).compress
dec_int_addr = int(dec_int_addr)
except ValueError:
try:
# '<str_addr_a> <str_addr_b>'
str_addr_a = hex_int_addr = dec_int_addr = None
str_addr_a, str_addr_b = line.split()
if not is_valid_str_addr(str_addr_a):
raise ValueError from None
if not is_valid_str_addr(str_addr_b):
raise ValueError from None
except ValueError:
try:
# '<str_addr_a> <str_addr_b> <hex_int_addr>'
str_addr_a = str_addr_b = None
str_addr_a, str_addr_b, hex_int_addr = line.split()
if not is_valid_str_addr(str_addr_a):
raise ValueError from None
if not is_valid_str_addr(str_addr_b):
raise ValueError from None
hex_int_addr = hex_addr(hex_int_addr).compress
except ValueError:
try:
# '<str_addr_a> <str_addr_b> <hex_int_addr>
# <dec_int_addr>'
str_addr_a = str_addr_b = hex_int_addr = None
str_addr_a, str_addr_b, hex_int_addr, \
dec_int_addr = line.split()
if not is_valid_str_addr(str_addr_a):
raise ValueError from None
if not is_valid_str_addr(str_addr_b):
raise ValueError from None
hex_int_addr = hex_addr(hex_int_addr).compress
dec_int_addr = int(dec_int_addr)
except ValueError:
logging.error(f'{file}: error on line number {line_nr}')
continue
addresses.append((str_addr_a, str_addr_b, hex_int_addr, dec_int_addr))
return(addresses)
As seen above, each try/except
block tests one of the possible line formats. Such code introduces quite a lot of branches and isn't very flat. Is there a more elegant way to process such data file? Or perhaps the way I'm doing this is fine?
1 Answer 1
I think you would benefit from re-interpreting this input as having "only one format", with one mandatory field and three other optional fields. A regex can capture this better than your nested try blocks.
You need to separate individual record line parsing from parsing of the file.
Add unit tests.
Suggested
File and #
handling not shown.
import re
from typing import NamedTuple, Optional
RECORD_PAT = re.compile(
r'''(?x)
^
(?P<str_addr_a>
[a-z]{4}
)
(?:
\s+
(?P<str_addr_b>
[a-z]{4}
)
)?
(?:
\s+
(?P<hex_int_addr>
[A-F0-9]{4}
)
)?
(?:
\s+
(?P<dec_int_addr>
\d+
)
)?
$
'''
)
class Record(NamedTuple):
str_addr_a: str
str_addr_b: Optional[str]
hex_int_addr: Optional[str]
dec_int_addr: Optional[int]
@classmethod
def from_line(cls, line: str) -> 'Record':
*first, dec_int_addr = RECORD_PAT.match(line).groups()
if dec_int_addr is not None:
dec_int_addr = int(dec_int_addr)
return cls(*first, dec_int_addr)
def test() -> None:
r = Record.from_line('dkfi A18A')
assert r.str_addr_a == 'dkfi'
assert r.str_addr_b is None
assert r.hex_int_addr == 'A18A'
assert r.dec_int_addr is None
r = Record.from_line('kloe uuep')
assert r.str_addr_a == 'kloe'
assert r.str_addr_b == 'uuep'
assert r.hex_int_addr is None
assert r.dec_int_addr is None
r = Record.from_line('ctff yaaq BBF2 19')
assert r.str_addr_a == 'ctff'
assert r.str_addr_b == 'yaaq'
assert r.hex_int_addr == 'BBF2'
assert r.dec_int_addr == 19
r = Record.from_line('fkii hhyf E118')
assert r.str_addr_a == 'fkii'
assert r.str_addr_b == 'hhyf'
assert r.hex_int_addr == 'E118'
assert r.dec_int_addr is None
r = Record.from_line('ctkj yuuq BBF0 12')
assert r.str_addr_a == 'ctkj'
assert r.str_addr_b == 'yuuq'
assert r.hex_int_addr == 'BBF0'
assert r.dec_int_addr == 12
if __name__ == '__main__':
test()
-
3\$\begingroup\$ It looks like a
dec_int_addr
can only be present if there is ahex_int_addr
. This could be included in the regex pattern by nesting thedec_int_addr
pattern inside thehex_int_addr
pattern. \$\endgroup\$RootTwo– RootTwo2022年09月05日 00:52:32 +00:00Commented Sep 5, 2022 at 0:52
is_valid_str_addr
andhex_addr
. \$\endgroup\$hex_addr().compress
? \$\endgroup\$