Simple text parser using regexes

Question 1

I'm trying to write simple parser using regexes. This is what I currently have, it looks really messy. Any tips what can I change?

from re import compile
from typing import Dict, Iterator, List, NamedTuple, TextIO
# PATTERNS
registers_pattern = list(map(compile, [
 r'(rax=[0-9a-f]{16}) (rbx=[0-9a-f]{16}) (rcx=[0-9a-f]{16})\n',
 r'(rdx=[0-9a-f]{16}) (rsi=[0-9a-f]{16}) (rdi=[0-9a-f]{16})\n',
 r'(rip=[0-9a-f]{16}) (rsp=[0-9a-f]{16}) (rbp=[0-9a-f]{16})\n',
 r' (r8=[0-9a-f]{16}) (r9=[0-9a-f]{16}) (r10=[0-9a-f]{16})\n',
 r'(r11=[0-9a-f]{16}) (r12=[0-9a-f]{16}) (r13=[0-9a-f]{16})\n',
 r'(r14=[0-9a-f]{16}) (r15=[0-9a-f]{16})\n',
]))
flags_pattern = compile(r'iopl=[0-9a-f]+(?:\s+[a-z]{2}){8}\n')
segments_pattern = compile(r'(?:[a-z]{2}=[0-9a-f]{4}\s+){6}efl=[0-9a-f]{8}\n')
label_pattern = compile(r'[\w\+]+:\n')
instruction_pattern = compile(
 r'[0-9a-f]{8}`[0-9a-f]{8}\s+(?P<ENCODING>[0-9a-f]+)\s+(?P<INSTRUCTION>.+?)\n?'
)
class Instruction(NamedTuple):
 instruction: str
 encoding: str
 registers: Dict[str, str]
def parse_trace(stream: TextIO) -> Iterator[Instruction]:
 """ TODO: some description
 """
 iterator = iter(enumerate(stream, start=1))
 for index, line in iterator:
 # Parse general-purpose registers
 registers: Dict[str, str] = {}
 
 for pattern in registers_pattern:
 if match := pattern.fullmatch(line):
 # Extract register values from regex match and go to the next line
 registers.update(group.split('=') for group in match.groups())
 index, line = next(iterator)
 else:
 raise RuntimeError(f'Failed to parse line: {index}')
 if flags_pattern.fullmatch(line) is None:
 raise RuntimeError(f'Failed to parse line: {index}')
 if segments_pattern.fullmatch(next(iterator)[1]) is None:
 # TODO: here will be something
 raise RuntimeError(f'Failed to parse line: {index}')
 if label_pattern.fullmatch(next(iterator)[1]) is None:
 raise RuntimeError(f'Failed to parse line: {index}')
 
 if (match := instruction_pattern.fullmatch(next(iterator)[1])) is None:
 raise RuntimeError(f'Failed to parse line: {index}')
 
 yield Instruction(match.group('INSTRUCTION'), match.group('ENCODING'), registers)
# Example of usage:
from io import StringIO
trace = StringIO("""rax=0000000000000000 rbx=0000000000000000 rcx=0000000000000000
rdx=0000000000000000 rsi=0000000000000000 rdi=0000000000000000
rip=000000000040100a rsp=0000000000000000 rbp=0000000000000000
 r8=わ0000000000000000 r9=わ0000000000000000 r10=わ0000000000000000
r11=わ0000000000000000 r12=わ0000000000000000 r13=わ0000000000000000
r14=わ0000000000000000 r15=わ0000000000000000
iopl=0 nv up ei pl zr na po nc
cs=0033 ss=002b ds=002b es=002b fs=0053 gs=002b efl=00000246
lol+0x1000:
00000000`00401000 48bba47868302035e80c mov rbx,0CE83520306878A4h
rax=0000000000000000 rbx=0000000000000000 rcx=0000000000000000
rdx=0000000000000000 rsi=0000000000000000 rdi=0000000000000000
rip=000000000040100a rsp=0000000000000000 rbp=0000000000000000
 r8=わ0000000000000000 r9=わ0000000000000000 r10=わ0000000000000000
r11=わ0000000000000000 r12=わ0000000000000000 r13=わ0000000000000000
r14=わ0000000000000000 r15=わ0000000000000000
iopl=0 nv up ei pl zr na po nc
cs=0033 ss=002b ds=002b es=002b fs=0053 gs=002b efl=00000246
lol+0x1000:
00000000`00401000 48bba47868302035e80c mov rbx,0CE83520306878A4h""")
for each in parse_trace(trace):
 print(each.instruction)
```

Question 2

Is your goal to create the parser using regex's, or do you just want to parse the input into Instruction objects as cleanly as possible? Will the input coming from a file, or must it accept any TextIO object? Because if it is being read from a file, and this instruction parser can open the file, it could use fileinput.filelineno to keep track of line numbers.

Question 3

Instead of trying to match whole lines, it would be simpler to match smaller chunks of the input and the combine the smaller regexes. For example, one regex could match any of the registers, a different one could match the segment registers, etc. Order of the patterns may be important if one regex is be a prefix of another, but that doesn't seem to apply in this case.

Use capture groups to get the important information, and MatchObject.lastgroup to see which part of the regex matched.

The flags pattern probably needs to have other codes added.

The (?x) in the pattern is for verbose mode, which lets you add whitespace and comments to the regex pattern.

pattern = r"""(?x)
 (?P<REG>r\w{1,2})=(?P<RVAL>[0-9a-f]{16}) # registers
 |iopl=(?P<IOPL>[0-9a-f]+) # iopl
 |(?P<FLAGS>(\s+(?:nv|up|ei|pl|zr|na|po|nc))+) # flags
 |(?P<SREG>[csdefg]s|efl)=(?P<SVAL>[0-9a-f]+) # segments
 |(?P<LABEL>^[\w\+]+:) # label
 |[0-9a-f]{8}`[0-9a-f]{8}\s+(?P<ENCODING>[0-9a-f]+)\s+(?P<INSTRUCTION>.+)
 |(?P<NL>\n)
 """
regex = re.compile(pattern)
registers = {}
lineno = 1
for line in trace:
 print(f"line = {lineno}")
 
 for mo in regex.finditer(line):
 group_name = mo.lastgroup
 
 if group_name == 'NL':
 lineno += 1
 
 else:
 if group_name == "RVAL":
 registers[mo['REG']] = mo['RVAL']
 print(f" {mo['REG']} = {mo['RVAL']}")
 
 elif group_name == "IOPL":
 print(f" iopl = {mo['IOPL']}") 
 
 elif group_name == "FLAGS":
 print(f" FLAGS = {mo['FLAGS'].strip().split()}") 
 
 elif group_name == "SVAL":
 print(f" {mo['SREG']} = {mo['SVAL']}")
 
 elif group_name == "LABEL":
 print(f" LABEL = {mo['LABEL']}") 
 
 elif group_name == "INSTRUCTION":
 print(f" {mo['INSTRUCTION']} = {mo['ENCODING']}")

For the sample input, the code outputs:

line = 1
 rax = 0000000000000000
 rbx = 0000000000000000
 rcx = 0000000000000000
line = 2
 rdx = 0000000000000000
 rsi = 0000000000000000
 rdi = 0000000000000000
line = 3
 rip = 000000000040100a
 rsp = 0000000000000000
 rbp = 0000000000000000
line = 4
 r8 =わ 0000000000000000
 r9 =わ 0000000000000000
 r10 =わ 0000000000000000
line = 5
 r11 =わ 0000000000000000
 r12 =わ 0000000000000000
 r13 =わ 0000000000000000
line = 6
 r14 =わ 0000000000000000
 r15 =わ 0000000000000000
line = 7
 iopl = 0
 FLAGS = ['nv', 'up', 'ei', 'pl', 'zr', 'na', 'po', 'nc']
line = 8
 cs = 0033
 ss = 002b
 ds = 002b
 es = 002b
 fs = 0053
 gs = 002b
 efl = 00000246
line = 9
 LABEL = lol+0x1000:
line = 10
 mov rbx,0CE83520306878A4h = 48bba47868302035e80c
line = 11
 rax = 0000000000000000
 rbx = 0000000000000000
 rcx = 0000000000000000
line = 12
 rdx = 0000000000000000
 rsi = 0000000000000000
 rdi = 0000000000000000
line = 13
 rip = 000000000040100a
 rsp = 0000000000000000
 rbp = 0000000000000000
line = 14
 r8 =わ 0000000000000000
 r9 =わ 0000000000000000
 r10 =わ 0000000000000000
line = 15
 r11 =わ 0000000000000000
 r12 =わ 0000000000000000
 r13 =わ 0000000000000000
line = 16
 r14 =わ 0000000000000000
 r15 =わ 0000000000000000
line = 17
 iopl = 0
 FLAGS = ['nv', 'up', 'ei', 'pl', 'zr', 'na', 'po', 'nc']
line = 18
 cs = 0033
 ss = 002b
 ds = 002b
 es = 002b
 fs = 0053
 gs = 002b
 efl = 00000246
line = 19
 LABEL = lol+0x1000:
line = 20
 mov rbx,0CE83520306878A4h = 48bba47868302035e80c

Obviously, do something useful instead of just printing the information.

Question 4

This line is repeated many times

raise RuntimeError(f'Failed to parse line: {index}')

I would create a function for it like so:

def parseError(index):
 raise RuntimeError(f'Failed to parse line: {index}')

and by calling this function you avoid repeating the format string over and over.

Also this pattern =[0-9a-f]{16} is repeated 17 times in your definition. You could define it once as a variable, and then build those regex strings with some combination of format strings, list, dict, and/or functions to reduce the repetition.

for example

p = r'=[0-9a-f]{16}'
def pattern(prefixes):
 result = r''
 for prefix in prefixes:
 # build the format string from the prefixes
registers_pattern = list(map(compile, [ pattern( ['rax', 'rbx', 'rcx'] ) , pattern ( [] )

RootTwo RootTwo 10.6k1 gold badge14 silver badges30 bronze badges · Accepted Answer · 2020-09-03 22:19:34Z

Instead of trying to match whole lines, it would be simpler to match smaller chunks of the input and the combine the smaller regexes. For example, one regex could match any of the registers, a different one could match the segment registers, etc. Order of the patterns may be important if one regex is be a prefix of another, but that doesn't seem to apply in this case.

Use capture groups to get the important information, and MatchObject.lastgroup to see which part of the regex matched.

The flags pattern probably needs to have other codes added.

The (?x) in the pattern is for verbose mode, which lets you add whitespace and comments to the regex pattern.

pattern = r"""(?x)
 (?P<REG>r\w{1,2})=(?P<RVAL>[0-9a-f]{16}) # registers
 |iopl=(?P<IOPL>[0-9a-f]+) # iopl
 |(?P<FLAGS>(\s+(?:nv|up|ei|pl|zr|na|po|nc))+) # flags
 |(?P<SREG>[csdefg]s|efl)=(?P<SVAL>[0-9a-f]+) # segments
 |(?P<LABEL>^[\w\+]+:) # label
 |[0-9a-f]{8}`[0-9a-f]{8}\s+(?P<ENCODING>[0-9a-f]+)\s+(?P<INSTRUCTION>.+)
 |(?P<NL>\n)
 """
regex = re.compile(pattern)
registers = {}
lineno = 1
for line in trace:
 print(f"line = {lineno}")
 
 for mo in regex.finditer(line):
 group_name = mo.lastgroup
 
 if group_name == 'NL':
 lineno += 1
 
 else:
 if group_name == "RVAL":
 registers[mo['REG']] = mo['RVAL']
 print(f" {mo['REG']} = {mo['RVAL']}")
 
 elif group_name == "IOPL":
 print(f" iopl = {mo['IOPL']}") 
 
 elif group_name == "FLAGS":
 print(f" FLAGS = {mo['FLAGS'].strip().split()}") 
 
 elif group_name == "SVAL":
 print(f" {mo['SREG']} = {mo['SVAL']}")
 
 elif group_name == "LABEL":
 print(f" LABEL = {mo['LABEL']}") 
 
 elif group_name == "INSTRUCTION":
 print(f" {mo['INSTRUCTION']} = {mo['ENCODING']}")

For the sample input, the code outputs:

line = 1
 rax = 0000000000000000
 rbx = 0000000000000000
 rcx = 0000000000000000
line = 2
 rdx = 0000000000000000
 rsi = 0000000000000000
 rdi = 0000000000000000
line = 3
 rip = 000000000040100a
 rsp = 0000000000000000
 rbp = 0000000000000000
line = 4
 r8 =わ 0000000000000000
 r9 =わ 0000000000000000
 r10 =わ 0000000000000000
line = 5
 r11 =わ 0000000000000000
 r12 =わ 0000000000000000
 r13 =わ 0000000000000000
line = 6
 r14 =わ 0000000000000000
 r15 =わ 0000000000000000
line = 7
 iopl = 0
 FLAGS = ['nv', 'up', 'ei', 'pl', 'zr', 'na', 'po', 'nc']
line = 8
 cs = 0033
 ss = 002b
 ds = 002b
 es = 002b
 fs = 0053
 gs = 002b
 efl = 00000246
line = 9
 LABEL = lol+0x1000:
line = 10
 mov rbx,0CE83520306878A4h = 48bba47868302035e80c
line = 11
 rax = 0000000000000000
 rbx = 0000000000000000
 rcx = 0000000000000000
line = 12
 rdx = 0000000000000000
 rsi = 0000000000000000
 rdi = 0000000000000000
line = 13
 rip = 000000000040100a
 rsp = 0000000000000000
 rbp = 0000000000000000
line = 14
 r8 =わ 0000000000000000
 r9 =わ 0000000000000000
 r10 =わ 0000000000000000
line = 15
 r11 =わ 0000000000000000
 r12 =わ 0000000000000000
 r13 =わ 0000000000000000
line = 16
 r14 =わ 0000000000000000
 r15 =わ 0000000000000000
line = 17
 iopl = 0
 FLAGS = ['nv', 'up', 'ei', 'pl', 'zr', 'na', 'po', 'nc']
line = 18
 cs = 0033
 ss = 002b
 ds = 002b
 es = 002b
 fs = 0053
 gs = 002b
 efl = 00000246
line = 19
 LABEL = lol+0x1000:
line = 20
 mov rbx,0CE83520306878A4h = 48bba47868302035e80c

Obviously, do something useful instead of just printing the information.

Stack Exchange Network

Simple text parser using regexes

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Simple text parser using regexes

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions