Basic, general lexer for a programming language

Question 1

I've been working on a general lexer for a programming language for a couple days now. I don't know if the code I've written is overcomplicated, if there is a better way to parse the code into tokens or if my code could be made easier with a library.

Code:

from enum import Enum
from typing import Any
OPERATORS = list("+~-*/^&<>()[]{}=!?|,")
# TODO: add double and triple-char operator support
#DOUBLECHAROPERATORS = ["|>", "<|", "!=", "==", "<<", ">>"]
#TRIPLECHAROPERATORS = [""]
NAMECHARS = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ._')
STRCHARS = [chr(i) for i in range(0,256)]
NUMS = list('1234567890.')
LINEFEEDS = list(';\n')
CMTLINEFEEDS = ['\n']
SLICOMMENTS = ['#', '//']
MLICOMMENTS = [['\\*', '*/']]
STRINGCHARS = list('\'\"')
[STRCHARS.remove(i) for i in STRINGCHARS + ['\n']]
class TokenTypes(Enum):
 SPECIAL = -1
 OP = 0
 NAME = 1
 COMMENT = 2
 LINEFEED = 3
 STRING = 4
 NUMBER = 5
class Token:
 def __init__(self, type: int, data: Any | str):
 self.type = type
 self.data = data
 def __repr__(self):
 return f"Token({self.type}, {repr(self.data)})"
 
class Modes:
 class ModeBool():
 boolean = False
 def __init__(self, val: bool = False):
 self.boolean = val
 def __neg__(self):
 self.boolean = False
 def __pos__(self):
 self.boolean = True
 def __inv__(self):
 self.boolean = not self.boolean
 def __bool__(self):
 return self.boolean
 def __repr__(self):
 return str(self.boolean)
 REGULAR = ModeBool(True)
 STRING = ModeBool(False)
 COMMENT = ModeBool(False)
 COMMENTML = ModeBool(False)
 NAME = ModeBool(False)
 NUM = ModeBool(False)
 def __repr__(self):
 return str({i:getattr(self, i) for i in dir(self) if not i.startswith('_') and i != 'ModeBool'})
class Lexer:
 def __init__(self, src: str) -> None:
 self.position = 0
 self.src = src
 self.tokens = []
 self.modes = Modes()
 self.currdata = {}
 def parse(self) -> None:
 self._main_parse_loop()
 def _main_parse_loop(self) -> None:
 while True:
 if self.position == len(self.src):
 return
 char = self.src[self.position]
 if not len(self.src) == self.position + 1:
 nxtchar = self.src[self.position + 1]
 if self.modes.REGULAR:
 print(char, nxtchar)
 if char == '/' and nxtchar == '*':
 +self.modes.COMMENTML
 -self.modes.REGULAR
 self.position+=1
 elif char in OPERATORS:
 self.tokens.append(Token(TokenTypes.OP, char))
 elif char in LINEFEEDS:
 self.tokens.append(Token(TokenTypes.LINEFEED, char))
 elif char in SLICOMMENTS:
 +self.modes.COMMENT
 -self.modes.REGULAR
 self.currdata = {}
 self.position-=1
 elif char in NAMECHARS:
 -self.modes.REGULAR
 +self.modes.NAME
 self.position-=1
 elif char in STRINGCHARS:
 -self.modes.REGULAR
 +self.modes.STRING
 self.position-=1
 elif char in NUMS:
 -self.modes.REGULAR
 +self.modes.NUM
 self.position-=1
 elif self.modes.COMMENT:
 if not self.currdata.get('notfirst'):
 self.currdata['notfirst'] = 1
 self.currdata['str'] = ''
 if char in LINEFEEDS:
 self.tokens.append(Token(TokenTypes.COMMENT, self.currdata['str']))
 self.currdata = {}
 -self.modes.COMMENT
 +self.modes.REGULAR
 self.position-=1
 else:
 self.currdata['str'] += char
 elif self.modes.COMMENTML:
 if not self.currdata.get('notfirst'):
 self.currdata['notfirst'] = 1
 self.currdata['str'] = '/*'
 if char == '*' and nxtchar == '/':
 self.currdata['str'] += '*/'
 self.tokens.append(Token(TokenTypes.COMMENT, self.currdata['str']))
 self.currdata = {}
 -self.modes.COMMENTML
 +self.modes.REGULAR
 self.position+=1
 else:
 self.currdata['str'] += char
 elif self.modes.NAME:
 if not self.currdata.get('notfirst'):
 self.currdata['notfirst'] = 1
 self.currdata['str'] = ''
 if char in NAMECHARS:
 self.currdata['str'] += char
 if char in LINEFEEDS or char in OPERATORS or char == ' ':
 print('a') if self.currdata['str'] == '' else None
 self.tokens.append(Token(TokenTypes.NAME, self.currdata['str']))
 self.currdata = {}
 -self.modes.NAME
 +self.modes.REGULAR
 if not char == ' ':
 self.position-=1
 elif self.modes.STRING:
 if not self.currdata.get('notfirst'):
 self.currdata['notfirst'] = 1
 self.currdata['str'] = char
 elif char in STRCHARS or char == '\n':
 self.currdata['str'] += char
 elif char in STRINGCHARS:
 self.currdata['str'] += char
 self.tokens.append(Token(TokenTypes.STRING, self.currdata['str']))
 self.currdata = {}
 -self.modes.STRING
 +self.modes.REGULAR
 #self.position-=1
 elif self.modes.NUM:
 if not self.currdata.get('notfirst'):
 self.currdata['notfirst'] = 1
 self.currdata['str'] = ''
 if char in NUMS:
 self.currdata['str'] += char
 if char in LINEFEEDS or char in OPERATORS:
 print('a') if self.currdata['str'] == '' else None
 self.tokens.append(Token(TokenTypes.NUMBER, self.currdata['str']))
 self.currdata = {}
 -self.modes.NUM
 +self.modes.REGULAR
 self.position-=1
 self.position += 1
 print(self.position, self.modes, char)
 
 
if __name__ == '__main__':
 code = """
import github.octocat.gopher.libgopher as go
if go.enabled != true {exit()} # Very much a test ----+-
compile_hello = () -> {go.compile('Hello, World!')}
print(compile_hello())
compile_hello() |> print
"VeryWell1000000" |> print
for i in range(0,5) {
 print(i)
}
/*
this
should
not
be parsed *-*
*/
20 & 30 | 50
a = "
never 
gonna 
give 
you 
up
"
b = "never gonna let you down"; rick = a + b
/*
regex = r'.*'
*/
if not b + 1 == 2 {print('a')}
"""
 parser: Lexer = Lexer(code)
 parser.parse()
 [print(i) for i in parser.tokens]
 print(' '.join([i.data for i in parser.tokens]))
 print(code)

Question 2

Don't Use List Comprehensions Solely for Their Side Effects

In several places you have code similar to:

[print(i) for i in parser.tokens]

You are creating a list whose elements will be the return values from calling the print function. And what are you doing with this list? Nothing, of course. Then why create the list? This style of coding is non-Pythonic (not to mention inefficient). Just do:

for i in parser_tokens:
 print(i)

Use Logging for Outputting Debugging Information

Your code has several print calls that appear to be for debugging purposes. For example:

 ...
 if self.modes.REGULAR:
 print(char, nxtchar)
 ...

If so, it would be preferable to use the logging API. For example:

import logging
# Set the level to logging.DEBUG to output debugging info:
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.CRITICAL)
logger = logging.getLogger()
 ...
 if self.modes.REGULAR:
 logger.debug("%s %s", char, nxtchar)
 ...

Better Naming

You have:

 ...
 parser: Lexer = Lexer(code)
 parser.parse()
 ...

There is a huge difference between a lexical analyzer, which generally recognizes tokens that can be defined by using a regular language (or equivalenetly, a regular expression) and the parser that uses those token to parse those tokens into, for example, an abstract syntax tree based on a grammar that, in general, is not regular. To make a long story short, a better name for a Lexer instance would be lexer. Likewise, I suggest renaming method parse to lex (something other than parse).

Use Generator Expressions Instead of Comprehensions When a List Is Not Required

You have:

 ...
 print(' '.join([i.data for i in parser.tokens]))
 ...

We don't really need to create a list for use with join:

 ...
 print(' '.join(token.data for token in lexer.tokens))
 ...

Note also I have renamed variable i to the more meaningful token (and parser to lexer.

How Will These Tokens Be Used?

Are we just generating tokens and never using them in a parser? Probably not. You generally start out defining a grammar for your language that will be parsed by your parser. Let's say that our parser will be based on LALR(1) grammars. An LALR(1) parser is driven by a two dimensional table indexed by "the current state number" and "the current lexical token", both of which are integers. Your definition of a Token is (in part):

class Token:
 def __init__(self, type: int, data: Any | str):
 self.type = type
 self.data = data

Where you claim that the type attribute is an int. But when you instantiate a token, you code is:

Token(TokenTypes.NUMBER, self.currdata['str'])

But TokenTypes is a subclass of enum.Enum:

class TokenTypes(Enum):
 SPECIAL = -1
 OP = 0
 NAME = 1
 COMMENT = 2
 LINEFEED = 3
 STRING = 4
 NUMBER = 5

Consequently TokenType.NUMBER is not an int. This is not a big deal; it's just something I happened to catch. Anyway, you can derive an int value with the expression TokenType.NUMBER.value, which is what is important and so this definition of TokenType is perfectly fine. But one normally starts with a grammar, from which the tokens can be derived. The person coding the grammar and the person coding the lexer may not be the same person. So one of the outputs from running a parser generator is usually a module that can be imported defining the tokens. Typically, it might be a file looking like:

from enum import Enum
class TokenTypes(Enum):
 SPECIAL = -1
 OP = 0
 NAME = 1
 COMMENT = 2
 LINEFEED = 3
 STRING = 4
 NUMBER = 5

So in real life your token definitions would probably be within a module you import.

The parser generally does not request all of the tokens at once but rather calls a method such as lexer.next_token() to get the next token. It would be nice if you provided this. This could be implemented simply as:

class Lexer:
 def parse(self) -> None:
 self._main_parse_loop()
 self._n_tokens = len(self.tokens)
 self._token_index = 0
 ...
 def next_token(self):
 """Return the next token."""
 if self._token_index < self._n_tokens:
 token = self.tokens[self._token_index]
 self._token_index += 1
 else:
 token = Token(TokenTypes.EOF, '<EOF>')
 return token

In the above function we have defined an additional token, which we might call EOF, representing the end of input:

class TokenTypes(Enum):
 ...
 EOF = 6

The purpose for this is to make sure there are no extraneous tokens following a stream of tokens that can be parsed into a sentence of the grammar. For example, if we had the grammar:

expression -> number + number;

Valid input might be: "1 + 3" but not "1 + 3 + 4". Just because we were able to reduce the token "1", "+" and "3" to an expression, does not mean the input was valid. So we normally use an "augmented" grammar:

expression_prime -> expression EOF;
expression -> number + number;

We now have a new goal symbol, expression_prime, which will reject "1 + 3 + 4 EOF" but accept "1 + 3 EOF".

What if in doing your lexical analysis you come upon an ill-formed toke (e.g. a string that is not properly terminated)? The lexer and parser should have an additional token type called ERROR, which the lexer returns when it finds an illegal token. The grammar will generally include special productions to handle errors discovered by the lexer or parser that will include the ERROR token.

A Better Way? Could Regular Expressions Be Used to Your Advantage?

I honestly did not spend too much time trying to fully understand the code; it just seemed a too complicated. I can't speak for others, but I would have a difficult time maintaining it. Here is an example of how I might code a lexer for a desk calculator that allows arithmetic expressions and assignments:

File tokens.py

"""
This module defines class LexValues, which a lexer (lexical analyzer) class
should mix in and the definition of a Token class.
"""
import collections
Token = collections.namedtuple('Token', ['token_number', 'value'])
class LexValues:
 def __init__(self):
 super().__init__()
 self.EOFSYM = 0
 self.ERROR = 1
 self.LPAREN = 2
 self.RPAREN = 3
 self.TIMES = 4
 self.NUMBER = 5
 self.ID = 6
 self.ASSIGNMENT = 7
 self.ADDOP = 8
 self.DIVOP = 9

File lexer.py

"""
This is a lexer for a desk calculator supporting assignments and numeric
expressions.
"""
import re
import decimal
from tokens import *
class Lexer(LexValues):
 def __init__(self, txt):
 super().__init__()
 tokens = (
 ('WHITESPACE', -1, r'\s+'),
 ('LPAREN', self.LPAREN, r'\('),
 ('RPAREN', self.RPAREN, r'\)'),
 ('TIMES', self.TIMES, r'\*'),
 ('ADDOP', self.ADDOP, r'[+-]'),
 ('DIVOP', self.DIVOP, r'[/%]'),
 ('NUMBER', self.NUMBER, r'\d+\.?\d*|\.\d+'),
 ('ID', self.ID, r'[A-Za-z][A-Za-z0-9_]*'),
 ('ASSIGNMENT', self.ASSIGNMENT, r'='),
 # This must be the last entry and matches anything that the previous
 # expressions do not match:
 ('ERROR', self.ERROR, r'.')
 )
 self.regex = re.compile('|'.join(f'(?P<%s>%s)' % (token[0], token[2]) for token in tokens))
 self.scanner = self.regex.scanner(txt)
 self.token_numbers = {token[0]: token[1] for token in tokens}
 def next_token(self):
 """Return Token consisting of a token number and token value."""
 while (m := self.scanner.match()) and m.lastgroup == 'WHITESPACE':
 pass
 if not m:
 return Token(self.EOFSYM, '<EOF>')
 token_type = m.lastgroup
 token_number = self.token_numbers[m.lastgroup]
 token_value = m.group()
 if token_number == self.NUMBER:
 # We will be using Decimal math!
 token_value = decimal.Decimal(token_value)
 return Token(token_number, token_value)
if __name__ == '__main__':
 lexer = Lexer('x = 17.0 + 19.3 / 6.1')
 while True:
 token = lexer.next_token()
 print(f'token number = {token.token_number}, value = {token.value!r}')
 if token.token_number == lexer.EOFSYM:
 break

Prints:

token number = 6, token value = 'x'
token number = 7, token value = '='
token number = 5, token value = Decimal('17.0')
token number = 8, token value = '+'
token number = 5, token value = Decimal('19.3')
token number = 9, token value = '/'
token number = 5, token value = Decimal('6.1')
token number = 0, token value = '<EOF>'

Question 3

We don't really need to create a list for use with join: - that's the function where such construct may be reasonable. I doubt it makes any measurable difference at this scale, but ''.join(['abcde' for _ in range(100000)]) is faster than the same using a generator! I always advocate for "drop that listcomp unless necessary", but str.join is the most famous counterexample:) stackoverflow.com/questions/34822676/…

Question 4

for token in self.tokens: yield token is conventionally spelled as yield from self.tokens.

Question 5

@STerliakov Thank you and thank you. However, as it turns out, what is needed for next_token is not a generator since the function is meant to be called repeatedly in a loop with token = lexer.next_token() so I have updated the function definition.

Question 6

I can't say I am a fan of prescribing regular expressions for a lexer (or parser) in general. Telling, your example does NOT consider string parsing, which in the presence of escapes, or raw string modes, can get fairly non-regular... (though regex extensions may still allow getting something that works).

Question 7

@MatthieuM. How about this? lex, that classic tool for performing lexical analysis that is integrated with yacc, is based on regular expressions. Regular expressions also make it clear (at least clearer) what , for example, a number is. The FSA required for recognizing tokens is now implemented by the re module. Also, most programming languages are not regular and you could not therefore use regular expressions to build a parser.

Question 8

Thank you for sharing this lexer. There's a lot of good code in it.

EBNF

The biggest thing missing from it is a Bakus-Naur grammar, as the _main_parse_loop is definitely not self-documenting. The first thing any user of your language will want to know is, "what can I write?" That is, what is the set of valid source code inputs? Decades of experience shows that describing this through code, or through English prose, will not adequately serve this need.

So you will need a grammar (or perhaps the equivalent railroad diagram) to show to the user. You could attempt to maintain an evolving grammar alongside an evolving parse loop, but that sounds very error prone. Much better to have code consume the grammar and follow its instructions.

Consider adopting Lark, or perhaps one of the PEG parsers.

POLA

 +self.modes.COMMENTML
 -self.modes.REGULAR
 self.position += 1

Those first two lines are not good. A C programmer might write --regular, but this is python code. Much better to express that intent with ... -= 1, or simply assign ... = False.

The OP code requires a maintenance engineer to go hunting through the code for these surprising definitions, which have non-traditional semantics:

 def __neg__(self):
 self.boolean = False
 def __pos__(self):
 self.boolean = True

Nesting ModeBool within Modes seems slightly clumsy, as revealed by the ... and i != "ModeBool" conjunct. Prefer to organize these in a separate mode.py module, with both up at module level.

annotation

I thank you kindly for optional type annotations -- they help me to better understand your intent.

But it's clear you've not asked the machine to read those annotations. Always use a type checker if you go to the trouble of writing them, to verify they make sense. It's an aid to the author, allowing you to avoid silly errors before even running any unit tests.

If you've written an >>> expression comment in a docstring, definitely verify with $ python -m doctest *.py. If you've written annotations, verify them with pyright or mypy --strict.

On which topic: adding docstrings to some of the methods and all of the classes would benefit this project.

extra annotation

parser: Lexer = Lexer(code)

I would expect a type checker to infer the type from a simple parser = Lexer(code) assignment.

meaningless annotation

 def __init__( ..., data: Any | str):

This looks like a # XXX restrict to just string TODO comment, the sort of thing we try to clean up prior to submitting a Pull Request code review. Write it as just Any. Change it when your codebase expects only string input.

Consider making Token a @dataclass, to slightly simplify the constructor.

generator

The while True: ... print ... approach to serving up parsed results won't scale beyond your initial PoC testing, and the .tokens result list is very batch oriented, not well suited to online immediate error reporting. The caller should be requesting tokens from an iterable:

lexer = Lexer(code)
for token in lexer.parse():
 print(token)

The parse loop will yield each token. That sets you up for raising a diagnostic exception at the point that a parse fail occurs.

logging

There is a great deal of print() debugging output, which can be fine during initial development. But do yourself a favor, and import logging. Then you can hang onto INFO level debug statements in the codebase, while typically running at WARN level.

Question 9

The print statements are for debugging. I'll use loguru in the final code.

Question 10

I mostly agree with the existing answers, let's hope I don't add too many duplicated notes.

`inv` magic method does not exist.

You probably wanted to define __invert__ instead. Fortunately you never use bitwise inversion later, so this went unnoticed. But this shouldn't matter as...

`Modes` is probably misdesigned

Using +x, -x and ~x for inplace modification is a bad idea, see @J_H answer. However, let's venture one step further: your modes are all mutually exclusive, that is, one and only one mode is active at any time (counting -x; +y as a single "switch mode to" operation). Well, then it isn't Modes, it's just a class Mode(Enum), and a lexer should keep a reference to the current mode. This prevents accidentally having multiple modes turned on (or none). Spell "switch to mode X" as self.mode = Mode.X.

If that was a "future-proof solution", then explain that. You should have pretty solid understanding (at least similar to "here are two modes I'll introduce next month - X and Y - and they must be able to coexist"). Even then it might be better to add more such mutually exclusive enums, but depends on the usecase. Don't add premature complexity - YAGNI.

Using a Enum will also provide a built-in solution similar to your __repr__ (print(Mode._member_map_)).

Dict-typed values

self.currdata uses a dictionary (a data structure mapping arbitrary hashable keys to arbitrary values) to represent a storage for a few fixed attributes. That's a problem: once you make a typo somewhere, it'll be difficult to spot.

I only see two such attributes (notfirst and str). They can just become lexer's own properties. If you need more, you can always extract a helper @dataclass to store such state in a structured fashion. That might be even more desirable to replace self.currdata = {} with something like self.state.clear().

Format, lint, test

I'm pleasantly surprised to see this code almost passing ruff's default auto-formatter modulo quotes and line length in one place. If you formatted by hand, consider relying on black or ruff format instead, otherwise - great, please keep using such tools!

Consider also running flake8 with plugins or ruff on your code to catch some common problems. That would have prevented some (or even all) of my style comments at the end of this answer.

Finally, there's no way to ensure that your code does what intended. Uhm. Its output takes more than 5 screens, I'm physically unable to check that in any way other than freezing the output, making changes and diffing the files. If any diff is found, I'm left trying to figure out what could have caused it. Instead, prefer a testing framework (built-in unittest or most popular pytest) and smaller tests, exercising all corner cases of the intended behaviour. This will make everyone much more confident when refactoring - I won't provide a "here's how I would write this" snippet in this answer, because I can't verify its correctness.

Enum - auto or unique?

Yes, your values are now unique, great. Consider asserting that with @enum.unique or dropping numeric values altogether and using enum.auto().

from enum import Enum, unique
@unique
class TokenTypes(Enum):
 SPECIAL = -1
 OP = 0
 NAME = 1
 COMMENT = 2
 LINEFEED = 3
 STRING = 4
 NUMBER = 5

or

from enum import Enum, auto
class TokenTypes(Enum):
 SPECIAL = auto()
 OP = auto()
 NAME = auto()
 COMMENT = auto()
 LINEFEED = auto()
 STRING = auto()
 NUMBER = auto()

Naming

A lot was said about naming, but nothing about this bit. Constants STRCHARS and STRINGCHARS look like aliases, but in fact they have nothing in common. Please rename one to QUOTE_CHARS, for example.

Minor comments

if not len(self.src) == self.position + 1 is better spelled if len(self.src) != self.position + 1
repr in f-string can be shorthanded: f'Token({self.type}, {self.data!r})'
if self.currdata["str"] == "" doesn't need comparison, just if not self.currdata["str"]. But...
Please remove all "clever one-liners" such as side-effect-only ternaries and listcomps. That doesn't make your code any better.
It feels a bit weird to see parens, brackets and braces as OPERATORS characters. Usually they're used as block delimiters, so { and * likely shouldn't belong to the same semantic group. Next steps of parsing will thank you for less ambiguity here.

Question 11

Overall, the code looks good. The first suggestion I have is ...

Documentation

Start off by adding documentation to the top of the code to summarize its purpose. The most important things to describe are:

What type of input source are you parsing
What form does the output take
What is the user expected to do with the output

The users of your code, and especially the maintainers of your code, will appreciate a clear high-level description of what it does. Six months after you started writing the code, you will even thank yourself for putting in the effort now.

The PEP 8 style guide recommends using docstrings:

"""
Lexer and tokenizer for FILL-IN-THE-BLANK
Input file or text is of format...
Output is of format...
"""

The PEP-8 guide also recommends docstrings for classes and functions. Lexer is a fine name, but it could use some elaboration. For example:

class Lexer:
 """ Lexing the FILL-IN-THE-BLANK format """

For the _main_parse_loop function, it would be helpful to document that it is printing output:

def _main_parse_loop(self) -> None:
 """ Parse input and print to stdout """

Comments

To reduce clutter, delete all commented-out code and notes to yourself about future features:

# TODO: add double and triple-char operator support
#DOUBLECHAROPERATORS = ["|>", "<|", "!=", "==", "<<", ">>"]
#TRIPLECHAROPERATORS = [""]
 #self.position-=1

You can keep track of future enhancements in a separate file in your version control system.

Naming

The style guide recommends separating multi-word constant names with an underscore. For example:

NAMECHARS
SLICOMMENTS

become:

NAME_CHARS
SLI_COMMENTS

This makes the code easier to read.

I recommend adding a comment for abbreviations like:

SLI
MLI

Layout

It would be nice to align these lines to the = operator:

REGULAR = ModeBool(True)
STRING = ModeBool(False)
COMMENT = ModeBool(False)
COMMENTML = ModeBool(False)
NAME = ModeBool(False)
NUM = ModeBool(False)

Also, it would be nice to add some blank lines between sections of code:

 return str(self.boolean)
REGULAR = ModeBool(True)
STRING = ModeBool(False)
COMMENT = ModeBool(False)
COMMENTML = ModeBool(False)
NAME = ModeBool(False)
NUM = ModeBool(False)
def __repr__(self):

Simpler

The range function:

range(0,256)

is simpler without the optional 0:

range(256)

`string`

Long string like this are error-prone:

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

The string.ascii_letters function provides this for you.

string.digits also gives you 1234567890.

Question 12

"nice to align these lines to the = operator" I respectfully disagree. I mean sure, I see your point about visually scanning to notice the similarities, I have worked with and authored plenty of code like that, especially in C. But for this codebase, I found it not very legible until I ran $ black *.py, and I would much prefer the OP author do that routinely than attempt something at cross purposes like aligning equal signs. // If we want to emphasize that "most of these are False", then IDK maybe define a false_mode_bool temp var and make assignments with that?

Question 13

@J_H: Yes, I agree that auto-formatting would be better.

Question 14

Yep, please don't "align by equal sign", that's even explicitly discouraged by PEP8 (link to the relevant section, last example) and will be fixed by ruff or black immediately. Please use formatter and linter instead.

Question 15

@STerliakov: Thanks for the link. I didn't realize it was buried in PEP.

Booboo Booboo 3,3164 silver badges14 bronze badges · Answer 1 · 2025-01-04 16:18:49Z

Don't Use List Comprehensions Solely for Their Side Effects

In several places you have code similar to:

[print(i) for i in parser.tokens]

You are creating a list whose elements will be the return values from calling the print function. And what are you doing with this list? Nothing, of course. Then why create the list? This style of coding is non-Pythonic (not to mention inefficient). Just do:

for i in parser_tokens:
 print(i)

Use Logging for Outputting Debugging Information

Your code has several print calls that appear to be for debugging purposes. For example:

 ...
 if self.modes.REGULAR:
 print(char, nxtchar)
 ...

If so, it would be preferable to use the logging API. For example:

import logging
# Set the level to logging.DEBUG to output debugging info:
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.CRITICAL)
logger = logging.getLogger()
 ...
 if self.modes.REGULAR:
 logger.debug("%s %s", char, nxtchar)
 ...

Better Naming

You have:

 ...
 parser: Lexer = Lexer(code)
 parser.parse()
 ...

There is a huge difference between a lexical analyzer, which generally recognizes tokens that can be defined by using a regular language (or equivalenetly, a regular expression) and the parser that uses those token to parse those tokens into, for example, an abstract syntax tree based on a grammar that, in general, is not regular. To make a long story short, a better name for a Lexer instance would be lexer. Likewise, I suggest renaming method parse to lex (something other than parse).

Use Generator Expressions Instead of Comprehensions When a List Is Not Required

You have:

 ...
 print(' '.join([i.data for i in parser.tokens]))
 ...

We don't really need to create a list for use with join:

 ...
 print(' '.join(token.data for token in lexer.tokens))
 ...

Note also I have renamed variable i to the more meaningful token (and parser to lexer.

How Will These Tokens Be Used?

Are we just generating tokens and never using them in a parser? Probably not. You generally start out defining a grammar for your language that will be parsed by your parser. Let's say that our parser will be based on LALR(1) grammars. An LALR(1) parser is driven by a two dimensional table indexed by "the current state number" and "the current lexical token", both of which are integers. Your definition of a Token is (in part):

class Token:
 def __init__(self, type: int, data: Any | str):
 self.type = type
 self.data = data

Where you claim that the type attribute is an int. But when you instantiate a token, you code is:

Token(TokenTypes.NUMBER, self.currdata['str'])

But TokenTypes is a subclass of enum.Enum:

class TokenTypes(Enum):
 SPECIAL = -1
 OP = 0
 NAME = 1
 COMMENT = 2
 LINEFEED = 3
 STRING = 4
 NUMBER = 5

Consequently TokenType.NUMBER is not an int. This is not a big deal; it's just something I happened to catch. Anyway, you can derive an int value with the expression TokenType.NUMBER.value, which is what is important and so this definition of TokenType is perfectly fine. But one normally starts with a grammar, from which the tokens can be derived. The person coding the grammar and the person coding the lexer may not be the same person. So one of the outputs from running a parser generator is usually a module that can be imported defining the tokens. Typically, it might be a file looking like:

from enum import Enum
class TokenTypes(Enum):
 SPECIAL = -1
 OP = 0
 NAME = 1
 COMMENT = 2
 LINEFEED = 3
 STRING = 4
 NUMBER = 5

So in real life your token definitions would probably be within a module you import.

The parser generally does not request all of the tokens at once but rather calls a method such as lexer.next_token() to get the next token. It would be nice if you provided this. This could be implemented simply as:

class Lexer:
 def parse(self) -> None:
 self._main_parse_loop()
 self._n_tokens = len(self.tokens)
 self._token_index = 0
 ...
 def next_token(self):
 """Return the next token."""
 if self._token_index < self._n_tokens:
 token = self.tokens[self._token_index]
 self._token_index += 1
 else:
 token = Token(TokenTypes.EOF, '<EOF>')
 return token

In the above function we have defined an additional token, which we might call EOF, representing the end of input:

class TokenTypes(Enum):
 ...
 EOF = 6

The purpose for this is to make sure there are no extraneous tokens following a stream of tokens that can be parsed into a sentence of the grammar. For example, if we had the grammar:

expression -> number + number;

Valid input might be: "1 + 3" but not "1 + 3 + 4". Just because we were able to reduce the token "1", "+" and "3" to an expression, does not mean the input was valid. So we normally use an "augmented" grammar:

expression_prime -> expression EOF;
expression -> number + number;

We now have a new goal symbol, expression_prime, which will reject "1 + 3 + 4 EOF" but accept "1 + 3 EOF".

What if in doing your lexical analysis you come upon an ill-formed toke (e.g. a string that is not properly terminated)? The lexer and parser should have an additional token type called ERROR, which the lexer returns when it finds an illegal token. The grammar will generally include special productions to handle errors discovered by the lexer or parser that will include the ERROR token.

A Better Way? Could Regular Expressions Be Used to Your Advantage?

I honestly did not spend too much time trying to fully understand the code; it just seemed a too complicated. I can't speak for others, but I would have a difficult time maintaining it. Here is an example of how I might code a lexer for a desk calculator that allows arithmetic expressions and assignments:

File tokens.py

"""
This module defines class LexValues, which a lexer (lexical analyzer) class
should mix in and the definition of a Token class.
"""
import collections
Token = collections.namedtuple('Token', ['token_number', 'value'])
class LexValues:
 def __init__(self):
 super().__init__()
 self.EOFSYM = 0
 self.ERROR = 1
 self.LPAREN = 2
 self.RPAREN = 3
 self.TIMES = 4
 self.NUMBER = 5
 self.ID = 6
 self.ASSIGNMENT = 7
 self.ADDOP = 8
 self.DIVOP = 9

File lexer.py

"""
This is a lexer for a desk calculator supporting assignments and numeric
expressions.
"""
import re
import decimal
from tokens import *
class Lexer(LexValues):
 def __init__(self, txt):
 super().__init__()
 tokens = (
 ('WHITESPACE', -1, r'\s+'),
 ('LPAREN', self.LPAREN, r'\('),
 ('RPAREN', self.RPAREN, r'\)'),
 ('TIMES', self.TIMES, r'\*'),
 ('ADDOP', self.ADDOP, r'[+-]'),
 ('DIVOP', self.DIVOP, r'[/%]'),
 ('NUMBER', self.NUMBER, r'\d+\.?\d*|\.\d+'),
 ('ID', self.ID, r'[A-Za-z][A-Za-z0-9_]*'),
 ('ASSIGNMENT', self.ASSIGNMENT, r'='),
 # This must be the last entry and matches anything that the previous
 # expressions do not match:
 ('ERROR', self.ERROR, r'.')
 )
 self.regex = re.compile('|'.join(f'(?P<%s>%s)' % (token[0], token[2]) for token in tokens))
 self.scanner = self.regex.scanner(txt)
 self.token_numbers = {token[0]: token[1] for token in tokens}
 def next_token(self):
 """Return Token consisting of a token number and token value."""
 while (m := self.scanner.match()) and m.lastgroup == 'WHITESPACE':
 pass
 if not m:
 return Token(self.EOFSYM, '<EOF>')
 token_type = m.lastgroup
 token_number = self.token_numbers[m.lastgroup]
 token_value = m.group()
 if token_number == self.NUMBER:
 # We will be using Decimal math!
 token_value = decimal.Decimal(token_value)
 return Token(token_number, token_value)
if __name__ == '__main__':
 lexer = Lexer('x = 17.0 + 19.3 / 6.1')
 while True:
 token = lexer.next_token()
 print(f'token number = {token.token_number}, value = {token.value!r}')
 if token.token_number == lexer.EOFSYM:
 break

Prints:

token number = 6, token value = 'x'
token number = 7, token value = '='
token number = 5, token value = Decimal('17.0')
token number = 8, token value = '+'
token number = 5, token value = Decimal('19.3')
token number = 9, token value = '/'
token number = 5, token value = Decimal('6.1')
token number = 0, token value = '<EOF>'

We don't really need to create a list for use with join: - that's the function where such construct may be reasonable. I doubt it makes any measurable difference at this scale, but ''.join(['abcde' for _ in range(100000)]) is faster than the same using a generator! I always advocate for "drop that listcomp unless necessary", but str.join is the most famous counterexample:) stackoverflow.com/questions/34822676/…
for token in self.tokens: yield token is conventionally spelled as yield from self.tokens.
@STerliakov Thank you and thank you. However, as it turns out, what is needed for next_token is not a generator since the function is meant to be called repeatedly in a loop with token = lexer.next_token() so I have updated the function definition.
I can't say I am a fan of prescribing regular expressions for a lexer (or parser) in general. Telling, your example does NOT consider string parsing, which in the presence of escapes, or raw string modes, can get fairly non-regular... (though regex extensions may still allow getting something that works).
@MatthieuM. How about this? lex, that classic tool for performing lexical analysis that is integrated with yacc, is based on regular expressions. Regular expressions also make it clear (at least clearer) what , for example, a number is. The FSA required for recognizing tokens is now implemented by the re module. Also, most programming languages are not regular and you could not therefore use regular expressions to build a parser.

J_H J_H 41.8k3 gold badges38 silver badges157 bronze badges · Answer 2 · 2025-01-04 17:29:21Z

Thank you for sharing this lexer. There's a lot of good code in it.

EBNF

The biggest thing missing from it is a Bakus-Naur grammar, as the _main_parse_loop is definitely not self-documenting. The first thing any user of your language will want to know is, "what can I write?" That is, what is the set of valid source code inputs? Decades of experience shows that describing this through code, or through English prose, will not adequately serve this need.

So you will need a grammar (or perhaps the equivalent railroad diagram) to show to the user. You could attempt to maintain an evolving grammar alongside an evolving parse loop, but that sounds very error prone. Much better to have code consume the grammar and follow its instructions.

Consider adopting Lark, or perhaps one of the PEG parsers.

POLA

 +self.modes.COMMENTML
 -self.modes.REGULAR
 self.position += 1

Those first two lines are not good. A C programmer might write --regular, but this is python code. Much better to express that intent with ... -= 1, or simply assign ... = False.

The OP code requires a maintenance engineer to go hunting through the code for these surprising definitions, which have non-traditional semantics:

 def __neg__(self):
 self.boolean = False
 def __pos__(self):
 self.boolean = True

Nesting ModeBool within Modes seems slightly clumsy, as revealed by the ... and i != "ModeBool" conjunct. Prefer to organize these in a separate mode.py module, with both up at module level.

annotation

I thank you kindly for optional type annotations -- they help me to better understand your intent.

But it's clear you've not asked the machine to read those annotations. Always use a type checker if you go to the trouble of writing them, to verify they make sense. It's an aid to the author, allowing you to avoid silly errors before even running any unit tests.

If you've written an >>> expression comment in a docstring, definitely verify with $ python -m doctest *.py. If you've written annotations, verify them with pyright or mypy --strict.

On which topic: adding docstrings to some of the methods and all of the classes would benefit this project.

extra annotation

parser: Lexer = Lexer(code)

I would expect a type checker to infer the type from a simple parser = Lexer(code) assignment.

meaningless annotation

 def __init__( ..., data: Any | str):

This looks like a # XXX restrict to just string TODO comment, the sort of thing we try to clean up prior to submitting a Pull Request code review. Write it as just Any. Change it when your codebase expects only string input.

Consider making Token a @dataclass, to slightly simplify the constructor.

generator

The while True: ... print ... approach to serving up parsed results won't scale beyond your initial PoC testing, and the .tokens result list is very batch oriented, not well suited to online immediate error reporting. The caller should be requesting tokens from an iterable:

lexer = Lexer(code)
for token in lexer.parse():
 print(token)

The parse loop will yield each token. That sets you up for raising a diagnostic exception at the point that a parse fail occurs.

logging

There is a great deal of print() debugging output, which can be fine during initial development. But do yourself a favor, and import logging. Then you can hang onto INFO level debug statements in the codebase, while typically running at WARN level.

The print statements are for debugging. I'll use loguru in the final code.

STerliakov STerliakov 2,0343 silver badges13 bronze badges · Answer 3 · 2025-01-05 00:31:49Z

I mostly agree with the existing answers, let's hope I don't add too many duplicated notes.

`inv` magic method does not exist.

You probably wanted to define __invert__ instead. Fortunately you never use bitwise inversion later, so this went unnoticed. But this shouldn't matter as...

`Modes` is probably misdesigned

Using +x, -x and ~x for inplace modification is a bad idea, see @J_H answer. However, let's venture one step further: your modes are all mutually exclusive, that is, one and only one mode is active at any time (counting -x; +y as a single "switch mode to" operation). Well, then it isn't Modes, it's just a class Mode(Enum), and a lexer should keep a reference to the current mode. This prevents accidentally having multiple modes turned on (or none). Spell "switch to mode X" as self.mode = Mode.X.

If that was a "future-proof solution", then explain that. You should have pretty solid understanding (at least similar to "here are two modes I'll introduce next month - X and Y - and they must be able to coexist"). Even then it might be better to add more such mutually exclusive enums, but depends on the usecase. Don't add premature complexity - YAGNI.

Using a Enum will also provide a built-in solution similar to your __repr__ (print(Mode._member_map_)).

Dict-typed values

self.currdata uses a dictionary (a data structure mapping arbitrary hashable keys to arbitrary values) to represent a storage for a few fixed attributes. That's a problem: once you make a typo somewhere, it'll be difficult to spot.

I only see two such attributes (notfirst and str). They can just become lexer's own properties. If you need more, you can always extract a helper @dataclass to store such state in a structured fashion. That might be even more desirable to replace self.currdata = {} with something like self.state.clear().

Format, lint, test

I'm pleasantly surprised to see this code almost passing ruff's default auto-formatter modulo quotes and line length in one place. If you formatted by hand, consider relying on black or ruff format instead, otherwise - great, please keep using such tools!

Consider also running flake8 with plugins or ruff on your code to catch some common problems. That would have prevented some (or even all) of my style comments at the end of this answer.

Finally, there's no way to ensure that your code does what intended. Uhm. Its output takes more than 5 screens, I'm physically unable to check that in any way other than freezing the output, making changes and diffing the files. If any diff is found, I'm left trying to figure out what could have caused it. Instead, prefer a testing framework (built-in unittest or most popular pytest) and smaller tests, exercising all corner cases of the intended behaviour. This will make everyone much more confident when refactoring - I won't provide a "here's how I would write this" snippet in this answer, because I can't verify its correctness.

Enum - auto or unique?

Yes, your values are now unique, great. Consider asserting that with @enum.unique or dropping numeric values altogether and using enum.auto().

from enum import Enum, unique
@unique
class TokenTypes(Enum):
 SPECIAL = -1
 OP = 0
 NAME = 1
 COMMENT = 2
 LINEFEED = 3
 STRING = 4
 NUMBER = 5

or

from enum import Enum, auto
class TokenTypes(Enum):
 SPECIAL = auto()
 OP = auto()
 NAME = auto()
 COMMENT = auto()
 LINEFEED = auto()
 STRING = auto()
 NUMBER = auto()

Naming

A lot was said about naming, but nothing about this bit. Constants STRCHARS and STRINGCHARS look like aliases, but in fact they have nothing in common. Please rename one to QUOTE_CHARS, for example.

Minor comments

if not len(self.src) == self.position + 1 is better spelled if len(self.src) != self.position + 1
repr in f-string can be shorthanded: f'Token({self.type}, {self.data!r})'
if self.currdata["str"] == "" doesn't need comparison, just if not self.currdata["str"]. But...
Please remove all "clever one-liners" such as side-effect-only ternaries and listcomps. That doesn't make your code any better.
It feels a bit weird to see parens, brackets and braces as OPERATORS characters. Usually they're used as block delimiters, so { and * likely shouldn't belong to the same semantic group. Next steps of parsing will thank you for less ambiguity here.

toolic toolic 15.2k5 gold badges29 silver badges213 bronze badges · Answer 4 · 2025-01-04 11:59:43Z

Overall, the code looks good. The first suggestion I have is ...

Documentation

Start off by adding documentation to the top of the code to summarize its purpose. The most important things to describe are:

What type of input source are you parsing
What form does the output take
What is the user expected to do with the output

The users of your code, and especially the maintainers of your code, will appreciate a clear high-level description of what it does. Six months after you started writing the code, you will even thank yourself for putting in the effort now.

The PEP 8 style guide recommends using docstrings:

"""
Lexer and tokenizer for FILL-IN-THE-BLANK
Input file or text is of format...
Output is of format...
"""

The PEP-8 guide also recommends docstrings for classes and functions. Lexer is a fine name, but it could use some elaboration. For example:

class Lexer:
 """ Lexing the FILL-IN-THE-BLANK format """

For the _main_parse_loop function, it would be helpful to document that it is printing output:

def _main_parse_loop(self) -> None:
 """ Parse input and print to stdout """

Comments

To reduce clutter, delete all commented-out code and notes to yourself about future features:

# TODO: add double and triple-char operator support
#DOUBLECHAROPERATORS = ["|>", "<|", "!=", "==", "<<", ">>"]
#TRIPLECHAROPERATORS = [""]
 #self.position-=1

You can keep track of future enhancements in a separate file in your version control system.

Naming

The style guide recommends separating multi-word constant names with an underscore. For example:

NAMECHARS
SLICOMMENTS

become:

NAME_CHARS
SLI_COMMENTS

This makes the code easier to read.

I recommend adding a comment for abbreviations like:

SLI
MLI

Layout

It would be nice to align these lines to the = operator:

REGULAR = ModeBool(True)
STRING = ModeBool(False)
COMMENT = ModeBool(False)
COMMENTML = ModeBool(False)
NAME = ModeBool(False)
NUM = ModeBool(False)

Also, it would be nice to add some blank lines between sections of code:

 return str(self.boolean)
REGULAR = ModeBool(True)
STRING = ModeBool(False)
COMMENT = ModeBool(False)
COMMENTML = ModeBool(False)
NAME = ModeBool(False)
NUM = ModeBool(False)
def __repr__(self):

Simpler

The range function:

range(0,256)

is simpler without the optional 0:

range(256)

`string`

Long string like this are error-prone:

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

The string.ascii_letters function provides this for you.

string.digits also gives you 1234567890.

"nice to align these lines to the = operator" I respectfully disagree. I mean sure, I see your point about visually scanning to notice the similarities, I have worked with and authored plenty of code like that, especially in C. But for this codebase, I found it not very legible until I ran $ black *.py, and I would much prefer the OP author do that routinely than attempt something at cross purposes like aligning equal signs. // If we want to emphasize that "most of these are False", then IDK maybe define a false_mode_bool temp var and make assignments with that?
Yep, please don't "align by equal sign", that's even explicitly discouraged by PEP8 (link to the relevant section, last example) and will be fixed by ruff or black immediately. Please use formatter and linter instead.
@STerliakov: Thanks for the link. I didn't realize it was buried in PEP.

Stack Exchange Network

Basic, general lexer for a programming language

4 Answers 4

Don't Use List Comprehensions Solely for Their Side Effects

Use Logging for Outputting Debugging Information

Better Naming

Use Generator Expressions Instead of Comprehensions When a List Is Not Required

How Will These Tokens Be Used?

A Better Way? Could Regular Expressions Be Used to Your Advantage?

EBNF

POLA

annotation

extra annotation

meaningless annotation

generator

logging

`inv` magic method does not exist.

`Modes` is probably misdesigned

Dict-typed values

Format, lint, test

Enum - auto or unique?

Naming

Minor comments

Documentation

Comments

Naming

Layout

Simpler

`string`

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

4 Answers 4

Don't Use List Comprehensions Solely for Their Side Effects

Use Logging for Outputting Debugging Information

Better Naming

Use Generator Expressions Instead of Comprehensions When a List Is Not Required

How Will These Tokens Be Used?

A Better Way? Could Regular Expressions Be Used to Your Advantage?

EBNF

annotation

extra annotation

meaningless annotation

generator

logging

__inv__ magic method does not exist.

Modes is probably misdesigned

Dict-typed values

Format, lint, test

Enum - auto or unique?

Naming

Minor comments

Documentation

Comments

Naming

Layout

Simpler

string

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

`inv` magic method does not exist.

`Modes` is probably misdesigned

`string`