I've been working on a general lexer for a programming language for a couple days now. I don't know if the code I've written is overcomplicated, if there is a better way to parse the code into tokens or if my code could be made easier with a library.
Code:
from enum import Enum
from typing import Any
OPERATORS = list("+~-*/^&<>()[]{}=!?|,")
# TODO: add double and triple-char operator support
#DOUBLECHAROPERATORS = ["|>", "<|", "!=", "==", "<<", ">>"]
#TRIPLECHAROPERATORS = [""]
NAMECHARS = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ._')
STRCHARS = [chr(i) for i in range(0,256)]
NUMS = list('1234567890.')
LINEFEEDS = list(';\n')
CMTLINEFEEDS = ['\n']
SLICOMMENTS = ['#', '//']
MLICOMMENTS = [['\\*', '*/']]
STRINGCHARS = list('\'\"')
[STRCHARS.remove(i) for i in STRINGCHARS + ['\n']]
class TokenTypes(Enum):
SPECIAL = -1
OP = 0
NAME = 1
COMMENT = 2
LINEFEED = 3
STRING = 4
NUMBER = 5
class Token:
def __init__(self, type: int, data: Any | str):
self.type = type
self.data = data
def __repr__(self):
return f"Token({self.type}, {repr(self.data)})"
class Modes:
class ModeBool():
boolean = False
def __init__(self, val: bool = False):
self.boolean = val
def __neg__(self):
self.boolean = False
def __pos__(self):
self.boolean = True
def __inv__(self):
self.boolean = not self.boolean
def __bool__(self):
return self.boolean
def __repr__(self):
return str(self.boolean)
REGULAR = ModeBool(True)
STRING = ModeBool(False)
COMMENT = ModeBool(False)
COMMENTML = ModeBool(False)
NAME = ModeBool(False)
NUM = ModeBool(False)
def __repr__(self):
return str({i:getattr(self, i) for i in dir(self) if not i.startswith('_') and i != 'ModeBool'})
class Lexer:
def __init__(self, src: str) -> None:
self.position = 0
self.src = src
self.tokens = []
self.modes = Modes()
self.currdata = {}
def parse(self) -> None:
self._main_parse_loop()
def _main_parse_loop(self) -> None:
while True:
if self.position == len(self.src):
return
char = self.src[self.position]
if not len(self.src) == self.position + 1:
nxtchar = self.src[self.position + 1]
if self.modes.REGULAR:
print(char, nxtchar)
if char == '/' and nxtchar == '*':
+self.modes.COMMENTML
-self.modes.REGULAR
self.position+=1
elif char in OPERATORS:
self.tokens.append(Token(TokenTypes.OP, char))
elif char in LINEFEEDS:
self.tokens.append(Token(TokenTypes.LINEFEED, char))
elif char in SLICOMMENTS:
+self.modes.COMMENT
-self.modes.REGULAR
self.currdata = {}
self.position-=1
elif char in NAMECHARS:
-self.modes.REGULAR
+self.modes.NAME
self.position-=1
elif char in STRINGCHARS:
-self.modes.REGULAR
+self.modes.STRING
self.position-=1
elif char in NUMS:
-self.modes.REGULAR
+self.modes.NUM
self.position-=1
elif self.modes.COMMENT:
if not self.currdata.get('notfirst'):
self.currdata['notfirst'] = 1
self.currdata['str'] = ''
if char in LINEFEEDS:
self.tokens.append(Token(TokenTypes.COMMENT, self.currdata['str']))
self.currdata = {}
-self.modes.COMMENT
+self.modes.REGULAR
self.position-=1
else:
self.currdata['str'] += char
elif self.modes.COMMENTML:
if not self.currdata.get('notfirst'):
self.currdata['notfirst'] = 1
self.currdata['str'] = '/*'
if char == '*' and nxtchar == '/':
self.currdata['str'] += '*/'
self.tokens.append(Token(TokenTypes.COMMENT, self.currdata['str']))
self.currdata = {}
-self.modes.COMMENTML
+self.modes.REGULAR
self.position+=1
else:
self.currdata['str'] += char
elif self.modes.NAME:
if not self.currdata.get('notfirst'):
self.currdata['notfirst'] = 1
self.currdata['str'] = ''
if char in NAMECHARS:
self.currdata['str'] += char
if char in LINEFEEDS or char in OPERATORS or char == ' ':
print('a') if self.currdata['str'] == '' else None
self.tokens.append(Token(TokenTypes.NAME, self.currdata['str']))
self.currdata = {}
-self.modes.NAME
+self.modes.REGULAR
if not char == ' ':
self.position-=1
elif self.modes.STRING:
if not self.currdata.get('notfirst'):
self.currdata['notfirst'] = 1
self.currdata['str'] = char
elif char in STRCHARS or char == '\n':
self.currdata['str'] += char
elif char in STRINGCHARS:
self.currdata['str'] += char
self.tokens.append(Token(TokenTypes.STRING, self.currdata['str']))
self.currdata = {}
-self.modes.STRING
+self.modes.REGULAR
#self.position-=1
elif self.modes.NUM:
if not self.currdata.get('notfirst'):
self.currdata['notfirst'] = 1
self.currdata['str'] = ''
if char in NUMS:
self.currdata['str'] += char
if char in LINEFEEDS or char in OPERATORS:
print('a') if self.currdata['str'] == '' else None
self.tokens.append(Token(TokenTypes.NUMBER, self.currdata['str']))
self.currdata = {}
-self.modes.NUM
+self.modes.REGULAR
self.position-=1
self.position += 1
print(self.position, self.modes, char)
if __name__ == '__main__':
code = """
import github.octocat.gopher.libgopher as go
if go.enabled != true {exit()} # Very much a test ----+-
compile_hello = () -> {go.compile('Hello, World!')}
print(compile_hello())
compile_hello() |> print
"VeryWell1000000" |> print
for i in range(0,5) {
print(i)
}
/*
this
should
not
be parsed *-*
*/
20 & 30 | 50
a = "
never
gonna
give
you
up
"
b = "never gonna let you down"; rick = a + b
/*
regex = r'.*'
*/
if not b + 1 == 2 {print('a')}
"""
parser: Lexer = Lexer(code)
parser.parse()
[print(i) for i in parser.tokens]
print(' '.join([i.data for i in parser.tokens]))
print(code)
4 Answers 4
Don't Use List Comprehensions Solely for Their Side Effects
In several places you have code similar to:
[print(i) for i in parser.tokens]
You are creating a list whose elements will be the return values from calling the print
function. And what are you doing with this list? Nothing, of course. Then why create the list? This style of coding is non-Pythonic (not to mention inefficient). Just do:
for i in parser_tokens:
print(i)
Use Logging for Outputting Debugging Information
Your code has several print
calls that appear to be for debugging purposes. For example:
...
if self.modes.REGULAR:
print(char, nxtchar)
...
If so, it would be preferable to use the logging API. For example:
import logging
# Set the level to logging.DEBUG to output debugging info:
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.CRITICAL)
logger = logging.getLogger()
...
if self.modes.REGULAR:
logger.debug("%s %s", char, nxtchar)
...
Better Naming
You have:
...
parser: Lexer = Lexer(code)
parser.parse()
...
There is a huge difference between a lexical analyzer, which generally recognizes tokens that can be defined by using a regular language (or equivalenetly, a regular expression) and the parser that uses those token to parse those tokens into, for example, an abstract syntax tree based on a grammar that, in general, is not regular. To make a long story short, a better name for a Lexer
instance would be lexer
. Likewise, I suggest renaming method parse
to lex
(something other than parse
).
Use Generator Expressions Instead of Comprehensions When a List Is Not Required
You have:
...
print(' '.join([i.data for i in parser.tokens]))
...
We don't really need to create a list for use with join
:
...
print(' '.join(token.data for token in lexer.tokens))
...
Note also I have renamed variable i
to the more meaningful token
(and parser
to lexer
.
How Will These Tokens Be Used?
Are we just generating tokens and never using them in a parser? Probably not. You generally start out defining a grammar for your language that will be parsed by your parser. Let's say that our parser will be based on LALR(1) grammars. An LALR(1) parser is driven by a two dimensional table indexed by "the current state number" and "the current lexical token", both of which are integers. Your definition of a Token
is (in part):
class Token:
def __init__(self, type: int, data: Any | str):
self.type = type
self.data = data
Where you claim that the type
attribute is an int
. But when you instantiate a token, you code is:
Token(TokenTypes.NUMBER, self.currdata['str'])
But TokenTypes
is a subclass of enum.Enum
:
class TokenTypes(Enum):
SPECIAL = -1
OP = 0
NAME = 1
COMMENT = 2
LINEFEED = 3
STRING = 4
NUMBER = 5
Consequently TokenType.NUMBER
is not an int
. This is not a big deal; it's just something I happened to catch. Anyway, you can derive an int
value with the expression TokenType.NUMBER.value
, which is what is important and so this definition of TokenType
is perfectly fine. But one normally starts with a grammar, from which the tokens can be derived. The person coding the grammar and the person coding the lexer may not be the same person. So one of the outputs from running a parser generator is usually a module that can be imported defining the tokens. Typically, it might be a file looking like:
from enum import Enum
class TokenTypes(Enum):
SPECIAL = -1
OP = 0
NAME = 1
COMMENT = 2
LINEFEED = 3
STRING = 4
NUMBER = 5
So in real life your token definitions would probably be within a module you import.
The parser generally does not request all of the tokens at once but rather calls a method such as lexer.next_token()
to get the next token. It would be nice if you provided this. This could be implemented simply as:
class Lexer:
def parse(self) -> None:
self._main_parse_loop()
self._n_tokens = len(self.tokens)
self._token_index = 0
...
def next_token(self):
"""Return the next token."""
if self._token_index < self._n_tokens:
token = self.tokens[self._token_index]
self._token_index += 1
else:
token = Token(TokenTypes.EOF, '<EOF>')
return token
In the above function we have defined an additional token, which we might call EOF, representing the end of input:
class TokenTypes(Enum):
...
EOF = 6
The purpose for this is to make sure there are no extraneous tokens following a stream of tokens that can be parsed into a sentence of the grammar. For example, if we had the grammar:
expression -> number + number;
Valid input might be: "1 + 3" but not "1 + 3 + 4". Just because we were able to reduce the token "1", "+" and "3" to an expression, does not mean the input was valid. So we normally use an "augmented" grammar:
expression_prime -> expression EOF;
expression -> number + number;
We now have a new goal symbol, expression_prime, which will reject "1 + 3 + 4 EOF" but accept "1 + 3 EOF".
What if in doing your lexical analysis you come upon an ill-formed toke (e.g. a string that is not properly terminated)? The lexer and parser should have an additional token type called ERROR, which the lexer returns when it finds an illegal token. The grammar will generally include special productions to handle errors discovered by the lexer or parser that will include the ERROR token.
A Better Way? Could Regular Expressions Be Used to Your Advantage?
I honestly did not spend too much time trying to fully understand the code; it just seemed a too complicated. I can't speak for others, but I would have a difficult time maintaining it. Here is an example of how I might code a lexer for a desk calculator that allows arithmetic expressions and assignments:
File tokens.py
"""
This module defines class LexValues, which a lexer (lexical analyzer) class
should mix in and the definition of a Token class.
"""
import collections
Token = collections.namedtuple('Token', ['token_number', 'value'])
class LexValues:
def __init__(self):
super().__init__()
self.EOFSYM = 0
self.ERROR = 1
self.LPAREN = 2
self.RPAREN = 3
self.TIMES = 4
self.NUMBER = 5
self.ID = 6
self.ASSIGNMENT = 7
self.ADDOP = 8
self.DIVOP = 9
File lexer.py
"""
This is a lexer for a desk calculator supporting assignments and numeric
expressions.
"""
import re
import decimal
from tokens import *
class Lexer(LexValues):
def __init__(self, txt):
super().__init__()
tokens = (
('WHITESPACE', -1, r'\s+'),
('LPAREN', self.LPAREN, r'\('),
('RPAREN', self.RPAREN, r'\)'),
('TIMES', self.TIMES, r'\*'),
('ADDOP', self.ADDOP, r'[+-]'),
('DIVOP', self.DIVOP, r'[/%]'),
('NUMBER', self.NUMBER, r'\d+\.?\d*|\.\d+'),
('ID', self.ID, r'[A-Za-z][A-Za-z0-9_]*'),
('ASSIGNMENT', self.ASSIGNMENT, r'='),
# This must be the last entry and matches anything that the previous
# expressions do not match:
('ERROR', self.ERROR, r'.')
)
self.regex = re.compile('|'.join(f'(?P<%s>%s)' % (token[0], token[2]) for token in tokens))
self.scanner = self.regex.scanner(txt)
self.token_numbers = {token[0]: token[1] for token in tokens}
def next_token(self):
"""Return Token consisting of a token number and token value."""
while (m := self.scanner.match()) and m.lastgroup == 'WHITESPACE':
pass
if not m:
return Token(self.EOFSYM, '<EOF>')
token_type = m.lastgroup
token_number = self.token_numbers[m.lastgroup]
token_value = m.group()
if token_number == self.NUMBER:
# We will be using Decimal math!
token_value = decimal.Decimal(token_value)
return Token(token_number, token_value)
if __name__ == '__main__':
lexer = Lexer('x = 17.0 + 19.3 / 6.1')
while True:
token = lexer.next_token()
print(f'token number = {token.token_number}, value = {token.value!r}')
if token.token_number == lexer.EOFSYM:
break
Prints:
token number = 6, token value = 'x'
token number = 7, token value = '='
token number = 5, token value = Decimal('17.0')
token number = 8, token value = '+'
token number = 5, token value = Decimal('19.3')
token number = 9, token value = '/'
token number = 5, token value = Decimal('6.1')
token number = 0, token value = '<EOF>'
-
1\$\begingroup\$ We don't really need to create a list for use with join: - that's the function where such construct may be reasonable. I doubt it makes any measurable difference at this scale, but
''.join(['abcde' for _ in range(100000)])
is faster than the same using a generator! I always advocate for "drop that listcomp unless necessary", butstr.join
is the most famous counterexample:) stackoverflow.com/questions/34822676/… \$\endgroup\$STerliakov– STerliakov2025年01月04日 23:37:49 +00:00Commented Jan 4 at 23:37 -
\$\begingroup\$
for token in self.tokens: yield token
is conventionally spelled asyield from self.tokens
. \$\endgroup\$STerliakov– STerliakov2025年01月04日 23:39:26 +00:00Commented Jan 4 at 23:39 -
1\$\begingroup\$ @STerliakov Thank you and thank you. However, as it turns out, what is needed for
next_token
is not a generator since the function is meant to be called repeatedly in a loop withtoken = lexer.next_token()
so I have updated the function definition. \$\endgroup\$Booboo– Booboo2025年01月05日 10:48:19 +00:00Commented Jan 5 at 10:48 -
\$\begingroup\$ I can't say I am a fan of prescribing regular expressions for a lexer (or parser) in general. Telling, your example does NOT consider string parsing, which in the presence of escapes, or raw string modes, can get fairly non-regular... (though regex extensions may still allow getting something that works). \$\endgroup\$Matthieu M.– Matthieu M.2025年01月06日 15:48:17 +00:00Commented Jan 6 at 15:48
-
\$\begingroup\$ @MatthieuM. How about this? lex, that classic tool for performing lexical analysis that is integrated with yacc, is based on regular expressions. Regular expressions also make it clear (at least clearer) what , for example, a number is. The FSA required for recognizing tokens is now implemented by the
re
module. Also, most programming languages are not regular and you could not therefore use regular expressions to build a parser. \$\endgroup\$Booboo– Booboo2025年01月06日 16:31:46 +00:00Commented Jan 6 at 16:31
Thank you for sharing this lexer. There's a lot of good code in it.
EBNF
The biggest thing missing from it is a
Bakus-Naur
grammar, as the _main_parse_loop
is definitely not
self-documenting.
The first thing any user of your language will want
to know is, "what can I write?"
That is, what is the set of valid source code inputs?
Decades of experience shows that describing this through
code, or through English prose, will not adequately serve
this need.
So you will need a grammar (or perhaps the equivalent railroad diagram) to show to the user. You could attempt to maintain an evolving grammar alongside an evolving parse loop, but that sounds very error prone. Much better to have code consume the grammar and follow its instructions.
Consider adopting Lark, or perhaps one of the PEG parsers.
POLA
+self.modes.COMMENTML
-self.modes.REGULAR
self.position += 1
Those first two lines are not good.
A C programmer might write --regular
,
but this is python code.
Much better to express that intent with ... -= 1
,
or simply assign ... = False
.
The OP code requires a maintenance engineer to go hunting through the code for these surprising definitions, which have non-traditional semantics:
def __neg__(self):
self.boolean = False
def __pos__(self):
self.boolean = True
Nesting ModeBool
within Modes
seems slightly clumsy,
as revealed by the ... and i != "ModeBool"
conjunct.
Prefer to organize these in a separate mode.py
module,
with both up at module level.
annotation
I thank you kindly for optional type annotations -- they help me to better understand your intent.
But it's clear you've not asked the machine to read those annotations. Always use a type checker if you go to the trouble of writing them, to verify they make sense. It's an aid to the author, allowing you to avoid silly errors before even running any unit tests.
If you've written an >>>
expression comment in a docstring,
definitely verify with $ python -m doctest *.py
.
If you've written annotations, verify them with
pyright or
mypy --strict.
On which topic: adding docstrings to some of the methods and all of the classes would benefit this project.
extra annotation
parser: Lexer = Lexer(code)
I would expect a type checker to infer the type from a
simple parser = Lexer(code)
assignment.
meaningless annotation
def __init__( ..., data: Any | str):
This looks like a # XXX restrict to just string
TODO comment,
the sort of thing we try to clean up prior to submitting
a Pull Request code review.
Write it as just Any
.
Change it when your codebase expects only string input.
Consider making Token
a
@dataclass,
to slightly simplify the constructor.
generator
The while True: ... print ...
approach to serving up
parsed results won't scale beyond your initial PoC testing,
and the .tokens
result list is very batch oriented,
not well suited to online immediate error reporting.
The caller should be requesting tokens from an iterable:
lexer = Lexer(code)
for token in lexer.parse():
print(token)
The parse loop will yield
each token.
That sets you up for raising a diagnostic exception
at the point that a parse fail occurs.
logging
There is a great deal of print()
debugging output,
which can be fine during initial development.
But do yourself a favor, and
import logging.
Then you can hang onto INFO level debug statements
in the codebase, while typically running at WARN level.
-
\$\begingroup\$ The print statements are for debugging. I'll use loguru in the final code. \$\endgroup\$Xandaaah– Xandaaah2025年01月05日 07:25:21 +00:00Commented Jan 5 at 7:25
I mostly agree with the existing answers, let's hope I don't add too many duplicated notes.
__inv__
magic method does not exist.
You probably wanted to define __invert__
instead. Fortunately you never use bitwise inversion later, so this went unnoticed. But this shouldn't matter as...
Modes
is probably misdesigned
Using +x
, -x
and ~x
for inplace modification is a bad idea, see @J_H answer. However, let's venture one step further: your modes are all mutually exclusive, that is, one and only one mode is active at any time (counting -x; +y as a single "switch mode to" operation). Well, then it isn't Modes
, it's just a class Mode(Enum)
, and a lexer should keep a reference to the current mode. This prevents accidentally having multiple modes turned on (or none). Spell "switch to mode X" as self.mode = Mode.X
.
If that was a "future-proof solution", then explain that. You should have pretty solid understanding (at least similar to "here are two modes I'll introduce next month - X and Y - and they must be able to coexist"). Even then it might be better to add more such mutually exclusive enums, but depends on the usecase. Don't add premature complexity - YAGNI.
Using a Enum will also provide a built-in solution similar to your __repr__
(print(Mode._member_map_)
).
Dict-typed values
self.currdata
uses a dictionary (a data structure mapping arbitrary hashable keys to arbitrary values) to represent a storage for a few fixed attributes. That's a problem: once you make a typo somewhere, it'll be difficult to spot.
I only see two such attributes (notfirst
and str
). They can just become lexer's own properties. If you need more, you can always extract a helper @dataclass
to store such state in a structured fashion. That might be even more desirable to replace self.currdata = {}
with something like self.state.clear()
.
Format, lint, test
I'm pleasantly surprised to see this code almost passing ruff's default auto-formatter modulo quotes and line length in one place. If you formatted by hand, consider relying on black
or ruff format
instead, otherwise - great, please keep using such tools!
Consider also running flake8
with plugins or ruff
on your code to catch some common problems. That would have prevented some (or even all) of my style comments at the end of this answer.
Finally, there's no way to ensure that your code does what intended. Uhm. Its output takes more than 5 screens, I'm physically unable to check that in any way other than freezing the output, making changes and diff
ing the files. If any diff is found, I'm left trying to figure out what could have caused it. Instead, prefer a testing framework (built-in unittest
or most popular pytest
) and smaller tests, exercising all corner cases of the intended behaviour. This will make everyone much more confident when refactoring - I won't provide a "here's how I would write this" snippet in this answer, because I can't verify its correctness.
Enum - auto or unique?
Yes, your values are now unique, great. Consider asserting that with @enum.unique
or dropping numeric values altogether and using enum.auto()
.
from enum import Enum, unique
@unique
class TokenTypes(Enum):
SPECIAL = -1
OP = 0
NAME = 1
COMMENT = 2
LINEFEED = 3
STRING = 4
NUMBER = 5
or
from enum import Enum, auto
class TokenTypes(Enum):
SPECIAL = auto()
OP = auto()
NAME = auto()
COMMENT = auto()
LINEFEED = auto()
STRING = auto()
NUMBER = auto()
Naming
A lot was said about naming, but nothing about this bit. Constants STRCHARS
and STRINGCHARS
look like aliases, but in fact they have nothing in common. Please rename one to QUOTE_CHARS
, for example.
Minor comments
if not len(self.src) == self.position + 1
is better spelledif len(self.src) != self.position + 1
repr
in f-string can be shorthanded:f'Token({self.type}, {self.data!r})'
if self.currdata["str"] == ""
doesn't need comparison, justif not self.currdata["str"]
. But...- Please remove all "clever one-liners" such as side-effect-only ternaries and listcomps. That doesn't make your code any better.
- It feels a bit weird to see parens, brackets and braces as
OPERATORS
characters. Usually they're used as block delimiters, so{
and*
likely shouldn't belong to the same semantic group. Next steps of parsing will thank you for less ambiguity here.
Overall, the code looks good. The first suggestion I have is ...
Documentation
Start off by adding documentation to the top of the code to summarize its purpose. The most important things to describe are:
- What type of input source are you parsing
- What form does the output take
- What is the user expected to do with the output
The users of your code, and especially the maintainers of your code, will appreciate a clear high-level description of what it does. Six months after you started writing the code, you will even thank yourself for putting in the effort now.
The PEP 8 style guide recommends using docstrings:
"""
Lexer and tokenizer for FILL-IN-THE-BLANK
Input file or text is of format...
Output is of format...
"""
The PEP-8 guide also recommends docstrings for classes and functions.
Lexer
is a fine name, but it could use some elaboration. For example:
class Lexer:
""" Lexing the FILL-IN-THE-BLANK format """
For the _main_parse_loop
function, it would be helpful to
document that it is printing output:
def _main_parse_loop(self) -> None:
""" Parse input and print to stdout """
Comments
To reduce clutter, delete all commented-out code and notes to yourself about future features:
# TODO: add double and triple-char operator support
#DOUBLECHAROPERATORS = ["|>", "<|", "!=", "==", "<<", ">>"]
#TRIPLECHAROPERATORS = [""]
#self.position-=1
You can keep track of future enhancements in a separate file in your version control system.
Naming
The style guide recommends separating multi-word constant names with an underscore. For example:
NAMECHARS
SLICOMMENTS
become:
NAME_CHARS
SLI_COMMENTS
This makes the code easier to read.
I recommend adding a comment for abbreviations like:
SLI
MLI
Layout
It would be nice to align these lines to the =
operator:
REGULAR = ModeBool(True)
STRING = ModeBool(False)
COMMENT = ModeBool(False)
COMMENTML = ModeBool(False)
NAME = ModeBool(False)
NUM = ModeBool(False)
Also, it would be nice to add some blank lines between sections of code:
return str(self.boolean)
REGULAR = ModeBool(True)
STRING = ModeBool(False)
COMMENT = ModeBool(False)
COMMENTML = ModeBool(False)
NAME = ModeBool(False)
NUM = ModeBool(False)
def __repr__(self):
Simpler
The range
function:
range(0,256)
is simpler without the optional 0
:
range(256)
string
Long string like this are error-prone:
abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
The string.ascii_letters function provides this for you.
string.digits
also gives you 1234567890
.
-
1\$\begingroup\$ "nice to align these lines to the = operator" I respectfully disagree. I mean sure, I see your point about visually scanning to notice the similarities, I have worked with and authored plenty of code like that, especially in C. But for this codebase, I found it not very legible until I ran
$ black *.py
, and I would much prefer the OP author do that routinely than attempt something at cross purposes like aligning equal signs. // If we want to emphasize that "most of these are False", then IDK maybe define afalse_mode_bool
temp var and make assignments with that? \$\endgroup\$J_H– J_H2025年01月04日 17:48:08 +00:00Commented Jan 4 at 17:48 -
\$\begingroup\$ @J_H: Yes, I agree that auto-formatting would be better. \$\endgroup\$toolic– toolic2025年01月04日 17:57:11 +00:00Commented Jan 4 at 17:57
-
\$\begingroup\$ Yep, please don't "align by equal sign", that's even explicitly discouraged by PEP8 (link to the relevant section, last example) and will be fixed by ruff or black immediately. Please use formatter and linter instead. \$\endgroup\$STerliakov– STerliakov2025年01月04日 23:41:22 +00:00Commented Jan 4 at 23:41
-
\$\begingroup\$ @STerliakov: Thanks for the link. I didn't realize it was buried in PEP. \$\endgroup\$toolic– toolic2025年01月05日 01:15:13 +00:00Commented Jan 5 at 1:15
Explore related questions
See similar questions with these tags.