Code:
# read file of first agrument
import sys, os
try:
data = open(sys.argv[1]).read()
except:
data = open("File.txt").read()
sys.argv.append("File.txt")
dct = {"@": data, "E": "0"}
iteration = 0
RESET = False
define = False
printf = False
replace = False
inp = False
join = False
Reverse = False
getascii = False
getfirstchar = False
removeallexceptcertainchar = False
readfile = False
comment = False
multilinecomment = False
takeagrument = False
definebutvar = False
getfilesandfolders = False
haltifeof = False
################################################################################
variable = ""
variable2 = ""
variable3 = ""
variable4 = ""
variable5 = ""
variable6 = ""
string = ""
escape = False
isready = False
isset = False
isgo = False
def erroring(errtype, errreason, tb):
print(errreason, end="", file=sys.stderr)
sys.excepthook = erroring
def dctremoveallexceptcertain(string, variable):
# removes all except certain characters
# string = string to be modified
# variable = characters to be kept
# returns modified string
global newstring
newstring = ""
for i in string:
if i in variable:
newstring += i
return newstring
while True:
# Note you should use try and except to catch errors, not if x in y:
for i in data:
if printf:
# print the variable of dct variable
if i in dct:
print(dct[i], end = "")
printf = False
else:
raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
break
elif define:
if isready:
if escape:
# check if i is quote or backslash add to string else add backslash and i
if i == "\"" or i == "\\":
string += i
escape = False
else:
string += "\\" + i
escape = False
elif i == "\"":
dct[variable] = string
string = ""
isready = False
define = False
# check if the variable is "@" change the data
if variable == "@":
RESET = True
break
elif i == "\\":
escape = True
continue
else:
string += i
else:
variable = i
isready = True
elif replace:
# get 3 variables
if isgo:
try:
dct[variable] = dct[variable2].replace(variable3, i)
except:
raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
break
replace = isset = isready = isgo = False
# check if the variable is "@" change the data
if variable == "@":
RESET = True
break
elif isset:
variable3 = i
isgo = True
elif isready:
variable2 = i
isset = True
else:
variable = i
isready = True
elif inp:
try:
dct[i] = input()
dct["E"] = "0"
except EOFError:
dct["E"] = "1"
inp = False
if i == "@":
RESET = True
break
elif join:
if isset:
try:
dct[variable] = dct[variable2] + dct[i]
except:
raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
break
join = isset = isready = isgo = False
# check if the variable is "@" change the data
if variable == "@":
RESET = True
break
elif isready:
variable2 = i
isset = True
else:
variable = i
isready = True
elif Reverse:
# this is equivalent to pseudocode var = var2.reverse()
if isready:
try:
dct[variable] = dct[i][::-1]
except:
raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
break
isready = False
Revert = False
if variable == "@":
RESET = True
break
else:
variable = i
isready = True
elif getfirstchar:
# gets the first character of the variable
if isready:
try:
dct[variable] = dct[i][0]
except:
raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
isready = False
getfirstchar = False
if variable == "@":
RESET = True
break
else:
variable = i
isready = True
elif getascii:
# gets ascii value of the variable
if isready:
try:
dct[i]
try:
dct[variable] = ord(dct[i])
except:
raise Exception("ERROR IN FILE", sys.argv[1], "INVALID NUMBER IN ITERATION", iteration)
except:
raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
getascii = False
isready = False
if variable == "@":
RESET = True
break
else:
variable = i
isready = True
elif removeallexceptcertainchar:
# removes all except certain character
if isready:
try:
dct[variable] = dctremoveallexceptcertain(dct[variable], dct[i])
except:
raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
isready = False
removeallexceptcertainchar = False
if variable == "@":
RESET = True
break
else:
variable = i
isready = True
elif readfile:
if isready:
try:
dct[i]
except:
raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
try:
dct[variable] = open(dct[i], "r").read()
dct["E"] = "0"
except:
dct["E"] = "1"
isready = False
readfile = False
if variable == "@":
RESET = True
break
else:
variable = i
isready = True
elif comment:
# ignore until a newline is found
if i == "\n":
comment = False
elif multilinecomment:
if i == "}":
multilinecomment = False
elif takeagrument:
# checks if the variable is defined and the variable is a number
if isready:
try:
dct[i]
try:
int(dct[i])
try:
dct[variable] = sys.argv[int(dct[i])]
dct["E"] = "0"
except:
dct["E"] = "1"
except:
raise Exception("ERROR IN FILE", sys.argv[1], "INVALID NUMBER", iteration)
except:
raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
isready = False
takeagrument = False
if variable == "@":
RESET = True
else:
variable = i
isready = True
elif definebutvar:
# like > but it defines the variable
if isready:
try:
dct[variable] = dct[i]
except:
raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
isready = False
definebutvar = False
if variable == "@":
RESET = True
break
else:
variable = i
isready = True
elif getfilesandfolders:
# gets all files and folders in a variable if folder not found the variable "E" is equal to "1" else "0"
if isready:
# check if variable is defined
try:
dct[i]
except:
raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
try:
files = os.listdir(dct[i])
dct[variable] = files
dct["E"] = "0"
except:
dct["E"] = "1"
isready = False
getfilesandfolders = False
if variable == "@":
RESET = True
break
else:
variable = i
isready = True
elif haltifeof:
if dct["E"] == "1":break
haltifeof = False
elif i == "@": #############################################################################################################
RESET = True
break
elif i == "H":
break
elif i == ">":
define = True
elif i == "*":
printf = True
elif i == "$":
replace = True
elif i == "&":
inp = True
elif i == "+":
join = True
elif i == "/":
Reverse = True
elif i == "?":
getfirstchar = True
elif i == ".":
getascii = True
elif i == "!":
removeallexceptcertainchar = True
elif i == ";":
readfile = True
elif i == ":":
comment = True
elif i == "{":
multilinecomment = True
elif i == "|":
takeagrument = True
elif i == "=":
definebutvar = True
elif i == "%":
getfilesandfolders = True
elif i == "_":
haltifeof = True
# its not necessary to raise error invalid command
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
iteration += 1
# check if string still contains something put it in dct
if string != "":
dct[variable] = string
string = ""
define = False
isready = False
isset = False
isgo = False
printf = False
replace = False
inp = False
join, Reverse = False, False
getfirstchar = False
getascii = False
removeallexceptcertainchar = False
readfile = False
comment = False
multilinecomment = False
takeagrument = False
definebutvar = False
getfilesandfolders = False
# check if dct is "@" change the data
if variable == "@":
RESET = True
if RESET:
RESET = False
data = dct["@"]
iteration = 0
continue
break
try:
print(dct["#"], end = "")
except:
pass
This is a programming language interpreter. I will show wiki.
Commands:
(a, b, c, d means variables next command)
"@" goto first char
"H" halt (must be uppercase, not lowercase)
">" define var Syntax: >~Hello"
Escape is backslash (the quote is optional)
"*" print the next char variable
"$" a = b.replace(c, d)
"&" Input the next char variable ("E" var is 1 if EOF)
"+" a = b.join(c)
"/" a = b.reverse()
"?" a = b.getfirstchar()
"." a = chr(b)
"!" a = a but non-b chars removed
";" a = readfile(b) ("E" = 1 if the file doesn't exist)
":" comment
"{" start multiline comment
"}" end multiline comment
"|" a = takearg(int(b))
"=" a = b
"%" a = str(getfilesandfolders(b))
"_" halt if EOF
Variables:
"@" if the variable is edited, the program changes
"#" print when program ends.
"E" EOF Var
Examples:
>#Hello, World!
Hello, World!
@
Infinite loop
&#*#
read a line and output without newline twice
>HHello, World!"
NOP
>H\""*H
print double quotes
Can you find bugs?
-
1\$\begingroup\$ Do you have a name for your programming language, and a sample program in that language? \$\endgroup\$200_success– 200_success2021年11月23日 18:30:37 +00:00Commented Nov 23, 2021 at 18:30
-
\$\begingroup\$ My programming language name is NLRNIS \$\endgroup\$Fmbalbuena– Fmbalbuena2021年11月23日 18:41:16 +00:00Commented Nov 23, 2021 at 18:41
-
\$\begingroup\$ And I will show samples later. \$\endgroup\$Fmbalbuena– Fmbalbuena2021年11月23日 18:43:22 +00:00Commented Nov 23, 2021 at 18:43
-
1\$\begingroup\$ Are you asking us to comment on your samples as well? It's a bit hard to comment on the code without some samples to see what you're expecting to happen. (And use some functions, please.) \$\endgroup\$Teepeemm– Teepeemm2021年11月23日 19:20:56 +00:00Commented Nov 23, 2021 at 19:20
-
1\$\begingroup\$ I named NLRNIS because No Loop, Recursion neither if statements. \$\endgroup\$Fmbalbuena– Fmbalbuena2021年11月27日 00:23:21 +00:00Commented Nov 27, 2021 at 0:23
1 Answer 1
To build a language with good quality code requires understanding the various approaches to building operators, evaluators and tokenizers. Lets go through the concepts from simplest to complicated, to build a calculator.
Operators
Operators is concerned with evaluating each operator.
Normally an operator function can be passed arguments and return the result, like add(1, 2)
.
Inline Operators
The simplest approach is to define all the operators inline in a 'switch'.
Since Python doesn't have a dedicated switch
statement (until 3.10), we can instead use if
elif
else
statements.
if op == "+":
result = lhs + rhs
elif op == "-":
result = lhs - rhs
elif op == "*":
result = lhs * rhs
elif op == "/":
result = lhs / rhs
Dictionary Operators
We can store the operators in an object. Giving us the ability to pass Operators to the Evaluator and/or the Tokenizer. Since the operators are strings we can build a dictionary mapping operator strings to the corresponding operator function.
When building the dictionary we can use lambda
functions as we can define the operator inside the dictionary.
However, lambda
functions can only contain one expression in Python.
If we need more than one expression then we will have to use 'full-fat' functions.
I have made addition a 'full-fat' function, so we have an example of both approaches.
def add(lhs, rhs):
return lhs + rhs
operators = {
"+": add,
"-": lambda lhs, rhs: lhs - rhs,
"*": lambda lhs, rhs: lhs * rhs,
"/": lambda lhs, rhs: lhs / rhs,
}
Class Operators
Rather than defining the operators in a dictionary we can define the operators on a class.
We can then build the operators
dictionary somewhat automatically from the class.
We do however have to use valid Python names for the operators.
class CalculatorOperators:
@staticmethod
def add(lhs, rhs):
return lhs + rhs
@staticmethod
def sub(lhs, rhs):
return lhs - rhs
@staticmethod
def mul(lhs, rhs):
return lhs * rhs
@staticmethod
def div(lhs, rhs):
return lhs / rhs
We can then iterate through the class by iterating through __dict__
.
To only get our functions we can filter out any dunder values.
def get_operators(obj):
return {
name: getattr(obj, name)
for name in obj.__dict__
if not (name.startswith("__") and name.endswith("__"))
}
print(get_operators(CalculatorOperators))
{'add': <function CalculatorOperators.add at 0x7ff171f61f30>, ...}
Metaclass Operators
We can automatically call get_operators
by having a base class with an __init_subclass__
method or a custom metaclass.
Either can run when the class is defined and transparently build operators
.
class Operators:
def __init_subclass__(cls, /, **kwargs):
cls.OPERATORS = get_operators(cls)
class CalculatorOperators(Operators):
@staticmethod
def add(lhs, rhs):
return lhs + rhs
...
print(CalculatorOperators.OPERATORS)
{'add': <function CalculatorOperators.add at 0x7ff171f623b0>, ...}
Decorated Class Operators
We can customize the key to use by defining a decorator function (somewhat comprehensive guide by me).
The decorator function can assign the desired name to an attribute on the function.
We'd need to make a little change to how we build operators
.
We'd need to use the function's attribute value if available.
def with_name(name=None):
def inner(fn):
fn.name = name
return fn
return inner
class CalculatorEvaluator:
@with_name("+")
@staticmethod
def add(lhs, rhs):
return lhs + rhs
...
operators = {
getattr(fn, "name", name): fn
for name, fn in CalculatorEvaluator.__dict__.items()
if not (name.startswith("__") and name.endswith("__"))
}
print(operators)
{'+': <staticmethod(<function CalculatorEvaluator.add at 0x7ff171f62560>)>, ...}
Comparison
Inline Operators
The simplest approach. However with larger operator sets or complicated logic for the operators the code quickly becomes unmaintainable.The approach is good when building a very simple evaluator for BrainFuck.
Dictionary Operators
A very common approach. Googling "Python switch" is bound to result in results saying to use a dictionary. The approach has a complexity improvement over Inline Operators, going from \$O(n)\$ to \$O(1)\$ operator lookup. I, however, don't find the approach to be the much more maintainable to Inline Operators. As the approach is prone to code fragmentation and the function inoperators
getting out of sync.The approach is good when;
- all the operators can be defined as
lambda
s and documenting the operators isn't needed. Or, - the operators must be fragmented and can't be contained within a single class/namespace. For example a menu screen, as the user can enter the same view from two different ways.
- all the operators can be defined as
Class Operators
The most advanced approach of the three, requiring an understanding of__dict__
(and potentially__slots__
). After learning the little bit of metaprogramming required to understand the approach I find the approach to be the simplest. We can throw all our operators in one class, document the class like any other class and have the approach best for our use case.Whilst we currently need to use Decorated Class Operators, we can normally use the simpler Class Operators approach when using the best approach to defining our own language.
The approach seems to be the standard approach ranging from single operator evaluators like
cmd
to parser libraries like TatSu.
For the rest of the answer operators
is the following:
operators = {
"+": lambda lhs, rhs: lhs + rhs,
"-": lambda lhs, rhs: lhs - rhs,
"*": lambda lhs, rhs: lhs * rhs,
"/": lambda lhs, rhs: lhs / rhs,
}
Evaluator
Now we've decided gone through the operators we can move onto the evaluators. The evaluators are generally pretty simple, like the operators.
Single Operator Evaluator
We need to evaluate an operator and some values.
To do so we just get the operator from operators
and just pass the values.
print(operators["+"](1, 2))
3
Stream Evaluator
Stream evaluators typically use a stack, and so work best on postfix input.
In Python the stack can just be a list
.
Then, whenever we have a non-operator token; we add the token to the stack.
Whenever we have a operator token; we pop the operands from the stack, and pass the operands to the function.
We then add the result of the operator to the stack.
After processing all tokens we pop the top value of the stack.
def evaluate__stream(tokens, operators):
stack = []
for token in tokens:
if token in operators:
b = stack.pop()
a = stack.pop()
token = operators[token](a, b)
stack.append(token)
return stack.pop()
evaluate__stream([1, 2, 3, "*", "+"], operators)
7
Tree Evaluator
Tree evaluators interact with a tree.
To have a tree the input must be nodes of some sort.
We can build a simple Node
class to store the operator and children.
class Node:
def __init__(self, operator, *children):
self.operator = operator
self.children = children
def __repr__(self):
values = [self.operator, *self.children]
return f"Node({', '.join(map(repr, values))})"
def __str__(self): # Only supports binary operators
op = self.operator
lhs = self.children[0]
rhs = self.children[1]
return f"({lhs} {op} {rhs})"
ast = Node("+", 1, Node("*", 2, 3))
print(repr(ast))
print(ast)
Node('+', 1, Node('*', 2, 3))
(1 + (2 * 3))
We then perform a depth first walk of the tree.
Otherwise we will perform the following operation 1 + Node("*", 2, 3)
, resulting in an error.
We can build a simple tree evaluator using recursion.
def evaluate__tree(node, operators):
if not isinstance(node, Node):
return node
children = [
evaluate__tree(child, operators)
for child in node.children
]
return operators[node.operator](*children)
print(evaluate__tree(ast, operators))
7
Comparison
Single Operator Evaluator
Very simple and allows simple control flow.The approach is good for menus or terminal commands, like with
cmd
.Stream Evaluator
Good at implementing postfix evaluators. However, most human languages aren't postfix only;- binary operators typically are infix. And,
- functions are typically prefix.
The approach is good for postfix languages, or languages easily converted to postfix.
Tree Evaluator
Trees are typically easier to understand than a postfix stream. As such languages typically build and evaluate an Abstract Syntax Tree (AST). An AST is also easier to mutate as we don't need to partially evaluate the stream to find the children.The approach is good for most languages.
The Evaluator to pick is largely dictated by how the tokens are made. Using a Tree Evaluator with the Single Operator Tokenizer doesn't make sense. As the Evaluator is completely overkill for the Tokenizer. Additionally the Single Operator Evaluator can only work with the Single Operator Tokenizer.
Tokenizer
Now we've got the easy stuff out of the way. Lets focus on the hard part, converting a string into a bunch of tokens.
Single Operator Tokenizer
The simplest approach is to get the user to provide the individual tokens.
We can do so by just calling input
a bunch, effectively sidestepping building a tokenizer.
lhs = int(input("lhs> "))
op = input("op > ")
rhs = int(input("rhs> "))
print(operators[op](lhs, rhs))
lhs> 1
op > +
rhs> 2
3
Simple Tokenizer (postfix)
The section is based of another one of my answers.
Tokenizers can be pretty complicated to build correctly as a beginner. In the past I've gotten caught up trying to implement something, but I forget I need to make the tokenizer do everything. So my code became complicated as I missed the forest for the trees. Lets use TDD to build the tokenizer:
We'll handle only numbers.
However only one character, and we'll convert to an int.def tokenize__simple(text): for char in text: if char in "0123456789": yield int(char)
>>> list(tokenize__simple("123")) [1, 2, 3]
We'll handle one number of any size.
We'll build up a 'token' stack. We'll use a list to ensure \$O(1)\$ time when adding new characters. As Python's strings guarantee a minimum \$O(n)\$ time when adding.def tokenize__simple(text): token = [] for char in text: if char in "0123456789": token.append(char) if token: yield int("".join(token))
>>> list(tokenize__simple("1 2 3")) [123] >>> list(tokenize__simple("")) []
We'll handle spaces.
We just need to copy theif token
code. And resettoken
after yielding one.def tokenize__simple(text): token = [] for char in text: if char in "0123456789": token.append(char) elif char in " ": if token: yield int("".join(token)) token = [] if token: yield int("".join(token))
>>> list(tokenize__simple("1 2 3")) [1, 2, 3]
We'll handle operators.
Basically the same as spaces but we also yield the operator.def tokenize__simple(text): token = [] for char in text: if char in "0123456789": token.append(char) elif char in " +-*/": if token: yield int("".join(token)) token = [] if char != " ": yield char if token: yield int("".join(token))
>>> list(tokenize__simple("1+ 2*3")) [1, "+", 2, "*", 3]
From here we can tokenize numbers in postfix notation and pass the values to the Stream Evaluator we wrote earlier.
evaluate__stream(tokenize__simple("1 2 3 * +"), operators)
7
Simple Tokenizer + Transformation (Infix Stream)
Performing transformations on the tokens afterwards is fairly common. For example, we can use Dijkstra's Shunting-Yard algorithm to convert from infix to postfix. We can then pass the tokens to the Stream Evaluator for evaluation.
def to_postfix(tokens, precedences):
operators = list()
for token in tokens:
if isinstance(token, int):
yield token
else:
try:
precedence = precedences[token]
while precedence <= precedences[operators[-1]]:
yield operators.pop()
except LookupError:
pass
operators.append(token)
yield from operators[::-1]
precedences = {"+": 0, "-": 0, "*": 1, "/": 1}
print(evaluate__stream(to_postfix(tokenize__simple("1 + 2 * 3"), precedences), operators))
7
Multiple Tokenizers (Infix Graph)
To build an infix graph we can build specialized parsers for each part of the grammar. The approach allows us to build more complex grammars. Whilst completely overkill for a calculator with 4 operators the idea is transferable to more complex systems.
First we'll get whitespace working.
We don't actually have a need for the whitespace, so we'll throw any runs away.
The for
loop will advance stream
leading to the consecutive whitespace being removed.
WHITESPACE = " "
def whitespace(char, stream):
if char in WHITESPACE:
for char in stream:
if char not in WHITESPACE:
break
return char, None
>>> it = iter(" 12")
>>> whitespace(next(it), it)
('1', None)
>>> next(it)
'2'
Next we want to get the numbers as integers.
The solution is a lot like whitespace
.
However we'll store the integer characters in chars
.
And we'll return chars
as an integer.
NUMBERS = "0123456789"
def number(char, stream):
chars = []
if char in NUMBERS:
chars.append(char)
for char in stream:
if char not in NUMBERS:
break
chars.append(char)
return char, int("".join(chars))
Next we'll say numbers can be surrounded by whitespace. We'll discard whatever the whitespace is. Since a number must be either side of an operator, we'll be able to put whitespace anywhere.
def spaced_number(char, stream):
char, _ = whitespace(char, stream)
char, num = number(char, stream)
char, _ = whitespace(char, stream)
return char, num
Next we'll handle multiplication and division.
By handling multiplication and division 'closer' to number
allows us to implement BODMAS.
To get the code to work we get a spaced_number
then we build a tree (more like a linked list) whilst we encounter multiplication or division.
def mul_div(char, stream):
char, number = spaced_number(char, stream)
node = number
while char in "*/":
op = char
char, number = spaced_number(next(stream, None), stream)
node = Node(op, node, number)
return char, node
Then we do the same we did for mul and div to add and sub.
However we call mul_div
rather than spaced_number
.
def add_sub(char, stream):
char, number = mul_div(char, stream)
node = number
while char in "+-":
op = char
char, number = mul_div(next(stream, None), stream)
node = Node(op, node, number)
return char, node
Finally we hide the char
and stream
implementation details away by making the tokenize__tree
function.
def tokenize__tree(text):
it = iter(text)
_, value = add_sub(next(it, None), it)
return value
print(evaluate__tree(tokenize__tree("1 + 2 * 3"), operators))
7
Comparison
Single Operator Tokenizer
Very simple and allows simple control flow without needing a complicated tokenizer.The approach is good for menus or terminal commands, like with
cmd
.Simple Tokenizer
Good at implementing postfix evaluators and simple grammars. The approach can also work with infix or prefix grammars. If you can think up an algorithm like Shunting-Yard for your use-case.The approach is good for simple grammars where using *BNF is not an option.
Multiple Tokenizers
The approach is very powerful, but also very verbose.The approach is good for complex grammars where using *BNF is not an option. Like writing a library to parse *BNF.
If a tokenizer isn't needed then use the Single Operator Tokenizer. Otherwise using *BNF is the best approach. If *BNF isn't an option then use Multiple Tokenizers. Most grammars seem simple as a beginner, so picking the option for complex grammars will allow the code room to grow if the grammar is more complex then anticipated.
*BNF
BNF
BNF is a very simple language for parsing strings. We can write the complex Multiple Tokenizers in a couple lines of BNF. Whilst BNF is a standard structure, many libraries use a bespoke flavour of the language.
First lets define the two constants. To define
NUMBERS
we need to allow any of the number characters, so we'll use|
to say either the character on the left hand side or right hand side.<WHITESPACE> ::= " " <NUMBER> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
Next lets define
whitespace
. We'll need to use the and operator;"a" "b"
matchesab
. Then we'll need to use recursion to allow any amount of whitespace.<whitespace> ::= <WHITESPACE> <whitespace> | ""
number
is defined almost exactly the same way aswhitespace
. The difference is the minimum match (the right hand side) must be<NUMBER>
not""
to match at least one number.<number> ::= <NUMBER> <number> | <NUMBER>
From here the rest is effectively the same.
<spaced_number> ::= <whitespace> <number> <whitespace> <mul_div> ::= <spaced_number> ("*" | "/") <mul_div> | <spaced_number> <add_sub> ::= <mul_div> ("+" | "-") <add_sub> | <mul_div>
BNF, the notoriously ugly and verbose flavour, is far easier to read than the Multiple Tokenizers code we wrote.
Use a Parser Library
I've used TatSu once or twice before and found the library to be easy to use. We can write the equivalent of the entire Tokenizer, Evaluator and Operators in not much code.
Lets convert the BNF to TatSu's flavour.
TatSu supports regex, so we can define
spaced_number
in one line.spaced_number = / *\d+ */;
TatSu also seems to ignore whitespace so both
/ */
are not needed.number = /\d+/;
From here the rest can be effectively the same.
mul_div = number ("*" | "/") mul_div | number; add_sub = mul_div ("+" | "-") add_sub | mul_div;
TatSu also provides a regex like
{}*
syntax, like regex's*
, so we can avoid the recursive definition.mul_div = number {("*" | "/") number}*; add_sub = mul_div {("+" | "-") mul_div}*;
However, rather than focusing on brevity we should focus on defining clear tokens we can interact with, with the Operators class we'll provide to TatSu. So we need to make addition, subtraction, multiplication and division stand alone rules.
mul = number "*" mul_div; div = number "/" mul_div; add = mul_div "+" add_sub; sub = mul_div "-" add_sub;
We can then bind the values to names by using
name:exp
syntax.mul = lhs:number "*" rhs:mul_div; div = lhs:number "/" rhs:mul_div; add = lhs:mul_div "+" rhs:add_sub; sub = lhs:mul_div "-" rhs:add_sub;
Now we can write the entire calculator in a very small amount of code.
import tatsu
GRAMMAR = """
root = add_sub $;
number = /\d+/;
mul_div = mul | div | number;
mul = lhs:number "*" rhs:mul_div;
div = lhs:number "/" rhs:mul_div;
add_sub = add | sub | mul_div;
add = lhs:mul_div "+" rhs:add_sub;
sub = lhs:mul_div "-" rhs:add_sub;
"""
class CalculatorOperators:
def number(self, ast):
return int(ast)
def mul(self, ast):
return ast.lhs * ast.rhs
def div(self, ast):
return ast.lhs / ast.rhs
def add(self, ast):
return ast.lhs + ast.rhs
def sub(self, ast):
return ast.lhs - ast.rhs
def evaluate(text):
parser = tatsu.compile(GRAMMAR)
return parser.parse(text, semantics=CalculatorOperators())
print(evaluate("1 + 2 * 3"))
7
The code is now ridiculously maintainable. We've got the power of Multiple Tokenizers by writing a Class Operators and a very simple grammar.
We saw how much code and effort went into writing even the Simple Tokenizer. With fairly complex code and algorithms to get infix working. Vanilla Python isn't the right tool for writing a language. A library with a *BNF parser will normally be far better.
We can also easily change the code to support brackets/parenthesis.
The change would be to add a stage between number
and mul_div
.
We wouldn't even need to add a new operator.
Op's Code
Design
You've made the same mistake I have in the past. By making a Simple Tokenizer you seem to have missed the forest for the trees. We can see you've gone "how can I get x to work" but you've not looked back over the entire code and gone; "Oh, the code is the same!"
Look here:
elif join:
if isset:
try:
dct[variable] = dct[variable2] + dct[i]
except:
raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
break
join = isset = isready = isgo = False
# check if the variable is "@" change the data
if variable == "@":
RESET = True
break
elif isready:
variable2 = i
isset = True
else:
variable = i
isready = True
elif Reverse:
# this is equivalent to pseudocode var = var2.reverse()
if isready:
try:
dct[variable] = dct[i][::-1]
except:
raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
break
isready = False
Revert = False
if variable == "@":
RESET = True
break
else:
variable = i
isready = True
# ...
elif i == "+":
join = True
elif i == "/":
Reverse = True
You've written effectively the same code twice. If we split the Tokenizer away from the Evaluator and Operators then we could drastically simplify the code.
Whilst we can probably get away with a Stream Evaluator we'd be duplicating code in the Tokenizer and the Evaluator.
Since your language is a prefix language to tokenize correctly we'd have to know the amount of operands to consume from the input.
Then to evaluate correctly we'd need to know the correct amount of operands to consume from the input.
Where instead using the Tree Evaluator would remove the duplication by storing the operands in the children
variable.
I'm using an iterator, like in Multiple Tokenizers, to make the code to get the variables simpler.
By advancing the iterator _text
we don't need any complicate join
, Reverse
, isset
or isready
variables.
import itertools
ARGS = {
"+": 3,
"/": 2,
}
def tokenize(text):
_text = iter(text)
nodes = []
for char in _text:
if char in ARGS:
nodes.append(Node(char, list(itertools.islice(_text, ARGS[char]))))
# handle other tokens
return Node("root", nodes)
However I'd still strongly recommend using *BNF if at all possible. Even if you can't, using TatSu to make a mostly working version of your language would probably help you figure out how to build Multiple Tokenizers.
I really think you need to start over with TatSu. Post another question and then see where to go from there.
Correctness
Can you find bugs?
Honestly your code needs a rewrite. Looking for bugs at the moment would be a waste of time.
Additionally languages are incredibly fragile code bases. Asking us to hunt bugs when you have a very lackluster coverage is also a waste. You may fix the bug, but without testing infrastructure in place when you fix another bug you may just reintroduce the bug. You need tests to prevent regressions.
I'd recommend installing PyTest and using TDD to rebuild your code.
-
1\$\begingroup\$ @Peilonrayz fantastic job as always! I've learned so much from this. Just in awe, keep trucking. \$\endgroup\$N3buchadnezzar– N3buchadnezzar2021年12月16日 02:51:48 +00:00Commented Dec 16, 2021 at 2:51
-
\$\begingroup\$ Hello, sorry for late question, but how does
@staticmethod
work? I know how classes and decorators work, but I don't know what@staticmethod
does do. \$\endgroup\$Fmbalbuena– Fmbalbuena2024年02月08日 16:32:38 +00:00Commented Feb 8, 2024 at 16:32 -
\$\begingroup\$ @Fmbalbuena
staticmethod
removes theself
from methods when working with instances. So bothCalculatorOperators.add(1, 2)
andCalculatorOperators().add(1, 2)
work as intended. \$\endgroup\$2024年02月10日 17:54:19 +00:00Commented Feb 10, 2024 at 17:54