2
\$\begingroup\$

Code:

# read file of first agrument
import sys, os
try:
 data = open(sys.argv[1]).read()
except:
 data = open("File.txt").read()
 sys.argv.append("File.txt")
dct = {"@": data, "E": "0"}
iteration = 0
RESET = False
define = False
printf = False
replace = False
inp = False
join = False
Reverse = False
getascii = False
getfirstchar = False
removeallexceptcertainchar = False
readfile = False
comment = False
multilinecomment = False
takeagrument = False
definebutvar = False
getfilesandfolders = False
haltifeof = False
################################################################################
variable = ""
variable2 = ""
variable3 = ""
variable4 = ""
variable5 = ""
variable6 = ""
string = ""
escape = False
isready = False
isset = False
isgo = False
def erroring(errtype, errreason, tb):
 print(errreason, end="", file=sys.stderr)
sys.excepthook = erroring
def dctremoveallexceptcertain(string, variable):
 # removes all except certain characters
 # string = string to be modified
 # variable = characters to be kept
 # returns modified string
 global newstring
 newstring = ""
 for i in string:
 if i in variable:
 newstring += i
 return newstring
while True:
 # Note you should use try and except to catch errors, not if x in y:
 for i in data:
 if printf:
 # print the variable of dct variable
 if i in dct:
 print(dct[i], end = "")
 printf = False
 else:
 raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
 break
 elif define:
 if isready:
 if escape:
 # check if i is quote or backslash add to string else add backslash and i
 if i == "\"" or i == "\\":
 string += i
 escape = False
 else:
 string += "\\" + i
 escape = False
 elif i == "\"":
 dct[variable] = string
 string = ""
 isready = False
 define = False
 # check if the variable is "@" change the data
 if variable == "@":
 RESET = True
 break
 elif i == "\\":
 escape = True
 continue
 else:
 string += i
 else:
 variable = i
 isready = True
 elif replace:
 # get 3 variables
 if isgo:
 try:
 dct[variable] = dct[variable2].replace(variable3, i)
 except:
 raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
 break
 replace = isset = isready = isgo = False
 # check if the variable is "@" change the data
 if variable == "@":
 RESET = True
 break
 elif isset:
 variable3 = i
 isgo = True
 elif isready:
 variable2 = i
 isset = True
 else:
 variable = i
 isready = True
 elif inp:
 try:
 dct[i] = input()
 dct["E"] = "0"
 except EOFError:
 dct["E"] = "1"
 inp = False
 if i == "@":
 RESET = True
 break
 elif join:
 if isset:
 try:
 dct[variable] = dct[variable2] + dct[i]
 except:
 raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
 break
 join = isset = isready = isgo = False
 # check if the variable is "@" change the data
 if variable == "@":
 RESET = True
 break
 elif isready:
 variable2 = i
 isset = True
 else:
 variable = i
 isready = True
 elif Reverse:
 # this is equivalent to pseudocode var = var2.reverse()
 if isready:
 try:
 dct[variable] = dct[i][::-1]
 except:
 raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
 break
 isready = False
 Revert = False
 if variable == "@":
 RESET = True
 break
 else:
 variable = i
 isready = True
 elif getfirstchar:
 # gets the first character of the variable
 if isready:
 try:
 dct[variable] = dct[i][0]
 except:
 raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
 isready = False
 getfirstchar = False
 if variable == "@":
 RESET = True
 break
 else:
 variable = i
 isready = True
 elif getascii:
 # gets ascii value of the variable
 if isready:
 try:
 dct[i]
 try:
 dct[variable] = ord(dct[i])
 except:
 raise Exception("ERROR IN FILE", sys.argv[1], "INVALID NUMBER IN ITERATION", iteration)
 except:
 raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
 getascii = False
 isready = False
 if variable == "@":
 RESET = True
 break
 else:
 variable = i
 isready = True
 elif removeallexceptcertainchar:
 # removes all except certain character
 if isready:
 try:
 dct[variable] = dctremoveallexceptcertain(dct[variable], dct[i])
 except:
 raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
 isready = False
 removeallexceptcertainchar = False
 if variable == "@":
 RESET = True
 break
 else:
 variable = i
 isready = True
 elif readfile:
 if isready:
 try:
 dct[i]
 except:
 raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
 try:
 dct[variable] = open(dct[i], "r").read()
 dct["E"] = "0"
 except:
 dct["E"] = "1"
 isready = False
 readfile = False
 if variable == "@":
 RESET = True
 break
 else:
 variable = i
 isready = True
 elif comment:
 # ignore until a newline is found
 if i == "\n":
 comment = False
 elif multilinecomment:
 if i == "}":
 multilinecomment = False
 elif takeagrument:
 # checks if the variable is defined and the variable is a number
 if isready:
 try:
 dct[i]
 try:
 int(dct[i])
 try:
 dct[variable] = sys.argv[int(dct[i])]
 dct["E"] = "0"
 except:
 dct["E"] = "1"
 except:
 raise Exception("ERROR IN FILE", sys.argv[1], "INVALID NUMBER", iteration)
 except:
 raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
 isready = False
 takeagrument = False
 if variable == "@":
 RESET = True
 else:
 variable = i
 isready = True
 elif definebutvar:
 # like > but it defines the variable
 if isready:
 try:
 dct[variable] = dct[i]
 except:
 raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
 isready = False
 definebutvar = False
 if variable == "@":
 RESET = True
 break
 else:
 variable = i
 isready = True
 elif getfilesandfolders:
 # gets all files and folders in a variable if folder not found the variable "E" is equal to "1" else "0"
 if isready:
 # check if variable is defined
 try:
 dct[i]
 except:
 raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
 try:
 files = os.listdir(dct[i])
 dct[variable] = files
 dct["E"] = "0"
 except:
 dct["E"] = "1"
 isready = False
 getfilesandfolders = False
 if variable == "@":
 RESET = True
 break
 else:
 variable = i
 isready = True
 elif haltifeof:
 if dct["E"] == "1":break
 haltifeof = False
 elif i == "@": #############################################################################################################
 RESET = True
 break
 elif i == "H":
 break
 elif i == ">":
 define = True
 elif i == "*":
 printf = True
 elif i == "$":
 replace = True
 elif i == "&":
 inp = True
 elif i == "+":
 join = True
 elif i == "/":
 Reverse = True
 elif i == "?":
 getfirstchar = True
 elif i == ".":
 getascii = True
 elif i == "!":
 removeallexceptcertainchar = True
 elif i == ";":
 readfile = True
 elif i == ":":
 comment = True
 elif i == "{":
 multilinecomment = True
 elif i == "|":
 takeagrument = True
 elif i == "=":
 definebutvar = True
 elif i == "%":
 getfilesandfolders = True
 elif i == "_":
 haltifeof = True
 # its not necessary to raise error invalid command
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
 iteration += 1
 # check if string still contains something put it in dct
 if string != "":
 dct[variable] = string
 string = ""
 define = False
 isready = False
 isset = False
 isgo = False
 printf = False
 replace = False
 inp = False
 join, Reverse = False, False
 getfirstchar = False
 getascii = False
 removeallexceptcertainchar = False
 readfile = False
 comment = False
 multilinecomment = False
 takeagrument = False
 definebutvar = False
 getfilesandfolders = False
 # check if dct is "@" change the data
 if variable == "@":
 RESET = True
 if RESET:
 RESET = False
 data = dct["@"]
 iteration = 0
 continue
 break
try:
 print(dct["#"], end = "")
except:
 pass

This is a programming language interpreter. I will show wiki.

Commands:

(a, b, c, d means variables next command)

"@" goto first char

"H" halt (must be uppercase, not lowercase)

">" define var Syntax: >~Hello" Escape is backslash (the quote is optional)

"*" print the next char variable

"$" a = b.replace(c, d)

"&" Input the next char variable ("E" var is 1 if EOF)

"+" a = b.join(c)

"/" a = b.reverse()

"?" a = b.getfirstchar()

"." a = chr(b)

"!" a = a but non-b chars removed

";" a = readfile(b) ("E" = 1 if the file doesn't exist)

":" comment

"{" start multiline comment

"}" end multiline comment

"|" a = takearg(int(b))

"=" a = b

"%" a = str(getfilesandfolders(b))

"_" halt if EOF

Variables:

"@" if the variable is edited, the program changes

"#" print when program ends.

"E" EOF Var


Examples:

>#Hello, World!

Hello, World!

@

Infinite loop

&#*#

read a line and output without newline twice

>HHello, World!"

NOP

>H\""*H

print double quotes


Can you find bugs?

200_success
145k22 gold badges190 silver badges478 bronze badges
asked Nov 22, 2021 at 22:52
\$\endgroup\$
8
  • 1
    \$\begingroup\$ Do you have a name for your programming language, and a sample program in that language? \$\endgroup\$ Commented Nov 23, 2021 at 18:30
  • \$\begingroup\$ My programming language name is NLRNIS \$\endgroup\$ Commented Nov 23, 2021 at 18:41
  • \$\begingroup\$ And I will show samples later. \$\endgroup\$ Commented Nov 23, 2021 at 18:43
  • 1
    \$\begingroup\$ Are you asking us to comment on your samples as well? It's a bit hard to comment on the code without some samples to see what you're expecting to happen. (And use some functions, please.) \$\endgroup\$ Commented Nov 23, 2021 at 19:20
  • 1
    \$\begingroup\$ I named NLRNIS because No Loop, Recursion neither if statements. \$\endgroup\$ Commented Nov 27, 2021 at 0:23

1 Answer 1

9
\$\begingroup\$

To build a language with good quality code requires understanding the various approaches to building operators, evaluators and tokenizers. Lets go through the concepts from simplest to complicated, to build a calculator.

Operators

Operators is concerned with evaluating each operator. Normally an operator function can be passed arguments and return the result, like add(1, 2).

Inline Operators

The simplest approach is to define all the operators inline in a 'switch'. Since Python doesn't have a dedicated switch statement (until 3.10), we can instead use if elif else statements.

if op == "+":
 result = lhs + rhs
elif op == "-":
 result = lhs - rhs
elif op == "*":
 result = lhs * rhs
elif op == "/":
 result = lhs / rhs

Dictionary Operators

We can store the operators in an object. Giving us the ability to pass Operators to the Evaluator and/or the Tokenizer. Since the operators are strings we can build a dictionary mapping operator strings to the corresponding operator function.

When building the dictionary we can use lambda functions as we can define the operator inside the dictionary. However, lambda functions can only contain one expression in Python. If we need more than one expression then we will have to use 'full-fat' functions.
I have made addition a 'full-fat' function, so we have an example of both approaches.

def add(lhs, rhs):
 return lhs + rhs
operators = {
 "+": add,
 "-": lambda lhs, rhs: lhs - rhs,
 "*": lambda lhs, rhs: lhs * rhs,
 "/": lambda lhs, rhs: lhs / rhs,
}

Class Operators

Rather than defining the operators in a dictionary we can define the operators on a class. We can then build the operators dictionary somewhat automatically from the class. We do however have to use valid Python names for the operators.

class CalculatorOperators:
 @staticmethod
 def add(lhs, rhs):
 return lhs + rhs
 @staticmethod
 def sub(lhs, rhs):
 return lhs - rhs
 @staticmethod
 def mul(lhs, rhs):
 return lhs * rhs
 @staticmethod
 def div(lhs, rhs):
 return lhs / rhs

We can then iterate through the class by iterating through __dict__. To only get our functions we can filter out any dunder values.

def get_operators(obj):
 return {
 name: getattr(obj, name)
 for name in obj.__dict__
 if not (name.startswith("__") and name.endswith("__"))
 }
print(get_operators(CalculatorOperators))
{'add': <function CalculatorOperators.add at 0x7ff171f61f30>, ...}

Metaclass Operators

We can automatically call get_operators by having a base class with an __init_subclass__ method or a custom metaclass. Either can run when the class is defined and transparently build operators.

class Operators:
 def __init_subclass__(cls, /, **kwargs):
 cls.OPERATORS = get_operators(cls)
class CalculatorOperators(Operators):
 @staticmethod
 def add(lhs, rhs):
 return lhs + rhs
 ...
print(CalculatorOperators.OPERATORS)
{'add': <function CalculatorOperators.add at 0x7ff171f623b0>, ...}

Decorated Class Operators

We can customize the key to use by defining a decorator function (somewhat comprehensive guide by me). The decorator function can assign the desired name to an attribute on the function. We'd need to make a little change to how we build operators. We'd need to use the function's attribute value if available.

def with_name(name=None):
 def inner(fn):
 fn.name = name
 return fn
 return inner
class CalculatorEvaluator:
 @with_name("+")
 @staticmethod
 def add(lhs, rhs):
 return lhs + rhs
 ...
operators = {
 getattr(fn, "name", name): fn
 for name, fn in CalculatorEvaluator.__dict__.items()
 if not (name.startswith("__") and name.endswith("__"))
}
print(operators)
{'+': <staticmethod(<function CalculatorEvaluator.add at 0x7ff171f62560>)>, ...}

Comparison

  • Inline Operators
    The simplest approach. However with larger operator sets or complicated logic for the operators the code quickly becomes unmaintainable.

    The approach is good when building a very simple evaluator for BrainFuck.

  • Dictionary Operators
    A very common approach. Googling "Python switch" is bound to result in results saying to use a dictionary. The approach has a complexity improvement over Inline Operators, going from \$O(n)\$ to \$O(1)\$ operator lookup. I, however, don't find the approach to be the much more maintainable to Inline Operators. As the approach is prone to code fragmentation and the function in operators getting out of sync.

    The approach is good when;

    • all the operators can be defined as lambdas and documenting the operators isn't needed. Or,
    • the operators must be fragmented and can't be contained within a single class/namespace. For example a menu screen, as the user can enter the same view from two different ways.
  • Class Operators
    The most advanced approach of the three, requiring an understanding of __dict__ (and potentially __slots__). After learning the little bit of metaprogramming required to understand the approach I find the approach to be the simplest. We can throw all our operators in one class, document the class like any other class and have the approach best for our use case.

    Whilst we currently need to use Decorated Class Operators, we can normally use the simpler Class Operators approach when using the best approach to defining our own language.

    The approach seems to be the standard approach ranging from single operator evaluators like cmd to parser libraries like TatSu.

For the rest of the answer operators is the following:

operators = {
 "+": lambda lhs, rhs: lhs + rhs,
 "-": lambda lhs, rhs: lhs - rhs,
 "*": lambda lhs, rhs: lhs * rhs,
 "/": lambda lhs, rhs: lhs / rhs,
}

Evaluator

Now we've decided gone through the operators we can move onto the evaluators. The evaluators are generally pretty simple, like the operators.

Single Operator Evaluator

We need to evaluate an operator and some values. To do so we just get the operator from operators and just pass the values.

print(operators["+"](1, 2))
3

Stream Evaluator

Stream evaluators typically use a stack, and so work best on postfix input. In Python the stack can just be a list. Then, whenever we have a non-operator token; we add the token to the stack. Whenever we have a operator token; we pop the operands from the stack, and pass the operands to the function. We then add the result of the operator to the stack. After processing all tokens we pop the top value of the stack.

def evaluate__stream(tokens, operators):
 stack = []
 for token in tokens:
 if token in operators:
 b = stack.pop()
 a = stack.pop()
 token = operators[token](a, b)
 stack.append(token)
 return stack.pop()
evaluate__stream([1, 2, 3, "*", "+"], operators)
7

Tree Evaluator

Tree evaluators interact with a tree. To have a tree the input must be nodes of some sort. We can build a simple Node class to store the operator and children.

class Node:
 def __init__(self, operator, *children):
 self.operator = operator
 self.children = children
 def __repr__(self):
 values = [self.operator, *self.children]
 return f"Node({', '.join(map(repr, values))})"
 def __str__(self): # Only supports binary operators
 op = self.operator
 lhs = self.children[0]
 rhs = self.children[1]
 return f"({lhs} {op} {rhs})"
ast = Node("+", 1, Node("*", 2, 3))
print(repr(ast))
print(ast)
Node('+', 1, Node('*', 2, 3))
(1 + (2 * 3))

We then perform a depth first walk of the tree. Otherwise we will perform the following operation 1 + Node("*", 2, 3), resulting in an error. We can build a simple tree evaluator using recursion.

def evaluate__tree(node, operators):
 if not isinstance(node, Node):
 return node
 children = [
 evaluate__tree(child, operators)
 for child in node.children
 ]
 return operators[node.operator](*children)
print(evaluate__tree(ast, operators))
7

Comparison

  • Single Operator Evaluator
    Very simple and allows simple control flow.

    The approach is good for menus or terminal commands, like with cmd.

  • Stream Evaluator
    Good at implementing postfix evaluators. However, most human languages aren't postfix only;

    • binary operators typically are infix. And,
    • functions are typically prefix.

    The approach is good for postfix languages, or languages easily converted to postfix.

  • Tree Evaluator
    Trees are typically easier to understand than a postfix stream. As such languages typically build and evaluate an Abstract Syntax Tree (AST). An AST is also easier to mutate as we don't need to partially evaluate the stream to find the children.

    The approach is good for most languages.

The Evaluator to pick is largely dictated by how the tokens are made. Using a Tree Evaluator with the Single Operator Tokenizer doesn't make sense. As the Evaluator is completely overkill for the Tokenizer. Additionally the Single Operator Evaluator can only work with the Single Operator Tokenizer.

Tokenizer

Now we've got the easy stuff out of the way. Lets focus on the hard part, converting a string into a bunch of tokens.

Single Operator Tokenizer

The simplest approach is to get the user to provide the individual tokens. We can do so by just calling input a bunch, effectively sidestepping building a tokenizer.

lhs = int(input("lhs> "))
op = input("op > ")
rhs = int(input("rhs> "))
print(operators[op](lhs, rhs))
lhs> 1
op > +
rhs> 2
3

Simple Tokenizer (postfix)

The section is based of another one of my answers.

Tokenizers can be pretty complicated to build correctly as a beginner. In the past I've gotten caught up trying to implement something, but I forget I need to make the tokenizer do everything. So my code became complicated as I missed the forest for the trees. Lets use TDD to build the tokenizer:

  1. We'll handle only numbers.
    However only one character, and we'll convert to an int.

    def tokenize__simple(text):
     for char in text:
     if char in "0123456789":
     yield int(char)
    
    >>> list(tokenize__simple("123"))
    [1, 2, 3]
    
  2. We'll handle one number of any size.
    We'll build up a 'token' stack. We'll use a list to ensure \$O(1)\$ time when adding new characters. As Python's strings guarantee a minimum \$O(n)\$ time when adding.

    def tokenize__simple(text):
     token = []
     for char in text:
     if char in "0123456789":
     token.append(char)
     if token:
     yield int("".join(token))
    
    >>> list(tokenize__simple("1 2 3"))
    [123]
    >>> list(tokenize__simple(""))
    []
    
  3. We'll handle spaces.
    We just need to copy the if token code. And reset token after yielding one.

    def tokenize__simple(text):
     token = []
     for char in text:
     if char in "0123456789":
     token.append(char)
     elif char in " ":
     if token:
     yield int("".join(token))
     token = []
     if token:
     yield int("".join(token))
    
    >>> list(tokenize__simple("1 2 3"))
    [1, 2, 3]
    
  4. We'll handle operators.
    Basically the same as spaces but we also yield the operator.

    def tokenize__simple(text):
     token = []
     for char in text:
     if char in "0123456789":
     token.append(char)
     elif char in " +-*/":
     if token:
     yield int("".join(token))
     token = []
     if char != " ":
     yield char
     if token:
     yield int("".join(token))
    
    >>> list(tokenize__simple("1+ 2*3"))
    [1, "+", 2, "*", 3]
    

From here we can tokenize numbers in postfix notation and pass the values to the Stream Evaluator we wrote earlier.

evaluate__stream(tokenize__simple("1 2 3 * +"), operators)
7

Simple Tokenizer + Transformation (Infix Stream)

Performing transformations on the tokens afterwards is fairly common. For example, we can use Dijkstra's Shunting-Yard algorithm to convert from infix to postfix. We can then pass the tokens to the Stream Evaluator for evaluation.

def to_postfix(tokens, precedences):
 operators = list()
 for token in tokens:
 if isinstance(token, int):
 yield token
 else:
 try:
 precedence = precedences[token]
 while precedence <= precedences[operators[-1]]:
 yield operators.pop()
 except LookupError:
 pass
 operators.append(token)
 yield from operators[::-1]
precedences = {"+": 0, "-": 0, "*": 1, "/": 1}
print(evaluate__stream(to_postfix(tokenize__simple("1 + 2 * 3"), precedences), operators))
7

Multiple Tokenizers (Infix Graph)

To build an infix graph we can build specialized parsers for each part of the grammar. The approach allows us to build more complex grammars. Whilst completely overkill for a calculator with 4 operators the idea is transferable to more complex systems.

First we'll get whitespace working. We don't actually have a need for the whitespace, so we'll throw any runs away. The for loop will advance stream leading to the consecutive whitespace being removed.

WHITESPACE = " "
def whitespace(char, stream):
 if char in WHITESPACE:
 for char in stream:
 if char not in WHITESPACE:
 break
 return char, None
>>> it = iter(" 12")
>>> whitespace(next(it), it)
('1', None)
>>> next(it)
'2'

Next we want to get the numbers as integers. The solution is a lot like whitespace. However we'll store the integer characters in chars. And we'll return chars as an integer.

NUMBERS = "0123456789"
def number(char, stream):
 chars = []
 if char in NUMBERS:
 chars.append(char)
 for char in stream:
 if char not in NUMBERS:
 break
 chars.append(char)
 return char, int("".join(chars))

Next we'll say numbers can be surrounded by whitespace. We'll discard whatever the whitespace is. Since a number must be either side of an operator, we'll be able to put whitespace anywhere.

def spaced_number(char, stream):
 char, _ = whitespace(char, stream)
 char, num = number(char, stream)
 char, _ = whitespace(char, stream)
 return char, num

Next we'll handle multiplication and division. By handling multiplication and division 'closer' to number allows us to implement BODMAS. To get the code to work we get a spaced_number then we build a tree (more like a linked list) whilst we encounter multiplication or division.

def mul_div(char, stream):
 char, number = spaced_number(char, stream)
 node = number
 while char in "*/":
 op = char
 char, number = spaced_number(next(stream, None), stream)
 node = Node(op, node, number)
 return char, node

Then we do the same we did for mul and div to add and sub. However we call mul_div rather than spaced_number.

def add_sub(char, stream):
 char, number = mul_div(char, stream)
 node = number
 while char in "+-":
 op = char
 char, number = mul_div(next(stream, None), stream)
 node = Node(op, node, number)
 return char, node

Finally we hide the char and stream implementation details away by making the tokenize__tree function.

def tokenize__tree(text):
 it = iter(text)
 _, value = add_sub(next(it, None), it)
 return value
print(evaluate__tree(tokenize__tree("1 + 2 * 3"), operators))
7

Comparison

  • Single Operator Tokenizer
    Very simple and allows simple control flow without needing a complicated tokenizer.

    The approach is good for menus or terminal commands, like with cmd.

  • Simple Tokenizer
    Good at implementing postfix evaluators and simple grammars. The approach can also work with infix or prefix grammars. If you can think up an algorithm like Shunting-Yard for your use-case.

    The approach is good for simple grammars where using *BNF is not an option.

  • Multiple Tokenizers
    The approach is very powerful, but also very verbose.

    The approach is good for complex grammars where using *BNF is not an option. Like writing a library to parse *BNF.

If a tokenizer isn't needed then use the Single Operator Tokenizer. Otherwise using *BNF is the best approach. If *BNF isn't an option then use Multiple Tokenizers. Most grammars seem simple as a beginner, so picking the option for complex grammars will allow the code room to grow if the grammar is more complex then anticipated.

*BNF

BNF

BNF is a very simple language for parsing strings. We can write the complex Multiple Tokenizers in a couple lines of BNF. Whilst BNF is a standard structure, many libraries use a bespoke flavour of the language.

  • First lets define the two constants. To define NUMBERS we need to allow any of the number characters, so we'll use | to say either the character on the left hand side or right hand side.

    <WHITESPACE> ::= " "
    <NUMBER> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
    
  • Next lets define whitespace. We'll need to use the and operator; "a" "b" matches ab. Then we'll need to use recursion to allow any amount of whitespace.

    <whitespace> ::= <WHITESPACE> <whitespace> | ""
    
  • number is defined almost exactly the same way as whitespace. The difference is the minimum match (the right hand side) must be <NUMBER> not "" to match at least one number.

    <number> ::= <NUMBER> <number> | <NUMBER>
    
  • From here the rest is effectively the same.

    <spaced_number> ::= <whitespace> <number> <whitespace>
    <mul_div> ::= <spaced_number> ("*" | "/") <mul_div> | <spaced_number>
    <add_sub> ::= <mul_div> ("+" | "-") <add_sub> | <mul_div>
    

BNF, the notoriously ugly and verbose flavour, is far easier to read than the Multiple Tokenizers code we wrote.

Use a Parser Library

I've used TatSu once or twice before and found the library to be easy to use. We can write the equivalent of the entire Tokenizer, Evaluator and Operators in not much code.

Lets convert the BNF to TatSu's flavour.

  • TatSu supports regex, so we can define spaced_number in one line.

    spaced_number = / *\d+ */;
    

    TatSu also seems to ignore whitespace so both / */ are not needed.

    number = /\d+/;
    
  • From here the rest can be effectively the same.

    mul_div = number ("*" | "/") mul_div | number;
    add_sub = mul_div ("+" | "-") add_sub | mul_div;
    

    TatSu also provides a regex like {}* syntax, like regex's *, so we can avoid the recursive definition.

    mul_div = number {("*" | "/") number}*;
    add_sub = mul_div {("+" | "-") mul_div}*;
    
  • However, rather than focusing on brevity we should focus on defining clear tokens we can interact with, with the Operators class we'll provide to TatSu. So we need to make addition, subtraction, multiplication and division stand alone rules.

    mul = number "*" mul_div;
    div = number "/" mul_div;
    add = mul_div "+" add_sub;
    sub = mul_div "-" add_sub;
    

    We can then bind the values to names by using name:exp syntax.

    mul = lhs:number "*" rhs:mul_div;
    div = lhs:number "/" rhs:mul_div;
    add = lhs:mul_div "+" rhs:add_sub;
    sub = lhs:mul_div "-" rhs:add_sub;
    

Now we can write the entire calculator in a very small amount of code.

import tatsu
GRAMMAR = """
root = add_sub $;
number = /\d+/;
mul_div = mul | div | number;
mul = lhs:number "*" rhs:mul_div;
div = lhs:number "/" rhs:mul_div;
add_sub = add | sub | mul_div;
add = lhs:mul_div "+" rhs:add_sub;
sub = lhs:mul_div "-" rhs:add_sub;
"""
class CalculatorOperators:
 def number(self, ast):
 return int(ast)
 def mul(self, ast):
 return ast.lhs * ast.rhs
 def div(self, ast):
 return ast.lhs / ast.rhs
 def add(self, ast):
 return ast.lhs + ast.rhs
 def sub(self, ast):
 return ast.lhs - ast.rhs
def evaluate(text):
 parser = tatsu.compile(GRAMMAR)
 return parser.parse(text, semantics=CalculatorOperators())
print(evaluate("1 + 2 * 3"))
7

The code is now ridiculously maintainable. We've got the power of Multiple Tokenizers by writing a Class Operators and a very simple grammar.

We saw how much code and effort went into writing even the Simple Tokenizer. With fairly complex code and algorithms to get infix working. Vanilla Python isn't the right tool for writing a language. A library with a *BNF parser will normally be far better.

We can also easily change the code to support brackets/parenthesis. The change would be to add a stage between number and mul_div. We wouldn't even need to add a new operator.

Op's Code

Design

You've made the same mistake I have in the past. By making a Simple Tokenizer you seem to have missed the forest for the trees. We can see you've gone "how can I get x to work" but you've not looked back over the entire code and gone; "Oh, the code is the same!"

Look here:

 elif join:
 if isset:
 try:
 dct[variable] = dct[variable2] + dct[i]
 except:
 raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
 break
 join = isset = isready = isgo = False
 # check if the variable is "@" change the data
 if variable == "@":
 RESET = True
 break
 elif isready:
 variable2 = i
 isset = True
 else:
 variable = i
 isready = True
 elif Reverse:
 # this is equivalent to pseudocode var = var2.reverse()
 if isready:
 try:
 dct[variable] = dct[i][::-1]
 except:
 raise Exception("ERROR IN FILE", sys.argv[1], "VARIABLE NOT FOUND IN ITERATION", iteration)
 break
 isready = False
 Revert = False
 if variable == "@":
 RESET = True
 break
 else:
 variable = i
 isready = True
# ...
 elif i == "+":
 join = True
 elif i == "/":
 Reverse = True

You've written effectively the same code twice. If we split the Tokenizer away from the Evaluator and Operators then we could drastically simplify the code.

Whilst we can probably get away with a Stream Evaluator we'd be duplicating code in the Tokenizer and the Evaluator. Since your language is a prefix language to tokenize correctly we'd have to know the amount of operands to consume from the input. Then to evaluate correctly we'd need to know the correct amount of operands to consume from the input. Where instead using the Tree Evaluator would remove the duplication by storing the operands in the children variable.

I'm using an iterator, like in Multiple Tokenizers, to make the code to get the variables simpler. By advancing the iterator _text we don't need any complicate join, Reverse, isset or isready variables.

import itertools
ARGS = {
 "+": 3,
 "/": 2,
}
def tokenize(text):
 _text = iter(text)
 nodes = []
 for char in _text:
 if char in ARGS:
 nodes.append(Node(char, list(itertools.islice(_text, ARGS[char]))))
 
 # handle other tokens
 return Node("root", nodes)

However I'd still strongly recommend using *BNF if at all possible. Even if you can't, using TatSu to make a mostly working version of your language would probably help you figure out how to build Multiple Tokenizers.

I really think you need to start over with TatSu. Post another question and then see where to go from there.

Correctness

Can you find bugs?

Honestly your code needs a rewrite. Looking for bugs at the moment would be a waste of time.

Additionally languages are incredibly fragile code bases. Asking us to hunt bugs when you have a very lackluster coverage is also a waste. You may fix the bug, but without testing infrastructure in place when you fix another bug you may just reintroduce the bug. You need tests to prevent regressions.

I'd recommend installing PyTest and using TDD to rebuild your code.

Fe2O3
5,6081 gold badge9 silver badges26 bronze badges
answered Dec 16, 2021 at 0:14
\$\endgroup\$
3
  • 1
    \$\begingroup\$ @Peilonrayz fantastic job as always! I've learned so much from this. Just in awe, keep trucking. \$\endgroup\$ Commented Dec 16, 2021 at 2:51
  • \$\begingroup\$ Hello, sorry for late question, but how does @staticmethod work? I know how classes and decorators work, but I don't know what @staticmethod does do. \$\endgroup\$ Commented Feb 8, 2024 at 16:32
  • \$\begingroup\$ @Fmbalbuena staticmethod removes the self from methods when working with instances. So both CalculatorOperators.add(1, 2) and CalculatorOperators().add(1, 2) work as intended. \$\endgroup\$ Commented Feb 10, 2024 at 17:54

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.