Newborn pythonic calculator

Question 1

Let me start off by saying that I have several years of experience in Java, but now I need to learn Python, so I decided to make a calculator as it also is a community challenge.

Please review the code looking for beginner mistakes, though I do intend the code to look professional. This is my first real program made in Python and two days ago I knew nothing about Python.

My calculator currently has the following abilities:

It evaluates expressions, so you cannot interact with it.
The supported operators are +, -, * and /.
Functions are not supported.
It uses the Shunting-yard algorithm.
It uses Reverse Polish Notation.

calculator/tokens.py

from decimal import Decimal
__author__ = 'Frank van Heeswijk'
class Token:
 pass
class ValueToken(Token):
 def __init__(self, value: Decimal):
 self.value = value
 def __repr__(self):
 return "VT(" + str(self.value) + ")"
 def __hash__(self):
 return hash(self.value)
 def __eq__(self, other):
 if type(self) != type(other):
 return False
 return self.value == other.value
 def __ne__(self, other):
 return not self == other
class OperatorToken(Token):
 def __init__(self, operator: str):
 self.operator = operator
 def __repr__(self):
 return "OT(" + self.operator + ")"
 def __hash__(self):
 return hash(str)
 def __eq__(self, other):
 if type(self) != type(other):
 return False
 return self.operator == other.operator
 def __ne__(self, other):
 return not self == other
class LeftParenthesesToken(Token):
 def __repr__(self):
 return "LPT"
 def __hash__(self):
 return 0
 def __eq__(self, other):
 if type(self) != type(other):
 return False
 return True
 def __ne__(self, other):
 return not self == other
class RightParenthesesToken(Token):
 def __repr__(self):
 return "RPT"
 def __hash__(self):
 return 0
 def __eq__(self, other):
 if type(self) != type(other):
 return False
 return True
 def __ne__(self, other):
 return not self == other

calculator/calculator.py

from decimal import Decimal
from enum import Enum
import inspect
import re
from calculator.tokens import OperatorToken, ValueToken, LeftParenthesesToken, RightParenthesesToken
__author__ = 'Frank van Heeswijk'
class Associativity(Enum):
 left = 1
 right = 2
class Calculator:
 __operators = {
 # reference: http://en.wikipedia.org/wiki/Operators_in_C_and_C%2B%2B#Operator_precedence
 # operator: (precedence, associativity, function)
 "u+": (-3, Associativity.right, lambda op: op),
 "u-": (-3, Associativity.right, lambda op: -op),
 "*": (-5, Associativity.left, lambda op1, op2: op1 * op2),
 "/": (-5, Associativity.left, lambda op1, op2: op1 / op2),
 "+": (-6, Associativity.left, lambda op1, op2: op1 + op2),
 "-": (-6, Associativity.left, lambda op1, op2: op1 - op2)
 }
 def __init__(self):
 self.operators = Calculator.__operators
 def evaluate(self, expression: str) -> Decimal:
 """
 Evaluates an expression and returns its result.
 :param expression: The input expression
 :return: The output of evaluating the expression
 """
 tokens = self.to_rpn(self.tokenize(expression))
 stack = []
 for token in tokens:
 if isinstance(token, ValueToken):
 stack.append(token.value)
 elif isinstance(token, OperatorToken):
 function = self.operators[token.operator][2]
 argspec = inspect.getargspec(function)
 argument_count = len(argspec.args)
 if len(stack) < argument_count:
 raise RuntimeError("not enough tokens for: " + str(token) + ", expected: " + str(argument_count) + ", actual: " + str(len(tokens)))
 values = [stack.pop() for x in range(argument_count)]
 values.reverse()
 result = function(*values)
 stack.append(result)
 else:
 raise RuntimeError("unexpected token: " + token)
 return stack.pop()
 def tokenize(self, expression: str) -> list:
 """
 Tokenizes an expression and produces an output list of tokens.
 :rtype: list of [Token]
 :param expression: The input expression
 """
 tokens = []
 stripped_expression = expression.replace(' ', '')
 value_regex = re.compile(r"\d+(\.\d+)?")
 operator_regex = re.compile(r"[^\d\.\(\)]")
 left_parentheses_regex = re.compile(r"\(")
 right_parentheses_regex = re.compile(r"\)")
 regexps = [value_regex, operator_regex, left_parentheses_regex, right_parentheses_regex]
 raw_patterns = "|".join(map(lambda regex: regex.pattern, regexps))
 capture_regex = re.compile("(" + raw_patterns + ")")
 for raw_token, something_else in capture_regex.findall(stripped_expression):
 if value_regex.match(raw_token):
 tokens.append(ValueToken(Decimal(raw_token)))
 elif operator_regex.match(raw_token):
 if raw_token not in self.__operators:
 raise RuntimeError("unsupported operator: " + raw_token)
 tokens.append(OperatorToken(raw_token))
 elif left_parentheses_regex.match(raw_token):
 tokens.append(LeftParenthesesToken())
 elif right_parentheses_regex.match(raw_token):
 tokens.append(RightParenthesesToken())
 else:
 raise RuntimeError("token " + raw_token + " does not match any regex")
 # resolve unary plus and minus operators
 for index, token in enumerate(tokens):
 if isinstance(token, OperatorToken) and token.operator == '-':
 if index == 0\
 or isinstance(tokens[index - 1], LeftParenthesesToken)\
 or isinstance(tokens[index - 1], OperatorToken):
 tokens[index] = OperatorToken('u-')
 elif isinstance(token, OperatorToken) and token.operator == '+':
 if index == 0\
 or isinstance(tokens[index - 1], LeftParenthesesToken)\
 or isinstance(tokens[index - 1], OperatorToken):
 tokens[index] = OperatorToken('u+')
 return tokens
 def to_rpn(self, tokens: list) -> list:
 """
 Converts a list of tokens to an output list in Reverse Polish Notation form.
 :rtype: list of [Token]
 :type tokens: list of [Token]
 :param tokens: The input tokens
 :raise RuntimeError: If the parentheses are mismatched
 """
 output_queue = []
 stack = []
 for token in tokens:
 if isinstance(token, ValueToken):
 output_queue.append(token)
 elif isinstance(token, LeftParenthesesToken):
 stack.append(token)
 elif isinstance(token, RightParenthesesToken):
 while len(stack) > 0:
 pop_token = stack.pop()
 if isinstance(pop_token, LeftParenthesesToken):
 break
 output_queue.append(pop_token)
 # todo implement function support
 else:
 raise RuntimeError("mismatched parentheses")
 elif isinstance(token, OperatorToken):
 while len(stack) > 0:
 pop_token = stack.pop()
 if isinstance(pop_token, OperatorToken) and self.__has_lower_precedence(token, pop_token):
 output_queue.append(pop_token)
 else:
 stack.append(pop_token)
 break
 stack.append(token)
 else:
 raise RuntimeError("unexpected token: " + token)
 while len(stack) > 0:
 pop_token = stack.pop()
 if isinstance(pop_token, LeftParenthesesToken):
 raise RuntimeError("mismatched parentheses")
 output_queue.append(pop_token)
 return output_queue
 def __has_lower_precedence(self, operatortoken1: OperatorToken, operatortoken2: OperatorToken) -> bool:
 operator1 = operatortoken1.operator
 operator2 = operatortoken2.operator
 if operator1 not in self.operators:
 raise RuntimeError("Unsupported operator token: " + operator1)
 if operator2 not in self.operators:
 raise RuntimeError("Unsupported operator token: " + operator2)
 operator1_tuple = self.operators[operator1]
 operator2_tuple = self.operators[operator2]
 return (operator1_tuple[1] == Associativity.left and operator1_tuple[0] <= operator2_tuple[0]) \
 or (operator1_tuple[1] == Associativity.right and operator1_tuple[0] < operator2_tuple[0])

calculator_application.py

import sys
from calculator.calculator import Calculator
__author__ = 'Frank van Heeswijk'
if __name__ == '__main__':
 if len(sys.argv) < 2:
 print("Usage: calculator_application \"<expression>\"")
 sys.exit(1)
 calculator = Calculator()
 expression = " ".join(sys.argv[1:])
 result = calculator.evaluate(expression)
 print(result)

Small sample of the unit tests to give you a feel about its usage:

def test_evaluate(self):
 calculator = Calculator()
 self.assertEqual(Decimal(4), calculator.evaluate("4"))
 self.assertEqual(Decimal(21), calculator.evaluate("7 * 3"))
 self.assertEqual(Decimal(11), calculator.evaluate("2 * 4 + 3"))
 self.assertEqual(Decimal(45), calculator.evaluate("(3 * (2 + 5)) + 6 * (4)"))
 self.assertEqual(Decimal("25.92"), calculator.evaluate("2.7 * (3.2 + 6.4)"))
 self.assertEqual(Decimal(1), calculator.evaluate("-2 * -4 + -7"))
def test_evaluate_operators(self):
 calculator = Calculator()
 self.assertEqual(Decimal(3), calculator.evaluate("+3"))
 self.assertEqual(Decimal(-3), calculator.evaluate("-3"))
 self.assertEqual(Decimal(6), calculator.evaluate("2 * 3"))
 self.assertEqual(Decimal(2), calculator.evaluate("6 / 3"))
 self.assertEqual(Decimal(5), calculator.evaluate("2 + 3"))
 self.assertEqual(Decimal(3), calculator.evaluate("7 - 4"))
def test_evaluate_operator_precedences(self):
 calculator = Calculator()
 self.assertEqual(Decimal(-14), calculator.evaluate("-3 * 5 + +1"))
 self.assertEqual(Decimal("6.5"), calculator.evaluate("8 / -16 - -7"))
 self.assertEqual(Decimal(30), calculator.evaluate("5 * 3 * 8 / 4 / 2 * 6 / 3"))
 self.assertEqual(Decimal(-3), calculator.evaluate("2 + 3 + 4 - 5 - 8 + 6 + 4 - 9"))

If you are interested in the other unit tests, then the full project can be seen at my GitHub repository.

Question 2

This looks pretty nice! I have only a few minor nitpicks and practical tips for you.

Returning boolean values directly

I'm a bit surprised by this:

 if type(self) != type(other):
 return False
 return True

I'm wondering if you have a particular reason for not writing simply:

 return type(self) == type(other)

The same goes for all the __eq__ methods that can be simplified to a single return statement.

Compiling regexes

I'm a bit surprised by this:

def tokenize(self, expression: str) -> list:
 # ...
 value_regex = re.compile(r"\d+(\.\d+)?")
 operator_regex = re.compile(r"[^\d\.\(\)]")
 left_parentheses_regex = re.compile(r"\(")
 right_parentheses_regex = re.compile(r"\)")

Compiling the regexes every time this method gets called? Usually I put pre-compiled regexes at the top of the file as global constants. But looking at the docs, I'm wondering if compiling a few regexes is necessary at all, as it seems there is a built-in caching mechanism.

Taming complex conditions

This ain't pretty:

if isinstance(token, OperatorToken) and token.operator == '-':
 if index == 0\
 or isinstance(tokens[index - 1], LeftParenthesesToken)\
 or isinstance(tokens[index - 1], OperatorToken):
 tokens[index] = OperatorToken('u-')

Using \ to break long lines tends to be a bit ugly. (This might be subjective.) Another alternative is to enclose the complex expression within parens:

 if (index == 0
 or isinstance(tokens[index - 1], LeftParenthesesToken)
 or isinstance(tokens[index - 1], OperatorToken)):
 tokens[index] = OperatorToken('u-')

But this also isn't too good:

PEP8 complains: E129 visually indented line with same indent as next logical line
If I increase the indent of tokens[index] = ..., same complaint (to be honest, I don't really understand that)
I can play with the indents further to get something that passes PEP8 but doesn't actually look reasonable

The bottom line is, the best solution for a complex boolean expression is to extract to a helper function. The function can be defined anywhere, even right before you use it. It will the added benefit of having a name.

Or maybe none of this is worth the trouble and your original is best. Just some food for thought.

The Pythonic way of checking empty things

Instead of:

while len(stack) > 0:
 pop_token = stack.pop()

The Pythonic way is to simply:

while stack:
 pop_token = stack.pop()

Question 3

Bugs

Evaluating 1 2 3 results in 123, due to the indiscriminate use of stripped_expression = expression.replace(' ', '') in Calculator.tokenize().

Evaluating 12(3) results in 3. The infix expression is transformed into the RPN expression [VT(12), VT(3)]. The value at the top of the stack, 3, is returned as the result without checking whether any garbage remains in the stack. 12(3) should either be an error or be interpreted as implicit multiplication.

Tokenizing

There is too much code in tokens.py. Notably,

An empty Token base class is unusual in Python, since duck typing removes the need for elaborate class hierarchies.
That said, there is a lot of commonality among the subclasses, so you should have a common base class that actually contains useful code.
The abbreviated representations LPT, VT(n), etc. are a luxury — they are just a debugging aid. In any case, repr() is supposed to return a string that looks like a valid Python expression that could be used to recreate the object.

from decimal import Decimal
class Token:
 def __init__(self, text: str):
 self._text = text
 def __repr__(self):
 return "{0}('{1}')".format(type(self).__name__, self._text)
 def __hash__(self):
 return hash(self._text)
 def __eq__(self, other):
 return type(self) == type(other) and self._text == other._text
 def __ne__(self, other):
 return not self == other
class ValueToken(Token):
 @property
 def value(self):
 return Decimal(self._text)
class OperatorToken(Token):
 @property
 def operator(self):
 return self._text
class LeftParenthesesToken(Token):
 pass
class RightParenthesesToken(Token):
 pass

The Calculator.tokenize() function could also use improvement. In particular,

expression.replace(' ', '') needs to be eliminated, as mentioned above.
The operator_regex is just accepting any non-space character that wasn't accepted as a value or parenthesis. There is a less-redundant way to express that — just put it as the last part of a regex alternation.
Neither of the raise RuntimeError() is needed. The OperatorToken() constructor should be responsible for checking whether the symbol is supported, and if it didn't, you would get the same RuntimeError when you eventually call __has_lower_precedence() anyway. (I'll discuss the OperatorToken class further below.) As for the "does not match any regex" error, I'm not convinced that it's possible, since the operator_regex matches everything that is rejected by the other regexes.
Calling capture_regex.findall(), then matching against the constituent regexes again is wasteful. Instead, you should make use of capturing groups (in particular, capturing groups named after their respective token types). Once you take care of that, there's no longer any point in compiling each constituent regex individually.
The code to handle unary ± has a lot of redundancy. I'd do it altogether differently by keeping track of the previous token.
Prefer generators to functions that return lists. It's one of the major themes of the Python 2→3 transition.
ValueToken won't accept .5 unless it has an explicit 0. A more lenient regex can fix that.

def tokenize(self, expression: str):
 """
 Generates tokens from an expression.
 :rtype: generator of Token
 :param expression: The input expression
 """
 capture_regex = re.compile('''\s*(?:
 (?P<ValueToken>\d*\.?\d+) |
 (?P<LeftParenthesesToken>\() |
 (?P<RightParenthesesToken>\)) |
 (?P<OperatorToken>[^\s])
 )''', re.VERBOSE)
 token = None
 for match in capture_regex.finditer(expression):
 kind, value = eval(match.lastgroup), match.group(match.lastgroup)
 if kind == OperatorToken and value in ('+', '-') and \
 type(token) not in (RightParenthesesToken, ValueToken):
 # Unary + or unary -
 token = OperatorToken('u' + value)
 else:
 token = kind(value)
 yield token

Evaluating

As with tokenize(), to_rpn() should be a generator. That is, you can eliminate output_queue by changing every output_queue.append(token) to yield token.

I feel like your OperatorToken class is underdeveloped. You can unburden the more complex Calculator class by shifting code into OperatorToken.

The Calculator.operators dictionary should move into OperatorToken. That also allows the OperatorToken constructor to do validation. That's better than having to validate within Calculator.__has_lower_precedence().
Calculator.__has_lower_precedence() should move over as well. (I've chosen to override < such that op1 < op2 compares operators by precedence.)
operator[0], operator[1], and operator[2] are a cryptic way to get the precedence, associativity, and function, respectively. Use a namedtuple instead. (The multiple inheritance and the overridden __new__() method that I've used below is admittedly unorthodox.)

from collections import namedtuple
from decimal import Decimal
from enum import Enum
class OperatorToken(Token, namedtuple('Operator', ['precedence', 'associativity', 'function'])):
 class Associativity(Enum):
 left = 1
 right = 2
 __operators = {
 # reference: http://en.wikipedia.org/wiki/Operators_in_C_and_C%2B%2B#Operator_precedence
 # operator: (precedence, associativity, function)
 "u+": (-3, Associativity.right, lambda s: s.append(s.pop())),
 "u-": (-3, Associativity.right, lambda s: s.append(-s.pop())),
 "*": (-5, Associativity.left, lambda s: s.append(s.pop() * s.pop())),
 "/": (-5, Associativity.left, lambda s: s.append(1 / s.pop() * s.pop())), 
 "+": (-6, Associativity.left, lambda s: s.append(s.pop() + s.pop())), 
 "-": (-6, Associativity.left, lambda s: s.append(-s.pop() + s.pop())), 
 } 
 def __new__(cls, text):
 try: 
 return tuple.__new__(cls, OperatorToken.__operators[text]) 
 except KeyError: 
 raise RuntimeError("Unsupported operator token: " + text) from None 
 def __lt__(self, other) -> bool:
 """
 Test if this operator has lower precedence than the other, accounting
 for associativity. 
 """
 return self.precedence < other.precedence or ( 
 self.associativity == OperatorToken.Associativity.left and 
 self.precedence == other.precedence)
 def __gt__(self, other) -> bool:
 """
 Test if this operator has higher precedence than the other, accounting 
 for associativity.
 """
 return other < self
 @property 
 def operator(self):
 return self._text

Calculator.evaluate() could use one simplification and some minor changes:

Using reflection to determine how many operands to pop seems complicated. RPN calculators are a lot simpler when the operators manipulate the stack directly (examples 1, 2).
As mentioned above, you should check that there is no garbage remaining in the stack when finished.
The error message "not enough tokens..." is using programmer jargon. End users think of operands, not tokens.

def evaluate(self, expression: str) -> Decimal:
 """
 Evaluates an expression and returns its result.
 :param expression: The input expression
 :return: The output of evaluating the expression
 """
 stack = []
 for token in self.to_rpn(self.tokenize(expression)): 
 if isinstance(token, ValueToken):
 stack.append(token.value)
 else: 
 assert isinstance(token, OperatorToken)
 try:
 token.function(stack) 
 except IndexError: 
 raise RuntimeError("not enough operands for: " + str(token)) from None 
 if len(stack) != 1:
 raise RuntimeError("invalid expression: " + expression)
 return stack.pop()

With all of the changes above, I see a reduction of lines of code in calculator.py by about 40%, while tokens.py remains nearly the same.

janos janos 113k15 gold badges154 silver badges396 bronze badges · Accepted Answer · 2015-04-26 20:15:01Z

This looks pretty nice! I have only a few minor nitpicks and practical tips for you.

Returning boolean values directly

I'm a bit surprised by this:

 if type(self) != type(other):
 return False
 return True

I'm wondering if you have a particular reason for not writing simply:

 return type(self) == type(other)

The same goes for all the __eq__ methods that can be simplified to a single return statement.

Compiling regexes

I'm a bit surprised by this:

def tokenize(self, expression: str) -> list:
 # ...
 value_regex = re.compile(r"\d+(\.\d+)?")
 operator_regex = re.compile(r"[^\d\.\(\)]")
 left_parentheses_regex = re.compile(r"\(")
 right_parentheses_regex = re.compile(r"\)")

Compiling the regexes every time this method gets called? Usually I put pre-compiled regexes at the top of the file as global constants. But looking at the docs, I'm wondering if compiling a few regexes is necessary at all, as it seems there is a built-in caching mechanism.

Taming complex conditions

This ain't pretty:

if isinstance(token, OperatorToken) and token.operator == '-':
 if index == 0\
 or isinstance(tokens[index - 1], LeftParenthesesToken)\
 or isinstance(tokens[index - 1], OperatorToken):
 tokens[index] = OperatorToken('u-')

Using \ to break long lines tends to be a bit ugly. (This might be subjective.) Another alternative is to enclose the complex expression within parens:

 if (index == 0
 or isinstance(tokens[index - 1], LeftParenthesesToken)
 or isinstance(tokens[index - 1], OperatorToken)):
 tokens[index] = OperatorToken('u-')

But this also isn't too good:

PEP8 complains: E129 visually indented line with same indent as next logical line
If I increase the indent of tokens[index] = ..., same complaint (to be honest, I don't really understand that)
I can play with the indents further to get something that passes PEP8 but doesn't actually look reasonable

The bottom line is, the best solution for a complex boolean expression is to extract to a helper function. The function can be defined anywhere, even right before you use it. It will the added benefit of having a name.

Or maybe none of this is worth the trouble and your original is best. Just some food for thought.

The Pythonic way of checking empty things

Instead of:

while len(stack) > 0:
 pop_token = stack.pop()

The Pythonic way is to simply:

while stack:
 pop_token = stack.pop()

Stack Exchange Network

Newborn pythonic calculator

2 Answers 2

Returning boolean values directly

Compiling regexes

Taming complex conditions

The Pythonic way of checking empty things

Bugs

Tokenizing

Evaluating

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Newborn pythonic calculator

2 Answers 2

Returning boolean values directly

Compiling regexes

Taming complex conditions

The Pythonic way of checking empty things

Bugs

Tokenizing

Evaluating

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions