Let me start off by saying that I have several years of experience in Java, but now I need to learn Python, so I decided to make a calculator as it also is a community challenge.
Please review the code looking for beginner mistakes, though I do intend the code to look professional. This is my first real program made in Python and two days ago I knew nothing about Python.
My calculator currently has the following abilities:
- It evaluates expressions, so you cannot interact with it.
- The supported operators are
+
,-
,*
and/
. - Functions are not supported.
- It uses the Shunting-yard algorithm.
- It uses Reverse Polish Notation.
calculator/tokens.py
from decimal import Decimal
__author__ = 'Frank van Heeswijk'
class Token:
pass
class ValueToken(Token):
def __init__(self, value: Decimal):
self.value = value
def __repr__(self):
return "VT(" + str(self.value) + ")"
def __hash__(self):
return hash(self.value)
def __eq__(self, other):
if type(self) != type(other):
return False
return self.value == other.value
def __ne__(self, other):
return not self == other
class OperatorToken(Token):
def __init__(self, operator: str):
self.operator = operator
def __repr__(self):
return "OT(" + self.operator + ")"
def __hash__(self):
return hash(str)
def __eq__(self, other):
if type(self) != type(other):
return False
return self.operator == other.operator
def __ne__(self, other):
return not self == other
class LeftParenthesesToken(Token):
def __repr__(self):
return "LPT"
def __hash__(self):
return 0
def __eq__(self, other):
if type(self) != type(other):
return False
return True
def __ne__(self, other):
return not self == other
class RightParenthesesToken(Token):
def __repr__(self):
return "RPT"
def __hash__(self):
return 0
def __eq__(self, other):
if type(self) != type(other):
return False
return True
def __ne__(self, other):
return not self == other
calculator/calculator.py
from decimal import Decimal
from enum import Enum
import inspect
import re
from calculator.tokens import OperatorToken, ValueToken, LeftParenthesesToken, RightParenthesesToken
__author__ = 'Frank van Heeswijk'
class Associativity(Enum):
left = 1
right = 2
class Calculator:
__operators = {
# reference: http://en.wikipedia.org/wiki/Operators_in_C_and_C%2B%2B#Operator_precedence
# operator: (precedence, associativity, function)
"u+": (-3, Associativity.right, lambda op: op),
"u-": (-3, Associativity.right, lambda op: -op),
"*": (-5, Associativity.left, lambda op1, op2: op1 * op2),
"/": (-5, Associativity.left, lambda op1, op2: op1 / op2),
"+": (-6, Associativity.left, lambda op1, op2: op1 + op2),
"-": (-6, Associativity.left, lambda op1, op2: op1 - op2)
}
def __init__(self):
self.operators = Calculator.__operators
def evaluate(self, expression: str) -> Decimal:
"""
Evaluates an expression and returns its result.
:param expression: The input expression
:return: The output of evaluating the expression
"""
tokens = self.to_rpn(self.tokenize(expression))
stack = []
for token in tokens:
if isinstance(token, ValueToken):
stack.append(token.value)
elif isinstance(token, OperatorToken):
function = self.operators[token.operator][2]
argspec = inspect.getargspec(function)
argument_count = len(argspec.args)
if len(stack) < argument_count:
raise RuntimeError("not enough tokens for: " + str(token) + ", expected: " + str(argument_count) + ", actual: " + str(len(tokens)))
values = [stack.pop() for x in range(argument_count)]
values.reverse()
result = function(*values)
stack.append(result)
else:
raise RuntimeError("unexpected token: " + token)
return stack.pop()
def tokenize(self, expression: str) -> list:
"""
Tokenizes an expression and produces an output list of tokens.
:rtype: list of [Token]
:param expression: The input expression
"""
tokens = []
stripped_expression = expression.replace(' ', '')
value_regex = re.compile(r"\d+(\.\d+)?")
operator_regex = re.compile(r"[^\d\.\(\)]")
left_parentheses_regex = re.compile(r"\(")
right_parentheses_regex = re.compile(r"\)")
regexps = [value_regex, operator_regex, left_parentheses_regex, right_parentheses_regex]
raw_patterns = "|".join(map(lambda regex: regex.pattern, regexps))
capture_regex = re.compile("(" + raw_patterns + ")")
for raw_token, something_else in capture_regex.findall(stripped_expression):
if value_regex.match(raw_token):
tokens.append(ValueToken(Decimal(raw_token)))
elif operator_regex.match(raw_token):
if raw_token not in self.__operators:
raise RuntimeError("unsupported operator: " + raw_token)
tokens.append(OperatorToken(raw_token))
elif left_parentheses_regex.match(raw_token):
tokens.append(LeftParenthesesToken())
elif right_parentheses_regex.match(raw_token):
tokens.append(RightParenthesesToken())
else:
raise RuntimeError("token " + raw_token + " does not match any regex")
# resolve unary plus and minus operators
for index, token in enumerate(tokens):
if isinstance(token, OperatorToken) and token.operator == '-':
if index == 0\
or isinstance(tokens[index - 1], LeftParenthesesToken)\
or isinstance(tokens[index - 1], OperatorToken):
tokens[index] = OperatorToken('u-')
elif isinstance(token, OperatorToken) and token.operator == '+':
if index == 0\
or isinstance(tokens[index - 1], LeftParenthesesToken)\
or isinstance(tokens[index - 1], OperatorToken):
tokens[index] = OperatorToken('u+')
return tokens
def to_rpn(self, tokens: list) -> list:
"""
Converts a list of tokens to an output list in Reverse Polish Notation form.
:rtype: list of [Token]
:type tokens: list of [Token]
:param tokens: The input tokens
:raise RuntimeError: If the parentheses are mismatched
"""
output_queue = []
stack = []
for token in tokens:
if isinstance(token, ValueToken):
output_queue.append(token)
elif isinstance(token, LeftParenthesesToken):
stack.append(token)
elif isinstance(token, RightParenthesesToken):
while len(stack) > 0:
pop_token = stack.pop()
if isinstance(pop_token, LeftParenthesesToken):
break
output_queue.append(pop_token)
# todo implement function support
else:
raise RuntimeError("mismatched parentheses")
elif isinstance(token, OperatorToken):
while len(stack) > 0:
pop_token = stack.pop()
if isinstance(pop_token, OperatorToken) and self.__has_lower_precedence(token, pop_token):
output_queue.append(pop_token)
else:
stack.append(pop_token)
break
stack.append(token)
else:
raise RuntimeError("unexpected token: " + token)
while len(stack) > 0:
pop_token = stack.pop()
if isinstance(pop_token, LeftParenthesesToken):
raise RuntimeError("mismatched parentheses")
output_queue.append(pop_token)
return output_queue
def __has_lower_precedence(self, operatortoken1: OperatorToken, operatortoken2: OperatorToken) -> bool:
operator1 = operatortoken1.operator
operator2 = operatortoken2.operator
if operator1 not in self.operators:
raise RuntimeError("Unsupported operator token: " + operator1)
if operator2 not in self.operators:
raise RuntimeError("Unsupported operator token: " + operator2)
operator1_tuple = self.operators[operator1]
operator2_tuple = self.operators[operator2]
return (operator1_tuple[1] == Associativity.left and operator1_tuple[0] <= operator2_tuple[0]) \
or (operator1_tuple[1] == Associativity.right and operator1_tuple[0] < operator2_tuple[0])
calculator_application.py
import sys
from calculator.calculator import Calculator
__author__ = 'Frank van Heeswijk'
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Usage: calculator_application \"<expression>\"")
sys.exit(1)
calculator = Calculator()
expression = " ".join(sys.argv[1:])
result = calculator.evaluate(expression)
print(result)
Small sample of the unit tests to give you a feel about its usage:
def test_evaluate(self):
calculator = Calculator()
self.assertEqual(Decimal(4), calculator.evaluate("4"))
self.assertEqual(Decimal(21), calculator.evaluate("7 * 3"))
self.assertEqual(Decimal(11), calculator.evaluate("2 * 4 + 3"))
self.assertEqual(Decimal(45), calculator.evaluate("(3 * (2 + 5)) + 6 * (4)"))
self.assertEqual(Decimal("25.92"), calculator.evaluate("2.7 * (3.2 + 6.4)"))
self.assertEqual(Decimal(1), calculator.evaluate("-2 * -4 + -7"))
def test_evaluate_operators(self):
calculator = Calculator()
self.assertEqual(Decimal(3), calculator.evaluate("+3"))
self.assertEqual(Decimal(-3), calculator.evaluate("-3"))
self.assertEqual(Decimal(6), calculator.evaluate("2 * 3"))
self.assertEqual(Decimal(2), calculator.evaluate("6 / 3"))
self.assertEqual(Decimal(5), calculator.evaluate("2 + 3"))
self.assertEqual(Decimal(3), calculator.evaluate("7 - 4"))
def test_evaluate_operator_precedences(self):
calculator = Calculator()
self.assertEqual(Decimal(-14), calculator.evaluate("-3 * 5 + +1"))
self.assertEqual(Decimal("6.5"), calculator.evaluate("8 / -16 - -7"))
self.assertEqual(Decimal(30), calculator.evaluate("5 * 3 * 8 / 4 / 2 * 6 / 3"))
self.assertEqual(Decimal(-3), calculator.evaluate("2 + 3 + 4 - 5 - 8 + 6 + 4 - 9"))
If you are interested in the other unit tests, then the full project can be seen at my GitHub repository.
2 Answers 2
This looks pretty nice! I have only a few minor nitpicks and practical tips for you.
Returning boolean values directly
I'm a bit surprised by this:
if type(self) != type(other): return False return True
I'm wondering if you have a particular reason for not writing simply:
return type(self) == type(other)
The same goes for all the __eq__
methods that can be simplified to a single return
statement.
Compiling regexes
I'm a bit surprised by this:
def tokenize(self, expression: str) -> list: # ... value_regex = re.compile(r"\d+(\.\d+)?") operator_regex = re.compile(r"[^\d\.\(\)]") left_parentheses_regex = re.compile(r"\(") right_parentheses_regex = re.compile(r"\)")
Compiling the regexes every time this method gets called? Usually I put pre-compiled regexes at the top of the file as global constants. But looking at the docs, I'm wondering if compiling a few regexes is necessary at all, as it seems there is a built-in caching mechanism.
Taming complex conditions
This ain't pretty:
if isinstance(token, OperatorToken) and token.operator == '-': if index == 0\ or isinstance(tokens[index - 1], LeftParenthesesToken)\ or isinstance(tokens[index - 1], OperatorToken): tokens[index] = OperatorToken('u-')
Using \
to break long lines tends to be a bit ugly.
(This might be subjective.)
Another alternative is to enclose the complex expression within parens:
if (index == 0
or isinstance(tokens[index - 1], LeftParenthesesToken)
or isinstance(tokens[index - 1], OperatorToken)):
tokens[index] = OperatorToken('u-')
But this also isn't too good:
- PEP8 complains: E129 visually indented line with same indent as next logical line
- If I increase the indent of
tokens[index] = ...
, same complaint (to be honest, I don't really understand that) - I can play with the indents further to get something that passes PEP8 but doesn't actually look reasonable
The bottom line is, the best solution for a complex boolean expression is to extract to a helper function. The function can be defined anywhere, even right before you use it. It will the added benefit of having a name.
Or maybe none of this is worth the trouble and your original is best. Just some food for thought.
The Pythonic way of checking empty things
Instead of:
while len(stack) > 0: pop_token = stack.pop()
The Pythonic way is to simply:
while stack:
pop_token = stack.pop()
Bugs
Evaluating 1 2 3
results in 123
, due to the indiscriminate use of stripped_expression = expression.replace(' ', '')
in Calculator.tokenize()
.
Evaluating 12(3)
results in 3
. The infix expression is transformed into the RPN expression [VT(12), VT(3)]
. The value at the top of the stack, 3
, is returned as the result without checking whether any garbage remains in the stack. 12(3)
should either be an error or be interpreted as implicit multiplication.
Tokenizing
There is too much code in tokens.py
. Notably,
- An empty
Token
base class is unusual in Python, since duck typing removes the need for elaborate class hierarchies. - That said, there is a lot of commonality among the subclasses, so you should have a common base class that actually contains useful code.
- The abbreviated representations
LPT
,VT(n)
, etc. are a luxury — they are just a debugging aid. In any case,repr()
is supposed to return a string that looks like a valid Python expression that could be used to recreate the object.
from decimal import Decimal
class Token:
def __init__(self, text: str):
self._text = text
def __repr__(self):
return "{0}('{1}')".format(type(self).__name__, self._text)
def __hash__(self):
return hash(self._text)
def __eq__(self, other):
return type(self) == type(other) and self._text == other._text
def __ne__(self, other):
return not self == other
class ValueToken(Token):
@property
def value(self):
return Decimal(self._text)
class OperatorToken(Token):
@property
def operator(self):
return self._text
class LeftParenthesesToken(Token):
pass
class RightParenthesesToken(Token):
pass
The Calculator.tokenize()
function could also use improvement. In particular,
expression.replace(' ', '')
needs to be eliminated, as mentioned above.- The
operator_regex
is just accepting any non-space character that wasn't accepted as a value or parenthesis. There is a less-redundant way to express that — just put it as the last part of a regex alternation. - Neither of the
raise RuntimeError()
is needed. TheOperatorToken()
constructor should be responsible for checking whether the symbol is supported, and if it didn't, you would get the sameRuntimeError
when you eventually call__has_lower_precedence()
anyway. (I'll discuss theOperatorToken
class further below.) As for the "does not match any regex" error, I'm not convinced that it's possible, since theoperator_regex
matches everything that is rejected by the other regexes. - Calling
capture_regex.findall()
, then matching against the constituent regexes again is wasteful. Instead, you should make use of capturing groups (in particular, capturing groups named after their respective token types). Once you take care of that, there's no longer any point in compiling each constituent regex individually. - The code to handle unary ± has a lot of redundancy. I'd do it altogether differently by keeping track of the previous token.
- Prefer generators to functions that return lists. It's one of the major themes of the Python 2→3 transition.
ValueToken
won't accept.5
unless it has an explicit0
. A more lenient regex can fix that.
def tokenize(self, expression: str):
"""
Generates tokens from an expression.
:rtype: generator of Token
:param expression: The input expression
"""
capture_regex = re.compile('''\s*(?:
(?P<ValueToken>\d*\.?\d+) |
(?P<LeftParenthesesToken>\() |
(?P<RightParenthesesToken>\)) |
(?P<OperatorToken>[^\s])
)''', re.VERBOSE)
token = None
for match in capture_regex.finditer(expression):
kind, value = eval(match.lastgroup), match.group(match.lastgroup)
if kind == OperatorToken and value in ('+', '-') and \
type(token) not in (RightParenthesesToken, ValueToken):
# Unary + or unary -
token = OperatorToken('u' + value)
else:
token = kind(value)
yield token
Evaluating
As with tokenize()
, to_rpn()
should be a generator. That is, you can eliminate output_queue
by changing every output_queue.append(token)
to yield token
.
I feel like your OperatorToken
class is underdeveloped. You can unburden the more complex Calculator
class by shifting code into OperatorToken
.
- The
Calculator.operators
dictionary should move intoOperatorToken
. That also allows theOperatorToken
constructor to do validation. That's better than having to validate withinCalculator.__has_lower_precedence()
. Calculator.__has_lower_precedence()
should move over as well. (I've chosen to override<
such thatop1 < op2
compares operators by precedence.)operator[0]
,operator[1]
, andoperator[2]
are a cryptic way to get the precedence, associativity, and function, respectively. Use anamedtuple
instead. (The multiple inheritance and the overridden__new__()
method that I've used below is admittedly unorthodox.)
from collections import namedtuple
from decimal import Decimal
from enum import Enum
class OperatorToken(Token, namedtuple('Operator', ['precedence', 'associativity', 'function'])):
class Associativity(Enum):
left = 1
right = 2
__operators = {
# reference: http://en.wikipedia.org/wiki/Operators_in_C_and_C%2B%2B#Operator_precedence
# operator: (precedence, associativity, function)
"u+": (-3, Associativity.right, lambda s: s.append(s.pop())),
"u-": (-3, Associativity.right, lambda s: s.append(-s.pop())),
"*": (-5, Associativity.left, lambda s: s.append(s.pop() * s.pop())),
"/": (-5, Associativity.left, lambda s: s.append(1 / s.pop() * s.pop())),
"+": (-6, Associativity.left, lambda s: s.append(s.pop() + s.pop())),
"-": (-6, Associativity.left, lambda s: s.append(-s.pop() + s.pop())),
}
def __new__(cls, text):
try:
return tuple.__new__(cls, OperatorToken.__operators[text])
except KeyError:
raise RuntimeError("Unsupported operator token: " + text) from None
def __lt__(self, other) -> bool:
"""
Test if this operator has lower precedence than the other, accounting
for associativity.
"""
return self.precedence < other.precedence or (
self.associativity == OperatorToken.Associativity.left and
self.precedence == other.precedence)
def __gt__(self, other) -> bool:
"""
Test if this operator has higher precedence than the other, accounting
for associativity.
"""
return other < self
@property
def operator(self):
return self._text
Calculator.evaluate()
could use one simplification and some minor changes:
- Using reflection to determine how many operands to pop seems complicated. RPN calculators are a lot simpler when the operators manipulate the stack directly (examples 1, 2).
- As mentioned above, you should check that there is no garbage remaining in the stack when finished.
- The error message
"not enough tokens..."
is using programmer jargon. End users think of operands, not tokens.
def evaluate(self, expression: str) -> Decimal:
"""
Evaluates an expression and returns its result.
:param expression: The input expression
:return: The output of evaluating the expression
"""
stack = []
for token in self.to_rpn(self.tokenize(expression)):
if isinstance(token, ValueToken):
stack.append(token.value)
else:
assert isinstance(token, OperatorToken)
try:
token.function(stack)
except IndexError:
raise RuntimeError("not enough operands for: " + str(token)) from None
if len(stack) != 1:
raise RuntimeError("invalid expression: " + expression)
return stack.pop()
With all of the changes above, I see a reduction of lines of code in calculator.py
by about 40%, while tokens.py
remains nearly the same.
Explore related questions
See similar questions with these tags.