I created the following program which takes a molecular formula as an input, eg CH3COOH
and returns the molar mass of the compound:
#! /usr/bin/env python3
element_weights = {
'H': 1.00794,
'He': 4.002602,
'C': 12.011,
'O': 15.999,
'Ts': 294.0,
'Og': 294.0,
}
def tokenize(string):
position = 0
tokens = []
striter = iter(string)
character = next(striter)
while True:
try:
token = character
if character in "()":
character = next(striter)
elif character.isnumeric():
while (character := next(striter)).isnumeric():
token += character
elif character.isupper():
while (character := next(striter)).islower():
token += character
else:
raise ValueError("Can't parse")
tokens.append(token)
except StopIteration:
tokens.append(token)
tokens.append("EOF")
return tokens
def get_composition(tokens_list):
composition = {}
tokens = iter(tokens_list)
token = next(tokens)
while True:
if(token == "EOF"):
break
num_parens = 0
if token == "(":
num_parens = 1
substr_tokens = []
while num_parens > 0:
token = next(tokens)
if token == "EOF":
raise ValueError(f"Unbalanced Parens, tokens: {tokens_list}")
elif token == "(":
num_parens += 1
elif token == ")":
num_parens -= 1
if (num_parens > 0):
substr_tokens.append(token)
substr_tokens.append("EOF")
substr_composition = get_composition(substr_tokens)
if (token := next(tokens)).isnumeric():
substr_composition = {k: int(token) * v for k,v in substr_composition.items()}
for k,v in substr_composition.items():
if k in composition:
composition[k] += v
else:
composition[k] = v
break
if token == ")":
raise ValueError(f"Unbalanced Parens, tokens: {tokens_list}")
if token not in element_weights:
raise ValueError(f"Can't find element {token}, tokens {tokens_list}")
element = token
if (token := next(tokens)).isnumeric():
element_count = int(token)
token = next(tokens)
else:
element_count = 1
if element in composition:
composition[element] += element_count
else:
composition[element] = element_count
return composition
def convertToAMU(element_count):
return sum(element_weights[k] * v for k,v in element_count.items())
if __name__ == "__main__":
import sys
if(len(sys.argv) > 1):
print(convertToAMU(get_composition(tokenize(sys.argv[1]))))
else:
print(f"Usage: {sys.argv[0]} [chemical_formula]")
1 Answer 1
For such a simple parsing job, I'd use a regular expression and re.findall()` to split the string into a list of tokens. It's just one line instead of over 20 lines. The tokens are an element (an upper case letter possibly with a lower case letter), a repeat count (1 or more digits), or a bracket. The "{}[]" matches any of the brackets.
tokens = re.findall(r"[A-Z][a-z]|[0-9]+|[](){}[]", string)
collections.Counter()
is useful for counting the elements. Lists used as simple stacks help keep track of bracketed sub-compounds.
composition = Counter()
match = []
stack = []
Using a list of tokens may be less optimal for large parsing tasks, but is sufficient for this command line calculator. Plus, it lets you "peek" at the next token to see if its a number, greatly simplifying the handling of the numbers in a formula. (I used an if-expression in the final code)
if tokens[0].isalpha():
element = tokens.pop(0)
if tokens[0].isdigit(): # <-- peek at next token to see if
repeat = int(tokens.pop(0)) # there is a count
else:
repeat = 1
compound[element] += repeat
When an opening bracket is encountered, save the current state (the Counter) on a stack and start a new state. Save the matching bracket on a stack too.
elif tokens[0] in ('(', '[', '{'):
match.append(MATCH[tokens.pop(0)])
stack.append(composition)
composition = Counter()
When a closing bracket is encountered, see if it matches the one on top of the bracket stack. Pop the saved state from the stack and combine it from the one for the bracketed sub-compound.
elif tokens[0] == match[-1]:
tokens.pop(0)
match.pop()
repeat = int(tokens.pop(0)) if tokens and tokens[0].isdigit() else 1
for element in composition.keys():
composition[element] *= repeat
composition.update(stack.pop())
Any other token is an error.
else:
if token in (')', ']', '}'):
raise ValueError(f"Error, mismatched bracket: "
f"expected {match[-1]} got {tokens[0]}.")
else:
raise ValueError(f"Error, unrecognized token "
f"in formula: '{tokens[0]}'.")
If there are any brackets left on the stack at the end, then there are mismatched brackets.
if match:
brackets = ', '.join(f"'{b}'" for b in match[::-1])
raise ValueError(f"Error, missing bracket(s): {brackets}.")
The full parser:
import re
from collections import Counter
MATCH = {'(':')', '[':']', '{':'}'}
def parse(molecule):
tokens = re.findall(r"[A-Z][a-z]?|[0-9]+|[]{}()[]", molecule)
composition = Counter()
match = []
stack = []
while tokens:
# element with optional count
if tokens[0].isalpha():
element = tokens.pop(0)
count = int(tokens.pop(0)) if tokens and tokens[0].isdigit() else 1
composition[element] += count
# start a sub-compound
elif tokens[0] in ('(', '[', '{'):
match.append(MATCH[tokens.pop(0)])
stack.append(composition)
composition = Counter()
# matching close bracket ends a sub-compound with an optional count
elif tokens[0] == match[-1]:
tokens.pop(0)
match.pop()
repeat = int(tokens.pop(0)) if tokens and tokens[0].isdigit() else 1
for element in composition.keys():
composition[element] *= repeat
composition.update(stack.pop())
# "syntax" error in the formula
else:
if token[0] in (')', ']', '}'):
raise ValueError(f"Error, mismatched bracket: "
f"expected '{match[-1]}' got '{token[0]}'.")
else:
raise ValueError(f"Error, unrecognized token in "
f"formula: '{token[0]}'.")
# left over, unmatched brackets
if match:
brackets = ', '.join(f"'{b}'" for b in match[::-1])
raise ValueError(f"Error, missing bracket(s): {brackets}.")
return dict(composition)
CH3COOH
doesn't work becauseC
is not included in the weights. \$\endgroup\$