Python molar mass calculator

Question 1

I created the following program which takes a molecular formula as an input, eg CH3COOH and returns the molar mass of the compound:

#! /usr/bin/env python3
element_weights = {
 'H': 1.00794,
 'He': 4.002602,
 'C': 12.011,
 'O': 15.999,
 'Ts': 294.0,
 'Og': 294.0,
}
def tokenize(string):
 position = 0
 tokens = []
 striter = iter(string)
 character = next(striter)
 while True:
 try:
 token = character
 if character in "()":
 character = next(striter)
 elif character.isnumeric():
 while (character := next(striter)).isnumeric():
 token += character
 elif character.isupper():
 while (character := next(striter)).islower():
 token += character
 else:
 raise ValueError("Can't parse")
 tokens.append(token)
 except StopIteration:
 tokens.append(token)
 tokens.append("EOF")
 return tokens
def get_composition(tokens_list):
 composition = {}
 tokens = iter(tokens_list)
 token = next(tokens)
 while True:
 if(token == "EOF"):
 break
 num_parens = 0
 if token == "(":
 num_parens = 1
 substr_tokens = []
 while num_parens > 0:
 token = next(tokens)
 if token == "EOF":
 raise ValueError(f"Unbalanced Parens, tokens: {tokens_list}")
 elif token == "(":
 num_parens += 1
 elif token == ")":
 num_parens -= 1
 if (num_parens > 0):
 substr_tokens.append(token)
 substr_tokens.append("EOF")
 substr_composition = get_composition(substr_tokens)
 if (token := next(tokens)).isnumeric():
 substr_composition = {k: int(token) * v for k,v in substr_composition.items()}
 for k,v in substr_composition.items():
 if k in composition:
 composition[k] += v
 else:
 composition[k] = v
 break
 if token == ")":
 raise ValueError(f"Unbalanced Parens, tokens: {tokens_list}")
 if token not in element_weights:
 raise ValueError(f"Can't find element {token}, tokens {tokens_list}")
 element = token
 if (token := next(tokens)).isnumeric():
 element_count = int(token)
 token = next(tokens)
 else:
 element_count = 1
 if element in composition:
 composition[element] += element_count
 else:
 composition[element] = element_count
 return composition
 
def convertToAMU(element_count):
 return sum(element_weights[k] * v for k,v in element_count.items())
if __name__ == "__main__":
 import sys
 if(len(sys.argv) > 1):
 print(convertToAMU(get_composition(tokenize(sys.argv[1]))))
 else:
 print(f"Usage: {sys.argv[0]} [chemical_formula]")

Question 2

the most inefficient code looks to be how you're tokenizing the molecule. take a look at, for eg. codereview.stackexchange.com/q/232630/12240

Question 3

Can you provide sample input that will work? Currently your CH3COOH doesn't work because C is not included in the weights.

Question 4

Can you explain your code a little more in the body?

Question 5

For such a simple parsing job, I'd use a regular expression and re.findall()` to split the string into a list of tokens. It's just one line instead of over 20 lines. The tokens are an element (an upper case letter possibly with a lower case letter), a repeat count (1 or more digits), or a bracket. The "{}[]" matches any of the brackets.

 tokens = re.findall(r"[A-Z][a-z]|[0-9]+|[](){}[]", string)

collections.Counter() is useful for counting the elements. Lists used as simple stacks help keep track of bracketed sub-compounds.

 composition = Counter()
 match = []
 stack = []

Using a list of tokens may be less optimal for large parsing tasks, but is sufficient for this command line calculator. Plus, it lets you "peek" at the next token to see if its a number, greatly simplifying the handling of the numbers in a formula. (I used an if-expression in the final code)

 if tokens[0].isalpha():
 element = tokens.pop(0)
 if tokens[0].isdigit(): # <-- peek at next token to see if
 repeat = int(tokens.pop(0)) # there is a count
 else:
 repeat = 1
 compound[element] += repeat

When an opening bracket is encountered, save the current state (the Counter) on a stack and start a new state. Save the matching bracket on a stack too.

 elif tokens[0] in ('(', '[', '{'):
 match.append(MATCH[tokens.pop(0)])
 stack.append(composition)
 composition = Counter()

When a closing bracket is encountered, see if it matches the one on top of the bracket stack. Pop the saved state from the stack and combine it from the one for the bracketed sub-compound.

 elif tokens[0] == match[-1]:
 tokens.pop(0)
 match.pop()
 
 repeat = int(tokens.pop(0)) if tokens and tokens[0].isdigit() else 1
 for element in composition.keys():
 composition[element] *= repeat
 
 composition.update(stack.pop())

Any other token is an error.

 else:
 if token in (')', ']', '}'):
 raise ValueError(f"Error, mismatched bracket: "
 f"expected {match[-1]} got {tokens[0]}.")
 
 else:
 raise ValueError(f"Error, unrecognized token "
 f"in formula: '{tokens[0]}'.")

If there are any brackets left on the stack at the end, then there are mismatched brackets.

 if match:
 brackets = ', '.join(f"'{b}'" for b in match[::-1])
 raise ValueError(f"Error, missing bracket(s): {brackets}.")

The full parser:

import re
from collections import Counter
MATCH = {'(':')', '[':']', '{':'}'}
def parse(molecule):
 tokens = re.findall(r"[A-Z][a-z]?|[0-9]+|[]{}()[]", molecule)
 
 composition = Counter()
 match = []
 stack = []
 
 while tokens:
 # element with optional count
 if tokens[0].isalpha():
 element = tokens.pop(0)
 count = int(tokens.pop(0)) if tokens and tokens[0].isdigit() else 1
 composition[element] += count
 # start a sub-compound
 elif tokens[0] in ('(', '[', '{'):
 match.append(MATCH[tokens.pop(0)])
 stack.append(composition)
 composition = Counter()
 
 # matching close bracket ends a sub-compound with an optional count
 elif tokens[0] == match[-1]:
 tokens.pop(0)
 match.pop()
 
 repeat = int(tokens.pop(0)) if tokens and tokens[0].isdigit() else 1
 for element in composition.keys():
 composition[element] *= repeat
 
 composition.update(stack.pop())
 # "syntax" error in the formula
 else:
 if token[0] in (')', ']', '}'):
 raise ValueError(f"Error, mismatched bracket: "
 f"expected '{match[-1]}' got '{token[0]}'.")
 
 else:
 raise ValueError(f"Error, unrecognized token in "
 f"formula: '{token[0]}'.")
 # left over, unmatched brackets
 if match:
 brackets = ', '.join(f"'{b}'" for b in match[::-1])
 raise ValueError(f"Error, missing bracket(s): {brackets}.")
 
 return dict(composition)

RootTwo RootTwo 10.6k1 gold badge14 silver badges30 bronze badges · Answer 1 · 2020-10-14 07:36:58Z

For such a simple parsing job, I'd use a regular expression and re.findall()` to split the string into a list of tokens. It's just one line instead of over 20 lines. The tokens are an element (an upper case letter possibly with a lower case letter), a repeat count (1 or more digits), or a bracket. The "{}[]" matches any of the brackets.

 tokens = re.findall(r"[A-Z][a-z]|[0-9]+|[](){}[]", string)

collections.Counter() is useful for counting the elements. Lists used as simple stacks help keep track of bracketed sub-compounds.

 composition = Counter()
 match = []
 stack = []

Using a list of tokens may be less optimal for large parsing tasks, but is sufficient for this command line calculator. Plus, it lets you "peek" at the next token to see if its a number, greatly simplifying the handling of the numbers in a formula. (I used an if-expression in the final code)

 if tokens[0].isalpha():
 element = tokens.pop(0)
 if tokens[0].isdigit(): # <-- peek at next token to see if
 repeat = int(tokens.pop(0)) # there is a count
 else:
 repeat = 1
 compound[element] += repeat

When an opening bracket is encountered, save the current state (the Counter) on a stack and start a new state. Save the matching bracket on a stack too.

 elif tokens[0] in ('(', '[', '{'):
 match.append(MATCH[tokens.pop(0)])
 stack.append(composition)
 composition = Counter()

When a closing bracket is encountered, see if it matches the one on top of the bracket stack. Pop the saved state from the stack and combine it from the one for the bracketed sub-compound.

 elif tokens[0] == match[-1]:
 tokens.pop(0)
 match.pop()
 
 repeat = int(tokens.pop(0)) if tokens and tokens[0].isdigit() else 1
 for element in composition.keys():
 composition[element] *= repeat
 
 composition.update(stack.pop())

Any other token is an error.

 else:
 if token in (')', ']', '}'):
 raise ValueError(f"Error, mismatched bracket: "
 f"expected {match[-1]} got {tokens[0]}.")
 
 else:
 raise ValueError(f"Error, unrecognized token "
 f"in formula: '{tokens[0]}'.")

If there are any brackets left on the stack at the end, then there are mismatched brackets.

 if match:
 brackets = ', '.join(f"'{b}'" for b in match[::-1])
 raise ValueError(f"Error, missing bracket(s): {brackets}.")

The full parser:

import re
from collections import Counter
MATCH = {'(':')', '[':']', '{':'}'}
def parse(molecule):
 tokens = re.findall(r"[A-Z][a-z]?|[0-9]+|[]{}()[]", molecule)
 
 composition = Counter()
 match = []
 stack = []
 
 while tokens:
 # element with optional count
 if tokens[0].isalpha():
 element = tokens.pop(0)
 count = int(tokens.pop(0)) if tokens and tokens[0].isdigit() else 1
 composition[element] += count
 # start a sub-compound
 elif tokens[0] in ('(', '[', '{'):
 match.append(MATCH[tokens.pop(0)])
 stack.append(composition)
 composition = Counter()
 
 # matching close bracket ends a sub-compound with an optional count
 elif tokens[0] == match[-1]:
 tokens.pop(0)
 match.pop()
 
 repeat = int(tokens.pop(0)) if tokens and tokens[0].isdigit() else 1
 for element in composition.keys():
 composition[element] *= repeat
 
 composition.update(stack.pop())
 # "syntax" error in the formula
 else:
 if token[0] in (')', ']', '}'):
 raise ValueError(f"Error, mismatched bracket: "
 f"expected '{match[-1]}' got '{token[0]}'.")
 
 else:
 raise ValueError(f"Error, unrecognized token in "
 f"formula: '{token[0]}'.")
 # left over, unmatched brackets
 if match:
 brackets = ', '.join(f"'{b}'" for b in match[::-1])
 raise ValueError(f"Error, missing bracket(s): {brackets}.")
 
 return dict(composition)

Stack Exchange Network

Python molar mass calculator

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Hot Network Questions

Python molar mass calculator

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Related

Hot Network Questions