5
\$\begingroup\$

I'm working on a feature for the HamlPy (Haml for Django) project:

About Haml

For those who don't know, Haml is an indentation-based markup language which compiles to HTML:

%ul#atheletes
 - for athelete in athelete_list
 %li.athelete{'id': 'athelete_{{ athelete.pk }}'}= athelete.name

compiles to

<ul id='atheletes'>
 {% for athelete in athelete_list %}
 <li class='athelete' id='athelete_{{ athelete.pk }}'>{{ athelete.name }}</li>
 {% endfor %}
</ul>

The code

{'id': 'athelete_{{ athelete.pk }}'} is referred to as the 'attribute dictionary'. It is an (almost) valid Python dictionary and is currently parsed with some very ugly regular expressions and an eval(). However, I would like to add some features to it that would no longer make it a valid Python dictionary, e.g. using Haml within the attributes:

%a.link{
 'class':
 - if forloop.first
 link-first
 - else
 - if forloop.last
 link-last
 'href':
 - url some_view
 }

among other things.

I began by writing a class which I could swap out for the eval and would pass all of the current tests:

import re
# Valid characters for dictionary key
re_key = re.compile(r'[a-zA-Z0-9-_]+')
re_nums = re.compile(r'[0-9\.]+')
class AttributeParser:
 """Parses comma-separated HamlPy attribute values"""
 def __init__(self, data, terminator):
 self.terminator=terminator
 self.s = data.lstrip()
 # Index of current character being read
 self.ptr=1
 def consume_whitespace(self, include_newlines=False):
 """Moves the pointer to the next non-whitespace character"""
 whitespace = (' ', '\t', '\r', '\n') if include_newlines else (' ', '\t')
 while self.ptr<len(self.s) and self.s[self.ptr] in whitespace:
 self.ptr+=1
 return self.ptr
 def consume_end_of_value(self):
 # End of value comma or end of string
 self.ptr=self.consume_whitespace()
 if self.s[self.ptr] != self.terminator:
 if self.s[self.ptr] == ',':
 self.ptr+=1
 else:
 raise Exception("Expected comma for end of value (after ...%s), but got '%s' instead" % (self.s[max(self.ptr-10,0):self.ptr], self.s[self.ptr]))
 def read_until_unescaped_character(self, closing, pos=0):
 """
 Moves the dictionary string starting from position *pos*
 until a *closing* character not preceded by a backslash is found.
 Returns a tuple containing the string which was read (without any preceding backslashes)
 and the number of characters which were read.
 """
 initial_pos=pos
 while pos<len(self.s):
 if self.s[pos]==closing and (pos==initial_pos or self.s[pos-1]!='\\'):
 break
 pos+=1
 return (self.s[initial_pos:pos].replace('\\'+closing,closing), pos-initial_pos+1)
 def parse_value(self):
 self.ptr=self.consume_whitespace()
 # Invalid initial value
 val=False
 if self.s[self.ptr]==self.terminator:
 return val
 # String
 if self.s[self.ptr] in ("'",'"'):
 quote=self.s[self.ptr]
 self.ptr += 1
 val,characters_read = self.read_until_unescaped_character(quote, pos=self.ptr)
 self.ptr += characters_read
 # Django variable
 elif self.s[self.ptr:self.ptr+2] == '={':
 self.ptr+=2
 val,characters_read = self.read_until_unescaped_character('}', pos=self.ptr)
 self.ptr += characters_read
 val="{{ %s }}" % val
 # Django tag
 elif self.s[self.ptr:self.ptr+2] in ['-{', '#{']:
 self.ptr+=2
 val,characters_read = self.read_until_unescaped_character('}', pos=self.ptr)
 self.ptr += characters_read
 val=r"{%% %s %%}" % val
 # Boolean Attributes
 elif self.s[self.ptr:self.ptr+4] in ['none','None']:
 val = None
 self.ptr+=4
 # Integers and floats
 else:
 match=re_nums.match(self.s[self.ptr:])
 if match:
 val = match.group(0)
 self.ptr += len(val)
 if val is False:
 raise Exception("Failed to parse dictionary value beginning at: %s" % self.s[self.ptr:])
 self.consume_end_of_value()
 return val
class AttributeDictParser(AttributeParser):
 """
 Parses a Haml element's attribute dictionary string and
 provides a Python dictionary of the element attributes
 """
 def __init__(self, s):
 AttributeParser.__init__(self, s, '}')
 self.dict={}
 def parse(self):
 while self.ptr<len(self.s)-1:
 key = self.__parse_key()
 # Tuple/List parsing
 self.ptr=self.consume_whitespace()
 if self.s[self.ptr] in ('(', '['):
 tl_parser = AttributeTupleAndListParser(self.s[self.ptr:])
 val = tl_parser.parse()
 self.ptr += tl_parser.ptr
 self.consume_end_of_value()
 else:
 val = self.parse_value()
 self.dict[key]=val
 return self.dict
 def __parse_key(self):
 '''Parse key variable and consume up to the colon'''
 self.ptr=self.consume_whitespace(include_newlines=True)
 # Consume opening quote
 quote=None
 if self.s[self.ptr] in ("'",'"'):
 quote = self.s[self.ptr]
 self.ptr += 1
 # Extract key
 if quote:
 key,characters_read = self.read_until_unescaped_character(quote, pos=self.ptr)
 self.ptr+=characters_read
 else:
 key_match = re_key.match(self.s[self.ptr:])
 if key_match is None:
 raise Exception("Invalid key beginning at: %s" % self.s[self.ptr:])
 key = key_match.group(0)
 self.ptr += len(key)
 # Consume colon
 ptr=self.consume_whitespace()
 if self.s[self.ptr]==':':
 self.ptr+=1
 else:
 raise Exception("Expected colon for end of key (after ...%s), but got '%s' instead" % (self.s[max(self.ptr-10,0):self.ptr], self.s[self.ptr]))
 return key
 def render_attributes(self):
 attributes=[]
 for k, v in self.dict.items():
 if k != 'id' and k != 'class':
 # Boolean attributes
 if v==None:
 attributes.append( "%s" % (k,))
 else:
 attributes.append( "%s='%s'" % (k,v))
 return ' '.join(attributes)
class AttributeTupleAndListParser(AttributeParser):
 def __init__(self, s):
 if s[0]=='(':
 terminator = ')'
 elif s[0]=='[':
 terminator = ']'
 AttributeParser.__init__(self, s, terminator)
 def parse(self):
 lst=[]
 # Todo: Must be easier way...
 val=True
 while val != False:
 val = self.parse_value()
 if val != False:
 lst.append(val)
 self.ptr +=1
 if self.terminator==')':
 return tuple(lst)
 else:
 return lst

The class can be used stand-alone as follows:

>>> from attribute_dict_parser import AttributeDictParser
>>> a=AttributeDictParser("{'id': 'a', 'class': 'b'}")
>>> d=a.parse()
>>> d
{'id': 'a', 'class': 'b'}
>>> type(d)
<type 'dict'>

AttributeDictParser iterates through characters in s (the attribute dictionary) and uses the variable ptr to track its location (to prevent unnecessary string splicing). The function parse_key parses the keys ('id': and 'class':), and the function parse_value parses the values ('a' and 'b'). parse_value works with data types other than strings. It returns False if it reaches the end of the attribute dictionary, because Null is a valid value to return.

AttributeTupleAndListParser parses list and tuple values, as these are valid values (e.g. {'id': ['a','b','c']}.

Both of these classes inherit from AttributeParser because they share the same way of parsing values.

Questions:

  1. Is this a sensible approach? Am I insane to think that I can move from eval()ing the code as a Python dictionary to a custom parser without causing issues for users, just because it passes the tests?

  2. I'm worried that the performance hit of writing a parser in an interpreted language will be too much compared to doing the eval(). I've written similar things before for parsing JSON expressions, and was dismayed that for all my optimisations, the two-line regular expression won on the benchmarks. I will do some profiling on it once I've tidied up a few things. Are there any notable ineffeciencies in my approach?

  3. There are some things in the old parser that have not been ported to the new one (e.g. supporting the Ruby Haml => syntax). This feature has never been documented however and I doubt anybody knows it's there. What is a good rule of thumb for breaking an undocumented feature in open source projects?

  4. I would welcome any feedback on my coding style, as I don't get to be around other developers much.

200_success
145k22 gold badges190 silver badges478 bronze badges
asked Sep 6, 2012 at 22:45
\$\endgroup\$
0

1 Answer 1

7
\$\begingroup\$

1. Answers to your questions

  1. If the goal of the project is to be able to include Haml in attribute values, then you've got no choice but to switch to your own parser. I haven't looked at the set of test cases, but it does seem plausible that you are going to introduce incompatibilities because of the complexity of Python's own parser. You are going to find that you have users who used the oddities of Python's string syntax (r-strings, \u-escapes and all).

    The way to manage the transition from the old parser to the new is to start out by shipping both, with the old parser selected by default, but the new parser selectable with an option. This gives your users time to discover the incompatibilities and fix them (or submit bug reports). Then in a later release make the new parser the default, but with the old parser available but deprecated. Finally, remove the old parser.

  2. Correctness and simplicity first, speed later. You can always port the parser to C if nothing else will do.

  3. My answer to question 1 applies here too.

  4. See below.

2. Designing a parser

Now, let's look at the code. I thought about making a series of comments on the various misfeatures, but that seems less than helpful, given that the whole design of the parser isn't quite right:

  1. There's no separation between the lexer and the parser.

  2. You have different classes for different productions in your syntax, so that each time you need to parse a tuple/list, you construct a new AttributeTupleAndListParser object, construct a string for it to parse (by copying the tail of the original string), and then throw away the parser object when done.

  3. Some of your parsing methods don't seem well-matched to the syntax of the language, making it difficult to understand what they do. consume_end_of_value is a good example: it doesn't seem to correspond to anything natural in the syntax.

Computer science is by no means a discipline with all the answers, but one thing that we know how to do is write a parser! You don't have to have read the dragon book from cover to cover to know that it's conventional to develop a formal grammar for your language (often written down in a variation on Backus–Naur form), and then split your code into a lexical analyzer (which transforms source code into tokens using a finite state machine or something similar) and a parser which takes a stream of tokens and constructs a syntax tree or some other form of output based on the syntax of the input.

Sticking to this convention has a bunch of advantages: the existence of a formal grammar makes it easier to build compatible implementations; you can modify and test the lexical analyzer independently from the parser and vice versa; and other programmers will find it easier to understand and modify your code.

3. Rewriting your parser conventionally

Here's how I might start rewriting your parser to use the conventional approach. This implements a deliberately incomplete subset of the HamlPy attribute language, in order to keep the code short and to get it finished in a reasonable amount of time.

First, a class whose instances represent source tokens. The original string and position of each token is recorded in that token so that we can easily produce an error message related to that token. I've used the built-in exception SyntaxError here so that the error messages match those from other Python libraries. (You might later want to extend this so that the class can represent tokens from files as well as tokens from strings.)

class Token(object):
 """
 An object representing a token in a HamlPy document. Construct it
 using `Token(type, value, source, start, end)` where:
 `type` is the token type (`Token.DELIMITER`, `Token.STRING`, etc);
 `value` is the token value;
 `source` is the string from which the token was taken;
 `start` is the character position in `source` where the token starts;
 `ends` is the character position in `source` where the token finishes.
 """
 # Enumeration of token types.
 DELIMITER = 1
 STRING = 2
 END = 3
 ERROR = 4
 def __init__(self, type, value, source, start, end):
 self.type = type
 self.value = value
 self.source = source
 self.start = start
 self.end = end
 def __repr__(self):
 type_name = 'UNKNOWN'
 for attr in dir(self):
 if getattr(self, attr) == self.type:
 type_name = attr
 break
 return ('Token(Token.{0}, {1}, {2}, {3}, {4})'
 .format(type_name, repr(self.value), repr(self.source),
 self.start, self.end))
 def matches(self, type, value):
 """
 Return True iff this token matches the given `type` and `value`.
 """
 return self.type == type and self.value == value
 def error(self, msg):
 """
 Return a `SyntaxError` object describing a problem with this
 token. The argument `msg` is the error message; the token's
 line number and position are also reported.
 """
 line_start = 1 + self.source.rfind('\n', 0, self.start)
 line_end = self.source.find('\n', self.end)
 if line_end == -1: line_end = len(self.source)
 e = SyntaxError(msg)
 e.lineno = 1 + self.source.count('\n', 0, self.start)
 e.text = self.source[line_start: line_end]
 e.offset = self.start - line_start + 1
 return e

Second, the lexical analyzer, using Python's iterator protocol.

class Tokenizer(object):
 """
 Tokenizer for a subset of HamlPy. Instances of this class support
 the iterator protocol, and yield tokens from the string `s` as
 Token object. When the string `s` runs out, yield an END token.
 >>> from pprint import pprint
 >>> pprint(list(Tokenizer('{"a":"b"}')))
 [Token(Token.DELIMITER, '{', '{"a":"b"}', 0, 1),
 Token(Token.STRING, 'a', '{"a":"b"}', 2, 3),
 Token(Token.DELIMITER, ':', '{"a":"b"}', 4, 5),
 Token(Token.STRING, 'b', '{"a":"b"}', 6, 7),
 Token(Token.DELIMITER, '}', '{"a":"b"}', 8, 9),
 Token(Token.END, '', '{"a":"b"}', 9, 9)]
 """
 def __init__(self, s):
 self.iter = self.tokenize(s)
 def __iter__(self):
 return self
 def next(self):
 return next(self.iter)
 # Regular expression matching a source token.
 token_re = re.compile(r'''
 \s* # Ignore initial whitespace
 (?:([][{},:]) # 1. Delimiter
 |'([^\\']*(?:\\.[^\\']*)*)' # 2. Single-quoted string
 |"([^\\"]*(?:\\.[^\\"]*)*)" # 3. Double-quoted string
 |(\S) # 4. Something else
 )''', re.X)
 # Regular expression matching a backslash and following character.
 backslash_re = re.compile(r'\\(.)')
 def tokenize(self, s):
 for m in self.token_re.finditer(s):
 if m.group(1):
 yield Token(Token.DELIMITER, m.group(1),
 s, m.start(1), m.end(1))
 elif m.group(2):
 yield Token(Token.STRING,
 self.backslash_re.sub(r'1円', m.group(2)),
 s, m.start(2), m.end(2))
 elif m.group(3):
 yield Token(Token.STRING,
 self.backslash_re.sub(r'1円', m.group(3)),
 s, m.start(3), m.end(3))
 else:
 t = Token(Token.ERROR, m.group(4), s, m.start(4), m.end(4))
 raise t.error('Unexpected character')
 yield Token(Token.END, '', s, len(s), len(s))

And third, the recursive descent parser, with the formal grammar given in the docstring for the class. The parser needs one token of lookahead.

class Parser(object):
 """
 Parser for the subset of HamlPy with the following grammar:
 attribute-dict ::= '{' [attribute-list] '}'
 attribute-list ::= attribute (',' attribute)*
 attribute ::= string ':' value
 value ::= string | '[' [value-list] ']'
 value-list ::= value (',' value)*
 """
 def __init__(self, s):
 self.tokenizer = Tokenizer(s)
 self.lookahead = None # The lookahead token.
 self.next_token() # Lookahead one token.
 def next_token(self):
 """
 Return the next token from the lexer and update the lookahead
 token.
 """
 t = self.lookahead
 self.lookahead = next(self.tokenizer)
 return t
 # Regular expression matching an allowable key.
 key_re = re.compile(r'[a-zA-Z_0-9-]+$')
 def parse_value(self):
 t = self.next_token()
 if t.type == Token.STRING:
 return t.value
 elif t.matches(Token.DELIMITER, '['):
 return list(self.parse_value_list())
 else:
 raise t.error('Expected a value')
 def parse_value_list(self):
 if self.lookahead.matches(Token.DELIMITER, ']'):
 self.next_token()
 return
 while True:
 yield self.parse_value()
 t = self.next_token()
 if t.matches(Token.DELIMITER, ']'):
 return
 elif not t.matches(Token.DELIMITER, ','):
 raise t.error('Expected "," or "]"')
 def parse_attribute(self):
 t = self.next_token()
 if t.type != Token.STRING:
 raise t.error('Expected a string')
 key = t.value
 if not self.key_re.match(key):
 raise t.error('Invalid key')
 t = self.next_token()
 if not t.matches(Token.DELIMITER, ':'):
 raise t.error('Expected ":"')
 value = self.parse_value()
 return key, value
 def parse_attribute_list(self):
 if self.lookahead.matches(Token.DELIMITER, '}'):
 self.next_token()
 return
 while True:
 yield self.parse_attribute()
 t = self.next_token()
 if t.matches(Token.DELIMITER, '}'):
 return
 elif not t.matches(Token.DELIMITER, ','):
 raise t.error('Expected "," or "}"')
 def parse_attribute_dict(self):
 t = self.next_token()
 if not t.matches(Token.DELIMITER, '{'):
 raise t.error('Expected "{"')
 return dict(self.parse_attribute_list())

You'll probably want to know how to handle Haml's significant whitespace. The way to do this is to modify the tokenizer to emit NEWLINE, INDENT and DEDENT tokens, and then modify next_token to take an include_newlines optional parameter, and discard or return these extra tokens as appropriate.

answered Sep 9, 2012 at 17:16
\$\endgroup\$
0

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.