I have written a string parser that is designed to split a string by spaces, excluding spaces wrapped in strings.
Here is some example inputs and outputs:
| Rule | Input | Output |
|------|-------------------------------|-----------------------------------|
| 1 | 'foo' | ['foo'] |
| 1 | 'foo bar' | ['foo', 'bar'] |
| 1 | 'foo bar \'abc xyz\'' | ['foo', 'bar', "'abc xyz'"] |
| 1 | 'foo bar "abc xyz"' | ['foo', 'bar', '"abc xyz"'] |
| 3 | 'foo bar "abc xyz"\\' | ['foo', 'bar', '"abc xyz"'] |
| 2 | 'foo bar "abc \\"def\\" xyz"' | ['foo', 'bar', '"abc "def" xyz"'] |
And the rules my parser follows:
The input string is split by spaces, counting any text between quotes (either single or double) as a single "unit" of text (i.e. not split by spaces).
Any text wrapped in double quotes can be escaped using a backslash. For example:
'hello "\\"world\\""'becomes['hello', '""world""'].Any trailing backslashes are stripped.
Here is the code I have written to parse the strings:
import re as _re
import enum as _enum
def is_space(text):
"""Returns whether or not the provided string is a single space."""
return _re.match(r'^\s*$', text) is not None
class State(_enum.Enum):
"""A state for the parser.
space: The last character was a space.
word: The last character was part of a word.
quote: The last character was part of a quoted string.
"""
space = 0
word = 1
quote = 2
class ParserState(object):
"""A string parser that splits a string into words.
This parser splits a string into words, counting quoted strings as a single
word.
Example:
input: 'hello world "inner string"'
output: ['hello', 'world', '"inner string"']
"""
def __init__(self, text):
"""Constructs a new parser for the provided text."""
self._text = text
self._index = 0
self._state = State.space
self._quote = ''
self._word = ''
self._words = []
@property
def text(self):
"""The source text that the parser is parsing."""
return self._text
@property
def index(self):
"""The current index in the source text of the parser."""
return self._index
@index.setter
def index(self, value):
"""The current index in the source text of the parser.
If the value is not an integer then the index is not set.
"""
if isinstance(value, int):
self._index = value
@property
def character(self):
"""The current character in the source text of the parser.
This is the same as text[index]
"""
return self._text[self.index]
@property
def state(self):
"""The current state of the parser."""
return self._state
@state.setter
def state(self, value):
"""The current state of the parser.
If the value is not a valid state then the state is not set.
"""
if value in State:
self._state = value
@property
def quote(self):
"""The current quoting used by the parser.
This is set to a quote (either ' or ") to denote the closing quote for
the current section.
"""
return self._quote
@quote.setter
def quote(self, value):
"""The current quoting used by the parser.
If the provided value is not ' or " then the quote is not set.
"""
if value in ('\'', '"'):
self._quote = value
@property
def word(self):
"""The current word that the parser is building."""
return self._word
@word.setter
def word(self, value):
"""The current word that the parser is building.
If the value is not a string then the word is not set.
"""
if isinstance(value, basestring):
self._word = value
@property
def words(self):
"""All of the words found by the parser."""
return self._words
def push_word(self, allow_empty=False, clear_word=False):
"""Push the current word onto the words array.
If the word is empty and allow_empty is False then the word will not
be added to the words array.
If clear_word is True then word is emptied after pushing it.
"""
if allow_empty or len(self.word) > 0:
self.words.append(self.word)
if clear_word:
self.word = ''
def push_character(self, increment_index=0):
"""Append the current character onto the word.
If increment_index is not 0 then index is incremented by it's value.
"""
self.word += self.character
if increment_index != 0:
self.index += increment_index
def parse(text):
"""Splits the provided string into words.
This method counts quoted strings as a single word.
Example:
input: 'hello world "inner string"'
output: ['hello', 'world', '"inner string"']
"""
state = ParserState(text)
# Iterate each character in the provided string.
while state.index < len(state.text):
if is_space(state.character):
# If we're currently in a word then we push it the words array and
# clear it's value.
if state.state == State.word:
state.push_word(clear_word=True)
# If we're not in a quoted string then skip the space. By doing
# this we ignore all consecutive spaces.
if state.state != State.quote:
state.index += 1
continue
elif state.state == State.space:
state.state = State.word
if state.state == State.word:
# If the current character is a quote then store the type of quote
# and change the state to quote.
if state.character in ('\'', '"'):
state.quote = state.character
state.state = State.quote
# If the current character is a backslash then push it onto the
# current word and increment the index. By doing this, we can now
# access the character that is being escaped.
if state.character == '\\':
# If the current character is the last character of the string
# then increment the index and skip the rest of the iteration.
# This skips the backslash character, effectively removing it
# from the end of the string.
if state.index + 1 == len(state.text):
state.index += 1
continue
state.push_character(increment_index=1)
# Push the current character onto the current word and skip the
# rest of the iteration.
state.push_character(increment_index=1)
continue
if state.state == State.quote:
# If the current character is the same as the opening quote then
# change the state to word.
if state.character == state.quote:
state.state = State.word
# If the current character is a backslash and we're not in a single
# quoted string.
if state.character == '\\' and state.quote != '\'':
# If the current character is the last character in the string
# then increment the index and skip the rest of the iteration.
# This skips the backslash character, effectively removing it
# from the end of the string.
if state.index + 1 == len(state.text):
state.index += 1
continue
# Increment the index, effectively skipping the backslash.
state.index += 1
state.push_character()
state.index += 1
state.push_word()
return state.words
The main issue I have with my code is that I feel it's a bit long and complex for what it is accomplishing; is there a simpler way to do it?
I'm not sure how relevant this is but this code is part of a Dockerfile parser. You can find the Go implementation here.
I know the Go implementation is much shorter, however I tried implementing their design and it ended up rather large and "non-pythonic". I am actually using the original logic from the Go parser but with a custom state enum and parser state object.
1 Answer 1
What benefit is your ParserState class?
The only property that I think, ok that's ok, is the character one, all the others are just noise and can go.
You also want to reset the word whenever you use push_word and when you append or push_character you just want to push the character.
And so if you had to keep it I'd use the following:
class ParserState(object):
def __init__(self, text):
self.text = text
self.index = 0
self.state = State.space
self.quote = ''
self.word = ''
self.words = []
@property
def character(self):
return self._text[self.index]
def push_word(self, allow_empty=False):
if allow_empty or self.word:
self.words.append(self.word)
self.word = ''
def append(self):
self.word += self.character
However this has no advantage over merging it with parse and keeping it a single function. In fact it hinders the readability. And so I'd merge it together to get:
def parse(text):
text = text
index = 0
state = State.space
quote = ''
word = ''
words = []
while index < len(text):
character = text[index]
if is_space(character):
if state == State.word:
if clear_word or word:
words.append(word)
if state != State.quote:
index += 1
continue
elif state == State.space:
state = State.word
if state == State.word:
if character in ('\'', '"'):
quote = character
state = State.quote
if character == '\\':
if index + 1 == len(text):
index += 1
continue
word += character
index += 1
word += character
index += 1
continue
if state == State.quote:
if character == quote:
state = State.word
if character == '\\' and quote != '\'':
if index + 1 == len(text):
index += 1
continue
index += 1
word += character
index += 1
words.append(word)
return words
For you enum I'd use a method that actually works in all 2.7 versions, rather than one that only works in the latest. I used one of the methods in the top answer, but it shouldn't matter too much. Just keep in mind that I use uppercase variables, as they are constants.
I'd also use char in string.whitespace rather than using a regex, this is as it removes a function and is a simple in.
And so for all bar the function I use:
from string import whitespace as space
def enum(*sequential, **named):
enums = dict(zip(sequential, range(len(sequential))), **named)
return type('Enum', (), enums)
State = enum('SPACE', 'WORD', 'QUOTE')
TOKEN_ESCAPE = '\\'
Your function can still be improved. The changes you made from the original are:
- Checks if whitespace first.
- Moved the addition of the final word out of the loop but removed it's checks.
The latter is half a good idea, but neither is too great. The first moves the code out of the state sections, for no good reason. The second removes the checks.
And so I'd move back to a 1:1 translation of the Go code. However rather than using the while loop method, I'd use the iterator way.
indexes = iter(range(len(text)))
for index in indexes:
if (some test):
continue
if (some other test):
index = next(indexes)
Rather than your current way of:
index = 0
while index < len(text):
if (some test):
index += 1
continue
if (some other test):
index += 1
index += 1
The former is much more succinct. And so you'd want to re-try from the conversion of the Go code in this form:
def parse(text):
words = []
word = []
state = State.SPACE
quote = ''
allow_blank = False
indexes = iter(range(len(text) + 1))
for index in indexes:
if index != len(text):
char = text[index]
if state is State.SPACE:
if index == len(rest):
break
if char in space:
continue
state = State.WORD
if (state is State.WORD or state is State.QUOTE) and index == len(text):
if allow_blank or word:
words.append(''.join(word))
break
if state is State.WORD:
if char in space:
state = State.SPACE
if allow_blank or word:
words.append(''.join(word))
word = []
allow_blank = False
continue
if char in '\'"':
quote = char
allow_blank = True
state = State.QUOTE
if char == TOKEN_ESCAPE:
if pos + 1 == len(text):
continue
word.append(char)
index += 1
char = text[index]
word.append(char)
continue
if state is State.QUOTE:
if char == quote:
state = State.WORD
if char == TOKEN_ESCAPE and quote != '\'':
if pos + 1 == len(text):
continue
word.append(char)
index += 1
char = text[index]
word.append(char)
return words
As we don't do code reviews of other peoples code, I'm not going to review this code. However I did change it the following ways:
- Move the final append out of the loop. But kept the checks. All of them.
- Removed obsolete index checks.
- Removed index from the code.
- Merged the
TOKEN_ESCAPEcode.
Which resulted in:
from string import whitespace as space
def enum(*sequential, **named):
enums = dict(zip(sequential, range(len(sequential))), **named)
return type('Enum', (), enums)
State = enum('SPACE', 'WORD', 'QUOTE')
TOKEN_ESCAPE = '\\'
def parse(text):
words = []
word = []
state = State.SPACE
quote = ''
allow_blank = False
text_ = iter(text)
for char in text_:
if state is State.SPACE:
if char in space:
continue
state = State.WORD
if state is State.WORD:
if char in space:
state = State.SPACE
if allow_blank or word:
words.append(''.join(word))
word = []
allow_blank = False
continue
if char in '\'"':
quote = char
allow_blank = True
state = State.QUOTE
elif state is State.QUOTE:
if char == quote:
state = State.WORD
if char == TOKEN_ESCAPE and ((state is State.WORD) or
(state is State.QUOTE and quote != '\'')):
new_char = next(text_, StopIteration)
if new_char is StopIteration:
break
word.append(char)
char = new_char
word.append(char)
if (state is State.WORD or state is State.QUOTE):
if allow_blank or word:
words.append(''.join(word))
return words
print(parse('foo'))
print(parse('foo bar'))
print(parse('foo bar \'abc xyz\''))
print(parse('foo bar "abc xyz"'))
print(parse('foo bar "abc xyz"\\'))
print(parse('foo bar "abc \\"def\\" xyz"'))
-
\$\begingroup\$ Can I ask what the purpose of
allow_blankis? I didn't need it in my original code, as I just used the state instead? \$\endgroup\$Jack Wilsdon– Jack Wilsdon2016年06月13日 14:41:34 +00:00Commented Jun 13, 2016 at 14:41 -
1\$\begingroup\$ @JackWilsdon It's from the Go code you linked to. From memory, and I've not looked at this for a week, your code may have a/some edge-cases if you remove it. \$\endgroup\$2016年06月13日 14:52:15 +00:00Commented Jun 13, 2016 at 14:52
-
\$\begingroup\$ It seems that the logic for the TOKEN_ESCAPE section is slightly wrong. We should be appending the new char to the word and then continuing (i.e. skipping the current iteration) instead of appending the old char and then the new char, as this ends up giving us the escaping backslashes too. I just replaced
char = new_charwithcontinuewhich seems to have fixed the issue. \$\endgroup\$Jack Wilsdon– Jack Wilsdon2016年06月13日 14:58:32 +00:00Commented Jun 13, 2016 at 14:58
shlex.split? \$\endgroup\$shlex.split, as it seems to follow pretty much the same rules my code does (except it removes the outer quotes from strings). \$\endgroup\$