Quoted string parser

Question 1

I have written a string parser that is designed to split a string by spaces, excluding spaces wrapped in strings.

Here is some example inputs and outputs:

| Rule | Input | Output |
|------|-------------------------------|-----------------------------------|
| 1 | 'foo' | ['foo'] |
| 1 | 'foo bar' | ['foo', 'bar'] |
| 1 | 'foo bar \'abc xyz\'' | ['foo', 'bar', "'abc xyz'"] |
| 1 | 'foo bar "abc xyz"' | ['foo', 'bar', '"abc xyz"'] |
| 3 | 'foo bar "abc xyz"\\' | ['foo', 'bar', '"abc xyz"'] |
| 2 | 'foo bar "abc \\"def\\" xyz"' | ['foo', 'bar', '"abc "def" xyz"'] |

And the rules my parser follows:

The input string is split by spaces, counting any text between quotes (either single or double) as a single "unit" of text (i.e. not split by spaces).
Any text wrapped in double quotes can be escaped using a backslash. For example: 'hello "\\"world\\""' becomes ['hello', '""world""'].
Any trailing backslashes are stripped.

Here is the code I have written to parse the strings:

import re as _re
import enum as _enum
def is_space(text):
 """Returns whether or not the provided string is a single space."""
 return _re.match(r'^\s*$', text) is not None
class State(_enum.Enum):
 """A state for the parser.
 space: The last character was a space.
 word: The last character was part of a word.
 quote: The last character was part of a quoted string.
 """
 space = 0
 word = 1
 quote = 2
class ParserState(object):
 """A string parser that splits a string into words.
 This parser splits a string into words, counting quoted strings as a single
 word.
 Example:
 input: 'hello world "inner string"'
 output: ['hello', 'world', '"inner string"']
 """
 def __init__(self, text):
 """Constructs a new parser for the provided text."""
 self._text = text
 self._index = 0
 self._state = State.space
 self._quote = ''
 self._word = ''
 self._words = []
 @property
 def text(self):
 """The source text that the parser is parsing."""
 return self._text
 @property
 def index(self):
 """The current index in the source text of the parser."""
 return self._index
 @index.setter
 def index(self, value):
 """The current index in the source text of the parser.
 If the value is not an integer then the index is not set.
 """
 if isinstance(value, int):
 self._index = value
 @property
 def character(self):
 """The current character in the source text of the parser.
 This is the same as text[index]
 """
 return self._text[self.index]
 @property
 def state(self):
 """The current state of the parser."""
 return self._state
 @state.setter
 def state(self, value):
 """The current state of the parser.
 If the value is not a valid state then the state is not set.
 """
 if value in State:
 self._state = value
 @property
 def quote(self):
 """The current quoting used by the parser.
 This is set to a quote (either ' or ") to denote the closing quote for
 the current section.
 """
 return self._quote
 @quote.setter
 def quote(self, value):
 """The current quoting used by the parser.
 If the provided value is not ' or " then the quote is not set.
 """
 if value in ('\'', '"'):
 self._quote = value
 @property
 def word(self):
 """The current word that the parser is building."""
 return self._word
 @word.setter
 def word(self, value):
 """The current word that the parser is building.
 If the value is not a string then the word is not set.
 """
 if isinstance(value, basestring):
 self._word = value
 @property
 def words(self):
 """All of the words found by the parser."""
 return self._words
 def push_word(self, allow_empty=False, clear_word=False):
 """Push the current word onto the words array.
 If the word is empty and allow_empty is False then the word will not
 be added to the words array.
 If clear_word is True then word is emptied after pushing it.
 """
 if allow_empty or len(self.word) > 0:
 self.words.append(self.word)
 if clear_word:
 self.word = ''
 def push_character(self, increment_index=0):
 """Append the current character onto the word.
 If increment_index is not 0 then index is incremented by it's value.
 """
 self.word += self.character
 if increment_index != 0:
 self.index += increment_index
def parse(text):
 """Splits the provided string into words.
 This method counts quoted strings as a single word.
 Example:
 input: 'hello world "inner string"'
 output: ['hello', 'world', '"inner string"']
 """
 state = ParserState(text)
 # Iterate each character in the provided string.
 while state.index < len(state.text):
 if is_space(state.character):
 # If we're currently in a word then we push it the words array and
 # clear it's value.
 if state.state == State.word:
 state.push_word(clear_word=True)
 # If we're not in a quoted string then skip the space. By doing
 # this we ignore all consecutive spaces.
 if state.state != State.quote:
 state.index += 1
 continue
 elif state.state == State.space:
 state.state = State.word
 if state.state == State.word:
 # If the current character is a quote then store the type of quote
 # and change the state to quote.
 if state.character in ('\'', '"'):
 state.quote = state.character
 state.state = State.quote
 # If the current character is a backslash then push it onto the
 # current word and increment the index. By doing this, we can now
 # access the character that is being escaped.
 if state.character == '\\':
 # If the current character is the last character of the string
 # then increment the index and skip the rest of the iteration.
 # This skips the backslash character, effectively removing it
 # from the end of the string.
 if state.index + 1 == len(state.text):
 state.index += 1
 continue
 state.push_character(increment_index=1)
 # Push the current character onto the current word and skip the
 # rest of the iteration.
 state.push_character(increment_index=1)
 continue
 if state.state == State.quote:
 # If the current character is the same as the opening quote then
 # change the state to word.
 if state.character == state.quote:
 state.state = State.word
 # If the current character is a backslash and we're not in a single
 # quoted string.
 if state.character == '\\' and state.quote != '\'':
 # If the current character is the last character in the string
 # then increment the index and skip the rest of the iteration.
 # This skips the backslash character, effectively removing it
 # from the end of the string.
 if state.index + 1 == len(state.text):
 state.index += 1
 continue
 # Increment the index, effectively skipping the backslash.
 state.index += 1
 state.push_character()
 state.index += 1
 state.push_word()
 return state.words

The main issue I have with my code is that I feel it's a bit long and complex for what it is accomplishing; is there a simpler way to do it?

I'm not sure how relevant this is but this code is part of a Dockerfile parser. You can find the Go implementation here.

I know the Go implementation is much shorter, however I tried implementing their design and it ended up rather large and "non-pythonic". I am actually using the original logic from the Go parser but with a custom state enum and parser state object.

Question 2

Hi, I'm quite new to this stack exchange so I would like to know a few things: 1. Do you mind using built-in Python modules? 2. Do you want a class or a function in the end?

Question 3

Did you consider using shlex.split?

Question 4

@MathiasEttinger: Sadly a few of the tests fail (the main issue being the fifth one, as I get an exception relating to the lack of a character being escaped), however I may end up pre-parsing the trailing backslashes before just using shlex.split, as it seems to follow pretty much the same rules my code does (except it removes the outer quotes from strings).

Question 5

What benefit is your ParserState class?

The only property that I think, ok that's ok, is the character one, all the others are just noise and can go. You also want to reset the word whenever you use push_word and when you append or push_character you just want to push the character. And so if you had to keep it I'd use the following:

class ParserState(object):
 def __init__(self, text):
 self.text = text
 self.index = 0
 self.state = State.space
 self.quote = ''
 self.word = ''
 self.words = []
 @property
 def character(self):
 return self._text[self.index]
 def push_word(self, allow_empty=False):
 if allow_empty or self.word:
 self.words.append(self.word)
 self.word = ''
 def append(self):
 self.word += self.character

However this has no advantage over merging it with parse and keeping it a single function. In fact it hinders the readability. And so I'd merge it together to get:

def parse(text):
 text = text
 index = 0
 state = State.space
 quote = ''
 word = ''
 words = []
 while index < len(text):
 character = text[index]
 if is_space(character):
 if state == State.word:
 if clear_word or word:
 words.append(word)
 if state != State.quote:
 index += 1
 continue
 elif state == State.space:
 state = State.word
 if state == State.word:
 if character in ('\'', '"'):
 quote = character
 state = State.quote
 if character == '\\':
 if index + 1 == len(text):
 index += 1
 continue
 word += character
 index += 1
 word += character
 index += 1
 continue
 if state == State.quote:
 if character == quote:
 state = State.word
 if character == '\\' and quote != '\'':
 if index + 1 == len(text):
 index += 1
 continue
 index += 1
 word += character
 index += 1
 words.append(word)
 return words

For you enum I'd use a method that actually works in all 2.7 versions, rather than one that only works in the latest. I used one of the methods in the top answer, but it shouldn't matter too much. Just keep in mind that I use uppercase variables, as they are constants.

I'd also use char in string.whitespace rather than using a regex, this is as it removes a function and is a simple in.

And so for all bar the function I use:

from string import whitespace as space
def enum(*sequential, **named):
 enums = dict(zip(sequential, range(len(sequential))), **named)
 return type('Enum', (), enums)
State = enum('SPACE', 'WORD', 'QUOTE')
TOKEN_ESCAPE = '\\'

Your function can still be improved. The changes you made from the original are:

Checks if whitespace first.
Moved the addition of the final word out of the loop but removed it's checks.

The latter is half a good idea, but neither is too great. The first moves the code out of the state sections, for no good reason. The second removes the checks.

And so I'd move back to a 1:1 translation of the Go code. However rather than using the while loop method, I'd use the iterator way.

indexes = iter(range(len(text)))
for index in indexes:
 if (some test):
 continue
 if (some other test):
 index = next(indexes)

Rather than your current way of:

index = 0
while index < len(text):
 if (some test):
 index += 1
 continue
 if (some other test):
 index += 1
 index += 1

The former is much more succinct. And so you'd want to re-try from the conversion of the Go code in this form:

def parse(text):
 words = []
 word = []
 state = State.SPACE
 quote = ''
 allow_blank = False
 indexes = iter(range(len(text) + 1))
 for index in indexes:
 if index != len(text):
 char = text[index]
 if state is State.SPACE:
 if index == len(rest):
 break
 if char in space:
 continue
 state = State.WORD
 if (state is State.WORD or state is State.QUOTE) and index == len(text):
 if allow_blank or word:
 words.append(''.join(word))
 break
 if state is State.WORD:
 if char in space:
 state = State.SPACE
 if allow_blank or word:
 words.append(''.join(word))
 word = []
 allow_blank = False
 continue
 if char in '\'"':
 quote = char
 allow_blank = True
 state = State.QUOTE
 if char == TOKEN_ESCAPE:
 if pos + 1 == len(text):
 continue
 word.append(char)
 index += 1
 char = text[index]
 word.append(char)
 continue
 if state is State.QUOTE:
 if char == quote:
 state = State.WORD
 if char == TOKEN_ESCAPE and quote != '\'':
 if pos + 1 == len(text):
 continue
 word.append(char)
 index += 1
 char = text[index]
 word.append(char)
 return words

As we don't do code reviews of other peoples code, I'm not going to review this code. However I did change it the following ways:

Move the final append out of the loop. But kept the checks. All of them.
Removed obsolete index checks.
Removed index from the code.
Merged the TOKEN_ESCAPE code.

Which resulted in:

from string import whitespace as space
def enum(*sequential, **named):
 enums = dict(zip(sequential, range(len(sequential))), **named)
 return type('Enum', (), enums)
State = enum('SPACE', 'WORD', 'QUOTE')
TOKEN_ESCAPE = '\\'
def parse(text):
 words = []
 word = []
 state = State.SPACE
 quote = ''
 allow_blank = False
 text_ = iter(text)
 for char in text_:
 if state is State.SPACE:
 if char in space:
 continue
 state = State.WORD
 if state is State.WORD:
 if char in space:
 state = State.SPACE
 if allow_blank or word:
 words.append(''.join(word))
 word = []
 allow_blank = False
 continue
 if char in '\'"':
 quote = char
 allow_blank = True
 state = State.QUOTE
 elif state is State.QUOTE:
 if char == quote:
 state = State.WORD
 if char == TOKEN_ESCAPE and ((state is State.WORD) or
 (state is State.QUOTE and quote != '\'')):
 new_char = next(text_, StopIteration)
 if new_char is StopIteration:
 break
 word.append(char)
 char = new_char
 word.append(char)
 if (state is State.WORD or state is State.QUOTE):
 if allow_blank or word:
 words.append(''.join(word))
 return words
print(parse('foo'))
print(parse('foo bar'))
print(parse('foo bar \'abc xyz\''))
print(parse('foo bar "abc xyz"'))
print(parse('foo bar "abc xyz"\\'))
print(parse('foo bar "abc \\"def\\" xyz"'))

Question 6

Can I ask what the purpose of allow_blank is? I didn't need it in my original code, as I just used the state instead?

Question 7

@JackWilsdon It's from the Go code you linked to. From memory, and I've not looked at this for a week, your code may have a/some edge-cases if you remove it.

Question 8

It seems that the logic for the TOKEN_ESCAPE section is slightly wrong. We should be appending the new char to the word and then continuing (i.e. skipping the current iteration) instead of appending the old char and then the new char, as this ends up giving us the escaping backslashes too. I just replaced char = new_char with continue which seems to have fixed the issue.

Peilonrayz ♦ 44.6k7 gold badges80 silver badges158 bronze badges · Answer 1 · 2016-06-06 13:12:14Z

What benefit is your ParserState class?

The only property that I think, ok that's ok, is the character one, all the others are just noise and can go. You also want to reset the word whenever you use push_word and when you append or push_character you just want to push the character. And so if you had to keep it I'd use the following:

class ParserState(object):
 def __init__(self, text):
 self.text = text
 self.index = 0
 self.state = State.space
 self.quote = ''
 self.word = ''
 self.words = []
 @property
 def character(self):
 return self._text[self.index]
 def push_word(self, allow_empty=False):
 if allow_empty or self.word:
 self.words.append(self.word)
 self.word = ''
 def append(self):
 self.word += self.character

However this has no advantage over merging it with parse and keeping it a single function. In fact it hinders the readability. And so I'd merge it together to get:

def parse(text):
 text = text
 index = 0
 state = State.space
 quote = ''
 word = ''
 words = []
 while index < len(text):
 character = text[index]
 if is_space(character):
 if state == State.word:
 if clear_word or word:
 words.append(word)
 if state != State.quote:
 index += 1
 continue
 elif state == State.space:
 state = State.word
 if state == State.word:
 if character in ('\'', '"'):
 quote = character
 state = State.quote
 if character == '\\':
 if index + 1 == len(text):
 index += 1
 continue
 word += character
 index += 1
 word += character
 index += 1
 continue
 if state == State.quote:
 if character == quote:
 state = State.word
 if character == '\\' and quote != '\'':
 if index + 1 == len(text):
 index += 1
 continue
 index += 1
 word += character
 index += 1
 words.append(word)
 return words

For you enum I'd use a method that actually works in all 2.7 versions, rather than one that only works in the latest. I used one of the methods in the top answer, but it shouldn't matter too much. Just keep in mind that I use uppercase variables, as they are constants.

I'd also use char in string.whitespace rather than using a regex, this is as it removes a function and is a simple in.

And so for all bar the function I use:

from string import whitespace as space
def enum(*sequential, **named):
 enums = dict(zip(sequential, range(len(sequential))), **named)
 return type('Enum', (), enums)
State = enum('SPACE', 'WORD', 'QUOTE')
TOKEN_ESCAPE = '\\'

Your function can still be improved. The changes you made from the original are:

Checks if whitespace first.
Moved the addition of the final word out of the loop but removed it's checks.

The latter is half a good idea, but neither is too great. The first moves the code out of the state sections, for no good reason. The second removes the checks.

And so I'd move back to a 1:1 translation of the Go code. However rather than using the while loop method, I'd use the iterator way.

indexes = iter(range(len(text)))
for index in indexes:
 if (some test):
 continue
 if (some other test):
 index = next(indexes)

Rather than your current way of:

index = 0
while index < len(text):
 if (some test):
 index += 1
 continue
 if (some other test):
 index += 1
 index += 1

The former is much more succinct. And so you'd want to re-try from the conversion of the Go code in this form:

def parse(text):
 words = []
 word = []
 state = State.SPACE
 quote = ''
 allow_blank = False
 indexes = iter(range(len(text) + 1))
 for index in indexes:
 if index != len(text):
 char = text[index]
 if state is State.SPACE:
 if index == len(rest):
 break
 if char in space:
 continue
 state = State.WORD
 if (state is State.WORD or state is State.QUOTE) and index == len(text):
 if allow_blank or word:
 words.append(''.join(word))
 break
 if state is State.WORD:
 if char in space:
 state = State.SPACE
 if allow_blank or word:
 words.append(''.join(word))
 word = []
 allow_blank = False
 continue
 if char in '\'"':
 quote = char
 allow_blank = True
 state = State.QUOTE
 if char == TOKEN_ESCAPE:
 if pos + 1 == len(text):
 continue
 word.append(char)
 index += 1
 char = text[index]
 word.append(char)
 continue
 if state is State.QUOTE:
 if char == quote:
 state = State.WORD
 if char == TOKEN_ESCAPE and quote != '\'':
 if pos + 1 == len(text):
 continue
 word.append(char)
 index += 1
 char = text[index]
 word.append(char)
 return words

As we don't do code reviews of other peoples code, I'm not going to review this code. However I did change it the following ways:

Move the final append out of the loop. But kept the checks. All of them.
Removed obsolete index checks.
Removed index from the code.
Merged the TOKEN_ESCAPE code.

Which resulted in:

from string import whitespace as space
def enum(*sequential, **named):
 enums = dict(zip(sequential, range(len(sequential))), **named)
 return type('Enum', (), enums)
State = enum('SPACE', 'WORD', 'QUOTE')
TOKEN_ESCAPE = '\\'
def parse(text):
 words = []
 word = []
 state = State.SPACE
 quote = ''
 allow_blank = False
 text_ = iter(text)
 for char in text_:
 if state is State.SPACE:
 if char in space:
 continue
 state = State.WORD
 if state is State.WORD:
 if char in space:
 state = State.SPACE
 if allow_blank or word:
 words.append(''.join(word))
 word = []
 allow_blank = False
 continue
 if char in '\'"':
 quote = char
 allow_blank = True
 state = State.QUOTE
 elif state is State.QUOTE:
 if char == quote:
 state = State.WORD
 if char == TOKEN_ESCAPE and ((state is State.WORD) or
 (state is State.QUOTE and quote != '\'')):
 new_char = next(text_, StopIteration)
 if new_char is StopIteration:
 break
 word.append(char)
 char = new_char
 word.append(char)
 if (state is State.WORD or state is State.QUOTE):
 if allow_blank or word:
 words.append(''.join(word))
 return words
print(parse('foo'))
print(parse('foo bar'))
print(parse('foo bar \'abc xyz\''))
print(parse('foo bar "abc xyz"'))
print(parse('foo bar "abc xyz"\\'))
print(parse('foo bar "abc \\"def\\" xyz"'))

Can I ask what the purpose of allow_blank is? I didn't need it in my original code, as I just used the state instead?
@JackWilsdon It's from the Go code you linked to. From memory, and I've not looked at this for a week, your code may have a/some edge-cases if you remove it.
It seems that the logic for the TOKEN_ESCAPE section is slightly wrong. We should be appending the new char to the word and then continuing (i.e. skipping the current iteration) instead of appending the old char and then the new char, as this ends up giving us the escaping backslashes too. I just replaced char = new_char with continue which seems to have fixed the issue.

Stack Exchange Network

Quoted string parser

1 Answer 1

You must log in to answer this question.

Hot Network Questions

Quoted string parser

1 Answer 1

You must log in to answer this question.

Related

Hot Network Questions