[Python-checkins] cpython: Issue #19380: Optimized parsing of regular expressions.

Fri Oct 10 10:16:17 CEST 2014

https://hg.python.org/cpython/rev/1adeac2a8714
changeset: 92911:1adeac2a8714
user: Serhiy Storchaka <storchaka at gmail.com>
date: Fri Oct 10 11:14:49 2014 +0300
summary:
 Issue #19380: Optimized parsing of regular expressions.
files:
 Lib/sre_parse.py | 270 +++++++++++++++-------------------
 Misc/NEWS | 4 +-
 2 files changed, 123 insertions(+), 151 deletions(-)

diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -18,12 +18,15 @@
 SPECIAL_CHARS = ".\\[{()*+?^$|"
 REPEAT_CHARS = "*+?{"
 
-DIGITS = set("0123456789")
+DIGITS = frozenset("0123456789")
 
-OCTDIGITS = set("01234567")
-HEXDIGITS = set("0123456789abcdefABCDEF")
+OCTDIGITS = frozenset("01234567")
+HEXDIGITS = frozenset("0123456789abcdefABCDEF")
 
-WHITESPACE = set(" \t\n\r\v\f")
+WHITESPACE = frozenset(" \t\n\r\v\f")
+
+_REPEATCODES = frozenset((MIN_REPEAT, MAX_REPEAT))
+_UNITCODES = frozenset((ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY))
 
 ESCAPES = {
 r"\a": (LITERAL, ord("\a")),
@@ -153,11 +156,9 @@
 self.data.append(code)
 def getwidth(self):
 # determine the width (min, max) for this subpattern
- if self.width:
+ if self.width is not None:
 return self.width
 lo = hi = 0
- UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
- REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
 for op, av in self.data:
 if op is BRANCH:
 i = MAXREPEAT - 1
@@ -176,11 +177,11 @@
 i, j = av[1].getwidth()
 lo = lo + i
 hi = hi + j
- elif op in REPEATCODES:
+ elif op in _REPEATCODES:
 i, j = av[2].getwidth()
 lo = lo + i * av[0]
 hi = hi + j * av[1]
- elif op in UNITCODES:
+ elif op in _UNITCODES:
 lo = lo + 1
 hi = hi + 1
 elif op == SUCCESS:
@@ -191,34 +192,31 @@
 class Tokenizer:
 def __init__(self, string):
 self.istext = isinstance(string, str)
+ if not self.istext:
+ string = str(string, 'latin1')
 self.string = string
 self.index = 0
 self.__next()
 def __next(self):
- if self.index >= len(self.string):
+ index = self.index
+ try:
+ char = self.string[index]
+ except IndexError:
 self.next = None
 return
- char = self.string[self.index:self.index+1]
- # Special case for the str8, since indexing returns a integer
- # XXX This is only needed for test_bug_926075 in test_re.py
- if char and not self.istext:
- char = chr(char[0])
 if char == "\\":
+ index += 1
 try:
- c = self.string[self.index + 1]
+ char += self.string[index]
 except IndexError:
 raise error("bogus escape (end of line)")
- if not self.istext:
- c = chr(c)
- char = char + c
- self.index = self.index + len(char)
+ self.index = index + 1
 self.next = char
- def match(self, char, skip=1):
+ def match(self, char):
 if char == self.next:
- if skip:
- self.__next()
- return 1
- return 0
+ self.__next()
+ return True
+ return False
 def get(self):
 this = self.next
 self.__next()
@@ -232,6 +230,17 @@
 result += c
 self.__next()
 return result
+ def getuntil(self, terminator):
+ result = ''
+ while True:
+ c = self.next
+ self.__next()
+ if c is None:
+ raise error("unterminated name")
+ if c == terminator:
+ break
+ result += c
+ return result
 def tell(self):
 return self.index, self.next
 def seek(self, index):
@@ -270,7 +279,7 @@
 if code:
 return code
 code = CATEGORIES.get(escape)
- if code and code[0] == IN:
+ if code and code[0] is IN:
 return code
 try:
 c = escape[1:2]
@@ -279,7 +288,7 @@
 escape += source.getwhile(2, HEXDIGITS)
 if len(escape) != 4:
 raise ValueError
- return LITERAL, int(escape[2:], 16) & 0xff
+ return LITERAL, int(escape[2:], 16)
 elif c == "u" and source.istext:
 # unicode escape (exactly four digits)
 escape += source.getwhile(4, HEXDIGITS)
@@ -325,7 +334,7 @@
 escape += source.getwhile(2, HEXDIGITS)
 if len(escape) != 4:
 raise ValueError
- return LITERAL, int(escape[2:], 16) & 0xff
+ return LITERAL, int(escape[2:], 16)
 elif c == "u" and source.istext:
 # unicode escape (exactly four digits)
 escape += source.getwhile(4, HEXDIGITS)
@@ -347,11 +356,11 @@
 elif c in DIGITS:
 # octal escape *or* decimal group reference (sigh)
 if source.next in DIGITS:
- escape = escape + source.get()
+ escape += source.get()
 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
 source.next in OCTDIGITS):
 # got three octal digits; this is an octal escape
- escape = escape + source.get()
+ escape += source.get()
 c = int(escape[1:], 8)
 if c > 0o377:
 raise error('octal escape value %r outside of '
@@ -370,22 +379,18 @@
 pass
 raise error("bogus escape: %s" % repr(escape))
 
-def _parse_sub(source, state, nested=1):
+def _parse_sub(source, state, nested=True):
 # parse an alternation: a|b|c
 
 items = []
 itemsappend = items.append
 sourcematch = source.match
- while 1:
+ while True:
 itemsappend(_parse(source, state))
- if sourcematch("|"):
- continue
- if not nested:
+ if not sourcematch("|"):
 break
- if not source.next or sourcematch(")", 0):
- break
- else:
- raise error("pattern not properly closed")
+ if nested and source.next is not None and source.next != ")":
+ raise error("pattern not properly closed")
 
 if len(items) == 1:
 return items[0]
@@ -394,7 +399,7 @@
 subpatternappend = subpattern.append
 
 # check if all items share a common prefix
- while 1:
+ while True:
 prefix = None
 for item in items:
 if not item:
@@ -414,16 +419,12 @@
 
 # check if the branch can be replaced by a character set
 for item in items:
- if len(item) != 1 or item[0][0] != LITERAL:
+ if len(item) != 1 or item[0][0] is not LITERAL:
 break
 else:
 # we can store this as a character set instead of a
 # branch (the compiler may optimize this even more)
- set = []
- setappend = set.append
- for item in items:
- setappend(item[0])
- subpatternappend((IN, set))
+ subpatternappend((IN, [item[0] for item in items]))
 return subpattern
 
 subpattern.append((BRANCH, (None, items)))
@@ -433,21 +434,16 @@
 item_yes = _parse(source, state)
 if source.match("|"):
 item_no = _parse(source, state)
- if source.match("|"):
+ if source.next == "|":
 raise error("conditional backref with more than two branches")
 else:
 item_no = None
- if source.next and not source.match(")", 0):
+ if source.next is not None and source.next != ")":
 raise error("pattern not properly closed")
 subpattern = SubPattern(state)
 subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
 return subpattern
 
-_PATTERNENDERS = set("|)")
-_ASSERTCHARS = set("=!<")
-_LOOKBEHINDASSERTCHARS = set("=!")
-_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
-
 def _parse(source, state):
 # parse a simple pattern
 subpattern = SubPattern(state)
@@ -457,32 +453,35 @@
 sourceget = source.get
 sourcematch = source.match
 _len = len
- PATTERNENDERS = _PATTERNENDERS
- ASSERTCHARS = _ASSERTCHARS
- LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
- REPEATCODES = _REPEATCODES
+ _ord = ord
+ verbose = state.flags & SRE_FLAG_VERBOSE
 
- while 1:
+ while True:
 
- if source.next in PATTERNENDERS:
- break # end of subpattern
- this = sourceget()
+ this = source.next
 if this is None:
 break # end of pattern
+ if this in "|)":
+ break # end of subpattern
+ sourceget()
 
- if state.flags & SRE_FLAG_VERBOSE:
+ if verbose:
 # skip whitespace and comments
 if this in WHITESPACE:
 continue
 if this == "#":
- while 1:
+ while True:
 this = sourceget()
- if this in (None, "\n"):
+ if this is None or this == "\n":
 break
 continue
 
- if this and this[0] not in SPECIAL_CHARS:
- subpatternappend((LITERAL, ord(this)))
+ if this[0] == "\\":
+ code = _escape(source, this, state)
+ subpatternappend(code)
+
+ elif this not in SPECIAL_CHARS:
+ subpatternappend((LITERAL, _ord(this)))
 
 elif this == "[":
 # character set
@@ -494,39 +493,38 @@
 setappend((NEGATE, None))
 # check remaining characters
 start = set[:]
- while 1:
+ while True:
 this = sourceget()
+ if this is None:
+ raise error("unexpected end of regular expression")
 if this == "]" and set != start:
 break
- elif this and this[0] == "\\":
+ elif this[0] == "\\":
 code1 = _class_escape(source, this)
- elif this:
- code1 = LITERAL, ord(this)
 else:
- raise error("unexpected end of regular expression")
+ code1 = LITERAL, _ord(this)
 if sourcematch("-"):
 # potential range
 this = sourceget()
+ if this is None:
+ raise error("unexpected end of regular expression")
 if this == "]":
 if code1[0] is IN:
 code1 = code1[1][0]
 setappend(code1)
- setappend((LITERAL, ord("-")))
+ setappend((LITERAL, _ord("-")))
 break
- elif this:
- if this[0] == "\\":
- code2 = _class_escape(source, this)
- else:
- code2 = LITERAL, ord(this)
- if code1[0] != LITERAL or code2[0] != LITERAL:
- raise error("bad character range")
- lo = code1[1]
- hi = code2[1]
- if hi < lo:
- raise error("bad character range")
- setappend((RANGE, (lo, hi)))
+ if this[0] == "\\":
+ code2 = _class_escape(source, this)
 else:
- raise error("unexpected end of regular expression")
+ code2 = LITERAL, _ord(this)
+ if code1[0] != LITERAL or code2[0] != LITERAL:
+ raise error("bad character range")
+ lo = code1[1]
+ hi = code2[1]
+ if hi < lo:
+ raise error("bad character range")
+ setappend((RANGE, (lo, hi)))
 else:
 if code1[0] is IN:
 code1 = code1[1][0]
@@ -541,7 +539,7 @@
 # XXX: <fl> should add charmap optimization here
 subpatternappend((IN, set))
 
- elif this and this[0] in REPEAT_CHARS:
+ elif this in REPEAT_CHARS:
 # repeat previous item
 if this == "?":
 min, max = 0, 1
@@ -552,20 +550,20 @@
 min, max = 1, MAXREPEAT
 elif this == "{":
 if source.next == "}":
- subpatternappend((LITERAL, ord(this)))
+ subpatternappend((LITERAL, _ord(this)))
 continue
 here = source.tell()
 min, max = 0, MAXREPEAT
 lo = hi = ""
 while source.next in DIGITS:
- lo = lo + source.get()
+ lo += sourceget()
 if sourcematch(","):
 while source.next in DIGITS:
- hi = hi + sourceget()
+ hi += sourceget()
 else:
 hi = lo
 if not sourcematch("}"):
- subpatternappend((LITERAL, ord(this)))
+ subpatternappend((LITERAL, _ord(this)))
 source.seek(here)
 continue
 if lo:
@@ -587,7 +585,7 @@
 item = None
 if not item or (_len(item) == 1 and item[0][0] == AT):
 raise error("nothing to repeat")
- if item[0][0] in REPEATCODES:
+ if item[0][0] in _REPEATCODES:
 raise error("multiple repeat")
 if sourcematch("?"):
 subpattern[-1] = (MIN_REPEAT, (min, max, item))
@@ -604,18 +602,14 @@
 if sourcematch("?"):
 group = 0
 # options
- if sourcematch("P"):
+ char = sourceget()
+ if char is None:
+ raise error("unexpected end of pattern")
+ if char == "P":
 # python extensions
 if sourcematch("<"):
 # named group: skip forward to end of name
- name = ""
- while 1:
- char = sourceget()
- if char is None:
- raise error("unterminated name")
- if char == ">":
- break
- name = name + char
+ name = source.getuntil(">")
 group = 1
 if not name:
 raise error("missing group name")
@@ -623,14 +617,7 @@
 raise error("bad character in group name %r" % name)
 elif sourcematch("="):
 # named backreference
- name = ""
- while 1:
- char = sourceget()
- if char is None:
- raise error("unterminated name")
- if char == ")":
- break
- name = name + char
+ name = source.getuntil(")")
 if not name:
 raise error("missing group name")
 if not name.isidentifier():
@@ -647,27 +634,25 @@
 if char is None:
 raise error("unexpected end of pattern")
 raise error("unknown specifier: ?P%s" % char)
- elif sourcematch(":"):
+ elif char == ":":
 # non-capturing group
 group = 2
- elif sourcematch("#"):
+ elif char == "#":
 # comment
- while 1:
- if source.next is None or source.next == ")":
+ while True:
+ if source.next is None:
+ raise error("unbalanced parenthesis")
+ if sourceget() == ")":
 break
- sourceget()
- if not sourcematch(")"):
- raise error("unbalanced parenthesis")
 continue
- elif source.next in ASSERTCHARS:
+ elif char in "=!<":
 # lookahead assertions
- char = sourceget()
 dir = 1
 if char == "<":
- if source.next not in LOOKBEHINDASSERTCHARS:
+ char = sourceget()
+ if char is None or char not in "=!":
 raise error("syntax error")
 dir = -1 # lookbehind
- char = sourceget()
 p = _parse_sub(source, state)
 if not sourcematch(")"):
 raise error("unbalanced parenthesis")
@@ -676,16 +661,9 @@
 else:
 subpatternappend((ASSERT_NOT, (dir, p)))
 continue
- elif sourcematch("("):
+ elif char == "(":
 # conditional backreference group
- condname = ""
- while 1:
- char = sourceget()
- if char is None:
- raise error("unterminated name")
- if char == ")":
- break
- condname = condname + char
+ condname = source.getuntil(")")
 group = 2
 if not condname:
 raise error("missing group name")
@@ -705,12 +683,14 @@
 raise error("bad group number")
 if condgroup >= MAXGROUPS:
 raise error("the group number is too large")
+ elif char in FLAGS:
+ # flags
+ state.flags |= FLAGS[char]
+ while source.next in FLAGS:
+ state.flags |= FLAGS[sourceget()]
+ verbose = state.flags & SRE_FLAG_VERBOSE
 else:
- # flags
- if not source.next in FLAGS:
- raise error("unexpected end of pattern")
- while source.next in FLAGS:
- state.flags = state.flags | FLAGS[sourceget()]
+ raise error("unexpected end of pattern " + char)
 if group:
 # parse group contents
 if group == 2:
@@ -728,7 +708,7 @@
 state.closegroup(group)
 subpatternappend((SUBPATTERN, (group, p)))
 else:
- while 1:
+ while True:
 char = sourceget()
 if char is None:
 raise error("unexpected end of pattern")
@@ -742,10 +722,6 @@
 elif this == "$":
 subpattern.append((AT, AT_END))
 
- elif this and this[0] == "\\":
- code = _escape(source, this, state)
- subpatternappend(code)
-
 else:
 raise error("parser error")
 
@@ -776,11 +752,11 @@
 p = _parse_sub(source, pattern, 0)
 p.pattern.flags = fix_flags(str, p.pattern.flags)
 
- tail = source.get()
- if tail == ")":
- raise error("unbalanced parenthesis")
- elif tail:
- raise error("bogus characters at end of regular expression")
+ if source.next is not None:
+ if source.next == ")":
+ raise error("unbalanced parenthesis")
+ else:
+ raise error("bogus characters at end of regular expression")
 
 if flags & SRE_FLAG_DEBUG:
 p.dump()
@@ -817,13 +793,7 @@
 if c == "g":
 name = ""
 if s.match("<"):
- while True:
- char = sget()
- if char is None:
- raise error("unterminated group name")
- if char == ">":
- break
- name += char
+ name = s.getuntil(">")
 if not name:
 raise error("missing group name")
 try:
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -166,7 +166,9 @@
 Library
 -------
 
-- Issue 1519638: Now unmatched groups are replaced with empty strings in re.sub()
+- Issue #19380: Optimized parsing of regular expressions.
+
+- Issue #1519638: Now unmatched groups are replaced with empty strings in re.sub()
 and re.subn().
 
 - Issue #18615: sndhdr.what/whathdr now return a namedtuple.
-- 
Repository URL: https://hg.python.org/cpython