[Python-checkins] cpython: Issue #24426: Fast searching optimization in regular expressions now works

serhiy.storchaka python-checkins at python.org
Sun Jun 21 13:07:18 CEST 2015


https://hg.python.org/cpython/rev/7e46a503dd16
changeset: 96622:7e46a503dd16
user: Serhiy Storchaka <storchaka at gmail.com>
date: Sun Jun 21 14:06:55 2015 +0300
summary:
 Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups. Fast searching optimization
now can't be disabled at compile time.
files:
 Lib/sre_compile.py | 117 +++++++++++++++++++-------------
 Misc/NEWS | 4 +
 Modules/_sre.c | 3 -
 Modules/sre_lib.h | 53 +++++++-------
 4 files changed, 99 insertions(+), 78 deletions(-)
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -409,57 +409,39 @@
 table[i] = idx + 1
 return table
 
-def _compile_info(code, pattern, flags):
- # internal: compile an info block. in the current version,
- # this contains min/max pattern width, and an optional literal
- # prefix or a character map
- lo, hi = pattern.getwidth()
- if hi > MAXCODE:
- hi = MAXCODE
- if lo == 0:
- code.extend([INFO, 4, 0, lo, hi])
- return
- # look for a literal prefix
+def _get_literal_prefix(pattern):
+ # look for literal prefix
 prefix = []
 prefixappend = prefix.append
- prefix_skip = 0
+ prefix_skip = None
+ got_all = True
+ for op, av in pattern.data:
+ if op is LITERAL:
+ prefixappend(av)
+ elif op is SUBPATTERN:
+ prefix1, prefix_skip1, got_all = _get_literal_prefix(av[1])
+ if prefix_skip is None:
+ if av[0] is not None:
+ prefix_skip = len(prefix)
+ elif prefix_skip1 is not None:
+ prefix_skip = len(prefix) + prefix_skip1
+ prefix.extend(prefix1)
+ if not got_all:
+ break
+ else:
+ got_all = False
+ break
+ return prefix, prefix_skip, got_all
+
+def _get_charset_prefix(pattern):
 charset = [] # not used
 charsetappend = charset.append
- if not (flags & SRE_FLAG_IGNORECASE):
- # look for literal prefix
- for op, av in pattern.data:
+ if pattern.data:
+ op, av = pattern.data[0]
+ if op is SUBPATTERN and av[1]:
+ op, av = av[1][0]
 if op is LITERAL:
- if len(prefix) == prefix_skip:
- prefix_skip = prefix_skip + 1
- prefixappend(av)
- elif op is SUBPATTERN and len(av[1]) == 1:
- op, av = av[1][0]
- if op is LITERAL:
- prefixappend(av)
- else:
- break
- else:
- break
- # if no prefix, look for charset prefix
- if not prefix and pattern.data:
- op, av = pattern.data[0]
- if op is SUBPATTERN and av[1]:
- op, av = av[1][0]
- if op is LITERAL:
- charsetappend((op, av))
- elif op is BRANCH:
- c = []
- cappend = c.append
- for p in av[1]:
- if not p:
- break
- op, av = p[0]
- if op is LITERAL:
- cappend((op, av))
- else:
- break
- else:
- charset = c
+ charsetappend((op, av))
 elif op is BRANCH:
 c = []
 cappend = c.append
@@ -473,8 +455,43 @@
 break
 else:
 charset = c
- elif op is IN:
- charset = av
+ elif op is BRANCH:
+ c = []
+ cappend = c.append
+ for p in av[1]:
+ if not p:
+ break
+ op, av = p[0]
+ if op is LITERAL:
+ cappend((op, av))
+ else:
+ break
+ else:
+ charset = c
+ elif op is IN:
+ charset = av
+ return charset
+
+def _compile_info(code, pattern, flags):
+ # internal: compile an info block. in the current version,
+ # this contains min/max pattern width, and an optional literal
+ # prefix or a character map
+ lo, hi = pattern.getwidth()
+ if hi > MAXCODE:
+ hi = MAXCODE
+ if lo == 0:
+ code.extend([INFO, 4, 0, lo, hi])
+ return
+ # look for a literal prefix
+ prefix = []
+ prefix_skip = 0
+ charset = [] # not used
+ if not (flags & SRE_FLAG_IGNORECASE):
+ # look for literal prefix
+ prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
+ # if no prefix, look for charset prefix
+ if not prefix:
+ charset = _get_charset_prefix(pattern)
 ## if prefix:
 ## print("*** PREFIX", prefix, prefix_skip)
 ## if charset:
@@ -487,7 +504,7 @@
 mask = 0
 if prefix:
 mask = SRE_INFO_PREFIX
- if len(prefix) == prefix_skip == len(pattern.data):
+ if prefix_skip is None and got_all:
 mask = mask | SRE_INFO_LITERAL
 elif charset:
 mask = mask | SRE_INFO_CHARSET
@@ -502,6 +519,8 @@
 # add literal prefix
 if prefix:
 emit(len(prefix)) # length
+ if prefix_skip is None:
+ prefix_skip = len(prefix)
 emit(prefix_skip) # skip
 code.extend(prefix)
 # generate overlap table
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -13,6 +13,10 @@
 Library
 -------
 
+- Issue #24426: Fast searching optimization in regular expressions now works
+ for patterns that starts with capturing groups. Fast searching optimization
+ now can't be disabled at compile time.
+
 Documentation
 -------------
 
diff --git a/Modules/_sre.c b/Modules/_sre.c
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -62,9 +62,6 @@
 /* -------------------------------------------------------------------- */
 /* optional features */
 
-/* enables fast searching */
-#define USE_FAST_SEARCH
-
 /* enables copy/deepcopy handling (work in progress) */
 #undef USE_BUILTIN_COPY
 
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -1248,7 +1248,32 @@
 prefix, prefix_len, prefix_skip));
 TRACE(("charset = %p\n", charset));
 
-#if defined(USE_FAST_SEARCH)
+ if (prefix_len == 1) {
+ /* pattern starts with a literal character */
+ SRE_CHAR c = (SRE_CHAR) prefix[0];
+#if SIZEOF_SRE_CHAR < 4
+ if ((SRE_CODE) c != prefix[0])
+ return 0; /* literal can't match: doesn't fit in char width */
+#endif
+ end = (SRE_CHAR *)state->end;
+ while (ptr < end) {
+ while (*ptr != c) {
+ if (++ptr >= end)
+ return 0;
+ }
+ TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
+ state->start = ptr;
+ state->ptr = ptr + prefix_skip;
+ if (flags & SRE_INFO_LITERAL)
+ return 1; /* we got all of it */
+ status = SRE(match)(state, pattern + 2*prefix_skip, 0);
+ if (status != 0)
+ return status;
+ ++ptr;
+ }
+ return 0;
+ }
+
 if (prefix_len > 1) {
 /* pattern starts with a known prefix. use the overlap
 table to skip forward as fast as we possibly can */
@@ -1297,32 +1322,8 @@
 }
 return 0;
 }
-#endif
 
- if (pattern[0] == SRE_OP_LITERAL) {
- /* pattern starts with a literal character. this is used
- for short prefixes, and if fast search is disabled */
- SRE_CHAR c = (SRE_CHAR) pattern[1];
-#if SIZEOF_SRE_CHAR < 4
- if ((SRE_CODE) c != pattern[1])
- return 0; /* literal can't match: doesn't fit in char width */
-#endif
- end = (SRE_CHAR *)state->end;
- while (ptr < end) {
- while (*ptr != c) {
- if (++ptr >= end)
- return 0;
- }
- TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
- state->start = ptr;
- state->ptr = ++ptr;
- if (flags & SRE_INFO_LITERAL)
- return 1; /* we got all of it */
- status = SRE(match)(state, pattern + 2, 0);
- if (status != 0)
- break;
- }
- } else if (charset) {
+ if (charset) {
 /* pattern starts with a character from a known set */
 end = (SRE_CHAR *)state->end;
 for (;;) {
-- 
Repository URL: https://hg.python.org/cpython


More information about the Python-checkins mailing list

AltStyle によって変換されたページ (->オリジナル) /