3 * This file is #included by regexec.c.
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 * src/backend/regex/rege_dfa.c
36 * longest - longest-preferred matching engine
38 * On success, returns match endpoint address. Returns NULL on no match.
39 * Internal errors also return NULL, with v->err set.
44 chr *
start,
/* where the match should start */
45 chr *stop,
/* match must end at or before here */
46 int *hitstopp)
/* record whether hit v->stop, if non-NULL */
49 chr *realstop = (stop == v->
stop) ? stop : stop + 1;
57 /* prevent "uninitialized variable" warnings */
61 /* if this is a backref to a known string, just match against that */
68 if (cp ==
v->
stop && stop ==
v->
stop && hitstopp != NULL)
74 /* fast path for matchall NFAs */
77 size_t nchr = stop -
start;
84 if (stop ==
v->
stop && hitstopp != NULL)
89 if (stop ==
v->
stop && nchr <= maxmatchall + 1 && hitstopp != NULL)
91 if (nchr > maxmatchall)
92 return start + maxmatchall;
104 FDEBUG((
"+++ startup +++\n"));
108 FDEBUG((
"color %ld\n", (
long) co));
113 FDEBUG((
"char %c, color %ld\n", (
char) *(cp - 1), (
long) co));
121 * This is the main text-scanning loop. It seems worth having two copies
122 * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
123 * builds, when you're not actively tracing.
128 while (cp < realstop)
130 FDEBUG((
"+++ at c%d +++\n", (
int) (css - d->
ssets)));
132 FDEBUG((
"char %c, color %ld\n", (
char) *cp, (
long) co));
138 break;
/* NOTE BREAK OUT */
148 while (cp < realstop)
156 break;
/* NOTE BREAK OUT */
168 FDEBUG((
"+++ shutdown at c%d +++\n", (
int) (css - d->
ssets)));
171 if (hitstopp != NULL)
174 FDEBUG((
"color %ld\n", (
long) co));
178 /* special case: match ended at eol? */
185 /* find last match, if any */
189 (post == NULL || post < ss->lastseen))
191 if (post != NULL)
/* found one */
198 * shortest - shortest-preferred matching engine
200 * On success, returns match endpoint address. Returns NULL on no match.
201 * Internal errors also return NULL, with v->err set.
206 chr *
start,
/* where the match should start */
207 chr *min,
/* match must end at or after here */
208 chr *
max,
/* match must end at or before here */
209 chr **coldp,
/* store coldstart pointer here, if non-NULL */
210 int *hitstopp)
/* record whether hit v->stop, if non-NULL */
213 chr *realmin = (min ==
v->
stop) ? min : min + 1;
220 /* prevent "uninitialized variable" warnings */
223 if (hitstopp != NULL)
226 /* if this is a backref to a known string, just match against that */
233 if (cp != NULL && coldp != NULL)
235 /* there is no case where we should set *hitstopp */
240 /* fast path for matchall NFAs */
243 size_t nchr = min -
start;
254 /* there is no case where we should set *hitstopp */
265 FDEBUG((
"--- startup ---\n"));
269 FDEBUG((
"color %ld\n", (
long) co));
274 FDEBUG((
"char %c, color %ld\n", (
char) *(cp - 1), (
long) co));
283 * This is the main text-scanning loop. It seems worth having two copies
284 * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
285 * builds, when you're not actively tracing.
292 FDEBUG((
"--- at c%d ---\n", (
int) (css - d->
ssets)));
294 FDEBUG((
"char %c, color %ld\n", (
char) *cp, (
long) co));
300 break;
/* NOTE BREAK OUT */
306 break;
/* NOTE BREAK OUT */
320 break;
/* NOTE BREAK OUT */
326 break;
/* NOTE BREAK OUT */
333 if (coldp != NULL)
/* report last no-progress state set, if any */
344 FDEBUG((
"color %ld\n", (
long) co));
346 /* match might have ended at eol */
358 * matchuntil - incremental matching engine
360 * This is meant for use with a search-style NFA (that is, the pattern is
361 * known to act as though it had a leading .*). We determine whether a
362 * match exists starting at v->start and ending at probe. Multiple calls
363 * require only O(N) time not O(N^2) so long as the probe values are
364 * nondecreasing. *lastcss and *lastcp must be initialized to NULL before
365 * starting a series of calls.
367 * Returns 1 if a match exists, 0 if not.
368 * Internal errors also return 0, with v->err set.
373 chr *probe,
/* we want to know if a match ends here */
374 struct sset **lastcss,
/* state storage across calls */
375 chr **lastcp)
/* state storage across calls */
379 struct sset *css = *lastcss;
383 /* fast path for matchall NFAs */
386 size_t nchr = probe -
v->
start;
390 /* maxmatchall will always be infinity, cf. makesearch() */
395 /* initialize and startup, or restart, if necessary */
396 if (cp == NULL || cp > probe)
403 FDEBUG((
">>> startup >>>\n"));
405 FDEBUG((
"color %ld\n", (
long) co));
412 else if (css == NULL)
414 /* we previously found that no match is possible beyond *lastcp */
420 * This is the main text-scanning loop. It seems worth having two copies
421 * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
422 * builds, when you're not actively tracing.
429 FDEBUG((
">>> at c%d >>>\n", (
int) (css - d->
ssets)));
431 FDEBUG((
"char %c, color %ld\n", (
char) *cp, (
long) co));
437 break;
/* NOTE BREAK OUT */
455 break;
/* NOTE BREAK OUT */
467 return 0;
/* impossible match, or internal error */
469 /* We need to process one more chr, or the EOS symbol, to check match */
472 FDEBUG((
">>> at c%d >>>\n", (
int) (css - d->
ssets)));
474 FDEBUG((
"char %c, color %ld\n", (
char) *cp, (
long) co));
483 FDEBUG((
"color %ld\n", (
long) co));
494 * dfa_backref - find best match length for a known backref string
496 * When the backref's referent is already available, we can deliver an exact
497 * answer with considerably less work than running the backref node's NFA.
499 * Return match endpoint for longest or shortest valid repeated match,
500 * or NULL if there is no valid match.
502 * Should be in sync with cbrdissect(), although that has the different task
503 * of checking a match to a predetermined section of the string.
508 chr *
start,
/* where the match should start */
509 chr *min,
/* match must end at or after here */
510 chr *
max,
/* match must end at or before here */
523 /* get the backreferenced string (caller should have checked this) */
529 /* special-case zero-length backreference to avoid divide by zero */
533 * matches only a zero-length string, but any number of repetitions
534 * can be considered to be present
536 if (min ==
start && backmin <= backmax)
542 * convert min and max into numbers of possible repetitions of the backref
543 * string, rounding appropriately
548 minreps = (min -
start - 1) / brlen + 1;
551 /* apply bounds, then see if there is any allowed match length */
552 if (minreps < backmin)
554 if (backmax !=
DUPINF && maxreps > backmax)
556 if (maxreps < minreps)
559 /* quick exit if zero-repetitions match is valid and preferred */
563 /* okay, compare the actual string contents */
566 while (numreps < maxreps)
568 if ((*
v->
g->compare) (brstring, p, brlen) != 0)
576 if (numreps >= minreps)
582 * lastcold - determine last point at which no progress had been made
584static chr *
/* endpoint, or NULL */
602 * newdfa - set up a fresh DFA
604 * Returns NULL (and sets v->err) on failure.
610 struct smalldfa *sml)
/* preallocated space, may be NULL */
654 sizeof(
struct sset *));
656 sizeof(
struct arcp));
659 /* now freedfa() will behave sanely */
679 d->
backno = -1;
/* may be set by caller */
682 /* initialization of sset fields is done as needed */
688 * freedfa - free a DFA
695 if (d->
ssets != NULL)
710 * hash - construct a hash code for a bitvector
712 * There are probably better ways, but they're more expensive.
722 for (
i = 0;
i < n;
i++)
728 * initialize - hand-craft a cache entry for startup, otherwise get ready
738 /* is previous one still there? */
742 {
/* no, must (re)build it */
752 /* lastseen dealt with below */
764 * miss - handle a stateset cache miss
766 * css is the current stateset, co is the color of the current input character,
767 * cp points to the character after that (which is where we may need to test
768 * LACONs). start does not affect matching behavior but is needed for pickss'
769 * heuristics about which stateset cache entry to replace.
771 * Ordinarily, returns the address of the next stateset (the one that is
772 * valid after consuming the input character). Returns NULL if no valid
773 * NFA states remain, ie we have a certain match failure.
774 * Internal errors also return NULL, with v->err set.
781 chr *cp,
/* next chr */
782 chr *
start)
/* where the attempt got started */
796 /* for convenience, we can be called even if it might not be a miss */
797 if (css->
outs[co] != NULL)
800 return css->
outs[co];
805 * Checking for operation cancel in the inner text search loop seems
806 * unduly expensive. As a compromise, check during cache misses.
811 * What set of states would we end up in after consuming the co character?
812 * We first consider PLAIN arcs that consume the character, and then look
813 * to see what LACON arcs could be traversed after consuming it.
816 d->
work[
i] = 0;
/* build new stateset bitmap in d->work */
836 return NULL;
/* character cannot reach any new state */
839 /* outer loop handles transitive closure of reachable-by-LACON states */
848 continue;
/* not a LACON arc */
850 continue;
/* arc would be a no-op anyway */
851 sawlacons = 1;
/* this LACON affects our result */
856 continue;
/* LACON arc cannot be traversed */
871 /* Is this stateset already in the cache? */
876 break;
/* NOTE BREAK OUT */
879 {
/* nope, need a new cache entry */
890 /* lastseen to be dealt with by caller */
894 * Link new stateset to old, unless a LACON affected the result, in which
895 * case we don't create the link. That forces future transitions across
896 * this same arc (same prior stateset and character color) to come through
897 * miss() again, so that we can recheck the LACON(s), which might or might
898 * not pass since context will be different.
903 (
int) (css - d->
ssets), co, (
int) (p - d->
ssets)));
913 * lacon - lookaround-constraint checker for miss()
915static int /* predicate: constraint satisfied? */
917 struct cnfa *pcnfa,
/* parent cnfa */
919 color co)
/* "color" of the lookaround constraint */
927 /* Since this is recursive, it could be driven to stack overflow */
936 FDEBUG((
"=== testing lacon %d\n", n));
943 /* used to use longest() here, but shortest() could be much cheaper */
945 (
chr **) NULL, (
int *) NULL);
951 * To avoid doing O(N^2) work when repeatedly testing a lookbehind
952 * constraint in an N-character string, we use matchuntil() which can
953 * cache the DFA state across calls. We only need to restart if the
954 * probe point decreases, which is not common. The NFA we're using is
955 * a search NFA, so it doesn't mind scanning over stuff before the
960 satisfied = !satisfied;
962 FDEBUG((
"=== lacon %d satisfied %d\n", n, satisfied));
967 * getvacant - get a vacant state set
969 * This routine clears out the inarcs and outarcs, but does not otherwise
970 * clear the innards of the state set -- that's up to the caller.
989 /* clear out its inarcs, including self-referential ones */
991 while ((p = ap.
ss) != NULL)
994 FDEBUG((
"zapping c%d's %ld outarc\n", (
int) (p - d->
ssets), (
long)
co));
1001 /* take it off the inarc chains of the ssets reached by its outarcs */
1005 assert(p !=
ss);
/* not self-referential */
1007 continue;
/* NOTE CONTINUE */
1008 FDEBUG((
"del outarc %d from c%d's in chn\n",
i, (
int) (p - d->
ssets)));
1013 struct arcp lastap = {NULL, 0};
1016 for (ap = p->
ins; ap.
ss != NULL &&
1027 /* if ss was a success state, may need to remember location */
1032 /* likewise for a no-progress state */
1041 * pickss - pick the next stateset to be used
1054 /* shortcut for cases where cache isn't full */
1061 /* set up innards */
1076 /* look for oldest, or old enough anyway */
1077 if (cp -
start > d->
nssets * 2 / 3)
/* oldest 33% are expendable */
1078 ancient = cp - d->
nssets * 2 / 3;
1086 FDEBUG((
"replacing c%d\n", (
int) (ss - d->
ssets)));
1089 for (ss = d->
ssets, end = d->
search; ss < end; ss++)
1094 FDEBUG((
"replacing c%d\n", (
int) (ss - d->
ssets)));
1098 /* nobody's old enough?!? -- something's really wrong */
1099 FDEBUG((
"cannot find victim to replace!\n"));
#define HASH(sign, val, siglen)
static void freedfa(struct dfa *d)
static chr * shortest(struct vars *v, struct dfa *d, chr *start, chr *min, chr *max, chr **coldp, int *hitstopp)
static chr * dfa_backref(struct vars *v, struct dfa *d, chr *start, chr *min, chr *max, bool shortest)
static int lacon(struct vars *v, struct cnfa *pcnfa, chr *cp, color co)
static unsigned hash(unsigned *uv, int n)
static chr * lastcold(struct vars *v, struct dfa *d)
static chr * longest(struct vars *v, struct dfa *d, chr *start, chr *stop, int *hitstopp)
static struct sset * getvacant(struct vars *v, struct dfa *d, chr *cp, chr *start)
static struct sset * miss(struct vars *v, struct dfa *d, struct sset *css, color co, chr *cp, chr *start)
static struct dfa * newdfa(struct vars *v, struct cnfa *cnfa, struct colormap *cm, struct smalldfa *sml)
static struct sset * initialize(struct vars *v, struct dfa *d, chr *start)
static struct sset * pickss(struct vars *v, struct dfa *d, chr *cp, chr *start)
static int matchuntil(struct vars *v, struct dfa *d, chr *probe, struct sset **lastcss, chr **lastcp)
static struct dfa * getladfa(struct vars *v, int n)
#define HIT(h, bv, ss, nw)
#define STACK_TOO_DEEP(re)
#define LATYPE_IS_AHEAD(la)
#define LATYPE_IS_POS(la)
unsigned statesarea[FEWSTATES *2+WORK]
struct arcp incarea[FEWSTATES *2 *FEWCOLORS]
struct sset * outsarea[FEWSTATES *2 *FEWCOLORS]
struct sset ssets[FEWSTATES *2]