2 * Internal interface definitions, etc., for the reg package
4 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 * Development of this software was funded, in part, by Cray Research Inc.,
7 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
8 * Corporation, none of whom are responsible for the results. The author
11 * Redistribution and use in source and binary forms -- with or without
12 * modification -- are permitted for any purpose, provided that
13 * redistributions in source form retain this entire copyright notice and
14 * indicate the origin and nature of any modifications.
16 * I'd appreciate being given credit for this package in the documentation
17 * of software which uses it, but that is not a requirement.
19 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
20 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
21 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
22 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * src/include/regex/regguts.h
36 * Environmental customization. It should not (I hope) be necessary to
37 * alter the file you are now reading -- regcustom.h should handle it all,
38 * given care here and elsewhere.
45 * Things that regcustom.h might override.
51#define NDEBUG /* no assertions */
58 #define DISCARD void /* for throwing values away */
61 #define VS(x) ((void *)(x)) /* cast something to generic ptr */
64/* function-pointer declarator */
66#define FUNCPTR(name, args) (*(name)) args
69/* memory allocation */
71#define MALLOC(n) malloc(n)
74#define REALLOC(p, n) realloc(VS(p), n)
77#define FREE(p) free(VS(p))
85/* want size of a char in bits, and max value in bounded quantifiers */
86#ifndef _POSIX2_RE_DUP_MAX
87 #define _POSIX2_RE_DUP_MAX 255 /* normally from <limits.h> */
98 #define DUPMAX _POSIX2_RE_DUP_MAX
99 #define DUPINF (DUPMAX+1)
101 #define REMAGIC 0xfed7 /* magic number for main struct */
103/* Type codes for lookaround constraints */
104 #define LATYPE_AHEAD_POS 03 /* positive lookahead */
105 #define LATYPE_AHEAD_NEG 02 /* negative lookahead */
106 #define LATYPE_BEHIND_POS 01 /* positive lookbehind */
107 #define LATYPE_BEHIND_NEG 00 /* negative lookbehind */
108 #define LATYPE_IS_POS(la) ((la) & 01)
109 #define LATYPE_IS_AHEAD(la) ((la) & 02)
113 * debugging facilities
116/* FDEBUG does finite-state tracing */
117#define FDEBUG(arglist) { if (v->eflags®_FTRACE) printf arglist; }
118/* MDEBUG does higher-level tracing */
119#define MDEBUG(arglist) { if (v->eflags®_MTRACE) printf arglist; }
121 #define FDEBUG(arglist) {}
122 #define MDEBUG(arglist) {}
128 * bitmap manipulation
130 #define UBITS (CHAR_BIT * sizeof(unsigned))
131 #define BSET(uv, sn) ((uv)[(sn)/UBITS] |= (unsigned)1 << ((sn)%UBITS))
132 #define ISBSET(uv, sn) ((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS)))
136 * known character classes
144 #define NUM_CCLASSES 14
148 * As soon as possible, we map chrs into equivalence classes -- "colors" --
149 * which are of much more manageable number.
151 * To further reduce the number of arcs in NFAs and DFAs, we also have a
152 * special RAINBOW "color" that can be assigned to an arc. This is not a
153 * real color, in that it has no entry in color maps.
155 typedef short color;
/* colors of characters */
157 #define MAX_COLOR 32767 /* max color (must fit in 'color' datatype) */
158 #define COLORLESS (-1) /* impossible color */
159 #define RAINBOW (-2) /* represents all colors except pseudocolors */
160 #define WHITE 0 /* default color, parent of all others */
161/* Note: various places in the code know that WHITE is zero */
165 * Per-color data structure for the compile-time color machinery
167 * If "sub" is not NOSUB then it is the number of the color's current
168 * subcolor, i.e. we are in process of dividing this color (character
169 * equivalence class) into two colors. See src/backend/regex/README for
170 * discussion of subcolors.
172 * Currently-unused colors have the FREECOL bit set and are linked into a
173 * freelist using their "sub" fields, but only if their color numbers are
174 * less than colormap.max. Any array entries beyond "max" are just garbage.
178 int nschrs;
/* number of simple chars of this color */
179 int nuchrs;
/* number of upper map entries of this color */
180 color sub;
/* open subcolor, if any; or free-chain ptr */
181 #define NOSUB COLORLESS /* value of "sub" when no open subcolor */
182 struct arc *
arcs;
/* chain of all arcs of this color */
184 int flags;
/* bitmask of the following flags: */
185 #define FREECOL 01 /* currently free */
186 #define PSEUDO 02 /* pseudocolor, no real chars */
187 #define COLMARK 04 /* temporary marker used in some functions */
190 #define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL)
193 * The color map itself
195 * This struct holds both data used only at compile time, and the chr to
196 * color mapping information, used at both compile and run time. The latter
197 * is the bulk of the space, so it's not really worth separating out the
198 * compile-only portion.
200 * Ideally, the mapping data would just be an array of colors indexed by
201 * chr codes; but for large character sets that's impractical. Fortunately,
202 * common characters have smaller codes, so we can use a simple array for chr
203 * codes up to MAX_SIMPLE_CHR, and do something more complex for codes above
204 * that, without much loss of performance. The "something more complex" is a
205 * 2-D array of color entries, where row indexes correspond to individual chrs
206 * or chr ranges that have been mentioned in the regex (with row zero
207 * representing all other chrs), and column indexes correspond to different
208 * sets of locale-dependent character classes such as "isalpha". The
209 * classbits[k] entry is zero if we do not care about the k'th character class
210 * in this regex, and otherwise it is the bit to be OR'd into the column index
211 * if the character in question is a member of that class. We find the color
212 * of a high-valued chr by identifying which colormaprange it is in to get
213 * the row index (use row zero if it's in none of them), identifying which of
214 * the interesting cclasses it's in to get the column index, and then indexing
215 * into the 2-D hicolormap array.
217 * The colormapranges are required to be nonempty, nonoverlapping, and to
218 * appear in increasing chr-value order.
223 chr cmin;
/* range represents cmin..cmax inclusive */
225 int rownum;
/* row index in hicolormap array (>= 1) */
231 #define CMMAGIC 0x876
232 struct vars *
v;
/* for compile error reporting */
233 size_t ncds;
/* allocated length of colordescs array */
234 size_t max;
/* highest color number currently in use */
237 #define CDEND(cm) (&(cm)->cd[(cm)->max + 1])
239 /* mapping data for chrs <= MAX_SIMPLE_CHR: */
242 /* mapping data for chrs > MAX_SIMPLE_CHR: */
251 /* If we need up to NINLINECDS, we store them here to save a malloc */
252 #define NINLINECDS ((size_t) 10)
256/* fetch color for chr; beware of multiple evaluation of c argument */
257 #define GETCOLOR(cm, c) \
258 ((c) <= MAX_SIMPLE_CHR ? (cm)->locolormap[(c) - CHR_MIN] : pg_reg_getcolor(cm, c))
262 * Interface definitions for locale-interface functions in regc_locale.c.
266 * Representation of a set of characters. chrs[] represents individual
267 * code points, ranges[] represents ranges in the form min..max inclusive.
269 * If the cvec represents a locale-specific character class, eg [[:alpha:]],
270 * then the chrs[] and ranges[] arrays contain only members of that class
271 * up to MAX_SIMPLE_CHR (inclusive). cclasscode is set to regc_locale.c's
272 * code for the class, rather than being -1 as it is in an ordinary cvec.
274 * Note that in cvecs gotten from newcvec() and intended to be freed by
275 * freecvec(), both arrays of chrs are after the end of the struct, not
276 * separately malloc'd; so chrspace and rangespace are effectively immutable.
281 int chrspace;
/* number of chrs allocated in chrs[] */
283 int nranges;
/* number of ranges (chr pairs) */
291 * definitions for NFA internal representation
297 int type;
/* 0 if free, else an NFA arc type code */
298 color co;
/* color the arc matches (possibly RAINBOW) */
301 struct arc *
outchain;
/* link in *from's outs chain or free chain */
303 #define freechain outchain /* we do not maintain "freechainRev" */
306 /* these fields are not used when co == RAINBOW: */
312{
/* for bulk allocation of arcs */
314 size_t narcs;
/* number of arcs allocated in this arcbatch */
317 #define ARCBATCHSIZE(n) ((n) * sizeof(struct arc) + offsetof(struct arcbatch, a))
318/* first batch will have FIRSTABSIZE arcs; then double it until MAXABSIZE */
319 #define FIRSTABSIZE 64
320 #define MAXABSIZE 1024
324 int no;
/* state number, zero and up; or FREESTATE */
325 #define FREESTATE (-1)
326 char flag;
/* marks special states */
327 int nins;
/* number of inarcs */
331 struct state *
tmp;
/* temporary for traversal algorithms */
332 struct state *
next;
/* chain for traversing all live states */
333 /* the "next" field is also used to chain free states together */
334 struct state *
prev;
/* back-link in chain of all live states */
338{
/* for bulk allocation of states */
340 size_t nstates;
/* number of states allocated in this batch */
343 #define STATEBATCHSIZE(n) ((n) * sizeof(struct state) + offsetof(struct statebatch, s))
344/* first batch will have FIRSTSBSIZE states; then double it until MAXSBSIZE */
345 #define FIRSTSBSIZE 32
346 #define MAXSBSIZE 1024
352 struct state *
final;
/* final state */
361 size_t lastsbused;
/* number of states consumed from *lastsb */
362 size_t lastabused;
/* number of arcs consumed from *lastab */
364 color bos[2];
/* colors, if any, assigned to BOS and BOL */
365 color eos[2];
/* colors, if any, assigned to EOS and EOL */
366 int flags;
/* flags to pass forward to cNFA */
369 struct vars *
v;
/* simplifies compile error reporting */
376 * definitions for compacted NFA
378 * The main space savings in a compacted NFA is from making the arcs as small
379 * as possible. We store only the transition color and next-state number for
380 * each arc. The list of out arcs for each state is an array beginning at
381 * cnfa.states[statenumber], and terminated by a dummy carc struct with
384 * The non-dummy carc structs are of two types: plain arcs and LACON arcs.
385 * Plain arcs just store the transition color number as "co". LACON arcs
386 * store the lookaround constraint number plus cnfa.ncolors as "co". LACON
387 * arcs can be distinguished from plain by testing for co >= cnfa.ncolors.
389 * Note that in a plain arc, "co" can be RAINBOW; since that's negative,
390 * it doesn't break the rule about how to recognize LACON arcs.
392 * We have special markings for "trivial" NFAs that can match any string
393 * (possibly with limits on the number of characters therein). In such a
394 * case, flags & MATCHALL is set (and HASLACONS can't be set). Then the
395 * fields minmatchall and maxmatchall give the minimum and maximum numbers
396 * of characters to match. For example, ".*" produces minmatchall = 0
397 * and maxmatchall = DUPINF, while ".+" produces minmatchall = 1 and
398 * maxmatchall = DUPINF.
403 int to;
/* next-state number */
409 int ncolors;
/* number of colors (max color in use + 1) */
410 int flags;
/* bitmask of the following flags: */
411 #define HASLACONS 01 /* uses lookaround constraints */
412 #define MATCHALL 02 /* matches all strings of a range of lengths */
413 #define HASCANTMATCH 04 /* contains CANTMATCH arcs */
414 /* Note: HASCANTMATCH appears in nfa structs' flags, but never in cnfas */
415 int pre;
/* setup state number */
416 int post;
/* teardown state number */
417 color bos[2];
/* colors, if any, assigned to BOS and BOL */
418 color eos[2];
/* colors, if any, assigned to EOS and EOL */
419 char *
stflags;
/* vector of per-state flags bytes */
420 #define CNFA_NOPROGRESS 01 /* flag bit for a no-progress state */
421 struct carc **
states;
/* vector of pointers to outarc lists */
422 /* states[n] are pointers into a single malloc'd array of arcs */
424 /* these fields are used only in a MATCHALL NFA (else they're -1): */
430 * When debugging, it's helpful if an un-filled CNFA is all-zeroes.
431 * In production, though, we only require nstates to be zero.
434#define ZAPCNFA(cnfa) memset(&(cnfa), 0, sizeof(cnfa))
436 #define ZAPCNFA(cnfa) ((cnfa).nstates = 0)
438 #define NULLCNFA(cnfa) ((cnfa).nstates == 0)
441 * This symbol limits the transient heap space used by the regex compiler,
442 * and thereby also the maximum complexity of NFAs that we'll deal with.
443 * Currently we only count NFA states and arcs against this; the other
444 * transient data is generally not large enough to notice compared to those.
445 * Note that we do not charge anything for the final output data structures
446 * (the compacted NFA and the colormap).
447 * The scaling here is based on an empirical measurement that very large
448 * NFAs tend to have about 4 arcs/state.
450#ifndef REG_MAX_COMPILE_SPACE
451 #define REG_MAX_COMPILE_SPACE \
452 (500000 * (sizeof(struct state) + 4 * sizeof(struct arc)))
459 * '=' plain regex without interesting substructure (implemented as DFA)
460 * 'b' back-reference (has no substructure either)
461 * '(' no-op capture node: captures the match of its single child
462 * '.' concatenation: matches a match for first child, then second child
463 * '|' alternation: matches a match for any of its children
464 * '*' iteration: matches some number of matches of its single child
466 * An alternation node can have any number of children (but at least two),
467 * linked through their sibling fields.
469 * A concatenation node must have exactly two children. It might be useful
470 * to support more, but that would complicate the executor. Note that it is
471 * the first child's greediness that determines the node's preference for
472 * where to split a match.
474 * Note: when a backref is directly quantified, we stick the min/max counts
475 * into the backref rather than plastering an iteration node on top. This is
476 * for efficiency: there is no need to search for possible division points.
480 char op;
/* see type codes above */
482 #define LONGER 01 /* prefers longer match */
483 #define SHORTER 02 /* prefers shorter match */
484 #define MIXED 04 /* mixed preference below */
485 #define CAP 010 /* capturing parens here or below */
486 #define BACKR 020 /* back reference here or below */
487 #define BRUSE 040 /* is referenced by a back reference */
488 #define INUSE 0100 /* in use in final tree */
489 #define UPPROP (MIXED|CAP|BACKR) /* flags which should propagate up */
490 #define LMIX(f) ((f)<<2) /* LONGER -> MIXED */
491 #define SMIX(f) ((f)<<1) /* SHORTER -> MIXED */
492 #define UP(f) (((f)&UPPROP) | (LMIX(f) & SMIX(f) & MIXED))
493 #define MESSY(f) ((f)&(MIXED|CAP|BACKR))
494 #define PREF(f) ((f)&(LONGER|SHORTER))
495 #define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
496 #define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
497 char latype;
/* LATYPE code, if lookaround constraint */
498 int id;
/* ID of subre (1..ntree-1) */
499 int capno;
/* if capture node, subno to capture into */
500 int backno;
/* if backref node, subno it refers to */
501 short min;
/* min repetitions for iteration or backref */
502 short max;
/* max repetitions for iteration or backref */
503 struct subre *
child;
/* first child, if any (also freelist chain) */
514 * table of function pointers for generic manipulation functions
515 * A regex_t's re_fns points to one of these.
523 #define STACK_TOO_DEEP(re) \
524 ((*((struct fns *) (re)->re_fns)->stack_too_deep) ())
528 * the insides of a regex_t, hidden behind a void *
533 #define GUTSMAGIC 0xfed9
535 long info;
/* copy of re_info */
536 size_t nsub;
/* copy of re_nsub */
539 int ntree;
/* number of subre's, plus one */
543 int nlacons;
/* size of lacons[]; note that only slots
544 * numbered 1 .. nlacons-1 are used */
548/* prototypes for functions that are exported from regcomp.c to regexec.c */
#define FLEXIBLE_ARRAY_MEMBER
static int compare(const void *arg1, const void *arg2)
void pg_set_regex_collation(Oid collation)
struct colormaprange colormaprange
color pg_reg_getcolor(struct colormap *cm, chr c)
struct arc * colorchainRev
struct arc a[FLEXIBLE_ARRAY_MEMBER]
int classbits[NUM_CCLASSES]
struct colordesc cdspace[NINLINECDS]
int FUNCPTR(stack_too_deep,(void))
void FUNCPTR(free,(regex_t *))
int FUNCPTR(compare,(const chr *, const chr *, size_t))
struct statebatch * lastsb
struct state * freestates
struct state s[FLEXIBLE_ARRAY_MEMBER]