3 * This file is #included by regcomp.c.
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 * src/backend/regex/regc_lex.c
35/* scanning macros (know about v) */
36 #define ATEOS() (v->now >= v->stop)
37 #define HAVE(n) (v->stop - v->now >= (n))
38 #define NEXT1(c) (!ATEOS() && *v->now == CHR(c))
39 #define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
40 #define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \
41 *(v->now+1) == CHR(b) && \
42 *(v->now+2) == CHR(c))
43 #define SET(c) (v->nexttype = (c))
44 #define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n))
45 #define RET(c) return (SET(c), 1)
46 #define RETV(c, n) return (SETV(c, n), 1)
47 #define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */
48 #define LASTTYPE(t) (v->lasttype == (t))
51 #define L_ERE 1 /* mainline ERE/ARE */
52 #define L_BRE 2 /* mainline BRE */
53 #define L_Q 3 /* REG_QUOTE */
54 #define L_EBND 4 /* ERE/ARE bound */
55 #define L_BBND 5 /* BRE bound */
56 #define L_BRACK 6 /* brackets */
57 #define L_CEL 7 /* collating element */
58 #define L_ECL 8 /* equivalence class */
59 #define L_CCL 9 /* character class */
60 #define INTOCON(c) (v->lexcon = (c))
61 #define INCON(con) (v->lexcon == (con))
63/* construct pointer past end of chr array */
64 #define ENDOF(array) ((array) + sizeof(array)/sizeof(chr))
67 * lexstart - set up lexical stuff, scan leading options
72 prefixes(v);
/* may turn on new type bits etc. */
92 next(v);
/* set up the first token */
96 * prefixes - implement various special prefixes
101 /* literal string doesn't get any of this stuff */
105 /* initial "***" gets special things */
107 switch (*(v->
now + 3))
109 case CHR(
'?'):
/* "***?" error, msg shows version */
111 return;
/* proceed no further */
113 case CHR(
'='):
/* "***=" shifts to literal string */
118 return;
/* and there can be no more prefixes */
120 case CHR(
':'):
/* "***:" shifts to AREs */
125 default:
/* otherwise *** is just an error */
131 /* BREs and EREs don't get embedded options */
135 /* embedded options (AREs only) */
143 case CHR(
'b'):
/* BREs (but why???) */
146 case CHR(
'c'):
/* case sensitive */
149 case CHR(
'e'):
/* plain EREs */
153 case CHR(
'i'):
/* case insensitive */
156 case CHR(
'm'):
/* Perloid synonym for n */
157 case CHR(
'n'):
/* \n affects ^ $ . [^ */
160 case CHR(
'p'):
/* ~Perl, \n affects . [^ */
164 case CHR(
'q'):
/* literal string */
166 v->
cflags &= ~REG_ADVANCED;
168 case CHR(
's'):
/* single line, \n ordinary */
169 v->
cflags &= ~REG_NEWLINE;
171 case CHR(
't'):
/* tight syntax */
172 v->
cflags &= ~REG_EXPANDED;
174 case CHR(
'w'):
/* weird, \n affects ^ $ only */
178 case CHR(
'x'):
/* expanded syntax */
197 * next - get next token
199static int /* 1 normal, 0 failure */
204next_restart:
/* loop here after eating a comment */
206 /* errors yield an infinite sequence of failures */
208 return 0;
/* the error has set nexttype to EOS */
210 /* remember flavor of last token */
216 /* at start of a REG_BOSONLY RE */
220 /* skip white space etc. if appropriate (not in literal or []) */
232 /* handle EOS, depending on context */
256 /* okay, time to actually get a character */
259 /* deal with the easy contexts, punt EREs to code below */
262 case L_BRE:
/* punt BREs to separate function */
265 case L_ERE:
/* see below */
267 case L_Q:
/* literal strings are easy */
270 case L_BBND:
/* bounds are fairly simple */
289 case CHR(
'}'):
/* ERE bound ends with } */
304 case CHR(
'\\'):
/* BRE bound ends with \} */
320 case L_BRACK:
/* brackets are not too hard */
343 {
/* not all escapes okay here */
350 /* not one of the acceptable escapes */
366 /* might or might not be locale-specific */
392 case L_CEL:
/* collating elements are easy */
402 case L_ECL:
/* ditto equivalence classes */
412 case L_CCL:
/* ditto character classes */
427 /* that got rid of everything except EREs and AREs */
430 /* deal with EREs and AREs, except for backslashes */
463 case CHR(
'{'):
/* bounds start or plain character */
480 case CHR(
'('):
/* parenthesis, or advanced extension */
489 case CHR(
':'):
/* non-capturing paren */
492 case CHR(
'#'):
/* comment */
499 case CHR(
'='):
/* positive lookahead */
503 case CHR(
'!'):
/* negative lookahead */
512 case CHR(
'='):
/* positive lookbehind */
516 case CHR(
'!'):
/* negative lookbehind */
539 case CHR(
'['):
/* easy except for [[:<:]] and [[:>:]] */
541 *(v->
now + 1) ==
CHR(
':') &&
542 (*(v->
now + 2) ==
CHR(
'<') ||
543 *(v->
now + 2) ==
CHR(
'>')) &&
544 *(v->
now + 3) ==
CHR(
':') &&
545 *(v->
now + 4) ==
CHR(
']') &&
546 *(v->
now + 5) ==
CHR(
']'))
551 RET((
c ==
CHR(
'<')) ?
'<' :
'>');
570 case CHR(
'\\'):
/* mostly punt backslashes to code below */
574 default:
/* ordinary character */
579 /* ERE/ARE backslash handling; backslash already eaten */
582 {
/* only AREs have non-trivial escapes */
594 * lexescape - parse an ARE backslash escape (backslash already eaten)
596 * This is used for ARE backslashes both normally and inside bracket
597 * expressions. In the latter case, not all escape types are allowed,
598 * but the caller must reject unwanted ones after we return.
604 static const chr alert[] = {
607 static const chr esc[] = {
617 /* if it's not alphanumeric ASCII, treat it as a plain character */
618 if (!(
'a' <=
c &&
c <=
'z') &&
619 !(
'A' <=
c &&
c <=
'Z') &&
620 !(
'0' <=
c &&
c <=
'9'))
707 c =
lexdigits(v, 16, 1, 255);
/* REs >255 long outside spec */
733 v->
now--;
/* put first digit back */
734 c =
lexdigits(v, 10, 1, 255);
/* REs >255 long outside spec */
737 /* ugly heuristic (first test is "exactly 1 digit?") */
738 if (v->
now == save || ((
int)
c > 0 && (
int)
c <= v->nsubexp))
743 /* oops, doesn't look like it's a backref after all... */
745 /* and fall through into octal number */
749 v->
now--;
/* put first digit back */
755 /* out of range, so we handled one digit too much */
764 * Throw an error for unrecognized ASCII alpha escape sequences,
765 * which reserves them for future use if needed.
774 * lexdigits - slurp up digits and return chr value
776 * This does not account for overflow; callers should range-check the result
777 * if maxlen is large enough to make that possible.
779static chr /* chr value; errors signalled via ERR */
785 uchr n;
/* unsigned to avoid overflow misbehavior */
834 v->
now--;
/* oops, not a digit at all */
840 {
/* not a plausible digit */
845 break;
/* NOTE BREAK OUT */
846 n = n * ub + (
uchr) d;
855 * brenext - get next BRE token
857 * This is much like EREs except for all the stupid backslashes and the
858 * context-dependency of some things.
860static int /* 1 normal, 0 failure */
873 *(v->
now + 1) ==
CHR(
':') &&
874 (*(v->
now + 2) ==
CHR(
'<') ||
875 *(v->
now + 2) ==
CHR(
'>')) &&
876 *(v->
now + 3) ==
CHR(
':') &&
877 *(v->
now + 4) ==
CHR(
']') &&
878 *(v->
now + 5) ==
CHR(
']'))
883 RET((
c ==
CHR(
'<')) ?
'<' :
'>');
911 if (
NEXT2(
'\\',
')'))
919 break;
/* see below */
979 * skip - skip white space and comments in expanded form
993 break;
/* NOTE BREAK OUT */
997 /* leave the newline to be picked up by the iscspace loop */
1005 * newline - return the chr for a newline
1007 * This helps confine use of CHR to this source file.
1016 * chrnamed - return the chr known by a given (chr string) name
1018 * The code is a bit clumsy, but this routine gets only such specialized
1019 * use that it hardly matters.
1023 const chr *startp,
/* start of name */
1024 const chr *endp,
/* just past end of name */
1025 chr lastresort)
/* what to return if name lookup fails */
#define errsave(context,...)
static int lexescape(struct vars *v)
static void skip(struct vars *v)
static chr lexdigits(struct vars *v, int base, int minlen, int maxlen)
static int brenext(struct vars *v, chr c)
static void lexstart(struct vars *v)
static void prefixes(struct vars *v)
static chr chrnamed(struct vars *v, const chr *startp, const chr *endp, chr lastresort)
static int next(struct vars *v)
static struct cvec * range(struct vars *v, chr a, chr b, int cases)
static chr element(struct vars *v, const chr *startp, const chr *endp)
#define CHR_IS_IN_RANGE(c)
#define LATYPE_BEHIND_POS
#define LATYPE_BEHIND_NEG