00001 /*************************************************
00002 * Perl-Compatible Regular Expressions *
00003 *************************************************/
00004
00005 /* **** This is an ALTERED VERSION of PCRE **** */
00006
00007 /*
00008 This is a library of functions to support regular expressions whose syntax
00009 and semantics are as close as possible to those of the Perl 5 language. See
00010 the file Tech.Notes for some information on the internals.
00011
00012 Written by: Philip Hazel <ph10@cam.ac.uk>
00013
00014 Copyright (c) 1997-1999 University of Cambridge
00015
00016 -----------------------------------------------------------------------------
00017 Permission is granted to anyone to use this software for any purpose on any
00018 computer system, and to redistribute it freely, subject to the following
00019 restrictions:
00020
00021 1. This software is distributed in the hope that it will be useful,
00022 but WITHOUT ANY WARRANTY; without even the implied warranty of
00023 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
00024
00025 2. The origin of this software must not be misrepresented, either by
00026 explicit claim or by omission.
00027
00028 3. Altered versions must be plainly marked as such, and must not be
00029 misrepresented as being the original software.
00030
00031 4. If PCRE is embedded in any software that is released under the GNU
00032 General Purpose Licence (GPL), then the terms of that licence shall
00033 supersede any condition above with which it is incompatible.
00034 -----------------------------------------------------------------------------
00035 */
00036
00037
00038 #ifdef __cplusplus
00039 extern "C" {
00040 #endif
00041
00042
00043
00044
00045 /* Define DEBUG to get debugging output on stdout. */
00046
00047 /* #define DEBUG */
00048
00049 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
00050 inline, and there are *still* stupid compilers about that don't like indented
00051 pre-processor statements. I suppose it's only been 10 years... */
00052
00053 #ifdef DEBUG
00054 #define DPRINTF(p) printf p
00055 #else
00056 #define DPRINTF(p) /*nothing*/
00057 #endif
00058
00059 /* Include the internals header, which itself includes Standard C headers plus
00060 the external pcre header. */
00061
00062 #include "pcreinternal.h"
00063
00064
00065 /* Allow compilation as C++ source code, should anybody want to do that. */
00066
00067 #ifdef __cplusplus
00068 #define class pcre_class
00069 #endif
00070
00071
00072 /* Number of items on the nested bracket stacks at compile time. This should
00073 not be set greater than 200. */
00074
00075 #define BRASTACK_SIZE 200
00076
00077
00078 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
00079
00080 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
00081 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
00082
00083 /* Text forms of OP_ values and things, for debugging (not all used) */
00084
00085 #ifdef DEBUG
00086 static const char *OP_names[] = {
00087 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
00088 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
00089 "Opt", "^", "$", "Any", "chars", "not",
00090 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
00091 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
00092 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
00093 "*", "*?", "+", "+?", "?", "??", "{", "{",
00094 "class", "Ref",
00095 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
00096 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
00097 "Brazero", "Braminzero", "Bra"
00098 };
00099 #endif
00100
00101 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
00102 are simple data values; negative values are for special things like \d and so
00103 on. Zero means further processing is needed (for things like \x), or the escape
00104 is invalid. */
00105
00106 static const short int escapes[] = {
00107 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
00108 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
00109 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
00110 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
00111 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
00112 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
00113 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
00114 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
00115 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
00116 0, 0, -ESC_z /* x - z */
00117 };
00118
00119 /* Definition to allow mutual recursion */
00120
00121 static BOOL
00122 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
00123 BOOL, int, compile_data *);
00124
00125
00126
00127 /*************************************************
00128 * Global variables *
00129 *************************************************/
00130
00131 /* PCRE is thread-clean and doesn't use any global variables in the normal
00132 sense. However, it calls memory allocation and free functions via the two
00133 indirections below, which are can be changed by the caller, but are shared
00134 between all threads. */
00135
00136 void *(*vmdpcre_malloc)(size_t) = malloc;
00137 void (*vmdpcre_free)(void *) = free;
00138
00139
00140
00141
00142 /*************************************************
00143 * Default character tables *
00144 *************************************************/
00145
00146 /* A default set of character tables is included in the PCRE binary. Its source
00147 is built by the maketables auxiliary program, which uses the default C ctypes
00148 functions, and put in the file chartables.c. These tables are used by PCRE
00149 whenever the caller of pcre_compile() does not provide an alternate set of
00150 tables. */
00151
00152 #include "pcretables.h"
00153
00154
00155
00156 /*************************************************
00157 * Return version string *
00158 *************************************************/
00159
00160 const char *
00161 vmdpcre_version(void)
00162 {
00163 return PCRE_VERSION;
00164 }
00165
00166
00167
00168
00169 /*************************************************
00170 * Return info about a compiled pattern *
00171 *************************************************/
00172
00173 /* This function picks potentially useful data out of the private
00174 structure.
00175
00176 Arguments:
00177 external_re points to compiled code
00178 optptr where to pass back the options
00179 first_char where to pass back the first character,
00180 or -1 if multiline and all branches start ^,
00181 or -2 otherwise
00182
00183 Returns: number of identifying extraction brackets
00184 or negative values on error
00185 */
00186
00187 int
00188 vmdpcre_info(const pcre *external_re, int *optptr, int *first_char)
00189 {
00190 const real_pcre *re = (const real_pcre *)external_re;
00191 if (re == NULL) return PCRE_ERROR_NULL;
00192 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
00193 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
00194 if (first_char != NULL)
00195 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
00196 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
00197 return re->top_bracket;
00198 }
00199
00200
00201
00202
00203 #ifdef DEBUG
00204 /*************************************************
00205 * Debugging function to print chars *
00206 *************************************************/
00207
00208 /* Print a sequence of chars in printable format, stopping at the end of the
00209 subject if the requested.
00210
00211 Arguments:
00212 p points to characters
00213 length number to print
00214 is_subject TRUE if printing from within md->start_subject
00215 md pointer to matching data block, if is_subject is TRUE
00216
00217 Returns: nothing
00218 */
00219
00220 static void
00221 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
00222 {
00223 int c;
00224 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
00225 while (length-- > 0)
00226 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
00227 }
00228 #endif
00229
00230
00231
00232
00233 /*************************************************
00234 * Handle escapes *
00235 *************************************************/
00236
00237 /* This function is called when a \ has been encountered. It either returns a
00238 positive value for a simple escape such as \n, or a negative value which
00239 encodes one of the more complicated things such as \d. On entry, ptr is
00240 pointing at the \. On exit, it is on the final character of the escape
00241 sequence.
00242
00243 Arguments:
00244 ptrptr points to the pattern position pointer
00245 errorptr points to the pointer to the error message
00246 bracount number of previous extracting brackets
00247 options the options bits
00248 isclass TRUE if inside a character class
00249 cd pointer to char tables block
00250
00251 Returns: zero or positive => a data character
00252 negative => a special escape sequence
00253 on error, errorptr is set
00254 */
00255
00256 static int
00257 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
00258 int options, BOOL isclass, compile_data *cd)
00259 {
00260 const uschar *ptr = *ptrptr;
00261 int c = *(ptr+1) & 255; /* Ensure > 0 on signed-char systems */
00262 int i;
00263 ++ptr;
00264
00265 if (c == 0) *errorptr = ERR1;
00266
00267 /* Digits or letters may have special meaning; all others are literals. */
00268
00269 else if (c < '0' || c > 'z') {}
00270
00271 /* Do an initial lookup in a table. A non-zero result is something that can be
00272 returned immediately. Otherwise further processing may be required. */
00273
00274 else if ((i = escapes[c - '0']) != 0) c = i;
00275
00276 /* Escapes that need further processing, or are illegal. */
00277
00278 else
00279 {
00280 const uschar *oldptr;
00281 switch (c)
00282 {
00283 /* The handling of escape sequences consisting of a string of digits
00284 starting with one that is not zero is not straightforward. By experiment,
00285 the way Perl works seems to be as follows:
00286
00287 Outside a character class, the digits are read as a decimal number. If the
00288 number is less than 10, or if there are that many previous extracting
00289 left brackets, then it is a back reference. Otherwise, up to three octal
00290 digits are read to form an escaped byte. Thus 123円 is likely to be octal
00291 123 (cf 0123,円 which is octal 012 followed by the literal 3). If the octal
00292 value is greater than 377, the least significant 8 bits are taken. Inside a
00293 character class, \ followed by a digit is always an octal number. */
00294
00295 case '1': case '2': case '3': case '4': case '5':
00296 case '6': case '7': case '8': case '9':
00297
00298 if (!isclass)
00299 {
00300 oldptr = ptr;
00301 c -= '0';
00302 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
00303 c = c * 10 + *(++ptr) - '0';
00304 if (c < 10 || c <= bracount)
00305 {
00306 c = -(ESC_REF + c);
00307 break;
00308 }
00309 ptr = oldptr; /* Put the pointer back and fall through */
00310 }
00311
00312 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
00313 generates a binary zero byte and treats the digit as a following literal.
00314 Thus we have to pull back the pointer by one. */
00315
00316 if ((c = *ptr) >= '8')
00317 {
00318 ptr--;
00319 c = 0;
00320 break;
00321 }
00322
00323 /* 0円 always starts an octal number, but we may drop through to here with a
00324 larger first octal digit */
00325
00326 case '0':
00327 c -= '0';
00328 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
00329 ptr[1] != '8' && ptr[1] != '9')
00330 c = c * 8 + *(++ptr) - '0';
00331 break;
00332
00333 /* Special escapes not starting with a digit are straightforward */
00334
00335 case 'x':
00336 c = 0;
00337 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
00338 {
00339 ptr++;
00340 c = c * 16 + cd->lcc[*ptr] -
00341 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
00342 }
00343 break;
00344
00345 case 'c':
00346 c = *(++ptr);
00347 if (c == 0)
00348 {
00349 *errorptr = ERR2;
00350 return 0;
00351 }
00352
00353 /* A letter is upper-cased; then the 0x40 bit is flipped */
00354
00355 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
00356 c ^= 0x40;
00357 break;
00358
00359 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
00360 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
00361 for Perl compatibility, it is a literal. This code looks a bit odd, but
00362 there used to be some cases other than the default, and there may be again
00363 in future, so I haven't "optimized" it. */
00364
00365 default:
00366 if ((options & PCRE_EXTRA) != 0) switch(c)
00367 {
00368 default:
00369 *errorptr = ERR3;
00370 break;
00371 }
00372 break;
00373 }
00374 }
00375
00376 *ptrptr = ptr;
00377 return c;
00378 }
00379
00380
00381
00382 /*************************************************
00383 * Check for counted repeat *
00384 *************************************************/
00385
00386 /* This function is called when a '{' is encountered in a place where it might
00387 start a quantifier. It looks ahead to see if it really is a quantifier or not.
00388 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
00389 where the ddds are digits.
00390
00391 Arguments:
00392 p pointer to the first char after '{'
00393 cd pointer to char tables block
00394
00395 Returns: TRUE or FALSE
00396 */
00397
00398 static BOOL
00399 is_counted_repeat(const uschar *p, compile_data *cd)
00400 {
00401 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
00402 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
00403 if (*p == '}') return TRUE;
00404
00405 if (*p++ != ',') return FALSE;
00406 if (*p == '}') return TRUE;
00407
00408 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
00409 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
00410 return (*p == '}');
00411 }
00412
00413
00414
00415 /*************************************************
00416 * Read repeat counts *
00417 *************************************************/
00418
00419 /* Read an item of the form {n,m} and return the values. This is called only
00420 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
00421 so the syntax is guaranteed to be correct, but we need to check the values.
00422
00423 Arguments:
00424 p pointer to first char after '{'
00425 minp pointer to int for min
00426 maxp pointer to int for max
00427 returned as -1 if no max
00428 errorptr points to pointer to error message
00429 cd pointer to character tables clock
00430
00431 Returns: pointer to '}' on success;
00432 current ptr on error, with errorptr set
00433 */
00434
00435 static const uschar *
00436 read_repeat_counts(const uschar *p, int *minp, int *maxp,
00437 const char **errorptr, compile_data *cd)
00438 {
00439 int min = 0;
00440 int max = -1;
00441
00442 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
00443
00444 if (*p == '}') max = min; else
00445 {
00446 if (*(++p) != '}')
00447 {
00448 max = 0;
00449 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
00450 if (max < min)
00451 {
00452 *errorptr = ERR4;
00453 return p;
00454 }
00455 }
00456 }
00457
00458 /* Do paranoid checks, then fill in the required variables, and pass back the
00459 pointer to the terminating '}'. */
00460
00461 if (min > 65535 || max > 65535)
00462 *errorptr = ERR5;
00463 else
00464 {
00465 *minp = min;
00466 *maxp = max;
00467 }
00468 return p;
00469 }
00470
00471
00472
00473 /*************************************************
00474 * Find the fixed length of a pattern *
00475 *************************************************/
00476
00477 /* Scan a pattern and compute the fixed length of subject that will match it,
00478 if the length is fixed. This is needed for dealing with backward assertions.
00479
00480 Arguments:
00481 code points to the start of the pattern (the bracket)
00482
00483 Returns: the fixed length, or -1 if there is no fixed length
00484 */
00485
00486 static int
00487 find_fixedlength(uschar *code)
00488 {
00489 int length = -1;
00490
00491 register int branchlength = 0;
00492 register uschar *cc = code + 3;
00493
00494 /* Scan along the opcodes for this branch. If we get to the end of the
00495 branch, check the length against that of the other branches. */
00496
00497 for (;;)
00498 {
00499 int d;
00500 register int op = *cc;
00501 if (op >= OP_BRA) op = OP_BRA;
00502
00503 switch (op)
00504 {
00505 case OP_BRA:
00506 case OP_ONCE:
00507 case OP_COND:
00508 d = find_fixedlength(cc);
00509 if (d < 0) return -1;
00510 branchlength += d;
00511 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
00512 cc += 3;
00513 break;
00514
00515 /* Reached end of a branch; if it's a ket it is the end of a nested
00516 call. If it's ALT it is an alternation in a nested call. If it is
00517 END it's the end of the outer call. All can be handled by the same code. */
00518
00519 case OP_ALT:
00520 case OP_KET:
00521 case OP_KETRMAX:
00522 case OP_KETRMIN:
00523 case OP_END:
00524 if (length < 0) length = branchlength;
00525 else if (length != branchlength) return -1;
00526 if (*cc != OP_ALT) return length;
00527 cc += 3;
00528 branchlength = 0;
00529 break;
00530
00531 /* Skip over assertive subpatterns */
00532
00533 case OP_ASSERT:
00534 case OP_ASSERT_NOT:
00535 case OP_ASSERTBACK:
00536 case OP_ASSERTBACK_NOT:
00537 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
00538 cc += 3;
00539 break;
00540
00541 /* Skip over things that don't match chars */
00542
00543 case OP_REVERSE:
00544 cc++;
00545
00546 case OP_CREF:
00547 case OP_OPT:
00548 cc++;
00549 /* Fall through */
00550
00551 case OP_SOD:
00552 case OP_EOD:
00553 case OP_EODN:
00554 case OP_CIRC:
00555 case OP_DOLL:
00556 case OP_NOT_WORD_BOUNDARY:
00557 case OP_WORD_BOUNDARY:
00558 cc++;
00559 break;
00560
00561 /* Handle char strings */
00562
00563 case OP_CHARS:
00564 branchlength += *(++cc);
00565 cc += *cc + 1;
00566 break;
00567
00568 /* Handle exact repetitions */
00569
00570 case OP_EXACT:
00571 case OP_TYPEEXACT:
00572 branchlength += (cc[1] << 8) + cc[2];
00573 cc += 4;
00574 break;
00575
00576 /* Handle single-char matchers */
00577
00578 case OP_NOT_DIGIT:
00579 case OP_DIGIT:
00580 case OP_NOT_WHITESPACE:
00581 case OP_WHITESPACE:
00582 case OP_NOT_WORDCHAR:
00583 case OP_WORDCHAR:
00584 case OP_ANY:
00585 branchlength++;
00586 cc++;
00587 break;
00588
00589
00590 /* Check a class for variable quantification */
00591
00592 case OP_CLASS:
00593 cc += (*cc == OP_REF)? 2 : 33;
00594
00595 switch (*cc)
00596 {
00597 case OP_CRSTAR:
00598 case OP_CRMINSTAR:
00599 case OP_CRQUERY:
00600 case OP_CRMINQUERY:
00601 return -1;
00602
00603 case OP_CRRANGE:
00604 case OP_CRMINRANGE:
00605 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
00606 branchlength += (cc[1] << 8) + cc[2];
00607 cc += 5;
00608 break;
00609
00610 default:
00611 branchlength++;
00612 }
00613 break;
00614
00615 /* Anything else is variable length */
00616
00617 default:
00618 return -1;
00619 }
00620 }
00621 /* Control never gets here */
00622 }
00623
00624
00625
00626
00627 /*************************************************
00628 * Compile one branch *
00629 *************************************************/
00630
00631 /* Scan the pattern, compiling it into the code vector.
00632
00633 Arguments:
00634 options the option bits
00635 brackets points to number of brackets used
00636 code points to the pointer to the current code point
00637 ptrptr points to the current pattern pointer
00638 errorptr points to pointer to error message
00639 optchanged set to the value of the last OP_OPT item compiled
00640 cd contains pointers to tables
00641
00642 Returns: TRUE on success
00643 FALSE, with *errorptr set on error
00644 */
00645
00646 static BOOL
00647 compile_branch(int options, int *brackets, uschar **codeptr,
00648 const uschar **ptrptr, const char **errorptr, int *optchanged,
00649 compile_data *cd)
00650 {
00651 int repeat_type, op_type;
00652 int repeat_min, repeat_max;
00653 int bravalue, length;
00654 int greedy_default, greedy_non_default;
00655 register int c;
00656 register uschar *code = *codeptr;
00657 uschar *tempcode;
00658 const uschar *ptr = *ptrptr;
00659 const uschar *tempptr;
00660 uschar *previous = NULL;
00661 uschar class[32];
00662
00663 /* Set up the default and non-default settings for greediness */
00664
00665 greedy_default = ((options & PCRE_UNGREEDY) != 0);
00666 greedy_non_default = greedy_default ^ 1;
00667
00668 /* Switch on next character until the end of the branch */
00669
00670 for (;; ptr++)
00671 {
00672 BOOL negate_class;
00673 int class_charcount;
00674 int class_lastchar;
00675 int newoptions;
00676 int condref;
00677
00678 c = *ptr;
00679 if ((options & PCRE_EXTENDED) != 0)
00680 {
00681 if ((cd->ctypes[c] & ctype_space) != 0) continue;
00682 if (c == '#')
00683 {
00684 while ((c = *(++ptr)) != 0 && c != '\n');
00685 continue;
00686 }
00687 }
00688
00689 switch(c)
00690 {
00691 /* The branch terminates at end of string, |, or ). */
00692
00693 case 0:
00694 case '|':
00695 case ')':
00696 *codeptr = code;
00697 *ptrptr = ptr;
00698 return TRUE;
00699
00700 /* Handle single-character metacharacters */
00701
00702 case '^':
00703 previous = NULL;
00704 *code++ = OP_CIRC;
00705 break;
00706
00707 case '$':
00708 previous = NULL;
00709 *code++ = OP_DOLL;
00710 break;
00711
00712 case '.':
00713 previous = code;
00714 *code++ = OP_ANY;
00715 break;
00716
00717 /* Character classes. These always build a 32-byte bitmap of the permitted
00718 characters, except in the special case where there is only one character.
00719 For negated classes, we build the map as usual, then invert it at the end.
00720 */
00721
00722 case '[':
00723 previous = code;
00724 *code++ = OP_CLASS;
00725
00726 /* If the first character is '^', set the negation flag and skip it. */
00727
00728 if ((c = *(++ptr)) == '^')
00729 {
00730 negate_class = TRUE;
00731 c = *(++ptr);
00732 }
00733 else negate_class = FALSE;
00734
00735 /* Keep a count of chars so that we can optimize the case of just a single
00736 character. */
00737
00738 class_charcount = 0;
00739 class_lastchar = -1;
00740
00741 /* Initialize the 32-char bit map to all zeros. We have to build the
00742 map in a temporary bit of store, in case the class contains only 1
00743 character, because in that case the compiled code doesn't use the
00744 bit map. */
00745
00746 memset(class, 0, 32 * sizeof(uschar));
00747
00748 /* Process characters until ] is reached. By writing this as a "do" it
00749 means that an initial ] is taken as a data character. */
00750
00751 do
00752 {
00753 if (c == 0)
00754 {
00755 *errorptr = ERR6;
00756 goto FAILED;
00757 }
00758
00759 /* Backslash may introduce a single character, or it may introduce one
00760 of the specials, which just set a flag. Escaped items are checked for
00761 validity in the pre-compiling pass. The sequence \b is a special case.
00762 Inside a class (and only there) it is treated as backspace. Elsewhere
00763 it marks a word boundary. Other escapes have preset maps ready to
00764 or into the one we are building. We assume they have more than one
00765 character in them, so set class_count bigger than one. */
00766
00767 if (c == '\\')
00768 {
00769 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
00770 if (-c == ESC_b) c = '\b';
00771 else if (c < 0)
00772 {
00773 register const uschar *cbits = cd->cbits;
00774 class_charcount = 10;
00775 switch (-c)
00776 {
00777 case ESC_d:
00778 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
00779 continue;
00780
00781 case ESC_D:
00782 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
00783 continue;
00784
00785 case ESC_w:
00786 for (c = 0; c < 32; c++)
00787 class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
00788 continue;
00789
00790 case ESC_W:
00791 for (c = 0; c < 32; c++)
00792 class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
00793 continue;
00794
00795 case ESC_s:
00796 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
00797 continue;
00798
00799 case ESC_S:
00800 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
00801 continue;
00802
00803 default:
00804 *errorptr = ERR7;
00805 goto FAILED;
00806 }
00807 }
00808 /* Fall through if single character */
00809 }
00810
00811 /* A single character may be followed by '-' to form a range. However,
00812 Perl does not permit ']' to be the end of the range. A '-' character
00813 here is treated as a literal. */
00814
00815 if (ptr[1] == '-' && ptr[2] != ']')
00816 {
00817 int d;
00818 ptr += 2;
00819 d = *ptr;
00820
00821 if (d == 0)
00822 {
00823 *errorptr = ERR6;
00824 goto FAILED;
00825 }
00826
00827 /* The second part of a range can be a single-character escape, but
00828 not any of the other escapes. */
00829
00830 if (d == '\\')
00831 {
00832 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
00833 if (d < 0)
00834 {
00835 if (d == -ESC_b) d = '\b'; else
00836 {
00837 *errorptr = ERR7;
00838 goto FAILED;
00839 }
00840 }
00841 }
00842
00843 if (d < c)
00844 {
00845 *errorptr = ERR8;
00846 goto FAILED;
00847 }
00848
00849 for (; c <= d; c++)
00850 {
00851 class[c/8] |= (1 << (c&7));
00852 if ((options & PCRE_CASELESS) != 0)
00853 {
00854 int uc = cd->fcc[c]; /* flip case */
00855 class[uc/8] |= (1 << (uc&7));
00856 }
00857 class_charcount++; /* in case a one-char range */
00858 class_lastchar = c;
00859 }
00860 continue; /* Go get the next char in the class */
00861 }
00862
00863 /* Handle a lone single character - we can get here for a normal
00864 non-escape char, or after \ that introduces a single character. */
00865
00866 class [c/8] |= (1 << (c&7));
00867 if ((options & PCRE_CASELESS) != 0)
00868 {
00869 c = cd->fcc[c]; /* flip case */
00870 class[c/8] |= (1 << (c&7));
00871 }
00872 class_charcount++;
00873 class_lastchar = c;
00874 }
00875
00876 /* Loop until ']' reached; the check for end of string happens inside the
00877 loop. This "while" is the end of the "do" above. */
00878
00879 while ((c = *(++ptr)) != ']');
00880
00881 /* If class_charcount is 1 and class_lastchar is not negative, we saw
00882 precisely one character. This doesn't need the whole 32-byte bit map.
00883 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
00884 it's negative. */
00885
00886 if (class_charcount == 1 && class_lastchar >= 0)
00887 {
00888 if (negate_class)
00889 {
00890 code[-1] = OP_NOT;
00891 }
00892 else
00893 {
00894 code[-1] = OP_CHARS;
00895 *code++ = 1;
00896 }
00897 *code++ = class_lastchar;
00898 }
00899
00900 /* Otherwise, negate the 32-byte map if necessary, and copy it into
00901 the code vector. */
00902
00903 else
00904 {
00905 if (negate_class)
00906 for (c = 0; c < 32; c++) code[c] = ~class[c];
00907 else
00908 memcpy(code, class, 32);
00909 code += 32;
00910 }
00911 break;
00912
00913 /* Various kinds of repeat */
00914
00915 case '{':
00916 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
00917 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
00918 if (*errorptr != NULL) goto FAILED;
00919 goto REPEAT;
00920
00921 case '*':
00922 repeat_min = 0;
00923 repeat_max = -1;
00924 goto REPEAT;
00925
00926 case '+':
00927 repeat_min = 1;
00928 repeat_max = -1;
00929 goto REPEAT;
00930
00931 case '?':
00932 repeat_min = 0;
00933 repeat_max = 1;
00934
00935 REPEAT:
00936 if (previous == NULL)
00937 {
00938 *errorptr = ERR9;
00939 goto FAILED;
00940 }
00941
00942 /* If the next character is '?' this is a minimizing repeat, by default,
00943 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
00944 next character. */
00945
00946 if (ptr[1] == '?')
00947 { repeat_type = greedy_non_default; ptr++; }
00948 else repeat_type = greedy_default;
00949
00950 /* If the maximum is zero then the minimum must also be zero; Perl allows
00951 this case, so we do too - by simply omitting the item altogether. */
00952
00953 if (repeat_max == 0) code = previous;
00954
00955 /* If previous was a string of characters, chop off the last one and use it
00956 as the subject of the repeat. If there was only one character, we can
00957 abolish the previous item altogether. */
00958
00959 else if (*previous == OP_CHARS)
00960 {
00961 int len = previous[1];
00962 if (len == 1)
00963 {
00964 c = previous[2];
00965 code = previous;
00966 }
00967 else
00968 {
00969 c = previous[len+1];
00970 previous[1]--;
00971 code--;
00972 }
00973 op_type = 0; /* Use single-char op codes */
00974 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
00975 }
00976
00977 /* If previous was a single negated character ([^a] or similar), we use
00978 one of the special opcodes, replacing it. The code is shared with single-
00979 character repeats by adding a suitable offset into repeat_type. */
00980
00981 else if ((int)*previous == OP_NOT)
00982 {
00983 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
00984 c = previous[1];
00985 code = previous;
00986 goto OUTPUT_SINGLE_REPEAT;
00987 }
00988
00989 /* If previous was a character type match (\d or similar), abolish it and
00990 create a suitable repeat item. The code is shared with single-character
00991 repeats by adding a suitable offset into repeat_type. */
00992
00993 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
00994 {
00995 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
00996 c = *previous;
00997 code = previous;
00998
00999 OUTPUT_SINGLE_REPEAT:
01000 repeat_type += op_type; /* Combine both values for many cases */
01001
01002 /* A minimum of zero is handled either as the special case * or ?, or as
01003 an UPTO, with the maximum given. */
01004
01005 if (repeat_min == 0)
01006 {
01007 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
01008 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
01009 else
01010 {
01011 *code++ = OP_UPTO + repeat_type;
01012 *code++ = repeat_max >> 8;
01013 *code++ = (repeat_max & 255);
01014 }
01015 }
01016
01017 /* The case {1,} is handled as the special case + */
01018
01019 else if (repeat_min == 1 && repeat_max == -1)
01020 *code++ = OP_PLUS + repeat_type;
01021
01022 /* The case {n,n} is just an EXACT, while the general case {n,m} is
01023 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
01024
01025 else
01026 {
01027 if (repeat_min != 1)
01028 {
01029 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
01030 *code++ = repeat_min >> 8;
01031 *code++ = (repeat_min & 255);
01032 }
01033
01034 /* If the mininum is 1 and the previous item was a character string,
01035 we either have to put back the item that got cancelled if the string
01036 length was 1, or add the character back onto the end of a longer
01037 string. For a character type nothing need be done; it will just get
01038 put back naturally. Note that the final character is always going to
01039 get added below. */
01040
01041 else if (*previous == OP_CHARS)
01042 {
01043 if (code == previous) code += 2; else previous[1]++;
01044 }
01045
01046 /* For a single negated character we also have to put back the
01047 item that got cancelled. */
01048
01049 else if (*previous == OP_NOT) code++;
01050
01051 /* If the maximum is unlimited, insert an OP_STAR. */
01052
01053 if (repeat_max < 0)
01054 {
01055 *code++ = c;
01056 *code++ = OP_STAR + repeat_type;
01057 }
01058
01059 /* Else insert an UPTO if the max is greater than the min. */
01060
01061 else if (repeat_max != repeat_min)
01062 {
01063 *code++ = c;
01064 repeat_max -= repeat_min;
01065 *code++ = OP_UPTO + repeat_type;
01066 *code++ = repeat_max >> 8;
01067 *code++ = (repeat_max & 255);
01068 }
01069 }
01070
01071 /* The character or character type itself comes last in all cases. */
01072
01073 *code++ = c;
01074 }
01075
01076 /* If previous was a character class or a back reference, we put the repeat
01077 stuff after it. */
01078
01079 else if (*previous == OP_CLASS || *previous == OP_REF)
01080 {
01081 if (repeat_min == 0 && repeat_max == -1)
01082 *code++ = OP_CRSTAR + repeat_type;
01083 else if (repeat_min == 1 && repeat_max == -1)
01084 *code++ = OP_CRPLUS + repeat_type;
01085 else if (repeat_min == 0 && repeat_max == 1)
01086 *code++ = OP_CRQUERY + repeat_type;
01087 else
01088 {
01089 *code++ = OP_CRRANGE + repeat_type;
01090 *code++ = repeat_min >> 8;
01091 *code++ = repeat_min & 255;
01092 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
01093 *code++ = repeat_max >> 8;
01094 *code++ = repeat_max & 255;
01095 }
01096 }
01097
01098 /* If previous was a bracket group, we may have to replicate it in certain
01099 cases. */
01100
01101 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
01102 (int)*previous == OP_COND)
01103 {
01104 register int i;
01105 int ketoffset = 0;
01106 int len = code - previous;
01107 uschar *bralink = NULL;
01108
01109 /* If the maximum repeat count is unlimited, find the end of the bracket
01110 by scanning through from the start, and compute the offset back to it
01111 from the current code pointer. There may be an OP_OPT setting following
01112 the final KET, so we can't find the end just by going back from the code
01113 pointer. */
01114
01115 if (repeat_max == -1)
01116 {
01117 register uschar *ket = previous;
01118 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
01119 ketoffset = code - ket;
01120 }
01121
01122 /* The case of a zero minimum is special because of the need to stick
01123 OP_BRAZERO in front of it, and because the group appears once in the
01124 data, whereas in other cases it appears the minimum number of times. For
01125 this reason, it is simplest to treat this case separately, as otherwise
01126 the code gets far too mess. There are several special subcases when the
01127 minimum is zero. */
01128
01129 if (repeat_min == 0)
01130 {
01131 /* If the maximum is also zero, we just omit the group from the output
01132 altogether. */
01133
01134 if (repeat_max == 0)
01135 {
01136 code = previous;
01137 previous = NULL;
01138 break;
01139 }
01140
01141 /* If the maximum is 1 or unlimited, we just have to stick in the
01142 BRAZERO and do no more at this point. */
01143
01144 if (repeat_max <= 1)
01145 {
01146 memmove(previous+1, previous, len);
01147 code++;
01148 *previous++ = OP_BRAZERO + repeat_type;
01149 }
01150
01151 /* If the maximum is greater than 1 and limited, we have to replicate
01152 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
01153 The first one has to be handled carefully because it's the original
01154 copy, which has to be moved up. The remainder can be handled by code
01155 that is common with the non-zero minimum case below. We just have to
01156 adjust the value or repeat_max, since one less copy is required. */
01157
01158 else
01159 {
01160 int offset;
01161 memmove(previous+4, previous, len);
01162 code += 4;
01163 *previous++ = OP_BRAZERO + repeat_type;
01164 *previous++ = OP_BRA;
01165
01166 /* We chain together the bracket offset fields that have to be
01167 filled in later when the ends of the brackets are reached. */
01168
01169 offset = (bralink == NULL)? 0 : previous - bralink;
01170 bralink = previous;
01171 *previous++ = offset >> 8;
01172 *previous++ = offset & 255;
01173 }
01174
01175 repeat_max--;
01176 }
01177
01178 /* If the minimum is greater than zero, replicate the group as many
01179 times as necessary, and adjust the maximum to the number of subsequent
01180 copies that we need. */
01181
01182 else
01183 {
01184 for (i = 1; i < repeat_min; i++)
01185 {
01186 memcpy(code, previous, len);
01187 code += len;
01188 }
01189 if (repeat_max > 0) repeat_max -= repeat_min;
01190 }
01191
01192 /* This code is common to both the zero and non-zero minimum cases. If
01193 the maximum is limited, it replicates the group in a nested fashion,
01194 remembering the bracket starts on a stack. In the case of a zero minimum,
01195 the first one was set up above. In all cases the repeat_max now specifies
01196 the number of additional copies needed. */
01197
01198 if (repeat_max >= 0)
01199 {
01200 for (i = repeat_max - 1; i >= 0; i--)
01201 {
01202 *code++ = OP_BRAZERO + repeat_type;
01203
01204 /* All but the final copy start a new nesting, maintaining the
01205 chain of brackets outstanding. */
01206
01207 if (i != 0)
01208 {
01209 int offset;
01210 *code++ = OP_BRA;
01211 offset = (bralink == NULL)? 0 : code - bralink;
01212 bralink = code;
01213 *code++ = offset >> 8;
01214 *code++ = offset & 255;
01215 }
01216
01217 memcpy(code, previous, len);
01218 code += len;
01219 }
01220
01221 /* Now chain through the pending brackets, and fill in their length
01222 fields (which are holding the chain links pro tem). */
01223
01224 while (bralink != NULL)
01225 {
01226 int oldlinkoffset;
01227 int offset = code - bralink + 1;
01228 uschar *bra = code - offset;
01229 oldlinkoffset = (bra[1] << 8) + bra[2];
01230 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
01231 *code++ = OP_KET;
01232 *code++ = bra[1] = offset >> 8;
01233 *code++ = bra[2] = (offset & 255);
01234 }
01235 }
01236
01237 /* If the maximum is unlimited, set a repeater in the final copy. We
01238 can't just offset backwards from the current code point, because we
01239 don't know if there's been an options resetting after the ket. The
01240 correct offset was computed above. */
01241
01242 else code[-ketoffset] = OP_KETRMAX + repeat_type;
01243
01244
01245 #ifdef NEVER
01246 /* If the minimum is greater than zero, and the maximum is unlimited or
01247 equal to the minimum, the first copy remains where it is, and is
01248 replicated up to the minimum number of times. This case includes the +
01249 repeat, but of course no replication is needed in that case. */
01250
01251 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
01252 {
01253 for (i = 1; i < repeat_min; i++)
01254 {
01255 memcpy(code, previous, len);
01256 code += len;
01257 }
01258 }
01259
01260 /* If the minimum is zero, stick BRAZERO in front of the first copy.
01261 Then, if there is a fixed upper limit, replicated up to that many times,
01262 sticking BRAZERO in front of all the optional ones. */
01263
01264 else
01265 {
01266 if (repeat_min == 0)
01267 {
01268 memmove(previous+1, previous, len);
01269 code++;
01270 *previous++ = OP_BRAZERO + repeat_type;
01271 }
01272
01273 for (i = 1; i < repeat_min; i++)
01274 {
01275 memcpy(code, previous, len);
01276 code += len;
01277 }
01278
01279 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
01280 {
01281 *code++ = OP_BRAZERO + repeat_type;
01282 memcpy(code, previous, len);
01283 code += len;
01284 }
01285 }
01286
01287 /* If the maximum is unlimited, set a repeater in the final copy. We
01288 can't just offset backwards from the current code point, because we
01289 don't know if there's been an options resetting after the ket. The
01290 correct offset was computed above. */
01291
01292 if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;
01293 #endif
01294
01295
01296 }
01297
01298 /* Else there's some kind of shambles */
01299
01300 else
01301 {
01302 *errorptr = ERR11;
01303 goto FAILED;
01304 }
01305
01306 /* In all case we no longer have a previous item. */
01307
01308 previous = NULL;
01309 break;
01310
01311
01312 /* Start of nested bracket sub-expression, or comment or lookahead or
01313 lookbehind or option setting or condition. First deal with special things
01314 that can come after a bracket; all are introduced by ?, and the appearance
01315 of any of them means that this is not a referencing group. They were
01316 checked for validity in the first pass over the string, so we don't have to
01317 check for syntax errors here. */
01318
01319 case '(':
01320 newoptions = options;
01321 condref = -1;
01322
01323 if (*(++ptr) == '?')
01324 {
01325 int set, unset;
01326 int *optset;
01327
01328 switch (*(++ptr))
01329 {
01330 case '#': /* Comment; skip to ket */
01331 ptr++;
01332 while (*ptr != ')') ptr++;
01333 continue;
01334
01335 case ':': /* Non-extracting bracket */
01336 bravalue = OP_BRA;
01337 ptr++;
01338 break;
01339
01340 case '(':
01341 bravalue = OP_COND; /* Conditional group */
01342 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
01343 {
01344 condref = *ptr - '0';
01345 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
01346 ptr++;
01347 }
01348 else ptr--;
01349 break;
01350
01351 case '=': /* Positive lookahead */
01352 bravalue = OP_ASSERT;
01353 ptr++;
01354 break;
01355
01356 case '!': /* Negative lookahead */
01357 bravalue = OP_ASSERT_NOT;
01358 ptr++;
01359 break;
01360
01361 case '<': /* Lookbehinds */
01362 switch (*(++ptr))
01363 {
01364 case '=': /* Positive lookbehind */
01365 bravalue = OP_ASSERTBACK;
01366 ptr++;
01367 break;
01368
01369 case '!': /* Negative lookbehind */
01370 bravalue = OP_ASSERTBACK_NOT;
01371 ptr++;
01372 break;
01373
01374 default: /* Syntax error */
01375 *errorptr = ERR24;
01376 goto FAILED;
01377 }
01378 break;
01379
01380 case '>': /* One-time brackets */
01381 bravalue = OP_ONCE;
01382 ptr++;
01383 break;
01384
01385 default: /* Option setting */
01386 set = unset = 0;
01387 optset = &set;
01388
01389 while (*ptr != ')' && *ptr != ':')
01390 {
01391 switch (*ptr++)
01392 {
01393 case '-': optset = &unset; break;
01394
01395 case 'i': *optset |= PCRE_CASELESS; break;
01396 case 'm': *optset |= PCRE_MULTILINE; break;
01397 case 's': *optset |= PCRE_DOTALL; break;
01398 case 'x': *optset |= PCRE_EXTENDED; break;
01399 case 'U': *optset |= PCRE_UNGREEDY; break;
01400 case 'X': *optset |= PCRE_EXTRA; break;
01401
01402 default:
01403 *errorptr = ERR12;
01404 goto FAILED;
01405 }
01406 }
01407
01408 /* Set up the changed option bits, but don't change anything yet. */
01409
01410 newoptions = (options | set) & (~unset);
01411
01412 /* If the options ended with ')' this is not the start of a nested
01413 group with option changes, so the options change at this level. At top
01414 level there is nothing else to be done (the options will in fact have
01415 been set from the start of compiling as a result of the first pass) but
01416 at an inner level we must compile code to change the ims options if
01417 necessary, and pass the new setting back so that it can be put at the
01418 start of any following branches, and when this group ends, a resetting
01419 item can be compiled. */
01420
01421 if (*ptr == ')')
01422 {
01423 if ((options & PCRE_INGROUP) != 0 &&
01424 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
01425 {
01426 *code++ = OP_OPT;
01427 *code++ = *optchanged = newoptions & PCRE_IMS;
01428 }
01429 options = newoptions; /* Change options at this level */
01430 previous = NULL; /* This item can't be repeated */
01431 continue; /* It is complete */
01432 }
01433
01434 /* If the options ended with ':' we are heading into a nested group
01435 with possible change of options. Such groups are non-capturing and are
01436 not assertions of any kind. All we need to do is skip over the ':';
01437 the newoptions value is handled below. */
01438
01439 bravalue = OP_BRA;
01440 ptr++;
01441 }
01442 }
01443
01444 /* Else we have a referencing group; adjust the opcode. */
01445
01446 else
01447 {
01448 if (++(*brackets) > EXTRACT_MAX)
01449 {
01450 *errorptr = ERR13;
01451 goto FAILED;
01452 }
01453 bravalue = OP_BRA + *brackets;
01454 }
01455
01456 /* Process nested bracketed re. Assertions may not be repeated, but other
01457 kinds can be. We copy code into a non-register variable in order to be able
01458 to pass its address because some compilers complain otherwise. Pass in a
01459 new setting for the ims options if they have changed. */
01460
01461 previous = (bravalue >= OP_ONCE)? code : NULL;
01462 *code = bravalue;
01463 tempcode = code;
01464
01465 if (!compile_regex(
01466 options | PCRE_INGROUP, /* Set for all nested groups */
01467 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
01468 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
01469 brackets, /* Bracket level */
01470 &tempcode, /* Where to put code (updated) */
01471 &ptr, /* Input pointer (updated) */
01472 errorptr, /* Where to put an error message */
01473 (bravalue == OP_ASSERTBACK ||
01474 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
01475 condref, /* Condition reference number */
01476 cd)) /* Tables block */
01477 goto FAILED;
01478
01479 /* At the end of compiling, code is still pointing to the start of the
01480 group, while tempcode has been updated to point past the end of the group
01481 and any option resetting that may follow it. The pattern pointer (ptr)
01482 is on the bracket. */
01483
01484 /* If this is a conditional bracket, check that there are no more than
01485 two branches in the group. */
01486
01487 if (bravalue == OP_COND)
01488 {
01489 int branchcount = 0;
01490 uschar *tc = code;
01491
01492 do {
01493 branchcount++;
01494 tc += (tc[1] << 8) | tc[2];
01495 }
01496 while (*tc != OP_KET);
01497
01498 if (branchcount > 2)
01499 {
01500 *errorptr = ERR27;
01501 goto FAILED;
01502 }
01503 }
01504
01505 /* Now update the main code pointer to the end of the group. */
01506
01507 code = tempcode;
01508
01509 /* Error if hit end of pattern */
01510
01511 if (*ptr != ')')
01512 {
01513 *errorptr = ERR14;
01514 goto FAILED;
01515 }
01516 break;
01517
01518 /* Check \ for being a real metacharacter; if not, fall through and handle
01519 it as a data character at the start of a string. Escape items are checked
01520 for validity in the pre-compiling pass. */
01521
01522 case '\\':
01523 tempptr = ptr;
01524 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
01525
01526 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
01527 are arranged to be the negation of the corresponding OP_values. For the
01528 back references, the values are ESC_REF plus the reference number. Only
01529 back references and those types that consume a character may be repeated.
01530 We can test for values between ESC_b and ESC_Z for the latter; this may
01531 have to change if any new ones are ever created. */
01532
01533 if (c < 0)
01534 {
01535 if (-c >= ESC_REF)
01536 {
01537 previous = code;
01538 *code++ = OP_REF;
01539 *code++ = -c - ESC_REF;
01540 }
01541 else
01542 {
01543 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
01544 *code++ = -c;
01545 }
01546 continue;
01547 }
01548
01549 /* Data character: reset and fall through */
01550
01551 ptr = tempptr;
01552 c = '\\';
01553
01554 /* Handle a run of data characters until a metacharacter is encountered.
01555 The first character is guaranteed not to be whitespace or # when the
01556 extended flag is set. */
01557
01558 NORMAL_CHAR:
01559 default:
01560 previous = code;
01561 *code = OP_CHARS;
01562 code += 2;
01563 length = 0;
01564
01565 do
01566 {
01567 if ((options & PCRE_EXTENDED) != 0)
01568 {
01569 if ((cd->ctypes[c] & ctype_space) != 0) continue;
01570 if (c == '#')
01571 {
01572 while ((c = *(++ptr)) != 0 && c != '\n');
01573 if (c == 0) break;
01574 continue;
01575 }
01576 }
01577
01578 /* Backslash may introduce a data char or a metacharacter. Escaped items
01579 are checked for validity in the pre-compiling pass. Stop the string
01580 before a metaitem. */
01581
01582 if (c == '\\')
01583 {
01584 tempptr = ptr;
01585 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
01586 if (c < 0) { ptr = tempptr; break; }
01587 }
01588
01589 /* Ordinary character or single-char escape */
01590
01591 *code++ = c;
01592 length++;
01593 }
01594
01595 /* This "while" is the end of the "do" above. */
01596
01597 while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
01598
01599 /* Compute the length and set it in the data vector, and advance to
01600 the next state. */
01601
01602 previous[1] = length;
01603 if (length < 255) ptr--;
01604 break;
01605 }
01606 } /* end of big loop */
01607
01608 /* Control never reaches here by falling through, only by a goto for all the
01609 error states. Pass back the position in the pattern so that it can be displayed
01610 to the user for diagnosing the error. */
01611
01612 FAILED:
01613 *ptrptr = ptr;
01614 return FALSE;
01615 }
01616
01617
01618
01619
01620 /*************************************************
01621 * Compile sequence of alternatives *
01622 *************************************************/
01623
01624 /* On entry, ptr is pointing past the bracket character, but on return
01625 it points to the closing bracket, or vertical bar, or end of string.
01626 The code variable is pointing at the byte into which the BRA operator has been
01627 stored. If the ims options are changed at the start (for a (?ims: group) or
01628 during any branch, we need to insert an OP_OPT item at the start of every
01629 following branch to ensure they get set correctly at run time, and also pass
01630 the new options into every subsequent branch compile.
01631
01632 Argument:
01633 options the option bits
01634 optchanged new ims options to set as if (?ims) were at the start, or -1
01635 for no change
01636 brackets -> int containing the number of extracting brackets used
01637 codeptr -> the address of the current code pointer
01638 ptrptr -> the address of the current pattern pointer
01639 errorptr -> pointer to error message
01640 lookbehind TRUE if this is a lookbehind assertion
01641 condref > 0 for OPT_CREF setting at start of conditional group
01642 cd points to the data block with tables pointers
01643
01644 Returns: TRUE on success
01645 */
01646
01647 static BOOL
01648 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
01649 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
01650 compile_data *cd)
01651 {
01652 const uschar *ptr = *ptrptr;
01653 uschar *code = *codeptr;
01654 uschar *last_branch = code;
01655 uschar *start_bracket = code;
01656 uschar *reverse_count = NULL;
01657 int oldoptions = options & PCRE_IMS;
01658
01659 code += 3;
01660
01661 /* At the start of a reference-based conditional group, insert the reference
01662 number as an OP_CREF item. */
01663
01664 if (condref > 0)
01665 {
01666 *code++ = OP_CREF;
01667 *code++ = condref;
01668 }
01669
01670 /* Loop for each alternative branch */
01671
01672 for (;;)
01673 {
01674 int length;
01675
01676 /* Handle change of options */
01677
01678 if (optchanged >= 0)
01679 {
01680 *code++ = OP_OPT;
01681 *code++ = optchanged;
01682 options = (options & ~PCRE_IMS) | optchanged;
01683 }
01684
01685 /* Set up dummy OP_REVERSE if lookbehind assertion */
01686
01687 if (lookbehind)
01688 {
01689 *code++ = OP_REVERSE;
01690 reverse_count = code;
01691 *code++ = 0;
01692 *code++ = 0;
01693 }
01694
01695 /* Now compile the branch */
01696
01697 if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))
01698 {
01699 *ptrptr = ptr;
01700 return FALSE;
01701 }
01702
01703 /* Fill in the length of the last branch */
01704
01705 length = code - last_branch;
01706 last_branch[1] = length >> 8;
01707 last_branch[2] = length & 255;
01708
01709 /* If lookbehind, check that this branch matches a fixed-length string,
01710 and put the length into the OP_REVERSE item. Temporarily mark the end of
01711 the branch with OP_END. */
01712
01713 if (lookbehind)
01714 {
01715 *code = OP_END;
01716 length = find_fixedlength(last_branch);
01717 DPRINTF(("fixed length = %d\n", length));
01718 if (length < 0)
01719 {
01720 *errorptr = ERR25;
01721 *ptrptr = ptr;
01722 return FALSE;
01723 }
01724 reverse_count[0] = (length >> 8);
01725 reverse_count[1] = length & 255;
01726 }
01727
01728 /* Reached end of expression, either ')' or end of pattern. Insert a
01729 terminating ket and the length of the whole bracketed item, and return,
01730 leaving the pointer at the terminating char. If any of the ims options
01731 were changed inside the group, compile a resetting op-code following. */
01732
01733 if (*ptr != '|')
01734 {
01735 length = code - start_bracket;
01736 *code++ = OP_KET;
01737 *code++ = length >> 8;
01738 *code++ = length & 255;
01739 if (optchanged >= 0)
01740 {
01741 *code++ = OP_OPT;
01742 *code++ = oldoptions;
01743 }
01744 *codeptr = code;
01745 *ptrptr = ptr;
01746 return TRUE;
01747 }
01748
01749 /* Another branch follows; insert an "or" node and advance the pointer. */
01750
01751 *code = OP_ALT;
01752 last_branch = code;
01753 code += 3;
01754 ptr++;
01755 }
01756 /* Control never reaches here */
01757 }
01758
01759
01760
01761
01762 /*************************************************
01763 * Find first significant op code *
01764 *************************************************/
01765
01766 /* This is called by several functions that scan a compiled expression looking
01767 for a fixed first character, or an anchoring op code etc. It skips over things
01768 that do not influence this. For one application, a change of caseless option is
01769 important.
01770
01771 Arguments:
01772 code pointer to the start of the group
01773 options pointer to external options
01774 optbit the option bit whose changing is significant, or
01775 zero if none are
01776 optstop TRUE to return on option change, otherwise change the options
01777 value and continue
01778
01779 Returns: pointer to the first significant opcode
01780 */
01781
01782 static const uschar*
01783 first_significant_code(const uschar *code, int *options, int optbit,
01784 BOOL optstop)
01785 {
01786 for (;;)
01787 {
01788 switch ((int)*code)
01789 {
01790 case OP_OPT:
01791 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
01792 {
01793 if (optstop) return code;
01794 *options = (int)code[1];
01795 }
01796 code += 2;
01797 break;
01798
01799 case OP_CREF:
01800 code += 2;
01801 break;
01802
01803 case OP_WORD_BOUNDARY:
01804 case OP_NOT_WORD_BOUNDARY:
01805 code++;
01806 break;
01807
01808 case OP_ASSERT_NOT:
01809 case OP_ASSERTBACK:
01810 case OP_ASSERTBACK_NOT:
01811 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
01812 code += 3;
01813 break;
01814
01815 default:
01816 return code;
01817 }
01818 }
01819 /* Control never reaches here */
01820 }
01821
01822
01823
01824
01825 /*************************************************
01826 * Check for anchored expression *
01827 *************************************************/
01828
01829 /* Try to find out if this is an anchored regular expression. Consider each
01830 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
01831 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
01832 it's anchored. However, if this is a multiline pattern, then only OP_SOD
01833 counts, since OP_CIRC can match in the middle.
01834
01835 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
01836 because that will try the rest of the pattern at all possible matching points,
01837 so there is no point trying them again.
01838
01839 Arguments:
01840 code points to start of expression (the bracket)
01841 options points to the options setting
01842
01843 Returns: TRUE or FALSE
01844 */
01845
01846 static BOOL
01847 is_anchored(register const uschar *code, int *options)
01848 {
01849 do {
01850 const uschar *scode = first_significant_code(code + 3, options,
01851 PCRE_MULTILINE, FALSE);
01852 register int op = *scode;
01853 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
01854 { if (!is_anchored(scode, options)) return FALSE; }
01855 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
01856 (*options & PCRE_DOTALL) != 0)
01857 { if (scode[1] != OP_ANY) return FALSE; }
01858 else if (op != OP_SOD &&
01859 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
01860 return FALSE;
01861 code += (code[1] << 8) + code[2];
01862 }
01863 while (*code == OP_ALT);
01864 return TRUE;
01865 }
01866
01867
01868
01869 /*************************************************
01870 * Check for starting with ^ or .* *
01871 *************************************************/
01872
01873 /* This is called to find out if every branch starts with ^ or .* so that
01874 "first char" processing can be done to speed things up in multiline
01875 matching and for non-DOTALL patterns that start with .* (which must start at
01876 the beginning or after \n).
01877
01878 Argument: points to start of expression (the bracket)
01879 Returns: TRUE or FALSE
01880 */
01881
01882 static BOOL
01883 is_startline(const uschar *code)
01884 {
01885 do {
01886 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
01887 register int op = *scode;
01888 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
01889 { if (!is_startline(scode)) return FALSE; }
01890 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
01891 { if (scode[1] != OP_ANY) return FALSE; }
01892 else if (op != OP_CIRC) return FALSE;
01893 code += (code[1] << 8) + code[2];
01894 }
01895 while (*code == OP_ALT);
01896 return TRUE;
01897 }
01898
01899
01900
01901 /*************************************************
01902 * Check for fixed first char *
01903 *************************************************/
01904
01905 /* Try to find out if there is a fixed first character. This is called for
01906 unanchored expressions, as it speeds up their processing quite considerably.
01907 Consider each alternative branch. If they all start with the same char, or with
01908 a bracket all of whose alternatives start with the same char (recurse ad lib),
01909 then we return that char, otherwise -1.
01910
01911 Arguments:
01912 code points to start of expression (the bracket)
01913 options pointer to the options (used to check casing changes)
01914
01915 Returns: -1 or the fixed first char
01916 */
01917
01918 static int
01919 find_firstchar(const uschar *code, int *options)
01920 {
01921 register int c = -1;
01922 do {
01923 int d;
01924 const uschar *scode = first_significant_code(code + 3, options,
01925 PCRE_CASELESS, TRUE);
01926 register int op = *scode;
01927
01928 if (op >= OP_BRA) op = OP_BRA;
01929
01930 switch(op)
01931 {
01932 default:
01933 return -1;
01934
01935 case OP_BRA:
01936 case OP_ASSERT:
01937 case OP_ONCE:
01938 case OP_COND:
01939 if ((d = find_firstchar(scode, options)) < 0) return -1;
01940 if (c < 0) c = d; else if (c != d) return -1;
01941 break;
01942
01943 case OP_EXACT: /* Fall through */
01944 scode++;
01945
01946 case OP_CHARS: /* Fall through */
01947 scode++;
01948
01949 case OP_PLUS:
01950 case OP_MINPLUS:
01951 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
01952 break;
01953 }
01954
01955 code += (code[1] << 8) + code[2];
01956 }
01957 while (*code == OP_ALT);
01958 return c;
01959 }
01960
01961
01962
01963
01964
01965 /*************************************************
01966 * Compile a Regular Expression *
01967 *************************************************/
01968
01969 /* This function takes a string and returns a pointer to a block of store
01970 holding a compiled version of the expression.
01971
01972 Arguments:
01973 pattern the regular expression
01974 options various option bits
01975 errorptr pointer to pointer to error text
01976 erroroffset ptr offset in pattern where error was detected
01977 tables pointer to character tables or NULL
01978
01979 Returns: pointer to compiled data block, or NULL on error,
01980 with errorptr and erroroffset set
01981 */
01982
01983 pcre *
01984 vmdpcre_compile(const char *pattern, int options, const char **errorptr,
01985 int *erroroffset, const unsigned char *tables)
01986 {
01987 real_pcre *re;
01988 int length = 3; /* For initial BRA plus length */
01989 int runlength;
01990 int c, size;
01991 int bracount = 0;
01992 int top_backref = 0;
01993 int branch_extra = 0;
01994 int branch_newextra;
01995 unsigned int brastackptr = 0;
01996 uschar *code;
01997 const uschar *ptr;
01998 compile_data compile_block;
01999 int brastack[BRASTACK_SIZE];
02000 uschar bralenstack[BRASTACK_SIZE];
02001
02002 #ifdef DEBUG
02003 uschar *code_base, *code_end;
02004 #endif
02005
02006 /* We can't pass back an error message if errorptr is NULL; I guess the best we
02007 can do is just return NULL. */
02008
02009 if (errorptr == NULL) return NULL;
02010 *errorptr = NULL;
02011
02012 /* However, we can give a message for this error */
02013
02014 if (erroroffset == NULL)
02015 {
02016 *errorptr = ERR16;
02017 return NULL;
02018 }
02019 *erroroffset = 0;
02020
02021 if ((options & ~PUBLIC_OPTIONS) != 0)
02022 {
02023 *errorptr = ERR17;
02024 return NULL;
02025 }
02026
02027 /* Set up pointers to the individual character tables */
02028
02029 if (tables == NULL) tables = pcre_default_tables;
02030 compile_block.lcc = tables + lcc_offset;
02031 compile_block.fcc = tables + fcc_offset;
02032 compile_block.cbits = tables + cbits_offset;
02033 compile_block.ctypes = tables + ctypes_offset;
02034
02035 /* Reflect pattern for debugging output */
02036
02037 DPRINTF(("------------------------------------------------------------------\n"));
02038 DPRINTF(("%s\n", pattern));
02039
02040 /* The first thing to do is to make a pass over the pattern to compute the
02041 amount of store required to hold the compiled code. This does not have to be
02042 perfect as long as errors are overestimates. At the same time we can detect any
02043 internal flag settings. Make an attempt to correct for any counted white space
02044 if an "extended" flag setting appears late in the pattern. We can't be so
02045 clever for #-comments. */
02046
02047 ptr = (const uschar *)(pattern - 1);
02048 while ((c = *(++ptr)) != 0)
02049 {
02050 int min, max;
02051 int class_charcount;
02052
02053 if ((options & PCRE_EXTENDED) != 0)
02054 {
02055 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
02056 if (c == '#')
02057 {
02058 while ((c = *(++ptr)) != 0 && c != '\n');
02059 continue;
02060 }
02061 }
02062
02063 switch(c)
02064 {
02065 /* A backslashed item may be an escaped "normal" character or a
02066 character type. For a "normal" character, put the pointers and
02067 character back so that tests for whitespace etc. in the input
02068 are done correctly. */
02069
02070 case '\\':
02071 {
02072 const uschar *save_ptr = ptr;
02073 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
02074 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
02075 if (c >= 0)
02076 {
02077 ptr = save_ptr;
02078 c = '\\';
02079 goto NORMAL_CHAR;
02080 }
02081 }
02082 length++;
02083
02084 /* A back reference needs an additional char, plus either one or 5
02085 bytes for a repeat. We also need to keep the value of the highest
02086 back reference. */
02087
02088 if (c <= -ESC_REF)
02089 {
02090 int refnum = -c - ESC_REF;
02091 if (refnum > top_backref) top_backref = refnum;
02092 length++; /* For single back reference */
02093 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
02094 {
02095 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
02096 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
02097 if ((min == 0 && (max == 1 || max == -1)) ||
02098 (min == 1 && max == -1))
02099 length++;
02100 else length += 5;
02101 if (ptr[1] == '?') ptr++;
02102 }
02103 }
02104 continue;
02105
02106 case '^':
02107 case '.':
02108 case '$':
02109 case '*': /* These repeats won't be after brackets; */
02110 case '+': /* those are handled separately */
02111 case '?':
02112 length++;
02113 continue;
02114
02115 /* This covers the cases of repeats after a single char, metachar, class,
02116 or back reference. */
02117
02118 case '{':
02119 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
02120 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
02121 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
02122 if ((min == 0 && (max == 1 || max == -1)) ||
02123 (min == 1 && max == -1))
02124 length++;
02125 else
02126 {
02127 length--; /* Uncount the original char or metachar */
02128 if (min == 1) length++; else if (min > 0) length += 4;
02129 if (max > 0) length += 4; else length += 2;
02130 }
02131 if (ptr[1] == '?') ptr++;
02132 continue;
02133
02134 /* An alternation contains an offset to the next branch or ket. If any ims
02135 options changed in the previous branch(es), and/or if we are in a
02136 lookbehind assertion, extra space will be needed at the start of the
02137 branch. This is handled by branch_extra. */
02138
02139 case '|':
02140 length += 3 + branch_extra;
02141 continue;
02142
02143 /* A character class uses 33 characters. Don't worry about character types
02144 that aren't allowed in classes - they'll get picked up during the compile.
02145 A character class that contains only one character uses 2 or 3 bytes,
02146 depending on whether it is negated or not. Notice this where we can. */
02147
02148 case '[':
02149 class_charcount = 0;
02150 if (*(++ptr) == '^') ptr++;
02151 do
02152 {
02153 if (*ptr == '\\')
02154 {
02155 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
02156 &compile_block);
02157 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
02158 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
02159 }
02160 else class_charcount++;
02161 ptr++;
02162 }
02163 while (*ptr != 0 && *ptr != ']');
02164
02165 /* Repeats for negated single chars are handled by the general code */
02166
02167 if (class_charcount == 1) length += 3; else
02168 {
02169 length += 33;
02170
02171 /* A repeat needs either 1 or 5 bytes. */
02172
02173 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
02174 {
02175 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
02176 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
02177 if ((min == 0 && (max == 1 || max == -1)) ||
02178 (min == 1 && max == -1))
02179 length++;
02180 else length += 5;
02181 if (ptr[1] == '?') ptr++;
02182 }
02183 }
02184 continue;
02185
02186 /* Brackets may be genuine groups or special things */
02187
02188 case '(':
02189 branch_newextra = 0;
02190
02191 /* Handle special forms of bracket, which all start (? */
02192
02193 if (ptr[1] == '?')
02194 {
02195 int set, unset;
02196 int *optset;
02197
02198 switch (c = ptr[2])
02199 {
02200 /* Skip over comments entirely */
02201 case '#':
02202 ptr += 3;
02203 while (*ptr != 0 && *ptr != ')') ptr++;
02204 if (*ptr == 0)
02205 {
02206 *errorptr = ERR18;
02207 goto PCRE_ERROR_RETURN;
02208 }
02209 continue;
02210
02211 /* Non-referencing groups and lookaheads just move the pointer on, and
02212 then behave like a non-special bracket, except that they don't increment
02213 the count of extracting brackets. Ditto for the "once only" bracket,
02214 which is in Perl from version 5.005. */
02215
02216 case ':':
02217 case '=':
02218 case '!':
02219 case '>':
02220 ptr += 2;
02221 break;
02222
02223 /* Lookbehinds are in Perl from version 5.005 */
02224
02225 case '<':
02226 if (ptr[3] == '=' || ptr[3] == '!')
02227 {
02228 ptr += 3;
02229 branch_newextra = 3;
02230 length += 3; /* For the first branch */
02231 break;
02232 }
02233 *errorptr = ERR24;
02234 goto PCRE_ERROR_RETURN;
02235
02236 /* Conditionals are in Perl from version 5.005. The bracket must either
02237 be followed by a number (for bracket reference) or by an assertion
02238 group. */
02239
02240 case '(':
02241 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
02242 {
02243 ptr += 4;
02244 length += 2;
02245 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
02246 if (*ptr != ')')
02247 {
02248 *errorptr = ERR26;
02249 goto PCRE_ERROR_RETURN;
02250 }
02251 }
02252 else /* An assertion must follow */
02253 {
02254 ptr++; /* Can treat like ':' as far as spacing is concerned */
02255
02256 if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
02257 {
02258 ptr += 2; /* To get right offset in message */
02259 *errorptr = ERR28;
02260 goto PCRE_ERROR_RETURN;
02261 }
02262 }
02263 break;
02264
02265 /* Else loop checking valid options until ) is met. Anything else is an
02266 error. If we are without any brackets, i.e. at top level, the settings
02267 act as if specified in the options, so massage the options immediately.
02268 This is for backward compatibility with Perl 5.004. */
02269
02270 default:
02271 set = unset = 0;
02272 optset = &set;
02273 ptr += 2;
02274
02275 for (;; ptr++)
02276 {
02277 c = *ptr;
02278 switch (c)
02279 {
02280 case 'i':
02281 *optset |= PCRE_CASELESS;
02282 continue;
02283
02284 case 'm':
02285 *optset |= PCRE_MULTILINE;
02286 continue;
02287
02288 case 's':
02289 *optset |= PCRE_DOTALL;
02290 continue;
02291
02292 case 'x':
02293 *optset |= PCRE_EXTENDED;
02294 continue;
02295
02296 case 'X':
02297 *optset |= PCRE_EXTRA;
02298 continue;
02299
02300 case 'U':
02301 *optset |= PCRE_UNGREEDY;
02302 continue;
02303
02304 case '-':
02305 optset = &unset;
02306 continue;
02307
02308 /* A termination by ')' indicates an options-setting-only item;
02309 this is global at top level; otherwise nothing is done here and
02310 it is handled during the compiling process on a per-bracket-group
02311 basis. */
02312
02313 case ')':
02314 if (brastackptr == 0)
02315 {
02316 options = (options | set) & (~unset);
02317 set = unset = 0; /* To save length */
02318 }
02319 /* Fall through */
02320
02321 /* A termination by ':' indicates the start of a nested group with
02322 the given options set. This is again handled at compile time, but
02323 we must allow for compiled space if any of the ims options are
02324 set. We also have to allow for resetting space at the end of
02325 the group, which is why 4 is added to the length and not just 2.
02326 If there are several changes of options within the same group, this
02327 will lead to an over-estimate on the length, but this shouldn't
02328 matter very much. We also have to allow for resetting options at
02329 the start of any alternations, which we do by setting
02330 branch_newextra to 2. */
02331
02332 case ':':
02333 if (((set|unset) & PCRE_IMS) != 0)
02334 {
02335 length += 4;
02336 branch_newextra = 2;
02337 }
02338 goto END_OPTIONS;
02339
02340 /* Unrecognized option character */
02341
02342 default:
02343 *errorptr = ERR12;
02344 goto PCRE_ERROR_RETURN;
02345 }
02346 }
02347
02348 /* If we hit a closing bracket, that's it - this is a freestanding
02349 option-setting. We need to ensure that branch_extra is updated if
02350 necessary. The only values branch_newextra can have here are 0 or 2.
02351 If the value is 2, then branch_extra must either be 2 or 5, depending
02352 on whether this is a lookbehind group or not. */
02353
02354 END_OPTIONS:
02355 if (c == ')')
02356 {
02357 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
02358 branch_extra += branch_newextra;
02359 continue;
02360 }
02361
02362 /* If options were terminated by ':' control comes here. Fall through
02363 to handle the group below. */
02364 }
02365 }
02366
02367 /* Extracting brackets must be counted so we can process escapes in a
02368 Perlish way. */
02369
02370 else bracount++;
02371
02372 /* Non-special forms of bracket. Save length for computing whole length
02373 at end if there's a repeat that requires duplication of the group. Also
02374 save the current value of branch_extra, and start the new group with
02375 the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
02376 for a lookbehind assertion. */
02377
02378 if (brastackptr >= sizeof(brastack)/sizeof(int))
02379 {
02380 *errorptr = ERR19;
02381 goto PCRE_ERROR_RETURN;
02382 }
02383
02384 bralenstack[brastackptr] = branch_extra;
02385 branch_extra = branch_newextra;
02386
02387 brastack[brastackptr++] = length;
02388 length += 3;
02389 continue;
02390
02391 /* Handle ket. Look for subsequent max/min; for certain sets of values we
02392 have to replicate this bracket up to that many times. If brastackptr is
02393 0 this is an unmatched bracket which will generate an error, but take care
02394 not to try to access brastack[-1] when computing the length and restoring
02395 the branch_extra value. */
02396
02397 case ')':
02398 length += 3;
02399 {
02400 int minval = 1;
02401 int maxval = 1;
02402 int duplength;
02403
02404 if (brastackptr > 0)
02405 {
02406 duplength = length - brastack[--brastackptr];
02407 branch_extra = bralenstack[brastackptr];
02408 }
02409 else duplength = 0;
02410
02411 /* Leave ptr at the final char; for read_repeat_counts this happens
02412 automatically; for the others we need an increment. */
02413
02414 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
02415 {
02416 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
02417 &compile_block);
02418 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
02419 }
02420 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
02421 else if (c == '+') { maxval = -1; ptr++; }
02422 else if (c == '?') { minval = 0; ptr++; }
02423
02424 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
02425 group, and if the maximum is greater than zero, we have to replicate
02426 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
02427 bracket set - hence the 7. */
02428
02429 if (minval == 0)
02430 {
02431 length++;
02432 if (maxval > 0) length += (maxval - 1) * (duplength + 7);
02433 }
02434
02435 /* When the minimum is greater than zero, 1 we have to replicate up to
02436 minval-1 times, with no additions required in the copies. Then, if
02437 there is a limited maximum we have to replicate up to maxval-1 times
02438 allowing for a BRAZERO item before each optional copy and nesting
02439 brackets for all but one of the optional copies. */
02440
02441 else
02442 {
02443 length += (minval - 1) * duplength;
02444 if (maxval > minval) /* Need this test as maxval=-1 means no limit */
02445 length += (maxval - minval) * (duplength + 7) - 6;
02446 }
02447 }
02448 continue;
02449
02450 /* Non-special character. For a run of such characters the length required
02451 is the number of characters + 2, except that the maximum run length is 255.
02452 We won't get a skipped space or a non-data escape or the start of a #
02453 comment as the first character, so the length can't be zero. */
02454
02455 NORMAL_CHAR:
02456 default:
02457 length += 2;
02458 runlength = 0;
02459 do
02460 {
02461 if ((options & PCRE_EXTENDED) != 0)
02462 {
02463 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
02464 if (c == '#')
02465 {
02466 while ((c = *(++ptr)) != 0 && c != '\n');
02467 continue;
02468 }
02469 }
02470
02471 /* Backslash may introduce a data char or a metacharacter; stop the
02472 string before the latter. */
02473
02474 if (c == '\\')
02475 {
02476 const uschar *saveptr = ptr;
02477 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
02478 &compile_block);
02479 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
02480 if (c < 0) { ptr = saveptr; break; }
02481 }
02482
02483 /* Ordinary character or single-char escape */
02484
02485 runlength++;
02486 }
02487
02488 /* This "while" is the end of the "do" above. */
02489
02490 while (runlength < 255 &&
02491 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
02492
02493 ptr--;
02494 length += runlength;
02495 continue;
02496 }
02497 }
02498
02499 length += 4; /* For final KET and END */
02500
02501 if (length > 65539)
02502 {
02503 *errorptr = ERR20;
02504 return NULL;
02505 }
02506
02507 /* Compute the size of data block needed and get it, either from malloc or
02508 externally provided function. We specify "code[0]" in the offsetof() expression
02509 rather than just "code", because it has been reported that one broken compiler
02510 fails on "code" because it is also an independent variable. It should make no
02511 difference to the value of the offsetof(). */
02512
02513 size = length + offsetof(real_pcre, code[0]);
02514 re = (real_pcre *)(vmdpcre_malloc)(size);
02515
02516 if (re == NULL)
02517 {
02518 *errorptr = ERR21;
02519 return NULL;
02520 }
02521
02522 /* Put in the magic number and the options. */
02523
02524 re->magic_number = MAGIC_NUMBER;
02525 re->options = options;
02526 re->tables = tables;
02527
02528 /* Set up a starting, non-extracting bracket, then compile the expression. On
02529 error, *errorptr will be set non-NULL, so we don't need to look at the result
02530 of the function here. */
02531
02532 ptr = (const uschar *)pattern;
02533 code = re->code;
02534 *code = OP_BRA;
02535 bracount = 0;
02536 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
02537 &compile_block);
02538 re->top_bracket = bracount;
02539 re->top_backref = top_backref;
02540
02541 /* If not reached end of pattern on success, there's an excess bracket. */
02542
02543 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
02544
02545 /* Fill in the terminating state and check for disastrous overflow, but
02546 if debugging, leave the test till after things are printed out. */
02547
02548 *code++ = OP_END;
02549
02550 #ifndef DEBUG
02551 if (code - re->code > length) *errorptr = ERR23;
02552 #endif
02553
02554 /* Give an error if there's back reference to a non-existent capturing
02555 subpattern. */
02556
02557 if (top_backref > re->top_bracket) *errorptr = ERR15;
02558
02559 /* Failed to compile */
02560
02561 if (*errorptr != NULL)
02562 {
02563 (vmdpcre_free)(re);
02564 PCRE_ERROR_RETURN:
02565 *erroroffset = ptr - (const uschar *)pattern;
02566 return NULL;
02567 }
02568
02569 /* If the anchored option was not passed, set flag if we can determine that the
02570 pattern is anchored by virtue of ^ characters or \A or anything else (such as
02571 starting with .* when DOTALL is set).
02572
02573 Otherwise, see if we can determine what the first character has to be, because
02574 that speeds up unanchored matches no end. If not, see if we can set the
02575 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
02576 start with ^. and also when all branches start with .* for non-DOTALL matches.
02577 */
02578
02579 if ((options & PCRE_ANCHORED) == 0)
02580 {
02581 int temp_options = options;
02582 if (is_anchored(re->code, &temp_options))
02583 re->options |= PCRE_ANCHORED;
02584 else
02585 {
02586 int ch = find_firstchar(re->code, &temp_options);
02587 if (ch >= 0)
02588 {
02589 re->first_char = ch;
02590 re->options |= PCRE_FIRSTSET;
02591 }
02592 else if (is_startline(re->code))
02593 re->options |= PCRE_STARTLINE;
02594 }
02595 }
02596
02597 /* Print out the compiled data for debugging */
02598
02599 #ifdef DEBUG
02600
02601 printf("Length = %d top_bracket = %d top_backref = %d\n",
02602 length, re->top_bracket, re->top_backref);
02603
02604 if (re->options != 0)
02605 {
02606 printf("%s%s%s%s%s%s%s%s\n",
02607 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
02608 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
02609 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
02610 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
02611 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
02612 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
02613 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
02614 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
02615 }
02616
02617 if ((re->options & PCRE_FIRSTSET) != 0)
02618 {
02619 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
02620 else printf("First char = \\x%02x\n", re->first_char);
02621 }
02622
02623 code_end = code;
02624 code_base = code = re->code;
02625
02626 while (code < code_end)
02627 {
02628 int charlength;
02629
02630 printf("%3d ", code - code_base);
02631
02632 if (*code >= OP_BRA)
02633 {
02634 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
02635 code += 2;
02636 }
02637
02638 else switch(*code)
02639 {
02640 case OP_OPT:
02641 printf(" %.2x %s", code[1], OP_names[*code]);
02642 code++;
02643 break;
02644
02645 case OP_COND:
02646 printf("%3d Cond", (code[1] << 8) + code[2]);
02647 code += 2;
02648 break;
02649
02650 case OP_CREF:
02651 printf(" %.2d %s", code[1], OP_names[*code]);
02652 code++;
02653 break;
02654
02655 case OP_CHARS:
02656 charlength = *(++code);
02657 printf("%3d ", charlength);
02658 while (charlength-- > 0)
02659 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
02660 break;
02661
02662 case OP_KETRMAX:
02663 case OP_KETRMIN:
02664 case OP_ALT:
02665 case OP_KET:
02666 case OP_ASSERT:
02667 case OP_ASSERT_NOT:
02668 case OP_ASSERTBACK:
02669 case OP_ASSERTBACK_NOT:
02670 case OP_ONCE:
02671 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
02672 code += 2;
02673 break;
02674
02675 case OP_REVERSE:
02676 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
02677 code += 2;
02678 break;
02679
02680 case OP_STAR:
02681 case OP_MINSTAR:
02682 case OP_PLUS:
02683 case OP_MINPLUS:
02684 case OP_QUERY:
02685 case OP_MINQUERY:
02686 case OP_TYPESTAR:
02687 case OP_TYPEMINSTAR:
02688 case OP_TYPEPLUS:
02689 case OP_TYPEMINPLUS:
02690 case OP_TYPEQUERY:
02691 case OP_TYPEMINQUERY:
02692 if (*code >= OP_TYPESTAR)
02693 printf(" %s", OP_names[code[1]]);
02694 else if (isprint(c = code[1])) printf(" %c", c);
02695 else printf(" \\x%02x", c);
02696 printf("%s", OP_names[*code++]);
02697 break;
02698
02699 case OP_EXACT:
02700 case OP_UPTO:
02701 case OP_MINUPTO:
02702 if (isprint(c = code[3])) printf(" %c{", c);
02703 else printf(" \\x%02x{", c);
02704 if (*code != OP_EXACT) printf("0,");
02705 printf("%d}", (code[1] << 8) + code[2]);
02706 if (*code == OP_MINUPTO) printf("?");
02707 code += 3;
02708 break;
02709
02710 case OP_TYPEEXACT:
02711 case OP_TYPEUPTO:
02712 case OP_TYPEMINUPTO:
02713 printf(" %s{", OP_names[code[3]]);
02714 if (*code != OP_TYPEEXACT) printf(",");
02715 printf("%d}", (code[1] << 8) + code[2]);
02716 if (*code == OP_TYPEMINUPTO) printf("?");
02717 code += 3;
02718 break;
02719
02720 case OP_NOT:
02721 if (isprint(c = *(++code))) printf(" [^%c]", c);
02722 else printf(" [^\\x%02x]", c);
02723 break;
02724
02725 case OP_NOTSTAR:
02726 case OP_NOTMINSTAR:
02727 case OP_NOTPLUS:
02728 case OP_NOTMINPLUS:
02729 case OP_NOTQUERY:
02730 case OP_NOTMINQUERY:
02731 if (isprint(c = code[1])) printf(" [^%c]", c);
02732 else printf(" [^\\x%02x]", c);
02733 printf("%s", OP_names[*code++]);
02734 break;
02735
02736 case OP_NOTEXACT:
02737 case OP_NOTUPTO:
02738 case OP_NOTMINUPTO:
02739 if (isprint(c = code[3])) printf(" [^%c]{", c);
02740 else printf(" [^\\x%02x]{", c);
02741 if (*code != OP_NOTEXACT) printf(",");
02742 printf("%d}", (code[1] << 8) + code[2]);
02743 if (*code == OP_NOTMINUPTO) printf("?");
02744 code += 3;
02745 break;
02746
02747 case OP_REF:
02748 printf(" \\%d", *(++code));
02749 code ++;
02750 goto CLASS_REF_REPEAT;
02751
02752 case OP_CLASS:
02753 {
02754 int i, min, max;
02755 code++;
02756 printf(" [");
02757
02758 for (i = 0; i < 256; i++)
02759 {
02760 if ((code[i/8] & (1 << (i&7))) != 0)
02761 {
02762 int j;
02763 for (j = i+1; j < 256; j++)
02764 if ((code[j/8] & (1 << (j&7))) == 0) break;
02765 if (i == '-' || i == ']') printf("\\");
02766 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
02767 if (--j > i)
02768 {
02769 printf("-");
02770 if (j == '-' || j == ']') printf("\\");
02771 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
02772 }
02773 i = j;
02774 }
02775 }
02776 printf("]");
02777 code += 32;
02778
02779 CLASS_REF_REPEAT:
02780
02781 switch(*code)
02782 {
02783 case OP_CRSTAR:
02784 case OP_CRMINSTAR:
02785 case OP_CRPLUS:
02786 case OP_CRMINPLUS:
02787 case OP_CRQUERY:
02788 case OP_CRMINQUERY:
02789 printf("%s", OP_names[*code]);
02790 break;
02791
02792 case OP_CRRANGE:
02793 case OP_CRMINRANGE:
02794 min = (code[1] << 8) + code[2];
02795 max = (code[3] << 8) + code[4];
02796 if (max == 0) printf("{%d,}", min);
02797 else printf("{%d,%d}", min, max);
02798 if (*code == OP_CRMINRANGE) printf("?");
02799 code += 4;
02800 break;
02801
02802 default:
02803 code--;
02804 }
02805 }
02806 break;
02807
02808 /* Anything else is just a one-node item */
02809
02810 default:
02811 printf(" %s", OP_names[*code]);
02812 break;
02813 }
02814
02815 code++;
02816 printf("\n");
02817 }
02818 printf("------------------------------------------------------------------\n");
02819
02820 /* This check is done here in the debugging case so that the code that
02821 was compiled can be seen. */
02822
02823 if (code - re->code > length)
02824 {
02825 *errorptr = ERR23;
02826 (vmdpcre_free)(re);
02827 *erroroffset = ptr - (uschar *)pattern;
02828 return NULL;
02829 }
02830 #endif
02831
02832 return (pcre *)re;
02833 }
02834
02835
02836
02837 /*************************************************
02838 * Match a back-reference *
02839 *************************************************/
02840
02841 /* If a back reference hasn't been set, the length that is passed is greater
02842 than the number of characters left in the string, so the match fails.
02843
02844 Arguments:
02845 offset index into the offset vector
02846 eptr points into the subject
02847 length length to be matched
02848 md points to match data block
02849 ims the ims flags
02850
02851 Returns: TRUE if matched
02852 */
02853
02854 static BOOL
02855 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
02856 int ims)
02857 {
02858 const uschar *p = md->start_subject + md->offset_vector[offset];
02859
02860 #ifdef DEBUG
02861 if (eptr >= md->end_subject)
02862 printf("matching subject <null>");
02863 else
02864 {
02865 printf("matching subject ");
02866 pchars(eptr, length, TRUE, md);
02867 }
02868 printf(" against backref ");
02869 pchars(p, length, FALSE, md);
02870 printf("\n");
02871 #endif
02872
02873 /* Always fail if not enough characters left */
02874
02875 if (length > md->end_subject - eptr) return FALSE;
02876
02877 /* Separate the caselesss case for speed */
02878
02879 if ((ims & PCRE_CASELESS) != 0)
02880 {
02881 while (length-- > 0)
02882 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
02883 }
02884 else
02885 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
02886
02887 return TRUE;
02888 }
02889
02890
02891
02892 /*************************************************
02893 * Match from current position *
02894 *************************************************/
02895
02896 /* On entry ecode points to the first opcode, and eptr to the first character
02897 in the subject string, while eptrb holds the value of eptr at the start of the
02898 last bracketed group - used for breaking infinite loops matching zero-length
02899 strings.
02900
02901 Arguments:
02902 eptr pointer in subject
02903 ecode position in code
02904 offset_top current top pointer
02905 md pointer to "static" info for the match
02906 ims current /i, /m, and /s options
02907 condassert TRUE if called to check a condition assertion
02908 eptrb eptr at start of last bracket
02909
02910 Returns: TRUE if matched
02911 */
02912
02913 static BOOL
02914 match(register const uschar *eptr, register const uschar *ecode,
02915 int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)
02916 {
02917 int original_ims = ims; /* Save for resetting on ')' */
02918
02919 for (;;)
02920 {
02921 int op = (int)*ecode;
02922 int min, max, ctype;
02923 register int i;
02924 register int c;
02925 BOOL minimize = FALSE;
02926
02927 /* Opening capturing bracket. If there is space in the offset vector, save
02928 the current subject position in the working slot at the top of the vector. We
02929 mustn't change the current values of the data slot, because they may be set
02930 from a previous iteration of this group, and be referred to by a reference
02931 inside the group.
02932
02933 If the bracket fails to match, we need to restore this value and also the
02934 values of the final offsets, in case they were set by a previous iteration of
02935 the same bracket.
02936
02937 If there isn't enough space in the offset vector, treat this as if it were a
02938 non-capturing bracket. Don't worry about setting the flag for the error case
02939 here; that is handled in the code for KET. */
02940
02941 if (op > OP_BRA)
02942 {
02943 int number = op - OP_BRA;
02944 int offset = number << 1;
02945
02946 #ifdef DEBUG
02947 printf("start bracket %d subject=", number);
02948 pchars(eptr, 16, TRUE, md);
02949 printf("\n");
02950 #endif
02951
02952 if (offset < md->offset_max)
02953 {
02954 int save_offset1 = md->offset_vector[offset];
02955 int save_offset2 = md->offset_vector[offset+1];
02956 int save_offset3 = md->offset_vector[md->offset_end - number];
02957
02958 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
02959 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
02960
02961 do
02962 {
02963 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
02964 ecode += (ecode[1] << 8) + ecode[2];
02965 }
02966 while (*ecode == OP_ALT);
02967
02968 DPRINTF(("bracket %d failed\n", number));
02969
02970 md->offset_vector[offset] = save_offset1;
02971 md->offset_vector[offset+1] = save_offset2;
02972 md->offset_vector[md->offset_end - number] = save_offset3;
02973 return FALSE;
02974 }
02975
02976 /* Insufficient room for saving captured contents */
02977
02978 else op = OP_BRA;
02979 }
02980
02981 /* Other types of node can be handled by a switch */
02982
02983 switch(op)
02984 {
02985 case OP_BRA: /* Non-capturing bracket: optimized */
02986 DPRINTF(("start bracket 0\n"));
02987 do
02988 {
02989 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
02990 ecode += (ecode[1] << 8) + ecode[2];
02991 }
02992 while (*ecode == OP_ALT);
02993 DPRINTF(("bracket 0 failed\n"));
02994 return FALSE;
02995
02996 /* Conditional group: compilation checked that there are no more than
02997 two branches. If the condition is false, skipping the first branch takes us
02998 past the end if there is only one branch, but that's OK because that is
02999 exactly what going to the ket would do. */
03000
03001 case OP_COND:
03002 if (ecode[3] == OP_CREF) /* Condition is extraction test */
03003 {
03004 int offset = ecode[4] << 1; /* Doubled reference number */
03005 return match(eptr,
03006 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
03007 5 : 3 + (ecode[1] << 8) + ecode[2]),
03008 offset_top, md, ims, FALSE, eptr);
03009 }
03010
03011 /* The condition is an assertion. Call match() to evaluate it - setting
03012 the final argument TRUE causes it to stop at the end of an assertion. */
03013
03014 else
03015 {
03016 if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
03017 {
03018 ecode += 3 + (ecode[4] << 8) + ecode[5];
03019 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
03020 }
03021 else ecode += (ecode[1] << 8) + ecode[2];
03022 return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
03023 }
03024 /* Control never reaches here */
03025
03026 /* Skip over conditional reference data if encountered (should not be) */
03027
03028 case OP_CREF:
03029 ecode += 2;
03030 break;
03031
03032 /* End of the pattern */
03033
03034 case OP_END:
03035 md->end_match_ptr = eptr; /* Record where we ended */
03036 md->end_offset_top = offset_top; /* and how many extracts were taken */
03037 return TRUE;
03038
03039 /* Change option settings */
03040
03041 case OP_OPT:
03042 ims = ecode[1];
03043 ecode += 2;
03044 DPRINTF(("ims set to %02x\n", ims));
03045 break;
03046
03047 /* Assertion brackets. Check the alternative branches in turn - the
03048 matching won't pass the KET for an assertion. If any one branch matches,
03049 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
03050 start of each branch to move the current point backwards, so the code at
03051 this level is identical to the lookahead case. */
03052
03053 case OP_ASSERT:
03054 case OP_ASSERTBACK:
03055 do
03056 {
03057 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
03058 ecode += (ecode[1] << 8) + ecode[2];
03059 }
03060 while (*ecode == OP_ALT);
03061 if (*ecode == OP_KET) return FALSE;
03062
03063 /* If checking an assertion for a condition, return TRUE. */
03064
03065 if (condassert) return TRUE;
03066
03067 /* Continue from after the assertion, updating the offsets high water
03068 mark, since extracts may have been taken during the assertion. */
03069
03070 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
03071 ecode += 3;
03072 offset_top = md->end_offset_top;
03073 continue;
03074
03075 /* Negative assertion: all branches must fail to match */
03076
03077 case OP_ASSERT_NOT:
03078 case OP_ASSERTBACK_NOT:
03079 do
03080 {
03081 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
03082 ecode += (ecode[1] << 8) + ecode[2];
03083 }
03084 while (*ecode == OP_ALT);
03085
03086 if (condassert) return TRUE;
03087 ecode += 3;
03088 continue;
03089
03090 /* Move the subject pointer back. This occurs only at the start of
03091 each branch of a lookbehind assertion. If we are too close to the start to
03092 move back, this match function fails. */
03093
03094 case OP_REVERSE:
03095 eptr -= (ecode[1] << 8) + ecode[2];
03096 if (eptr < md->start_subject) return FALSE;
03097 ecode += 3;
03098 break;
03099
03100
03101 /* "Once" brackets are like assertion brackets except that after a match,
03102 the point in the subject string is not moved back. Thus there can never be
03103 a move back into the brackets. Check the alternative branches in turn - the
03104 matching won't pass the KET for this kind of subpattern. If any one branch
03105 matches, we carry on as at the end of a normal bracket, leaving the subject
03106 pointer. */
03107
03108 case OP_ONCE:
03109 {
03110 const uschar *prev = ecode;
03111
03112 do
03113 {
03114 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
03115 ecode += (ecode[1] << 8) + ecode[2];
03116 }
03117 while (*ecode == OP_ALT);
03118
03119 /* If hit the end of the group (which could be repeated), fail */
03120
03121 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
03122
03123 /* Continue as from after the assertion, updating the offsets high water
03124 mark, since extracts may have been taken. */
03125
03126 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
03127
03128 offset_top = md->end_offset_top;
03129 eptr = md->end_match_ptr;
03130
03131 /* For a non-repeating ket, just continue at this level. This also
03132 happens for a repeating ket if no characters were matched in the group.
03133 This is the forcible breaking of infinite loops as implemented in Perl
03134 5.005. If there is an options reset, it will get obeyed in the normal
03135 course of events. */
03136
03137 if (*ecode == OP_KET || eptr == eptrb)
03138 {
03139 ecode += 3;
03140 break;
03141 }
03142
03143 /* The repeating kets try the rest of the pattern or restart from the
03144 preceding bracket, in the appropriate order. We need to reset any options
03145 that changed within the bracket before re-running it, so check the next
03146 opcode. */
03147
03148 if (ecode[3] == OP_OPT)
03149 {
03150 ims = (ims & ~PCRE_IMS) | ecode[4];
03151 DPRINTF(("ims set to %02x at group repeat\n", ims));
03152 }
03153
03154 if (*ecode == OP_KETRMIN)
03155 {
03156 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
03157 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
03158 }
03159 else /* OP_KETRMAX */
03160 {
03161 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
03162 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
03163 }
03164 }
03165 return FALSE;
03166
03167 /* An alternation is the end of a branch; scan along to find the end of the
03168 bracketed group and go to there. */
03169
03170 case OP_ALT:
03171 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
03172 break;
03173
03174 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
03175 that it may occur zero times. It may repeat infinitely, or not at all -
03176 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
03177 repeat limits are compiled as a number of copies, with the optional ones
03178 preceded by BRAZERO or BRAMINZERO. */
03179
03180 case OP_BRAZERO:
03181 {
03182 const uschar *next = ecode+1;
03183 if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
03184 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
03185 ecode = next + 3;
03186 }
03187 break;
03188
03189 case OP_BRAMINZERO:
03190 {
03191 const uschar *next = ecode+1;
03192 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
03193 if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
03194 ecode++;
03195 }
03196 break;
03197
03198 /* End of a group, repeated or non-repeating. If we are at the end of
03199 an assertion "group", stop matching and return TRUE, but record the
03200 current high water mark for use by positive assertions. Do this also
03201 for the "once" (not-backup up) groups. */
03202
03203 case OP_KET:
03204 case OP_KETRMIN:
03205 case OP_KETRMAX:
03206 {
03207 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
03208
03209 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
03210 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
03211 *prev == OP_ONCE)
03212 {
03213 md->end_match_ptr = eptr; /* For ONCE */
03214 md->end_offset_top = offset_top;
03215 return TRUE;
03216 }
03217
03218 /* In all other cases except a conditional group we have to check the
03219 group number back at the start and if necessary complete handling an
03220 extraction by setting the offsets and bumping the high water mark. */
03221
03222 if (*prev != OP_COND)
03223 {
03224 int number = *prev - OP_BRA;
03225 int offset = number << 1;
03226
03227 DPRINTF(("end bracket %d\n", number));
03228
03229 if (number > 0)
03230 {
03231 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
03232 {
03233 md->offset_vector[offset] =
03234 md->offset_vector[md->offset_end - number];
03235 md->offset_vector[offset+1] = eptr - md->start_subject;
03236 if (offset_top <= offset) offset_top = offset + 2;
03237 }
03238 }
03239 }
03240
03241 /* Reset the value of the ims flags, in case they got changed during
03242 the group. */
03243
03244 ims = original_ims;
03245 DPRINTF(("ims reset to %02x\n", ims));
03246
03247 /* For a non-repeating ket, just continue at this level. This also
03248 happens for a repeating ket if no characters were matched in the group.
03249 This is the forcible breaking of infinite loops as implemented in Perl
03250 5.005. If there is an options reset, it will get obeyed in the normal
03251 course of events. */
03252
03253 if (*ecode == OP_KET || eptr == eptrb)
03254 {
03255 ecode += 3;
03256 break;
03257 }
03258
03259 /* The repeating kets try the rest of the pattern or restart from the
03260 preceding bracket, in the appropriate order. */
03261
03262 if (*ecode == OP_KETRMIN)
03263 {
03264 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
03265 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
03266 }
03267 else /* OP_KETRMAX */
03268 {
03269 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
03270 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
03271 }
03272 }
03273 return FALSE;
03274
03275 /* Start of subject unless notbol, or after internal newline if multiline */
03276
03277 case OP_CIRC:
03278 if (md->notbol && eptr == md->start_subject) return FALSE;
03279 if ((ims & PCRE_MULTILINE) != 0)
03280 {
03281 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
03282 ecode++;
03283 break;
03284 }
03285 /* ... else fall through */
03286
03287 /* Start of subject assertion */
03288
03289 case OP_SOD:
03290 if (eptr != md->start_subject) return FALSE;
03291 ecode++;
03292 break;
03293
03294 /* Assert before internal newline if multiline, or before a terminating
03295 newline unless endonly is set, else end of subject unless noteol is set. */
03296
03297 case OP_DOLL:
03298 if ((ims & PCRE_MULTILINE) != 0)
03299 {
03300 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
03301 else { if (md->noteol) return FALSE; }
03302 ecode++;
03303 break;
03304 }
03305 else
03306 {
03307 if (md->noteol) return FALSE;
03308 if (!md->endonly)
03309 {
03310 if (eptr < md->end_subject - 1 ||
03311 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
03312
03313 ecode++;
03314 break;
03315 }
03316 }
03317 /* ... else fall through */
03318
03319 /* End of subject assertion (\z) */
03320
03321 case OP_EOD:
03322 if (eptr < md->end_subject) return FALSE;
03323 ecode++;
03324 break;
03325
03326 /* End of subject or ending \n assertion (\Z) */
03327
03328 case OP_EODN:
03329 if (eptr < md->end_subject - 1 ||
03330 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
03331 ecode++;
03332 break;
03333
03334 /* Word boundary assertions */
03335
03336 case OP_NOT_WORD_BOUNDARY:
03337 case OP_WORD_BOUNDARY:
03338 {
03339 BOOL prev_is_word = (eptr != md->start_subject) &&
03340 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
03341 BOOL cur_is_word = (eptr < md->end_subject) &&
03342 ((md->ctypes[*eptr] & ctype_word) != 0);
03343 if ((*ecode++ == OP_WORD_BOUNDARY)?
03344 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
03345 return FALSE;
03346 }
03347 break;
03348
03349 /* Match a single character type; inline for speed */
03350
03351 case OP_ANY:
03352 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
03353 return FALSE;
03354 if (eptr++ >= md->end_subject) return FALSE;
03355 ecode++;
03356 break;
03357
03358 case OP_NOT_DIGIT:
03359 if (eptr >= md->end_subject ||
03360 (md->ctypes[*eptr++] & ctype_digit) != 0)
03361 return FALSE;
03362 ecode++;
03363 break;
03364
03365 case OP_DIGIT:
03366 if (eptr >= md->end_subject ||
03367 (md->ctypes[*eptr++] & ctype_digit) == 0)
03368 return FALSE;
03369 ecode++;
03370 break;
03371
03372 case OP_NOT_WHITESPACE:
03373 if (eptr >= md->end_subject ||
03374 (md->ctypes[*eptr++] & ctype_space) != 0)
03375 return FALSE;
03376 ecode++;
03377 break;
03378
03379 case OP_WHITESPACE:
03380 if (eptr >= md->end_subject ||
03381 (md->ctypes[*eptr++] & ctype_space) == 0)
03382 return FALSE;
03383 ecode++;
03384 break;
03385
03386 case OP_NOT_WORDCHAR:
03387 if (eptr >= md->end_subject ||
03388 (md->ctypes[*eptr++] & ctype_word) != 0)
03389 return FALSE;
03390 ecode++;
03391 break;
03392
03393 case OP_WORDCHAR:
03394 if (eptr >= md->end_subject ||
03395 (md->ctypes[*eptr++] & ctype_word) == 0)
03396 return FALSE;
03397 ecode++;
03398 break;
03399
03400 /* Match a back reference, possibly repeatedly. Look past the end of the
03401 item to see if there is repeat information following. The code is similar
03402 to that for character classes, but repeated for efficiency. Then obey
03403 similar code to character type repeats - written out again for speed.
03404 However, if the referenced string is the empty string, always treat
03405 it as matched, any number of times (otherwise there could be infinite
03406 loops). */
03407
03408 case OP_REF:
03409 {
03410 int length;
03411 int offset = ecode[1] << 1; /* Doubled reference number */
03412 ecode += 2; /* Advance past the item */
03413
03414 /* If the reference is unset, set the length to be longer than the amount
03415 of subject left; this ensures that every attempt at a match fails. We
03416 can't just fail here, because of the possibility of quantifiers with zero
03417 minima. */
03418
03419 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
03420 md->end_subject - eptr + 1 :
03421 md->offset_vector[offset+1] - md->offset_vector[offset];
03422
03423 /* Set up for repetition, or handle the non-repeated case */
03424
03425 switch (*ecode)
03426 {
03427 case OP_CRSTAR:
03428 case OP_CRMINSTAR:
03429 case OP_CRPLUS:
03430 case OP_CRMINPLUS:
03431 case OP_CRQUERY:
03432 case OP_CRMINQUERY:
03433 c = *ecode++ - OP_CRSTAR;
03434 minimize = (c & 1) != 0;
03435 min = rep_min[c]; /* Pick up values from tables; */
03436 max = rep_max[c]; /* zero for max => infinity */
03437 if (max == 0) max = INT_MAX;
03438 break;
03439
03440 case OP_CRRANGE:
03441 case OP_CRMINRANGE:
03442 minimize = (*ecode == OP_CRMINRANGE);
03443 min = (ecode[1] << 8) + ecode[2];
03444 max = (ecode[3] << 8) + ecode[4];
03445 if (max == 0) max = INT_MAX;
03446 ecode += 5;
03447 break;
03448
03449 default: /* No repeat follows */
03450 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
03451 eptr += length;
03452 continue; /* With the main loop */
03453 }
03454
03455 /* If the length of the reference is zero, just continue with the
03456 main loop. */
03457
03458 if (length == 0) continue;
03459
03460 /* First, ensure the minimum number of matches are present. We get back
03461 the length of the reference string explicitly rather than passing the
03462 address of eptr, so that eptr can be a register variable. */
03463
03464 for (i = 1; i <= min; i++)
03465 {
03466 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
03467 eptr += length;
03468 }
03469
03470 /* If min = max, continue at the same level without recursion.
03471 They are not both allowed to be zero. */
03472
03473 if (min == max) continue;
03474
03475 /* If minimizing, keep trying and advancing the pointer */
03476
03477 if (minimize)
03478 {
03479 for (i = min;; i++)
03480 {
03481 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
03482 return TRUE;
03483 if (i >= max || !match_ref(offset, eptr, length, md, ims))
03484 return FALSE;
03485 eptr += length;
03486 }
03487 /* Control never gets here */
03488 }
03489
03490 /* If maximizing, find the longest string and work backwards */
03491
03492 else
03493 {
03494 const uschar *pp = eptr;
03495 for (i = min; i < max; i++)
03496 {
03497 if (!match_ref(offset, eptr, length, md, ims)) break;
03498 eptr += length;
03499 }
03500 while (eptr >= pp)
03501 {
03502 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
03503 return TRUE;
03504 eptr -= length;
03505 }
03506 return FALSE;
03507 }
03508 }
03509 /* Control never gets here */
03510
03511
03512
03513 /* Match a character class, possibly repeatedly. Look past the end of the
03514 item to see if there is repeat information following. Then obey similar
03515 code to character type repeats - written out again for speed. */
03516
03517 case OP_CLASS:
03518 {
03519 const uschar *data = ecode + 1; /* Save for matching */
03520 ecode += 33; /* Advance past the item */
03521
03522 switch (*ecode)
03523 {
03524 case OP_CRSTAR:
03525 case OP_CRMINSTAR:
03526 case OP_CRPLUS:
03527 case OP_CRMINPLUS:
03528 case OP_CRQUERY:
03529 case OP_CRMINQUERY:
03530 c = *ecode++ - OP_CRSTAR;
03531 minimize = (c & 1) != 0;
03532 min = rep_min[c]; /* Pick up values from tables; */
03533 max = rep_max[c]; /* zero for max => infinity */
03534 if (max == 0) max = INT_MAX;
03535 break;
03536
03537 case OP_CRRANGE:
03538 case OP_CRMINRANGE:
03539 minimize = (*ecode == OP_CRMINRANGE);
03540 min = (ecode[1] << 8) + ecode[2];
03541 max = (ecode[3] << 8) + ecode[4];
03542 if (max == 0) max = INT_MAX;
03543 ecode += 5;
03544 break;
03545
03546 default: /* No repeat follows */
03547 min = max = 1;
03548 break;
03549 }
03550
03551 /* First, ensure the minimum number of matches are present. */
03552
03553 for (i = 1; i <= min; i++)
03554 {
03555 if (eptr >= md->end_subject) return FALSE;
03556 c = *eptr++;
03557 if ((data[c/8] & (1 << (c&7))) != 0) continue;
03558 return FALSE;
03559 }
03560
03561 /* If max == min we can continue with the main loop without the
03562 need to recurse. */
03563
03564 if (min == max) continue;
03565
03566 /* If minimizing, keep testing the rest of the expression and advancing
03567 the pointer while it matches the class. */
03568
03569 if (minimize)
03570 {
03571 for (i = min;; i++)
03572 {
03573 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
03574 return TRUE;
03575 if (i >= max || eptr >= md->end_subject) return FALSE;
03576 c = *eptr++;
03577 if ((data[c/8] & (1 << (c&7))) != 0) continue;
03578 return FALSE;
03579 }
03580 /* Control never gets here */
03581 }
03582
03583 /* If maximizing, find the longest possible run, then work backwards. */
03584
03585 else
03586 {
03587 const uschar *pp = eptr;
03588 for (i = min; i < max; eptr++, i++)
03589 {
03590 if (eptr >= md->end_subject) break;
03591 c = *eptr;
03592 if ((data[c/8] & (1 << (c&7))) != 0) continue;
03593 break;
03594 }
03595
03596 while (eptr >= pp)
03597 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
03598 return TRUE;
03599 return FALSE;
03600 }
03601 }
03602 /* Control never gets here */
03603
03604 /* Match a run of characters */
03605
03606 case OP_CHARS:
03607 {
03608 register int length = ecode[1];
03609 ecode += 2;
03610
03611 #ifdef DEBUG /* Sigh. Some compilers never learn. */
03612 if (eptr >= md->end_subject)
03613 printf("matching subject <null> against pattern ");
03614 else
03615 {
03616 printf("matching subject ");
03617 pchars(eptr, length, TRUE, md);
03618 printf(" against pattern ");
03619 }
03620 pchars(ecode, length, FALSE, md);
03621 printf("\n");
03622 #endif
03623
03624 if (length > md->end_subject - eptr) return FALSE;
03625 if ((ims & PCRE_CASELESS) != 0)
03626 {
03627 while (length-- > 0)
03628 if (md->lcc[*ecode++] != md->lcc[*eptr++])
03629 return FALSE;
03630 }
03631 else
03632 {
03633 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
03634 }
03635 }
03636 break;
03637
03638 /* Match a single character repeatedly; different opcodes share code. */
03639
03640 case OP_EXACT:
03641 min = max = (ecode[1] << 8) + ecode[2];
03642 ecode += 3;
03643 goto REPEATCHAR;
03644
03645 case OP_UPTO:
03646 case OP_MINUPTO:
03647 min = 0;
03648 max = (ecode[1] << 8) + ecode[2];
03649 minimize = *ecode == OP_MINUPTO;
03650 ecode += 3;
03651 goto REPEATCHAR;
03652
03653 case OP_STAR:
03654 case OP_MINSTAR:
03655 case OP_PLUS:
03656 case OP_MINPLUS:
03657 case OP_QUERY:
03658 case OP_MINQUERY:
03659 c = *ecode++ - OP_STAR;
03660 minimize = (c & 1) != 0;
03661 min = rep_min[c]; /* Pick up values from tables; */
03662 max = rep_max[c]; /* zero for max => infinity */
03663 if (max == 0) max = INT_MAX;
03664
03665 /* Common code for all repeated single-character matches. We can give
03666 up quickly if there are fewer than the minimum number of characters left in
03667 the subject. */
03668
03669 REPEATCHAR:
03670 if (min > md->end_subject - eptr) return FALSE;
03671 c = *ecode++;
03672
03673 /* The code is duplicated for the caseless and caseful cases, for speed,
03674 since matching characters is likely to be quite common. First, ensure the
03675 minimum number of matches are present. If min = max, continue at the same
03676 level without recursing. Otherwise, if minimizing, keep trying the rest of
03677 the expression and advancing one matching character if failing, up to the
03678 maximum. Alternatively, if maximizing, find the maximum number of
03679 characters and work backwards. */
03680
03681 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
03682 max, eptr));
03683
03684 if ((ims & PCRE_CASELESS) != 0)
03685 {
03686 c = md->lcc[c];
03687 for (i = 1; i <= min; i++)
03688 if (c != md->lcc[*eptr++]) return FALSE;
03689 if (min == max) continue;
03690 if (minimize)
03691 {
03692 for (i = min;; i++)
03693 {
03694 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
03695 return TRUE;
03696 if (i >= max || eptr >= md->end_subject ||
03697 c != md->lcc[*eptr++])
03698 return FALSE;
03699 }
03700 /* Control never gets here */
03701 }
03702 else
03703 {
03704 const uschar *pp = eptr;
03705 for (i = min; i < max; i++)
03706 {
03707 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
03708 eptr++;
03709 }
03710 while (eptr >= pp)
03711 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
03712 return TRUE;
03713 return FALSE;
03714 }
03715 /* Control never gets here */
03716 }
03717
03718 /* Caseful comparisons */
03719
03720 else
03721 {
03722 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
03723 if (min == max) continue;
03724 if (minimize)
03725 {
03726 for (i = min;; i++)
03727 {
03728 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
03729 return TRUE;
03730 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
03731 }
03732 /* Control never gets here */
03733 }
03734 else
03735 {
03736 const uschar *pp = eptr;
03737 for (i = min; i < max; i++)
03738 {
03739 if (eptr >= md->end_subject || c != *eptr) break;
03740 eptr++;
03741 }
03742 while (eptr >= pp)
03743 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
03744 return TRUE;
03745 return FALSE;
03746 }
03747 }
03748 /* Control never gets here */
03749
03750 /* Match a negated single character */
03751
03752 case OP_NOT:
03753 if (eptr >= md->end_subject) return FALSE;
03754 ecode++;
03755 if ((ims & PCRE_CASELESS) != 0)
03756 {
03757 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
03758 }
03759 else
03760 {
03761 if (*ecode++ == *eptr++) return FALSE;
03762 }
03763 break;
03764
03765 /* Match a negated single character repeatedly. This is almost a repeat of
03766 the code for a repeated single character, but I haven't found a nice way of
03767 commoning these up that doesn't require a test of the positive/negative
03768 option for each character match. Maybe that wouldn't add very much to the
03769 time taken, but character matching *is* what this is all about... */
03770
03771 case OP_NOTEXACT:
03772 min = max = (ecode[1] << 8) + ecode[2];
03773 ecode += 3;
03774 goto REPEATNOTCHAR;
03775
03776 case OP_NOTUPTO:
03777 case OP_NOTMINUPTO:
03778 min = 0;
03779 max = (ecode[1] << 8) + ecode[2];
03780 minimize = *ecode == OP_NOTMINUPTO;
03781 ecode += 3;
03782 goto REPEATNOTCHAR;
03783
03784 case OP_NOTSTAR:
03785 case OP_NOTMINSTAR:
03786 case OP_NOTPLUS:
03787 case OP_NOTMINPLUS:
03788 case OP_NOTQUERY:
03789 case OP_NOTMINQUERY:
03790 c = *ecode++ - OP_NOTSTAR;
03791 minimize = (c & 1) != 0;
03792 min = rep_min[c]; /* Pick up values from tables; */
03793 max = rep_max[c]; /* zero for max => infinity */
03794 if (max == 0) max = INT_MAX;
03795
03796 /* Common code for all repeated single-character matches. We can give
03797 up quickly if there are fewer than the minimum number of characters left in
03798 the subject. */
03799
03800 REPEATNOTCHAR:
03801 if (min > md->end_subject - eptr) return FALSE;
03802 c = *ecode++;
03803
03804 /* The code is duplicated for the caseless and caseful cases, for speed,
03805 since matching characters is likely to be quite common. First, ensure the
03806 minimum number of matches are present. If min = max, continue at the same
03807 level without recursing. Otherwise, if minimizing, keep trying the rest of
03808 the expression and advancing one matching character if failing, up to the
03809 maximum. Alternatively, if maximizing, find the maximum number of
03810 characters and work backwards. */
03811
03812 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
03813 max, eptr));
03814
03815 if ((ims & PCRE_CASELESS) != 0)
03816 {
03817 c = md->lcc[c];
03818 for (i = 1; i <= min; i++)
03819 if (c == md->lcc[*eptr++]) return FALSE;
03820 if (min == max) continue;
03821 if (minimize)
03822 {
03823 for (i = min;; i++)
03824 {
03825 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
03826 return TRUE;
03827 if (i >= max || eptr >= md->end_subject ||
03828 c == md->lcc[*eptr++])
03829 return FALSE;
03830 }
03831 /* Control never gets here */
03832 }
03833 else
03834 {
03835 const uschar *pp = eptr;
03836 for (i = min; i < max; i++)
03837 {
03838 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
03839 eptr++;
03840 }
03841 while (eptr >= pp)
03842 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
03843 return TRUE;
03844 return FALSE;
03845 }
03846 /* Control never gets here */
03847 }
03848
03849 /* Caseful comparisons */
03850
03851 else
03852 {
03853 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
03854 if (min == max) continue;
03855 if (minimize)
03856 {
03857 for (i = min;; i++)
03858 {
03859 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
03860 return TRUE;
03861 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
03862 }
03863 /* Control never gets here */
03864 }
03865 else
03866 {
03867 const uschar *pp = eptr;
03868 for (i = min; i < max; i++)
03869 {
03870 if (eptr >= md->end_subject || c == *eptr) break;
03871 eptr++;
03872 }
03873 while (eptr >= pp)
03874 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
03875 return TRUE;
03876 return FALSE;
03877 }
03878 }
03879 /* Control never gets here */
03880
03881 /* Match a single character type repeatedly; several different opcodes
03882 share code. This is very similar to the code for single characters, but we
03883 repeat it in the interests of efficiency. */
03884
03885 case OP_TYPEEXACT:
03886 min = max = (ecode[1] << 8) + ecode[2];
03887 minimize = TRUE;
03888 ecode += 3;
03889 goto REPEATTYPE;
03890
03891 case OP_TYPEUPTO:
03892 case OP_TYPEMINUPTO:
03893 min = 0;
03894 max = (ecode[1] << 8) + ecode[2];
03895 minimize = *ecode == OP_TYPEMINUPTO;
03896 ecode += 3;
03897 goto REPEATTYPE;
03898
03899 case OP_TYPESTAR:
03900 case OP_TYPEMINSTAR:
03901 case OP_TYPEPLUS:
03902 case OP_TYPEMINPLUS:
03903 case OP_TYPEQUERY:
03904 case OP_TYPEMINQUERY:
03905 c = *ecode++ - OP_TYPESTAR;
03906 minimize = (c & 1) != 0;
03907 min = rep_min[c]; /* Pick up values from tables; */
03908 max = rep_max[c]; /* zero for max => infinity */
03909 if (max == 0) max = INT_MAX;
03910
03911 /* Common code for all repeated single character type matches */
03912
03913 REPEATTYPE:
03914 ctype = *ecode++; /* Code for the character type */
03915
03916 /* First, ensure the minimum number of matches are present. Use inline
03917 code for maximizing the speed, and do the type test once at the start
03918 (i.e. keep it out of the loop). Also test that there are at least the
03919 minimum number of characters before we start. */
03920
03921 if (min > md->end_subject - eptr) return FALSE;
03922 if (min > 0) switch(ctype)
03923 {
03924 case OP_ANY:
03925 if ((ims & PCRE_DOTALL) == 0)
03926 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
03927 else eptr += min;
03928 break;
03929
03930 case OP_NOT_DIGIT:
03931 for (i = 1; i <= min; i++)
03932 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
03933 break;
03934
03935 case OP_DIGIT:
03936 for (i = 1; i <= min; i++)
03937 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
03938 break;
03939
03940 case OP_NOT_WHITESPACE:
03941 for (i = 1; i <= min; i++)
03942 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
03943 break;
03944
03945 case OP_WHITESPACE:
03946 for (i = 1; i <= min; i++)
03947 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
03948 break;
03949
03950 case OP_NOT_WORDCHAR:
03951 for (i = 1; i <= min; i++)
03952 if ((md->ctypes[*eptr++] & ctype_word) != 0)
03953 return FALSE;
03954 break;
03955
03956 case OP_WORDCHAR:
03957 for (i = 1; i <= min; i++)
03958 if ((md->ctypes[*eptr++] & ctype_word) == 0)
03959 return FALSE;
03960 break;
03961 }
03962
03963 /* If min = max, continue at the same level without recursing */
03964
03965 if (min == max) continue;
03966
03967 /* If minimizing, we have to test the rest of the pattern before each
03968 subsequent match. */
03969
03970 if (minimize)
03971 {
03972 for (i = min;; i++)
03973 {
03974 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
03975 if (i >= max || eptr >= md->end_subject) return FALSE;
03976
03977 c = *eptr++;
03978 switch(ctype)
03979 {
03980 case OP_ANY:
03981 if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
03982 break;
03983
03984 case OP_NOT_DIGIT:
03985 if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
03986 break;
03987
03988 case OP_DIGIT:
03989 if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
03990 break;
03991
03992 case OP_NOT_WHITESPACE:
03993 if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
03994 break;
03995
03996 case OP_WHITESPACE:
03997 if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
03998 break;
03999
04000 case OP_NOT_WORDCHAR:
04001 if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
04002 break;
04003
04004 case OP_WORDCHAR:
04005 if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
04006 break;
04007 }
04008 }
04009 /* Control never gets here */
04010 }
04011
04012 /* If maximizing it is worth using inline code for speed, doing the type
04013 test once at the start (i.e. keep it out of the loop). */
04014
04015 else
04016 {
04017 const uschar *pp = eptr;
04018 switch(ctype)
04019 {
04020 case OP_ANY:
04021 if ((ims & PCRE_DOTALL) == 0)
04022 {
04023 for (i = min; i < max; i++)
04024 {
04025 if (eptr >= md->end_subject || *eptr == '\n') break;
04026 eptr++;
04027 }
04028 }
04029 else
04030 {
04031 c = max - min;
04032 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
04033 eptr += c;
04034 }
04035 break;
04036
04037 case OP_NOT_DIGIT:
04038 for (i = min; i < max; i++)
04039 {
04040 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
04041 break;
04042 eptr++;
04043 }
04044 break;
04045
04046 case OP_DIGIT:
04047 for (i = min; i < max; i++)
04048 {
04049 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
04050 break;
04051 eptr++;
04052 }
04053 break;
04054
04055 case OP_NOT_WHITESPACE:
04056 for (i = min; i < max; i++)
04057 {
04058 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
04059 break;
04060 eptr++;
04061 }
04062 break;
04063
04064 case OP_WHITESPACE:
04065 for (i = min; i < max; i++)
04066 {
04067 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
04068 break;
04069 eptr++;
04070 }
04071 break;
04072
04073 case OP_NOT_WORDCHAR:
04074 for (i = min; i < max; i++)
04075 {
04076 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
04077 break;
04078 eptr++;
04079 }
04080 break;
04081
04082 case OP_WORDCHAR:
04083 for (i = min; i < max; i++)
04084 {
04085 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
04086 break;
04087 eptr++;
04088 }
04089 break;
04090 }
04091
04092 while (eptr >= pp)
04093 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
04094 return TRUE;
04095 return FALSE;
04096 }
04097 /* Control never gets here */
04098
04099 /* There's been some horrible disaster. */
04100
04101 default:
04102 DPRINTF(("Unknown opcode %d\n", *ecode));
04103 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
04104 return FALSE;
04105 }
04106
04107 /* Do not stick any code in here without much thought; it is assumed
04108 that "continue" in the code above comes out to here to repeat the main
04109 loop. */
04110
04111 } /* End of main loop */
04112 /* Control never reaches here */
04113 }
04114
04115
04116
04117
04118 /*************************************************
04119 * Execute a Regular Expression *
04120 *************************************************/
04121
04122 /* This function applies a compiled re to a subject string and picks out
04123 portions of the string if it matches. Two elements in the vector are set for
04124 each substring: the offsets to the start and end of the substring.
04125
04126 Arguments:
04127 external_re points to the compiled expression
04128 external_extra points to "hints" from pcre_study() or is NULL
04129 subject points to the subject string
04130 length length of subject string (may contain binary zeros)
04131 start_offset where to start in the subject string
04132 options option bits
04133 offsets points to a vector of ints to be filled in with offsets
04134 offsetcount the number of elements in the vector
04135
04136 Returns: > 0 => success; value is the number of elements filled in
04137 = 0 => success, but offsets is not big enough
04138 -1 => failed to match
04139 < -1 => some kind of unexpected problem
04140 */
04141
04142 int
04143 vmdpcre_exec(const pcre *external_re, const pcre_extra *external_extra,
04144 const char *subject, int length, int start_offset, int options, int *offsets,
04145 int offsetcount)
04146 {
04147 int resetcount, ocount;
04148 int first_char = -1;
04149 int ims = 0;
04150 match_data match_block;
04151 const uschar *start_bits = NULL;
04152 const uschar *start_match = (const uschar *)subject + start_offset;
04153 const uschar *end_subject;
04154 const real_pcre *re = (const real_pcre *)external_re;
04155 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
04156 BOOL using_temporary_offsets = FALSE;
04157 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
04158 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
04159
04160 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
04161
04162 if (re == NULL || subject == NULL ||
04163 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
04164 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
04165
04166 match_block.start_subject = (const uschar *)subject;
04167 match_block.end_subject = match_block.start_subject + length;
04168 end_subject = match_block.end_subject;
04169
04170 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
04171
04172 match_block.notbol = (options & PCRE_NOTBOL) != 0;
04173 match_block.noteol = (options & PCRE_NOTEOL) != 0;
04174
04175 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
04176
04177 match_block.lcc = re->tables + lcc_offset;
04178 match_block.ctypes = re->tables + ctypes_offset;
04179
04180 /* The ims options can vary during the matching as a result of the presence
04181 of (?ims) items in the pattern. They are kept in a local variable so that
04182 restoring at the exit of a group is easy. */
04183
04184 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
04185
04186 /* If the expression has got more back references than the offsets supplied can
04187 hold, we get a temporary bit of working store to use during the matching.
04188 Otherwise, we can use the vector supplied, rounding down its size to a multiple
04189 of 3. */
04190
04191 ocount = offsetcount - (offsetcount % 3);
04192
04193 if (re->top_backref > 0 && re->top_backref >= ocount/3)
04194 {
04195 ocount = re->top_backref * 3 + 3;
04196 match_block.offset_vector = (int *)(vmdpcre_malloc)(ocount * sizeof(int));
04197 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
04198 using_temporary_offsets = TRUE;
04199 DPRINTF(("Got memory to hold back references\n"));
04200 }
04201 else match_block.offset_vector = offsets;
04202
04203 match_block.offset_end = ocount;
04204 match_block.offset_max = (2*ocount)/3;
04205 match_block.offset_overflow = FALSE;
04206
04207 /* Compute the minimum number of offsets that we need to reset each time. Doing
04208 this makes a huge difference to execution time when there aren't many brackets
04209 in the pattern. */
04210
04211 resetcount = 2 + re->top_bracket * 2;
04212 if (resetcount > offsetcount) resetcount = ocount;
04213
04214 /* Reset the working variable associated with each extraction. These should
04215 never be used unless previously set, but they get saved and restored, and so we
04216 initialize them to avoid reading uninitialized locations. */
04217
04218 if (match_block.offset_vector != NULL)
04219 {
04220 register int *iptr = match_block.offset_vector + ocount;
04221 register int *iend = iptr - resetcount/2 + 1;
04222 while (--iptr >= iend) *iptr = -1;
04223 }
04224
04225 /* Set up the first character to match, if available. The first_char value is
04226 never set for an anchored regular expression, but the anchoring may be forced
04227 at run time, so we have to test for anchoring. The first char may be unset for
04228 an unanchored pattern, of course. If there's no first char and the pattern was
04229 studied, there may be a bitmap of possible first characters. */
04230
04231 if (!anchored)
04232 {
04233 if ((re->options & PCRE_FIRSTSET) != 0)
04234 {
04235 first_char = re->first_char;
04236 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
04237 }
04238 else
04239 if (!startline && extra != NULL &&
04240 (extra->options & PCRE_STUDY_MAPPED) != 0)
04241 start_bits = extra->start_bits;
04242 }
04243
04244 /* Loop for unanchored matches; for anchored regexs the loop runs just once. */
04245
04246 do
04247 {
04248 int rc;
04249 register int *iptr = match_block.offset_vector;
04250 register int *iend = iptr + resetcount;
04251
04252 /* Reset the maximum number of extractions we might see. */
04253
04254 while (iptr < iend) *iptr++ = -1;
04255
04256 /* Advance to a unique first char if possible */
04257
04258 if (first_char >= 0)
04259 {
04260 if ((ims & PCRE_CASELESS) != 0)
04261 while (start_match < end_subject &&
04262 match_block.lcc[*start_match] != first_char)
04263 start_match++;
04264 else
04265 while (start_match < end_subject && *start_match != first_char)
04266 start_match++;
04267 }
04268
04269 /* Or to just after \n for a multiline match if possible */
04270
04271 else if (startline)
04272 {
04273 if (start_match > match_block.start_subject)
04274 {
04275 while (start_match < end_subject && start_match[-1] != '\n')
04276 start_match++;
04277 }
04278 }
04279
04280 /* Or to a non-unique first char */
04281
04282 else if (start_bits != NULL)
04283 {
04284 while (start_match < end_subject)
04285 {
04286 register int c = *start_match;
04287 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
04288 }
04289 }
04290
04291 #ifdef DEBUG /* Sigh. Some compilers never learn. */
04292 printf(">>>> Match against: ");
04293 pchars(start_match, end_subject - start_match, TRUE, &match_block);
04294 printf("\n");
04295 #endif
04296
04297 /* When a match occurs, substrings will be set for all internal extractions;
04298 we just need to set up the whole thing as substring 0 before returning. If
04299 there were too many extractions, set the return code to zero. In the case
04300 where we had to get some local store to hold offsets for backreferences, copy
04301 those back references that we can. In this case there need not be overflow
04302 if certain parts of the pattern were not used. */
04303
04304 if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
04305 continue;
04306
04307 /* Copy the offset information from temporary store if necessary */
04308
04309 if (using_temporary_offsets)
04310 {
04311 if (offsetcount >= 4)
04312 {
04313 memcpy(offsets + 2, match_block.offset_vector + 2,
04314 (offsetcount - 2) * sizeof(int));
04315 DPRINTF(("Copied offsets from temporary memory\n"));
04316 }
04317 if (match_block.end_offset_top > offsetcount)
04318 match_block.offset_overflow = TRUE;
04319
04320 DPRINTF(("Freeing temporary memory\n"));
04321 (vmdpcre_free)(match_block.offset_vector);
04322 }
04323
04324 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
04325
04326 if (match_block.offset_end < 2) rc = 0; else
04327 {
04328 offsets[0] = start_match - match_block.start_subject;
04329 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
04330 }
04331
04332 DPRINTF((">>>> returning %d\n", rc));
04333 return rc;
04334 }
04335
04336 /* This "while" is the end of the "do" above */
04337
04338 while (!anchored &&
04339 match_block.errorcode == PCRE_ERROR_NOMATCH &&
04340 start_match++ < end_subject);
04341
04342 if (using_temporary_offsets)
04343 {
04344 DPRINTF(("Freeing temporary memory\n"));
04345 (vmdpcre_free)(match_block.offset_vector);
04346 }
04347
04348 DPRINTF((">>>> returning %d\n", match_block.errorcode));
04349
04350 return match_block.errorcode;
04351 }
04352
04353 #ifdef __cplusplus
04354 }
04355 #endif
04356
04357 /* End of pcre.c */