[フレーム]

regc_lex.c

Go to the documentation of this file.

1/*

2 * lexical analyzer

3 * This file is #included by regcomp.c.

4 *

6 *

7 * Development of this software was funded, in part, by Cray Research Inc.,

8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics

9 * Corporation, none of whom are responsible for the results. The author

10 * thanks all of them.

11 *

12 * Redistribution and use in source and binary forms -- with or without

13 * modification -- are permitted for any purpose, provided that

14 * redistributions in source form retain this entire copyright notice and

15 * indicate the origin and nature of any modifications.

16 *

17 * I'd appreciate being given credit for this package in the documentation

18 * of software which uses it, but that is not a requirement.

19 *

20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,

21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY

22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL

23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;

26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,

27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR

28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF

29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

30 *

31 * src/backend/regex/regc_lex.c

32 *

33 */

34

35/* scanning macros (know about v) */

36 #define ATEOS() (v->now >= v->stop)

37 #define HAVE(n) (v->stop - v->now >= (n))

38 #define NEXT1(c) (!ATEOS() && *v->now == CHR(c))

39 #define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))

40 #define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \

41 *(v->now+1) == CHR(b) && \

42 *(v->now+2) == CHR(c))

43 #define SET(c) (v->nexttype = (c))

44 #define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n))

45 #define RET(c) return (SET(c), 1)

46 #define RETV(c, n) return (SETV(c, n), 1)

47 #define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */

48 #define LASTTYPE(t) (v->lasttype == (t))

49

50/* lexical contexts */

51 #define L_ERE 1 /* mainline ERE/ARE */

52 #define L_BRE 2 /* mainline BRE */

53 #define L_Q 3 /* REG_QUOTE */

54 #define L_EBND 4 /* ERE/ARE bound */

55 #define L_BBND 5 /* BRE bound */

56 #define L_BRACK 6 /* brackets */

57 #define L_CEL 7 /* collating element */

58 #define L_ECL 8 /* equivalence class */

59 #define L_CCL 9 /* character class */

60 #define INTOCON(c) (v->lexcon = (c))

61 #define INCON(con) (v->lexcon == (con))

62

63/* construct pointer past end of chr array */

64 #define ENDOF(array) ((array) + sizeof(array)/sizeof(chr))

65

66/*

67 * lexstart - set up lexical stuff, scan leading options

68 */

69static void

70 lexstart(struct vars *v)

71{

72 prefixes(v); /* may turn on new type bits etc. */

73 NOERR();

74

75 if (v->cflags & REG_QUOTE)

76 {

77 assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)));

78 INTOCON(L_Q);

79 }

80 else if (v->cflags & REG_EXTENDED)

81 {

82 assert(!(v->cflags & REG_QUOTE));

83 INTOCON(L_ERE);

84 }

85 else

86 {

87 assert(!(v->cflags & (REG_QUOTE | REG_ADVF)));

88 INTOCON(L_BRE);

89 }

90

91 v->nexttype = EMPTY; /* remember we were at the start */

92 next(v); /* set up the first token */

93}

94

95/*

96 * prefixes - implement various special prefixes

97 */

98static void

99 prefixes(struct vars *v)

100{

101 /* literal string doesn't get any of this stuff */

102 if (v->cflags & REG_QUOTE)

103 return;

104

105 /* initial "***" gets special things */

106 if (HAVE(4) && NEXT3('*', '*', '*'))

107 switch (*(v->now + 3))

108 {

109 case CHR('?'): /* "***?" error, msg shows version */

110 ERR(REG_BADPAT);

111 return; /* proceed no further */

112 break;

113 case CHR('='): /* "***=" shifts to literal string */

114 NOTE(REG_UNONPOSIX);

115 v->cflags |= REG_QUOTE;

116 v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE);

117 v->now += 4;

118 return; /* and there can be no more prefixes */

119 break;

120 case CHR(':'): /* "***:" shifts to AREs */

121 NOTE(REG_UNONPOSIX);

122 v->cflags |= REG_ADVANCED;

123 v->now += 4;

124 break;

125 default: /* otherwise *** is just an error */

126 ERR(REG_BADRPT);

127 return;

128 break;

129 }

130

131 /* BREs and EREs don't get embedded options */

132 if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)

133 return;

134

135 /* embedded options (AREs only) */

136 if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))

137 {

138 NOTE(REG_UNONPOSIX);

139 v->now += 2;

140 for (; !ATEOS() && iscalpha(*v->now); v->now++)

141 switch (*v->now)

142 {

143 case CHR('b'): /* BREs (but why???) */

144 v->cflags &= ~(REG_ADVANCED | REG_QUOTE);

145 break;

146 case CHR('c'): /* case sensitive */

147 v->cflags &= ~REG_ICASE;

148 break;

149 case CHR('e'): /* plain EREs */

150 v->cflags |= REG_EXTENDED;

151 v->cflags &= ~(REG_ADVF | REG_QUOTE);

152 break;

153 case CHR('i'): /* case insensitive */

154 v->cflags |= REG_ICASE;

155 break;

156 case CHR('m'): /* Perloid synonym for n */

157 case CHR('n'): /* \n affects ^ $ . [^ */

158 v->cflags |= REG_NEWLINE;

159 break;

160 case CHR('p'): /* ~Perl, \n affects . [^ */

161 v->cflags |= REG_NLSTOP;

162 v->cflags &= ~REG_NLANCH;

163 break;

164 case CHR('q'): /* literal string */

165 v->cflags |= REG_QUOTE;

166 v->cflags &= ~REG_ADVANCED;

167 break;

168 case CHR('s'): /* single line, \n ordinary */

169 v->cflags &= ~REG_NEWLINE;

170 break;

171 case CHR('t'): /* tight syntax */

172 v->cflags &= ~REG_EXPANDED;

173 break;

174 case CHR('w'): /* weird, \n affects ^ $ only */

175 v->cflags &= ~REG_NLSTOP;

176 v->cflags |= REG_NLANCH;

177 break;

178 case CHR('x'): /* expanded syntax */

179 v->cflags |= REG_EXPANDED;

180 break;

181 default:

182 ERR(REG_BADOPT);

183 return;

184 }

185 if (!NEXT1(')'))

186 {

187 ERR(REG_BADOPT);

188 return;

189 }

190 v->now++;

191 if (v->cflags & REG_QUOTE)

192 v->cflags &= ~(REG_EXPANDED | REG_NEWLINE);

193 }

194}

195

196/*

197 * next - get next token

198 */

199static int /* 1 normal, 0 failure */

200 next(struct vars *v)

201{

202 chr c;

203

204next_restart: /* loop here after eating a comment */

205

206 /* errors yield an infinite sequence of failures */

207 if (ISERR())

208 return 0; /* the error has set nexttype to EOS */

209

210 /* remember flavor of last token */

211 v->lasttype = v->nexttype;

212

213 /* REG_BOSONLY */

214 if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))

215 {

216 /* at start of a REG_BOSONLY RE */

217 RETV(SBEGIN, 0); /* same as \A */

218 }

219

220 /* skip white space etc. if appropriate (not in literal or []) */

221 if (v->cflags & REG_EXPANDED)

222 switch (v->lexcon)

223 {

224 case L_ERE:

225 case L_BRE:

226 case L_EBND:

227 case L_BBND:

228 skip(v);

229 break;

230 }

231

232 /* handle EOS, depending on context */

233 if (ATEOS())

234 {

235 switch (v->lexcon)

236 {

237 case L_ERE:

238 case L_BRE:

239 case L_Q:

240 RET(EOS);

241 break;

242 case L_EBND:

243 case L_BBND:

244 FAILW(REG_EBRACE);

245 break;

246 case L_BRACK:

247 case L_CEL:

248 case L_ECL:

249 case L_CCL:

250 FAILW(REG_EBRACK);

251 break;

252 }

253 assert(NOTREACHED);

254 }

255

256 /* okay, time to actually get a character */

257 c = *v->now++;

258

259 /* deal with the easy contexts, punt EREs to code below */

260 switch (v->lexcon)

261 {

262 case L_BRE: /* punt BREs to separate function */

263 return brenext(v, c);

264 break;

265 case L_ERE: /* see below */

266 break;

267 case L_Q: /* literal strings are easy */

268 RETV(PLAIN, c);

269 break;

270 case L_BBND: /* bounds are fairly simple */

271 case L_EBND:

272 switch (c)

273 {

274 case CHR('0'):

275 case CHR('1'):

276 case CHR('2'):

277 case CHR('3'):

278 case CHR('4'):

279 case CHR('5'):

280 case CHR('6'):

281 case CHR('7'):

282 case CHR('8'):

283 case CHR('9'):

284 RETV(DIGIT, (chr) DIGITVAL(c));

285 break;

286 case CHR(','):

287 RET(',');

288 break;

289 case CHR('}'): /* ERE bound ends with } */

290 if (INCON(L_EBND))

291 {

292 INTOCON(L_ERE);

293 if ((v->cflags & REG_ADVF) && NEXT1('?'))

294 {

295 v->now++;

296 NOTE(REG_UNONPOSIX);

297 RETV('}', 0);

298 }

299 RETV('}', 1);

300 }

301 else

302 FAILW(REG_BADBR);

303 break;

304 case CHR('\\'): /* BRE bound ends with \} */

305 if (INCON(L_BBND) && NEXT1('}'))

306 {

307 v->now++;

308 INTOCON(L_BRE);

309 RETV('}', 1);

310 }

311 else

312 FAILW(REG_BADBR);

313 break;

314 default:

315 FAILW(REG_BADBR);

316 break;

317 }

318 assert(NOTREACHED);

319 break;

320 case L_BRACK: /* brackets are not too hard */

321 switch (c)

322 {

323 case CHR(']'):

324 if (LASTTYPE('['))

325 RETV(PLAIN, c);

326 else

327 {

328 INTOCON((v->cflags & REG_EXTENDED) ?

329 L_ERE : L_BRE);

330 RET(']');

331 }

332 break;

333 case CHR('\\'):

334 NOTE(REG_UBBS);

335 if (!(v->cflags & REG_ADVF))

336 RETV(PLAIN, c);

337 NOTE(REG_UNONPOSIX);

338 if (ATEOS())

339 FAILW(REG_EESCAPE);

340 if (!lexescape(v))

341 return 0;

342 switch (v->nexttype)

343 { /* not all escapes okay here */

344 case PLAIN:

345 case CCLASSS:

346 case CCLASSC:

347 return 1;

348 break;

349 }

350 /* not one of the acceptable escapes */

351 FAILW(REG_EESCAPE);

352 break;

353 case CHR('-'):

354 if (LASTTYPE('[') || NEXT1(']'))

355 RETV(PLAIN, c);

356 else

357 RETV(RANGE, c);

358 break;

359 case CHR('['):

360 if (ATEOS())

361 FAILW(REG_EBRACK);

362 switch (*v->now++)

363 {

364 case CHR('.'):

365 INTOCON(L_CEL);

366 /* might or might not be locale-specific */

367 RET(COLLEL);

368 break;

369 case CHR('='):

370 INTOCON(L_ECL);

371 NOTE(REG_ULOCALE);

372 RET(ECLASS);

373 break;

374 case CHR(':'):

375 INTOCON(L_CCL);

376 NOTE(REG_ULOCALE);

377 RET(CCLASS);

378 break;

379 default: /* oops */

380 v->now--;

381 RETV(PLAIN, c);

382 break;

383 }

384 assert(NOTREACHED);

385 break;

386 default:

387 RETV(PLAIN, c);

388 break;

389 }

390 assert(NOTREACHED);

391 break;

392 case L_CEL: /* collating elements are easy */

393 if (c == CHR('.') && NEXT1(']'))

394 {

395 v->now++;

396 INTOCON(L_BRACK);

397 RETV(END, '.');

398 }

399 else

400 RETV(PLAIN, c);

401 break;

402 case L_ECL: /* ditto equivalence classes */

403 if (c == CHR('=') && NEXT1(']'))

404 {

405 v->now++;

406 INTOCON(L_BRACK);

407 RETV(END, '=');

408 }

409 else

410 RETV(PLAIN, c);

411 break;

412 case L_CCL: /* ditto character classes */

413 if (c == CHR(':') && NEXT1(']'))

414 {

415 v->now++;

416 INTOCON(L_BRACK);

417 RETV(END, ':');

418 }

419 else

420 RETV(PLAIN, c);

421 break;

422 default:

423 assert(NOTREACHED);

424 break;

425 }

426

427 /* that got rid of everything except EREs and AREs */

428 assert(INCON(L_ERE));

429

430 /* deal with EREs and AREs, except for backslashes */

431 switch (c)

432 {

433 case CHR('|'):

434 RET('|');

435 break;

436 case CHR('*'):

437 if ((v->cflags & REG_ADVF) && NEXT1('?'))

438 {

439 v->now++;

440 NOTE(REG_UNONPOSIX);

441 RETV('*', 0);

442 }

443 RETV('*', 1);

444 break;

445 case CHR('+'):

446 if ((v->cflags & REG_ADVF) && NEXT1('?'))

447 {

448 v->now++;

449 NOTE(REG_UNONPOSIX);

450 RETV('+', 0);

451 }

452 RETV('+', 1);

453 break;

454 case CHR('?'):

455 if ((v->cflags & REG_ADVF) && NEXT1('?'))

456 {

457 v->now++;

458 NOTE(REG_UNONPOSIX);

459 RETV('?', 0);

460 }

461 RETV('?', 1);

462 break;

463 case CHR('{'): /* bounds start or plain character */

464 if (v->cflags & REG_EXPANDED)

465 skip(v);

466 if (ATEOS() || !iscdigit(*v->now))

467 {

468 NOTE(REG_UBRACES);

469 NOTE(REG_UUNSPEC);

470 RETV(PLAIN, c);

471 }

472 else

473 {

474 NOTE(REG_UBOUNDS);

475 INTOCON(L_EBND);

476 RET('{');

477 }

478 assert(NOTREACHED);

479 break;

480 case CHR('('): /* parenthesis, or advanced extension */

481 if ((v->cflags & REG_ADVF) && NEXT1('?'))

482 {

483 NOTE(REG_UNONPOSIX);

484 v->now++;

485 if (ATEOS())

486 FAILW(REG_BADRPT);

487 switch (*v->now++)

488 {

489 case CHR(':'): /* non-capturing paren */

490 RETV('(', 0);

491 break;

492 case CHR('#'): /* comment */

493 while (!ATEOS() && *v->now != CHR(')'))

494 v->now++;

495 if (!ATEOS())

496 v->now++;

497 assert(v->nexttype == v->lasttype);

498 goto next_restart;

499 case CHR('='): /* positive lookahead */

500 NOTE(REG_ULOOKAROUND);

501 RETV(LACON, LATYPE_AHEAD_POS);

502 break;

503 case CHR('!'): /* negative lookahead */

504 NOTE(REG_ULOOKAROUND);

505 RETV(LACON, LATYPE_AHEAD_NEG);

506 break;

507 case CHR('<'):

508 if (ATEOS())

509 FAILW(REG_BADRPT);

510 switch (*v->now++)

511 {

512 case CHR('='): /* positive lookbehind */

513 NOTE(REG_ULOOKAROUND);

514 RETV(LACON, LATYPE_BEHIND_POS);

515 break;

516 case CHR('!'): /* negative lookbehind */

517 NOTE(REG_ULOOKAROUND);

518 RETV(LACON, LATYPE_BEHIND_NEG);

519 break;

520 default:

521 FAILW(REG_BADRPT);

522 break;

523 }

524 assert(NOTREACHED);

525 break;

526 default:

527 FAILW(REG_BADRPT);

528 break;

529 }

530 assert(NOTREACHED);

531 }

532 RETV('(', 1);

533 break;

534 case CHR(')'):

535 if (LASTTYPE('('))

536 NOTE(REG_UUNSPEC);

537 RETV(')', c);

538 break;

539 case CHR('['): /* easy except for [[:<:]] and [[:>:]] */

540 if (HAVE(6) && *(v->now + 0) == CHR('[') &&

541 *(v->now + 1) == CHR(':') &&

542 (*(v->now + 2) == CHR('<') ||

543 *(v->now + 2) == CHR('>')) &&

544 *(v->now + 3) == CHR(':') &&

545 *(v->now + 4) == CHR(']') &&

546 *(v->now + 5) == CHR(']'))

547 {

548 c = *(v->now + 2);

549 v->now += 6;

550 NOTE(REG_UNONPOSIX);

551 RET((c == CHR('<')) ? '<' : '>');

552 }

553 INTOCON(L_BRACK);

554 if (NEXT1('^'))

555 {

556 v->now++;

557 RETV('[', 0);

558 }

559 RETV('[', 1);

560 break;

561 case CHR('.'):

562 RET('.');

563 break;

564 case CHR('^'):

565 RET('^');

566 break;

567 case CHR('$'):

568 RET('$');

569 break;

570 case CHR('\\'): /* mostly punt backslashes to code below */

571 if (ATEOS())

572 FAILW(REG_EESCAPE);

573 break;

574 default: /* ordinary character */

575 RETV(PLAIN, c);

576 break;

577 }

578

579 /* ERE/ARE backslash handling; backslash already eaten */

580 assert(!ATEOS());

581 if (!(v->cflags & REG_ADVF))

582 { /* only AREs have non-trivial escapes */

583 if (iscalnum(*v->now))

584 {

585 NOTE(REG_UBSALNUM);

586 NOTE(REG_UUNSPEC);

587 }

588 RETV(PLAIN, *v->now++);

589 }

590 return lexescape(v);

591}

592

593/*

594 * lexescape - parse an ARE backslash escape (backslash already eaten)

595 *

596 * This is used for ARE backslashes both normally and inside bracket

597 * expressions. In the latter case, not all escape types are allowed,

598 * but the caller must reject unwanted ones after we return.

599 */

600static int

601 lexescape(struct vars *v)

602{

603 chr c;

604 static const chr alert[] = {

605 CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')

606 };

607 static const chr esc[] = {

608 CHR('E'), CHR('S'), CHR('C')

609 };

610 const chr *save;

611

612 assert(v->cflags & REG_ADVF);

613

614 assert(!ATEOS());

615 c = *v->now++;

616

617 /* if it's not alphanumeric ASCII, treat it as a plain character */

618 if (!('a' <= c && c <= 'z') &&

619 !('A' <= c && c <= 'Z') &&

620 !('0' <= c && c <= '9'))

621 RETV(PLAIN, c);

622

623 NOTE(REG_UNONPOSIX);

624 switch (c)

625 {

626 case CHR('a'):

627 RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('007円')));

628 break;

629 case CHR('A'):

630 RETV(SBEGIN, 0);

631 break;

632 case CHR('b'):

633 RETV(PLAIN, CHR('\b'));

634 break;

635 case CHR('B'):

636 RETV(PLAIN, CHR('\\'));

637 break;

638 case CHR('c'):

639 NOTE(REG_UUNPORT);

640 if (ATEOS())

641 FAILW(REG_EESCAPE);

642 RETV(PLAIN, (chr) (*v->now++ & 037));

643 break;

644 case CHR('d'):

645 NOTE(REG_ULOCALE);

646 RETV(CCLASSS, CC_DIGIT);

647 break;

648 case CHR('D'):

649 NOTE(REG_ULOCALE);

650 RETV(CCLASSC, CC_DIGIT);

651 break;

652 case CHR('e'):

653 NOTE(REG_UUNPORT);

654 RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('033円')));

655 break;

656 case CHR('f'):

657 RETV(PLAIN, CHR('\f'));

658 break;

659 case CHR('m'):

660 RET('<');

661 break;

662 case CHR('M'):

663 RET('>');

664 break;

665 case CHR('n'):

666 RETV(PLAIN, CHR('\n'));

667 break;

668 case CHR('r'):

669 RETV(PLAIN, CHR('\r'));

670 break;

671 case CHR('s'):

672 NOTE(REG_ULOCALE);

673 RETV(CCLASSS, CC_SPACE);

674 break;

675 case CHR('S'):

676 NOTE(REG_ULOCALE);

677 RETV(CCLASSC, CC_SPACE);

678 break;

679 case CHR('t'):

680 RETV(PLAIN, CHR('\t'));

681 break;

682 case CHR('u'):

683 c = lexdigits(v, 16, 4, 4);

684 if (ISERR() || !CHR_IS_IN_RANGE(c))

685 FAILW(REG_EESCAPE);

686 RETV(PLAIN, c);

687 break;

688 case CHR('U'):

689 c = lexdigits(v, 16, 8, 8);

690 if (ISERR() || !CHR_IS_IN_RANGE(c))

691 FAILW(REG_EESCAPE);

692 RETV(PLAIN, c);

693 break;

694 case CHR('v'):

695 RETV(PLAIN, CHR('\v'));

696 break;

697 case CHR('w'):

698 NOTE(REG_ULOCALE);

699 RETV(CCLASSS, CC_WORD);

700 break;

701 case CHR('W'):

702 NOTE(REG_ULOCALE);

703 RETV(CCLASSC, CC_WORD);

704 break;

705 case CHR('x'):

706 NOTE(REG_UUNPORT);

707 c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */

708 if (ISERR() || !CHR_IS_IN_RANGE(c))

709 FAILW(REG_EESCAPE);

710 RETV(PLAIN, c);

711 break;

712 case CHR('y'):

713 NOTE(REG_ULOCALE);

714 RETV(WBDRY, 0);

715 break;

716 case CHR('Y'):

717 NOTE(REG_ULOCALE);

718 RETV(NWBDRY, 0);

719 break;

720 case CHR('Z'):

721 RETV(SEND, 0);

722 break;

723 case CHR('1'):

724 case CHR('2'):

725 case CHR('3'):

726 case CHR('4'):

727 case CHR('5'):

728 case CHR('6'):

729 case CHR('7'):

730 case CHR('8'):

731 case CHR('9'):

732 save = v->now;

733 v->now--; /* put first digit back */

734 c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */

735 if (ISERR())

736 FAILW(REG_EESCAPE);

737 /* ugly heuristic (first test is "exactly 1 digit?") */

738 if (v->now == save || ((int) c > 0 && (int) c <= v->nsubexp))

739 {

740 NOTE(REG_UBACKREF);

741 RETV(BACKREF, c);

742 }

743 /* oops, doesn't look like it's a backref after all... */

744 v->now = save;

745 /* and fall through into octal number */

746 /* FALLTHROUGH */

747 case CHR('0'):

748 NOTE(REG_UUNPORT);

749 v->now--; /* put first digit back */

750 c = lexdigits(v, 8, 1, 3);

751 if (ISERR())

752 FAILW(REG_EESCAPE);

753 if (c > 0xff)

754 {

755 /* out of range, so we handled one digit too much */

756 v->now--;

757 c >>= 3;

758 }

759 RETV(PLAIN, c);

760 break;

761 default:

762

763 /*

764 * Throw an error for unrecognized ASCII alpha escape sequences,

765 * which reserves them for future use if needed.

766 */

767 FAILW(REG_EESCAPE);

768 break;

769 }

770 assert(NOTREACHED);

771}

772

773/*

774 * lexdigits - slurp up digits and return chr value

775 *

776 * This does not account for overflow; callers should range-check the result

777 * if maxlen is large enough to make that possible.

778 */

779static chr /* chr value; errors signalled via ERR */

780 lexdigits(struct vars *v,

781 int base,

782 int minlen,

783 int maxlen)

784{

785 uchr n; /* unsigned to avoid overflow misbehavior */

786 int len;

787 chr c;

788 int d;

789 const uchr ub = (uchr) base;

790

791 n = 0;

792 for (len = 0; len < maxlen && !ATEOS(); len++)

793 {

794 c = *v->now++;

795 switch (c)

796 {

797 case CHR('0'):

798 case CHR('1'):

799 case CHR('2'):

800 case CHR('3'):

801 case CHR('4'):

802 case CHR('5'):

803 case CHR('6'):

804 case CHR('7'):

805 case CHR('8'):

806 case CHR('9'):

807 d = DIGITVAL(c);

808 break;

809 case CHR('a'):

810 case CHR('A'):

811 d = 10;

812 break;

813 case CHR('b'):

814 case CHR('B'):

815 d = 11;

816 break;

817 case CHR('c'):

818 case CHR('C'):

819 d = 12;

820 break;

821 case CHR('d'):

822 case CHR('D'):

823 d = 13;

824 break;

825 case CHR('e'):

826 case CHR('E'):

827 d = 14;

828 break;

829 case CHR('f'):

830 case CHR('F'):

831 d = 15;

832 break;

833 default:

834 v->now--; /* oops, not a digit at all */

835 d = -1;

836 break;

837 }

838

839 if (d >= base)

840 { /* not a plausible digit */

841 v->now--;

842 d = -1;

843 }

844 if (d < 0)

845 break; /* NOTE BREAK OUT */

846 n = n * ub + (uchr) d;

847 }

848 if (len < minlen)

849 ERR(REG_EESCAPE);

850

851 return (chr) n;

852}

853

854/*

855 * brenext - get next BRE token

856 *

857 * This is much like EREs except for all the stupid backslashes and the

858 * context-dependency of some things.

859 */

860static int /* 1 normal, 0 failure */

861 brenext(struct vars *v,

862 chr c)

863{

864 switch (c)

865 {

866 case CHR('*'):

867 if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))

868 RETV(PLAIN, c);

869 RETV('*', 1);

870 break;

871 case CHR('['):

872 if (HAVE(6) && *(v->now + 0) == CHR('[') &&

873 *(v->now + 1) == CHR(':') &&

874 (*(v->now + 2) == CHR('<') ||

875 *(v->now + 2) == CHR('>')) &&

876 *(v->now + 3) == CHR(':') &&

877 *(v->now + 4) == CHR(']') &&

878 *(v->now + 5) == CHR(']'))

879 {

880 c = *(v->now + 2);

881 v->now += 6;

882 NOTE(REG_UNONPOSIX);

883 RET((c == CHR('<')) ? '<' : '>');

884 }

885 INTOCON(L_BRACK);

886 if (NEXT1('^'))

887 {

888 v->now++;

889 RETV('[', 0);

890 }

891 RETV('[', 1);

892 break;

893 case CHR('.'):

894 RET('.');

895 break;

896 case CHR('^'):

897 if (LASTTYPE(EMPTY))

898 RET('^');

899 if (LASTTYPE('('))

900 {

901 NOTE(REG_UUNSPEC);

902 RET('^');

903 }

904 RETV(PLAIN, c);

905 break;

906 case CHR('$'):

907 if (v->cflags & REG_EXPANDED)

908 skip(v);

909 if (ATEOS())

910 RET('$');

911 if (NEXT2('\\', ')'))

912 {

913 NOTE(REG_UUNSPEC);

914 RET('$');

915 }

916 RETV(PLAIN, c);

917 break;

918 case CHR('\\'):

919 break; /* see below */

920 default:

921 RETV(PLAIN, c);

922 break;

923 }

924

925 assert(c == CHR('\\'));

926

927 if (ATEOS())

928 FAILW(REG_EESCAPE);

929

930 c = *v->now++;

931 switch (c)

932 {

933 case CHR('{'):

934 INTOCON(L_BBND);

935 NOTE(REG_UBOUNDS);

936 RET('{');

937 break;

938 case CHR('('):

939 RETV('(', 1);

940 break;

941 case CHR(')'):

942 RETV(')', c);

943 break;

944 case CHR('<'):

945 NOTE(REG_UNONPOSIX);

946 RET('<');

947 break;

948 case CHR('>'):

949 NOTE(REG_UNONPOSIX);

950 RET('>');

951 break;

952 case CHR('1'):

953 case CHR('2'):

954 case CHR('3'):

955 case CHR('4'):

956 case CHR('5'):

957 case CHR('6'):

958 case CHR('7'):

959 case CHR('8'):

960 case CHR('9'):

961 NOTE(REG_UBACKREF);

962 RETV(BACKREF, (chr) DIGITVAL(c));

963 break;

964 default:

965 if (iscalnum(c))

966 {

967 NOTE(REG_UBSALNUM);

968 NOTE(REG_UUNSPEC);

969 }

970 RETV(PLAIN, c);

971 break;

972 }

973

974 assert(NOTREACHED);

975 return 0;

976}

977

978/*

979 * skip - skip white space and comments in expanded form

980 */

981static void

982 skip(struct vars *v)

983{

984 const chr *start = v->now;

985

986 assert(v->cflags & REG_EXPANDED);

987

988 for (;;)

989 {

990 while (!ATEOS() && iscspace(*v->now))

991 v->now++;

992 if (ATEOS() || *v->now != CHR('#'))

993 break; /* NOTE BREAK OUT */

994 assert(NEXT1('#'));

995 while (!ATEOS() && *v->now != CHR('\n'))

996 v->now++;

997 /* leave the newline to be picked up by the iscspace loop */

998 }

999

1000 if (v->now != start)

1001 NOTE(REG_UNONPOSIX);

1002}

1003

1004/*

1005 * newline - return the chr for a newline

1006 *

1007 * This helps confine use of CHR to this source file.

1008 */

1009static chr

1010 newline(void)

1011{

1012 return CHR('\n');

1013}

1014

1015/*

1016 * chrnamed - return the chr known by a given (chr string) name

1017 *

1018 * The code is a bit clumsy, but this routine gets only such specialized

1019 * use that it hardly matters.

1020 */

1021static chr

1022 chrnamed(struct vars *v,

1023 const chr *startp, /* start of name */

1024 const chr *endp, /* just past end of name */

1025 chr lastresort) /* what to return if name lookup fails */

1026{

1027 chr c;

1028 int errsave;

1029 int e;

1030 struct cvec *cv;

1031

1032 errsave = v->err;

1033 v->err = 0;

1034 c = element(v, startp, endp);

1035 e = v->err;

1036 v->err = errsave;

1037

1038 if (e != 0)

1039 return lastresort;

1040

1041 cv = range(v, c, c, 0);

1042 if (cv->nchrs == 0)

1043 return lastresort;

1044 return cv->chrs[0];

1045}

END

#define END

Definition: _int.h:160

ERR

#define ERR

Definition: _int.h:161

errsave

#define errsave(context,...)

Definition: elog.h:262

start

return str start

Definition: hashfn_unstable.h:282

len

const void size_t len

Definition: pg_crc32c_sse42.c:28

NOTE

@ NOTE

Definition: pg_regress.c:88

c

char * c

Definition: preproc-cursor.c:31

e

Definition: preproc-init.c:82

L_ERE

#define L_ERE

Definition: regc_lex.c:51

NEXT2

#define NEXT2(a, b)

Definition: regc_lex.c:39

RET

#define RET(c)

Definition: regc_lex.c:45

INTOCON

#define INTOCON(c)

Definition: regc_lex.c:60

INCON

#define INCON(con)

Definition: regc_lex.c:61

lexescape

static int lexescape(struct vars *v)

Definition: regc_lex.c:601

L_BBND

#define L_BBND

Definition: regc_lex.c:55

ATEOS

#define ATEOS()

Definition: regc_lex.c:36

L_Q

#define L_Q

Definition: regc_lex.c:53

skip

static void skip(struct vars *v)

Definition: regc_lex.c:982

lexdigits

static chr lexdigits(struct vars *v, int base, int minlen, int maxlen)

Definition: regc_lex.c:780

HAVE

#define HAVE(n)

Definition: regc_lex.c:37

LASTTYPE

#define LASTTYPE(t)

Definition: regc_lex.c:48

RETV

#define RETV(c, n)

Definition: regc_lex.c:46

newline

static chr newline(void)

Definition: regc_lex.c:1010

L_CEL

#define L_CEL

Definition: regc_lex.c:57

FAILW

#define FAILW(e)

Definition: regc_lex.c:47

L_EBND

#define L_EBND

Definition: regc_lex.c:54

L_ECL

#define L_ECL

Definition: regc_lex.c:58

brenext

static int brenext(struct vars *v, chr c)

Definition: regc_lex.c:861

lexstart

static void lexstart(struct vars *v)

Definition: regc_lex.c:70

prefixes

static void prefixes(struct vars *v)

Definition: regc_lex.c:99

ENDOF

#define ENDOF(array)

Definition: regc_lex.c:64

L_CCL

#define L_CCL

Definition: regc_lex.c:59

NEXT3

#define NEXT3(a, b, c)

Definition: regc_lex.c:40

L_BRACK

#define L_BRACK

Definition: regc_lex.c:56

chrnamed

static chr chrnamed(struct vars *v, const chr *startp, const chr *endp, chr lastresort)

Definition: regc_lex.c:1022

NEXT1

#define NEXT1(c)

Definition: regc_lex.c:38

L_BRE

#define L_BRE

Definition: regc_lex.c:52

Definition: regc_lex.c:200

range

static struct cvec * range(struct vars *v, chr a, chr b, int cases)

Definition: regc_locale.c:412

element

static chr element(struct vars *v, const chr *startp, const chr *endp)

Definition: regc_locale.c:376

COLLEL

#define COLLEL

Definition: regcomp.c:334

NWBDRY

#define NWBDRY

Definition: regcomp.c:345

NOERR

#define NOERR()

Definition: regcomp.c:321

EMPTY

#define EMPTY

Definition: regcomp.c:329

SBEGIN

#define SBEGIN

Definition: regcomp.c:347

ISERR

#define ISERR()

Definition: regcomp.c:317

CCLASSS

#define CCLASSS

Definition: regcomp.c:338

CCLASS

#define CCLASS

Definition: regcomp.c:336

WBDRY

#define WBDRY

Definition: regcomp.c:344

DIGIT

#define DIGIT

Definition: regcomp.c:332

CCLASSC

#define CCLASSC

Definition: regcomp.c:339

ECLASS

#define ECLASS

Definition: regcomp.c:335

BACKREF

#define BACKREF

Definition: regcomp.c:333

LACON

#define LACON

Definition: regcomp.c:341

EOS

#define EOS

Definition: regcomp.c:330

PLAIN

#define PLAIN

Definition: regcomp.c:331

SEND

#define SEND

Definition: regcomp.c:348

RANGE

#define RANGE

Definition: regcomp.c:340

uchr

unsigned uchr

Definition: regcustom.h:60

DIGITVAL

#define DIGITVAL(c)

Definition: regcustom.h:63

iscalnum

#define iscalnum(x)

Definition: regcustom.h:90

iscdigit

#define iscdigit(x)

Definition: regcustom.h:92

CHR_IS_IN_RANGE

#define CHR_IS_IN_RANGE(c)

Definition: regcustom.h:77

chr

pg_wchar chr

Definition: regcustom.h:59

CHR

#define CHR(c)

Definition: regcustom.h:62

iscspace

#define iscspace(x)

Definition: regcustom.h:93

iscalpha

#define iscalpha(x)

Definition: regcustom.h:91

assert

#define assert(x)

Definition: regcustom.h:56

REG_BADOPT

#define REG_BADOPT

Definition: regex.h:232

REG_ICASE

#define REG_ICASE

Definition: regex.h:184

REG_EBRACK

#define REG_EBRACK

Definition: regex.h:222

REG_UBOUNDS

#define REG_UBOUNDS

Definition: regex.h:140

REG_BADRPT

#define REG_BADRPT

Definition: regex.h:228

REG_EESCAPE

#define REG_EESCAPE

Definition: regex.h:220

REG_ULOOKAROUND

#define REG_ULOOKAROUND

Definition: regex.h:139

REG_UBBS

#define REG_UBBS

Definition: regex.h:144

REG_ADVANCED

#define REG_ADVANCED

Definition: regex.h:181

REG_EXPANDED

#define REG_EXPANDED

Definition: regex.h:186

REG_NLANCH

#define REG_NLANCH

Definition: regex.h:188

REG_EXTENDED

#define REG_EXTENDED

Definition: regex.h:179

REG_NLSTOP

#define REG_NLSTOP

Definition: regex.h:187

REG_ADVF

#define REG_ADVF

Definition: regex.h:180

REG_UUNSPEC

#define REG_UUNSPEC

Definition: regex.h:146

REG_UNONPOSIX

#define REG_UNONPOSIX

Definition: regex.h:145

REG_BADBR

#define REG_BADBR

Definition: regex.h:225

REG_NEWLINE

#define REG_NEWLINE

Definition: regex.h:189

REG_UBSALNUM

#define REG_UBSALNUM

Definition: regex.h:142

REG_ULOCALE

#define REG_ULOCALE

Definition: regex.h:148

REG_UUNPORT

#define REG_UUNPORT

Definition: regex.h:147

REG_EBRACE

#define REG_EBRACE

Definition: regex.h:224

REG_BADPAT

#define REG_BADPAT

Definition: regex.h:217

REG_BOSONLY

#define REG_BOSONLY

Definition: regex.h:192

REG_UBRACES

#define REG_UBRACES

Definition: regex.h:141

REG_UBACKREF

#define REG_UBACKREF

Definition: regex.h:138

REG_QUOTE

#define REG_QUOTE

Definition: regex.h:182

LATYPE_AHEAD_NEG

#define LATYPE_AHEAD_NEG

Definition: regguts.h:105

LATYPE_BEHIND_POS

#define LATYPE_BEHIND_POS

Definition: regguts.h:106

NOTREACHED

#define NOTREACHED

Definition: regguts.h:96

LATYPE_BEHIND_NEG

#define LATYPE_BEHIND_NEG

Definition: regguts.h:107

CC_WORD

@ CC_WORD

Definition: regguts.h:141

CC_SPACE

@ CC_SPACE

Definition: regguts.h:141

CC_DIGIT

@ CC_DIGIT

Definition: regguts.h:140

LATYPE_AHEAD_POS

#define LATYPE_AHEAD_POS

Definition: regguts.h:104

cvec

Definition: regguts.h:279

cvec::nchrs

int nchrs

Definition: regguts.h:280

cvec::chrs

chr * chrs

Definition: regguts.h:282

vars

Definition: regcomp.c:282

vars::now

const chr * now

Definition: regcomp.c:284

vars::err

int err

Definition: regcomp.c:286

vars::cflags

int cflags

Definition: regcomp.c:287

vars::lexcon

int lexcon

Definition: regcomp.c:291

vars::nexttype

int nexttype

Definition: regcomp.c:289

vars::lasttype

int lasttype

Definition: regcomp.c:288

PostgreSQL Source Code: src/backend/regex/regc_lex.c Source File