[フレーム]

parser.c

Go to the documentation of this file.

1/*-------------------------------------------------------------------------

2 *

3 * parser.c

4 * Main entry point/driver for PostgreSQL grammar

5 *

6 * Note that the grammar is not allowed to perform any table access

7 * (since we need to be able to do basic parsing even while inside an

8 * aborted transaction). Therefore, the data structures returned by

9 * the grammar are "raw" parsetrees that still need to be analyzed by

10 * analyze.c and related files.

11 *

12 *

15 *

16 * IDENTIFICATION

17 * src/backend/parser/parser.c

18 *

19 *-------------------------------------------------------------------------

20 */

21

22#include "postgres.h"

23

24#include "gramparse.h"

25#include "mb/pg_wchar.h"

26#include "parser/parser.h"

27#include "parser/scansup.h"

28

29static bool check_uescapechar(unsigned char escape);

30static char *str_udeescape(const char *str, char escape,

31 int position, core_yyscan_t yyscanner);

32

33

34/*

35 * raw_parser

36 * Given a query in string form, do lexical and grammatical analysis.

37 *

38 * Returns a list of raw (un-analyzed) parse trees. The contents of the

39 * list have the form required by the specified RawParseMode.

40 */

41List *

42 raw_parser(const char *str, RawParseMode mode)

43{

44 core_yyscan_t yyscanner;

45 base_yy_extra_type yyextra;

46 int yyresult;

47

48 /* initialize the flex scanner */

49 yyscanner = scanner_init(str, &yyextra.core_yy_extra,

50 &ScanKeywords, ScanKeywordTokens);

51

52 /* base_yylex() only needs us to initialize the lookahead token, if any */

53 if (mode == RAW_PARSE_DEFAULT)

54 yyextra.have_lookahead = false;

55 else

56 {

57 /* this array is indexed by RawParseMode enum */

58 static const int mode_token[] = {

59 [RAW_PARSE_DEFAULT] = 0,

60 [RAW_PARSE_TYPE_NAME] = MODE_TYPE_NAME,

61 [RAW_PARSE_PLPGSQL_EXPR] = MODE_PLPGSQL_EXPR,

62 [RAW_PARSE_PLPGSQL_ASSIGN1] = MODE_PLPGSQL_ASSIGN1,

63 [RAW_PARSE_PLPGSQL_ASSIGN2] = MODE_PLPGSQL_ASSIGN2,

64 [RAW_PARSE_PLPGSQL_ASSIGN3] = MODE_PLPGSQL_ASSIGN3,

65 };

66

67 yyextra.have_lookahead = true;

68 yyextra.lookahead_token = mode_token[mode];

69 yyextra.lookahead_yylloc = 0;

70 yyextra.lookahead_end = NULL;

71 }

72

73 /* initialize the bison parser */

74 parser_init(&yyextra);

75

76 /* Parse! */

77 yyresult = base_yyparse(yyscanner);

78

79 /* Clean up (release memory) */

80 scanner_finish(yyscanner);

81

82 if (yyresult) /* error */

83 return NIL;

84

85 return yyextra.parsetree;

86}

87

88

89/*

90 * Intermediate filter between parser and core lexer (core_yylex in scan.l).

91 *

92 * This filter is needed because in some cases the standard SQL grammar

93 * requires more than one token lookahead. We reduce these cases to one-token

94 * lookahead by replacing tokens here, in order to keep the grammar LALR(1).

95 *

96 * Using a filter is simpler than trying to recognize multiword tokens

97 * directly in scan.l, because we'd have to allow for comments between the

98 * words. Furthermore it's not clear how to do that without re-introducing

99 * scanner backtrack, which would cost more performance than this filter

100 * layer does.

101 *

102 * We also use this filter to convert UIDENT and USCONST sequences into

103 * plain IDENT and SCONST tokens. While that could be handled by additional

104 * productions in the main grammar, it's more efficient to do it like this.

105 *

106 * The filter also provides a convenient place to translate between

107 * the core_YYSTYPE and YYSTYPE representations (which are really the

108 * same thing anyway, but notationally they're different).

109 */

110int

111 base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)

112{

113 base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);

114 int cur_token;

115 int next_token;

116 int cur_token_length;

117 YYLTYPE cur_yylloc;

118

119 /* Get next token --- we might already have it */

120 if (yyextra->have_lookahead)

121 {

122 cur_token = yyextra->lookahead_token;

123 lvalp->core_yystype = yyextra->lookahead_yylval;

124 *llocp = yyextra->lookahead_yylloc;

125 if (yyextra->lookahead_end)

126 *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;

127 yyextra->have_lookahead = false;

128 }

129 else

130 cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);

131

132 /*

133 * If this token isn't one that requires lookahead, just return it. If it

134 * does, determine the token length. (We could get that via strlen(), but

135 * since we have such a small set of possibilities, hardwiring seems

136 * feasible and more efficient --- at least for the fixed-length cases.)

137 */

138 switch (cur_token)

139 {

140 case FORMAT:

141 cur_token_length = 6;

142 break;

143 case NOT:

144 cur_token_length = 3;

145 break;

146 case NULLS_P:

147 cur_token_length = 5;

148 break;

149 case WITH:

150 cur_token_length = 4;

151 break;

152 case UIDENT:

153 case USCONST:

154 cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);

155 break;

156 case WITHOUT:

157 cur_token_length = 7;

158 break;

159 default:

160 return cur_token;

161 }

162

163 /*

164 * Identify end+1 of current token. core_yylex() has temporarily stored a

165 * '0円' here, and will undo that when we call it again. We need to redo

166 * it to fully revert the lookahead call for error reporting purposes.

167 */

168 yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +

169 *llocp + cur_token_length;

170 Assert(*(yyextra->lookahead_end) == '0円');

171

172 /*

173 * Save and restore *llocp around the call. It might look like we could

174 * avoid this by just passing &lookahead_yylloc to core_yylex(), but that

175 * does not work because flex actually holds onto the last-passed pointer

176 * internally, and will use that for error reporting. We need any error

177 * reports to point to the current token, not the next one.

178 */

179 cur_yylloc = *llocp;

180

181 /* Get next token, saving outputs into lookahead variables */

182 next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);

183 yyextra->lookahead_token = next_token;

184 yyextra->lookahead_yylloc = *llocp;

185

186 *llocp = cur_yylloc;

187

188 /* Now revert the un-truncation of the current token */

189 yyextra->lookahead_hold_char = *(yyextra->lookahead_end);

190 *(yyextra->lookahead_end) = '0円';

191

192 yyextra->have_lookahead = true;

193

194 /* Replace cur_token if needed, based on lookahead */

195 switch (cur_token)

196 {

197 case FORMAT:

198 /* Replace FORMAT by FORMAT_LA if it's followed by JSON */

199 switch (next_token)

200 {

201 case JSON:

202 cur_token = FORMAT_LA;

203 break;

204 }

205 break;

206

207 case NOT:

208 /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */

209 switch (next_token)

210 {

211 case BETWEEN:

212 case IN_P:

213 case LIKE:

214 case ILIKE:

215 case SIMILAR:

216 cur_token = NOT_LA;

217 break;

218 }

219 break;

220

221 case NULLS_P:

222 /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */

223 switch (next_token)

224 {

225 case FIRST_P:

226 case LAST_P:

227 cur_token = NULLS_LA;

228 break;

229 }

230 break;

231

232 case WITH:

233 /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */

234 switch (next_token)

235 {

236 case TIME:

237 case ORDINALITY:

238 cur_token = WITH_LA;

239 break;

240 }

241 break;

242

243 case WITHOUT:

244 /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */

245 switch (next_token)

246 {

247 case TIME:

248 cur_token = WITHOUT_LA;

249 break;

250 }

251 break;

252

253 case UIDENT:

254 case USCONST:

255 /* Look ahead for UESCAPE */

256 if (next_token == UESCAPE)

257 {

258 /* Yup, so get third token, which had better be SCONST */

259 const char *escstr;

260

261 /* Again save and restore *llocp */

262 cur_yylloc = *llocp;

263

264 /* Un-truncate current token so errors point to third token */

265 *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;

266

267 /* Get third token */

268 next_token = core_yylex(&(yyextra->lookahead_yylval),

269 llocp, yyscanner);

270

271 /* If we throw error here, it will point to third token */

272 if (next_token != SCONST)

273 scanner_yyerror("UESCAPE must be followed by a simple string literal",

274 yyscanner);

275

276 escstr = yyextra->lookahead_yylval.str;

277 if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))

278 scanner_yyerror("invalid Unicode escape character",

279 yyscanner);

280

281 /* Now restore *llocp; errors will point to first token */

282 *llocp = cur_yylloc;

283

284 /* Apply Unicode conversion */

285 lvalp->core_yystype.str =

286 str_udeescape(lvalp->core_yystype.str,

287 escstr[0],

288 *llocp,

289 yyscanner);

290

291 /*

292 * We don't need to revert the un-truncation of UESCAPE. What

293 * we do want to do is clear have_lookahead, thereby consuming

294 * all three tokens.

295 */

296 yyextra->have_lookahead = false;

297 }

298 else

299 {

300 /* No UESCAPE, so convert using default escape character */

301 lvalp->core_yystype.str =

302 str_udeescape(lvalp->core_yystype.str,

303 '\\',

304 *llocp,

305 yyscanner);

306 }

307

308 if (cur_token == UIDENT)

309 {

310 /* It's an identifier, so truncate as appropriate */

311 truncate_identifier(lvalp->core_yystype.str,

312 strlen(lvalp->core_yystype.str),

313 true);

314 cur_token = IDENT;

315 }

316 else if (cur_token == USCONST)

317 {

318 cur_token = SCONST;

319 }

320 break;

321 }

322

323 return cur_token;

324}

325

326/* convert hex digit (caller should have verified that) to value */

327static unsigned int

328 hexval(unsigned char c)

329{

330 if (c >= '0' && c <= '9')

331 return c - '0';

332 if (c >= 'a' && c <= 'f')

333 return c - 'a' + 0xA;

334 if (c >= 'A' && c <= 'F')

335 return c - 'A' + 0xA;

336 elog(ERROR, "invalid hexadecimal digit");

337 return 0; /* not reached */

338}

339

340/* is Unicode code point acceptable? */

341static void

342 check_unicode_value(pg_wchar c)

343{

344 if (!is_valid_unicode_codepoint(c))

345 ereport(ERROR,

346 (errcode(ERRCODE_SYNTAX_ERROR),

347 errmsg("invalid Unicode escape value")));

348}

349

350/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */

351static bool

352 check_uescapechar(unsigned char escape)

353{

354 if (isxdigit(escape)

355 || escape == '+'

356 || escape == '\''

357 || escape == '"'

358 || scanner_isspace(escape))

359 return false;

360 else

361 return true;

362}

363

364/*

365 * Process Unicode escapes in "str", producing a palloc'd plain string

366 *

367 * escape: the escape character to use

368 * position: start position of U&'' or U&"" string token

369 * yyscanner: context information needed for error reports

370 */

371static char *

372 str_udeescape(const char *str, char escape,

373 int position, core_yyscan_t yyscanner)

374{

375 const char *in;

376 char *new,

377 *out;

378 size_t new_len;

379 pg_wchar pair_first = 0;

380 ScannerCallbackState scbstate;

381

382 /*

383 * Guesstimate that result will be no longer than input, but allow enough

384 * padding for Unicode conversion.

385 */

386 new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;

387 new = palloc(new_len);

388

389 in = str;

390 out = new;

391 while (*in)

392 {

393 /* Enlarge string if needed */

394 size_t out_dist = out - new;

395

396 if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))

397 {

398 new_len *= 2;

399 new = repalloc(new, new_len);

400 out = new + out_dist;

401 }

402

403 if (in[0] == escape)

404 {

405 /*

406 * Any errors reported while processing this escape sequence will

407 * have an error cursor pointing at the escape.

408 */

409 setup_scanner_errposition_callback(&scbstate, yyscanner,

410 in - str + position + 3); /* 3 for U&" */

411 if (in[1] == escape)

412 {

413 if (pair_first)

414 goto invalid_pair;

415 *out++ = escape;

416 in += 2;

417 }

418 else if (isxdigit((unsigned char) in[1]) &&

419 isxdigit((unsigned char) in[2]) &&

420 isxdigit((unsigned char) in[3]) &&

421 isxdigit((unsigned char) in[4]))

422 {

423 pg_wchar unicode;

424

425 unicode = (hexval(in[1]) << 12) +

426 (hexval(in[2]) << 8) +

427 (hexval(in[3]) << 4) +

428 hexval(in[4]);

429 check_unicode_value(unicode);

430 if (pair_first)

431 {

432 if (is_utf16_surrogate_second(unicode))

433 {

434 unicode = surrogate_pair_to_codepoint(pair_first, unicode);

435 pair_first = 0;

436 }

437 else

438 goto invalid_pair;

439 }

440 else if (is_utf16_surrogate_second(unicode))

441 goto invalid_pair;

442

443 if (is_utf16_surrogate_first(unicode))

444 pair_first = unicode;

445 else

446 {

447 pg_unicode_to_server(unicode, (unsigned char *) out);

448 out += strlen(out);

449 }

450 in += 5;

451 }

452 else if (in[1] == '+' &&

453 isxdigit((unsigned char) in[2]) &&

454 isxdigit((unsigned char) in[3]) &&

455 isxdigit((unsigned char) in[4]) &&

456 isxdigit((unsigned char) in[5]) &&

457 isxdigit((unsigned char) in[6]) &&

458 isxdigit((unsigned char) in[7]))

459 {

460 pg_wchar unicode;

461

462 unicode = (hexval(in[2]) << 20) +

463 (hexval(in[3]) << 16) +

464 (hexval(in[4]) << 12) +

465 (hexval(in[5]) << 8) +

466 (hexval(in[6]) << 4) +

467 hexval(in[7]);

468 check_unicode_value(unicode);

469 if (pair_first)

470 {

471 if (is_utf16_surrogate_second(unicode))

472 {

473 unicode = surrogate_pair_to_codepoint(pair_first, unicode);

474 pair_first = 0;

475 }

476 else

477 goto invalid_pair;

478 }

479 else if (is_utf16_surrogate_second(unicode))

480 goto invalid_pair;

481

482 if (is_utf16_surrogate_first(unicode))

483 pair_first = unicode;

484 else

485 {

486 pg_unicode_to_server(unicode, (unsigned char *) out);

487 out += strlen(out);

488 }

489 in += 8;

490 }

491 else

492 ereport(ERROR,

493 (errcode(ERRCODE_SYNTAX_ERROR),

494 errmsg("invalid Unicode escape"),

495 errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));

496

497 cancel_scanner_errposition_callback(&scbstate);

498 }

499 else

500 {

501 if (pair_first)

502 goto invalid_pair;

503

504 *out++ = *in++;

505 }

506 }

507

508 /* unfinished surrogate pair? */

509 if (pair_first)

510 goto invalid_pair;

511

512 *out = '0円';

513 return new;

514

515 /*

516 * We might get here with the error callback active, or not. Call

517 * scanner_errposition to make sure an error cursor appears; if the

518 * callback is active, this is duplicative but harmless.

519 */

520invalid_pair:

521 ereport(ERROR,

522 (errcode(ERRCODE_SYNTAX_ERROR),

523 errmsg("invalid Unicode surrogate pair"),

524 scanner_errposition(in - str + position + 3, /* 3 for U&" */

525 yyscanner)));

526 return NULL; /* keep compiler quiet */

527}

check_unicode_value

static void check_unicode_value(pg_wchar c)

Definition: parser.c:342

raw_parser

List * raw_parser(const char *str, RawParseMode mode)

Definition: parser.c:42

hexval

static unsigned int hexval(unsigned char c)

Definition: parser.c:328

str_udeescape

static char * str_udeescape(const char *str, char escape, int position, core_yyscan_t yyscanner)

Definition: parser.c:372

base_yylex

int base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)

Definition: parser.c:111

check_uescapechar

static bool check_uescapechar(unsigned char escape)

Definition: parser.c:352

errhint

int errhint(const char *fmt,...)

Definition: elog.c:1321

errcode

int errcode(int sqlerrcode)

Definition: elog.c:854

errmsg

int errmsg(const char *fmt,...)

Definition: elog.c:1071

ERROR

#define ERROR

Definition: elog.h:39

elog

#define elog(elevel,...)

Definition: elog.h:226

ereport

#define ereport(elevel,...)

Definition: elog.h:150

gramparse.h

pg_yyget_extra

#define pg_yyget_extra(yyscanner)

Definition: gramparse.h:64

parser_init

void parser_init(base_yy_extra_type *yyext)

base_yyparse

int base_yyparse(core_yyscan_t yyscanner)

Assert

Assert(PointerIsAligned(start, uint64))

str

const char * str

Definition: hashfn_unstable.h:254

next_token

static bool next_token(char **lineptr, StringInfo buf, bool *initial_quote, bool *terminating_comma)

Definition: hba.c:187

ScanKeywords

PGDLLIMPORT const ScanKeywordList ScanKeywords

pg_wchar

unsigned int pg_wchar

Definition: mbprint.c:31

pg_unicode_to_server

void pg_unicode_to_server(pg_wchar c, unsigned char *s)

Definition: mbutils.c:865

repalloc

void * repalloc(void *pointer, Size size)

Definition: mcxt.c:1610

palloc

void * palloc(Size size)

Definition: mcxt.c:1365

parser.h

RawParseMode

Definition: parser.h:38

RAW_PARSE_PLPGSQL_EXPR

@ RAW_PARSE_PLPGSQL_EXPR

Definition: parser.h:41

RAW_PARSE_PLPGSQL_ASSIGN2

@ RAW_PARSE_PLPGSQL_ASSIGN2

Definition: parser.h:43

RAW_PARSE_PLPGSQL_ASSIGN1

@ RAW_PARSE_PLPGSQL_ASSIGN1

Definition: parser.h:42

RAW_PARSE_TYPE_NAME

@ RAW_PARSE_TYPE_NAME

Definition: parser.h:40

RAW_PARSE_PLPGSQL_ASSIGN3

@ RAW_PARSE_PLPGSQL_ASSIGN3

Definition: parser.h:44

RAW_PARSE_DEFAULT

@ RAW_PARSE_DEFAULT

Definition: parser.h:39

mode

static PgChecksumMode mode

Definition: pg_checksums.c:55

NIL

#define NIL

Definition: pg_list.h:68

pg_wchar.h

MAX_UNICODE_EQUIVALENT_STRING

#define MAX_UNICODE_EQUIVALENT_STRING

Definition: pg_wchar.h:329

is_valid_unicode_codepoint

static bool is_valid_unicode_codepoint(pg_wchar c)

Definition: pg_wchar.h:519

surrogate_pair_to_codepoint

static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)

Definition: pg_wchar.h:537

is_utf16_surrogate_first

static bool is_utf16_surrogate_first(pg_wchar c)

Definition: pg_wchar.h:525

is_utf16_surrogate_second

static bool is_utf16_surrogate_second(pg_wchar c)

Definition: pg_wchar.h:531

postgres.h

c

char * c

Definition: preproc-cursor.c:31

YYLTYPE

const char * YYLTYPE

Definition: preproc_extern.h:20

YYSTYPE

int YYSTYPE

Definition: psqlscanslash.l:39

scanner_errposition

int scanner_errposition(int location, core_yyscan_t yyscanner)

Definition: scan.l:1140

scanner_init

core_yyscan_t scanner_init(const char *str, core_yy_extra_type *yyext, const ScanKeywordList *keywordlist, const uint16 *keyword_tokens)

Definition: scan.l:1249

setup_scanner_errposition_callback

void setup_scanner_errposition_callback(ScannerCallbackState *scbstate, core_yyscan_t yyscanner, int location)

Definition: scan.l:1186

scanner_finish

void scanner_finish(core_yyscan_t yyscanner)

Definition: scan.l:1291

cancel_scanner_errposition_callback

void cancel_scanner_errposition_callback(ScannerCallbackState *scbstate)

Definition: scan.l:1203

yyextra

#define yyextra

Definition: scan.l:1118

ScanKeywordTokens

const uint16 ScanKeywordTokens[]

Definition: scan.l:81

scanner_yyerror

void scanner_yyerror(const char *message, core_yyscan_t yyscanner)

Definition: scan.l:1222

core_yyscan_t

void * core_yyscan_t

Definition: scanner.h:121

core_yylex

int core_yylex(core_YYSTYPE *yylval_param, YYLTYPE *yylloc_param, core_yyscan_t yyscanner)

truncate_identifier

void truncate_identifier(char *ident, int len, bool warn)

Definition: scansup.c:93

scanner_isspace

bool scanner_isspace(char ch)

Definition: scansup.c:117

scansup.h

List

Definition: pg_list.h:54

ScannerCallbackState

Definition: scanner.h:125

base_yy_extra_type

Definition: gramparse.h:36

PostgreSQL Source Code: src/backend/parser/parser.c Source File