1/*-------------------------------------------------------------------------
4 * Main entry point/driver for PostgreSQL grammar
6 * Note that the grammar is not allowed to perform any table access
7 * (since we need to be able to do basic parsing even while inside an
8 * aborted transaction). Therefore, the data structures returned by
9 * the grammar are "raw" parsetrees that still need to be analyzed by
10 * analyze.c and related files.
13 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
14 * Portions Copyright (c) 1994, Regents of the University of California
17 * src/backend/parser/parser.c
19 *-------------------------------------------------------------------------
36 * Given a query in string form, do lexical and grammatical analysis.
38 * Returns a list of raw (un-analyzed) parse trees. The contents of the
39 * list have the form required by the specified RawParseMode.
48 /* initialize the flex scanner */
52 /* base_yylex() only needs us to initialize the lookahead token, if any */
57 /* this array is indexed by RawParseMode enum */
58 static const int mode_token[] = {
73 /* initialize the bison parser */
79 /* Clean up (release memory) */
82 if (yyresult)
/* error */
90 * Intermediate filter between parser and core lexer (core_yylex in scan.l).
92 * This filter is needed because in some cases the standard SQL grammar
93 * requires more than one token lookahead. We reduce these cases to one-token
94 * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
96 * Using a filter is simpler than trying to recognize multiword tokens
97 * directly in scan.l, because we'd have to allow for comments between the
98 * words. Furthermore it's not clear how to do that without re-introducing
99 * scanner backtrack, which would cost more performance than this filter
102 * We also use this filter to convert UIDENT and USCONST sequences into
103 * plain IDENT and SCONST tokens. While that could be handled by additional
104 * productions in the main grammar, it's more efficient to do it like this.
106 * The filter also provides a convenient place to translate between
107 * the core_YYSTYPE and YYSTYPE representations (which are really the
108 * same thing anyway, but notationally they're different).
116 int cur_token_length;
119 /* Get next token --- we might already have it */
122 cur_token =
yyextra->lookahead_token;
123 lvalp->core_yystype =
yyextra->lookahead_yylval;
124 *llocp =
yyextra->lookahead_yylloc;
127 yyextra->have_lookahead =
false;
130 cur_token =
core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
133 * If this token isn't one that requires lookahead, just return it. If it
134 * does, determine the token length. (We could get that via strlen(), but
135 * since we have such a small set of possibilities, hardwiring seems
136 * feasible and more efficient --- at least for the fixed-length cases.)
141 cur_token_length = 6;
144 cur_token_length = 3;
147 cur_token_length = 5;
150 cur_token_length = 4;
154 cur_token_length = strlen(
yyextra->core_yy_extra.scanbuf + *llocp);
157 cur_token_length = 7;
164 * Identify end+1 of current token. core_yylex() has temporarily stored a
165 * '0円' here, and will undo that when we call it again. We need to redo
166 * it to fully revert the lookahead call for error reporting purposes.
169 *llocp + cur_token_length;
173 * Save and restore *llocp around the call. It might look like we could
174 * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
175 * does not work because flex actually holds onto the last-passed pointer
176 * internally, and will use that for error reporting. We need any error
177 * reports to point to the current token, not the next one.
181 /* Get next token, saving outputs into lookahead variables */
184 yyextra->lookahead_yylloc = *llocp;
188 /* Now revert the un-truncation of the current token */
190 *(
yyextra->lookahead_end) =
'0円';
192 yyextra->have_lookahead =
true;
194 /* Replace cur_token if needed, based on lookahead */
198 /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
202 cur_token = FORMAT_LA;
208 /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
222 /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
227 cur_token = NULLS_LA;
233 /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
244 /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
248 cur_token = WITHOUT_LA;
255 /* Look ahead for UESCAPE */
258 /* Yup, so get third token, which had better be SCONST */
261 /* Again save and restore *llocp */
264 /* Un-truncate current token so errors point to third token */
267 /* Get third token */
271 /* If we throw error here, it will point to third token */
276 escstr =
yyextra->lookahead_yylval.str;
281 /* Now restore *llocp; errors will point to first token */
284 /* Apply Unicode conversion */
285 lvalp->core_yystype.str =
292 * We don't need to revert the un-truncation of UESCAPE. What
293 * we do want to do is clear have_lookahead, thereby consuming
296 yyextra->have_lookahead =
false;
300 /* No UESCAPE, so convert using default escape character */
301 lvalp->core_yystype.str =
308 if (cur_token == UIDENT)
310 /* It's an identifier, so truncate as appropriate */
312 strlen(lvalp->core_yystype.str),
316 else if (cur_token == USCONST)
326/* convert hex digit (caller should have verified that) to value */
330 if (
c >=
'0' &&
c <=
'9')
332 if (
c >=
'a' &&
c <=
'f')
333 return c -
'a' + 0xA;
334 if (
c >=
'A' &&
c <=
'F')
335 return c -
'A' + 0xA;
336 elog(
ERROR,
"invalid hexadecimal digit");
337 return 0;
/* not reached */
340/* is Unicode code point acceptable? */
346 (
errcode(ERRCODE_SYNTAX_ERROR),
347 errmsg(
"invalid Unicode escape value")));
350/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
365 * Process Unicode escapes in "str", producing a palloc'd plain string
367 * escape: the escape character to use
368 * position: start position of U&'' or U&"" string token
369 * yyscanner: context information needed for error reports
383 * Guesstimate that result will be no longer than input, but allow enough
384 * padding for Unicode conversion.
393 /* Enlarge string if needed */
394 size_t out_dist = out -
new;
400 out =
new + out_dist;
406 * Any errors reported while processing this escape sequence will
407 * have an error cursor pointing at the escape.
410 in -
str + position + 3);
/* 3 for U&" */
418 else if (isxdigit((
unsigned char) in[1]) &&
419 isxdigit((
unsigned char) in[2]) &&
420 isxdigit((
unsigned char) in[3]) &&
421 isxdigit((
unsigned char) in[4]))
425 unicode = (
hexval(in[1]) << 12) +
444 pair_first = unicode;
452 else if (in[1] ==
'+' &&
453 isxdigit((
unsigned char) in[2]) &&
454 isxdigit((
unsigned char) in[3]) &&
455 isxdigit((
unsigned char) in[4]) &&
456 isxdigit((
unsigned char) in[5]) &&
457 isxdigit((
unsigned char) in[6]) &&
458 isxdigit((
unsigned char) in[7]))
462 unicode = (
hexval(in[2]) << 20) +
483 pair_first = unicode;
493 (
errcode(ERRCODE_SYNTAX_ERROR),
494 errmsg(
"invalid Unicode escape"),
495 errhint(
"Unicode escapes must be \\XXXX or \\+XXXXXX.")));
508 /* unfinished surrogate pair? */
516 * We might get here with the error callback active, or not. Call
517 * scanner_errposition to make sure an error cursor appears; if the
518 * callback is active, this is duplicative but harmless.
522 (
errcode(ERRCODE_SYNTAX_ERROR),
523 errmsg(
"invalid Unicode surrogate pair"),
526 return NULL;
/* keep compiler quiet */
static void check_unicode_value(pg_wchar c)
List * raw_parser(const char *str, RawParseMode mode)
static unsigned int hexval(unsigned char c)
static char * str_udeescape(const char *str, char escape, int position, core_yyscan_t yyscanner)
int base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
static bool check_uescapechar(unsigned char escape)
int errhint(const char *fmt,...)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
#define pg_yyget_extra(yyscanner)
void parser_init(base_yy_extra_type *yyext)
int base_yyparse(core_yyscan_t yyscanner)
Assert(PointerIsAligned(start, uint64))
static bool next_token(char **lineptr, StringInfo buf, bool *initial_quote, bool *terminating_comma)
PGDLLIMPORT const ScanKeywordList ScanKeywords
void pg_unicode_to_server(pg_wchar c, unsigned char *s)
void * repalloc(void *pointer, Size size)
@ RAW_PARSE_PLPGSQL_ASSIGN2
@ RAW_PARSE_PLPGSQL_ASSIGN1
@ RAW_PARSE_PLPGSQL_ASSIGN3
static PgChecksumMode mode
#define MAX_UNICODE_EQUIVALENT_STRING
static bool is_valid_unicode_codepoint(pg_wchar c)
static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
static bool is_utf16_surrogate_first(pg_wchar c)
static bool is_utf16_surrogate_second(pg_wchar c)
int scanner_errposition(int location, core_yyscan_t yyscanner)
core_yyscan_t scanner_init(const char *str, core_yy_extra_type *yyext, const ScanKeywordList *keywordlist, const uint16 *keyword_tokens)
void setup_scanner_errposition_callback(ScannerCallbackState *scbstate, core_yyscan_t yyscanner, int location)
void scanner_finish(core_yyscan_t yyscanner)
void cancel_scanner_errposition_callback(ScannerCallbackState *scbstate)
const uint16 ScanKeywordTokens[]
void scanner_yyerror(const char *message, core_yyscan_t yyscanner)
int core_yylex(core_YYSTYPE *yylval_param, YYLTYPE *yylloc_param, core_yyscan_t yyscanner)
void truncate_identifier(char *ident, int len, bool warn)
bool scanner_isspace(char ch)