1/*-------------------------------------------------------------------------
4 * Postgres' interface to the regular expression package.
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/utils/adt/regexp.c
13 * Alistair Crooks added the code for the regex caching
14 * agc - cached the regular expressions used - there's a good chance
15 * that we'll get a hit, so this saves a compile step for every
16 * attempted match. I haven't actually measured the speed improvement,
17 * but it `looks' a lot quicker visually when watching regression
20 * agc - incorporated Keith Bostic's Berkeley regex code into
21 * the tree for all ports. To distinguish this regex code from any that
22 * is existent on a platform, I've prepended the string "pg_" to
23 * the functions regcomp, regerror, regexec and regfree.
24 * Fixed a bug that was originally a typo by me, where `i' was used
25 * instead of `oldest' when compiling regular expressions - benign
26 * results mostly, although occasionally it bit you...
28 *-------------------------------------------------------------------------
40 #define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
41 (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
44/* all the options of interest for regex functions */
47 int cflags;
/* compile flags for Spencer's regex code */
48 bool glob;
/* do it globally (for each occurrence) */
51/* cross-call state for regexp_match and regexp_split functions */
55 int nmatches;
/* number of places where pattern matched */
56 int npatterns;
/* number of capturing subpatterns */
57 /* We store start char index and end+1 char index for each match */
58 /* so the number of entries in match_locs is nmatches * npatterns * 2 */
60 int next_match;
/* 0-based index of next match to process */
61 /* workspace for build_regexp_match_result() */
63 bool *
nulls;
/* has npatterns elements */
65 char *
conv_buf;
/* conversion buffer, if needed */
70 * We cache precompiled regular expressions using a "self organizing list"
71 * structure, in which recently-used items tend to be near the front.
72 * Whenever we use an entry, it's moved up to the front of the list.
73 * Over time, an item's average position corresponds to its frequency of use.
75 * When we first create an entry, it's inserted at the front of
76 * the array, dropping the entry at the end of the array if necessary to
77 * make room. (This might seem to be weighting the new entry too heavily,
78 * but if we insert new entries further back, we'll be unable to adjust to
79 * a sudden shift in the query mix where we are presented with MAX_CACHED_RES
80 * never-before-seen items used circularly. We ought to be able to handle
81 * that case, so we have to insert at the front.)
83 * Knuth mentions a variant strategy in which a used item is moved up just
84 * one place in the list. Although he says this uses fewer comparisons on
85 * average, it seems not to adapt very well to the situation where you have
86 * both some reusable patterns and a steady stream of non-reusable patterns.
87 * A reusable pattern that isn't used at least as often as non-reusable
88 * patterns are seen will "fail to keep up" and will drop off the end of the
89 * cache. With move-to-front, a reusable pattern is guaranteed to stay in
90 * the cache as long as it's used at least once in every MAX_CACHED_RES uses.
93/* this is the maximum number of cached regular expressions */
95 #define MAX_CACHED_RES 32
98/* A parent memory context for regular expressions. */
101/* this structure describes one cached regular expression */
105 char *
cre_pat;
/* original RE (not null terminated!) */
112 static int num_res = 0;
/* # of cached re's */
121 bool use_subpatterns,
122 bool ignore_degenerate,
123 bool fetching_unmatched);
129 * RE_compile_and_cache - compile a RE, caching if possible
133 * text_re --- the pattern, expressed as a TEXT object
134 * cflags --- compile options for the pattern
135 * collation --- collation to use for LC_CTYPE-dependent behavior
137 * Pattern is given in the database encoding. We internally convert to
138 * an array of pg_wchar, which is what Spencer's regex package wants.
154 * Look for a match among previously compiled REs. Since the data
155 * structure is self-organizing with most-used entries at the front, our
156 * search strategy can just be to scan from the front.
160 if (
re_array[
i].cre_pat_len == text_re_len &&
162 re_array[
i].cre_collation == collation &&
163 memcmp(
re_array[
i].cre_pat, text_re_val, text_re_len) == 0)
166 * Found a match; move it to front if not there already.
179 /* Set up the cache memory on first go through. */
183 "RegexpCacheMemoryContext",
187 * Couldn't find it, so try to compile the new RE. To avoid leaking
188 * resources on failure, we build into the re_temp local.
191 /* Convert pattern string to wide characters */
198 * Make a memory context for this compiled regexp. This is initially a
199 * child of the current memory context, so it will be cleaned up
200 * automatically if compilation is interrupted and throws an ERROR. We'll
201 * re-parent it under the longer lived cache context if we make it to the
202 * bottom of this function.
205 "RegexpMemoryContext",
219 /* re didn't compile (no need for pg_regfree, if so) */
222 (
errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
223 errmsg(
"invalid regular expression: %s", errMsg)));
226 /* Copy the pattern into the per-regexp memory context. */
228 memcpy(re_temp.
cre_pat, text_re_val, text_re_len);
231 * NUL-terminate it only for the benefit of the identifier used for the
232 * memory context, visible in the pg_backend_memory_contexts view.
234 re_temp.
cre_pat[text_re_len] = 0;
242 * Okay, we have a valid new item in re_temp; insert it into the storage
243 * array. Discard last entry if needed.
249 /* Delete the memory context holding the regexp and pattern. */
253 /* Re-parent the memory context to our long-lived cache context. */
268 * RE_wchar_execute - execute a RE on pg_wchar data
270 * Returns true on match, false on no match
272 * re --- the compiled pattern as returned by RE_compile_and_cache
273 * data --- the data to match against (need not be null-terminated)
274 * data_len --- the length of the data string
275 * start_search -- the offset in the data to start searching
276 * nmatch, pmatch --- optional return area for match details
278 * Data is given as array of pg_wchar which is what Spencer's regex package
283 int start_search,
int nmatch,
regmatch_t *pmatch)
288 /* Perform RE match and return result */
293 NULL,
/* no details */
301 pg_regerror(regexec_result, re, errMsg,
sizeof(errMsg));
303 (
errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
304 errmsg(
"regular expression failed: %s", errMsg)));
307 return (regexec_result ==
REG_OKAY);
311 * RE_execute - execute a RE
313 * Returns true on match, false on no match
315 * re --- the compiled pattern as returned by RE_compile_and_cache
316 * dat --- the data to match against (need not be null-terminated)
317 * dat_len --- the length of the data string
318 * nmatch, pmatch --- optional return area for match details
320 * Data is given in the database encoding. We internally
321 * convert to array of pg_wchar which is what Spencer's regex package wants.
331 /* Convert data string to wide characters */
335 /* Perform RE match and return result */
343 * RE_compile_and_execute - compile and execute a RE
345 * Returns true on match, false on no match
347 * text_re --- the pattern, expressed as a TEXT object
348 * dat --- the data to match against (need not be null-terminated)
349 * dat_len --- the length of the data string
350 * cflags --- compile options for the pattern
351 * collation --- collation to use for LC_CTYPE-dependent behavior
352 * nmatch, pmatch --- optional return area for match details
354 * Both pattern and data are given in the database encoding. We internally
355 * convert to array of pg_wchar which is what Spencer's regex package wants.
359 int cflags,
Oid collation,
364 /* Use REG_NOSUB if caller does not want sub-match details */
371 return RE_execute(re, dat, dat_len, nmatch, pmatch);
376 * parse_re_flags - parse the options argument of regexp_match and friends
378 * flags --- output argument, filled with desired options
379 * opts --- TEXT object, or NULL for defaults
381 * This accepts all the options allowed by any of the callers; callers that
382 * don't want some have to reject them after the fact.
387 /* regex flavor is always folded into the compile flags */
397 for (
i = 0;
i < opt_len;
i++)
404 case 'b':
/* BREs (but why???) */
407 case 'c':
/* case sensitive */
408 flags->
cflags &= ~REG_ICASE;
410 case 'e':
/* plain EREs */
414 case 'i':
/* case insensitive */
417 case 'm':
/* Perloid synonym for n */
418 case 'n':
/* \n affects ^ $ . [^ */
421 case 'p':
/* ~Perl, \n affects . [^ */
423 flags->
cflags &= ~REG_NLANCH;
425 case 'q':
/* literal string */
429 case 's':
/* single line, \n ordinary */
430 flags->
cflags &= ~REG_NEWLINE;
432 case 't':
/* tight syntax */
433 flags->
cflags &= ~REG_EXPANDED;
435 case 'w':
/* weird, \n affects ^ $ only */
436 flags->
cflags &= ~REG_NLSTOP;
439 case 'x':
/* expanded syntax */
444 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
445 errmsg(
"invalid regular expression option: \"%.*s\"",
455 * interface routines called by the function manager
516 * routines that use the regexp stuff, but ignore the case.
517 * for this, we use the REG_ICASE flag to pg_regcomp
580 * Return a substring matched by a regular expression.
596 * We pass two regmatch_t structs to get info about the overall match and
597 * the match for the first parenthesized subexpression (if any). If there
598 * is a parenthesized subexpression, we return what it matched; else
599 * return what the whole regexp matched.
608 /* has parenthesized subexpressions, use the first one */
609 so = pmatch[1].rm_so;
610 eo = pmatch[1].rm_eo;
614 /* no parenthesized subexpression, use whole match */
615 so = pmatch[0].rm_so;
616 eo = pmatch[0].rm_eo;
620 * It is possible to have a match to the whole pattern but no match for a
621 * subexpression; for example 'foo(bar)?' is considered to match 'foo' but
622 * there is no subexpression match. So this extra test for match failure
625 if (so < 0 || eo < 0)
635 * textregexreplace_noopt()
636 * Return a string matched by a regular expression, with replacement.
638 * This version doesn't have an option argument: we default to case
639 * sensitive match, replace the first instance only.
655 * Return a string matched by a regular expression, with replacement.
667 * regexp_replace() with four arguments will be preferentially resolved as
668 * this form when the fourth argument is of type UNKNOWN. However, the
669 * user might have intended to call textregexreplace_extended_no_n. If we
670 * see flags that look like an integer, emit the same error that
671 * parse_re_flags would, but add a HINT about how to fix it.
677 if (*opt_p >=
'0' && *opt_p <=
'9')
679 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
680 errmsg(
"invalid regular expression option: \"%.*s\"",
682 errhint(
"If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly.")));
689 0, flags.
glob ? 0 : 1));
693 * textregexreplace_extended()
694 * Return a string matched by a regular expression, with replacement.
695 * Extends textregexreplace by allowing a start position and the
696 * choice of the occurrence to replace (0 means all occurrences).
709 /* Collect optional parameters */
715 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
716 errmsg(
"invalid value for parameter \"%s\": %d",
724 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
725 errmsg(
"invalid value for parameter \"%s\": %d",
729 /* Determine options */
732 /* If N was not specified, deduce it from the 'g' flag */
734 n = re_flags.
glob ? 0 : 1;
736 /* Do the replacement(s) */
742/* This is separate to keep the opr_sanity regression test from complaining */
749/* This is separate to keep the opr_sanity regression test from complaining */
757 * similar_to_escape(), similar_escape()
759 * Convert a SQL "SIMILAR TO" regexp pattern to POSIX style, so it can be
760 * used by our regexp engine.
762 * similar_escape_internal() is the common workhorse for three SQL-exposed
763 * functions. esc_text can be passed as NULL to select the default escape
764 * (which is '\'), or as an empty string to select no escape character.
775 bool afterescape =
false;
777 int bracket_depth = 0;
/* square bracket nesting level */
778 int charclass_pos = 0;
/* position inside a character class */
782 if (esc_text == NULL)
784 /* No ESCAPE clause provided; default to backslash as escape */
793 e = NULL;
/* no escape character */
798 if (escape_mblen > 1)
800 (
errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
801 errmsg(
"invalid escape string"),
802 errhint(
"Escape string must be empty or one character.")));
807 * We surround the transformed input string with
809 * which requires some explanation. We need "^" and "$" to force
810 * the pattern to match the entire input string as per the SQL spec.
811 * The "(?:" and ")" are a non-capturing set of parens; we have to have
812 * parens in case the string contains "|", else the "^" and "$" will
813 * be bound into the first and last alternatives which is not what we
814 * want, and the parens must be non capturing because we don't want them
815 * to count when selecting output for SUBSTRING.
817 * When the pattern is divided into three parts by escape-double-quotes,
819 * ^(?:part1){1,1}?(part2){1,1}(?:part3)$
820 * which requires even more explanation. The "{1,1}?" on part1 makes it
821 * non-greedy so that it will match the smallest possible amount of text
822 * not the largest, as required by SQL. The plain parens around part2
823 * are capturing parens so that that part is what controls the result of
824 * SUBSTRING. The "{1,1}" forces part2 to be greedy, so that it matches
825 * the largest possible amount of text; hence part3 must match the
826 * smallest amount of text, as required by SQL. We don't need an explicit
827 * greediness marker on part3. Note that this also confines the effects
828 * of any "|" characters to the respective part, which is what we want.
830 * The SQL spec says that SUBSTRING's pattern must contain exactly two
831 * escape-double-quotes, but we only complain if there's more than two.
832 * With none, we act as though part1 and part3 are empty; with one, we
833 * act as though part3 is empty. Both behaviors fall out of omitting
834 * the relevant part separators in the above expansion. If the result
835 * of this function is used in a plain regexp match (SIMILAR TO), the
836 * escape-double-quotes have no effect on the match behavior.
838 * While we don't fully validate character classes (bracket expressions),
839 * we do need to parse them well enough to know where they end.
840 * "charclass_pos" tracks where we are in a character class.
841 * Its value is uninteresting when bracket_depth is 0.
842 * But when bracket_depth > 0, it will be
843 * 1: right after the opening '[' (a following '^' will negate
844 * the class, while ']' is a literal character)
845 * 2: right after a '^' after the opening '[' (']' is still a literal
847 * 3 or more: further inside the character class (']' ends the class)
852 * We need room for the prefix/postfix and part separators, plus as many
853 * as 3 output bytes per input byte; since the input is at most 1GB this
854 * can't overflow size_t.
869 * If both the escape character and the current character from the
870 * pattern are multi-byte, we need to take the slow path.
872 * But if one of them is single-byte, we can process the pattern one
873 * byte at a time, ignoring multi-byte characters. (This works
874 * because all server-encodings have the property that a valid
875 * multi-byte character representation cannot contain the
876 * representation of a valid single-byte character.)
885 /* slow, multi-byte path */
893 else if (
e && elen == mblen && memcmp(
e, p, mblen) == 0)
895 /* SQL escape character; do not send to output */
901 * We know it's a multi-byte character, so we don't need
902 * to do all the comparisons to single-byte characters
919 if (pchar ==
'"' && bracket_depth < 1)
/* escape-double-quote? */
921 /* emit appropriate part separator, per notes above */
933 else if (nquotes == 1)
947 (
errcode(ERRCODE_INVALID_USE_OF_ESCAPE_CHARACTER),
948 errmsg(
"SQL regular expression may not contain more than two escape-double-quote separators")));
954 * We allow any character at all to be escaped; notably, this
955 * allows access to POSIX character-class escapes such as
956 * "\d". The SQL spec is considerably more restrictive.
962 * If we encounter an escaped character in a character class,
963 * we are no longer at the beginning.
969 else if (
e && pchar == *
e)
971 /* SQL escape character; do not send to output */
974 else if (bracket_depth > 0)
976 /* inside a character class */
980 * If we're here, backslash is not the SQL escape character,
981 * so treat it as a literal class element, which requires
982 * doubling it. (This matches our behavior for backslashes
983 * outside character classes.)
989 /* parse the character class well enough to identify ending ']' */
990 if (pchar ==
']' && charclass_pos > 2)
992 /* found the real end of a bracket pair */
994 /* don't reset charclass_pos, this may be an inner bracket */
996 else if (pchar ==
'[')
998 /* start of a nested bracket pair */
1002 * We are no longer at the beginning of a character class.
1003 * (The nested bracket pair is a collating element, not a
1004 * character class in its own right.)
1008 else if (pchar ==
'^')
1011 * A caret right after the opening bracket negates the
1012 * character class. In that case, the following will
1013 * increment charclass_pos from 1 to 2, so that a following
1014 * ']' is still a literal character and does not end the
1015 * character class. If we are further inside a character
1016 * class, charclass_pos might get incremented past 3, which is
1024 * Anything else (including a backslash or leading ']') is an
1025 * element of the character class, so we are no longer at the
1026 * beginning of the class.
1031 else if (pchar ==
'[')
1033 /* start of a character class */
1038 else if (pchar ==
'%')
1043 else if (pchar ==
'_')
1045 else if (pchar ==
'(')
1047 /* convert to non-capturing parenthesis */
1052 else if (pchar ==
'\\' || pchar ==
'.' ||
1053 pchar ==
'^' || pchar ==
'$')
1072 * similar_to_escape(pattern, escape)
1087 * similar_to_escape(pattern)
1088 * Inserts a default escape character.
1102 * similar_escape(pattern, escape)
1104 * Legacy function for compatibility with views stored using the
1105 * pre-v13 expansion of SIMILAR TO. Unlike the above functions, this
1106 * is non-strict, which leads to not-per-spec handling of "ESCAPE NULL".
1115 /* This function is not strict, so must test explicitly */
1121 esc_text = NULL;
/* use default escape character */
1132 * Return the number of matches of a pattern within a string.
1144 /* Collect optional parameters */
1150 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1151 errmsg(
"invalid value for parameter \"%s\": %d",
1155 /* Determine options */
1157 /* User mustn't specify 'g' */
1160 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1161 /* translator: %s is a SQL function name */
1162 errmsg(
"%s does not support the \"global\" option",
1163 "regexp_count()")));
1164 /* But we find all the matches anyway */
1165 re_flags.
glob =
true;
1167 /* Do the matching */
1170 false,
/* can ignore subexprs */
1176/* This is separate to keep the opr_sanity regression test from complaining */
1183/* This is separate to keep the opr_sanity regression test from complaining */
1192 * Return the match's position within the string
1208 /* Collect optional parameters */
1214 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1215 errmsg(
"invalid value for parameter \"%s\": %d",
1223 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1224 errmsg(
"invalid value for parameter \"%s\": %d",
1230 if (endoption != 0 && endoption != 1)
1232 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1233 errmsg(
"invalid value for parameter \"%s\": %d",
1234 "endoption", endoption)));
1241 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1242 errmsg(
"invalid value for parameter \"%s\": %d",
1243 "subexpr", subexpr)));
1246 /* Determine options */
1248 /* User mustn't specify 'g' */
1251 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1252 /* translator: %s is a SQL function name */
1253 errmsg(
"%s does not support the \"global\" option",
1254 "regexp_instr()")));
1255 /* But we find all the matches anyway */
1256 re_flags.
glob =
true;
1258 /* Do the matching */
1261 (subexpr > 0),
/* need submatches? */
1264 /* When n exceeds matches return 0 (includes case of no matches) */
1268 /* When subexpr exceeds number of subexpressions return 0 */
1272 /* Select the appropriate match position to return */
1286/* This is separate to keep the opr_sanity regression test from complaining */
1293/* This is separate to keep the opr_sanity regression test from complaining */
1300/* This is separate to keep the opr_sanity regression test from complaining */
1307/* This is separate to keep the opr_sanity regression test from complaining */
1314/* This is separate to keep the opr_sanity regression test from complaining */
1323 * Test for a pattern match within a string.
1333 /* Determine options */
1335 /* User mustn't specify 'g' */
1338 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1339 /* translator: %s is a SQL function name */
1340 errmsg(
"%s does not support the \"global\" option",
1343 /* Otherwise it's like textregexeq/texticregexeq */
1352/* This is separate to keep the opr_sanity regression test from complaining */
1361 * Return the first substring(s) matching a pattern within a string.
1372 /* Determine options */
1374 /* User mustn't specify 'g' */
1377 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1378 /* translator: %s is a SQL function name */
1379 errmsg(
"%s does not support the \"global\" option",
1381 errhint(
"Use the regexp_matches function instead.")));
1391 /* Create workspace that build_regexp_match_result needs */
1398/* This is separate to keep the opr_sanity regression test from complaining */
1407 * Return a table of all matches of a pattern within a string.
1425 /* Determine options */
1428 /* be sure to copy the input string into the multi-call ctx */
1432 true,
false,
false);
1434 /* Pre-create workspace that build_regexp_match_result needs */
1457/* This is separate to keep the opr_sanity regression test from complaining */
1465 * setup_regexp_matches --- do the initial matching for regexp_match,
1466 * regexp_split, and related functions
1468 * To avoid having to re-find the compiled pattern on each call, we do
1469 * all the matching in one swoop. The returned regexp_matches_ctx contains
1470 * the locations of all the substrings matching the pattern.
1472 * start_search: the character (not byte) offset in orig_str at which to
1473 * begin the search. Returned positions are relative to orig_str anyway.
1474 * use_subpatterns: collect data about matches to parenthesized subexpressions.
1475 * ignore_degenerate: ignore zero-length matches.
1476 * fetching_unmatched: caller wants to fetch unmatched substrings.
1478 * We don't currently assume that fetching_unmatched is exclusive of fetching
1479 * the matched text too; if it's set, the conversion buffer is large enough to
1480 * fetch any single matched or unmatched string, but not any larger
1481 * substring. (In practice, when splitting the matches are usually small
1482 * anyway, and it didn't seem worth complicating the code further.)
1488 bool use_subpatterns,
1489 bool ignore_degenerate,
1490 bool fetching_unmatched)
1504 int prev_valid_match_end;
1505 int maxlen = 0;
/* largest fetch length in characters */
1507 /* save original string --- we'll extract result substrings from it */
1510 /* convert string to pg_wchar form for matching */
1515 /* set up the compiled pattern */
1516 cflags = re_flags->
cflags;
1517 if (!use_subpatterns)
1521 /* do we want to remember subpatterns? */
1522 if (use_subpatterns && cpattern->re_nsub > 0)
1524 matchctx->
npatterns = cpattern->re_nsub;
1525 pmatch_len = cpattern->re_nsub + 1;
1529 use_subpatterns =
false;
1534 /* temporary output space for RE package */
1538 * the real output space (grown dynamically if needed)
1540 * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
1543 array_len = re_flags->
glob ? 255 : 31;
1547 /* search for the pattern, perhaps repeatedly */
1549 prev_valid_match_end = 0;
1551 pmatch_len, pmatch))
1554 * If requested, ignore degenerate matches, which are zero-length
1555 * matches occurring at the start or end of a string or just after a
1558 if (!ignore_degenerate ||
1559 (pmatch[0].rm_so < wide_len &&
1560 pmatch[0].rm_eo > prev_match_end))
1562 /* enlarge output space if needed */
1563 while (array_idx + matchctx->
npatterns * 2 + 1 > array_len)
1565 array_len += array_len + 1;
/* 2^n-1 => 2^(n+1)-1 */
1568 (
errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1569 errmsg(
"too many regular expression matches")));
1571 sizeof(
int) * array_len);
1574 /* save this match's locations */
1575 if (use_subpatterns)
1581 int so = pmatch[
i].rm_so;
1582 int eo = pmatch[
i].rm_eo;
1586 if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
1592 int so = pmatch[0].rm_so;
1593 int eo = pmatch[0].rm_eo;
1597 if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
1603 * check length of unmatched portion between end of previous valid
1604 * (nondegenerate, or degenerate but not ignored) match and start
1607 if (fetching_unmatched &&
1608 pmatch[0].rm_so >= 0 &&
1609 (pmatch[0].rm_so - prev_valid_match_end) > maxlen)
1610 maxlen = (pmatch[0].rm_so - prev_valid_match_end);
1611 prev_valid_match_end = pmatch[0].rm_eo;
1613 prev_match_end = pmatch[0].rm_eo;
1615 /* if not glob, stop after one match */
1616 if (!re_flags->
glob)
1620 * Advance search position. Normally we start the next search at the
1621 * end of the previous match; but if the match was of zero length, we
1622 * have to advance by one character, or we'd just find the same match
1625 start_search = prev_match_end;
1626 if (pmatch[0].rm_so == pmatch[0].rm_eo)
1628 if (start_search > wide_len)
1633 * check length of unmatched portion between end of last match and end of
1636 if (fetching_unmatched &&
1637 (wide_len - prev_valid_match_end) > maxlen)
1638 maxlen = (wide_len - prev_valid_match_end);
1641 * Keep a note of the end position of the string for the benefit of
1652 * Make the conversion buffer large enough for any substring of
1655 * Worst case: assume we need the maximum size (maxlen*eml), but take
1656 * advantage of the fact that the original string length in bytes is
1657 * an upper bound on the byte length of any fetched substring (and we
1658 * know that len+1 is safe to allocate because the varlena header is
1659 * longer than 1 byte).
1661 if (maxsiz > orig_len)
1662 conv_bufsiz = orig_len + 1;
1664 conv_bufsiz = maxsiz + 1;
/* safe since maxsiz < 2^30 */
1672 /* No need to keep the wide string if we're in a single-byte charset. */
1679 /* Clean up temp storage */
1686 * build_regexp_match_result - build output array for current match
1693 bool *nulls = matchctx->
nulls;
1699 /* Extract matching substrings from the original string */
1706 if (so < 0 || eo < 0)
1717 Assert(len < matchctx->conv_bufsiz);
1731 /* And form an array */
1734 /* XXX: this hardcodes assumptions about the text type */
1736 TEXTOID, -1,
false, TYPALIGN_INT);
1740 * regexp_split_to_table()
1741 * Split the string at matches of the pattern, returning the
1742 * split-out substrings as a table.
1760 /* Determine options */
1762 /* User mustn't specify 'g' */
1765 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1766 /* translator: %s is a SQL function name */
1767 errmsg(
"%s does not support the \"global\" option",
1768 "regexp_split_to_table()")));
1769 /* But we find all the matches anyway */
1770 re_flags.
glob =
true;
1772 /* be sure to copy the input string into the multi-call ctx */
1796/* This is separate to keep the opr_sanity regression test from complaining */
1804 * regexp_split_to_array()
1805 * Split the string at matches of the pattern, returning the
1806 * split-out substrings as an array.
1815 /* Determine options */
1817 /* User mustn't specify 'g' */
1820 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1821 /* translator: %s is a SQL function name */
1822 errmsg(
"%s does not support the \"global\" option",
1823 "regexp_split_to_array()")));
1824 /* But we find all the matches anyway */
1825 re_flags.
glob =
true;
1846/* This is separate to keep the opr_sanity regression test from complaining */
1854 * build_regexp_split_result - build output string for current match
1856 * We return the string between the current match and the previous one,
1857 * or the string after the last match when next_match == nmatches.
1871 elog(
ERROR,
"invalid match ending position");
1875 elog(
ERROR,
"invalid match starting position");
1884 Assert(len < splitctx->conv_bufsiz);
1898 * Return the substring that matches a regular expression pattern
1915 /* Collect optional parameters */
1921 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1922 errmsg(
"invalid value for parameter \"%s\": %d",
1930 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1931 errmsg(
"invalid value for parameter \"%s\": %d",
1939 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1940 errmsg(
"invalid value for parameter \"%s\": %d",
1941 "subexpr", subexpr)));
1944 /* Determine options */
1946 /* User mustn't specify 'g' */
1949 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1950 /* translator: %s is a SQL function name */
1951 errmsg(
"%s does not support the \"global\" option",
1952 "regexp_substr()")));
1953 /* But we find all the matches anyway */
1954 re_flags.
glob =
true;
1956 /* Do the matching */
1959 (subexpr > 0),
/* need submatches? */
1962 /* When n exceeds matches return NULL (includes case of no matches) */
1966 /* When subexpr exceeds number of subexpressions return NULL */
1970 /* Select the appropriate match position to return */
1978 if (so < 0 || eo < 0)
1987/* This is separate to keep the opr_sanity regression test from complaining */
1994/* This is separate to keep the opr_sanity regression test from complaining */
2001/* This is separate to keep the opr_sanity regression test from complaining */
2008/* This is separate to keep the opr_sanity regression test from complaining */
2016 * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp
2018 * The result is NULL if there is no fixed prefix, else a palloc'd string.
2019 * If it is an exact match, not just a prefix, *exact is returned as true.
2034 *exact =
false;
/* default result */
2038 if (case_insensitive)
2043 /* Examine it to see if there's a fixed prefix */
2052 /* continue with wchar conversion */
2057 /* continue with wchar conversion */
2062 pg_regerror(re_result, re, errMsg,
sizeof(errMsg));
2064 (
errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
2065 errmsg(
"regular expression failed: %s", errMsg)));
2069 /* Convert pg_wchar result back to database encoding */
2071 result = (
char *)
palloc(maxlen);
ArrayBuildState * accumArrayResult(ArrayBuildState *astate, Datum dvalue, bool disnull, Oid element_type, MemoryContext rcontext)
ArrayType * construct_md_array(Datum *elems, bool *nulls, int ndims, int *dims, int *lbs, Oid elmtype, int elmlen, bool elmbyval, char elmalign)
Datum makeArrayResult(ArrayBuildState *astate, MemoryContext rcontext)
int errhint(const char *fmt,...)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
#define PG_GETARG_TEXT_PP(n)
#define PG_GETARG_NAME(n)
#define PG_RETURN_TEXT_P(x)
#define PG_RETURN_INT32(x)
#define PG_GETARG_INT32(n)
#define PG_RETURN_DATUM(x)
#define DirectFunctionCall3(func, arg1, arg2, arg3)
#define PG_GET_COLLATION()
#define PG_GETARG_TEXT_P_COPY(n)
#define PG_RETURN_BOOL(x)
#define SRF_IS_FIRSTCALL()
#define SRF_PERCALL_SETUP()
#define SRF_RETURN_NEXT(_funcctx, _result)
#define SRF_FIRSTCALL_INIT()
#define SRF_RETURN_DONE(_funcctx)
Assert(PointerIsAligned(start, uint64))
if(TABLE==NULL||TABLE_index==NULL)
int pg_mbstrlen_with_len(const char *mbstr, int limit)
int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
int pg_database_encoding_max_length(void)
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
int pg_mblen(const char *mbstr)
void MemoryContextSetParent(MemoryContext context, MemoryContext new_parent)
void * repalloc(void *pointer, Size size)
void pfree(void *pointer)
void * palloc0(Size size)
MemoryContext TopMemoryContext
MemoryContext CurrentMemoryContext
void MemoryContextDelete(MemoryContext context)
void MemoryContextSetIdentifier(MemoryContext context, const char *id)
#define AllocSetContextCreate
#define ALLOCSET_SMALL_SIZES
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
static AmcheckOptions opts
static XLogRecPtr startpos
static Datum PointerGetDatum(const void *X)
static Datum Int32GetDatum(int32 X)
int pg_regcomp(regex_t *re, const chr *string, size_t len, int flags, Oid collation)
size_t pg_regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
int pg_regexec(regex_t *re, const chr *string, size_t len, size_t search_start, rm_detail_t *details, size_t nmatch, regmatch_t pmatch[], int flags)
struct regexp_matches_ctx regexp_matches_ctx
static MemoryContext RegexpCacheMemoryContext
regex_t * RE_compile_and_cache(text *text_re, int cflags, Oid collation)
Datum regexp_match_no_flags(PG_FUNCTION_ARGS)
Datum textregexreplace(PG_FUNCTION_ARGS)
Datum texticregexne(PG_FUNCTION_ARGS)
Datum regexp_substr_no_start(PG_FUNCTION_ARGS)
struct pg_re_flags pg_re_flags
Datum regexp_split_to_array(PG_FUNCTION_ARGS)
Datum texticregexeq(PG_FUNCTION_ARGS)
Datum regexp_substr_no_n(PG_FUNCTION_ARGS)
Datum regexp_instr_no_subexpr(PG_FUNCTION_ARGS)
Datum similar_to_escape_2(PG_FUNCTION_ARGS)
bool RE_compile_and_execute(text *text_re, char *dat, int dat_len, int cflags, Oid collation, int nmatch, regmatch_t *pmatch)
char * regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation, bool *exact)
static bool RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len, int start_search, int nmatch, regmatch_t *pmatch)
Datum regexp_substr(PG_FUNCTION_ARGS)
Datum nameicregexne(PG_FUNCTION_ARGS)
Datum textregexsubstr(PG_FUNCTION_ARGS)
static Datum build_regexp_split_result(regexp_matches_ctx *splitctx)
Datum regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
Datum textregexreplace_extended_no_n(PG_FUNCTION_ARGS)
static regexp_matches_ctx * setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, int start_search, Oid collation, bool use_subpatterns, bool ignore_degenerate, bool fetching_unmatched)
Datum nameregexne(PG_FUNCTION_ARGS)
Datum regexp_instr(PG_FUNCTION_ARGS)
static ArrayType * build_regexp_match_result(regexp_matches_ctx *matchctx)
Datum similar_to_escape_1(PG_FUNCTION_ARGS)
Datum regexp_substr_no_flags(PG_FUNCTION_ARGS)
Datum regexp_matches(PG_FUNCTION_ARGS)
#define PG_GETARG_TEXT_PP_IF_EXISTS(_n)
Datum nameicregexeq(PG_FUNCTION_ARGS)
Datum regexp_matches_no_flags(PG_FUNCTION_ARGS)
Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
Datum regexp_match(PG_FUNCTION_ARGS)
Datum textregexreplace_extended(PG_FUNCTION_ARGS)
Datum nameregexeq(PG_FUNCTION_ARGS)
Datum regexp_instr_no_n(PG_FUNCTION_ARGS)
Datum regexp_count_no_start(PG_FUNCTION_ARGS)
struct cached_re_str cached_re_str
static cached_re_str re_array[MAX_CACHED_RES]
static bool RE_execute(regex_t *re, char *dat, int dat_len, int nmatch, regmatch_t *pmatch)
static void parse_re_flags(pg_re_flags *flags, text *opts)
Datum regexp_split_to_table(PG_FUNCTION_ARGS)
Datum textregexreplace_noopt(PG_FUNCTION_ARGS)
Datum regexp_like_no_flags(PG_FUNCTION_ARGS)
Datum regexp_instr_no_flags(PG_FUNCTION_ARGS)
Datum textregexeq(PG_FUNCTION_ARGS)
Datum textregexne(PG_FUNCTION_ARGS)
Datum regexp_count_no_flags(PG_FUNCTION_ARGS)
Datum similar_escape(PG_FUNCTION_ARGS)
Datum regexp_instr_no_start(PG_FUNCTION_ARGS)
Datum regexp_instr_no_endoption(PG_FUNCTION_ARGS)
Datum textregexreplace_extended_no_flags(PG_FUNCTION_ARGS)
Datum regexp_like(PG_FUNCTION_ARGS)
Datum regexp_substr_no_subexpr(PG_FUNCTION_ARGS)
static text * similar_escape_internal(text *pat_text, text *esc_text)
Datum regexp_count(PG_FUNCTION_ARGS)
int pg_regprefix(regex_t *re, chr **string, size_t *slength)
MemoryContext multi_call_memory_ctx
MemoryContext cre_context
static Size VARSIZE_ANY_EXHDR(const void *PTR)
static char * VARDATA(const void *PTR)
static char * VARDATA_ANY(const void *PTR)
static void SET_VARSIZE(void *PTR, Size len)
Datum text_substr(PG_FUNCTION_ARGS)
text * cstring_to_text_with_len(const char *s, int len)
text * replace_text_regexp(text *src_text, text *pattern_text, text *replace_text, int cflags, Oid collation, int search_start, int n)