1/*-------------------------------------------------------------------------
4 * Regular expression matching using trigrams.
6 * The general idea of trigram index support for a regular expression (regex)
7 * search is to transform the regex into a logical expression on trigrams.
10 * (ab|cd)efg => ((abe & bef) | (cde & def)) & efg
12 * If a string matches the regex, then it must match the logical expression on
13 * trigrams. The opposite is not necessarily true, however: a string that
14 * matches the logical expression might not match the original regex. Such
15 * false positives are removed via recheck, by running the regular regex match
16 * operator on the retrieved heap tuple.
18 * Since the trigram expression involves both AND and OR operators, we can't
19 * expect the core index machinery to evaluate it completely. Instead, the
20 * result of regex analysis is a list of trigrams to be sought in the index,
21 * plus a simplified graph that is used by trigramsMatchGraph() to determine
22 * whether a particular indexed value matches the expression.
24 * Converting a regex to a trigram expression is based on analysis of an
25 * automaton corresponding to the regex. The algorithm consists of four
28 * 1) Compile the regexp to NFA form. This is handled by the PostgreSQL
29 * regexp library, which provides accessors for its opaque regex_t struct
30 * to expose the NFA state graph and the "colors" (sets of equivalent
31 * characters) used as state transition labels.
33 * 2) Transform the original NFA into an expanded graph, where arcs
34 * are labeled with trigrams that must be present in order to move from
35 * one state to another via the arcs. The trigrams used in this stage
36 * consist of colors, not characters, as in the original NFA.
38 * 3) Expand the color trigrams into regular trigrams consisting of
39 * characters. If too many distinct trigrams are produced, trigrams are
40 * eliminated and the graph is simplified until it's simple enough.
42 * 4) Finally, the resulting graph is packed into a TrgmPackedGraph struct,
43 * and returned to the caller.
45 * 1) Compile the regexp to NFA form
46 * ---------------------------------
47 * The automaton returned by the regexp compiler is a graph where vertices
48 * are "states" and arcs are labeled with colors. Each color represents
49 * a set of characters, so that all characters assigned to the same color
50 * are interchangeable, so far as matching the regexp is concerned. There
51 * are two special states: "initial" and "final". A state can have multiple
52 * outgoing arcs labeled with the same color, which makes the automaton
53 * non-deterministic, because it can be in many states simultaneously.
55 * Note that this NFA is already lossy compared to the original regexp,
56 * since it ignores some regex features such as lookahead constraints and
57 * backref matching. This is OK for our purposes since it's still the case
58 * that only strings matching the NFA can possibly satisfy the regexp.
60 * 2) Transform the original NFA into an expanded graph
61 * ----------------------------------------------------
62 * In the 2nd stage, the automaton is transformed into a graph based on the
63 * original NFA. Each state in the expanded graph represents a state from
64 * the original NFA, plus a prefix identifying the last two characters
65 * (colors, to be precise) seen before entering the state. There can be
66 * multiple states in the expanded graph for each state in the original NFA,
67 * depending on what characters can precede it. A prefix position can be
68 * "unknown" if it's uncertain what the preceding character was, or "blank"
69 * if the character was a non-word character (we don't need to distinguish
70 * which non-word character it was, so just think of all of them as blanks).
72 * For convenience in description, call an expanded-state identifier
73 * (two prefix colors plus a state number from the original NFA) an
76 * Each arc of the expanded graph is labeled with a trigram that must be
77 * present in the string to match. We can construct this from an out-arc of
78 * the underlying NFA state by combining the expanded state's prefix with the
79 * color label of the underlying out-arc, if neither prefix position is
80 * "unknown". But note that some of the colors in the trigram might be
81 * "blank". This is OK since we want to generate word-boundary trigrams as
82 * the regular trigram machinery would, if we know that some word characters
83 * must be adjacent to a word boundary in all strings matching the NFA.
85 * The expanded graph can also have fewer states than the original NFA,
86 * because we don't bother to make a separate state entry unless the state
87 * is reachable by a valid arc. When an enter key is reachable from a state
88 * of the expanded graph, but we do not know a complete trigram associated
89 * with that transition, we cannot make a valid arc; instead we insert the
90 * enter key into the enterKeys list of the source state. This effectively
91 * means that the two expanded states are not reliably distinguishable based
92 * on examining trigrams.
94 * So the expanded graph resembles the original NFA, but the arcs are
95 * labeled with trigrams instead of individual characters, and there may be
96 * more or fewer states. It is a lossy representation of the original NFA:
97 * any string that matches the original regexp must match the expanded graph,
98 * but the reverse is not true.
100 * We build the expanded graph through a breadth-first traversal of states
101 * reachable from the initial state. At each reachable state, we identify the
102 * states reachable from it without traversing a predictable trigram, and add
103 * those states' enter keys to the current state. Then we generate all
104 * out-arcs leading out of this collection of states that have predictable
105 * trigrams, adding their target states to the queue of states to examine.
107 * When building the graph, if the number of states or arcs exceed pre-defined
108 * limits, we give up and simply mark any states not yet processed as final
109 * states. Roughly speaking, that means that we make use of some portion from
110 * the beginning of the regexp. Also, any colors that have too many member
111 * characters are treated as "unknown", so that we can't derive trigrams
114 * 3) Expand the color trigrams into regular trigrams
115 * --------------------------------------------------
116 * The trigrams in the expanded graph are "color trigrams", consisting
117 * of three consecutive colors that must be present in the string. But for
118 * search, we need regular trigrams consisting of characters. In the 3rd
119 * stage, the color trigrams are expanded into regular trigrams. Since each
120 * color can represent many characters, the total number of regular trigrams
121 * after expansion could be very large. Because searching the index for
122 * thousands of trigrams would be slow, and would likely produce so many
123 * false positives that we would have to traverse a large fraction of the
124 * index, the graph is simplified further in a lossy fashion by removing
125 * color trigrams. When a color trigram is removed, the states connected by
126 * any arcs labeled with that trigram are merged.
128 * Trigrams do not all have equivalent value for searching: some of them are
129 * more frequent and some of them are less frequent. Ideally, we would like
130 * to know the distribution of trigrams, but we don't. But because of padding
131 * we know for sure that the empty character is more frequent than others,
132 * so we can penalize trigrams according to presence of whitespace. The
133 * penalty assigned to each color trigram is the number of simple trigrams
134 * it would produce, times the penalties[] multiplier associated with its
135 * whitespace content. (The penalties[] constants were calculated by analysis
136 * of some real-life text.) We eliminate color trigrams starting with the
137 * highest-penalty one, until we get to a total penalty of no more than
138 * WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would
139 * lead to merging the initial and final states, so we may not be able to
140 * reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than
141 * MAX_TRGM_COUNT simple trigrams in total, otherwise we fail.
143 * 4) Pack the graph into a compact representation
144 * -----------------------------------------------
145 * The 2nd and 3rd stages might have eliminated or merged many of the states
146 * and trigrams created earlier, so in this final stage, the graph is
147 * compacted and packed into a simpler struct that contains only the
148 * information needed to evaluate it.
152 * Consider the example regex "ab[cd]". This regex is transformed into the
153 * following NFA (for simplicity we show colors as their single members):
163 * We use * to mark initial state and # to mark final state. It's not depicted,
164 * but states 1, 4, 5 have self-referencing arcs for all possible characters,
165 * because this pattern can match to any part of a string.
167 * As the result of stage 2 we will have the following graph:
170 * 2# <---- 1* ----> 3#
172 * The process for generating this graph is:
173 * 1) Create state 1 with enter key (UNKNOWN, UNKNOWN, 1).
174 * 2) Add key (UNKNOWN, "a", 2) to state 1.
175 * 3) Add key ("a", "b", 3) to state 1.
176 * 4) Create new state 2 with enter key ("b", "c", 4). Add an arc
177 * from state 1 to state 2 with label trigram "abc".
178 * 5) Mark state 2 final because state 4 of source NFA is marked as final.
179 * 6) Create new state 3 with enter key ("b", "d", 5). Add an arc
180 * from state 1 to state 3 with label trigram "abd".
181 * 7) Mark state 3 final because state 5 of source NFA is marked as final.
184 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
185 * Portions Copyright (c) 1994, Regents of the University of California
188 * contrib/pg_trgm/trgm_regexp.c
190 *-------------------------------------------------------------------------
194#include "catalog/pg_collation_d.h"
204 * Uncomment (or use -DTRGM_REGEXP_DEBUG) to print debug info,
205 * for exploring and debugging the algorithm implementation.
206 * This produces three graph files in /tmp, in Graphviz .gv format.
207 * Some progress information is also printed to postmaster stderr.
209/* #define TRGM_REGEXP_DEBUG */
212 * These parameters are used to limit the amount of work done.
213 * Otherwise regex processing could be too slow and memory-consuming.
215 * MAX_EXPANDED_STATES - How many states we allow in expanded graph
216 * MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph
217 * MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted
218 * WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties
219 * COLOR_COUNT_LIMIT - Maximum number of characters per color
221 #define MAX_EXPANDED_STATES 128
222 #define MAX_EXPANDED_ARCS 1024
223 #define MAX_TRGM_COUNT 256
224 #define WISH_TRGM_PENALTY 16
225 #define COLOR_COUNT_LIMIT 256
228 * Penalty multipliers for trigram counts depending on whitespace contents.
229 * Numbers based on analysis of real-life texts.
234 0.0f,
/* "a a" (impossible) */
235 0.0f,
/* "a " (impossible) */
239 0.0f
/* " " (impossible) */
242/* Struct representing a single pg_wchar, converted back to multibyte form */
249 * Attributes of NFA colors:
251 * expandable - we know the character expansion of this color
252 * containsNonWord - color contains non-word characters
253 * (which will not be extracted into trigrams)
254 * wordCharsCount - count of word characters in color
255 * wordChars - array of this color's word characters
256 * (which can be extracted into trigrams)
258 * When expandable is false, the other attributes don't matter; we just
259 * assume this color represents unknown character(s).
270 * A "prefix" is information about the colors of the last two characters read
271 * before reaching a specific NFA state. These colors can have special values
272 * COLOR_UNKNOWN and COLOR_BLANK. COLOR_UNKNOWN means that we have no
273 * information, for example because we read some character of an unexpandable
274 * color. COLOR_BLANK means that we read a non-word character.
276 * We call a prefix ambiguous if at least one of its colors is unknown. It's
277 * fully ambiguous if both are unknown, partially ambiguous if only the first
278 * is unknown. (The case of first color known, second unknown is not valid.)
280 * Wholly- or partly-blank prefixes are mostly handled the same as regular
281 * color prefixes. This allows us to generate appropriate partly-blank
282 * trigrams when the NFA requires word character(s) to appear adjacent to
283 * non-word character(s).
287/* We assume that colors returned by the regexp engine cannot be these: */
288 #define COLOR_UNKNOWN (-3)
289 #define COLOR_BLANK (-4)
297 * Color-trigram data type. Note that some elements of the trigram can be
298 * COLOR_BLANK, but we don't allow COLOR_UNKNOWN.
306 * Key identifying a state of our expanded graph: color prefix, and number
307 * of the corresponding state in the underlying regex NFA. The color prefix
308 * shows how we reached the regex state (to the extent that we know it).
317 * One state of the expanded graph.
319 * stateKey - ID of this state
320 * arcs - outgoing arcs of this state (List of TrgmArc)
321 * enterKeys - enter keys reachable from this state without reading any
322 * predictable trigram (List of TrgmStateKey)
324 * snumber - number of this state (initially assigned as -1, -2, etc,
325 * for debugging purposes only; then at the packaging stage,
326 * surviving states are renumbered with positive numbers)
327 * parent - parent state, if this state has been merged into another
328 * tentFlags - flags this state would acquire via planned merges
329 * tentParent - planned parent state, if considering a merge
331 #define TSTATE_INIT 0x01 /* flag indicating this state is initial */
332 #define TSTATE_FIN 0x02 /* flag indicating this state is final */
347 * One arc in the expanded graph.
356 * Information about arc of specific color trigram (used in stage 3)
358 * Contains pointers to the source and target states.
367 * Information about color trigram (used in stage 3)
369 * ctrgm - trigram itself
370 * cnumber - number of this trigram (used in the packaging stage)
371 * count - number of simple trigrams created from this color trigram
372 * expanded - indicates this color trigram is expanded into simple trigrams
373 * arcs - list of all arcs labeled with this color trigram.
386 * Data structure representing all the data we need during regex processing.
388 * regex - compiled regex
389 * colorInfo - extracted information about regex's colors
390 * ncolors - number of colors in colorInfo[]
391 * states - hashtable of TrgmStates (states of expanded graph)
392 * initState - pointer to initial state of expanded graph
393 * queue - queue of to-be-processed TrgmStates
394 * keysQueue - queue of to-be-processed TrgmStateKeys
395 * arcsCount - total number of arcs of expanded graph (for resource
397 * overflowed - we have exceeded resource limit for transformation
398 * colorTrgms - array of all color trigrams present in graph
399 * colorTrgmsCount - count of those color trigrams
400 * totalTrgmCount - total count of extracted simple trigrams
404 /* Source regexp, and color information extracted from it (stage 1) */
409 /* Expanded graph (stage 2) */
414 /* Workspace for stage 2 */
420 /* Information about distinct color trigrams in the graph (stage 3) */
427 * Final, compact representation of expanded graph.
432 int colorTrgm;
/* index of color trigram for transition */
441/* "typedef struct TrgmPackedGraph TrgmPackedGraph" appears in trgm.h */
445 * colorTrigramsCount and colorTrigramGroups contain information about how
446 * trigrams are grouped into color trigrams. "colorTrigramsCount" is the
447 * count of color trigrams and "colorTrigramGroups" contains number of
448 * simple trigrams for each color trigram. The array of simple trigrams
449 * (stored separately from this struct) is ordered so that the simple
450 * trigrams for each color trigram are consecutive, and they're in order
451 * by color trigram number.
457 * The states of the simplified NFA. State number 0 is always initial
458 * state and state number 1 is always final state.
463 /* Temporary work space for trigramsMatchGraph() */
470 * Temporary structure for representing an arc during packaging.
480/* prototypes for private functions */
484 int cflags,
Oid collation);
506#ifdef TRGM_REGEXP_DEBUG
508static void printTrgmNFA(
TrgmNFA *trgmNFA);
515 * Main entry point to process a regular expression.
517 * Returns an array of trigrams required by the regular expression, or NULL if
518 * the regular expression was too complex to analyze. In addition, a packed
519 * graph representation of the regex is returned into *graph. The results
520 * must be allocated in rcontext (which might or might not be the current
533 * This processing generates a great deal of cruft, which we'd like to
534 * clean up before returning (since this function may be called in a
535 * query-lifespan memory context). Make a temp context we can work in so
536 * that cleanup is easy.
539 "createTrgmNFA temporary context",
544 * Stage 1: Compile the regexp into a NFA, using the regexp library.
556 /* Clean up all the cruft we created (including regex) */
564 * Body of createTrgmNFA, exclusive of regex compilation/freeing.
573 trgmNFA.
regex = regex;
575 /* Collect color information from the regex */
578#ifdef TRGM_REGEXP_DEBUG
583 * Stage 2: Create an expanded graph from the source NFA.
587#ifdef TRGM_REGEXP_DEBUG
588 printTrgmNFA(&trgmNFA);
592 * Fail if we were unable to make a nontrivial graph, ie it is possible to
593 * get from the initial state to the final state without reading any
594 * predictable trigram.
600 * Stage 3: Select color trigrams to expand. Fail if too many trigrams.
606 * Stage 4: Expand color trigrams and pack graph into final
613#ifdef TRGM_REGEXP_DEBUG
614 printTrgmPackedGraph(*graph, trg);
621 * Main entry point for evaluating a graph during index scanning.
623 * The check[] array is indexed by trigram number (in the array of simple
624 * trigrams returned by createTrgmNFA), and holds true for those trigrams
625 * that are present in the index entry being checked.
637 * Reset temporary working areas.
644 * Check which color trigrams were matched. A match for any simple
645 * trigram associated with a color trigram counts as a match of the color
653 for (k =
j; k <
j + cnt; k++)
658 * Found one matched trigram in the group. Can skip the rest
659 * of them and go to the next group.
669 * Initialize the statesQueue to hold just the initial state. Note:
670 * statesQueue has room for statesCount entries, which is certainly enough
671 * since no state will be put in the queue more than once. The
672 * statesActive array marks which states have been queued.
679 /* Process queued states as long as there are any. */
680 while (queueIn < queueOut)
684 int cnt =
state->arcsCount;
686 /* Loop over state's out-arcs */
687 for (
i = 0;
i < cnt;
i++)
692 * If corresponding color trigram is present then activate the
693 * corresponding state. We're done if that's the final state,
694 * otherwise queue the state if it's not been queued already.
698 int nextstate =
arc->targetState;
701 return true;
/* success: final state is reachable */
712 /* Queue is empty, so match fails. */
717 * Compile regex string into struct at *regex.
718 * NB: pg_regfree must be applied to regex if this completes successfully.
730 /* Convert pattern string to wide characters */
747 /* re didn't compile (no need for pg_regfree, if so) */
748 pg_regerror(regcomp_result, regex, errMsg,
sizeof(errMsg));
750 (
errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
751 errmsg(
"invalid regular expression: %s", errMsg)));
756/*---------------------
757 * Subroutines for pre-processing the color map (stage 1).
758 *---------------------
762 * Fill TrgmColorInfo structure for each color using regex export functions.
770 trgmNFA->
ncolors = colorsCount;
775 * Loop over colors, filling TrgmColorInfo about each. Note we include
776 * WHITE (0) even though we know it'll be reported as non-expandable.
778 for (
i = 0;
i < colorsCount;
i++)
787 /* Non expandable, or too large to work with */
798 /* Extract all the chars in this color */
803 * Convert characters back to multibyte form, and save only those that
804 * are word characters. Set "containsNonWord" if any non-word
805 * character. (Note: it'd probably be nicer to keep the chars in
806 * pg_wchar format for now, but ISWORDCHR wants to see multibyte.)
808 for (
j = 0;
j < charsCount;
j++)
813 continue;
/* ok to ignore it altogether */
825 * Convert pg_wchar to multibyte format.
826 * Returns false if the character should be ignored completely.
831 /* "s" has enough space for a multibyte character and a trailing NUL */
835 * We can ignore the NUL character, since it can never appear in a PG text
836 * string. This avoids the need for various special cases when
837 * reconstructing trigrams.
842 /* Do the conversion, making sure the result is NUL-terminated */
843 memset(s, 0,
sizeof(s));
847 * In IGNORECASE mode, we can ignore uppercase characters. We assume that
848 * the regex engine generated both uppercase and lowercase equivalents
849 * within each color, since we used the REG_ICASE option; so there's no
850 * need to process the uppercase version.
852 * XXX this code is dependent on the assumption that str_tolower() works
853 * the same as the regex engine's internal case folding machinery. Might
854 * be wiser to expose pg_wc_tolower and test whether c ==
855 * pg_wc_tolower(c). On the other hand, the trigrams in the index were
856 * created using str_tolower(), so we're probably screwed if there's any
857 * incompatibility anyway.
861 char *lowerCased =
str_tolower(s, strlen(s), DEFAULT_COLLATION_OID);
863 if (strcmp(lowerCased, s) != 0)
872 /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */
878/*---------------------
879 * Subroutines for expanding original NFA graph into a trigram graph (stage 2).
880 *---------------------
884 * Transform the graph, given a regex and extracted color information.
886 * We create and process a queue of expanded-graph states until all the states
889 * This algorithm may be stopped due to resource limitation. In this case we
890 * force every unprocessed branch to immediately finish with matching (this
891 * can give us false positives but no false negatives) by marking all
892 * unprocessed states as final.
902 /* Initialize this stage's workspace in trgmNFA struct */
908 /* Create hashtable for states */
918 /* Create initial state: ambiguous prefix, NFA's initial state */
919 MemSet(&initkey, 0,
sizeof(initkey));
924 initstate =
getState(trgmNFA, &initkey);
929 * Recursively build the expanded graph by processing queue of states
930 * (breadth-first search). getState already put initstate in the queue.
931 * Note that getState will append new states to the queue within the loop,
932 * too; this works as long as we don't do repeat fetches using the "lc"
935 foreach(lc, trgmNFA->
queue)
940 * If we overflowed then just mark state as final. Otherwise do
948 /* Did we overflow? */
956 * Process one state: add enter keys and then add outgoing arcs.
963 /* keysQueue should be NIL already, but make sure */
967 * Add state's own key, and then process all keys added to keysQueue until
968 * queue is finished. But we can quit if the state gets marked final.
980 /* Release keysQueue to clean up for next cycle */
985 * Add outgoing arcs only if state isn't final (we have no interest in
986 * outgoing arcs if we already match)
993 * Add the given enter key into the state's enterKeys list, and determine
994 * whether this should result in any further enter keys being added.
995 * If so, add those keys to keysQueue so that processState will handle them.
997 * If the enter key is for the NFA's final state, mark state as TSTATE_FIN.
998 * This situation means that we can reach the final state from this expanded
999 * state without reading any predictable trigram, so we must consider this
1000 * state as an accepting one.
1002 * The given key could be a duplicate of one already in enterKeys, or be
1003 * redundant with some enterKeys. So we check that before doing anything.
1005 * Note that we don't generate any actual arcs here. addArcs will do that
1006 * later, after we have identified all the enter keys for this state.
1018 * Ensure any pad bytes in destKey are zero, since it may get used as a
1019 * hashtable key by getState.
1021 MemSet(&destKey, 0,
sizeof(destKey));
1024 * Compare key to each existing enter key of the state to check for
1025 * redundancy. We can drop either old key(s) or the new key if we find
1028 foreach(cell,
state->enterKeys)
1036 /* This old key already covers the new key. Nothing to do */
1042 * The new key covers this old key. Remove the old key, it's
1043 * no longer needed once we add this key to the list.
1051 /* No redundancy, so add this key to the state's list */
1054 /* If state is now known final, mark it and we're done */
1062 * Loop through all outgoing arcs of the corresponding state in the
1069 for (
i = 0;
i < arcsCount;
i++)
1076 * Start of line/string (^). Trigram extraction treats start of
1077 * line same as start of word: double space prefix is added.
1078 * Hence, make an enter key showing we can reach the arc
1079 * destination with all-blank prefix.
1085 /* Add enter key to this state */
1091 * End of line/string ($). We must consider this arc as a
1092 * transition that doesn't read anything. The reason for adding
1093 * this enter key to the state is that if the arc leads to the
1094 * NFA's final state, we must mark this expanded state as final.
1100 /* Add enter key to this state */
1103 else if (
arc->
co >= 0)
1105 /* Regular color (including WHITE) */
1114 * We can reach the arc destination after reading a
1115 * non-word character, but the prefix is not something
1116 * that addArc will accept with COLOR_BLANK, so no trigram
1117 * arc can get made for this transition. We must make an
1118 * enter key to show that the arc destination is
1119 * reachable. Set it up with an all-blank prefix, since
1120 * that corresponds to what the trigram extraction code
1121 * will do at a word starting boundary.
1133 * We can reach the arc destination after reading a word
1134 * character, but the prefix is not something that addArc
1135 * will accept, so no trigram arc can get made for this
1136 * transition. We must make an enter key to show that the
1137 * arc destination is reachable. The prefix for the enter
1138 * key should reflect the info we have for this arc.
1149 * Unexpandable color. Add enter key with ambiguous prefix,
1150 * showing we can reach the destination from this state, but
1151 * the preceding colors will be uncertain. (We do not set the
1152 * first prefix color to key->prefix.colors[1], because a
1153 * prefix of known followed by unknown is invalid.)
1163 /* RAINBOW: treat as unexpandable color */
1175 * Add copy of given key to keysQueue for later processing.
1187 * Add outgoing arcs from given state, whose enter keys are all now known.
1199 * Ensure any pad bytes in destKey are zero, since it may get used as a
1200 * hashtable key by getState.
1202 MemSet(&destKey, 0,
sizeof(destKey));
1205 * Iterate over enter keys associated with this expanded-graph state. This
1206 * includes both the state's own stateKey, and any enter keys we added to
1207 * it during addKey (which represent expanded-graph states that are not
1208 * distinguishable from this one by means of trigrams). For each such
1209 * enter key, examine all the out-arcs of the key's underlying NFA state,
1210 * and try to make a trigram arc leading to where the out-arc leads.
1211 * (addArc will deal with whether the arc is valid or not.)
1213 foreach(cell,
state->enterKeys)
1221 for (
i = 0;
i < arcsCount;
i++)
1227 * Ignore non-expandable colors; addKey already handled the case.
1229 * We need no special check for WHITE or begin/end pseudocolors
1230 * here. We don't need to do any processing for them, and they
1231 * will be marked non-expandable since the regex engine will have
1232 * reported them that way. We do have to watch out for RAINBOW,
1233 * which has a negative color number.
1246 * Color includes non-word character(s).
1248 * Generate an arc, treating this transition as occurring on
1249 * BLANK. This allows word-ending trigrams to be manufactured
1262 * Color includes word character(s).
1264 * Generate an arc. Color is pushed into prefix of target
1280 * Generate an out-arc of the expanded graph, if it's valid and not redundant.
1282 * state: expanded-graph state we want to add an out-arc to
1283 * key: provides prefix colors (key->nstate is not used)
1284 * co: transition color
1285 * destKey: identifier for destination state of expanded graph
1294 /* Do nothing if this wouldn't be a valid arc label trigram */
1299 * Check if we are going to reach key which is covered by a key which is
1300 * already listed in this state. If so arc is useless: the NFA can bypass
1301 * it through a path that doesn't require any predictable trigram, so
1302 * whether the arc's trigram is present or not doesn't really matter.
1304 foreach(cell,
state->enterKeys)
1313 /* Checks were successful, add new arc */
1316 arc->ctrgm.colors[0] =
key->prefix.colors[0];
1317 arc->ctrgm.colors[1] =
key->prefix.colors[1];
1318 arc->ctrgm.colors[2] = co;
1325 * Can we make a valid trigram arc label from the given prefix and arc color?
1327 * This is split out so that tests in addKey and addArc will stay in sync.
1333 * We have to know full trigram in order to add outgoing arc. So we can't
1334 * do it if prefix is ambiguous.
1339 /* If key->prefix.colors[0] isn't unknown, its second color isn't either */
1341 /* And we should not be called with an unknown arc color anytime */
1345 * We don't bother with making arcs representing three non-word
1346 * characters, since that's useless for trigram extraction.
1354 * We also reject nonblank-blank-anything. The nonblank-blank-nonblank
1355 * case doesn't correspond to any trigram the trigram extraction code
1356 * would make. The nonblank-blank-blank case is also not possible with
1357 * RPADDING = 1. (Note that in many cases we'd fail to generate such a
1358 * trigram even if it were valid, for example processing "foo bar" will
1359 * not result in considering the trigram "o ". So if you want to support
1360 * RPADDING = 2, there's more to do than just twiddle this test.)
1367 * Other combinations involving blank are valid, in particular we assume
1368 * blank-blank-nonblank is valid, which presumes that LPADDING is 2.
1370 * Note: Using again the example "foo bar", we will not consider the
1371 * trigram " b", though this trigram would be found by the trigram
1372 * extraction code. Since we will find " ba", it doesn't seem worth
1373 * trying to hack the algorithm to generate the additional trigram.
1376 /* arc label is valid */
1381 * Get state of expanded graph for given state key,
1382 * and queue the state for processing if it didn't already exist.
1394 /* New state: initialize and queue it */
1398 /* states are initially given negative numbers */
1400 state->parent = NULL;
1401 state->tentFlags = 0;
1402 state->tentParent = NULL;
1410 * Check if prefix1 "contains" prefix2.
1412 * "contains" means that any exact prefix (with no ambiguity) that satisfies
1413 * prefix2 also satisfies prefix1.
1420 /* Fully ambiguous prefix contains everything */
1426 * Prefix with only first unknown color contains every prefix with
1427 * same second color.
1436 /* Exact prefix contains only the exact same prefix */
1446/*---------------------
1447 * Subroutines for expanding color trigrams into regular trigrams (stage 3).
1448 *---------------------
1452 * Get vector of all color trigrams in graph and select which of them
1453 * to expand into simple trigrams.
1455 * Returns true if OK, false if exhausted resource limits.
1465 int64 totalTrgmCount;
1469 /* Collect color trigrams from all arcs */
1479 foreach(cell,
state->arcs)
1489 /* count and penalty will be set below */
1497 /* Remove duplicates, merging their arcs lists */
1503 /* Sort trigrams to ease duplicate detection */
1506 /* p1 is probe point, p2 is last known non-duplicate. */
1508 for (p1 = colorTrgms + 1; p1 < colorTrgms + arcsCount; p1++)
1528 * Count number of simple trigrams generated by each color trigram, and
1529 * also compute a penalty value, which is the number of simple trigrams
1530 * times a multiplier that depends on its whitespace content.
1532 * Note: per-color-trigram counts cannot overflow an int so long as
1533 * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about
1534 * 1290. However, the grand total totalTrgmCount might conceivably
1535 * overflow an int, so we use int64 for that within this routine. Also,
1536 * penalties are calculated in float4 arithmetic to avoid any overflow
1540 totalTrgmPenalty = 0.0f;
1548 for (
j = 0;
j < 3;
j++)
1558 trgmInfo->
count = count;
1559 totalTrgmCount += count;
1561 totalTrgmPenalty += trgmInfo->
penalty;
1564 /* Sort color trigrams in descending order of their penalties */
1569 * Remove color trigrams from the graph so long as total penalty of color
1570 * trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to
1571 * WISH_TRGM_PENALTY, it's OK so long as total count is no more than
1572 * MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher
1573 * penalty, since those are the most promising for reducing the total
1574 * penalty. When removing a color trigram we have to merge states
1575 * connected by arcs labeled with that trigram. It's necessary to not
1576 * merge initial and final states, because our graph becomes useless if
1577 * that happens; so we cannot always remove the trigram we'd prefer to.
1582 bool canRemove =
true;
1585 /* Done if we've reached the target */
1589#ifdef TRGM_REGEXP_DEBUG
1590 fprintf(stderr,
"considering ctrgm %d %d %d, penalty %f, %d arcs\n",
1599 * Does any arc of this color trigram connect initial and final
1600 * states? If so we can't remove it.
1602 foreach(cell, trgmInfo->
arcs)
1606 *target = arcInfo->
target;
1610#ifdef TRGM_REGEXP_DEBUG
1611 fprintf(stderr,
"examining arc to s%d (%x) from s%d (%x)\n",
1612 -target->snumber, target->flags,
1616 /* examine parent states, if any merging has already happened */
1619 while (target->parent)
1620 target = target->parent;
1622#ifdef TRGM_REGEXP_DEBUG
1623 fprintf(stderr,
" ... after completed merges: to s%d (%x) from s%d (%x)\n",
1624 -target->snumber, target->flags,
1628 /* we must also consider merges we are planning right now */
1630 while (
source->tentParent)
1635 target_flags = target->flags | target->tentFlags;
1636 while (target->tentParent)
1638 target = target->tentParent;
1639 target_flags |= target->flags | target->tentFlags;
1642#ifdef TRGM_REGEXP_DEBUG
1643 fprintf(stderr,
" ... after tentative merges: to s%d (%x) from s%d (%x)\n",
1644 -target->snumber, target_flags,
1645 -
source->snumber, source_flags);
1648 /* would fully-merged state have both INIT and FIN set? */
1656 /* ok so far, so remember planned merge */
1659#ifdef TRGM_REGEXP_DEBUG
1660 fprintf(stderr,
" ... tentatively merging s%d into s%d\n",
1661 -target->snumber, -
source->snumber);
1663 target->tentParent =
source;
1664 source->tentFlags |= target_flags;
1669 * We must reset all the tentFlags/tentParent fields before
1670 * continuing. tentFlags could only have become set in states that
1671 * are the source or parent or tentative parent of one of the current
1672 * arcs; likewise tentParent could only have become set in states that
1673 * are the target or parent or tentative parent of one of the current
1674 * arcs. There might be some overlap between those sets, but if we
1675 * clear tentFlags in target states as well as source states, we
1676 * should be okay even if we visit a state as target before visiting
1679 foreach(cell, trgmInfo->
arcs)
1683 *target = arcInfo->
target;
1686 /* no need to touch previously-merged states */
1689 while (target->parent)
1690 target = target->parent;
1698 while ((ttarget = target->
tentParent) != NULL)
1700 target->tentParent = NULL;
1701 target->tentFlags = 0;
/* in case it was also a source */
1706 /* Now, move on if we can't drop this trigram */
1709#ifdef TRGM_REGEXP_DEBUG
1710 fprintf(stderr,
" ... not ok to merge\n");
1715 /* OK, merge states linked by each arc labeled by the trigram */
1716 foreach(cell, trgmInfo->
arcs)
1720 *target = arcInfo->
target;
1724 while (target->parent)
1725 target = target->parent;
1728#ifdef TRGM_REGEXP_DEBUG
1729 fprintf(stderr,
"merging s%d into s%d\n",
1730 -target->snumber, -
source->snumber);
1733 /* Assert we didn't merge initial and final states */
1739 /* Mark trigram unexpanded, and update totals */
1741 totalTrgmCount -= trgmInfo->
count;
1742 totalTrgmPenalty -= trgmInfo->
penalty;
1745 /* Did we succeed in fitting into MAX_TRGM_COUNT? */
1752 * Sort color trigrams by colors (will be useful for bsearch in packGraph)
1753 * and enumerate the color trigrams that are expanded.
1760 if (colorTrgms[
i].expanded)
1771 * Expand selected color trigrams into regular trigrams.
1773 * Returns the TRGM array to be passed to the index machinery.
1774 * The array must be allocated in rcontext.
1785 /* Set up "blank" color structure containing a single zero character */
1786 memset(blankChar.
bytes, 0,
sizeof(blankChar.
bytes));
1790 /* Construct the trgm array */
1808 /* Ignore any unexpanded trigrams ... */
1812 /* Get colors, substituting the dummy struct for COLOR_BLANK */
1813 for (
j = 0;
j < 3;
j++)
1821 /* Iterate over all possible combinations of colors' characters */
1822 for (i1 = 0; i1 <
c[0]->wordCharsCount; i1++)
1824 s[0] =
c[0]->wordChars[i1];
1825 for (i2 = 0; i2 <
c[1]->wordCharsCount; i2++)
1827 s[1] =
c[1]->wordChars[i2];
1828 for (i3 = 0; i3 <
c[2]->wordCharsCount; i3++)
1830 s[2] =
c[2]->wordChars[i3];
1842 * Convert trigram into trgm datatype.
1852 /* Write multibyte string into "str" (we don't need null termination) */
1855 for (
i = 0;
i < 3;
i++)
1857 if (s[
i].bytes[0] != 0)
1860 *p++ = s[
i].bytes[
j];
1864 /* Emit a space in place of COLOR_BLANK */
1869 /* Convert "str" to a standard trigram (possibly hashing it) */
1874 * Merge two states of graph.
1879 Assert(state1 != state2);
1883 /* state1 absorbs state2's flags */
1886 /* state2, and indirectly all its children, become children of state1 */
1891 * Compare function for sorting of color trigrams by their colors.
1903 * Compare function for sorting color trigrams in descending order of
1904 * their penalty fields.
1912 if (penalty1 < penalty2)
1914 else if (penalty1 == penalty2)
1921/*---------------------
1922 * Subroutines for packing the graph into final representation (stage 4).
1923 *---------------------
1927 * Pack expanded graph into final representation.
1929 * The result data must be allocated in rcontext.
1945 /* Enumerate surviving states, giving init and fin reserved numbers */
1949 while (
state->parent)
1952 if (
state->snumber < 0)
1960 state->snumber = snumber;
1966 /* Collect array of all arcs */
1979 foreach(cell,
state->arcs)
2007 /* Sort arcs to ease duplicate detection */
2010 /* We could have duplicates because states were merged. Remove them. */
2013 /* p1 is probe point, p2 is last known non-duplicate. */
2018 for (p1 = arcs + 1; p1 < arcs + arcIndex; p1++)
2026 arcsCount = (p2 - arcs) + 1;
2029 arcsCount = arcIndex;
2031 /* Create packed representation */
2035 /* Pack color trigrams information */
2054 /* Pack states and arcs information */
2061 for (
i = 0;
i < snumber;
i++)
2066 while (
j < arcsCount && arcs[
j].sourceState ==
i)
2076 /* Allocate working memory for trigramsMatchGraph() */
2088 * Comparison function for sorting TrgmPackArcInfos.
2090 * Compares arcs in following order: sourceState, colorTrgm, targetState.
2114/*---------------------
2115 * Debugging functions
2117 * These are designed to emit GraphViz files.
2118 *---------------------
2121#ifdef TRGM_REGEXP_DEBUG
2124 * Print initial NFA, in regexp library's representation
2153 for (
i = 0;
i < arcsCount;
i++)
2156 state, arcs[
i].to, arcs[
i].co);
2170 for (
i = 0;
i < ncolors;
i++)
2176 if (
color->expandable)
2178 for (
j = 0;
j <
color->wordCharsCount;
j++)
2197 /* dot -Tpng -o /tmp/source.png < /tmp/source.gv */
2198 FILE *fp = fopen(
"/tmp/source.gv",
"w");
2208 * Print expanded graph.
2235 foreach(cell,
state->arcs)
2240 -
state->snumber, -
arc->target->snumber);
2241 printTrgmColor(&
buf,
arc->ctrgm.colors[0]);
2243 printTrgmColor(&
buf,
arc->ctrgm.colors[1]);
2245 printTrgmColor(&
buf,
arc->ctrgm.colors[2]);
2259 /* dot -Tpng -o /tmp/transformed.png < /tmp/transformed.gv */
2260 FILE *fp = fopen(
"/tmp/transformed.gv",
"w");
2270 * Print a TrgmColor readably.
2284 * Print final packed representation of trigram-based expanded graph.
2308 for (
j = 0;
j <
state->arcsCount;
j++)
2313 i,
arc->targetState,
arc->colorTrgm);
2320 /* Print trigrams */
2332 for (
j = 0;
j < count;
j++)
2338 * XXX This representation is nice only for all-ASCII trigrams.
2350 /* dot -Tpng -o /tmp/packed.png < /tmp/packed.gv */
2351 FILE *fp = fopen(
"/tmp/packed.gv",
"w");
2360#endif /* TRGM_REGEXP_DEBUG */
#define MemSet(start, val, len)
#define fprintf(file, fmt, msg)
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
void * hash_seq_search(HASH_SEQ_STATUS *status)
int64 hash_get_num_entries(HTAB *hashp)
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
Assert(PointerIsAligned(start, uint64))
static const FormData_pg_attribute a1
static const FormData_pg_attribute a2
#define CALCGTSIZE(flag, siglen)
List * lappend(List *list, void *datum)
List * list_concat(List *list1, const List *list2)
void list_free(List *list)
int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
void * MemoryContextAlloc(MemoryContext context, Size size)
void * MemoryContextAllocZero(MemoryContext context, Size size)
void pfree(void *pointer)
void * palloc0(Size size)
MemoryContext CurrentMemoryContext
void MemoryContextDelete(MemoryContext context)
#define AllocSetContextCreate
#define ALLOCSET_DEFAULT_SIZES
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
static int list_length(const List *l)
#define foreach_delete_current(lst, var_or_cell)
static rewind_source * source
#define MAX_MULTIBYTE_CHAR_LEN
#define qsort(a, b, c, d)
int pg_regcomp(regex_t *re, const chr *string, size_t len, int flags, Oid collation)
size_t pg_regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
int pg_reg_getnumoutarcs(const regex_t *regex, int st)
int pg_reg_getnumcolors(const regex_t *regex)
int pg_reg_getnumstates(const regex_t *regex)
int pg_reg_getfinalstate(const regex_t *regex)
int pg_reg_getinitialstate(const regex_t *regex)
void pg_reg_getoutarcs(const regex_t *regex, int st, regex_arc_t *arcs, int arcs_len)
int pg_reg_colorisend(const regex_t *regex, int co)
void pg_reg_getcharacters(const regex_t *regex, int co, pg_wchar *chars, int chars_len)
int pg_reg_getnumcharacters(const regex_t *regex, int co)
int pg_reg_colorisbegin(const regex_t *regex, int co)
void appendStringInfo(StringInfo str, const char *fmt,...)
void appendStringInfoString(StringInfo str, const char *s)
void appendStringInfoChar(StringInfo str, char ch)
void initStringInfo(StringInfo str)
ColorTrgmInfo * colorTrgms
TrgmColorInfo * colorInfo
bool * colorTrigramsActive
struct TrgmState * parent
struct TrgmState * tentParent
char bytes[MAX_MULTIBYTE_CHAR_LEN]
void compact_trigram(trgm *tptr, char *str, int bytelen)
static bool convertPgWchar(pg_wchar c, trgm_mb_char *result)
static bool validArcLabel(TrgmStateKey *key, TrgmColor co)
static TrgmState * getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
static TRGM * createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph, MemoryContext rcontext)
static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key, TrgmColor co, TrgmStateKey *destKey)
#define MAX_EXPANDED_STATES
static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
#define MAX_EXPANDED_ARCS
struct TrgmState TrgmState
static bool selectColorTrigrams(TrgmNFA *trgmNFA)
static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3])
static TRGM * expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext)
TRGM * createTrgmNFA(text *text_re, Oid collation, TrgmPackedGraph **graph, MemoryContext rcontext)
static void transformGraph(TrgmNFA *trgmNFA)
bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check)
static const float4 penalties[8]
static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key)
static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
static TrgmPackedGraph * packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
static int packArcInfoCmp(const void *a1, const void *a2)
static void processState(TrgmNFA *trgmNFA, TrgmState *state)
#define COLOR_COUNT_LIMIT
#define WISH_TRGM_PENALTY
static void mergeStates(TrgmState *state1, TrgmState *state2)
static int colorTrgmInfoCmp(const void *p1, const void *p2)
static void RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
static void addArcs(TrgmNFA *trgmNFA, TrgmState *state)
static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2)
static Size VARSIZE_ANY_EXHDR(const void *PTR)
static char * VARDATA_ANY(const void *PTR)
static void SET_VARSIZE(void *PTR, Size len)
static char chars[TZ_MAX_CHARS]