1/*-------------------------------------------------------------------------
3 * Normalize a Unicode string
5 * This implements Unicode normalization, per the documentation at
6 * https://www.unicode.org/reports/tr15/.
8 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
11 * src/common/unicode_norm.c
13 *-------------------------------------------------------------------------
31 #define ALLOC(size) palloc(size)
32 #define FREE(size) pfree(size)
34#define ALLOC(size) malloc(size)
35#define FREE(size) free(size)
38/* Constants for calculations with Hangul characters */
39 #define SBASE 0xAC00 /* U+AC00 */
40 #define LBASE 0x1100 /* U+1100 */
41 #define VBASE 0x1161 /* U+1161 */
42 #define TBASE 0x11A7 /* U+11A7 */
46 #define NCOUNT VCOUNT * TCOUNT
47 #define SCOUNT LCOUNT * NCOUNT
50/* comparison routine for bsearch() of decomposition lookup table. */
52conv_compare(
const void *p1,
const void *p2)
59 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
67 * Get the entry corresponding to code in the decomposition lookup table.
68 * The backend version of this code uses a perfect hash function for the
69 * lookup, while the frontend version uses a binary search.
80 * Compute the hash function. The hash key is the codepoint with the bytes
84 h = decompinfo.
hash(&hashkey);
86 /* An out-of-range result implies no match */
91 * Since it's a perfect hash, we need only match to the specific codepoint
100 return bsearch(&(code),
109 * Get the combining class of the given codepoint.
117 * If no entries are found, the character used is either an Hangul
118 * character or a character with a class of 0 and no decompositions.
127 * Given a decomposition entry looked up earlier, get the decomposed
130 * Note: the returned pointer can point to statically allocated buffer, and
131 * is only valid until next call to this function!
153 * Calculate how many characters a given character will decompose to.
155 * This needs to recurse, if the character decomposes into characters that
156 * are, in turn, decomposable.
168 * Fast path for Hangul characters not stored in tables to save memory as
169 * decomposition is algorithmic. See
170 * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
178 sindex = code -
SBASE;
189 * Just count current code if no other decompositions. A NULL entry is
190 * equivalent to a character with class 0 and no decompositions.
197 * If this entry has other decomposition codes look at them as well. First
198 * get its decomposition in the list of tables available.
201 for (
i = 0;
i < dec_size;
i++)
212 * Recompose a set of characters. For hangul characters, the calculation
213 * is algorithmic. For others, an inverse lookup at the decomposition
214 * table is necessary. Returns true if a recomposition can be done, and
221 * Handle Hangul characters algorithmically, per the Unicode spec.
223 * Check if two current characters are L and V.
228 /* make syllable of form LV */
235 /* Check if two current characters are LV and T */
240 /* make syllable of form LVT */
243 *result =
start + tindex;
251 * Do an inverse lookup of the decomposition tables to see if anything
252 * matches. The comparison just needs to be a perfect match on the
253 * sub-table of size two, because the start character has already been
254 * recomposed partially. This lookup uses a perfect hash function for
265 * Compute the hash function. The hash key is formed by concatenating
266 * bytes of the two codepoints in network order. See also
267 * src/common/unicode/generate-unicode_norm_table.pl.
270 h = recompinfo.
hash(&hashkey);
272 /* An out-of-range result implies no match */
307#endif /* !FRONTEND */
314 * Decompose the given code into the array given by caller. The
315 * decomposition begins at the position given by caller, saving one
316 * lookup on the decomposition table. The current position needs to be
317 * updated here to let the caller know from where to continue filling
318 * in the array result.
329 * Fast path for Hangul characters not stored in tables to save memory as
330 * decomposition is algorithmic. See
331 * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
342 sindex = code -
SBASE;
354 res[*current] =
TBASE + tindex;
364 * Just fill in with the current decomposition if there are no
365 * decomposition codes to recurse to. A NULL entry is equivalent to a
366 * character with class 0 and no decompositions, so just leave also in
374 res[*current] = code;
380 * If this entry has other decomposition codes look at them as well.
383 for (
i = 0;
i < dec_size;
i++)
387 /* Leave if no more decompositions */
393 * unicode_normalize - Normalize a Unicode string to the specified form.
395 * The input is a 0-terminated array of codepoints.
397 * In frontend, returns a 0-terminated array of codepoints, allocated with
398 * malloc. Or NULL if we run out of memory. In backend, the returned
399 * string is palloc'd instead, and OOM is reported with ereport().
413 /* variables for recomposition */
419 /* First, do character decomposition */
422 * Calculate how many characters long the decomposed version will be.
425 for (p =
input; *p; p++)
429 if (decomp_chars == NULL)
433 * Now fill in each entry recursively. This needs a second pass on the
434 * decomposition table.
437 for (p =
input; *p; p++)
439 decomp_chars[decomp_size] =
'0円';
442 /* Leave if there is nothing to decompose */
443 if (decomp_size == 0)
447 * Now apply canonical ordering.
449 for (count = 1; count < decomp_size; count++)
451 pg_wchar prev = decomp_chars[count - 1];
458 * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
459 * annex 4, a sequence of two adjacent characters in a string is an
460 * exchangeable pair if the combining class (from the Unicode
461 * Character Database) for the first character is greater than the
462 * combining class for the second, and the second is not a starter. A
463 * character is a starter if its combining class is 0.
465 if (prevClass == 0 || nextClass == 0)
468 if (prevClass <= nextClass)
471 /* exchange can happen */
472 tmp = decomp_chars[count - 1];
473 decomp_chars[count - 1] = decomp_chars[count];
474 decomp_chars[count] = tmp;
476 /* backtrack to check again */
485 * The last phase of NFC and NFKC is the recomposition of the reordered
486 * Unicode string using combining classes. The recomposed string cannot be
487 * longer than the decomposed one, so make the allocation of the output
488 * string based on that assumption.
497 last_class = -1;
/* this eliminates a special check */
500 starter_ch = recomp_chars[0] = decomp_chars[0];
502 for (count = 1; count < decomp_size; count++)
508 if (last_class < ch_class &&
511 recomp_chars[starter_pos] = composite;
512 starter_ch = composite;
514 else if (ch_class == 0)
516 starter_pos = target_pos;
519 recomp_chars[target_pos++] = ch;
523 last_class = ch_class;
524 recomp_chars[target_pos++] = ch;
527 recomp_chars[target_pos] = (
pg_wchar)
'0円';
535 * Normalization "quick check" algorithm; see
536 * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
539/* We only need this in the backend. */
549 * Compute the hash function. The hash key is the codepoint with the bytes
553 h = norminfo->
hash(&hashkey);
555 /* An out-of-range result implies no match */
560 * Since it's a perfect hash, we need only match to the specific codepoint
571 * Look up the normalization quick check character property
600 uint8 lastCanonicalClass = 0;
604 * For the "D" forms, we don't run the quickcheck. We don't include the
605 * lookup tables for those because they are huge, checking for these
606 * particular forms is less common, and running the slow path is faster
607 * for the "D" forms than the "C" forms because you don't need to
608 * recompose, which is slow.
616 uint8 canonicalClass;
620 if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
629 lastCanonicalClass = canonicalClass;
634#endif /* !FRONTEND */
Assert(PointerIsAligned(start, uint64))
static int64 current_size
const pg_unicode_decomposition * decomps
const pg_unicode_normprops * normprops
const uint16 * inverse_lookup
static const pg_unicode_decomposition * get_code_entry(pg_wchar code)
static void decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
static const pg_wchar * get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
static uint8 get_canonical_class(pg_wchar code)
static UnicodeNormalizationQC qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
pg_wchar * unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
static const pg_unicode_normprops * qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
static bool recompose_code(uint32 start, uint32 code, uint32 *result)
static int get_decomposed_size(pg_wchar code, bool compat)
static const pg_unicode_decompinfo UnicodeDecompInfo
static const pg_unicode_recompinfo UnicodeRecompInfo
#define DECOMPOSITION_NO_COMPOSE(x)
static const uint32 UnicodeDecomp_codepoints[5138]
#define DECOMPOSITION_IS_INLINE(x)
static const pg_unicode_decomposition UnicodeDecompMain[6843]
#define DECOMPOSITION_IS_COMPAT(x)
#define DECOMPOSITION_SIZE(x)
static const pg_unicode_norminfo UnicodeNormInfo_NFKC_QC
static const pg_unicode_norminfo UnicodeNormInfo_NFC_QC