1/*-------------------------------------------------------------------------
3 * Unicode case mapping and case conversion.
5 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
8 * src/common/unicode_case.c
10 *-------------------------------------------------------------------------
31 * Map for each case kind.
42static size_t convert_case(
char *dst,
size_t dstsize,
const char *src, ssize_t srclen,
46 const char *src,
size_t srclen,
size_t srcoff,
54 return cp != 0 ? cp : code;
62 return cp != 0 ? cp : code;
70 return cp != 0 ? cp : code;
78 return cp != 0 ? cp : code;
84 * Convert src to lowercase, and return the result length (not including
87 * String src must be encoded in UTF-8. If srclen < 0, src must be
90 * Result string is stored in dst, truncating if larger than dstsize. If
91 * dstsize is greater than the result length, dst will be NUL-terminated;
94 * If dstsize is zero, dst may be NULL. This is useful for calculating the
95 * required buffer size before allocating.
97 * If full is true, use special case mappings if available and if the
98 * conditions are satisfied.
111 * Convert src to titlecase, and return the result length (not including
114 * String src must be encoded in UTF-8. If srclen < 0, src must be
117 * Result string is stored in dst, truncating if larger than dstsize. If
118 * dstsize is greater than the result length, dst will be NUL-terminated;
121 * If dstsize is zero, dst may be NULL. This is useful for calculating the
122 * required buffer size before allocating.
124 * If full is true, use special case mappings if available and if the
125 * conditions are satisfied. Otherwise, use only simple mappings and use
126 * uppercase instead of titlecase.
128 * Titlecasing requires knowledge about word boundaries, which is provided by
129 * the callback wbnext. A word boundary is the offset of the start of a word
130 * or the offset of the character immediately following a word.
132 * The caller is expected to initialize and free the callback state
133 * wbstate. The callback should first return offset 0 for the first boundary;
134 * then the offset of each subsequent word boundary; then the total length of
135 * the string to indicate the final boundary.
148 * Convert src to uppercase, and return the result length (not including
151 * String src must be encoded in UTF-8. If srclen < 0, src must be
154 * Result string is stored in dst, truncating if larger than dstsize. If
155 * dstsize is greater than the result length, dst will be NUL-terminated;
158 * If dstsize is zero, dst may be NULL. This is useful for calculating the
159 * required buffer size before allocating.
161 * If full is true, use special case mappings if available and if the
162 * conditions are satisfied.
175 * Case fold src, and return the result length (not including terminating
178 * String src must be encoded in UTF-8. If srclen < 0, src must be
181 * Result string is stored in dst, truncating if larger than dstsize. If
182 * dstsize is greater than the result length, dst will be NUL-terminated;
185 * If dstsize is zero, dst may be NULL. This is useful for calculating the
186 * required buffer size before allocating.
197 * Implement Unicode Default Case Conversion algorithm.
199 * If str_casekind is CaseLower or CaseUpper, map each character in the string
200 * for which a mapping is available.
202 * If str_casekind is CaseTitle, maps characters found on a word boundary to
203 * titlecase (or uppercase if full is false) and other characters to
204 * lowercase. NB: does not currently implement the Unicode behavior in which
205 * the word boundary is adjusted to the next Cased character. That behavior
206 * could be implemented as an option, but it doesn't match the default
207 * behavior of ICU, nor does it match the documented behavior of INITCAP().
209 * If full is true, use special mappings for relevant characters, which can
210 * map a single codepoint to multiple codepoints, or depend on conditions.
213 convert_case(
char *dst,
size_t dstsize,
const char *src, ssize_t srclen,
217 /* character CaseKind varies while titlecasing */
218 CaseKind chr_casekind = str_casekind;
220 size_t result_len = 0;
224 (str_casekind !=
CaseTitle && !wbnext && !wbstate));
228 boundary = wbnext(wbstate);
229 Assert(boundary == 0);
/* start of text is always a boundary */
232 while ((srclen < 0 || srcoff < srclen) && src[srcoff] !=
'0円')
242 if (srcoff == boundary)
245 boundary = wbnext(wbstate);
251 casemap_result =
casemap(u1, chr_casekind, full, src, srclen, srcoff,
254 switch (casemap_result)
257 /* no mapping; copy bytes from src */
260 if (result_len + u1len <= dstsize)
261 memcpy(dst + result_len, src + srcoff, u1len);
267 /* replace with single character */
272 if (result_len + u2len <= dstsize)
279 /* replace with up to MAX_CASE_EXPANSION characters */
286 if (result_len + u2len <= dstsize)
297 if (result_len < dstsize)
298 dst[result_len] =
'0円';
304 * Check that the condition matches Final_Sigma, described in Unicode Table
305 * 3-17. The character at the given offset must be directly preceded by a
306 * Cased character, and must not be directly followed by a Cased character.
308 * Case_Ignorable characters are ignored. NB: some characters may be both
309 * Cased and Case_Ignorable, in which case they are ignored.
314 /* the start of the string is not preceded by a Cased character */
318 /* iterate backwards, looking for Cased character */
319 for (
int i = offset - 1;
i >= 0;
i--)
321 if ((
str[
i] & 0x80) == 0 || (
str[
i] & 0xC0) == 0xC0)
332 else if ((
str[
i] & 0xC0) == 0x80)
335 Assert(
false);
/* invalid UTF-8 */
338 /* end of string is not followed by a Cased character */
342 /* iterate forwards, looking for Cased character */
343 for (
int i = offset + 1;
i <
len &&
str[
i] !=
'0円';
i++)
345 if ((
str[
i] & 0x80) == 0 || (
str[
i] & 0xC0) == 0xC0)
356 else if ((
str[
i] & 0xC0) == 0x80)
359 Assert(
false);
/* invalid UTF-8 */
366 * Unicode allows for special casing to be applied only under certain
367 * circumstances. The only currently-supported condition is Final_Sigma.
378 /* no other conditions supported */
384 * Map the given character to the requested case.
386 * If full is true, and a special case mapping is found and the conditions are
387 * met, 'special' is set to the mapping result (which is an array of up to
388 * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
390 * Otherwise, search for a simple mapping, and if found, set 'simple' to the
391 * result and return CASEMAP_SIMPLE.
393 * If no mapping is found, return CASEMAP_SELF, and the caller should copy the
394 * character without modification.
398 const char *src,
size_t srclen,
size_t srcoff,
403 /* Fast path for codepoints < 0x80 */
407 * The first elements in all tables are reserved as 0 (as NULL). The
408 * data starts at index 1, not 0.
422 src, srclen, srcoff))
434 * Find entry in simple case map.
435 * If the entry does not exist, 0 will be returned.
440 /* Fast path for codepoints < 0x80 */
442 /* The first elements in all tables are reserved as 0 (as NULL). */
Datum idx(PG_FUNCTION_ARGS)
Assert(PointerIsAligned(start, uint64))
static pg_wchar utf8_to_unicode(const unsigned char *c)
static unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
static int unicode_utf8len(pg_wchar c)
pg_wchar map[NCaseKind][MAX_CASE_EXPANSION]
pg_wchar unicode_uppercase_simple(pg_wchar code)
pg_wchar unicode_titlecase_simple(pg_wchar code)
size_t unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full, const char *src, size_t srclen, size_t srcoff, pg_wchar *simple, const pg_wchar **special)
size_t unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, CaseKind str_casekind, bool full, WordBoundaryNext wbnext, void *wbstate)
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full, WordBoundaryNext wbnext, void *wbstate)
static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map)
static bool check_special_conditions(int conditions, const char *str, size_t len, size_t offset)
static const pg_wchar *const casekind_map[NCaseKind]
pg_wchar unicode_lowercase_simple(pg_wchar code)
size_t unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
pg_wchar unicode_casefold_simple(pg_wchar code)
static bool check_final_sigma(const unsigned char *str, size_t len, size_t offset)
size_t(* WordBoundaryNext)(void *wbstate)
static const uint8 case_map_special[1704]
#define MAX_CASE_EXPANSION
static const pg_special_case special_case[106]
static const pg_wchar case_map_title[1704]
static const pg_wchar case_map_upper[1704]
static const pg_wchar case_map_fold[1704]
static const pg_wchar case_map_lower[1704]
static uint16 case_index(pg_wchar cp)
bool pg_u_prop_cased(pg_wchar code)
bool pg_u_prop_case_ignorable(pg_wchar code)