1/*-----------------------------------------------------------------------
3 * PostgreSQL locale utilities for libc
5 * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
7 * src/backend/utils/adt/pg_locale_libc.c
9 *-----------------------------------------------------------------------
29#include <gnu/libc-version.h>
37 * For the libc provider, to provide as much functionality as possible on a
38 * variety of platforms without going so far as to implement everything from
39 * scratch, we use several implementation strategies depending on the
42 * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
43 * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
44 * collations don't give a fig about multibyte characters.
46 * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
47 * This assumes that every platform uses Unicode codepoints directly
48 * as the wchar_t representation of Unicode. (XXX: ICU makes this assumption
49 * even for non-UTF8 encodings, which may be a problem.) On some platforms
50 * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
52 * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
53 * values up to 255, and punt for values above that. This is 100% correct
54 * only in single-byte encodings such as LATINn. However, non-Unicode
55 * multibyte encodings are mostly Far Eastern character sets for which the
56 * properties being tested here aren't very relevant for higher code values
57 * anyway. The difficulty with using the <wctype.h> functions with
58 * non-Unicode multibyte encodings is that we can have no certainty that
59 * the platform's wchar_t representation matches what we do in pg_wchar
62 * As a special case, in the "default" collation, (2) and (3) force ASCII
63 * letters to follow ASCII upcase/downcase rules, while in a non-default
64 * collation we just let the library functions do what they will. The case
65 * where this matters is treatment of I/i in Turkish, and the behavior is
66 * meant to match the upper()/lower() SQL functions.
68 * We store the active collation setting in static variables. In principle
69 * it could be passed down to here via the regex library's "struct vars" data
70 * structure; but that would require somewhat invasive changes in the regex
71 * library, and right now there's no real benefit to be gained from that.
73 * NB: the coding here assumes pg_wchar is an unsigned type.
77 * Size of stack buffer to use for string transformations, used to avoid heap
78 * allocations in typical cases. This should be large enough that most strings
79 * will fit, but small enough that we feel comfortable putting it on the
82 #define TEXTBUFLEN 1024
87 const char *arg2, ssize_t len2,
90 const char *src, ssize_t srclen,
97static int strncoll_libc_win32_utf8(
const char *arg1, ssize_t len1,
98 const char *arg2, ssize_t len2,
103 const char *src, ssize_t srclen,
106 const char *src, ssize_t srclen,
109 const char *src, ssize_t srclen,
112 const char *src, ssize_t srclen,
115 const char *src, ssize_t srclen,
118 const char *src, ssize_t srclen,
252 /* force C behavior for ASCII characters, per comments above */
266 /* force C behavior for ASCII characters, per comments above */
269 if (
sizeof(
wchar_t) >= 4 || wc <= (
pg_wchar) 0xFFFF)
280 /* force C behavior for ASCII characters, per comments above */
294 /* force C behavior for ASCII characters, per comments above */
297 if (
sizeof(
wchar_t) >= 4 || wc <= (
pg_wchar) 0xFFFF)
320 .max_chr = UCHAR_MAX,
324 * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
325 * single-byte semantics for pattern matching.
344 .max_chr = UCHAR_MAX,
369 .strnxfrm_prefix = NULL,
372 * Unfortunately, it seems that strxfrm() for non-C collations is broken
373 * on many common platforms; testing of multiple versions of glibc reveals
374 * that, for many locales, strcoll() and strxfrm() do not return
375 * consistent results. While no other libc other than Cygwin has so far
376 * been shown to have a problem, we take the conservative course of action
377 * for right now and disable this categorically. (Users who are certain
378 * this isn't a problem on their system can define TRUST_STRXFRM.)
381 .strxfrm_is_safe =
true,
383 .strxfrm_is_safe =
false,
389 .
strncoll = strncoll_libc_win32_utf8,
391 .strnxfrm_prefix = NULL,
393 .strxfrm_is_safe =
true,
395 .strxfrm_is_safe =
false,
405 srclen = strlen(src);
407 if (srclen + 1 <= destsize)
412 if (srclen + 1 > destsize)
415 memcpy(
dest, src, srclen);
419 * Note: we assume that tolower_l() will not be so broken as to need
420 * an isupper_l() guard test. When using the default collation, we
421 * apply the traditional Postgres behavior that forces ASCII-style
422 * treatment of I/i, but in non-default collations you get exactly
423 * what the collation says.
425 for (p =
dest; *p; p++)
449 srclen = strlen(src);
451 /* Overflow paranoia */
452 if ((srclen + 1) > (INT_MAX /
sizeof(
wchar_t)))
454 (
errcode(ERRCODE_OUT_OF_MEMORY),
455 errmsg(
"out of memory")));
457 /* Output workspace cannot have more codes than input bytes */
458 workspace = (
wchar_t *)
palloc((srclen + 1) *
sizeof(wchar_t));
460 char2wchar(workspace, srclen + 1, src, srclen, loc);
462 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
463 workspace[curr_char] =
towlower_l(workspace[curr_char], loc);
466 * Make result large enough; case change might change number of bytes
469 result =
palloc(max_size + 1);
471 result_size =
wchar2char(result, workspace, max_size + 1, loc);
473 if (result_size + 1 > destsize)
476 memcpy(
dest, result, result_size);
477 dest[result_size] =
'0円';
490 srclen = strlen(src);
492 if (srclen + 1 <= destsize)
495 int wasalnum =
false;
498 memcpy(
dest, src, srclen);
502 * Note: we assume that toupper_l()/tolower_l() will not be so broken
503 * as to need guard tests. When using the default collation, we apply
504 * the traditional Postgres behavior that forces ASCII-style treatment
505 * of I/i, but in non-default collations you get exactly what the
508 for (p =
dest; *p; p++)
524 wasalnum =
isalnum_l((
unsigned char) *p, loc);
536 int wasalnum =
false;
544 srclen = strlen(src);
546 /* Overflow paranoia */
547 if ((srclen + 1) > (INT_MAX /
sizeof(
wchar_t)))
549 (
errcode(ERRCODE_OUT_OF_MEMORY),
550 errmsg(
"out of memory")));
552 /* Output workspace cannot have more codes than input bytes */
553 workspace = (
wchar_t *)
palloc((srclen + 1) *
sizeof(wchar_t));
555 char2wchar(workspace, srclen + 1, src, srclen, loc);
557 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
560 workspace[curr_char] =
towlower_l(workspace[curr_char], loc);
562 workspace[curr_char] =
towupper_l(workspace[curr_char], loc);
563 wasalnum =
iswalnum_l(workspace[curr_char], loc);
567 * Make result large enough; case change might change number of bytes
570 result =
palloc(max_size + 1);
572 result_size =
wchar2char(result, workspace, max_size + 1, loc);
574 if (result_size + 1 > destsize)
577 memcpy(
dest, result, result_size);
578 dest[result_size] =
'0円';
591 srclen = strlen(src);
593 if (srclen + 1 <= destsize)
598 memcpy(
dest, src, srclen);
602 * Note: we assume that toupper_l() will not be so broken as to need
603 * an islower_l() guard test. When using the default collation, we
604 * apply the traditional Postgres behavior that forces ASCII-style
605 * treatment of I/i, but in non-default collations you get exactly
606 * what the collation says.
608 for (p =
dest; *p; p++)
632 srclen = strlen(src);
634 /* Overflow paranoia */
635 if ((srclen + 1) > (INT_MAX /
sizeof(
wchar_t)))
637 (
errcode(ERRCODE_OUT_OF_MEMORY),
638 errmsg(
"out of memory")));
640 /* Output workspace cannot have more codes than input bytes */
641 workspace = (
wchar_t *)
palloc((srclen + 1) *
sizeof(wchar_t));
643 char2wchar(workspace, srclen + 1, src, srclen, loc);
645 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
646 workspace[curr_char] =
towupper_l(workspace[curr_char], loc);
649 * Make result large enough; case change might change number of bytes
652 result =
palloc(max_size + 1);
654 result_size =
wchar2char(result, workspace, max_size + 1, loc);
656 if (result_size + 1 > destsize)
659 memcpy(
dest, result, result_size);
660 dest[result_size] =
'0円';
676 if (
collid == DEFAULT_COLLATION_OID)
685 Anum_pg_database_datcollate);
688 Anum_pg_database_datctype);
703 Anum_pg_collation_collcollate);
706 Anum_pg_collation_collctype);
718 (strcmp(collate,
"POSIX") == 0);
719 result->
ctype_is_c = (strcmp(ctype,
"C") == 0) ||
720 (strcmp(ctype,
"POSIX") == 0);
726 result->
collate = &collate_methods_libc_win32_utf8;
745 * Create a locale_t with the given collation and ctype.
747 * The "C" and "POSIX" locales are not actually handled by libc, so return
750 * Ensure that no path leaks a locale_t.
757 if (strcmp(collate, ctype) == 0)
759 if (strcmp(ctype,
"C") != 0 && strcmp(ctype,
"POSIX") != 0)
761 /* Normal case where they're the same */
764 loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
767 loc = _create_locale(LC_ALL, collate);
776 /* We need two newlocale() steps */
779 if (strcmp(collate,
"C") != 0 && strcmp(collate,
"POSIX") != 0)
782 loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
787 if (strcmp(ctype,
"C") != 0 && strcmp(ctype,
"POSIX") != 0)
790 loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
803 * XXX The _create_locale() API doesn't appear to support this. Could
804 * perhaps be worked around by changing pg_locale_t to contain two
808 (
errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
809 errmsg(
"collations with different collate and ctype values are not supported on this platform")));
819 * NUL-terminate arguments, if necessary, and pass to strcoll_l().
821 * An input string length of -1 means that it's already NUL-terminated.
824 strncoll_libc(
const char *arg1, ssize_t len1,
const char *arg2, ssize_t len2,
829 size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
830 size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
838 /* nul-terminate arguments if necessary */
847 memcpy(buf1, arg1, len1);
858 char *buf2 =
buf + bufsize1;
860 memcpy(buf2, arg2, len2);
876 * NUL-terminate src, if necessary, and pass to strxfrm_l().
878 * A source length of -1 means that it's already NUL-terminated.
895 /* nul-terminate argument */
896 memcpy(
buf, src, srclen);
904 /* if dest is defined, it should be nul-terminated */
905 Assert(result >= destsize ||
dest[result] ==
'0円');
913 char *collversion = NULL;
919#if defined(__GLIBC__)
920 /* Use the glibc version because we don't have anything better. */
921 collversion =
pstrdup(gnu_get_libc_version());
922#elif defined(LC_VERSION_MASK)
925 /* Look up FreeBSD collation version. */
926 loc = newlocale(LC_COLLATE_MASK, collcollate, NULL);
930 pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
935 (
errmsg(
"could not load locale \"%s\"", collcollate)));
938 * If we are targeting Windows Vista and above, we can ask for a name
939 * given a collation name (earlier versions required a location code
940 * that we don't have).
942 NLSVERSIONINFOEX version = {
sizeof(NLSVERSIONINFOEX)};
943 WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
945 MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
946 LOCALE_NAME_MAX_LENGTH);
947 if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
950 * GetNLSVersionEx() wants a language tag such as "en-US", not a
951 * locale name like "English_United States.1252". Until those
952 * values can be prevented from entering the system, or 100%
953 * reliably converted to the more useful tag format, tolerate the
954 * resulting error and report that we have no version data.
956 if (GetLastError() == ERROR_INVALID_PARAMETER)
960 (
errmsg(
"could not get collation version for locale \"%s\": error code %lu",
964 collversion =
psprintf(
"%lu.%lu,%lu.%lu",
965 (version.dwNLSVersion >> 8) & 0xFFFF,
966 version.dwNLSVersion & 0xFF,
967 (version.dwDefinedVersion >> 8) & 0xFFFF,
968 version.dwDefinedVersion & 0xFF);
976 * strncoll_libc_win32_utf8
978 * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
979 * invoke wcscoll_l().
981 * An input string length of -1 means that it's NUL-terminated.
985strncoll_libc_win32_utf8(
const char *arg1, ssize_t len1,
const char *arg2,
1000 len1 = strlen(arg1);
1002 len2 = strlen(arg2);
1004 a1len = len1 * 2 + 2;
1005 a2len = len2 * 2 + 2;
1013 /* API does not work for zero-length input */
1018 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1019 (LPWSTR) a1p, a1len / 2);
1022 (
errmsg(
"could not convert string to UTF-16: error code %lu",
1025 ((LPWSTR) a1p)[r] = 0;
1031 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1032 (LPWSTR) a2p, a2len / 2);
1035 (
errmsg(
"could not convert string to UTF-16: error code %lu",
1038 ((LPWSTR) a2p)[r] = 0;
1042 if (result == 2147483647)
/* _NLSCMPERROR; missing from mingw headers */
1044 (
errmsg(
"could not compare Unicode strings: %m")));
1053/* simple subroutine for reporting errors from newlocale() */
1060 * Windows doesn't provide any useful error indication from
1061 * _create_locale(), and BSD-derived platforms don't seem to feel they
1062 * need to set errno either (even though POSIX is pretty clear that
1063 * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1064 * is what to report.
1070 * ENOENT means "no such locale", not "no such file", so clarify that
1071 * errno with an errdetail message.
1073 save_errno = errno;
/* auxiliary funcs might change errno */
1075 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1076 errmsg(
"could not create locale \"%s\": %m",
1078 (save_errno == ENOENT ?
1079 errdetail(
"The operating system could not find any locale data for the locale name \"%s\".",
1084 * POSIX doesn't define _l-variants of these functions, but several systems
1085 * have them. We provide our own replacements here.
1087#ifndef HAVE_MBSTOWCS_L
1092 return _mbstowcs_l(
dest, src, n, loc);
1095 locale_t save_locale = uselocale(loc);
1097 result = mbstowcs(
dest, src, n);
1098 uselocale(save_locale);
1103#ifndef HAVE_WCSTOMBS_L
1108 return _wcstombs_l(
dest, src, n, loc);
1111 locale_t save_locale = uselocale(loc);
1113 result = wcstombs(
dest, src, n);
1114 uselocale(save_locale);
1121 * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
1122 * Therefore we keep them here rather than with the mbutils code.
1126 * wchar2char --- convert wide characters to multibyte format
1128 * This has the same API as the standard wcstombs_l() function; in particular,
1129 * tolen is the maximum number of bytes to store at *to, and *from must be
1130 * zero-terminated. The output will be zero-terminated iff there is room.
1143 * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
1144 * for some reason mbstowcs and wcstombs won't do this for us, so we use
1145 * MultiByteToWideChar().
1149 result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
1151 /* A zero return is failure */
1157 /* Microsoft counts the zero terminator in the result */
1165 /* Use wcstombs directly for the default locale */
1166 result = wcstombs(to, from, tolen);
1170 /* Use wcstombs_l for nondefault locales */
1178 * char2wchar --- convert multibyte characters to wide characters
1180 * This has almost the API of mbstowcs_l(), except that *from need not be
1181 * null-terminated; instead, the number of input bytes is specified as
1182 * fromlen. Also, we ereport() rather than returning -1 for invalid
1183 * input encoding. tolen is the maximum number of wchar_t's to store at *to.
1184 * The output will be zero-terminated iff there is room.
1187 char2wchar(
wchar_t *to,
size_t tolen,
const char *from,
size_t fromlen,
1196 /* See WIN32 "Unicode" comment above */
1199 /* Win32 API does not work for zero-length input */
1204 result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
1205 /* A zero return is failure */
1213 /* Append trailing null wchar (MultiByteToWideChar() does not) */
1220 /* mbstowcs requires ending '0円' */
1225 /* Use mbstowcs directly for the default locale */
1226 result = mbstowcs(to,
str, tolen);
1230 /* Use mbstowcs_l for nondefault locales */
1240 * Invalid multibyte character encountered. We try to give a useful
1241 * error message by letting pg_verifymbstr check the string. But it's
1242 * possible that the string is OK to us, and not OK to mbstowcs ---
1243 * this suggests that the LC_CTYPE locale is different from the
1244 * database encoding. Give a generic error message if pg_verifymbstr
1245 * can't find anything wrong.
1248 /* but if it does ... */
1250 (
errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1251 errmsg(
"invalid multibyte character for locale"),
1252 errhint(
"The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
#define TextDatumGetCString(d)
#define IS_HIGHBIT_SET(ch)
int errdetail(const char *fmt,...)
int errhint(const char *fmt,...)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
Assert(PointerIsAligned(start, uint64))
#define HeapTupleIsValid(tuple)
if(TABLE==NULL||TABLE_index==NULL)
int GetDatabaseEncoding(void)
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
int pg_database_encoding_max_length(void)
void * MemoryContextAllocZero(MemoryContext context, Size size)
char * pstrdup(const char *in)
void pfree(void *pointer)
char * pnstrdup(const char *in, Size len)
static bool wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale)
static const struct ctype_methods ctype_methods_libc_other_mb
static const struct ctype_methods ctype_methods_libc_utf8
static pg_wchar toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context)
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
static bool wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, locale_t loc)
static bool wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
static pg_wchar toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale)
char * get_collation_actual_version_libc(const char *collcollate)
static bool wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale)
static locale_t make_libc_collator(const char *collate, const char *ctype)
static bool wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale)
static pg_wchar tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
static size_t wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
static const struct collate_methods collate_methods_libc
static bool wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
static size_t strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
static const struct ctype_methods ctype_methods_libc_sb
static size_t strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
void report_newlocale_failure(const char *localename)
static pg_wchar tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
static char char_tolower_libc(unsigned char ch, pg_locale_t locale)
static bool char_is_cased_libc(char ch, pg_locale_t locale)
static bool wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static size_t mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
static size_t strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
int pg_strcasecmp(const char *s1, const char *s2)
unsigned char pg_toupper(unsigned char ch)
unsigned char pg_tolower(unsigned char ch)
unsigned char pg_ascii_tolower(unsigned char ch)
unsigned char pg_ascii_toupper(unsigned char ch)
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
static Datum ObjectIdGetDatum(Oid X)
char * psprintf(const char *fmt,...)
int(* strncoll)(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
size_t(* strlower)(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
const struct ctype_methods * ctype
const struct collate_methods * collate
void ReleaseSysCache(HeapTuple tuple)
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, AttrNumber attributeNumber)