1/*-------------------------------------------------------------------------
3 * Utility functions for conversion procs.
5 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
6 * Portions Copyright (c) 1994, Regents of the University of California
9 * src/backend/utils/mb/conv.c
11 *-------------------------------------------------------------------------
18 * local2local: a generic single byte charset encoding
19 * conversion between two ASCII-superset encodings.
21 * l points to the source string of length len
22 * p is the output area (must be large enough!)
23 * src_encoding is the PG identifier for the source encoding
24 * dest_encoding is the PG identifier for the target encoding
25 * tab holds conversion entries for the source charset
26 * starting from 128 (0x80). each entry in the table holds the corresponding
27 * code point for the target charset, or 0 if there is no equivalent code.
29 * Returns the number of input bytes consumed. If noError is true, this can
38 const unsigned char *tab,
41 const unsigned char *
start = l;
66 (
const char *) l,
len);
78 * LATINn ---> MIC when the charset's local codes map directly to MIC
80 * l points to the source string of length len
81 * p is the output area (must be large enough!)
82 * lc is the mule character set id for the local encoding
83 * encoding is the PG identifier for the local encoding
85 * Returns the number of input bytes consumed. If noError is true, this can
92 const unsigned char *
start = l;
116 * MIC ---> LATINn when the charset's local codes map directly to MIC
118 * mic points to the source string of length len
119 * p is the output area (must be large enough!)
120 * lc is the mule character set id for the local encoding
121 * encoding is the PG identifier for the local encoding
123 * Returns the number of input bytes consumed. If noError is true, this can
124 * be less than 'len'.
130 const unsigned char *
start = mic;
165 (
const char *) mic,
len);
179 * latin2mic_with_table: a generic single byte charset encoding
180 * conversion from a local charset to the mule internal code.
182 * l points to the source string of length len
183 * p is the output area (must be large enough!)
184 * lc is the mule character set id for the local encoding
185 * encoding is the PG identifier for the local encoding
186 * tab holds conversion entries for the local charset
187 * starting from 128 (0x80). each entry in the table holds the corresponding
188 * code point for the mule encoding, or 0 if there is no equivalent code.
190 * Returns the number of input bytes consumed. If noError is true, this can
191 * be less than 'len'.
199 const unsigned char *tab,
202 const unsigned char *
start = l;
230 (
const char *) l,
len);
242 * mic2latin_with_table: a generic single byte charset encoding
243 * conversion from the mule internal code to a local charset.
245 * mic points to the source string of length len
246 * p is the output area (must be large enough!)
247 * lc is the mule character set id for the local encoding
248 * encoding is the PG identifier for the local encoding
249 * tab holds conversion entries for the mule internal code's second byte,
250 * starting from 128 (0x80). each entry in the table holds the corresponding
251 * code point for the local charset, or 0 if there is no equivalent code.
253 * Returns the number of input bytes consumed. If noError is true, this can
254 * be less than 'len'.
262 const unsigned char *tab,
265 const unsigned char *
start = mic;
297 (c2 = tab[mic[1] -
HIGHBIT]) == 0)
302 (
const char *) mic,
len);
303 break;
/* keep compiler quiet */
316 * comparison routine for bsearch()
317 * this routine is intended for combined UTF8 -> local code
331 return (
s1 > d1 || (
s1 == d1 &&
s2 > d2)) ? 1 : ((
s1 == d1 &&
s2 == d2) ? 0 : -1);
335 * comparison routine for bsearch()
336 * this routine is intended for local code -> combined UTF8
344 v1 = *(
const uint32 *) p1;
346 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
350 * store 32bit character representation into multibyte stream
352static inline unsigned char *
355 if (code & 0xff000000)
356 *
dest++ = code >> 24;
357 if (code & 0x00ff0000)
358 *
dest++ = code >> 16;
359 if (code & 0x0000ff00)
361 if (code & 0x000000ff)
367 * Convert a character using a conversion radix tree.
369 * 'l' is the length of the input character in bytes, and b1-b4 are
370 * the input character's bytes.
384 /* check code validity */
385 if (b1 < rt->b4_1_lower || b1 > rt->
b4_1_upper ||
415 /* check code validity */
416 if (b2 < rt->b3_1_lower || b2 > rt->
b3_1_upper ||
443 /* check code validity - first byte */
444 if (b3 < rt->b2_1_lower || b3 > rt->
b2_1_upper ||
468 /* check code validity - first byte */
469 if (b4 < rt->b1_lower || b4 > rt->
b1_upper)
478 return 0;
/* shouldn't happen */
482 * UTF8 ---> local code
484 * utf: input string in UTF8 encoding (need not be null-terminated)
485 * len: length of input string (in bytes)
486 * iso: pointer to the output area (must be large enough!)
487 (output string will be null-terminated)
488 * map: conversion map for single characters
489 * cmap: conversion map for combined characters
490 * (optional, pass NULL if none)
491 * cmapsize: number of entries in the conversion map for combined characters
492 * (optional, pass 0 if none)
493 * conv_func: algorithmic encoding conversion function
494 * (optional, pass NULL if none)
495 * encoding: PG identifier for the local encoding
497 * For each character, the cmap (if provided) is consulted first; if no match,
498 * the map is consulted next; if still no match, the conv_func (if provided)
499 * is applied. An error is raised if no match is found.
501 * See pg_wchar.h for more details about the data structures used here.
503 * Returns the number of input bytes consumed. If noError is true, this can
504 * be less than 'len'.
517 const unsigned char *
start = utf;
521 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
526 unsigned char b1 = 0;
527 unsigned char b2 = 0;
528 unsigned char b3 = 0;
529 unsigned char b4 = 0;
531 /* "break" cases all represent errors */
544 /* ASCII case is easy, assume it's one-to-one conversion */
549 /* collect coded char of length l */
570 elog(
ERROR,
"unsupported character length %d", l);
571 iutf = 0;
/* keep compiler quiet */
573 iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
575 /* First, try with combined map if possible */
578 const unsigned char *utf_save = utf;
582 /* collect next character, same as above */
588 /* need more data to decide if this is a combined char */
601 /* We assume ASCII character cannot be in combined map */
614 iutf2 = *utf++ << 16;
615 iutf2 |= *utf++ << 8;
620 iutf2 = *utf++ << 24;
621 iutf2 |= *utf++ << 16;
622 iutf2 |= *utf++ << 8;
627 elog(
ERROR,
"unsupported character length %d", l);
628 iutf2 = 0;
/* keep compiler quiet */
634 cp = bsearch(cutf, cmap, cmapsize,
644 /* fail, so back up to reprocess second character next time */
650 /* Now check ordinary map */
662 /* if there's a conversion function, try that */
665 uint32 converted = (*conv_func) (iutf);
674 /* failed to translate this character */
679 (
const char *) utf,
len);
682 /* if we broke out of loop early, must be invalid input */
683 if (
len > 0 && !noError)
692 * local code ---> UTF8
694 * iso: input string in local encoding (need not be null-terminated)
695 * len: length of input string (in bytes)
696 * utf: pointer to the output area (must be large enough!)
697 (output string will be null-terminated)
698 * map: conversion map for single characters
699 * cmap: conversion map for combined characters
700 * (optional, pass NULL if none)
701 * cmapsize: number of entries in the conversion map for combined characters
702 * (optional, pass 0 if none)
703 * conv_func: algorithmic encoding conversion function
704 * (optional, pass NULL if none)
705 * encoding: PG identifier for the local encoding
707 * For each character, the map is consulted first; if no match, the cmap
708 * (if provided) is consulted next; if still no match, the conv_func
709 * (if provided) is applied. An error is raised if no match is found.
711 * See pg_wchar.h for more details about the data structures used here.
713 * Returns the number of input bytes consumed. If noError is true, this can
714 * be less than 'len'.
728 const unsigned char *
start = iso;
732 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
737 unsigned char b1 = 0;
738 unsigned char b2 = 0;
739 unsigned char b3 = 0;
740 unsigned char b4 = 0;
742 /* "break" cases all represent errors */
748 /* ASCII case is easy, assume it's one-to-one conversion */
758 /* collect coded char of length l */
781 elog(
ERROR,
"unsupported character length %d", l);
782 iiso = 0;
/* keep compiler quiet */
784 iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
796 /* If there's a combined character map, try that */
799 cp = bsearch(&iiso, cmap, cmapsize,
811 /* if there's a conversion function, try that */
814 uint32 converted = (*conv_func) (iiso);
823 /* failed to translate this character */
828 (
const char *) iso,
len);
831 /* if we broke out of loop early, must be invalid input */
832 if (
len > 0 && !noError)
Datum idx(PG_FUNCTION_ARGS)
#define IS_HIGHBIT_SET(ch)
int mic2latin_with_table(const unsigned char *mic, unsigned char *p, int len, int lc, int encoding, const unsigned char *tab, bool noError)
int UtfToLocal(const unsigned char *utf, int len, unsigned char *iso, const pg_mb_radix_tree *map, const pg_utf_to_local_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding, bool noError)
static int compare3(const void *p1, const void *p2)
int latin2mic_with_table(const unsigned char *l, unsigned char *p, int len, int lc, int encoding, const unsigned char *tab, bool noError)
static unsigned char * store_coded_char(unsigned char *dest, uint32 code)
int mic2latin(const unsigned char *mic, unsigned char *p, int len, int lc, int encoding, bool noError)
static uint32 pg_mb_radix_conv(const pg_mb_radix_tree *rt, int l, unsigned char b1, unsigned char b2, unsigned char b3, unsigned char b4)
int LocalToUtf(const unsigned char *iso, int len, unsigned char *utf, const pg_mb_radix_tree *map, const pg_local_to_utf_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding, bool noError)
static int compare4(const void *p1, const void *p2)
int local2local(const unsigned char *l, unsigned char *p, int len, int src_encoding, int dest_encoding, const unsigned char *tab, bool noError)
int latin2mic(const unsigned char *l, unsigned char *p, int len, int lc, int encoding, bool noError)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len)
void report_invalid_encoding(int encoding, const char *mbstr, int len)
uint32(* utf_local_conversion_func)(uint32 code)
#define PG_VALID_ENCODING(_enc)
bool pg_utf8_islegal(const unsigned char *source, int length)
int pg_mule_mblen(const unsigned char *s)
int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)