1/*-------------------------------------------------------------------------
3 * Multibyte character printing support for frontend code
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
9 * src/fe_utils/mbprint.c
11 *-------------------------------------------------------------------------
21 * To avoid version-skew problems, this file must not use declarations
22 * from pg_wchar.h: the encoding IDs we are dealing with are determined
23 * by the libpq.so we are linked with, and that might not match the
24 * numbers we see at compile time. (If this file were inside libpq,
25 * the problem would go away...)
27 * Hence, we have our own definition of pg_wchar, and we get the values
28 * of any needed encoding IDs on-the-fly.
36 static int utf8_id = -1;
43 #define PG_UTF8 pg_get_utf8_id()
47 * Convert a UTF-8 character to a Unicode code point.
48 * This is a one-character version of pg_utf2wchar_with_len.
50 * No error checks here, c must point to a long-enough string.
57 else if ((*
c & 0xe0) == 0xc0)
60 else if ((*
c & 0xf0) == 0xe0)
61 return (
pg_wchar) (((
c[0] & 0x0f) << 12) |
62 ((
c[1] & 0x3f) << 6) |
64 else if ((*
c & 0xf8) == 0xf0)
65 return (
pg_wchar) (((
c[0] & 0x07) << 18) |
66 ((
c[1] & 0x3f) << 12) |
67 ((
c[2] & 0x3f) << 6) |
70 /* that is an invalid code on purpose */
76 * Unicode 3.1 compliant validation : for each category, it checks the
77 * combination of each byte to make sure it maps to a valid range. It also
78 * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
79 * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
86 else if ((*
c & 0xe0) == 0xc0)
89 if (((
c[1] & 0xc0) == 0x80) && ((
c[0] & 0x1f) > 0x01))
93 else if ((*
c & 0xf0) == 0xe0)
96 if (((
c[1] & 0xc0) == 0x80) &&
97 (((
c[0] & 0x0f) != 0x00) || ((
c[1] & 0x20) == 0x20)) &&
98 ((
c[2] & 0xc0) == 0x80))
101 int yx = ((
c[1] & 0x3f) << 6) | (
c[0] & 0x3f);
104 /* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
106 (((yx & 0xffe) == 0xffe) ||
107 (((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||
108 ((z == 0x0d) && ((yx & 0xb00) == 0x800)))
114 else if ((*
c & 0xf8) == 0xf0)
116 int u = ((
c[0] & 0x07) << 2) | ((
c[1] & 0x30) >> 4);
119 if (((
c[1] & 0xc0) == 0x80) &&
120 (u > 0x00) && (u <= 0x10) &&
121 ((
c[2] & 0xc0) == 0x80) && ((
c[3] & 0xc0) == 0x80))
123 /* test for 0xzzzzfffe/0xzzzzfffff */
124 if (((
c[1] & 0x0f) == 0x0f) && ((
c[2] & 0x3f) == 0x3f) &&
125 ((
c[3] & 0x3e) == 0x3e))
138 unsigned char *p = pwcs;
160 /* we skip the char */
168 * public functions : wcswidth and mbvalidate
172 * pg_wcswidth is the dumb display-width function.
173 * It assumes that everything will appear on one line.
174 * OTOH it is easier to use than pg_wcssize if this applies to you.
187 if (
len < (
size_t) chlen)
188 break;
/* Invalid string */
201 * pg_wcssize takes the given string in the given encoding and returns three
203 * result_width: Width in display characters of the longest line in string
204 * result_height: Number of lines in display output
205 * result_format_size: Number of bytes required to store formatted
206 * representation of string
208 * This MUST be kept in sync with pg_wcsformat!
212 int *result_width,
int *result_height,
int *result_format_size)
221 for (; *pwcs &&
len > 0; pwcs += chlen)
224 if (
len < (
size_t) chlen)
228 if (chlen == 1)
/* single-byte char */
230 if (*pwcs ==
'\n')
/* Newline */
232 if (linewidth > width)
236 format_size += 1;
/* For NUL char */
238 else if (*pwcs ==
'\r')
/* Linefeed */
243 else if (*pwcs ==
'\t')
/* Tab */
249 }
while (linewidth % 8 != 0);
251 else if (w < 0)
/* Other control char */
256 else /* Output it as-is */
262 else if (w < 0)
/* Non-ascii control char */
264 linewidth += 6;
/* \u0000 */
267 else /* All other chars */
270 format_size += chlen;
274 if (linewidth > width)
276 format_size += 1;
/* For NUL char */
280 *result_width = width;
282 *result_height = height;
283 if (result_format_size)
284 *result_format_size = format_size;
288 * Format a string into one or more "struct lineptr" lines.
289 * lines[i].ptr == NULL indicates the end of the array.
291 * This MUST be kept in sync with pg_wcssize!
295 struct lineptr *lines,
int count)
300 unsigned char *ptr = lines->
ptr;
/* Pointer to data area */
302 for (; *pwcs &&
len > 0; pwcs += chlen)
305 if (
len < (
size_t) chlen)
309 if (chlen == 1)
/* single-byte char */
311 if (*pwcs ==
'\n')
/* Newline */
314 lines->
width = linewidth;
319 exit(1);
/* Screwup */
321 /* make next line point to remaining memory */
324 else if (*pwcs ==
'\r')
/* Linefeed */
326 strcpy((
char *) ptr,
"\\r");
330 else if (*pwcs ==
'\t')
/* Tab */
336 }
while (linewidth % 8 != 0);
338 else if (w < 0)
/* Other control char */
340 sprintf((
char *) ptr,
"\\x%02X", *pwcs);
344 else /* Output it as-is */
350 else if (w < 0)
/* Non-ascii control char */
357 * This case cannot happen in the current code because only
358 * UTF-8 signals multibyte control characters. But we may need
359 * to support it at some stage
361 sprintf((
char *) ptr,
"\\u????");
366 else /* All other chars */
370 for (
i = 0;
i < chlen;
i++)
376 lines->
width = linewidth;
377 *ptr++ =
'0円';
/* Terminate formatted string */
380 exit(1);
/* Screwup */
382 (lines + 1)->ptr = NULL;
/* terminate line array */
387 * Encoding validation: delete any unvalidatable characters from the string
389 * This seems redundant with existing functionality elsewhere?
399 * other encodings needing validation should add their own routines
int PQmblen(const char *s, int encoding)
int PQdsplen(const char *s, int encoding)
static void mb_utf_validate(unsigned char *pwcs)
void pg_wcssize(const unsigned char *pwcs, size_t len, int encoding, int *result_width, int *result_height, int *result_format_size)
int pg_wcswidth(const char *pwcs, size_t len, int encoding)
void pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding, struct lineptr *lines, int count)
static int utf_charcheck(const unsigned char *c)
static int pg_get_utf8_id(void)
static pg_wchar utf8_to_unicode(const unsigned char *c)
unsigned char * mbvalidate(unsigned char *pwcs, int encoding)
#define pg_char_to_encoding