PostgreSQL Source Code: src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c Source File

PostgreSQL Source Code git master
utf8_and_gb18030.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * GB18030 <--> UTF8
4 *
5 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
6 * Portions Copyright (c) 1994, Regents of the University of California
7 *
8 * IDENTIFICATION
9 * src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
10 *
11 *-------------------------------------------------------------------------
12 */
13
14#include "postgres.h"
15#include "fmgr.h"
16#include "mb/pg_wchar.h"
17#include "../../Unicode/gb18030_to_utf8.map"
18#include "../../Unicode/utf8_to_gb18030.map"
19
20 PG_MODULE_MAGIC_EXT(
21 .name = "utf8_and_gb18030",
22 .version = PG_VERSION
23);
24
25 PG_FUNCTION_INFO_V1(gb18030_to_utf8);
26 PG_FUNCTION_INFO_V1(utf8_to_gb18030);
27
28/*
29 * Convert 4-byte GB18030 characters to and from a linear code space
30 *
31 * The first and third bytes can range from 0x81 to 0xfe (126 values),
32 * while the second and fourth bytes can range from 0x30 to 0x39 (10 values).
33 */
34static inline uint32
35 gb_linear(uint32 gb)
36{
37 uint32 b0 = (gb & 0xff000000) >> 24;
38 uint32 b1 = (gb & 0x00ff0000) >> 16;
39 uint32 b2 = (gb & 0x0000ff00) >> 8;
40 uint32 b3 = (gb & 0x000000ff);
41
42 return b0 * 12600 + b1 * 1260 + b2 * 10 + b3 -
43 (0x81 * 12600 + 0x30 * 1260 + 0x81 * 10 + 0x30);
44}
45
46static inline uint32
47 gb_unlinear(uint32 lin)
48{
49 uint32 r0 = 0x81 + lin / 12600;
50 uint32 r1 = 0x30 + (lin / 1260) % 10;
51 uint32 r2 = 0x81 + (lin / 10) % 126;
52 uint32 r3 = 0x30 + lin % 10;
53
54 return (r0 << 24) | (r1 << 16) | (r2 << 8) | r3;
55}
56
57/*
58 * Convert word-formatted UTF8 to and from Unicode code points
59 *
60 * Probably this should be somewhere else ...
61 */
62static inline uint32
63 unicode_to_utf8word(uint32 c)
64{
65 uint32 word;
66
67 if (c <= 0x7F)
68 {
69 word = c;
70 }
71 else if (c <= 0x7FF)
72 {
73 word = (0xC0 | ((c >> 6) & 0x1F)) << 8;
74 word |= 0x80 | (c & 0x3F);
75 }
76 else if (c <= 0xFFFF)
77 {
78 word = (0xE0 | ((c >> 12) & 0x0F)) << 16;
79 word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
80 word |= 0x80 | (c & 0x3F);
81 }
82 else
83 {
84 word = (0xF0 | ((c >> 18) & 0x07)) << 24;
85 word |= (0x80 | ((c >> 12) & 0x3F)) << 16;
86 word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
87 word |= 0x80 | (c & 0x3F);
88 }
89
90 return word;
91}
92
93static inline uint32
94 utf8word_to_unicode(uint32 c)
95{
96 uint32 ucs;
97
98 if (c <= 0x7F)
99 {
100 ucs = c;
101 }
102 else if (c <= 0xFFFF)
103 {
104 ucs = ((c >> 8) & 0x1F) << 6;
105 ucs |= c & 0x3F;
106 }
107 else if (c <= 0xFFFFFF)
108 {
109 ucs = ((c >> 16) & 0x0F) << 12;
110 ucs |= ((c >> 8) & 0x3F) << 6;
111 ucs |= c & 0x3F;
112 }
113 else
114 {
115 ucs = ((c >> 24) & 0x07) << 18;
116 ucs |= ((c >> 16) & 0x3F) << 12;
117 ucs |= ((c >> 8) & 0x3F) << 6;
118 ucs |= c & 0x3F;
119 }
120
121 return ucs;
122}
123
124/*
125 * Perform mapping of GB18030 ranges to UTF8
126 *
127 * General description, and the range we need to convert for U+10000 and up:
128 * https://htmlpreview.github.io/?https://github.com/unicode-org/icu-data/blob/main/charset/source/gb18030/gb18030.html
129 *
130 * Ranges up to U+FFFF:
131 * https://github.com/unicode-org/icu-data/blob/main/charset/source/gb18030/ranges.txt
132 *
133 * All are ranges of 4-byte GB18030 codes.
134 */
135static uint32
136 conv_18030_to_utf8(uint32 code)
137{
138#define conv18030(minunicode, mincode, maxcode) \
139 if (code >= mincode && code <= maxcode) \
140 return unicode_to_utf8word(gb_linear(code) - gb_linear(mincode) + minunicode)
141
142 conv18030(0x0452, 0x8130D330, 0x8136A531);
143 conv18030(0x2643, 0x8137A839, 0x8138FD38);
144 conv18030(0x361B, 0x8230A633, 0x8230F237);
145 conv18030(0x3CE1, 0x8231D438, 0x8232AF32);
146 conv18030(0x4160, 0x8232C937, 0x8232F837);
147 conv18030(0x44D7, 0x8233A339, 0x8233C931);
148 conv18030(0x478E, 0x8233E838, 0x82349638);
149 conv18030(0x49B8, 0x8234A131, 0x8234E733);
150 conv18030(0x9FA6, 0x82358F33, 0x8336C738);
151 conv18030(0xE865, 0x8336D030, 0x84308534);
152 conv18030(0xFA2A, 0x84309C38, 0x84318537);
153 conv18030(0xFFE6, 0x8431A234, 0x8431A439);
154 conv18030(0x10000, 0x90308130, 0xE3329A35);
155 /* No mapping exists */
156 return 0;
157}
158
159/*
160 * Perform mapping of UTF8 ranges to GB18030
161 */
162static uint32
163 conv_utf8_to_18030(uint32 code)
164{
165 uint32 ucs = utf8word_to_unicode(code);
166
167#define convutf8(minunicode, maxunicode, mincode) \
168 if (ucs >= minunicode && ucs <= maxunicode) \
169 return gb_unlinear(ucs - minunicode + gb_linear(mincode))
170
171 convutf8(0x0452, 0x200F, 0x8130D330);
172 convutf8(0x2643, 0x2E80, 0x8137A839);
173 convutf8(0x361B, 0x3917, 0x8230A633);
174 convutf8(0x3CE1, 0x4055, 0x8231D438);
175 convutf8(0x4160, 0x4336, 0x8232C937);
176 convutf8(0x44D7, 0x464B, 0x8233A339);
177 convutf8(0x478E, 0x4946, 0x8233E838);
178 convutf8(0x49B8, 0x4C76, 0x8234A131);
179 convutf8(0x9FA6, 0xD7FF, 0x82358F33);
180 convutf8(0xE865, 0xF92B, 0x8336D030);
181 convutf8(0xFA2A, 0xFE2F, 0x84309C38);
182 convutf8(0xFFE6, 0xFFFF, 0x8431A234);
183 convutf8(0x10000, 0x10FFFF, 0x90308130);
184 /* No mapping exists */
185 return 0;
186}
187
188/* ----------
189 * conv_proc(
190 * INTEGER, -- source encoding id
191 * INTEGER, -- destination encoding id
192 * CSTRING, -- source string (null terminated C string)
193 * CSTRING, -- destination string (null terminated C string)
194 * INTEGER, -- source string length
195 * BOOL -- if true, don't throw an error if conversion fails
196 * ) returns INTEGER;
197 *
198 * Returns the number of bytes successfully converted.
199 * ----------
200 */
201Datum
202 gb18030_to_utf8(PG_FUNCTION_ARGS)
203{
204 unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
205 unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
206 int len = PG_GETARG_INT32(4);
207 bool noError = PG_GETARG_BOOL(5);
208 int converted;
209
210 CHECK_ENCODING_CONVERSION_ARGS(PG_GB18030, PG_UTF8);
211
212 converted = LocalToUtf(src, len, dest,
213 &gb18030_to_unicode_tree,
214 NULL, 0,
215 conv_18030_to_utf8,
216 PG_GB18030,
217 noError);
218
219 PG_RETURN_INT32(converted);
220}
221
222Datum
223 utf8_to_gb18030(PG_FUNCTION_ARGS)
224{
225 unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
226 unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
227 int len = PG_GETARG_INT32(4);
228 bool noError = PG_GETARG_BOOL(5);
229 int converted;
230
231 CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030);
232
233 converted = UtfToLocal(src, len, dest,
234 &gb18030_from_unicode_tree,
235 NULL, 0,
236 conv_utf8_to_18030,
237 PG_GB18030,
238 noError);
239
240 PG_RETURN_INT32(converted);
241}
uint32_t uint32
Definition: c.h:538
int UtfToLocal(const unsigned char *utf, int len, unsigned char *iso, const pg_mb_radix_tree *map, const pg_utf_to_local_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding, bool noError)
Definition: conv.c:507
int LocalToUtf(const unsigned char *iso, int len, unsigned char *utf, const pg_mb_radix_tree *map, const pg_local_to_utf_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding, bool noError)
Definition: conv.c:717
#define PG_GETARG_CSTRING(n)
Definition: fmgr.h:277
#define PG_RETURN_INT32(x)
Definition: fmgr.h:354
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_GETARG_BOOL(n)
Definition: fmgr.h:274
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
const void size_t len
@ PG_GB18030
Definition: pg_wchar.h:268
@ PG_UTF8
Definition: pg_wchar.h:232
#define CHECK_ENCODING_CONVERSION_ARGS(srcencoding, destencoding)
Definition: pg_wchar.h:507
uint64_t Datum
Definition: postgres.h:70
c
char * c
Definition: preproc-cursor.c:31
static void word(struct vars *v, int dir, struct state *lp, struct state *rp)
Definition: regcomp.c:1476
static uint32 gb_unlinear(uint32 lin)
static uint32 utf8word_to_unicode(uint32 c)
static uint32 conv_18030_to_utf8(uint32 code)
static uint32 unicode_to_utf8word(uint32 c)
Datum gb18030_to_utf8(PG_FUNCTION_ARGS)
PG_MODULE_MAGIC_EXT(.name="utf8_and_gb18030",.version=PG_VERSION)
PG_FUNCTION_INFO_V1(gb18030_to_utf8)
static uint32 conv_utf8_to_18030(uint32 code)
Datum utf8_to_gb18030(PG_FUNCTION_ARGS)
#define conv18030(minunicode, mincode, maxcode)
#define convutf8(minunicode, maxunicode, mincode)
static uint32 gb_linear(uint32 gb)
const char * name

AltStyle によって変換されたページ (->オリジナル) /