[フレーム]

mbprint.c

Go to the documentation of this file.

1/*-------------------------------------------------------------------------

2 *

3 * Multibyte character printing support for frontend code

4 *

5 *

8 *

9 * src/fe_utils/mbprint.c

10 *

11 *-------------------------------------------------------------------------

12 */

13#include "postgres_fe.h"

14

15#include "fe_utils/mbprint.h"

16

17#include "libpq-fe.h"

18

19

20/*

21 * To avoid version-skew problems, this file must not use declarations

22 * from pg_wchar.h: the encoding IDs we are dealing with are determined

23 * by the libpq.so we are linked with, and that might not match the

24 * numbers we see at compile time. (If this file were inside libpq,

25 * the problem would go away...)

26 *

27 * Hence, we have our own definition of pg_wchar, and we get the values

28 * of any needed encoding IDs on-the-fly.

29 */

30

31 typedef unsigned int pg_wchar;

32

33static int

34 pg_get_utf8_id(void)

35{

36 static int utf8_id = -1;

37

38 if (utf8_id < 0)

39 utf8_id = pg_char_to_encoding("utf8");

40 return utf8_id;

41}

42

43 #define PG_UTF8 pg_get_utf8_id()

44

45

46/*

47 * Convert a UTF-8 character to a Unicode code point.

48 * This is a one-character version of pg_utf2wchar_with_len.

49 *

50 * No error checks here, c must point to a long-enough string.

51 */

52static pg_wchar

53 utf8_to_unicode(const unsigned char *c)

54{

55 if ((*c & 0x80) == 0)

56 return (pg_wchar) c[0];

57 else if ((*c & 0xe0) == 0xc0)

58 return (pg_wchar) (((c[0] & 0x1f) << 6) |

59 (c[1] & 0x3f));

60 else if ((*c & 0xf0) == 0xe0)

61 return (pg_wchar) (((c[0] & 0x0f) << 12) |

62 ((c[1] & 0x3f) << 6) |

63 (c[2] & 0x3f));

64 else if ((*c & 0xf8) == 0xf0)

65 return (pg_wchar) (((c[0] & 0x07) << 18) |

66 ((c[1] & 0x3f) << 12) |

67 ((c[2] & 0x3f) << 6) |

68 (c[3] & 0x3f));

69 else

70 /* that is an invalid code on purpose */

71 return 0xffffffff;

72}

73

74

75/*

76 * Unicode 3.1 compliant validation : for each category, it checks the

77 * combination of each byte to make sure it maps to a valid range. It also

78 * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =

79 * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)

80 */

81static int

82 utf_charcheck(const unsigned char *c)

83{

84 if ((*c & 0x80) == 0)

85 return 1;

86 else if ((*c & 0xe0) == 0xc0)

87 {

88 /* two-byte char */

89 if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))

90 return 2;

91 return -1;

92 }

93 else if ((*c & 0xf0) == 0xe0)

94 {

95 /* three-byte char */

96 if (((c[1] & 0xc0) == 0x80) &&

97 (((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) &&

98 ((c[2] & 0xc0) == 0x80))

99 {

100 int z = c[0] & 0x0f;

101 int yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f);

102 int lx = yx & 0x7f;

103

104 /* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */

105 if (((z == 0x0f) &&

106 (((yx & 0xffe) == 0xffe) ||

107 (((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||

108 ((z == 0x0d) && ((yx & 0xb00) == 0x800)))

109 return -1;

110 return 3;

111 }

112 return -1;

113 }

114 else if ((*c & 0xf8) == 0xf0)

115 {

116 int u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4);

117

118 /* four-byte char */

119 if (((c[1] & 0xc0) == 0x80) &&

120 (u > 0x00) && (u <= 0x10) &&

121 ((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))

122 {

123 /* test for 0xzzzzfffe/0xzzzzfffff */

124 if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&

125 ((c[3] & 0x3e) == 0x3e))

126 return -1;

127 return 4;

128 }

129 return -1;

130 }

131 return -1;

132}

133

134

135static void

136 mb_utf_validate(unsigned char *pwcs)

137{

138 unsigned char *p = pwcs;

139

140 while (*pwcs)

141 {

142 int len;

143

144 if ((len = utf_charcheck(pwcs)) > 0)

145 {

146 if (p != pwcs)

147 {

148 int i;

149

150 for (i = 0; i < len; i++)

151 *p++ = *pwcs++;

152 }

153 else

154 {

155 pwcs += len;

156 p += len;

157 }

158 }

159 else

160 /* we skip the char */

161 pwcs++;

162 }

163 if (p != pwcs)

164 *p = '0円';

165}

166

167/*

168 * public functions : wcswidth and mbvalidate

169 */

170

171/*

172 * pg_wcswidth is the dumb display-width function.

173 * It assumes that everything will appear on one line.

174 * OTOH it is easier to use than pg_wcssize if this applies to you.

175 */

176int

177 pg_wcswidth(const char *pwcs, size_t len, int encoding)

178{

179 int width = 0;

180

181 while (len > 0)

182 {

183 int chlen,

184 chwidth;

185

186 chlen = PQmblen(pwcs, encoding);

187 if (len < (size_t) chlen)

188 break; /* Invalid string */

189

190 chwidth = PQdsplen(pwcs, encoding);

191 if (chwidth > 0)

192 width += chwidth;

193

194 pwcs += chlen;

195 len -= chlen;

196 }

197 return width;

198}

199

200/*

201 * pg_wcssize takes the given string in the given encoding and returns three

202 * values:

203 * result_width: Width in display characters of the longest line in string

204 * result_height: Number of lines in display output

205 * result_format_size: Number of bytes required to store formatted

206 * representation of string

207 *

208 * This MUST be kept in sync with pg_wcsformat!

209 */

210void

211 pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,

212 int *result_width, int *result_height, int *result_format_size)

213{

214 int w,

215 chlen = 0,

216 linewidth = 0;

217 int width = 0;

218 int height = 1;

219 int format_size = 0;

220

221 for (; *pwcs && len > 0; pwcs += chlen)

222 {

223 chlen = PQmblen((const char *) pwcs, encoding);

224 if (len < (size_t) chlen)

225 break;

226 w = PQdsplen((const char *) pwcs, encoding);

227

228 if (chlen == 1) /* single-byte char */

229 {

230 if (*pwcs == '\n') /* Newline */

231 {

232 if (linewidth > width)

233 width = linewidth;

234 linewidth = 0;

235 height += 1;

236 format_size += 1; /* For NUL char */

237 }

238 else if (*pwcs == '\r') /* Linefeed */

239 {

240 linewidth += 2;

241 format_size += 2;

242 }

243 else if (*pwcs == '\t') /* Tab */

244 {

245 do

246 {

247 linewidth++;

248 format_size++;

249 } while (linewidth % 8 != 0);

250 }

251 else if (w < 0) /* Other control char */

252 {

253 linewidth += 4;

254 format_size += 4;

255 }

256 else /* Output it as-is */

257 {

258 linewidth += w;

259 format_size += 1;

260 }

261 }

262 else if (w < 0) /* Non-ascii control char */

263 {

264 linewidth += 6; /* \u0000 */

265 format_size += 6;

266 }

267 else /* All other chars */

268 {

269 linewidth += w;

270 format_size += chlen;

271 }

272 len -= chlen;

273 }

274 if (linewidth > width)

275 width = linewidth;

276 format_size += 1; /* For NUL char */

277

278 /* Set results */

279 if (result_width)

280 *result_width = width;

281 if (result_height)

282 *result_height = height;

283 if (result_format_size)

284 *result_format_size = format_size;

285}

286

287/*

288 * Format a string into one or more "struct lineptr" lines.

289 * lines[i].ptr == NULL indicates the end of the array.

290 *

291 * This MUST be kept in sync with pg_wcssize!

292 */

293void

294 pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding,

295 struct lineptr *lines, int count)

296{

297 int w,

298 chlen = 0;

299 int linewidth = 0;

300 unsigned char *ptr = lines->ptr; /* Pointer to data area */

301

302 for (; *pwcs && len > 0; pwcs += chlen)

303 {

304 chlen = PQmblen((const char *) pwcs, encoding);

305 if (len < (size_t) chlen)

306 break;

307 w = PQdsplen((const char *) pwcs, encoding);

308

309 if (chlen == 1) /* single-byte char */

310 {

311 if (*pwcs == '\n') /* Newline */

312 {

313 *ptr++ = '0円';

314 lines->width = linewidth;

315 linewidth = 0;

316 lines++;

317 count--;

318 if (count <= 0)

319 exit(1); /* Screwup */

320

321 /* make next line point to remaining memory */

322 lines->ptr = ptr;

323 }

324 else if (*pwcs == '\r') /* Linefeed */

325 {

326 strcpy((char *) ptr, "\\r");

327 linewidth += 2;

328 ptr += 2;

329 }

330 else if (*pwcs == '\t') /* Tab */

331 {

332 do

333 {

334 *ptr++ = ' ';

335 linewidth++;

336 } while (linewidth % 8 != 0);

337 }

338 else if (w < 0) /* Other control char */

339 {

340 sprintf((char *) ptr, "\\x%02X", *pwcs);

341 linewidth += 4;

342 ptr += 4;

343 }

344 else /* Output it as-is */

345 {

346 linewidth += w;

347 *ptr++ = *pwcs;

348 }

349 }

350 else if (w < 0) /* Non-ascii control char */

351 {

352 if (encoding == PG_UTF8)

353 sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs));

354 else

355 {

356 /*

357 * This case cannot happen in the current code because only

358 * UTF-8 signals multibyte control characters. But we may need

359 * to support it at some stage

360 */

361 sprintf((char *) ptr, "\\u????");

362 }

363 ptr += 6;

364 linewidth += 6;

365 }

366 else /* All other chars */

367 {

368 int i;

369

370 for (i = 0; i < chlen; i++)

371 *ptr++ = pwcs[i];

372 linewidth += w;

373 }

374 len -= chlen;

375 }

376 lines->width = linewidth;

377 *ptr++ = '0円'; /* Terminate formatted string */

378

379 if (count <= 0)

380 exit(1); /* Screwup */

381

382 (lines + 1)->ptr = NULL; /* terminate line array */

383}

384

385

386/*

387 * Encoding validation: delete any unvalidatable characters from the string

388 *

389 * This seems redundant with existing functionality elsewhere?

390 */

391unsigned char *

392 mbvalidate(unsigned char *pwcs, int encoding)

393{

394 if (encoding == PG_UTF8)

395 mb_utf_validate(pwcs);

396 else

397 {

398 /*

399 * other encodings needing validation should add their own routines

400 * here

401 */

402 }

403

404 return pwcs;

405}

PQmblen

int PQmblen(const char *s, int encoding)

Definition: fe-misc.c:1255

PQdsplen

int PQdsplen(const char *s, int encoding)

Definition: fe-misc.c:1276

i

int i

Definition: isn.c:77

libpq-fe.h

PG_UTF8

#define PG_UTF8

Definition: mbprint.c:43

mb_utf_validate

static void mb_utf_validate(unsigned char *pwcs)

Definition: mbprint.c:136

pg_wcssize

void pg_wcssize(const unsigned char *pwcs, size_t len, int encoding, int *result_width, int *result_height, int *result_format_size)

Definition: mbprint.c:211

pg_wcswidth

int pg_wcswidth(const char *pwcs, size_t len, int encoding)

Definition: mbprint.c:177

pg_wcsformat

void pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding, struct lineptr *lines, int count)

Definition: mbprint.c:294

utf_charcheck

static int utf_charcheck(const unsigned char *c)

Definition: mbprint.c:82

pg_get_utf8_id

static int pg_get_utf8_id(void)

Definition: mbprint.c:34

utf8_to_unicode

static pg_wchar utf8_to_unicode(const unsigned char *c)

Definition: mbprint.c:53

pg_wchar

unsigned int pg_wchar

Definition: mbprint.c:31

mbvalidate

unsigned char * mbvalidate(unsigned char *pwcs, int encoding)

Definition: mbprint.c:392

mbprint.h

len

const void size_t len

Definition: pg_crc32c_sse42.c:28

encoding

int32 encoding

Definition: pg_database.h:41

pg_char_to_encoding

#define pg_char_to_encoding

Definition: pg_wchar.h:629

sprintf

#define sprintf

Definition: port.h:241

postgres_fe.h

c

char * c

Definition: preproc-cursor.c:31

lineptr

Definition: mbprint.h:17

lineptr::width

int width

Definition: mbprint.h:19

lineptr::ptr

unsigned char * ptr

Definition: mbprint.h:18

PostgreSQL Source Code: src/fe_utils/mbprint.c Source File