[フレーム]

unicode_case.c

Go to the documentation of this file.

1/*-------------------------------------------------------------------------

2 * unicode_case.c

3 * Unicode case mapping and case conversion.

4 *

6 *

7 * IDENTIFICATION

8 * src/common/unicode_case.c

9 *

10 *-------------------------------------------------------------------------

11 */

12#ifndef FRONTEND

13#include "postgres.h"

14#else

15#include "postgres_fe.h"

16#endif

17

18#include "common/unicode_case.h"

19#include "common/unicode_case_table.h"

20#include "common/unicode_category.h"

21#include "mb/pg_wchar.h"

22

23 enum CaseMapResult

24{

25 CASEMAP_SELF,

26 CASEMAP_SIMPLE,

27 CASEMAP_SPECIAL,

28};

29

30/*

31 * Map for each case kind.

32 */

33 static const pg_wchar *const casekind_map[NCaseKind] =

34{

35 [CaseLower] = case_map_lower,

36 [CaseTitle] = case_map_title,

37 [CaseUpper] = case_map_upper,

38 [CaseFold] = case_map_fold,

39};

40

41static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map);

42static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,

43 CaseKind str_casekind, bool full, WordBoundaryNext wbnext,

44 void *wbstate);

45static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,

46 const char *src, size_t srclen, size_t srcoff,

47 pg_wchar *simple, const pg_wchar **special);

48

49pg_wchar

50 unicode_lowercase_simple(pg_wchar code)

51{

52 pg_wchar cp = find_case_map(code, case_map_lower);

53

54 return cp != 0 ? cp : code;

55}

56

57pg_wchar

58 unicode_titlecase_simple(pg_wchar code)

59{

60 pg_wchar cp = find_case_map(code, case_map_title);

61

62 return cp != 0 ? cp : code;

63}

64

65pg_wchar

66 unicode_uppercase_simple(pg_wchar code)

67{

68 pg_wchar cp = find_case_map(code, case_map_upper);

69

70 return cp != 0 ? cp : code;

71}

72

73pg_wchar

74 unicode_casefold_simple(pg_wchar code)

75{

76 pg_wchar cp = find_case_map(code, case_map_fold);

77

78 return cp != 0 ? cp : code;

79}

80

81/*

82 * unicode_strlower()

83 *

84 * Convert src to lowercase, and return the result length (not including

85 * terminating NUL).

86 *

87 * String src must be encoded in UTF-8. If srclen < 0, src must be

88 * NUL-terminated.

89 *

90 * Result string is stored in dst, truncating if larger than dstsize. If

91 * dstsize is greater than the result length, dst will be NUL-terminated;

92 * otherwise not.

93 *

94 * If dstsize is zero, dst may be NULL. This is useful for calculating the

95 * required buffer size before allocating.

96 *

97 * If full is true, use special case mappings if available and if the

98 * conditions are satisfied.

99 */

100size_t

101 unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,

102 bool full)

103{

104 return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,

105 NULL);

106}

107

108/*

109 * unicode_strtitle()

110 *

111 * Convert src to titlecase, and return the result length (not including

112 * terminating NUL).

113 *

114 * String src must be encoded in UTF-8. If srclen < 0, src must be

115 * NUL-terminated.

116 *

117 * Result string is stored in dst, truncating if larger than dstsize. If

118 * dstsize is greater than the result length, dst will be NUL-terminated;

119 * otherwise not.

120 *

121 * If dstsize is zero, dst may be NULL. This is useful for calculating the

122 * required buffer size before allocating.

123 *

124 * If full is true, use special case mappings if available and if the

125 * conditions are satisfied. Otherwise, use only simple mappings and use

126 * uppercase instead of titlecase.

127 *

128 * Titlecasing requires knowledge about word boundaries, which is provided by

129 * the callback wbnext. A word boundary is the offset of the start of a word

130 * or the offset of the character immediately following a word.

131 *

132 * The caller is expected to initialize and free the callback state

133 * wbstate. The callback should first return offset 0 for the first boundary;

134 * then the offset of each subsequent word boundary; then the total length of

135 * the string to indicate the final boundary.

136 */

137size_t

138 unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,

139 bool full, WordBoundaryNext wbnext, void *wbstate)

140{

141 return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,

142 wbstate);

143}

144

145/*

146 * unicode_strupper()

147 *

148 * Convert src to uppercase, and return the result length (not including

149 * terminating NUL).

150 *

151 * String src must be encoded in UTF-8. If srclen < 0, src must be

152 * NUL-terminated.

153 *

154 * Result string is stored in dst, truncating if larger than dstsize. If

155 * dstsize is greater than the result length, dst will be NUL-terminated;

156 * otherwise not.

157 *

158 * If dstsize is zero, dst may be NULL. This is useful for calculating the

159 * required buffer size before allocating.

160 *

161 * If full is true, use special case mappings if available and if the

162 * conditions are satisfied.

163 */

164size_t

165 unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,

166 bool full)

167{

168 return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,

169 NULL);

170}

171

172/*

173 * unicode_strfold()

174 *

175 * Case fold src, and return the result length (not including terminating

176 * NUL).

177 *

178 * String src must be encoded in UTF-8. If srclen < 0, src must be

179 * NUL-terminated.

180 *

181 * Result string is stored in dst, truncating if larger than dstsize. If

182 * dstsize is greater than the result length, dst will be NUL-terminated;

183 * otherwise not.

184 *

185 * If dstsize is zero, dst may be NULL. This is useful for calculating the

186 * required buffer size before allocating.

187 */

188size_t

189 unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,

190 bool full)

191{

192 return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,

193 NULL);

194}

195

196/*

197 * Implement Unicode Default Case Conversion algorithm.

198 *

199 * If str_casekind is CaseLower or CaseUpper, map each character in the string

200 * for which a mapping is available.

201 *

202 * If str_casekind is CaseTitle, maps characters found on a word boundary to

203 * titlecase (or uppercase if full is false) and other characters to

204 * lowercase. NB: does not currently implement the Unicode behavior in which

205 * the word boundary is adjusted to the next Cased character. That behavior

206 * could be implemented as an option, but it doesn't match the default

207 * behavior of ICU, nor does it match the documented behavior of INITCAP().

208 *

209 * If full is true, use special mappings for relevant characters, which can

210 * map a single codepoint to multiple codepoints, or depend on conditions.

211 */

212static size_t

213 convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,

214 CaseKind str_casekind, bool full, WordBoundaryNext wbnext,

215 void *wbstate)

216{

217 /* character CaseKind varies while titlecasing */

218 CaseKind chr_casekind = str_casekind;

219 size_t srcoff = 0;

220 size_t result_len = 0;

221 size_t boundary = 0;

222

223 Assert((str_casekind == CaseTitle && wbnext && wbstate) ||

224 (str_casekind != CaseTitle && !wbnext && !wbstate));

225

226 if (str_casekind == CaseTitle)

227 {

228 boundary = wbnext(wbstate);

229 Assert(boundary == 0); /* start of text is always a boundary */

230 }

231

232 while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '0円')

233 {

234 pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);

235 int u1len = unicode_utf8len(u1);

236 pg_wchar simple = 0;

237 const pg_wchar *special = NULL;

238 enum CaseMapResult casemap_result;

239

240 if (str_casekind == CaseTitle)

241 {

242 if (srcoff == boundary)

243 {

244 chr_casekind = full ? CaseTitle : CaseUpper;

245 boundary = wbnext(wbstate);

246 }

247 else

248 chr_casekind = CaseLower;

249 }

250

251 casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,

252 &simple, &special);

253

254 switch (casemap_result)

255 {

256 case CASEMAP_SELF:

257 /* no mapping; copy bytes from src */

258 Assert(simple == 0);

259 Assert(special == NULL);

260 if (result_len + u1len <= dstsize)

261 memcpy(dst + result_len, src + srcoff, u1len);

262

263 result_len += u1len;

264 break;

265 case CASEMAP_SIMPLE:

266 {

267 /* replace with single character */

268 pg_wchar u2 = simple;

269 pg_wchar u2len = unicode_utf8len(u2);

270

271 Assert(special == NULL);

272 if (result_len + u2len <= dstsize)

273 unicode_to_utf8(u2, (unsigned char *) dst + result_len);

274

275 result_len += u2len;

276 }

277 break;

278 case CASEMAP_SPECIAL:

279 /* replace with up to MAX_CASE_EXPANSION characters */

280 Assert(simple == 0);

281 for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)

282 {

283 pg_wchar u2 = special[i];

284 size_t u2len = unicode_utf8len(u2);

285

286 if (result_len + u2len <= dstsize)

287 unicode_to_utf8(u2, (unsigned char *) dst + result_len);

288

289 result_len += u2len;

290 }

291 break;

292 }

293

294 srcoff += u1len;

295 }

296

297 if (result_len < dstsize)

298 dst[result_len] = '0円';

299

300 return result_len;

301}

302

303/*

304 * Check that the condition matches Final_Sigma, described in Unicode Table

305 * 3-17. The character at the given offset must be directly preceded by a

306 * Cased character, and must not be directly followed by a Cased character.

307 *

308 * Case_Ignorable characters are ignored. NB: some characters may be both

309 * Cased and Case_Ignorable, in which case they are ignored.

310 */

311static bool

312 check_final_sigma(const unsigned char *str, size_t len, size_t offset)

313{

314 /* the start of the string is not preceded by a Cased character */

315 if (offset == 0)

316 return false;

317

318 /* iterate backwards, looking for Cased character */

319 for (int i = offset - 1; i >= 0; i--)

320 {

321 if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)

322 {

323 pg_wchar curr = utf8_to_unicode(str + i);

324

325 if (pg_u_prop_case_ignorable(curr))

326 continue;

327 else if (pg_u_prop_cased(curr))

328 break;

329 else

330 return false;

331 }

332 else if ((str[i] & 0xC0) == 0x80)

333 continue;

334

335 Assert(false); /* invalid UTF-8 */

336 }

337

338 /* end of string is not followed by a Cased character */

339 if (offset == len)

340 return true;

341

342 /* iterate forwards, looking for Cased character */

343 for (int i = offset + 1; i < len && str[i] != '0円'; i++)

344 {

345 if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)

346 {

347 pg_wchar curr = utf8_to_unicode(str + i);

348

349 if (pg_u_prop_case_ignorable(curr))

350 continue;

351 else if (pg_u_prop_cased(curr))

352 return false;

353 else

354 break;

355 }

356 else if ((str[i] & 0xC0) == 0x80)

357 continue;

358

359 Assert(false); /* invalid UTF-8 */

360 }

361

362 return true;

363}

364

365/*

366 * Unicode allows for special casing to be applied only under certain

367 * circumstances. The only currently-supported condition is Final_Sigma.

368 */

369static bool

370 check_special_conditions(int conditions, const char *str, size_t len,

371 size_t offset)

372{

373 if (conditions == 0)

374 return true;

375 else if (conditions == PG_U_FINAL_SIGMA)

376 return check_final_sigma((unsigned char *) str, len, offset);

377

378 /* no other conditions supported */

379 Assert(false);

380 return false;

381}

382

383/*

384 * Map the given character to the requested case.

385 *

386 * If full is true, and a special case mapping is found and the conditions are

387 * met, 'special' is set to the mapping result (which is an array of up to

388 * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.

389 *

390 * Otherwise, search for a simple mapping, and if found, set 'simple' to the

391 * result and return CASEMAP_SIMPLE.

392 *

393 * If no mapping is found, return CASEMAP_SELF, and the caller should copy the

394 * character without modification.

395 */

396static enum CaseMapResult

397 casemap(pg_wchar u1, CaseKind casekind, bool full,

398 const char *src, size_t srclen, size_t srcoff,

399 pg_wchar *simple, const pg_wchar **special)

400{

401 uint16 idx;

402

403 /* Fast path for codepoints < 0x80 */

404 if (u1 < 0x80)

405 {

406 /*

407 * The first elements in all tables are reserved as 0 (as NULL). The

408 * data starts at index 1, not 0.

409 */

410 *simple = casekind_map[casekind][u1 + 1];

411

412 return CASEMAP_SIMPLE;

413 }

414

415 idx = case_index(u1);

416

417 if (idx == 0)

418 return CASEMAP_SELF;

419

420 if (full && case_map_special[idx] &&

421 check_special_conditions(special_case[case_map_special[idx]].conditions,

422 src, srclen, srcoff))

423 {

424 *special = special_case[case_map_special[idx]].map[casekind];

425 return CASEMAP_SPECIAL;

426 }

427

428 *simple = casekind_map[casekind][idx];

429

430 return CASEMAP_SIMPLE;

431}

432

433/*

434 * Find entry in simple case map.

435 * If the entry does not exist, 0 will be returned.

436 */

437static pg_wchar

438 find_case_map(pg_wchar ucs, const pg_wchar *map)

439{

440 /* Fast path for codepoints < 0x80 */

441 if (ucs < 0x80)

442 /* The first elements in all tables are reserved as 0 (as NULL). */

443 return map[ucs + 1];

444 return map[case_index(ucs)];

445}

idx

Datum idx(PG_FUNCTION_ARGS)

Definition: _int_op.c:262

uint16

uint16_t uint16

Definition: c.h:537

Assert

Assert(PointerIsAligned(start, uint64))

str

const char * str

Definition: hashfn_unstable.h:254

i

int i

Definition: isn.c:77

utf8_to_unicode

static pg_wchar utf8_to_unicode(const unsigned char *c)

Definition: mbprint.c:53

pg_wchar

unsigned int pg_wchar

Definition: mbprint.c:31

len

const void size_t len

Definition: pg_crc32c_sse42.c:28

pg_wchar.h

unicode_to_utf8

static unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)

Definition: pg_wchar.h:575

unicode_utf8len

static int unicode_utf8len(pg_wchar c)

Definition: pg_wchar.h:607

postgres.h

postgres_fe.h

pg_special_case::map

pg_wchar map[NCaseKind][MAX_CASE_EXPANSION]

Definition: unicode_case_table.h:48

unicode_uppercase_simple

pg_wchar unicode_uppercase_simple(pg_wchar code)

Definition: unicode_case.c:66

unicode_titlecase_simple

pg_wchar unicode_titlecase_simple(pg_wchar code)

Definition: unicode_case.c:58

unicode_strupper

size_t unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)

Definition: unicode_case.c:165

casemap

static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full, const char *src, size_t srclen, size_t srcoff, pg_wchar *simple, const pg_wchar **special)

Definition: unicode_case.c:397

unicode_strlower

size_t unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)

Definition: unicode_case.c:101

convert_case

static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, CaseKind str_casekind, bool full, WordBoundaryNext wbnext, void *wbstate)

Definition: unicode_case.c:213

unicode_strtitle

size_t unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full, WordBoundaryNext wbnext, void *wbstate)

Definition: unicode_case.c:138

find_case_map

static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map)

Definition: unicode_case.c:438

check_special_conditions

static bool check_special_conditions(int conditions, const char *str, size_t len, size_t offset)

Definition: unicode_case.c:370

casekind_map

static const pg_wchar *const casekind_map[NCaseKind]