[フレーム]

unaccent.c

Go to the documentation of this file.

1/*-------------------------------------------------------------------------

2 *

3 * unaccent.c

4 * Text search unaccent dictionary

5 *

7 *

8 * IDENTIFICATION

9 * contrib/unaccent/unaccent.c

10 *

11 *-------------------------------------------------------------------------

12 */

13

14#include "postgres.h"

15

16#include "catalog/pg_ts_dict.h"

17#include "commands/defrem.h"

18#include "lib/stringinfo.h"

19#include "tsearch/ts_cache.h"

20#include "tsearch/ts_locale.h"

21#include "tsearch/ts_public.h"

22#include "utils/builtins.h"

23#include "utils/lsyscache.h"

24#include "utils/syscache.h"

25

26 PG_MODULE_MAGIC_EXT(

27 .name = "unaccent",

28 .version = PG_VERSION

29);

30

31/*

32 * An unaccent dictionary uses a trie to find a string to replace. Each node

33 * of the trie is an array of 256 TrieChar structs; the N-th element of the

34 * array corresponds to next byte value N. That element can contain both a

35 * replacement string (to be used if the source string ends with this byte)

36 * and a link to another trie node (to be followed if there are more bytes).

37 *

38 * Note that the trie search logic pays no attention to multibyte character

39 * boundaries. This is OK as long as both the data entered into the trie and

40 * the data we're trying to look up are validly encoded; no partial-character

41 * matches will occur.

42 */

43 typedef struct TrieChar

44{

45 struct TrieChar *nextChar;

46 char *replaceTo;

47 int replacelen;

48 } TrieChar;

49

50/*

51 * placeChar - put str into trie's structure, byte by byte.

52 *

53 * If node is NULL, we need to make a new node, which will be returned;

54 * otherwise the return value is the same as node.

55 */

56static TrieChar *

57 placeChar(TrieChar *node, const unsigned char *str, int lenstr,

58 const char *replaceTo, int replacelen)

59{

60 TrieChar *curnode;

61

62 if (!node)

63 node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);

64

65 Assert(lenstr > 0); /* else str[0] doesn't exist */

66

67 curnode = node + *str;

68

69 if (lenstr <= 1)

70 {

71 if (curnode->replaceTo)

72 ereport(WARNING,

73 (errcode(ERRCODE_CONFIG_FILE_ERROR),

74 errmsg("duplicate source strings, first one will be used")));

75 else

76 {

77 curnode->replacelen = replacelen;

78 curnode->replaceTo = (char *) palloc(replacelen);

79 memcpy(curnode->replaceTo, replaceTo, replacelen);

80 }

81 }

82 else

83 {

84 curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,

85 replaceTo, replacelen);

86 }

87

88 return node;

89}

90

91/*

92 * initTrie - create trie from file.

93 *

94 * Function converts UTF8-encoded file into current encoding.

95 */

96static TrieChar *

97 initTrie(const char *filename)

98{

99 TrieChar *volatile rootTrie = NULL;

100 MemoryContext ccxt = CurrentMemoryContext;

101 tsearch_readline_state trst;

102 volatile bool skip;

103

104 filename = get_tsearch_config_filename(filename, "rules");

105 if (!tsearch_readline_begin(&trst, filename))

106 ereport(ERROR,

107 (errcode(ERRCODE_CONFIG_FILE_ERROR),

108 errmsg("could not open unaccent file \"%s\": %m",

109 filename)));

110

111 do

112 {

113 /*

114 * pg_do_encoding_conversion() (called by tsearch_readline()) will

115 * emit exception if it finds untranslatable characters in current

116 * locale. We just skip such lines, continuing with the next.

117 */

118 skip = true;

119

120 PG_TRY();

121 {

122 char *line;

123

124 while ((line = tsearch_readline(&trst)) != NULL)

125 {

126 /*----------

127 * The format of each line must be "src" or "src trg", where

128 * src and trg are sequences of one or more non-whitespace

129 * characters, separated by whitespace. Whitespace at start

130 * or end of line is ignored. If trg is omitted, an empty

131 * string is used as the replacement. trg can be optionally

132 * quoted, in which case whitespaces are included in it.

133 *

134 * We use a simple state machine, with states

135 * 0 initial (before src)

136 * 1 in src

137 * 2 in whitespace after src

138 * 3 in trg (non-quoted)

139 * 4 in trg (quoted)

140 * 5 in whitespace after trg

141 * -1 syntax error detected (two strings)

142 * -2 syntax error detected (unfinished quoted string)

143 *----------

144 */

145 int state;

146 char *ptr;

147 char *src = NULL;

148 char *trg = NULL;

149 char *trgstore = NULL;

150 int ptrlen;

151 int srclen = 0;

152 int trglen = 0;

153 int trgstorelen = 0;

154 bool trgquoted = false;

155

156 state = 0;

157 for (ptr = line; *ptr; ptr += ptrlen)

158 {

159 ptrlen = pg_mblen(ptr);

160 /* ignore whitespace, but end src or trg */

161 if (isspace((unsigned char) *ptr))

162 {

163 if (state == 1)

164 state = 2;

165 else if (state == 3)

166 state = 5;

167 /* whitespaces are OK in quoted area */

168 if (state != 4)

169 continue;

170 }

171 switch (state)

172 {

173 case 0:

174 /* start of src */

175 src = ptr;

176 srclen = ptrlen;

177 state = 1;

178 break;

179 case 1:

180 /* continue src */

181 srclen += ptrlen;

182 break;

183 case 2:

184 /* start of trg */

185 if (*ptr == '"')

186 {

187 trgquoted = true;

188 state = 4;

189 }

190 else

191 state = 3;

192

193 trg = ptr;

194 trglen = ptrlen;

195 break;

196 case 3:

197 /* continue non-quoted trg */

198 trglen += ptrlen;

199 break;

200 case 4:

201 /* continue quoted trg */

202 trglen += ptrlen;

203

204 /*

205 * If this is a quote, consider it as the end of

206 * trg except if the follow-up character is itself

207 * a quote.

208 */

209 if (*ptr == '"')

210 {

211 if (*(ptr + 1) == '"')

212 {

213 ptr++;

214 trglen += 1;

215 }

216 else

217 state = 5;

218 }

219 break;

220 default:

221 /* bogus line format */

222 state = -1;

223 break;

224 }

225 }

226

227 if (state == 1 || state == 2)

228 {

229 /* trg was omitted, so use "" */

230 trg = "";

231 trglen = 0;

232 }

233

234 /* If still in a quoted area, fallback to an error */

235 if (state == 4)

236 state = -2;

237

238 /* If trg was quoted, remove its quotes and unescape it */

239 if (trgquoted && state > 0)

240 {

241 /* Ignore first and end quotes */

242 trgstore = (char *) palloc(sizeof(char) * (trglen - 2));

243 trgstorelen = 0;

244 for (int i = 1; i < trglen - 1; i++)

245 {

246 trgstore[trgstorelen] = trg[i];

247 trgstorelen++;

248 /* skip second double quotes */

249 if (trg[i] == '"' && trg[i + 1] == '"')

250 i++;

251 }

252 }

253 else

254 {

255 trgstore = (char *) palloc(sizeof(char) * trglen);

256 trgstorelen = trglen;

257 memcpy(trgstore, trg, trgstorelen);

258 }

259

260 if (state > 0)

261 rootTrie = placeChar(rootTrie,

262 (unsigned char *) src, srclen,

263 trgstore, trgstorelen);

264 else if (state == -1)

265 ereport(WARNING,

266 (errcode(ERRCODE_CONFIG_FILE_ERROR),

267 errmsg("invalid syntax: more than two strings in unaccent rule")));

268 else if (state == -2)

269 ereport(WARNING,

270 (errcode(ERRCODE_CONFIG_FILE_ERROR),

271 errmsg("invalid syntax: unfinished quoted string in unaccent rule")));

272

273 pfree(trgstore);

274 pfree(line);

275 }

276 skip = false;

277 }

278 PG_CATCH();

279 {

280 ErrorData *errdata;

281 MemoryContext ecxt;

282

283 ecxt = MemoryContextSwitchTo(ccxt);

284 errdata = CopyErrorData();

285 if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)

286 {

287 FlushErrorState();

288 }

289 else

290 {

291 MemoryContextSwitchTo(ecxt);

292 PG_RE_THROW();

293 }

294 }

295 PG_END_TRY();

296 }

297 while (skip);

298

299 tsearch_readline_end(&trst);

300

301 return rootTrie;

302}

303

304/*

305 * findReplaceTo - find longest possible match in trie

306 *

307 * On success, returns pointer to ending subnode, plus length of matched

308 * source string in *p_matchlen. On failure, returns NULL.

309 */

310static TrieChar *

311 findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,

312 int *p_matchlen)

313{

314 TrieChar *result = NULL;

315 int matchlen = 0;

316

317 *p_matchlen = 0; /* prevent uninitialized-variable warnings */

318

319 while (node && matchlen < srclen)

320 {

321 node = node + src[matchlen];

322 matchlen++;

323

324 if (node->replaceTo)

325 {

326 result = node;

327 *p_matchlen = matchlen;

328 }

329

330 node = node->nextChar;

331 }

332

333 return result;

334}

335

336 PG_FUNCTION_INFO_V1(unaccent_init);

337Datum

338 unaccent_init(PG_FUNCTION_ARGS)

339{

340 List *dictoptions = (List *) PG_GETARG_POINTER(0);

341 TrieChar *rootTrie = NULL;

342 bool fileloaded = false;

343 ListCell *l;

344

345 foreach(l, dictoptions)

346 {

347 DefElem *defel = (DefElem *) lfirst(l);

348

349 if (strcmp(defel->defname, "rules") == 0)

350 {

351 if (fileloaded)

352 ereport(ERROR,

353 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),

354 errmsg("multiple Rules parameters")));

355 rootTrie = initTrie(defGetString(defel));

356 fileloaded = true;

357 }

358 else

359 {

360 ereport(ERROR,

361 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),

362 errmsg("unrecognized Unaccent parameter: \"%s\"",

363 defel->defname)));

364 }

365 }

366

367 if (!fileloaded)

368 {

369 ereport(ERROR,

370 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),

371 errmsg("missing Rules parameter")));

372 }

373

374 PG_RETURN_POINTER(rootTrie);

375}

376

377 PG_FUNCTION_INFO_V1(unaccent_lexize);

378Datum

379 unaccent_lexize(PG_FUNCTION_ARGS)

380{

381 TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);

382 char *srcchar = (char *) PG_GETARG_POINTER(1);

383 int32 len = PG_GETARG_INT32(2);

384 char *srcstart = srcchar;

385 TSLexeme *res;

386 StringInfoData buf;

387

388 /* we allocate storage for the buffer only if needed */

389 buf.data = NULL;

390

391 while (len > 0)

392 {

393 TrieChar *node;

394 int matchlen;

395

396 node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,

397 &matchlen);

398 if (node && node->replaceTo)

399 {

400 if (buf.data == NULL)

401 {

402 /* initialize buffer */

403 initStringInfo(&buf);

404 /* insert any data we already skipped over */

405 if (srcchar != srcstart)

406 appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);

407 }

408 appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);

409 }

410 else

411 {

412 matchlen = pg_mblen(srcchar);

413 if (buf.data != NULL)

414 appendBinaryStringInfo(&buf, srcchar, matchlen);

415 }

416

417 srcchar += matchlen;

418 len -= matchlen;

419 }

420

421 /* return a result only if we made at least one substitution */

422 if (buf.data != NULL)

423 {

424 res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);

425 res->lexeme = buf.data;

426 res->flags = TSL_FILTER;

427 }

428 else

429 res = NULL;

430

431 PG_RETURN_POINTER(res);

432}

433

434/*

435 * Function-like wrapper for dictionary

436 */

437 PG_FUNCTION_INFO_V1(unaccent_dict);

438Datum

439 unaccent_dict(PG_FUNCTION_ARGS)

440{

441 text *str;

442 int strArg;

443 Oid dictOid;

444 TSDictionaryCacheEntry *dict;

445 TSLexeme *res;

446

447 if (PG_NARGS() == 1)

448 {

449 /*

450 * Use the "unaccent" dictionary that is in the same schema that this

451 * function is in.

452 */

453 Oid procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);

454 const char *dictname = "unaccent";

455

456 dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,

457 PointerGetDatum(dictname),

458 ObjectIdGetDatum(procnspid));

459 if (!OidIsValid(dictOid))

460 ereport(ERROR,

461 (errcode(ERRCODE_UNDEFINED_OBJECT),

462 errmsg("text search dictionary \"%s.%s\" does not exist",

463 get_namespace_name(procnspid), dictname)));

464 strArg = 0;

465 }

466 else

467 {

468 dictOid = PG_GETARG_OID(0);

469 strArg = 1;

470 }

471 str = PG_GETARG_TEXT_PP(strArg);

472

473 dict = lookup_ts_dictionary_cache(dictOid);

474

475 res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),

476 PointerGetDatum(dict->dictData),

477 PointerGetDatum(VARDATA_ANY(str)),

478 Int32GetDatum(VARSIZE_ANY_EXHDR(str)),

479 PointerGetDatum(NULL)));

480

481 PG_FREE_IF_COPY(str, strArg);

482

483 if (res == NULL)

484 {

485 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));

486 }

487 else if (res->lexeme == NULL)

488 {

489 pfree(res);

490 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));

491 }

492 else

493 {

494 text *txt = cstring_to_text(res->lexeme);

495

496 pfree(res->lexeme);

497 pfree(res);

498

499 PG_RETURN_TEXT_P(txt);

500 }

501}

builtins.h

int32

int32_t int32

Definition: c.h:534

OidIsValid

#define OidIsValid(objectId)

Definition: c.h:774

defGetString

char * defGetString(DefElem *def)

Definition: define.c:35

defrem.h

CopyErrorData

ErrorData * CopyErrorData(void)

Definition: elog.c:1754

FlushErrorState

void FlushErrorState(void)

Definition: elog.c:1875

errcode

int errcode(int sqlerrcode)

Definition: elog.c:854

errmsg

int errmsg(const char *fmt,...)

Definition: elog.c:1071

PG_RE_THROW

#define PG_RE_THROW()

Definition: elog.h:405

PG_TRY

#define PG_TRY(...)

Definition: elog.h:372

WARNING

#define WARNING

Definition: elog.h:36

PG_END_TRY

#define PG_END_TRY(...)

Definition: elog.h:397

ERROR

#define ERROR

Definition: elog.h:39

PG_CATCH

#define PG_CATCH(...)

Definition: elog.h:382

ereport

#define ereport(elevel,...)

Definition: elog.h:150

PG_FREE_IF_COPY

#define PG_FREE_IF_COPY(ptr, n)

Definition: fmgr.h:260

PG_GETARG_OID

#define PG_GETARG_OID(n)

Definition: fmgr.h:275

PG_GETARG_TEXT_PP

#define PG_GETARG_TEXT_PP(n)

Definition: fmgr.h:309

PG_GETARG_POINTER

#define PG_GETARG_POINTER(n)

Definition: fmgr.h:276

PG_NARGS

#define PG_NARGS()

Definition: fmgr.h:203

FunctionCall4

#define FunctionCall4(flinfo, arg1, arg2, arg3, arg4)

Definition: fmgr.h:706

PG_RETURN_TEXT_P

#define PG_RETURN_TEXT_P(x)

Definition: fmgr.h:372

PG_GETARG_INT32

#define PG_GETARG_INT32(n)

Definition: fmgr.h:269

PG_RETURN_POINTER

#define PG_RETURN_POINTER(x)

Definition: fmgr.h:361

PG_GETARG_TEXT_P_COPY

#define PG_GETARG_TEXT_P_COPY(n)

Definition: fmgr.h:315

PG_FUNCTION_ARGS

#define PG_FUNCTION_ARGS

Definition: fmgr.h:193

Assert

Assert(PointerIsAligned(start, uint64))

str

const char * str

Definition: hashfn_unstable.h:254

i

int i

Definition: isn.c:77

get_func_namespace

Oid get_func_namespace(Oid funcid)

Definition: lsyscache.c:1799

get_namespace_name

char * get_namespace_name(Oid nspid)

Definition: lsyscache.c:3533

lsyscache.h

pg_mblen

int pg_mblen(const char *mbstr)

Definition: mbutils.c:1024

pfree

void pfree(void *pointer)

Definition: mcxt.c:1594

palloc0

void * palloc0(Size size)

Definition: mcxt.c:1395

palloc

void * palloc(Size size)

Definition: mcxt.c:1365

CurrentMemoryContext

MemoryContext CurrentMemoryContext

Definition: mcxt.c:160

MemoryContextSwitchTo

static MemoryContext MemoryContextSwitchTo(MemoryContext context)

Definition: palloc.h:124

skip

static const struct exclude_list_item skip[]

Definition: pg_checksums.c:107

len

const void size_t len

Definition: pg_crc32c_sse42.c:28

filename

static char * filename

Definition: pg_dumpall.c:120

lfirst

#define lfirst(lc)

Definition: pg_list.h:172

buf

static char * buf

Definition: pg_test_fsync.c:72

pg_ts_dict.h

postgres.h

PointerGetDatum

static Datum PointerGetDatum(const void *X)

Definition: postgres.h:332

ObjectIdGetDatum

static Datum ObjectIdGetDatum(Oid X)

Definition: postgres.h:262

Datum

uint64_t Datum

Definition: postgres.h:70

DatumGetPointer

static Pointer DatumGetPointer(Datum X)

Definition: postgres.h:322

Int32GetDatum

static Datum Int32GetDatum(int32 X)

Definition: postgres.h:222

Oid

unsigned int Oid

Definition: postgres_ext.h:32

appendBinaryStringInfo

void appendBinaryStringInfo(StringInfo str, const void *data, int datalen)

Definition: stringinfo.c:281

initStringInfo

void initStringInfo(StringInfo str)

Definition: stringinfo.c:97

stringinfo.h

DefElem

Definition: parsenodes.h:840

DefElem::defname

char * defname

Definition: parsenodes.h:843

ErrorData

Definition: elog.h:420

ErrorData::sqlerrcode

int sqlerrcode

Definition: elog.h:431

List

Definition: pg_list.h:54

MemoryContextData

Definition: memnodes.h:118

StringInfoData

Definition: stringinfo.h:47

TSDictionaryCacheEntry

Definition: ts_cache.h:52

TSDictionaryCacheEntry::dictData

void * dictData

Definition: ts_cache.h:62

TSDictionaryCacheEntry::lexize

FmgrInfo lexize

Definition: ts_cache.h:59

TSLexeme

Definition: ts_public.h:116

TSLexeme::lexeme

char * lexeme

Definition: ts_public.h:138

TSLexeme::flags

uint16 flags

Definition: ts_public.h:136

TrieChar

Definition: unaccent.c:44

TrieChar::nextChar

struct TrieChar * nextChar

Definition: unaccent.c:45

TrieChar::replacelen

int replacelen

Definition: unaccent.c:47

TrieChar::replaceTo

char * replaceTo

Definition: unaccent.c:46

state

Definition: regguts.h:323

tsearch_readline_state

Definition: ts_locale.h:25

varlena

Definition: c.h:692

syscache.h

GetSysCacheOid2

#define GetSysCacheOid2(cacheId, oidcol, key1, key2)

Definition: syscache.h:111

lookup_ts_dictionary_cache

TSDictionaryCacheEntry * lookup_ts_dictionary_cache(Oid dictId)

Definition: ts_cache.c:208

ts_cache.h

tsearch_readline_begin

bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)

Definition: ts_locale.c:89

tsearch_readline

char * tsearch_readline(tsearch_readline_state *stp)

Definition: ts_locale.c:112

tsearch_readline_end

void tsearch_readline_end(tsearch_readline_state *stp)

Definition: ts_locale.c:157

ts_locale.h

ts_public.h

TSL_FILTER

#define TSL_FILTER

Definition: ts_public.h:144

get_tsearch_config_filename

char * get_tsearch_config_filename(const char *basename, const char *extension)

Definition: ts_utils.c:34

initTrie

static TrieChar * initTrie(const char *filename)

Definition: unaccent.c:97

unaccent_init

Datum unaccent_init(PG_FUNCTION_ARGS)

Definition: unaccent.c:338

PG_FUNCTION_INFO_V1

PG_FUNCTION_INFO_V1(unaccent_init)

TrieChar

struct TrieChar TrieChar

unaccent_lexize

Datum unaccent_lexize(PG_FUNCTION_ARGS)

Definition: unaccent.c:379

placeChar

static TrieChar * placeChar(TrieChar *node, const unsigned char *str, int lenstr, const char *replaceTo, int replacelen)

Definition: unaccent.c:57

unaccent_dict

Datum unaccent_dict(PG_FUNCTION_ARGS)

Definition: unaccent.c:439

findReplaceTo

static TrieChar * findReplaceTo(TrieChar *node, const unsigned char *src, int srclen, int *p_matchlen)

Definition: unaccent.c:311

PG_MODULE_MAGIC_EXT

PG_MODULE_MAGIC_EXT(.name="unaccent",.version=PG_VERSION)

ListCell

Definition: pg_list.h:46

VARSIZE_ANY_EXHDR

static Size VARSIZE_ANY_EXHDR(const void *PTR)

Definition: varatt.h:472

VARDATA_ANY

static char * VARDATA_ANY(const void *PTR)

Definition: varatt.h:486

cstring_to_text

text * cstring_to_text(const char *s)

Definition: varlena.c:181

name

const char * name

Definition: wait_event_funcs.c:28

PostgreSQL Source Code: contrib/unaccent/unaccent.c Source File