1/*-------------------------------------------------------------------------
4 * Text search unaccent dictionary
6 * Copyright (c) 2009-2025, PostgreSQL Global Development Group
9 * contrib/unaccent/unaccent.c
11 *-------------------------------------------------------------------------
32 * An unaccent dictionary uses a trie to find a string to replace. Each node
33 * of the trie is an array of 256 TrieChar structs; the N-th element of the
34 * array corresponds to next byte value N. That element can contain both a
35 * replacement string (to be used if the source string ends with this byte)
36 * and a link to another trie node (to be followed if there are more bytes).
38 * Note that the trie search logic pays no attention to multibyte character
39 * boundaries. This is OK as long as both the data entered into the trie and
40 * the data we're trying to look up are validly encoded; no partial-character
51 * placeChar - put str into trie's structure, byte by byte.
53 * If node is NULL, we need to make a new node, which will be returned;
54 * otherwise the return value is the same as node.
65 Assert(lenstr > 0);
/* else str[0] doesn't exist */
67 curnode = node + *
str;
73 (
errcode(ERRCODE_CONFIG_FILE_ERROR),
74 errmsg(
"duplicate source strings, first one will be used")));
92 * initTrie - create trie from file.
94 * Function converts UTF8-encoded file into current encoding.
107 (
errcode(ERRCODE_CONFIG_FILE_ERROR),
108 errmsg(
"could not open unaccent file \"%s\": %m",
114 * pg_do_encoding_conversion() (called by tsearch_readline()) will
115 * emit exception if it finds untranslatable characters in current
116 * locale. We just skip such lines, continuing with the next.
127 * The format of each line must be "src" or "src trg", where
128 * src and trg are sequences of one or more non-whitespace
129 * characters, separated by whitespace. Whitespace at start
130 * or end of line is ignored. If trg is omitted, an empty
131 * string is used as the replacement. trg can be optionally
132 * quoted, in which case whitespaces are included in it.
134 * We use a simple state machine, with states
135 * 0 initial (before src)
137 * 2 in whitespace after src
138 * 3 in trg (non-quoted)
140 * 5 in whitespace after trg
141 * -1 syntax error detected (two strings)
142 * -2 syntax error detected (unfinished quoted string)
149 char *trgstore = NULL;
154 bool trgquoted =
false;
157 for (ptr = line; *ptr; ptr += ptrlen)
160 /* ignore whitespace, but end src or trg */
161 if (isspace((
unsigned char) *ptr))
167 /* whitespaces are OK in quoted area */
197 /* continue non-quoted trg */
201 /* continue quoted trg */
205 * If this is a quote, consider it as the end of
206 * trg except if the follow-up character is itself
211 if (*(ptr + 1) ==
'"')
221 /* bogus line format */
229 /* trg was omitted, so use "" */
234 /* If still in a quoted area, fallback to an error */
238 /* If trg was quoted, remove its quotes and unescape it */
239 if (trgquoted &&
state > 0)
241 /* Ignore first and end quotes */
242 trgstore = (
char *)
palloc(
sizeof(
char) * (trglen - 2));
244 for (
int i = 1;
i < trglen - 1;
i++)
246 trgstore[trgstorelen] = trg[
i];
248 /* skip second double quotes */
249 if (trg[
i] ==
'"' && trg[
i + 1] ==
'"')
255 trgstore = (
char *)
palloc(
sizeof(
char) * trglen);
256 trgstorelen = trglen;
257 memcpy(trgstore, trg, trgstorelen);
262 (
unsigned char *) src, srclen,
263 trgstore, trgstorelen);
264 else if (
state == -1)
266 (
errcode(ERRCODE_CONFIG_FILE_ERROR),
267 errmsg(
"invalid syntax: more than two strings in unaccent rule")));
268 else if (
state == -2)
270 (
errcode(ERRCODE_CONFIG_FILE_ERROR),
271 errmsg(
"invalid syntax: unfinished quoted string in unaccent rule")));
285 if (errdata->
sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
305 * findReplaceTo - find longest possible match in trie
307 * On success, returns pointer to ending subnode, plus length of matched
308 * source string in *p_matchlen. On failure, returns NULL.
317 *p_matchlen = 0;
/* prevent uninitialized-variable warnings */
319 while (node && matchlen < srclen)
321 node = node + src[matchlen];
327 *p_matchlen = matchlen;
342 bool fileloaded =
false;
345 foreach(l, dictoptions)
349 if (strcmp(defel->
defname,
"rules") == 0)
353 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
354 errmsg(
"multiple Rules parameters")));
361 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
362 errmsg(
"unrecognized Unaccent parameter: \"%s\"",
370 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
371 errmsg(
"missing Rules parameter")));
384 char *srcstart = srcchar;
388 /* we allocate storage for the buffer only if needed */
400 if (
buf.data == NULL)
402 /* initialize buffer */
404 /* insert any data we already skipped over */
405 if (srcchar != srcstart)
413 if (
buf.data != NULL)
421 /* return a result only if we made at least one substitution */
422 if (
buf.data != NULL)
435 * Function-like wrapper for dictionary
450 * Use the "unaccent" dictionary that is in the same schema that this
454 const char *dictname =
"unaccent";
461 (
errcode(ERRCODE_UNDEFINED_OBJECT),
462 errmsg(
"text search dictionary \"%s.%s\" does not exist",
487 else if (res->
lexeme == NULL)
#define OidIsValid(objectId)
char * defGetString(DefElem *def)
ErrorData * CopyErrorData(void)
void FlushErrorState(void)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
#define PG_FREE_IF_COPY(ptr, n)
#define PG_GETARG_TEXT_PP(n)
#define PG_GETARG_POINTER(n)
#define FunctionCall4(flinfo, arg1, arg2, arg3, arg4)
#define PG_RETURN_TEXT_P(x)
#define PG_GETARG_INT32(n)
#define PG_RETURN_POINTER(x)
#define PG_GETARG_TEXT_P_COPY(n)
Assert(PointerIsAligned(start, uint64))
Oid get_func_namespace(Oid funcid)
char * get_namespace_name(Oid nspid)
int pg_mblen(const char *mbstr)
void pfree(void *pointer)
void * palloc0(Size size)
MemoryContext CurrentMemoryContext
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
static const struct exclude_list_item skip[]
static Datum PointerGetDatum(const void *X)
static Datum ObjectIdGetDatum(Oid X)
static Pointer DatumGetPointer(Datum X)
static Datum Int32GetDatum(int32 X)
void appendBinaryStringInfo(StringInfo str, const void *data, int datalen)
void initStringInfo(StringInfo str)
struct TrieChar * nextChar
#define GetSysCacheOid2(cacheId, oidcol, key1, key2)
TSDictionaryCacheEntry * lookup_ts_dictionary_cache(Oid dictId)
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
char * tsearch_readline(tsearch_readline_state *stp)
void tsearch_readline_end(tsearch_readline_state *stp)
char * get_tsearch_config_filename(const char *basename, const char *extension)
static TrieChar * initTrie(const char *filename)
Datum unaccent_init(PG_FUNCTION_ARGS)
PG_FUNCTION_INFO_V1(unaccent_init)
Datum unaccent_lexize(PG_FUNCTION_ARGS)
static TrieChar * placeChar(TrieChar *node, const unsigned char *str, int lenstr, const char *replaceTo, int replacelen)
Datum unaccent_dict(PG_FUNCTION_ARGS)
static TrieChar * findReplaceTo(TrieChar *node, const unsigned char *src, int srclen, int *p_matchlen)
PG_MODULE_MAGIC_EXT(.name="unaccent",.version=PG_VERSION)
static Size VARSIZE_ANY_EXHDR(const void *PTR)
static char * VARDATA_ANY(const void *PTR)
text * cstring_to_text(const char *s)