PostgreSQL Source Code: src/backend/snowball/dict_snowball.c Source File

PostgreSQL Source Code git master
dict_snowball.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * dict_snowball.c
4 * Snowball dictionary
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/backend/snowball/dict_snowball.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "postgres.h"
14
15#include "catalog/pg_collation_d.h"
16#include "commands/defrem.h"
17#include "mb/pg_wchar.h"
18#include "tsearch/ts_public.h"
19#include "utils/formatting.h"
20
21/* Some platforms define MAXINT and/or MININT, causing conflicts */
22#ifdef MAXINT
23#undef MAXINT
24#endif
25#ifdef MININT
26#undef MININT
27#endif
28
29/* Now we can include the original Snowball header.h */
30#include "snowball/libstemmer/header.h"
31#include "snowball/libstemmer/stem_ISO_8859_1_basque.h"
32#include "snowball/libstemmer/stem_ISO_8859_1_catalan.h"
33#include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
34#include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
35#include "snowball/libstemmer/stem_ISO_8859_1_english.h"
36#include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
37#include "snowball/libstemmer/stem_ISO_8859_1_french.h"
38#include "snowball/libstemmer/stem_ISO_8859_1_german.h"
39#include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
40#include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
41#include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
42#include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
43#include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
44#include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
45#include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
46#include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
47#include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
48#include "snowball/libstemmer/stem_KOI8_R_russian.h"
49#include "snowball/libstemmer/stem_UTF_8_arabic.h"
50#include "snowball/libstemmer/stem_UTF_8_armenian.h"
51#include "snowball/libstemmer/stem_UTF_8_basque.h"
52#include "snowball/libstemmer/stem_UTF_8_catalan.h"
53#include "snowball/libstemmer/stem_UTF_8_danish.h"
54#include "snowball/libstemmer/stem_UTF_8_dutch.h"
55#include "snowball/libstemmer/stem_UTF_8_english.h"
56#include "snowball/libstemmer/stem_UTF_8_estonian.h"
57#include "snowball/libstemmer/stem_UTF_8_finnish.h"
58#include "snowball/libstemmer/stem_UTF_8_french.h"
59#include "snowball/libstemmer/stem_UTF_8_german.h"
60#include "snowball/libstemmer/stem_UTF_8_greek.h"
61#include "snowball/libstemmer/stem_UTF_8_hindi.h"
62#include "snowball/libstemmer/stem_UTF_8_hungarian.h"
63#include "snowball/libstemmer/stem_UTF_8_indonesian.h"
64#include "snowball/libstemmer/stem_UTF_8_irish.h"
65#include "snowball/libstemmer/stem_UTF_8_italian.h"
66#include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
67#include "snowball/libstemmer/stem_UTF_8_nepali.h"
68#include "snowball/libstemmer/stem_UTF_8_norwegian.h"
69#include "snowball/libstemmer/stem_UTF_8_porter.h"
70#include "snowball/libstemmer/stem_UTF_8_portuguese.h"
71#include "snowball/libstemmer/stem_UTF_8_romanian.h"
72#include "snowball/libstemmer/stem_UTF_8_russian.h"
73#include "snowball/libstemmer/stem_UTF_8_serbian.h"
74#include "snowball/libstemmer/stem_UTF_8_spanish.h"
75#include "snowball/libstemmer/stem_UTF_8_swedish.h"
76#include "snowball/libstemmer/stem_UTF_8_tamil.h"
77#include "snowball/libstemmer/stem_UTF_8_turkish.h"
78#include "snowball/libstemmer/stem_UTF_8_yiddish.h"
79
80 PG_MODULE_MAGIC_EXT(
81 .name = "dict_snowball",
82 .version = PG_VERSION
83);
84
85 PG_FUNCTION_INFO_V1(dsnowball_init);
86
87 PG_FUNCTION_INFO_V1(dsnowball_lexize);
88
89/* List of supported modules */
90 typedef struct stemmer_module
91{
92 const char *name;
93 pg_enc enc;
94 struct SN_env *(*create) (void);
95 void (*close) (struct SN_env *);
96 int (*stem) (struct SN_env *);
97 } stemmer_module;
98
99/* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
100 #define STEMMER_MODULE(name,enc,senc) \
101 {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
102
103 static const stemmer_module stemmer_modules[] =
104{
105 /*
106 * Stemmers list from Snowball distribution
107 */
108 STEMMER_MODULE(basque, PG_LATIN1, ISO_8859_1),
109 STEMMER_MODULE(catalan, PG_LATIN1, ISO_8859_1),
110 STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
111 STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
112 STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
113 STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
114 STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
115 STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
116 STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
117 STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
118 STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
119 STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
120 STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
121 STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
122 STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
123 STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
124 STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
125 STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
126 STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
127 STEMMER_MODULE(armenian, PG_UTF8, UTF_8),
128 STEMMER_MODULE(basque, PG_UTF8, UTF_8),
129 STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
130 STEMMER_MODULE(danish, PG_UTF8, UTF_8),
131 STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
132 STEMMER_MODULE(english, PG_UTF8, UTF_8),
133 STEMMER_MODULE(estonian, PG_UTF8, UTF_8),
134 STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
135 STEMMER_MODULE(french, PG_UTF8, UTF_8),
136 STEMMER_MODULE(german, PG_UTF8, UTF_8),
137 STEMMER_MODULE(greek, PG_UTF8, UTF_8),
138 STEMMER_MODULE(hindi, PG_UTF8, UTF_8),
139 STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
140 STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
141 STEMMER_MODULE(irish, PG_UTF8, UTF_8),
142 STEMMER_MODULE(italian, PG_UTF8, UTF_8),
143 STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
144 STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
145 STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
146 STEMMER_MODULE(porter, PG_UTF8, UTF_8),
147 STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
148 STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
149 STEMMER_MODULE(russian, PG_UTF8, UTF_8),
150 STEMMER_MODULE(serbian, PG_UTF8, UTF_8),
151 STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
152 STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
153 STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
154 STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
155 STEMMER_MODULE(yiddish, PG_UTF8, UTF_8),
156
157 /*
158 * Stemmer with PG_SQL_ASCII encoding should be valid for any server
159 * encoding
160 */
161 STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
162
163 {NULL, 0, NULL, NULL, NULL} /* list end marker */
164};
165
166
167 typedef struct DictSnowball
168{
169 struct SN_env *z;
170 StopList stoplist;
171 bool needrecode; /* needs recoding before/after call stem */
172 int (*stem) (struct SN_env *z);
173
174 /*
175 * snowball saves alloced memory between calls, so we should run it in our
176 * private memory context. Note, init function is executed in long lived
177 * context, so we just remember CurrentMemoryContext
178 */
179 MemoryContext dictCtx;
180 } DictSnowball;
181
182
183static void
184 locate_stem_module(DictSnowball *d, const char *lang)
185{
186 const stemmer_module *m;
187
188 /*
189 * First, try to find exact match of stemmer module. Stemmer with
190 * PG_SQL_ASCII encoding is treated as working with any server encoding
191 */
192 for (m = stemmer_modules; m->name; m++)
193 {
194 if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
195 pg_strcasecmp(m->name, lang) == 0)
196 {
197 d->stem = m->stem;
198 d->z = m->create();
199 d->needrecode = false;
200 return;
201 }
202 }
203
204 /*
205 * Second, try to find stemmer for needed language for UTF8 encoding.
206 */
207 for (m = stemmer_modules; m->name; m++)
208 {
209 if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
210 {
211 d->stem = m->stem;
212 d->z = m->create();
213 d->needrecode = true;
214 return;
215 }
216 }
217
218 ereport(ERROR,
219 (errcode(ERRCODE_UNDEFINED_OBJECT),
220 errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
221 lang, GetDatabaseEncodingName())));
222}
223
224Datum
225 dsnowball_init(PG_FUNCTION_ARGS)
226{
227 List *dictoptions = (List *) PG_GETARG_POINTER(0);
228 DictSnowball *d;
229 bool stoploaded = false;
230 ListCell *l;
231
232 d = (DictSnowball *) palloc0(sizeof(DictSnowball));
233
234 foreach(l, dictoptions)
235 {
236 DefElem *defel = (DefElem *) lfirst(l);
237
238 if (strcmp(defel->defname, "stopwords") == 0)
239 {
240 if (stoploaded)
241 ereport(ERROR,
242 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
243 errmsg("multiple StopWords parameters")));
244 readstoplist(defGetString(defel), &d->stoplist, str_tolower);
245 stoploaded = true;
246 }
247 else if (strcmp(defel->defname, "language") == 0)
248 {
249 if (d->stem)
250 ereport(ERROR,
251 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
252 errmsg("multiple Language parameters")));
253 locate_stem_module(d, defGetString(defel));
254 }
255 else
256 {
257 ereport(ERROR,
258 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
259 errmsg("unrecognized Snowball parameter: \"%s\"",
260 defel->defname)));
261 }
262 }
263
264 if (!d->stem)
265 ereport(ERROR,
266 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
267 errmsg("missing Language parameter")));
268
269 d->dictCtx = CurrentMemoryContext;
270
271 PG_RETURN_POINTER(d);
272}
273
274Datum
275 dsnowball_lexize(PG_FUNCTION_ARGS)
276{
277 DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
278 char *in = (char *) PG_GETARG_POINTER(1);
279 int32 len = PG_GETARG_INT32(2);
280 char *txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
281 TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
282
283 /*
284 * Do not pass strings exceeding 1000 bytes to the stemmer, as they're
285 * surely not words in any human language. This restriction avoids
286 * wasting cycles on stuff like base64-encoded data, and it protects us
287 * against possible inefficiency or misbehavior in the stemmer. (For
288 * example, the Turkish stemmer has an indefinite recursion, so it can
289 * crash on long-enough strings.) However, Snowball dictionaries are
290 * defined to recognize all strings, so we can't reject the string as an
291 * unknown word.
292 */
293 if (len > 1000)
294 {
295 /* return the lexeme lowercased, but otherwise unmodified */
296 res->lexeme = txt;
297 }
298 else if (*txt == '0円' || searchstoplist(&(d->stoplist), txt))
299 {
300 /* empty or stopword, so report as stopword */
301 pfree(txt);
302 }
303 else
304 {
305 MemoryContext saveCtx;
306
307 /*
308 * recode to utf8 if stemmer is utf8 and doesn't match server encoding
309 */
310 if (d->needrecode)
311 {
312 char *recoded;
313
314 recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
315 if (recoded != txt)
316 {
317 pfree(txt);
318 txt = recoded;
319 }
320 }
321
322 /* see comment about d->dictCtx */
323 saveCtx = MemoryContextSwitchTo(d->dictCtx);
324 SN_set_current(d->z, strlen(txt), (symbol *) txt);
325 d->stem(d->z);
326 MemoryContextSwitchTo(saveCtx);
327
328 if (d->z->p && d->z->l)
329 {
330 txt = repalloc(txt, d->z->l + 1);
331 memcpy(txt, d->z->p, d->z->l);
332 txt[d->z->l] = '0円';
333 }
334
335 /* back recode if needed */
336 if (d->needrecode)
337 {
338 char *recoded;
339
340 recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
341 if (recoded != txt)
342 {
343 pfree(txt);
344 txt = recoded;
345 }
346 }
347
348 res->lexeme = txt;
349 }
350
351 PG_RETURN_POINTER(res);
352}
int SN_set_current(struct SN_env *z, int size, const symbol *s)
Definition: api.c:51
unsigned char symbol
Definition: api.h:2
int32_t int32
Definition: c.h:534
char * defGetString(DefElem *def)
Definition: define.c:35
struct stemmer_module stemmer_module
Datum dsnowball_lexize(PG_FUNCTION_ARGS)
Definition: dict_snowball.c:275
static const stemmer_module stemmer_modules[]
Definition: dict_snowball.c:103
static void locate_stem_module(DictSnowball *d, const char *lang)
Definition: dict_snowball.c:184
#define STEMMER_MODULE(name, enc, senc)
Definition: dict_snowball.c:100
PG_MODULE_MAGIC_EXT(.name="dict_snowball",.version=PG_VERSION)
PG_FUNCTION_INFO_V1(dsnowball_init)
Datum dsnowball_init(PG_FUNCTION_ARGS)
Definition: dict_snowball.c:225
struct DictSnowball DictSnowball
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:150
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
char * str_tolower(const char *buff, size_t nbytes, Oid collid)
Definition: formatting.c:1639
int GetDatabaseEncoding(void)
Definition: mbutils.c:1262
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:677
const char * GetDatabaseEncodingName(void)
Definition: mbutils.c:1268
char * pg_server_to_any(const char *s, int len, int encoding)
Definition: mbutils.c:750
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1610
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc0(Size size)
Definition: mcxt.c:1395
MemoryContext CurrentMemoryContext
Definition: mcxt.c:160
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
const void size_t len
#define lfirst(lc)
Definition: pg_list.h:172
pg_enc
Definition: pg_wchar.h:225
@ PG_SQL_ASCII
Definition: pg_wchar.h:226
@ PG_KOI8R
Definition: pg_wchar.h:248
@ PG_LATIN2
Definition: pg_wchar.h:235
@ PG_LATIN1
Definition: pg_wchar.h:234
@ PG_UTF8
Definition: pg_wchar.h:232
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
uint64_t Datum
Definition: postgres.h:70
char * defname
Definition: parsenodes.h:843
MemoryContext dictCtx
Definition: dict_snowball.c:179
bool needrecode
Definition: dict_snowball.c:171
StopList stoplist
Definition: dict_snowball.c:170
struct SN_env * z
Definition: dict_snowball.c:169
int(* stem)(struct SN_env *z)
Definition: dict_snowball.c:172
Definition: pg_list.h:54
Definition: api.h:14
symbol * p
Definition: api.h:15
int l
Definition: api.h:16
char * lexeme
Definition: ts_public.h:138
struct SN_env *(* create)(void)
Definition: dict_snowball.c:94
const char * name
Definition: dict_snowball.c:92
void(* close)(struct SN_env *)
Definition: dict_snowball.c:95
int(* stem)(struct SN_env *)
Definition: dict_snowball.c:96
void readstoplist(const char *fname, StopList *s, char *(*wordop)(const char *, size_t, Oid))
Definition: ts_utils.c:69
bool searchstoplist(StopList *s, char *key)
Definition: ts_utils.c:141
Definition: pg_list.h:46
const char * name

AltStyle によって変換されたページ (->オリジナル) /