1/*-------------------------------------------------------------------------
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
9 * src/backend/snowball/dict_snowball.c
11 *-------------------------------------------------------------------------
15#include "catalog/pg_collation_d.h"
21/* Some platforms define MAXINT and/or MININT, causing conflicts */
29/* Now we can include the original Snowball header.h */
81 .
name =
"dict_snowball",
89/* List of supported modules */
94 struct SN_env *(*create) (void);
99/* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
100 #define STEMMER_MODULE(name,enc,senc) \
101 {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
106 * Stemmers list from Snowball distribution
158 * Stemmer with PG_SQL_ASCII encoding should be valid for any server
163 {NULL, 0, NULL, NULL, NULL}
/* list end marker */
175 * snowball saves alloced memory between calls, so we should run it in our
176 * private memory context. Note, init function is executed in long lived
177 * context, so we just remember CurrentMemoryContext
189 * First, try to find exact match of stemmer module. Stemmer with
190 * PG_SQL_ASCII encoding is treated as working with any server encoding
205 * Second, try to find stemmer for needed language for UTF8 encoding.
219 (
errcode(ERRCODE_UNDEFINED_OBJECT),
220 errmsg(
"no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
229 bool stoploaded =
false;
234 foreach(
l, dictoptions)
238 if (strcmp(defel->
defname,
"stopwords") == 0)
242 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
243 errmsg(
"multiple StopWords parameters")));
247 else if (strcmp(defel->
defname,
"language") == 0)
251 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
252 errmsg(
"multiple Language parameters")));
258 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
259 errmsg(
"unrecognized Snowball parameter: \"%s\"",
266 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
267 errmsg(
"missing Language parameter")));
284 * Do not pass strings exceeding 1000 bytes to the stemmer, as they're
285 * surely not words in any human language. This restriction avoids
286 * wasting cycles on stuff like base64-encoded data, and it protects us
287 * against possible inefficiency or misbehavior in the stemmer. (For
288 * example, the Turkish stemmer has an indefinite recursion, so it can
289 * crash on long-enough strings.) However, Snowball dictionaries are
290 * defined to recognize all strings, so we can't reject the string as an
295 /* return the lexeme lowercased, but otherwise unmodified */
300 /* empty or stopword, so report as stopword */
308 * recode to utf8 if stemmer is utf8 and doesn't match server encoding
322 /* see comment about d->dictCtx */
328 if (d->
z->
p && d->
z->
l)
331 memcpy(txt, d->
z->
p, d->
z->
l);
335 /* back recode if needed */
int SN_set_current(struct SN_env *z, int size, const symbol *s)
char * defGetString(DefElem *def)
struct stemmer_module stemmer_module
Datum dsnowball_lexize(PG_FUNCTION_ARGS)
static const stemmer_module stemmer_modules[]
static void locate_stem_module(DictSnowball *d, const char *lang)
#define STEMMER_MODULE(name, enc, senc)
PG_MODULE_MAGIC_EXT(.name="dict_snowball",.version=PG_VERSION)
PG_FUNCTION_INFO_V1(dsnowball_init)
Datum dsnowball_init(PG_FUNCTION_ARGS)
struct DictSnowball DictSnowball
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
#define PG_GETARG_POINTER(n)
#define PG_GETARG_INT32(n)
#define PG_RETURN_POINTER(x)
int GetDatabaseEncoding(void)
char * pg_any_to_server(const char *s, int len, int encoding)
const char * GetDatabaseEncodingName(void)
char * pg_server_to_any(const char *s, int len, int encoding)
void * repalloc(void *pointer, Size size)
void pfree(void *pointer)
void * palloc0(Size size)
MemoryContext CurrentMemoryContext
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
int pg_strcasecmp(const char *s1, const char *s2)
int(* stem)(struct SN_env *z)
struct SN_env *(* create)(void)
void(* close)(struct SN_env *)
int(* stem)(struct SN_env *)
void readstoplist(const char *fname, StopList *s, char *(*wordop)(const char *, size_t, Oid))
bool searchstoplist(StopList *s, char *key)