3
* New dictionary is include in dict.h. For languages which
4
* use latin charset it may be need to modify mapdict table.
5
* Teodor Sigaev <teodor@stack.net>
11
#include "utils/builtins.h"
17
* Struct for calling dictionaries
18
* All of this methods are optional, but
19
* if all methods are NULL, then dictionary does nothing :)
20
* Return value of lemmatize must be palloced or the same.
21
* Return value of init must be malloced in other case
22
* it will be free in end of transaction!
26
char localename[NAMEDATALEN];
29
/* close dictionary */
30
void (*close) (void *);
31
/* find in dictionary */
32
char *(*lemmatize) (void *, char *, int *);
33
int (*is_stoplemm) (void *, char *, int);
34
int (*is_stemstoplemm) (void *, char *, int);
37
/* insert all dictionaries */
42
/* fill dictionary's structure */
46
"C", NULL, NULL, NULL, NULL, NULL /* fake dictionary */
53
/* array for storing dictionary's objects (if needed) */
63
typedef int2 MAPDICT[MAXNDICT];
65
#define GETDICT(x,i) *( ((int2*)(x)) + (i) )
67
/* map dictionaries for lexem type */
68
static MAPDICT mapdict[] = {
69
{NODICT, NODICT}, /* not used */
70
{DEFAULTDICT, NODICT}, /* LATWORD */
71
{BYLOCALE, NODICT}, /* NONLATINWORD */
72
{BYLOCALE, DEFAULTDICT}, /* UWORD */
73
{NODICT, NODICT}, /* EMAIL */
74
{NODICT, NODICT}, /* FURL */
75
{NODICT, NODICT}, /* HOST */
76
{NODICT, NODICT}, /* SCIENTIFIC */
77
{NODICT, NODICT}, /* VERSIONNUMBER */
78
{BYLOCALE, DEFAULTDICT}, /* PARTHYPHENWORD */
79
{BYLOCALE, NODICT}, /* CYRPARTHYPHENWORD */
80
{DEFAULTDICT, NODICT}, /* LATPARTHYPHENWORD */
81
{STOPLEXEM, NODICT}, /* SPACE */
82
{STOPLEXEM, NODICT}, /* TAG */
83
{STOPLEXEM, NODICT}, /* HTTP */
84
{BYLOCALE, DEFAULTDICT}, /* HYPHENWORD */
85
{DEFAULTDICT, NODICT}, /* LATHYPHENWORD */
86
{BYLOCALE, NODICT}, /* CYRHYPHENWORD */
87
{NODICT, NODICT}, /* URI */
88
{NODICT, NODICT}, /* FILEPATH */
89
{NODICT, NODICT}, /* DECIMAL */
90
{NODICT, NODICT}, /* SIGNEDINT */
91
{NODICT, NODICT}, /* UNSIGNEDINT */
92
{STOPLEXEM, NODICT} /* HTMLENTITY */
95
static bool inited = false;
104
bool needinit[lengthof(dicts)];
105
const char *curlocale;
106
int bylocaledict = NODICT;
110
for (i = 1; i < lengthof(dicts); i++)
113
curlocale = setlocale(LC_CTYPE, NULL);
116
for (i = 1; i < lengthof(dicts); i++)
117
if (strcmp(dicts[i].localename, curlocale) == 0)
124
for (i = 1; i < lengthof(mapdict); i++)
128
for (j = 0; j < MAXNDICT; j++)
130
GETDICT(md, k) = GETDICT(md, j);
131
if (GETDICT(md, k) == NODICT)
133
else if (GETDICT(md, k) == BYLOCALE)
135
if (bylocaledict == NODICT)
137
GETDICT(md, k) = bylocaledict;
139
if (GETDICT(md, k) >= (int2) lengthof(dicts))
141
needinit[GETDICT(md, k)] = true;
144
for (; k < MAXNDICT; k++)
145
if (GETDICT(md, k) != STOPLEXEM)
146
GETDICT(md, k) = NODICT;
149
for (i = 1; i < lengthof(dicts); i++)
150
if (needinit[i] && dicts[i].init)
151
dictobjs[i] = (*(dicts[i].init)) ();
158
lemmatize(char *word, int *len, int type)
164
for (i = 0; i < MAXNDICT; i++)
166
nd = GETDICT(&mapdict[type], i);
169
/* there is no dictionary */
172
else if (nd == STOPLEXEM)
174
/* word is stopword */
177
else if (nd == BYLOCALE)
179
continue; /* no dict for current locale */
184
if (dict->is_stoplemm && (*(dict->is_stoplemm)) (dictobjs[nd], word, *len))
189
char *newword = (*(dict->lemmatize)) (dictobjs[nd], word, len);
191
/* word is recognized by dictionary */
192
if (newword != word || *len != oldlen)
194
if (dict->is_stemstoplemm &&
195
(*(dict->is_stemstoplemm)) (dictobjs[nd], word, *len))
197
if (newword != word && newword)
211
is_stoptype(int type)
213
return (GETDICT(&mapdict[type], 0) == STOPLEXEM) ? true : false;