~vcs-imports/mammoth-replicator/trunk

« back to all changes in this revision

Viewing changes to contrib/tsearch/morph.c

  • Committer: alvherre
  • Date: 2005-12-16 21:24:52 UTC
  • Revision ID: svn-v4:db760fc0-0f08-0410-9d63-cc6633f64896:trunk:1
Initial import of the REL8_0_3 sources from the Pgsql CVS repository.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
 * morphology module
 
3
 * New dictionary is include in dict.h. For languages which
 
4
 * use latin charset it may be need to modify mapdict table.
 
5
 * Teodor Sigaev <teodor@stack.net>
 
6
 */
 
7
#include "postgres.h"
 
8
 
 
9
#include <locale.h>
 
10
 
 
11
#include "utils/builtins.h"
 
12
 
 
13
#include "morph.h"
 
14
#include "deflex.h"
 
15
 
 
16
/*
 
17
 * Struct for calling dictionaries
 
18
 * All of this methods are optional, but
 
19
 * if all methods are NULL, then dictionary does nothing :)
 
20
 * Return value of lemmatize must be palloced or the same.
 
21
 * Return value of init must be malloced in other case
 
22
 * it will be free in end of transaction!
 
23
 */
 
24
typedef struct
 
25
{
 
26
        char            localename[NAMEDATALEN];
 
27
        /* init dictionary */
 
28
        void       *(*init) (void);
 
29
        /* close dictionary */
 
30
        void            (*close) (void *);
 
31
        /* find in dictionary */
 
32
        char       *(*lemmatize) (void *, char *, int *);
 
33
        int                     (*is_stoplemm) (void *, char *, int);
 
34
        int                     (*is_stemstoplemm) (void *, char *, int);
 
35
}       DICT;
 
36
 
 
37
/* insert all dictionaries */
 
38
#define DICT_BODY
 
39
#include "dict.h"
 
40
#undef  DICT_BODY
 
41
 
 
42
/* fill dictionary's structure */
 
43
#define DICT_TABLE
 
44
DICT            dicts[] = {
 
45
        {
 
46
                "C", NULL, NULL, NULL, NULL, NULL               /* fake dictionary */
 
47
        }
 
48
#include "dict.h"
 
49
};
 
50
 
 
51
#undef DICT_TABLE
 
52
 
 
53
/* array for storing dictionary's objects (if needed) */
 
54
void       *dictobjs[
 
55
                                         lengthof(dicts)];
 
56
 
 
57
#define STOPLEXEM       -2
 
58
#define BYLOCALE        -1
 
59
#define NODICT          0
 
60
#define DEFAULTDICT 1
 
61
 
 
62
#define MAXNDICT        2
 
63
typedef int2 MAPDICT[MAXNDICT];
 
64
 
 
65
#define GETDICT(x,i)    *( ((int2*)(x)) + (i) )
 
66
 
 
67
/* map dictionaries for lexem type */
 
68
static MAPDICT mapdict[] = {
 
69
        {NODICT, NODICT},                       /* not used                     */
 
70
        {DEFAULTDICT, NODICT},          /* LATWORD              */
 
71
        {BYLOCALE, NODICT},                     /* NONLATINWORD         */
 
72
        {BYLOCALE, DEFAULTDICT},        /* UWORD                */
 
73
        {NODICT, NODICT},                       /* EMAIL                */
 
74
        {NODICT, NODICT},                       /* FURL                 */
 
75
        {NODICT, NODICT},                       /* HOST                 */
 
76
        {NODICT, NODICT},                       /* SCIENTIFIC           */
 
77
        {NODICT, NODICT},                       /* VERSIONNUMBER                */
 
78
        {BYLOCALE, DEFAULTDICT},        /* PARTHYPHENWORD               */
 
79
        {BYLOCALE, NODICT},                     /* CYRPARTHYPHENWORD */
 
80
        {DEFAULTDICT, NODICT},          /* LATPARTHYPHENWORD            */
 
81
        {STOPLEXEM, NODICT},            /* SPACE                */
 
82
        {STOPLEXEM, NODICT},            /* TAG          */
 
83
        {STOPLEXEM, NODICT},            /* HTTP                 */
 
84
        {BYLOCALE, DEFAULTDICT},        /* HYPHENWORD           */
 
85
        {DEFAULTDICT, NODICT},          /* LATHYPHENWORD                */
 
86
        {BYLOCALE, NODICT},                     /* CYRHYPHENWORD        */
 
87
        {NODICT, NODICT},                       /* URI                  */
 
88
        {NODICT, NODICT},                       /* FILEPATH             */
 
89
        {NODICT, NODICT},                       /* DECIMAL              */
 
90
        {NODICT, NODICT},                       /* SIGNEDINT            */
 
91
        {NODICT, NODICT},                       /* UNSIGNEDINT          */
 
92
        {STOPLEXEM, NODICT}                     /* HTMLENTITY           */
 
93
};
 
94
 
 
95
static bool inited = false;
 
96
 
 
97
void
 
98
initmorph(void)
 
99
{
 
100
        int                     i,
 
101
                                j,
 
102
                                k;
 
103
        MAPDICT    *md;
 
104
        bool            needinit[lengthof(dicts)];
 
105
        const char *curlocale;
 
106
        int                     bylocaledict = NODICT;
 
107
 
 
108
        if (inited)
 
109
                return;
 
110
        for (i = 1; i < lengthof(dicts); i++)
 
111
                needinit[i] = false;
 
112
 
 
113
        curlocale = setlocale(LC_CTYPE, NULL);
 
114
        if (curlocale)
 
115
        {
 
116
                for (i = 1; i < lengthof(dicts); i++)
 
117
                        if (strcmp(dicts[i].localename, curlocale) == 0)
 
118
                        {
 
119
                                bylocaledict = i;
 
120
                                break;
 
121
                        }
 
122
        }
 
123
 
 
124
        for (i = 1; i < lengthof(mapdict); i++)
 
125
        {
 
126
                k = 0;
 
127
                md = &mapdict[i];
 
128
                for (j = 0; j < MAXNDICT; j++)
 
129
                {
 
130
                        GETDICT(md, k) = GETDICT(md, j);
 
131
                        if (GETDICT(md, k) == NODICT)
 
132
                                break;
 
133
                        else if (GETDICT(md, k) == BYLOCALE)
 
134
                        {
 
135
                                if (bylocaledict == NODICT)
 
136
                                        continue;
 
137
                                GETDICT(md, k) = bylocaledict;
 
138
                        }
 
139
                        if (GETDICT(md, k) >= (int2) lengthof(dicts))
 
140
                                continue;
 
141
                        needinit[GETDICT(md, k)] = true;
 
142
                        k++;
 
143
                }
 
144
                for (; k < MAXNDICT; k++)
 
145
                        if (GETDICT(md, k) != STOPLEXEM)
 
146
                                GETDICT(md, k) = NODICT;
 
147
        }
 
148
 
 
149
        for (i = 1; i < lengthof(dicts); i++)
 
150
                if (needinit[i] && dicts[i].init)
 
151
                        dictobjs[i] = (*(dicts[i].init)) ();
 
152
 
 
153
        inited = true;
 
154
        return;
 
155
}
 
156
 
 
157
char *
 
158
lemmatize(char *word, int *len, int type)
 
159
{
 
160
        int2            nd;
 
161
        int                     i;
 
162
        DICT       *dict;
 
163
 
 
164
        for (i = 0; i < MAXNDICT; i++)
 
165
        {
 
166
                nd = GETDICT(&mapdict[type], i);
 
167
                if (nd == NODICT)
 
168
                {
 
169
                        /* there is no dictionary */
 
170
                        return word;
 
171
                }
 
172
                else if (nd == STOPLEXEM)
 
173
                {
 
174
                        /* word is stopword */
 
175
                        return NULL;
 
176
                }
 
177
                else if (nd == BYLOCALE)
 
178
                {
 
179
                        continue;                       /* no dict for current locale */
 
180
                }
 
181
                else
 
182
                {
 
183
                        dict = &dicts[nd];
 
184
                        if (dict->is_stoplemm && (*(dict->is_stoplemm)) (dictobjs[nd], word, *len))
 
185
                                return NULL;
 
186
                        if (dict->lemmatize)
 
187
                        {
 
188
                                int                     oldlen = *len;
 
189
                                char       *newword = (*(dict->lemmatize)) (dictobjs[nd], word, len);
 
190
 
 
191
                                /* word is recognized by dictionary */
 
192
                                if (newword != word || *len != oldlen)
 
193
                                {
 
194
                                        if (dict->is_stemstoplemm &&
 
195
                                        (*(dict->is_stemstoplemm)) (dictobjs[nd], word, *len))
 
196
                                        {
 
197
                                                if (newword != word && newword)
 
198
                                                        pfree(newword);
 
199
                                                return NULL;
 
200
                                        }
 
201
                                        return newword;
 
202
                                }
 
203
                        }
 
204
                }
 
205
        }
 
206
 
 
207
        return word;
 
208
}
 
209
 
 
210
bool
 
211
is_stoptype(int type)
 
212
{
 
213
        return (GETDICT(&mapdict[type], 0) == STOPLEXEM) ? true : false;
 
214
}