~ubuntu-branches/ubuntu/oneiric/postgresql-9.1/oneiric-security

« back to all changes in this revision

Viewing changes to src/backend/utils/adt/tsvector_parser.c

  • Committer: Bazaar Package Importer
  • Author(s): Martin Pitt
  • Date: 2011-05-11 10:41:53 UTC
  • Revision ID: james.westby@ubuntu.com-20110511104153-psbh2o58553fv1m0
Tags: upstream-9.1~beta1
ImportĀ upstreamĀ versionĀ 9.1~beta1

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*-------------------------------------------------------------------------
 
2
 *
 
3
 * tsvector_parser.c
 
4
 *        Parser for tsvector
 
5
 *
 
6
 * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
 
7
 *
 
8
 *
 
9
 * IDENTIFICATION
 
10
 *        src/backend/utils/adt/tsvector_parser.c
 
11
 *
 
12
 *-------------------------------------------------------------------------
 
13
 */
 
14
 
 
15
#include "postgres.h"
 
16
 
 
17
#include "libpq/pqformat.h"
 
18
#include "tsearch/ts_type.h"
 
19
#include "tsearch/ts_locale.h"
 
20
#include "tsearch/ts_utils.h"
 
21
#include "utils/memutils.h"
 
22
 
 
23
 
 
24
/*
 
25
 * Private state of tsvector parser.  Note that tsquery also uses this code to
 
26
 * parse its input, hence the boolean flags.  The two flags are both true or
 
27
 * both false in current usage, but we keep them separate for clarity.
 
28
 * is_tsquery affects *only* the content of error messages.
 
29
 */
 
30
struct TSVectorParseStateData
 
31
{
 
32
        char       *prsbuf;                     /* next input character */
 
33
        char       *bufstart;           /* whole string (used only for errors) */
 
34
        char       *word;                       /* buffer to hold the current word */
 
35
        int                     len;                    /* size in bytes allocated for 'word' */
 
36
        int                     eml;                    /* max bytes per character */
 
37
        bool            oprisdelim;             /* treat ! | * ( ) as delimiters? */
 
38
        bool            is_tsquery;             /* say "tsquery" not "tsvector" in errors? */
 
39
};
 
40
 
 
41
 
 
42
/*
 
43
 * Initializes parser for the input string. If oprisdelim is set, the
 
44
 * following characters are treated as delimiters in addition to whitespace:
 
45
 * ! | & ( )
 
46
 */
 
47
TSVectorParseState
 
48
init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
 
49
{
 
50
        TSVectorParseState state;
 
51
 
 
52
        state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
 
53
        state->prsbuf = input;
 
54
        state->bufstart = input;
 
55
        state->len = 32;
 
56
        state->word = (char *) palloc(state->len);
 
57
        state->eml = pg_database_encoding_max_length();
 
58
        state->oprisdelim = oprisdelim;
 
59
        state->is_tsquery = is_tsquery;
 
60
 
 
61
        return state;
 
62
}
 
63
 
 
64
/*
 
65
 * Reinitializes parser to parse 'input', instead of previous input.
 
66
 */
 
67
void
 
68
reset_tsvector_parser(TSVectorParseState state, char *input)
 
69
{
 
70
        state->prsbuf = input;
 
71
}
 
72
 
 
73
/*
 
74
 * Shuts down a tsvector parser.
 
75
 */
 
76
void
 
77
close_tsvector_parser(TSVectorParseState state)
 
78
{
 
79
        pfree(state->word);
 
80
        pfree(state);
 
81
}
 
82
 
 
83
/* increase the size of 'word' if needed to hold one more character */
 
84
#define RESIZEPRSBUF \
 
85
do { \
 
86
        int clen = curpos - state->word; \
 
87
        if ( clen + state->eml >= state->len ) \
 
88
        { \
 
89
                state->len *= 2; \
 
90
                state->word = (char *) repalloc(state->word, state->len); \
 
91
                curpos = state->word + clen; \
 
92
        } \
 
93
} while (0)
 
94
 
 
95
#define ISOPERATOR(x)   ( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )
 
96
 
 
97
/* Fills gettoken_tsvector's output parameters, and returns true */
 
98
#define RETURN_TOKEN \
 
99
do { \
 
100
        if (pos_ptr != NULL) \
 
101
        { \
 
102
                *pos_ptr = pos; \
 
103
                *poslen = npos; \
 
104
        } \
 
105
        else if (pos != NULL) \
 
106
                pfree(pos); \
 
107
        \
 
108
        if (strval != NULL) \
 
109
                *strval = state->word; \
 
110
        if (lenval != NULL) \
 
111
                *lenval = curpos - state->word; \
 
112
        if (endptr != NULL) \
 
113
                *endptr = state->prsbuf; \
 
114
        return true; \
 
115
} while(0)
 
116
 
 
117
 
 
118
/* State codes used in gettoken_tsvector */
 
119
#define WAITWORD                1
 
120
#define WAITENDWORD             2
 
121
#define WAITNEXTCHAR    3
 
122
#define WAITENDCMPLX    4
 
123
#define WAITPOSINFO             5
 
124
#define INPOSINFO               6
 
125
#define WAITPOSDELIM    7
 
126
#define WAITCHARCMPLX   8
 
127
 
 
128
#define PRSSYNTAXERROR prssyntaxerror(state)
 
129
 
 
130
static void
 
131
prssyntaxerror(TSVectorParseState state)
 
132
{
 
133
        ereport(ERROR,
 
134
                        (errcode(ERRCODE_SYNTAX_ERROR),
 
135
                         state->is_tsquery ?
 
136
                         errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
 
137
                         errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
 
138
}
 
139
 
 
140
 
 
141
/*
 
142
 * Get next token from string being parsed. Returns true if successful,
 
143
 * false if end of input string is reached.  On success, these output
 
144
 * parameters are filled in:
 
145
 *
 
146
 * *strval              pointer to token
 
147
 * *lenval              length of *strval
 
148
 * *pos_ptr             pointer to a palloc'd array of positions and weights
 
149
 *                              associated with the token. If the caller is not interested
 
150
 *                              in the information, NULL can be supplied. Otherwise
 
151
 *                              the caller is responsible for pfreeing the array.
 
152
 * *poslen              number of elements in *pos_ptr
 
153
 * *endptr              scan resumption point
 
154
 *
 
155
 * Pass NULL for unwanted output parameters.
 
156
 */
 
157
bool
 
158
gettoken_tsvector(TSVectorParseState state,
 
159
                                  char **strval, int *lenval,
 
160
                                  WordEntryPos **pos_ptr, int *poslen,
 
161
                                  char **endptr)
 
162
{
 
163
        int                     oldstate = 0;
 
164
        char       *curpos = state->word;
 
165
        int                     statecode = WAITWORD;
 
166
 
 
167
        /*
 
168
         * pos is for collecting the comma delimited list of positions followed by
 
169
         * the actual token.
 
170
         */
 
171
        WordEntryPos *pos = NULL;
 
172
        int                     npos = 0;               /* elements of pos used */
 
173
        int                     posalen = 0;    /* allocated size of pos */
 
174
 
 
175
        while (1)
 
176
        {
 
177
                if (statecode == WAITWORD)
 
178
                {
 
179
                        if (*(state->prsbuf) == '\0')
 
180
                                return false;
 
181
                        else if (t_iseq(state->prsbuf, '\''))
 
182
                                statecode = WAITENDCMPLX;
 
183
                        else if (t_iseq(state->prsbuf, '\\'))
 
184
                        {
 
185
                                statecode = WAITNEXTCHAR;
 
186
                                oldstate = WAITENDWORD;
 
187
                        }
 
188
                        else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
 
189
                                PRSSYNTAXERROR;
 
190
                        else if (!t_isspace(state->prsbuf))
 
191
                        {
 
192
                                COPYCHAR(curpos, state->prsbuf);
 
193
                                curpos += pg_mblen(state->prsbuf);
 
194
                                statecode = WAITENDWORD;
 
195
                        }
 
196
                }
 
197
                else if (statecode == WAITNEXTCHAR)
 
198
                {
 
199
                        if (*(state->prsbuf) == '\0')
 
200
                                ereport(ERROR,
 
201
                                                (errcode(ERRCODE_SYNTAX_ERROR),
 
202
                                                 errmsg("there is no escaped character: \"%s\"",
 
203
                                                                state->bufstart)));
 
204
                        else
 
205
                        {
 
206
                                RESIZEPRSBUF;
 
207
                                COPYCHAR(curpos, state->prsbuf);
 
208
                                curpos += pg_mblen(state->prsbuf);
 
209
                                Assert(oldstate != 0);
 
210
                                statecode = oldstate;
 
211
                        }
 
212
                }
 
213
                else if (statecode == WAITENDWORD)
 
214
                {
 
215
                        if (t_iseq(state->prsbuf, '\\'))
 
216
                        {
 
217
                                statecode = WAITNEXTCHAR;
 
218
                                oldstate = WAITENDWORD;
 
219
                        }
 
220
                        else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
 
221
                                         (state->oprisdelim && ISOPERATOR(state->prsbuf)))
 
222
                        {
 
223
                                RESIZEPRSBUF;
 
224
                                if (curpos == state->word)
 
225
                                        PRSSYNTAXERROR;
 
226
                                *(curpos) = '\0';
 
227
                                RETURN_TOKEN;
 
228
                        }
 
229
                        else if (t_iseq(state->prsbuf, ':'))
 
230
                        {
 
231
                                if (curpos == state->word)
 
232
                                        PRSSYNTAXERROR;
 
233
                                *(curpos) = '\0';
 
234
                                if (state->oprisdelim)
 
235
                                        RETURN_TOKEN;
 
236
                                else
 
237
                                        statecode = INPOSINFO;
 
238
                        }
 
239
                        else
 
240
                        {
 
241
                                RESIZEPRSBUF;
 
242
                                COPYCHAR(curpos, state->prsbuf);
 
243
                                curpos += pg_mblen(state->prsbuf);
 
244
                        }
 
245
                }
 
246
                else if (statecode == WAITENDCMPLX)
 
247
                {
 
248
                        if (t_iseq(state->prsbuf, '\''))
 
249
                        {
 
250
                                statecode = WAITCHARCMPLX;
 
251
                        }
 
252
                        else if (t_iseq(state->prsbuf, '\\'))
 
253
                        {
 
254
                                statecode = WAITNEXTCHAR;
 
255
                                oldstate = WAITENDCMPLX;
 
256
                        }
 
257
                        else if (*(state->prsbuf) == '\0')
 
258
                                PRSSYNTAXERROR;
 
259
                        else
 
260
                        {
 
261
                                RESIZEPRSBUF;
 
262
                                COPYCHAR(curpos, state->prsbuf);
 
263
                                curpos += pg_mblen(state->prsbuf);
 
264
                        }
 
265
                }
 
266
                else if (statecode == WAITCHARCMPLX)
 
267
                {
 
268
                        if (t_iseq(state->prsbuf, '\''))
 
269
                        {
 
270
                                RESIZEPRSBUF;
 
271
                                COPYCHAR(curpos, state->prsbuf);
 
272
                                curpos += pg_mblen(state->prsbuf);
 
273
                                statecode = WAITENDCMPLX;
 
274
                        }
 
275
                        else
 
276
                        {
 
277
                                RESIZEPRSBUF;
 
278
                                *(curpos) = '\0';
 
279
                                if (curpos == state->word)
 
280
                                        PRSSYNTAXERROR;
 
281
                                if (state->oprisdelim)
 
282
                                {
 
283
                                        /* state->prsbuf+=pg_mblen(state->prsbuf); */
 
284
                                        RETURN_TOKEN;
 
285
                                }
 
286
                                else
 
287
                                        statecode = WAITPOSINFO;
 
288
                                continue;               /* recheck current character */
 
289
                        }
 
290
                }
 
291
                else if (statecode == WAITPOSINFO)
 
292
                {
 
293
                        if (t_iseq(state->prsbuf, ':'))
 
294
                                statecode = INPOSINFO;
 
295
                        else
 
296
                                RETURN_TOKEN;
 
297
                }
 
298
                else if (statecode == INPOSINFO)
 
299
                {
 
300
                        if (t_isdigit(state->prsbuf))
 
301
                        {
 
302
                                if (posalen == 0)
 
303
                                {
 
304
                                        posalen = 4;
 
305
                                        pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
 
306
                                        npos = 0;
 
307
                                }
 
308
                                else if (npos + 1 >= posalen)
 
309
                                {
 
310
                                        posalen *= 2;
 
311
                                        pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
 
312
                                }
 
313
                                npos++;
 
314
                                WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
 
315
                                /* we cannot get here in tsquery, so no need for 2 errmsgs */
 
316
                                if (WEP_GETPOS(pos[npos - 1]) == 0)
 
317
                                        ereport(ERROR,
 
318
                                                        (errcode(ERRCODE_SYNTAX_ERROR),
 
319
                                                         errmsg("wrong position info in tsvector: \"%s\"",
 
320
                                                                        state->bufstart)));
 
321
                                WEP_SETWEIGHT(pos[npos - 1], 0);
 
322
                                statecode = WAITPOSDELIM;
 
323
                        }
 
324
                        else
 
325
                                PRSSYNTAXERROR;
 
326
                }
 
327
                else if (statecode == WAITPOSDELIM)
 
328
                {
 
329
                        if (t_iseq(state->prsbuf, ','))
 
330
                                statecode = INPOSINFO;
 
331
                        else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
 
332
                        {
 
333
                                if (WEP_GETWEIGHT(pos[npos - 1]))
 
334
                                        PRSSYNTAXERROR;
 
335
                                WEP_SETWEIGHT(pos[npos - 1], 3);
 
336
                        }
 
337
                        else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
 
338
                        {
 
339
                                if (WEP_GETWEIGHT(pos[npos - 1]))
 
340
                                        PRSSYNTAXERROR;
 
341
                                WEP_SETWEIGHT(pos[npos - 1], 2);
 
342
                        }
 
343
                        else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
 
344
                        {
 
345
                                if (WEP_GETWEIGHT(pos[npos - 1]))
 
346
                                        PRSSYNTAXERROR;
 
347
                                WEP_SETWEIGHT(pos[npos - 1], 1);
 
348
                        }
 
349
                        else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
 
350
                        {
 
351
                                if (WEP_GETWEIGHT(pos[npos - 1]))
 
352
                                        PRSSYNTAXERROR;
 
353
                                WEP_SETWEIGHT(pos[npos - 1], 0);
 
354
                        }
 
355
                        else if (t_isspace(state->prsbuf) ||
 
356
                                         *(state->prsbuf) == '\0')
 
357
                                RETURN_TOKEN;
 
358
                        else if (!t_isdigit(state->prsbuf))
 
359
                                PRSSYNTAXERROR;
 
360
                }
 
361
                else    /* internal error */
 
362
                        elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
 
363
                                 statecode);
 
364
 
 
365
                /* get next char */
 
366
                state->prsbuf += pg_mblen(state->prsbuf);
 
367
        }
 
368
 
 
369
        return false;
 
370
}