1
/*-------------------------------------------------------------------------
6
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
10
* src/backend/utils/adt/tsvector_parser.c
12
*-------------------------------------------------------------------------
17
#include "libpq/pqformat.h"
18
#include "tsearch/ts_type.h"
19
#include "tsearch/ts_locale.h"
20
#include "tsearch/ts_utils.h"
21
#include "utils/memutils.h"
25
* Private state of tsvector parser. Note that tsquery also uses this code to
26
* parse its input, hence the boolean flags. The two flags are both true or
27
* both false in current usage, but we keep them separate for clarity.
28
* is_tsquery affects *only* the content of error messages.
30
struct TSVectorParseStateData
32
char *prsbuf; /* next input character */
33
char *bufstart; /* whole string (used only for errors) */
34
char *word; /* buffer to hold the current word */
35
int len; /* size in bytes allocated for 'word' */
36
int eml; /* max bytes per character */
37
bool oprisdelim; /* treat ! | * ( ) as delimiters? */
38
bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
43
* Initializes parser for the input string. If oprisdelim is set, the
44
* following characters are treated as delimiters in addition to whitespace:
48
init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
50
TSVectorParseState state;
52
state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
53
state->prsbuf = input;
54
state->bufstart = input;
56
state->word = (char *) palloc(state->len);
57
state->eml = pg_database_encoding_max_length();
58
state->oprisdelim = oprisdelim;
59
state->is_tsquery = is_tsquery;
65
* Reinitializes parser to parse 'input', instead of previous input.
68
reset_tsvector_parser(TSVectorParseState state, char *input)
70
state->prsbuf = input;
74
* Shuts down a tsvector parser.
77
close_tsvector_parser(TSVectorParseState state)
83
/* increase the size of 'word' if needed to hold one more character */
84
#define RESIZEPRSBUF \
86
int clen = curpos - state->word; \
87
if ( clen + state->eml >= state->len ) \
90
state->word = (char *) repalloc(state->word, state->len); \
91
curpos = state->word + clen; \
95
#define ISOPERATOR(x) ( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )
97
/* Fills gettoken_tsvector's output parameters, and returns true */
98
#define RETURN_TOKEN \
100
if (pos_ptr != NULL) \
105
else if (pos != NULL) \
108
if (strval != NULL) \
109
*strval = state->word; \
110
if (lenval != NULL) \
111
*lenval = curpos - state->word; \
112
if (endptr != NULL) \
113
*endptr = state->prsbuf; \
118
/* State codes used in gettoken_tsvector */
120
#define WAITENDWORD 2
121
#define WAITNEXTCHAR 3
122
#define WAITENDCMPLX 4
123
#define WAITPOSINFO 5
125
#define WAITPOSDELIM 7
126
#define WAITCHARCMPLX 8
128
#define PRSSYNTAXERROR prssyntaxerror(state)
131
prssyntaxerror(TSVectorParseState state)
134
(errcode(ERRCODE_SYNTAX_ERROR),
136
errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
137
errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
142
* Get next token from string being parsed. Returns true if successful,
143
* false if end of input string is reached. On success, these output
144
* parameters are filled in:
146
* *strval pointer to token
147
* *lenval length of *strval
148
* *pos_ptr pointer to a palloc'd array of positions and weights
149
* associated with the token. If the caller is not interested
150
* in the information, NULL can be supplied. Otherwise
151
* the caller is responsible for pfreeing the array.
152
* *poslen number of elements in *pos_ptr
153
* *endptr scan resumption point
155
* Pass NULL for unwanted output parameters.
158
gettoken_tsvector(TSVectorParseState state,
159
char **strval, int *lenval,
160
WordEntryPos **pos_ptr, int *poslen,
164
char *curpos = state->word;
165
int statecode = WAITWORD;
168
* pos is for collecting the comma delimited list of positions followed by
171
WordEntryPos *pos = NULL;
172
int npos = 0; /* elements of pos used */
173
int posalen = 0; /* allocated size of pos */
177
if (statecode == WAITWORD)
179
if (*(state->prsbuf) == '\0')
181
else if (t_iseq(state->prsbuf, '\''))
182
statecode = WAITENDCMPLX;
183
else if (t_iseq(state->prsbuf, '\\'))
185
statecode = WAITNEXTCHAR;
186
oldstate = WAITENDWORD;
188
else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
190
else if (!t_isspace(state->prsbuf))
192
COPYCHAR(curpos, state->prsbuf);
193
curpos += pg_mblen(state->prsbuf);
194
statecode = WAITENDWORD;
197
else if (statecode == WAITNEXTCHAR)
199
if (*(state->prsbuf) == '\0')
201
(errcode(ERRCODE_SYNTAX_ERROR),
202
errmsg("there is no escaped character: \"%s\"",
207
COPYCHAR(curpos, state->prsbuf);
208
curpos += pg_mblen(state->prsbuf);
209
Assert(oldstate != 0);
210
statecode = oldstate;
213
else if (statecode == WAITENDWORD)
215
if (t_iseq(state->prsbuf, '\\'))
217
statecode = WAITNEXTCHAR;
218
oldstate = WAITENDWORD;
220
else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
221
(state->oprisdelim && ISOPERATOR(state->prsbuf)))
224
if (curpos == state->word)
229
else if (t_iseq(state->prsbuf, ':'))
231
if (curpos == state->word)
234
if (state->oprisdelim)
237
statecode = INPOSINFO;
242
COPYCHAR(curpos, state->prsbuf);
243
curpos += pg_mblen(state->prsbuf);
246
else if (statecode == WAITENDCMPLX)
248
if (t_iseq(state->prsbuf, '\''))
250
statecode = WAITCHARCMPLX;
252
else if (t_iseq(state->prsbuf, '\\'))
254
statecode = WAITNEXTCHAR;
255
oldstate = WAITENDCMPLX;
257
else if (*(state->prsbuf) == '\0')
262
COPYCHAR(curpos, state->prsbuf);
263
curpos += pg_mblen(state->prsbuf);
266
else if (statecode == WAITCHARCMPLX)
268
if (t_iseq(state->prsbuf, '\''))
271
COPYCHAR(curpos, state->prsbuf);
272
curpos += pg_mblen(state->prsbuf);
273
statecode = WAITENDCMPLX;
279
if (curpos == state->word)
281
if (state->oprisdelim)
283
/* state->prsbuf+=pg_mblen(state->prsbuf); */
287
statecode = WAITPOSINFO;
288
continue; /* recheck current character */
291
else if (statecode == WAITPOSINFO)
293
if (t_iseq(state->prsbuf, ':'))
294
statecode = INPOSINFO;
298
else if (statecode == INPOSINFO)
300
if (t_isdigit(state->prsbuf))
305
pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
308
else if (npos + 1 >= posalen)
311
pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
314
WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
315
/* we cannot get here in tsquery, so no need for 2 errmsgs */
316
if (WEP_GETPOS(pos[npos - 1]) == 0)
318
(errcode(ERRCODE_SYNTAX_ERROR),
319
errmsg("wrong position info in tsvector: \"%s\"",
321
WEP_SETWEIGHT(pos[npos - 1], 0);
322
statecode = WAITPOSDELIM;
327
else if (statecode == WAITPOSDELIM)
329
if (t_iseq(state->prsbuf, ','))
330
statecode = INPOSINFO;
331
else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
333
if (WEP_GETWEIGHT(pos[npos - 1]))
335
WEP_SETWEIGHT(pos[npos - 1], 3);
337
else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
339
if (WEP_GETWEIGHT(pos[npos - 1]))
341
WEP_SETWEIGHT(pos[npos - 1], 2);
343
else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
345
if (WEP_GETWEIGHT(pos[npos - 1]))
347
WEP_SETWEIGHT(pos[npos - 1], 1);
349
else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
351
if (WEP_GETWEIGHT(pos[npos - 1]))
353
WEP_SETWEIGHT(pos[npos - 1], 0);
355
else if (t_isspace(state->prsbuf) ||
356
*(state->prsbuf) == '\0')
358
else if (!t_isdigit(state->prsbuf))
361
else /* internal error */
362
elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
366
state->prsbuf += pg_mblen(state->prsbuf);