~vcs-imports/mammoth-replicator/trunk

« back to all changes in this revision

Viewing changes to contrib/fulltextindex/fti.c

  • Committer: alvherre
  • Date: 2005-12-16 21:24:52 UTC
  • Revision ID: svn-v4:db760fc0-0f08-0410-9d63-cc6633f64896:trunk:1
Initial import of the REL8_0_3 sources from the Pgsql CVS repository.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#include "postgres.h"
 
2
 
 
3
#include <ctype.h>
 
4
 
 
5
#include "executor/spi.h"
 
6
#include "commands/trigger.h"
 
7
 
 
8
/*
 
9
 *      Trigger function accepts variable number of arguments:
 
10
 *
 
11
 *              1. relation in which to store the substrings
 
12
 *              2. fields to extract substrings from
 
13
 *
 
14
 *      The relation in which to insert *must* have the following layout:
 
15
 *
 
16
 *              string          varchar(#)
 
17
 *              id                      oid
 
18
 *
 
19
 *       where # is the largest size of the varchar columns being indexed
 
20
 *
 
21
 *      Example:
 
22
 *
 
23
 *      -- Create the SQL function based on the compiled shared object
 
24
 *      create function fti() returns trigger as
 
25
 *        '/usr/local/pgsql/lib/contrib/fti.so' language 'C';
 
26
 *
 
27
 *      -- Create the FTI table
 
28
 *      create table product_fti (string varchar(255), id oid) without oids;
 
29
 *
 
30
 *      -- Create an index to assist string matches
 
31
 *      create index product_fti_string_idx on product_fti (string);
 
32
 *
 
33
 *      -- Create an index to assist trigger'd deletes
 
34
 *      create index product_fti_id_idx on product_fti (id);
 
35
 *
 
36
 *      -- Create an index on the product oid column to assist joins
 
37
 *      -- between the fti table and the product table
 
38
 *      create index product_oid_idx on product (oid);
 
39
 *
 
40
 *      -- Create the trigger to perform incremental changes to the full text index.
 
41
 *      create trigger product_fti_trig after update or insert or delete on product
 
42
 *      for each row execute procedure fti(product_fti, title, artist);
 
43
 *                                                                         ^^^^^^^^^^^
 
44
 *                                                                         table where full text index is stored
 
45
 *                                                                                                      ^^^^^^^^^^^^^
 
46
 *                                                                                                      columns to index in the base table
 
47
 *
 
48
 *      After populating 'product', try something like:
 
49
 *
 
50
 *      SELECT DISTINCT(p.*) FROM product p, product_fti f1, product_fti f2 WHERE
 
51
 *      f1.string ~ '^slippery' AND f2.string ~ '^wet' AND p.oid=f1.id AND p.oid=f2.id;
 
52
 *
 
53
 *      To check that your indicies are being used correctly, make sure you
 
54
 *      EXPLAIN SELECT ... your test query above.
 
55
 *
 
56
 * CHANGELOG
 
57
 * ---------
 
58
 *
 
59
 *      august 3 2001
 
60
 *                               Extended fti function to accept more than one column as a
 
61
 *                               parameter and all specified columns are indexed.  Changed
 
62
 *                               all uses of sprintf to snprintf.  Made error messages more
 
63
 *                               consistent.
 
64
 *
 
65
 *      march 4 1998 Changed breakup() to return less substrings. Only breakup
 
66
 *                               in word parts which are in turn shortened from the start
 
67
 *                               of the word (ie. word, ord, rd)
 
68
 *                               Did allocation of substring buffer outside of breakup()
 
69
 *
 
70
 *      oct. 5 1997, fixed a bug in string breakup (where there are more nonalpha
 
71
 *                               characters between words then 1).
 
72
 *
 
73
 *      oct 4-5 1997 implemented the thing, at least the basic functionallity
 
74
 *                               of it all....
 
75
 *
 
76
 * TODO
 
77
 * ----
 
78
 *
 
79
 *       prevent generating duplicate words for an oid in the fti table
 
80
 *       save a plan for deletes
 
81
 *       create a function that will make the index *after* we have populated
 
82
 *       the main table (probably first delete all contents to be sure there's
 
83
 *       nothing in it, then re-populate the fti-table)
 
84
 *
 
85
 *       can we do something with operator overloading or a seperate function
 
86
 *       that can build the final query automagically?
 
87
 */
 
88
 
 
89
#define MAX_FTI_QUERY_LENGTH 8192
 
90
 
 
91
extern Datum fti(PG_FUNCTION_ARGS);
 
92
static char *breakup(char *, char *);
 
93
static bool is_stopword(char *);
 
94
 
 
95
static bool new_tuple = false;
 
96
 
 
97
 
 
98
#ifdef USE_STOP_WORDS
 
99
 
 
100
/* THIS LIST MUST BE IN SORTED ORDER, A BINARY SEARCH IS USED!!!! */
 
101
char       *StopWords[] = {             /* list of words to skip in indexing */
 
102
        "no",
 
103
        "the",
 
104
        "yes"
 
105
};
 
106
#endif   /* USE_STOP_WORDS */
 
107
 
 
108
/* stuff for caching query-plans, stolen from contrib/spi/\*.c */
 
109
typedef struct
 
110
{
 
111
        char       *ident;
 
112
        int                     nplans;
 
113
        void      **splan;
 
114
}       EPlan;
 
115
 
 
116
static EPlan *InsertPlans = NULL;
 
117
static EPlan *DeletePlans = NULL;
 
118
static int      nInsertPlans = 0;
 
119
static int      nDeletePlans = 0;
 
120
 
 
121
static EPlan *find_plan(char *ident, EPlan ** eplan, int *nplans);
 
122
 
 
123
/***********************************************************************/
 
124
PG_FUNCTION_INFO_V1(fti);
 
125
 
 
126
Datum
 
127
fti(PG_FUNCTION_ARGS)
 
128
{
 
129
        TriggerData *trigdata;
 
130
        Trigger    *trigger;            /* to get trigger name */
 
131
        int                     nargs;                  /* # of arguments */
 
132
        char      **args;                       /* arguments */
 
133
        char       *relname;            /* triggered relation name */
 
134
        Relation        rel;                    /* triggered relation */
 
135
        char       *indexname;          /* name of table for substrings */
 
136
        HeapTuple       rettuple = NULL;
 
137
        TupleDesc       tupdesc;                /* tuple description */
 
138
        bool            isinsert = false;
 
139
        bool            isdelete = false;
 
140
        int                     ret;
 
141
        char            query[MAX_FTI_QUERY_LENGTH];
 
142
        Oid                     oid;
 
143
 
 
144
        /*
 
145
         * FILE          *debug;
 
146
         */
 
147
 
 
148
        /*
 
149
         * debug = fopen("/dev/xconsole", "w"); fprintf(debug, "FTI: entered
 
150
         * function\n"); fflush(debug);
 
151
         */
 
152
 
 
153
        if (!CALLED_AS_TRIGGER(fcinfo))
 
154
                /* internal error */
 
155
                elog(ERROR, "not fired by trigger manager");
 
156
 
 
157
        /* It's safe to cast now that we've checked */
 
158
        trigdata = (TriggerData *) fcinfo->context;
 
159
 
 
160
        if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event))
 
161
                ereport(ERROR,
 
162
                                (errcode(ERRCODE_TRIGGERED_ACTION_EXCEPTION),
 
163
                                 errmsg("can't process STATEMENT events")));
 
164
 
 
165
        if (TRIGGER_FIRED_BEFORE(trigdata->tg_event))
 
166
                ereport(ERROR,
 
167
                                (errcode(ERRCODE_TRIGGERED_ACTION_EXCEPTION),
 
168
                                 errmsg("must be fired AFTER event")));
 
169
 
 
170
        if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
 
171
                isinsert = true;
 
172
        if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event))
 
173
        {
 
174
                isdelete = true;
 
175
                isinsert = true;
 
176
        }
 
177
        if (TRIGGER_FIRED_BY_DELETE(trigdata->tg_event))
 
178
                isdelete = true;
 
179
 
 
180
        trigger = trigdata->tg_trigger;
 
181
        rel = trigdata->tg_relation;
 
182
        relname = SPI_getrelname(rel);
 
183
        rettuple = trigdata->tg_trigtuple;
 
184
        if (isdelete && isinsert)       /* is an UPDATE */
 
185
                rettuple = trigdata->tg_newtuple;
 
186
 
 
187
        if ((ret = SPI_connect()) < 0)
 
188
                /* internal error */
 
189
                elog(ERROR, "SPI_connect failed, returned %d", ret);
 
190
 
 
191
        nargs = trigger->tgnargs;
 
192
        if (nargs < 2)
 
193
                ereport(ERROR,
 
194
                                (errcode(ERRCODE_TRIGGERED_ACTION_EXCEPTION),
 
195
                                 errmsg("fti trigger must have at least 2 arguments")));
 
196
 
 
197
        args = trigger->tgargs;
 
198
        indexname = args[0];
 
199
        tupdesc = rel->rd_att;          /* what the tuple looks like (?) */
 
200
 
 
201
        /* get oid of current tuple, needed by all, so place here */
 
202
        oid = HeapTupleGetOid(rettuple);
 
203
        if (!OidIsValid(oid))
 
204
                ereport(ERROR,
 
205
                                (errcode(ERRCODE_UNDEFINED_COLUMN),
 
206
                                 errmsg("OID is not present"),
 
207
                                 errhint("Full Text Index requires indexed tables be created WITH OIDS.")));
 
208
 
 
209
        if (isdelete)
 
210
        {
 
211
                void       *pplan;
 
212
                Oid                *argtypes;
 
213
                Datum           values[1];
 
214
                EPlan      *plan;
 
215
                int                     i;
 
216
 
 
217
                snprintf(query, MAX_FTI_QUERY_LENGTH, "D%s", indexname);
 
218
                for (i = 1; i < nargs; i++)
 
219
                        snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]);
 
220
 
 
221
                plan = find_plan(query, &DeletePlans, &nDeletePlans);
 
222
                if (plan->nplans <= 0)
 
223
                {
 
224
                        argtypes = (Oid *) palloc(sizeof(Oid));
 
225
 
 
226
                        argtypes[0] = OIDOID;
 
227
 
 
228
                        snprintf(query, MAX_FTI_QUERY_LENGTH, "DELETE FROM %s WHERE id = $1", indexname);
 
229
                        pplan = SPI_prepare(query, 1, argtypes);
 
230
                        if (!pplan)
 
231
                                /* internal error */
 
232
                                elog(ERROR, "SPI_prepare returned NULL in delete");
 
233
                        pplan = SPI_saveplan(pplan);
 
234
                        if (pplan == NULL)
 
235
                                /* internal error */
 
236
                                elog(ERROR, "SPI_saveplan returned NULL in delete");
 
237
 
 
238
                        plan->splan = (void **) malloc(sizeof(void *));
 
239
                        *(plan->splan) = pplan;
 
240
                        plan->nplans = 1;
 
241
                }
 
242
 
 
243
                values[0] = oid;
 
244
 
 
245
                ret = SPI_execp(*(plan->splan), values, NULL, 0);
 
246
                if (ret != SPI_OK_DELETE)
 
247
                        ereport(ERROR,
 
248
                                        (errcode(ERRCODE_TRIGGERED_ACTION_EXCEPTION),
 
249
                                         errmsg("error executing delete")));
 
250
        }
 
251
 
 
252
        if (isinsert)
 
253
        {
 
254
                char       *substring;
 
255
                char       *column;
 
256
                void       *pplan;
 
257
                Oid                *argtypes;
 
258
                Datum           values[2];
 
259
                int                     colnum;
 
260
                struct varlena *data;
 
261
                EPlan      *plan;
 
262
                int                     i;
 
263
                char       *buff;
 
264
                char       *string;
 
265
 
 
266
                snprintf(query, MAX_FTI_QUERY_LENGTH, "I%s", indexname);
 
267
                for (i = 1; i < nargs; i++)
 
268
                        snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]);
 
269
 
 
270
                plan = find_plan(query, &InsertPlans, &nInsertPlans);
 
271
 
 
272
                /* no plan yet, so allocate mem for argtypes */
 
273
                if (plan->nplans <= 0)
 
274
                {
 
275
                        argtypes = (Oid *) palloc(2 * sizeof(Oid));
 
276
 
 
277
                        argtypes[0] = VARCHAROID;       /* create table t_name (string
 
278
                                                                                 * varchar, */
 
279
                        argtypes[1] = OIDOID;           /* id     oid);    */
 
280
 
 
281
                        /* prepare plan to gain speed */
 
282
                        snprintf(query, MAX_FTI_QUERY_LENGTH, "INSERT INTO %s (string, id) VALUES ($1, $2)",
 
283
                                         indexname);
 
284
                        pplan = SPI_prepare(query, 2, argtypes);
 
285
                        if (!pplan)
 
286
                                /* internal error */
 
287
                                elog(ERROR, "SPI_prepare returned NULL in insert");
 
288
 
 
289
                        pplan = SPI_saveplan(pplan);
 
290
                        if (pplan == NULL)
 
291
                                /* internal error */
 
292
                                elog(ERROR, "SPI_saveplan returned NULL in insert");
 
293
 
 
294
                        plan->splan = (void **) malloc(sizeof(void *));
 
295
                        *(plan->splan) = pplan;
 
296
                        plan->nplans = 1;
 
297
                }
 
298
 
 
299
                /* prepare plan for query */
 
300
                for (i = 0; i < nargs - 1; i++)
 
301
                {
 
302
                        colnum = SPI_fnumber(tupdesc, args[i + 1]);
 
303
                        if (colnum == SPI_ERROR_NOATTRIBUTE)
 
304
                                ereport(ERROR,
 
305
                                                (errcode(ERRCODE_UNDEFINED_COLUMN),
 
306
                                                 errmsg("column \"%s\" of \"%s\" does not exist",
 
307
                                                                args[i + 1], indexname)));
 
308
 
 
309
                        /* Get the char* representation of the column */
 
310
                        column = SPI_getvalue(rettuple, tupdesc, colnum);
 
311
 
 
312
                        /* make sure we don't try to index NULL's */
 
313
                        if (column)
 
314
                        {
 
315
                                string = column;
 
316
                                while (*string != '\0')
 
317
                                {
 
318
                                        *string = tolower((unsigned char) *string);
 
319
                                        string++;
 
320
                                }
 
321
 
 
322
                                data = (struct varlena *) palloc(sizeof(int32) + strlen(column) +1);
 
323
                                buff = palloc(strlen(column) + 1);
 
324
                                /* saves lots of calls in while-loop and in breakup() */
 
325
 
 
326
                                new_tuple = true;
 
327
 
 
328
                                while ((substring = breakup(column, buff)))
 
329
                                {
 
330
                                        int                     l;
 
331
 
 
332
                                        l = strlen(substring);
 
333
 
 
334
                                        data->vl_len = l + sizeof(int32);
 
335
                                        memcpy(VARDATA(data), substring, l);
 
336
                                        values[0] = PointerGetDatum(data);
 
337
                                        values[1] = oid;
 
338
 
 
339
                                        ret = SPI_execp(*(plan->splan), values, NULL, 0);
 
340
                                        if (ret != SPI_OK_INSERT)
 
341
                                                ereport(ERROR,
 
342
                                                        (errcode(ERRCODE_TRIGGERED_ACTION_EXCEPTION),
 
343
                                                         errmsg("error executing insert")));
 
344
                                }
 
345
                                pfree(buff);
 
346
                                pfree(data);
 
347
                        }
 
348
                }
 
349
        }
 
350
 
 
351
        SPI_finish();
 
352
        return PointerGetDatum(rettuple);
 
353
}
 
354
 
 
355
static char *
 
356
breakup(char *string, char *substring)
 
357
{
 
358
        static char *last_start;
 
359
        static char *cur_pos;
 
360
 
 
361
        if (new_tuple)
 
362
        {
 
363
                cur_pos = last_start = &string[strlen(string) - 1];
 
364
                new_tuple = false;              /* don't initialize this next time */
 
365
        }
 
366
 
 
367
        while (cur_pos > string)        /* don't read before start of 'string' */
 
368
        {
 
369
                /*
 
370
                 * skip pieces at the end of a string that are not alfa-numeric
 
371
                 * (ie. 'string$%^&', last_start first points to '&', and after
 
372
                 * this to 'g'
 
373
                 */
 
374
                if (!isalnum((unsigned char) *last_start))
 
375
                {
 
376
                        while (!isalnum((unsigned char) *last_start) &&
 
377
                                   last_start > string)
 
378
                                last_start--;
 
379
                        cur_pos = last_start;
 
380
                }
 
381
 
 
382
                cur_pos--;                              /* substrings are at minimum 2 characters
 
383
                                                                 * long */
 
384
 
 
385
                if (isalnum((unsigned char) *cur_pos))
 
386
                {
 
387
                        /* Houston, we have a substring! :) */
 
388
                        memcpy(substring, cur_pos, last_start - cur_pos + 1);
 
389
                        substring[last_start - cur_pos + 1] = '\0';
 
390
                        if (!is_stopword(substring))
 
391
                                return substring;
 
392
                }
 
393
                else
 
394
                {
 
395
                        last_start = cur_pos - 1;
 
396
                        cur_pos = last_start;
 
397
                }
 
398
        }
 
399
 
 
400
        return NULL;                            /* we've processed all of 'string' */
 
401
}
 
402
 
 
403
/* copied from src/backend/parser/keywords.c and adjusted for our situation*/
 
404
static bool
 
405
is_stopword(char *text)
 
406
{
 
407
#ifdef USE_STOP_WORDS
 
408
        char      **StopLow;            /* for list of stop-words */
 
409
        char      **StopHigh;
 
410
        char      **StopMiddle;
 
411
        int                     difference;
 
412
 
 
413
        StopLow = &StopWords[0];        /* initialize stuff for binary search */
 
414
        StopHigh = endof(StopWords);
 
415
 
 
416
        /* Loop invariant: *StopLow <= text < *StopHigh */
 
417
 
 
418
        while (StopLow < StopHigh)
 
419
        {
 
420
                StopMiddle = StopLow + (StopHigh - StopLow) / 2;
 
421
                difference = strcmp(*StopMiddle, text);
 
422
                if (difference == 0)
 
423
                        return (true);
 
424
                else if (difference < 0)
 
425
                        StopLow = StopMiddle + 1;
 
426
                else
 
427
                        StopHigh = StopMiddle;
 
428
        }
 
429
#endif   /* USE_STOP_WORDS */
 
430
 
 
431
        return (false);
 
432
}
 
433
 
 
434
/* for caching of query plans, stolen from contrib/spi/\*.c */
 
435
static EPlan *
 
436
find_plan(char *ident, EPlan ** eplan, int *nplans)
 
437
{
 
438
        EPlan      *newp;
 
439
        int                     i;
 
440
 
 
441
        if (*nplans > 0)
 
442
        {
 
443
                for (i = 0; i < *nplans; i++)
 
444
                {
 
445
                        if (strcmp((*eplan)[i].ident, ident) == 0)
 
446
                                break;
 
447
                }
 
448
                if (i != *nplans)
 
449
                        return (*eplan + i);
 
450
                *eplan = (EPlan *) realloc(*eplan, (i + 1) * sizeof(EPlan));
 
451
                newp = *eplan + i;
 
452
        }
 
453
        else
 
454
        {
 
455
                newp = *eplan = (EPlan *) malloc(sizeof(EPlan));
 
456
                (*nplans) = i = 0;
 
457
        }
 
458
 
 
459
        newp->ident = (char *) malloc(strlen(ident) + 1);
 
460
        strcpy(newp->ident, ident);
 
461
        newp->nplans = 0;
 
462
        newp->splan = NULL;
 
463
        (*nplans)++;
 
464
 
 
465
        return (newp);
 
466
}