1
/*-------------------------------------------------------------------------
4
* Selectivity estimation functions for text search operators.
6
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
10
* src/backend/tsearch/ts_selfuncs.c
12
*-------------------------------------------------------------------------
16
#include "catalog/pg_statistic.h"
17
#include "catalog/pg_type.h"
18
#include "miscadmin.h"
19
#include "nodes/nodes.h"
20
#include "tsearch/ts_type.h"
21
#include "utils/lsyscache.h"
22
#include "utils/selfuncs.h"
23
#include "utils/syscache.h"
27
* The default text search selectivity is chosen to be small enough to
28
* encourage indexscans for typical table densities. See selfuncs.h and
29
* DEFAULT_EQ_SEL for details.
31
#define DEFAULT_TS_MATCH_SEL 0.005
33
/* lookup table type for binary searching through MCELEMs */
40
/* type of keys for bsearch'ing through an array of TextFreqs */
47
static Selectivity tsquerysel(VariableStatData *vardata, Datum constval);
48
static Selectivity mcelem_tsquery_selec(TSQuery query,
49
Datum *mcelem, int nmcelem,
50
float4 *numbers, int nnumbers);
51
static Selectivity tsquery_opr_selec(QueryItem *item, char *operand,
52
TextFreq *lookup, int length, float4 minfreq);
53
static int compare_lexeme_textfreq(const void *e1, const void *e2);
55
#define tsquery_opr_selec_no_stats(query) \
56
tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), NULL, 0, 0)
60
* tsmatchsel -- Selectivity of "@@"
62
* restriction selectivity function for tsvector @@ tsquery and
66
tsmatchsel(PG_FUNCTION_ARGS)
68
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
71
Oid operator = PG_GETARG_OID(1);
73
List *args = (List *) PG_GETARG_POINTER(2);
74
int varRelid = PG_GETARG_INT32(3);
75
VariableStatData vardata;
81
* If expression is not variable = something or something = variable, then
82
* punt and return a default estimate.
84
if (!get_restriction_variable(root, args, varRelid,
85
&vardata, &other, &varonleft))
86
PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
89
* Can't do anything useful if the something is not a constant, either.
91
if (!IsA(other, Const))
93
ReleaseVariableStats(vardata);
94
PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
98
* The "@@" operator is strict, so we can cope with NULL right away
100
if (((Const *) other)->constisnull)
102
ReleaseVariableStats(vardata);
103
PG_RETURN_FLOAT8(0.0);
107
* OK, there's a Var and a Const we're dealing with here. We need the
108
* Const to be a TSQuery, else we can't do anything useful. We have to
109
* check this because the Var might be the TSQuery not the TSVector.
111
if (((Const *) other)->consttype == TSQUERYOID)
113
/* tsvector @@ tsquery or the other way around */
114
Assert(vardata.vartype == TSVECTOROID);
116
selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
120
/* If we can't see the query structure, must punt */
121
selec = DEFAULT_TS_MATCH_SEL;
124
ReleaseVariableStats(vardata);
126
CLAMP_PROBABILITY(selec);
128
PG_RETURN_FLOAT8((float8) selec);
133
* tsmatchjoinsel -- join selectivity of "@@"
135
* join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
138
tsmatchjoinsel(PG_FUNCTION_ARGS)
140
/* for the moment we just punt */
141
PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
146
* @@ selectivity for tsvector var vs tsquery constant
149
tsquerysel(VariableStatData *vardata, Datum constval)
154
/* The caller made sure the const is a TSQuery, so get it now */
155
query = DatumGetTSQuery(constval);
157
/* Empty query matches nothing */
158
if (query->size == 0)
159
return (Selectivity) 0.0;
161
if (HeapTupleIsValid(vardata->statsTuple))
163
Form_pg_statistic stats;
169
stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
171
/* MCELEM will be an array of TEXT elements for a tsvector column */
172
if (get_attstatsslot(vardata->statsTuple,
174
STATISTIC_KIND_MCELEM, InvalidOid,
177
&numbers, &nnumbers))
180
* There is a most-common-elements slot for the tsvector Var, so
183
selec = mcelem_tsquery_selec(query, values, nvalues,
185
free_attstatsslot(TEXTOID, values, nvalues, numbers, nnumbers);
189
/* No most-common-elements info, so do without */
190
selec = tsquery_opr_selec_no_stats(query);
194
* MCE stats count only non-null rows, so adjust for null rows.
196
selec *= (1.0 - stats->stanullfrac);
200
/* No stats at all, so do without */
201
selec = tsquery_opr_selec_no_stats(query);
202
/* we assume no nulls here, so no stanullfrac correction */
209
* Extract data from the pg_statistic arrays into useful format.
212
mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
213
float4 *numbers, int nnumbers)
221
* There should be two more Numbers than Values, because the last two
222
* cells are taken for minimal and maximal frequency. Punt if not.
224
if (nnumbers != nmcelem + 2)
225
return tsquery_opr_selec_no_stats(query);
228
* Transpose the data into a single array so we can use bsearch().
230
lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
231
for (i = 0; i < nmcelem; i++)
234
* The text Datums came from an array, so it cannot be compressed or
235
* stored out-of-line -- it's safe to use VARSIZE_ANY*.
237
Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i]));
238
lookup[i].element = (text *) DatumGetPointer(mcelem[i]);
239
lookup[i].frequency = numbers[i];
243
* Grab the lowest frequency. compute_tsvector_stats() stored it for us in
244
* the one before the last cell of the Numbers array. See ts_typanalyze.c
246
minfreq = numbers[nnumbers - 2];
248
selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
257
* Traverse the tsquery in preorder, calculating selectivity as:
259
* selec(left_oper) * selec(right_oper) in AND nodes,
261
* selec(left_oper) + selec(right_oper) -
262
* selec(left_oper) * selec(right_oper) in OR nodes,
264
* 1 - select(oper) in NOT nodes
266
* histogram-based estimation in prefix VAL nodes
268
* freq[val] in exact VAL nodes, if the value is in MCELEM
269
* min(freq[MCELEM]) / 2 in VAL nodes, if it is not
271
* The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
272
* binary search for determining freq[MCELEM].
274
* If we don't have stats for the tsvector, we still use this logic,
275
* except we use default estimates for VAL nodes. This case is signaled
279
tsquery_opr_selec(QueryItem *item, char *operand,
280
TextFreq *lookup, int length, float4 minfreq)
284
/* since this function recurses, it could be driven to stack overflow */
287
if (item->type == QI_VAL)
289
QueryOperand *oper = (QueryOperand *) item;
293
* Prepare the key for bsearch().
295
key.lexeme = operand + oper->distance;
296
key.length = oper->length;
300
/* Prefix match, ie the query item is lexeme:* */
306
* Our strategy is to scan through the MCV list and add up the
307
* frequencies of the ones that match the prefix, thereby assuming
308
* that the MCVs are representative of the whole lexeme population
309
* in this respect. Compare histogram_selectivity().
311
* This is only a good plan if we have a pretty fair number of
312
* MCVs available; we set the threshold at 100. If no stats or
313
* insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
315
if (lookup == NULL || length < 100)
316
return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
318
matched = allmcvs = 0;
319
for (i = 0; i < length; i++)
321
TextFreq *t = lookup + i;
322
int tlen = VARSIZE_ANY_EXHDR(t->element);
324
if (tlen >= key.length &&
325
strncmp(key.lexeme, VARDATA_ANY(t->element),
327
matched += t->frequency;
328
allmcvs += t->frequency;
331
if (allmcvs > 0) /* paranoia about zero divide */
332
selec = matched / allmcvs;
334
selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
337
* In any case, never believe that a prefix match has selectivity
338
* less than DEFAULT_TS_MATCH_SEL.
340
selec = Max(DEFAULT_TS_MATCH_SEL, selec);
344
/* Regular exact lexeme match */
347
/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
349
return (Selectivity) DEFAULT_TS_MATCH_SEL;
351
searchres = (TextFreq *) bsearch(&key, lookup, length,
353
compare_lexeme_textfreq);
358
* The element is in MCELEM. Return precise selectivity (or
359
* at least as precise as ANALYZE could find out).
361
selec = searchres->frequency;
366
* The element is not in MCELEM. Punt, but assume that the
367
* selectivity cannot be more than minfreq / 2.
369
selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
375
/* Current TSQuery node is an operator */
379
switch (item->qoperator.oper)
382
selec = 1.0 - tsquery_opr_selec(item + 1, operand,
383
lookup, length, minfreq);
387
s1 = tsquery_opr_selec(item + 1, operand,
388
lookup, length, minfreq);
389
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
390
lookup, length, minfreq);
395
s1 = tsquery_opr_selec(item + 1, operand,
396
lookup, length, minfreq);
397
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
398
lookup, length, minfreq);
399
selec = s1 + s2 - s1 * s2;
403
elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
404
selec = 0; /* keep compiler quiet */
409
/* Clamp intermediate results to stay sane despite roundoff error */
410
CLAMP_PROBABILITY(selec);
416
* bsearch() comparator for a lexeme (non-NULL terminated string with length)
417
* and a TextFreq. Use length, then byte-for-byte comparison, because that's
418
* how ANALYZE code sorted data before storing it in a statistic tuple.
419
* See ts_typanalyze.c for details.
422
compare_lexeme_textfreq(const void *e1, const void *e2)
424
const LexemeKey *key = (const LexemeKey *) e1;
425
const TextFreq *t = (const TextFreq *) e2;
430
len2 = VARSIZE_ANY_EXHDR(t->element);
432
/* Compare lengths first, possibly avoiding a strncmp call */
435
else if (len1 < len2)
438
/* Fall back on byte-for-byte comparison */
439
return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);