~ubuntu-branches/ubuntu/oneiric/postgresql-9.1/oneiric-security

« back to all changes in this revision

Viewing changes to src/backend/utils/adt/selfuncs.c

  • Committer: Bazaar Package Importer
  • Author(s): Martin Pitt
  • Date: 2011-05-11 10:41:53 UTC
  • Revision ID: james.westby@ubuntu.com-20110511104153-psbh2o58553fv1m0
Tags: upstream-9.1~beta1
ImportĀ upstreamĀ versionĀ 9.1~beta1

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*-------------------------------------------------------------------------
 
2
 *
 
3
 * selfuncs.c
 
4
 *        Selectivity functions and index cost estimation functions for
 
5
 *        standard operators and index access methods.
 
6
 *
 
7
 *        Selectivity routines are registered in the pg_operator catalog
 
8
 *        in the "oprrest" and "oprjoin" attributes.
 
9
 *
 
10
 *        Index cost functions are registered in the pg_am catalog
 
11
 *        in the "amcostestimate" attribute.
 
12
 *
 
13
 * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
 
14
 * Portions Copyright (c) 1994, Regents of the University of California
 
15
 *
 
16
 *
 
17
 * IDENTIFICATION
 
18
 *        src/backend/utils/adt/selfuncs.c
 
19
 *
 
20
 *-------------------------------------------------------------------------
 
21
 */
 
22
 
 
23
/*----------
 
24
 * Operator selectivity estimation functions are called to estimate the
 
25
 * selectivity of WHERE clauses whose top-level operator is their operator.
 
26
 * We divide the problem into two cases:
 
27
 *              Restriction clause estimation: the clause involves vars of just
 
28
 *                      one relation.
 
29
 *              Join clause estimation: the clause involves vars of multiple rels.
 
30
 * Join selectivity estimation is far more difficult and usually less accurate
 
31
 * than restriction estimation.
 
32
 *
 
33
 * When dealing with the inner scan of a nestloop join, we consider the
 
34
 * join's joinclauses as restriction clauses for the inner relation, and
 
35
 * treat vars of the outer relation as parameters (a/k/a constants of unknown
 
36
 * values).  So, restriction estimators need to be able to accept an argument
 
37
 * telling which relation is to be treated as the variable.
 
38
 *
 
39
 * The call convention for a restriction estimator (oprrest function) is
 
40
 *
 
41
 *              Selectivity oprrest (PlannerInfo *root,
 
42
 *                                                       Oid operator,
 
43
 *                                                       List *args,
 
44
 *                                                       int varRelid);
 
45
 *
 
46
 * root: general information about the query (rtable and RelOptInfo lists
 
47
 * are particularly important for the estimator).
 
48
 * operator: OID of the specific operator in question.
 
49
 * args: argument list from the operator clause.
 
50
 * varRelid: if not zero, the relid (rtable index) of the relation to
 
51
 * be treated as the variable relation.  May be zero if the args list
 
52
 * is known to contain vars of only one relation.
 
53
 *
 
54
 * This is represented at the SQL level (in pg_proc) as
 
55
 *
 
56
 *              float8 oprrest (internal, oid, internal, int4);
 
57
 *
 
58
 * The result is a selectivity, that is, a fraction (0 to 1) of the rows
 
59
 * of the relation that are expected to produce a TRUE result for the
 
60
 * given operator.
 
61
 *
 
62
 * The call convention for a join estimator (oprjoin function) is similar
 
63
 * except that varRelid is not needed, and instead join information is
 
64
 * supplied:
 
65
 *
 
66
 *              Selectivity oprjoin (PlannerInfo *root,
 
67
 *                                                       Oid operator,
 
68
 *                                                       List *args,
 
69
 *                                                       JoinType jointype,
 
70
 *                                                       SpecialJoinInfo *sjinfo);
 
71
 *
 
72
 *              float8 oprjoin (internal, oid, internal, int2, internal);
 
73
 *
 
74
 * (Before Postgres 8.4, join estimators had only the first four of these
 
75
 * parameters.  That signature is still allowed, but deprecated.)  The
 
76
 * relationship between jointype and sjinfo is explained in the comments for
 
77
 * clause_selectivity() --- the short version is that jointype is usually
 
78
 * best ignored in favor of examining sjinfo.
 
79
 *
 
80
 * Join selectivity for regular inner and outer joins is defined as the
 
81
 * fraction (0 to 1) of the cross product of the relations that is expected
 
82
 * to produce a TRUE result for the given operator.  For both semi and anti
 
83
 * joins, however, the selectivity is defined as the fraction of the left-hand
 
84
 * side relation's rows that are expected to have a match (ie, at least one
 
85
 * row with a TRUE result) in the right-hand side.
 
86
 *----------
 
87
 */
 
88
 
 
89
#include "postgres.h"
 
90
 
 
91
#include <ctype.h>
 
92
#include <math.h>
 
93
 
 
94
#include "access/gin.h"
 
95
#include "access/sysattr.h"
 
96
#include "catalog/index.h"
 
97
#include "catalog/pg_collation.h"
 
98
#include "catalog/pg_opfamily.h"
 
99
#include "catalog/pg_statistic.h"
 
100
#include "catalog/pg_type.h"
 
101
#include "executor/executor.h"
 
102
#include "mb/pg_wchar.h"
 
103
#include "nodes/makefuncs.h"
 
104
#include "nodes/nodeFuncs.h"
 
105
#include "optimizer/clauses.h"
 
106
#include "optimizer/cost.h"
 
107
#include "optimizer/pathnode.h"
 
108
#include "optimizer/paths.h"
 
109
#include "optimizer/plancat.h"
 
110
#include "optimizer/predtest.h"
 
111
#include "optimizer/restrictinfo.h"
 
112
#include "optimizer/var.h"
 
113
#include "parser/parse_coerce.h"
 
114
#include "parser/parsetree.h"
 
115
#include "utils/builtins.h"
 
116
#include "utils/bytea.h"
 
117
#include "utils/date.h"
 
118
#include "utils/datum.h"
 
119
#include "utils/fmgroids.h"
 
120
#include "utils/lsyscache.h"
 
121
#include "utils/nabstime.h"
 
122
#include "utils/pg_locale.h"
 
123
#include "utils/selfuncs.h"
 
124
#include "utils/spccache.h"
 
125
#include "utils/syscache.h"
 
126
#include "utils/tqual.h"
 
127
 
 
128
 
 
129
/* Hooks for plugins to get control when we ask for stats */
 
130
get_relation_stats_hook_type get_relation_stats_hook = NULL;
 
131
get_index_stats_hook_type get_index_stats_hook = NULL;
 
132
 
 
133
static double var_eq_const(VariableStatData *vardata, Oid operator,
 
134
                         Datum constval, bool constisnull,
 
135
                         bool varonleft);
 
136
static double var_eq_non_const(VariableStatData *vardata, Oid operator,
 
137
                                 Node *other,
 
138
                                 bool varonleft);
 
139
static double ineq_histogram_selectivity(PlannerInfo *root,
 
140
                                                   VariableStatData *vardata,
 
141
                                                   FmgrInfo *opproc, bool isgt,
 
142
                                                   Datum constval, Oid consttype);
 
143
static double eqjoinsel_inner(Oid operator,
 
144
                                VariableStatData *vardata1, VariableStatData *vardata2);
 
145
static double eqjoinsel_semi(Oid operator,
 
146
                           VariableStatData *vardata1, VariableStatData *vardata2);
 
147
static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
 
148
                                  Datum lobound, Datum hibound, Oid boundstypid,
 
149
                                  double *scaledlobound, double *scaledhibound);
 
150
static double convert_numeric_to_scalar(Datum value, Oid typid);
 
151
static void convert_string_to_scalar(char *value,
 
152
                                                 double *scaledvalue,
 
153
                                                 char *lobound,
 
154
                                                 double *scaledlobound,
 
155
                                                 char *hibound,
 
156
                                                 double *scaledhibound);
 
157
static void convert_bytea_to_scalar(Datum value,
 
158
                                                double *scaledvalue,
 
159
                                                Datum lobound,
 
160
                                                double *scaledlobound,
 
161
                                                Datum hibound,
 
162
                                                double *scaledhibound);
 
163
static double convert_one_string_to_scalar(char *value,
 
164
                                                         int rangelo, int rangehi);
 
165
static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
 
166
                                                        int rangelo, int rangehi);
 
167
static char *convert_string_datum(Datum value, Oid typid);
 
168
static double convert_timevalue_to_scalar(Datum value, Oid typid);
 
169
static bool get_variable_range(PlannerInfo *root, VariableStatData *vardata,
 
170
                                   Oid sortop, Datum *min, Datum *max);
 
171
static bool get_actual_variable_range(PlannerInfo *root,
 
172
                                                  VariableStatData *vardata,
 
173
                                                  Oid sortop,
 
174
                                                  Datum *min, Datum *max);
 
175
static Selectivity prefix_selectivity(PlannerInfo *root,
 
176
                                   VariableStatData *vardata,
 
177
                                   Oid vartype, Oid opfamily, Const *prefixcon);
 
178
static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
 
179
static Datum string_to_datum(const char *str, Oid datatype);
 
180
static Const *string_to_const(const char *str, Oid datatype);
 
181
static Const *string_to_bytea_const(const char *str, size_t str_len);
 
182
 
 
183
 
 
184
/*
 
185
 *              eqsel                   - Selectivity of "=" for any data types.
 
186
 *
 
187
 * Note: this routine is also used to estimate selectivity for some
 
188
 * operators that are not "=" but have comparable selectivity behavior,
 
189
 * such as "~=" (geometric approximate-match).  Even for "=", we must
 
190
 * keep in mind that the left and right datatypes may differ.
 
191
 */
 
192
Datum
 
193
eqsel(PG_FUNCTION_ARGS)
 
194
{
 
195
        PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 
196
        Oid                     operator = PG_GETARG_OID(1);
 
197
        List       *args = (List *) PG_GETARG_POINTER(2);
 
198
        int                     varRelid = PG_GETARG_INT32(3);
 
199
        VariableStatData vardata;
 
200
        Node       *other;
 
201
        bool            varonleft;
 
202
        double          selec;
 
203
 
 
204
        /*
 
205
         * If expression is not variable = something or something = variable, then
 
206
         * punt and return a default estimate.
 
207
         */
 
208
        if (!get_restriction_variable(root, args, varRelid,
 
209
                                                                  &vardata, &other, &varonleft))
 
210
                PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
 
211
 
 
212
        /*
 
213
         * We can do a lot better if the something is a constant.  (Note: the
 
214
         * Const might result from estimation rather than being a simple constant
 
215
         * in the query.)
 
216
         */
 
217
        if (IsA(other, Const))
 
218
                selec = var_eq_const(&vardata, operator,
 
219
                                                         ((Const *) other)->constvalue,
 
220
                                                         ((Const *) other)->constisnull,
 
221
                                                         varonleft);
 
222
        else
 
223
                selec = var_eq_non_const(&vardata, operator, other,
 
224
                                                                 varonleft);
 
225
 
 
226
        ReleaseVariableStats(vardata);
 
227
 
 
228
        PG_RETURN_FLOAT8((float8) selec);
 
229
}
 
230
 
 
231
/*
 
232
 * var_eq_const --- eqsel for var = const case
 
233
 *
 
234
 * This is split out so that some other estimation functions can use it.
 
235
 */
 
236
static double
 
237
var_eq_const(VariableStatData *vardata, Oid operator,
 
238
                         Datum constval, bool constisnull,
 
239
                         bool varonleft)
 
240
{
 
241
        double          selec;
 
242
 
 
243
        /*
 
244
         * If the constant is NULL, assume operator is strict and return zero, ie,
 
245
         * operator will never return TRUE.
 
246
         */
 
247
        if (constisnull)
 
248
                return 0.0;
 
249
 
 
250
        /*
 
251
         * If we matched the var to a unique index, assume there is exactly one
 
252
         * match regardless of anything else.  (This is slightly bogus, since the
 
253
         * index's equality operator might be different from ours, but it's more
 
254
         * likely to be right than ignoring the information.)
 
255
         */
 
256
        if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0)
 
257
                return 1.0 / vardata->rel->tuples;
 
258
 
 
259
        if (HeapTupleIsValid(vardata->statsTuple))
 
260
        {
 
261
                Form_pg_statistic stats;
 
262
                Datum      *values;
 
263
                int                     nvalues;
 
264
                float4     *numbers;
 
265
                int                     nnumbers;
 
266
                bool            match = false;
 
267
                int                     i;
 
268
 
 
269
                stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
 
270
 
 
271
                /*
 
272
                 * Is the constant "=" to any of the column's most common values?
 
273
                 * (Although the given operator may not really be "=", we will assume
 
274
                 * that seeing whether it returns TRUE is an appropriate test.  If you
 
275
                 * don't like this, maybe you shouldn't be using eqsel for your
 
276
                 * operator...)
 
277
                 */
 
278
                if (get_attstatsslot(vardata->statsTuple,
 
279
                                                         vardata->atttype, vardata->atttypmod,
 
280
                                                         STATISTIC_KIND_MCV, InvalidOid,
 
281
                                                         NULL,
 
282
                                                         &values, &nvalues,
 
283
                                                         &numbers, &nnumbers))
 
284
                {
 
285
                        FmgrInfo        eqproc;
 
286
 
 
287
                        fmgr_info(get_opcode(operator), &eqproc);
 
288
 
 
289
                        for (i = 0; i < nvalues; i++)
 
290
                        {
 
291
                                /* be careful to apply operator right way 'round */
 
292
                                if (varonleft)
 
293
                                        match = DatumGetBool(FunctionCall2Coll(&eqproc,
 
294
                                                                                                                   DEFAULT_COLLATION_OID,
 
295
                                                                                                                   values[i],
 
296
                                                                                                                   constval));
 
297
                                else
 
298
                                        match = DatumGetBool(FunctionCall2Coll(&eqproc,
 
299
                                                                                                                   DEFAULT_COLLATION_OID,
 
300
                                                                                                                   constval,
 
301
                                                                                                                   values[i]));
 
302
                                if (match)
 
303
                                        break;
 
304
                        }
 
305
                }
 
306
                else
 
307
                {
 
308
                        /* no most-common-value info available */
 
309
                        values = NULL;
 
310
                        numbers = NULL;
 
311
                        i = nvalues = nnumbers = 0;
 
312
                }
 
313
 
 
314
                if (match)
 
315
                {
 
316
                        /*
 
317
                         * Constant is "=" to this common value.  We know selectivity
 
318
                         * exactly (or as exactly as ANALYZE could calculate it, anyway).
 
319
                         */
 
320
                        selec = numbers[i];
 
321
                }
 
322
                else
 
323
                {
 
324
                        /*
 
325
                         * Comparison is against a constant that is neither NULL nor any
 
326
                         * of the common values.  Its selectivity cannot be more than
 
327
                         * this:
 
328
                         */
 
329
                        double          sumcommon = 0.0;
 
330
                        double          otherdistinct;
 
331
 
 
332
                        for (i = 0; i < nnumbers; i++)
 
333
                                sumcommon += numbers[i];
 
334
                        selec = 1.0 - sumcommon - stats->stanullfrac;
 
335
                        CLAMP_PROBABILITY(selec);
 
336
 
 
337
                        /*
 
338
                         * and in fact it's probably a good deal less. We approximate that
 
339
                         * all the not-common values share this remaining fraction
 
340
                         * equally, so we divide by the number of other distinct values.
 
341
                         */
 
342
                        otherdistinct = get_variable_numdistinct(vardata) - nnumbers;
 
343
                        if (otherdistinct > 1)
 
344
                                selec /= otherdistinct;
 
345
 
 
346
                        /*
 
347
                         * Another cross-check: selectivity shouldn't be estimated as more
 
348
                         * than the least common "most common value".
 
349
                         */
 
350
                        if (nnumbers > 0 && selec > numbers[nnumbers - 1])
 
351
                                selec = numbers[nnumbers - 1];
 
352
                }
 
353
 
 
354
                free_attstatsslot(vardata->atttype, values, nvalues,
 
355
                                                  numbers, nnumbers);
 
356
        }
 
357
        else
 
358
        {
 
359
                /*
 
360
                 * No ANALYZE stats available, so make a guess using estimated number
 
361
                 * of distinct values and assuming they are equally common. (The guess
 
362
                 * is unlikely to be very good, but we do know a few special cases.)
 
363
                 */
 
364
                selec = 1.0 / get_variable_numdistinct(vardata);
 
365
        }
 
366
 
 
367
        /* result should be in range, but make sure... */
 
368
        CLAMP_PROBABILITY(selec);
 
369
 
 
370
        return selec;
 
371
}
 
372
 
 
373
/*
 
374
 * var_eq_non_const --- eqsel for var = something-other-than-const case
 
375
 */
 
376
static double
 
377
var_eq_non_const(VariableStatData *vardata, Oid operator,
 
378
                                 Node *other,
 
379
                                 bool varonleft)
 
380
{
 
381
        double          selec;
 
382
 
 
383
        /*
 
384
         * If we matched the var to a unique index, assume there is exactly one
 
385
         * match regardless of anything else.  (This is slightly bogus, since the
 
386
         * index's equality operator might be different from ours, but it's more
 
387
         * likely to be right than ignoring the information.)
 
388
         */
 
389
        if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0)
 
390
                return 1.0 / vardata->rel->tuples;
 
391
 
 
392
        if (HeapTupleIsValid(vardata->statsTuple))
 
393
        {
 
394
                Form_pg_statistic stats;
 
395
                double          ndistinct;
 
396
                float4     *numbers;
 
397
                int                     nnumbers;
 
398
 
 
399
                stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
 
400
 
 
401
                /*
 
402
                 * Search is for a value that we do not know a priori, but we will
 
403
                 * assume it is not NULL.  Estimate the selectivity as non-null
 
404
                 * fraction divided by number of distinct values, so that we get a
 
405
                 * result averaged over all possible values whether common or
 
406
                 * uncommon.  (Essentially, we are assuming that the not-yet-known
 
407
                 * comparison value is equally likely to be any of the possible
 
408
                 * values, regardless of their frequency in the table.  Is that a good
 
409
                 * idea?)
 
410
                 */
 
411
                selec = 1.0 - stats->stanullfrac;
 
412
                ndistinct = get_variable_numdistinct(vardata);
 
413
                if (ndistinct > 1)
 
414
                        selec /= ndistinct;
 
415
 
 
416
                /*
 
417
                 * Cross-check: selectivity should never be estimated as more than the
 
418
                 * most common value's.
 
419
                 */
 
420
                if (get_attstatsslot(vardata->statsTuple,
 
421
                                                         vardata->atttype, vardata->atttypmod,
 
422
                                                         STATISTIC_KIND_MCV, InvalidOid,
 
423
                                                         NULL,
 
424
                                                         NULL, NULL,
 
425
                                                         &numbers, &nnumbers))
 
426
                {
 
427
                        if (nnumbers > 0 && selec > numbers[0])
 
428
                                selec = numbers[0];
 
429
                        free_attstatsslot(vardata->atttype, NULL, 0, numbers, nnumbers);
 
430
                }
 
431
        }
 
432
        else
 
433
        {
 
434
                /*
 
435
                 * No ANALYZE stats available, so make a guess using estimated number
 
436
                 * of distinct values and assuming they are equally common. (The guess
 
437
                 * is unlikely to be very good, but we do know a few special cases.)
 
438
                 */
 
439
                selec = 1.0 / get_variable_numdistinct(vardata);
 
440
        }
 
441
 
 
442
        /* result should be in range, but make sure... */
 
443
        CLAMP_PROBABILITY(selec);
 
444
 
 
445
        return selec;
 
446
}
 
447
 
 
448
/*
 
449
 *              neqsel                  - Selectivity of "!=" for any data types.
 
450
 *
 
451
 * This routine is also used for some operators that are not "!="
 
452
 * but have comparable selectivity behavior.  See above comments
 
453
 * for eqsel().
 
454
 */
 
455
Datum
 
456
neqsel(PG_FUNCTION_ARGS)
 
457
{
 
458
        PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 
459
        Oid                     operator = PG_GETARG_OID(1);
 
460
        List       *args = (List *) PG_GETARG_POINTER(2);
 
461
        int                     varRelid = PG_GETARG_INT32(3);
 
462
        Oid                     eqop;
 
463
        float8          result;
 
464
 
 
465
        /*
 
466
         * We want 1 - eqsel() where the equality operator is the one associated
 
467
         * with this != operator, that is, its negator.
 
468
         */
 
469
        eqop = get_negator(operator);
 
470
        if (eqop)
 
471
        {
 
472
                result = DatumGetFloat8(DirectFunctionCall4(eqsel,
 
473
                                                                                                        PointerGetDatum(root),
 
474
                                                                                                        ObjectIdGetDatum(eqop),
 
475
                                                                                                        PointerGetDatum(args),
 
476
                                                                                                        Int32GetDatum(varRelid)));
 
477
        }
 
478
        else
 
479
        {
 
480
                /* Use default selectivity (should we raise an error instead?) */
 
481
                result = DEFAULT_EQ_SEL;
 
482
        }
 
483
        result = 1.0 - result;
 
484
        PG_RETURN_FLOAT8(result);
 
485
}
 
486
 
 
487
/*
 
488
 *      scalarineqsel           - Selectivity of "<", "<=", ">", ">=" for scalars.
 
489
 *
 
490
 * This is the guts of both scalarltsel and scalargtsel.  The caller has
 
491
 * commuted the clause, if necessary, so that we can treat the variable as
 
492
 * being on the left.  The caller must also make sure that the other side
 
493
 * of the clause is a non-null Const, and dissect same into a value and
 
494
 * datatype.
 
495
 *
 
496
 * This routine works for any datatype (or pair of datatypes) known to
 
497
 * convert_to_scalar().  If it is applied to some other datatype,
 
498
 * it will return a default estimate.
 
499
 */
 
500
static double
 
501
scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
 
502
                          VariableStatData *vardata, Datum constval, Oid consttype)
 
503
{
 
504
        Form_pg_statistic stats;
 
505
        FmgrInfo        opproc;
 
506
        double          mcv_selec,
 
507
                                hist_selec,
 
508
                                sumcommon;
 
509
        double          selec;
 
510
 
 
511
        if (!HeapTupleIsValid(vardata->statsTuple))
 
512
        {
 
513
                /* no stats available, so default result */
 
514
                return DEFAULT_INEQ_SEL;
 
515
        }
 
516
        stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
 
517
 
 
518
        fmgr_info(get_opcode(operator), &opproc);
 
519
 
 
520
        /*
 
521
         * If we have most-common-values info, add up the fractions of the MCV
 
522
         * entries that satisfy MCV OP CONST.  These fractions contribute directly
 
523
         * to the result selectivity.  Also add up the total fraction represented
 
524
         * by MCV entries.
 
525
         */
 
526
        mcv_selec = mcv_selectivity(vardata, &opproc, constval, true,
 
527
                                                                &sumcommon);
 
528
 
 
529
        /*
 
530
         * If there is a histogram, determine which bin the constant falls in, and
 
531
         * compute the resulting contribution to selectivity.
 
532
         */
 
533
        hist_selec = ineq_histogram_selectivity(root, vardata, &opproc, isgt,
 
534
                                                                                        constval, consttype);
 
535
 
 
536
        /*
 
537
         * Now merge the results from the MCV and histogram calculations,
 
538
         * realizing that the histogram covers only the non-null values that are
 
539
         * not listed in MCV.
 
540
         */
 
541
        selec = 1.0 - stats->stanullfrac - sumcommon;
 
542
 
 
543
        if (hist_selec >= 0.0)
 
544
                selec *= hist_selec;
 
545
        else
 
546
        {
 
547
                /*
 
548
                 * If no histogram but there are values not accounted for by MCV,
 
549
                 * arbitrarily assume half of them will match.
 
550
                 */
 
551
                selec *= 0.5;
 
552
        }
 
553
 
 
554
        selec += mcv_selec;
 
555
 
 
556
        /* result should be in range, but make sure... */
 
557
        CLAMP_PROBABILITY(selec);
 
558
 
 
559
        return selec;
 
560
}
 
561
 
 
562
/*
 
563
 *      mcv_selectivity                 - Examine the MCV list for selectivity estimates
 
564
 *
 
565
 * Determine the fraction of the variable's MCV population that satisfies
 
566
 * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft.  Also
 
567
 * compute the fraction of the total column population represented by the MCV
 
568
 * list.  This code will work for any boolean-returning predicate operator.
 
569
 *
 
570
 * The function result is the MCV selectivity, and the fraction of the
 
571
 * total population is returned into *sumcommonp.  Zeroes are returned
 
572
 * if there is no MCV list.
 
573
 */
 
574
double
 
575
mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 
576
                                Datum constval, bool varonleft,
 
577
                                double *sumcommonp)
 
578
{
 
579
        double          mcv_selec,
 
580
                                sumcommon;
 
581
        Datum      *values;
 
582
        int                     nvalues;
 
583
        float4     *numbers;
 
584
        int                     nnumbers;
 
585
        int                     i;
 
586
 
 
587
        mcv_selec = 0.0;
 
588
        sumcommon = 0.0;
 
589
 
 
590
        if (HeapTupleIsValid(vardata->statsTuple) &&
 
591
                get_attstatsslot(vardata->statsTuple,
 
592
                                                 vardata->atttype, vardata->atttypmod,
 
593
                                                 STATISTIC_KIND_MCV, InvalidOid,
 
594
                                                 NULL,
 
595
                                                 &values, &nvalues,
 
596
                                                 &numbers, &nnumbers))
 
597
        {
 
598
                for (i = 0; i < nvalues; i++)
 
599
                {
 
600
                        if (varonleft ?
 
601
                                DatumGetBool(FunctionCall2Coll(opproc,
 
602
                                                                                           DEFAULT_COLLATION_OID,
 
603
                                                                                           values[i],
 
604
                                                                                           constval)) :
 
605
                                DatumGetBool(FunctionCall2Coll(opproc,
 
606
                                                                                           DEFAULT_COLLATION_OID,
 
607
                                                                                           constval,
 
608
                                                                                           values[i])))
 
609
                                mcv_selec += numbers[i];
 
610
                        sumcommon += numbers[i];
 
611
                }
 
612
                free_attstatsslot(vardata->atttype, values, nvalues,
 
613
                                                  numbers, nnumbers);
 
614
        }
 
615
 
 
616
        *sumcommonp = sumcommon;
 
617
        return mcv_selec;
 
618
}
 
619
 
 
620
/*
 
621
 *      histogram_selectivity   - Examine the histogram for selectivity estimates
 
622
 *
 
623
 * Determine the fraction of the variable's histogram entries that satisfy
 
624
 * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft.
 
625
 *
 
626
 * This code will work for any boolean-returning predicate operator, whether
 
627
 * or not it has anything to do with the histogram sort operator.  We are
 
628
 * essentially using the histogram just as a representative sample.  However,
 
629
 * small histograms are unlikely to be all that representative, so the caller
 
630
 * should be prepared to fall back on some other estimation approach when the
 
631
 * histogram is missing or very small.  It may also be prudent to combine this
 
632
 * approach with another one when the histogram is small.
 
633
 *
 
634
 * If the actual histogram size is not at least min_hist_size, we won't bother
 
635
 * to do the calculation at all.  Also, if the n_skip parameter is > 0, we
 
636
 * ignore the first and last n_skip histogram elements, on the grounds that
 
637
 * they are outliers and hence not very representative.  Typical values for
 
638
 * these parameters are 10 and 1.
 
639
 *
 
640
 * The function result is the selectivity, or -1 if there is no histogram
 
641
 * or it's smaller than min_hist_size.
 
642
 *
 
643
 * The output parameter *hist_size receives the actual histogram size,
 
644
 * or zero if no histogram.  Callers may use this number to decide how
 
645
 * much faith to put in the function result.
 
646
 *
 
647
 * Note that the result disregards both the most-common-values (if any) and
 
648
 * null entries.  The caller is expected to combine this result with
 
649
 * statistics for those portions of the column population.      It may also be
 
650
 * prudent to clamp the result range, ie, disbelieve exact 0 or 1 outputs.
 
651
 */
 
652
double
 
653
histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 
654
                                          Datum constval, bool varonleft,
 
655
                                          int min_hist_size, int n_skip,
 
656
                                          int *hist_size)
 
657
{
 
658
        double          result;
 
659
        Datum      *values;
 
660
        int                     nvalues;
 
661
 
 
662
        /* check sanity of parameters */
 
663
        Assert(n_skip >= 0);
 
664
        Assert(min_hist_size > 2 * n_skip);
 
665
 
 
666
        if (HeapTupleIsValid(vardata->statsTuple) &&
 
667
                get_attstatsslot(vardata->statsTuple,
 
668
                                                 vardata->atttype, vardata->atttypmod,
 
669
                                                 STATISTIC_KIND_HISTOGRAM, InvalidOid,
 
670
                                                 NULL,
 
671
                                                 &values, &nvalues,
 
672
                                                 NULL, NULL))
 
673
        {
 
674
                *hist_size = nvalues;
 
675
                if (nvalues >= min_hist_size)
 
676
                {
 
677
                        int                     nmatch = 0;
 
678
                        int                     i;
 
679
 
 
680
                        for (i = n_skip; i < nvalues - n_skip; i++)
 
681
                        {
 
682
                                if (varonleft ?
 
683
                                        DatumGetBool(FunctionCall2Coll(opproc,
 
684
                                                                                                   DEFAULT_COLLATION_OID,
 
685
                                                                                                   values[i],
 
686
                                                                                                   constval)) :
 
687
                                        DatumGetBool(FunctionCall2Coll(opproc,
 
688
                                                                                                   DEFAULT_COLLATION_OID,
 
689
                                                                                                   constval,
 
690
                                                                                                   values[i])))
 
691
                                        nmatch++;
 
692
                        }
 
693
                        result = ((double) nmatch) / ((double) (nvalues - 2 * n_skip));
 
694
                }
 
695
                else
 
696
                        result = -1;
 
697
                free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
 
698
        }
 
699
        else
 
700
        {
 
701
                *hist_size = 0;
 
702
                result = -1;
 
703
        }
 
704
 
 
705
        return result;
 
706
}
 
707
 
 
708
/*
 
709
 *      ineq_histogram_selectivity      - Examine the histogram for scalarineqsel
 
710
 *
 
711
 * Determine the fraction of the variable's histogram population that
 
712
 * satisfies the inequality condition, ie, VAR < CONST or VAR > CONST.
 
713
 *
 
714
 * Returns -1 if there is no histogram (valid results will always be >= 0).
 
715
 *
 
716
 * Note that the result disregards both the most-common-values (if any) and
 
717
 * null entries.  The caller is expected to combine this result with
 
718
 * statistics for those portions of the column population.
 
719
 */
 
720
static double
 
721
ineq_histogram_selectivity(PlannerInfo *root,
 
722
                                                   VariableStatData *vardata,
 
723
                                                   FmgrInfo *opproc, bool isgt,
 
724
                                                   Datum constval, Oid consttype)
 
725
{
 
726
        double          hist_selec;
 
727
        Oid                     hist_op;
 
728
        Datum      *values;
 
729
        int                     nvalues;
 
730
 
 
731
        hist_selec = -1.0;
 
732
 
 
733
        /*
 
734
         * Someday, ANALYZE might store more than one histogram per rel/att,
 
735
         * corresponding to more than one possible sort ordering defined for the
 
736
         * column type.  However, to make that work we will need to figure out
 
737
         * which staop to search for --- it's not necessarily the one we have at
 
738
         * hand!  (For example, we might have a '<=' operator rather than the '<'
 
739
         * operator that will appear in staop.)  For now, assume that whatever
 
740
         * appears in pg_statistic is sorted the same way our operator sorts, or
 
741
         * the reverse way if isgt is TRUE.
 
742
         */
 
743
        if (HeapTupleIsValid(vardata->statsTuple) &&
 
744
                get_attstatsslot(vardata->statsTuple,
 
745
                                                 vardata->atttype, vardata->atttypmod,
 
746
                                                 STATISTIC_KIND_HISTOGRAM, InvalidOid,
 
747
                                                 &hist_op,
 
748
                                                 &values, &nvalues,
 
749
                                                 NULL, NULL))
 
750
        {
 
751
                if (nvalues > 1)
 
752
                {
 
753
                        /*
 
754
                         * Use binary search to find proper location, ie, the first slot
 
755
                         * at which the comparison fails.  (If the given operator isn't
 
756
                         * actually sort-compatible with the histogram, you'll get garbage
 
757
                         * results ... but probably not any more garbage-y than you would
 
758
                         * from the old linear search.)
 
759
                         *
 
760
                         * If the binary search accesses the first or last histogram
 
761
                         * entry, we try to replace that endpoint with the true column min
 
762
                         * or max as found by get_actual_variable_range().      This
 
763
                         * ameliorates misestimates when the min or max is moving as a
 
764
                         * result of changes since the last ANALYZE.  Note that this could
 
765
                         * result in effectively including MCVs into the histogram that
 
766
                         * weren't there before, but we don't try to correct for that.
 
767
                         */
 
768
                        double          histfrac;
 
769
                        int                     lobound = 0;    /* first possible slot to search */
 
770
                        int                     hibound = nvalues;              /* last+1 slot to search */
 
771
                        bool            have_end = false;
 
772
 
 
773
                        /*
 
774
                         * If there are only two histogram entries, we'll want up-to-date
 
775
                         * values for both.  (If there are more than two, we need at most
 
776
                         * one of them to be updated, so we deal with that within the
 
777
                         * loop.)
 
778
                         */
 
779
                        if (nvalues == 2)
 
780
                                have_end = get_actual_variable_range(root,
 
781
                                                                                                         vardata,
 
782
                                                                                                         hist_op,
 
783
                                                                                                         &values[0],
 
784
                                                                                                         &values[1]);
 
785
 
 
786
                        while (lobound < hibound)
 
787
                        {
 
788
                                int                     probe = (lobound + hibound) / 2;
 
789
                                bool            ltcmp;
 
790
 
 
791
                                /*
 
792
                                 * If we find ourselves about to compare to the first or last
 
793
                                 * histogram entry, first try to replace it with the actual
 
794
                                 * current min or max (unless we already did so above).
 
795
                                 */
 
796
                                if (probe == 0 && nvalues > 2)
 
797
                                        have_end = get_actual_variable_range(root,
 
798
                                                                                                                 vardata,
 
799
                                                                                                                 hist_op,
 
800
                                                                                                                 &values[0],
 
801
                                                                                                                 NULL);
 
802
                                else if (probe == nvalues - 1 && nvalues > 2)
 
803
                                        have_end = get_actual_variable_range(root,
 
804
                                                                                                                 vardata,
 
805
                                                                                                                 hist_op,
 
806
                                                                                                                 NULL,
 
807
                                                                                                                 &values[probe]);
 
808
 
 
809
                                ltcmp = DatumGetBool(FunctionCall2Coll(opproc,
 
810
                                                                                                           DEFAULT_COLLATION_OID,
 
811
                                                                                                           values[probe],
 
812
                                                                                                           constval));
 
813
                                if (isgt)
 
814
                                        ltcmp = !ltcmp;
 
815
                                if (ltcmp)
 
816
                                        lobound = probe + 1;
 
817
                                else
 
818
                                        hibound = probe;
 
819
                        }
 
820
 
 
821
                        if (lobound <= 0)
 
822
                        {
 
823
                                /* Constant is below lower histogram boundary. */
 
824
                                histfrac = 0.0;
 
825
                        }
 
826
                        else if (lobound >= nvalues)
 
827
                        {
 
828
                                /* Constant is above upper histogram boundary. */
 
829
                                histfrac = 1.0;
 
830
                        }
 
831
                        else
 
832
                        {
 
833
                                int                     i = lobound;
 
834
                                double          val,
 
835
                                                        high,
 
836
                                                        low;
 
837
                                double          binfrac;
 
838
 
 
839
                                /*
 
840
                                 * We have values[i-1] <= constant <= values[i].
 
841
                                 *
 
842
                                 * Convert the constant and the two nearest bin boundary
 
843
                                 * values to a uniform comparison scale, and do a linear
 
844
                                 * interpolation within this bin.
 
845
                                 */
 
846
                                if (convert_to_scalar(constval, consttype, &val,
 
847
                                                                          values[i - 1], values[i],
 
848
                                                                          vardata->vartype,
 
849
                                                                          &low, &high))
 
850
                                {
 
851
                                        if (high <= low)
 
852
                                        {
 
853
                                                /* cope if bin boundaries appear identical */
 
854
                                                binfrac = 0.5;
 
855
                                        }
 
856
                                        else if (val <= low)
 
857
                                                binfrac = 0.0;
 
858
                                        else if (val >= high)
 
859
                                                binfrac = 1.0;
 
860
                                        else
 
861
                                        {
 
862
                                                binfrac = (val - low) / (high - low);
 
863
 
 
864
                                                /*
 
865
                                                 * Watch out for the possibility that we got a NaN or
 
866
                                                 * Infinity from the division.  This can happen
 
867
                                                 * despite the previous checks, if for example "low"
 
868
                                                 * is -Infinity.
 
869
                                                 */
 
870
                                                if (isnan(binfrac) ||
 
871
                                                        binfrac < 0.0 || binfrac > 1.0)
 
872
                                                        binfrac = 0.5;
 
873
                                        }
 
874
                                }
 
875
                                else
 
876
                                {
 
877
                                        /*
 
878
                                         * Ideally we'd produce an error here, on the grounds that
 
879
                                         * the given operator shouldn't have scalarXXsel
 
880
                                         * registered as its selectivity func unless we can deal
 
881
                                         * with its operand types.      But currently, all manner of
 
882
                                         * stuff is invoking scalarXXsel, so give a default
 
883
                                         * estimate until that can be fixed.
 
884
                                         */
 
885
                                        binfrac = 0.5;
 
886
                                }
 
887
 
 
888
                                /*
 
889
                                 * Now, compute the overall selectivity across the values
 
890
                                 * represented by the histogram.  We have i-1 full bins and
 
891
                                 * binfrac partial bin below the constant.
 
892
                                 */
 
893
                                histfrac = (double) (i - 1) + binfrac;
 
894
                                histfrac /= (double) (nvalues - 1);
 
895
                        }
 
896
 
 
897
                        /*
 
898
                         * Now histfrac = fraction of histogram entries below the
 
899
                         * constant.
 
900
                         *
 
901
                         * Account for "<" vs ">"
 
902
                         */
 
903
                        hist_selec = isgt ? (1.0 - histfrac) : histfrac;
 
904
 
 
905
                        /*
 
906
                         * The histogram boundaries are only approximate to begin with,
 
907
                         * and may well be out of date anyway.  Therefore, don't believe
 
908
                         * extremely small or large selectivity estimates --- unless we
 
909
                         * got actual current endpoint values from the table.
 
910
                         */
 
911
                        if (have_end)
 
912
                                CLAMP_PROBABILITY(hist_selec);
 
913
                        else
 
914
                        {
 
915
                                if (hist_selec < 0.0001)
 
916
                                        hist_selec = 0.0001;
 
917
                                else if (hist_selec > 0.9999)
 
918
                                        hist_selec = 0.9999;
 
919
                        }
 
920
                }
 
921
 
 
922
                free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
 
923
        }
 
924
 
 
925
        return hist_selec;
 
926
}
 
927
 
 
928
/*
 
929
 *              scalarltsel             - Selectivity of "<" (also "<=") for scalars.
 
930
 */
 
931
Datum
 
932
scalarltsel(PG_FUNCTION_ARGS)
 
933
{
 
934
        PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 
935
        Oid                     operator = PG_GETARG_OID(1);
 
936
        List       *args = (List *) PG_GETARG_POINTER(2);
 
937
        int                     varRelid = PG_GETARG_INT32(3);
 
938
        VariableStatData vardata;
 
939
        Node       *other;
 
940
        bool            varonleft;
 
941
        Datum           constval;
 
942
        Oid                     consttype;
 
943
        bool            isgt;
 
944
        double          selec;
 
945
 
 
946
        /*
 
947
         * If expression is not variable op something or something op variable,
 
948
         * then punt and return a default estimate.
 
949
         */
 
950
        if (!get_restriction_variable(root, args, varRelid,
 
951
                                                                  &vardata, &other, &varonleft))
 
952
                PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 
953
 
 
954
        /*
 
955
         * Can't do anything useful if the something is not a constant, either.
 
956
         */
 
957
        if (!IsA(other, Const))
 
958
        {
 
959
                ReleaseVariableStats(vardata);
 
960
                PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 
961
        }
 
962
 
 
963
        /*
 
964
         * If the constant is NULL, assume operator is strict and return zero, ie,
 
965
         * operator will never return TRUE.
 
966
         */
 
967
        if (((Const *) other)->constisnull)
 
968
        {
 
969
                ReleaseVariableStats(vardata);
 
970
                PG_RETURN_FLOAT8(0.0);
 
971
        }
 
972
        constval = ((Const *) other)->constvalue;
 
973
        consttype = ((Const *) other)->consttype;
 
974
 
 
975
        /*
 
976
         * Force the var to be on the left to simplify logic in scalarineqsel.
 
977
         */
 
978
        if (varonleft)
 
979
        {
 
980
                /* we have var < other */
 
981
                isgt = false;
 
982
        }
 
983
        else
 
984
        {
 
985
                /* we have other < var, commute to make var > other */
 
986
                operator = get_commutator(operator);
 
987
                if (!operator)
 
988
                {
 
989
                        /* Use default selectivity (should we raise an error instead?) */
 
990
                        ReleaseVariableStats(vardata);
 
991
                        PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 
992
                }
 
993
                isgt = true;
 
994
        }
 
995
 
 
996
        selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
 
997
 
 
998
        ReleaseVariableStats(vardata);
 
999
 
 
1000
        PG_RETURN_FLOAT8((float8) selec);
 
1001
}
 
1002
 
 
1003
/*
 
1004
 *              scalargtsel             - Selectivity of ">" (also ">=") for integers.
 
1005
 */
 
1006
Datum
 
1007
scalargtsel(PG_FUNCTION_ARGS)
 
1008
{
 
1009
        PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 
1010
        Oid                     operator = PG_GETARG_OID(1);
 
1011
        List       *args = (List *) PG_GETARG_POINTER(2);
 
1012
        int                     varRelid = PG_GETARG_INT32(3);
 
1013
        VariableStatData vardata;
 
1014
        Node       *other;
 
1015
        bool            varonleft;
 
1016
        Datum           constval;
 
1017
        Oid                     consttype;
 
1018
        bool            isgt;
 
1019
        double          selec;
 
1020
 
 
1021
        /*
 
1022
         * If expression is not variable op something or something op variable,
 
1023
         * then punt and return a default estimate.
 
1024
         */
 
1025
        if (!get_restriction_variable(root, args, varRelid,
 
1026
                                                                  &vardata, &other, &varonleft))
 
1027
                PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 
1028
 
 
1029
        /*
 
1030
         * Can't do anything useful if the something is not a constant, either.
 
1031
         */
 
1032
        if (!IsA(other, Const))
 
1033
        {
 
1034
                ReleaseVariableStats(vardata);
 
1035
                PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 
1036
        }
 
1037
 
 
1038
        /*
 
1039
         * If the constant is NULL, assume operator is strict and return zero, ie,
 
1040
         * operator will never return TRUE.
 
1041
         */
 
1042
        if (((Const *) other)->constisnull)
 
1043
        {
 
1044
                ReleaseVariableStats(vardata);
 
1045
                PG_RETURN_FLOAT8(0.0);
 
1046
        }
 
1047
        constval = ((Const *) other)->constvalue;
 
1048
        consttype = ((Const *) other)->consttype;
 
1049
 
 
1050
        /*
 
1051
         * Force the var to be on the left to simplify logic in scalarineqsel.
 
1052
         */
 
1053
        if (varonleft)
 
1054
        {
 
1055
                /* we have var > other */
 
1056
                isgt = true;
 
1057
        }
 
1058
        else
 
1059
        {
 
1060
                /* we have other > var, commute to make var < other */
 
1061
                operator = get_commutator(operator);
 
1062
                if (!operator)
 
1063
                {
 
1064
                        /* Use default selectivity (should we raise an error instead?) */
 
1065
                        ReleaseVariableStats(vardata);
 
1066
                        PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 
1067
                }
 
1068
                isgt = false;
 
1069
        }
 
1070
 
 
1071
        selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
 
1072
 
 
1073
        ReleaseVariableStats(vardata);
 
1074
 
 
1075
        PG_RETURN_FLOAT8((float8) selec);
 
1076
}
 
1077
 
 
1078
/*
 
1079
 * patternsel                   - Generic code for pattern-match selectivity.
 
1080
 */
 
1081
static double
 
1082
patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
 
1083
{
 
1084
        PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 
1085
        Oid                     operator = PG_GETARG_OID(1);
 
1086
        List       *args = (List *) PG_GETARG_POINTER(2);
 
1087
        int                     varRelid = PG_GETARG_INT32(3);
 
1088
        VariableStatData vardata;
 
1089
        Node       *other;
 
1090
        bool            varonleft;
 
1091
        Datum           constval;
 
1092
        Oid                     consttype;
 
1093
        Oid                     vartype;
 
1094
        Oid                     opfamily;
 
1095
        Pattern_Prefix_Status pstatus;
 
1096
        Const      *patt = NULL;
 
1097
        Const      *prefix = NULL;
 
1098
        Const      *rest = NULL;
 
1099
        double          result;
 
1100
 
 
1101
        /*
 
1102
         * If this is for a NOT LIKE or similar operator, get the corresponding
 
1103
         * positive-match operator and work with that.  Set result to the correct
 
1104
         * default estimate, too.
 
1105
         */
 
1106
        if (negate)
 
1107
        {
 
1108
                operator = get_negator(operator);
 
1109
                if (!OidIsValid(operator))
 
1110
                        elog(ERROR, "patternsel called for operator without a negator");
 
1111
                result = 1.0 - DEFAULT_MATCH_SEL;
 
1112
        }
 
1113
        else
 
1114
        {
 
1115
                result = DEFAULT_MATCH_SEL;
 
1116
        }
 
1117
 
 
1118
        /*
 
1119
         * If expression is not variable op constant, then punt and return a
 
1120
         * default estimate.
 
1121
         */
 
1122
        if (!get_restriction_variable(root, args, varRelid,
 
1123
                                                                  &vardata, &other, &varonleft))
 
1124
                return result;
 
1125
        if (!varonleft || !IsA(other, Const))
 
1126
        {
 
1127
                ReleaseVariableStats(vardata);
 
1128
                return result;
 
1129
        }
 
1130
 
 
1131
        /*
 
1132
         * If the constant is NULL, assume operator is strict and return zero, ie,
 
1133
         * operator will never return TRUE.  (It's zero even for a negator op.)
 
1134
         */
 
1135
        if (((Const *) other)->constisnull)
 
1136
        {
 
1137
                ReleaseVariableStats(vardata);
 
1138
                return 0.0;
 
1139
        }
 
1140
        constval = ((Const *) other)->constvalue;
 
1141
        consttype = ((Const *) other)->consttype;
 
1142
 
 
1143
        /*
 
1144
         * The right-hand const is type text or bytea for all supported operators.
 
1145
         * We do not expect to see binary-compatible types here, since
 
1146
         * const-folding should have relabeled the const to exactly match the
 
1147
         * operator's declared type.
 
1148
         */
 
1149
        if (consttype != TEXTOID && consttype != BYTEAOID)
 
1150
        {
 
1151
                ReleaseVariableStats(vardata);
 
1152
                return result;
 
1153
        }
 
1154
 
 
1155
        /*
 
1156
         * Similarly, the exposed type of the left-hand side should be one of
 
1157
         * those we know.  (Do not look at vardata.atttype, which might be
 
1158
         * something binary-compatible but different.)  We can use it to choose
 
1159
         * the index opfamily from which we must draw the comparison operators.
 
1160
         *
 
1161
         * NOTE: It would be more correct to use the PATTERN opfamilies than the
 
1162
         * simple ones, but at the moment ANALYZE will not generate statistics for
 
1163
         * the PATTERN operators.  But our results are so approximate anyway that
 
1164
         * it probably hardly matters.
 
1165
         */
 
1166
        vartype = vardata.vartype;
 
1167
 
 
1168
        switch (vartype)
 
1169
        {
 
1170
                case TEXTOID:
 
1171
                        opfamily = TEXT_BTREE_FAM_OID;
 
1172
                        break;
 
1173
                case BPCHAROID:
 
1174
                        opfamily = BPCHAR_BTREE_FAM_OID;
 
1175
                        break;
 
1176
                case NAMEOID:
 
1177
                        opfamily = NAME_BTREE_FAM_OID;
 
1178
                        break;
 
1179
                case BYTEAOID:
 
1180
                        opfamily = BYTEA_BTREE_FAM_OID;
 
1181
                        break;
 
1182
                default:
 
1183
                        ReleaseVariableStats(vardata);
 
1184
                        return result;
 
1185
        }
 
1186
 
 
1187
        /*
 
1188
         * Divide pattern into fixed prefix and remainder.  XXX we have to assume
 
1189
         * default collation here, because we don't have access to the actual
 
1190
         * input collation for the operator.  FIXME ...
 
1191
         */
 
1192
        patt = (Const *) other;
 
1193
        pstatus = pattern_fixed_prefix(patt, ptype, DEFAULT_COLLATION_OID,
 
1194
                                                                   &prefix, &rest);
 
1195
 
 
1196
        /*
 
1197
         * If necessary, coerce the prefix constant to the right type. (The "rest"
 
1198
         * constant need not be changed.)
 
1199
         */
 
1200
        if (prefix && prefix->consttype != vartype)
 
1201
        {
 
1202
                char       *prefixstr;
 
1203
 
 
1204
                switch (prefix->consttype)
 
1205
                {
 
1206
                        case TEXTOID:
 
1207
                                prefixstr = TextDatumGetCString(prefix->constvalue);
 
1208
                                break;
 
1209
                        case BYTEAOID:
 
1210
                                prefixstr = DatumGetCString(DirectFunctionCall1(byteaout,
 
1211
                                                                                                                prefix->constvalue));
 
1212
                                break;
 
1213
                        default:
 
1214
                                elog(ERROR, "unrecognized consttype: %u",
 
1215
                                         prefix->consttype);
 
1216
                                ReleaseVariableStats(vardata);
 
1217
                                return result;
 
1218
                }
 
1219
                prefix = string_to_const(prefixstr, vartype);
 
1220
                pfree(prefixstr);
 
1221
        }
 
1222
 
 
1223
        if (pstatus == Pattern_Prefix_Exact)
 
1224
        {
 
1225
                /*
 
1226
                 * Pattern specifies an exact match, so pretend operator is '='
 
1227
                 */
 
1228
                Oid                     eqopr = get_opfamily_member(opfamily, vartype, vartype,
 
1229
                                                                                                BTEqualStrategyNumber);
 
1230
 
 
1231
                if (eqopr == InvalidOid)
 
1232
                        elog(ERROR, "no = operator for opfamily %u", opfamily);
 
1233
                result = var_eq_const(&vardata, eqopr, prefix->constvalue,
 
1234
                                                          false, true);
 
1235
        }
 
1236
        else
 
1237
        {
 
1238
                /*
 
1239
                 * Not exact-match pattern.  If we have a sufficiently large
 
1240
                 * histogram, estimate selectivity for the histogram part of the
 
1241
                 * population by counting matches in the histogram.  If not, estimate
 
1242
                 * selectivity of the fixed prefix and remainder of pattern
 
1243
                 * separately, then combine the two to get an estimate of the
 
1244
                 * selectivity for the part of the column population represented by
 
1245
                 * the histogram.  (For small histograms, we combine these
 
1246
                 * approaches.)
 
1247
                 *
 
1248
                 * We then add up data for any most-common-values values; these are
 
1249
                 * not in the histogram population, and we can get exact answers for
 
1250
                 * them by applying the pattern operator, so there's no reason to
 
1251
                 * approximate.  (If the MCVs cover a significant part of the total
 
1252
                 * population, this gives us a big leg up in accuracy.)
 
1253
                 */
 
1254
                Selectivity selec;
 
1255
                int                     hist_size;
 
1256
                FmgrInfo        opproc;
 
1257
                double          nullfrac,
 
1258
                                        mcv_selec,
 
1259
                                        sumcommon;
 
1260
 
 
1261
                /* Try to use the histogram entries to get selectivity */
 
1262
                fmgr_info(get_opcode(operator), &opproc);
 
1263
 
 
1264
                selec = histogram_selectivity(&vardata, &opproc, constval, true,
 
1265
                                                                          10, 1, &hist_size);
 
1266
 
 
1267
                /* If not at least 100 entries, use the heuristic method */
 
1268
                if (hist_size < 100)
 
1269
                {
 
1270
                        Selectivity heursel;
 
1271
                        Selectivity prefixsel;
 
1272
                        Selectivity restsel;
 
1273
 
 
1274
                        if (pstatus == Pattern_Prefix_Partial)
 
1275
                                prefixsel = prefix_selectivity(root, &vardata, vartype,
 
1276
                                                                                           opfamily, prefix);
 
1277
                        else
 
1278
                                prefixsel = 1.0;
 
1279
                        restsel = pattern_selectivity(rest, ptype);
 
1280
                        heursel = prefixsel * restsel;
 
1281
 
 
1282
                        if (selec < 0)          /* fewer than 10 histogram entries? */
 
1283
                                selec = heursel;
 
1284
                        else
 
1285
                        {
 
1286
                                /*
 
1287
                                 * For histogram sizes from 10 to 100, we combine the
 
1288
                                 * histogram and heuristic selectivities, putting increasingly
 
1289
                                 * more trust in the histogram for larger sizes.
 
1290
                                 */
 
1291
                                double          hist_weight = hist_size / 100.0;
 
1292
 
 
1293
                                selec = selec * hist_weight + heursel * (1.0 - hist_weight);
 
1294
                        }
 
1295
                }
 
1296
 
 
1297
                /* In any case, don't believe extremely small or large estimates. */
 
1298
                if (selec < 0.0001)
 
1299
                        selec = 0.0001;
 
1300
                else if (selec > 0.9999)
 
1301
                        selec = 0.9999;
 
1302
 
 
1303
                /*
 
1304
                 * If we have most-common-values info, add up the fractions of the MCV
 
1305
                 * entries that satisfy MCV OP PATTERN.  These fractions contribute
 
1306
                 * directly to the result selectivity.  Also add up the total fraction
 
1307
                 * represented by MCV entries.
 
1308
                 */
 
1309
                mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
 
1310
                                                                        &sumcommon);
 
1311
 
 
1312
                if (HeapTupleIsValid(vardata.statsTuple))
 
1313
                        nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
 
1314
                else
 
1315
                        nullfrac = 0.0;
 
1316
 
 
1317
                /*
 
1318
                 * Now merge the results from the MCV and histogram calculations,
 
1319
                 * realizing that the histogram covers only the non-null values that
 
1320
                 * are not listed in MCV.
 
1321
                 */
 
1322
                selec *= 1.0 - nullfrac - sumcommon;
 
1323
                selec += mcv_selec;
 
1324
 
 
1325
                /* result should be in range, but make sure... */
 
1326
                CLAMP_PROBABILITY(selec);
 
1327
                result = selec;
 
1328
        }
 
1329
 
 
1330
        if (prefix)
 
1331
        {
 
1332
                pfree(DatumGetPointer(prefix->constvalue));
 
1333
                pfree(prefix);
 
1334
        }
 
1335
 
 
1336
        ReleaseVariableStats(vardata);
 
1337
 
 
1338
        return negate ? (1.0 - result) : result;
 
1339
}
 
1340
 
 
1341
/*
 
1342
 *              regexeqsel              - Selectivity of regular-expression pattern match.
 
1343
 */
 
1344
Datum
 
1345
regexeqsel(PG_FUNCTION_ARGS)
 
1346
{
 
1347
        PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, false));
 
1348
}
 
1349
 
 
1350
/*
 
1351
 *              icregexeqsel    - Selectivity of case-insensitive regex match.
 
1352
 */
 
1353
Datum
 
1354
icregexeqsel(PG_FUNCTION_ARGS)
 
1355
{
 
1356
        PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, false));
 
1357
}
 
1358
 
 
1359
/*
 
1360
 *              likesel                 - Selectivity of LIKE pattern match.
 
1361
 */
 
1362
Datum
 
1363
likesel(PG_FUNCTION_ARGS)
 
1364
{
 
1365
        PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, false));
 
1366
}
 
1367
 
 
1368
/*
 
1369
 *              iclikesel                       - Selectivity of ILIKE pattern match.
 
1370
 */
 
1371
Datum
 
1372
iclikesel(PG_FUNCTION_ARGS)
 
1373
{
 
1374
        PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, false));
 
1375
}
 
1376
 
 
1377
/*
 
1378
 *              regexnesel              - Selectivity of regular-expression pattern non-match.
 
1379
 */
 
1380
Datum
 
1381
regexnesel(PG_FUNCTION_ARGS)
 
1382
{
 
1383
        PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, true));
 
1384
}
 
1385
 
 
1386
/*
 
1387
 *              icregexnesel    - Selectivity of case-insensitive regex non-match.
 
1388
 */
 
1389
Datum
 
1390
icregexnesel(PG_FUNCTION_ARGS)
 
1391
{
 
1392
        PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, true));
 
1393
}
 
1394
 
 
1395
/*
 
1396
 *              nlikesel                - Selectivity of LIKE pattern non-match.
 
1397
 */
 
1398
Datum
 
1399
nlikesel(PG_FUNCTION_ARGS)
 
1400
{
 
1401
        PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, true));
 
1402
}
 
1403
 
 
1404
/*
 
1405
 *              icnlikesel              - Selectivity of ILIKE pattern non-match.
 
1406
 */
 
1407
Datum
 
1408
icnlikesel(PG_FUNCTION_ARGS)
 
1409
{
 
1410
        PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true));
 
1411
}
 
1412
 
 
1413
/*
 
1414
 *              booltestsel             - Selectivity of BooleanTest Node.
 
1415
 */
 
1416
Selectivity
 
1417
booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg,
 
1418
                        int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo)
 
1419
{
 
1420
        VariableStatData vardata;
 
1421
        double          selec;
 
1422
 
 
1423
        examine_variable(root, arg, varRelid, &vardata);
 
1424
 
 
1425
        if (HeapTupleIsValid(vardata.statsTuple))
 
1426
        {
 
1427
                Form_pg_statistic stats;
 
1428
                double          freq_null;
 
1429
                Datum      *values;
 
1430
                int                     nvalues;
 
1431
                float4     *numbers;
 
1432
                int                     nnumbers;
 
1433
 
 
1434
                stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
 
1435
                freq_null = stats->stanullfrac;
 
1436
 
 
1437
                if (get_attstatsslot(vardata.statsTuple,
 
1438
                                                         vardata.atttype, vardata.atttypmod,
 
1439
                                                         STATISTIC_KIND_MCV, InvalidOid,
 
1440
                                                         NULL,
 
1441
                                                         &values, &nvalues,
 
1442
                                                         &numbers, &nnumbers)
 
1443
                        && nnumbers > 0)
 
1444
                {
 
1445
                        double          freq_true;
 
1446
                        double          freq_false;
 
1447
 
 
1448
                        /*
 
1449
                         * Get first MCV frequency and derive frequency for true.
 
1450
                         */
 
1451
                        if (DatumGetBool(values[0]))
 
1452
                                freq_true = numbers[0];
 
1453
                        else
 
1454
                                freq_true = 1.0 - numbers[0] - freq_null;
 
1455
 
 
1456
                        /*
 
1457
                         * Next derive frequency for false. Then use these as appropriate
 
1458
                         * to derive frequency for each case.
 
1459
                         */
 
1460
                        freq_false = 1.0 - freq_true - freq_null;
 
1461
 
 
1462
                        switch (booltesttype)
 
1463
                        {
 
1464
                                case IS_UNKNOWN:
 
1465
                                        /* select only NULL values */
 
1466
                                        selec = freq_null;
 
1467
                                        break;
 
1468
                                case IS_NOT_UNKNOWN:
 
1469
                                        /* select non-NULL values */
 
1470
                                        selec = 1.0 - freq_null;
 
1471
                                        break;
 
1472
                                case IS_TRUE:
 
1473
                                        /* select only TRUE values */
 
1474
                                        selec = freq_true;
 
1475
                                        break;
 
1476
                                case IS_NOT_TRUE:
 
1477
                                        /* select non-TRUE values */
 
1478
                                        selec = 1.0 - freq_true;
 
1479
                                        break;
 
1480
                                case IS_FALSE:
 
1481
                                        /* select only FALSE values */
 
1482
                                        selec = freq_false;
 
1483
                                        break;
 
1484
                                case IS_NOT_FALSE:
 
1485
                                        /* select non-FALSE values */
 
1486
                                        selec = 1.0 - freq_false;
 
1487
                                        break;
 
1488
                                default:
 
1489
                                        elog(ERROR, "unrecognized booltesttype: %d",
 
1490
                                                 (int) booltesttype);
 
1491
                                        selec = 0.0;    /* Keep compiler quiet */
 
1492
                                        break;
 
1493
                        }
 
1494
 
 
1495
                        free_attstatsslot(vardata.atttype, values, nvalues,
 
1496
                                                          numbers, nnumbers);
 
1497
                }
 
1498
                else
 
1499
                {
 
1500
                        /*
 
1501
                         * No most-common-value info available. Still have null fraction
 
1502
                         * information, so use it for IS [NOT] UNKNOWN. Otherwise adjust
 
1503
                         * for null fraction and assume an even split for boolean tests.
 
1504
                         */
 
1505
                        switch (booltesttype)
 
1506
                        {
 
1507
                                case IS_UNKNOWN:
 
1508
 
 
1509
                                        /*
 
1510
                                         * Use freq_null directly.
 
1511
                                         */
 
1512
                                        selec = freq_null;
 
1513
                                        break;
 
1514
                                case IS_NOT_UNKNOWN:
 
1515
 
 
1516
                                        /*
 
1517
                                         * Select not unknown (not null) values. Calculate from
 
1518
                                         * freq_null.
 
1519
                                         */
 
1520
                                        selec = 1.0 - freq_null;
 
1521
                                        break;
 
1522
                                case IS_TRUE:
 
1523
                                case IS_NOT_TRUE:
 
1524
                                case IS_FALSE:
 
1525
                                case IS_NOT_FALSE:
 
1526
                                        selec = (1.0 - freq_null) / 2.0;
 
1527
                                        break;
 
1528
                                default:
 
1529
                                        elog(ERROR, "unrecognized booltesttype: %d",
 
1530
                                                 (int) booltesttype);
 
1531
                                        selec = 0.0;    /* Keep compiler quiet */
 
1532
                                        break;
 
1533
                        }
 
1534
                }
 
1535
        }
 
1536
        else
 
1537
        {
 
1538
                /*
 
1539
                 * If we can't get variable statistics for the argument, perhaps
 
1540
                 * clause_selectivity can do something with it.  We ignore the
 
1541
                 * possibility of a NULL value when using clause_selectivity, and just
 
1542
                 * assume the value is either TRUE or FALSE.
 
1543
                 */
 
1544
                switch (booltesttype)
 
1545
                {
 
1546
                        case IS_UNKNOWN:
 
1547
                                selec = DEFAULT_UNK_SEL;
 
1548
                                break;
 
1549
                        case IS_NOT_UNKNOWN:
 
1550
                                selec = DEFAULT_NOT_UNK_SEL;
 
1551
                                break;
 
1552
                        case IS_TRUE:
 
1553
                        case IS_NOT_FALSE:
 
1554
                                selec = (double) clause_selectivity(root, arg,
 
1555
                                                                                                        varRelid,
 
1556
                                                                                                        jointype, sjinfo);
 
1557
                                break;
 
1558
                        case IS_FALSE:
 
1559
                        case IS_NOT_TRUE:
 
1560
                                selec = 1.0 - (double) clause_selectivity(root, arg,
 
1561
                                                                                                                  varRelid,
 
1562
                                                                                                                  jointype, sjinfo);
 
1563
                                break;
 
1564
                        default:
 
1565
                                elog(ERROR, "unrecognized booltesttype: %d",
 
1566
                                         (int) booltesttype);
 
1567
                                selec = 0.0;    /* Keep compiler quiet */
 
1568
                                break;
 
1569
                }
 
1570
        }
 
1571
 
 
1572
        ReleaseVariableStats(vardata);
 
1573
 
 
1574
        /* result should be in range, but make sure... */
 
1575
        CLAMP_PROBABILITY(selec);
 
1576
 
 
1577
        return (Selectivity) selec;
 
1578
}
 
1579
 
 
1580
/*
 
1581
 *              nulltestsel             - Selectivity of NullTest Node.
 
1582
 */
 
1583
Selectivity
 
1584
nulltestsel(PlannerInfo *root, NullTestType nulltesttype, Node *arg,
 
1585
                        int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo)
 
1586
{
 
1587
        VariableStatData vardata;
 
1588
        double          selec;
 
1589
 
 
1590
        examine_variable(root, arg, varRelid, &vardata);
 
1591
 
 
1592
        if (HeapTupleIsValid(vardata.statsTuple))
 
1593
        {
 
1594
                Form_pg_statistic stats;
 
1595
                double          freq_null;
 
1596
 
 
1597
                stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
 
1598
                freq_null = stats->stanullfrac;
 
1599
 
 
1600
                switch (nulltesttype)
 
1601
                {
 
1602
                        case IS_NULL:
 
1603
 
 
1604
                                /*
 
1605
                                 * Use freq_null directly.
 
1606
                                 */
 
1607
                                selec = freq_null;
 
1608
                                break;
 
1609
                        case IS_NOT_NULL:
 
1610
 
 
1611
                                /*
 
1612
                                 * Select not unknown (not null) values. Calculate from
 
1613
                                 * freq_null.
 
1614
                                 */
 
1615
                                selec = 1.0 - freq_null;
 
1616
                                break;
 
1617
                        default:
 
1618
                                elog(ERROR, "unrecognized nulltesttype: %d",
 
1619
                                         (int) nulltesttype);
 
1620
                                return (Selectivity) 0; /* keep compiler quiet */
 
1621
                }
 
1622
        }
 
1623
        else
 
1624
        {
 
1625
                /*
 
1626
                 * No ANALYZE stats available, so make a guess
 
1627
                 */
 
1628
                switch (nulltesttype)
 
1629
                {
 
1630
                        case IS_NULL:
 
1631
                                selec = DEFAULT_UNK_SEL;
 
1632
                                break;
 
1633
                        case IS_NOT_NULL:
 
1634
                                selec = DEFAULT_NOT_UNK_SEL;
 
1635
                                break;
 
1636
                        default:
 
1637
                                elog(ERROR, "unrecognized nulltesttype: %d",
 
1638
                                         (int) nulltesttype);
 
1639
                                return (Selectivity) 0; /* keep compiler quiet */
 
1640
                }
 
1641
        }
 
1642
 
 
1643
        ReleaseVariableStats(vardata);
 
1644
 
 
1645
        /* result should be in range, but make sure... */
 
1646
        CLAMP_PROBABILITY(selec);
 
1647
 
 
1648
        return (Selectivity) selec;
 
1649
}
 
1650
 
 
1651
/*
 
1652
 * strip_array_coercion - strip binary-compatible relabeling from an array expr
 
1653
 *
 
1654
 * For array values, the parser normally generates ArrayCoerceExpr conversions,
 
1655
 * but it seems possible that RelabelType might show up.  Also, the planner
 
1656
 * is not currently tense about collapsing stacked ArrayCoerceExpr nodes,
 
1657
 * so we need to be ready to deal with more than one level.
 
1658
 */
 
1659
static Node *
 
1660
strip_array_coercion(Node *node)
 
1661
{
 
1662
        for (;;)
 
1663
        {
 
1664
                if (node && IsA(node, ArrayCoerceExpr) &&
 
1665
                        ((ArrayCoerceExpr *) node)->elemfuncid == InvalidOid)
 
1666
                {
 
1667
                        node = (Node *) ((ArrayCoerceExpr *) node)->arg;
 
1668
                }
 
1669
                else if (node && IsA(node, RelabelType))
 
1670
                {
 
1671
                        /* We don't really expect this case, but may as well cope */
 
1672
                        node = (Node *) ((RelabelType *) node)->arg;
 
1673
                }
 
1674
                else
 
1675
                        break;
 
1676
        }
 
1677
        return node;
 
1678
}
 
1679
 
 
1680
/*
 
1681
 *              scalararraysel          - Selectivity of ScalarArrayOpExpr Node.
 
1682
 */
 
1683
Selectivity
 
1684
scalararraysel(PlannerInfo *root,
 
1685
                           ScalarArrayOpExpr *clause,
 
1686
                           bool is_join_clause,
 
1687
                           int varRelid,
 
1688
                           JoinType jointype,
 
1689
                           SpecialJoinInfo *sjinfo)
 
1690
{
 
1691
        Oid                     operator = clause->opno;
 
1692
        bool            useOr = clause->useOr;
 
1693
        Node       *leftop;
 
1694
        Node       *rightop;
 
1695
        Oid                     nominal_element_type;
 
1696
        Oid                     nominal_element_collation;
 
1697
        RegProcedure oprsel;
 
1698
        FmgrInfo        oprselproc;
 
1699
        Selectivity s1;
 
1700
 
 
1701
        /*
 
1702
         * First, look up the underlying operator's selectivity estimator. Punt if
 
1703
         * it hasn't got one.
 
1704
         */
 
1705
        if (is_join_clause)
 
1706
                oprsel = get_oprjoin(operator);
 
1707
        else
 
1708
                oprsel = get_oprrest(operator);
 
1709
        if (!oprsel)
 
1710
                return (Selectivity) 0.5;
 
1711
        fmgr_info(oprsel, &oprselproc);
 
1712
 
 
1713
        /* deconstruct the expression */
 
1714
        Assert(list_length(clause->args) == 2);
 
1715
        leftop = (Node *) linitial(clause->args);
 
1716
        rightop = (Node *) lsecond(clause->args);
 
1717
 
 
1718
        /* get nominal (after relabeling) element type of rightop */
 
1719
        nominal_element_type = get_base_element_type(exprType(rightop));
 
1720
        if (!OidIsValid(nominal_element_type))
 
1721
                return (Selectivity) 0.5;               /* probably shouldn't happen */
 
1722
        /* get nominal collation, too, for generating constants */
 
1723
        nominal_element_collation = exprCollation(rightop);
 
1724
 
 
1725
        /* look through any binary-compatible relabeling of rightop */
 
1726
        rightop = strip_array_coercion(rightop);
 
1727
 
 
1728
        /*
 
1729
         * We consider three cases:
 
1730
         *
 
1731
         * 1. rightop is an Array constant: deconstruct the array, apply the
 
1732
         * operator's selectivity function for each array element, and merge the
 
1733
         * results in the same way that clausesel.c does for AND/OR combinations.
 
1734
         *
 
1735
         * 2. rightop is an ARRAY[] construct: apply the operator's selectivity
 
1736
         * function for each element of the ARRAY[] construct, and merge.
 
1737
         *
 
1738
         * 3. otherwise, make a guess ...
 
1739
         */
 
1740
        if (rightop && IsA(rightop, Const))
 
1741
        {
 
1742
                Datum           arraydatum = ((Const *) rightop)->constvalue;
 
1743
                bool            arrayisnull = ((Const *) rightop)->constisnull;
 
1744
                ArrayType  *arrayval;
 
1745
                int16           elmlen;
 
1746
                bool            elmbyval;
 
1747
                char            elmalign;
 
1748
                int                     num_elems;
 
1749
                Datum      *elem_values;
 
1750
                bool       *elem_nulls;
 
1751
                int                     i;
 
1752
 
 
1753
                if (arrayisnull)                /* qual can't succeed if null array */
 
1754
                        return (Selectivity) 0.0;
 
1755
                arrayval = DatumGetArrayTypeP(arraydatum);
 
1756
                get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
 
1757
                                                         &elmlen, &elmbyval, &elmalign);
 
1758
                deconstruct_array(arrayval,
 
1759
                                                  ARR_ELEMTYPE(arrayval),
 
1760
                                                  elmlen, elmbyval, elmalign,
 
1761
                                                  &elem_values, &elem_nulls, &num_elems);
 
1762
                s1 = useOr ? 0.0 : 1.0;
 
1763
                for (i = 0; i < num_elems; i++)
 
1764
                {
 
1765
                        List       *args;
 
1766
                        Selectivity s2;
 
1767
 
 
1768
                        args = list_make2(leftop,
 
1769
                                                          makeConst(nominal_element_type,
 
1770
                                                                                -1,
 
1771
                                                                                nominal_element_collation,
 
1772
                                                                                elmlen,
 
1773
                                                                                elem_values[i],
 
1774
                                                                                elem_nulls[i],
 
1775
                                                                                elmbyval));
 
1776
                        if (is_join_clause)
 
1777
                                s2 = DatumGetFloat8(FunctionCall5(&oprselproc,
 
1778
                                                                                                  PointerGetDatum(root),
 
1779
                                                                                                  ObjectIdGetDatum(operator),
 
1780
                                                                                                  PointerGetDatum(args),
 
1781
                                                                                                  Int16GetDatum(jointype),
 
1782
                                                                                                  PointerGetDatum(sjinfo)));
 
1783
                        else
 
1784
                                s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
 
1785
                                                                                                  PointerGetDatum(root),
 
1786
                                                                                                  ObjectIdGetDatum(operator),
 
1787
                                                                                                  PointerGetDatum(args),
 
1788
                                                                                                  Int32GetDatum(varRelid)));
 
1789
                        if (useOr)
 
1790
                                s1 = s1 + s2 - s1 * s2;
 
1791
                        else
 
1792
                                s1 = s1 * s2;
 
1793
                }
 
1794
        }
 
1795
        else if (rightop && IsA(rightop, ArrayExpr) &&
 
1796
                         !((ArrayExpr *) rightop)->multidims)
 
1797
        {
 
1798
                ArrayExpr  *arrayexpr = (ArrayExpr *) rightop;
 
1799
                int16           elmlen;
 
1800
                bool            elmbyval;
 
1801
                ListCell   *l;
 
1802
 
 
1803
                get_typlenbyval(arrayexpr->element_typeid,
 
1804
                                                &elmlen, &elmbyval);
 
1805
                s1 = useOr ? 0.0 : 1.0;
 
1806
                foreach(l, arrayexpr->elements)
 
1807
                {
 
1808
                        Node       *elem = (Node *) lfirst(l);
 
1809
                        List       *args;
 
1810
                        Selectivity s2;
 
1811
 
 
1812
                        /*
 
1813
                         * Theoretically, if elem isn't of nominal_element_type we should
 
1814
                         * insert a RelabelType, but it seems unlikely that any operator
 
1815
                         * estimation function would really care ...
 
1816
                         */
 
1817
                        args = list_make2(leftop, elem);
 
1818
                        if (is_join_clause)
 
1819
                                s2 = DatumGetFloat8(FunctionCall5(&oprselproc,
 
1820
                                                                                                  PointerGetDatum(root),
 
1821
                                                                                                  ObjectIdGetDatum(operator),
 
1822
                                                                                                  PointerGetDatum(args),
 
1823
                                                                                                  Int16GetDatum(jointype),
 
1824
                                                                                                  PointerGetDatum(sjinfo)));
 
1825
                        else
 
1826
                                s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
 
1827
                                                                                                  PointerGetDatum(root),
 
1828
                                                                                                  ObjectIdGetDatum(operator),
 
1829
                                                                                                  PointerGetDatum(args),
 
1830
                                                                                                  Int32GetDatum(varRelid)));
 
1831
                        if (useOr)
 
1832
                                s1 = s1 + s2 - s1 * s2;
 
1833
                        else
 
1834
                                s1 = s1 * s2;
 
1835
                }
 
1836
        }
 
1837
        else
 
1838
        {
 
1839
                CaseTestExpr *dummyexpr;
 
1840
                List       *args;
 
1841
                Selectivity s2;
 
1842
                int                     i;
 
1843
 
 
1844
                /*
 
1845
                 * We need a dummy rightop to pass to the operator selectivity
 
1846
                 * routine.  It can be pretty much anything that doesn't look like a
 
1847
                 * constant; CaseTestExpr is a convenient choice.
 
1848
                 */
 
1849
                dummyexpr = makeNode(CaseTestExpr);
 
1850
                dummyexpr->typeId = nominal_element_type;
 
1851
                dummyexpr->typeMod = -1;
 
1852
                dummyexpr->collation = clause->inputcollid;
 
1853
                args = list_make2(leftop, dummyexpr);
 
1854
                if (is_join_clause)
 
1855
                        s2 = DatumGetFloat8(FunctionCall5(&oprselproc,
 
1856
                                                                                          PointerGetDatum(root),
 
1857
                                                                                          ObjectIdGetDatum(operator),
 
1858
                                                                                          PointerGetDatum(args),
 
1859
                                                                                          Int16GetDatum(jointype),
 
1860
                                                                                          PointerGetDatum(sjinfo)));
 
1861
                else
 
1862
                        s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
 
1863
                                                                                          PointerGetDatum(root),
 
1864
                                                                                          ObjectIdGetDatum(operator),
 
1865
                                                                                          PointerGetDatum(args),
 
1866
                                                                                          Int32GetDatum(varRelid)));
 
1867
                s1 = useOr ? 0.0 : 1.0;
 
1868
 
 
1869
                /*
 
1870
                 * Arbitrarily assume 10 elements in the eventual array value (see
 
1871
                 * also estimate_array_length)
 
1872
                 */
 
1873
                for (i = 0; i < 10; i++)
 
1874
                {
 
1875
                        if (useOr)
 
1876
                                s1 = s1 + s2 - s1 * s2;
 
1877
                        else
 
1878
                                s1 = s1 * s2;
 
1879
                }
 
1880
        }
 
1881
 
 
1882
        /* result should be in range, but make sure... */
 
1883
        CLAMP_PROBABILITY(s1);
 
1884
 
 
1885
        return s1;
 
1886
}
 
1887
 
 
1888
/*
 
1889
 * Estimate number of elements in the array yielded by an expression.
 
1890
 *
 
1891
 * It's important that this agree with scalararraysel.
 
1892
 */
 
1893
int
 
1894
estimate_array_length(Node *arrayexpr)
 
1895
{
 
1896
        /* look through any binary-compatible relabeling of arrayexpr */
 
1897
        arrayexpr = strip_array_coercion(arrayexpr);
 
1898
 
 
1899
        if (arrayexpr && IsA(arrayexpr, Const))
 
1900
        {
 
1901
                Datum           arraydatum = ((Const *) arrayexpr)->constvalue;
 
1902
                bool            arrayisnull = ((Const *) arrayexpr)->constisnull;
 
1903
                ArrayType  *arrayval;
 
1904
 
 
1905
                if (arrayisnull)
 
1906
                        return 0;
 
1907
                arrayval = DatumGetArrayTypeP(arraydatum);
 
1908
                return ArrayGetNItems(ARR_NDIM(arrayval), ARR_DIMS(arrayval));
 
1909
        }
 
1910
        else if (arrayexpr && IsA(arrayexpr, ArrayExpr) &&
 
1911
                         !((ArrayExpr *) arrayexpr)->multidims)
 
1912
        {
 
1913
                return list_length(((ArrayExpr *) arrayexpr)->elements);
 
1914
        }
 
1915
        else
 
1916
        {
 
1917
                /* default guess --- see also scalararraysel */
 
1918
                return 10;
 
1919
        }
 
1920
}
 
1921
 
 
1922
/*
 
1923
 *              rowcomparesel           - Selectivity of RowCompareExpr Node.
 
1924
 *
 
1925
 * We estimate RowCompare selectivity by considering just the first (high
 
1926
 * order) columns, which makes it equivalent to an ordinary OpExpr.  While
 
1927
 * this estimate could be refined by considering additional columns, it
 
1928
 * seems unlikely that we could do a lot better without multi-column
 
1929
 * statistics.
 
1930
 */
 
1931
Selectivity
 
1932
rowcomparesel(PlannerInfo *root,
 
1933
                          RowCompareExpr *clause,
 
1934
                          int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo)
 
1935
{
 
1936
        Selectivity s1;
 
1937
        Oid                     opno = linitial_oid(clause->opnos);
 
1938
        List       *opargs;
 
1939
        bool            is_join_clause;
 
1940
 
 
1941
        /* Build equivalent arg list for single operator */
 
1942
        opargs = list_make2(linitial(clause->largs), linitial(clause->rargs));
 
1943
 
 
1944
        /*
 
1945
         * Decide if it's a join clause.  This should match clausesel.c's
 
1946
         * treat_as_join_clause(), except that we intentionally consider only the
 
1947
         * leading columns and not the rest of the clause.
 
1948
         */
 
1949
        if (varRelid != 0)
 
1950
        {
 
1951
                /*
 
1952
                 * Caller is forcing restriction mode (eg, because we are examining an
 
1953
                 * inner indexscan qual).
 
1954
                 */
 
1955
                is_join_clause = false;
 
1956
        }
 
1957
        else if (sjinfo == NULL)
 
1958
        {
 
1959
                /*
 
1960
                 * It must be a restriction clause, since it's being evaluated at a
 
1961
                 * scan node.
 
1962
                 */
 
1963
                is_join_clause = false;
 
1964
        }
 
1965
        else
 
1966
        {
 
1967
                /*
 
1968
                 * Otherwise, it's a join if there's more than one relation used.
 
1969
                 */
 
1970
                is_join_clause = (NumRelids((Node *) opargs) > 1);
 
1971
        }
 
1972
 
 
1973
        if (is_join_clause)
 
1974
        {
 
1975
                /* Estimate selectivity for a join clause. */
 
1976
                s1 = join_selectivity(root, opno,
 
1977
                                                          opargs,
 
1978
                                                          jointype,
 
1979
                                                          sjinfo);
 
1980
        }
 
1981
        else
 
1982
        {
 
1983
                /* Estimate selectivity for a restriction clause. */
 
1984
                s1 = restriction_selectivity(root, opno,
 
1985
                                                                         opargs,
 
1986
                                                                         varRelid);
 
1987
        }
 
1988
 
 
1989
        return s1;
 
1990
}
 
1991
 
 
1992
/*
 
1993
 *              eqjoinsel               - Join selectivity of "="
 
1994
 */
 
1995
Datum
 
1996
eqjoinsel(PG_FUNCTION_ARGS)
 
1997
{
 
1998
        PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 
1999
        Oid                     operator = PG_GETARG_OID(1);
 
2000
        List       *args = (List *) PG_GETARG_POINTER(2);
 
2001
 
 
2002
#ifdef NOT_USED
 
2003
        JoinType        jointype = (JoinType) PG_GETARG_INT16(3);
 
2004
#endif
 
2005
        SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
 
2006
        double          selec;
 
2007
        VariableStatData vardata1;
 
2008
        VariableStatData vardata2;
 
2009
        bool            join_is_reversed;
 
2010
 
 
2011
        get_join_variables(root, args, sjinfo,
 
2012
                                           &vardata1, &vardata2, &join_is_reversed);
 
2013
 
 
2014
        switch (sjinfo->jointype)
 
2015
        {
 
2016
                case JOIN_INNER:
 
2017
                case JOIN_LEFT:
 
2018
                case JOIN_FULL:
 
2019
                        selec = eqjoinsel_inner(operator, &vardata1, &vardata2);
 
2020
                        break;
 
2021
                case JOIN_SEMI:
 
2022
                case JOIN_ANTI:
 
2023
                        if (!join_is_reversed)
 
2024
                                selec = eqjoinsel_semi(operator, &vardata1, &vardata2);
 
2025
                        else
 
2026
                                selec = eqjoinsel_semi(get_commutator(operator),
 
2027
                                                                           &vardata2, &vardata1);
 
2028
                        break;
 
2029
                default:
 
2030
                        /* other values not expected here */
 
2031
                        elog(ERROR, "unrecognized join type: %d",
 
2032
                                 (int) sjinfo->jointype);
 
2033
                        selec = 0;                      /* keep compiler quiet */
 
2034
                        break;
 
2035
        }
 
2036
 
 
2037
        ReleaseVariableStats(vardata1);
 
2038
        ReleaseVariableStats(vardata2);
 
2039
 
 
2040
        CLAMP_PROBABILITY(selec);
 
2041
 
 
2042
        PG_RETURN_FLOAT8((float8) selec);
 
2043
}
 
2044
 
 
2045
/*
 
2046
 * eqjoinsel_inner --- eqjoinsel for normal inner join
 
2047
 *
 
2048
 * We also use this for LEFT/FULL outer joins; it's not presently clear
 
2049
 * that it's worth trying to distinguish them here.
 
2050
 */
 
2051
static double
 
2052
eqjoinsel_inner(Oid operator,
 
2053
                                VariableStatData *vardata1, VariableStatData *vardata2)
 
2054
{
 
2055
        double          selec;
 
2056
        double          nd1;
 
2057
        double          nd2;
 
2058
        Form_pg_statistic stats1 = NULL;
 
2059
        Form_pg_statistic stats2 = NULL;
 
2060
        bool            have_mcvs1 = false;
 
2061
        Datum      *values1 = NULL;
 
2062
        int                     nvalues1 = 0;
 
2063
        float4     *numbers1 = NULL;
 
2064
        int                     nnumbers1 = 0;
 
2065
        bool            have_mcvs2 = false;
 
2066
        Datum      *values2 = NULL;
 
2067
        int                     nvalues2 = 0;
 
2068
        float4     *numbers2 = NULL;
 
2069
        int                     nnumbers2 = 0;
 
2070
 
 
2071
        nd1 = get_variable_numdistinct(vardata1);
 
2072
        nd2 = get_variable_numdistinct(vardata2);
 
2073
 
 
2074
        if (HeapTupleIsValid(vardata1->statsTuple))
 
2075
        {
 
2076
                stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
 
2077
                have_mcvs1 = get_attstatsslot(vardata1->statsTuple,
 
2078
                                                                          vardata1->atttype,
 
2079
                                                                          vardata1->atttypmod,
 
2080
                                                                          STATISTIC_KIND_MCV,
 
2081
                                                                          InvalidOid,
 
2082
                                                                          NULL,
 
2083
                                                                          &values1, &nvalues1,
 
2084
                                                                          &numbers1, &nnumbers1);
 
2085
        }
 
2086
 
 
2087
        if (HeapTupleIsValid(vardata2->statsTuple))
 
2088
        {
 
2089
                stats2 = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
 
2090
                have_mcvs2 = get_attstatsslot(vardata2->statsTuple,
 
2091
                                                                          vardata2->atttype,
 
2092
                                                                          vardata2->atttypmod,
 
2093
                                                                          STATISTIC_KIND_MCV,
 
2094
                                                                          InvalidOid,
 
2095
                                                                          NULL,
 
2096
                                                                          &values2, &nvalues2,
 
2097
                                                                          &numbers2, &nnumbers2);
 
2098
        }
 
2099
 
 
2100
        if (have_mcvs1 && have_mcvs2)
 
2101
        {
 
2102
                /*
 
2103
                 * We have most-common-value lists for both relations.  Run through
 
2104
                 * the lists to see which MCVs actually join to each other with the
 
2105
                 * given operator.      This allows us to determine the exact join
 
2106
                 * selectivity for the portion of the relations represented by the MCV
 
2107
                 * lists.  We still have to estimate for the remaining population, but
 
2108
                 * in a skewed distribution this gives us a big leg up in accuracy.
 
2109
                 * For motivation see the analysis in Y. Ioannidis and S.
 
2110
                 * Christodoulakis, "On the propagation of errors in the size of join
 
2111
                 * results", Technical Report 1018, Computer Science Dept., University
 
2112
                 * of Wisconsin, Madison, March 1991 (available from ftp.cs.wisc.edu).
 
2113
                 */
 
2114
                FmgrInfo        eqproc;
 
2115
                bool       *hasmatch1;
 
2116
                bool       *hasmatch2;
 
2117
                double          nullfrac1 = stats1->stanullfrac;
 
2118
                double          nullfrac2 = stats2->stanullfrac;
 
2119
                double          matchprodfreq,
 
2120
                                        matchfreq1,
 
2121
                                        matchfreq2,
 
2122
                                        unmatchfreq1,
 
2123
                                        unmatchfreq2,
 
2124
                                        otherfreq1,
 
2125
                                        otherfreq2,
 
2126
                                        totalsel1,
 
2127
                                        totalsel2;
 
2128
                int                     i,
 
2129
                                        nmatches;
 
2130
 
 
2131
                fmgr_info(get_opcode(operator), &eqproc);
 
2132
                hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
 
2133
                hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
 
2134
 
 
2135
                /*
 
2136
                 * Note we assume that each MCV will match at most one member of the
 
2137
                 * other MCV list.      If the operator isn't really equality, there could
 
2138
                 * be multiple matches --- but we don't look for them, both for speed
 
2139
                 * and because the math wouldn't add up...
 
2140
                 */
 
2141
                matchprodfreq = 0.0;
 
2142
                nmatches = 0;
 
2143
                for (i = 0; i < nvalues1; i++)
 
2144
                {
 
2145
                        int                     j;
 
2146
 
 
2147
                        for (j = 0; j < nvalues2; j++)
 
2148
                        {
 
2149
                                if (hasmatch2[j])
 
2150
                                        continue;
 
2151
                                if (DatumGetBool(FunctionCall2Coll(&eqproc,
 
2152
                                                                                                   DEFAULT_COLLATION_OID,
 
2153
                                                                                                   values1[i],
 
2154
                                                                                                   values2[j])))
 
2155
                                {
 
2156
                                        hasmatch1[i] = hasmatch2[j] = true;
 
2157
                                        matchprodfreq += numbers1[i] * numbers2[j];
 
2158
                                        nmatches++;
 
2159
                                        break;
 
2160
                                }
 
2161
                        }
 
2162
                }
 
2163
                CLAMP_PROBABILITY(matchprodfreq);
 
2164
                /* Sum up frequencies of matched and unmatched MCVs */
 
2165
                matchfreq1 = unmatchfreq1 = 0.0;
 
2166
                for (i = 0; i < nvalues1; i++)
 
2167
                {
 
2168
                        if (hasmatch1[i])
 
2169
                                matchfreq1 += numbers1[i];
 
2170
                        else
 
2171
                                unmatchfreq1 += numbers1[i];
 
2172
                }
 
2173
                CLAMP_PROBABILITY(matchfreq1);
 
2174
                CLAMP_PROBABILITY(unmatchfreq1);
 
2175
                matchfreq2 = unmatchfreq2 = 0.0;
 
2176
                for (i = 0; i < nvalues2; i++)
 
2177
                {
 
2178
                        if (hasmatch2[i])
 
2179
                                matchfreq2 += numbers2[i];
 
2180
                        else
 
2181
                                unmatchfreq2 += numbers2[i];
 
2182
                }
 
2183
                CLAMP_PROBABILITY(matchfreq2);
 
2184
                CLAMP_PROBABILITY(unmatchfreq2);
 
2185
                pfree(hasmatch1);
 
2186
                pfree(hasmatch2);
 
2187
 
 
2188
                /*
 
2189
                 * Compute total frequency of non-null values that are not in the MCV
 
2190
                 * lists.
 
2191
                 */
 
2192
                otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1;
 
2193
                otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2;
 
2194
                CLAMP_PROBABILITY(otherfreq1);
 
2195
                CLAMP_PROBABILITY(otherfreq2);
 
2196
 
 
2197
                /*
 
2198
                 * We can estimate the total selectivity from the point of view of
 
2199
                 * relation 1 as: the known selectivity for matched MCVs, plus
 
2200
                 * unmatched MCVs that are assumed to match against random members of
 
2201
                 * relation 2's non-MCV population, plus non-MCV values that are
 
2202
                 * assumed to match against random members of relation 2's unmatched
 
2203
                 * MCVs plus non-MCV values.
 
2204
                 */
 
2205
                totalsel1 = matchprodfreq;
 
2206
                if (nd2 > nvalues2)
 
2207
                        totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
 
2208
                if (nd2 > nmatches)
 
2209
                        totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
 
2210
                                (nd2 - nmatches);
 
2211
                /* Same estimate from the point of view of relation 2. */
 
2212
                totalsel2 = matchprodfreq;
 
2213
                if (nd1 > nvalues1)
 
2214
                        totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
 
2215
                if (nd1 > nmatches)
 
2216
                        totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
 
2217
                                (nd1 - nmatches);
 
2218
 
 
2219
                /*
 
2220
                 * Use the smaller of the two estimates.  This can be justified in
 
2221
                 * essentially the same terms as given below for the no-stats case: to
 
2222
                 * a first approximation, we are estimating from the point of view of
 
2223
                 * the relation with smaller nd.
 
2224
                 */
 
2225
                selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2;
 
2226
        }
 
2227
        else
 
2228
        {
 
2229
                /*
 
2230
                 * We do not have MCV lists for both sides.  Estimate the join
 
2231
                 * selectivity as MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2). This
 
2232
                 * is plausible if we assume that the join operator is strict and the
 
2233
                 * non-null values are about equally distributed: a given non-null
 
2234
                 * tuple of rel1 will join to either zero or N2*(1-nullfrac2)/nd2 rows
 
2235
                 * of rel2, so total join rows are at most
 
2236
                 * N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join selectivity of
 
2237
                 * not more than (1-nullfrac1)*(1-nullfrac2)/nd2. By the same logic it
 
2238
                 * is not more than (1-nullfrac1)*(1-nullfrac2)/nd1, so the expression
 
2239
                 * with MIN() is an upper bound.  Using the MIN() means we estimate
 
2240
                 * from the point of view of the relation with smaller nd (since the
 
2241
                 * larger nd is determining the MIN).  It is reasonable to assume that
 
2242
                 * most tuples in this rel will have join partners, so the bound is
 
2243
                 * probably reasonably tight and should be taken as-is.
 
2244
                 *
 
2245
                 * XXX Can we be smarter if we have an MCV list for just one side? It
 
2246
                 * seems that if we assume equal distribution for the other side, we
 
2247
                 * end up with the same answer anyway.
 
2248
                 *
 
2249
                 * An additional hack we use here is to clamp the nd1 and nd2 values
 
2250
                 * to not more than what we are estimating the input relation sizes to
 
2251
                 * be, providing a crude correction for the selectivity of restriction
 
2252
                 * clauses on those relations.  (We don't do that in the other path
 
2253
                 * since there we are comparing the nd values to stats for the whole
 
2254
                 * relations.)
 
2255
                 */
 
2256
                double          nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
 
2257
                double          nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
 
2258
 
 
2259
                if (vardata1->rel)
 
2260
                        nd1 = Min(nd1, vardata1->rel->rows);
 
2261
                if (vardata2->rel)
 
2262
                        nd2 = Min(nd2, vardata2->rel->rows);
 
2263
 
 
2264
                selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
 
2265
                if (nd1 > nd2)
 
2266
                        selec /= nd1;
 
2267
                else
 
2268
                        selec /= nd2;
 
2269
        }
 
2270
 
 
2271
        if (have_mcvs1)
 
2272
                free_attstatsslot(vardata1->atttype, values1, nvalues1,
 
2273
                                                  numbers1, nnumbers1);
 
2274
        if (have_mcvs2)
 
2275
                free_attstatsslot(vardata2->atttype, values2, nvalues2,
 
2276
                                                  numbers2, nnumbers2);
 
2277
 
 
2278
        return selec;
 
2279
}
 
2280
 
 
2281
/*
 
2282
 * eqjoinsel_semi --- eqjoinsel for semi join
 
2283
 *
 
2284
 * (Also used for anti join, which we are supposed to estimate the same way.)
 
2285
 * Caller has ensured that vardata1 is the LHS variable.
 
2286
 */
 
2287
static double
 
2288
eqjoinsel_semi(Oid operator,
 
2289
                           VariableStatData *vardata1, VariableStatData *vardata2)
 
2290
{
 
2291
        double          selec;
 
2292
        double          nd1;
 
2293
        double          nd2;
 
2294
        Form_pg_statistic stats1 = NULL;
 
2295
        bool            have_mcvs1 = false;
 
2296
        Datum      *values1 = NULL;
 
2297
        int                     nvalues1 = 0;
 
2298
        float4     *numbers1 = NULL;
 
2299
        int                     nnumbers1 = 0;
 
2300
        bool            have_mcvs2 = false;
 
2301
        Datum      *values2 = NULL;
 
2302
        int                     nvalues2 = 0;
 
2303
        float4     *numbers2 = NULL;
 
2304
        int                     nnumbers2 = 0;
 
2305
 
 
2306
        nd1 = get_variable_numdistinct(vardata1);
 
2307
        nd2 = get_variable_numdistinct(vardata2);
 
2308
 
 
2309
        if (HeapTupleIsValid(vardata1->statsTuple))
 
2310
        {
 
2311
                stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
 
2312
                have_mcvs1 = get_attstatsslot(vardata1->statsTuple,
 
2313
                                                                          vardata1->atttype,
 
2314
                                                                          vardata1->atttypmod,
 
2315
                                                                          STATISTIC_KIND_MCV,
 
2316
                                                                          InvalidOid,
 
2317
                                                                          NULL,
 
2318
                                                                          &values1, &nvalues1,
 
2319
                                                                          &numbers1, &nnumbers1);
 
2320
        }
 
2321
 
 
2322
        if (HeapTupleIsValid(vardata2->statsTuple))
 
2323
        {
 
2324
                have_mcvs2 = get_attstatsslot(vardata2->statsTuple,
 
2325
                                                                          vardata2->atttype,
 
2326
                                                                          vardata2->atttypmod,
 
2327
                                                                          STATISTIC_KIND_MCV,
 
2328
                                                                          InvalidOid,
 
2329
                                                                          NULL,
 
2330
                                                                          &values2, &nvalues2,
 
2331
                                                                          &numbers2, &nnumbers2);
 
2332
        }
 
2333
 
 
2334
        if (have_mcvs1 && have_mcvs2 && OidIsValid(operator))
 
2335
        {
 
2336
                /*
 
2337
                 * We have most-common-value lists for both relations.  Run through
 
2338
                 * the lists to see which MCVs actually join to each other with the
 
2339
                 * given operator.      This allows us to determine the exact join
 
2340
                 * selectivity for the portion of the relations represented by the MCV
 
2341
                 * lists.  We still have to estimate for the remaining population, but
 
2342
                 * in a skewed distribution this gives us a big leg up in accuracy.
 
2343
                 */
 
2344
                FmgrInfo        eqproc;
 
2345
                bool       *hasmatch1;
 
2346
                bool       *hasmatch2;
 
2347
                double          nullfrac1 = stats1->stanullfrac;
 
2348
                double          matchfreq1,
 
2349
                                        uncertainfrac,
 
2350
                                        uncertain;
 
2351
                int                     i,
 
2352
                                        nmatches;
 
2353
 
 
2354
                fmgr_info(get_opcode(operator), &eqproc);
 
2355
                hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
 
2356
                hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
 
2357
 
 
2358
                /*
 
2359
                 * Note we assume that each MCV will match at most one member of the
 
2360
                 * other MCV list.      If the operator isn't really equality, there could
 
2361
                 * be multiple matches --- but we don't look for them, both for speed
 
2362
                 * and because the math wouldn't add up...
 
2363
                 */
 
2364
                nmatches = 0;
 
2365
                for (i = 0; i < nvalues1; i++)
 
2366
                {
 
2367
                        int                     j;
 
2368
 
 
2369
                        for (j = 0; j < nvalues2; j++)
 
2370
                        {
 
2371
                                if (hasmatch2[j])
 
2372
                                        continue;
 
2373
                                if (DatumGetBool(FunctionCall2Coll(&eqproc,
 
2374
                                                                                                   DEFAULT_COLLATION_OID,
 
2375
                                                                                                   values1[i],
 
2376
                                                                                                   values2[j])))
 
2377
                                {
 
2378
                                        hasmatch1[i] = hasmatch2[j] = true;
 
2379
                                        nmatches++;
 
2380
                                        break;
 
2381
                                }
 
2382
                        }
 
2383
                }
 
2384
                /* Sum up frequencies of matched MCVs */
 
2385
                matchfreq1 = 0.0;
 
2386
                for (i = 0; i < nvalues1; i++)
 
2387
                {
 
2388
                        if (hasmatch1[i])
 
2389
                                matchfreq1 += numbers1[i];
 
2390
                }
 
2391
                CLAMP_PROBABILITY(matchfreq1);
 
2392
                pfree(hasmatch1);
 
2393
                pfree(hasmatch2);
 
2394
 
 
2395
                /*
 
2396
                 * Now we need to estimate the fraction of relation 1 that has at
 
2397
                 * least one join partner.      We know for certain that the matched MCVs
 
2398
                 * do, so that gives us a lower bound, but we're really in the dark
 
2399
                 * about everything else.  Our crude approach is: if nd1 <= nd2 then
 
2400
                 * assume all non-null rel1 rows have join partners, else assume for
 
2401
                 * the uncertain rows that a fraction nd2/nd1 have join partners. We
 
2402
                 * can discount the known-matched MCVs from the distinct-values counts
 
2403
                 * before doing the division.
 
2404
                 *
 
2405
                 * Crude as the above is, it's completely useless if we don't have
 
2406
                 * reliable ndistinct values for both sides.  Hence, if either nd1
 
2407
                 * or nd2 is default, punt and assume half of the uncertain rows
 
2408
                 * have join partners.
 
2409
                 */
 
2410
                if (nd1 != DEFAULT_NUM_DISTINCT && nd2 != DEFAULT_NUM_DISTINCT)
 
2411
                {
 
2412
                        nd1 -= nmatches;
 
2413
                        nd2 -= nmatches;
 
2414
                        if (nd1 <= nd2 || nd2 <= 0)
 
2415
                                uncertainfrac = 1.0;
 
2416
                        else
 
2417
                                uncertainfrac = nd2 / nd1;
 
2418
                }
 
2419
                else
 
2420
                        uncertainfrac = 0.5;
 
2421
                uncertain = 1.0 - matchfreq1 - nullfrac1;
 
2422
                CLAMP_PROBABILITY(uncertain);
 
2423
                selec = matchfreq1 + uncertainfrac * uncertain;
 
2424
        }
 
2425
        else
 
2426
        {
 
2427
                /*
 
2428
                 * Without MCV lists for both sides, we can only use the heuristic
 
2429
                 * about nd1 vs nd2.
 
2430
                 */
 
2431
                double          nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
 
2432
 
 
2433
                if (nd1 != DEFAULT_NUM_DISTINCT && nd2 != DEFAULT_NUM_DISTINCT)
 
2434
                {
 
2435
                        if (vardata1->rel)
 
2436
                                nd1 = Min(nd1, vardata1->rel->rows);
 
2437
                        if (vardata2->rel)
 
2438
                                nd2 = Min(nd2, vardata2->rel->rows);
 
2439
 
 
2440
                        if (nd1 <= nd2 || nd2 <= 0)
 
2441
                                selec = 1.0 - nullfrac1;
 
2442
                        else
 
2443
                                selec = (nd2 / nd1) * (1.0 - nullfrac1);
 
2444
                }
 
2445
                else
 
2446
                        selec = 0.5 * (1.0 - nullfrac1);
 
2447
        }
 
2448
 
 
2449
        if (have_mcvs1)
 
2450
                free_attstatsslot(vardata1->atttype, values1, nvalues1,
 
2451
                                                  numbers1, nnumbers1);
 
2452
        if (have_mcvs2)
 
2453
                free_attstatsslot(vardata2->atttype, values2, nvalues2,
 
2454
                                                  numbers2, nnumbers2);
 
2455
 
 
2456
        return selec;
 
2457
}
 
2458
 
 
2459
/*
 
2460
 *              neqjoinsel              - Join selectivity of "!="
 
2461
 */
 
2462
Datum
 
2463
neqjoinsel(PG_FUNCTION_ARGS)
 
2464
{
 
2465
        PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 
2466
        Oid                     operator = PG_GETARG_OID(1);
 
2467
        List       *args = (List *) PG_GETARG_POINTER(2);
 
2468
        JoinType        jointype = (JoinType) PG_GETARG_INT16(3);
 
2469
        SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
 
2470
        Oid                     eqop;
 
2471
        float8          result;
 
2472
 
 
2473
        /*
 
2474
         * We want 1 - eqjoinsel() where the equality operator is the one
 
2475
         * associated with this != operator, that is, its negator.
 
2476
         */
 
2477
        eqop = get_negator(operator);
 
2478
        if (eqop)
 
2479
        {
 
2480
                result = DatumGetFloat8(DirectFunctionCall5(eqjoinsel,
 
2481
                                                                                                        PointerGetDatum(root),
 
2482
                                                                                                        ObjectIdGetDatum(eqop),
 
2483
                                                                                                        PointerGetDatum(args),
 
2484
                                                                                                        Int16GetDatum(jointype),
 
2485
                                                                                                        PointerGetDatum(sjinfo)));
 
2486
        }
 
2487
        else
 
2488
        {
 
2489
                /* Use default selectivity (should we raise an error instead?) */
 
2490
                result = DEFAULT_EQ_SEL;
 
2491
        }
 
2492
        result = 1.0 - result;
 
2493
        PG_RETURN_FLOAT8(result);
 
2494
}
 
2495
 
 
2496
/*
 
2497
 *              scalarltjoinsel - Join selectivity of "<" and "<=" for scalars
 
2498
 */
 
2499
Datum
 
2500
scalarltjoinsel(PG_FUNCTION_ARGS)
 
2501
{
 
2502
        PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 
2503
}
 
2504
 
 
2505
/*
 
2506
 *              scalargtjoinsel - Join selectivity of ">" and ">=" for scalars
 
2507
 */
 
2508
Datum
 
2509
scalargtjoinsel(PG_FUNCTION_ARGS)
 
2510
{
 
2511
        PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 
2512
}
 
2513
 
 
2514
/*
 
2515
 * patternjoinsel               - Generic code for pattern-match join selectivity.
 
2516
 */
 
2517
static double
 
2518
patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
 
2519
{
 
2520
        /* For the moment we just punt. */
 
2521
        return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL;
 
2522
}
 
2523
 
 
2524
/*
 
2525
 *              regexeqjoinsel  - Join selectivity of regular-expression pattern match.
 
2526
 */
 
2527
Datum
 
2528
regexeqjoinsel(PG_FUNCTION_ARGS)
 
2529
{
 
2530
        PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, false));
 
2531
}
 
2532
 
 
2533
/*
 
2534
 *              icregexeqjoinsel        - Join selectivity of case-insensitive regex match.
 
2535
 */
 
2536
Datum
 
2537
icregexeqjoinsel(PG_FUNCTION_ARGS)
 
2538
{
 
2539
        PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, false));
 
2540
}
 
2541
 
 
2542
/*
 
2543
 *              likejoinsel                     - Join selectivity of LIKE pattern match.
 
2544
 */
 
2545
Datum
 
2546
likejoinsel(PG_FUNCTION_ARGS)
 
2547
{
 
2548
        PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false));
 
2549
}
 
2550
 
 
2551
/*
 
2552
 *              iclikejoinsel                   - Join selectivity of ILIKE pattern match.
 
2553
 */
 
2554
Datum
 
2555
iclikejoinsel(PG_FUNCTION_ARGS)
 
2556
{
 
2557
        PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, false));
 
2558
}
 
2559
 
 
2560
/*
 
2561
 *              regexnejoinsel  - Join selectivity of regex non-match.
 
2562
 */
 
2563
Datum
 
2564
regexnejoinsel(PG_FUNCTION_ARGS)
 
2565
{
 
2566
        PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, true));
 
2567
}
 
2568
 
 
2569
/*
 
2570
 *              icregexnejoinsel        - Join selectivity of case-insensitive regex non-match.
 
2571
 */
 
2572
Datum
 
2573
icregexnejoinsel(PG_FUNCTION_ARGS)
 
2574
{
 
2575
        PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, true));
 
2576
}
 
2577
 
 
2578
/*
 
2579
 *              nlikejoinsel            - Join selectivity of LIKE pattern non-match.
 
2580
 */
 
2581
Datum
 
2582
nlikejoinsel(PG_FUNCTION_ARGS)
 
2583
{
 
2584
        PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, true));
 
2585
}
 
2586
 
 
2587
/*
 
2588
 *              icnlikejoinsel          - Join selectivity of ILIKE pattern non-match.
 
2589
 */
 
2590
Datum
 
2591
icnlikejoinsel(PG_FUNCTION_ARGS)
 
2592
{
 
2593
        PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, true));
 
2594
}
 
2595
 
 
2596
/*
 
2597
 * mergejoinscansel                     - Scan selectivity of merge join.
 
2598
 *
 
2599
 * A merge join will stop as soon as it exhausts either input stream.
 
2600
 * Therefore, if we can estimate the ranges of both input variables,
 
2601
 * we can estimate how much of the input will actually be read.  This
 
2602
 * can have a considerable impact on the cost when using indexscans.
 
2603
 *
 
2604
 * Also, we can estimate how much of each input has to be read before the
 
2605
 * first join pair is found, which will affect the join's startup time.
 
2606
 *
 
2607
 * clause should be a clause already known to be mergejoinable.  opfamily,
 
2608
 * strategy, and nulls_first specify the sort ordering being used.
 
2609
 *
 
2610
 * The outputs are:
 
2611
 *              *leftstart is set to the fraction of the left-hand variable expected
 
2612
 *               to be scanned before the first join pair is found (0 to 1).
 
2613
 *              *leftend is set to the fraction of the left-hand variable expected
 
2614
 *               to be scanned before the join terminates (0 to 1).
 
2615
 *              *rightstart, *rightend similarly for the right-hand variable.
 
2616
 */
 
2617
void
 
2618
mergejoinscansel(PlannerInfo *root, Node *clause,
 
2619
                                 Oid opfamily, int strategy, bool nulls_first,
 
2620
                                 Selectivity *leftstart, Selectivity *leftend,
 
2621
                                 Selectivity *rightstart, Selectivity *rightend)
 
2622
{
 
2623
        Node       *left,
 
2624
                           *right;
 
2625
        VariableStatData leftvar,
 
2626
                                rightvar;
 
2627
        int                     op_strategy;
 
2628
        Oid                     op_lefttype;
 
2629
        Oid                     op_righttype;
 
2630
        Oid                     opno,
 
2631
                                lsortop,
 
2632
                                rsortop,
 
2633
                                lstatop,
 
2634
                                rstatop,
 
2635
                                ltop,
 
2636
                                leop,
 
2637
                                revltop,
 
2638
                                revleop;
 
2639
        bool            isgt;
 
2640
        Datum           leftmin,
 
2641
                                leftmax,
 
2642
                                rightmin,
 
2643
                                rightmax;
 
2644
        double          selec;
 
2645
 
 
2646
        /* Set default results if we can't figure anything out. */
 
2647
        /* XXX should default "start" fraction be a bit more than 0? */
 
2648
        *leftstart = *rightstart = 0.0;
 
2649
        *leftend = *rightend = 1.0;
 
2650
 
 
2651
        /* Deconstruct the merge clause */
 
2652
        if (!is_opclause(clause))
 
2653
                return;                                 /* shouldn't happen */
 
2654
        opno = ((OpExpr *) clause)->opno;
 
2655
        left = get_leftop((Expr *) clause);
 
2656
        right = get_rightop((Expr *) clause);
 
2657
        if (!right)
 
2658
                return;                                 /* shouldn't happen */
 
2659
 
 
2660
        /* Look for stats for the inputs */
 
2661
        examine_variable(root, left, 0, &leftvar);
 
2662
        examine_variable(root, right, 0, &rightvar);
 
2663
 
 
2664
        /* Extract the operator's declared left/right datatypes */
 
2665
        get_op_opfamily_properties(opno, opfamily, false,
 
2666
                                                           &op_strategy,
 
2667
                                                           &op_lefttype,
 
2668
                                                           &op_righttype);
 
2669
        Assert(op_strategy == BTEqualStrategyNumber);
 
2670
 
 
2671
        /*
 
2672
         * Look up the various operators we need.  If we don't find them all, it
 
2673
         * probably means the opfamily is broken, but we just fail silently.
 
2674
         *
 
2675
         * Note: we expect that pg_statistic histograms will be sorted by the '<'
 
2676
         * operator, regardless of which sort direction we are considering.
 
2677
         */
 
2678
        switch (strategy)
 
2679
        {
 
2680
                case BTLessStrategyNumber:
 
2681
                        isgt = false;
 
2682
                        if (op_lefttype == op_righttype)
 
2683
                        {
 
2684
                                /* easy case */
 
2685
                                ltop = get_opfamily_member(opfamily,
 
2686
                                                                                   op_lefttype, op_righttype,
 
2687
                                                                                   BTLessStrategyNumber);
 
2688
                                leop = get_opfamily_member(opfamily,
 
2689
                                                                                   op_lefttype, op_righttype,
 
2690
                                                                                   BTLessEqualStrategyNumber);
 
2691
                                lsortop = ltop;
 
2692
                                rsortop = ltop;
 
2693
                                lstatop = lsortop;
 
2694
                                rstatop = rsortop;
 
2695
                                revltop = ltop;
 
2696
                                revleop = leop;
 
2697
                        }
 
2698
                        else
 
2699
                        {
 
2700
                                ltop = get_opfamily_member(opfamily,
 
2701
                                                                                   op_lefttype, op_righttype,
 
2702
                                                                                   BTLessStrategyNumber);
 
2703
                                leop = get_opfamily_member(opfamily,
 
2704
                                                                                   op_lefttype, op_righttype,
 
2705
                                                                                   BTLessEqualStrategyNumber);
 
2706
                                lsortop = get_opfamily_member(opfamily,
 
2707
                                                                                          op_lefttype, op_lefttype,
 
2708
                                                                                          BTLessStrategyNumber);
 
2709
                                rsortop = get_opfamily_member(opfamily,
 
2710
                                                                                          op_righttype, op_righttype,
 
2711
                                                                                          BTLessStrategyNumber);
 
2712
                                lstatop = lsortop;
 
2713
                                rstatop = rsortop;
 
2714
                                revltop = get_opfamily_member(opfamily,
 
2715
                                                                                          op_righttype, op_lefttype,
 
2716
                                                                                          BTLessStrategyNumber);
 
2717
                                revleop = get_opfamily_member(opfamily,
 
2718
                                                                                          op_righttype, op_lefttype,
 
2719
                                                                                          BTLessEqualStrategyNumber);
 
2720
                        }
 
2721
                        break;
 
2722
                case BTGreaterStrategyNumber:
 
2723
                        /* descending-order case */
 
2724
                        isgt = true;
 
2725
                        if (op_lefttype == op_righttype)
 
2726
                        {
 
2727
                                /* easy case */
 
2728
                                ltop = get_opfamily_member(opfamily,
 
2729
                                                                                   op_lefttype, op_righttype,
 
2730
                                                                                   BTGreaterStrategyNumber);
 
2731
                                leop = get_opfamily_member(opfamily,
 
2732
                                                                                   op_lefttype, op_righttype,
 
2733
                                                                                   BTGreaterEqualStrategyNumber);
 
2734
                                lsortop = ltop;
 
2735
                                rsortop = ltop;
 
2736
                                lstatop = get_opfamily_member(opfamily,
 
2737
                                                                                          op_lefttype, op_lefttype,
 
2738
                                                                                          BTLessStrategyNumber);
 
2739
                                rstatop = lstatop;
 
2740
                                revltop = ltop;
 
2741
                                revleop = leop;
 
2742
                        }
 
2743
                        else
 
2744
                        {
 
2745
                                ltop = get_opfamily_member(opfamily,
 
2746
                                                                                   op_lefttype, op_righttype,
 
2747
                                                                                   BTGreaterStrategyNumber);
 
2748
                                leop = get_opfamily_member(opfamily,
 
2749
                                                                                   op_lefttype, op_righttype,
 
2750
                                                                                   BTGreaterEqualStrategyNumber);
 
2751
                                lsortop = get_opfamily_member(opfamily,
 
2752
                                                                                          op_lefttype, op_lefttype,
 
2753
                                                                                          BTGreaterStrategyNumber);
 
2754
                                rsortop = get_opfamily_member(opfamily,
 
2755
                                                                                          op_righttype, op_righttype,
 
2756
                                                                                          BTGreaterStrategyNumber);
 
2757
                                lstatop = get_opfamily_member(opfamily,
 
2758
                                                                                          op_lefttype, op_lefttype,
 
2759
                                                                                          BTLessStrategyNumber);
 
2760
                                rstatop = get_opfamily_member(opfamily,
 
2761
                                                                                          op_righttype, op_righttype,
 
2762
                                                                                          BTLessStrategyNumber);
 
2763
                                revltop = get_opfamily_member(opfamily,
 
2764
                                                                                          op_righttype, op_lefttype,
 
2765
                                                                                          BTGreaterStrategyNumber);
 
2766
                                revleop = get_opfamily_member(opfamily,
 
2767
                                                                                          op_righttype, op_lefttype,
 
2768
                                                                                          BTGreaterEqualStrategyNumber);
 
2769
                        }
 
2770
                        break;
 
2771
                default:
 
2772
                        goto fail;                      /* shouldn't get here */
 
2773
        }
 
2774
 
 
2775
        if (!OidIsValid(lsortop) ||
 
2776
                !OidIsValid(rsortop) ||
 
2777
                !OidIsValid(lstatop) ||
 
2778
                !OidIsValid(rstatop) ||
 
2779
                !OidIsValid(ltop) ||
 
2780
                !OidIsValid(leop) ||
 
2781
                !OidIsValid(revltop) ||
 
2782
                !OidIsValid(revleop))
 
2783
                goto fail;                              /* insufficient info in catalogs */
 
2784
 
 
2785
        /* Try to get ranges of both inputs */
 
2786
        if (!isgt)
 
2787
        {
 
2788
                if (!get_variable_range(root, &leftvar, lstatop,
 
2789
                                                                &leftmin, &leftmax))
 
2790
                        goto fail;                      /* no range available from stats */
 
2791
                if (!get_variable_range(root, &rightvar, rstatop,
 
2792
                                                                &rightmin, &rightmax))
 
2793
                        goto fail;                      /* no range available from stats */
 
2794
        }
 
2795
        else
 
2796
        {
 
2797
                /* need to swap the max and min */
 
2798
                if (!get_variable_range(root, &leftvar, lstatop,
 
2799
                                                                &leftmax, &leftmin))
 
2800
                        goto fail;                      /* no range available from stats */
 
2801
                if (!get_variable_range(root, &rightvar, rstatop,
 
2802
                                                                &rightmax, &rightmin))
 
2803
                        goto fail;                      /* no range available from stats */
 
2804
        }
 
2805
 
 
2806
        /*
 
2807
         * Now, the fraction of the left variable that will be scanned is the
 
2808
         * fraction that's <= the right-side maximum value.  But only believe
 
2809
         * non-default estimates, else stick with our 1.0.
 
2810
         */
 
2811
        selec = scalarineqsel(root, leop, isgt, &leftvar,
 
2812
                                                  rightmax, op_righttype);
 
2813
        if (selec != DEFAULT_INEQ_SEL)
 
2814
                *leftend = selec;
 
2815
 
 
2816
        /* And similarly for the right variable. */
 
2817
        selec = scalarineqsel(root, revleop, isgt, &rightvar,
 
2818
                                                  leftmax, op_lefttype);
 
2819
        if (selec != DEFAULT_INEQ_SEL)
 
2820
                *rightend = selec;
 
2821
 
 
2822
        /*
 
2823
         * Only one of the two "end" fractions can really be less than 1.0;
 
2824
         * believe the smaller estimate and reset the other one to exactly 1.0. If
 
2825
         * we get exactly equal estimates (as can easily happen with self-joins),
 
2826
         * believe neither.
 
2827
         */
 
2828
        if (*leftend > *rightend)
 
2829
                *leftend = 1.0;
 
2830
        else if (*leftend < *rightend)
 
2831
                *rightend = 1.0;
 
2832
        else
 
2833
                *leftend = *rightend = 1.0;
 
2834
 
 
2835
        /*
 
2836
         * Also, the fraction of the left variable that will be scanned before the
 
2837
         * first join pair is found is the fraction that's < the right-side
 
2838
         * minimum value.  But only believe non-default estimates, else stick with
 
2839
         * our own default.
 
2840
         */
 
2841
        selec = scalarineqsel(root, ltop, isgt, &leftvar,
 
2842
                                                  rightmin, op_righttype);
 
2843
        if (selec != DEFAULT_INEQ_SEL)
 
2844
                *leftstart = selec;
 
2845
 
 
2846
        /* And similarly for the right variable. */
 
2847
        selec = scalarineqsel(root, revltop, isgt, &rightvar,
 
2848
                                                  leftmin, op_lefttype);
 
2849
        if (selec != DEFAULT_INEQ_SEL)
 
2850
                *rightstart = selec;
 
2851
 
 
2852
        /*
 
2853
         * Only one of the two "start" fractions can really be more than zero;
 
2854
         * believe the larger estimate and reset the other one to exactly 0.0. If
 
2855
         * we get exactly equal estimates (as can easily happen with self-joins),
 
2856
         * believe neither.
 
2857
         */
 
2858
        if (*leftstart < *rightstart)
 
2859
                *leftstart = 0.0;
 
2860
        else if (*leftstart > *rightstart)
 
2861
                *rightstart = 0.0;
 
2862
        else
 
2863
                *leftstart = *rightstart = 0.0;
 
2864
 
 
2865
        /*
 
2866
         * If the sort order is nulls-first, we're going to have to skip over any
 
2867
         * nulls too.  These would not have been counted by scalarineqsel, and we
 
2868
         * can safely add in this fraction regardless of whether we believe
 
2869
         * scalarineqsel's results or not.  But be sure to clamp the sum to 1.0!
 
2870
         */
 
2871
        if (nulls_first)
 
2872
        {
 
2873
                Form_pg_statistic stats;
 
2874
 
 
2875
                if (HeapTupleIsValid(leftvar.statsTuple))
 
2876
                {
 
2877
                        stats = (Form_pg_statistic) GETSTRUCT(leftvar.statsTuple);
 
2878
                        *leftstart += stats->stanullfrac;
 
2879
                        CLAMP_PROBABILITY(*leftstart);
 
2880
                        *leftend += stats->stanullfrac;
 
2881
                        CLAMP_PROBABILITY(*leftend);
 
2882
                }
 
2883
                if (HeapTupleIsValid(rightvar.statsTuple))
 
2884
                {
 
2885
                        stats = (Form_pg_statistic) GETSTRUCT(rightvar.statsTuple);
 
2886
                        *rightstart += stats->stanullfrac;
 
2887
                        CLAMP_PROBABILITY(*rightstart);
 
2888
                        *rightend += stats->stanullfrac;
 
2889
                        CLAMP_PROBABILITY(*rightend);
 
2890
                }
 
2891
        }
 
2892
 
 
2893
        /* Disbelieve start >= end, just in case that can happen */
 
2894
        if (*leftstart >= *leftend)
 
2895
        {
 
2896
                *leftstart = 0.0;
 
2897
                *leftend = 1.0;
 
2898
        }
 
2899
        if (*rightstart >= *rightend)
 
2900
        {
 
2901
                *rightstart = 0.0;
 
2902
                *rightend = 1.0;
 
2903
        }
 
2904
 
 
2905
fail:
 
2906
        ReleaseVariableStats(leftvar);
 
2907
        ReleaseVariableStats(rightvar);
 
2908
}
 
2909
 
 
2910
 
 
2911
/*
 
2912
 * Helper routine for estimate_num_groups: add an item to a list of
 
2913
 * GroupVarInfos, but only if it's not known equal to any of the existing
 
2914
 * entries.
 
2915
 */
 
2916
typedef struct
 
2917
{
 
2918
        Node       *var;                        /* might be an expression, not just a Var */
 
2919
        RelOptInfo *rel;                        /* relation it belongs to */
 
2920
        double          ndistinct;              /* # distinct values */
 
2921
} GroupVarInfo;
 
2922
 
 
2923
static List *
 
2924
add_unique_group_var(PlannerInfo *root, List *varinfos,
 
2925
                                         Node *var, VariableStatData *vardata)
 
2926
{
 
2927
        GroupVarInfo *varinfo;
 
2928
        double          ndistinct;
 
2929
        ListCell   *lc;
 
2930
 
 
2931
        ndistinct = get_variable_numdistinct(vardata);
 
2932
 
 
2933
        /* cannot use foreach here because of possible list_delete */
 
2934
        lc = list_head(varinfos);
 
2935
        while (lc)
 
2936
        {
 
2937
                varinfo = (GroupVarInfo *) lfirst(lc);
 
2938
 
 
2939
                /* must advance lc before list_delete possibly pfree's it */
 
2940
                lc = lnext(lc);
 
2941
 
 
2942
                /* Drop exact duplicates */
 
2943
                if (equal(var, varinfo->var))
 
2944
                        return varinfos;
 
2945
 
 
2946
                /*
 
2947
                 * Drop known-equal vars, but only if they belong to different
 
2948
                 * relations (see comments for estimate_num_groups)
 
2949
                 */
 
2950
                if (vardata->rel != varinfo->rel &&
 
2951
                        exprs_known_equal(root, var, varinfo->var))
 
2952
                {
 
2953
                        if (varinfo->ndistinct <= ndistinct)
 
2954
                        {
 
2955
                                /* Keep older item, forget new one */
 
2956
                                return varinfos;
 
2957
                        }
 
2958
                        else
 
2959
                        {
 
2960
                                /* Delete the older item */
 
2961
                                varinfos = list_delete_ptr(varinfos, varinfo);
 
2962
                        }
 
2963
                }
 
2964
        }
 
2965
 
 
2966
        varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
 
2967
 
 
2968
        varinfo->var = var;
 
2969
        varinfo->rel = vardata->rel;
 
2970
        varinfo->ndistinct = ndistinct;
 
2971
        varinfos = lappend(varinfos, varinfo);
 
2972
        return varinfos;
 
2973
}
 
2974
 
 
2975
/*
 
2976
 * estimate_num_groups          - Estimate number of groups in a grouped query
 
2977
 *
 
2978
 * Given a query having a GROUP BY clause, estimate how many groups there
 
2979
 * will be --- ie, the number of distinct combinations of the GROUP BY
 
2980
 * expressions.
 
2981
 *
 
2982
 * This routine is also used to estimate the number of rows emitted by
 
2983
 * a DISTINCT filtering step; that is an isomorphic problem.  (Note:
 
2984
 * actually, we only use it for DISTINCT when there's no grouping or
 
2985
 * aggregation ahead of the DISTINCT.)
 
2986
 *
 
2987
 * Inputs:
 
2988
 *      root - the query
 
2989
 *      groupExprs - list of expressions being grouped by
 
2990
 *      input_rows - number of rows estimated to arrive at the group/unique
 
2991
 *              filter step
 
2992
 *
 
2993
 * Given the lack of any cross-correlation statistics in the system, it's
 
2994
 * impossible to do anything really trustworthy with GROUP BY conditions
 
2995
 * involving multiple Vars.  We should however avoid assuming the worst
 
2996
 * case (all possible cross-product terms actually appear as groups) since
 
2997
 * very often the grouped-by Vars are highly correlated.  Our current approach
 
2998
 * is as follows:
 
2999
 *      1.      Expressions yielding boolean are assumed to contribute two groups,
 
3000
 *              independently of their content, and are ignored in the subsequent
 
3001
 *              steps.  This is mainly because tests like "col IS NULL" break the
 
3002
 *              heuristic used in step 2 especially badly.
 
3003
 *      2.      Reduce the given expressions to a list of unique Vars used.  For
 
3004
 *              example, GROUP BY a, a + b is treated the same as GROUP BY a, b.
 
3005
 *              It is clearly correct not to count the same Var more than once.
 
3006
 *              It is also reasonable to treat f(x) the same as x: f() cannot
 
3007
 *              increase the number of distinct values (unless it is volatile,
 
3008
 *              which we consider unlikely for grouping), but it probably won't
 
3009
 *              reduce the number of distinct values much either.
 
3010
 *              As a special case, if a GROUP BY expression can be matched to an
 
3011
 *              expressional index for which we have statistics, then we treat the
 
3012
 *              whole expression as though it were just a Var.
 
3013
 *      3.      If the list contains Vars of different relations that are known equal
 
3014
 *              due to equivalence classes, then drop all but one of the Vars from each
 
3015
 *              known-equal set, keeping the one with smallest estimated # of values
 
3016
 *              (since the extra values of the others can't appear in joined rows).
 
3017
 *              Note the reason we only consider Vars of different relations is that
 
3018
 *              if we considered ones of the same rel, we'd be double-counting the
 
3019
 *              restriction selectivity of the equality in the next step.
 
3020
 *      4.      For Vars within a single source rel, we multiply together the numbers
 
3021
 *              of values, clamp to the number of rows in the rel (divided by 10 if
 
3022
 *              more than one Var), and then multiply by the selectivity of the
 
3023
 *              restriction clauses for that rel.  When there's more than one Var,
 
3024
 *              the initial product is probably too high (it's the worst case) but
 
3025
 *              clamping to a fraction of the rel's rows seems to be a helpful
 
3026
 *              heuristic for not letting the estimate get out of hand.  (The factor
 
3027
 *              of 10 is derived from pre-Postgres-7.4 practice.)  Multiplying
 
3028
 *              by the restriction selectivity is effectively assuming that the
 
3029
 *              restriction clauses are independent of the grouping, which is a crummy
 
3030
 *              assumption, but it's hard to do better.
 
3031
 *      5.      If there are Vars from multiple rels, we repeat step 4 for each such
 
3032
 *              rel, and multiply the results together.
 
3033
 * Note that rels not containing grouped Vars are ignored completely, as are
 
3034
 * join clauses.  Such rels cannot increase the number of groups, and we
 
3035
 * assume such clauses do not reduce the number either (somewhat bogus,
 
3036
 * but we don't have the info to do better).
 
3037
 */
 
3038
double
 
3039
estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
 
3040
{
 
3041
        List       *varinfos = NIL;
 
3042
        double          numdistinct;
 
3043
        ListCell   *l;
 
3044
 
 
3045
        /* We should not be called unless query has GROUP BY (or DISTINCT) */
 
3046
        Assert(groupExprs != NIL);
 
3047
 
 
3048
        /*
 
3049
         * Count groups derived from boolean grouping expressions.      For other
 
3050
         * expressions, find the unique Vars used, treating an expression as a Var
 
3051
         * if we can find stats for it.  For each one, record the statistical
 
3052
         * estimate of number of distinct values (total in its table, without
 
3053
         * regard for filtering).
 
3054
         */
 
3055
        numdistinct = 1.0;
 
3056
 
 
3057
        foreach(l, groupExprs)
 
3058
        {
 
3059
                Node       *groupexpr = (Node *) lfirst(l);
 
3060
                VariableStatData vardata;
 
3061
                List       *varshere;
 
3062
                ListCell   *l2;
 
3063
 
 
3064
                /* Short-circuit for expressions returning boolean */
 
3065
                if (exprType(groupexpr) == BOOLOID)
 
3066
                {
 
3067
                        numdistinct *= 2.0;
 
3068
                        continue;
 
3069
                }
 
3070
 
 
3071
                /*
 
3072
                 * If examine_variable is able to deduce anything about the GROUP BY
 
3073
                 * expression, treat it as a single variable even if it's really more
 
3074
                 * complicated.
 
3075
                 */
 
3076
                examine_variable(root, groupexpr, 0, &vardata);
 
3077
                if (HeapTupleIsValid(vardata.statsTuple) || vardata.isunique)
 
3078
                {
 
3079
                        varinfos = add_unique_group_var(root, varinfos,
 
3080
                                                                                        groupexpr, &vardata);
 
3081
                        ReleaseVariableStats(vardata);
 
3082
                        continue;
 
3083
                }
 
3084
                ReleaseVariableStats(vardata);
 
3085
 
 
3086
                /*
 
3087
                 * Else pull out the component Vars.  Handle PlaceHolderVars by
 
3088
                 * recursing into their arguments (effectively assuming that the
 
3089
                 * PlaceHolderVar doesn't change the number of groups, which boils
 
3090
                 * down to ignoring the possible addition of nulls to the result set).
 
3091
                 */
 
3092
                varshere = pull_var_clause(groupexpr, PVC_RECURSE_PLACEHOLDERS);
 
3093
 
 
3094
                /*
 
3095
                 * If we find any variable-free GROUP BY item, then either it is a
 
3096
                 * constant (and we can ignore it) or it contains a volatile function;
 
3097
                 * in the latter case we punt and assume that each input row will
 
3098
                 * yield a distinct group.
 
3099
                 */
 
3100
                if (varshere == NIL)
 
3101
                {
 
3102
                        if (contain_volatile_functions(groupexpr))
 
3103
                                return input_rows;
 
3104
                        continue;
 
3105
                }
 
3106
 
 
3107
                /*
 
3108
                 * Else add variables to varinfos list
 
3109
                 */
 
3110
                foreach(l2, varshere)
 
3111
                {
 
3112
                        Node       *var = (Node *) lfirst(l2);
 
3113
 
 
3114
                        examine_variable(root, var, 0, &vardata);
 
3115
                        varinfos = add_unique_group_var(root, varinfos, var, &vardata);
 
3116
                        ReleaseVariableStats(vardata);
 
3117
                }
 
3118
        }
 
3119
 
 
3120
        /*
 
3121
         * If now no Vars, we must have an all-constant or all-boolean GROUP BY
 
3122
         * list.
 
3123
         */
 
3124
        if (varinfos == NIL)
 
3125
        {
 
3126
                /* Guard against out-of-range answers */
 
3127
                if (numdistinct > input_rows)
 
3128
                        numdistinct = input_rows;
 
3129
                return numdistinct;
 
3130
        }
 
3131
 
 
3132
        /*
 
3133
         * Group Vars by relation and estimate total numdistinct.
 
3134
         *
 
3135
         * For each iteration of the outer loop, we process the frontmost Var in
 
3136
         * varinfos, plus all other Vars in the same relation.  We remove these
 
3137
         * Vars from the newvarinfos list for the next iteration. This is the
 
3138
         * easiest way to group Vars of same rel together.
 
3139
         */
 
3140
        do
 
3141
        {
 
3142
                GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos);
 
3143
                RelOptInfo *rel = varinfo1->rel;
 
3144
                double          reldistinct = varinfo1->ndistinct;
 
3145
                double          relmaxndistinct = reldistinct;
 
3146
                int                     relvarcount = 1;
 
3147
                List       *newvarinfos = NIL;
 
3148
 
 
3149
                /*
 
3150
                 * Get the product of numdistinct estimates of the Vars for this rel.
 
3151
                 * Also, construct new varinfos list of remaining Vars.
 
3152
                 */
 
3153
                for_each_cell(l, lnext(list_head(varinfos)))
 
3154
                {
 
3155
                        GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l);
 
3156
 
 
3157
                        if (varinfo2->rel == varinfo1->rel)
 
3158
                        {
 
3159
                                reldistinct *= varinfo2->ndistinct;
 
3160
                                if (relmaxndistinct < varinfo2->ndistinct)
 
3161
                                        relmaxndistinct = varinfo2->ndistinct;
 
3162
                                relvarcount++;
 
3163
                        }
 
3164
                        else
 
3165
                        {
 
3166
                                /* not time to process varinfo2 yet */
 
3167
                                newvarinfos = lcons(varinfo2, newvarinfos);
 
3168
                        }
 
3169
                }
 
3170
 
 
3171
                /*
 
3172
                 * Sanity check --- don't divide by zero if empty relation.
 
3173
                 */
 
3174
                Assert(rel->reloptkind == RELOPT_BASEREL);
 
3175
                if (rel->tuples > 0)
 
3176
                {
 
3177
                        /*
 
3178
                         * Clamp to size of rel, or size of rel / 10 if multiple Vars. The
 
3179
                         * fudge factor is because the Vars are probably correlated but we
 
3180
                         * don't know by how much.  We should never clamp to less than the
 
3181
                         * largest ndistinct value for any of the Vars, though, since
 
3182
                         * there will surely be at least that many groups.
 
3183
                         */
 
3184
                        double          clamp = rel->tuples;
 
3185
 
 
3186
                        if (relvarcount > 1)
 
3187
                        {
 
3188
                                clamp *= 0.1;
 
3189
                                if (clamp < relmaxndistinct)
 
3190
                                {
 
3191
                                        clamp = relmaxndistinct;
 
3192
                                        /* for sanity in case some ndistinct is too large: */
 
3193
                                        if (clamp > rel->tuples)
 
3194
                                                clamp = rel->tuples;
 
3195
                                }
 
3196
                        }
 
3197
                        if (reldistinct > clamp)
 
3198
                                reldistinct = clamp;
 
3199
 
 
3200
                        /*
 
3201
                         * Multiply by restriction selectivity.
 
3202
                         */
 
3203
                        reldistinct *= rel->rows / rel->tuples;
 
3204
 
 
3205
                        /*
 
3206
                         * Update estimate of total distinct groups.
 
3207
                         */
 
3208
                        numdistinct *= reldistinct;
 
3209
                }
 
3210
 
 
3211
                varinfos = newvarinfos;
 
3212
        } while (varinfos != NIL);
 
3213
 
 
3214
        numdistinct = ceil(numdistinct);
 
3215
 
 
3216
        /* Guard against out-of-range answers */
 
3217
        if (numdistinct > input_rows)
 
3218
                numdistinct = input_rows;
 
3219
        if (numdistinct < 1.0)
 
3220
                numdistinct = 1.0;
 
3221
 
 
3222
        return numdistinct;
 
3223
}
 
3224
 
 
3225
/*
 
3226
 * Estimate hash bucketsize fraction (ie, number of entries in a bucket
 
3227
 * divided by total tuples in relation) if the specified expression is used
 
3228
 * as a hash key.
 
3229
 *
 
3230
 * XXX This is really pretty bogus since we're effectively assuming that the
 
3231
 * distribution of hash keys will be the same after applying restriction
 
3232
 * clauses as it was in the underlying relation.  However, we are not nearly
 
3233
 * smart enough to figure out how the restrict clauses might change the
 
3234
 * distribution, so this will have to do for now.
 
3235
 *
 
3236
 * We are passed the number of buckets the executor will use for the given
 
3237
 * input relation.      If the data were perfectly distributed, with the same
 
3238
 * number of tuples going into each available bucket, then the bucketsize
 
3239
 * fraction would be 1/nbuckets.  But this happy state of affairs will occur
 
3240
 * only if (a) there are at least nbuckets distinct data values, and (b)
 
3241
 * we have a not-too-skewed data distribution.  Otherwise the buckets will
 
3242
 * be nonuniformly occupied.  If the other relation in the join has a key
 
3243
 * distribution similar to this one's, then the most-loaded buckets are
 
3244
 * exactly those that will be probed most often.  Therefore, the "average"
 
3245
 * bucket size for costing purposes should really be taken as something close
 
3246
 * to the "worst case" bucket size.  We try to estimate this by adjusting the
 
3247
 * fraction if there are too few distinct data values, and then scaling up
 
3248
 * by the ratio of the most common value's frequency to the average frequency.
 
3249
 *
 
3250
 * If no statistics are available, use a default estimate of 0.1.  This will
 
3251
 * discourage use of a hash rather strongly if the inner relation is large,
 
3252
 * which is what we want.  We do not want to hash unless we know that the
 
3253
 * inner rel is well-dispersed (or the alternatives seem much worse).
 
3254
 */
 
3255
Selectivity
 
3256
estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
 
3257
{
 
3258
        VariableStatData vardata;
 
3259
        double          estfract,
 
3260
                                ndistinct,
 
3261
                                stanullfrac,
 
3262
                                mcvfreq,
 
3263
                                avgfreq;
 
3264
        float4     *numbers;
 
3265
        int                     nnumbers;
 
3266
 
 
3267
        examine_variable(root, hashkey, 0, &vardata);
 
3268
 
 
3269
        /* Get number of distinct values and fraction that are null */
 
3270
        ndistinct = get_variable_numdistinct(&vardata);
 
3271
 
 
3272
        if (HeapTupleIsValid(vardata.statsTuple))
 
3273
        {
 
3274
                Form_pg_statistic stats;
 
3275
 
 
3276
                stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
 
3277
                stanullfrac = stats->stanullfrac;
 
3278
        }
 
3279
        else
 
3280
        {
 
3281
                /*
 
3282
                 * Believe a default ndistinct only if it came from stats. Otherwise
 
3283
                 * punt and return 0.1, per comments above.
 
3284
                 */
 
3285
                if (ndistinct == DEFAULT_NUM_DISTINCT)
 
3286
                {
 
3287
                        ReleaseVariableStats(vardata);
 
3288
                        return (Selectivity) 0.1;
 
3289
                }
 
3290
 
 
3291
                stanullfrac = 0.0;
 
3292
        }
 
3293
 
 
3294
        /* Compute avg freq of all distinct data values in raw relation */
 
3295
        avgfreq = (1.0 - stanullfrac) / ndistinct;
 
3296
 
 
3297
        /*
 
3298
         * Adjust ndistinct to account for restriction clauses.  Observe we are
 
3299
         * assuming that the data distribution is affected uniformly by the
 
3300
         * restriction clauses!
 
3301
         *
 
3302
         * XXX Possibly better way, but much more expensive: multiply by
 
3303
         * selectivity of rel's restriction clauses that mention the target Var.
 
3304
         */
 
3305
        if (vardata.rel)
 
3306
                ndistinct *= vardata.rel->rows / vardata.rel->tuples;
 
3307
 
 
3308
        /*
 
3309
         * Initial estimate of bucketsize fraction is 1/nbuckets as long as the
 
3310
         * number of buckets is less than the expected number of distinct values;
 
3311
         * otherwise it is 1/ndistinct.
 
3312
         */
 
3313
        if (ndistinct > nbuckets)
 
3314
                estfract = 1.0 / nbuckets;
 
3315
        else
 
3316
                estfract = 1.0 / ndistinct;
 
3317
 
 
3318
        /*
 
3319
         * Look up the frequency of the most common value, if available.
 
3320
         */
 
3321
        mcvfreq = 0.0;
 
3322
 
 
3323
        if (HeapTupleIsValid(vardata.statsTuple))
 
3324
        {
 
3325
                if (get_attstatsslot(vardata.statsTuple,
 
3326
                                                         vardata.atttype, vardata.atttypmod,
 
3327
                                                         STATISTIC_KIND_MCV, InvalidOid,
 
3328
                                                         NULL,
 
3329
                                                         NULL, NULL,
 
3330
                                                         &numbers, &nnumbers))
 
3331
                {
 
3332
                        /*
 
3333
                         * The first MCV stat is for the most common value.
 
3334
                         */
 
3335
                        if (nnumbers > 0)
 
3336
                                mcvfreq = numbers[0];
 
3337
                        free_attstatsslot(vardata.atttype, NULL, 0,
 
3338
                                                          numbers, nnumbers);
 
3339
                }
 
3340
        }
 
3341
 
 
3342
        /*
 
3343
         * Adjust estimated bucketsize upward to account for skewed distribution.
 
3344
         */
 
3345
        if (avgfreq > 0.0 && mcvfreq > avgfreq)
 
3346
                estfract *= mcvfreq / avgfreq;
 
3347
 
 
3348
        /*
 
3349
         * Clamp bucketsize to sane range (the above adjustment could easily
 
3350
         * produce an out-of-range result).  We set the lower bound a little above
 
3351
         * zero, since zero isn't a very sane result.
 
3352
         */
 
3353
        if (estfract < 1.0e-6)
 
3354
                estfract = 1.0e-6;
 
3355
        else if (estfract > 1.0)
 
3356
                estfract = 1.0;
 
3357
 
 
3358
        ReleaseVariableStats(vardata);
 
3359
 
 
3360
        return (Selectivity) estfract;
 
3361
}
 
3362
 
 
3363
 
 
3364
/*-------------------------------------------------------------------------
 
3365
 *
 
3366
 * Support routines
 
3367
 *
 
3368
 *-------------------------------------------------------------------------
 
3369
 */
 
3370
 
 
3371
/*
 
3372
 * convert_to_scalar
 
3373
 *        Convert non-NULL values of the indicated types to the comparison
 
3374
 *        scale needed by scalarineqsel().
 
3375
 *        Returns "true" if successful.
 
3376
 *
 
3377
 * XXX this routine is a hack: ideally we should look up the conversion
 
3378
 * subroutines in pg_type.
 
3379
 *
 
3380
 * All numeric datatypes are simply converted to their equivalent
 
3381
 * "double" values.  (NUMERIC values that are outside the range of "double"
 
3382
 * are clamped to +/- HUGE_VAL.)
 
3383
 *
 
3384
 * String datatypes are converted by convert_string_to_scalar(),
 
3385
 * which is explained below.  The reason why this routine deals with
 
3386
 * three values at a time, not just one, is that we need it for strings.
 
3387
 *
 
3388
 * The bytea datatype is just enough different from strings that it has
 
3389
 * to be treated separately.
 
3390
 *
 
3391
 * The several datatypes representing absolute times are all converted
 
3392
 * to Timestamp, which is actually a double, and then we just use that
 
3393
 * double value.  Note this will give correct results even for the "special"
 
3394
 * values of Timestamp, since those are chosen to compare correctly;
 
3395
 * see timestamp_cmp.
 
3396
 *
 
3397
 * The several datatypes representing relative times (intervals) are all
 
3398
 * converted to measurements expressed in seconds.
 
3399
 */
 
3400
static bool
 
3401
convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
 
3402
                                  Datum lobound, Datum hibound, Oid boundstypid,
 
3403
                                  double *scaledlobound, double *scaledhibound)
 
3404
{
 
3405
        /*
 
3406
         * Both the valuetypid and the boundstypid should exactly match the
 
3407
         * declared input type(s) of the operator we are invoked for, so we just
 
3408
         * error out if either is not recognized.
 
3409
         *
 
3410
         * XXX The histogram we are interpolating between points of could belong
 
3411
         * to a column that's only binary-compatible with the declared type. In
 
3412
         * essence we are assuming that the semantics of binary-compatible types
 
3413
         * are enough alike that we can use a histogram generated with one type's
 
3414
         * operators to estimate selectivity for the other's.  This is outright
 
3415
         * wrong in some cases --- in particular signed versus unsigned
 
3416
         * interpretation could trip us up.  But it's useful enough in the
 
3417
         * majority of cases that we do it anyway.      Should think about more
 
3418
         * rigorous ways to do it.
 
3419
         */
 
3420
        switch (valuetypid)
 
3421
        {
 
3422
                        /*
 
3423
                         * Built-in numeric types
 
3424
                         */
 
3425
                case BOOLOID:
 
3426
                case INT2OID:
 
3427
                case INT4OID:
 
3428
                case INT8OID:
 
3429
                case FLOAT4OID:
 
3430
                case FLOAT8OID:
 
3431
                case NUMERICOID:
 
3432
                case OIDOID:
 
3433
                case REGPROCOID:
 
3434
                case REGPROCEDUREOID:
 
3435
                case REGOPEROID:
 
3436
                case REGOPERATOROID:
 
3437
                case REGCLASSOID:
 
3438
                case REGTYPEOID:
 
3439
                case REGCONFIGOID:
 
3440
                case REGDICTIONARYOID:
 
3441
                        *scaledvalue = convert_numeric_to_scalar(value, valuetypid);
 
3442
                        *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
 
3443
                        *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
 
3444
                        return true;
 
3445
 
 
3446
                        /*
 
3447
                         * Built-in string types
 
3448
                         */
 
3449
                case CHAROID:
 
3450
                case BPCHAROID:
 
3451
                case VARCHAROID:
 
3452
                case TEXTOID:
 
3453
                case NAMEOID:
 
3454
                        {
 
3455
                                char       *valstr = convert_string_datum(value, valuetypid);
 
3456
                                char       *lostr = convert_string_datum(lobound, boundstypid);
 
3457
                                char       *histr = convert_string_datum(hibound, boundstypid);
 
3458
 
 
3459
                                convert_string_to_scalar(valstr, scaledvalue,
 
3460
                                                                                 lostr, scaledlobound,
 
3461
                                                                                 histr, scaledhibound);
 
3462
                                pfree(valstr);
 
3463
                                pfree(lostr);
 
3464
                                pfree(histr);
 
3465
                                return true;
 
3466
                        }
 
3467
 
 
3468
                        /*
 
3469
                         * Built-in bytea type
 
3470
                         */
 
3471
                case BYTEAOID:
 
3472
                        {
 
3473
                                convert_bytea_to_scalar(value, scaledvalue,
 
3474
                                                                                lobound, scaledlobound,
 
3475
                                                                                hibound, scaledhibound);
 
3476
                                return true;
 
3477
                        }
 
3478
 
 
3479
                        /*
 
3480
                         * Built-in time types
 
3481
                         */
 
3482
                case TIMESTAMPOID:
 
3483
                case TIMESTAMPTZOID:
 
3484
                case ABSTIMEOID:
 
3485
                case DATEOID:
 
3486
                case INTERVALOID:
 
3487
                case RELTIMEOID:
 
3488
                case TINTERVALOID:
 
3489
                case TIMEOID:
 
3490
                case TIMETZOID:
 
3491
                        *scaledvalue = convert_timevalue_to_scalar(value, valuetypid);
 
3492
                        *scaledlobound = convert_timevalue_to_scalar(lobound, boundstypid);
 
3493
                        *scaledhibound = convert_timevalue_to_scalar(hibound, boundstypid);
 
3494
                        return true;
 
3495
 
 
3496
                        /*
 
3497
                         * Built-in network types
 
3498
                         */
 
3499
                case INETOID:
 
3500
                case CIDROID:
 
3501
                case MACADDROID:
 
3502
                        *scaledvalue = convert_network_to_scalar(value, valuetypid);
 
3503
                        *scaledlobound = convert_network_to_scalar(lobound, boundstypid);
 
3504
                        *scaledhibound = convert_network_to_scalar(hibound, boundstypid);
 
3505
                        return true;
 
3506
        }
 
3507
        /* Don't know how to convert */
 
3508
        *scaledvalue = *scaledlobound = *scaledhibound = 0;
 
3509
        return false;
 
3510
}
 
3511
 
 
3512
/*
 
3513
 * Do convert_to_scalar()'s work for any numeric data type.
 
3514
 */
 
3515
static double
 
3516
convert_numeric_to_scalar(Datum value, Oid typid)
 
3517
{
 
3518
        switch (typid)
 
3519
        {
 
3520
                case BOOLOID:
 
3521
                        return (double) DatumGetBool(value);
 
3522
                case INT2OID:
 
3523
                        return (double) DatumGetInt16(value);
 
3524
                case INT4OID:
 
3525
                        return (double) DatumGetInt32(value);
 
3526
                case INT8OID:
 
3527
                        return (double) DatumGetInt64(value);
 
3528
                case FLOAT4OID:
 
3529
                        return (double) DatumGetFloat4(value);
 
3530
                case FLOAT8OID:
 
3531
                        return (double) DatumGetFloat8(value);
 
3532
                case NUMERICOID:
 
3533
                        /* Note: out-of-range values will be clamped to +-HUGE_VAL */
 
3534
                        return (double)
 
3535
                                DatumGetFloat8(DirectFunctionCall1(numeric_float8_no_overflow,
 
3536
                                                                                                   value));
 
3537
                case OIDOID:
 
3538
                case REGPROCOID:
 
3539
                case REGPROCEDUREOID:
 
3540
                case REGOPEROID:
 
3541
                case REGOPERATOROID:
 
3542
                case REGCLASSOID:
 
3543
                case REGTYPEOID:
 
3544
                case REGCONFIGOID:
 
3545
                case REGDICTIONARYOID:
 
3546
                        /* we can treat OIDs as integers... */
 
3547
                        return (double) DatumGetObjectId(value);
 
3548
        }
 
3549
 
 
3550
        /*
 
3551
         * Can't get here unless someone tries to use scalarltsel/scalargtsel on
 
3552
         * an operator with one numeric and one non-numeric operand.
 
3553
         */
 
3554
        elog(ERROR, "unsupported type: %u", typid);
 
3555
        return 0;
 
3556
}
 
3557
 
 
3558
/*
 
3559
 * Do convert_to_scalar()'s work for any character-string data type.
 
3560
 *
 
3561
 * String datatypes are converted to a scale that ranges from 0 to 1,
 
3562
 * where we visualize the bytes of the string as fractional digits.
 
3563
 *
 
3564
 * We do not want the base to be 256, however, since that tends to
 
3565
 * generate inflated selectivity estimates; few databases will have
 
3566
 * occurrences of all 256 possible byte values at each position.
 
3567
 * Instead, use the smallest and largest byte values seen in the bounds
 
3568
 * as the estimated range for each byte, after some fudging to deal with
 
3569
 * the fact that we probably aren't going to see the full range that way.
 
3570
 *
 
3571
 * An additional refinement is that we discard any common prefix of the
 
3572
 * three strings before computing the scaled values.  This allows us to
 
3573
 * "zoom in" when we encounter a narrow data range.  An example is a phone
 
3574
 * number database where all the values begin with the same area code.
 
3575
 * (Actually, the bounds will be adjacent histogram-bin-boundary values,
 
3576
 * so this is more likely to happen than you might think.)
 
3577
 */
 
3578
static void
 
3579
convert_string_to_scalar(char *value,
 
3580
                                                 double *scaledvalue,
 
3581
                                                 char *lobound,
 
3582
                                                 double *scaledlobound,
 
3583
                                                 char *hibound,
 
3584
                                                 double *scaledhibound)
 
3585
{
 
3586
        int                     rangelo,
 
3587
                                rangehi;
 
3588
        char       *sptr;
 
3589
 
 
3590
        rangelo = rangehi = (unsigned char) hibound[0];
 
3591
        for (sptr = lobound; *sptr; sptr++)
 
3592
        {
 
3593
                if (rangelo > (unsigned char) *sptr)
 
3594
                        rangelo = (unsigned char) *sptr;
 
3595
                if (rangehi < (unsigned char) *sptr)
 
3596
                        rangehi = (unsigned char) *sptr;
 
3597
        }
 
3598
        for (sptr = hibound; *sptr; sptr++)
 
3599
        {
 
3600
                if (rangelo > (unsigned char) *sptr)
 
3601
                        rangelo = (unsigned char) *sptr;
 
3602
                if (rangehi < (unsigned char) *sptr)
 
3603
                        rangehi = (unsigned char) *sptr;
 
3604
        }
 
3605
        /* If range includes any upper-case ASCII chars, make it include all */
 
3606
        if (rangelo <= 'Z' && rangehi >= 'A')
 
3607
        {
 
3608
                if (rangelo > 'A')
 
3609
                        rangelo = 'A';
 
3610
                if (rangehi < 'Z')
 
3611
                        rangehi = 'Z';
 
3612
        }
 
3613
        /* Ditto lower-case */
 
3614
        if (rangelo <= 'z' && rangehi >= 'a')
 
3615
        {
 
3616
                if (rangelo > 'a')
 
3617
                        rangelo = 'a';
 
3618
                if (rangehi < 'z')
 
3619
                        rangehi = 'z';
 
3620
        }
 
3621
        /* Ditto digits */
 
3622
        if (rangelo <= '9' && rangehi >= '0')
 
3623
        {
 
3624
                if (rangelo > '0')
 
3625
                        rangelo = '0';
 
3626
                if (rangehi < '9')
 
3627
                        rangehi = '9';
 
3628
        }
 
3629
 
 
3630
        /*
 
3631
         * If range includes less than 10 chars, assume we have not got enough
 
3632
         * data, and make it include regular ASCII set.
 
3633
         */
 
3634
        if (rangehi - rangelo < 9)
 
3635
        {
 
3636
                rangelo = ' ';
 
3637
                rangehi = 127;
 
3638
        }
 
3639
 
 
3640
        /*
 
3641
         * Now strip any common prefix of the three strings.
 
3642
         */
 
3643
        while (*lobound)
 
3644
        {
 
3645
                if (*lobound != *hibound || *lobound != *value)
 
3646
                        break;
 
3647
                lobound++, hibound++, value++;
 
3648
        }
 
3649
 
 
3650
        /*
 
3651
         * Now we can do the conversions.
 
3652
         */
 
3653
        *scaledvalue = convert_one_string_to_scalar(value, rangelo, rangehi);
 
3654
        *scaledlobound = convert_one_string_to_scalar(lobound, rangelo, rangehi);
 
3655
        *scaledhibound = convert_one_string_to_scalar(hibound, rangelo, rangehi);
 
3656
}
 
3657
 
 
3658
static double
 
3659
convert_one_string_to_scalar(char *value, int rangelo, int rangehi)
 
3660
{
 
3661
        int                     slen = strlen(value);
 
3662
        double          num,
 
3663
                                denom,
 
3664
                                base;
 
3665
 
 
3666
        if (slen <= 0)
 
3667
                return 0.0;                             /* empty string has scalar value 0 */
 
3668
 
 
3669
        /*
 
3670
         * Since base is at least 10, need not consider more than about 20 chars
 
3671
         */
 
3672
        if (slen > 20)
 
3673
                slen = 20;
 
3674
 
 
3675
        /* Convert initial characters to fraction */
 
3676
        base = rangehi - rangelo + 1;
 
3677
        num = 0.0;
 
3678
        denom = base;
 
3679
        while (slen-- > 0)
 
3680
        {
 
3681
                int                     ch = (unsigned char) *value++;
 
3682
 
 
3683
                if (ch < rangelo)
 
3684
                        ch = rangelo - 1;
 
3685
                else if (ch > rangehi)
 
3686
                        ch = rangehi + 1;
 
3687
                num += ((double) (ch - rangelo)) / denom;
 
3688
                denom *= base;
 
3689
        }
 
3690
 
 
3691
        return num;
 
3692
}
 
3693
 
 
3694
/*
 
3695
 * Convert a string-type Datum into a palloc'd, null-terminated string.
 
3696
 *
 
3697
 * When using a non-C locale, we must pass the string through strxfrm()
 
3698
 * before continuing, so as to generate correct locale-specific results.
 
3699
 */
 
3700
static char *
 
3701
convert_string_datum(Datum value, Oid typid)
 
3702
{
 
3703
        char       *val;
 
3704
 
 
3705
        switch (typid)
 
3706
        {
 
3707
                case CHAROID:
 
3708
                        val = (char *) palloc(2);
 
3709
                        val[0] = DatumGetChar(value);
 
3710
                        val[1] = '\0';
 
3711
                        break;
 
3712
                case BPCHAROID:
 
3713
                case VARCHAROID:
 
3714
                case TEXTOID:
 
3715
                        val = TextDatumGetCString(value);
 
3716
                        break;
 
3717
                case NAMEOID:
 
3718
                        {
 
3719
                                NameData   *nm = (NameData *) DatumGetPointer(value);
 
3720
 
 
3721
                                val = pstrdup(NameStr(*nm));
 
3722
                                break;
 
3723
                        }
 
3724
                default:
 
3725
 
 
3726
                        /*
 
3727
                         * Can't get here unless someone tries to use scalarltsel on an
 
3728
                         * operator with one string and one non-string operand.
 
3729
                         */
 
3730
                        elog(ERROR, "unsupported type: %u", typid);
 
3731
                        return NULL;
 
3732
        }
 
3733
 
 
3734
        if (!lc_collate_is_c(DEFAULT_COLLATION_OID))
 
3735
        {
 
3736
                char       *xfrmstr;
 
3737
                size_t          xfrmlen;
 
3738
                size_t          xfrmlen2;
 
3739
 
 
3740
                /*
 
3741
                 * Note: originally we guessed at a suitable output buffer size, and
 
3742
                 * only needed to call strxfrm twice if our guess was too small.
 
3743
                 * However, it seems that some versions of Solaris have buggy strxfrm
 
3744
                 * that can write past the specified buffer length in that scenario.
 
3745
                 * So, do it the dumb way for portability.
 
3746
                 *
 
3747
                 * Yet other systems (e.g., glibc) sometimes return a smaller value
 
3748
                 * from the second call than the first; thus the Assert must be <= not
 
3749
                 * == as you'd expect.  Can't any of these people program their way
 
3750
                 * out of a paper bag?
 
3751
                 *
 
3752
                 * XXX: strxfrm doesn't support UTF-8 encoding on Win32, it can return
 
3753
                 * bogus data or set an error. This is not really a problem unless it
 
3754
                 * crashes since it will only give an estimation error and nothing
 
3755
                 * fatal.
 
3756
                 */
 
3757
#if _MSC_VER == 1400                    /* VS.Net 2005 */
 
3758
 
 
3759
                /*
 
3760
                 *
 
3761
                 * http://connect.microsoft.com/VisualStudio/feedback/ViewFeedback.aspx?
 
3762
                 * FeedbackID=99694 */
 
3763
                {
 
3764
                        char            x[1];
 
3765
 
 
3766
                        xfrmlen = strxfrm(x, val, 0);
 
3767
                }
 
3768
#else
 
3769
                xfrmlen = strxfrm(NULL, val, 0);
 
3770
#endif
 
3771
#ifdef WIN32
 
3772
 
 
3773
                /*
 
3774
                 * On Windows, strxfrm returns INT_MAX when an error occurs. Instead
 
3775
                 * of trying to allocate this much memory (and fail), just return the
 
3776
                 * original string unmodified as if we were in the C locale.
 
3777
                 */
 
3778
                if (xfrmlen == INT_MAX)
 
3779
                        return val;
 
3780
#endif
 
3781
                xfrmstr = (char *) palloc(xfrmlen + 1);
 
3782
                xfrmlen2 = strxfrm(xfrmstr, val, xfrmlen + 1);
 
3783
                Assert(xfrmlen2 <= xfrmlen);
 
3784
                pfree(val);
 
3785
                val = xfrmstr;
 
3786
        }
 
3787
 
 
3788
        return val;
 
3789
}
 
3790
 
 
3791
/*
 
3792
 * Do convert_to_scalar()'s work for any bytea data type.
 
3793
 *
 
3794
 * Very similar to convert_string_to_scalar except we can't assume
 
3795
 * null-termination and therefore pass explicit lengths around.
 
3796
 *
 
3797
 * Also, assumptions about likely "normal" ranges of characters have been
 
3798
 * removed - a data range of 0..255 is always used, for now.  (Perhaps
 
3799
 * someday we will add information about actual byte data range to
 
3800
 * pg_statistic.)
 
3801
 */
 
3802
static void
 
3803
convert_bytea_to_scalar(Datum value,
 
3804
                                                double *scaledvalue,
 
3805
                                                Datum lobound,
 
3806
                                                double *scaledlobound,
 
3807
                                                Datum hibound,
 
3808
                                                double *scaledhibound)
 
3809
{
 
3810
        int                     rangelo,
 
3811
                                rangehi,
 
3812
                                valuelen = VARSIZE(DatumGetPointer(value)) - VARHDRSZ,
 
3813
                                loboundlen = VARSIZE(DatumGetPointer(lobound)) - VARHDRSZ,
 
3814
                                hiboundlen = VARSIZE(DatumGetPointer(hibound)) - VARHDRSZ,
 
3815
                                i,
 
3816
                                minlen;
 
3817
        unsigned char *valstr = (unsigned char *) VARDATA(DatumGetPointer(value)),
 
3818
                           *lostr = (unsigned char *) VARDATA(DatumGetPointer(lobound)),
 
3819
                           *histr = (unsigned char *) VARDATA(DatumGetPointer(hibound));
 
3820
 
 
3821
        /*
 
3822
         * Assume bytea data is uniformly distributed across all byte values.
 
3823
         */
 
3824
        rangelo = 0;
 
3825
        rangehi = 255;
 
3826
 
 
3827
        /*
 
3828
         * Now strip any common prefix of the three strings.
 
3829
         */
 
3830
        minlen = Min(Min(valuelen, loboundlen), hiboundlen);
 
3831
        for (i = 0; i < minlen; i++)
 
3832
        {
 
3833
                if (*lostr != *histr || *lostr != *valstr)
 
3834
                        break;
 
3835
                lostr++, histr++, valstr++;
 
3836
                loboundlen--, hiboundlen--, valuelen--;
 
3837
        }
 
3838
 
 
3839
        /*
 
3840
         * Now we can do the conversions.
 
3841
         */
 
3842
        *scaledvalue = convert_one_bytea_to_scalar(valstr, valuelen, rangelo, rangehi);
 
3843
        *scaledlobound = convert_one_bytea_to_scalar(lostr, loboundlen, rangelo, rangehi);
 
3844
        *scaledhibound = convert_one_bytea_to_scalar(histr, hiboundlen, rangelo, rangehi);
 
3845
}
 
3846
 
 
3847
static double
 
3848
convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
 
3849
                                                        int rangelo, int rangehi)
 
3850
{
 
3851
        double          num,
 
3852
                                denom,
 
3853
                                base;
 
3854
 
 
3855
        if (valuelen <= 0)
 
3856
                return 0.0;                             /* empty string has scalar value 0 */
 
3857
 
 
3858
        /*
 
3859
         * Since base is 256, need not consider more than about 10 chars (even
 
3860
         * this many seems like overkill)
 
3861
         */
 
3862
        if (valuelen > 10)
 
3863
                valuelen = 10;
 
3864
 
 
3865
        /* Convert initial characters to fraction */
 
3866
        base = rangehi - rangelo + 1;
 
3867
        num = 0.0;
 
3868
        denom = base;
 
3869
        while (valuelen-- > 0)
 
3870
        {
 
3871
                int                     ch = *value++;
 
3872
 
 
3873
                if (ch < rangelo)
 
3874
                        ch = rangelo - 1;
 
3875
                else if (ch > rangehi)
 
3876
                        ch = rangehi + 1;
 
3877
                num += ((double) (ch - rangelo)) / denom;
 
3878
                denom *= base;
 
3879
        }
 
3880
 
 
3881
        return num;
 
3882
}
 
3883
 
 
3884
/*
 
3885
 * Do convert_to_scalar()'s work for any timevalue data type.
 
3886
 */
 
3887
static double
 
3888
convert_timevalue_to_scalar(Datum value, Oid typid)
 
3889
{
 
3890
        switch (typid)
 
3891
        {
 
3892
                case TIMESTAMPOID:
 
3893
                        return DatumGetTimestamp(value);
 
3894
                case TIMESTAMPTZOID:
 
3895
                        return DatumGetTimestampTz(value);
 
3896
                case ABSTIMEOID:
 
3897
                        return DatumGetTimestamp(DirectFunctionCall1(abstime_timestamp,
 
3898
                                                                                                                 value));
 
3899
                case DATEOID:
 
3900
                        return date2timestamp_no_overflow(DatumGetDateADT(value));
 
3901
                case INTERVALOID:
 
3902
                        {
 
3903
                                Interval   *interval = DatumGetIntervalP(value);
 
3904
 
 
3905
                                /*
 
3906
                                 * Convert the month part of Interval to days using assumed
 
3907
                                 * average month length of 365.25/12.0 days.  Not too
 
3908
                                 * accurate, but plenty good enough for our purposes.
 
3909
                                 */
 
3910
#ifdef HAVE_INT64_TIMESTAMP
 
3911
                                return interval->time + interval->day * (double) USECS_PER_DAY +
 
3912
                                        interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * USECS_PER_DAY);
 
3913
#else
 
3914
                                return interval->time + interval->day * SECS_PER_DAY +
 
3915
                                        interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * (double) SECS_PER_DAY);
 
3916
#endif
 
3917
                        }
 
3918
                case RELTIMEOID:
 
3919
#ifdef HAVE_INT64_TIMESTAMP
 
3920
                        return (DatumGetRelativeTime(value) * 1000000.0);
 
3921
#else
 
3922
                        return DatumGetRelativeTime(value);
 
3923
#endif
 
3924
                case TINTERVALOID:
 
3925
                        {
 
3926
                                TimeInterval tinterval = DatumGetTimeInterval(value);
 
3927
 
 
3928
#ifdef HAVE_INT64_TIMESTAMP
 
3929
                                if (tinterval->status != 0)
 
3930
                                        return ((tinterval->data[1] - tinterval->data[0]) * 1000000.0);
 
3931
#else
 
3932
                                if (tinterval->status != 0)
 
3933
                                        return tinterval->data[1] - tinterval->data[0];
 
3934
#endif
 
3935
                                return 0;               /* for lack of a better idea */
 
3936
                        }
 
3937
                case TIMEOID:
 
3938
                        return DatumGetTimeADT(value);
 
3939
                case TIMETZOID:
 
3940
                        {
 
3941
                                TimeTzADT  *timetz = DatumGetTimeTzADTP(value);
 
3942
 
 
3943
                                /* use GMT-equivalent time */
 
3944
#ifdef HAVE_INT64_TIMESTAMP
 
3945
                                return (double) (timetz->time + (timetz->zone * 1000000.0));
 
3946
#else
 
3947
                                return (double) (timetz->time + timetz->zone);
 
3948
#endif
 
3949
                        }
 
3950
        }
 
3951
 
 
3952
        /*
 
3953
         * Can't get here unless someone tries to use scalarltsel/scalargtsel on
 
3954
         * an operator with one timevalue and one non-timevalue operand.
 
3955
         */
 
3956
        elog(ERROR, "unsupported type: %u", typid);
 
3957
        return 0;
 
3958
}
 
3959
 
 
3960
 
 
3961
/*
 
3962
 * get_restriction_variable
 
3963
 *              Examine the args of a restriction clause to see if it's of the
 
3964
 *              form (variable op pseudoconstant) or (pseudoconstant op variable),
 
3965
 *              where "variable" could be either a Var or an expression in vars of a
 
3966
 *              single relation.  If so, extract information about the variable,
 
3967
 *              and also indicate which side it was on and the other argument.
 
3968
 *
 
3969
 * Inputs:
 
3970
 *      root: the planner info
 
3971
 *      args: clause argument list
 
3972
 *      varRelid: see specs for restriction selectivity functions
 
3973
 *
 
3974
 * Outputs: (these are valid only if TRUE is returned)
 
3975
 *      *vardata: gets information about variable (see examine_variable)
 
3976
 *      *other: gets other clause argument, aggressively reduced to a constant
 
3977
 *      *varonleft: set TRUE if variable is on the left, FALSE if on the right
 
3978
 *
 
3979
 * Returns TRUE if a variable is identified, otherwise FALSE.
 
3980
 *
 
3981
 * Note: if there are Vars on both sides of the clause, we must fail, because
 
3982
 * callers are expecting that the other side will act like a pseudoconstant.
 
3983
 */
 
3984
bool
 
3985
get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
 
3986
                                                 VariableStatData *vardata, Node **other,
 
3987
                                                 bool *varonleft)
 
3988
{
 
3989
        Node       *left,
 
3990
                           *right;
 
3991
        VariableStatData rdata;
 
3992
 
 
3993
        /* Fail if not a binary opclause (probably shouldn't happen) */
 
3994
        if (list_length(args) != 2)
 
3995
                return false;
 
3996
 
 
3997
        left = (Node *) linitial(args);
 
3998
        right = (Node *) lsecond(args);
 
3999
 
 
4000
        /*
 
4001
         * Examine both sides.  Note that when varRelid is nonzero, Vars of other
 
4002
         * relations will be treated as pseudoconstants.
 
4003
         */
 
4004
        examine_variable(root, left, varRelid, vardata);
 
4005
        examine_variable(root, right, varRelid, &rdata);
 
4006
 
 
4007
        /*
 
4008
         * If one side is a variable and the other not, we win.
 
4009
         */
 
4010
        if (vardata->rel && rdata.rel == NULL)
 
4011
        {
 
4012
                *varonleft = true;
 
4013
                *other = estimate_expression_value(root, rdata.var);
 
4014
                /* Assume we need no ReleaseVariableStats(rdata) here */
 
4015
                return true;
 
4016
        }
 
4017
 
 
4018
        if (vardata->rel == NULL && rdata.rel)
 
4019
        {
 
4020
                *varonleft = false;
 
4021
                *other = estimate_expression_value(root, vardata->var);
 
4022
                /* Assume we need no ReleaseVariableStats(*vardata) here */
 
4023
                *vardata = rdata;
 
4024
                return true;
 
4025
        }
 
4026
 
 
4027
        /* Ooops, clause has wrong structure (probably var op var) */
 
4028
        ReleaseVariableStats(*vardata);
 
4029
        ReleaseVariableStats(rdata);
 
4030
 
 
4031
        return false;
 
4032
}
 
4033
 
 
4034
/*
 
4035
 * get_join_variables
 
4036
 *              Apply examine_variable() to each side of a join clause.
 
4037
 *              Also, attempt to identify whether the join clause has the same
 
4038
 *              or reversed sense compared to the SpecialJoinInfo.
 
4039
 *
 
4040
 * We consider the join clause "normal" if it is "lhs_var OP rhs_var",
 
4041
 * or "reversed" if it is "rhs_var OP lhs_var".  In complicated cases
 
4042
 * where we can't tell for sure, we default to assuming it's normal.
 
4043
 */
 
4044
void
 
4045
get_join_variables(PlannerInfo *root, List *args, SpecialJoinInfo *sjinfo,
 
4046
                                   VariableStatData *vardata1, VariableStatData *vardata2,
 
4047
                                   bool *join_is_reversed)
 
4048
{
 
4049
        Node       *left,
 
4050
                           *right;
 
4051
 
 
4052
        if (list_length(args) != 2)
 
4053
                elog(ERROR, "join operator should take two arguments");
 
4054
 
 
4055
        left = (Node *) linitial(args);
 
4056
        right = (Node *) lsecond(args);
 
4057
 
 
4058
        examine_variable(root, left, 0, vardata1);
 
4059
        examine_variable(root, right, 0, vardata2);
 
4060
 
 
4061
        if (vardata1->rel &&
 
4062
                bms_is_subset(vardata1->rel->relids, sjinfo->syn_righthand))
 
4063
                *join_is_reversed = true;               /* var1 is on RHS */
 
4064
        else if (vardata2->rel &&
 
4065
                         bms_is_subset(vardata2->rel->relids, sjinfo->syn_lefthand))
 
4066
                *join_is_reversed = true;               /* var2 is on LHS */
 
4067
        else
 
4068
                *join_is_reversed = false;
 
4069
}
 
4070
 
 
4071
/*
 
4072
 * examine_variable
 
4073
 *              Try to look up statistical data about an expression.
 
4074
 *              Fill in a VariableStatData struct to describe the expression.
 
4075
 *
 
4076
 * Inputs:
 
4077
 *      root: the planner info
 
4078
 *      node: the expression tree to examine
 
4079
 *      varRelid: see specs for restriction selectivity functions
 
4080
 *
 
4081
 * Outputs: *vardata is filled as follows:
 
4082
 *      var: the input expression (with any binary relabeling stripped, if
 
4083
 *              it is or contains a variable; but otherwise the type is preserved)
 
4084
 *      rel: RelOptInfo for relation containing variable; NULL if expression
 
4085
 *              contains no Vars (NOTE this could point to a RelOptInfo of a
 
4086
 *              subquery, not one in the current query).
 
4087
 *      statsTuple: the pg_statistic entry for the variable, if one exists;
 
4088
 *              otherwise NULL.
 
4089
 *      freefunc: pointer to a function to release statsTuple with.
 
4090
 *      vartype: exposed type of the expression; this should always match
 
4091
 *              the declared input type of the operator we are estimating for.
 
4092
 *      atttype, atttypmod: type data to pass to get_attstatsslot().  This is
 
4093
 *              commonly the same as the exposed type of the variable argument,
 
4094
 *              but can be different in binary-compatible-type cases.
 
4095
 *      isunique: TRUE if we were able to match the var to a unique index,
 
4096
 *              implying its values are unique for this query.
 
4097
 *
 
4098
 * Caller is responsible for doing ReleaseVariableStats() before exiting.
 
4099
 */
 
4100
void
 
4101
examine_variable(PlannerInfo *root, Node *node, int varRelid,
 
4102
                                 VariableStatData *vardata)
 
4103
{
 
4104
        Node       *basenode;
 
4105
        Relids          varnos;
 
4106
        RelOptInfo *onerel;
 
4107
 
 
4108
        /* Make sure we don't return dangling pointers in vardata */
 
4109
        MemSet(vardata, 0, sizeof(VariableStatData));
 
4110
 
 
4111
        /* Save the exposed type of the expression */
 
4112
        vardata->vartype = exprType(node);
 
4113
 
 
4114
        /* Look inside any binary-compatible relabeling */
 
4115
 
 
4116
        if (IsA(node, RelabelType))
 
4117
                basenode = (Node *) ((RelabelType *) node)->arg;
 
4118
        else
 
4119
                basenode = node;
 
4120
 
 
4121
        /* Fast path for a simple Var */
 
4122
 
 
4123
        if (IsA(basenode, Var) &&
 
4124
                (varRelid == 0 || varRelid == ((Var *) basenode)->varno))
 
4125
        {
 
4126
                Var                *var = (Var *) basenode;
 
4127
                RangeTblEntry *rte;
 
4128
 
 
4129
                vardata->var = basenode;        /* return Var without relabeling */
 
4130
                vardata->rel = find_base_rel(root, var->varno);
 
4131
                vardata->atttype = var->vartype;
 
4132
                vardata->atttypmod = var->vartypmod;
 
4133
                vardata->isunique = has_unique_index(vardata->rel, var->varattno);
 
4134
 
 
4135
                rte = root->simple_rte_array[var->varno];
 
4136
 
 
4137
                if (get_relation_stats_hook &&
 
4138
                        (*get_relation_stats_hook) (root, rte, var->varattno, vardata))
 
4139
                {
 
4140
                        /*
 
4141
                         * The hook took control of acquiring a stats tuple.  If it did
 
4142
                         * supply a tuple, it'd better have supplied a freefunc.
 
4143
                         */
 
4144
                        if (HeapTupleIsValid(vardata->statsTuple) &&
 
4145
                                !vardata->freefunc)
 
4146
                                elog(ERROR, "no function provided to release variable stats with");
 
4147
                }
 
4148
                else if (rte->rtekind == RTE_RELATION)
 
4149
                {
 
4150
                        vardata->statsTuple = SearchSysCache3(STATRELATTINH,
 
4151
                                                                                                ObjectIdGetDatum(rte->relid),
 
4152
                                                                                                Int16GetDatum(var->varattno),
 
4153
                                                                                                  BoolGetDatum(rte->inh));
 
4154
                        vardata->freefunc = ReleaseSysCache;
 
4155
                }
 
4156
                else
 
4157
                {
 
4158
                        /*
 
4159
                         * XXX This means the Var comes from a JOIN or sub-SELECT. Later
 
4160
                         * add code to dig down into the join etc and see if we can trace
 
4161
                         * the variable to something with stats.  (But beware of
 
4162
                         * sub-SELECTs with DISTINCT/GROUP BY/etc.      Perhaps there are no
 
4163
                         * cases where this would really be useful, because we'd have
 
4164
                         * flattened the subselect if it is??)
 
4165
                         */
 
4166
                }
 
4167
 
 
4168
                return;
 
4169
        }
 
4170
 
 
4171
        /*
 
4172
         * Okay, it's a more complicated expression.  Determine variable
 
4173
         * membership.  Note that when varRelid isn't zero, only vars of that
 
4174
         * relation are considered "real" vars.
 
4175
         */
 
4176
        varnos = pull_varnos(basenode);
 
4177
 
 
4178
        onerel = NULL;
 
4179
 
 
4180
        switch (bms_membership(varnos))
 
4181
        {
 
4182
                case BMS_EMPTY_SET:
 
4183
                        /* No Vars at all ... must be pseudo-constant clause */
 
4184
                        break;
 
4185
                case BMS_SINGLETON:
 
4186
                        if (varRelid == 0 || bms_is_member(varRelid, varnos))
 
4187
                        {
 
4188
                                onerel = find_base_rel(root,
 
4189
                                           (varRelid ? varRelid : bms_singleton_member(varnos)));
 
4190
                                vardata->rel = onerel;
 
4191
                                node = basenode;        /* strip any relabeling */
 
4192
                        }
 
4193
                        /* else treat it as a constant */
 
4194
                        break;
 
4195
                case BMS_MULTIPLE:
 
4196
                        if (varRelid == 0)
 
4197
                        {
 
4198
                                /* treat it as a variable of a join relation */
 
4199
                                vardata->rel = find_join_rel(root, varnos);
 
4200
                                node = basenode;        /* strip any relabeling */
 
4201
                        }
 
4202
                        else if (bms_is_member(varRelid, varnos))
 
4203
                        {
 
4204
                                /* ignore the vars belonging to other relations */
 
4205
                                vardata->rel = find_base_rel(root, varRelid);
 
4206
                                node = basenode;        /* strip any relabeling */
 
4207
                                /* note: no point in expressional-index search here */
 
4208
                        }
 
4209
                        /* else treat it as a constant */
 
4210
                        break;
 
4211
        }
 
4212
 
 
4213
        bms_free(varnos);
 
4214
 
 
4215
        vardata->var = node;
 
4216
        vardata->atttype = exprType(node);
 
4217
        vardata->atttypmod = exprTypmod(node);
 
4218
 
 
4219
        if (onerel)
 
4220
        {
 
4221
                /*
 
4222
                 * We have an expression in vars of a single relation.  Try to match
 
4223
                 * it to expressional index columns, in hopes of finding some
 
4224
                 * statistics.
 
4225
                 *
 
4226
                 * XXX it's conceivable that there are multiple matches with different
 
4227
                 * index opfamilies; if so, we need to pick one that matches the
 
4228
                 * operator we are estimating for.      FIXME later.
 
4229
                 */
 
4230
                ListCell   *ilist;
 
4231
 
 
4232
                foreach(ilist, onerel->indexlist)
 
4233
                {
 
4234
                        IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist);
 
4235
                        ListCell   *indexpr_item;
 
4236
                        int                     pos;
 
4237
 
 
4238
                        indexpr_item = list_head(index->indexprs);
 
4239
                        if (indexpr_item == NULL)
 
4240
                                continue;               /* no expressions here... */
 
4241
 
 
4242
                        for (pos = 0; pos < index->ncolumns; pos++)
 
4243
                        {
 
4244
                                if (index->indexkeys[pos] == 0)
 
4245
                                {
 
4246
                                        Node       *indexkey;
 
4247
 
 
4248
                                        if (indexpr_item == NULL)
 
4249
                                                elog(ERROR, "too few entries in indexprs list");
 
4250
                                        indexkey = (Node *) lfirst(indexpr_item);
 
4251
                                        if (indexkey && IsA(indexkey, RelabelType))
 
4252
                                                indexkey = (Node *) ((RelabelType *) indexkey)->arg;
 
4253
                                        if (equal(node, indexkey))
 
4254
                                        {
 
4255
                                                /*
 
4256
                                                 * Found a match ... is it a unique index? Tests here
 
4257
                                                 * should match has_unique_index().
 
4258
                                                 */
 
4259
                                                if (index->unique &&
 
4260
                                                        index->ncolumns == 1 &&
 
4261
                                                        (index->indpred == NIL || index->predOK))
 
4262
                                                        vardata->isunique = true;
 
4263
 
 
4264
                                                /*
 
4265
                                                 * Has it got stats?  We only consider stats for
 
4266
                                                 * non-partial indexes, since partial indexes probably
 
4267
                                                 * don't reflect whole-relation statistics; the above
 
4268
                                                 * check for uniqueness is the only info we take from
 
4269
                                                 * a partial index.
 
4270
                                                 *
 
4271
                                                 * An index stats hook, however, must make its own
 
4272
                                                 * decisions about what to do with partial indexes.
 
4273
                                                 */
 
4274
                                                if (get_index_stats_hook &&
 
4275
                                                        (*get_index_stats_hook) (root, index->indexoid,
 
4276
                                                                                                         pos + 1, vardata))
 
4277
                                                {
 
4278
                                                        /*
 
4279
                                                         * The hook took control of acquiring a stats
 
4280
                                                         * tuple.  If it did supply a tuple, it'd better
 
4281
                                                         * have supplied a freefunc.
 
4282
                                                         */
 
4283
                                                        if (HeapTupleIsValid(vardata->statsTuple) &&
 
4284
                                                                !vardata->freefunc)
 
4285
                                                                elog(ERROR, "no function provided to release variable stats with");
 
4286
                                                }
 
4287
                                                else if (index->indpred == NIL)
 
4288
                                                {
 
4289
                                                        vardata->statsTuple =
 
4290
                                                                SearchSysCache3(STATRELATTINH,
 
4291
                                                                                   ObjectIdGetDatum(index->indexoid),
 
4292
                                                                                                Int16GetDatum(pos + 1),
 
4293
                                                                                                BoolGetDatum(false));
 
4294
                                                        vardata->freefunc = ReleaseSysCache;
 
4295
                                                }
 
4296
                                                if (vardata->statsTuple)
 
4297
                                                        break;
 
4298
                                        }
 
4299
                                        indexpr_item = lnext(indexpr_item);
 
4300
                                }
 
4301
                        }
 
4302
                        if (vardata->statsTuple)
 
4303
                                break;
 
4304
                }
 
4305
        }
 
4306
}
 
4307
 
 
4308
/*
 
4309
 * get_variable_numdistinct
 
4310
 *        Estimate the number of distinct values of a variable.
 
4311
 *
 
4312
 * vardata: results of examine_variable
 
4313
 *
 
4314
 * NB: be careful to produce an integral result, since callers may compare
 
4315
 * the result to exact integer counts.
 
4316
 */
 
4317
double
 
4318
get_variable_numdistinct(VariableStatData *vardata)
 
4319
{
 
4320
        double          stadistinct;
 
4321
        double          ntuples;
 
4322
 
 
4323
        /*
 
4324
         * Determine the stadistinct value to use.      There are cases where we can
 
4325
         * get an estimate even without a pg_statistic entry, or can get a better
 
4326
         * value than is in pg_statistic.
 
4327
         */
 
4328
        if (HeapTupleIsValid(vardata->statsTuple))
 
4329
        {
 
4330
                /* Use the pg_statistic entry */
 
4331
                Form_pg_statistic stats;
 
4332
 
 
4333
                stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
 
4334
                stadistinct = stats->stadistinct;
 
4335
        }
 
4336
        else if (vardata->vartype == BOOLOID)
 
4337
        {
 
4338
                /*
 
4339
                 * Special-case boolean columns: presumably, two distinct values.
 
4340
                 *
 
4341
                 * Are there any other datatypes we should wire in special estimates
 
4342
                 * for?
 
4343
                 */
 
4344
                stadistinct = 2.0;
 
4345
        }
 
4346
        else
 
4347
        {
 
4348
                /*
 
4349
                 * We don't keep statistics for system columns, but in some cases we
 
4350
                 * can infer distinctness anyway.
 
4351
                 */
 
4352
                if (vardata->var && IsA(vardata->var, Var))
 
4353
                {
 
4354
                        switch (((Var *) vardata->var)->varattno)
 
4355
                        {
 
4356
                                case ObjectIdAttributeNumber:
 
4357
                                case SelfItemPointerAttributeNumber:
 
4358
                                        stadistinct = -1.0; /* unique */
 
4359
                                        break;
 
4360
                                case TableOidAttributeNumber:
 
4361
                                        stadistinct = 1.0;      /* only 1 value */
 
4362
                                        break;
 
4363
                                default:
 
4364
                                        stadistinct = 0.0;      /* means "unknown" */
 
4365
                                        break;
 
4366
                        }
 
4367
                }
 
4368
                else
 
4369
                        stadistinct = 0.0;      /* means "unknown" */
 
4370
 
 
4371
                /*
 
4372
                 * XXX consider using estimate_num_groups on expressions?
 
4373
                 */
 
4374
        }
 
4375
 
 
4376
        /*
 
4377
         * If there is a unique index for the variable, assume it is unique no
 
4378
         * matter what pg_statistic says; the statistics could be out of date, or
 
4379
         * we might have found a partial unique index that proves the var is
 
4380
         * unique for this query.
 
4381
         */
 
4382
        if (vardata->isunique)
 
4383
                stadistinct = -1.0;
 
4384
 
 
4385
        /*
 
4386
         * If we had an absolute estimate, use that.
 
4387
         */
 
4388
        if (stadistinct > 0.0)
 
4389
                return stadistinct;
 
4390
 
 
4391
        /*
 
4392
         * Otherwise we need to get the relation size; punt if not available.
 
4393
         */
 
4394
        if (vardata->rel == NULL)
 
4395
                return DEFAULT_NUM_DISTINCT;
 
4396
        ntuples = vardata->rel->tuples;
 
4397
        if (ntuples <= 0.0)
 
4398
                return DEFAULT_NUM_DISTINCT;
 
4399
 
 
4400
        /*
 
4401
         * If we had a relative estimate, use that.
 
4402
         */
 
4403
        if (stadistinct < 0.0)
 
4404
                return floor((-stadistinct * ntuples) + 0.5);
 
4405
 
 
4406
        /*
 
4407
         * With no data, estimate ndistinct = ntuples if the table is small, else
 
4408
         * use default.
 
4409
         */
 
4410
        if (ntuples < DEFAULT_NUM_DISTINCT)
 
4411
                return ntuples;
 
4412
 
 
4413
        return DEFAULT_NUM_DISTINCT;
 
4414
}
 
4415
 
 
4416
/*
 
4417
 * get_variable_range
 
4418
 *              Estimate the minimum and maximum value of the specified variable.
 
4419
 *              If successful, store values in *min and *max, and return TRUE.
 
4420
 *              If no data available, return FALSE.
 
4421
 *
 
4422
 * sortop is the "<" comparison operator to use.  This should generally
 
4423
 * be "<" not ">", as only the former is likely to be found in pg_statistic.
 
4424
 */
 
4425
static bool
 
4426
get_variable_range(PlannerInfo *root, VariableStatData *vardata, Oid sortop,
 
4427
                                   Datum *min, Datum *max)
 
4428
{
 
4429
        Datum           tmin = 0;
 
4430
        Datum           tmax = 0;
 
4431
        bool            have_data = false;
 
4432
        int16           typLen;
 
4433
        bool            typByVal;
 
4434
        Datum      *values;
 
4435
        int                     nvalues;
 
4436
        int                     i;
 
4437
 
 
4438
        /*
 
4439
         * XXX It's very tempting to try to use the actual column min and max, if
 
4440
         * we can get them relatively-cheaply with an index probe.      However, since
 
4441
         * this function is called many times during join planning, that could
 
4442
         * have unpleasant effects on planning speed.  Need more investigation
 
4443
         * before enabling this.
 
4444
         */
 
4445
#ifdef NOT_USED
 
4446
        if (get_actual_variable_range(root, vardata, sortop, min, max))
 
4447
                return true;
 
4448
#endif
 
4449
 
 
4450
        if (!HeapTupleIsValid(vardata->statsTuple))
 
4451
        {
 
4452
                /* no stats available, so default result */
 
4453
                return false;
 
4454
        }
 
4455
 
 
4456
        get_typlenbyval(vardata->atttype, &typLen, &typByVal);
 
4457
 
 
4458
        /*
 
4459
         * If there is a histogram, grab the first and last values.
 
4460
         *
 
4461
         * If there is a histogram that is sorted with some other operator than
 
4462
         * the one we want, fail --- this suggests that there is data we can't
 
4463
         * use.
 
4464
         */
 
4465
        if (get_attstatsslot(vardata->statsTuple,
 
4466
                                                 vardata->atttype, vardata->atttypmod,
 
4467
                                                 STATISTIC_KIND_HISTOGRAM, sortop,
 
4468
                                                 NULL,
 
4469
                                                 &values, &nvalues,
 
4470
                                                 NULL, NULL))
 
4471
        {
 
4472
                if (nvalues > 0)
 
4473
                {
 
4474
                        tmin = datumCopy(values[0], typByVal, typLen);
 
4475
                        tmax = datumCopy(values[nvalues - 1], typByVal, typLen);
 
4476
                        have_data = true;
 
4477
                }
 
4478
                free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
 
4479
        }
 
4480
        else if (get_attstatsslot(vardata->statsTuple,
 
4481
                                                          vardata->atttype, vardata->atttypmod,
 
4482
                                                          STATISTIC_KIND_HISTOGRAM, InvalidOid,
 
4483
                                                          NULL,
 
4484
                                                          &values, &nvalues,
 
4485
                                                          NULL, NULL))
 
4486
        {
 
4487
                free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
 
4488
                return false;
 
4489
        }
 
4490
 
 
4491
        /*
 
4492
         * If we have most-common-values info, look for extreme MCVs.  This is
 
4493
         * needed even if we also have a histogram, since the histogram excludes
 
4494
         * the MCVs.  However, usually the MCVs will not be the extreme values, so
 
4495
         * avoid unnecessary data copying.
 
4496
         */
 
4497
        if (get_attstatsslot(vardata->statsTuple,
 
4498
                                                 vardata->atttype, vardata->atttypmod,
 
4499
                                                 STATISTIC_KIND_MCV, InvalidOid,
 
4500
                                                 NULL,
 
4501
                                                 &values, &nvalues,
 
4502
                                                 NULL, NULL))
 
4503
        {
 
4504
                bool            tmin_is_mcv = false;
 
4505
                bool            tmax_is_mcv = false;
 
4506
                FmgrInfo        opproc;
 
4507
 
 
4508
                fmgr_info(get_opcode(sortop), &opproc);
 
4509
 
 
4510
                for (i = 0; i < nvalues; i++)
 
4511
                {
 
4512
                        if (!have_data)
 
4513
                        {
 
4514
                                tmin = tmax = values[i];
 
4515
                                tmin_is_mcv = tmax_is_mcv = have_data = true;
 
4516
                                continue;
 
4517
                        }
 
4518
                        if (DatumGetBool(FunctionCall2Coll(&opproc,
 
4519
                                                                                           DEFAULT_COLLATION_OID,
 
4520
                                                                                           values[i], tmin)))
 
4521
                        {
 
4522
                                tmin = values[i];
 
4523
                                tmin_is_mcv = true;
 
4524
                        }
 
4525
                        if (DatumGetBool(FunctionCall2Coll(&opproc,
 
4526
                                                                                           DEFAULT_COLLATION_OID,
 
4527
                                                                                           tmax, values[i])))
 
4528
                        {
 
4529
                                tmax = values[i];
 
4530
                                tmax_is_mcv = true;
 
4531
                        }
 
4532
                }
 
4533
                if (tmin_is_mcv)
 
4534
                        tmin = datumCopy(tmin, typByVal, typLen);
 
4535
                if (tmax_is_mcv)
 
4536
                        tmax = datumCopy(tmax, typByVal, typLen);
 
4537
                free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
 
4538
        }
 
4539
 
 
4540
        *min = tmin;
 
4541
        *max = tmax;
 
4542
        return have_data;
 
4543
}
 
4544
 
 
4545
 
 
4546
/*
 
4547
 * get_actual_variable_range
 
4548
 *              Attempt to identify the current *actual* minimum and/or maximum
 
4549
 *              of the specified variable, by looking for a suitable btree index
 
4550
 *              and fetching its low and/or high values.
 
4551
 *              If successful, store values in *min and *max, and return TRUE.
 
4552
 *              (Either pointer can be NULL if that endpoint isn't needed.)
 
4553
 *              If no data available, return FALSE.
 
4554
 *
 
4555
 * sortop is the "<" comparison operator to use.
 
4556
 */
 
4557
static bool
 
4558
get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
 
4559
                                                  Oid sortop,
 
4560
                                                  Datum *min, Datum *max)
 
4561
{
 
4562
        bool            have_data = false;
 
4563
        RelOptInfo *rel = vardata->rel;
 
4564
        RangeTblEntry *rte;
 
4565
        ListCell   *lc;
 
4566
 
 
4567
        /* No hope if no relation or it doesn't have indexes */
 
4568
        if (rel == NULL || rel->indexlist == NIL)
 
4569
                return false;
 
4570
        /* If it has indexes it must be a plain relation */
 
4571
        rte = root->simple_rte_array[rel->relid];
 
4572
        Assert(rte->rtekind == RTE_RELATION);
 
4573
 
 
4574
        /* Search through the indexes to see if any match our problem */
 
4575
        foreach(lc, rel->indexlist)
 
4576
        {
 
4577
                IndexOptInfo *index = (IndexOptInfo *) lfirst(lc);
 
4578
                ScanDirection indexscandir;
 
4579
 
 
4580
                /* Ignore non-btree indexes */
 
4581
                if (index->relam != BTREE_AM_OID)
 
4582
                        continue;
 
4583
 
 
4584
                /*
 
4585
                 * Ignore partial indexes --- we only want stats that cover the entire
 
4586
                 * relation.
 
4587
                 */
 
4588
                if (index->indpred != NIL)
 
4589
                        continue;
 
4590
 
 
4591
                /*
 
4592
                 * The index list might include hypothetical indexes inserted by a
 
4593
                 * get_relation_info hook --- don't try to access them.
 
4594
                 */
 
4595
                if (index->hypothetical)
 
4596
                        continue;
 
4597
 
 
4598
                /*
 
4599
                 * The first index column must match the desired variable and sort
 
4600
                 * operator --- but we can use a descending-order index.
 
4601
                 */
 
4602
                if (!match_index_to_operand(vardata->var, 0, index))
 
4603
                        continue;
 
4604
                switch (get_op_opfamily_strategy(sortop, index->sortopfamily[0]))
 
4605
                {
 
4606
                        case BTLessStrategyNumber:
 
4607
                                if (index->reverse_sort[0])
 
4608
                                        indexscandir = BackwardScanDirection;
 
4609
                                else
 
4610
                                        indexscandir = ForwardScanDirection;
 
4611
                                break;
 
4612
                        case BTGreaterStrategyNumber:
 
4613
                                if (index->reverse_sort[0])
 
4614
                                        indexscandir = ForwardScanDirection;
 
4615
                                else
 
4616
                                        indexscandir = BackwardScanDirection;
 
4617
                                break;
 
4618
                        default:
 
4619
                                /* index doesn't match the sortop */
 
4620
                                continue;
 
4621
                }
 
4622
 
 
4623
                /*
 
4624
                 * Found a suitable index to extract data from.  We'll need an EState
 
4625
                 * and a bunch of other infrastructure.
 
4626
                 */
 
4627
                {
 
4628
                        EState     *estate;
 
4629
                        ExprContext *econtext;
 
4630
                        MemoryContext tmpcontext;
 
4631
                        MemoryContext oldcontext;
 
4632
                        Relation        heapRel;
 
4633
                        Relation        indexRel;
 
4634
                        IndexInfo  *indexInfo;
 
4635
                        TupleTableSlot *slot;
 
4636
                        int16           typLen;
 
4637
                        bool            typByVal;
 
4638
                        ScanKeyData scankeys[1];
 
4639
                        IndexScanDesc index_scan;
 
4640
                        HeapTuple       tup;
 
4641
                        Datum           values[INDEX_MAX_KEYS];
 
4642
                        bool            isnull[INDEX_MAX_KEYS];
 
4643
 
 
4644
                        estate = CreateExecutorState();
 
4645
                        econtext = GetPerTupleExprContext(estate);
 
4646
                        /* Make sure any cruft is generated in the econtext's memory */
 
4647
                        tmpcontext = econtext->ecxt_per_tuple_memory;
 
4648
                        oldcontext = MemoryContextSwitchTo(tmpcontext);
 
4649
 
 
4650
                        /*
 
4651
                         * Open the table and index so we can read from them.  We should
 
4652
                         * already have at least AccessShareLock on the table, but not
 
4653
                         * necessarily on the index.
 
4654
                         */
 
4655
                        heapRel = heap_open(rte->relid, NoLock);
 
4656
                        indexRel = index_open(index->indexoid, AccessShareLock);
 
4657
 
 
4658
                        /* extract index key information from the index's pg_index info */
 
4659
                        indexInfo = BuildIndexInfo(indexRel);
 
4660
 
 
4661
                        /* some other stuff */
 
4662
                        slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRel));
 
4663
                        econtext->ecxt_scantuple = slot;
 
4664
                        get_typlenbyval(vardata->atttype, &typLen, &typByVal);
 
4665
 
 
4666
                        /* set up an IS NOT NULL scan key so that we ignore nulls */
 
4667
                        ScanKeyEntryInitialize(&scankeys[0],
 
4668
                                                                   SK_ISNULL | SK_SEARCHNOTNULL,
 
4669
                                                                   1,   /* index col to scan */
 
4670
                                                                   InvalidStrategy,             /* no strategy */
 
4671
                                                                   InvalidOid,  /* no strategy subtype */
 
4672
                                                                   InvalidOid,  /* no collation */
 
4673
                                                                   InvalidOid,  /* no reg proc for this */
 
4674
                                                                   (Datum) 0);  /* constant */
 
4675
 
 
4676
                        have_data = true;
 
4677
 
 
4678
                        /* If min is requested ... */
 
4679
                        if (min)
 
4680
                        {
 
4681
                                index_scan = index_beginscan(heapRel, indexRel, SnapshotNow,
 
4682
                                                                                         1, 0);
 
4683
                                index_rescan(index_scan, scankeys, 1, NULL, 0);
 
4684
 
 
4685
                                /* Fetch first tuple in sortop's direction */
 
4686
                                if ((tup = index_getnext(index_scan,
 
4687
                                                                                 indexscandir)) != NULL)
 
4688
                                {
 
4689
                                        /* Extract the index column values from the heap tuple */
 
4690
                                        ExecStoreTuple(tup, slot, InvalidBuffer, false);
 
4691
                                        FormIndexDatum(indexInfo, slot, estate,
 
4692
                                                                   values, isnull);
 
4693
 
 
4694
                                        /* Shouldn't have got a null, but be careful */
 
4695
                                        if (isnull[0])
 
4696
                                                elog(ERROR, "found unexpected null value in index \"%s\"",
 
4697
                                                         RelationGetRelationName(indexRel));
 
4698
 
 
4699
                                        /* Copy the index column value out to caller's context */
 
4700
                                        MemoryContextSwitchTo(oldcontext);
 
4701
                                        *min = datumCopy(values[0], typByVal, typLen);
 
4702
                                        MemoryContextSwitchTo(tmpcontext);
 
4703
                                }
 
4704
                                else
 
4705
                                        have_data = false;
 
4706
 
 
4707
                                index_endscan(index_scan);
 
4708
                        }
 
4709
 
 
4710
                        /* If max is requested, and we didn't find the index is empty */
 
4711
                        if (max && have_data)
 
4712
                        {
 
4713
                                index_scan = index_beginscan(heapRel, indexRel, SnapshotNow,
 
4714
                                                                                         1, 0);
 
4715
                                index_rescan(index_scan, scankeys, 1, NULL, 0);
 
4716
 
 
4717
                                /* Fetch first tuple in reverse direction */
 
4718
                                if ((tup = index_getnext(index_scan,
 
4719
                                                                                 -indexscandir)) != NULL)
 
4720
                                {
 
4721
                                        /* Extract the index column values from the heap tuple */
 
4722
                                        ExecStoreTuple(tup, slot, InvalidBuffer, false);
 
4723
                                        FormIndexDatum(indexInfo, slot, estate,
 
4724
                                                                   values, isnull);
 
4725
 
 
4726
                                        /* Shouldn't have got a null, but be careful */
 
4727
                                        if (isnull[0])
 
4728
                                                elog(ERROR, "found unexpected null value in index \"%s\"",
 
4729
                                                         RelationGetRelationName(indexRel));
 
4730
 
 
4731
                                        /* Copy the index column value out to caller's context */
 
4732
                                        MemoryContextSwitchTo(oldcontext);
 
4733
                                        *max = datumCopy(values[0], typByVal, typLen);
 
4734
                                        MemoryContextSwitchTo(tmpcontext);
 
4735
                                }
 
4736
                                else
 
4737
                                        have_data = false;
 
4738
 
 
4739
                                index_endscan(index_scan);
 
4740
                        }
 
4741
 
 
4742
                        /* Clean everything up */
 
4743
                        ExecDropSingleTupleTableSlot(slot);
 
4744
 
 
4745
                        index_close(indexRel, AccessShareLock);
 
4746
                        heap_close(heapRel, NoLock);
 
4747
 
 
4748
                        MemoryContextSwitchTo(oldcontext);
 
4749
                        FreeExecutorState(estate);
 
4750
 
 
4751
                        /* And we're done */
 
4752
                        break;
 
4753
                }
 
4754
        }
 
4755
 
 
4756
        return have_data;
 
4757
}
 
4758
 
 
4759
 
 
4760
/*-------------------------------------------------------------------------
 
4761
 *
 
4762
 * Pattern analysis functions
 
4763
 *
 
4764
 * These routines support analysis of LIKE and regular-expression patterns
 
4765
 * by the planner/optimizer.  It's important that they agree with the
 
4766
 * regular-expression code in backend/regex/ and the LIKE code in
 
4767
 * backend/utils/adt/like.c.  Also, the computation of the fixed prefix
 
4768
 * must be conservative: if we report a string longer than the true fixed
 
4769
 * prefix, the query may produce actually wrong answers, rather than just
 
4770
 * getting a bad selectivity estimate!
 
4771
 *
 
4772
 * Note that the prefix-analysis functions are called from
 
4773
 * backend/optimizer/path/indxpath.c as well as from routines in this file.
 
4774
 *
 
4775
 *-------------------------------------------------------------------------
 
4776
 */
 
4777
 
 
4778
/*
 
4779
 * Check whether char is a letter (and, hence, subject to case-folding)
 
4780
 *
 
4781
 * In multibyte character sets, we can't use isalpha, and it does not seem
 
4782
 * worth trying to convert to wchar_t to use iswalpha.  Instead, just assume
 
4783
 * any multibyte char is potentially case-varying.
 
4784
 */
 
4785
static int
 
4786
pattern_char_isalpha(char c, bool is_multibyte,
 
4787
                                         pg_locale_t locale, bool locale_is_c)
 
4788
{
 
4789
        if (locale_is_c)
 
4790
                return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
 
4791
        else if (is_multibyte && IS_HIGHBIT_SET(c))
 
4792
                return true;
 
4793
#ifdef HAVE_LOCALE_T
 
4794
        else if (locale)
 
4795
                return isalpha_l((unsigned char) c, locale);
 
4796
#endif
 
4797
        else
 
4798
                return isalpha((unsigned char) c);
 
4799
}
 
4800
 
 
4801
/*
 
4802
 * Extract the fixed prefix, if any, for a pattern.
 
4803
 *
 
4804
 * *prefix is set to a palloc'd prefix string (in the form of a Const node),
 
4805
 *      or to NULL if no fixed prefix exists for the pattern.
 
4806
 * *rest is set to a palloc'd Const representing the remainder of the pattern
 
4807
 *      after the portion describing the fixed prefix.
 
4808
 * Each of these has the same type (TEXT or BYTEA) as the given pattern Const.
 
4809
 *
 
4810
 * The return value distinguishes no fixed prefix, a partial prefix,
 
4811
 * or an exact-match-only pattern.
 
4812
 */
 
4813
 
 
4814
static Pattern_Prefix_Status
 
4815
like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
 
4816
                                  Const **prefix_const, Const **rest_const)
 
4817
{
 
4818
        char       *match;
 
4819
        char       *patt;
 
4820
        int                     pattlen;
 
4821
        char       *rest;
 
4822
        Oid                     typeid = patt_const->consttype;
 
4823
        int                     pos,
 
4824
                                match_pos;
 
4825
        bool            is_multibyte = (pg_database_encoding_max_length() > 1);
 
4826
        pg_locale_t     locale = 0;
 
4827
        bool            locale_is_c = false;
 
4828
 
 
4829
        /* the right-hand const is type text or bytea */
 
4830
        Assert(typeid == BYTEAOID || typeid == TEXTOID);
 
4831
 
 
4832
        if (case_insensitive)
 
4833
        {
 
4834
                if (typeid == BYTEAOID)
 
4835
                        ereport(ERROR,
 
4836
                                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 
4837
                   errmsg("case insensitive matching not supported on type bytea")));
 
4838
 
 
4839
                /* If case-insensitive, we need locale info */
 
4840
                if (lc_ctype_is_c(collation))
 
4841
                        locale_is_c = true;
 
4842
                else if (collation != DEFAULT_COLLATION_OID)
 
4843
                {
 
4844
                        if (!OidIsValid(collation))
 
4845
                        {
 
4846
                                /*
 
4847
                                 * This typically means that the parser could not resolve a
 
4848
                                 * conflict of implicit collations, so report it that way.
 
4849
                                 */
 
4850
                                ereport(ERROR,
 
4851
                                                (errcode(ERRCODE_INDETERMINATE_COLLATION),
 
4852
                                                 errmsg("could not determine which collation to use for ILIKE"),
 
4853
                                                 errhint("Use the COLLATE clause to set the collation explicitly.")));
 
4854
                        }
 
4855
                        locale = pg_newlocale_from_collation(collation);
 
4856
                }
 
4857
        }
 
4858
 
 
4859
        if (typeid != BYTEAOID)
 
4860
        {
 
4861
                patt = TextDatumGetCString(patt_const->constvalue);
 
4862
                pattlen = strlen(patt);
 
4863
        }
 
4864
        else
 
4865
        {
 
4866
                bytea      *bstr = DatumGetByteaP(patt_const->constvalue);
 
4867
 
 
4868
                pattlen = VARSIZE(bstr) - VARHDRSZ;
 
4869
                patt = (char *) palloc(pattlen);
 
4870
                memcpy(patt, VARDATA(bstr), pattlen);
 
4871
                if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
 
4872
                        pfree(bstr);
 
4873
        }
 
4874
 
 
4875
        match = palloc(pattlen + 1);
 
4876
        match_pos = 0;
 
4877
        for (pos = 0; pos < pattlen; pos++)
 
4878
        {
 
4879
                /* % and _ are wildcard characters in LIKE */
 
4880
                if (patt[pos] == '%' ||
 
4881
                        patt[pos] == '_')
 
4882
                        break;
 
4883
 
 
4884
                /* Backslash escapes the next character */
 
4885
                if (patt[pos] == '\\')
 
4886
                {
 
4887
                        pos++;
 
4888
                        if (pos >= pattlen)
 
4889
                                break;
 
4890
                }
 
4891
 
 
4892
                /* Stop if case-varying character (it's sort of a wildcard) */
 
4893
                if (case_insensitive &&
 
4894
                        pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
 
4895
                        break;
 
4896
 
 
4897
                match[match_pos++] = patt[pos];
 
4898
        }
 
4899
 
 
4900
        match[match_pos] = '\0';
 
4901
        rest = &patt[pos];
 
4902
 
 
4903
        if (typeid != BYTEAOID)
 
4904
        {
 
4905
                *prefix_const = string_to_const(match, typeid);
 
4906
                *rest_const = string_to_const(rest, typeid);
 
4907
        }
 
4908
        else
 
4909
        {
 
4910
                *prefix_const = string_to_bytea_const(match, match_pos);
 
4911
                *rest_const = string_to_bytea_const(rest, pattlen - pos);
 
4912
        }
 
4913
 
 
4914
        pfree(patt);
 
4915
        pfree(match);
 
4916
 
 
4917
        /* in LIKE, an empty pattern is an exact match! */
 
4918
        if (pos == pattlen)
 
4919
                return Pattern_Prefix_Exact;    /* reached end of pattern, so exact */
 
4920
 
 
4921
        if (match_pos > 0)
 
4922
                return Pattern_Prefix_Partial;
 
4923
 
 
4924
        return Pattern_Prefix_None;
 
4925
}
 
4926
 
 
4927
static Pattern_Prefix_Status
 
4928
regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
 
4929
                                   Const **prefix_const, Const **rest_const)
 
4930
{
 
4931
        char       *match;
 
4932
        int                     pos,
 
4933
                                match_pos,
 
4934
                                prev_pos,
 
4935
                                prev_match_pos;
 
4936
        bool            have_leading_paren;
 
4937
        char       *patt;
 
4938
        char       *rest;
 
4939
        Oid                     typeid = patt_const->consttype;
 
4940
        bool            is_multibyte = (pg_database_encoding_max_length() > 1);
 
4941
        pg_locale_t     locale = 0;
 
4942
        bool            locale_is_c = false;
 
4943
 
 
4944
        /*
 
4945
         * Should be unnecessary, there are no bytea regex operators defined. As
 
4946
         * such, it should be noted that the rest of this function has *not* been
 
4947
         * made safe for binary (possibly NULL containing) strings.
 
4948
         */
 
4949
        if (typeid == BYTEAOID)
 
4950
                ereport(ERROR,
 
4951
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 
4952
                 errmsg("regular-expression matching not supported on type bytea")));
 
4953
 
 
4954
        if (case_insensitive)
 
4955
        {
 
4956
                /* If case-insensitive, we need locale info */
 
4957
                if (lc_ctype_is_c(collation))
 
4958
                        locale_is_c = true;
 
4959
                else if (collation != DEFAULT_COLLATION_OID)
 
4960
                {
 
4961
                        if (!OidIsValid(collation))
 
4962
                        {
 
4963
                                /*
 
4964
                                 * This typically means that the parser could not resolve a
 
4965
                                 * conflict of implicit collations, so report it that way.
 
4966
                                 */
 
4967
                                ereport(ERROR,
 
4968
                                                (errcode(ERRCODE_INDETERMINATE_COLLATION),
 
4969
                                                 errmsg("could not determine which collation to use for regular expression"),
 
4970
                                                 errhint("Use the COLLATE clause to set the collation explicitly.")));
 
4971
                        }
 
4972
                        locale = pg_newlocale_from_collation(collation);
 
4973
                }
 
4974
        }
 
4975
 
 
4976
        /* the right-hand const is type text for all of these */
 
4977
        patt = TextDatumGetCString(patt_const->constvalue);
 
4978
 
 
4979
        /*
 
4980
         * Check for ARE director prefix.  It's worth our trouble to recognize
 
4981
         * this because similar_escape() used to use it, and some other code might
 
4982
         * still use it, to force ARE mode.
 
4983
         */
 
4984
        pos = 0;
 
4985
        if (strncmp(patt, "***:", 4) == 0)
 
4986
                pos = 4;
 
4987
 
 
4988
        /* Pattern must be anchored left */
 
4989
        if (patt[pos] != '^')
 
4990
        {
 
4991
                rest = patt;
 
4992
 
 
4993
                *prefix_const = NULL;
 
4994
                *rest_const = string_to_const(rest, typeid);
 
4995
 
 
4996
                return Pattern_Prefix_None;
 
4997
        }
 
4998
        pos++;
 
4999
 
 
5000
        /*
 
5001
         * If '|' is present in pattern, then there may be multiple alternatives
 
5002
         * for the start of the string.  (There are cases where this isn't so, for
 
5003
         * instance if the '|' is inside parens, but detecting that reliably is
 
5004
         * too hard.)
 
5005
         */
 
5006
        if (strchr(patt + pos, '|') != NULL)
 
5007
        {
 
5008
                rest = patt;
 
5009
 
 
5010
                *prefix_const = NULL;
 
5011
                *rest_const = string_to_const(rest, typeid);
 
5012
 
 
5013
                return Pattern_Prefix_None;
 
5014
        }
 
5015
 
 
5016
        /* OK, allocate space for pattern */
 
5017
        match = palloc(strlen(patt) + 1);
 
5018
        prev_match_pos = match_pos = 0;
 
5019
 
 
5020
        /*
 
5021
         * We special-case the syntax '^(...)$' because psql uses it.  But beware:
 
5022
         * sequences beginning "(?" are not what they seem, unless they're "(?:".
 
5023
         * (We must recognize that because of similar_escape().)
 
5024
         */
 
5025
        have_leading_paren = false;
 
5026
        if (patt[pos] == '(' &&
 
5027
                (patt[pos + 1] != '?' || patt[pos + 2] == ':'))
 
5028
        {
 
5029
                have_leading_paren = true;
 
5030
                pos += (patt[pos + 1] != '?' ? 1 : 3);
 
5031
        }
 
5032
 
 
5033
        /* Scan remainder of pattern */
 
5034
        prev_pos = pos;
 
5035
        while (patt[pos])
 
5036
        {
 
5037
                int                     len;
 
5038
 
 
5039
                /*
 
5040
                 * Check for characters that indicate multiple possible matches here.
 
5041
                 * Also, drop out at ')' or '$' so the termination test works right.
 
5042
                 */
 
5043
                if (patt[pos] == '.' ||
 
5044
                        patt[pos] == '(' ||
 
5045
                        patt[pos] == ')' ||
 
5046
                        patt[pos] == '[' ||
 
5047
                        patt[pos] == '^' ||
 
5048
                        patt[pos] == '$')
 
5049
                        break;
 
5050
 
 
5051
                /* Stop if case-varying character (it's sort of a wildcard) */
 
5052
                if (case_insensitive &&
 
5053
                        pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
 
5054
                        break;
 
5055
 
 
5056
                /*
 
5057
                 * Check for quantifiers.  Except for +, this means the preceding
 
5058
                 * character is optional, so we must remove it from the prefix too!
 
5059
                 */
 
5060
                if (patt[pos] == '*' ||
 
5061
                        patt[pos] == '?' ||
 
5062
                        patt[pos] == '{')
 
5063
                {
 
5064
                        match_pos = prev_match_pos;
 
5065
                        pos = prev_pos;
 
5066
                        break;
 
5067
                }
 
5068
                if (patt[pos] == '+')
 
5069
                {
 
5070
                        pos = prev_pos;
 
5071
                        break;
 
5072
                }
 
5073
 
 
5074
                /*
 
5075
                 * Normally, backslash quotes the next character.  But in AREs,
 
5076
                 * backslash followed by alphanumeric is an escape, not a quoted
 
5077
                 * character.  Must treat it as having multiple possible matches.
 
5078
                 * Note: since only ASCII alphanumerics are escapes, we don't have to
 
5079
                 * be paranoid about multibyte or collations here.
 
5080
                 */
 
5081
                if (patt[pos] == '\\')
 
5082
                {
 
5083
                        if (isalnum((unsigned char) patt[pos + 1]))
 
5084
                                break;
 
5085
                        pos++;
 
5086
                        if (patt[pos] == '\0')
 
5087
                                break;
 
5088
                }
 
5089
                /* save position in case we need to back up on next loop cycle */
 
5090
                prev_match_pos = match_pos;
 
5091
                prev_pos = pos;
 
5092
                /* must use encoding-aware processing here */
 
5093
                len = pg_mblen(&patt[pos]);
 
5094
                memcpy(&match[match_pos], &patt[pos], len);
 
5095
                match_pos += len;
 
5096
                pos += len;
 
5097
        }
 
5098
 
 
5099
        match[match_pos] = '\0';
 
5100
        rest = &patt[pos];
 
5101
 
 
5102
        if (have_leading_paren && patt[pos] == ')')
 
5103
                pos++;
 
5104
 
 
5105
        if (patt[pos] == '$' && patt[pos + 1] == '\0')
 
5106
        {
 
5107
                rest = &patt[pos + 1];
 
5108
 
 
5109
                *prefix_const = string_to_const(match, typeid);
 
5110
                *rest_const = string_to_const(rest, typeid);
 
5111
 
 
5112
                pfree(patt);
 
5113
                pfree(match);
 
5114
 
 
5115
                return Pattern_Prefix_Exact;    /* pattern specifies exact match */
 
5116
        }
 
5117
 
 
5118
        *prefix_const = string_to_const(match, typeid);
 
5119
        *rest_const = string_to_const(rest, typeid);
 
5120
 
 
5121
        pfree(patt);
 
5122
        pfree(match);
 
5123
 
 
5124
        if (match_pos > 0)
 
5125
                return Pattern_Prefix_Partial;
 
5126
 
 
5127
        return Pattern_Prefix_None;
 
5128
}
 
5129
 
 
5130
Pattern_Prefix_Status
 
5131
pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
 
5132
                                         Const **prefix, Const **rest)
 
5133
{
 
5134
        Pattern_Prefix_Status result;
 
5135
 
 
5136
        switch (ptype)
 
5137
        {
 
5138
                case Pattern_Type_Like:
 
5139
                        result = like_fixed_prefix(patt, false, collation, prefix, rest);
 
5140
                        break;
 
5141
                case Pattern_Type_Like_IC:
 
5142
                        result = like_fixed_prefix(patt, true, collation, prefix, rest);
 
5143
                        break;
 
5144
                case Pattern_Type_Regex:
 
5145
                        result = regex_fixed_prefix(patt, false, collation, prefix, rest);
 
5146
                        break;
 
5147
                case Pattern_Type_Regex_IC:
 
5148
                        result = regex_fixed_prefix(patt, true, collation, prefix, rest);
 
5149
                        break;
 
5150
                default:
 
5151
                        elog(ERROR, "unrecognized ptype: %d", (int) ptype);
 
5152
                        result = Pattern_Prefix_None;           /* keep compiler quiet */
 
5153
                        break;
 
5154
        }
 
5155
        return result;
 
5156
}
 
5157
 
 
5158
/*
 
5159
 * Estimate the selectivity of a fixed prefix for a pattern match.
 
5160
 *
 
5161
 * A fixed prefix "foo" is estimated as the selectivity of the expression
 
5162
 * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c).
 
5163
 *
 
5164
 * The selectivity estimate is with respect to the portion of the column
 
5165
 * population represented by the histogram --- the caller must fold this
 
5166
 * together with info about MCVs and NULLs.
 
5167
 *
 
5168
 * We use the >= and < operators from the specified btree opfamily to do the
 
5169
 * estimation.  The given variable and Const must be of the associated
 
5170
 * datatype.
 
5171
 *
 
5172
 * XXX Note: we make use of the upper bound to estimate operator selectivity
 
5173
 * even if the locale is such that we cannot rely on the upper-bound string.
 
5174
 * The selectivity only needs to be approximately right anyway, so it seems
 
5175
 * more useful to use the upper-bound code than not.
 
5176
 */
 
5177
static Selectivity
 
5178
prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
 
5179
                                   Oid vartype, Oid opfamily, Const *prefixcon)
 
5180
{
 
5181
        Selectivity prefixsel;
 
5182
        Oid                     cmpopr;
 
5183
        FmgrInfo        opproc;
 
5184
        Const      *greaterstrcon;
 
5185
        Selectivity eq_sel;
 
5186
 
 
5187
        cmpopr = get_opfamily_member(opfamily, vartype, vartype,
 
5188
                                                                 BTGreaterEqualStrategyNumber);
 
5189
        if (cmpopr == InvalidOid)
 
5190
                elog(ERROR, "no >= operator for opfamily %u", opfamily);
 
5191
        fmgr_info(get_opcode(cmpopr), &opproc);
 
5192
 
 
5193
        prefixsel = ineq_histogram_selectivity(root, vardata, &opproc, true,
 
5194
                                                                                   prefixcon->constvalue,
 
5195
                                                                                   prefixcon->consttype);
 
5196
 
 
5197
        if (prefixsel < 0.0)
 
5198
        {
 
5199
                /* No histogram is present ... return a suitable default estimate */
 
5200
                return DEFAULT_MATCH_SEL;
 
5201
        }
 
5202
 
 
5203
        /*-------
 
5204
         * If we can create a string larger than the prefix, say
 
5205
         *      "x < greaterstr".
 
5206
         *-------
 
5207
         */
 
5208
        cmpopr = get_opfamily_member(opfamily, vartype, vartype,
 
5209
                                                                 BTLessStrategyNumber);
 
5210
        if (cmpopr == InvalidOid)
 
5211
                elog(ERROR, "no < operator for opfamily %u", opfamily);
 
5212
        fmgr_info(get_opcode(cmpopr), &opproc);
 
5213
        greaterstrcon = make_greater_string(prefixcon, &opproc,
 
5214
                                                                                DEFAULT_COLLATION_OID);
 
5215
        if (greaterstrcon)
 
5216
        {
 
5217
                Selectivity topsel;
 
5218
 
 
5219
                topsel = ineq_histogram_selectivity(root, vardata, &opproc, false,
 
5220
                                                                                        greaterstrcon->constvalue,
 
5221
                                                                                        greaterstrcon->consttype);
 
5222
 
 
5223
                /* ineq_histogram_selectivity worked before, it shouldn't fail now */
 
5224
                Assert(topsel >= 0.0);
 
5225
 
 
5226
                /*
 
5227
                 * Merge the two selectivities in the same way as for a range query
 
5228
                 * (see clauselist_selectivity()).      Note that we don't need to worry
 
5229
                 * about double-exclusion of nulls, since ineq_histogram_selectivity
 
5230
                 * doesn't count those anyway.
 
5231
                 */
 
5232
                prefixsel = topsel + prefixsel - 1.0;
 
5233
        }
 
5234
 
 
5235
        /*
 
5236
         * If the prefix is long then the two bounding values might be too close
 
5237
         * together for the histogram to distinguish them usefully, resulting in a
 
5238
         * zero estimate (plus or minus roundoff error). To avoid returning a
 
5239
         * ridiculously small estimate, compute the estimated selectivity for
 
5240
         * "variable = 'foo'", and clamp to that. (Obviously, the resultant
 
5241
         * estimate should be at least that.)
 
5242
         *
 
5243
         * We apply this even if we couldn't make a greater string.  That case
 
5244
         * suggests that the prefix is near the maximum possible, and thus
 
5245
         * probably off the end of the histogram, and thus we probably got a very
 
5246
         * small estimate from the >= condition; so we still need to clamp.
 
5247
         */
 
5248
        cmpopr = get_opfamily_member(opfamily, vartype, vartype,
 
5249
                                                                 BTEqualStrategyNumber);
 
5250
        if (cmpopr == InvalidOid)
 
5251
                elog(ERROR, "no = operator for opfamily %u", opfamily);
 
5252
        eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue,
 
5253
                                                  false, true);
 
5254
 
 
5255
        prefixsel = Max(prefixsel, eq_sel);
 
5256
 
 
5257
        return prefixsel;
 
5258
}
 
5259
 
 
5260
 
 
5261
/*
 
5262
 * Estimate the selectivity of a pattern of the specified type.
 
5263
 * Note that any fixed prefix of the pattern will have been removed already.
 
5264
 *
 
5265
 * For now, we use a very simplistic approach: fixed characters reduce the
 
5266
 * selectivity a good deal, character ranges reduce it a little,
 
5267
 * wildcards (such as % for LIKE or .* for regex) increase it.
 
5268
 */
 
5269
 
 
5270
#define FIXED_CHAR_SEL  0.20    /* about 1/5 */
 
5271
#define CHAR_RANGE_SEL  0.25
 
5272
#define ANY_CHAR_SEL    0.9             /* not 1, since it won't match end-of-string */
 
5273
#define FULL_WILDCARD_SEL 5.0
 
5274
#define PARTIAL_WILDCARD_SEL 2.0
 
5275
 
 
5276
static Selectivity
 
5277
like_selectivity(Const *patt_const, bool case_insensitive)
 
5278
{
 
5279
        Selectivity sel = 1.0;
 
5280
        int                     pos;
 
5281
        Oid                     typeid = patt_const->consttype;
 
5282
        char       *patt;
 
5283
        int                     pattlen;
 
5284
 
 
5285
        /* the right-hand const is type text or bytea */
 
5286
        Assert(typeid == BYTEAOID || typeid == TEXTOID);
 
5287
 
 
5288
        if (typeid == BYTEAOID && case_insensitive)
 
5289
                ereport(ERROR,
 
5290
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 
5291
                   errmsg("case insensitive matching not supported on type bytea")));
 
5292
 
 
5293
        if (typeid != BYTEAOID)
 
5294
        {
 
5295
                patt = TextDatumGetCString(patt_const->constvalue);
 
5296
                pattlen = strlen(patt);
 
5297
        }
 
5298
        else
 
5299
        {
 
5300
                bytea      *bstr = DatumGetByteaP(patt_const->constvalue);
 
5301
 
 
5302
                pattlen = VARSIZE(bstr) - VARHDRSZ;
 
5303
                patt = (char *) palloc(pattlen);
 
5304
                memcpy(patt, VARDATA(bstr), pattlen);
 
5305
                if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
 
5306
                        pfree(bstr);
 
5307
        }
 
5308
 
 
5309
        /* Skip any leading wildcard; it's already factored into initial sel */
 
5310
        for (pos = 0; pos < pattlen; pos++)
 
5311
        {
 
5312
                if (patt[pos] != '%' && patt[pos] != '_')
 
5313
                        break;
 
5314
        }
 
5315
 
 
5316
        for (; pos < pattlen; pos++)
 
5317
        {
 
5318
                /* % and _ are wildcard characters in LIKE */
 
5319
                if (patt[pos] == '%')
 
5320
                        sel *= FULL_WILDCARD_SEL;
 
5321
                else if (patt[pos] == '_')
 
5322
                        sel *= ANY_CHAR_SEL;
 
5323
                else if (patt[pos] == '\\')
 
5324
                {
 
5325
                        /* Backslash quotes the next character */
 
5326
                        pos++;
 
5327
                        if (pos >= pattlen)
 
5328
                                break;
 
5329
                        sel *= FIXED_CHAR_SEL;
 
5330
                }
 
5331
                else
 
5332
                        sel *= FIXED_CHAR_SEL;
 
5333
        }
 
5334
        /* Could get sel > 1 if multiple wildcards */
 
5335
        if (sel > 1.0)
 
5336
                sel = 1.0;
 
5337
 
 
5338
        pfree(patt);
 
5339
        return sel;
 
5340
}
 
5341
 
 
5342
static Selectivity
 
5343
regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
 
5344
{
 
5345
        Selectivity sel = 1.0;
 
5346
        int                     paren_depth = 0;
 
5347
        int                     paren_pos = 0;  /* dummy init to keep compiler quiet */
 
5348
        int                     pos;
 
5349
 
 
5350
        for (pos = 0; pos < pattlen; pos++)
 
5351
        {
 
5352
                if (patt[pos] == '(')
 
5353
                {
 
5354
                        if (paren_depth == 0)
 
5355
                                paren_pos = pos;        /* remember start of parenthesized item */
 
5356
                        paren_depth++;
 
5357
                }
 
5358
                else if (patt[pos] == ')' && paren_depth > 0)
 
5359
                {
 
5360
                        paren_depth--;
 
5361
                        if (paren_depth == 0)
 
5362
                                sel *= regex_selectivity_sub(patt + (paren_pos + 1),
 
5363
                                                                                         pos - (paren_pos + 1),
 
5364
                                                                                         case_insensitive);
 
5365
                }
 
5366
                else if (patt[pos] == '|' && paren_depth == 0)
 
5367
                {
 
5368
                        /*
 
5369
                         * If unquoted | is present at paren level 0 in pattern, we have
 
5370
                         * multiple alternatives; sum their probabilities.
 
5371
                         */
 
5372
                        sel += regex_selectivity_sub(patt + (pos + 1),
 
5373
                                                                                 pattlen - (pos + 1),
 
5374
                                                                                 case_insensitive);
 
5375
                        break;                          /* rest of pattern is now processed */
 
5376
                }
 
5377
                else if (patt[pos] == '[')
 
5378
                {
 
5379
                        bool            negclass = false;
 
5380
 
 
5381
                        if (patt[++pos] == '^')
 
5382
                        {
 
5383
                                negclass = true;
 
5384
                                pos++;
 
5385
                        }
 
5386
                        if (patt[pos] == ']')           /* ']' at start of class is not
 
5387
                                                                                 * special */
 
5388
                                pos++;
 
5389
                        while (pos < pattlen && patt[pos] != ']')
 
5390
                                pos++;
 
5391
                        if (paren_depth == 0)
 
5392
                                sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
 
5393
                }
 
5394
                else if (patt[pos] == '.')
 
5395
                {
 
5396
                        if (paren_depth == 0)
 
5397
                                sel *= ANY_CHAR_SEL;
 
5398
                }
 
5399
                else if (patt[pos] == '*' ||
 
5400
                                 patt[pos] == '?' ||
 
5401
                                 patt[pos] == '+')
 
5402
                {
 
5403
                        /* Ought to be smarter about quantifiers... */
 
5404
                        if (paren_depth == 0)
 
5405
                                sel *= PARTIAL_WILDCARD_SEL;
 
5406
                }
 
5407
                else if (patt[pos] == '{')
 
5408
                {
 
5409
                        while (pos < pattlen && patt[pos] != '}')
 
5410
                                pos++;
 
5411
                        if (paren_depth == 0)
 
5412
                                sel *= PARTIAL_WILDCARD_SEL;
 
5413
                }
 
5414
                else if (patt[pos] == '\\')
 
5415
                {
 
5416
                        /* backslash quotes the next character */
 
5417
                        pos++;
 
5418
                        if (pos >= pattlen)
 
5419
                                break;
 
5420
                        if (paren_depth == 0)
 
5421
                                sel *= FIXED_CHAR_SEL;
 
5422
                }
 
5423
                else
 
5424
                {
 
5425
                        if (paren_depth == 0)
 
5426
                                sel *= FIXED_CHAR_SEL;
 
5427
                }
 
5428
        }
 
5429
        /* Could get sel > 1 if multiple wildcards */
 
5430
        if (sel > 1.0)
 
5431
                sel = 1.0;
 
5432
        return sel;
 
5433
}
 
5434
 
 
5435
static Selectivity
 
5436
regex_selectivity(Const *patt_const, bool case_insensitive)
 
5437
{
 
5438
        Selectivity sel;
 
5439
        char       *patt;
 
5440
        int                     pattlen;
 
5441
        Oid                     typeid = patt_const->consttype;
 
5442
 
 
5443
        /*
 
5444
         * Should be unnecessary, there are no bytea regex operators defined. As
 
5445
         * such, it should be noted that the rest of this function has *not* been
 
5446
         * made safe for binary (possibly NULL containing) strings.
 
5447
         */
 
5448
        if (typeid == BYTEAOID)
 
5449
                ereport(ERROR,
 
5450
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 
5451
                 errmsg("regular-expression matching not supported on type bytea")));
 
5452
 
 
5453
        /* the right-hand const is type text for all of these */
 
5454
        patt = TextDatumGetCString(patt_const->constvalue);
 
5455
        pattlen = strlen(patt);
 
5456
 
 
5457
        /* If patt doesn't end with $, consider it to have a trailing wildcard */
 
5458
        if (pattlen > 0 && patt[pattlen - 1] == '$' &&
 
5459
                (pattlen == 1 || patt[pattlen - 2] != '\\'))
 
5460
        {
 
5461
                /* has trailing $ */
 
5462
                sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
 
5463
        }
 
5464
        else
 
5465
        {
 
5466
                /* no trailing $ */
 
5467
                sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
 
5468
                sel *= FULL_WILDCARD_SEL;
 
5469
                if (sel > 1.0)
 
5470
                        sel = 1.0;
 
5471
        }
 
5472
        return sel;
 
5473
}
 
5474
 
 
5475
static Selectivity
 
5476
pattern_selectivity(Const *patt, Pattern_Type ptype)
 
5477
{
 
5478
        Selectivity result;
 
5479
 
 
5480
        switch (ptype)
 
5481
        {
 
5482
                case Pattern_Type_Like:
 
5483
                        result = like_selectivity(patt, false);
 
5484
                        break;
 
5485
                case Pattern_Type_Like_IC:
 
5486
                        result = like_selectivity(patt, true);
 
5487
                        break;
 
5488
                case Pattern_Type_Regex:
 
5489
                        result = regex_selectivity(patt, false);
 
5490
                        break;
 
5491
                case Pattern_Type_Regex_IC:
 
5492
                        result = regex_selectivity(patt, true);
 
5493
                        break;
 
5494
                default:
 
5495
                        elog(ERROR, "unrecognized ptype: %d", (int) ptype);
 
5496
                        result = 1.0;           /* keep compiler quiet */
 
5497
                        break;
 
5498
        }
 
5499
        return result;
 
5500
}
 
5501
 
 
5502
 
 
5503
/*
 
5504
 * Try to generate a string greater than the given string or any
 
5505
 * string it is a prefix of.  If successful, return a palloc'd string
 
5506
 * in the form of a Const node; else return NULL.
 
5507
 *
 
5508
 * The caller must provide the appropriate "less than" comparison function
 
5509
 * for testing the strings, along with the collation to use.
 
5510
 *
 
5511
 * The key requirement here is that given a prefix string, say "foo",
 
5512
 * we must be able to generate another string "fop" that is greater than
 
5513
 * all strings "foobar" starting with "foo".  We can test that we have
 
5514
 * generated a string greater than the prefix string, but in non-C collations
 
5515
 * that is not a bulletproof guarantee that an extension of the string might
 
5516
 * not sort after it; an example is that "foo " is less than "foo!", but it
 
5517
 * is not clear that a "dictionary" sort ordering will consider "foo!" less
 
5518
 * than "foo bar".      CAUTION: Therefore, this function should be used only for
 
5519
 * estimation purposes when working in a non-C collation.
 
5520
 *
 
5521
 * To try to catch most cases where an extended string might otherwise sort
 
5522
 * before the result value, we determine which of the strings "Z", "z", "y",
 
5523
 * and "9" is seen as largest by the collation, and append that to the given
 
5524
 * prefix before trying to find a string that compares as larger.
 
5525
 *
 
5526
 * If we max out the righthand byte, truncate off the last character
 
5527
 * and start incrementing the next.  For example, if "z" were the last
 
5528
 * character in the sort order, then we could produce "foo" as a
 
5529
 * string greater than "fonz".
 
5530
 *
 
5531
 * This could be rather slow in the worst case, but in most cases we
 
5532
 * won't have to try more than one or two strings before succeeding.
 
5533
 */
 
5534
Const *
 
5535
make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
 
5536
{
 
5537
        Oid                     datatype = str_const->consttype;
 
5538
        char       *workstr;
 
5539
        int                     len;
 
5540
        Datum           cmpstr;
 
5541
        text       *cmptxt = NULL;
 
5542
 
 
5543
        /*
 
5544
         * Get a modifiable copy of the prefix string in C-string format, and set
 
5545
         * up the string we will compare to as a Datum.  In C locale this can just
 
5546
         * be the given prefix string, otherwise we need to add a suffix.  Types
 
5547
         * NAME and BYTEA sort bytewise so they don't need a suffix either.
 
5548
         */
 
5549
        if (datatype == NAMEOID)
 
5550
        {
 
5551
                workstr = DatumGetCString(DirectFunctionCall1(nameout,
 
5552
                                                                                                          str_const->constvalue));
 
5553
                len = strlen(workstr);
 
5554
                cmpstr = str_const->constvalue;
 
5555
        }
 
5556
        else if (datatype == BYTEAOID)
 
5557
        {
 
5558
                bytea      *bstr = DatumGetByteaP(str_const->constvalue);
 
5559
 
 
5560
                len = VARSIZE(bstr) - VARHDRSZ;
 
5561
                workstr = (char *) palloc(len);
 
5562
                memcpy(workstr, VARDATA(bstr), len);
 
5563
                if ((Pointer) bstr != DatumGetPointer(str_const->constvalue))
 
5564
                        pfree(bstr);
 
5565
                cmpstr = str_const->constvalue;
 
5566
        }
 
5567
        else
 
5568
        {
 
5569
                workstr = TextDatumGetCString(str_const->constvalue);
 
5570
                len = strlen(workstr);
 
5571
                if (lc_collate_is_c(collation) || len == 0)
 
5572
                        cmpstr = str_const->constvalue;
 
5573
                else
 
5574
                {
 
5575
                        /* If first time through, determine the suffix to use */
 
5576
                        static char suffixchar = 0;
 
5577
                        static Oid      suffixcollation = 0;
 
5578
 
 
5579
                        if (!suffixchar || suffixcollation != collation)
 
5580
                        {
 
5581
                                char       *best;
 
5582
 
 
5583
                                best = "Z";
 
5584
                                if (varstr_cmp(best, 1, "z", 1, collation) < 0)
 
5585
                                        best = "z";
 
5586
                                if (varstr_cmp(best, 1, "y", 1, collation) < 0)
 
5587
                                        best = "y";
 
5588
                                if (varstr_cmp(best, 1, "9", 1, collation) < 0)
 
5589
                                        best = "9";
 
5590
                                suffixchar = *best;
 
5591
                                suffixcollation = collation;
 
5592
                        }
 
5593
 
 
5594
                        /* And build the string to compare to */
 
5595
                        cmptxt = (text *) palloc(VARHDRSZ + len + 1);
 
5596
                        SET_VARSIZE(cmptxt, VARHDRSZ + len + 1);
 
5597
                        memcpy(VARDATA(cmptxt), workstr, len);
 
5598
                        *(VARDATA(cmptxt) + len) = suffixchar;
 
5599
                        cmpstr = PointerGetDatum(cmptxt);
 
5600
                }
 
5601
        }
 
5602
 
 
5603
        while (len > 0)
 
5604
        {
 
5605
                unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
 
5606
                unsigned char savelastchar = *lastchar;
 
5607
 
 
5608
                /*
 
5609
                 * Try to generate a larger string by incrementing the last byte.
 
5610
                 */
 
5611
                while (*lastchar < (unsigned char) 255)
 
5612
                {
 
5613
                        Const      *workstr_const;
 
5614
 
 
5615
                        (*lastchar)++;
 
5616
 
 
5617
                        if (datatype != BYTEAOID)
 
5618
                        {
 
5619
                                /* do not generate invalid encoding sequences */
 
5620
                                if (!pg_verifymbstr(workstr, len, true))
 
5621
                                        continue;
 
5622
                                workstr_const = string_to_const(workstr, datatype);
 
5623
                        }
 
5624
                        else
 
5625
                                workstr_const = string_to_bytea_const(workstr, len);
 
5626
 
 
5627
                        if (DatumGetBool(FunctionCall2Coll(ltproc,
 
5628
                                                                                           collation,
 
5629
                                                                                           cmpstr,
 
5630
                                                                                           workstr_const->constvalue)))
 
5631
                        {
 
5632
                                /* Successfully made a string larger than cmpstr */
 
5633
                                if (cmptxt)
 
5634
                                        pfree(cmptxt);
 
5635
                                pfree(workstr);
 
5636
                                return workstr_const;
 
5637
                        }
 
5638
 
 
5639
                        /* No good, release unusable value and try again */
 
5640
                        pfree(DatumGetPointer(workstr_const->constvalue));
 
5641
                        pfree(workstr_const);
 
5642
                }
 
5643
 
 
5644
                /* restore last byte so we don't confuse pg_mbcliplen */
 
5645
                *lastchar = savelastchar;
 
5646
 
 
5647
                /*
 
5648
                 * Truncate off the last character, which might be more than 1 byte,
 
5649
                 * depending on the character encoding.
 
5650
                 */
 
5651
                if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1)
 
5652
                        len = pg_mbcliplen(workstr, len, len - 1);
 
5653
                else
 
5654
                        len -= 1;
 
5655
 
 
5656
                if (datatype != BYTEAOID)
 
5657
                        workstr[len] = '\0';
 
5658
        }
 
5659
 
 
5660
        /* Failed... */
 
5661
        if (cmptxt)
 
5662
                pfree(cmptxt);
 
5663
        pfree(workstr);
 
5664
 
 
5665
        return NULL;
 
5666
}
 
5667
 
 
5668
/*
 
5669
 * Generate a Datum of the appropriate type from a C string.
 
5670
 * Note that all of the supported types are pass-by-ref, so the
 
5671
 * returned value should be pfree'd if no longer needed.
 
5672
 */
 
5673
static Datum
 
5674
string_to_datum(const char *str, Oid datatype)
 
5675
{
 
5676
        Assert(str != NULL);
 
5677
 
 
5678
        /*
 
5679
         * We cheat a little by assuming that CStringGetTextDatum() will do for
 
5680
         * bpchar and varchar constants too...
 
5681
         */
 
5682
        if (datatype == NAMEOID)
 
5683
                return DirectFunctionCall1(namein, CStringGetDatum(str));
 
5684
        else if (datatype == BYTEAOID)
 
5685
                return DirectFunctionCall1(byteain, CStringGetDatum(str));
 
5686
        else
 
5687
                return CStringGetTextDatum(str);
 
5688
}
 
5689
 
 
5690
/*
 
5691
 * Generate a Const node of the appropriate type from a C string.
 
5692
 */
 
5693
static Const *
 
5694
string_to_const(const char *str, Oid datatype)
 
5695
{
 
5696
        Datum           conval = string_to_datum(str, datatype);
 
5697
        Oid                     collation;
 
5698
        int                     constlen;
 
5699
 
 
5700
        /*
 
5701
         * We only need to support a few datatypes here, so hard-wire properties
 
5702
         * instead of incurring the expense of catalog lookups.
 
5703
         */
 
5704
        switch (datatype)
 
5705
        {
 
5706
                case TEXTOID:
 
5707
                case VARCHAROID:
 
5708
                case BPCHAROID:
 
5709
                        collation = DEFAULT_COLLATION_OID;
 
5710
                        constlen = -1;
 
5711
                        break;
 
5712
 
 
5713
                case NAMEOID:
 
5714
                        collation = InvalidOid;
 
5715
                        constlen = NAMEDATALEN;
 
5716
                        break;
 
5717
 
 
5718
                case BYTEAOID:
 
5719
                        collation = InvalidOid;
 
5720
                        constlen = -1;
 
5721
                        break;
 
5722
 
 
5723
                default:
 
5724
                        elog(ERROR, "unexpected datatype in string_to_const: %u",
 
5725
                                 datatype);
 
5726
                        return NULL;
 
5727
        }
 
5728
 
 
5729
        return makeConst(datatype, -1, collation, constlen,
 
5730
                                         conval, false, false);
 
5731
}
 
5732
 
 
5733
/*
 
5734
 * Generate a Const node of bytea type from a binary C string and a length.
 
5735
 */
 
5736
static Const *
 
5737
string_to_bytea_const(const char *str, size_t str_len)
 
5738
{
 
5739
        bytea      *bstr = palloc(VARHDRSZ + str_len);
 
5740
        Datum           conval;
 
5741
 
 
5742
        memcpy(VARDATA(bstr), str, str_len);
 
5743
        SET_VARSIZE(bstr, VARHDRSZ + str_len);
 
5744
        conval = PointerGetDatum(bstr);
 
5745
 
 
5746
        return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false);
 
5747
}
 
5748
 
 
5749
/*-------------------------------------------------------------------------
 
5750
 *
 
5751
 * Index cost estimation functions
 
5752
 *
 
5753
 * genericcostestimate is a general-purpose estimator for use when we
 
5754
 * don't have any better idea about how to estimate.  Index-type-specific
 
5755
 * knowledge can be incorporated in the type-specific routines.
 
5756
 *
 
5757
 * One bit of index-type-specific knowledge we can relatively easily use
 
5758
 * in genericcostestimate is the estimate of the number of index tuples
 
5759
 * visited.  If numIndexTuples is not 0 then it is used as the estimate,
 
5760
 * otherwise we compute a generic estimate.
 
5761
 *
 
5762
 *-------------------------------------------------------------------------
 
5763
 */
 
5764
 
 
5765
static void
 
5766
genericcostestimate(PlannerInfo *root,
 
5767
                                        IndexOptInfo *index,
 
5768
                                        List *indexQuals,
 
5769
                                        List *indexOrderBys,
 
5770
                                        RelOptInfo *outer_rel,
 
5771
                                        double numIndexTuples,
 
5772
                                        Cost *indexStartupCost,
 
5773
                                        Cost *indexTotalCost,
 
5774
                                        Selectivity *indexSelectivity,
 
5775
                                        double *indexCorrelation)
 
5776
{
 
5777
        double          numIndexPages;
 
5778
        double          num_sa_scans;
 
5779
        double          num_outer_scans;
 
5780
        double          num_scans;
 
5781
        QualCost        index_qual_cost;
 
5782
        double          qual_op_cost;
 
5783
        double          qual_arg_cost;
 
5784
        double          spc_random_page_cost;
 
5785
        List       *selectivityQuals;
 
5786
        ListCell   *l;
 
5787
 
 
5788
        /*----------
 
5789
         * If the index is partial, AND the index predicate with the explicitly
 
5790
         * given indexquals to produce a more accurate idea of the index
 
5791
         * selectivity.  However, we need to be careful not to insert redundant
 
5792
         * clauses, because clauselist_selectivity() is easily fooled into
 
5793
         * computing a too-low selectivity estimate.  Our approach is to add
 
5794
         * only the index predicate clause(s) that cannot be proven to be implied
 
5795
         * by the given indexquals.  This successfully handles cases such as a
 
5796
         * qual "x = 42" used with a partial index "WHERE x >= 40 AND x < 50".
 
5797
         * There are many other cases where we won't detect redundancy, leading
 
5798
         * to a too-low selectivity estimate, which will bias the system in favor
 
5799
         * of using partial indexes where possible.  That is not necessarily bad
 
5800
         * though.
 
5801
         *
 
5802
         * Note that indexQuals contains RestrictInfo nodes while the indpred
 
5803
         * does not.  This is OK for both predicate_implied_by() and
 
5804
         * clauselist_selectivity().
 
5805
         *----------
 
5806
         */
 
5807
        if (index->indpred != NIL)
 
5808
        {
 
5809
                List       *predExtraQuals = NIL;
 
5810
 
 
5811
                foreach(l, index->indpred)
 
5812
                {
 
5813
                        Node       *predQual = (Node *) lfirst(l);
 
5814
                        List       *oneQual = list_make1(predQual);
 
5815
 
 
5816
                        if (!predicate_implied_by(oneQual, indexQuals))
 
5817
                                predExtraQuals = list_concat(predExtraQuals, oneQual);
 
5818
                }
 
5819
                /* list_concat avoids modifying the passed-in indexQuals list */
 
5820
                selectivityQuals = list_concat(predExtraQuals, indexQuals);
 
5821
        }
 
5822
        else
 
5823
                selectivityQuals = indexQuals;
 
5824
 
 
5825
        /*
 
5826
         * Check for ScalarArrayOpExpr index quals, and estimate the number of
 
5827
         * index scans that will be performed.
 
5828
         */
 
5829
        num_sa_scans = 1;
 
5830
        foreach(l, indexQuals)
 
5831
        {
 
5832
                RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
 
5833
 
 
5834
                if (IsA(rinfo->clause, ScalarArrayOpExpr))
 
5835
                {
 
5836
                        ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause;
 
5837
                        int                     alength = estimate_array_length(lsecond(saop->args));
 
5838
 
 
5839
                        if (alength > 1)
 
5840
                                num_sa_scans *= alength;
 
5841
                }
 
5842
        }
 
5843
 
 
5844
        /* Estimate the fraction of main-table tuples that will be visited */
 
5845
        *indexSelectivity = clauselist_selectivity(root, selectivityQuals,
 
5846
                                                                                           index->rel->relid,
 
5847
                                                                                           JOIN_INNER,
 
5848
                                                                                           NULL);
 
5849
 
 
5850
        /*
 
5851
         * If caller didn't give us an estimate, estimate the number of index
 
5852
         * tuples that will be visited.  We do it in this rather peculiar-looking
 
5853
         * way in order to get the right answer for partial indexes.
 
5854
         */
 
5855
        if (numIndexTuples <= 0.0)
 
5856
        {
 
5857
                numIndexTuples = *indexSelectivity * index->rel->tuples;
 
5858
 
 
5859
                /*
 
5860
                 * The above calculation counts all the tuples visited across all
 
5861
                 * scans induced by ScalarArrayOpExpr nodes.  We want to consider the
 
5862
                 * average per-indexscan number, so adjust.  This is a handy place to
 
5863
                 * round to integer, too.  (If caller supplied tuple estimate, it's
 
5864
                 * responsible for handling these considerations.)
 
5865
                 */
 
5866
                numIndexTuples = rint(numIndexTuples / num_sa_scans);
 
5867
        }
 
5868
 
 
5869
        /*
 
5870
         * We can bound the number of tuples by the index size in any case. Also,
 
5871
         * always estimate at least one tuple is touched, even when
 
5872
         * indexSelectivity estimate is tiny.
 
5873
         */
 
5874
        if (numIndexTuples > index->tuples)
 
5875
                numIndexTuples = index->tuples;
 
5876
        if (numIndexTuples < 1.0)
 
5877
                numIndexTuples = 1.0;
 
5878
 
 
5879
        /*
 
5880
         * Estimate the number of index pages that will be retrieved.
 
5881
         *
 
5882
         * We use the simplistic method of taking a pro-rata fraction of the total
 
5883
         * number of index pages.  In effect, this counts only leaf pages and not
 
5884
         * any overhead such as index metapage or upper tree levels. In practice
 
5885
         * this seems a better approximation than charging for access to the upper
 
5886
         * levels, perhaps because those tend to stay in cache under load.
 
5887
         */
 
5888
        if (index->pages > 1 && index->tuples > 1)
 
5889
                numIndexPages = ceil(numIndexTuples * index->pages / index->tuples);
 
5890
        else
 
5891
                numIndexPages = 1.0;
 
5892
 
 
5893
        /* fetch estimated page cost for schema containing index */
 
5894
        get_tablespace_page_costs(index->reltablespace,
 
5895
                                                          &spc_random_page_cost,
 
5896
                                                          NULL);
 
5897
 
 
5898
        /*
 
5899
         * Now compute the disk access costs.
 
5900
         *
 
5901
         * The above calculations are all per-index-scan.  However, if we are in a
 
5902
         * nestloop inner scan, we can expect the scan to be repeated (with
 
5903
         * different search keys) for each row of the outer relation.  Likewise,
 
5904
         * ScalarArrayOpExpr quals result in multiple index scans.      This creates
 
5905
         * the potential for cache effects to reduce the number of disk page
 
5906
         * fetches needed.      We want to estimate the average per-scan I/O cost in
 
5907
         * the presence of caching.
 
5908
         *
 
5909
         * We use the Mackert-Lohman formula (see costsize.c for details) to
 
5910
         * estimate the total number of page fetches that occur.  While this
 
5911
         * wasn't what it was designed for, it seems a reasonable model anyway.
 
5912
         * Note that we are counting pages not tuples anymore, so we take N = T =
 
5913
         * index size, as if there were one "tuple" per page.
 
5914
         */
 
5915
        if (outer_rel != NULL && outer_rel->rows > 1)
 
5916
        {
 
5917
                num_outer_scans = outer_rel->rows;
 
5918
                num_scans = num_sa_scans * num_outer_scans;
 
5919
        }
 
5920
        else
 
5921
        {
 
5922
                num_outer_scans = 1;
 
5923
                num_scans = num_sa_scans;
 
5924
        }
 
5925
 
 
5926
        if (num_scans > 1)
 
5927
        {
 
5928
                double          pages_fetched;
 
5929
 
 
5930
                /* total page fetches ignoring cache effects */
 
5931
                pages_fetched = numIndexPages * num_scans;
 
5932
 
 
5933
                /* use Mackert and Lohman formula to adjust for cache effects */
 
5934
                pages_fetched = index_pages_fetched(pages_fetched,
 
5935
                                                                                        index->pages,
 
5936
                                                                                        (double) index->pages,
 
5937
                                                                                        root);
 
5938
 
 
5939
                /*
 
5940
                 * Now compute the total disk access cost, and then report a pro-rated
 
5941
                 * share for each outer scan.  (Don't pro-rate for ScalarArrayOpExpr,
 
5942
                 * since that's internal to the indexscan.)
 
5943
                 */
 
5944
                *indexTotalCost = (pages_fetched * spc_random_page_cost)
 
5945
                        / num_outer_scans;
 
5946
        }
 
5947
        else
 
5948
        {
 
5949
                /*
 
5950
                 * For a single index scan, we just charge spc_random_page_cost per
 
5951
                 * page touched.
 
5952
                 */
 
5953
                *indexTotalCost = numIndexPages * spc_random_page_cost;
 
5954
        }
 
5955
 
 
5956
        /*
 
5957
         * A difficulty with the leaf-pages-only cost approach is that for small
 
5958
         * selectivities (eg, single index tuple fetched) all indexes will look
 
5959
         * equally attractive because we will estimate exactly 1 leaf page to be
 
5960
         * fetched.  All else being equal, we should prefer physically smaller
 
5961
         * indexes over larger ones.  (An index might be smaller because it is
 
5962
         * partial or because it contains fewer columns; presumably the other
 
5963
         * columns in the larger index aren't useful to the query, or the larger
 
5964
         * index would have better selectivity.)
 
5965
         *
 
5966
         * We can deal with this by adding a very small "fudge factor" that
 
5967
         * depends on the index size.  The fudge factor used here is one
 
5968
         * spc_random_page_cost per 100000 index pages, which should be small
 
5969
         * enough to not alter index-vs-seqscan decisions, but will prevent
 
5970
         * indexes of different sizes from looking exactly equally attractive.
 
5971
         */
 
5972
        *indexTotalCost += index->pages * spc_random_page_cost / 100000.0;
 
5973
 
 
5974
        /*
 
5975
         * CPU cost: any complex expressions in the indexquals will need to be
 
5976
         * evaluated once at the start of the scan to reduce them to runtime keys
 
5977
         * to pass to the index AM (see nodeIndexscan.c).  We model the per-tuple
 
5978
         * CPU costs as cpu_index_tuple_cost plus one cpu_operator_cost per
 
5979
         * indexqual operator.  Because we have numIndexTuples as a per-scan
 
5980
         * number, we have to multiply by num_sa_scans to get the correct result
 
5981
         * for ScalarArrayOpExpr cases.  Similarly add in costs for any index
 
5982
         * ORDER BY expressions.
 
5983
         *
 
5984
         * Note: this neglects the possible costs of rechecking lossy operators
 
5985
         * and OR-clause expressions.  Detecting that that might be needed seems
 
5986
         * more expensive than it's worth, though, considering all the other
 
5987
         * inaccuracies here ...
 
5988
         */
 
5989
        cost_qual_eval(&index_qual_cost, indexQuals, root);
 
5990
        qual_arg_cost = index_qual_cost.startup + index_qual_cost.per_tuple;
 
5991
        cost_qual_eval(&index_qual_cost, indexOrderBys, root);
 
5992
        qual_arg_cost += index_qual_cost.startup + index_qual_cost.per_tuple;
 
5993
        qual_op_cost = cpu_operator_cost *
 
5994
                (list_length(indexQuals) + list_length(indexOrderBys));
 
5995
        qual_arg_cost -= qual_op_cost;
 
5996
        if (qual_arg_cost < 0)          /* just in case... */
 
5997
                qual_arg_cost = 0;
 
5998
 
 
5999
        *indexStartupCost = qual_arg_cost;
 
6000
        *indexTotalCost += qual_arg_cost;
 
6001
        *indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost);
 
6002
 
 
6003
        /*
 
6004
         * We also add a CPU-cost component to represent the general costs of
 
6005
         * starting an indexscan, such as analysis of btree index keys and initial
 
6006
         * tree descent.  This is estimated at 100x cpu_operator_cost, which is a
 
6007
         * bit arbitrary but seems the right order of magnitude. (As noted above,
 
6008
         * we don't charge any I/O for touching upper tree levels, but charging
 
6009
         * nothing at all has been found too optimistic.)
 
6010
         *
 
6011
         * Although this is startup cost with respect to any one scan, we add it
 
6012
         * to the "total" cost component because it's only very interesting in the
 
6013
         * many-ScalarArrayOpExpr-scan case, and there it will be paid over the
 
6014
         * life of the scan node.
 
6015
         */
 
6016
        *indexTotalCost += num_sa_scans * 100.0 * cpu_operator_cost;
 
6017
 
 
6018
        /*
 
6019
         * Generic assumption about index correlation: there isn't any.
 
6020
         */
 
6021
        *indexCorrelation = 0.0;
 
6022
}
 
6023
 
 
6024
 
 
6025
Datum
 
6026
btcostestimate(PG_FUNCTION_ARGS)
 
6027
{
 
6028
        PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 
6029
        IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
 
6030
        List       *indexQuals = (List *) PG_GETARG_POINTER(2);
 
6031
        List       *indexOrderBys = (List *) PG_GETARG_POINTER(3);
 
6032
        RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(4);
 
6033
        Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(5);
 
6034
        Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(6);
 
6035
        Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(7);
 
6036
        double     *indexCorrelation = (double *) PG_GETARG_POINTER(8);
 
6037
        Oid                     relid;
 
6038
        AttrNumber      colnum;
 
6039
        VariableStatData vardata;
 
6040
        double          numIndexTuples;
 
6041
        List       *indexBoundQuals;
 
6042
        int                     indexcol;
 
6043
        bool            eqQualHere;
 
6044
        bool            found_saop;
 
6045
        bool            found_is_null_op;
 
6046
        double          num_sa_scans;
 
6047
        ListCell   *l;
 
6048
 
 
6049
        /*
 
6050
         * For a btree scan, only leading '=' quals plus inequality quals for the
 
6051
         * immediately next attribute contribute to index selectivity (these are
 
6052
         * the "boundary quals" that determine the starting and stopping points of
 
6053
         * the index scan).  Additional quals can suppress visits to the heap, so
 
6054
         * it's OK to count them in indexSelectivity, but they should not count
 
6055
         * for estimating numIndexTuples.  So we must examine the given indexQuals
 
6056
         * to find out which ones count as boundary quals.      We rely on the
 
6057
         * knowledge that they are given in index column order.
 
6058
         *
 
6059
         * For a RowCompareExpr, we consider only the first column, just as
 
6060
         * rowcomparesel() does.
 
6061
         *
 
6062
         * If there's a ScalarArrayOpExpr in the quals, we'll actually perform N
 
6063
         * index scans not one, but the ScalarArrayOpExpr's operator can be
 
6064
         * considered to act the same as it normally does.
 
6065
         */
 
6066
        indexBoundQuals = NIL;
 
6067
        indexcol = 0;
 
6068
        eqQualHere = false;
 
6069
        found_saop = false;
 
6070
        found_is_null_op = false;
 
6071
        num_sa_scans = 1;
 
6072
        foreach(l, indexQuals)
 
6073
        {
 
6074
                RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
 
6075
                Expr       *clause;
 
6076
                Node       *leftop,
 
6077
                                   *rightop;
 
6078
                Oid                     clause_op;
 
6079
                int                     op_strategy;
 
6080
                bool            is_null_op = false;
 
6081
 
 
6082
                Assert(IsA(rinfo, RestrictInfo));
 
6083
                clause = rinfo->clause;
 
6084
                if (IsA(clause, OpExpr))
 
6085
                {
 
6086
                        leftop = get_leftop(clause);
 
6087
                        rightop = get_rightop(clause);
 
6088
                        clause_op = ((OpExpr *) clause)->opno;
 
6089
                }
 
6090
                else if (IsA(clause, RowCompareExpr))
 
6091
                {
 
6092
                        RowCompareExpr *rc = (RowCompareExpr *) clause;
 
6093
 
 
6094
                        leftop = (Node *) linitial(rc->largs);
 
6095
                        rightop = (Node *) linitial(rc->rargs);
 
6096
                        clause_op = linitial_oid(rc->opnos);
 
6097
                }
 
6098
                else if (IsA(clause, ScalarArrayOpExpr))
 
6099
                {
 
6100
                        ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
 
6101
 
 
6102
                        leftop = (Node *) linitial(saop->args);
 
6103
                        rightop = (Node *) lsecond(saop->args);
 
6104
                        clause_op = saop->opno;
 
6105
                        found_saop = true;
 
6106
                }
 
6107
                else if (IsA(clause, NullTest))
 
6108
                {
 
6109
                        NullTest   *nt = (NullTest *) clause;
 
6110
 
 
6111
                        leftop = (Node *) nt->arg;
 
6112
                        rightop = NULL;
 
6113
                        clause_op = InvalidOid;
 
6114
                        if (nt->nulltesttype == IS_NULL)
 
6115
                        {
 
6116
                                found_is_null_op = true;
 
6117
                                is_null_op = true;
 
6118
                        }
 
6119
                }
 
6120
                else
 
6121
                {
 
6122
                        elog(ERROR, "unsupported indexqual type: %d",
 
6123
                                 (int) nodeTag(clause));
 
6124
                        continue;                       /* keep compiler quiet */
 
6125
                }
 
6126
                if (match_index_to_operand(leftop, indexcol, index))
 
6127
                {
 
6128
                        /* clause_op is correct */
 
6129
                }
 
6130
                else if (match_index_to_operand(rightop, indexcol, index))
 
6131
                {
 
6132
                        /* Must flip operator to get the opfamily member */
 
6133
                        clause_op = get_commutator(clause_op);
 
6134
                }
 
6135
                else
 
6136
                {
 
6137
                        /* Must be past the end of quals for indexcol, try next */
 
6138
                        if (!eqQualHere)
 
6139
                                break;                  /* done if no '=' qual for indexcol */
 
6140
                        indexcol++;
 
6141
                        eqQualHere = false;
 
6142
                        if (match_index_to_operand(leftop, indexcol, index))
 
6143
                        {
 
6144
                                /* clause_op is correct */
 
6145
                        }
 
6146
                        else if (match_index_to_operand(rightop, indexcol, index))
 
6147
                        {
 
6148
                                /* Must flip operator to get the opfamily member */
 
6149
                                clause_op = get_commutator(clause_op);
 
6150
                        }
 
6151
                        else
 
6152
                        {
 
6153
                                /* No quals for new indexcol, so we are done */
 
6154
                                break;
 
6155
                        }
 
6156
                }
 
6157
                /* check for equality operator */
 
6158
                if (OidIsValid(clause_op))
 
6159
                {
 
6160
                        op_strategy = get_op_opfamily_strategy(clause_op,
 
6161
                                                                                                   index->opfamily[indexcol]);
 
6162
                        Assert(op_strategy != 0);       /* not a member of opfamily?? */
 
6163
                        if (op_strategy == BTEqualStrategyNumber)
 
6164
                                eqQualHere = true;
 
6165
                }
 
6166
                else if (is_null_op)
 
6167
                {
 
6168
                        /* IS NULL is like = for purposes of selectivity determination */
 
6169
                        eqQualHere = true;
 
6170
                }
 
6171
                /* count up number of SA scans induced by indexBoundQuals only */
 
6172
                if (IsA(clause, ScalarArrayOpExpr))
 
6173
                {
 
6174
                        ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
 
6175
                        int                     alength = estimate_array_length(lsecond(saop->args));
 
6176
 
 
6177
                        if (alength > 1)
 
6178
                                num_sa_scans *= alength;
 
6179
                }
 
6180
                indexBoundQuals = lappend(indexBoundQuals, rinfo);
 
6181
        }
 
6182
 
 
6183
        /*
 
6184
         * If index is unique and we found an '=' clause for each column, we can
 
6185
         * just assume numIndexTuples = 1 and skip the expensive
 
6186
         * clauselist_selectivity calculations.  However, a ScalarArrayOp or
 
6187
         * NullTest invalidates that theory, even though it sets eqQualHere.
 
6188
         */
 
6189
        if (index->unique &&
 
6190
                indexcol == index->ncolumns - 1 &&
 
6191
                eqQualHere &&
 
6192
                !found_saop &&
 
6193
                !found_is_null_op)
 
6194
                numIndexTuples = 1.0;
 
6195
        else
 
6196
        {
 
6197
                Selectivity btreeSelectivity;
 
6198
 
 
6199
                btreeSelectivity = clauselist_selectivity(root, indexBoundQuals,
 
6200
                                                                                                  index->rel->relid,
 
6201
                                                                                                  JOIN_INNER,
 
6202
                                                                                                  NULL);
 
6203
                numIndexTuples = btreeSelectivity * index->rel->tuples;
 
6204
 
 
6205
                /*
 
6206
                 * As in genericcostestimate(), we have to adjust for any
 
6207
                 * ScalarArrayOpExpr quals included in indexBoundQuals, and then round
 
6208
                 * to integer.
 
6209
                 */
 
6210
                numIndexTuples = rint(numIndexTuples / num_sa_scans);
 
6211
        }
 
6212
 
 
6213
        genericcostestimate(root, index, indexQuals, indexOrderBys,
 
6214
                                                outer_rel, numIndexTuples,
 
6215
                                                indexStartupCost, indexTotalCost,
 
6216
                                                indexSelectivity, indexCorrelation);
 
6217
 
 
6218
        /*
 
6219
         * If we can get an estimate of the first column's ordering correlation C
 
6220
         * from pg_statistic, estimate the index correlation as C for a
 
6221
         * single-column index, or C * 0.75 for multiple columns. (The idea here
 
6222
         * is that multiple columns dilute the importance of the first column's
 
6223
         * ordering, but don't negate it entirely.  Before 8.0 we divided the
 
6224
         * correlation by the number of columns, but that seems too strong.)
 
6225
         *
 
6226
         * We can skip all this if we found a ScalarArrayOpExpr, because then the
 
6227
         * call must be for a bitmap index scan, and the caller isn't going to
 
6228
         * care what the index correlation is.
 
6229
         */
 
6230
        if (found_saop)
 
6231
                PG_RETURN_VOID();
 
6232
 
 
6233
        MemSet(&vardata, 0, sizeof(vardata));
 
6234
 
 
6235
        if (index->indexkeys[0] != 0)
 
6236
        {
 
6237
                /* Simple variable --- look to stats for the underlying table */
 
6238
                RangeTblEntry *rte = planner_rt_fetch(index->rel->relid, root);
 
6239
 
 
6240
                Assert(rte->rtekind == RTE_RELATION);
 
6241
                relid = rte->relid;
 
6242
                Assert(relid != InvalidOid);
 
6243
                colnum = index->indexkeys[0];
 
6244
 
 
6245
                if (get_relation_stats_hook &&
 
6246
                        (*get_relation_stats_hook) (root, rte, colnum, &vardata))
 
6247
                {
 
6248
                        /*
 
6249
                         * The hook took control of acquiring a stats tuple.  If it did
 
6250
                         * supply a tuple, it'd better have supplied a freefunc.
 
6251
                         */
 
6252
                        if (HeapTupleIsValid(vardata.statsTuple) &&
 
6253
                                !vardata.freefunc)
 
6254
                                elog(ERROR, "no function provided to release variable stats with");
 
6255
                }
 
6256
                else
 
6257
                {
 
6258
                        vardata.statsTuple = SearchSysCache3(STATRELATTINH,
 
6259
                                                                                                 ObjectIdGetDatum(relid),
 
6260
                                                                                                 Int16GetDatum(colnum),
 
6261
                                                                                                 BoolGetDatum(rte->inh));
 
6262
                        vardata.freefunc = ReleaseSysCache;
 
6263
                }
 
6264
        }
 
6265
        else
 
6266
        {
 
6267
                /* Expression --- maybe there are stats for the index itself */
 
6268
                relid = index->indexoid;
 
6269
                colnum = 1;
 
6270
 
 
6271
                if (get_index_stats_hook &&
 
6272
                        (*get_index_stats_hook) (root, relid, colnum, &vardata))
 
6273
                {
 
6274
                        /*
 
6275
                         * The hook took control of acquiring a stats tuple.  If it did
 
6276
                         * supply a tuple, it'd better have supplied a freefunc.
 
6277
                         */
 
6278
                        if (HeapTupleIsValid(vardata.statsTuple) &&
 
6279
                                !vardata.freefunc)
 
6280
                                elog(ERROR, "no function provided to release variable stats with");
 
6281
                }
 
6282
                else
 
6283
                {
 
6284
                        vardata.statsTuple = SearchSysCache3(STATRELATTINH,
 
6285
                                                                                                 ObjectIdGetDatum(relid),
 
6286
                                                                                                 Int16GetDatum(colnum),
 
6287
                                                                                                 BoolGetDatum(false));
 
6288
                        vardata.freefunc = ReleaseSysCache;
 
6289
                }
 
6290
        }
 
6291
 
 
6292
        if (HeapTupleIsValid(vardata.statsTuple))
 
6293
        {
 
6294
                Oid                     sortop;
 
6295
                float4     *numbers;
 
6296
                int                     nnumbers;
 
6297
 
 
6298
                sortop = get_opfamily_member(index->opfamily[0],
 
6299
                                                                         index->opcintype[0],
 
6300
                                                                         index->opcintype[0],
 
6301
                                                                         BTLessStrategyNumber);
 
6302
                if (OidIsValid(sortop) &&
 
6303
                        get_attstatsslot(vardata.statsTuple, InvalidOid, 0,
 
6304
                                                         STATISTIC_KIND_CORRELATION,
 
6305
                                                         sortop,
 
6306
                                                         NULL,
 
6307
                                                         NULL, NULL,
 
6308
                                                         &numbers, &nnumbers))
 
6309
                {
 
6310
                        double          varCorrelation;
 
6311
 
 
6312
                        Assert(nnumbers == 1);
 
6313
                        varCorrelation = numbers[0];
 
6314
 
 
6315
                        if (index->reverse_sort[0])
 
6316
                                varCorrelation = -varCorrelation;
 
6317
 
 
6318
                        if (index->ncolumns > 1)
 
6319
                                *indexCorrelation = varCorrelation * 0.75;
 
6320
                        else
 
6321
                                *indexCorrelation = varCorrelation;
 
6322
 
 
6323
                        free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers);
 
6324
                }
 
6325
        }
 
6326
 
 
6327
        ReleaseVariableStats(vardata);
 
6328
 
 
6329
        PG_RETURN_VOID();
 
6330
}
 
6331
 
 
6332
Datum
 
6333
hashcostestimate(PG_FUNCTION_ARGS)
 
6334
{
 
6335
        PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 
6336
        IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
 
6337
        List       *indexQuals = (List *) PG_GETARG_POINTER(2);
 
6338
        List       *indexOrderBys = (List *) PG_GETARG_POINTER(3);
 
6339
        RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(4);
 
6340
        Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(5);
 
6341
        Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(6);
 
6342
        Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(7);
 
6343
        double     *indexCorrelation = (double *) PG_GETARG_POINTER(8);
 
6344
 
 
6345
        genericcostestimate(root, index, indexQuals, indexOrderBys, outer_rel, 0.0,
 
6346
                                                indexStartupCost, indexTotalCost,
 
6347
                                                indexSelectivity, indexCorrelation);
 
6348
 
 
6349
        PG_RETURN_VOID();
 
6350
}
 
6351
 
 
6352
Datum
 
6353
gistcostestimate(PG_FUNCTION_ARGS)
 
6354
{
 
6355
        PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 
6356
        IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
 
6357
        List       *indexQuals = (List *) PG_GETARG_POINTER(2);
 
6358
        List       *indexOrderBys = (List *) PG_GETARG_POINTER(3);
 
6359
        RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(4);
 
6360
        Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(5);
 
6361
        Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(6);
 
6362
        Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(7);
 
6363
        double     *indexCorrelation = (double *) PG_GETARG_POINTER(8);
 
6364
 
 
6365
        genericcostestimate(root, index, indexQuals, indexOrderBys, outer_rel, 0.0,
 
6366
                                                indexStartupCost, indexTotalCost,
 
6367
                                                indexSelectivity, indexCorrelation);
 
6368
 
 
6369
        PG_RETURN_VOID();
 
6370
}
 
6371
 
 
6372
/* Find the index column matching "op"; return its index, or -1 if no match */
 
6373
static int
 
6374
find_index_column(Node *op, IndexOptInfo *index)
 
6375
{
 
6376
        int                     i;
 
6377
 
 
6378
        for (i = 0; i < index->ncolumns; i++)
 
6379
        {
 
6380
                if (match_index_to_operand(op, i, index))
 
6381
                        return i;
 
6382
        }
 
6383
 
 
6384
        return -1;
 
6385
}
 
6386
 
 
6387
/*
 
6388
 * GIN has search behavior completely different from other index types
 
6389
 */
 
6390
Datum
 
6391
gincostestimate(PG_FUNCTION_ARGS)
 
6392
{
 
6393
        PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 
6394
        IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
 
6395
        List       *indexQuals = (List *) PG_GETARG_POINTER(2);
 
6396
        List       *indexOrderBys = (List *) PG_GETARG_POINTER(3);
 
6397
        RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(4);
 
6398
        Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(5);
 
6399
        Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(6);
 
6400
        Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(7);
 
6401
        double     *indexCorrelation = (double *) PG_GETARG_POINTER(8);
 
6402
        ListCell   *l;
 
6403
        List       *selectivityQuals;
 
6404
        double          numPages = index->pages,
 
6405
                                numTuples = index->tuples;
 
6406
        double          numEntryPages,
 
6407
                                numDataPages,
 
6408
                                numPendingPages,
 
6409
                                numEntries;
 
6410
        bool            haveFullScan = false;
 
6411
        double          partialEntriesInQuals = 0.0;
 
6412
        double          searchEntriesInQuals = 0.0;
 
6413
        double          exactEntriesInQuals = 0.0;
 
6414
        double          entryPagesFetched,
 
6415
                                dataPagesFetched,
 
6416
                                dataPagesFetchedBySel;
 
6417
        double          qual_op_cost,
 
6418
                                qual_arg_cost,
 
6419
                                spc_random_page_cost,
 
6420
                                num_scans;
 
6421
        QualCost        index_qual_cost;
 
6422
        Relation        indexRel;
 
6423
        GinStatsData ginStats;
 
6424
 
 
6425
        /*
 
6426
         * Obtain statistic information from the meta page
 
6427
         */
 
6428
        indexRel = index_open(index->indexoid, AccessShareLock);
 
6429
        ginGetStats(indexRel, &ginStats);
 
6430
        index_close(indexRel, AccessShareLock);
 
6431
 
 
6432
        numEntryPages = ginStats.nEntryPages;
 
6433
        numDataPages = ginStats.nDataPages;
 
6434
        numPendingPages = ginStats.nPendingPages;
 
6435
        numEntries = ginStats.nEntries;
 
6436
 
 
6437
        /*
 
6438
         * nPendingPages can be trusted, but the other fields are as of the last
 
6439
         * VACUUM.      Scale them by the ratio numPages / nTotalPages to account for
 
6440
         * growth since then.  If the fields are zero (implying no VACUUM at all,
 
6441
         * and an index created pre-9.1), assume all pages are entry pages.
 
6442
         */
 
6443
        if (ginStats.nTotalPages == 0 || ginStats.nEntryPages == 0)
 
6444
        {
 
6445
                numEntryPages = numPages;
 
6446
                numDataPages = 0;
 
6447
                numEntries = numTuples; /* bogus, but no other info available */
 
6448
        }
 
6449
        else
 
6450
        {
 
6451
                double          scale = numPages / ginStats.nTotalPages;
 
6452
 
 
6453
                numEntryPages = ceil(numEntryPages * scale);
 
6454
                numDataPages = ceil(numDataPages * scale);
 
6455
                numEntries = ceil(numEntries * scale);
 
6456
                /* ensure we didn't round up too much */
 
6457
                numEntryPages = Min(numEntryPages, numPages);
 
6458
                numDataPages = Min(numDataPages, numPages - numEntryPages);
 
6459
        }
 
6460
 
 
6461
        /* In an empty index, numEntries could be zero.  Avoid divide-by-zero */
 
6462
        if (numEntries < 1)
 
6463
                numEntries = 1;
 
6464
 
 
6465
        /*
 
6466
         * Include predicate in selectivityQuals (should match
 
6467
         * genericcostestimate)
 
6468
         */
 
6469
        if (index->indpred != NIL)
 
6470
        {
 
6471
                List       *predExtraQuals = NIL;
 
6472
 
 
6473
                foreach(l, index->indpred)
 
6474
                {
 
6475
                        Node       *predQual = (Node *) lfirst(l);
 
6476
                        List       *oneQual = list_make1(predQual);
 
6477
 
 
6478
                        if (!predicate_implied_by(oneQual, indexQuals))
 
6479
                                predExtraQuals = list_concat(predExtraQuals, oneQual);
 
6480
                }
 
6481
                /* list_concat avoids modifying the passed-in indexQuals list */
 
6482
                selectivityQuals = list_concat(predExtraQuals, indexQuals);
 
6483
        }
 
6484
        else
 
6485
                selectivityQuals = indexQuals;
 
6486
 
 
6487
        /* Estimate the fraction of main-table tuples that will be visited */
 
6488
        *indexSelectivity = clauselist_selectivity(root, selectivityQuals,
 
6489
                                                                                           index->rel->relid,
 
6490
                                                                                           JOIN_INNER,
 
6491
                                                                                           NULL);
 
6492
 
 
6493
        /* fetch estimated page cost for schema containing index */
 
6494
        get_tablespace_page_costs(index->reltablespace,
 
6495
                                                          &spc_random_page_cost,
 
6496
                                                          NULL);
 
6497
 
 
6498
        /*
 
6499
         * Generic assumption about index correlation: there isn't any.
 
6500
         */
 
6501
        *indexCorrelation = 0.0;
 
6502
 
 
6503
        /*
 
6504
         * Examine quals to estimate number of search entries & partial matches
 
6505
         */
 
6506
        foreach(l, indexQuals)
 
6507
        {
 
6508
                RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
 
6509
                Expr       *clause;
 
6510
                Node       *leftop,
 
6511
                                   *rightop,
 
6512
                                   *operand;
 
6513
                Oid                     extractProcOid;
 
6514
                Oid                     clause_op;
 
6515
                int                     strategy_op;
 
6516
                Oid                     lefttype,
 
6517
                                        righttype;
 
6518
                int32           nentries = 0;
 
6519
                bool       *partial_matches = NULL;
 
6520
                Pointer    *extra_data = NULL;
 
6521
                bool       *nullFlags = NULL;
 
6522
                int32           searchMode = GIN_SEARCH_MODE_DEFAULT;
 
6523
                int                     indexcol;
 
6524
 
 
6525
                Assert(IsA(rinfo, RestrictInfo));
 
6526
                clause = rinfo->clause;
 
6527
                Assert(IsA(clause, OpExpr));
 
6528
                leftop = get_leftop(clause);
 
6529
                rightop = get_rightop(clause);
 
6530
                clause_op = ((OpExpr *) clause)->opno;
 
6531
 
 
6532
                if ((indexcol = find_index_column(leftop, index)) >= 0)
 
6533
                {
 
6534
                        operand = rightop;
 
6535
                }
 
6536
                else if ((indexcol = find_index_column(rightop, index)) >= 0)
 
6537
                {
 
6538
                        operand = leftop;
 
6539
                        clause_op = get_commutator(clause_op);
 
6540
                }
 
6541
                else
 
6542
                {
 
6543
                        elog(ERROR, "could not match index to operand");
 
6544
                        operand = NULL;         /* keep compiler quiet */
 
6545
                }
 
6546
 
 
6547
                if (IsA(operand, RelabelType))
 
6548
                        operand = (Node *) ((RelabelType *) operand)->arg;
 
6549
 
 
6550
                /*
 
6551
                 * It's impossible to call extractQuery method for unknown operand. So
 
6552
                 * unless operand is a Const we can't do much; just assume there will
 
6553
                 * be one ordinary search entry from the operand at runtime.
 
6554
                 */
 
6555
                if (!IsA(operand, Const))
 
6556
                {
 
6557
                        searchEntriesInQuals++;
 
6558
                        continue;
 
6559
                }
 
6560
 
 
6561
                /* If Const is null, there can be no matches */
 
6562
                if (((Const *) operand)->constisnull)
 
6563
                {
 
6564
                        *indexStartupCost = 0;
 
6565
                        *indexTotalCost = 0;
 
6566
                        *indexSelectivity = 0;
 
6567
                        PG_RETURN_VOID();
 
6568
                }
 
6569
 
 
6570
                /*
 
6571
                 * Get the operator's strategy number and declared input data types
 
6572
                 * within the index opfamily.  (We don't need the latter, but we use
 
6573
                 * get_op_opfamily_properties because it will throw error if it fails
 
6574
                 * to find a matching pg_amop entry.)
 
6575
                 */
 
6576
                get_op_opfamily_properties(clause_op, index->opfamily[indexcol], false,
 
6577
                                                                   &strategy_op, &lefttype, &righttype);
 
6578
 
 
6579
                /*
 
6580
                 * GIN always uses the "default" support functions, which are those
 
6581
                 * with lefttype == righttype == the opclass' opcintype (see
 
6582
                 * IndexSupportInitialize in relcache.c).
 
6583
                 */
 
6584
                extractProcOid = get_opfamily_proc(index->opfamily[indexcol],
 
6585
                                                                                   index->opcintype[indexcol],
 
6586
                                                                                   index->opcintype[indexcol],
 
6587
                                                                                   GIN_EXTRACTQUERY_PROC);
 
6588
 
 
6589
                if (!OidIsValid(extractProcOid))
 
6590
                {
 
6591
                        /* should not happen; throw same error as index_getprocinfo */
 
6592
                        elog(ERROR, "missing support function %d for attribute %d of index \"%s\"",
 
6593
                                 GIN_EXTRACTQUERY_PROC, indexcol + 1,
 
6594
                                 get_rel_name(index->indexoid));
 
6595
                }
 
6596
 
 
6597
                OidFunctionCall7(extractProcOid,
 
6598
                                                 ((Const *) operand)->constvalue,
 
6599
                                                 PointerGetDatum(&nentries),
 
6600
                                                 UInt16GetDatum(strategy_op),
 
6601
                                                 PointerGetDatum(&partial_matches),
 
6602
                                                 PointerGetDatum(&extra_data),
 
6603
                                                 PointerGetDatum(&nullFlags),
 
6604
                                                 PointerGetDatum(&searchMode));
 
6605
 
 
6606
                if (nentries <= 0 && searchMode == GIN_SEARCH_MODE_DEFAULT)
 
6607
                {
 
6608
                        /* No match is possible */
 
6609
                        *indexStartupCost = 0;
 
6610
                        *indexTotalCost = 0;
 
6611
                        *indexSelectivity = 0;
 
6612
                        PG_RETURN_VOID();
 
6613
                }
 
6614
                else
 
6615
                {
 
6616
                        int32           i;
 
6617
 
 
6618
                        for (i = 0; i < nentries; i++)
 
6619
                        {
 
6620
                                /*
 
6621
                                 * For partial match we haven't any information to estimate
 
6622
                                 * number of matched entries in index, so, we just estimate it
 
6623
                                 * as 100
 
6624
                                 */
 
6625
                                if (partial_matches && partial_matches[i])
 
6626
                                        partialEntriesInQuals += 100;
 
6627
                                else
 
6628
                                        exactEntriesInQuals++;
 
6629
 
 
6630
                                searchEntriesInQuals++;
 
6631
                        }
 
6632
                }
 
6633
 
 
6634
                if (searchMode == GIN_SEARCH_MODE_INCLUDE_EMPTY)
 
6635
                {
 
6636
                        /* Treat "include empty" like an exact-match item */
 
6637
                        exactEntriesInQuals++;
 
6638
                        searchEntriesInQuals++;
 
6639
                }
 
6640
                else if (searchMode != GIN_SEARCH_MODE_DEFAULT)
 
6641
                {
 
6642
                        /* It's GIN_SEARCH_MODE_ALL */
 
6643
                        haveFullScan = true;
 
6644
                }
 
6645
        }
 
6646
 
 
6647
        if (haveFullScan || indexQuals == NIL)
 
6648
        {
 
6649
                /*
 
6650
                 * Full index scan will be required.  We treat this as if every key in
 
6651
                 * the index had been listed in the query; is that reasonable?
 
6652
                 */
 
6653
                searchEntriesInQuals = numEntries;
 
6654
        }
 
6655
 
 
6656
        /* Will we have more than one iteration of a nestloop scan? */
 
6657
        if (outer_rel != NULL && outer_rel->rows > 1)
 
6658
                num_scans = outer_rel->rows;
 
6659
        else
 
6660
                num_scans = 1;
 
6661
 
 
6662
        /*
 
6663
         * cost to begin scan, first of all, pay attention to pending list.
 
6664
         */
 
6665
        entryPagesFetched = numPendingPages;
 
6666
 
 
6667
        /*
 
6668
         * Estimate number of entry pages read.  We need to do
 
6669
         * searchEntriesInQuals searches.  Use a power function as it should be,
 
6670
         * but tuples on leaf pages usually is much greater. Here we include all
 
6671
         * searches in entry tree, including search of first entry in partial
 
6672
         * match algorithm
 
6673
         */
 
6674
        entryPagesFetched += ceil(searchEntriesInQuals * rint(pow(numEntryPages, 0.15)));
 
6675
 
 
6676
        /*
 
6677
         * Add an estimate of entry pages read by partial match algorithm. It's a
 
6678
         * scan over leaf pages in entry tree.  We haven't any useful stats here,
 
6679
         * so estimate it as proportion.
 
6680
         */
 
6681
        entryPagesFetched += ceil(numEntryPages * partialEntriesInQuals / numEntries);
 
6682
 
 
6683
        /*
 
6684
         * Partial match algorithm reads all data pages before doing actual scan,
 
6685
         * so it's a startup cost. Again, we havn't any useful stats here, so,
 
6686
         * estimate it as proportion
 
6687
         */
 
6688
        dataPagesFetched = ceil(numDataPages * partialEntriesInQuals / numEntries);
 
6689
 
 
6690
        /* calculate cache effects */
 
6691
        if (num_scans > 1 || searchEntriesInQuals > 1)
 
6692
        {
 
6693
                entryPagesFetched = index_pages_fetched(entryPagesFetched,
 
6694
                                                                                                (BlockNumber) numEntryPages,
 
6695
                                                                                                numEntryPages, root);
 
6696
                dataPagesFetched = index_pages_fetched(dataPagesFetched,
 
6697
                                                                                           (BlockNumber) numDataPages,
 
6698
                                                                                           numDataPages, root);
 
6699
        }
 
6700
 
 
6701
        /*
 
6702
         * Here we use random page cost because logically-close pages could be far
 
6703
         * apart on disk.
 
6704
         */
 
6705
        *indexStartupCost = (entryPagesFetched + dataPagesFetched) * spc_random_page_cost;
 
6706
 
 
6707
        /* cost to scan data pages for each exact (non-partial) matched entry */
 
6708
        dataPagesFetched = ceil(numDataPages * exactEntriesInQuals / numEntries);
 
6709
 
 
6710
        /*
 
6711
         * Estimate number of data pages read, using selectivity estimation and
 
6712
         * capacity of data page.
 
6713
         */
 
6714
        dataPagesFetchedBySel = ceil(*indexSelectivity *
 
6715
                                                                 (numTuples / (BLCKSZ / SizeOfIptrData)));
 
6716
 
 
6717
        if (dataPagesFetchedBySel > dataPagesFetched)
 
6718
        {
 
6719
                /*
 
6720
                 * At least one of entries is very frequent and, unfortunately, we
 
6721
                 * couldn't get statistic about entries (only tsvector has such
 
6722
                 * statistics). So, we obviously have too small estimation of pages
 
6723
                 * fetched from data tree. Re-estimate it from known capacity of data
 
6724
                 * pages
 
6725
                 */
 
6726
                dataPagesFetched = dataPagesFetchedBySel;
 
6727
        }
 
6728
 
 
6729
        if (num_scans > 1)
 
6730
                dataPagesFetched = index_pages_fetched(dataPagesFetched,
 
6731
                                                                                           (BlockNumber) numDataPages,
 
6732
                                                                                           numDataPages, root);
 
6733
        *indexTotalCost = *indexStartupCost +
 
6734
                dataPagesFetched * spc_random_page_cost;
 
6735
 
 
6736
        /*
 
6737
         * Add on index qual eval costs, much as in genericcostestimate
 
6738
         */
 
6739
        cost_qual_eval(&index_qual_cost, indexQuals, root);
 
6740
        qual_arg_cost = index_qual_cost.startup + index_qual_cost.per_tuple;
 
6741
        cost_qual_eval(&index_qual_cost, indexOrderBys, root);
 
6742
        qual_arg_cost += index_qual_cost.startup + index_qual_cost.per_tuple;
 
6743
        qual_op_cost = cpu_operator_cost *
 
6744
                (list_length(indexQuals) + list_length(indexOrderBys));
 
6745
        qual_arg_cost -= qual_op_cost;
 
6746
        if (qual_arg_cost < 0)          /* just in case... */
 
6747
                qual_arg_cost = 0;
 
6748
 
 
6749
        *indexStartupCost += qual_arg_cost;
 
6750
        *indexTotalCost += qual_arg_cost;
 
6751
        *indexTotalCost += (numTuples * *indexSelectivity) * (cpu_index_tuple_cost + qual_op_cost);
 
6752
 
 
6753
        PG_RETURN_VOID();
 
6754
}