18
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.260 2009/04/19 19:46:33 tgl Exp $
18
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.261 2009/06/11 14:49:04 momjian Exp $
20
20
*-------------------------------------------------------------------------
72
72
* float8 oprjoin (internal, oid, internal, int2, internal);
74
74
* (Before Postgres 8.4, join estimators had only the first four of these
75
* parameters. That signature is still allowed, but deprecated.) The
75
* parameters. That signature is still allowed, but deprecated.) The
76
76
* relationship between jointype and sjinfo is explained in the comments for
77
77
* clause_selectivity() --- the short version is that jointype is usually
78
78
* best ignored in favor of examining sjinfo.
135
135
static double eqjoinsel_inner(Oid operator,
136
136
VariableStatData *vardata1, VariableStatData *vardata2);
137
137
static double eqjoinsel_semi(Oid operator,
138
VariableStatData *vardata1, VariableStatData *vardata2);
138
VariableStatData *vardata1, VariableStatData *vardata2);
139
139
static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
140
140
Datum lobound, Datum hibound, Oid boundstypid,
141
141
double *scaledlobound, double *scaledhibound);
159
159
static char *convert_string_datum(Datum value, Oid typid);
160
160
static double convert_timevalue_to_scalar(Datum value, Oid typid);
161
161
static bool get_variable_range(PlannerInfo *root, VariableStatData *vardata,
162
Oid sortop, Datum *min, Datum *max);
162
Oid sortop, Datum *min, Datum *max);
163
163
static Selectivity prefix_selectivity(VariableStatData *vardata,
164
164
Oid vartype, Oid opfamily, Const *prefixcon);
165
165
static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
231
* If the constant is NULL, assume operator is strict and
232
* return zero, ie, operator will never return TRUE.
231
* If the constant is NULL, assume operator is strict and return zero, ie,
232
* operator will never return TRUE.
238
238
* If we matched the var to a unique index, assume there is exactly one
239
* match regardless of anything else. (This is slightly bogus, since
240
* the index's equality operator might be different from ours, but it's
241
* more likely to be right than ignoring the information.)
239
* match regardless of anything else. (This is slightly bogus, since the
240
* index's equality operator might be different from ours, but it's more
241
* likely to be right than ignoring the information.)
243
243
if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0)
244
244
return 1.0 / vardata->rel->tuples;
259
259
* Is the constant "=" to any of the column's most common values?
260
* (Although the given operator may not really be "=", we will
261
* assume that seeing whether it returns TRUE is an appropriate
262
* test. If you don't like this, maybe you shouldn't be using
263
* eqsel for your operator...)
260
* (Although the given operator may not really be "=", we will assume
261
* that seeing whether it returns TRUE is an appropriate test. If you
262
* don't like this, maybe you shouldn't be using eqsel for your
265
265
if (get_attstatsslot(vardata->statsTuple,
266
266
vardata->atttype, vardata->atttypmod,
301
301
* Constant is "=" to this common value. We know selectivity
302
* exactly (or as exactly as ANALYZE could calculate it,
302
* exactly (or as exactly as ANALYZE could calculate it, anyway).
305
304
selec = numbers[i];
310
* Comparison is against a constant that is neither NULL nor
311
* any of the common values. Its selectivity cannot be more
309
* Comparison is against a constant that is neither NULL nor any
310
* of the common values. Its selectivity cannot be more than
314
313
double sumcommon = 0.0;
315
314
double otherdistinct;
320
319
CLAMP_PROBABILITY(selec);
323
* and in fact it's probably a good deal less. We approximate
324
* that all the not-common values share this remaining
325
* fraction equally, so we divide by the number of other
322
* and in fact it's probably a good deal less. We approximate that
323
* all the not-common values share this remaining fraction
324
* equally, so we divide by the number of other distinct values.
328
326
otherdistinct = get_variable_numdistinct(vardata) - nnumbers;
329
327
if (otherdistinct > 1)
330
328
selec /= otherdistinct;
333
* Another cross-check: selectivity shouldn't be estimated as
334
* more than the least common "most common value".
331
* Another cross-check: selectivity shouldn't be estimated as more
332
* than the least common "most common value".
336
334
if (nnumbers > 0 && selec > numbers[nnumbers - 1])
337
335
selec = numbers[nnumbers - 1];
370
368
* If we matched the var to a unique index, assume there is exactly one
371
* match regardless of anything else. (This is slightly bogus, since
372
* the index's equality operator might be different from ours, but it's
373
* more likely to be right than ignoring the information.)
369
* match regardless of anything else. (This is slightly bogus, since the
370
* index's equality operator might be different from ours, but it's more
371
* likely to be right than ignoring the information.)
375
373
if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0)
376
374
return 1.0 / vardata->rel->tuples;
391
389
* result averaged over all possible values whether common or
392
390
* uncommon. (Essentially, we are assuming that the not-yet-known
393
391
* comparison value is equally likely to be any of the possible
394
* values, regardless of their frequency in the table. Is that a
392
* values, regardless of their frequency in the table. Is that a good
397
395
selec = 1.0 - stats->stanullfrac;
398
396
ndistinct = get_variable_numdistinct(vardata);
400
398
selec /= ndistinct;
403
* Cross-check: selectivity should never be estimated as more than
404
* the most common value's.
401
* Cross-check: selectivity should never be estimated as more than the
402
* most common value's.
406
404
if (get_attstatsslot(vardata->statsTuple,
407
405
vardata->atttype, vardata->atttypmod,
610
608
* essentially using the histogram just as a representative sample. However,
611
609
* small histograms are unlikely to be all that representative, so the caller
612
610
* should be prepared to fall back on some other estimation approach when the
613
* histogram is missing or very small. It may also be prudent to combine this
611
* histogram is missing or very small. It may also be prudent to combine this
614
612
* approach with another one when the histogram is small.
616
614
* If the actual histogram size is not at least min_hist_size, we won't bother
1169
1167
* selectivity of the fixed prefix and remainder of pattern
1170
1168
* separately, then combine the two to get an estimate of the
1171
1169
* selectivity for the part of the column population represented by
1172
* the histogram. (For small histograms, we combine these approaches.)
1170
* the histogram. (For small histograms, we combine these
1174
1173
* We then add up data for any most-common-values values; these are
1175
1174
* not in the histogram population, and we can get exact answers for
1205
1204
restsel = pattern_selectivity(rest, ptype);
1206
1205
heursel = prefixsel * restsel;
1208
if (selec < 0) /* fewer than 10 histogram entries? */
1207
if (selec < 0) /* fewer than 10 histogram entries? */
1209
1208
selec = heursel;
1214
1213
* histogram and heuristic selectivities, putting increasingly
1215
1214
* more trust in the histogram for larger sizes.
1217
double hist_weight = hist_size / 100.0;
1216
double hist_weight = hist_size / 100.0;
1219
1218
selec = selec * hist_weight + heursel * (1.0 - hist_weight);
1865
1864
* Decide if it's a join clause. This should match clausesel.c's
1866
* treat_as_join_clause(), except that we intentionally consider only
1867
* the leading columns and not the rest of the clause.
1865
* treat_as_join_clause(), except that we intentionally consider only the
1866
* leading columns and not the rest of the clause.
1869
1868
if (varRelid != 0)
1872
* Caller is forcing restriction mode (eg, because we are examining
1873
* an inner indexscan qual).
1871
* Caller is forcing restriction mode (eg, because we are examining an
1872
* inner indexscan qual).
1875
1874
is_join_clause = false;
1877
1876
else if (sjinfo == NULL)
1880
* It must be a restriction clause, since it's being evaluated at
1879
* It must be a restriction clause, since it's being evaluated at a
1883
1882
is_join_clause = false;
2163
2163
* end up with the same answer anyway.
2165
2165
* An additional hack we use here is to clamp the nd1 and nd2 values
2166
* to not more than what we are estimating the input relation sizes
2167
* to be, providing a crude correction for the selectivity of
2168
* restriction clauses on those relations. (We don't do that in the
2169
* other path since there we are comparing the nd values to stats for
2170
* the whole relations.)
2166
* to not more than what we are estimating the input relation sizes to
2167
* be, providing a crude correction for the selectivity of restriction
2168
* clauses on those relations. (We don't do that in the other path
2169
* since there we are comparing the nd values to stats for the whole
2172
2172
double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
2173
2173
double nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
2309
2309
* Now we need to estimate the fraction of relation 1 that has at
2310
* least one join partner. We know for certain that the matched
2311
* MCVs do, so that gives us a lower bound, but we're really in the
2312
* dark about everything else. Our crude approach is: if nd1 <= nd2
2313
* then assume all non-null rel1 rows have join partners, else assume
2314
* for the uncertain rows that a fraction nd2/nd1 have join partners.
2315
* We can discount the known-matched MCVs from the distinct-values
2316
* counts before doing the division.
2310
* least one join partner. We know for certain that the matched MCVs
2311
* do, so that gives us a lower bound, but we're really in the dark
2312
* about everything else. Our crude approach is: if nd1 <= nd2 then
2313
* assume all non-null rel1 rows have join partners, else assume for
2314
* the uncertain rows that a fraction nd2/nd1 have join partners. We
2315
* can discount the known-matched MCVs from the distinct-values counts
2316
* before doing the division.
2318
2318
nd1 -= nmatches;
2319
2319
nd2 -= nmatches;
2321
2321
selec = Max(matchfreq1, 1.0 - nullfrac1);
2324
double uncertain = 1.0 - matchfreq1 - nullfrac1;
2324
double uncertain = 1.0 - matchfreq1 - nullfrac1;
2326
2326
CLAMP_PROBABILITY(uncertain);
2327
selec = matchfreq1 + (nd2/nd1) * uncertain;
2327
selec = matchfreq1 + (nd2 / nd1) * uncertain;
2572
2572
* Look up the various operators we need. If we don't find them all, it
2573
2573
* probably means the opfamily is broken, but we just fail silently.
2575
* Note: we expect that pg_statistic histograms will be sorted by the
2576
* '<' operator, regardless of which sort direction we are considering.
2575
* Note: we expect that pg_statistic histograms will be sorted by the '<'
2576
* operator, regardless of which sort direction we are considering.
2578
2578
switch (strategy)
2723
2723
* Only one of the two "end" fractions can really be less than 1.0;
2724
* believe the smaller estimate and reset the other one to exactly 1.0.
2725
* If we get exactly equal estimates (as can easily happen with
2726
* self-joins), believe neither.
2724
* believe the smaller estimate and reset the other one to exactly 1.0. If
2725
* we get exactly equal estimates (as can easily happen with self-joins),
2728
2728
if (*leftend > *rightend)
2729
2729
*leftend = 1.0;
2733
2733
*leftend = *rightend = 1.0;
2736
* Also, the fraction of the left variable that will be scanned before
2737
* the first join pair is found is the fraction that's < the right-side
2736
* Also, the fraction of the left variable that will be scanned before the
2737
* first join pair is found is the fraction that's < the right-side
2738
2738
* minimum value. But only believe non-default estimates, else stick with
2739
2739
* our own default.
2753
2753
* Only one of the two "start" fractions can really be more than zero;
2754
* believe the larger estimate and reset the other one to exactly 0.0.
2755
* If we get exactly equal estimates (as can easily happen with
2756
* self-joins), believe neither.
2754
* believe the larger estimate and reset the other one to exactly 0.0. If
2755
* we get exactly equal estimates (as can easily happen with self-joins),
2758
2758
if (*leftstart < *rightstart)
2759
2759
*leftstart = 0.0;
2766
2766
* If the sort order is nulls-first, we're going to have to skip over any
2767
* nulls too. These would not have been counted by scalarineqsel, and
2768
* we can safely add in this fraction regardless of whether we believe
2767
* nulls too. These would not have been counted by scalarineqsel, and we
2768
* can safely add in this fraction regardless of whether we believe
2769
2769
* scalarineqsel's results or not. But be sure to clamp the sum to 1.0!
2771
2771
if (nulls_first)
2898
2898
* is as follows:
2899
2899
* 1. Expressions yielding boolean are assumed to contribute two groups,
2900
2900
* independently of their content, and are ignored in the subsequent
2901
* steps. This is mainly because tests like "col IS NULL" break the
2901
* steps. This is mainly because tests like "col IS NULL" break the
2902
2902
* heuristic used in step 2 especially badly.
2903
2903
* 2. Reduce the given expressions to a list of unique Vars used. For
2904
2904
* example, GROUP BY a, a + b is treated the same as GROUP BY a, b.
2946
2946
Assert(groupExprs != NIL);
2949
* Count groups derived from boolean grouping expressions. For other
2949
* Count groups derived from boolean grouping expressions. For other
2950
2950
* expressions, find the unique Vars used, treating an expression as a Var
2951
2951
* if we can find stats for it. For each one, record the statistical
2952
2952
* estimate of number of distinct values (total in its table, without
3959
3961
if (vardata1->rel &&
3960
3962
bms_is_subset(vardata1->rel->relids, sjinfo->syn_righthand))
3961
*join_is_reversed = true; /* var1 is on RHS */
3963
*join_is_reversed = true; /* var1 is on RHS */
3962
3964
else if (vardata2->rel &&
3963
3965
bms_is_subset(vardata2->rel->relids, sjinfo->syn_lefthand))
3964
*join_is_reversed = true; /* var2 is on LHS */
3966
*join_is_reversed = true; /* var2 is on LHS */
3966
3968
*join_is_reversed = false;
4036
4038
(*get_relation_stats_hook) (root, rte, var->varattno, vardata))
4039
* The hook took control of acquiring a stats tuple. If it
4040
* did supply a tuple, it'd better have supplied a freefunc.
4041
* The hook took control of acquiring a stats tuple. If it did
4042
* supply a tuple, it'd better have supplied a freefunc.
4042
4044
if (HeapTupleIsValid(vardata->statsTuple) &&
4043
4045
!vardata->freefunc)
4171
4173
* Has it got stats? We only consider stats for
4172
* non-partial indexes, since partial indexes
4173
* probably don't reflect whole-relation statistics;
4174
* the above check for uniqueness is the only info
4175
* we take from a partial index.
4174
* non-partial indexes, since partial indexes probably
4175
* don't reflect whole-relation statistics; the above
4176
* check for uniqueness is the only info we take from
4177
4179
* An index stats hook, however, must make its own
4178
4180
* decisions about what to do with partial indexes.
4195
4197
vardata->statsTuple =
4196
4198
SearchSysCache(STATRELATT,
4197
ObjectIdGetDatum(index->indexoid),
4199
ObjectIdGetDatum(index->indexoid),
4198
4200
Int16GetDatum(pos + 1),
4200
4202
vardata->freefunc = ReleaseSysCache;
4283
4285
* If there is a unique index for the variable, assume it is unique no
4284
* matter what pg_statistic says; the statistics could be out of date,
4285
* or we might have found a partial unique index that proves the var
4286
* is unique for this query.
4286
* matter what pg_statistic says; the statistics could be out of date, or
4287
* we might have found a partial unique index that proves the var is
4288
* unique for this query.
4288
4290
if (vardata->isunique)
4289
4291
stadistinct = -1.0;
4871
* If the prefix is long then the two bounding values might be too
4872
* close together for the histogram to distinguish them usefully,
4873
* resulting in a zero estimate (plus or minus roundoff error).
4874
* To avoid returning a ridiculously small estimate, compute the
4875
* estimated selectivity for "variable = 'foo'", and clamp to that.
4876
* (Obviously, the resultant estimate should be at least that.)
4873
* If the prefix is long then the two bounding values might be too close
4874
* together for the histogram to distinguish them usefully, resulting in a
4875
* zero estimate (plus or minus roundoff error). To avoid returning a
4876
* ridiculously small estimate, compute the estimated selectivity for
4877
* "variable = 'foo'", and clamp to that. (Obviously, the resultant
4878
* estimate should be at least that.)
4878
4880
* We apply this even if we couldn't make a greater string. That case
4879
4881
* suggests that the prefix is near the maximum possible, and thus
4880
* probably off the end of the histogram, and thus we probably got a
4881
* very small estimate from the >= condition; so we still need to clamp.
4882
* probably off the end of the histogram, and thus we probably got a very
4883
* small estimate from the >= condition; so we still need to clamp.
4883
4885
cmpopr = get_opfamily_member(opfamily, vartype, vartype,
4884
4886
BTEqualStrategyNumber);