1
package org.apache.lucene.facet.search.sampling;
3
import java.io.IOException;
5
import org.apache.lucene.index.IndexReader;
7
import org.apache.lucene.facet.search.FacetArrays;
8
import org.apache.lucene.facet.search.ScoredDocIDs;
9
import org.apache.lucene.facet.search.aggregator.Aggregator;
10
import org.apache.lucene.facet.search.params.FacetRequest;
11
import org.apache.lucene.facet.search.params.FacetSearchParams;
12
import org.apache.lucene.facet.search.results.FacetResult;
13
import org.apache.lucene.facet.search.results.FacetResultNode;
14
import org.apache.lucene.facet.search.results.MutableFacetResultNode;
15
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
18
* Licensed to the Apache Software Foundation (ASF) under one or more
19
* contributor license agreements. See the NOTICE file distributed with
20
* this work for additional information regarding copyright ownership.
21
* The ASF licenses this file to You under the Apache License, Version 2.0
22
* (the "License"); you may not use this file except in compliance with
23
* the License. You may obtain a copy of the License at
25
* http://www.apache.org/licenses/LICENSE-2.0
27
* Unless required by applicable law or agreed to in writing, software
28
* distributed under the License is distributed on an "AS IS" BASIS,
29
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
30
* See the License for the specific language governing permissions and
31
* limitations under the License.
35
* Sampling definition for facets accumulation
37
* The Sampler uses TAKMI style counting to provide a 'best guess' top-K result
38
* set of the facets accumulated.
40
* Note: Sampling accumulation (Accumulation over a sampled-set of the results),
41
* does not guarantee accurate values for
42
* {@link FacetResult#getNumValidDescendants()} &
43
* {@link FacetResultNode#getResidue()}.
45
* @lucene.experimental
47
public abstract class Sampler {
49
protected final SamplingParams samplingParams;
52
* Construct with {@link SamplingParams}
55
this(new SamplingParams());
59
* Construct with certain {@link SamplingParams}
60
* @param params sampling params in effect
61
* @throws IllegalArgumentException if the provided SamplingParams are not valid
63
public Sampler(SamplingParams params) throws IllegalArgumentException {
64
if (!params.validate()) {
65
throw new IllegalArgumentException("The provided SamplingParams are not valid!!");
67
this.samplingParams = params;
71
* Check if this sampler would complement for the input docIds
73
public boolean shouldSample(ScoredDocIDs docIds) {
74
return docIds.size() > samplingParams.getSamplingThreshold();
78
* Compute a sample set out of the input set, based on the {@link SamplingParams#getSampleRatio()}
79
* in effect. Sub classes can override to alter how the sample set is
82
* If the input set is of size smaller than {@link SamplingParams#getMinSampleSize()},
83
* the input set is returned (no sampling takes place).
85
* Other than that, the returned set size will not be larger than {@link SamplingParams#getMaxSampleSize()}
86
* nor smaller than {@link SamplingParams#getMinSampleSize()}.
88
* full set of matching documents out of which a sample is needed.
90
public SampleResult getSampleSet(ScoredDocIDs docids) throws IOException {
91
if (!shouldSample(docids)) {
92
return new SampleResult(docids, 1d);
95
int actualSize = docids.size();
96
int sampleSetSize = (int) (actualSize * samplingParams.getSampleRatio());
97
sampleSetSize = Math.max(sampleSetSize, samplingParams.getMinSampleSize());
98
sampleSetSize = Math.min(sampleSetSize, samplingParams.getMaxSampleSize());
100
return createSample(docids, actualSize, sampleSetSize);
104
* Create and return a sample of the input set
105
* @param docids input set out of which a sample is to be created
106
* @param actualSize original size of set, prior to sampling
107
* @param sampleSetSize required size of sample set
108
* @return sample of the input set in the required size
110
protected abstract SampleResult createSample(ScoredDocIDs docids, int actualSize,
111
int sampleSetSize) throws IOException;
114
* Get a fixer of sample facet accumulation results. Default implementation
115
* returns a <code>TakmiSampleFixer</code> which is adequate only for
116
* counting. For any other accumulator, provide a different fixer.
118
public SampleFixer getSampleFixer(
119
IndexReader indexReader, TaxonomyReader taxonomyReader,
120
FacetSearchParams searchParams) {
121
return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams);
125
* Result of sample computation
127
public final static class SampleResult {
128
public final ScoredDocIDs docids;
129
public final double actualSampleRatio;
130
protected SampleResult(ScoredDocIDs docids, double actualSampleRatio) {
131
this.docids = docids;
132
this.actualSampleRatio = actualSampleRatio;
137
* Return the sampling params in effect
139
public final SamplingParams getSamplingParams() {
140
return samplingParams;
144
* Trim the input facet result.<br>
145
* Note: It is only valid to call this method with result obtained for a
146
* facet request created through {@link #overSampledSearchParams(FacetSearchParams)}.
148
* @throws IllegalArgumentException
149
* if called with results not obtained for requests created
150
* through {@link #overSampledSearchParams(FacetSearchParams)}
152
public FacetResult trimResult(FacetResult facetResult) throws IllegalArgumentException {
153
double overSampleFactor = getSamplingParams().getOversampleFactor();
154
if (overSampleFactor <= 1) { // no factoring done?
158
OverSampledFacetRequest sampledFreq = null;
161
sampledFreq = (OverSampledFacetRequest)facetResult.getFacetRequest();
162
} catch (ClassCastException e) {
163
throw new IllegalArgumentException(
164
"It is only valid to call this method with result obtained for a" +
165
"facet request created through sampler.overSamlpingSearchParams()",
169
FacetRequest origFrq = sampledFreq.orig;
171
MutableFacetResultNode trimmedRootNode = MutableFacetResultNode.toImpl(facetResult.getFacetResultNode());
172
trimmedRootNode.trimSubResults(origFrq.getNumResults());
174
return new FacetResult(origFrq, trimmedRootNode, facetResult.getNumValidDescendants());
178
* Over-sampled search params, wrapping each request with an over-sampled one.
180
public FacetSearchParams overSampledSearchParams(FacetSearchParams original) {
181
FacetSearchParams res = original;
182
// So now we can sample -> altering the searchParams to accommodate for the statistical error for the sampling
183
double overSampleFactor = getSamplingParams().getOversampleFactor();
184
if (overSampleFactor > 1) { // any factoring to do?
185
res = new FacetSearchParams(original.getFacetIndexingParams());
186
for (FacetRequest frq: original.getFacetRequests()) {
187
int overSampledNumResults = (int) Math.ceil(frq.getNumResults() * overSampleFactor);
188
res.addFacetRequest(new OverSampledFacetRequest(frq, overSampledNumResults));
195
* Wrapping a facet request for over sampling.
196
* Implementation detail: even if the original request is a count request, no
197
* statistics will be computed for it as the wrapping is not a count request.
198
* This is ok, as the sampling accumulator is later computing the statistics
199
* over the original requests.
201
private static class OverSampledFacetRequest extends FacetRequest {
202
final FacetRequest orig;
203
public OverSampledFacetRequest(FacetRequest orig, int num) {
204
super(orig.getCategoryPath(), num);
209
public Aggregator createAggregator(boolean useComplements,
210
FacetArrays arrays, IndexReader indexReader,
211
TaxonomyReader taxonomy) throws IOException {
212
return orig.createAggregator(useComplements, arrays, indexReader,
217
public double getValueOf(FacetArrays arrays, int idx) {
218
return orig.getValueOf(arrays, idx);
222
public boolean requireDocumentScore() {
223
return orig.requireDocumentScore();