1
package org.apache.lucene.facet.search.sampling;
3
import java.io.IOException;
4
import java.util.Arrays;
5
import java.util.logging.Level;
6
import java.util.logging.Logger;
8
import org.apache.lucene.util.PriorityQueue;
10
import org.apache.lucene.facet.search.ScoredDocIDs;
11
import org.apache.lucene.facet.search.ScoredDocIDsIterator;
12
import org.apache.lucene.facet.util.ScoredDocIdsUtils;
15
* Licensed to the Apache Software Foundation (ASF) under one or more
16
* contributor license agreements. See the NOTICE file distributed with
17
* this work for additional information regarding copyright ownership.
18
* The ASF licenses this file to You under the Apache License, Version 2.0
19
* (the "License"); you may not use this file except in compliance with
20
* the License. You may obtain a copy of the License at
22
* http://www.apache.org/licenses/LICENSE-2.0
24
* Unless required by applicable law or agreed to in writing, software
25
* distributed under the License is distributed on an "AS IS" BASIS,
26
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
27
* See the License for the specific language governing permissions and
28
* limitations under the License.
32
* Take random samples of large collections.
33
* @lucene.experimental
35
public class RepeatableSampler extends Sampler {
37
private static final Logger logger = Logger.getLogger(RepeatableSampler.class.getName());
39
public RepeatableSampler(SamplingParams params) {
44
protected SampleResult createSample(ScoredDocIDs docids, int actualSize,
45
int sampleSetSize) throws IOException {
46
int[] sampleSet = null;
48
sampleSet = repeatableSample(docids, actualSize,
50
} catch (IOException e) {
51
if (logger.isLoggable(Level.WARNING)) {
52
logger.log(Level.WARNING, "sampling failed: "+e.getMessage()+" - falling back to no sampling!", e);
54
return new SampleResult(docids, 1d);
57
ScoredDocIDs sampled = ScoredDocIdsUtils.createScoredDocIDsSubset(docids,
59
if (logger.isLoggable(Level.FINEST)) {
60
logger.finest("******************** " + sampled.size());
62
return new SampleResult(sampled, sampled.size()/(double)docids.size());
66
* Returns <code>sampleSize</code> values from the first <code>collectionSize</code>
67
* locations of <code>collection</code>, chosen using
68
* the <code>TRAVERSAL</code> algorithm. The sample values are not sorted.
69
* @param collection The values from which a sample is wanted.
70
* @param collectionSize The number of values (from the first) from which to draw the sample.
71
* @param sampleSize The number of values to return.
72
* @return An array of values chosen from the collection.
73
* @see Algorithm#TRAVERSAL
75
private static int[] repeatableSample(ScoredDocIDs collection,
76
int collectionSize, int sampleSize)
78
return repeatableSample(collection, collectionSize,
79
sampleSize, Algorithm.HASHING, Sorted.NO);
83
* Returns <code>sampleSize</code> values from the first <code>collectionSize</code>
84
* locations of <code>collection</code>, chosen using <code>algorithm</code>.
85
* @param collection The values from which a sample is wanted.
86
* @param collectionSize The number of values (from the first) from which to draw the sample.
87
* @param sampleSize The number of values to return.
88
* @param algorithm Which algorithm to use.
89
* @param sorted Sorted.YES to sort the sample values in ascending order before returning;
90
* Sorted.NO to return them in essentially random order.
91
* @return An array of values chosen from the collection.
93
private static int[] repeatableSample(ScoredDocIDs collection,
94
int collectionSize, int sampleSize,
95
Algorithm algorithm, Sorted sorted)
97
if (collection == null) {
98
throw new IOException("docIdSet is null");
100
if (sampleSize < 1) {
101
throw new IOException("sampleSize < 1 (" + sampleSize + ")");
103
if (collectionSize < sampleSize) {
104
throw new IOException("collectionSize (" + collectionSize + ") less than sampleSize (" + sampleSize + ")");
106
int[] sample = new int[sampleSize];
107
long[] times = new long[4];
108
if (algorithm == Algorithm.TRAVERSAL) {
109
sample1(collection, collectionSize, sample, times);
110
} else if (algorithm == Algorithm.HASHING) {
111
sample2(collection, collectionSize, sample, times);
113
throw new IllegalArgumentException("Invalid algorithm selection");
115
if (sorted == Sorted.YES) {
119
times[3] = System.currentTimeMillis();
120
if (logger.isLoggable(Level.FINEST)) {
121
logger.finest("Times: " + (times[1] - times[0]) + "ms, "
122
+ (times[2] - times[1]) + "ms, " + (times[3] - times[2])+"ms");
129
* Returns <code>sample</code>.length values chosen from the first <code>collectionSize</code>
130
* locations of <code>collection</code>, using the TRAVERSAL algorithm. The sample is
131
* pseudorandom: no subset of the original collection
132
* is in principle more likely to occur than any other, but for a given collection
133
* and sample size, the same sample will always be returned. This algorithm walks the
134
* original collection in a methodical way that is guaranteed not to visit any location
135
* more than once, which makes sampling without replacement faster because removals don't
136
* have to be tracked, and the number of operations is proportional to the sample size,
137
* not the collection size.
138
* Times for performance measurement
139
* are returned in <code>times</code>, which must be an array of at least three longs, containing
140
* nanosecond event times. The first
141
* is set when the algorithm starts; the second, when the step size has been calculated;
142
* and the third when the sample has been taken.
143
* @param collection The set to be sampled.
144
* @param collectionSize The number of values to use (starting from first).
145
* @param sample The array in which to return the sample.
146
* @param times The times of three events, for measuring performance.
148
private static void sample1(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times)
150
ScoredDocIDsIterator it = collection.iterator();
152
times[0] = System.currentTimeMillis();
154
int sampleSize = sample.length;
155
int prime = findGoodStepSize(collectionSize, sampleSize);
156
int mod = prime % collectionSize;
158
times[1] = System.currentTimeMillis();
162
for (; sampleCount < sampleSize;) {
163
if (index + mod < collectionSize) {
164
for (int i = 0; i < mod; i++, index++) {
168
index = index + mod - collectionSize;
169
it = collection.iterator();
170
for (int i = 0; i < index; i++) {
174
sample[sampleCount++] = it.getDocID();
177
times[2] = System.currentTimeMillis();
182
* Returns a value which will allow the caller to walk
183
* a collection of <code>collectionSize</code> values, without repeating or missing
184
* any, and spanning the collection from beginning to end at least once with
185
* <code>sampleSize</code> visited locations. Choosing a value
186
* that is relatively prime to the collection size ensures that stepping by that size (modulo
187
* the collection size) will hit all locations without repeating, eliminating the need to
188
* track previously visited locations for a "without replacement" sample. Starting with the
189
* square root of the collection size ensures that either the first or second prime tried will
190
* work (they can't both divide the collection size). It also has the property that N steps of
191
* size N will span a collection of N**2 elements once. If the sample is bigger than N, it will
192
* wrap multiple times (without repeating). If the sample is smaller, a step size is chosen
193
* that will result in at least one spanning of the collection.
195
* @param collectionSize The number of values in the collection to be sampled.
196
* @param sampleSize The number of values wanted in the sample.
197
* @return A good increment value for walking the collection.
199
private static int findGoodStepSize(int collectionSize, int sampleSize) {
200
int i = (int) Math.sqrt(collectionSize);
201
if (sampleSize < i) {
202
i = collectionSize / sampleSize;
205
i = findNextPrimeAfter(i);
206
} while (collectionSize % i == 0);
211
* Returns the first prime number that is larger than <code>n</code>.
212
* @param n A number less than the prime to be returned.
213
* @return The smallest prime larger than <code>n</code>.
215
private static int findNextPrimeAfter(int n) {
216
n += (n % 2 == 0) ? 1 : 2; // next odd
217
foundFactor: for (;; n += 2) { //TODO labels??!!
218
int sri = (int) (Math.sqrt(n));
219
for (int primeIndex = 0; primeIndex < N_PRIMES; primeIndex++) {
220
int p = primes[primeIndex];
225
continue foundFactor;
228
for (int p = primes[N_PRIMES - 1] + 2;; p += 2) {
233
continue foundFactor;
240
* The first N_PRIMES primes, after 2.
242
private static final int N_PRIMES = 4000;
243
private static int[] primes = new int[N_PRIMES];
246
for (int count = 1; count < N_PRIMES; count++) {
247
primes[count] = findNextPrimeAfter(primes[count - 1]);
252
* Returns <code>sample</code>.length values chosen from the first <code>collectionSize</code>
253
* locations of <code>collection</code>, using the HASHING algorithm. Performance measurements
254
* are returned in <code>times</code>, which must be an array of at least three longs. The first
255
* will be set when the algorithm starts; the second, when a hash key has been calculated and
256
* inserted into the priority queue for every element in the collection; and the third when the
257
* original elements associated with the keys remaining in the PQ have been stored in the sample
260
* This algorithm slows as the sample size becomes a significant fraction of the collection
261
* size, because the PQ is as large as the sample set, and will not do early rejection of values
262
* below the minimum until it fills up, and a larger PQ contains more small values to be purged,
263
* resulting in less early rejection and more logN insertions.
265
* @param collection The set to be sampled.
266
* @param collectionSize The number of values to use (starting from first).
267
* @param sample The array in which to return the sample.
268
* @param times The times of three events, for measuring performance.
270
private static void sample2(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times)
273
times[0] = System.currentTimeMillis();
275
int sampleSize = sample.length;
276
IntPriorityQueue pq = new IntPriorityQueue(sampleSize);
278
* Convert every value in the collection to a hashed "weight" value, and insert
279
* into a bounded PQ (retains only sampleSize highest weights).
281
ScoredDocIDsIterator it = collection.iterator();
283
pq.insertWithReuse((int)(it.getDocID() * PHI_32) & 0x7FFFFFFF);
286
times[1] = System.currentTimeMillis();
289
* Extract heap, convert weights back to original values, and return as integers.
291
Object[] heap = pq.getHeap();
292
for (int si = 0; si < sampleSize; si++) {
293
sample[si] = (int)(((IntPriorityQueue.MI)(heap[si+1])).value * PHI_32I) & 0x7FFFFFFF;
296
times[2] = System.currentTimeMillis();
301
* A bounded priority queue for Integers, to retain a specified number of
302
* the highest-weighted values for return as a random sample.
304
private static class IntPriorityQueue extends PriorityQueue<Object> {
307
* Creates a bounded PQ of size <code>size</code>.
308
* @param size The number of elements to retain.
310
public IntPriorityQueue(int size) {
315
* Inserts an integer with overflow and object reuse.
317
public void insertWithReuse(int intval) {
318
if (this.mi == null) {
321
this.mi.value = intval;
322
this.mi = (MI)this.insertWithOverflow(this.mi);
326
* Returns the underlying data structure for faster access. Extracting elements
327
* one at a time would require N logN time, and since we want the elements sorted
328
* in ascending order by value (not weight), the array is useful as-is.
329
* @return The underlying heap array.
331
public Object[] getHeap() {
332
return getHeapArray();
336
* Returns true if <code>o1<code>'s weight is less than that of <code>o2</code>, for
337
* ordering in the PQ.
338
* @return True if <code>o1</code> weighs less than <code>o2</code>.
341
public boolean lessThan(Object o1, Object o2) {
342
return ((MI)o1).value < ((MI)o2).value;
346
* A mutable integer that lets queue objects be reused once they start overflowing.
348
private static class MI {
354
* The mutable integer instance for reuse after first overflow.
361
* For specifying which sampling algorithm to use.
363
private enum Algorithm {
366
* Specifies a methodical traversal algorithm, which is guaranteed to span the collection
367
* at least once, and never to return duplicates. Faster than the hashing algorithm and
368
* uses much less space, but the randomness of the sample may be affected by systematic
369
* variations in the collection. Requires only an array for the sample, and visits only
370
* the number of elements in the sample set, not the full set.
372
// TODO (Facet): This one produces a bimodal distribution (very flat around
373
// each peak!) for collection size 10M and sample sizes 10k and 10544.
378
* Specifies a Fibonacci-style hash algorithm (see Knuth, S&S), which generates a less
379
* systematically distributed subset of the sampled collection than the traversal method,
380
* but requires a bounded priority queue the size of the sample, and creates an object
381
* containing a sampled value and its hash, for every element in the full set.
387
* For specifying whether to sort the sample.
389
private enum Sorted {
392
* Sort resulting sample before returning.
397
*Do not sort the resulting sample.
403
* Magic number 1: prime closest to phi, in 32 bits.
405
private static final long PHI_32 = 2654435769L;
408
* Magic number 2: multiplicative inverse of PHI_32, modulo 2**32.
410
private static final long PHI_32I = 340573321L;
413
* Switch to cause methods to return timings.
415
private static boolean returnTimings = false;