2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
9
* http://www.apache.org/licenses/LICENSE-2.0
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
18
package org.apache.lucene.misc;
20
import org.apache.lucene.search.DefaultSimilarity;
21
import org.apache.lucene.index.FieldInvertState;
24
import java.util.HashMap;
27
* A similarity with a lengthNorm that provides for a "plateau" of
28
* equally good lengths, and tf helper functions.
31
* For lengthNorm, A global min/max can be specified to define the
32
* plateau of lengths that should all have a norm of 1.0.
33
* Below the min, and above the max the lengthNorm drops off in a
37
* A per field min/max can be specified if different fields have
38
* different sweet spots.
42
* For tf, baselineTf and hyperbolicTf functions are provided, which
43
* subclasses can choose between.
47
public class SweetSpotSimilarity extends DefaultSimilarity {
49
private int ln_min = 1;
50
private int ln_max = 1;
51
private float ln_steep = 0.5f;
53
private Map<String,Number> ln_maxs = new HashMap<String,Number>(7);
54
private Map<String,Number> ln_mins = new HashMap<String,Number>(7);
55
private Map<String,Float> ln_steeps = new HashMap<String,Float>(7);
56
private Map<String,Boolean> ln_overlaps = new HashMap<String,Boolean>(7);
58
private float tf_base = 0.0f;
59
private float tf_min = 0.0f;
61
private float tf_hyper_min = 0.0f;
62
private float tf_hyper_max = 2.0f;
63
private double tf_hyper_base = 1.3d;
64
private float tf_hyper_xoffset = 10.0f;
66
public SweetSpotSimilarity() {
71
* Sets the baseline and minimum function variables for baselineTf
75
public void setBaselineTfFactors(float base, float min) {
81
* Sets the function variables for the hyperbolicTf functions
83
* @param min the minimum tf value to ever be returned (default: 0.0)
84
* @param max the maximum tf value to ever be returned (default: 2.0)
85
* @param base the base value to be used in the exponential for the hyperbolic function (default: e)
86
* @param xoffset the midpoint of the hyperbolic function (default: 10.0)
89
public void setHyperbolicTfFactors(float min, float max,
90
double base, float xoffset) {
94
tf_hyper_xoffset = xoffset;
98
* Sets the default function variables used by lengthNorm when no field
99
* specific variables have been set.
103
public void setLengthNormFactors(int min, int max, float steepness) {
106
this.ln_steep = steepness;
110
* Sets the function variables used by lengthNorm for a specific named field.
112
* @param field field name
113
* @param min minimum value
114
* @param max maximum value
115
* @param steepness steepness of the curve
116
* @param discountOverlaps if true, <code>numOverlapTokens</code> will be
117
* subtracted from <code>numTokens</code>; if false then
118
* <code>numOverlapTokens</code> will be assumed to be 0 (see
119
* {@link DefaultSimilarity#computeNorm(String, FieldInvertState)} for details).
123
public void setLengthNormFactors(String field, int min, int max,
124
float steepness, boolean discountOverlaps) {
125
ln_mins.put(field, Integer.valueOf(min));
126
ln_maxs.put(field, Integer.valueOf(max));
127
ln_steeps.put(field, Float.valueOf(steepness));
128
ln_overlaps.put(field, new Boolean(discountOverlaps));
132
* Implemented as <code> state.getBoost() *
133
* lengthNorm(fieldName, numTokens) </code> where
134
* numTokens does not count overlap tokens if
135
* discountOverlaps is true by default or true for this
138
public float computeNorm(String fieldName, FieldInvertState state) {
140
boolean overlaps = discountOverlaps;
141
if (ln_overlaps.containsKey(fieldName)) {
142
overlaps = ln_overlaps.get(fieldName).booleanValue();
145
numTokens = state.getLength() - state.getNumOverlap();
147
numTokens = state.getLength();
149
return state.getBoost() * computeLengthNorm(fieldName, numTokens);
155
* 1/sqrt( steepness * (abs(x-min) + abs(x-max) - (max-min)) + 1 )
159
* This degrades to <code>1/sqrt(x)</code> when min and max are both 1 and
164
* :TODO: potential optimization is to just flat out return 1.0f if numTerms
165
* is between min and max.
168
* @see #setLengthNormFactors
170
public float computeLengthNorm(String fieldName, int numTerms) {
175
if (ln_mins.containsKey(fieldName)) {
176
l = ln_mins.get(fieldName).intValue();
178
if (ln_maxs.containsKey(fieldName)) {
179
h = ln_maxs.get(fieldName).intValue();
181
if (ln_steeps.containsKey(fieldName)) {
182
s = ln_steeps.get(fieldName).floatValue();
191
(float)(Math.abs(numTerms - l) + Math.abs(numTerms - h) - (h-l))
199
* Delegates to baselineTf
204
public float tf(int freq) {
205
return baselineTf(freq);
211
* (x <= min) ? base : sqrt(x+(base**2)-min)
213
* ...but with a special case check for 0.
215
* This degrates to <code>sqrt(x)</code> when min and base are both 0
218
* @see #setBaselineTfFactors
220
public float baselineTf(float freq) {
222
if (0.0f == freq) return 0.0f;
224
return (freq <= tf_min)
226
: (float)Math.sqrt(freq + (tf_base * tf_base) - tf_min);
230
* Uses a hyperbolic tangent function that allows for a hard max...
233
* tf(x)=min+(max-min)/2*(((base**(x-xoffset)-base**-(x-xoffset))/(base**(x-xoffset)+base**-(x-xoffset)))+1)
237
* This code is provided as a convenience for subclasses that want
238
* to use a hyperbolic tf function.
241
* @see #setHyperbolicTfFactors
243
public float hyperbolicTf(float freq) {
244
if (0.0f == freq) return 0.0f;
246
final float min = tf_hyper_min;
247
final float max = tf_hyper_max;
248
final double base = tf_hyper_base;
249
final float xoffset = tf_hyper_xoffset;
250
final double x = (double)(freq - xoffset);
252
final float result = min +
257
( ( Math.pow(base,x) - Math.pow(base,-x) )
258
/ ( Math.pow(base,x) + Math.pow(base,-x) )
264
return Float.isNaN(result) ? max : result;