4
* Copyright (C) 2012 10gen Inc.
6
* This program is free software: you can redistribute it and/or modify
7
* it under the terms of the GNU Affero General Public License, version 3,
8
* as published by the Free Software Foundation.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU Affero General Public License for more details.
15
* You should have received a copy of the GNU Affero General Public License
16
* along with this program. If not, see <http://www.gnu.org/licenses/>.
19
#include "mongo/pch.h"
21
#include "mongo/db/fts/fts_spec.h"
22
#include "mongo/db/fts/fts_util.h"
23
#include "mongo/util/mongoutils/str.h"
29
using namespace mongoutils;
31
const double MAX_WEIGHT = 1000000000.0;
32
const double MAX_WORD_WEIGHT = MAX_WEIGHT / 10000;
34
FTSSpec::FTSSpec( const BSONObj& indexInfo ) {
35
massert( 16739, "found invalid spec for text index",
36
indexInfo["weights"].isABSONObj() );
38
_defaultLanguage = indexInfo["default_language"].valuestrsafe();
39
_languageOverrideField = indexInfo["language_override"].valuestrsafe();
41
if ( _defaultLanguage.size() == 0 )
42
_defaultLanguage = "english";
43
if ( _languageOverrideField.size() == 0 )
44
_languageOverrideField = "language";
48
// in this block we fill in the _weights map
50
BSONObjIterator i( indexInfo["weights"].Obj() );
52
BSONElement e = i.next();
53
verify( e.isNumber() );
55
if ( WILDCARD == e.fieldName() ) {
59
double num = e.number();
60
_weights[ e.fieldName() ] = num;
61
verify( num > 0 && num < MAX_WORD_WEIGHT );
64
verify( _wildcard || _weights.size() );
69
BSONObj keyPattern = indexInfo["key"].Obj();
70
verify( keyPattern.nFields() >= 2 );
71
BSONObjIterator i( keyPattern );
73
bool passedFTS = false;
76
BSONElement e = i.next();
77
if ( str::equals( e.fieldName(), "_fts" ) ||
78
str::equals( e.fieldName(), "_ftsx" ) ) {
84
_extraAfter.push_back( e.fieldName() );
86
_extraBefore.push_back( e.fieldName() );
92
bool FTSSpec::weight( const StringData& field, double* out ) const {
93
Weights::const_iterator i = _weights.find( field.toString() );
94
if ( i == _weights.end() )
100
string FTSSpec::getLanguageToUse( const BSONObj& userDoc ) const {
101
BSONElement e = userDoc[_languageOverrideField];
102
if ( e.type() == String ) {
103
const char * x = e.valuestrsafe();
104
if ( strlen( x ) > 0 )
107
return _defaultLanguage;
112
* Calculates the score for all terms in a document of a collection
113
* @param obj, the document in the collection being parsed
114
* @param term_freqs, map<string,double> to fill up
116
void FTSSpec::scoreDocument( const BSONObj& obj, TermFrequencyMap* term_freqs ) const {
118
string language = getLanguageToUse( obj );
120
Stemmer stemmer(language);
121
Tools tools(language);
122
tools.stemmer = &stemmer;
123
tools.stopwords = StopWords::getStopWords( language );
126
// if * is specified for weight, we can recurse over all fields.
127
_scoreRecurse(tools, obj, term_freqs);
131
// otherwise, we need to remember the different weights for each field
132
// and act accordingly (in other words, call _score)
133
for ( Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++ ) {
134
const char * leftOverName = i->first.c_str();
136
BSONElement e = obj.getFieldDottedOrArray(leftOverName);
137
// weight associated to name of field
138
double weight = i->second;
143
else if ( e.type() == Array ) {
144
BSONObjIterator j( e.Obj() );
146
BSONElement x = j.next();
147
if ( leftOverName[0] && x.isABSONObj() )
148
x = x.Obj().getFieldDotted( leftOverName );
149
if ( x.type() == String )
150
_scoreString( tools, x.valuestr(), term_freqs, weight );
153
else if ( e.type() == String ) {
154
_scoreString( tools, e.valuestr(), term_freqs, weight );
162
* Recurses over all fields of an obj (document in collection)
163
* and fills term,score map term_freqs
164
* @param tokenizer, tokenizer to tokenize a string into terms
165
* @param obj, object being parsed
166
* term_freqs, map <term,score> to be filled up
168
void FTSSpec::_scoreRecurse(const Tools& tools,
170
TermFrequencyMap* term_freqs ) const {
171
BSONObjIterator j( obj );
173
BSONElement x = j.next();
175
if ( languageOverrideField() == x.fieldName() )
178
if (x.type() == String) {
180
weight( x.fieldName(), &w );
181
_scoreString(tools, x.valuestr(), term_freqs, w);
183
else if ( x.isABSONObj() ) {
184
_scoreRecurse( tools, x.Obj(), term_freqs);
191
struct ScoreHelperStruct {
193
: freq(0), count(0), exp(0){
199
typedef unordered_map<string,ScoreHelperStruct> ScoreHelperMap;
202
void FTSSpec::_scoreString( const Tools& tools,
203
const StringData& raw,
204
TermFrequencyMap* docScores,
205
double weight ) const {
207
ScoreHelperMap terms;
209
unsigned numTokens = 0;
211
Tokenizer i( tools.language, raw );
214
if ( t.type != Token::TEXT )
217
string term = t.data.toString();
219
if ( tools.stopwords->isStopWord( term ) )
221
term = tools.stemmer->stem( term );
223
ScoreHelperStruct& data = terms[term];
230
data.freq += ( 1 / data.exp );
235
for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) {
237
const string& term = i->first;
238
const ScoreHelperStruct& data = i->second;
240
// in order to adjust weights as a function of term count as it
241
// relates to total field length. ie. is this the only word or
242
// a frequently occuring term? or does it only show up once in
243
// a long block of text?
245
double coeff = ( 0.5 * data.count / numTokens ) + 0.5;
247
// if term is identical to the raw form of the
248
// field (untokenized) give it a small boost.
249
double adjustment = 1;
250
if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) )
253
double& score = (*docScores)[term];
254
score += ( weight * data.freq * coeff * adjustment );
255
verify( score <= MAX_WEIGHT );
259
Status FTSSpec::getIndexPrefix( const BSONObj& query, BSONObj* out ) const {
260
if ( numExtraBefore() == 0 ) {
266
for ( unsigned i = 0; i < numExtraBefore(); i++ ) {
267
BSONElement e = query.getFieldDotted(extraBefore(i));
269
return Status( ErrorCodes::BadValue,
271
<< "need have an equality filter on: "
274
if ( e.isABSONObj() && e.Obj().firstElement().getGtLtOp( -1 ) != -1 )
275
return Status( ErrorCodes::BadValue,
277
<< "need have an equality filter on: "
286
void _addFTSStuff( BSONObjBuilder* b ) {
287
b->append( "_fts", INDEX_NAME );
288
b->append( "_ftsx", 1 );
291
BSONObj FTSSpec::fixSpec( const BSONObj& spec ) {
297
bool addedFtsStuff = false;
299
BSONObjIterator i( spec["key"].Obj() );
301
BSONElement e = i.next();
302
if ( str::equals( e.fieldName(), "_fts" ) ||
303
str::equals( e.fieldName(), "_ftsx" ) ) {
306
else if ( e.type() == String &&
307
( str::equals( "fts", e.valuestr() ) ||
308
str::equals( "text", e.valuestr() ) ) ) {
310
if ( !addedFtsStuff ) {
312
addedFtsStuff = true;
315
m[e.fieldName()] = 1;
322
if ( !addedFtsStuff )
325
keyPattern = b.obj();
328
if ( spec["weights"].isABSONObj() ) {
329
BSONObjIterator i( spec["weights"].Obj() );
331
BSONElement e = i.next();
332
m[e.fieldName()] = e.numberInt();
335
else if ( spec["weights"].str() == WILDCARD ) {
342
for ( map<string,int>::iterator i = m.begin(); i != m.end(); ++i ) {
343
uassert( 16674, "score for word too high",
344
i->second > 0 && i->second < MAX_WORD_WEIGHT );
345
b.append( i->first, i->second );
350
string default_language(spec.getStringField("default_language"));
351
if ( default_language.empty() )
352
default_language = "english";
354
string language_override(spec.getStringField("language_override"));
355
if ( language_override.empty() )
356
language_override = "language";
359
int textIndexVersion = 1;
362
BSONObjIterator i( spec );
364
BSONElement e = i.next();
365
if ( str::equals( e.fieldName(), "key" ) ) {
366
b.append( "key", keyPattern );
368
else if ( str::equals( e.fieldName(), "weights" ) ) {
369
b.append( "weights", weights );
372
else if ( str::equals( e.fieldName(), "default_language" ) ) {
373
b.append( "default_language", default_language);
374
default_language = "";
376
else if ( str::equals( e.fieldName(), "language_override" ) ) {
377
b.append( "language_override", language_override);
378
language_override = "";
380
else if ( str::equals( e.fieldName(), "v" ) ) {
381
version = e.numberInt();
383
else if ( str::equals( e.fieldName(), "textIndexVersion" ) ) {
384
textIndexVersion = e.numberInt();
386
str::stream() << "bad textIndexVersion: " << textIndexVersion,
387
textIndexVersion == 1 );
394
if ( !weights.isEmpty() )
395
b.append( "weights", weights );
396
if ( !default_language.empty() )
397
b.append( "default_language", default_language);
398
if ( !language_override.empty() )
399
b.append( "language_override", language_override);
402
b.append( "v", version );
404
b.append( "textIndexVersion", textIndexVersion );