~ubuntu-branches/ubuntu/saucy/libcolumbus/saucy

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
/*
 * Copyright (C) 2012 Canonical, Ltd.
 *
 * Authors:
 *    Jussi Pakkanen <jussi.pakkanen@canonical.com>
 *
 * This library is free software; you can redistribute it and/or modify it under
 * the terms of version 3 of the GNU Lesser General Public License as published
 * by the Free Software Foundation.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "Matcher.hh"
#include "Corpus.hh"
#include "LevenshteinIndex.hh"
#include "Word.hh"
#include "Document.hh"
#include "WordList.hh"
#include "IndexMatches.hh"
#include "MatchResults.hh"
#include "ErrorValues.hh"
#include "ColumbusHelpers.hh"
#include "IndexWeights.hh"
#include "MatcherStatistics.hh"
#include "WordStore.hh"
#include "ResultFilter.hh"
#include "SearchParameters.hh"
#include <cassert>
#include <stdexcept>
#include <map>
#include <vector>

#ifdef HAS_SPARSE_HASH
#include <google/sparse_hash_map>
#include <google/sparse_hash_set>
using google::sparse_hash_map;
using google::sparse_hash_set;
#define hashmap sparse_hash_map
#define hashset sparse_hash_set
#else
#include <unordered_map>
#include <unordered_set>
#define hashmap unordered_map
#define hashset unordered_set
#endif

COL_NAMESPACE_START
using namespace std;

struct idhasher : std::unary_function<const pair<WordID, WordID>, size_t> {
    size_t operator() ( const pair<WordID, WordID> &p) const {
        size_t w1 = p.first;
        size_t w2 = p.second;
        if(sizeof(size_t) > sizeof(WordID)) {
            return (w1 << 32) | w2; // Assuming size_t is 64 bits.
        } else {
            return w1 ^ w2;
        }
    }
};

typedef hashset<DocumentID> DocumentSet;
typedef hashmap<WordID, LevenshteinIndex*> IndexMap;
typedef hashmap<std::pair<WordID, WordID>, DocumentSet, idhasher > ReverseIndexData; // Index name, word, documents.

typedef IndexMap::iterator IndIterator;
typedef ReverseIndexData::iterator RevIndIterator;

typedef map<WordID, int> MatchErrorMap;

typedef map<WordID, MatchErrorMap> BestIndexMatches;

typedef BestIndexMatches::iterator MatchIndIterator;
typedef MatchErrorMap::iterator MatchIterator;


class ReverseIndex {
private:
    ReverseIndexData reverseIndex;
public:

    void add(const WordID wordID, const WordID indexID, const DocumentID id);
    bool documentHasTerm(const WordID wordID, const WordID indexID, DocumentID id);
    void findDocuments(const WordID wordID, const WordID indexID, std::vector<DocumentID> &result);
};

struct MatcherPrivate {
    IndexMap indexes;
    ReverseIndex reverseIndex;
    ErrorValues e;
    IndexWeights weights;
    MatcherStatistics stats;
    WordStore store;
};

void ReverseIndex::add(const WordID wordID, const WordID indexID, const DocumentID id) {
    pair<WordID, WordID> p;
    p.first = indexID;
    p.second = wordID;
    auto revIt = reverseIndex.find(p);
    if(revIt == reverseIndex.end()) {
        DocumentSet tmp;
        tmp.insert(id);
        reverseIndex[p] = tmp;
    } else {
        revIt->second.insert(id);
    }

}

bool ReverseIndex::documentHasTerm(const WordID wordID, const WordID indexID, DocumentID id) {
    pair<WordID, WordID> p;
    p.first = indexID;
    p.second = wordID;
    auto revIt = reverseIndex.find(p);
    if(revIt == reverseIndex.end())
        return false;
    return revIt->second.find(id) != revIt->second.end();
}

void ReverseIndex::findDocuments(const WordID wordID, const WordID indexID, std::vector<DocumentID> &result) {
    pair<WordID, WordID> p;
    p.first = indexID;
    p.second = wordID;
    auto revIt = reverseIndex.find(p);
    if(revIt == reverseIndex.end())
        return;
    DocumentSet &docSet = revIt->second;
    for(auto docIter = docSet.begin(); docIter != docSet.end(); docIter++) {
        result.push_back(*docIter);
    }
}

/*
 * These are helper functions for Matcher. They are not member functions to avoid polluting the header
 * with STL includes.
 */

static void addMatches(MatcherPrivate */*p*/, BestIndexMatches &bestIndexMatches, const Word &/*queryWord*/, const WordID indexID, IndexMatches &matches) {
    MatchIndIterator it = bestIndexMatches.find(indexID);
    map<WordID, int> *indexMatches;
    if(it == bestIndexMatches.end()) {
        map<WordID, int> tmp;
        bestIndexMatches[indexID] = tmp;
        it = bestIndexMatches.find(indexID);
    }
    indexMatches = &(it->second);
    for(size_t i=0; i < matches.size(); i++) {
        const WordID matchWordID = matches.getMatch(i);
        const int matchError = matches.getMatchError(i);
        MatchIterator mIt = indexMatches->find(matchWordID);
        if(mIt == indexMatches->end()) {
            (*indexMatches)[matchWordID] = matchError;
        } else {
            if(mIt->second > matchError)
                (*indexMatches)[matchWordID] = matchError;
        }

    }
}

/*
 * A simple relevancy calculator for matched word. Better ranking functions exist and should be examined:
 * http://en.wikipedia.org/wiki/TF_IDF
 * http://en.wikipedia.org/wiki/Okapi_BM25
 */
static double calculateRelevancy(MatcherPrivate *p, const WordID wID, const WordID indexID, int error) {
    const LevenshteinIndex * const ind = p->indexes[indexID];
    double errorMultiplier = 100.0/(100.0+error); // Should be adjusted for maxError or word length.
    size_t indexCount = ind->wordCount(wID);
    size_t indexMaxCount = ind->maxCount();
    assert(indexCount > 0);
    assert(indexMaxCount > 0);
    double frequencyMultiplier = 1.0 - double(indexCount)/(indexMaxCount+1);
    double indexWeightMultiplier = p->weights.getWeight(p->store.getWord(indexID));
    return errorMultiplier*frequencyMultiplier*indexWeightMultiplier;
}


static void matchIndexes(MatcherPrivate *p, const WordList &query, const SearchParameters &params, const int extraError, BestIndexMatches &bestIndexMatches) {
    for(size_t i=0; i<query.size(); i++) {
        const Word &w = query[i];
        int maxError;
        if(params.isDynamic())
            maxError = params.getDynamicError(w);
        else
            maxError = 2*LevenshteinIndex::getDefaultError();
        maxError += extraError;

        for(IndIterator it = p->indexes.begin(); it != p->indexes.end(); it++) {
            if(params.isNonsearchingField(p->store.getWord(it->first))) {
                continue;
            }
            IndexMatches m;
            it->second->findWords(w, p->e, maxError, m);
            addMatches(p, bestIndexMatches, w, it->first, m);
            debugMessage("Matched word %s in index %s with error %d and got %lu matches.\n",
                    w.asUtf8().c_str(), p->store.getWord(it->first).asUtf8().c_str(), maxError, (unsigned long) m.size());
        }
    }
}

static void gatherMatchedDocuments(MatcherPrivate *p,  BestIndexMatches &bestIndexMatches, map<DocumentID, double> &matchedDocuments) {
    for(MatchIndIterator it = bestIndexMatches.begin(); it != bestIndexMatches.end(); it++) {
        for(MatchIterator mIt = it->second.begin(); mIt != it->second.end(); mIt++) {
            vector<DocumentID> tmp;
            p->reverseIndex.findDocuments(mIt->first, it->first, tmp);
            debugMessage("Exact searched \"%s\" in field \"%s\", which was found in %lu documents.\n",
                    p->store.getWord(mIt->first).asUtf8().c_str(),
                    p->store.getWord(it->first).asUtf8().c_str(), (unsigned long)tmp.size());
            for(size_t i=0; i<tmp.size(); i++) {
                DocumentID curDoc = tmp[i];
                // At this point we know the matched word, and which index and field
                // it matched in. Now we can just increment the relevancy of said document.
                double relevancy = calculateRelevancy(p, mIt->first, it->first, mIt->second);
                auto doc = matchedDocuments.find(curDoc);
                if(doc == matchedDocuments.end())
                    matchedDocuments[curDoc] = relevancy;
                else
                    matchedDocuments[curDoc] += relevancy;
            }
        }
    }
}

static void expandQuery(const WordList &query, WordList &expandedQuery) {
    for(size_t i=0; i<query.size(); i++)
        expandedQuery.addWord(query[i]);

    for(size_t i=0; i<query.size()-1; i++) {
        expandedQuery.addWord(query[i].join(query[i+1]));
    }
}

static bool subtermsMatch(MatcherPrivate *p, const ResultFilter &filter, size_t term, DocumentID id) {
    for(size_t subTerm=0; subTerm < filter.numSubTerms(term); subTerm++) {
        const Word &filterName = filter.getField(term, subTerm);
        const Word &value = filter.getWord(term, subTerm);
        bool termFound = p->reverseIndex.documentHasTerm(
                p->store.getID(value), p->store.getID(filterName), id);
        if(!termFound) {
            return false;
        }
    }
    return true;
}

Matcher::Matcher() {
    p = new MatcherPrivate();
}

void Matcher::index(const Corpus &c) {
    double buildStart, buildEnd;
    buildStart = hiresTimestamp();
    buildIndexes(c);
    buildEnd = hiresTimestamp();
    debugMessage("Added %lu documents to matcher. It now has %lu indexes. Index population took %.2f seconds.\n",
            (unsigned long) c.size(), (unsigned long) p->indexes.size(), buildEnd - buildStart);
    for(IndIterator it = p->indexes.begin(); it != p->indexes.end(); it++) {
        debugMessage("Index \"%s\" has %lu words and %lu nodes.\n",
                p->store.getWord(it->first).asUtf8().c_str(), (unsigned long) it->second->numWords(),
                (unsigned long) it->second->numNodes());
    }
}

Matcher::~Matcher() {
    for(IndIterator it = p->indexes.begin(); it != p->indexes.end(); it++) {
        delete it->second;
    }
    delete p;
}

void Matcher::buildIndexes(const Corpus &c) {
    for(size_t ci = 0; ci < c.size(); ci++) {
        const Document &d = c.getDocument(ci);
        WordList textNames;
        d.getFieldNames(textNames);
        for(size_t ti=0; ti < textNames.size(); ti++) {
            const Word &fieldName = textNames[ti];
            const WordID fieldID = p->store.getID(fieldName);
            const WordList &text = d.getText(fieldName);
            for(size_t wi=0; wi<text.size(); wi++) {
                const Word &word = text[wi];
                const WordID wordID = p->store.getID(word);
                p->stats.wordProcessed(wordID);
                addToIndex(word, wordID, fieldID);
                p->stats.addedWordToIndex(wordID, fieldName);
                p->reverseIndex.add(wordID, fieldID, d.getID());
            }
        }
    }
}

void Matcher::addToIndex(const Word &word, const WordID wordID, const WordID indexID) {
    LevenshteinIndex *target;
    IndIterator it = p->indexes.find(indexID);
    if(it == p->indexes.end()) {
        target = new LevenshteinIndex();
        p->indexes[indexID] = target;
    } else {
        target = it->second;
    }
    target->insertWord(word, wordID);
}


void Matcher::matchWithRelevancy(const WordList &query, const SearchParameters &params, const int extraError, MatchResults &matchedDocuments) {
    map<DocumentID, double> docs;
    BestIndexMatches bestIndexMatches;
    double start, indexMatchEnd, gatherEnd, finish;

    start = hiresTimestamp();
    matchIndexes(p, query, params, extraError, bestIndexMatches);
    indexMatchEnd = hiresTimestamp();
    // Now we know all matched words in all indexes. Gather up the corresponding documents.
    gatherMatchedDocuments(p, bestIndexMatches, docs);
    gatherEnd = hiresTimestamp();
    for(auto it=docs.begin(); it != docs.end(); it++) {
        matchedDocuments.addResult(it->first, it->second);
    }
    debugMessage("Found a total of %lu documents.\n", (unsigned long) matchedDocuments.size());
    finish = hiresTimestamp();
    debugMessage("Query finished. Index lookups took %.2fs, result gathering %.2fs, result building %.2fs.\n",
            indexMatchEnd - start, gatherEnd - indexMatchEnd, finish - gatherEnd);
}

MatchResults Matcher::match(const WordList &query, const SearchParameters &params) {
    MatchResults matchedDocuments;
    const int maxIterations = 1;
    const int increment = LevenshteinIndex::getDefaultError();
    const size_t minMatches = 10;
    WordList expandedQuery;
    MatchResults allMatches;

    if(query.size() == 0)
        return matchedDocuments;
    expandQuery(query, expandedQuery);
    // Try to search with ever growing error until we find enough matches.
    for(int i=0; i<maxIterations; i++) {
        MatchResults matches;
        matchWithRelevancy(expandedQuery, params, i*increment, matches);
        if(matches.size() >= minMatches || i == maxIterations-1) {
            allMatches.addResults(matches);
            break;
        }
    }

    /* Filter results into final set. */
    auto &filter = params.getResultFilter();
    for(size_t i=0; i<allMatches.size(); i++) {
        DocumentID id = allMatches.getDocumentID(i);
        for(size_t term=0; term < filter.numTerms(); term++) {
            if(subtermsMatch(p, filter, term, id)) {
                matchedDocuments.copyResult(allMatches, i);
                break;
            }
        }
    }
    return matchedDocuments;
}

MatchResults Matcher::match(const char *queryAsUtf8) {
    return match(splitToWords(queryAsUtf8));
}

MatchResults Matcher::match(const std::string &queryAsUtf8) {
    return match(queryAsUtf8.c_str());
}


MatchResults Matcher::match(const WordList &query) {
    SearchParameters defaults;
    return match(query, defaults);
}

ErrorValues& Matcher::getErrorValues() {
    return p->e;
}

MatchResults Matcher::match(const char *queryAsUtf8, const SearchParameters &params) {
    return match(splitToWords(queryAsUtf8), params);
}

IndexWeights& Matcher::getIndexWeights() {
    return p->weights;
}

COL_NAMESPACE_END