~jpakkane/libcolumbus/hud-rework

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
/*
 * Copyright (C) 2012 Canonical, Ltd.
 *
 * Authors:
 *    Jussi Pakkanen <jussi.pakkanen@canonical.com>
 *
 * This library is free software; you can redistribute it and/or modify it under
 * the terms of version 3 of the GNU Lesser General Public License as published
 * by the Free Software Foundation.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 * This exe is meant to be used to measure speed and
 * memory consumption of Index building.
 */

#include "columbus.hh" // This app should only need public API from Columbus.
#include <fstream>
#include <string>
#include <climits>

using namespace std;
using namespace Columbus;

Matcher* build_matcher(const char *dataFile, int maxLines) {
    Matcher *m = 0;
    Corpus *c = new Corpus();
    const int batchSize = 100000;
    Word field("name");
    double dataReadStart, dataReadEnd;
    int i = 0;
    size_t totalDocs = 0;

    ifstream ifile(dataFile);
    if(ifile.fail()) {
        printf("Could not open file %s.\n", dataFile);
        exit(1);
    }
    string line;

    m = new Matcher();

    // Build Corpus.
    dataReadStart = hiresTimestamp();
    while(getline(ifile, line)) {
        if(line.size() == 0)
            continue;
        totalDocs++;
        Document d(totalDocs);
        d.addText(field, line.c_str());
        c->addDocument(d);
        i++;
        if(i % batchSize == 0) {
            m->index(*c);
            delete c;
            c = new Corpus();
        }
        if(i >= maxLines)
            break;
    }
    m->index(*c);
    delete c;
    dataReadEnd = hiresTimestamp();
    printf("Read in %lu documents in %.2f seconds.\n", (unsigned long)totalDocs, dataReadEnd - dataReadStart);
    return m;
}

int main(int argc, char **argv) {
    int maxLines = INT_MAX;
    Matcher *m;
    if(argc == 1) {
        printf("%s datafile.txt [num of lines]\n", argv[0]);
        return 1;
    }
    if(argc > 2)
        maxLines = atoi(argv[2]);
    try {
        m = build_matcher(argv[1], maxLines);
        delete m;
    } catch(const std::exception &e) {
        fprintf(stderr, "Fail: %s\n", e.what());
        return 666;
    }
    return 0;
}