198
199
parms.get(Doc::keyfmt, doc.fmtime);
199
200
parms.get(Doc::keydmt, doc.dmtime);
200
201
parms.get(Doc::keyoc, doc.origcharset);
201
parms.get(cstr_keycap, doc.meta[Doc::keytt]);
202
parms.get(cstr_caption, doc.meta[Doc::keytt]);
202
203
parms.get(Doc::keykw, doc.meta[Doc::keykw]);
203
204
parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);
204
205
// Possibly remove synthetic abstract indicator (if it's there, we
209
210
doc.syntabs = true;
211
212
parms.get(Doc::keyipt, doc.ipath);
213
parms.get(Doc::keypcs, doc.pcbytes);
212
214
parms.get(Doc::keyfs, doc.fbytes);
213
215
parms.get(Doc::keyds, doc.dbytes);
214
216
parms.get(Doc::keysig, doc.sig);
859
867
// The splitter breaks text into words and adds postings to the Xapian
860
868
// document. We use a single object to split all of the document
861
869
// fields and position jumps to separate fields
862
class TextSplitDb : public TextSplit {
870
class TextSplitDb : public TextSplitP {
864
Xapian::WritableDatabase db;
865
872
Xapian::Document &doc; // Xapian document
866
873
// Base for document section. Gets large increment when we change
867
874
// sections, to avoid cross-section proximity matches.
874
881
// to compute the first position of the next section.
875
882
Xapian::termpos curpos;
878
TextSplitDb(Xapian::WritableDatabase idb,
879
Xapian::Document &d, StopList &_stops)
880
: db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1)
884
TextSplitDb(Xapian::Document &d, TermProc *prc)
886
doc(d), basepos(1), curpos(0), wdfinc(1)
882
888
// Reimplement text_to_words to add start and end special terms
883
889
virtual bool text_to_words(const string &in);
884
bool takeword(const std::string &term, int pos, int, int);
885
890
void setprefix(const string& pref) {prefix = pref;}
886
891
void setwdfinc(int i) {wdfinc = i;}
893
friend class TermProcIdx;
889
896
// If prefix is set, we also add a posting for the prefixed terms
890
897
// (ie: for titles, add postings for both "term" and "Sterm")
922
928
} XCATCHERROR(ermsg);
923
929
if (!ermsg.empty()) {
924
930
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
925
basepos += curpos + 100;
928
937
basepos += curpos + 100;
932
// Get one term from the doc, remove accents and lowercase, then add posting
933
bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
935
LOGDEB2(("TextSplitDb::takeword: [%s]\n", _term.c_str()));
938
if (!unacmaybefold(_term, term, "UTF-8", true)) {
939
LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n",
942
// We don't generate a fatal error because of a bad term
946
if (stops.isStop(term)) {
947
LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
951
// Compute absolute position (pos is relative to current segment),
952
// and remember relative.
957
// Index without prefix, using the field-specific weighting
958
doc.add_posting(term, pos, wdfinc);
941
class TermProcIdx : public TermProc {
943
TermProcIdx() : TermProc(0), m_ts(0) {}
944
void setTSD(TextSplitDb *ts) {m_ts = ts;}
946
bool takeword(const std::string &term, int pos, int, int)
948
// Compute absolute position (pos is relative to current segment),
949
// and remember relative.
951
pos += m_ts->basepos;
954
// Index without prefix, using the field-specific weighting
955
LOGDEB1(("Emitting term at %d : [%s]\n", pos, term.c_str()));
956
m_ts->doc.add_posting(term, pos, m_ts->wdfinc);
959
957
#ifdef TESTING_XAPIAN_SPELL
960
if (Db::isSpellingCandidate(term)) {
961
db.add_spelling(term);
958
if (Db::isSpellingCandidate(term)) {
959
m_ts->db.add_spelling(term);
964
// Index the prefixed term.
965
if (!prefix.empty()) {
966
doc.add_posting(prefix + term, pos, wdfinc);
969
} XCATCHERROR(ermsg);
970
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
962
// Index the prefixed term.
963
if (!m_ts->prefix.empty()) {
964
m_ts->doc.add_posting(m_ts->prefix + term, pos, m_ts->wdfinc);
967
} XCATCHERROR(ermsg);
968
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
974
977
#ifdef TESTING_XAPIAN_SPELL
975
978
string Db::getSpellingSuggestion(const string& word)
1006
1009
#define RECORD_APPEND(R, NM, VAL) {R += NM + "=" + VAL + "\n";}
1012
void *DbUpdWorker(void* vdbp)
1014
Db *dbp = (Db *)vdbp;
1015
WorkQueue<DbUpdTask*> *tqp = &(dbp->m_ndb->m_wqueue);
1019
if (!tqp->take(&tsk)) {
1023
LOGDEB(("DbUpdWorker: got task, ql %d\n", int(tqp->size())));
1025
const char *fnc = tsk->udi.c_str();
1028
// Add db entry or update existing entry:
1031
dbp->m_ndb->xwdb.replace_document(tsk->uniterm,
1033
if (did < dbp->updated.size()) {
1034
dbp->updated[did] = true;
1035
LOGINFO(("Db::add: docid %d updated [%s]\n", did, fnc));
1037
LOGINFO(("Db::add: docid %d added [%s]\n", did, fnc));
1039
} XCATCHERROR(ermsg);
1041
if (!ermsg.empty()) {
1042
LOGERR(("Db::add: replace_document failed: %s\n", ermsg.c_str()));
1044
// FIXME: is this ever actually needed?
1046
dbp->m_ndb->xwdb.add_document(tsk->doc);
1047
LOGDEB(("Db::add: %s added (failed re-seek for duplicate)\n",
1049
} XCATCHERROR(ermsg);
1050
if (!ermsg.empty()) {
1051
LOGERR(("Db::add: add_document failed: %s\n", ermsg.c_str()));
1056
dbp->maybeflush(tsk->txtlen);
1061
#endif // IDX_THREADS
1008
1063
// Add document in internal form to the database: index the terms in
1009
1064
// the title abstract and body and add special terms for file name,
1010
1065
// date, mime type etc. , create the document data record (more
1011
1066
// metadata), and update database
1012
1067
bool Db::addOrUpdate(const string &udi, const string &parent_udi,
1015
1070
LOGDEB(("Db::add: udi [%s] parent [%s]\n",
1016
1071
udi.c_str(), parent_udi.c_str()));
1030
1085
m_occtxtsz = m_curtxtsz;
1035
1088
Xapian::Document newdocument;
1036
TextSplitDb splitter(m_ndb->xwdb, newdocument, m_stops);
1090
// The term processing pipeline:
1092
TermProc *nxt = &tpidx;
1093
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
1094
// TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
1095
TermProcPrep tpprep(nxt); nxt = &tpprep;
1097
TextSplitDb splitter(newdocument, nxt);
1098
tpidx.setTSD(&splitter);
1038
1100
// Split and index file name as document term(s)
1039
1101
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
1194
1256
RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
1196
if (!doc.fbytes.empty())
1258
if (doc.fbytes.empty())
1259
doc.fbytes = doc.pcbytes;
1261
if (!doc.fbytes.empty()) {
1197
1262
RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
1263
leftzeropad(doc.fbytes, 12);
1264
newdocument.add_value(VALUE_SIZE, doc.fbytes);
1267
if (!doc.pcbytes.empty())
1268
RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
1270
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
1271
RECORD_APPEND(record, Doc::keyds, sizebuf);
1198
1273
// Note that we add the signature both as a value and in the data record
1199
1274
if (!doc.sig.empty())
1200
1275
RECORD_APPEND(record, Doc::keysig, doc.sig);
1201
1276
newdocument.add_value(VALUE_SIG, doc.sig);
1204
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
1205
RECORD_APPEND(record, Doc::keyds, sizebuf);
1207
1278
if (!doc.ipath.empty())
1208
1279
RECORD_APPEND(record, Doc::keyipt, doc.ipath);
1210
if (doc.meta[Doc::keytt].empty())
1211
doc.meta[Doc::keytt] = doc.utf8fn;
1212
1281
doc.meta[Doc::keytt] =
1213
1282
neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
1214
1283
if (!doc.meta[Doc::keytt].empty())
1215
RECORD_APPEND(record, cstr_keycap, doc.meta[Doc::keytt]);
1284
RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
1217
1286
trimstring(doc.meta[Doc::keykw], " \t\r\n");
1218
1287
doc.meta[Doc::keykw] =
1264
1333
LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
1265
1334
newdocument.set_data(record);
1337
DbUpdTask *tp = new DbUpdTask(udi, uniterm, newdocument, doc.text.length());
1338
if (!m_ndb->m_wqueue.put(tp)) {
1339
LOGERR(("Db::addOrUpdate:Cant queue task\n"));
1267
1343
const char *fnc = udi.c_str();
1365
1442
// Set the uptodate flag for doc / pseudo doc
1366
updated[*docid] = true;
1443
if (m_mode != DbRO) {
1444
updated[*docid] = true;
1368
// Set the existence flag for all the subdocs (if any)
1369
vector<Xapian::docid> docids;
1370
if (!m_ndb->subDocs(udi, docids)) {
1371
LOGERR(("Rcl::Db::needUpdate: can't get subdocs list\n"));
1374
for (vector<Xapian::docid>::iterator it = docids.begin();
1375
it != docids.end(); it++) {
1376
if (*it < updated.size()) {
1377
LOGDEB2(("Db::needUpdate: set flag for docid %d\n", *it));
1378
updated[*it] = true;
1446
// Set the existence flag for all the subdocs (if any)
1447
vector<Xapian::docid> docids;
1448
if (!m_ndb->subDocs(udi, docids)) {
1449
LOGERR(("Rcl::Db::needUpdate: can't get subdocs list\n"));
1452
for (vector<Xapian::docid>::iterator it = docids.begin();
1453
it != docids.end(); it++) {
1454
if (*it < updated.size()) {
1455
LOGDEB2(("Db::needUpdate: set flag for docid %d\n", *it));
1456
updated[*it] = true;
1559
1647
// each end: match any substring
1560
1648
if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
1561
1649
pattern = pattern.substr(1, pattern.size() -2);
1562
} else if (pattern.find_first_of("*?[") == string::npos &&
1650
} else if (pattern.find_first_of(cstr_minwilds) == string::npos &&
1563
1651
!unaciscapital(pattern)) {
1564
1652
pattern = "*" + pattern + "*";
1565
1653
} // else let it be