72
71
LOGDEB0(("SearchData::~SearchData\n"));
73
72
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
77
// Expand categories and mime type wild card exps Categories are
78
// expanded against the configuration, mimetypes against the index
80
bool SearchData::expandFileTypes(Db &db, vector<string>& tps)
82
const RclConfig *cfg = db.getConf();
84
LOGFATAL(("Db::expandFileTypes: null configuration!!\n"));
87
vector<string> exptps;
89
for (vector<string>::iterator it = tps.begin(); it != tps.end(); it++) {
90
if (cfg->isMimeCategory(*it)) {
92
cfg->getMimeCatTypes(*it, tps);
93
exptps.insert(exptps.end(), tps.begin(), tps.end());
96
string mt = stringtolower((const string&)*it);
97
// We set casesens|diacsens to get an equivalent of ixTermMatch()
98
db.termMatch(Db::ET_WILD|Db::ET_CASESENS|Db::ET_DIACSENS, string(),
99
mt, res, -1, "mtype");
100
if (res.entries.empty()) {
101
exptps.push_back(it->c_str());
103
for (vector<TermMatchEntry>::const_iterator rit =
104
res.entries.begin(); rit != res.entries.end(); rit++) {
105
exptps.push_back(strip_prefix(rit->term));
110
sort(exptps.begin(), exptps.end());
111
exptps.erase(unique(exptps.begin(), exptps.end()), exptps.end());
117
static const char *maxXapClauseMsg =
118
"Maximum Xapian query size exceeded. Increase maxXapianClauses "
119
"in the configuration. ";
120
static const char *maxXapClauseCaseDiacMsg =
121
"Or try to use case (C) or diacritics (D) sensitivity qualifiers, or less "
125
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
126
vector<SearchDataClause*>& query,
127
string& reason, void *d)
130
for (qlist_it_t it = query.begin(); it != query.end(); it++) {
132
if (!(*it)->toNativeQuery(db, &nq)) {
133
LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n",
134
(*it)->getReason().c_str()));
135
reason += (*it)->getReason() + " ";
139
LOGDEB(("SearchData::clausesToQuery: skipping empty clause\n"));
142
// If this structure is an AND list, must use AND_NOT for excl clauses.
143
// Else this is an OR list, and there can't be excl clauses (checked by
145
Xapian::Query::op op;
146
if (tp == SCLT_AND) {
147
if ((*it)->getexclude()) {
148
op = Xapian::Query::OP_AND_NOT;
150
op = Xapian::Query::OP_AND;
153
op = Xapian::Query::OP_OR;
156
if (op == Xapian::Query::OP_AND_NOT)
157
xq = Xapian::Query(op, Xapian::Query::MatchAll, nq);
161
xq = Xapian::Query(op, xq, nq);
163
if (int(xq.get_length()) >= getMaxCl()) {
164
LOGERR(("%s\n", maxXapClauseMsg));
165
m_reason += maxXapClauseMsg;
166
if (!o_index_stripchars)
167
m_reason += maxXapClauseCaseDiacMsg;
172
LOGDEB0(("SearchData::clausesToQuery: got %d clauses\n", xq.get_length()));
175
xq = Xapian::Query::MatchAll;
177
*((Xapian::Query *)d) = xq;
181
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
183
LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
186
db.getConf()->getConfParam("maxTermExpand", &m_maxexp);
187
db.getConf()->getConfParam("maxXapianClauses", &m_maxcl);
189
// Walk the clause list translating each in turn and building the
192
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
193
LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n",
199
// If one of the extremities is unset, compute db extremas
200
if (m_dates.y1 == 0 || m_dates.y2 == 0) {
201
int minyear = 1970, maxyear = 2100;
202
if (!db.maxYearSpan(&minyear, &maxyear)) {
203
LOGERR(("Can't retrieve index min/max dates\n"));
207
if (m_dates.y1 == 0) {
208
m_dates.y1 = minyear;
212
if (m_dates.y2 == 0) {
213
m_dates.y2 = maxyear;
218
LOGDEB(("Db::toNativeQuery: date interval: %d-%d-%d/%d-%d-%d\n",
219
m_dates.y1, m_dates.m1, m_dates.d1,
220
m_dates.y2, m_dates.m2, m_dates.d2));
221
Xapian::Query dq = date_range_filter(m_dates.y1, m_dates.m1, m_dates.d1,
222
m_dates.y2, m_dates.m2, m_dates.d2);
224
LOGINFO(("Db::toNativeQuery: date filter is empty\n"));
226
// If no probabilistic query is provided then promote the daterange
227
// filter to be THE query instead of filtering an empty query.
229
LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
232
xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, dq);
237
if (m_minSize != size_t(-1) || m_maxSize != size_t(-1)) {
239
char min[50], max[50];
240
sprintf(min, "%lld", (long long)m_minSize);
241
sprintf(max, "%lld", (long long)m_maxSize);
242
if (m_minSize == size_t(-1)) {
244
leftzeropad(value, 12);
245
sq = Xapian::Query(Xapian::Query::OP_VALUE_LE, VALUE_SIZE, value);
246
} else if (m_maxSize == size_t(-1)) {
248
leftzeropad(value, 12);
249
sq = Xapian::Query(Xapian::Query::OP_VALUE_GE, VALUE_SIZE, value);
251
string minvalue(min);
252
leftzeropad(minvalue, 12);
253
string maxvalue(max);
254
leftzeropad(maxvalue, 12);
255
sq = Xapian::Query(Xapian::Query::OP_VALUE_RANGE, VALUE_SIZE,
259
// If no probabilistic query is provided then promote the
260
// filter to be THE query instead of filtering an empty query.
262
LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
265
xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, sq);
269
// Add the autophrase if any
270
if (m_autophrase.isNotNull()) {
272
if (m_autophrase->toNativeQuery(db, &apq)) {
273
xq = xq.empty() ? apq :
274
Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, apq);
278
// Add the file type filtering clause if any
279
if (!m_filetypes.empty()) {
280
expandFileTypes(db, m_filetypes);
283
for (vector<string>::iterator it = m_filetypes.begin();
284
it != m_filetypes.end(); it++) {
285
string term = wrap_prefix(mimetype_prefix) + *it;
286
LOGDEB0(("Adding file type term: [%s]\n", term.c_str()));
287
tq = tq.empty() ? Xapian::Query(term) :
288
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
290
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
293
// Add the neg file type filtering clause if any
294
if (!m_nfiletypes.empty()) {
295
expandFileTypes(db, m_nfiletypes);
298
for (vector<string>::iterator it = m_nfiletypes.begin();
299
it != m_nfiletypes.end(); it++) {
300
string term = wrap_prefix(mimetype_prefix) + *it;
301
LOGDEB0(("Adding negative file type term: [%s]\n", term.c_str()));
302
tq = tq.empty() ? Xapian::Query(term) :
303
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
305
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_AND_NOT, xq, tq);
308
*((Xapian::Query *)d) = xq;
312
76
// This is called by the GUI simple search if the option is set: add
424
266
void SearchData::getTerms(HighlightData &hld) const
426
268
for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++)
427
(*it)->getTerms(hld);
269
(*it)->getTerms(hld);
431
// Splitter callback for breaking a user string into simple terms and
432
// phrases. This is for parts of the user entry which would appear as
433
// a single word because there is no white space inside, but are
434
// actually multiple terms to rcldb (ie term1,term2)
435
class TextSplitQ : public TextSplitP {
437
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
438
: TextSplitP(prc, flags),
439
curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
442
bool takeword(const std::string &term, int pos, int bs, int be)
444
// Check if the first letter is a majuscule in which
445
// case we do not want to do stem expansion. Need to do this
446
// before unac of course...
447
curnostemexp = unaciscapital(term);
449
return TextSplitP::takeword(term, pos, bs, be);
453
vector<string> terms;
454
vector<bool> nostemexps;
455
const StopList &stops;
456
// Count of terms including stopwords: this is for adjusting
462
class TermProcQ : public TermProc {
464
TermProcQ() : TermProc(0), m_ts(0) {}
465
void setTSQ(TextSplitQ *ts) {m_ts = ts;}
467
bool takeword(const std::string &term, int pos, int bs, int be)
469
m_ts->alltermcount++;
470
if (m_ts->lastpos < pos)
472
bool noexpand = be ? m_ts->curnostemexp : true;
473
LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
474
term.c_str(), pos, noexpand));
475
if (m_terms[pos].size() < term.size()) {
477
m_nste[pos] = noexpand;
483
for (map<int, string>::const_iterator it = m_terms.begin();
484
it != m_terms.end(); it++) {
485
m_ts->terms.push_back(it->second);
486
m_ts->nostemexps.push_back(m_nste[it->first]);
492
map<int, string> m_terms;
493
map<int, bool> m_nste;
498
static void listVector(const string& what, const vector<string>&l)
501
for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
504
LOGDEB0(("%s: %s\n", what.c_str(), a.c_str()));
508
/** Expand term into term list, using appropriate mode: stem, wildcards,
511
* @param mods stem expansion, case and diacritics sensitivity control.
512
* @param term input single word
513
* @param oexp output expansion list
514
* @param sterm output original input term if there were no wildcards
515
* @param prefix field prefix in index. We could recompute it, but the caller
516
* has it already. Used in the simple case where there is nothing to expand,
517
* and we just return the prefixed term (else Db::termMatch deals with it).
519
bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
520
string& ermsg, int mods,
522
vector<string>& oexp, string &sterm,
523
const string& prefix)
525
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
526
mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
532
bool maxexpissoft = false;
533
int maxexpand = getSoftMaxExp();
534
if (maxexpand != -1) {
537
maxexpand = getMaxExp();
540
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
542
// If there are no wildcards, add term to the list of user-entered terms
544
m_hldata.uterms.insert(term);
547
// No stem expansion if there are wildcards or if prevented by caller
548
bool nostemexp = (mods & SDCM_NOSTEMMING) != 0;
549
if (haswild || getStemLang().empty()) {
550
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
554
// noexpansion can be modified further down by possible case/diac expansion
555
bool noexpansion = nostemexp && !haswild;
557
int termmatchsens = 0;
559
bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
560
bool case_sensitive = (mods & SDCM_CASESENS) != 0;
562
if (o_index_stripchars) {
563
diac_sensitive = case_sensitive = false;
565
// If we are working with a raw index, apply the rules for case and
566
// diacritics sensitivity.
568
// If any character has a diacritic, we become
569
// diacritic-sensitive. Note that the way that the test is
570
// performed (conversion+comparison) will automatically ignore
571
// accented characters which are actually a separate letter
572
if (getAutoDiac() && unachasaccents(term)) {
573
LOGDEB0(("expandTerm: term has accents -> diac-sensitive\n"));
574
diac_sensitive = true;
577
// If any character apart the first is uppercase, we become
578
// case-sensitive. The first character is reserved for
579
// turning off stemming. You need to use a query language
580
// modifier to search for Floor in a case-sensitive way.
583
if (getAutoCase() && unachasuppercase(term.substr(it.getBpos()))) {
584
LOGDEB0(("expandTerm: term has uppercase -> case-sensitive\n"));
585
case_sensitive = true;
588
// If we are sensitive to case or diacritics turn stemming off
589
if (diac_sensitive || case_sensitive) {
590
LOGDEB0(("expandTerm: diac or case sens set -> stemexpand off\n"));
594
if (!case_sensitive || !diac_sensitive)
599
termmatchsens |= Db::ET_CASESENS;
601
termmatchsens |= Db::ET_DIACSENS;
604
oexp.push_back(prefix + term);
605
m_hldata.terms[term] = term;
606
LOGDEB(("ExpandTerm: noexpansion: final: %s\n", stringsToString(oexp).c_str()));
610
Db::MatchType mtyp = haswild ? Db::ET_WILD :
611
nostemexp ? Db::ET_NONE : Db::ET_STEM;
613
if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand,
618
// Term match entries to vector of terms
619
if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {
620
ermsg = "Maximum term expansion size exceeded."
621
" Maybe use case/diacritics sensitivity or increase maxTermExpand.";
624
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
625
it != res.entries.end(); it++) {
626
oexp.push_back(it->term);
628
// If the term does not exist at all in the db, the return from
629
// termMatch() is going to be empty, which is not what we want (we
630
// would then compute an empty Xapian query)
632
oexp.push_back(prefix + term);
634
// Remember the uterm-to-expansion links
635
for (vector<string>::const_iterator it = oexp.begin();
636
it != oexp.end(); it++) {
637
m_hldata.terms[strip_prefix(*it)] = term;
639
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
643
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
644
void multiply_groups(vector<vector<string> >::const_iterator vvit,
645
vector<vector<string> >::const_iterator vvend,
646
vector<string>& comb,
647
vector<vector<string> >&allcombs)
649
// Remember my string vector and compute next, for recursive calls.
650
vector<vector<string> >::const_iterator myvit = vvit++;
652
// Walk the string vector I'm called upon and, for each string,
653
// add it to current result, an call myself recursively on the
654
// next string vector. The last call (last element of the vector of
655
// vectors), adds the elementary result to the output
657
// Walk my string vector
658
for (vector<string>::const_iterator strit = (*myvit).begin();
659
strit != (*myvit).end(); strit++) {
661
// Add my current value to the string vector we're building
662
comb.push_back(*strit);
665
// Last call: store current result
666
allcombs.push_back(comb);
668
// Call recursively on next string vector
669
multiply_groups(vvit, vvend, comb, allcombs);
671
// Pop the value I just added (make room for the next element in my
677
void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
681
vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq);
682
LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
683
span.c_str(), (unsigned int)mods));
685
string sterm; // dumb version of user term
688
const FieldTraits *ftp;
689
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
690
prefix = wrap_prefix(ftp->pfx);
693
if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix))
696
// Set up the highlight data. No prefix should go in there
697
for (vector<string>::const_iterator it = exp.begin();
698
it != exp.end(); it++) {
699
m_hldata.groups.push_back(vector<string>(1, it->substr(prefix.size())));
700
m_hldata.slacks.push_back(0);
701
m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1);
704
// Push either term or OR of stem-expanded set
705
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
706
m_curcl += exp.size();
708
// If sterm (simplified original user term) is not null, give it a
709
// relevance boost. We do this even if no expansion occurred (else
710
// the non-expanded terms in a term list would end-up with even
711
// less wqf). This does not happen if there are wildcards anywhere
713
// We normally boost the original term in the stem expansion list. Don't
714
// do it if there are wildcards anywhere, this would skew the results.
715
bool doBoostUserTerm =
716
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
717
(m_parentSearch == 0 && !m_haveWildCards);
718
if (doBoostUserTerm && !sterm.empty()) {
719
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
720
Xapian::Query(prefix+sterm,
721
original_term_wqf_booster));
723
pqueries.push_back(xq);
726
// User entry element had several terms: transform into a PHRASE or
727
// NEAR xapian query, the elements of which can themselves be OR
728
// queries if the terms get expanded by stemming or wildcards (we
729
// don't do stemming for PHRASE though)
730
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
731
TextSplitQ *splitData,
733
bool useNear, int slack)
735
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
736
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
737
Xapian::Query::OP_PHRASE;
738
vector<Xapian::Query> orqueries;
739
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
740
bool hadmultiple = false;
742
vector<vector<string> >groups;
745
const FieldTraits *ftp;
746
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
747
prefix = wrap_prefix(ftp->pfx);
750
if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
751
orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
755
// Go through the list and perform stem/wildcard expansion for each element
756
vector<bool>::iterator nxit = splitData->nostemexps.begin();
757
for (vector<string>::iterator it = splitData->terms.begin();
758
it != splitData->terms.end(); it++, nxit++) {
759
LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
760
// Adjust when we do stem expansion. Not if disabled by
761
// caller, not inside phrases, and some versions of xapian
762
// will accept only one OR clause inside NEAR.
763
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE)
764
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
766
#endif // single OR inside NEAR
770
lmods |= SearchDataClause::SDCM_NOSTEMMING;
773
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
775
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
777
// groups is used for highlighting, we don't want prefixes in there.
778
vector<string> noprefs;
779
for (vector<string>::const_iterator it = exp.begin();
780
it != exp.end(); it++) {
781
noprefs.push_back(it->substr(prefix.size()));
783
groups.push_back(noprefs);
784
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
785
exp.begin(), exp.end()));
786
m_curcl += exp.size();
787
if (m_curcl >= getMaxCl())
789
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
795
if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
796
orqueries.push_back(Xapian::Query(prefix + end_of_field_term));
800
// Generate an appropriate PHRASE/NEAR query with adjusted slack
801
// For phrases, give a relevance boost like we do for original terms
802
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
803
splitData->alltermcount, splitData->lastpos));
804
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
805
splitData->lastpos + 1 + slack);
806
if (op == Xapian::Query::OP_PHRASE)
807
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
808
original_term_wqf_booster);
809
pqueries.push_back(xq);
811
// Add all combinations of NEAR/PHRASE groups to the highlighting data.
812
vector<vector<string> > allcombs;
814
multiply_groups(groups.begin(), groups.end(), comb, allcombs);
816
// Insert the search groups and slacks in the highlight data, with
817
// a reference to the user entry that generated them:
818
m_hldata.groups.insert(m_hldata.groups.end(),
819
allcombs.begin(), allcombs.end());
820
m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack);
821
m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(),
822
m_hldata.ugroups.size() - 1);
825
// Trim string beginning with ^ or ending with $ and convert to flags
826
static int stringToMods(string& s)
829
// Check for an anchored search
831
if (s.length() > 0 && s[0] == '^') {
832
mods |= Rcl::SearchDataClause::SDCM_ANCHORSTART;
835
if (s.length() > 0 && s[s.length()-1] == '$') {
836
mods |= Rcl::SearchDataClause::SDCM_ANCHOREND;
837
s.erase(s.length()-1);
843
* Turn user entry string (NOT query language) into a list of xapian queries.
844
* We just separate words and phrases, and do wildcard and stem expansion,
846
* This is used to process data entered into an OR/AND/NEAR/PHRASE field of
847
* the GUI (in the case of NEAR/PHRASE, clausedist adds dquotes to the user
850
* This appears awful, and it would seem that the split into
851
* terms/phrases should be performed in the upper layer so that we
852
* only receive pure term or near/phrase pure elements here, but in
853
* fact there are things that would appear like terms to naive code,
854
* and which will actually may be turned into phrases (ie: tom:jerry),
855
* in a manner which intimately depends on the index implementation,
856
* so that it makes sense to process this here.
858
* The final list contains one query for each term or phrase
859
* - Elements corresponding to a stem-expanded part are an OP_OR
860
* composition of the stem-expanded terms (or a single term query).
861
* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
862
* composition of the phrase terms (no stem expansion in this case)
863
* @return the subquery count (either or'd stem-expanded terms or phrase word
866
bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq,
867
string &ermsg, void *pq,
868
int slack, bool useNear)
870
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
871
int mods = m_modifiers;
873
LOGDEB(("StringToXapianQ:pUS:: qstr [%s] fld [%s] mods 0x%x "
874
"slack %d near %d\n",
875
iq.c_str(), m_field.c_str(), mods, slack, useNear));
878
const StopList stops = db.getStopList();
880
// Simple whitespace-split input into user-level words and
881
// double-quoted phrases: word1 word2 "this is a phrase".
883
// The text splitter may further still decide that the resulting
884
// "words" are really phrases, this depends on separators:
885
// [paul@dom.net] would still be a word (span), but [about:me]
886
// will probably be handled as a phrase.
887
vector<string> phrases;
888
TextSplit::stringToStrings(iq, phrases);
890
// Process each element: textsplit into terms, handle stem/wildcard
891
// expansion and transform into an appropriate Xapian::Query
893
for (vector<string>::iterator it = phrases.begin();
894
it != phrases.end(); it++) {
895
LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
896
// Anchoring modifiers
897
int amods = stringToMods(*it);
898
int terminc = amods != 0 ? 1 : 0;
900
// If there are multiple spans in this element, including
901
// at least one composite, we have to increase the slack
902
// else a phrase query including a span would fail.
903
// Ex: "term0@term1 term2" is onlyspans-split as:
904
// 0 term0@term1 0 12
906
// The position of term2 is 2, not 1, so a phrase search
908
// We used to do word split, searching for
909
// "term0 term1 term2" instead, which may have worse
910
// performance, but will succeed.
911
// We now adjust the phrase/near slack by comparing the term count
912
// and the last position
914
// The term processing pipeline:
916
TermProc *nxt = &tpq;
917
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
918
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
919
//tpcommon.onlygrams(true);
920
TermProcPrep tpprep(nxt);
921
if (o_index_stripchars)
924
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
925
TextSplit::TXTS_KEEPWILD),
927
tpq.setTSQ(&splitter);
928
splitter.text_to_words(*it);
930
slack += splitter.lastpos - splitter.terms.size() + 1;
932
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
933
switch (splitter.terms.size() + terminc) {
938
if (splitter.nostemexps.front())
939
lmods |= SearchDataClause::SDCM_NOSTEMMING;
940
m_hldata.ugroups.push_back(splitter.terms);
941
processSimpleSpan(db, ermsg, splitter.terms.front(),
946
m_hldata.ugroups.push_back(splitter.terms);
947
processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries,
950
if (m_curcl >= getMaxCl()) {
951
ermsg = maxXapClauseMsg;
952
if (!o_index_stripchars)
953
ermsg += maxXapClauseCaseDiacMsg;
957
} catch (const Xapian::Error &e) {
959
} catch (const string &s) {
961
} catch (const char *s) {
964
ermsg = "Caught unknown exception";
966
if (!ermsg.empty()) {
967
LOGERR(("stringToXapianQueries: %s\n", ermsg.c_str()));
973
// Translate a simple OR or AND search clause.
974
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
976
LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
977
getStemLang().c_str()));
979
Xapian::Query *qp = (Xapian::Query *)p;
980
*qp = Xapian::Query();
982
Xapian::Query::op op;
984
case SCLT_AND: op = Xapian::Query::OP_AND; break;
985
case SCLT_OR: op = Xapian::Query::OP_OR; break;
987
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
991
vector<Xapian::Query> pqueries;
992
if (!processUserString(db, m_text, m_reason, &pqueries))
994
if (pqueries.empty()) {
995
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
999
*qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
1000
if (m_weight != 1.0) {
1001
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
1006
// Translate a FILENAME search clause. This always comes
1007
// from a "filename" search from the gui or recollq. A query language
1008
// "filename:"-prefixed field will not go through here, but through
1009
// the generic field-processing code.
1011
// We do not split the entry any more (used to do some crazy thing
1012
// about expanding multiple fragments in the past). We just take the
1013
// value blanks and all and expand this against the indexed unsplit
1015
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
1017
Xapian::Query *qp = (Xapian::Query *)p;
1018
*qp = Xapian::Query();
1020
int maxexp = getSoftMaxExp();
1022
maxexp = getMaxExp();
1024
vector<string> names;
1025
db.filenameWildExp(m_text, names, maxexp);
1026
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
1028
if (m_weight != 1.0) {
1029
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
1034
// Translate a dir: path filtering clause. See comments in .h
1035
bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p)
1037
LOGDEB(("SearchDataClausePath::toNativeQuery: [%s]\n", m_text.c_str()));
1038
Xapian::Query *qp = (Xapian::Query *)p;
1039
*qp = Xapian::Query();
1041
if (m_text.empty()) {
1042
LOGERR(("SearchDataClausePath: empty path??\n"));
1043
m_reason = "Empty path ?";
1047
vector<Xapian::Query> orqueries;
1049
if (m_text[0] == '/')
1050
orqueries.push_back(Xapian::Query(wrap_prefix(pathelt_prefix)));
273
void SearchData::dump(ostream& o) const
275
o << "SearchData: " << " qs " << int(m_query.size()) <<
276
" ft " << m_filetypes.size() << " nft " << m_nfiletypes.size() <<
277
" hd " << m_haveDates << " maxs " << int(m_maxSize) << " mins " <<
278
int(m_minSize) << " wc " << m_haveWildCards << "\n";
279
for (std::vector<SearchDataClause*>::const_iterator it =
280
m_query.begin(); it != m_query.end(); it++) {
287
void SearchDataClause::dump(ostream& o) const
289
o << "SearchDataClause??";
292
void SearchDataClauseSimple::dump(ostream& o) const
294
o << "ClauseSimple: ";
298
if (!m_field.empty())
299
o << m_field << " : ";
303
void SearchDataClauseFilename::dump(ostream& o) const
308
o << "[" << m_text << "]";
311
void SearchDataClausePath::dump(ostream& o) const
316
o << "[" << m_text << "]";
319
void SearchDataClauseDist::dump(ostream& o) const
321
if (m_tp == SCLT_NEAR)
322
o << "ClauseDist: NEAR: ";
1052
m_text = path_tildexpand(m_text);
1054
vector<string> vpath;
1055
stringToTokens(m_text, vpath, "/");
1057
for (vector<string>::const_iterator pit = vpath.begin();
1058
pit != vpath.end(); pit++){
1062
if (!expandTerm(db, m_reason,
1063
SDCM_NOSTEMMING|SDCM_CASESENS|SDCM_DIACSENS,
1064
*pit, exp, sterm, wrap_prefix(pathelt_prefix))) {
1067
LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size()));
1068
listVector("", exp);
1069
if (exp.size() == 1)
1070
orqueries.push_back(Xapian::Query(exp[0]));
1072
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
1073
exp.begin(), exp.end()));
1074
m_curcl += exp.size();
1075
if (m_curcl >= getMaxCl())
1079
*qp = Xapian::Query(Xapian::Query::OP_PHRASE,
1080
orqueries.begin(), orqueries.end());
1082
if (m_weight != 1.0) {
1083
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
324
o << "ClauseDist: PHRA: ";
329
if (!m_field.empty())
330
o << m_field << " : ";
1088
// Translate NEAR or PHRASE clause.
1089
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
334
void SearchDataClauseSub::dump(ostream& o) const
1091
LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
1093
Xapian::Query *qp = (Xapian::Query *)p;
1094
*qp = Xapian::Query();
1096
vector<Xapian::Query> pqueries;
1099
// We produce a single phrase out of the user entry then use
1100
// stringToXapianQueries() to lowercase and simplify the phrase
1101
// terms etc. This will result into a single (complex)
1103
if (m_text.find('\"') != string::npos) {
1104
m_text = neutchars(m_text, "\"");
1106
string s = cstr_dquote + m_text + cstr_dquote;
1107
bool useNear = (m_tp == SCLT_NEAR);
1108
if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))
1110
if (pqueries.empty()) {
1111
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
1115
*qp = *pqueries.begin();
1116
if (m_weight != 1.0) {
1117
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
336
o << "ClauseSub {\n";
337
m_sub.getconstptr()->dump(o);
1122
341
} // Namespace Rcl