3
# Copyright (C) 2007 Lemur Consulting Ltd
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation; either version 2 of the License, or
8
# (at your option) any later version.
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
15
# You should have received a copy of the GNU General Public License along
16
# with this program; if not, write to the Free Software Foundation, Inc.,
17
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18
r"""searchconnection.py: A connection to the search engine for searching.
21
__docformat__ = "restructuredtext en"
25
import cPickle as _cPickle
28
import xapian as _xapian
29
from datastructures import *
30
from fieldactions import *
31
import fieldmappings as _fieldmappings
32
import highlight as _highlight
33
import errors as _errors
34
import indexerconnection as _indexerconnection
36
from replaylog import log as _log
38
class SearchResult(ProcessedDocument):
39
"""A result from a search.
41
As well as being a ProcessedDocument representing the document in the
42
database, the result has several members which may be used to get
43
information about how well the document matches the search:
45
- `rank`: The rank of the document in the search results, starting at 0
46
(ie, 0 is the "top" result, 1 is the second result, etc).
48
- `weight`: A floating point number indicating the weight of the result
49
document. The value is only meaningful relative to other results for a
50
given search - a different search, or the same search with a different
51
database, may give an entirely different scale to the weights. This
52
should not usually be displayed to users, but may be useful if trying to
53
perform advanced reweighting operations on search results.
55
- `percent`: A percentage value for the weight of a document. This is
56
just a rescaled form of the `weight` member. It doesn't represent any
57
kind of probability value; the only real meaning of the numbers is that,
58
within a single set of results, a document with a higher percentage
59
corresponds to a better match. Because the percentage doesn't really
60
represent a probability, or a confidence value, it is probably unhelpful
61
to display it to most users, since they tend to place an over emphasis
62
on its meaning. However, it is included because it may be useful
66
def __init__(self, msetitem, results):
67
ProcessedDocument.__init__(self, results._fieldmappings, msetitem.document)
68
self.rank = msetitem.rank
69
self.weight = msetitem.weight
70
self.percent = msetitem.percent
71
self._results = results
73
def _get_language(self, field):
74
"""Get the language that should be used for a given field.
76
Raises a KeyError if the field is not known.
79
actions = self._results._conn._field_actions[field]._actions
80
for action, kwargslist in actions.iteritems():
81
if action == FieldActions.INDEX_FREETEXT:
82
for kwargs in kwargslist:
84
return kwargs['language']
89
def summarise(self, field, maxlen=600, hl=('<b>', '</b>'), query=None):
90
"""Return a summarised version of the field specified.
92
This will return a summary of the contents of the field stored in the
93
search result, with words which match the query highlighted.
95
The maximum length of the summary (in characters) may be set using the
98
The return value will be a string holding the summary, with
99
highlighting applied. If there are multiple instances of the field in
100
the document, the instances will be joined with a newline character.
102
To turn off highlighting, set hl to None. Each highlight will consist
103
of the first entry in the `hl` list being placed before the word, and
104
the second entry in the `hl` list being placed after the word.
106
Any XML or HTML style markup tags in the field will be stripped before
107
the summarisation algorithm is applied.
109
If `query` is supplied, it should contain a Query object, as returned
110
from SearchConnection.query_parse() or related methods, which will be
111
used as the basis of the summarisation and highlighting rather than the
112
query which was used for the search.
114
Raises KeyError if the field is not known.
117
highlighter = _highlight.Highlighter(language_code=self._get_language(field))
118
field = self.data[field]
120
text = '\n'.join(field)
122
query = self._results._query
123
return highlighter.makeSample(text, query, maxlen, hl)
125
def highlight(self, field, hl=('<b>', '</b>'), strip_tags=False, query=None):
126
"""Return a highlighted version of the field specified.
128
This will return all the contents of the field stored in the search
129
result, with words which match the query highlighted.
131
The return value will be a list of strings (corresponding to the list
132
of strings which is the raw field data).
134
Each highlight will consist of the first entry in the `hl` list being
135
placed before the word, and the second entry in the `hl` list being
136
placed after the word.
138
If `strip_tags` is True, any XML or HTML style markup tags in the field
139
will be stripped before highlighting is applied.
141
If `query` is supplied, it should contain a Query object, as returned
142
from SearchConnection.query_parse() or related methods, which will be
143
used as the basis of the summarisation and highlighting rather than the
144
query which was used for the search.
146
Raises KeyError if the field is not known.
149
highlighter = _highlight.Highlighter(language_code=self._get_language(field))
150
field = self.data[field]
153
query = self._results._query
155
results.append(highlighter.highlight(text, query, hl, strip_tags))
159
return ('<SearchResult(rank=%d, id=%r, data=%r)>' %
160
(self.rank, self.id, self.data))
163
class SearchResultIter(object):
164
"""An iterator over a set of results from a search.
167
def __init__(self, results, order):
168
self._results = results
170
if self._order is None:
171
self._iter = iter(results._mset)
173
self._iter = iter(self._order)
176
if self._order is None:
177
msetitem = self._iter.next()
179
index = self._iter.next()
180
msetitem = self._results._mset.get_hit(index)
181
return SearchResult(msetitem, self._results)
184
def _get_significant_digits(value, lower, upper):
185
"""Get the significant digits of value which are constrained by the
186
(inclusive) lower and upper bounds.
188
If there are no significant digits which are definitely within the
189
bounds, exactly one significant digit will be returned in the result.
191
>>> _get_significant_digits(15,15,15)
193
>>> _get_significant_digits(15,15,17)
195
>>> _get_significant_digits(4777,208,6000)
197
>>> _get_significant_digits(4777,4755,4790)
199
>>> _get_significant_digits(4707,4695,4710)
201
>>> _get_significant_digits(4719,4717,4727)
203
>>> _get_significant_digits(0,0,0)
205
>>> _get_significant_digits(9,9,10)
207
>>> _get_significant_digits(9,9,100)
211
assert(lower <= value)
212
assert(value <= upper)
215
# Get the first power of 10 greater than the difference.
216
# This corresponds to the magnitude of the smallest significant digit.
220
pos_pow_10 = int(10 ** math.ceil(math.log10(diff)))
222
# Special case for situation where we don't have any significant digits:
223
# get the magnitude of the most significant digit in value.
224
if pos_pow_10 > value:
228
pos_pow_10 = int(10 ** math.floor(math.log10(value)))
230
# Return the value, rounded to the nearest multiple of pos_pow_10
231
return ((value + pos_pow_10 // 2) // pos_pow_10) * pos_pow_10
233
class SearchResults(object):
234
"""A set of results of a search.
237
def __init__(self, conn, enq, query, mset, fieldmappings, tagspy,
238
tagfields, facetspy, facetfields, facethierarchy,
244
self._mset_order = None
245
self._fieldmappings = fieldmappings
246
self._tagspy = tagspy
247
if tagfields is None:
248
self._tagfields = None
250
self._tagfields = set(tagfields)
251
self._facetspy = facetspy
252
self._facetfields = facetfields
253
self._facethierarchy = facethierarchy
254
self._facetassocs = facetassocs
255
self._numeric_ranges_built = {}
257
def _cluster(self, num_clusters, maxdocs, fields=None):
258
"""Cluster results based on similarity.
260
Note: this method is experimental, and will probably disappear or
261
change in the future.
263
The number of clusters is specified by num_clusters: unless there are
264
too few results, there will be exaclty this number of clusters in the
268
clusterer = _xapian.ClusterSingleLink()
269
xapclusters = _xapian.ClusterAssignments()
270
docsim = _xapian.DocSimCosine()
271
source = _xapian.MSetDocumentSource(self._mset, maxdocs)
274
clusterer.cluster(self._conn._index, xapclusters, docsim, source, num_clusters)
276
decider = self._make_expand_decider(fields)
277
clusterer.cluster(self._conn._index, xapclusters, docsim, source, decider, num_clusters)
282
for item in self._mset:
284
clusterid = xapclusters.cluster(docid)
285
if clusterid not in idmap:
286
idmap[clusterid] = newid
288
clusterid = idmap[clusterid]
289
if clusterid not in clusters:
290
clusters[clusterid] = []
291
clusters[clusterid].append(item.rank)
294
def _reorder_by_clusters(self, clusters):
295
"""Reorder the mset based on some clusters.
298
if self.startrank != 0:
299
raise _errors.SearchError("startrank must be zero to reorder by clusters")
304
clusterstarts = dict(((c[0], None) for c in clusters.itervalues()))
305
for i in xrange(self.endrank):
306
if i in clusterstarts:
310
self._mset_order = tophits
311
self._mset_order.extend(nottophits)
313
def _make_expand_decider(self, fields):
314
"""Make an expand decider which accepts only terms in the specified
319
if isinstance(fields, basestring):
323
actions = self._conn._field_actions[field]._actions
326
for action, kwargslist in actions.iteritems():
327
if action == FieldActions.INDEX_FREETEXT:
328
prefix = self._conn._field_mappings.get_prefix(field)
329
prefixes[prefix] = None
330
prefixes['Z' + prefix] = None
331
if action in (FieldActions.INDEX_EXACT,
333
FieldActions.FACET,):
334
prefix = self._conn._field_mappings.get_prefix(field)
335
prefixes[prefix] = None
336
prefix_re = _re.compile('|'.join([_re.escape(x) + '[^A-Z]' for x in prefixes.keys()]))
337
class decider(_xapian.ExpandDecider):
338
def __call__(self, term):
339
return prefix_re.match(term) is not None
342
def _reorder_by_similarity(self, count, maxcount, max_similarity,
344
"""Reorder results based on similarity.
346
The top `count` documents will be chosen such that they are relatively
347
dissimilar. `maxcount` documents will be considered for moving around,
348
and `max_similarity` is a value between 0 and 1 indicating the maximum
349
similarity to the previous document before a document is moved down the
352
Note: this method is experimental, and will probably disappear or
353
change in the future.
356
if self.startrank != 0:
357
raise _errors.SearchError("startrank must be zero to reorder by similiarity")
358
ds = _xapian.DocSimCosine()
359
ds.set_termfreqsource(_xapian.DatabaseTermFreqSource(self._conn._index))
361
if fields is not None:
362
ds.set_expand_decider(self._make_expand_decider(fields))
371
end = min(self.endrank, maxcount)
372
for i in xrange(end):
376
hit = self._mset.get_hit(i)
377
if len(tophits) == 0:
381
# Compare each incoming hit to tophits
383
for tophit in tophits[-1:]:
385
sim = ds.similarity(hit.document, tophit.document)
389
# If it's not similar to an existing hit, add to tophits.
390
if maxsim < max_similarity:
393
nottophits.append(hit)
396
# If we're full of hits, append to the end.
397
if len(tophits) >= count:
399
new_order.append(hit.rank)
400
for hit in nottophits:
401
new_order.append(hit.rank)
405
new_order.append(hit.rank)
406
for hit in nottophits:
407
new_order.append(hit.rank)
408
if end != self.endrank:
409
new_order.extend(range(end, self.endrank))
410
assert len(new_order) == self.endrank
412
self._mset_order = new_order
414
assert new_order == range(self.endrank)
417
return ("<SearchResults(startrank=%d, "
420
"matches_lower_bound=%d, "
421
"matches_upper_bound=%d, "
422
"matches_estimated=%d, "
423
"estimate_is_exact=%s)>" %
428
self.matches_lower_bound,
429
self.matches_upper_bound,
430
self.matches_estimated,
431
self.estimate_is_exact,
434
def _get_more_matches(self):
435
# This check relies on us having asked for at least one more result
436
# than retrieved to be checked.
437
return (self.matches_lower_bound > self.endrank)
438
more_matches = property(_get_more_matches, doc=
439
"""Check whether there are further matches after those in this result set.
443
def _get_startrank(self):
444
return self._mset.get_firstitem()
445
startrank = property(_get_startrank, doc=
446
"""Get the rank of the first item in the search results.
448
This corresponds to the "startrank" parameter passed to the search() method.
452
def _get_endrank(self):
453
return self._mset.get_firstitem() + len(self._mset)
454
endrank = property(_get_endrank, doc=
455
"""Get the rank of the item after the end of the search results.
457
If there are sufficient results in the index, this corresponds to the
458
"endrank" parameter passed to the search() method.
462
def _get_lower_bound(self):
463
return self._mset.get_matches_lower_bound()
464
matches_lower_bound = property(_get_lower_bound, doc=
465
"""Get a lower bound on the total number of matching documents.
469
def _get_upper_bound(self):
470
return self._mset.get_matches_upper_bound()
471
matches_upper_bound = property(_get_upper_bound, doc=
472
"""Get an upper bound on the total number of matching documents.
476
def _get_human_readable_estimate(self):
477
lower = self._mset.get_matches_lower_bound()
478
upper = self._mset.get_matches_upper_bound()
479
est = self._mset.get_matches_estimated()
480
return _get_significant_digits(est, lower, upper)
481
matches_human_readable_estimate = property(_get_human_readable_estimate,
483
"""Get a human readable estimate of the number of matching documents.
485
This consists of the value returned by the "matches_estimated" property,
486
rounded to an appropriate number of significant digits (as determined by
487
the values of the "matches_lower_bound" and "matches_upper_bound"
492
def _get_estimated(self):
493
return self._mset.get_matches_estimated()
494
matches_estimated = property(_get_estimated, doc=
495
"""Get an estimate for the total number of matching documents.
499
def _estimate_is_exact(self):
500
return self._mset.get_matches_lower_bound() == \
501
self._mset.get_matches_upper_bound()
502
estimate_is_exact = property(_estimate_is_exact, doc=
503
"""Check whether the estimated number of matching documents is exact.
505
If this returns true, the estimate given by the `matches_estimated`
506
property is guaranteed to be correct.
508
If this returns false, it is possible that the actual number of matching
509
documents is different from the number given by the `matches_estimated`
514
def get_hit(self, index):
515
"""Get the hit with a given index.
518
if self._mset_order is None:
519
msetitem = self._mset.get_hit(index)
521
msetitem = self._mset.get_hit(self._mset_order[index])
522
return SearchResult(msetitem, self)
523
__getitem__ = get_hit
526
"""Get an iterator over the hits in the search result.
528
The iterator returns the results in increasing order of rank.
531
return SearchResultIter(self, self._mset_order)
534
"""Get the number of hits in the search result.
536
Note that this is not (usually) the number of matching documents for
537
the search. If startrank is non-zero, it's not even the rank of the
538
last document in the search result. It's simply the number of hits
539
stored in the search result.
541
It is, however, the number of items returned by the iterator produced
542
by calling iter() on this SearchResults object.
545
return len(self._mset)
547
def get_top_tags(self, field, maxtags):
548
"""Get the most frequent tags in a given field.
550
- `field` - the field to get tags for. This must have been specified
551
in the "gettags" argument of the search() call.
552
- `maxtags` - the maximum number of tags to return.
554
Returns a sequence of 2-item tuples, in which the first item in the
555
tuple is the tag, and the second is the frequency of the tag in the
556
matches seen (as an integer).
559
if 'tags' in _checkxapian.missing_features:
560
raise errors.SearchError("Tags unsupported with this release of xapian")
561
if self._tagspy is None or field not in self._tagfields:
562
raise _errors.SearchError("Field %r was not specified for getting tags" % field)
563
prefix = self._conn._field_mappings.get_prefix(field)
564
return self._tagspy.get_top_terms(prefix, maxtags)
566
def get_suggested_facets(self, maxfacets=5, desired_num_of_categories=7,
567
required_facets=None):
568
"""Get a suggested set of facets, to present to the user.
570
This returns a list, in descending order of the usefulness of the
571
facet, in which each item is a tuple holding:
573
- fieldname of facet.
574
- sequence of 2-tuples holding the suggested values or ranges for that
577
For facets of type 'string', the first item in the 2-tuple will
578
simply be the string supplied when the facet value was added to its
579
document. For facets of type 'float', it will be a 2-tuple, holding
580
floats giving the start and end of the suggested value range.
582
The second item in the 2-tuple will be the frequency of the facet
583
value or range in the result set.
585
If required_facets is not None, it must be a field name, or a sequence
586
of field names. Any field names mentioned in required_facets will be
587
returned if there are any facet values at all in the search results for
588
that field. The facet will only be omitted if there are no facet
589
values at all for the field.
591
The value of maxfacets will be respected as far as possible; the
592
exception is that if there are too many fields listed in
593
required_facets with at least one value in the search results, extra
594
facets will be returned (ie, obeying the required_facets parameter is
595
considered more important than the maxfacets parameter).
597
If facet_hierarchy was indicated when search() was called, and the
598
query included facets, then only subfacets of those query facets and
599
top-level facets will be included in the returned list. Furthermore
600
top-level facets will only be returned if there are remaining places
601
in the list after it has been filled with subfacets. Note that
602
required_facets is still respected regardless of the facet hierarchy.
604
If a query type was specified when search() was called, and the query
605
included facets, then facets with an association of Never to the
606
query type are never returned, even if mentioned in required_facets.
607
Facets with an association of Preferred are listed before others in
611
if 'facets' in _checkxapian.missing_features:
612
raise errors.SearchError("Facets unsupported with this release of xapian")
613
if self._facetspy is None:
614
raise _errors.SearchError("Facet selection wasn't enabled when the search was run")
615
if isinstance(required_facets, basestring):
616
required_facets = [required_facets]
619
for field, slot, kwargslist in self._facetfields:
621
for kwargs in kwargslist:
622
type = kwargs.get('type', None)
623
if type is not None: break
624
if type is None: type = 'string'
627
if field not in self._numeric_ranges_built:
628
self._facetspy.build_numeric_ranges(slot, desired_num_of_categories)
629
self._numeric_ranges_built[field] = None
630
facettypes[field] = type
631
score = self._facetspy.score_categorisation(slot, desired_num_of_categories)
632
scores.append((score, field, slot))
634
# Sort on whether facet is top-level ahead of score (use subfacets first),
635
# and on whether facet is preferred for the query type ahead of anything else
636
if self._facethierarchy:
637
# Note, tuple[-2] is the value of 'field' in a scores tuple
638
scores = [(tuple[-2] not in self._facethierarchy,) + tuple for tuple in scores]
639
if self._facetassocs:
640
preferred = _indexerconnection.IndexerConnection.FacetQueryType_Preferred
641
scores = [(self._facetassocs.get(tuple[-2]) != preferred,) + tuple for tuple in scores]
643
if self._facethierarchy:
647
if self._facetassocs:
650
scores = [tuple[index:] for tuple in scores]
653
required_results = []
654
for score, field, slot in scores:
655
# Check if the facet is required
657
if required_facets is not None:
658
required = field in required_facets
660
# If we've got enough facets, and the field isn't required, skip it
661
if not required and len(results) + len(required_results) >= maxfacets:
665
values = self._facetspy.get_values_as_dict(slot)
666
if field in self._numeric_ranges_built:
670
# Required facets must occur at least once, other facets must occur
680
if facettypes[field] == 'float':
681
# Convert numbers to python numbers, and number ranges to a
682
# python tuple of two numbers.
683
for value, frequency in values.iteritems():
685
value1 = _log(_xapian.sortable_unserialise, value)
688
value1 = _log(_xapian.sortable_unserialise, value[:9])
689
value2 = _log(_xapian.sortable_unserialise, value[9:])
690
newvalues.append(((value1, value2), frequency))
692
for value, frequency in values.iteritems():
693
newvalues.append((value, frequency))
697
required_results.append((score, field, newvalues))
699
results.append((score, field, newvalues))
701
# Throw away any excess results if we have more required_results to
703
maxfacets = maxfacets - len(required_results)
705
results = required_results
707
results = results[:maxfacets]
708
results.extend(required_results)
711
# Throw away the scores because they're not meaningful outside this
713
results = [(field, newvalues) for (score, field, newvalues) in results]
717
class SearchConnection(object):
718
"""A connection to the search engine for searching.
720
The connection will access a view of the database.
723
_qp_flags_base = _xapian.QueryParser.FLAG_LOVEHATE
724
_qp_flags_phrase = _xapian.QueryParser.FLAG_PHRASE
725
_qp_flags_synonym = (_xapian.QueryParser.FLAG_AUTO_SYNONYMS |
726
_xapian.QueryParser.FLAG_AUTO_MULTIWORD_SYNONYMS)
727
_qp_flags_bool = _xapian.QueryParser.FLAG_BOOLEAN
731
def __init__(self, indexpath):
732
"""Create a new connection to the index for searching.
734
There may only an arbitrary number of search connections for a
735
particular database open at a given time (regardless of whether there
736
is a connection for indexing open as well).
738
If the database doesn't exist, an exception will be raised.
741
self._index = _log(_xapian.Database, indexpath)
742
self._indexpath = indexpath
747
self._close_handlers = []
752
def append_close_handler(self, handler, userdata=None):
753
"""Append a callback to the list of close handlers.
755
These will be called when the SearchConnection is closed. This happens
756
when the close() method is called, or when the SearchConnection object
757
is deleted. The callback will be passed two arguments: the path to the
758
SearchConnection object, and the userdata supplied to this method.
760
The handlers will be called in the order in which they were added.
762
The handlers will be called after the connection has been closed, so
763
cannot prevent it closing: their return value will be ignored. In
764
addition, they should not raise any exceptions.
767
self._close_handlers.append((handler, userdata))
769
def _get_sort_type(self, field):
770
"""Get the sort type that should be used for a given field.
774
actions = self._field_actions[field]._actions
777
for action, kwargslist in actions.iteritems():
778
if action == FieldActions.SORT_AND_COLLAPSE:
779
for kwargs in kwargslist:
780
return kwargs['type']
782
def _load_config(self):
783
"""Load the configuration for the database.
786
# Note: this code is basically duplicated in the IndexerConnection
787
# class. Move it to a shared location.
788
assert self._index is not None
790
config_str = _log(self._index.get_metadata, '_xappy_config')
791
if len(config_str) == 0:
792
self._field_actions = {}
793
self._field_mappings = _fieldmappings.FieldMappings()
794
self._facet_hierarchy = {}
795
self._facet_query_table = {}
799
(self._field_actions, mappings, self._facet_hierarchy, self._facet_query_table, self._next_docid) = _cPickle.loads(config_str)
801
# Backwards compatibility - configuration used to lack _facet_hierarchy and _facet_query_table
802
(self._field_actions, mappings, self._next_docid) = _cPickle.loads(config_str)
803
self._facet_hierarchy = {}
804
self._facet_query_table = {}
805
self._field_mappings = _fieldmappings.FieldMappings(mappings)
808
"""Reopen the connection.
810
This updates the revision of the index which the connection references
811
to the latest flushed revision.
814
if self._index is None:
815
raise _errors.SearchError("SearchConnection has been closed")
817
# Re-read the actions.
821
"""Close the connection to the database.
823
It is important to call this method before allowing the class to be
824
garbage collected to ensure that the connection is cleaned up promptly.
826
No other methods may be called on the connection after this has been
827
called. (It is permissible to call close() multiple times, but
828
only the first call will have any effect.)
830
If an exception occurs, the database will be closed, but changes since
831
the last call to flush may be lost.
834
if self._index is None:
837
# Remember the index path
838
indexpath = self._indexpath
840
# There is currently no "close()" method for xapian databases, so
841
# we have to rely on the garbage collector. Since we never copy
842
# the _index property out of this class, there should be no cycles,
843
# so the standard python implementation should garbage collect
844
# _index straight away. A close() method is planned to be added to
845
# xapian at some point - when it is, we should call it here to make
846
# the code more robust.
848
self._indexpath = None
849
self._field_actions = None
850
self._field_mappings = None
852
# Call the close handlers.
853
for handler, userdata in self._close_handlers:
855
handler(indexpath, userdata)
857
import sys, traceback
858
print >>sys.stderr, "WARNING: unhandled exception in handler called by SearchConnection.close(): %s" % traceback.format_exception_only(type(e), e)
860
def get_doccount(self):
861
"""Count the number of documents in the database.
863
This count will include documents which have been added or removed but
867
if self._index is None:
868
raise _errors.SearchError("SearchConnection has been closed")
869
return self._index.get_doccount()
871
OP_AND = _xapian.Query.OP_AND
872
OP_OR = _xapian.Query.OP_OR
873
def query_composite(self, operator, queries):
874
"""Build a composite query from a list of queries.
876
The queries are combined with the supplied operator, which is either
877
SearchConnection.OP_AND or SearchConnection.OP_OR.
880
if self._index is None:
881
raise _errors.SearchError("SearchConnection has been closed")
882
return _log(_xapian.Query, operator, list(queries))
884
def query_multweight(self, query, multiplier):
885
"""Build a query which modifies the weights of a subquery.
887
This produces a query which returns the same documents as the subquery,
888
and in the same order, but with the weights assigned to each document
889
multiplied by the value of "multiplier". "multiplier" may be any floating
890
point value, but negative values will be clipped to 0, since Xapian
891
doesn't support negative weights.
893
This can be useful when producing queries to be combined with
894
query_composite, because it allows the relative importance of parts of
895
the query to be adjusted.
898
return _log(_xapian.Query, _xapian.Query.OP_SCALE_WEIGHT, query, multiplier)
900
def query_filter(self, query, filter, exclude=False):
901
"""Filter a query with another query.
903
If exclude is False (or not specified), documents will only match the
904
resulting query if they match the both the first and second query: the
905
results of the first query are "filtered" to only include those which
906
also match the second query.
908
If exclude is True, documents will only match the resulting query if
909
they match the first query, but not the second query: the results of
910
the first query are "filtered" to only include those which do not match
913
Documents will always be weighted according to only the first query.
915
- `query`: The query to filter.
916
- `filter`: The filter to apply to the query.
917
- `exclude`: If True, the sense of the filter is reversed - only
918
documents which do not match the second query will be returned.
921
if self._index is None:
922
raise _errors.SearchError("SearchConnection has been closed")
923
if not isinstance(filter, _xapian.Query):
924
raise _errors.SearchError("Filter must be a Xapian Query object")
926
return _log(_xapian.Query, _xapian.Query.OP_AND_NOT, query, filter)
928
return _log(_xapian.Query, _xapian.Query.OP_FILTER, query, filter)
930
def query_adjust(self, primary, secondary):
931
"""Adjust the weights of one query with a secondary query.
933
Documents will be returned from the resulting query if and only if they
934
match the primary query (specified by the "primary" parameter).
935
However, the weights (and hence, the relevance rankings) of the
936
documents will be adjusted by adding weights from the secondary query
937
(specified by the "secondary" parameter).
940
if self._index is None:
941
raise _errors.SearchError("SearchConnection has been closed")
942
return _log(_xapian.Query, _xapian.Query.OP_AND_MAYBE, primary, secondary)
944
def query_range(self, field, begin, end):
945
"""Create a query for a range search.
947
This creates a query which matches only those documents which have a
948
field value in the specified range.
950
Begin and end must be appropriate values for the field, according to
951
the 'type' parameter supplied to the SORTABLE action for the field.
953
The begin and end values are both inclusive - any documents with a
954
value equal to begin or end will be returned (unless end is less than
955
begin, in which case no documents will be returned).
957
Begin or end may be set to None in order to create an open-ended
958
range. (They may also both be set to None, which will generate a query
959
which matches all documents containing any value for the field.)
962
if self._index is None:
963
raise _errors.SearchError("SearchConnection has been closed")
965
if begin is None and end is None:
966
# Return a "match everything" query
967
return _log(_xapian.Query, '')
970
slot = self._field_mappings.get_slot(field, 'collsort')
972
# Return a "match nothing" query
973
return _log(_xapian.Query)
975
sorttype = self._get_sort_type(field)
976
marshaller = SortableMarshaller(False)
977
fn = marshaller.get_marshall_function(field, sorttype)
979
if begin is not None:
980
begin = fn(field, begin)
985
return _log(_xapian.Query, _xapian.Query.OP_VALUE_LE, slot, end)
988
return _log(_xapian.Query, _xapian.Query.OP_VALUE_GE, slot, begin)
990
return _log(_xapian.Query, _xapian.Query.OP_VALUE_RANGE, slot, begin, end)
992
def query_facet(self, field, val):
993
"""Create a query for a facet value.
995
This creates a query which matches only those documents which have a
996
facet value in the specified range.
998
For a numeric range facet, val should be a tuple holding the start and
999
end of the range, or a comma separated string holding two floating
1000
point values. For other facets, val should be the value to look
1003
The start and end values are both inclusive - any documents with a
1004
value equal to start or end will be returned (unless end is less than
1005
start, in which case no documents will be returned).
1008
if self._index is None:
1009
raise _errors.SearchError("SearchConnection has been closed")
1010
if 'facets' in _checkxapian.missing_features:
1011
raise errors.SearchError("Facets unsupported with this release of xapian")
1014
actions = self._field_actions[field]._actions
1018
for action, kwargslist in actions.iteritems():
1019
if action == FieldActions.FACET:
1020
for kwargs in kwargslist:
1021
facettype = kwargs.get('type', None)
1022
if facettype is not None:
1024
if facettype is not None:
1027
if facettype == 'float':
1028
if isinstance(val, basestring):
1029
val = [float(v) for v in val.split(',', 2)]
1030
assert(len(val) == 2)
1032
slot = self._field_mappings.get_slot(field, 'facet')
1034
return _log(_xapian.Query)
1035
# FIXME - check that sorttype == self._get_sort_type(field)
1037
marshaller = SortableMarshaller(False)
1038
fn = marshaller.get_marshall_function(field, sorttype)
1039
begin = fn(field, val[0])
1040
end = fn(field, val[1])
1041
return _log(_xapian.Query, _xapian.Query.OP_VALUE_RANGE, slot, begin, end)
1043
assert(facettype == 'string' or facettype is None)
1044
prefix = self._field_mappings.get_prefix(field)
1045
return _log(_xapian.Query, prefix + val.lower())
1048
def _prepare_queryparser(self, allow, deny, default_op, default_allow,
1050
"""Prepare (and return) a query parser using the specified fields and
1054
if self._index is None:
1055
raise _errors.SearchError("SearchConnection has been closed")
1057
if isinstance(allow, basestring):
1059
if isinstance(deny, basestring):
1061
if allow is not None and len(allow) == 0:
1063
if deny is not None and len(deny) == 0:
1065
if allow is not None and deny is not None:
1066
raise _errors.SearchError("Cannot specify both `allow` and `deny` "
1067
"(got %r and %r)" % (allow, deny))
1069
if isinstance(default_allow, basestring):
1070
default_allow = (default_allow, )
1071
if isinstance(default_deny, basestring):
1072
default_deny = (default_deny, )
1073
if default_allow is not None and len(default_allow) == 0:
1074
default_allow = None
1075
if default_deny is not None and len(default_deny) == 0:
1077
if default_allow is not None and default_deny is not None:
1078
raise _errors.SearchError("Cannot specify both `default_allow` and `default_deny` "
1079
"(got %r and %r)" % (default_allow, default_deny))
1081
qp = _log(_xapian.QueryParser)
1082
qp.set_database(self._index)
1083
qp.set_default_op(default_op)
1086
allow = [key for key in self._field_actions]
1087
if deny is not None:
1088
allow = [key for key in allow if key not in deny]
1092
actions = self._field_actions[field]._actions
1095
for action, kwargslist in actions.iteritems():
1096
if action == FieldActions.INDEX_EXACT:
1097
# FIXME - need patched version of xapian to add exact prefixes
1098
#qp.add_exact_prefix(field, self._field_mappings.get_prefix(field))
1099
qp.add_prefix(field, self._field_mappings.get_prefix(field))
1100
if action == FieldActions.INDEX_FREETEXT:
1101
allow_field_specific = True
1102
for kwargs in kwargslist:
1103
allow_field_specific = allow_field_specific or kwargs.get('allow_field_specific', True)
1104
if not allow_field_specific:
1106
qp.add_prefix(field, self._field_mappings.get_prefix(field))
1107
for kwargs in kwargslist:
1109
lang = kwargs['language']
1110
my_stemmer = _log(_xapian.Stem, lang)
1111
qp.my_stemmer = my_stemmer
1112
qp.set_stemmer(my_stemmer)
1113
qp.set_stemming_strategy(qp.STEM_SOME)
1117
if default_allow is not None or default_deny is not None:
1118
if default_allow is None:
1119
default_allow = [key for key in self._field_actions]
1120
if default_deny is not None:
1121
default_allow = [key for key in default_allow if key not in default_deny]
1122
for field in default_allow:
1124
actions = self._field_actions[field]._actions
1127
for action, kwargslist in actions.iteritems():
1128
if action == FieldActions.INDEX_FREETEXT:
1129
qp.add_prefix('', self._field_mappings.get_prefix(field))
1130
# FIXME - set stemming options for the default prefix
1134
def _query_parse_with_prefix(self, qp, string, flags, prefix):
1135
"""Parse a query, with an optional prefix.
1139
return qp.parse_query(string, flags)
1141
return qp.parse_query(string, flags, prefix)
1143
def _query_parse_with_fallback(self, qp, string, prefix=None):
1144
"""Parse a query with various flags.
1146
If the initial boolean pass fails, fall back to not using boolean
1151
q1 = self._query_parse_with_prefix(qp, string,
1152
self._qp_flags_base |
1153
self._qp_flags_phrase |
1154
self._qp_flags_synonym |
1155
self._qp_flags_bool,
1157
except _xapian.QueryParserError, e:
1158
# If we got a parse error, retry without boolean operators (since
1159
# these are the usual cause of the parse error).
1160
q1 = self._query_parse_with_prefix(qp, string,
1161
self._qp_flags_base |
1162
self._qp_flags_phrase |
1163
self._qp_flags_synonym,
1166
qp.set_stemming_strategy(qp.STEM_NONE)
1168
q2 = self._query_parse_with_prefix(qp, string,
1169
self._qp_flags_base |
1170
self._qp_flags_bool,
1172
except _xapian.QueryParserError, e:
1173
# If we got a parse error, retry without boolean operators (since
1174
# these are the usual cause of the parse error).
1175
q2 = self._query_parse_with_prefix(qp, string,
1176
self._qp_flags_base,
1179
return _log(_xapian.Query, _xapian.Query.OP_AND_MAYBE, q1, q2)
1181
def query_parse(self, string, allow=None, deny=None, default_op=OP_AND,
1182
default_allow=None, default_deny=None):
1183
"""Parse a query string.
1185
This is intended for parsing queries entered by a user. If you wish to
1186
combine structured queries, it is generally better to use the other
1187
query building methods, such as `query_composite` (though you may wish
1188
to create parts of the query to combine with such methods with this
1191
The string passed to this method can have various operators in it. In
1192
particular, it may contain field specifiers (ie, field names, followed
1193
by a colon, followed by some text to search for in that field). For
1194
example, if "author" is a field in the database, the search string
1195
could contain "author:richard", and this would be interpreted as
1196
"search for richard in the author field". By default, any fields in
1197
the database which are indexed with INDEX_EXACT or INDEX_FREETEXT will
1198
be available for field specific searching in this way - however, this
1199
can be modified using the "allow" or "deny" parameters, and also by the
1200
allow_field_specific tag on INDEX_FREETEXT fields.
1202
Any text which isn't prefixed by a field specifier is used to search
1203
the "default set" of fields. By default, this is the full set of
1204
fields in the database which are indexed with INDEX_FREETEXT and for
1205
which the search_by_default flag set (ie, if the text is found in any
1206
of those fields, the query will match). However, this may be modified
1207
with the "default_allow" and "default_deny" parameters. (Note that
1208
fields which are indexed with INDEX_EXACT aren't allowed to be used in
1209
the default list of fields.)
1211
- `string`: The string to parse.
1212
- `allow`: A list of fields to allow in the query.
1213
- `deny`: A list of fields not to allow in the query.
1214
- `default_op`: The default operator to combine query terms with.
1215
- `default_allow`: A list of fields to search for by default.
1216
- `default_deny`: A list of fields not to search for by default.
1218
Only one of `allow` and `deny` may be specified.
1220
Only one of `default_allow` and `default_deny` may be specified.
1222
If any of the entries in `allow` are not present in the configuration
1223
for the database, or are not specified for indexing (either as
1224
INDEX_EXACT or INDEX_FREETEXT), they will be ignored. If any of the
1225
entries in `deny` are not present in the configuration for the
1226
database, they will be ignored.
1228
Returns a Query object, which may be passed to the search() method, or
1229
combined with other queries.
1232
qp = self._prepare_queryparser(allow, deny, default_op, default_allow,
1234
return self._query_parse_with_fallback(qp, string)
1236
def query_field(self, field, value, default_op=OP_AND):
1237
"""A query for a single field.
1240
if self._index is None:
1241
raise _errors.SearchError("SearchConnection has been closed")
1243
actions = self._field_actions[field]._actions
1247
# need to check on field type, and stem / split as appropriate
1248
for action, kwargslist in actions.iteritems():
1249
if action in (FieldActions.INDEX_EXACT,
1251
FieldActions.FACET,):
1252
prefix = self._field_mappings.get_prefix(field)
1254
chval = ord(value[0])
1255
if chval >= ord('A') and chval <= ord('Z'):
1256
prefix = prefix + ':'
1257
return _log(_xapian.Query, prefix + value)
1258
if action == FieldActions.INDEX_FREETEXT:
1259
qp = _log(_xapian.QueryParser)
1260
qp.set_default_op(default_op)
1261
prefix = self._field_mappings.get_prefix(field)
1262
for kwargs in kwargslist:
1264
lang = kwargs['language']
1265
qp.set_stemmer(_log(_xapian.Stem, lang))
1266
qp.set_stemming_strategy(qp.STEM_SOME)
1269
return self._query_parse_with_fallback(qp, value, prefix)
1271
return _log(_xapian.Query)
1273
def query_similar(self, ids, allow=None, deny=None, simterms=10):
1274
"""Get a query which returns documents which are similar to others.
1276
The list of document IDs to base the similarity search on is given in
1277
`ids`. This should be an iterable, holding a list of strings. If
1278
any of the supplied IDs cannot be found in the database, they will be
1279
ignored. (If no IDs can be found in the database, the resulting query
1280
will not match any documents.)
1282
By default, all fields which have been indexed for freetext searching
1283
will be used for the similarity calculation. The list of fields used
1284
for this can be customised using the `allow` and `deny` parameters
1285
(only one of which may be specified):
1287
- `allow`: A list of fields to base the similarity calculation on.
1288
- `deny`: A list of fields not to base the similarity calculation on.
1289
- `simterms`: Number of terms to use for the similarity calculation.
1291
For convenience, any of `ids`, `allow`, or `deny` may be strings, which
1292
will be treated the same as a list of length 1.
1294
Regardless of the setting of `allow` and `deny`, only fields which have
1295
been indexed for freetext searching will be used for the similarity
1296
measure - all other fields will always be ignored for this purpose.
1299
eterms, prefixes = self._get_eterms(ids, allow, deny, simterms)
1301
# Use the "elite set" operator, which chooses the terms with the
1302
# highest query weight to use.
1303
q = _log(_xapian.Query, _xapian.Query.OP_ELITE_SET, eterms, simterms)
1306
def significant_terms(self, ids, maxterms=10, allow=None, deny=None):
1307
"""Get a set of "significant" terms for a document, or documents.
1309
This has a similar interface to query_similar(): it takes a list of
1310
ids, and an optional specification of a set of fields to consider.
1311
Instead of returning a query, it returns a list of terms from the
1312
document (or documents), which appear "significant". Roughly,
1313
in this situation significant means that the terms occur more
1314
frequently in the specified document than in the rest of the corpus.
1316
The list is in decreasing order of "significance".
1318
By default, all terms related to fields which have been indexed for
1319
freetext searching will be considered for the list of significant
1320
terms. The list of fields used for this can be customised using the
1321
`allow` and `deny` parameters (only one of which may be specified):
1323
- `allow`: A list of fields to consider.
1324
- `deny`: A list of fields not to consider.
1326
For convenience, any of `ids`, `allow`, or `deny` may be strings, which
1327
will be treated the same as a list of length 1.
1329
Regardless of the setting of `allow` and `deny`, only fields which have
1330
been indexed for freetext searching will be considered - all other
1331
fields will always be ignored for this purpose.
1333
The maximum number of terms to return may be specified by the maxterms
1337
eterms, prefixes = self._get_eterms(ids, allow, deny, maxterms)
1342
if not char.isupper():
1345
field = prefixes[term[:pos]]
1347
terms.append((field, value))
1350
def _get_eterms(self, ids, allow, deny, simterms):
1351
"""Get a set of terms for an expand
1354
if self._index is None:
1355
raise _errors.SearchError("SearchConnection has been closed")
1356
if allow is not None and deny is not None:
1357
raise _errors.SearchError("Cannot specify both `allow` and `deny`")
1359
if isinstance(ids, basestring):
1361
if isinstance(allow, basestring):
1363
if isinstance(deny, basestring):
1366
# Set "allow" to contain a list of all the fields to use.
1368
allow = [key for key in self._field_actions]
1369
if deny is not None:
1370
allow = [key for key in allow if key not in deny]
1372
# Set "prefixes" to contain a list of all the prefixes to use.
1376
actions = self._field_actions[field]._actions
1379
for action, kwargslist in actions.iteritems():
1380
if action == FieldActions.INDEX_FREETEXT:
1381
prefixes[self._field_mappings.get_prefix(field)] = field
1383
# Repeat the expand until we don't get a DatabaseModifiedError
1386
eterms = self._perform_expand(ids, prefixes, simterms)
1388
except _xapian.DatabaseModifiedError, e:
1390
return eterms, prefixes
1392
class ExpandDecider(_xapian.ExpandDecider):
1393
def __init__(self, prefixes):
1394
_xapian.ExpandDecider.__init__(self)
1395
self._prefixes = prefixes
1397
def __call__(self, term):
1400
if not char.isupper():
1403
if term[:pos] in self._prefixes:
1407
def _perform_expand(self, ids, prefixes, simterms):
1408
"""Perform an expand operation to get the terms for a similarity
1409
search, given a set of ids (and a set of prefixes to restrict the
1410
similarity operation to).
1413
# Set idquery to be a query which returns the documents listed in
1415
idquery = _log(_xapian.Query, _xapian.Query.OP_OR, ['Q' + id for id in ids])
1417
enq = _log(_xapian.Enquire, self._index)
1418
enq.set_query(idquery)
1419
rset = _log(_xapian.RSet)
1421
pl = self._index.postlist('Q' + id)
1424
rset.add_document(xapid.docid)
1425
except StopIteration:
1428
expanddecider = _log(self.ExpandDecider, prefixes)
1429
eset = enq.get_eset(simterms, rset, 0, 1.0, expanddecider)
1430
return [term.term for term in eset]
1432
def query_all(self):
1433
"""A query which matches all the documents in the database.
1436
return _log(_xapian.Query, '')
1438
def query_none(self):
1439
"""A query which matches no documents in the database.
1441
This may be useful as a placeholder in various situations.
1444
return _log(_xapian.Query)
1446
def spell_correct(self, querystr, allow=None, deny=None, default_op=OP_AND,
1447
default_allow=None, default_deny=None):
1448
"""Correct a query spelling.
1450
This returns a version of the query string with any misspelt words
1453
- `allow`: A list of fields to allow in the query.
1454
- `deny`: A list of fields not to allow in the query.
1455
- `default_op`: The default operator to combine query terms with.
1456
- `default_allow`: A list of fields to search for by default.
1457
- `default_deny`: A list of fields not to search for by default.
1459
Only one of `allow` and `deny` may be specified.
1461
Only one of `default_allow` and `default_deny` may be specified.
1463
If any of the entries in `allow` are not present in the configuration
1464
for the database, or are not specified for indexing (either as
1465
INDEX_EXACT or INDEX_FREETEXT), they will be ignored. If any of the
1466
entries in `deny` are not present in the configuration for the
1467
database, they will be ignored.
1469
Note that it is possible that the resulting spell-corrected query will
1470
still match no documents - the user should usually check that some
1471
documents are matched by the corrected query before suggesting it to
1475
qp = self._prepare_queryparser(allow, deny, default_op, default_allow,
1478
qp.parse_query(querystr,
1479
self._qp_flags_base |
1480
self._qp_flags_phrase |
1481
self._qp_flags_synonym |
1482
self._qp_flags_bool |
1483
qp.FLAG_SPELLING_CORRECTION)
1484
except _xapian.QueryParserError:
1485
qp.parse_query(querystr,
1486
self._qp_flags_base |
1487
self._qp_flags_phrase |
1488
self._qp_flags_synonym |
1489
qp.FLAG_SPELLING_CORRECTION)
1490
corrected = qp.get_corrected_query_string()
1491
if len(corrected) == 0:
1492
if isinstance(querystr, unicode):
1493
# Encode as UTF-8 for consistency - this happens automatically
1494
# to values passed to Xapian.
1495
return querystr.encode('utf-8')
1499
def can_collapse_on(self, field):
1500
"""Check if this database supports collapsing on a specified field.
1503
if self._index is None:
1504
raise _errors.SearchError("SearchConnection has been closed")
1506
self._field_mappings.get_slot(field, 'collsort')
1511
def can_sort_on(self, field):
1512
"""Check if this database supports sorting on a specified field.
1515
if self._index is None:
1516
raise _errors.SearchError("SearchConnection has been closed")
1518
self._field_mappings.get_slot(field, 'collsort')
1523
def _get_prefix_from_term(self, term):
1524
"""Get the prefix of a term.
1526
Prefixes are any initial capital letters, with the exception that R always
1527
ends a prefix, even if followed by capital letters.
1530
for p in xrange(len(term)):
1531
if term[p].islower():
1533
elif term[p] == 'R':
1537
def _facet_query_never(self, facet, query_type):
1538
"""Check if a facet must never be returned by a particular query type.
1540
Returns True if the facet must never be returned.
1542
Returns False if the facet may be returned - either becuase there is no
1543
entry for the query type, or because the entry is not
1544
FacetQueryType_Never.
1547
if query_type is None:
1549
if query_type not in self._facet_query_table:
1551
if facet not in self._facet_query_table[query_type]:
1553
return self._facet_query_table[query_type][facet] == _indexerconnection.IndexerConnection.FacetQueryType_Never
1555
def search(self, query, startrank, endrank,
1556
checkatleast=0, sortby=None, collapse=None,
1558
getfacets=None, allowfacets=None, denyfacets=None, usesubfacets=None,
1559
percentcutoff=None, weightcutoff=None,
1561
"""Perform a search, for documents matching a query.
1563
- `query` is the query to perform.
1564
- `startrank` is the rank of the start of the range of matching
1565
documents to return (ie, the result with this rank will be returned).
1566
ranks start at 0, which represents the "best" matching document.
1567
- `endrank` is the rank at the end of the range of matching documents
1568
to return. This is exclusive, so the result with this rank will not
1570
- `checkatleast` is the minimum number of results to check for: the
1571
estimate of the total number of matches will always be exact if
1572
the number of matches is less than `checkatleast`. A value of ``-1``
1573
can be specified for the checkatleast parameter - this has the
1574
special meaning of "check all matches", and is equivalent to passing
1575
the result of get_doccount().
1576
- `sortby` is the name of a field to sort by. It may be preceded by a
1577
'+' or a '-' to indicate ascending or descending order
1578
(respectively). If the first character is neither '+' or '-', the
1579
sort will be in ascending order.
1580
- `collapse` is the name of a field to collapse the result documents
1581
on. If this is specified, there will be at most one result in the
1582
result set for each value of the field.
1583
- `gettags` is the name of a field to count tag occurrences in, or a
1584
list of fields to do so.
1585
- `getfacets` is a boolean - if True, the matching documents will be
1586
examined to build up a list of the facet values contained in them.
1587
- `allowfacets` is a list of the fieldnames of facets to consider.
1588
- `denyfacets` is a list of fieldnames of facets which will not be
1590
- `usesubfacets` is a boolean - if True, only top-level facets and
1591
subfacets of facets appearing in the query are considered (taking
1592
precedence over `allowfacets` and `denyfacets`).
1593
- `percentcutoff` is the minimum percentage a result must have to be
1595
- `weightcutoff` is the minimum weight a result must have to be
1597
- `query_type` is a value indicating the type of query being
1598
performed. If not None, the value is used to influence which facets
1599
are be returned by the get_suggested_facets() function. If the
1600
value of `getfacets` is False, it has no effect.
1602
If neither 'allowfacets' or 'denyfacets' is specified, all fields
1603
holding facets will be considered (but see 'usesubfacets').
1606
if self._index is None:
1607
raise _errors.SearchError("SearchConnection has been closed")
1608
if 'facets' in _checkxapian.missing_features:
1609
if getfacets is not None or \
1610
allowfacets is not None or \
1611
denyfacets is not None or \
1612
usesubfacets is not None or \
1613
query_type is not None:
1614
raise errors.SearchError("Facets unsupported with this release of xapian")
1615
if 'tags' in _checkxapian.missing_features:
1616
if gettags is not None:
1617
raise errors.SearchError("Tags unsupported with this release of xapian")
1618
if checkatleast == -1:
1619
checkatleast = self._index.get_doccount()
1621
enq = _log(_xapian.Enquire, self._index)
1622
enq.set_query(query)
1624
if sortby is not None:
1626
if sortby[0] == '-':
1629
elif sortby[0] == '+':
1633
slotnum = self._field_mappings.get_slot(sortby, 'collsort')
1635
raise _errors.SearchError("Field %r was not indexed for sorting" % sortby)
1637
# Note: we invert the "asc" parameter, because xapian treats
1638
# "ascending" as meaning "higher values are better"; in other
1639
# words, it considers "ascending" to mean return results in
1641
enq.set_sort_by_value_then_relevance(slotnum, not asc)
1643
if collapse is not None:
1645
slotnum = self._field_mappings.get_slot(collapse, 'collsort')
1647
raise _errors.SearchError("Field %r was not indexed for collapsing" % collapse)
1648
enq.set_collapse_key(slotnum)
1650
maxitems = max(endrank - startrank, 0)
1651
# Always check for at least one more result, so we can report whether
1652
# there are more matches.
1653
checkatleast = max(checkatleast, endrank + 1)
1655
# Build the matchspy.
1658
# First, add a matchspy for any gettags fields
1659
if isinstance(gettags, basestring):
1660
if len(gettags) != 0:
1663
if gettags is not None and len(gettags) != 0:
1664
tagspy = _log(_xapian.TermCountMatchSpy)
1665
for field in gettags:
1667
prefix = self._field_mappings.get_prefix(field)
1668
tagspy.add_prefix(prefix)
1670
raise _errors.SearchError("Field %r was not indexed for tagging" % field)
1671
matchspies.append(tagspy)
1674
# add a matchspy for facet selection here.
1678
if allowfacets is not None and denyfacets is not None:
1679
raise _errors.SearchError("Cannot specify both `allowfacets` and `denyfacets`")
1680
if allowfacets is None:
1681
allowfacets = [key for key in self._field_actions]
1682
if denyfacets is not None:
1683
allowfacets = [key for key in allowfacets if key not in denyfacets]
1685
# include None in queryfacets so a top-level facet will
1686
# satisfy self._facet_hierarchy.get(field) in queryfacets
1687
# (i.e. always include top-level facets)
1688
queryfacets = set([None])
1690
# add facets used in the query to queryfacets
1691
termsiter = query.get_terms_begin()
1692
termsend = query.get_terms_end()
1693
while termsiter != termsend:
1694
prefix = self._get_prefix_from_term(termsiter.get_term())
1695
field = self._field_mappings.get_fieldname_from_prefix(prefix)
1696
if field and FieldActions.FACET in self._field_actions[field]._actions:
1697
queryfacets.add(field)
1700
for field in allowfacets:
1702
actions = self._field_actions[field]._actions
1705
for action, kwargslist in actions.iteritems():
1706
if action == FieldActions.FACET:
1707
# filter out non-top-level facets that aren't subfacets
1708
# of a facet in the query
1709
if usesubfacets and self._facet_hierarchy.get(field) not in queryfacets:
1711
# filter out facets that should never be returned for the query type
1712
if self._facet_query_never(field, query_type):
1714
slot = self._field_mappings.get_slot(field, 'facet')
1715
if facetspy is None:
1716
facetspy = _log(_xapian.CategorySelectMatchSpy)
1718
for kwargs in kwargslist:
1719
facettype = kwargs.get('type', None)
1720
if facettype is not None:
1722
if facettype is None or facettype == 'string':
1723
facetspy.add_slot(slot, True)
1725
facetspy.add_slot(slot)
1726
facetfields.append((field, slot, kwargslist))
1728
if facetspy is None:
1729
# Set facetspy to False, to distinguish from no facet
1730
# calculation being performed. (This will prevent an
1731
# error being thrown when the list of suggested facets is
1732
# requested - instead, an empty list will be returned.)
1735
matchspies.append(facetspy)
1738
# Finally, build a single matchspy to pass to get_mset().
1739
if len(matchspies) == 0:
1741
elif len(matchspies) == 1:
1742
matchspy = matchspies[0]
1744
matchspy = _log(_xapian.MultipleMatchDecider)
1745
for spy in matchspies:
1746
matchspy.append(spy)
1748
enq.set_docid_order(enq.DONT_CARE)
1750
# Set percentage and weight cutoffs
1751
if percentcutoff is not None or weightcutoff is not None:
1752
if percentcutoff is None:
1754
if weightcutoff is None:
1756
enq.set_cutoff(percentcutoff, weightcutoff)
1758
# Repeat the search until we don't get a DatabaseModifiedError
1761
if matchspy is None:
1762
mset = enq.get_mset(startrank, maxitems, checkatleast)
1764
mset = enq.get_mset(startrank, maxitems, checkatleast,
1765
None, None, matchspy)
1767
except _xapian.DatabaseModifiedError, e:
1769
facet_hierarchy = None
1771
facet_hierarchy = self._facet_hierarchy
1773
return SearchResults(self, enq, query, mset, self._field_mappings,
1774
tagspy, gettags, facetspy, facetfields,
1776
self._facet_query_table.get(query_type))
1779
"""Get an iterator which returns all the ids in the database.
1781
The unqiue_ids are currently returned in binary lexicographical sort
1782
order, but this should not be relied on.
1784
Note that the iterator returned by this method may raise a
1785
xapian.DatabaseModifiedError exception if modifications are committed
1786
to the database while the iteration is in progress. If this happens,
1787
the search connection must be reopened (by calling reopen) and the
1788
iteration restarted.
1791
if self._index is None:
1792
raise _errors.SearchError("SearchConnection has been closed")
1793
return _indexerconnection.PrefixedTermIter('Q', self._index.allterms())
1795
def get_document(self, id):
1796
"""Get the document with the specified unique ID.
1798
Raises a KeyError if there is no such document. Otherwise, it returns
1799
a ProcessedDocument.
1802
if self._index is None:
1803
raise _errors.SearchError("SearchConnection has been closed")
1806
postlist = self._index.postlist('Q' + id)
1808
plitem = postlist.next()
1809
except StopIteration:
1810
# Unique ID not found
1811
raise KeyError('Unique ID %r not found' % id)
1814
raise _errors.IndexerError("Multiple documents " #pragma: no cover
1815
"found with same unique ID")
1816
except StopIteration:
1817
# Only one instance of the unique ID found, as it should be.
1820
result = ProcessedDocument(self._field_mappings)
1822
result._doc = self._index.get_document(plitem.docid)
1824
except _xapian.DatabaseModifiedError, e:
1827
def iter_synonyms(self, prefix=""):
1828
"""Get an iterator over the synonyms.
1830
- `prefix`: if specified, only synonym keys with this prefix will be
1833
The iterator returns 2-tuples, in which the first item is the key (ie,
1834
a 2-tuple holding the term or terms which will be synonym expanded,
1835
followed by the fieldname specified (or None if no fieldname)), and the
1836
second item is a tuple of strings holding the synonyms for the first
1839
These return values are suitable for the dict() builtin, so you can
1842
>>> conn = _indexerconnection.IndexerConnection('foo')
1843
>>> conn.add_synonym('foo', 'bar')
1844
>>> conn.add_synonym('foo bar', 'baz')
1845
>>> conn.add_synonym('foo bar', 'foo baz')
1847
>>> conn = SearchConnection('foo')
1848
>>> dict(conn.iter_synonyms())
1849
{('foo', None): ('bar',), ('foo bar', None): ('baz', 'foo baz')}
1852
if self._index is None:
1853
raise _errors.SearchError("SearchConnection has been closed")
1854
return _indexerconnection.SynonymIter(self._index, self._field_mappings, prefix)
1856
def get_metadata(self, key):
1857
"""Get an item of metadata stored in the connection.
1859
This returns a value stored by a previous call to
1860
IndexerConnection.set_metadata.
1862
If the value is not found, this will return the empty string.
1865
if self._index is None:
1866
raise _errors.IndexerError("SearchConnection has been closed")
1867
if not hasattr(self._index, 'get_metadata'):
1868
raise _errors.IndexerError("Version of xapian in use does not support metadata")
1869
return _log(self._index.get_metadata, key)
1871
if __name__ == '__main__':
1873
doctest.testmod (sys.modules[__name__])