1
# Copyright (c) 2005 Divmod Inc. See LICENSE file for details.
3
Xapwrap provides an improved interface to the Xapian text indexing
4
library (see http://www.xapian.org/ for more information on
5
Xapian). Xapwrap provides a layered approach offering ample
6
opportunities for customization.
12
from xapwrap import SmartIndex, Document, TextField, SortKey
13
from datetime import date
15
idx = SmartIndex('/tmp/index', True)
16
d1 = Document(TextField('hi there bob'),
17
sortFields = [SortKey('date', date(2004, 1, 1)),
18
SortKey('author', 'Bob'),
19
SortKey('size', 450)])
23
idx = SmartIndex('/tmp/index')
24
print idx.search('there', 'date', sortAscending = True)
31
Important methods for C{ReadOnlyIndex}:
32
__init__(self, *pathnames)
34
configure(self, prefixMap = None, indexValueMap = None)
36
search(self, query, sortKeyt = None,
37
startingIndex = 0, batchSize = MAX_DOCS_TO_RETURN,
38
sortIndex = None, sortAscending = True,
39
sortByRelevence = False)
41
checkIndex(self, maxID)
42
get_doccount(self, uid)
44
Important methods for C{Index}:
45
(all methods in ReadOnlyIndex)
46
__init__(self, pathname, create)
48
add_document(self, doc)
49
replace_document(self, uid, doc)
50
delete_document(self, uid)
52
C{SmartIndex} and C{SmartReadOnlyIndex} define the same methods as their
55
The primary way to interact with a Xapian index is to use either the
56
C{Index} or C{ReadOnlyIndex} class. In addition to offering read only
57
access without the inconveniance of lock files, C{ReadOnlyIndex} offers
58
the ability to merge several xapian indices into one super index with
59
only a small performance impediement.
61
In addition to C{Index} and C{ReadOnlyIndex}, Xapwrap also offers
62
C{SmartIndex} and C{SmartReadOnlyIndex} classes. These classes
63
automatically store and manage the index value map and the prefix map in
64
the index. There are two caveats to using them however. First, one
65
cannot index documents that have a xapian ID of 1. Secondly, when using
66
C{SmartReadOnlyIndex} to combine multiple indices together, the indices
67
must have consistent value index maps. Indices where all documents have
68
the same index value map are always consistent. The problem only emerges
69
when indices can have different types of documents with different sets
70
of sort keys. More specifically, the problem can only emerge if one
71
indices documents in such a way that sort keys are added to different
72
indices in different orders.
78
In order to add new data to an index, one asks a C{Index} or
79
C{SmartIndex} instance to index a C{Document} instance. Documents take a
80
sequence of text fields, a sequence of sort keys and a sequence of
81
keywords as constructor arguments. They also take optional universal
82
identifiers and an arbitrary serializable object. The first three
83
sequences can be created using the C{TextField}, C{SortKey}, and
84
C{Keyword} classes defined below. C{TextField} instances contain a chunk
85
of text and an optional name as well as a boolean indicating whether the
86
field is to be prefixed. Prefixed fields are effectively indexed twice:
87
after being indexed normally, each token is indexed again with the field
88
name. This allows the user to perform fielded searches and is primarily
89
useful for small text fields, such as the subject of an email or a list
90
of author names. C{Keyword} instances denote individual prefixed tokens
91
that are indexed with no positional information. C{SortKey} instances
92
denote arbitrary fields that are used for sorting documents. They
93
include a sort field name and the sort key value. Since Xapian only
94
accepts strings as sort keys, sort key values must be flattened into
95
strings before entering the index.
97
Xapwrap defines flattener functions that automatically flatten integer,
98
date, time, and datetime instances into strings that sort properly. You
99
can define your own flatteners for custom data types by using the
100
C{registerFlattener} class method of the C{Document} class.
105
Internal Xapian error conditions should generate normal python
106
exceptions defined in this file that inherit from xapwrap.XapianError.
111
Xapwrap will use twisted's logging facilities if available. In any
112
event, a custom logging function can be supplied by setting xapwrap.log.
117
Xapwrap currently does not support stemming or stop words, although a
124
from sets import Set as set
126
import cPickle, glob, os
128
from document import makePairForWrite, StandardAnalyzer, Document, SortKey, Keyword
129
from document import UNICODE_ENCODING, UNICODE_ERROR_POLICY
132
from atop.tpython import FilesystemLock
135
from os import symlink, readlink, remove as rmlink
139
class FilesystemLock:
142
A real mutex this time. See the non-win32 version for details.
148
def __init__(self, name):
149
#Mutex name cannot contain backslash
150
name = name.replace('\\', '/')
152
self._mutex = win32event.CreateMutex(None, False, name)
154
raise RuntimeError("Failed to create a named mutex")
157
res = win32event.WaitForSingleObject(self._mutex, 0)
158
self.locked = (res != win32event.WAIT_TIMEOUT)
162
#C API ReleaseMutex version is supposed to return something to
163
#tell whether the lock was correctly released or not. The binding
165
win32event.ReleaseMutex(self._mutex)
171
class FilesystemLock:
174
This relies on the filesystem property that creating
175
a symlink is an atomic operation and that it will
176
fail if the symlink already exists. Deleting the
177
symlink will release the lock.
179
@ivar name: The name of the file associated with this lock.
180
@ivar clean: Indicates whether this lock was released cleanly by its
181
last owner. Only meaningful after C{lock} has been called and returns
188
def __init__(self, name):
192
"""Acquire this lock.
195
@return: True if the lock is acquired, false otherwise.
197
@raise: Any exception os.symlink() may raise, other than
201
pid = readlink(self.name)
202
except (OSError, IOError), e:
203
if e.errno != errno.ENOENT:
207
if not hasattr(os, 'kill'):
211
except (OSError, IOError), e:
212
if e.errno != errno.ESRCH:
219
symlink(str(os.getpid()), self.name)
224
"""Release this lock.
226
This deletes the directory with the given name.
228
@raise: Any exception os.readlink() may raise, or
229
ValueError if the lock is not owned by this process.
231
pid = readlink(self.name)
232
if int(pid) != os.getpid():
233
raise ValueError("Lock %r not owned by this process" % (self.name,))
238
from twisted.python.log import msg as log
244
# max number of bytes that can be indexed without forcing an index
245
# flush. this limits memory consumption
246
MAX_DATA_INDEXED_BETWEEN_FLUSHES = 200 * 1000
248
MAX_DOCS_TO_RETURN = 1000 * 1000
250
XAPIAN_LOCK_FILENAME = "db_lock"
251
XAPWRAP_LOCK_FILENAME = "xapian_lock"
253
# Xapian error handling is somewhat weak: all errors trigger either an
254
# IOError, a RuntimeError, or a ValueError. The exception's args
255
# attribute is a singleton tuple containing an explanation
256
# string. Possible errors include 'DatabaseCorruptError: Quartz metafile
257
# /tmp/foo/meta is invalid: magic string not found.' and
258
# 'DatabaseLockError: Unable to acquire database write lock
259
# /tmp/foo/db_lock'. Instead of looking inside exception error strings
260
# everywhere, I made a wrapper for xapian database operations that
261
# catches exceptions and translates them into the more meaningful
262
# exceptions shown below.
264
class XapianError(StandardError):
266
class XapianRuntimeError(XapianError):
268
class XapianLogicError(XapianError):
270
class XapianDatabaseError(XapianError):
273
class XapianAssertionError(XapianLogicError):
275
class InvalidOperationError(XapianLogicError):
277
class InvalidArgumentError(XapianLogicError):
279
class UnimplementedError(XapianLogicError):
282
class DocNotFoundError(XapianRuntimeError):
284
class RangeError(XapianRuntimeError):
286
class InternalError(XapianRuntimeError):
288
class FeatureUnavalableError(XapianRuntimeError):
290
class XapianNetworkError(XapianRuntimeError):
293
class NetworkTimeoutError(XapianNetworkError):
296
class DatabaseCorruptionError(XapianDatabaseError):
298
class DatabaseCreationError(XapianDatabaseError):
300
class DatabaseOpeningError(XapianDatabaseError):
302
class DatabaseLockError(XapianDatabaseError):
304
class DatabaseModifiedError(XapianDatabaseError):
307
# these exceptions are not Xapian errors
308
class UnknownDatabaseError(XapianError):
311
class NoIndexValueFound(XapianError):
314
class InconsistantIndex(XapianError):
317
class InconsistantIndexCombination(XapianError):
321
def makeTranslatedMethod(methodName):
322
def translatedMethod(self, *args, **kwargs):
324
return getattr(self.db, methodName)(*args, **kwargs)
325
except (IOError, RuntimeError, ValueError), e:
327
for subString, exceptionClass in self.exceptionStrMap.iteritems():
328
if subString in errorMsg:
329
raise exceptionClass(e)
331
raise UnknownDatabaseError(e)
334
return translatedMethod
336
class ExceptionTranslater:
337
def __init__(self, db):
340
def openIndex(klass, readOnly, *args, **kwargs):
343
assert len(kwargs) == 0
344
# assume all args are db paths
345
db = xapian.Database(args[0])
346
for path in args[1:]:
347
db.add_database(xapian.Database(path))
351
return klass(xapian.WritableDatabase(*args, **kwargs)) # for xapian 1.0+
352
except AttributeError:
353
return klass(xapian.open(*args, **kwargs)) # for xapian 0.9.x
354
except (IOError, RuntimeError, ValueError), e:
356
for subString, exceptionClass in klass.exceptionStrMap.iteritems():
357
if subString in errorMsg:
358
raise exceptionClass(e)
360
raise UnknownDatabaseError(e)
362
raise UnknownDatabaseError(e)
364
openIndex = classmethod(openIndex)
366
# possible exceptions are taken from the list at
367
# http://www.xapian.org/docs/apidoc/html/errortypes_8h.html
369
# exceptions whose names differ between xapwrap and Xapian
370
'DatabaseCorruptError': DatabaseCorruptionError,
371
'AssertionError': XapianAssertionError,
372
'DatabaseCreateError': DatabaseCreationError,
374
# exceptions translated with the same name
375
'DatabaseLockError': DatabaseLockError,
376
'DatabaseOpeningError': DatabaseOpeningError,
377
'DatabaseModifiedError': DatabaseModifiedError,
378
'FeatureUnavalableError': FeatureUnavalableError,
379
'DocNotFoundError': DocNotFoundError,
380
'InvalidOperationError': InvalidOperationError,
381
'InvalidArgumentError': InvalidArgumentError,
382
'UnimplementedError': UnimplementedError,
383
'NetworkError': XapianNetworkError,
384
'NetworkTimeoutError': NetworkTimeoutError,
385
'DatabaseError': XapianDatabaseError,
386
'InternalError': InternalError,
387
'RangeError': RangeError,
388
'RuntimeError': XapianRuntimeError,
389
'LogicError': XapianLogicError
392
get_doccount = makeTranslatedMethod('get_doccount')
393
add_document = makeTranslatedMethod('add_document')
394
replace_document = makeTranslatedMethod('replace_document')
395
delete_document = makeTranslatedMethod('delete_document')
396
flush = makeTranslatedMethod('flush')
397
term_exists = makeTranslatedMethod('term_exists')
398
reopen = makeTranslatedMethod('reopen')
399
begin_transaction = makeTranslatedMethod('begin_transaction')
400
commit_transaction = makeTranslatedMethod('commit_transaction')
401
cancel_transaction = makeTranslatedMethod('cancel_transaction')
402
get_lastdocid = makeTranslatedMethod('get_lastdocid')
403
get_avlength = makeTranslatedMethod('get_avlength')
404
get_termfreq = makeTranslatedMethod('get_termfreq')
405
get_collection_freq = makeTranslatedMethod('get_collection_freq')
406
get_doclength = makeTranslatedMethod('get_doclength')
407
get_document = makeTranslatedMethod('get_document')
409
postlist_begin = makeTranslatedMethod('postlist_begin')
410
postlist_end = makeTranslatedMethod('postlist_end')
411
termlist_begin = makeTranslatedMethod('termlist_begin')
412
termlist_end = makeTranslatedMethod('termlist_end')
413
positionlist_begin = makeTranslatedMethod('positionlist_begin')
414
positionlist_end = makeTranslatedMethod('positionlist_end')
415
allterms_begin = makeTranslatedMethod('allterms_begin')
416
allterms_end = makeTranslatedMethod('allterms_end')
419
def makeProtectedDBMethod(method, setupDB = True):
420
def protectedMethod(self, *args, **kwargs):
424
return method(self, *args, **kwargs)
425
## # test that this works and doesn't recurse infinitely
426
## except DatabaseModifiedError:
428
## return protectedMethod(self, *args, **kwargs)
429
except XapianError, e:
430
#log("error encountered while performing xapian index operation %s: %s"
431
# % (method.__name__, e))
434
return protectedMethod
437
# there are lots of places below where we write code like:
440
# enq = self.enquire(foo)
441
# mset = enq.get_mset(0, 10)
442
# return mset[0][flimflam]
447
# the purpose of this code is to ensure that no references to enquire
448
# objects or msets will outlive the function call. msets and enquire
449
# objsects hold a reference to the xapian db, and thus prevent it from
450
# being properly gc'd. if we fail to delete enq and mset on exception,
451
# then they can be kept around for arbitrarily long periods of time as
452
# part of the exception state
455
# be extremely careful about keeping a db object in local scope;
456
# once its there, an unhandled exception could create a traceback
457
# containing a frame object that holds a copy of the locals dict,
458
# including the db object. if that frame/traceback object is kept
459
# around forever (which parts of twisted/quotient seem to do,
460
# especially deferreds), then the db object will never be deleted
461
# and the indexer lock will never go away.
463
# in order to prevent that from happening, we maintain two invariants:
465
# 1. the db is only accessed as an instance attribute and is never
466
# copied into a local variable. i.e., we always say self.db and
467
# never ever say db = self.db. this keeps the db object from ever
468
# getting captured by a frame/traceback.
470
# 2. the db is only accessed from within an exception handler that
471
# calls self.close() in the event of *any* failure. this ensures
472
# that the instance loses all references to the db on failure, so,
473
# even if the instance object is captured by a frame object (or
474
# something else), the db will already have been freed.
479
I represent a Xapian index that is read only by wrapping the
480
xapian.Database class. Because I provide read only access, I can be
481
used to combine several Xapian indices into one index with
482
performance only slightly lower than when using only one index.
484
@cvar DEFAULT_QUERY_COMBINER_OP: the operation used by the query parser to combine query terms
486
@cvar STEMMING_LANGUAGE: the language used by the query parser for
487
stemming. this is of little use since Xapwrap does not yet support
488
stemming when indexing.
490
@ivar names: a sequence of file names representing paths to Xapian
493
Please use the configure method to modify C{prefixMap} and C{indexValueMap}
495
@ivar prefixMap: a map of prefixes used by named fields in the index
496
and the name they should be referred to by the query parser
498
@ivar indexValueMap: a map from sort field names to value integer
500
@ivar amountIndexedSinceLastFlush: the number of bytes indexed since
503
The following instance attributes should never be modified or
506
@ivar db: the xapian index object
507
@ivar qp: the xapian query parser object
508
@ivar _searchSessions: a map from query description string to
509
(enquire, lastIndexSortedBy)
512
DEFAULT_QUERY_COMBINER_OP = xapian.Query.OP_AND
513
STEMMING_LANGUAGE = 'none'
515
def __init__(self, *names):
517
raise ValueError("No index directory supplied to Index constructor")
521
self._searchSessions = {}
523
self.indexValueMap = {}
524
self.amountIndexedSinceLastFlush = 0
527
# we hide the db so that methods always access it only through
528
# this method since db objects can be silently reaped when not
529
# in use. db objects consume 5 file descriptors.
534
#self.qp = xapian.QueryParser()
535
# this is vital: these options specify no language for
536
# stemming (""), disable stemming (False), and specify an
537
# empty stop word object (None). we need this because by
538
# default, xapian's query parser does english stemming
539
#s = xapian.Stem(self.STEMMING_LANGUAGE)
540
#self.qp.set_stemmer(s)
542
# we want query terms to be ANDed together by default
543
#self.qp.set_default_op(self.DEFAULT_QUERY_COMBINER_OP)
546
log("Index %s contains %s documents" %
547
(self.names, self.get_doccount()))
550
self.db = ExceptionTranslater.openIndex(True, *self.names)
553
log("closing xapian index %s" % self.names)
554
for query in self._searchSessions.keys():
555
del self._searchSessions[query]
559
def _configure(self):
560
if 'uid' not in self.indexValueMap:
561
# this a gross hack...
562
self.indexValueMap['uid'] = 0
563
self.indexValueMap['uidREV'] = 1
564
if self.qp is not None:
565
for k, v in self.prefixMap.iteritems():
566
# check for unicode encoding?
571
self.qp.add_prefix(k, V)
573
def configure(self, prefixMap = None, indexValueMap = None):
574
if prefixMap is not None:
575
self.prefixMap = prefixMap
576
if indexValueMap is not None:
577
self.indexValueMap = indexValueMap
580
def get_doccount(self):
581
return self.db.get_doccount()
582
get_doccount = makeProtectedDBMethod(get_doccount)
584
def enquire(self, query):
587
searchSession = xapian.Enquire(self.db.db)
588
searchSession.set_query(query)
591
del query, searchSession
593
enquire = makeProtectedDBMethod(enquire)
596
if self.db is not None:
598
self.amountIndexedSinceLastFlush = 0
599
flush = makeProtectedDBMethod(flush)
601
def search(self, query,
604
batchSize = MAX_DOCS_TO_RETURN,
605
sortIndex = None, sortAscending = True,
606
sortByRelevence = False,
612
@ivar valuesWanted: a list of Values that will be returned as part
613
of the result dictionary.
616
# TODO - allow a simple way to get Keywords out
618
if isinstance(query, (str, unicode)):
619
query = ParsedQuery(query)
620
elif not(isinstance(query, Query)):
621
raise ValueError("query %s must be either a string or a "
622
"subclass of xapwrap.Query" % query)
624
q = query.prepare(self.qp)
625
# uggg. this mess is due to the fact that xapain Query objects
626
# don't hash in a sane way.
628
qString = q.get_description() # deprecated since xapian 1.0, removal in 1.1
629
except AttributeError:
632
# the only thing we use sortKey for is to set sort index
633
if sortKey is not None:
634
sortIndex = self.indexValueMap[sortKey]
635
if collapseKey is not None:
636
collapseKey = self.indexValueMap[collapseKey]
638
# once you call set_sorting on an Enquire instance, there is no
639
# way to resort it by relevence, so we have to open a new
642
# ignore sortAscending since there's no easy way to implement
643
# ascending relevancy sorts and it's tough to imagine a case
644
# where you'd want to see the worst results. in any event, the
645
# user can always sort by relevancy and go to the last page of
649
if qString not in self._searchSessions:
650
self._searchSessions[qString] = (self.enquire(q), None)
652
enq, lastIndexSortedBy = self._searchSessions[qString]
654
# if we don't set sortIndex, the results will be returned
655
# sorted by relevance, assuming that we have never called
656
# set_sorting on this session
657
if sortByRelevence and lastIndexSortedBy is not None:
658
sortIndex = sortKey = None
659
if lastIndexSortedBy is not None:
660
del self._searchSessions[qString]
661
self._searchSessions[qString] = (self.enquire(q), None)
662
enq, lastIndexSortedBy = self._searchSessions[qString]
663
if sortByRelevence is not None and sortIndex is not None:
664
enq.set_sort_by_relevance_then_value(sortIndex, not sortAscending)
665
elif sortIndex is not None:
666
# It seems that we have the opposite definition of sort ascending
667
# than Xapian so we invert the ascending flag!
668
enq.set_sort_by_value(sortIndex, not sortAscending)
670
if collapseKey is not None:
671
enq.set_collapse_key(collapseKey)
673
self._searchSessions[qString] = (enq, sortIndex)
675
mset = enq.get_mset(startingIndex, batchSize)
679
thisResult['uid'] = m[xapian.MSET_DID]
680
thisResult['score'] = m[xapian.MSET_PERCENT]
684
for valName in valuesWanted:
685
valueIndex = self.indexValueMap.get(valName, None)
686
if valueIndex is None:
687
raise NoIndexValueFound(valName, self.indexValueMap)
688
valRes[valName] = xapDoc.get_value(valueIndex)
689
thisResult['values'] = valRes
690
results.append(thisResult)
691
return enq, mset, results
695
search = makeProtectedDBMethod(search)
697
def count(self, query):
700
enq = self.enquire(query)
701
# get_matches_estimated does not return accurate results if
702
# given a small ending number like 0 or 1
703
mset = enq.get_mset(0, MAX_DOCS_TO_RETURN)
704
sizeEstimate = mset.get_matches_estimated()
705
return sizeEstimate, self.get_doccount()
709
count = makeProtectedDBMethod(count)
711
def checkIndex(self, maxID):
712
"""Compute a list of all UIDs less than or equal to maxID that
715
# I had originally suspected that the performance hit of
716
# returning a huge list in the case of empty indexes would be
717
# substantial, but testing with a 120,000 msg index indicates
718
# that performance is fine and that the space overhead is quite
719
# reasonable. If that were not the case, this could be optimized
720
# by calculating the maximum document ID in the index and only
721
# scanning up to the minimum of maxID and the max ID in the
722
# index, assuming that were using the same document IDs in the
726
for uid in xrange(maxID + 1):
727
term = makePairForWrite('UID', str(uid))
728
if not self.db.term_exists(term):
729
missingUIDs.append(uid)
731
checkIndex = makeProtectedDBMethod(checkIndex)
733
def get_documents(self, uid):
734
""" return a list of remapped UIDs corresponding to the actual UID given
736
docTerm = makePairForWrite('UID', str(uid))
737
candidates = self.search(RawQuery(docTerm))
738
return [int(c['uid']) for c in candidates]
740
def get_document(self, uid):
741
# we cannot simply use db.get_document since doc ids get
742
# remapped when combining multiple databases
743
candidates = self.get_documents(uid)
744
if len(candidates) == 0:
745
raise DocNotFoundError(uid)
746
elif len(candidates) == 1:
747
return self._get_document(candidates[0])
749
raise InconsistantIndex(
750
"Something has gone horribly wrong. I tried "
751
"retrieving document id %s but found %i documents "
752
"with that document ID term" % (uid, len(candidates)))
754
def _get_document(self, uid):
755
assert isinstance(uid, int)
756
return self.db.get_document(uid)
757
_get_document = makeProtectedDBMethod(_get_document)
759
def term_exists(self, term):
760
assert isinstance(term, str)
761
return self.db.term_exists(term)
762
term_exists = makeProtectedDBMethod(term_exists)
764
def get_lastdocid(self):
765
return self.db.get_lastdocid()
766
get_lastdocid = makeProtectedDBMethod(get_lastdocid)
768
# XXX FIXME: we should consider deleting all searchSessions whenever we
769
# add a document, or we should reopen the db
772
class Index(ReadOnlyIndex):
774
def __init__(self, name, create = False, analyzer = None):
775
# XXX FIXME: we should really try opening the db here, so that
776
# any errors are caught immediately rather than waiting for the
777
# first time we try to do something...
778
ReadOnlyIndex.__init__(self, name)
781
self.flags = xapian.DB_CREATE_OR_OPEN
783
self.flags = xapian.DB_OPEN
784
self.analyzer = analyzer or StandardAnalyzer()
785
self.lockFile = FilesystemLock(
786
os.path.join(self.name, XAPWRAP_LOCK_FILENAME))
789
""" really get a xapian database object """
791
# xapian expects directories! self.name should refer to a
792
# directory. if it doesn't exist, we'll make one.
793
if not os.path.exists(self.name):
796
# try to acquire a lock file
797
if not self.lockFile.lock():
798
owningPid = os.readlink(self.lockFile.name)
799
errorMsg = ("cannot acquire lock file for xapian index %s"
800
"because it is owned by process %s" %
801
(self.name, owningPid))
803
raise DatabaseLockError(errorMsg)
804
xapLockFilePath = os.path.join(self.name, XAPIAN_LOCK_FILENAME)
805
if os.path.exists(xapLockFilePath):
806
log("Stale database lock found in %s. Deleting it now." % xapLockFilePath)
807
os.remove(xapLockFilePath)
809
# actually try to open a xapian DB
812
self.db = ExceptionTranslater.openIndex(False, self.name, self.flags)
813
except DatabaseCorruptionError, e:
814
# the index is trashed, so there's no harm in blowing it
815
# away and starting from scratch
816
log("Xapian index at %s is corrupted and will be destroyed"
818
if self.lockFile.locked:
819
self.lockFile.unlock()
820
for idxFname in glob.glob(os.path.join(self.name, '*')):
822
self.db = ExceptionTranslater.openIndex(False, self.name, self.flags)
824
if self.db is None and self.lockFile.locked:
825
self.lockFile.unlock()
831
# this is important! the only way to get xapian to release the
832
# db lock is to call the db object's destructor. that won't
833
# happen until nobody is holding a reference to the db
834
# object. unfortunately, the query parser holds a reference to
835
# it, so the query parser must also go away. do not hold
836
# references to these objects anywhere but here.
838
# enquire objects and mset objects hold a reference to the db,
839
# so if any of them are left alive, the db will not be reclaimed
841
if self.db is not None:
842
ReadOnlyIndex.close(self)
843
# the islink test is needed in case the index directory has
844
# been deleted before we close was called.
845
if self.lockFile.locked and os.path.islink(self.lockFile.name):
846
self.lockFile.unlock()
847
# there is no point in checking if the lock file is still
848
# around right here: it will only be deleted when xapian's
849
# destructor runs, but python defers running destructors
850
# until after exception handling is complete. since this
851
# code will often get called from an exception handler, we
852
# have to assume that the lock file's removal will be
853
# delayed at least until after this method exits
855
def get_document(self, uid):
856
return self._get_document(uid)
858
# methods that modify db state
860
def index(self, doc):
862
if hasattr(doc, 'uid') and doc.uid:
864
doc.sortFields.append(SortKey('uid', uid))
865
doc.keywords.append(Keyword('uid', str(uid)))
866
xapDoc = doc.toXapianDocument(self.indexValueMap, self.prefixMap)
867
self.replace_document(uid, xapDoc)
869
# We need to know the uid of the doc we're going to add
870
# before we add it so we can setup appropriate uid sorting
871
# values. But, another thread could potentially insert a
872
# document at that uid after we determine the last uid, but
873
# before we manage the insertion. Yay race conditions! So we
874
# try to add the document and then check that it ended up at
875
# the right uid. If it did not, we update it with the
876
# correct uid sort values.
877
uid = self.get_lastdocid() + 1
878
doc.sortFields.append(SortKey('uid', uid))
879
doc.keywords.append(Keyword('uid', str(uid)))
880
xapDoc = doc.toXapianDocument(self.indexValueMap, self.prefixMap)
881
newUID = self.add_document(xapDoc)
883
doc.sortFields.append(SortKey('uid', newUID))
884
doc.keywords.append(Keyword('uid', str(newUID)))
885
xapDoc = doc.toXapianDocument(self.indexValueMap, self.prefixMap)
886
self.replace_document(newUID, xapDoc)
888
# a simpler alternative would be to add an empty document
889
# and then replace it. the problem with that strategy is
890
# that it kills performance since xapian performs an
891
# implicit flush when you replace a document that was added
892
# but not yet committed to disk.
894
self.amountIndexedSinceLastFlush += len(doc)
895
if self.amountIndexedSinceLastFlush > MAX_DATA_INDEXED_BETWEEN_FLUSHES:
899
def add_document(self, doc):
900
return self.db.add_document(doc)
901
add_document = makeProtectedDBMethod(add_document)
903
def replace_document(self, uid, doc):
904
return self.db.replace_document(uid, doc)
905
replace_document = makeProtectedDBMethod(replace_document)
907
def delete_document(self, docID):
908
return self.db.delete_document(docID)
909
delete_document = makeProtectedDBMethod(delete_document)
914
class ParsedQuery(Query):
915
def __init__(self, queryString):
916
if isinstance(queryString, unicode):
917
queryString = queryString.encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY)
918
# as of xapian 0.9.5 the query parser makes trouble with utf-8. but it
919
# also doesnt work with iso-8859-15, so we just live with ascii-only search
920
# for now... - a utf8 fix seems to be planned for the near future!
921
self.queryString = queryString
923
def prepare(self, queryParser):
924
return queryParser.parse_query(self.queryString)
926
class RawQuery(Query):
927
def __init__(self, queryString):
928
if isinstance(queryString, unicode):
929
queryString = queryString.encode('utf-8')
931
assert isinstance(queryString, str)
932
self.queryString = queryString
934
def prepare(self, queryParser):
935
return xapian.Query(self.queryString)
937
class QObjQuery(Query):
938
def __init__(self, query):
939
assert isinstance(query, xapian.Query)
942
def prepare(self, queryParser):
945
class SmartIndex(Index):
946
documentFactory = Document
948
def __init__(self, *args, **kwargs):
949
Index.__init__(self, *args, **kwargs)
954
state = {'indexValueMap': self.indexValueMap,
955
'prefixMap': self.prefixMap}
956
d = self.documentFactory(uid = 1, data = state)
957
self.index(d, checkID = False)
960
def fetchState(self):
962
if self.get_doccount() == 0:
963
# Don't rely on the try:except: for this case
966
doc = self.get_document(1)
967
except DocNotFoundError:
968
newState = {'indexValueMap': {}, 'prefixMap': {}}
971
dataStr = doc.get_data()
972
newState = cPickle.loads(dataStr)
973
self.indexValueMap.update(newState['indexValueMap'])
974
self.prefixMap.update(newState['prefixMap'])
976
def index(self, doc, checkID = True):
977
if hasattr(doc, 'uid') and doc.uid == 1 and checkID:
978
raise InvalidArgumentError(
979
"document UIDs must be greater than one when using SmartIndex")
981
docSortKeys = set([sk.name for sk in doc.sortFields if sk.name is not None])
982
indexSortKeys = set(self.indexValueMap.keys())
983
if not docSortKeys.issubset(indexSortKeys):
984
nextValueIndex = 1 + max(self.indexValueMap.itervalues())
985
# we sort the sortKeys in order to improve the odds that two
986
# indices that are indexed with the same documents in the
987
# same order will always end up with the same
988
# indexValueMaps, even if different versions of python are
989
# used with different hash functions
990
sortKeys = list(docSortKeys)
992
for sortKey in sortKeys:
993
if sortKey not in self.indexValueMap:
994
assert nextValueIndex % 2 == 0
995
self.indexValueMap[sortKey] = nextValueIndex
996
self.indexValueMap[sortKey + 'REV'] = nextValueIndex + 1
1000
docKeywords = set([tf.name for tf in doc.textFields if tf.prefix] +
1001
[kw.name for kw in doc.keywords])
1002
indexKeyWords = set(self.prefixMap.keys())
1003
if not docKeywords.issubset(indexKeyWords):
1004
for k in docKeywords - indexKeyWords:
1005
self.prefixMap[k] = k.upper()
1008
return Index.index(self, doc)
1011
class SmartReadOnlyIndex(ReadOnlyIndex):
1013
def __init__(self, *args, **kwargs):
1014
ReadOnlyIndex.__init__(self, *args, **kwargs)
1017
def fetchState(self):
1018
stateDocIDs = self.get_documents(1)
1019
stateDocs = map(self._get_document, stateDocIDs)
1020
states = [cPickle.loads(s.get_data()) for s in stateDocs]
1022
# should we issue a warning when the number of states that we
1023
# retrieve is less than the number of indices we opened? the
1024
# only problem is that some indices may be empty, but there's no
1025
# easy way to check how many documents are in a subindex without
1026
# opening it explicitly using xapian.Database and that seems
1027
# rather expensive for this code path.
1029
# merge all the states into a master state
1030
master = {'prefixMap': self.prefixMap,
1031
'indexValueMap': self.indexValueMap}
1032
# note that if there are conflicts, there is no guarantee on who
1033
# will win, but it doesn't matter since we'll die on conflicts
1036
for substate in ('prefixMap', 'indexValueMap'):
1037
sub = s.get(substate, {})
1038
mSub = master[substate]
1039
for k, v in sub.iteritems():
1042
# ensure that states are compatible (check for conflicts)
1045
for substate in ('prefixMap', 'indexValueMap'):
1046
sub = s.get(substate, {})
1047
mSub = master[substate]
1048
for k, v in sub.iteritems():
1049
if k in mSub and mSub[k] != v:
1050
# we defer error reporting so that the user sees
1051
# as much info on the error as possible
1052
conflicts.append((substate, k, v, mSub[k]))
1054
# the only way states can be incompatible is if two states have
1055
# different values for the same keys in the same substate
1058
raise InconsistantIndexCombination(
1059
"The SmartReadOnlyIndex opened on %s cannot recconcile "
1060
"the following conflicts in the subindices' states:\n%s"
1062
'\n'.join(["%s[%r] is %r in one index but %r in another"
1063
% c for c in conflicts])))
1065
self.prefixMap = master['prefixMap']
1066
self.indexValueMap = master['indexValueMap']
1068
def search(self, query, sortKey = None,
1070
batchSize = MAX_DOCS_TO_RETURN,
1071
sortIndex = None, sortAscending = True,
1072
sortByRelevence = False):
1073
# if the appropriate index value string is not in
1074
# self.indexValueMap, fetchState() before calling
1075
# ReadOnlyIndex.search. if it still isn't there, let
1076
# ReadOnlyIndex.search take care of throwing an error
1077
if sortKey is not None and sortKey not in self.indexValueMap:
1079
return ReadOnlyIndex.search(self, query, sortKey,
1080
startingIndex, batchSize,
1081
sortIndex, sortAscending,