2
xapwrap.document - Pythonic wrapper around Xapian's Document API
9
MAX_KEY_LEN = 240 # this comes from xapian's btree.h, Btree::max_key_len
10
# NOTE: xapian's btree.h file says that its actually 252, but due to
11
# xapian's implementation details, the actual limit is closer to 245
12
# bytes. See http://thread.gmane.org/gmane.comp.search.xapian.cvs/329
13
# for more info, especially the second message.
15
# The limit described above only holds true assuming keys that do not
16
# contain any NULL bytes. Since xapian internally escapes \0 bytes,
17
# xapian sees the key length as (2*N + 2) where N is the number of
18
# embedded NULL characters.
20
INTER_FIELD_POSITION_GAP = 100
22
UNICODE_ENCODING = "UTF-8" # XXX this should not be hardcoded on module level
23
UNICODE_ERROR_POLICY = "replace"
25
class StandardAnalyzer:
26
WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U)
28
def tokenize(self, unknownText):
29
originalText = cleanInputText(unknownText, True)
30
# we want to perform lower() and the re search using a unicode
31
# object. if we try to perform those operations on regular
32
# string object that happens to represent unicode text encoded
33
# with UTF-8, we'll get garbage, or at least an
34
# OS/libc/$LC_CTYPE dependant result
35
text = originalText.lower()
36
for match in self.WORD_RE.finditer(text):
37
# we yield unicode ONLY
41
class TextField(object):
42
__slots__ = ('name', 'text', 'prefix')
44
def __init__(self, name, text = '', prefix = False):
46
assert not prefix # it makes no sense to use a prefixed
47
# field without a name
58
class SortKey(object):
59
__slots__ = ('name', 'value', 'index', 'flattener')
61
def __init__(self, name, value, index = None, flattener = None):
65
assert (name is None) ^ (index is None)
66
self.flattener = flattener
74
def __init__(self, value):
78
return len(self.value)
80
class Keyword(object):
81
__slots__ = ('name', 'value')
83
def __init__(self, name, value):
88
return len(self.value)
93
@ivar keywords: sequence of Keyword objects
94
@ivar sortFields: sequence of SortKey objects
95
@ivar textFields: sequence of TextField objects
97
@cvar analyzerFactory: factory object for constructing analyzers
98
@cvar _picklerProtocol: protocol used in pickling data attributes
99
@cvar _noObject: dummy object used to indicate that there is no
101
@cvar source: this is an optional argument to point at the
102
original text/object that this document represents
105
_picklerProtocol = -1
106
analyzerFactory = StandardAnalyzer
108
# XXX TODO: add a fromXapianDoc classmethod that can be used by
109
# indices when returning documents from the db
111
def __init__(self, textFields = (), sortFields = (), keywords = (),
112
terms = (), values = (), uid = None, data = _noObject, source = None):
114
sortFields and values are really the same thing as far as
115
xapian is concerned. We differentiate them in the hope of
116
making the API easier to understand.
118
for fields in ('textFields', 'sortFields', 'keywords', 'terms', 'values'):
120
if not isinstance(arg, (list, tuple)):
122
setattr(self, fields, list(arg))
123
# copy the list so we can modify without affecting the original
127
# sortFields and values are really the same thing as far as xapian is concerned
128
self.sortFields += self.values
132
for fieldList in (self.textFields, self.keywords):
133
length += sum(map(len, fieldList))
135
if self.data != self._noObject:
136
length += len(cPickle.dumps(self.data, self._picklerProtocol))
140
def toXapianDocument(self, indexValueMap, prefixMap=None):
141
d = xapian.Document()
143
analyzer = self.analyzerFactory()
146
for field in self.textFields:
147
# XXX: terms textFields won't get numbered
148
# after each other, needed for titles
150
for token in analyzer.tokenize(field.text):
151
if isinstance(token, tuple):
152
token, position = token
155
# the xapian swig bindings don't like unicode objects, so we
156
# decode terms to UTF-8 before indexing. this is fine as
157
# long as all data that goes into the db (whether for
158
# indexing or search) is converted to UTF-8 string and all
159
# data coming from the db (.get_value(), .get_data()) is
161
token = token.encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY)
162
# the tokenizer cannot guarantee that token length is
163
# below MAX_KEY_LEN since the regexp is done with
164
# unicode and the result is later converted to UTF-8. In
165
# the process, the string length could expand, so we
166
# need to check here as well.
167
d.add_posting(checkKeyLen(token), position)
168
#position += INTER_FIELD_POSITION_GAP
172
for token in analyzer.tokenize(field.text):
173
if isinstance(token, tuple):
174
token, position = token
177
# token is unicode, but gets converted to UTF-8
178
# by makePairForWrite:
179
term = makePairForWrite(prefix, token, prefixMap)
180
d.add_posting(term, position)
181
#position += INTER_FIELD_POSITION_GAP
184
for field in self.keywords:
185
term = makePairForWrite(field.name, field.value, prefixMap)
188
# add non positional terms
189
for term in self.terms:
190
d.add_term(term.value)
193
for field in self.sortFields:
194
self.addSortField(d, field, indexValueMap)
196
# serialize and add the data object if present
197
if self.data is not self._noObject:
198
dataStr = cPickle.dumps(self.data, self._picklerProtocol)
203
def addSortField(self, doc, field, indexValueMap):
204
if field.index is None:
205
valueIndex = indexValueMap.get(field.name, None)
206
if valueIndex is None:
207
from index import NoIndexValueFound
208
raise NoIndexValueFound(field.name, indexValueMap)
210
valueIndex = field.index
211
assert isinstance(valueIndex, int)
214
flatValue = field.flattener(field.value)
216
flatValue = self.flatten(field.value)
217
# xapian has no limit on value length
218
cleanValue = cleanInputText(flatValue)
219
doc.add_value(valueIndex, cleanValue)
223
def flatten(self, value):
227
elif t in self._flatteners:
228
flattener = self._flatteners[t]
229
flatVal = flattener(value)
232
raise ValueError("Cannot flatten %r into a string. Perhaps you "
233
"should register a flattener for type %r."
234
% (value, type(value)))
236
def registerFlattener(klass, typeToFlatten, flattener):
237
if typeToFlatten in klass._flatteners:
238
raise ValueError("A sort field flattener for type %s has already"
239
"been registered (%s) but you are attempting to"
240
"register a new flattener: %s"
241
% (typeToFlatten, klass._flatteners[typeToFlatten],
243
assert callable(flattener)
244
klass._flatteners[typeToFlatten] = flattener
245
registerFlattener = classmethod(registerFlattener)
247
def unregisterFlattener(klass, typeToFlatten):
248
if typeToFlatten in klass._flatteners:
249
del klass._flatteners[typeToFlatten]
250
unregisterFlattener = classmethod(unregisterFlattener)
254
def flattenNumeric(value, numDigits = 10):
255
return ''.join(('%', str(numDigits), '.d')) % value
257
Document.registerFlattener(int, flattenNumeric)
259
def flattenLong(value):
260
return flattenNumeric(value, numDigits=20)
262
Document.registerFlattener(long, flattenLong)
264
def flattenDate(value):
265
return value.isoformat()
267
for dt in (datetime.date, datetime.time, datetime.datetime):
268
Document.registerFlattener(dt, flattenDate)
270
def flattenUnicode(value):
271
return value.encode(UNICODE_ENCODING)
273
Document.registerFlattener(unicode, flattenUnicode)
276
def cleanInputText(unknownText, returnUnicode = False):
277
if isinstance(unknownText, str):
278
originalText = unknownText.decode(UNICODE_ENCODING, UNICODE_ERROR_POLICY) # XXX hardcoded UTF-8, make param XXX
279
elif isinstance(unknownText, unicode):
280
originalText = unknownText
282
raise ValueError("Only strings and unicode objects can be indexed.")
283
# be very careful about lowercasing the text here: since the API we
284
# expose to higher levels doesn't allow searchup.py to call
285
# findInField directly, searches for INDEXERVERSION:4 have to be
286
# sent as regular queries. lowercasing all queries here will break
291
return originalText.encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY)
294
def makePairForWrite(prefix, token, prefixMap=None):
295
# prefixes must be uppercase; if the prefix given to us is a str
296
# that happens to be UTF-8 encoded, bad things will happen when we
297
# uppercase it, so we convert everything to unicode first
298
if isinstance(prefix, str):
299
prefix = prefix.decode(UNICODE_ENCODING, UNICODE_ERROR_POLICY)
300
if isinstance(token, str):
301
token = token.decode(UNICODE_ENCODING, UNICODE_ERROR_POLICY) # XXX hardcoded UTF-8, make param
303
if prefixMap is None:
304
prefix = prefix.upper()
305
else: # we have a map, so first translate it using the map (e.g. 'title' -> 'S')
306
prefix = prefixMap.get(prefix, prefix.upper())
308
result = '%s%s%s' % (prefix, prefix[0] == 'X' and ':' or '', token)
309
# since return value is going into the db, it must be encoded as UTF-8
310
result = result.encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY)
311
return checkKeyLen(result)
316
numNullBytes = s.count('\0') + 1
317
xapianLen = numNullBytes + len(s) + 1 # that last one is for the
319
if xapianLen < MAX_KEY_LEN:
322
# doing nothing seems preferable to mangling an overly large
323
# token that we don't know how to handle. we use a space
324
# instead of an empty string because xapian doesn't like
325
# getting empty strings added as terms