1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
|
"""
xapwrap.document - Pythonic wrapper around Xapian's Document API
"""
import datetime
import re
import cPickle
import xapian
MAX_KEY_LEN = 240 # this comes from xapian's btree.h, Btree::max_key_len
# NOTE: xapian's btree.h file says that its actually 252, but due to
# xapian's implementation details, the actual limit is closer to 245
# bytes. See http://thread.gmane.org/gmane.comp.search.xapian.cvs/329
# for more info, especially the second message.
# The limit described above only holds true assuming keys that do not
# contain any NULL bytes. Since xapian internally escapes \0 bytes,
# xapian sees the key length as (2*N + 2) where N is the number of
# embedded NULL characters.
INTER_FIELD_POSITION_GAP = 100
UNICODE_ENCODING = "UTF-8" # XXX this should not be hardcoded on module level
UNICODE_ERROR_POLICY = "replace"
class StandardAnalyzer:
WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U)
def tokenize(self, unknownText):
originalText = cleanInputText(unknownText, True)
# we want to perform lower() and the re search using a unicode
# object. if we try to perform those operations on regular
# string object that happens to represent unicode text encoded
# with UTF-8, we'll get garbage, or at least an
# OS/libc/$LC_CTYPE dependant result
text = originalText.lower()
for match in self.WORD_RE.finditer(text):
# we yield unicode ONLY
yield match.group()
class TextField(object):
__slots__ = ('name', 'text', 'prefix')
def __init__(self, name, text = '', prefix = False):
if name and not text:
assert not prefix # it makes no sense to use a prefixed
# field without a name
self.text = name
self.name = ''
else:
self.name = name
self.text = text
self.prefix = prefix
def __len__(self):
return len(self.text)
class SortKey(object):
__slots__ = ('name', 'value', 'index', 'flattener')
def __init__(self, name, value, index = None, flattener = None):
self.name = name
self.value = value
self.index = index
assert (name is None) ^ (index is None)
self.flattener = flattener
class Value(SortKey):
pass
class Term(object):
__slots__ = ('value')
def __init__(self, value):
self.value = value
def __len__(self):
return len(self.value)
class Keyword(object):
__slots__ = ('name', 'value')
def __init__(self, name, value):
self.name = name
self.value = value
def __len__(self):
return len(self.value)
class Document:
"""
@ivar keywords: sequence of Keyword objects
@ivar sortFields: sequence of SortKey objects
@ivar textFields: sequence of TextField objects
@cvar analyzerFactory: factory object for constructing analyzers
@cvar _picklerProtocol: protocol used in pickling data attributes
@cvar _noObject: dummy object used to indicate that there is no
data attribute
@cvar source: this is an optional argument to point at the
original text/object that this document represents
"""
_noObject = object()
_picklerProtocol = -1
analyzerFactory = StandardAnalyzer
# XXX TODO: add a fromXapianDoc classmethod that can be used by
# indices when returning documents from the db
def __init__(self, textFields = (), sortFields = (), keywords = (),
terms = (), values = (), uid = None, data = _noObject, source = None):
"""
sortFields and values are really the same thing as far as
xapian is concerned. We differentiate them in the hope of
making the API easier to understand.
"""
for fields in ('textFields', 'sortFields', 'keywords', 'terms', 'values'):
arg = vars()[fields]
if not isinstance(arg, (list, tuple)):
arg = (arg,)
setattr(self, fields, list(arg))
# copy the list so we can modify without affecting the original
self.uid = uid
self.data = data
self.source = source
# sortFields and values are really the same thing as far as xapian is concerned
self.sortFields += self.values
def __len__(self):
length = 0
for fieldList in (self.textFields, self.keywords):
length += sum(map(len, fieldList))
if self.data != self._noObject:
length += len(cPickle.dumps(self.data, self._picklerProtocol))
return length
def toXapianDocument(self, indexValueMap, prefixMap=None):
d = xapian.Document()
position = 0
analyzer = self.analyzerFactory()
# add text fields
for field in self.textFields:
# XXX: terms textFields won't get numbered
# after each other, needed for titles
position = 0
for token in analyzer.tokenize(field.text):
if isinstance(token, tuple):
token, position = token
else:
position += 1
# the xapian swig bindings don't like unicode objects, so we
# decode terms to UTF-8 before indexing. this is fine as
# long as all data that goes into the db (whether for
# indexing or search) is converted to UTF-8 string and all
# data coming from the db (.get_value(), .get_data()) is
# decoded as UTF-8.
token = token.encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY)
# the tokenizer cannot guarantee that token length is
# below MAX_KEY_LEN since the regexp is done with
# unicode and the result is later converted to UTF-8. In
# the process, the string length could expand, so we
# need to check here as well.
d.add_posting(checkKeyLen(token), position)
#position += INTER_FIELD_POSITION_GAP
if field.prefix:
prefix = field.name
for token in analyzer.tokenize(field.text):
if isinstance(token, tuple):
token, position = token
else:
position += 1
# token is unicode, but gets converted to UTF-8
# by makePairForWrite:
term = makePairForWrite(prefix, token, prefixMap)
d.add_posting(term, position)
#position += INTER_FIELD_POSITION_GAP
# add keyword fields
for field in self.keywords:
term = makePairForWrite(field.name, field.value, prefixMap)
d.add_term(term)
# add non positional terms
for term in self.terms:
d.add_term(term.value)
# add sort keys
for field in self.sortFields:
self.addSortField(d, field, indexValueMap)
# serialize and add the data object if present
if self.data is not self._noObject:
dataStr = cPickle.dumps(self.data, self._picklerProtocol)
d.set_data(dataStr)
return d
def addSortField(self, doc, field, indexValueMap):
if field.index is None:
valueIndex = indexValueMap.get(field.name, None)
if valueIndex is None:
from index import NoIndexValueFound
raise NoIndexValueFound(field.name, indexValueMap)
else:
valueIndex = field.index
assert isinstance(valueIndex, int)
if field.flattener:
flatValue = field.flattener(field.value)
else:
flatValue = self.flatten(field.value)
# xapian has no limit on value length
cleanValue = cleanInputText(flatValue)
doc.add_value(valueIndex, cleanValue)
_flatteners = {}
def flatten(self, value):
t = type(value)
if t == str:
return value
elif t in self._flatteners:
flattener = self._flatteners[t]
flatVal = flattener(value)
return flatVal
else:
raise ValueError("Cannot flatten %r into a string. Perhaps you "
"should register a flattener for type %r."
% (value, type(value)))
def registerFlattener(klass, typeToFlatten, flattener):
if typeToFlatten in klass._flatteners:
raise ValueError("A sort field flattener for type %s has already"
"been registered (%s) but you are attempting to"
"register a new flattener: %s"
% (typeToFlatten, klass._flatteners[typeToFlatten],
flattener))
assert callable(flattener)
klass._flatteners[typeToFlatten] = flattener
registerFlattener = classmethod(registerFlattener)
def unregisterFlattener(klass, typeToFlatten):
if typeToFlatten in klass._flatteners:
del klass._flatteners[typeToFlatten]
unregisterFlattener = classmethod(unregisterFlattener)
# common flatteners:
def flattenNumeric(value, numDigits = 10):
return ''.join(('%', str(numDigits), '.d')) % value
Document.registerFlattener(int, flattenNumeric)
def flattenLong(value):
return flattenNumeric(value, numDigits=20)
Document.registerFlattener(long, flattenLong)
def flattenDate(value):
return value.isoformat()
for dt in (datetime.date, datetime.time, datetime.datetime):
Document.registerFlattener(dt, flattenDate)
def flattenUnicode(value):
return value.encode(UNICODE_ENCODING)
Document.registerFlattener(unicode, flattenUnicode)
def cleanInputText(unknownText, returnUnicode = False):
if isinstance(unknownText, str):
originalText = unknownText.decode(UNICODE_ENCODING, UNICODE_ERROR_POLICY) # XXX hardcoded UTF-8, make param XXX
elif isinstance(unknownText, unicode):
originalText = unknownText
else:
raise ValueError("Only strings and unicode objects can be indexed.")
# be very careful about lowercasing the text here: since the API we
# expose to higher levels doesn't allow searchup.py to call
# findInField directly, searches for INDEXERVERSION:4 have to be
# sent as regular queries. lowercasing all queries here will break
# keyword searches.
if returnUnicode:
return originalText
else:
return originalText.encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY)
def makePairForWrite(prefix, token, prefixMap=None):
# prefixes must be uppercase; if the prefix given to us is a str
# that happens to be UTF-8 encoded, bad things will happen when we
# uppercase it, so we convert everything to unicode first
if isinstance(prefix, str):
prefix = prefix.decode(UNICODE_ENCODING, UNICODE_ERROR_POLICY)
if isinstance(token, str):
token = token.decode(UNICODE_ENCODING, UNICODE_ERROR_POLICY) # XXX hardcoded UTF-8, make param
if prefixMap is None:
prefix = prefix.upper()
else: # we have a map, so first translate it using the map (e.g. 'title' -> 'S')
prefix = prefixMap.get(prefix, prefix.upper())
result = '%s%s%s' % (prefix, prefix[0] == 'X' and ':' or '', token)
# since return value is going into the db, it must be encoded as UTF-8
result = result.encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY)
return checkKeyLen(result)
def checkKeyLen(s):
if not s:
return ' '
numNullBytes = s.count('\0') + 1
xapianLen = numNullBytes + len(s) + 1 # that last one is for the
# terminating \0
if xapianLen < MAX_KEY_LEN:
return s
else:
# doing nothing seems preferable to mangling an overly large
# token that we don't know how to handle. we use a space
# instead of an empty string because xapian doesn't like
# getting empty strings added as terms
return ' '
|