3
# Copyright (C) 2007 Lemur Consulting Ltd
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation; either version 2 of the License, or
8
# (at your option) any later version.
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
15
# You should have received a copy of the GNU General Public License along
16
# with this program; if not, write to the Free Software Foundation, Inc.,
17
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18
r"""fieldactions.py: Definitions and implementations of field actions.
21
__docformat__ = "restructuredtext en"
26
from replaylog import log
30
def _act_store_content(fieldname, doc, value, context):
31
"""Perform the STORE_CONTENT action.
35
fielddata = doc.data[fieldname]
38
doc.data[fieldname] = fielddata
39
fielddata.append(value)
41
def _act_index_exact(fieldname, doc, value, context):
42
"""Perform the INDEX_EXACT action.
45
doc.add_term(fieldname, value, 0)
47
def _act_tag(fieldname, doc, value, context):
48
"""Perform the TAG action.
51
doc.add_term(fieldname, value.lower(), 0)
53
def _act_facet(fieldname, doc, value, context, type=None):
54
"""Perform the FACET action.
57
if type is None or type == 'string':
59
doc.add_term(fieldname, value, 0)
60
serialiser = log(xapian.StringListSerialiser,
61
doc.get_value(fieldname, 'facet'))
62
serialiser.append(value)
63
doc.add_value(fieldname, serialiser.get(), 'facet')
65
marshaller = SortableMarshaller()
66
fn = marshaller.get_marshall_function(fieldname, type)
67
doc.add_value(fieldname, fn(fieldname, value), 'facet')
69
def _act_index_freetext(fieldname, doc, value, context, weight=1,
70
language=None, stop=None, spell=False,
72
allow_field_specific=True,
73
search_by_default=True):
74
"""Perform the INDEX_FREETEXT action.
77
termgen = log(xapian.TermGenerator)
78
if language is not None:
79
termgen.set_stemmer(log(xapian.Stem, language))
82
stopper = log(xapian.SimpleStopper)
85
termgen.set_stopper (stopper)
88
termgen.set_database(context.index)
89
termgen.set_flags(termgen.FLAG_SPELLING)
91
termgen.set_document(doc._doc)
94
termgen.set_termpos(context.current_position)
95
# Store a copy of the field without a prefix, for non-field-specific
98
termgen.index_text_without_positions(value, weight, '')
100
termgen.index_text(value, weight, '')
102
if allow_field_specific:
103
# Store a second copy of the term with a prefix, for field-specific
105
prefix = doc._fieldmappings.get_prefix(fieldname)
107
termgen.set_termpos(context.current_position)
109
termgen.index_text_without_positions(value, weight, prefix)
111
termgen.index_text(value, weight, prefix)
113
# Add a gap between each field instance, so that phrase searches don't
114
# match across instances.
115
termgen.increase_termpos(10)
116
context.current_position = termgen.get_termpos()
118
class SortableMarshaller(object):
119
"""Implementation of marshalling for sortable values.
122
def __init__(self, indexing=True):
124
self._err = errors.IndexerError
126
self._err = errors.SearchError
128
def marshall_string(self, fieldname, value):
129
"""Marshall a value for sorting in lexicograpical order.
131
This returns the input as the output, since strings already sort in
132
lexicographical order.
137
def marshall_float(self, fieldname, value):
138
"""Marshall a value for sorting as a floating point value.
141
# convert the value to a float
145
raise self._err("Value supplied to field %r must be a "
146
"valid floating point number: was %r" %
148
return marshall.float_to_string(value)
150
def marshall_date(self, fieldname, value):
151
"""Marshall a value for sorting as a date.
155
value = parsedate.date_from_string(value)
156
except ValueError, e:
157
raise self._err("Value supplied to field %r must be a "
158
"valid date: was %r: error is '%s'" %
159
(fieldname, value, str(e)))
160
return marshall.date_to_string(value)
162
def get_marshall_function(self, fieldname, sorttype):
163
"""Get a function used to marshall values of a given sorttype.
168
None: self.marshall_string,
169
'string': self.marshall_string,
170
'float': self.marshall_float,
171
'date': self.marshall_date,
174
raise self._err("Unknown sort type %r for field %r" %
175
(sorttype, fieldname))
178
def _act_sort_and_collapse(fieldname, doc, value, context, type=None):
179
"""Perform the SORTABLE action.
182
marshaller = SortableMarshaller()
183
fn = marshaller.get_marshall_function(fieldname, type)
184
value = fn(fieldname, value)
185
doc.add_value(fieldname, value, 'collsort')
187
class ActionContext(object):
188
"""The context in which an action is performed.
190
This is just used to pass term generators, word positions, and the like
194
def __init__(self, index):
195
self.current_language = None
196
self.current_position = 0
199
class FieldActions(object):
200
"""An object describing the actions to be performed on a field.
202
The supported actions are:
204
- `STORE_CONTENT`: store the unprocessed content of the field in the search
205
engine database. All fields which need to be displayed or used when
206
displaying the search results need to be given this action.
208
- `INDEX_EXACT`: index the exact content of the field as a single search
209
term. Fields whose contents need to be searchable as an "exact match"
210
need to be given this action.
212
- `INDEX_FREETEXT`: index the content of this field as text. The content
213
will be split into terms, allowing free text searching of the field. Four
214
optional parameters may be supplied:
216
- 'weight' is a multiplier to apply to the importance of the field. This
217
must be an integer, and the default value is 1.
218
- 'language' is the language to use when processing the field. This can
219
be expressed as an ISO 2-letter language code. The supported languages
220
are those supported by the xapian core in use.
221
- 'stop' is an iterable of stopwords to filter out of the generated
222
terms. Note that due to Xapian design, only non-positional terms are
223
affected, so this is of limited use.
224
- 'spell' is a boolean flag - if true, the contents of the field will be
225
used for spelling correction.
226
- 'nopos' is a boolean flag - if true, positional information is not
228
- 'allow_field_specific' is a boolean flag - if False, prevents terms with the field
229
prefix being generated. This means that searches specific to this
230
field will not work, and thus should only be used when only non-field
231
specific searches are desired. Defaults to True.
232
- 'search_by_default' is a boolean flag - if False, the field will not be
233
searched by non-field specific searches. If True, or omitted, the
234
field will be included in searches for non field-specific searches.
236
- `SORTABLE`: index the content of the field such that it can be used to
237
sort result sets. It also allows result sets to be restricted to those
238
documents with a field values in a given range. One optional parameter
241
- 'type' is a value indicating how to sort the field. It has several
244
- 'string' - sort in lexicographic (ie, alphabetical) order.
245
This is the default, used if no type is set.
246
- 'float' - treat the values as (decimal representations of) floating
247
point numbers, and sort in numerical order. The values in the field
248
must be valid floating point numbers (according to Python's float()
250
- 'date' - sort in date order. The values must be valid dates (either
251
Python datetime.date objects, or ISO 8601 format (ie, YYYYMMDD or
254
- `COLLAPSE`: index the content of the field such that it can be used to
255
"collapse" result sets, such that only the highest result with each value
256
of the field will be returned.
258
- `TAG`: the field contains tags; these are strings, which will be matched
259
in a case insensitive way, but otherwise must be exact matches. Tag
260
fields can be searched for by making an explict query (ie, using
261
query_field(), but not with query_parse()). A list of the most frequent
262
tags in a result set can also be accessed easily.
264
- `FACET`: the field represents a classification facet; these are strings
265
which will be matched exactly, but a list of all the facets present in
266
the result set can also be accessed easily - in addition, a suitable
267
subset of the facets, and a selection of the facet values, present in the
268
result set can be calculated. One optional parameter may be supplied:
270
- 'type' is a value indicating the type of facet contained in the field:
272
- 'string' - the facet values are exact binary strings.
273
- 'float' - the facet values are floating point numbers.
277
# See the class docstring for the meanings of the following constants.
286
# Sorting and collapsing store the data in a value, but the format depends
287
# on the sort type. Easiest way to implement is to treat them as the same
289
SORT_AND_COLLAPSE = -1
291
_unsupported_actions = []
293
if 'tags' in _checkxapian.missing_features:
294
_unsupported_actions.append(TAG)
295
if 'facets' in _checkxapian.missing_features:
296
_unsupported_actions.append(FACET)
298
def __init__(self, fieldname):
299
# Dictionary of actions, keyed by type.
301
self._fieldname = fieldname
303
def add(self, field_mappings, action, **kwargs):
304
"""Add an action to perform on a field.
307
if action in self._unsupported_actions:
308
raise errors.IndexerError("Action unsupported with this release of xapian")
310
if action not in (FieldActions.STORE_CONTENT,
311
FieldActions.INDEX_EXACT,
312
FieldActions.INDEX_FREETEXT,
313
FieldActions.SORTABLE,
314
FieldActions.COLLAPSE,
318
raise errors.IndexerError("Unknown field action: %r" % action)
320
info = self._action_info[action]
322
# Check parameter names
323
for key in kwargs.keys():
324
if key not in info[1]:
325
raise errors.IndexerError("Unknown parameter name for action %r: %r" % (info[0], key))
327
# Fields cannot be indexed both with "EXACT" and "FREETEXT": whilst we
328
# could implement this, the query parser wouldn't know what to do with
330
if action == FieldActions.INDEX_EXACT:
331
if FieldActions.INDEX_FREETEXT in self._actions:
332
raise errors.IndexerError("Field %r is already marked for indexing "
333
"as free text: cannot mark for indexing "
334
"as exact text as well" % self._fieldname)
335
if action == FieldActions.INDEX_FREETEXT:
336
if FieldActions.INDEX_EXACT in self._actions:
337
raise errors.IndexerError("Field %r is already marked for indexing "
338
"as exact text: cannot mark for indexing "
339
"as free text as well" % self._fieldname)
341
# Fields cannot be indexed as more than one type for "SORTABLE": to
342
# implement this, we'd need to use a different prefix for each sortable
343
# type, but even then the search end wouldn't know what to sort on when
344
# searching. Also, if they're indexed as "COLLAPSE", the value must be
345
# stored in the right format for the type "SORTABLE".
346
if action == FieldActions.SORTABLE or action == FieldActions.COLLAPSE:
347
if action == FieldActions.COLLAPSE:
351
sorttype = kwargs['type']
354
kwargs['type'] = sorttype
355
action = FieldActions.SORT_AND_COLLAPSE
358
oldsortactions = self._actions[FieldActions.SORT_AND_COLLAPSE]
362
if len(oldsortactions) > 0:
363
for oldsortaction in oldsortactions:
364
oldsorttype = oldsortaction['type']
366
if sorttype == oldsorttype or oldsorttype is None:
368
self._actions[action] = []
369
elif sorttype is None:
373
raise errors.IndexerError("Field %r is already marked for "
374
"sorting, with a different "
375
"sort type" % self._fieldname)
377
if 'prefix' in info[3]:
378
field_mappings.add_prefix(self._fieldname)
379
if 'slot' in info[3]:
380
purposes = info[3]['slot']
381
if isinstance(purposes, basestring):
382
field_mappings.add_slot(self._fieldname, purposes)
385
for purpose in purposes:
386
slotnum = field_mappings.get_slot(self._fieldname, purpose)
387
if slotnum is not None:
389
for purpose in purposes:
390
field_mappings.add_slot(self._fieldname, purpose, slotnum=slotnum)
392
# Make an entry for the action
393
if action not in self._actions:
394
self._actions[action] = []
396
# Check for repetitions of actions
397
for old_action in self._actions[action]:
398
if old_action == kwargs:
401
# Append the action to the list of actions
402
self._actions[action].append(kwargs)
404
def perform(self, doc, value, context):
405
"""Perform the actions on the field.
407
- `doc` is a ProcessedDocument to store the result of the actions in.
408
- `value` is a string holding the value of the field.
409
- `context` is an ActionContext object used to keep state in.
412
for type, actionlist in self._actions.iteritems():
413
info = self._action_info[type]
414
for kwargs in actionlist:
415
info[2](self._fieldname, doc, value, context, **kwargs)
418
STORE_CONTENT: ('STORE_CONTENT', (), _act_store_content, {}, ),
419
INDEX_EXACT: ('INDEX_EXACT', (), _act_index_exact, {'prefix': True}, ),
420
INDEX_FREETEXT: ('INDEX_FREETEXT', ('weight', 'language', 'stop', 'spell', 'nopos', 'allow_field_specific', 'search_by_default', ),
421
_act_index_freetext, {'prefix': True, }, ),
422
SORTABLE: ('SORTABLE', ('type', ), None, {'slot': 'collsort',}, ),
423
COLLAPSE: ('COLLAPSE', (), None, {'slot': 'collsort',}, ),
424
TAG: ('TAG', (), _act_tag, {'prefix': True,}, ),
425
FACET: ('FACET', ('type', ), _act_facet, {'prefix': True, 'slot': 'facet',}, ),
427
SORT_AND_COLLAPSE: ('SORT_AND_COLLAPSE', ('type', ), _act_sort_and_collapse, {'slot': 'collsort',}, ),
430
if __name__ == '__main__':
432
doctest.testmod (sys.modules[__name__])