1
# -*- coding: iso-8859-1 -*-
3
MoinMoin - xapian search engine
5
@copyright: 2006-2008 MoinMoin:ThomasWaldmann,
6
2006 MoinMoin:FranzPletz
7
@license: GNU GPL, see COPYING for details.
13
from xapian import Query
15
from MoinMoin import log
16
logging = log.getLogger(__name__)
18
from MoinMoin.support.xapwrap import document as xapdoc
19
from MoinMoin.support.xapwrap import index as xapidx
20
from MoinMoin.parser.text_moin_wiki import Parser as WikiParser
22
from MoinMoin.Page import Page
23
from MoinMoin import config, wikiutil
24
from MoinMoin.search.builtin import BaseIndex
27
class UnicodeQuery(Query):
28
""" Xapian query object which automatically encodes unicode strings """
29
def __init__(self, *args, **kwargs):
31
@keyword encoding: specifiy the encoding manually (default: value of config.charset)
33
self.encoding = kwargs.get('encoding', config.charset)
37
if isinstance(term, unicode):
38
term = term.encode(self.encoding)
39
elif isinstance(term, list) or isinstance(term, tuple):
40
term = [t.encode(self.encoding) for t in term]
43
Query.__init__(self, *nargs, **kwargs)
46
##############################################################################
48
##############################################################################
50
def getWikiAnalyzerFactory(request=None, language='en'):
51
""" Returns a WikiAnalyzer instance
53
@keyword request: current request object
54
@keyword language: stemming language iso code, defaults to 'en'
56
return (lambda: WikiAnalyzer(request, language))
59
""" A text analyzer for wiki syntax
61
The purpose of this class is to anaylze texts/pages in wiki syntax
62
and yield yielding single terms for xapwrap to feed into the xapian
66
singleword = r"[%(u)s][%(l)s]+" % {
67
'u': config.chars_upper,
68
'l': config.chars_lower,
71
singleword_re = re.compile(singleword, re.U)
72
wikiword_re = re.compile(WikiParser.word_rule, re.UNICODE|re.VERBOSE)
74
token_re = re.compile(
75
r"(?P<company>\w+[&@]\w+)|" + # company names like AT&T and Excite@Home.
76
r"(?P<email>\w+([.-]\w+)*@\w+([.-]\w+)*)|" + # email addresses
77
r"(?P<hostname>\w+(\.\w+)+)|" + # hostnames
78
r"(?P<acronym>(\w\.)+)|" + # acronyms: U.S.A., I.B.M., etc.
79
r"(?P<word>\w+)", # words (including WikiWords)
82
dot_re = re.compile(r"[-_/,.]")
83
mail_re = re.compile(r"[-_/,.]|(@)")
84
alpha_num_re = re.compile(r"\d+|\D+")
86
# XXX limit stuff above to xapdoc.MAX_KEY_LEN
87
# WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U)
89
def __init__(self, request=None, language=None):
91
@param request: current request
92
@param language: if given, the language in which to stem words
95
if request and request.cfg.xapian_stemming and language:
97
stemmer = xapian.Stem(language)
98
# we need this wrapper because the stemmer returns a utf-8
99
# encoded string even when it gets fed with unicode objects:
100
self.stemmer = lambda word: stemmer(word).decode('utf-8')
101
except xapian.InvalidArgumentError:
102
# lang is not stemmable or not available
105
def raw_tokenize_word(self, word, pos):
106
""" try to further tokenize some word starting at pos """
107
if self.wikiword_re.match(word):
109
# if it is a CamelCaseWord, we additionally try to tokenize Camel, Case and Word
110
for m in re.finditer(self.singleword_re, word):
111
for w, p in self.raw_tokenize_word(m.group(), pos + m.start()):
114
# if we have Foo42, yield Foo and 42
115
for m in re.finditer(self.alpha_num_re, word):
116
yield (m.group(), pos + m.start())
118
def raw_tokenize(self, value):
119
""" Yield a stream of words from a string.
121
@param value: string to split, must be an unicode object or a list of
124
if isinstance(value, list): # used for page links
128
tokenstream = re.finditer(self.token_re, value)
129
for m in tokenstream:
130
if m.group("acronym"):
131
yield (m.group("acronym").replace('.', ''), m.start())
132
elif m.group("company"):
133
yield (m.group("company"), m.start())
134
elif m.group("email"):
136
for word in self.mail_re.split(m.group("email")):
138
yield (word, m.start() + displ)
139
displ += len(word) + 1
140
elif m.group("hostname"):
142
for word in self.dot_re.split(m.group("hostname")):
143
yield (word, m.start() + displ)
144
displ += len(word) + 1
145
elif m.group("word"):
146
for word, pos in self.raw_tokenize_word(m.group("word"), m.start()):
149
def tokenize(self, value, flat_stemming=True):
150
""" Yield a stream of lower cased raw and stemmed words from a string.
152
@param value: string to split, must be an unicode object or a list of
154
@keyword flat_stemming: whether to yield stemmed terms automatically
155
with the natural forms (True) or
156
yield both at once as a tuple (False)
158
for word, pos in self.raw_tokenize(value):
159
word = word.lower() # transform it into what xapian wants
163
yield (self.stemmer(word), pos)
165
yield (word, self.stemmer(word), pos)
168
#############################################################################
170
#############################################################################
172
class Index(BaseIndex):
173
""" A Xapian index """
175
# mapping the value names we can easily fetch from the index to
176
# integers required by xapian. 0 and 1 are reserved by xapwrap!
184
# http://svn.xapian.org/*checkout*/trunk/xapian-applications/omega/docs/termprefixes.txt
186
'date': 'D', # numeric format: YYYYMMDD or "latest" - e.g. D20050224 or Dlatest
187
#G newsGroup (or similar entity - e.g. a web forum name)
190
'lang': 'L', # ISO Language code
191
#M Month (numeric format: YYYYMM)
192
#N ISO couNtry code (or domaiN name)
195
'raw': 'R', # Raw (i.e. unstemmed) term
196
'title': 'S', # Subject (or title)
198
'url': 'U', # full URL of indexed document - if the resulting term would be > 240
199
# characters, a hashing scheme is used to prevent overflowing
200
# the Xapian term length limit (see omindex for how to do this).
201
#W "weak" (approximately 10 day intervals, taken as YYYYMMD from
202
# the D term, and changing the last digit to a '2' if it's a '3')
203
#X longer prefix for user-defined use
204
'linkto': 'XLINKTO', # this document links to that document
205
'stem_lang': 'XSTEMLANG', # ISO Language code this document was stemmed in
206
'category': 'XCAT', # category this document belongs to
207
'fulltitle': 'XFT', # full title
208
'domain': 'XDOMAIN', # standard or underlay
209
'revision': 'XREV', # revision of page
210
#Y year (four digits)
213
def __init__(self, request):
214
self._check_version()
215
BaseIndex.__init__(self, request)
217
def _check_version(self):
218
""" Checks if the correct version of Xapian is installed """
219
# every version greater than or equal to XAPIAN_MIN_VERSION is allowed
220
XAPIAN_MIN_VERSION = (1, 0, 0)
221
major, minor, revision = xapian.major_version(), xapian.minor_version(), xapian.revision()
222
if (major, minor, revision) >= XAPIAN_MIN_VERSION:
225
from MoinMoin.error import ConfigurationError
226
raise ConfigurationError(('MoinMoin needs at least Xapian version '
227
'%d.%d.%d to work correctly. Either disable Xapian '
228
'completetly in your wikiconfig or upgrade your Xapian %d.%d.%d '
229
'installation!') % (XAPIAN_MIN_VERSION + (major, minor, revision)))
232
""" Get the directory of the xapian index """
233
if self.request.cfg.xapian_index_dir:
234
return os.path.join(self.request.cfg.xapian_index_dir,
235
self.request.cfg.siteid)
237
return os.path.join(self.request.cfg.cache_dir, 'xapian')
240
""" Check if the Xapian index exists """
241
return BaseIndex.exists(self) and os.listdir(self.dir)
243
def _search(self, query, sort='weight', historysearch=0):
244
""" Perform the search using xapian (read-lock acquired)
246
@param query: the search query objects
247
@keyword sort: the sorting of the results (default: 'weight')
248
@keyword historysearch: whether to search in all page revisions (default: 0) TODO: use/implement this
252
searcher, timestamp = self.request.cfg.xapian_searchers.pop()
253
if timestamp != self.mtime():
258
searcher = xapidx.ReadOnlyIndex(self.dir)
259
searcher.configure(self.prefixMap, self.indexValueMap)
260
timestamp = self.mtime()
265
# XXX: we need real weight here, like _moinSearch
266
# (TradWeight in xapian)
267
kw['sortByRelevence'] = True
268
kw['sortKey'] = 'revision'
269
if sort == 'page_name':
270
kw['sortKey'] = 'pagename'
272
hits = searcher.search(query, valuesWanted=['pagename',
273
'attachment', 'mtime', 'wikiname', 'revision'], **kw)
274
self.request.cfg.xapian_searchers.append((searcher, timestamp))
277
def _do_queued_updates(self, request, amount=5):
278
""" Assumes that the write lock is acquired """
280
writer = xapidx.Index(self.dir, True)
281
writer.configure(self.prefixMap, self.indexValueMap)
283
# do all page updates
284
pages = self.update_queue.pages()[:amount]
286
p = Page(request, name)
287
if request.cfg.xapian_index_history:
288
for rev in p.getRevList():
289
self._index_page(writer, Page(request, name, rev=rev), mode='update')
291
self._index_page(writer, p, mode='update')
292
self.update_queue.remove([name])
294
# do page/attachment removals
295
items = self.remove_queue.pages()[:amount]
297
_item = item.split('//')
298
p = Page(request, _item[0])
299
self._remove_item(writer, p, _item[1])
300
self.remove_queue.remove([item])
305
""" Fetches all terms in the Xapian index """
306
db = xapidx.ExceptionTranslater.openIndex(True, self.dir)
307
i = db.allterms_begin()
308
while i != db.allterms_end():
312
def termpositions(self, uid, term):
313
""" Fetches all positions of a term in a document
315
@param uid: document id of the item in the xapian index
316
@param term: the term as a string
318
db = xapidx.ExceptionTranslater.openIndex(True, self.dir)
319
pos = db.positionlist_begin(uid, term)
320
while pos != db.positionlist_end(uid, term):
321
yield pos.get_termpos()
324
def _index_file(self, request, writer, filename, mode='update'):
325
""" index a file as it were a page named pagename
326
Assumes that the write lock is acquired
328
fs_rootpage = 'FS' # XXX FS hardcoded
331
wikiname = request.cfg.interwikiname or 'Self'
332
itemid = "%s:%s" % (wikiname, os.path.join(fs_rootpage, filename))
333
mtime = os.path.getmtime(filename)
334
mtime = wikiutil.timestamp2version(mtime)
336
query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid))
337
enq, mset, docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ])
339
doc = docs[0] # there should be only one
341
docmtime = long(doc['values']['mtime'])
342
updated = mtime > docmtime
343
logging.debug("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated))
349
logging.debug("%s %r" % (filename, updated))
351
xitemid = xapdoc.Keyword('itemid', itemid)
352
mimetype, file_content = self.contentfilter(filename)
353
xwname = xapdoc.SortKey('wikiname', request.cfg.interwikiname or "Self")
354
xpname = xapdoc.SortKey('pagename', fs_rootpage)
355
xattachment = xapdoc.SortKey('attachment', filename) # XXX we should treat files like real pages, not attachments
356
xmtime = xapdoc.SortKey('mtime', mtime)
357
xrev = xapdoc.SortKey('revision', '0')
358
title = " ".join(os.path.join(fs_rootpage, filename).split("/"))
359
xtitle = xapdoc.Keyword('title', title)
360
xmimetypes = [xapdoc.Keyword('mimetype', mt) for mt in [mimetype, ] + mimetype.split('/')]
361
xcontent = xapdoc.TextField('content', file_content)
362
doc = xapdoc.Document(textFields=(xcontent, ),
363
keywords=xmimetypes + [xtitle, xitemid, ],
364
sortFields=(xpname, xattachment,
365
xmtime, xwname, xrev, ),
367
doc.analyzerFactory = getWikiAnalyzerFactory()
369
logging.debug("%s (replace %r)" % (filename, uid))
371
id = writer.index(doc)
373
logging.debug("%s (add)" % (filename, ))
374
id = writer.index(doc)
375
except (OSError, IOError):
378
def _get_languages(self, page):
379
""" Get language of a page and the language to stem it in
381
@param page: the page instance
384
default_lang = page.request.cfg.language_default
386
# if we should stem, we check if we have stemmer for the language available
387
if page.request.cfg.xapian_stemming:
388
lang = page.pi['language']
391
# if there is no exception, lang is stemmable
393
except xapian.InvalidArgumentError:
394
# lang is not stemmable
398
# no lang found at all.. fallback to default language
401
# return actual lang and lang to stem in
402
return (lang, default_lang)
404
def _get_categories(self, page):
405
""" Get all categories the page belongs to through the old
408
@param page: the page instance
410
body = page.get_raw_body()
417
prev, next = next, re.search(r'----*\r?\n', body[pos:])
419
if not prev or prev == 1:
421
# for CategoryFoo, group 'all' matched CategoryFoo, group 'key' matched just Foo
422
return [m.group('all').lower() for m in self.request.cfg.cache.page_category_regex.finditer(body[pos:])]
424
def _get_domains(self, page):
425
""" Returns a generator with all the domains the page belongs to
429
if page.isUnderlayPage():
431
if page.isStandardPage():
433
if wikiutil.isSystemPage(self.request, page.page_name):
436
def _index_page(self, writer, page, mode='update'):
437
""" Index a page - assumes that the write lock is acquired
439
@arg writer: the index writer object
440
@arg page: a page object
441
@arg mode: 'add' = just add, no checks
442
'update' = check if already in index and update if needed (mtime)
444
request = page.request
445
wikiname = request.cfg.interwikiname or "Self"
446
pagename = page.page_name
447
mtime = page.mtime_usecs()
448
revision = str(page.get_real_rev())
449
itemid = "%s:%s:%s" % (wikiname, pagename, revision)
450
author = page.edit_info().get('editor', '?')
451
# XXX: Hack until we get proper metadata
452
language, stem_language = self._get_languages(page)
453
categories = self._get_categories(page)
454
domains = tuple(self._get_domains(page))
458
# from #xapian: if you generate a special "unique id" term,
459
# you can just call database.replace_document(uid_term, doc)
460
# -> done in xapwrap.index.Index.index()
461
query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid))
462
enq, mset, docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ])
464
doc = docs[0] # there should be only one
466
docmtime = long(doc['values']['mtime'])
467
updated = mtime > docmtime
468
logging.debug("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated))
474
logging.debug("%s %r" % (pagename, updated))
476
xwname = xapdoc.SortKey('wikiname', wikiname)
477
xpname = xapdoc.SortKey('pagename', pagename)
478
xattachment = xapdoc.SortKey('attachment', '') # this is a real page, not an attachment
479
xmtime = xapdoc.SortKey('mtime', str(mtime))
480
xrev = xapdoc.SortKey('revision', revision)
481
xtitle = xapdoc.TextField('title', pagename, True) # prefixed
482
mimetype = 'text/%s' % page.pi['format'] # XXX improve this
483
xkeywords = [xapdoc.Keyword('itemid', itemid),
484
xapdoc.Keyword('lang', language),
485
xapdoc.Keyword('stem_lang', stem_language),
486
xapdoc.Keyword('fulltitle', pagename),
487
xapdoc.Keyword('revision', revision),
488
xapdoc.Keyword('author', author),
490
[xapdoc.Keyword('mimetype', mt) for mt in [mimetype, ] + mimetype.split('/')]
492
for pagelink in page.getPageLinks(request):
493
xkeywords.append(xapdoc.Keyword('linkto', pagelink))
494
for category in categories:
495
xkeywords.append(xapdoc.Keyword('category', category))
496
for domain in domains:
497
xkeywords.append(xapdoc.Keyword('domain', domain))
498
xcontent = xapdoc.TextField('content', page.get_raw_body())
499
doc = xapdoc.Document(textFields=(xcontent, xtitle),
501
sortFields=(xpname, xattachment,
502
xmtime, xwname, xrev),
504
doc.analyzerFactory = getWikiAnalyzerFactory(request,
508
logging.debug("%s (replace %r)" % (pagename, uid))
510
id = writer.index(doc)
512
logging.debug("%s (add)" % (pagename, ))
513
id = writer.index(doc)
515
from MoinMoin.action import AttachFile
517
attachments = AttachFile._get_files(request, pagename)
518
for att in attachments:
519
filename = AttachFile.getFilename(request, pagename, att)
520
att_itemid = "%s:%s//%s" % (wikiname, pagename, att)
521
mtime = wikiutil.timestamp2version(os.path.getmtime(filename))
523
query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', att_itemid))
524
enq, mset, docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', ])
525
logging.debug("##%r %r" % (filename, docs))
527
doc = docs[0] # there should be only one
529
docmtime = long(doc['values']['mtime'])
530
updated = mtime > docmtime
531
logging.debug("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated))
537
logging.debug("%s %s %r" % (pagename, att, updated))
539
xatt_itemid = xapdoc.Keyword('itemid', att_itemid)
540
xpname = xapdoc.SortKey('pagename', pagename)
541
xwname = xapdoc.SortKey('wikiname', request.cfg.interwikiname or "Self")
542
xattachment = xapdoc.SortKey('attachment', att) # this is an attachment, store its filename
543
xmtime = xapdoc.SortKey('mtime', mtime)
544
xrev = xapdoc.SortKey('revision', '0')
545
xtitle = xapdoc.Keyword('title', '%s/%s' % (pagename, att))
546
xlanguage = xapdoc.Keyword('lang', language)
547
xstem_language = xapdoc.Keyword('stem_lang', stem_language)
548
mimetype, att_content = self.contentfilter(filename)
549
xmimetypes = [xapdoc.Keyword('mimetype', mt) for mt in [mimetype, ] + mimetype.split('/')]
550
xcontent = xapdoc.TextField('content', att_content)
551
xtitle_txt = xapdoc.TextField('title',
552
'%s/%s' % (pagename, att), True)
553
xfulltitle = xapdoc.Keyword('fulltitle', pagename)
554
xdomains = [xapdoc.Keyword('domain', domain)
555
for domain in domains]
556
doc = xapdoc.Document(textFields=(xcontent, xtitle_txt),
557
keywords=xdomains + xmimetypes + [xatt_itemid,
558
xtitle, xlanguage, xstem_language,
560
sortFields=(xpname, xattachment, xmtime,
563
doc.analyzerFactory = getWikiAnalyzerFactory(request,
566
logging.debug("%s (replace %r)" % (pagename, uid))
568
id = writer.index(doc)
570
logging.debug("%s (add)" % (pagename, ))
571
id = writer.index(doc)
574
def _remove_item(self, writer, page, attachment=None):
575
request = page.request
576
wikiname = request.cfg.interwikiname or 'Self'
577
pagename = page.page_name
580
# Remove all revisions and attachments from the index
581
query = xapidx.RawQuery(xapidx.makePairForWrite(
582
self.prefixMap['fulltitle'], pagename))
583
enq, mset, docs = writer.search(query, valuesWanted=['pagename',
586
writer.delete_document(doc['uid'])
587
logging.debug('%s removed from xapian index' %
588
doc['values']['pagename'])
590
# Only remove a single attachment
591
query = xapidx.RawQuery(xapidx.makePairForWrite('itemid',
592
"%s:%s//%s" % (wikiname, pagename, attachment)))
593
enq, mset, docs = writer.search(query, valuesWanted=['pagename',
596
doc = docs[0] # there should be only one
597
writer.delete_document(doc['uid'])
598
logging.debug('attachment %s from %s removed from index' %
599
(doc['values']['attachment'], doc['values']['pagename']))
601
def _index_pages(self, request, files=None, mode='update'):
602
""" Index all pages (and all given files)
604
This should be called from indexPages or indexPagesInNewThread only!
606
This may take some time, depending on the size of the wiki and speed
609
When called in a new thread, lock is acquired before the call,
610
and this method must release it when it finishes or fails.
612
@param request: the current request
613
@keyword files: an optional list of files to index
614
@keyword mode: how to index the files, either 'add', 'update' or
618
# rebuilding the DB: delete it and add everything
619
if mode == 'rebuild':
620
for f in os.listdir(self.dir):
621
os.unlink(os.path.join(self.dir, f))
626
writer = xapidx.Index(self.dir, True)
627
writer.configure(self.prefixMap, self.indexValueMap)
628
pages = request.rootpage.getPageList(user='', exists=1)
629
logging.debug("indexing all (%d) pages..." % len(pages))
630
for pagename in pages:
631
p = Page(request, pagename)
633
if request.cfg.xapian_index_history:
634
for rev in p.getRevList():
635
self._index_page(writer,
636
Page(request, pagename, rev=rev),
639
self._index_page(writer, p, mode)
641
logging.debug("indexing all files...")
643
fname = fname.strip()
644
self._index_file(request, writer, fname, mode)