1
# -*- coding: iso-8859-1 -*-
3
MoinMoin - search engine internals
5
@copyright: 2005 MoinMoin:FlorianFesti,
6
2005 MoinMoin:NirSoffer,
7
2005 MoinMoin:AlexanderSchremmer,
8
2006-2008 MoinMoin:ThomasWaldmann,
9
2006 MoinMoin:FranzPletz
10
@license: GNU GPL, see COPYING for details
13
import sys, os, time, errno, codecs
15
from MoinMoin import log
16
logging = log.getLogger(__name__)
18
from MoinMoin import wikiutil, config
19
from MoinMoin.Page import Page
20
from MoinMoin.util import lock, filesys
21
from MoinMoin.search.results import getSearchResults
22
from MoinMoin.search.queryparser import Match, TextMatch, TitleMatch
24
##############################################################################
25
# Search Engine Abstraction
26
##############################################################################
29
""" Represents a locked page queue on the disk
31
XXX: check whether we just can use the caching module
34
def __init__(self, f, lock_dir):
36
@param f: file to write to
37
@param lock_dir: directory to save the lock files
40
self.writeLock = lock.WriteLock(lock_dir, timeout=10.0)
41
self.readLock = lock.ReadLock(lock_dir, timeout=10.0)
44
""" Checks if the queue exists on the filesystem """
45
return os.path.exists(self.file)
47
def append(self, pagename):
48
""" Append a page to queue
50
@param pagename: string to save
52
if not self.writeLock.acquire(60.0):
53
logging.warning("can't add %r to xapian update queue: can't lock queue" % pagename)
56
f = codecs.open(self.file, 'a', config.charset)
58
f.write(pagename + "\n")
62
self.writeLock.release()
65
""" Return list of pages in the queue """
66
if self.readLock.acquire(1.0):
68
return self._decode(self._read())
70
self.readLock.release()
73
def remove(self, pages):
74
""" Remove pages from the queue
76
When the queue is empty, the queue file is removed, so exists()
77
can tell if there is something waiting in the queue.
79
@param pages: list of pagenames to remove
81
if self.writeLock.acquire(30.0):
83
queue = self._decode(self._read())
95
self.writeLock.release()
98
# Private -------------------------------------------------------
100
def _decode(self, data):
101
""" Decode queue data
103
@param data: the data to decode
105
pages = data.splitlines()
106
return self._filterDuplicates(pages)
108
def _filterDuplicates(self, pages):
109
""" Filter duplicates in page list, keeping the order
111
@param pages: list of pages to filter
122
""" Read and return queue data
124
This does not do anything with the data so we can release the
125
lock as soon as possible, enabling others to update the queue.
128
f = codecs.open(self.file, 'r', config.charset)
133
except (OSError, IOError), err:
134
if err.errno != errno.ENOENT:
138
def _write(self, pages):
139
""" Write pages to queue file
141
Requires queue write locking.
143
@param pages: list of pages to write
145
# XXX use tmpfile/move for atomic replace on real operating systems
146
data = '\n'.join(pages) + '\n'
147
f = codecs.open(self.file, 'w', config.charset)
153
def _removeFile(self):
154
""" Remove queue file
156
Requires queue write locking.
161
if err.errno != errno.ENOENT:
166
""" Represents a search engine index """
168
class LockedException(Exception):
171
def __init__(self, request):
173
@param request: current request
175
self.request = request
176
main_dir = self._main_dir()
177
self.dir = os.path.join(main_dir, 'index')
178
if not os.path.exists(self.dir):
179
os.makedirs(self.dir)
180
self.sig_file = os.path.join(main_dir, 'complete')
181
lock_dir = os.path.join(main_dir, 'index-lock')
182
self.lock = lock.WriteLock(lock_dir, timeout=3600.0, readlocktimeout=60.0)
183
#self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0)
184
self.update_queue = UpdateQueue(os.path.join(main_dir, 'update-queue'),
185
os.path.join(main_dir, 'update-queue-lock'))
186
self.remove_queue = UpdateQueue(os.path.join(main_dir, 'remove-queue'),
187
os.path.join(main_dir, 'remove-queue-lock'))
189
# Disabled until we have a sane way to build the index with a
190
# queue in small steps.
191
## if not self.exists():
192
## self.indexPagesInNewThread(request)
195
raise NotImplemented('...')
198
""" Check if index exists """
199
return os.path.exists(self.sig_file)
202
""" Modification time of the index """
203
return os.path.getmtime(self.dir)
206
""" Touch the index """
207
filesys.touch(self.dir)
209
def _search(self, query):
210
""" Actually perfom the search (read-lock acquired)
212
@param query: the search query objects tree
214
raise NotImplemented('...')
216
def search(self, query, **kw):
217
""" Search for items in the index
219
@param query: the search query objects to pass to the index
221
#if not self.read_lock.acquire(1.0):
222
# raise self.LockedException
224
hits = self._search(query, **kw)
226
# self.read_lock.release()
229
def update_page(self, pagename, now=1):
230
""" Update a single page in the index
232
@param pagename: the name of the page to update
233
@keyword now: do all updates now (default: 1)
235
self.update_queue.append(pagename)
237
self._do_queued_updates_InNewThread()
239
def remove_item(self, pagename, attachment=None, now=1):
240
""" Removes a page and all its revisions or a single attachment
242
@param pagename: name of the page to be removed
243
@keyword attachment: optional, only remove this attachment of the page
244
@keyword now: do all updates now (default: 1)
246
self.remove_queue.append('%s//%s' % (pagename, attachment or ''))
248
self._do_queued_updates_InNewThread()
250
def indexPages(self, files=None, mode='update'):
251
""" Index all pages (and files, if given)
253
Can be called only from a script. To index pages during a user
254
request, use indexPagesInNewThread.
255
@keyword files: iterator or list of files to index additionally
256
@keyword mode: set the mode of indexing the pages, either 'update', 'add' or 'rebuild'
258
if not self.lock.acquire(1.0):
259
logging.warning("can't index: can't acquire lock")
264
request = self._indexingRequest(self.request)
265
self._index_pages(request, files, mode)
266
logging.info("indexing completed successfully in %0.2f seconds." %
267
(time.time() - start))
272
def indexPagesInNewThread(self, files=None, mode='update'):
273
""" Index all pages in a new thread
275
Should be called from a user request. From a script, use indexPages.
277
# Prevent rebuilding the index just after it was finished
281
from threading import Thread
282
indexThread = Thread(target=self._index_pages, args=(files, mode))
283
indexThread.setDaemon(True)
285
# Join the index thread after current request finish, prevent
286
# Apache CGI from killing the process.
287
def joinDecorator(finish):
293
self.request.finish = joinDecorator(self.request.finish)
296
def _index_pages(self, request, files=None, mode='update'):
297
""" Index all pages (and all given files)
299
This should be called from indexPages or indexPagesInNewThread only!
301
This may take some time, depending on the size of the wiki and speed
304
When called in a new thread, lock is acquired before the call,
305
and this method must release it when it finishes or fails.
307
@param request: current request
308
@keyword files: iterator or list of files to index additionally
309
@keyword mode: set the mode of indexing the pages, either 'update',
312
raise NotImplemented('...')
314
def _remove_item(self, writer, page, attachment=None):
315
""" Remove a page and all its revisions from the index or just
316
an attachment of that page
318
@param pagename: name of the page to remove
319
@keyword attachment: optionally, just remove this attachment
321
raise NotImplemented('...')
323
def _do_queued_updates_InNewThread(self):
324
""" do queued index updates in a new thread
326
Should be called from a user request. From a script, use indexPages.
328
if not self.lock.acquire(1.0):
329
logging.warning("can't index: can't acquire lock")
332
def lockedDecorator(f):
333
def func(*args, **kwargs):
335
return f(*args, **kwargs)
340
from threading import Thread
341
indexThread = Thread(
342
target=lockedDecorator(self._do_queued_updates),
343
args=(self._indexingRequest(self.request), ))
344
indexThread.setDaemon(True)
346
# Join the index thread after current request finish, prevent
347
# Apache CGI from killing the process.
348
def joinDecorator(finish):
354
self.request.finish = joinDecorator(self.request.finish)
360
def _do_queued_updates(self, request, amount=5):
361
""" Perform updates in the queues (read-lock acquired)
363
@param request: the current request
364
@keyword amount: how many updates to perform at once (default: 5)
366
raise NotImplemented('...')
369
""" Optimize the index if possible """
370
raise NotImplemented('...')
372
def contentfilter(self, filename):
373
""" Get a filter for content of filename and return unicode content.
375
@param filename: name of the file
377
request = self.request
378
mt = wikiutil.MimeType(filename=filename)
379
for modulename in mt.module_name():
381
execute = wikiutil.importPlugin(request.cfg, 'filter', modulename)
383
except wikiutil.PluginMissingError:
386
logging.info("Cannot load filter for mimetype %s" % modulename)
388
data = execute(self, filename)
389
logging.debug("Filter %s returned %d characters for file %s" % (modulename, len(data), filename))
390
except (OSError, IOError), err:
392
logging.warning("Filter %s threw error '%s' for file %s" % (modulename, str(err), filename))
393
return mt.mime_type(), data
395
def _indexingRequest(self, request):
396
""" Return a new request that can be used for index building.
398
This request uses a security policy that lets the current user
399
read any page. Without this policy some pages will not render,
400
which will create broken pagelinks index.
402
@param request: current request
404
from MoinMoin.request.request_cli import Request
405
from MoinMoin.security import Permissions
406
request = Request(request.url)
407
class SecurityPolicy(Permissions):
408
def read(self, *args, **kw):
410
request.user.may = SecurityPolicy(request.user)
414
""" Remove sig file - assume write lock acquired """
416
os.remove(self.sig_file)
418
if err.errno != errno.ENOENT:
422
""" Add sig file - assume write lock acquired """
423
f = file(self.sig_file, 'w')
430
##############################################################################
432
##############################################################################
437
def __init__(self, request, query, sort='weight', mtime=None,
440
@param request: current request
441
@param query: search query objects tree
442
@keyword sort: the sorting of the results (default: 'weight')
443
@keyword mtime: only show items newer than this timestamp (default: None)
444
@keyword historysearch: whether to show old revisions of a page (default: 0)
446
self.request = request
450
self.historysearch = historysearch
451
self.filtered = False
452
self.fs_rootpage = "FS" # XXX FS hardcoded
455
""" Perform search and return results object """
457
if self.request.cfg.xapian_search:
458
hits = self._xapianSearch()
459
logging.debug("_xapianSearch found %d hits" % len(hits))
461
hits = self._moinSearch()
462
logging.debug("_moinSearch found %d hits" % len(hits))
464
# important - filter deleted pages or pages the user may not read!
465
if not self.filtered:
466
hits = self._filter(hits)
467
logging.debug("after filtering: %d hits" % len(hits))
469
# when xapian was used, we can estimate the numer of matches
470
# Note: hits can't be estimated by xapian with historysearch enabled
471
if not self.request.cfg.xapian_index_history and hasattr(self, '_xapianMset'):
472
_ = self.request.getText
473
mset = self._xapianMset
474
m_lower = mset.get_matches_lower_bound()
475
m_estimated = mset.get_matches_estimated()
476
m_upper = mset.get_matches_upper_bound()
477
estimated_hits = (m_estimated == m_upper and m_estimated == m_lower
478
and '' or _('about'), m_estimated)
480
estimated_hits = None
482
return getSearchResults(self.request, self.query, hits, start,
483
self.sort, estimated_hits)
485
# ----------------------------------------------------------------
488
def _xapianIndex(request):
489
""" Get the xapian index if possible
491
@param request: current request
494
from MoinMoin.search.Xapian import Index
495
index = Index(request)
502
_xapianIndex = staticmethod(_xapianIndex)
504
def _xapianSearch(self):
505
""" Search using Xapian
507
Get a list of pages using fast xapian search and
508
return moin search in those pages if needed.
510
clock = self.request.clock
512
index = self._xapianIndex(self.request)
514
if index and self.query.xapian_wanted():
515
clock.start('_xapianSearch')
517
from MoinMoin.support import xapwrap
519
clock.start('_xapianQuery')
520
query = self.query.xapian_term(self.request, index.allterms)
521
description = str(query)
522
logging.debug("_xapianSearch: query = %r" % description)
523
query = xapwrap.index.QObjQuery(query)
524
enq, mset, hits = index.search(query, sort=self.sort,
525
historysearch=self.historysearch)
526
clock.stop('_xapianQuery')
528
logging.debug("_xapianSearch: finds: %r" % hits)
530
""" decode dict values to unicode """
532
d[key] = d[key].decode(config.charset)
534
pages = [dict_decode(hit['values']) for hit in hits]
535
logging.debug("_xapianSearch: finds pages: %r" % pages)
537
self._xapianEnquire = enq
538
self._xapianMset = mset
539
self._xapianIndex = index
540
except BaseIndex.LockedException:
542
#except AttributeError:
546
# xapian handled the full query
547
if not self.query.xapian_need_postproc():
548
clock.start('_xapianProcess')
550
return self._getHits(hits, self._xapianMatch)
552
clock.stop('_xapianProcess')
554
clock.stop('_xapianSearch')
556
# we didn't use xapian in this request because we have no index,
557
# so we can just disable it until admin builds an index and
558
# restarts moin processes
559
self.request.cfg.xapian_search = 0
561
# some postprocessing by _moinSearch is required
562
return self._moinSearch(pages)
564
def _xapianMatchDecider(self, term, pos):
565
""" Returns correct Match object for a Xapian match
567
@param term: the term as string
568
@param pos: starting position of the match
570
if term[0] == 'S': # TitleMatch
571
return TitleMatch(start=pos, end=pos+len(term)-1)
572
else: # TextMatch (incl. headers)
573
return TextMatch(start=pos, end=pos+len(term))
575
def _xapianMatch(self, uid, page=None):
576
""" Get all relevant Xapian matches per document id
578
@param uid: the id of the document in the xapian index
581
term = self._xapianEnquire.get_matching_terms_begin(uid)
582
while term != self._xapianEnquire.get_matching_terms_end(uid):
583
term_name = term.get_term()
584
for pos in self._xapianIndex.termpositions(uid, term.get_term()):
585
if pos not in positions or \
586
len(positions[pos]) < len(term_name):
587
positions[pos] = term_name
589
matches = [self._xapianMatchDecider(term, pos) for pos, term
590
in positions.iteritems()]
593
return [Match()] # dummy for metadata, we got a match!
597
def _moinSearch(self, pages=None):
598
""" Search pages using moin's built-in full text search
600
Return list of tuples (page, match). The list may contain
601
deleted pages or pages the user may not read.
603
@keyword pages: optional list of pages to search in
605
self.request.clock.start('_moinSearch')
607
# if we are not called from _xapianSearch, we make a full pagelist,
608
# but don't search attachments (thus attachment name = '')
609
pages = [{'pagename': p, 'attachment': '', 'wikiname': 'Self', } for p in self._getPageList()]
610
hits = self._getHits(pages, self._moinMatch)
611
self.request.clock.stop('_moinSearch')
614
def _moinMatch(self, page, uid=None):
615
""" Get all matches from regular moinSearch
617
@param page: the current page instance
620
return self.query.search(page)
622
def _getHits(self, pages, matchSearchFunction):
623
""" Get the hit tuples in pages through matchSearchFunction """
624
logging.debug("_getHits searching in %d pages ..." % len(pages))
627
fs_rootpage = self.fs_rootpage
630
valuedict = hit['values']
636
wikiname = valuedict['wikiname']
637
pagename = valuedict['pagename']
638
attachment = valuedict['attachment']
639
logging.debug("_getHits processing %r %r %r" % (wikiname, pagename, attachment))
641
if 'revision' in valuedict and valuedict['revision']:
642
revision = int(valuedict['revision'])
646
if wikiname in (self.request.cfg.interwikiname, 'Self'): # THIS wiki
647
page = Page(self.request, pagename, rev=revision)
648
if not self.historysearch and revision:
649
revlist = page.getRevList()
650
# revlist can be empty if page was nuked/renamed since it was included in xapian index
651
if not revlist or revlist[0] != revision:
652
# nothing there at all or not the current revision
655
if pagename == fs_rootpage: # not really an attachment
656
page = Page(self.request, "%s/%s" % (fs_rootpage, attachment))
657
hits.append((wikiname, page, None, None))
659
matches = matchSearchFunction(page=None, uid=uid)
660
hits.append((wikiname, page, attachment, matches))
662
matches = matchSearchFunction(page=page, uid=uid)
663
logging.debug("matchSearchFunction %r returned %r" % (matchSearchFunction, matches))
665
if not self.historysearch and \
666
pagename in revisionCache and \
667
revisionCache[pagename][0] < revision:
668
hits.remove(revisionCache[pagename][1])
669
del revisionCache[pagename]
670
hits.append((wikiname, page, attachment, matches))
671
revisionCache[pagename] = (revision, hits[-1])
673
hits.append((wikiname, pagename, attachment, None, revision))
676
def _getPageList(self):
677
""" Get list of pages to search in
679
If the query has a page filter, use it to filter pages before
680
searching. If not, get a unfiltered page list. The filtering
681
will happen later on the hits, which is faster with current
684
filter_ = self.query.pageFilter()
686
# There is no need to filter the results again.
688
return self.request.rootpage.getPageList(filter=filter_)
690
return self.request.rootpage.getPageList(user='', exists=0)
692
def _filter(self, hits):
693
""" Filter out deleted or acl protected pages
695
@param hits: list of hits
697
userMayRead = self.request.user.may.read
698
fs_rootpage = self.fs_rootpage + "/"
699
thiswiki = (self.request.cfg.interwikiname, 'Self')
700
filtered = [(wikiname, page, attachment, match)
701
for wikiname, page, attachment, match in hits
702
if (not wikiname in thiswiki or
703
page.exists() and userMayRead(page.page_name) or
704
page.page_name.startswith(fs_rootpage)) and
705
(not self.mtime or self.mtime <= page.mtime_usecs()/1000000)]