3
3
MoinMoin - lupy indexing search engine
5
@copyright: 2005 by Florian Festi, Nir Soffer
5
@copyright: 2005 by Florian Festi, Nir Soffer, Thomas Waldmann
6
6
@license: GNU GPL, see COPYING for details.
9
9
import os, re, codecs, errno, time
11
11
from MoinMoin.Page import Page
12
from MoinMoin import config
12
from MoinMoin import config, wikiutil
13
13
from MoinMoin.util import filesys, lock
14
14
from MoinMoin.support.lupy.index.term import Term
15
15
from MoinMoin.support.lupy import document
16
16
from MoinMoin.support.lupy.index.indexwriter import IndexWriter
17
17
from MoinMoin.support.lupy.search.indexsearcher import IndexSearcher
19
from MoinMoin.support.lupy.index.term import Term
20
from MoinMoin.support.lupy.search.term import TermQuery
21
from MoinMoin.support.lupy.search.boolean import BooleanQuery
19
23
##############################################################################
21
25
##############################################################################
23
word_re = re.compile(r"\w+", re.U)
24
wikiword_re = re.compile(r"^([%(u)s][%(l)s]+)+$" % {'u': config.chars_upper,
25
'l': config.chars_lower}, re.U)
26
singleword_re = re.compile(r"[%(u)s][%(l)s]+" % {'u': config.chars_upper,
27
'l': config.chars_lower}, re.U)
27
singleword = r"[%(u)s][%(l)s]+" % {
28
'u': config.chars_upper,
29
'l': config.chars_lower,
32
singleword_re = re.compile(singleword, re.U)
33
wikiword_re = re.compile(r"^(%s){2,}$" % singleword, re.U)
29
35
token_re = re.compile(
30
r"(?P<company>\w+[&@]\w+)|" + #company names like AT&T and Excite@Home.
36
r"(?P<company>\w+[&@]\w+)|" + # company names like AT&T and Excite@Home.
31
37
r"(?P<email>\w+([.-]\w+)*@\w+([.-]\w+)*)|" + # email addresses
32
38
r"(?P<hostname>\w+(\.\w+)+)|" + # hostnames
33
39
r"(?P<num>(\w+[-/.,])*\w*\d\w*([-/.,]\w+)*)|" + # version numbers
202
201
def __init__(self, request):
203
202
self.request = request
204
203
cache_dir = request.cfg.cache_dir
205
self.dir = os.path.join(cache_dir, 'lupy_index')
204
self.main_dir = os.path.join(cache_dir, 'lupy')
205
self.dir = os.path.join(self.main_dir, 'index')
206
206
filesys.makeDirs(self.dir)
207
self.sig_file = os.path.join(self.dir, '__complete__')
207
self.sig_file = os.path.join(self.main_dir, 'complete')
208
208
self.segments_file = os.path.join(self.dir, 'segments')
209
lock_dir = os.path.join(cache_dir, 'lupy_index_lock')
209
lock_dir = os.path.join(self.main_dir, 'index-lock')
210
210
self.lock = lock.WriteLock(lock_dir,
211
211
timeout=3600.0, readlocktimeout=60.0)
212
212
self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0)
213
self.queue = UpdateQueue(os.path.join(self.dir, "__update_queue__"),
214
os.path.join(cache_dir, 'lupy_queue_lock'))
213
self.queue = UpdateQueue(os.path.join(self.main_dir, "update-queue"),
214
os.path.join(self.main_dir, 'update-queue-lock'))
216
216
# Disabled until we have a sane way to build the index with a
217
217
# queue in small steps.
226
226
return os.path.getmtime(self.segments_file)
228
def _search(self, query):
229
""" read lock must be acquired """
232
searcher, timestamp = self.request.cfg.lupy_searchers.pop()
233
if timestamp != self.mtime():
238
searcher = IndexSearcher(self.dir)
239
timestamp = self.mtime()
242
hits = list(searcher.search(query))
243
self.request.cfg.lupy_searchers.append((searcher, timestamp))
228
246
def search(self, query):
229
247
if not self.read_lock.acquire(1.0):
230
248
raise self.LockedException
234
searcher, timestamp = self.request.cfg.lupy_searchers.pop()
235
if timestamp!=self.mtime():
240
searcher = IndexSearcher(self.dir)
241
timestamp = self.mtime()
244
hits = list(searcher.search(query))
245
self.request.cfg.lupy_searchers.append((searcher, timestamp))
250
hits = self._search(query)
247
252
self.read_lock.release()
250
255
def update_page(self, page):
256
self.queue.append(page.page_name)
257
self._do_queued_updates_InNewThread()
259
def _do_queued_updates_InNewThread(self):
260
""" do queued index updates in a new thread
262
Should be called from a user request. From a script, use indexPages.
251
264
if not self.lock.acquire(1.0):
252
self.queue.append(page.page_name)
265
self.request.log("can't index: can't acquire lock")
254
self.request.clock.start('update_page')
256
self._do_queued_updates()
257
self._update_page(page)
268
from threading import Thread
269
indexThread = Thread(target=self._do_queued_updates,
270
args=(self._indexingRequest(self.request), self.lock))
271
indexThread.setDaemon(True)
273
# Join the index thread after current request finish, prevent
274
# Apache CGI from killing the process.
275
def joinDecorator(finish):
281
self.request.finish = joinDecorator(self.request.finish)
259
284
self.lock.release()
260
self.request.clock.stop('update_page')
262
def indexPages(self):
287
def indexPages(self, files=None, update=True):
288
""" Index all pages (and files, if given)
265
290
Can be called only from a script. To index pages during a user
266
request, use indexPagesInNewThread.
268
TODO: tune the acquire timeout
291
request, use indexPagesInNewThread.
292
@arg files: iterator or list of files to index additionally
293
@arg update: True = update an existing index, False = reindex everything
270
295
if not self.lock.acquire(1.0):
271
296
self.request.log("can't index: can't acquire lock")
274
self._index_pages(self._indexingRequest(self.request))
299
request = self._indexingRequest(self.request)
300
self._index_pages(request, None, files, update)
276
302
self.lock.release()
278
def indexPagesInNewThread(self):
304
def indexPagesInNewThread(self, files=None, update=True):
279
305
""" Index all pages in a new thread
281
Should be called from a user request. From a script, use
284
TODO: tune the acquire timeout
307
Should be called from a user request. From a script, use indexPages.
286
309
if not self.lock.acquire(1.0):
287
310
self.request.log("can't index: can't acquire lock")
347
375
if page.exists():
348
376
writer = IndexWriter(self.dir, False, tokenizer)
349
self._index_page(writer, page)
377
self._index_page(writer, page, False) # we don't need to check whether it is updated
352
def _index_page(self, writer, page):
353
""" Assumes that the write lock is acquired """
354
d = document.Document()
355
d.add(document.Keyword('pagename', page.page_name))
356
d.add(document.Text('title', page.page_name, store=False))
357
d.add(document.Text('text', page.get_raw_body(), store=False))
359
links = page.getPageLinks(page.request)
360
t = document.Text('links', '', store=False)
363
d.add(document.Text('link_text', ' '.join(links), store=False))
365
writer.addDocument(d)
367
def _index_pages(self, request, lock=None):
370
This should be called from indexPages or indexPagesInNewThread
373
This may take few minutes up to few hours, depending on the
380
def contentfilter(self, filename):
381
""" Get a filter for content of filename and return unicode content. """
383
from MoinMoin import wikiutil
384
request = self.request
385
mimetype, encoding = mimetypes.guess_type(filename)
387
mimetype = 'application/octet-stream'
388
def mt2mn(mt): # mimetype to modulename
389
return mt.replace("/", "_").replace("-","_").replace(".", "_")
391
_filter = mt2mn(mimetype)
392
execute = wikiutil.importPlugin(request.cfg, 'filter', _filter)
393
except wikiutil.PluginMissingError:
395
_filter = mt2mn(mimetype.split("/", 1)[0])
396
execute = wikiutil.importPlugin(request.cfg, 'filter', _filter)
397
except wikiutil.PluginMissingError:
399
_filter = mt2mn('application/octet-stream')
400
execute = wikiutil.importPlugin(request.cfg, 'filter', _filter)
401
except wikiutil.PluginMissingError:
402
raise ImportError("Cannot load filter %s" % binaryfilter)
404
data = execute(self, filename)
405
request.log("Filter %s returned %d characters for file %s" % (_filter, len(data), filename))
406
except (OSError, IOError), err:
408
request.log("Filter %s threw error '%s' for file %s" % (_filter, str(err), filename))
411
def test(self, request):
412
query = BooleanQuery()
413
query.add(TermQuery(Term("text", 'suchmich')), True, False)
414
docs = self._search(query)
416
request.log("%r %r %r" % (d, d.get('attachment'), d.get('pagename')))
418
def _index_file(self, request, writer, filename, update):
419
""" index a file as it were a page named pagename
420
Assumes that the write lock is acquired
422
fs_rootpage = 'FS' # XXX FS hardcoded
424
mtime = os.path.getmtime(filename)
425
mtime = wikiutil.timestamp2version(mtime)
427
query = BooleanQuery()
428
query.add(TermQuery(Term("pagename", fs_rootpage)), True, False)
429
query.add(TermQuery(Term("attachment", filename)), True, False)
430
docs = self._search(query)
431
updated = len(docs) == 0 or mtime > int(docs[0].get('mtime'))
434
request.log("%s %r" % (filename, updated))
436
file_content = self.contentfilter(filename)
437
d = document.Document()
438
d.add(document.Keyword('pagename', fs_rootpage))
439
d.add(document.Keyword('mtime', str(mtime)))
440
d.add(document.Keyword('attachment', filename)) # XXX we should treat files like real pages, not attachments
441
pagename = " ".join(os.path.join(fs_rootpage, filename).split("/"))
442
d.add(document.Text('title', pagename, store=False))
443
d.add(document.Text('text', file_content, store=False))
444
writer.addDocument(d)
445
except (OSError, IOError), err:
448
def _index_page(self, writer, page, update):
449
""" Index a page - assumes that the write lock is acquired
450
@arg writer: the index writer object
451
@arg page: a page object
452
@arg update: False = index in any case, True = index only when changed
454
pagename = page.page_name
455
request = page.request
456
mtime = page.mtime_usecs()
458
query = BooleanQuery()
459
query.add(TermQuery(Term("pagename", pagename)), True, False)
460
query.add(TermQuery(Term("attachment", "")), True, False)
461
docs = self._search(query)
462
updated = len(docs) == 0 or mtime > int(docs[0].get('mtime'))
465
request.log("%s %r" % (pagename, updated))
467
d = document.Document()
468
d.add(document.Keyword('pagename', pagename))
469
d.add(document.Keyword('mtime', str(mtime)))
470
d.add(document.Keyword('attachment', '')) # this is a real page, not an attachment
471
d.add(document.Text('title', pagename, store=False))
472
d.add(document.Text('text', page.get_raw_body(), store=False))
474
links = page.getPageLinks(request)
475
t = document.Text('links', '', store=False)
478
d.add(document.Text('link_text', ' '.join(links), store=False))
480
writer.addDocument(d)
482
from MoinMoin.action import AttachFile
484
attachments = AttachFile._get_files(request, pagename)
485
for att in attachments:
486
filename = AttachFile.getFilename(request, pagename, att)
487
mtime = wikiutil.timestamp2version(os.path.getmtime(filename))
489
query = BooleanQuery()
490
query.add(TermQuery(Term("pagename", pagename)), True, False)
491
query.add(TermQuery(Term("attachment", att)), True, False)
492
docs = self._search(query)
493
updated = len(docs) == 0 or mtime > int(docs[0].get('mtime'))
496
request.log("%s %s %r" % (pagename, att, updated))
498
att_content = self.contentfilter(filename)
499
d = document.Document()
500
d.add(document.Keyword('pagename', pagename))
501
d.add(document.Keyword('mtime', str(mtime)))
502
d.add(document.Keyword('attachment', att)) # this is an attachment, store its filename
503
d.add(document.Text('title', att, store=False)) # the filename is the "title" of an attachment
504
d.add(document.Text('text', att_content, store=False))
505
writer.addDocument(d)
508
def _index_pages(self, request, lock=None, files=None, update=True):
509
""" Index all pages (and all given files)
511
This should be called from indexPages or indexPagesInNewThread only!
513
This may take few minutes up to few hours, depending on the size of
376
516
When called in a new thread, lock is acquired before the call,
377
517
and this method must release it when it finishes or fails.