1
# -*- coding: iso-8859-1 -*-
3
MoinMoin - search query expressions
5
@copyright: 2005 MoinMoin:FlorianFesti,
6
2005 MoinMoin:NirSoffer,
7
2005 MoinMoin:AlexanderSchremmer,
8
2006-2008 MoinMoin:ThomasWaldmann,
9
2006 MoinMoin:FranzPletz,
10
2009 MoinMoin:DmitrijsMilajevs
11
@license: GNU GPL, see COPYING for details
16
from MoinMoin import log
17
logging = log.getLogger(__name__)
19
from MoinMoin import config, wikiutil
20
from MoinMoin.search.results import Match, TitleMatch, TextMatch
23
from MoinMoin.search import Xapian
24
from MoinMoin.search.Xapian import Query
28
OP_AND_NOT = Query.OP_AND_NOT
34
class BaseExpression(object):
35
""" Base class for all search terms """
37
# costs is estimated time to calculate this term.
38
# Number is relative to other terms and has no real unit.
39
# It allows to do the fast searches first.
43
def __init__(self, pattern, use_re=False, case=False):
44
""" Init a text search
46
@param pattern: pattern to search for, ascii string or unicode
47
@param use_re: treat pattern as re of plain text, bool
48
@param case: do case sensitive search, bool
50
self._pattern = unicode(pattern)
60
self.pattern, self.search_re = self._build_re(self._pattern, use_re=use_re, case=case)
63
return unicode(self).encode(config.charset, 'replace')
66
""" Negate the result of this term """
70
""" Return a page filtering function
72
This function is used to filter page list before we search
73
it. Return a function that get a page name, and return bool.
75
The default expression does not have any filter function and
76
return None. Sub class may define custom filter functions.
80
def _get_matches(self, page):
81
raise NotImplementedError
83
def search(self, page):
86
Returns a list of Match objects or None if term didn't find
87
anything (vice versa if negate() was called). Terms containing
88
other terms must call this method to aggregate the results.
89
This Base class returns True (Match()) if not negated.
91
logging.debug("%s searching page %r for (negated = %r) %r" % (self.__class__, page.page_name, self.negated, self._pattern))
93
matches = self._get_matches(page)
95
# Decide what to do with the results.
100
result = [Match()] # represents "matched" (but as it was a negative match, we have nothing to show)
106
logging.debug("%s returning %r" % (self.__class__, result))
109
def highlight_re(self):
110
""" Return a regular expression of what the term searches for
112
Used to display the needle in the page.
116
def _build_re(self, pattern, use_re=False, case=False, stemmed=False):
117
""" Make a regular expression out of a text pattern """
118
flags = case and re.U or (re.I | re.U)
121
search_re = re.compile(pattern, flags)
123
pattern = re.escape(pattern)
124
search_re = re.compile(pattern, flags)
126
return pattern, search_re
128
def _get_query_for_search_re(self, connection, field_to_check=None):
130
Return a query which satisfy self.search_re for field values.
131
If field_to_check is given check values only for that field.
135
documents = connection.get_all_documents()
136
for document in documents:
139
# Check only field with given name
140
if field_to_check in data:
141
for term in data[field_to_check]:
142
if self.search_re.match(term):
143
queries.append(connection.query_field(field_to_check, term))
146
for field, terms in data.iteritems():
148
if self.search_re.match(term):
149
queries.append(connection.query_field(field_to_check, term))
151
return Query(OP_OR, queries)
153
def xapian_need_postproc(self):
156
def __unicode__(self):
157
neg = self.negated and '-' or ''
158
return u'%s%s"%s"' % (neg, self._tag, unicode(self._pattern))
161
class AndExpression(BaseExpression):
162
""" A term connecting several sub terms with a logical AND """
166
def __init__(self, *terms):
167
self._subterms = list(terms)
170
def append(self, expression):
171
""" Append another term """
172
self._subterms.append(expression)
175
return self._subterms
179
return sum([t.costs for t in self._subterms])
181
def __unicode__(self):
183
for t in self._subterms:
184
result += self.operator + unicode(t)
185
return u'[' + result[len(self.operator):] + u']'
187
def _filter(self, terms, name):
188
""" A function that returns True if all terms filter name """
191
_filter = term.pageFilter()
198
logging.debug("pageFilter AND returns %r" % result)
201
def pageFilter(self):
202
""" Return a page filtering function
204
This function is used to filter page list before we search it.
206
Return a function that gets a page name, and return bool, or None.
208
# Sort terms by cost, then get all title searches
210
terms = [term for term in self._subterms if isinstance(term, TitleSearch)]
212
return lambda name: self._filter(terms, name)
214
def sortByCost(self):
215
self._subterms.sort(key=lambda t: t.costs)
217
def search(self, page):
218
""" Search for each term, cheap searches first """
221
for term in self._subterms:
222
result = term.search(page)
225
matches.extend(result)
228
def highlight_re(self):
230
for s in self._subterms:
231
highlight_re = s.highlight_re()
233
result.append(highlight_re)
235
return u'|'.join(result)
237
def xapian_need_postproc(self):
238
for term in self._subterms:
239
if term.xapian_need_postproc():
243
def xapian_term(self, request, connection):
248
for term in self._subterms:
250
terms.append(term.xapian_term(request, connection))
252
not_terms.append(term.xapian_term(request, connection))
254
# prepare query for not negated terms
256
query = Query(OP_AND, terms)
258
query = Query('') # MatchAll
260
# prepare query for negated terms
262
query_negated = Query(OP_OR, not_terms)
264
query_negated = Query()
266
return Query(OP_AND_NOT, query, query_negated)
269
class OrExpression(AndExpression):
270
""" A term connecting several sub terms with a logical OR """
274
def _filter(self, terms, name):
275
""" A function that returns True if any term filters name """
278
_filter = term.pageFilter()
285
logging.debug("pageFilter OR returns %r" % result)
288
def search(self, page):
289
""" Search page with terms
291
@param page: the page instance
294
# XXX Do we have any reason to sort here? we are not breaking out
295
# of the search in any case.
298
for term in self._subterms:
299
result = term.search(page)
301
matches.extend(result)
304
def xapian_term(self, request, connection):
305
# XXX: negated terms managed by _moinSearch?
306
return Query(OP_OR, [term.xapian_term(request, connection) for term in self._subterms])
309
class BaseTextFieldSearch(BaseExpression):
311
_field_to_search = None
313
def xapian_term(self, request, connection):
315
queries = [self._get_query_for_search_re(connection, self._field_to_search)]
319
analyzer = Xapian.WikiAnalyzer(request=request, language=request.cfg.language_default)
321
for term in self._pattern.split():
322
query_term = connection.query_field(self._field_to_search, term)
323
tokens = analyzer.tokenize(term)
325
if request.cfg.xapian_stemming:
327
for token, stemmed_ in tokens:
328
if token != term.lower():
330
query_token.append(Query(OP_OR,
331
[connection.query_field(self._field_to_search, token),
332
connection.query_field(self._field_to_search, stemmed_)]))
333
# stemmed.append('(%s|%s)' % (token, stemmed_))
335
query_token.append(connection.query_field(self._field_to_search, token))
336
# stemmed.append(token)
337
query_tokens = Query(OP_AND, query_token)
339
query_tokens = Query(OP_AND, [connection.query_field(self._field_to_search, token) for token, stemmed_ in tokens if token != term.lower()])
341
queries.append(Query(OP_OR, [query_term, query_tokens]))
343
# XXX broken wrong regexp is built!
344
if not self.case and stemmed:
345
new_pat = ' '.join(stemmed)
346
self._pattern = new_pat
347
self.pattern, self.search_re = self._build_re(new_pat, use_re=False, case=self.case, stemmed=True)
349
return Query(OP_AND, queries)
352
class TextSearch(BaseTextFieldSearch):
353
""" A term that does a normal text search
355
Both page content and the page title are searched, using an
356
additional TitleSearch term.
360
_field_to_search = 'content'
362
def highlight_re(self):
363
return u"(%s)" % self.pattern
365
def _get_matches(self, page):
368
# Search in page name
369
results = TitleSearch(self._pattern, use_re=self.use_re, case=self.case)._get_matches(page)
371
matches.extend(results)
373
# Search in page body
374
body = page.get_raw_body()
375
for match in self.search_re.finditer(body):
376
matches.append(TextMatch(re_match=match))
380
def xapian_term(self, request, connection):
382
# if regex search is wanted, we need to match all documents, because
383
# we do not have full content stored and need post processing to do
384
# the regex searching.
385
return Query('') # MatchAll
387
content_query = super(TextSearch, self).xapian_term(request, connection)
388
title_query = TitleSearch(self._pattern, use_re=self.use_re, case=self.case).xapian_term(request, connection)
389
return Query(OP_OR, [title_query, content_query])
391
def xapian_need_postproc(self):
392
# case-sensitive: xapian is case-insensitive, therefore we need postproc
393
# regex: xapian can't do regex search. also we don't have full content
394
# stored (and we don't want to do that anyway), so regex search
395
# needs postproc also.
396
return self.case or self.use_re
399
class TitleSearch(BaseTextFieldSearch):
400
""" Term searches in pattern in page title only """
404
_field_to_search = 'title'
406
def pageFilter(self):
407
""" Page filter function for single title search """
410
match = self.search_re.search(name)
411
result = bool(self.negated) ^ bool(match)
412
logging.debug("pageFilter title returns %r (%r)" % (result, self.pattern))
416
def _get_matches(self, page):
417
""" Get matches in page name """
420
for match in self.search_re.finditer(page.page_name):
421
matches.append(TitleMatch(re_match=match))
426
class BaseFieldSearch(BaseExpression):
428
_field_to_search = None
430
def xapian_term(self, request, connection):
432
return self._get_query_for_search_re(connection, self._field_to_search)
434
return connection.query_field(self._field_to_search, self._pattern)
437
class LinkSearch(BaseFieldSearch):
438
""" Search the term in the pagelinks """
441
_field_to_search = 'linkto'
442
costs = 5000 # cheaper than a TextSearch
444
def __init__(self, pattern, use_re=False, case=True):
445
""" Init a link search
447
@param pattern: pattern to search for, ascii string or unicode
448
@param use_re: treat pattern as re of plain text, bool
449
@param case: do case sensitive search, bool
452
super(LinkSearch, self).__init__(pattern, use_re, case)
454
self._textpattern = '(' + pattern.replace('/', '|') + ')' # used for search in text
455
self.textsearch = TextSearch(self._textpattern, use_re=True, case=case)
457
def highlight_re(self):
458
return u"(%s)" % self._textpattern
460
def _get_matches(self, page):
461
# Get matches in page links
464
# XXX in python 2.5 any() may be used.
466
for link in page.getPageLinks(page.request):
467
if self.search_re.match(link):
472
# Search in page text
473
results = self.textsearch.search(page)
475
matches.extend(results)
476
else: # This happens e.g. for pages that use navigation macros
477
matches.append(TextMatch(0, 0))
482
class LanguageSearch(BaseFieldSearch):
483
""" Search the pages written in a language """
486
_field_to_search = 'lang'
487
costs = 5000 # cheaper than a TextSearch
489
def __init__(self, pattern, use_re=False, case=False):
490
""" Init a language search
492
@param pattern: pattern to search for, ascii string or unicode
493
@param use_re: treat pattern as re of plain text, bool
494
@param case: do case sensitive search, bool
496
# iso language code, always lowercase and not case-sensitive
497
super(LanguageSearch, self).__init__(pattern.lower(), use_re, case=False)
499
def _get_matches(self, page):
501
if self.pattern == page.pi['language']:
507
class CategorySearch(BaseFieldSearch):
508
""" Search the pages belonging to a category """
511
_field_to_search = 'category'
512
costs = 5000 # cheaper than a TextSearch
514
def _get_matches(self, page):
515
""" match categories like this:
516
... some page text ...
518
## optionally some comments, e.g. about possible categories:
520
CategoryTheRealAndOnly
522
Note: there might be multiple comment lines, but all real categories
523
must be on a single line either directly below the ---- or
524
directly below some comment lines.
528
pattern = r'(?m)(^-----*\s*\r?\n)(^##.*\r?\n)*^(?!##)(.*)\b%s\b' % self.pattern
529
search_re = self._build_re(pattern, use_re=self.use_re, case=self.case)[1] # we need only a regexp, but not a pattern
531
body = page.get_raw_body()
532
for match in search_re.finditer(body):
533
matches.append(TextMatch(re_match=match))
537
def highlight_re(self):
538
return u'(\\b%s\\b)' % self._pattern
540
def xapian_term(self, request, connection):
541
# XXX Probably, it is a good idea to inherit this class from
542
# BaseFieldSearch and get rid of this definition
544
return self._get_query_for_search_re(connection, 'category')
546
pattern = self._pattern
547
# XXX UnicodeQuery was used
548
return connection.query_field('category', pattern)
551
class MimetypeSearch(BaseFieldSearch):
552
""" Search for files belonging to a specific mimetype """
555
_field_to_search = 'mimetype'
556
costs = 5000 # cheaper than a TextSearch
558
def __init__(self, pattern, use_re=False, case=False):
559
""" Init a mimetype search
561
@param pattern: pattern to search for, ascii string or unicode
562
@param use_re: treat pattern as re of plain text, bool
563
@param case: do case sensitive search, bool
565
# always lowercase and not case-sensitive
566
super(MimetypeSearch, self).__init__(pattern.lower(), use_re, case=False)
568
def _get_matches(self, page):
570
page_mimetype = u'text/%s' % page.pi['format']
572
if self.search_re.search(page_mimetype):
578
class DomainSearch(BaseFieldSearch):
579
""" Search for pages belonging to a specific domain """
582
_field_to_search = 'domain'
583
costs = 5000 # cheaper than a TextSearch
585
def __init__(self, pattern, use_re=False, case=False):
586
""" Init a domain search
588
@param pattern: pattern to search for, ascii string or unicode
589
@param use_re: treat pattern as re of plain text, bool
590
@param case: do case sensitive search, bool
592
# always lowercase and not case-sensitive
593
super(DomainSearch, self).__init__(pattern.lower(), use_re, case=False)
595
def _get_matches(self, page):
596
checks = {'underlay': page.isUnderlayPage,
597
'standard': page.isStandardPage,
598
'system': lambda page=page: wikiutil.isSystemPage(page.request, page.page_name),
602
match = checks[self.pattern]()