1
# -*- coding: iso-8859-1 -*-
3
MoinMoin - A text analyzer for wiki syntax
5
@copyright: 2006-2008 MoinMoin:ThomasWaldmann,
6
2006 MoinMoin:FranzPletz
7
@license: GNU GPL, see COPYING for details.
13
from MoinMoin.parser.text_moin_wiki import Parser as WikiParser
14
from MoinMoin import config
17
class WikiAnalyzer(object):
18
""" A text analyzer for wiki syntax
20
The purpose of this class is to analyze texts/pages in wiki syntax
21
and yield single terms to feed into the xapian database.
24
singleword = r"[%(u)s][%(l)s]+" % {
25
'u': config.chars_upper,
26
'l': config.chars_lower,
29
singleword_re = re.compile(singleword, re.U)
30
wikiword_re = re.compile(WikiParser.word_rule, re.UNICODE|re.VERBOSE)
32
token_re = re.compile(
33
r"(?P<company>\w+[&@]\w+)|" + # company names like AT&T and Excite@Home.
34
r"(?P<email>\w+([.-]\w+)*@\w+([.-]\w+)*)|" + # email addresses
35
r"(?P<acronym>(\w\.)+)|" + # acronyms: U.S.A., I.B.M., etc.
36
r"(?P<word>\w+)", # words (including WikiWords)
39
dot_re = re.compile(r"[-_/,.]")
40
mail_re = re.compile(r"[-_/,.]|(@)")
41
alpha_num_re = re.compile(r"\d+|\D+")
43
def __init__(self, request=None, language=None):
45
@param request: current request
46
@param language: if given, the language in which to stem words
49
if request and request.cfg.xapian_stemming and language:
51
stemmer = xapian.Stem(language)
52
# we need this wrapper because the stemmer returns a utf-8
53
# encoded string even when it gets fed with unicode objects:
54
self.stemmer = lambda word: stemmer(word).decode('utf-8')
55
except xapian.InvalidArgumentError:
56
# lang is not stemmable or not available
59
def raw_tokenize_word(self, word, pos):
60
""" try to further tokenize some word starting at pos """
62
if self.wikiword_re.match(word):
63
# if it is a CamelCaseWord, we additionally try to tokenize Camel, Case and Word
64
for m in re.finditer(self.singleword_re, word):
65
mw, mp = m.group(), pos + m.start()
66
for w, p in self.raw_tokenize_word(mw, mp):
69
# if we have Foo42, yield Foo and 42
70
for m in re.finditer(self.alpha_num_re, word):
71
mw, mp = m.group(), pos + m.start()
73
for w, p in self.raw_tokenize_word(mw, mp):
76
def raw_tokenize(self, value):
77
""" Yield a stream of words from a string.
79
@param value: string to split, must be an unicode object or a list of
82
if isinstance(value, list): # used for page links
86
tokenstream = re.finditer(self.token_re, value)
88
if m.group("acronym"):
89
yield (m.group("acronym").replace('.', ''), m.start())
90
elif m.group("company"):
91
yield (m.group("company"), m.start())
92
elif m.group("email"):
94
for word in self.mail_re.split(m.group("email")):
96
yield (word, m.start() + displ)
97
displ += len(word) + 1
99
for word, pos in self.raw_tokenize_word(m.group("word"), m.start()):
102
def tokenize(self, value):
104
Yield a stream of raw lower cased and stemmed words from a string.
106
@param value: string to split, must be an unicode object or a list of
112
stemmed = self.stemmer(value)
118
stemmer = lambda v: ''
120
for word, pos in self.raw_tokenize(value):
121
# Xapian stemmer expects lowercase input
123
yield word, stemmer(word)