1
# Copyright 2007 Matt Chaput. All rights reserved.
3
# Redistribution and use in source and binary forms, with or without
4
# modification, are permitted provided that the following conditions are met:
6
# 1. Redistributions of source code must retain the above copyright notice,
7
# this list of conditions and the following disclaimer.
9
# 2. Redistributions in binary form must reproduce the above copyright
10
# notice, this list of conditions and the following disclaimer in the
11
# documentation and/or other materials provided with the distribution.
13
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24
# The views and conclusions contained in the software and documentation are
25
# those of the authors and should not be interpreted as representing official
26
# policies, either expressed or implied, of Matt Chaput.
28
from whoosh.analysis.acore import Composable
29
from whoosh.analysis.filters import LowercaseFilter
30
from whoosh.analysis.filters import StopFilter, STOP_WORDS
31
from whoosh.analysis.morph import StemFilter
32
from whoosh.analysis.intraword import IntraWordFilter
33
from whoosh.analysis.tokenizers import default_pattern
34
from whoosh.analysis.tokenizers import CommaSeparatedTokenizer
35
from whoosh.analysis.tokenizers import IDTokenizer
36
from whoosh.analysis.tokenizers import RegexTokenizer
37
from whoosh.analysis.tokenizers import SpaceSeparatedTokenizer
38
from whoosh.lang.porter import stem
43
class Analyzer(Composable):
44
""" Abstract base class for analyzers.
48
return "%s()" % self.__class__.__name__
50
def __eq__(self, other):
52
and self.__class__ is other.__class__
53
and self.__dict__ == other.__dict__)
55
def __call__(self, value, **kwargs):
56
raise NotImplementedError
62
class CompositeAnalyzer(Analyzer):
63
def __init__(self, *composables):
65
for comp in composables:
66
if isinstance(comp, CompositeAnalyzer):
67
self.items.extend(comp.items)
69
self.items.append(comp)
72
return "%s(%s)" % (self.__class__.__name__,
73
", ".join(repr(item) for item in self.items))
75
def __call__(self, value, no_morph=False, **kwargs):
77
# Start with tokenizer
78
gen = items[0](value, **kwargs)
80
for item in items[1:]:
81
if not (no_morph and hasattr(item, "is_morph") and item.is_morph):
85
def __getitem__(self, item):
86
return self.items.__getitem__(item)
89
return len(self.items)
91
def __eq__(self, other):
93
and self.__class__ is other.__class__
94
and self.items == other.items)
97
for item in self.items:
98
if hasattr(item, "clean"):
102
return any(item.is_morph for item in self.items)
105
# Functions that return composed analyzers
107
def IDAnalyzer(lowercase=False):
108
"""Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if
112
tokenizer = IDTokenizer()
114
tokenizer = tokenizer | LowercaseFilter()
118
def KeywordAnalyzer(lowercase=False, commas=False):
119
"""Parses whitespace- or comma-separated tokens.
121
>>> ana = KeywordAnalyzer()
122
>>> [token.text for token in ana("Hello there, this is a TEST")]
123
["Hello", "there,", "this", "is", "a", "TEST"]
125
:param lowercase: whether to lowercase the tokens.
126
:param commas: if True, items are separated by commas rather than
131
tokenizer = CommaSeparatedTokenizer()
133
tokenizer = SpaceSeparatedTokenizer()
135
tokenizer = tokenizer | LowercaseFilter()
139
def RegexAnalyzer(expression=r"\w+(\.?\w+)*", gaps=False):
140
"""Deprecated, just use a RegexTokenizer directly.
143
return RegexTokenizer(expression=expression, gaps=gaps)
146
def SimpleAnalyzer(expression=default_pattern, gaps=False):
147
"""Composes a RegexTokenizer with a LowercaseFilter.
149
>>> ana = SimpleAnalyzer()
150
>>> [token.text for token in ana("Hello there, this is a TEST")]
151
["hello", "there", "this", "is", "a", "test"]
153
:param expression: The regular expression pattern to use to extract tokens.
154
:param gaps: If True, the tokenizer *splits* on the expression, rather
155
than matching on the expression.
158
return RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()
161
def StandardAnalyzer(expression=default_pattern, stoplist=STOP_WORDS,
162
minsize=2, maxsize=None, gaps=False):
163
"""Composes a RegexTokenizer with a LowercaseFilter and optional
166
>>> ana = StandardAnalyzer()
167
>>> [token.text for token in ana("Testing is testing and testing")]
168
["testing", "testing", "testing"]
170
:param expression: The regular expression pattern to use to extract tokens.
171
:param stoplist: A list of stop words. Set this to None to disable
172
the stop word filter.
173
:param minsize: Words smaller than this are removed from the stream.
174
:param maxsize: Words longer that this are removed from the stream.
175
:param gaps: If True, the tokenizer *splits* on the expression, rather
176
than matching on the expression.
179
ret = RegexTokenizer(expression=expression, gaps=gaps)
180
chain = ret | LowercaseFilter()
181
if stoplist is not None:
182
chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,
187
def StemmingAnalyzer(expression=default_pattern, stoplist=STOP_WORDS,
188
minsize=2, maxsize=None, gaps=False, stemfn=stem,
189
ignore=None, cachesize=50000):
190
"""Composes a RegexTokenizer with a lower case filter, an optional stop
191
filter, and a stemming filter.
193
>>> ana = StemmingAnalyzer()
194
>>> [token.text for token in ana("Testing is testing and testing")]
195
["test", "test", "test"]
197
:param expression: The regular expression pattern to use to extract tokens.
198
:param stoplist: A list of stop words. Set this to None to disable
199
the stop word filter.
200
:param minsize: Words smaller than this are removed from the stream.
201
:param maxsize: Words longer that this are removed from the stream.
202
:param gaps: If True, the tokenizer *splits* on the expression, rather
203
than matching on the expression.
204
:param ignore: a set of words to not stem.
205
:param cachesize: the maximum number of stemmed words to cache. The larger
206
this number, the faster stemming will be but the more memory it will
207
use. Use None for no cache, or -1 for an unbounded cache.
210
ret = RegexTokenizer(expression=expression, gaps=gaps)
211
chain = ret | LowercaseFilter()
212
if stoplist is not None:
213
chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,
215
return chain | StemFilter(stemfn=stemfn, ignore=ignore,
219
def FancyAnalyzer(expression=r"\s+", stoplist=STOP_WORDS, minsize=2,
220
maxsize=None, gaps=True, splitwords=True, splitnums=True,
221
mergewords=False, mergenums=False):
222
"""Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and
225
>>> ana = FancyAnalyzer()
226
>>> [token.text for token in ana("Should I call getInt or get_real?")]
227
["should", "call", "getInt", "get", "int", "get_real", "get", "real"]
229
:param expression: The regular expression pattern to use to extract tokens.
230
:param stoplist: A list of stop words. Set this to None to disable
231
the stop word filter.
232
:param minsize: Words smaller than this are removed from the stream.
233
:param maxsize: Words longer that this are removed from the stream.
234
:param gaps: If True, the tokenizer *splits* on the expression, rather
235
than matching on the expression.
238
return (RegexTokenizer(expression=expression, gaps=gaps)
239
| IntraWordFilter(splitwords=splitwords, splitnums=splitnums,
240
mergewords=mergewords, mergenums=mergenums)
242
| StopFilter(stoplist=stoplist, minsize=minsize)
246
def LanguageAnalyzer(lang, expression=default_pattern, gaps=False,
248
"""Configures a simple analyzer for the given language, with a
249
LowercaseFilter, StopFilter, and StemFilter.
251
>>> ana = LanguageAnalyzer("es")
252
>>> [token.text for token in ana("Por el mar corren las liebres")]
253
['mar', 'corr', 'liebr']
255
:param expression: The regular expression pattern to use to extract tokens.
256
:param gaps: If True, the tokenizer *splits* on the expression, rather
257
than matching on the expression.
258
:param cachesize: the maximum number of stemmed words to cache. The larger
259
this number, the faster stemming will be but the more memory it will
263
from whoosh.lang import NoStemmer, NoStopWords
264
from whoosh.lang import stopwords_for_language
266
# Make the start of the chain
267
chain = (RegexTokenizer(expression=expression, gaps=gaps)
270
# Add a stop word filter
272
stoplist = stopwords_for_language(lang)
273
chain = chain | StopFilter(stoplist=stoplist)
277
# Add a stemming filter
279
chain = chain | StemFilter(lang=lang, cachesize=cachesize)