~barry/ubuntu/raring/python-whoosh/hg1423

Viewing changes to src/whoosh/analysis/analyzers.py

Committer: Barry Warsaw
Date: 2013-01-23 16:36:20 UTC
mfrom: (1.2.20)
Revision ID: barry@python.org-20130123163620-wmrpb5uhvx68bo4x

* Pull from upstream Mercurial r1423 for Python 3.3 support.
* d/control:
  - Add B-D and B-D-I on python3-* packages.
  - Added X-Python3-Version: >= 3.2
  - Added python3-whoosh binary package.
* d/patches, d/patches/fix-setup.patch: Fix typo in setup.py and remove
  --pep8 flag from [pytest] section of setup.cfg since it doesn't work.
* d/*.install: Added python3-whoosh.install and updated paths.
* d/rules:
  - Add appropriate targets for Python 3 build.
  - Add get-{packaged-}orig-source for grabbing from upstream Mercurial.

files added:
.hgtags

.idea

.idea/.name

.idea/codeStyleSettings.xml

.idea/dictionaries

.idea/dictionaries/new.xml

.idea/encodings.xml

.idea/inspectionProfiles

.idea/inspectionProfiles/profiles_settings.xml

.idea/misc.xml

.idea/modules.xml

.idea/other.xml

.idea/scopes

.idea/scopes/scope_settings.xml

.idea/testrunner.xml

.idea/vcs.xml

.idea/whoosh.iml

.project

.pydevproject

debian/patches

debian/patches/fix-setup.patch

debian/patches/series

debian/python3-whoosh.install

docs/Makefile

docs/make.bat

scripts

scripts/make_checkpoint.py

scripts/pylint.ini

scripts/read_checkpoint.py

src/whoosh/analysis

src/whoosh/analysis/__init__.py

src/whoosh/analysis/acore.py

src/whoosh/analysis/analyzers.py

src/whoosh/analysis/filters.py

src/whoosh/analysis/intraword.py

src/whoosh/analysis/morph.py

src/whoosh/analysis/ngrams.py

src/whoosh/analysis/tokenizers.py

src/whoosh/codec/memory.py

src/whoosh/codec/plaintext.py

src/whoosh/codec/whoosh3.py

src/whoosh/columns.py

src/whoosh/externalsort.py

src/whoosh/fst.py

src/whoosh/idsets.py

src/whoosh/lang/isri.py

src/whoosh/lang/snowball

src/whoosh/lang/snowball/LICENSE.txt

src/whoosh/lang/snowball/__init__.py

src/whoosh/lang/snowball/bases.py

src/whoosh/lang/snowball/danish.py

src/whoosh/lang/snowball/dutch.py

src/whoosh/lang/snowball/english.py

src/whoosh/lang/snowball/finnish.py

src/whoosh/lang/snowball/french.py

src/whoosh/lang/snowball/german.py

src/whoosh/lang/snowball/hungarian.py

src/whoosh/lang/snowball/italian.py

src/whoosh/lang/snowball/norwegian.py

src/whoosh/lang/snowball/portugese.py

src/whoosh/lang/snowball/romanian.py

src/whoosh/lang/snowball/russian.py

src/whoosh/lang/snowball/spanish.py

src/whoosh/lang/snowball/swedish.py

src/whoosh/lang/stopwords.py

src/whoosh/legacy.py

src/whoosh/matching/combo.py

src/whoosh/multiproc.py

src/whoosh/query/qcolumns.py

src/whoosh/query/spans.py

src/whoosh/util

src/whoosh/util/__init__.py

src/whoosh/util/cache.py

src/whoosh/util/filelock.py

src/whoosh/util/loading.py

src/whoosh/util/numeric.py

src/whoosh/util/numlists.py

src/whoosh/util/testing.py

src/whoosh/util/text.py

src/whoosh/util/times.py

src/whoosh/util/varints.py

src/whoosh/util/versions.py

stress

stress/test_bigfacet.py

stress/test_bigindex.py

stress/test_bigsort.py

stress/test_bigtable.py

stress/test_hugeindex.py

stress/test_threading.py

stress/test_update.py

tests/english-words.10.gz

tests/test_columns.py

tox.ini

files removed:
PKG-INFO

docs/build

docs/build/html

docs/build/html/_sources

docs/build/html/_sources/analysis.txt

docs/build/html/_sources/api

docs/build/html/_sources/api/analysis.txt

docs/build/html/_sources/api/api.txt

docs/build/html/_sources/api/codec

docs/build/html/_sources/api/codec/base.txt

docs/build/html/_sources/api/collectors.txt

docs/build/html/_sources/api/fields.txt

docs/build/html/_sources/api/formats.txt

docs/build/html/_sources/api/highlight.txt

docs/build/html/_sources/api/index.txt

docs/build/html/_sources/api/lang

docs/build/html/_sources/api/lang/morph_en.txt

docs/build/html/_sources/api/lang/porter.txt

docs/build/html/_sources/api/lang/wordnet.txt

docs/build/html/_sources/api/matching.txt

docs/build/html/_sources/api/qparser.txt

docs/build/html/_sources/api/query.txt

docs/build/html/_sources/api/reading.txt

docs/build/html/_sources/api/scoring.txt

docs/build/html/_sources/api/searching.txt

docs/build/html/_sources/api/sorting.txt

docs/build/html/_sources/api/spans.txt

docs/build/html/_sources/api/spelling.txt

docs/build/html/_sources/api/store.txt

docs/build/html/_sources/api/support

docs/build/html/_sources/api/support/bitvector.txt

docs/build/html/_sources/api/support/charset.txt

docs/build/html/_sources/api/support/dawg.txt

docs/build/html/_sources/api/support/levenshtein.txt

docs/build/html/_sources/api/util.txt

docs/build/html/_sources/api/writing.txt

docs/build/html/_sources/batch.txt

docs/build/html/_sources/dates.txt

docs/build/html/_sources/facets.txt

docs/build/html/_sources/fieldcaches.txt

docs/build/html/_sources/glossary.txt

docs/build/html/_sources/highlight.txt

docs/build/html/_sources/index.txt

docs/build/html/_sources/indexing.txt

docs/build/html/_sources/intro.txt

docs/build/html/_sources/keywords.txt

docs/build/html/_sources/nested.txt

docs/build/html/_sources/ngrams.txt

docs/build/html/_sources/parsing.txt

docs/build/html/_sources/query.txt

docs/build/html/_sources/querylang.txt

docs/build/html/_sources/quickstart.txt

docs/build/html/_sources/recipes.txt

docs/build/html/_sources/releases

docs/build/html/_sources/releases/0_3.txt

docs/build/html/_sources/releases/1_0.txt

docs/build/html/_sources/releases/2_0.txt

docs/build/html/_sources/releases/index.txt

docs/build/html/_sources/schema.txt

docs/build/html/_sources/searching.txt

docs/build/html/_sources/spelling.txt

docs/build/html/_sources/stemming.txt

docs/build/html/_sources/tech

docs/build/html/_sources/tech/backend.txt

docs/build/html/_sources/tech/filedb.txt

docs/build/html/_sources/tech/index.txt

docs/build/html/_sources/threads.txt

src/Whoosh.egg-info

src/Whoosh.egg-info/PKG-INFO

src/Whoosh.egg-info/SOURCES.txt

src/Whoosh.egg-info/dependency_links.txt

src/Whoosh.egg-info/top_level.txt

src/Whoosh.egg-info/zip-safe

src/whoosh/analysis.py

src/whoosh/codec/legacy.py

src/whoosh/filedb/fieldcache.py

src/whoosh/filedb/fileindex.py

src/whoosh/filedb/filereading.py

src/whoosh/filedb/filewriting.py

src/whoosh/filedb/multiproc.py

src/whoosh/ramindex.py

src/whoosh/spans.py

src/whoosh/store.py

src/whoosh/support/bitvector.py

src/whoosh/support/dawg.py

src/whoosh/support/externalsort.py

src/whoosh/support/filelock.py

src/whoosh/support/numeric.py

src/whoosh/support/numlists.py

src/whoosh/support/testing.py

src/whoosh/support/times.py

src/whoosh/util.py

tests/test_ramindex.py

files modified:
benchmark/dictionary.py

benchmark/enron.py

benchmark/marc21.py

benchmark/reuters.py

debian/changelog

debian/control

debian/python-whoosh.install

debian/rules

docs/source/analysis.rst

docs/source/api/index.rst

docs/source/conf.py

docs/source/dates.rst

docs/source/facets.rst

docs/source/glossary.rst

docs/source/highlight.rst

docs/source/indexing.rst

docs/source/intro.rst

docs/source/keywords.rst

docs/source/ngrams.rst

docs/source/parsing.rst

docs/source/querylang.rst

docs/source/quickstart.rst

docs/source/recipes.rst

docs/source/releases/2_0.rst

docs/source/schema.rst

docs/source/searching.rst

docs/source/spelling.rst

docs/source/stemming.rst

docs/source/threads.rst

setup.cfg

setup.py

src/whoosh/__init__.py

src/whoosh/classify.py

src/whoosh/codec/__init__.py

src/whoosh/codec/base.py

src/whoosh/codec/whoosh2.py

src/whoosh/collectors.py

src/whoosh/compat.py

src/whoosh/fields.py

src/whoosh/filedb/compound.py

src/whoosh/filedb/filestore.py

src/whoosh/filedb/filetables.py

src/whoosh/filedb/gae.py

src/whoosh/filedb/structfile.py

src/whoosh/formats.py

src/whoosh/highlight.py

src/whoosh/index.py

src/whoosh/lang/__init__.py

src/whoosh/lang/dmetaphone.py

src/whoosh/lang/morph_en.py

src/whoosh/lang/paicehusk.py

src/whoosh/lang/phonetic.py

src/whoosh/lang/porter.py

src/whoosh/lang/wordnet.py

src/whoosh/matching/__init__.py

src/whoosh/matching/binary.py

src/whoosh/matching/mcore.py

src/whoosh/matching/wrappers.py

src/whoosh/qparser/dateparse.py

src/whoosh/qparser/default.py

src/whoosh/qparser/plugins.py

src/whoosh/qparser/syntax.py

src/whoosh/qparser/taggers.py

src/whoosh/query/__init__.py

src/whoosh/query/nary.py

src/whoosh/query/nested.py

src/whoosh/query/positional.py

src/whoosh/query/qcore.py

src/whoosh/query/ranges.py

src/whoosh/query/terms.py

src/whoosh/query/wrappers.py

src/whoosh/reading.py

src/whoosh/scoring.py

src/whoosh/searching.py

src/whoosh/sorting.py

src/whoosh/spelling.py

src/whoosh/support/base85.py

src/whoosh/support/bench.py

src/whoosh/support/charset.py

src/whoosh/support/unicode.py

src/whoosh/system.py

src/whoosh/writing.py

tests/test_analysis.py

tests/test_bits.py

tests/test_classify.py

tests/test_codecs.py

tests/test_collector.py

tests/test_compound.py

tests/test_dateparse.py

tests/test_dawg.py

tests/test_fields.py

tests/test_flexible.py

tests/test_highlighting.py

tests/test_indexing.py

tests/test_matching.py

tests/test_misc.py

tests/test_mpwriter.py

tests/test_nested.py

tests/test_parse_plugins.py

tests/test_parsing.py

tests/test_postings.py

tests/test_quality.py

tests/test_queries.py

tests/test_reading.py

tests/test_results.py

tests/test_searching.py

tests/test_sorting.py

tests/test_spans.py

tests/test_spelling.py

tests/test_tables.py

tests/test_vectors.py

tests/test_weightings.py

tests/test_writing.py

Show diffs side-by-side

added added

removed removed

src/whoosh/analysis/analyzers.py

# Redistribution and use in source and binary forms, with or without

# modification, are permitted provided that the following conditions are met:

# 1. Redistributions of source code must retain the above copyright notice,

# this list of conditions and the following disclaimer.

# 2. Redistributions in binary form must reproduce the above copyright

# notice, this list of conditions and the following disclaimer in the

# documentation and/or other materials provided with the distribution.

# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR

# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF

# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO

# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,

# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,

# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# The views and conclusions contained in the software and documentation are

# those of the authors and should not be interpreted as representing official

# policies, either expressed or implied, of Matt Chaput.

from whoosh.analysis.acore import Composable

from whoosh.analysis.filters import LowercaseFilter

from whoosh.analysis.filters import StopFilter, STOP_WORDS

from whoosh.analysis.morph import StemFilter

from whoosh.analysis.intraword import IntraWordFilter

from whoosh.analysis.tokenizers import default_pattern

from whoosh.analysis.tokenizers import CommaSeparatedTokenizer

from whoosh.analysis.tokenizers import IDTokenizer

from whoosh.analysis.tokenizers import RegexTokenizer

from whoosh.analysis.tokenizers import SpaceSeparatedTokenizer

from whoosh.lang.porter import stem

# Analyzers

class Analyzer(Composable):

""" Abstract base class for analyzers.

"""

def __repr__(self):

return "%s()" % self.__class__.__name__

def __eq__(self, other):

return (other

and self.__class__ is other.__class__

and self.__dict__ == other.__dict__)

def __call__(self, value, **kwargs):

raise NotImplementedError

def clean(self):

pass

class CompositeAnalyzer(Analyzer):

def __init__(self, *composables):

self.items = []

for comp in composables:

if isinstance(comp, CompositeAnalyzer):

self.items.extend(comp.items)

else:

self.items.append(comp)

def __repr__(self):

return "%s(%s)" % (self.__class__.__name__,

", ".join(repr(item) for item in self.items))

def __call__(self, value, no_morph=False, **kwargs):

items = self.items

# Start with tokenizer

gen = items[0](value, **kwargs)

# Run filters

for item in items[1:]:

if not (no_morph and hasattr(item, "is_morph") and item.is_morph):

gen = item(gen)

return gen

def __getitem__(self, item):

return self.items.__getitem__(item)

def __len__(self):

return len(self.items)

def __eq__(self, other):

return (other

and self.__class__ is other.__class__

and self.items == other.items)

def clean(self):

for item in self.items:

if hasattr(item, "clean"):

item.clean()

100

101

def has_morph(self):

102

return any(item.is_morph for item in self.items)

103

104

105

# Functions that return composed analyzers

106

107

def IDAnalyzer(lowercase=False):

108

"""Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if

109

desired.

110

"""

111

112

tokenizer = IDTokenizer()

113

if lowercase:

114

tokenizer = tokenizer | LowercaseFilter()

115

return tokenizer

116

117

118

def KeywordAnalyzer(lowercase=False, commas=False):

119

"""Parses whitespace- or comma-separated tokens.

120

121

>>> ana = KeywordAnalyzer()

122

>>> [token.text for token in ana("Hello there, this is a TEST")]

123

["Hello", "there,", "this", "is", "a", "TEST"]

124

125

:param lowercase: whether to lowercase the tokens.

126

:param commas: if True, items are separated by commas rather than

127

whitespace.

128

"""

129

130

if commas:

131

tokenizer = CommaSeparatedTokenizer()

132

else:

133

tokenizer = SpaceSeparatedTokenizer()

134

if lowercase:

135

tokenizer = tokenizer | LowercaseFilter()

136

return tokenizer

137

138

139

def RegexAnalyzer(expression=r"\w+(\.?\w+)*", gaps=False):

140

"""Deprecated, just use a RegexTokenizer directly.

141

"""

142

143

return RegexTokenizer(expression=expression, gaps=gaps)

144

145

146

def SimpleAnalyzer(expression=default_pattern, gaps=False):

147

"""Composes a RegexTokenizer with a LowercaseFilter.

148

149

>>> ana = SimpleAnalyzer()

150

>>> [token.text for token in ana("Hello there, this is a TEST")]

151

["hello", "there", "this", "is", "a", "test"]

152

153

:param expression: The regular expression pattern to use to extract tokens.

154

:param gaps: If True, the tokenizer *splits* on the expression, rather

155

than matching on the expression.

156

"""

157

158

return RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()

159

160

161

def StandardAnalyzer(expression=default_pattern, stoplist=STOP_WORDS,

162

minsize=2, maxsize=None, gaps=False):

163

"""Composes a RegexTokenizer with a LowercaseFilter and optional

164

StopFilter.

165

166

>>> ana = StandardAnalyzer()

167

>>> [token.text for token in ana("Testing is testing and testing")]

168

["testing", "testing", "testing"]

169

170

:param expression: The regular expression pattern to use to extract tokens.

171

:param stoplist: A list of stop words. Set this to None to disable

172

the stop word filter.

173

:param minsize: Words smaller than this are removed from the stream.

174

:param maxsize: Words longer that this are removed from the stream.

175

:param gaps: If True, the tokenizer *splits* on the expression, rather

176

than matching on the expression.

177

"""

178

179

ret = RegexTokenizer(expression=expression, gaps=gaps)

180

chain = ret | LowercaseFilter()

181

if stoplist is not None:

182

chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,

183

maxsize=maxsize)

184

return chain

185

186

187

def StemmingAnalyzer(expression=default_pattern, stoplist=STOP_WORDS,

188

minsize=2, maxsize=None, gaps=False, stemfn=stem,

189

ignore=None, cachesize=50000):

190

"""Composes a RegexTokenizer with a lower case filter, an optional stop

191

filter, and a stemming filter.

192

193

>>> ana = StemmingAnalyzer()

194

>>> [token.text for token in ana("Testing is testing and testing")]

195

["test", "test", "test"]

196

197

:param expression: The regular expression pattern to use to extract tokens.

198

:param stoplist: A list of stop words. Set this to None to disable

199

the stop word filter.

200

:param minsize: Words smaller than this are removed from the stream.

201

:param maxsize: Words longer that this are removed from the stream.

202

:param gaps: If True, the tokenizer *splits* on the expression, rather

203

than matching on the expression.

204

:param ignore: a set of words to not stem.

205

:param cachesize: the maximum number of stemmed words to cache. The larger

206

this number, the faster stemming will be but the more memory it will

207

use. Use None for no cache, or -1 for an unbounded cache.

208

"""

209

210

ret = RegexTokenizer(expression=expression, gaps=gaps)

211

chain = ret | LowercaseFilter()

212

if stoplist is not None:

213

chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,

214

maxsize=maxsize)

215

return chain | StemFilter(stemfn=stemfn, ignore=ignore,

216

cachesize=cachesize)

217

218

219

def FancyAnalyzer(expression=r"\s+", stoplist=STOP_WORDS, minsize=2,

220

maxsize=None, gaps=True, splitwords=True, splitnums=True,

221

mergewords=False, mergenums=False):

222

"""Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and

223

StopFilter.

224

225

>>> ana = FancyAnalyzer()

226

>>> [token.text for token in ana("Should I call getInt or get_real?")]

227

["should", "call", "getInt", "get", "int", "get_real", "get", "real"]

228

229

:param expression: The regular expression pattern to use to extract tokens.

230

:param stoplist: A list of stop words. Set this to None to disable

231

the stop word filter.

232

:param minsize: Words smaller than this are removed from the stream.

233

:param maxsize: Words longer that this are removed from the stream.

234

:param gaps: If True, the tokenizer *splits* on the expression, rather

235

than matching on the expression.

236

"""

237

238

return (RegexTokenizer(expression=expression, gaps=gaps)

239

| IntraWordFilter(splitwords=splitwords, splitnums=splitnums,

240

mergewords=mergewords, mergenums=mergenums)

241

| LowercaseFilter()

242

| StopFilter(stoplist=stoplist, minsize=minsize)

243

)

244

245

246

def LanguageAnalyzer(lang, expression=default_pattern, gaps=False,

247

cachesize=50000):

248

"""Configures a simple analyzer for the given language, with a

249

LowercaseFilter, StopFilter, and StemFilter.

250

251

>>> ana = LanguageAnalyzer("es")

252

>>> [token.text for token in ana("Por el mar corren las liebres")]

253

['mar', 'corr', 'liebr']

254

255

:param expression: The regular expression pattern to use to extract tokens.

256

:param gaps: If True, the tokenizer *splits* on the expression, rather

257

than matching on the expression.

258

:param cachesize: the maximum number of stemmed words to cache. The larger

259

this number, the faster stemming will be but the more memory it will

260

use.

261

"""

262

263

from whoosh.lang import NoStemmer, NoStopWords

264

from whoosh.lang import stopwords_for_language

265

266

# Make the start of the chain

267

chain = (RegexTokenizer(expression=expression, gaps=gaps)

268

| LowercaseFilter())

269

270

# Add a stop word filter

271

try:

272

stoplist = stopwords_for_language(lang)

273

chain = chain | StopFilter(stoplist=stoplist)

274

except NoStopWords:

275

pass

276

277

# Add a stemming filter

278

try:

279

chain = chain | StemFilter(lang=lang, cachesize=cachesize)

280

except NoStemmer:

281

pass

282

283

return chain

Older »