1
/*------------------------------------------------------------------------------
2
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
4
* Distributable under the terms of either the Apache License (Version 2.0) or
5
* the GNU Lesser General Public License, as specified in the COPYING file.
6
------------------------------------------------------------------------------*/
7
#ifndef _lucene_analysis_Analyzers_
8
#define _lucene_analysis_Analyzers_
10
#include "CLucene/util/VoidList.h"
11
#include "CLucene/util/VoidMap.h"
12
#include "CLucene/util/CLStreams.h"
13
#include "AnalysisHeader.h"
17
/** An abstract base class for simple, character-oriented tokenizers.*/
18
class CLUCENE_EXPORT CharTokenizer:public Tokenizer {
20
int32_t offset, bufferIndex, dataLen;
21
TCHAR buffer[LUCENE_MAX_WORD_LEN+1];
22
const TCHAR* ioBuffer;
25
/** Returns true iff a character should be included in a token. This
26
* tokenizer generates as tokens adjacent sequences of characters which
27
* satisfy this predicate. Characters for which this is false are used to
28
* define token boundaries and are not included in tokens. */
29
virtual bool isTokenChar(const TCHAR c) const = 0;
31
/** Called on each token character to normalize it before it is added to the
32
* token. The default implementation does nothing. Subclasses may use this
33
* to, e.g., lowercase tokens. */
34
virtual TCHAR normalize(const TCHAR c) const;
37
CharTokenizer(CL_NS(util)::Reader* in);
38
Token* next(Token* token);
39
void reset(CL_NS(util)::Reader* input);
41
virtual ~CharTokenizer();
45
/** A LetterTokenizer is a tokenizer that divides text at non-letters. That's
46
to say, it defines tokens as maximal strings of adjacent letters, as defined
47
by java.lang.Character.isLetter() predicate.
49
Note: this does a decent job for most European languages, but does a terrible
50
job for some Asian languages, where words are not separated by spaces. */
51
class CLUCENE_EXPORT LetterTokenizer:public CharTokenizer {
53
// Construct a new LetterTokenizer.
54
LetterTokenizer(CL_NS(util)::Reader* in);
55
virtual ~LetterTokenizer();
57
/** Collects only characters which satisfy _istalpha.*/
58
bool isTokenChar(const TCHAR c) const;
64
* LowerCaseTokenizer performs the function of LetterTokenizer
65
* and LowerCaseFilter together. It divides text at non-letters and converts
66
* them to lower case. While it is functionally equivalent to the combination
67
* of LetterTokenizer and LowerCaseFilter, there is a performance advantage
68
* to doing the two tasks at once, hence this (redundant) implementation.
70
* Note: this does a decent job for most European languages, but does a terrible
71
* job for some Asian languages, where words are not separated by spaces.
73
class CLUCENE_EXPORT LowerCaseTokenizer:public LetterTokenizer {
75
/** Construct a new LowerCaseTokenizer. */
76
LowerCaseTokenizer(CL_NS(util)::Reader* in);
77
virtual ~LowerCaseTokenizer();
79
/** Collects only characters which satisfy _totlower. */
80
TCHAR normalize(const TCHAR chr) const;
84
/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
85
* Adjacent sequences of non-Whitespace characters form tokens. */
86
class CLUCENE_EXPORT WhitespaceTokenizer: public CharTokenizer {
88
/** Construct a new WhitespaceTokenizer. */
89
WhitespaceTokenizer(CL_NS(util)::Reader* in);
90
virtual ~WhitespaceTokenizer();
92
/** Collects only characters which do not satisfy _istspace.*/
93
bool isTokenChar(const TCHAR c) const;
97
/** An Analyzer that uses WhitespaceTokenizer. */
98
class CLUCENE_EXPORT WhitespaceAnalyzer: public Analyzer {
100
WhitespaceAnalyzer();
101
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
102
TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
103
virtual ~WhitespaceAnalyzer();
106
/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
107
class CLUCENE_EXPORT SimpleAnalyzer: public Analyzer {
110
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
111
TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
113
virtual ~SimpleAnalyzer();
119
* Normalizes token text to lower case.
121
class CLUCENE_EXPORT LowerCaseFilter: public TokenFilter {
123
LowerCaseFilter(TokenStream* in, bool deleteTokenStream);
124
virtual ~LowerCaseFilter();
125
Token* next(Token* token);
130
* Removes stop words from a token stream.
132
class CLUCENE_EXPORT StopFilter: public TokenFilter {
134
//bvk: i found this to work faster with a non-hash table. the number of items
135
//in the stop table is not like to make it worth having hashing.
136
//ish: implement a radix/patricia tree for this?
137
CLTCSetList* stopWords;
138
bool deleteStopTable;
140
bool enablePositionIncrements;
141
const bool ignoreCase;
143
static bool ENABLE_POSITION_INCREMENTS_DEFAULT;
145
// Constructs a filter which removes words from the input
146
// TokenStream that are named in the array of words.
147
StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** _stopWords, const bool _ignoreCase = false);
149
virtual ~StopFilter();
151
/** Constructs a filter which removes words from the input
152
* TokenStream that are named in the CLSetList.
154
StopFilter(TokenStream* in, bool deleteTokenStream, CLTCSetList* stopTable, bool _deleteStopTable=false);
157
* Builds a Hashtable from an array of stop words, appropriate for passing
158
* into the StopFilter constructor. This permits this table construction to
159
* be cached once when an Analyzer is constructed.
160
* Note: the stopWords list must be a static list because the strings are not copied
162
static void fillStopTable(CLTCSetList* stopTable,
163
const TCHAR** stopWords, const bool _ignoreCase = false);
166
* Returns the next input Token whose termText() is not a stop word.
168
Token* next(Token* token);
172
* @see #setEnablePositionIncrementsDefault(boolean).
174
static bool getEnablePositionIncrementsDefault();
177
* Set the default position increments behavior of every StopFilter created from now on.
179
* Note: behavior of a single StopFilter instance can be modified
180
* with {@link #setEnablePositionIncrements(boolean)}.
181
* This static method allows control over behavior of classes using StopFilters internally,
182
* for example {@link lucene::analysis::standard::StandardAnalyzer StandardAnalyzer}.
185
* @see #setEnablePositionIncrements(boolean).
187
static void setEnablePositionIncrementsDefault(const bool defaultValue);
190
* @see #setEnablePositionIncrements(boolean).
192
bool getEnablePositionIncrements() const;
195
* Set to <code>true</code> to make <b>this</b> StopFilter enable position increments to result tokens.
197
* When set, when a token is stopped (omitted), the position increment of
198
* the following token is incremented.
200
* Default: see {@link #setEnablePositionIncrementsDefault(boolean)}.
202
void setEnablePositionIncrements(const bool enable);
207
* Loader for text files that represent a list of stopwords.
210
class CLUCENE_EXPORT WordlistLoader {
213
* Loads a text file and adds every line as an entry to a HashSet (omitting
214
* leading and trailing whitespace). Every line of the file should contain only
215
* one word. The words need to be in lowercase if you make use of an
216
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
218
* @param wordfile File containing the wordlist
219
* @return A HashSet with the file's words
221
static CLTCSetList* getWordSet(const char* wordfilePath, const char* enc = NULL, CLTCSetList* stopTable = NULL);
224
* Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
225
* leading and trailing whitespace). Every line of the Reader should contain only
226
* one word. The words need to be in lowercase if you make use of an
227
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
229
* @param reader Reader containing the wordlist
230
* @return A HashSet with the reader's words
232
static CLTCSetList* getWordSet(CL_NS(util)::Reader* reader, CLTCSetList* stopTable = NULL, const bool bDeleteReader = false);
236
/** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
237
class CLUCENE_EXPORT StopAnalyzer: public Analyzer {
238
CLTCSetList* stopTable;
242
/** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
244
virtual ~StopAnalyzer();
246
/** Builds an analyzer which removes words in the provided array. */
247
StopAnalyzer( const TCHAR** stopWords );
249
/** Builds an analyzer with the stop words from the given file.
250
* @see WordlistLoader#getWordSet(File)
252
StopAnalyzer(const char* stopwordsFile, const char* enc = NULL);
254
/** Builds an analyzer with the stop words from the given reader.
255
* @see WordlistLoader#getWordSet(Reader)
257
StopAnalyzer(CL_NS(util)::Reader* stopwordsReader, const bool _bDeleteReader = false);
259
/** Filters LowerCaseTokenizer with StopFilter. */
260
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
261
TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
263
/** An array containing some common English words that are not usually useful
265
static const TCHAR* ENGLISH_STOP_WORDS[];
271
* This analyzer is used to facilitate scenarios where different
272
* fields require different analysis techniques. Use {@link #addAnalyzer}
273
* to add a non-default analyzer on a field name basis.
278
* PerFieldAnalyzerWrapper* aWrapper =
279
* new PerFieldAnalyzerWrapper(new StandardAnalyzer());
280
* aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
281
* aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
284
* <p>In this example, StandardAnalyzer will be used for all fields except "firstname"
285
* and "lastname", for which KeywordAnalyzer will be used.
287
* <p>A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing
290
class CLUCENE_EXPORT PerFieldAnalyzerWrapper : public Analyzer {
292
Analyzer* defaultAnalyzer;
294
typedef CL_NS(util)::CLHashMap<TCHAR*, Analyzer*, CL_NS(util)::Compare::TChar,
295
CL_NS(util)::Equals::TChar, CL_NS(util)::Deletor::tcArray,CL_NS(util)::Deletor::Void<Analyzer> > AnalyzerMapType;
296
AnalyzerMapType* analyzerMap;
299
* Constructs with default analyzer.
301
* @param defaultAnalyzer Any fields not specifically
302
* defined to use a different analyzer will use the one provided here.
304
PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer);
305
virtual ~PerFieldAnalyzerWrapper();
308
* Defines an analyzer to use for the specified field.
310
* @param fieldName field name requiring a non-default analyzer
311
* @param analyzer non-default analyzer to use for field
313
void addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer);
314
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
316
TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
318
/** Return the positionIncrementGap from the analyzer assigned to fieldName */
319
int32_t getPositionIncrementGap(const TCHAR* fieldName);
324
* A filter that replaces accented characters in the ISO Latin 1 character set
325
* (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
327
* For instance, 'à' will be replaced by 'a'.
330
class CLUCENE_EXPORT ISOLatin1AccentFilter: public TokenFilter {
332
ISOLatin1AccentFilter(TokenStream* input, bool deleteTs);
335
* To replace accented characters in a String by unaccented equivalents.
337
Token* next(Token* token);
338
virtual ~ISOLatin1AccentFilter();
343
* Emits the entire input as a single token.
345
class CLUCENE_EXPORT KeywordTokenizer: public Tokenizer {
347
LUCENE_STATIC_CONSTANT(int, DEFAULT_BUFFER_SIZE = 256);
351
KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize=-1);
352
Token* next(Token* token);
353
void reset(CL_NS(util)::Reader* input);
355
virtual ~KeywordTokenizer();
359
* "Tokenizes" the entire stream as a single token. This is useful
360
* for data like zip codes, ids, and some product names.
362
class CLUCENE_EXPORT KeywordAnalyzer: public Analyzer {
365
virtual ~KeywordAnalyzer();
366
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
367
TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
372
* Removes words that are too long and too short from the stream.
375
class CLUCENE_EXPORT LengthFilter: public TokenFilter {
381
* Build a filter that removes words that are too long or too
382
* short from the text.
384
LengthFilter(TokenStream* in, const size_t _min, const size_t _max);
387
* Returns the next input Token whose termText() is the right len
389
Token* next(Token* token);