1
package org.apache.lucene.analysis.compound;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
21
import java.io.Reader;
24
import org.apache.lucene.analysis.TokenFilter; // for javadocs
25
import org.apache.lucene.analysis.TokenStream;
26
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
27
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
28
import org.apache.lucene.util.Version;
29
import org.xml.sax.InputSource;
32
* A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
34
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
35
* "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
36
* grammar and a word dictionary to achieve this.
38
* You must specify the required {@link Version} compatibility when creating
39
* CompoundWordTokenFilterBase:
41
* <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
42
* supplementary characters in strings and char arrays provided as compound word
45
* <p>If you pass in a {@link org.apache.lucene.analysis.CharArraySet} as dictionary,
46
* it should be case-insensitive unless it contains only lowercased entries and you
47
* have {@link org.apache.lucene.analysis.LowerCaseFilter} before this filter in your analysis chain.
48
* For optional performance (as this filter does lots of lookups to the dictionary,
49
* you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
50
* {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
51
* transformed to case-insensitive!
53
public class HyphenationCompoundWordTokenFilter extends
54
CompoundWordTokenFilterBase {
55
private HyphenationTree hyphenator;
58
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
61
* Lucene version to enable correct Unicode 4.0 behavior in the
62
* dictionaries if Version > 3.0. See <a
63
* href="CompoundWordTokenFilterBase#version"
64
* >CompoundWordTokenFilterBase</a> for details.
66
* the {@link TokenStream} to process
68
* the hyphenation pattern tree to use for hyphenation
70
* the word dictionary to match against
72
* only words longer than this get processed
73
* @param minSubwordSize
74
* only subwords longer than this get to the output stream
75
* @param maxSubwordSize
76
* only subwords shorter than this get to the output stream
77
* @param onlyLongestMatch
78
* Add only the longest matching subword to the stream
79
* @deprecated Use the constructors taking {@link Set}
82
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
83
HyphenationTree hyphenator, String[] dictionary, int minWordSize,
84
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
85
this(matchVersion, input, hyphenator, makeDictionary(matchVersion, dictionary), minWordSize,
86
minSubwordSize, maxSubwordSize, onlyLongestMatch);
90
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
93
* Lucene version to enable correct Unicode 4.0 behavior in the
94
* dictionaries if Version > 3.0. See <a
95
* href="CompoundWordTokenFilterBase#version"
96
* >CompoundWordTokenFilterBase</a> for details.
98
* the {@link TokenStream} to process
100
* the hyphenation pattern tree to use for hyphenation
102
* the word dictionary to match against
103
* @deprecated Use the constructors taking {@link Set}
106
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
107
HyphenationTree hyphenator, String[] dictionary) {
108
this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30,dictionary), DEFAULT_MIN_WORD_SIZE,
109
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
113
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
115
* @param matchVersion
116
* Lucene version to enable correct Unicode 4.0 behavior in the
117
* dictionaries if Version > 3.0. See <a
118
* href="CompoundWordTokenFilterBase#version"
119
* >CompoundWordTokenFilterBase</a> for details.
121
* the {@link TokenStream} to process
123
* the hyphenation pattern tree to use for hyphenation
125
* the word dictionary to match against.
127
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
128
HyphenationTree hyphenator, Set<?> dictionary) {
129
this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
130
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
134
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
136
* @param matchVersion
137
* Lucene version to enable correct Unicode 4.0 behavior in the
138
* dictionaries if Version > 3.0. See <a
139
* href="CompoundWordTokenFilterBase#version"
140
* >CompoundWordTokenFilterBase</a> for details.
142
* the {@link TokenStream} to process
144
* the hyphenation pattern tree to use for hyphenation
146
* the word dictionary to match against.
148
* only words longer than this get processed
149
* @param minSubwordSize
150
* only subwords longer than this get to the output stream
151
* @param maxSubwordSize
152
* only subwords shorter than this get to the output stream
153
* @param onlyLongestMatch
154
* Add only the longest matching subword to the stream
156
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
157
HyphenationTree hyphenator, Set<?> dictionary, int minWordSize,
158
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
159
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
162
this.hyphenator = hyphenator;
166
* Create a HyphenationCompoundWordTokenFilter with no dictionary.
168
* Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)
169
* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
170
* null, minWordSize, minSubwordSize, maxSubwordSize }
172
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
173
HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
174
int maxSubwordSize) {
175
this(matchVersion, input, hyphenator, (Set<?>) null, minWordSize, minSubwordSize,
176
maxSubwordSize, false);
180
* Create a HyphenationCompoundWordTokenFilter with no dictionary.
182
* Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, int, int, int)
183
* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
184
* DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
186
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
187
HyphenationTree hyphenator) {
188
this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE,
189
DEFAULT_MAX_SUBWORD_SIZE);
193
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
195
* @param input the {@link TokenStream} to process
196
* @param hyphenator the hyphenation pattern tree to use for hyphenation
197
* @param dictionary the word dictionary to match against
198
* @param minWordSize only words longer than this get processed
199
* @param minSubwordSize only subwords longer than this get to the output
201
* @param maxSubwordSize only subwords shorter than this get to the output
203
* @param onlyLongestMatch Add only the longest matching subword to the stream
204
* @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, String[], int, int, int, boolean)} instead.
207
public HyphenationCompoundWordTokenFilter(TokenStream input,
208
HyphenationTree hyphenator, String[] dictionary, int minWordSize,
209
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
210
this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30, dictionary), minWordSize,
211
minSubwordSize, maxSubwordSize, onlyLongestMatch);
215
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
217
* @param input the {@link TokenStream} to process
218
* @param hyphenator the hyphenation pattern tree to use for hyphenation
219
* @param dictionary the word dictionary to match against
220
* @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, String[])} instead.
223
public HyphenationCompoundWordTokenFilter(TokenStream input,
224
HyphenationTree hyphenator, String[] dictionary) {
225
this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30, dictionary), DEFAULT_MIN_WORD_SIZE,
226
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
230
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
232
* @param input the {@link TokenStream} to process
233
* @param hyphenator the hyphenation pattern tree to use for hyphenation
234
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
235
* lower case strings.
236
* @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set)} instead.
239
public HyphenationCompoundWordTokenFilter(TokenStream input,
240
HyphenationTree hyphenator, Set<?> dictionary) {
241
this(Version.LUCENE_30, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
242
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
246
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
248
* @param input the {@link TokenStream} to process
249
* @param hyphenator the hyphenation pattern tree to use for hyphenation
250
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
251
* lower case strings.
252
* @param minWordSize only words longer than this get processed
253
* @param minSubwordSize only subwords longer than this get to the output
255
* @param maxSubwordSize only subwords shorter than this get to the output
257
* @param onlyLongestMatch Add only the longest matching subword to the stream
258
* @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)} instead.
261
public HyphenationCompoundWordTokenFilter(TokenStream input,
262
HyphenationTree hyphenator, Set<?> dictionary, int minWordSize,
263
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
264
super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
267
this.hyphenator = hyphenator;
271
* Create a hyphenator tree
273
* @param hyphenationFilename the filename of the XML grammar to load
274
* @return An object representing the hyphenation patterns
277
public static HyphenationTree getHyphenationTree(String hyphenationFilename)
279
return getHyphenationTree(new InputSource(hyphenationFilename));
283
* Create a hyphenator tree
285
* @param hyphenationFile the file of the XML grammar to load
286
* @return An object representing the hyphenation patterns
289
public static HyphenationTree getHyphenationTree(File hyphenationFile)
291
return getHyphenationTree(new InputSource(hyphenationFile.toURL().toExternalForm()));
295
* Create a hyphenator tree
297
* @param hyphenationReader the reader of the XML grammar to load from
298
* @return An object representing the hyphenation patterns
300
* @deprecated Don't use Readers with fixed charset to load XML files, unless programatically created.
301
* Use {@link #getHyphenationTree(InputSource)} instead, where you can supply default charset and input
302
* stream, if you like.
305
public static HyphenationTree getHyphenationTree(Reader hyphenationReader)
307
final InputSource is = new InputSource(hyphenationReader);
308
// we need this to load the DTD in very old parsers (like the one in JDK 1.4).
309
// The DTD itsself is provided via EntityResolver, so it should always load, but
310
// some parsers still want to have a base URL (Crimson).
311
is.setSystemId("urn:java:" + HyphenationTree.class.getName());
312
return getHyphenationTree(is);
316
* Create a hyphenator tree
318
* @param hyphenationSource the InputSource pointing to the XML grammar
319
* @return An object representing the hyphenation patterns
322
public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
324
HyphenationTree tree = new HyphenationTree();
325
tree.loadPatterns(hyphenationSource);
330
protected void decompose() {
331
// get the hyphenation points
332
Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
333
// No hyphen points found -> exit
334
if (hyphens == null) {
338
final int[] hyp = hyphens.getHyphenationPoints();
340
for (int i = 0; i < hyp.length; ++i) {
341
int remaining = hyp.length - i;
343
CompoundToken longestMatchToken = null;
344
for (int j = 1; j < remaining; j++) {
345
int partLength = hyp[i + j] - start;
347
// if the part is longer than maxSubwordSize we
348
// are done with this round
349
if (partLength > this.maxSubwordSize) {
353
// we only put subwords to the token stream
354
// that are longer than minPartSize
355
if (partLength < this.minSubwordSize) {
359
// check the dictionary
360
if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
361
if (this.onlyLongestMatch) {
362
if (longestMatchToken != null) {
363
if (longestMatchToken.txt.length() < partLength) {
364
longestMatchToken = new CompoundToken(start, partLength);
367
longestMatchToken = new CompoundToken(start, partLength);
370
tokens.add(new CompoundToken(start, partLength));
372
} else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
373
// check the dictionary again with a word that is one character
375
// to avoid problems with genitive 's characters and other binding
377
if (this.onlyLongestMatch) {
378
if (longestMatchToken != null) {
379
if (longestMatchToken.txt.length() < partLength - 1) {
380
longestMatchToken = new CompoundToken(start, partLength - 1);
383
longestMatchToken = new CompoundToken(start, partLength - 1);
386
tokens.add(new CompoundToken(start, partLength - 1));
390
if (this.onlyLongestMatch && longestMatchToken!=null) {
391
tokens.add(longestMatchToken);