2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
9
* http://www.apache.org/licenses/LICENSE-2.0
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
18
package org.apache.solr.analysis;
20
import org.apache.commons.io.IOUtils;
21
import org.apache.lucene.analysis.TokenStream;
22
import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
23
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
24
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
25
import org.apache.lucene.analysis.CharArraySet;
26
import org.apache.solr.analysis.BaseTokenFilterFactory;
27
import org.apache.solr.common.ResourceLoader;
28
import org.apache.solr.common.SolrException;
29
import org.apache.solr.util.plugin.ResourceLoaderAware;
32
import java.io.InputStream;
33
import org.xml.sax.InputSource;
36
* Factory for {@link HyphenationCompoundWordTokenFilter}.
38
* This factory accepts the following parameters:
40
* <li><code>hyphenator</code> (mandatory): path to the FOP xml hyphenation pattern.
41
* See <a href="http://offo.sourceforge.net/hyphenation/">http://offo.sourceforge.net/hyphenation/</a>.
42
* <li><code>encoding</code> (optional): encoding of the xml hyphenation file. defaults to UTF-8.
43
* <li><code>dictionary</code> (optional): dictionary of words. defaults to no dictionary.
44
* <li><code>minWordSize</code> (optional): minimal word length that gets decomposed. defaults to 5.
45
* <li><code>minSubwordSize</code> (optional): minimum length of subwords. defaults to 2.
46
* <li><code>maxSubwordSize</code> (optional): maximum length of subwords. defaults to 15.
47
* <li><code>onlyLongestMatch</code> (optional): if true, adds only the longest matching subword
48
* to the stream. defaults to false.
51
* <pre class="prettyprint" >
52
* <fieldType name="text_hyphncomp" class="solr.TextField" positionIncrementGap="100">
54
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
55
* <filter class="solr.HyphenationCompoundWordTokenFilterFactory" hyphenator="hyphenator.xml" encoding="UTF-8"
56
* dictionary="dictionary.txt" minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="false"/>
58
* </fieldType></pre>
60
* @see HyphenationCompoundWordTokenFilter
62
public class HyphenationCompoundWordTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
63
private CharArraySet dictionary;
64
private HyphenationTree hyphenator;
65
private String dictFile;
66
private String hypFile;
67
private String encoding;
68
private int minWordSize;
69
private int minSubwordSize;
70
private int maxSubwordSize;
71
private boolean onlyLongestMatch;
74
public void init(Map<String, String> args) {
77
dictFile = args.get("dictionary");
78
if (args.containsKey("encoding"))
79
encoding = args.get("encoding");
80
hypFile = args.get("hyphenator");
81
if (null == hypFile) {
82
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
83
"Missing required parameter: hyphenator");
86
minWordSize = getInt("minWordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
87
minSubwordSize = getInt("minSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
88
maxSubwordSize = getInt("maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
89
onlyLongestMatch = getBoolean("onlyLongestMatch", false);
92
public void inform(ResourceLoader loader) {
93
InputStream stream = null;
95
if (dictFile != null) // the dictionary can be empty.
96
dictionary = getWordSet(loader, dictFile, false);
97
// TODO: Broken, because we cannot resolve real system id
98
// ResourceLoader should also supply method like ClassLoader to get resource URL
99
stream = loader.openResource(hypFile);
100
final InputSource is = new InputSource(stream);
101
is.setEncoding(encoding); // if it's null let xml parser decide
102
is.setSystemId(hypFile);
103
hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
104
} catch (Exception e) { // TODO: getHyphenationTree really shouldn't throw "Exception"
105
throw new RuntimeException(e);
107
IOUtils.closeQuietly(stream);
111
public HyphenationCompoundWordTokenFilter create(TokenStream input) {
112
return new HyphenationCompoundWordTokenFilter(luceneMatchVersion, input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);