1
package org.apache.solr.analysis;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
21
import java.io.IOException;
22
import java.io.InputStreamReader;
23
import java.io.Reader;
24
import java.nio.charset.Charset;
25
import java.nio.charset.CharsetDecoder;
26
import java.nio.charset.CodingErrorAction;
27
import java.text.ParseException;
28
import java.util.List;
31
import org.apache.lucene.analysis.Analyzer;
32
import org.apache.lucene.analysis.TokenStream;
33
import org.apache.lucene.analysis.Tokenizer;
34
import org.apache.lucene.analysis.LowerCaseFilter;
35
import org.apache.lucene.analysis.WhitespaceTokenizer;
36
import org.apache.lucene.analysis.synonym.SynonymFilter;
37
import org.apache.lucene.analysis.synonym.SynonymMap;
38
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
39
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
40
import org.apache.lucene.analysis.ReusableAnalyzerBase;
41
import org.apache.lucene.util.Version;
42
import org.apache.solr.common.ResourceLoader;
43
import org.apache.solr.common.SolrException;
44
import org.apache.solr.common.util.StrUtils;
45
import org.apache.solr.util.plugin.ResourceLoaderAware;
48
* @deprecated (3.4) use {@link SynonymFilterFactory} instead. this is only a backwards compatibility
49
* mechanism that will be removed in Lucene 5.0
51
// NOTE: rename this to "SynonymFilterFactory" and nuke that delegator in Lucene 5.0!
53
final class FSTSynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
54
private SynonymMap map;
55
private boolean ignoreCase;
57
public TokenStream create(TokenStream input) {
58
// if the fst is null, it means there's actually no synonyms... just return the original stream
59
// as there is nothing to do here.
60
return map.fst == null ? input : new SynonymFilter(input, map, ignoreCase);
63
public void inform(ResourceLoader loader) {
64
final boolean ignoreCase = getBoolean("ignoreCase", false);
65
this.ignoreCase = ignoreCase;
67
String tf = args.get("tokenizerFactory");
69
final TokenizerFactory factory = tf == null ? null : loadTokenizerFactory(loader, tf, args);
71
Analyzer analyzer = new ReusableAnalyzerBase() {
73
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
74
Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_31, reader) : factory.create(reader);
75
TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_31, tokenizer) : tokenizer;
76
return new TokenStreamComponents(tokenizer, stream);
80
String format = args.get("format");
82
if (format == null || format.equals("solr")) {
83
// TODO: expose dedup as a parameter?
84
map = loadSolrSynonyms(loader, true, analyzer);
85
} else if (format.equals("wordnet")) {
86
map = loadWordnetSynonyms(loader, true, analyzer);
88
// TODO: somehow make this more pluggable
89
throw new RuntimeException("Unrecognized synonyms format: " + format);
91
} catch (Exception e) {
92
throw new RuntimeException(e);
95
if (map.fst == null) {
96
log.warn("Synonyms loaded with " + args + " has empty rule set!");
101
* Load synonyms from the solr format, "format=solr".
103
private SynonymMap loadSolrSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
104
final boolean expand = getBoolean("expand", true);
105
String synonyms = args.get("synonyms");
106
if (synonyms == null)
107
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
109
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
110
.onMalformedInput(CodingErrorAction.REPORT)
111
.onUnmappableCharacter(CodingErrorAction.REPORT);
113
SolrSynonymParser parser = new SolrSynonymParser(dedup, expand, analyzer);
114
File synonymFile = new File(synonyms);
115
if (synonymFile.exists()) {
117
parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
119
List<String> files = StrUtils.splitFileNames(synonyms);
120
for (String file : files) {
122
parser.add(new InputStreamReader(loader.openResource(file), decoder));
125
return parser.build();
129
* Load synonyms from the wordnet format, "format=wordnet".
131
private SynonymMap loadWordnetSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
132
final boolean expand = getBoolean("expand", true);
133
String synonyms = args.get("synonyms");
134
if (synonyms == null)
135
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
137
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
138
.onMalformedInput(CodingErrorAction.REPORT)
139
.onUnmappableCharacter(CodingErrorAction.REPORT);
141
WordnetSynonymParser parser = new WordnetSynonymParser(dedup, expand, analyzer);
142
File synonymFile = new File(synonyms);
143
if (synonymFile.exists()) {
145
parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
147
List<String> files = StrUtils.splitFileNames(synonyms);
148
for (String file : files) {
150
parser.add(new InputStreamReader(loader.openResource(file), decoder));
153
return parser.build();
156
private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map<String,String> args){
157
TokenizerFactory tokFactory = (TokenizerFactory) loader.newInstance(cname);
158
tokFactory.init(args);