1
package org.apache.solr.analysis;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
22
import org.apache.lucene.analysis.*;
23
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
24
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
27
* When the plain text is extracted from documents, we will often have many words hyphenated and broken into
28
* two lines. This is often the case with documents where narrow text columns are used, such as newsletters.
29
* In order to increase search efficiency, this filter puts hyphenated words broken into two lines back together.
30
* This filter should be used on indexing time only.
31
* Example field definition in schema.xml:
33
* <fieldtype name="text" class="solr.TextField" positionIncrementGap="100">
34
* <analyzer type="index">
35
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
36
* <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
37
* <filter class="solr.StopFilterFactory" ignoreCase="true"/>
38
* <filter class="solr.HyphenatedWordsFilterFactory"/>
39
* <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
40
* <filter class="solr.LowerCaseFilterFactory"/>
41
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
43
* <analyzer type="query">
44
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
45
* <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
46
* <filter class="solr.StopFilterFactory" ignoreCase="true"/>
47
* <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
48
* <filter class="solr.LowerCaseFilterFactory"/>
49
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
55
public final class HyphenatedWordsFilter extends TokenFilter {
57
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
58
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
60
private final StringBuilder hyphenated = new StringBuilder();
61
private State savedState;
62
private boolean exhausted = false;
65
* Creates a new HyphenatedWordsFilter
67
* @param in TokenStream that will be filtered
69
public HyphenatedWordsFilter(TokenStream in) {
77
public boolean incrementToken() throws IOException {
78
while (!exhausted && input.incrementToken()) {
79
char[] term = termAttribute.buffer();
80
int termLength = termAttribute.length();
82
if (termLength > 0 && term[termLength - 1] == '-') {
84
// capture the state of the first token only
85
if (savedState == null) {
86
savedState = captureState();
88
hyphenated.append(term, 0, termLength - 1);
89
} else if (savedState == null) {
90
// not part of a hyphenated word.
93
// the final portion of a hyphenated word
94
hyphenated.append(term, 0, termLength);
102
if (savedState != null) {
103
// the final term ends with a hyphen
104
// add back the hyphen, for backwards compatibility.
105
hyphenated.append('-');
117
public void reset() throws IOException {
119
hyphenated.setLength(0);
124
// ================================================= Helper Methods ================================================
127
* Writes the joined unhyphenated term
129
private void unhyphenate() {
130
int endOffset = offsetAttribute.endOffset();
132
restoreState(savedState);
135
char term[] = termAttribute.buffer();
136
int length = hyphenated.length();
137
if (length > termAttribute.length()) {
138
term = termAttribute.resizeBuffer(length);
141
hyphenated.getChars(0, length, term, 0);
142
termAttribute.setLength(length);
143
offsetAttribute.setOffset(offsetAttribute.startOffset(), endOffset);
144
hyphenated.setLength(0);