1
package org.apache.lucene.analysis.pt;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
21
import java.io.InputStream;
22
import java.io.InputStreamReader;
23
import java.io.LineNumberReader;
24
import java.util.ArrayList;
25
import java.util.Arrays;
26
import java.util.HashMap;
27
import java.util.List;
29
import java.util.regex.Matcher;
30
import java.util.regex.Pattern;
32
import org.apache.lucene.analysis.CharArraySet;
33
import org.apache.lucene.util.Version;
35
import static org.apache.lucene.analysis.util.StemmerUtil.*;
38
* Base class for stemmers that use a set of RSLP-like stemming steps.
40
* RSLP (Removedor de Sufixos da Lingua Portuguesa) is an algorithm designed
41
* originally for stemming the Portuguese language, described in the paper
42
* <i>A Stemming Algorithm for the Portuguese Language</i>, Orengo et. al.
44
* Since this time a plural-only modification (RSLP-S) as well as a modification
45
* for the Galician language have been implemented. This class parses a configuration
46
* file that describes {@link Step}s, where each Step contains a set of {@link Rule}s.
48
* The general rule format is:
49
* <blockquote>{ "suffix", N, "replacement", { "exception1", "exception2", ...}}</blockquote>
52
* <li><code>suffix</code> is the suffix to be removed (such as "inho").
53
* <li><code>N</code> is the min stem size, where stem is defined as the candidate stem
54
* after removing the suffix (but before appending the replacement!)
55
* <li><code>replacement</code> is an optimal string to append after removing the suffix.
56
* This can be the empty string.
57
* <li><code>exceptions</code> is an optional list of exceptions, patterns that should
58
* not be stemmed. These patterns can be specified as whole word or suffix (ends-with)
59
* patterns, depending upon the exceptions format flag in the step header.
62
* A step is an ordered list of rules, with a structure in this format:
63
* <blockquote>{ "name", N, B, { "cond1", "cond2", ... }
68
* <li><code>name</code> is a name for the step (such as "Plural").
69
* <li><code>N</code> is the min word size. Words that are less than this length bypass
70
* the step completely, as an optimization. Note: N can be zero, in this case this
71
* implementation will automatically calculate the appropriate value from the underlying
73
* <li><code>B</code> is a "boolean" flag specifying how exceptions in the rules are matched.
74
* A value of 1 indicates whole-word pattern matching, a value of 0 indicates that
75
* exceptions are actually suffixes and should be matched with ends-with.
76
* <li><code>conds</code> are an optional list of conditions to enter the step at all. If
77
* the list is non-empty, then a word must end with one of these conditions or it will
78
* bypass the step completely as an optimization.
81
* @see <a href="http://www.inf.ufrgs.br/~viviane/rslp/index.htm">RSLP description</a>
84
public abstract class RSLPStemmerBase {
87
* A basic rule, with no exceptions.
89
protected static class Rule {
90
protected final char suffix[];
91
protected final char replacement[];
92
protected final int min;
96
* @param suffix suffix to remove
97
* @param min minimum stem length
98
* @param replacement replacement string
100
public Rule(String suffix, int min, String replacement) {
101
this.suffix = suffix.toCharArray();
102
this.replacement = replacement.toCharArray();
107
* @return true if the word matches this rule.
109
public boolean matches(char s[], int len) {
110
return (len - suffix.length >= min && endsWith(s, len, suffix));
114
* @return new valid length of the string after firing this rule.
116
public int replace(char s[], int len) {
117
if (replacement.length > 0) {
118
System.arraycopy(replacement, 0, s, len - suffix.length, replacement.length);
120
return len - suffix.length + replacement.length;
125
* A rule with a set of whole-word exceptions.
127
protected static class RuleWithSetExceptions extends Rule {
128
protected final CharArraySet exceptions;
130
public RuleWithSetExceptions(String suffix, int min, String replacement,
131
String[] exceptions) {
132
super(suffix, min, replacement);
133
for (int i = 0; i < exceptions.length; i++) {
134
if (!exceptions[i].endsWith(suffix))
135
System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
137
this.exceptions = new CharArraySet(Version.LUCENE_31,
138
Arrays.asList(exceptions), false);
142
public boolean matches(char s[], int len) {
143
return super.matches(s, len) && !exceptions.contains(s, 0, len);
148
* A rule with a set of exceptional suffixes.
150
protected static class RuleWithSuffixExceptions extends Rule {
151
// TODO: use a more efficient datastructure: automaton?
152
protected final char[][] exceptions;
154
public RuleWithSuffixExceptions(String suffix, int min, String replacement,
155
String[] exceptions) {
156
super(suffix, min, replacement);
157
for (int i = 0; i < exceptions.length; i++) {
158
if (!exceptions[i].endsWith(suffix))
159
System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
161
this.exceptions = new char[exceptions.length][];
162
for (int i = 0; i < exceptions.length; i++)
163
this.exceptions[i] = exceptions[i].toCharArray();
167
public boolean matches(char s[], int len) {
168
if (!super.matches(s, len))
171
for (int i = 0; i < exceptions.length; i++)
172
if (endsWith(s, len, exceptions[i]))
180
* A step containing a list of rules.
182
protected static class Step {
183
protected final String name;
184
protected final Rule rules[];
185
protected final int min;
186
protected final char[][] suffixes;
190
* @param name Step's name.
191
* @param rules an ordered list of rules.
192
* @param min minimum word size. if this is 0 it is automatically calculated.
193
* @param suffixes optional list of conditional suffixes. may be null.
195
public Step(String name, Rule rules[], int min, String suffixes[]) {
199
min = Integer.MAX_VALUE;
201
min = Math.min(min, r.min + r.suffix.length);
205
if (suffixes == null || suffixes.length == 0) {
206
this.suffixes = null;
208
this.suffixes = new char[suffixes.length][];
209
for (int i = 0; i < suffixes.length; i++)
210
this.suffixes[i] = suffixes[i].toCharArray();
215
* @return new valid length of the string after applying the entire step.
217
public int apply(char s[], int len) {
221
if (suffixes != null) {
222
boolean found = false;
224
for (int i = 0; i < suffixes.length; i++)
225
if (endsWith(s, len, suffixes[i])) {
230
if (!found) return len;
233
for (int i = 0; i < rules.length; i++) {
234
if (rules[i].matches(s, len))
235
return rules[i].replace(s, len);
243
* Parse a resource file into an RSLP stemmer description.
244
* @return a Map containing the named Steps in this description.
246
protected static Map<String,Step> parse(Class<? extends RSLPStemmerBase> clazz, String resource) {
247
// TODO: this parser is ugly, but works. use a jflex grammar instead.
249
InputStream is = clazz.getResourceAsStream(resource);
250
LineNumberReader r = new LineNumberReader(new InputStreamReader(is, "UTF-8"));
251
Map<String,Step> steps = new HashMap<String,Step>();
253
while ((step = readLine(r)) != null) {
254
Step s = parseStep(r, step);
255
steps.put(s.name, s);
259
} catch (IOException e) {
260
throw new RuntimeException(e);
264
private static final Pattern headerPattern =
265
Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*(0|1),\\s*\\{(.*)\\},\\s*$");
266
private static final Pattern stripPattern =
267
Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+)\\s*\\}\\s*(,|(\\}\\s*;))$");
268
private static final Pattern repPattern =
269
Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\"\\}\\s*(,|(\\}\\s*;))$");
270
private static final Pattern excPattern =
271
Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\",\\s*\\{(.*)\\}\\s*\\}\\s*(,|(\\}\\s*;))$");
273
private static Step parseStep(LineNumberReader r, String header) throws IOException {
274
Matcher matcher = headerPattern.matcher(header);
275
if (!matcher.find()) {
276
throw new RuntimeException("Illegal Step header specified at line " + r.getLineNumber());
278
assert matcher.groupCount() == 4;
279
String name = matcher.group(1);
280
int min = Integer.parseInt(matcher.group(2));
281
int type = Integer.parseInt(matcher.group(3));
282
String suffixes[] = parseList(matcher.group(4));
283
Rule rules[] = parseRules(r, type);
284
return new Step(name, rules, min, suffixes);
287
private static Rule[] parseRules(LineNumberReader r, int type) throws IOException {
288
List<Rule> rules = new ArrayList<Rule>();
290
while ((line = readLine(r)) != null) {
291
Matcher matcher = stripPattern.matcher(line);
292
if (matcher.matches()) {
293
rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), ""));
295
matcher = repPattern.matcher(line);
296
if (matcher.matches()) {
297
rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), matcher.group(3)));
299
matcher = excPattern.matcher(line);
300
if (matcher.matches()) {
302
rules.add(new RuleWithSuffixExceptions(matcher.group(1),
303
Integer.parseInt(matcher.group(2)),
305
parseList(matcher.group(4))));
307
rules.add(new RuleWithSetExceptions(matcher.group(1),
308
Integer.parseInt(matcher.group(2)),
310
parseList(matcher.group(4))));
313
throw new RuntimeException("Illegal Step rule specified at line " + r.getLineNumber());
317
if (line.endsWith(";"))
318
return rules.toArray(new Rule[rules.size()]);
323
private static String[] parseList(String s) {
326
String list[] = s.split(",");
327
for (int i = 0; i < list.length; i++)
328
list[i] = parseString(list[i].trim());
332
private static String parseString(String s) {
333
return s.substring(1, s.length()-1);
336
private static String readLine(LineNumberReader r) throws IOException {
338
while ((line = r.readLine()) != null) {
340
if (line.length() > 0 && line.charAt(0) != '#')