2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
* Copyright (C) 2001 University of Waikato, Hamilton, New Zealand
23
package weka.core.stemmers;
25
import weka.core.TechnicalInformation;
26
import weka.core.TechnicalInformation.Type;
27
import weka.core.TechnicalInformation.Field;
28
import weka.core.TechnicalInformationHandler;
30
import java.util.HashMap;
33
<!-- globalinfo-start -->
34
* A stemmer based on the Lovins stemmer, described here:<br/>
36
* Julie Beth Lovins (1968). Development of a stemming algorithm. Mechanical Translation and Computational Linguistics. 11:22-31.
38
<!-- globalinfo-end -->
40
<!-- technical-bibtex-start -->
43
* @article{Lovins1968,
44
* author = {Julie Beth Lovins},
45
* journal = {Mechanical Translation and Computational Linguistics},
47
* title = {Development of a stemming algorithm},
53
<!-- technical-bibtex-end -->
55
* @author Eibe Frank (eibe at cs dot waikato dot ac dot nz)
56
* @version $Revision: 1.6 $
58
public class LovinsStemmer
59
implements Stemmer, TechnicalInformationHandler {
61
/** for serialization */
62
static final long serialVersionUID = -6113024782588197L;
64
/** Enters C version compatibility mode if set to true (emulates
65
features of the original C implementation that are inconsistent
66
with the algorithm as described in Lovins's paper) */
67
private static boolean m_CompMode = false;
69
/** The hash tables containing the list of endings. */
70
private static HashMap m_l11 = null;
71
private static HashMap m_l10 = null;
72
private static HashMap m_l9 = null;
73
private static HashMap m_l8 = null;
74
private static HashMap m_l7 = null;
75
private static HashMap m_l6 = null;
76
private static HashMap m_l5 = null;
77
private static HashMap m_l4 = null;
78
private static HashMap m_l3 = null;
79
private static HashMap m_l2 = null;
80
private static HashMap m_l1 = null;
84
m_l11 = new HashMap();
85
m_l11.put("alistically", "B");
86
m_l11.put("arizability", "A");
87
m_l11.put("izationally", "B");
88
m_l10 = new HashMap();
89
m_l10.put("antialness", "A");
90
m_l10.put("arisations", "A");
91
m_l10.put("arizations", "A");
92
m_l10.put("entialness", "A");
94
m_l9.put("allically", "C");
95
m_l9.put("antaneous", "A");
96
m_l9.put("antiality", "A");
97
m_l9.put("arisation", "A");
98
m_l9.put("arization", "A");
99
m_l9.put("ationally", "B");
100
m_l9.put("ativeness", "A");
101
m_l9.put("eableness", "E");
102
m_l9.put("entations", "A");
103
m_l9.put("entiality", "A");
104
m_l9.put("entialize", "A");
105
m_l9.put("entiation", "A");
106
m_l9.put("ionalness", "A");
107
m_l9.put("istically", "A");
108
m_l9.put("itousness", "A");
109
m_l9.put("izability", "A");
110
m_l9.put("izational", "A");
111
m_l8 = new HashMap();
112
m_l8.put("ableness", "A");
113
m_l8.put("arizable", "A");
114
m_l8.put("entation", "A");
115
m_l8.put("entially", "A");
116
m_l8.put("eousness", "A");
117
m_l8.put("ibleness", "A");
118
m_l8.put("icalness", "A");
119
m_l8.put("ionalism", "A");
120
m_l8.put("ionality", "A");
121
m_l8.put("ionalize", "A");
122
m_l8.put("iousness", "A");
123
m_l8.put("izations", "A");
124
m_l8.put("lessness", "A");
125
m_l7 = new HashMap();
126
m_l7.put("ability", "A");
127
m_l7.put("aically", "A");
128
m_l7.put("alistic", "B");
129
m_l7.put("alities", "A");
130
m_l7.put("ariness", "E");
131
m_l7.put("aristic", "A");
132
m_l7.put("arizing", "A");
133
m_l7.put("ateness", "A");
134
m_l7.put("atingly", "A");
135
m_l7.put("ational", "B");
136
m_l7.put("atively", "A");
137
m_l7.put("ativism", "A");
138
m_l7.put("elihood", "E");
139
m_l7.put("encible", "A");
140
m_l7.put("entally", "A");
141
m_l7.put("entials", "A");
142
m_l7.put("entiate", "A");
143
m_l7.put("entness", "A");
144
m_l7.put("fulness", "A");
145
m_l7.put("ibility", "A");
146
m_l7.put("icalism", "A");
147
m_l7.put("icalist", "A");
148
m_l7.put("icality", "A");
149
m_l7.put("icalize", "A");
150
m_l7.put("ication", "G");
151
m_l7.put("icianry", "A");
152
m_l7.put("ination", "A");
153
m_l7.put("ingness", "A");
154
m_l7.put("ionally", "A");
155
m_l7.put("isation", "A");
156
m_l7.put("ishness", "A");
157
m_l7.put("istical", "A");
158
m_l7.put("iteness", "A");
159
m_l7.put("iveness", "A");
160
m_l7.put("ivistic", "A");
161
m_l7.put("ivities", "A");
162
m_l7.put("ization", "F");
163
m_l7.put("izement", "A");
164
m_l7.put("oidally", "A");
165
m_l7.put("ousness", "A");
166
m_l6 = new HashMap();
167
m_l6.put("aceous", "A");
168
m_l6.put("acious", "B");
169
m_l6.put("action", "G");
170
m_l6.put("alness", "A");
171
m_l6.put("ancial", "A");
172
m_l6.put("ancies", "A");
173
m_l6.put("ancing", "B");
174
m_l6.put("ariser", "A");
175
m_l6.put("arized", "A");
176
m_l6.put("arizer", "A");
177
m_l6.put("atable", "A");
178
m_l6.put("ations", "B");
179
m_l6.put("atives", "A");
180
m_l6.put("eature", "Z");
181
m_l6.put("efully", "A");
182
m_l6.put("encies", "A");
183
m_l6.put("encing", "A");
184
m_l6.put("ential", "A");
185
m_l6.put("enting", "C");
186
m_l6.put("entist", "A");
187
m_l6.put("eously", "A");
188
m_l6.put("ialist", "A");
189
m_l6.put("iality", "A");
190
m_l6.put("ialize", "A");
191
m_l6.put("ically", "A");
192
m_l6.put("icance", "A");
193
m_l6.put("icians", "A");
194
m_l6.put("icists", "A");
195
m_l6.put("ifully", "A");
196
m_l6.put("ionals", "A");
197
m_l6.put("ionate", "D");
198
m_l6.put("ioning", "A");
199
m_l6.put("ionist", "A");
200
m_l6.put("iously", "A");
201
m_l6.put("istics", "A");
202
m_l6.put("izable", "E");
203
m_l6.put("lessly", "A");
204
m_l6.put("nesses", "A");
205
m_l6.put("oidism", "A");
206
m_l5 = new HashMap();
207
m_l5.put("acies", "A");
208
m_l5.put("acity", "A");
209
m_l5.put("aging", "B");
210
m_l5.put("aical", "A");
212
m_l5.put("alist", "A");
214
m_l5.put("alism", "B");
215
m_l5.put("ality", "A");
216
m_l5.put("alize", "A");
217
m_l5.put("allic", "b");
218
m_l5.put("anced", "B");
219
m_l5.put("ances", "B");
220
m_l5.put("antic", "C");
221
m_l5.put("arial", "A");
222
m_l5.put("aries", "A");
223
m_l5.put("arily", "A");
224
m_l5.put("arity", "B");
225
m_l5.put("arize", "A");
226
m_l5.put("aroid", "A");
227
m_l5.put("ately", "A");
228
m_l5.put("ating", "I");
229
m_l5.put("ation", "B");
230
m_l5.put("ative", "A");
231
m_l5.put("ators", "A");
232
m_l5.put("atory", "A");
233
m_l5.put("ature", "E");
234
m_l5.put("early", "Y");
235
m_l5.put("ehood", "A");
236
m_l5.put("eless", "A");
238
m_l5.put("elily", "A");
240
m_l5.put("elity", "A");
242
m_l5.put("ement", "A");
243
m_l5.put("enced", "A");
244
m_l5.put("ences", "A");
245
m_l5.put("eness", "E");
246
m_l5.put("ening", "E");
247
m_l5.put("ental", "A");
248
m_l5.put("ented", "C");
249
m_l5.put("ently", "A");
250
m_l5.put("fully", "A");
251
m_l5.put("ially", "A");
252
m_l5.put("icant", "A");
253
m_l5.put("ician", "A");
254
m_l5.put("icide", "A");
255
m_l5.put("icism", "A");
256
m_l5.put("icist", "A");
257
m_l5.put("icity", "A");
258
m_l5.put("idine", "I");
259
m_l5.put("iedly", "A");
260
m_l5.put("ihood", "A");
261
m_l5.put("inate", "A");
262
m_l5.put("iness", "A");
263
m_l5.put("ingly", "B");
264
m_l5.put("inism", "J");
265
m_l5.put("inity", "c");
266
m_l5.put("ional", "A");
267
m_l5.put("ioned", "A");
268
m_l5.put("ished", "A");
269
m_l5.put("istic", "A");
270
m_l5.put("ities", "A");
271
m_l5.put("itous", "A");
272
m_l5.put("ively", "A");
273
m_l5.put("ivity", "A");
274
m_l5.put("izers", "F");
275
m_l5.put("izing", "F");
276
m_l5.put("oidal", "A");
277
m_l5.put("oides", "A");
278
m_l5.put("otide", "A");
279
m_l5.put("ously", "A");
280
m_l4 = new HashMap();
281
m_l4.put("able", "A");
282
m_l4.put("ably", "A");
283
m_l4.put("ages", "B");
284
m_l4.put("ally", "B");
285
m_l4.put("ance", "B");
286
m_l4.put("ancy", "B");
287
m_l4.put("ants", "B");
288
m_l4.put("aric", "A");
289
m_l4.put("arly", "K");
290
m_l4.put("ated", "I");
291
m_l4.put("ates", "A");
292
m_l4.put("atic", "B");
293
m_l4.put("ator", "A");
294
m_l4.put("ealy", "Y");
295
m_l4.put("edly", "E");
296
m_l4.put("eful", "A");
297
m_l4.put("eity", "A");
298
m_l4.put("ence", "A");
299
m_l4.put("ency", "A");
300
m_l4.put("ened", "E");
301
m_l4.put("enly", "E");
302
m_l4.put("eous", "A");
303
m_l4.put("hood", "A");
304
m_l4.put("ials", "A");
305
m_l4.put("ians", "A");
306
m_l4.put("ible", "A");
307
m_l4.put("ibly", "A");
308
m_l4.put("ical", "A");
309
m_l4.put("ides", "L");
310
m_l4.put("iers", "A");
311
m_l4.put("iful", "A");
312
m_l4.put("ines", "M");
313
m_l4.put("ings", "N");
314
m_l4.put("ions", "B");
315
m_l4.put("ious", "A");
316
m_l4.put("isms", "B");
317
m_l4.put("ists", "A");
318
m_l4.put("itic", "H");
319
m_l4.put("ized", "F");
320
m_l4.put("izer", "F");
321
m_l4.put("less", "A");
322
m_l4.put("lily", "A");
323
m_l4.put("ness", "A");
324
m_l4.put("ogen", "A");
325
m_l4.put("ward", "A");
326
m_l4.put("wise", "A");
327
m_l4.put("ying", "B");
328
m_l4.put("yish", "A");
329
m_l3 = new HashMap();
330
m_l3.put("acy", "A");
331
m_l3.put("age", "B");
332
m_l3.put("aic", "A");
333
m_l3.put("als", "b");
334
m_l3.put("ant", "B");
335
m_l3.put("ars", "O");
336
m_l3.put("ary", "F");
337
m_l3.put("ata", "A");
338
m_l3.put("ate", "A");
339
m_l3.put("eal", "Y");
340
m_l3.put("ear", "Y");
341
m_l3.put("ely", "E");
342
m_l3.put("ene", "E");
343
m_l3.put("ent", "C");
344
m_l3.put("ery", "E");
345
m_l3.put("ese", "A");
346
m_l3.put("ful", "A");
347
m_l3.put("ial", "A");
348
m_l3.put("ian", "A");
349
m_l3.put("ics", "A");
350
m_l3.put("ide", "L");
351
m_l3.put("ied", "A");
352
m_l3.put("ier", "A");
353
m_l3.put("ies", "P");
354
m_l3.put("ily", "A");
355
m_l3.put("ine", "M");
356
m_l3.put("ing", "N");
357
m_l3.put("ion", "Q");
358
m_l3.put("ish", "C");
359
m_l3.put("ism", "B");
360
m_l3.put("ist", "A");
361
m_l3.put("ite", "a");
362
m_l3.put("ity", "A");
363
m_l3.put("ium", "A");
364
m_l3.put("ive", "A");
365
m_l3.put("ize", "F");
366
m_l3.put("oid", "A");
367
m_l3.put("one", "R");
368
m_l3.put("ous", "A");
369
m_l2 = new HashMap();
386
m_l2.put("s\'", "A");
387
m_l2.put("\'s", "A");
388
m_l1 = new HashMap();
398
* Returns a string describing the stemmer
399
* @return a description suitable for
400
* displaying in the explorer/experimenter gui
402
public String globalInfo() {
404
"A stemmer based on the Lovins stemmer, described here:\n\n"
405
+ getTechnicalInformation().toString();
409
* Returns an instance of a TechnicalInformation object, containing
410
* detailed information about the technical background of this class,
411
* e.g., paper reference or book this class is based on.
413
* @return the technical information about this class
415
public TechnicalInformation getTechnicalInformation() {
416
TechnicalInformation result;
418
result = new TechnicalInformation(Type.ARTICLE);
419
result.setValue(Field.AUTHOR, "Julie Beth Lovins");
420
result.setValue(Field.YEAR, "1968");
421
result.setValue(Field.TITLE, "Development of a stemming algorithm");
422
result.setValue(Field.JOURNAL, "Mechanical Translation and Computational Linguistics");
423
result.setValue(Field.VOLUME, "11");
424
result.setValue(Field.PAGES, "22-31");
430
* Finds and removes ending from given word.
432
* @param word the word to work on
433
* @return the processed word
435
private String removeEnding(String word) {
437
int length = word.length();
441
if (length - el > 1) {
442
String ending = word.substring(length - el);
443
String conditionCode = null;
445
case 11: conditionCode = (String)m_l11.get(ending);
447
case 10: conditionCode = (String)m_l10.get(ending);
449
case 9: conditionCode = (String)m_l9.get(ending);
451
case 8: conditionCode = (String)m_l8.get(ending);
453
case 7: conditionCode = (String)m_l7.get(ending);
455
case 6: conditionCode = (String)m_l6.get(ending);
457
case 5: conditionCode = (String)m_l5.get(ending);
459
case 4: conditionCode = (String)m_l4.get(ending);
461
case 3: conditionCode = (String)m_l3.get(ending);
463
case 2: conditionCode = (String)m_l2.get(ending);
465
case 1: conditionCode = (String)m_l1.get(ending);
469
if (conditionCode != null) {
470
switch (conditionCode.charAt(0)) {
472
return word.substring(0, length - el);
474
if (length - el > 2) {
475
return word.substring(0, length - el);
479
if (length - el > 3) {
480
return word.substring(0, length - el);
484
if (length - el > 4) {
485
return word.substring(0, length - el);
489
if (word.charAt(length - el - 1) != 'e') {
490
return word.substring(0, length - el);
494
if ((length - el > 2) &&
495
(word.charAt(length - el - 1) != 'e')) {
496
return word.substring(0, length - el);
500
if ((length - el > 2) &&
501
(word.charAt(length - el - 1) == 'f')) {
502
return word.substring(0, length - el);
506
if ((word.charAt(length - el - 1) == 't') ||
507
((word.charAt(length - el - 1) == 'l') &&
508
(word.charAt(length - el - 2) == 'l'))) {
509
return word.substring(0, length - el);
513
if ((word.charAt(length - el - 1) != 'o') &&
514
(word.charAt(length - el - 1) != 'e')) {
515
return word.substring(0, length - el);
519
if ((word.charAt(length - el - 1) != 'a') &&
520
(word.charAt(length - el - 1) != 'e')) {
521
return word.substring(0, length - el);
525
if ((length - el > 2) &&
526
((word.charAt(length - el - 1) == 'l') ||
527
(word.charAt(length - el - 1) == 'i') ||
528
((word.charAt(length - el - 1) == 'e') &&
529
(word.charAt(length - el - 3) == 'u')))) {
530
return word.substring(0, length - el);
534
if ((word.charAt(length - el - 1) != 'u') &&
535
(word.charAt(length - el - 1) != 'x') &&
536
((word.charAt(length - el - 1) != 's') ||
537
(word.charAt(length - el - 2) == 'o'))) {
538
return word.substring(0, length - el);
542
if ((word.charAt(length - el - 1) != 'a') &&
543
(word.charAt(length - el - 1) != 'c') &&
544
(word.charAt(length - el - 1) != 'e') &&
545
(word.charAt(length - el - 1) != 'm')) {
546
return word.substring(0, length - el);
550
if ((length - el > 3) ||
551
((length - el == 3) &&
552
((word.charAt(length - el - 3) != 's')))) {
553
return word.substring(0, length - el);
557
if ((word.charAt(length - el - 1) == 'l') ||
558
(word.charAt(length - el - 1) == 'i')) {
559
return word.substring(0, length - el);
563
if (word.charAt(length - el - 1) != 'c') {
564
return word.substring(0, length - el);
568
if ((length - el > 2) &&
569
(word.charAt(length - el - 1) != 'l') &&
570
(word.charAt(length - el - 1) != 'n')) {
571
return word.substring(0, length - el);
575
if ((word.charAt(length - el - 1) == 'n') ||
576
(word.charAt(length - el - 1) == 'r')) {
577
return word.substring(0, length - el);
581
if (((word.charAt(length - el - 1) == 'r') &&
582
(word.charAt(length - el - 2) == 'd')) ||
583
((word.charAt(length - el - 1) == 't') &&
584
(word.charAt(length - el - 2) != 't'))) {
585
return word.substring(0, length - el);
589
if ((word.charAt(length - el - 1) == 's') ||
590
((word.charAt(length - el - 1) == 't') &&
591
(word.charAt(length - el - 2) != 'o'))) {
592
return word.substring(0, length - el);
596
if ((word.charAt(length - el - 1) == 'l') ||
597
(word.charAt(length - el - 1) == 'm') ||
598
(word.charAt(length - el - 1) == 'n') ||
599
(word.charAt(length - el - 1) == 'r')) {
600
return word.substring(0, length - el);
604
if (word.charAt(length - el - 1) == 'c') {
605
return word.substring(0, length - el);
609
if ((word.charAt(length - el - 1) != 's') &&
610
(word.charAt(length - el - 1) != 'u')) {
611
return word.substring(0, length - el);
615
if ((word.charAt(length - el - 1) == 'l') ||
616
(word.charAt(length - el - 1) == 'i') ||
617
((length - el > 2) &&
618
(word.charAt(length - el - 1) == 'e') &&
619
(word.charAt(length - el - 3) == 'u'))) {
620
return word.substring(0, length - el);
624
if ((word.charAt(length - el - 1) == 'n') &&
625
(word.charAt(length - el - 2) == 'i')) {
626
return word.substring(0, length - el);
630
if (word.charAt(length - el - 1) != 'f') {
631
return word.substring(0, length - el);
635
if ((word.charAt(length - el - 1) == 'd') ||
636
(word.charAt(length - el - 1) == 'f') ||
637
(((word.charAt(length - el - 1) == 'h') &&
638
(word.charAt(length - el - 2) == 'p'))) ||
639
(((word.charAt(length - el - 1) == 'h') &&
640
(word.charAt(length - el - 2) == 't'))) ||
641
(word.charAt(length - el - 1) == 'l') ||
642
(((word.charAt(length - el - 1) == 'r') &&
643
(word.charAt(length - el - 2) == 'e'))) ||
644
(((word.charAt(length - el - 1) == 'r') &&
645
(word.charAt(length - el - 2) == 'o'))) ||
646
(((word.charAt(length - el - 1) == 's') &&
647
(word.charAt(length - el - 2) == 'e'))) ||
648
(word.charAt(length - el - 1) == 't')) {
649
return word.substring(0, length - el);
654
if (((length - el == 3 ) &&
655
(!((word.charAt(length - el - 1) == 't') &&
656
(word.charAt(length - el - 2) == 'e') &&
657
(word.charAt(length - el - 3) == 'm')))) ||
658
((length - el > 3) &&
659
(!((word.charAt(length - el - 1) == 't') &&
660
(word.charAt(length - el - 2) == 's') &&
661
(word.charAt(length - el - 3) == 'y') &&
662
(word.charAt(length - el - 4) == 'r'))))) {
663
return word.substring(0, length - el);
666
if ((length - el > 2) &&
667
(!((word.charAt(length - el - 1) == 't') &&
668
(word.charAt(length - el - 2) == 'e') &&
669
(word.charAt(length - el - 3) == 'm'))) &&
670
((length - el < 4) ||
671
(!((word.charAt(length - el - 1) == 't') &&
672
(word.charAt(length - el - 2) == 's') &&
673
(word.charAt(length - el - 3) == 'y') &&
674
(word.charAt(length - el - 4) == 'r'))))) {
675
return word.substring(0, length - el);
680
if (word.charAt(length - el - 1) == 'l') {
681
return word.substring(0, length - el);
685
throw new IllegalArgumentException("Fatal error.");
695
* Recodes ending of given word.
697
* @param word the word to work on
698
* @return the processed word
700
private String recodeEnding(String word) {
702
int lastPos = word.length() - 1;
705
if (word.endsWith("bb") ||
706
word.endsWith("dd") ||
707
word.endsWith("gg") ||
708
word.endsWith("ll") ||
709
word.endsWith("mm") ||
710
word.endsWith("nn") ||
711
word.endsWith("pp") ||
712
word.endsWith("rr") ||
713
word.endsWith("ss") ||
714
word.endsWith("tt")) {
715
word = word.substring(0, lastPos);
720
if (word.endsWith("iev")) {
721
word = word.substring(0, lastPos - 2).concat("ief");
725
if (word.endsWith("uct")) {
726
word = word.substring(0, lastPos - 2).concat("uc");
731
if (word.endsWith("umpt")) {
732
word = word.substring(0, lastPos - 3).concat("um");
737
if (word.endsWith("rpt")) {
738
word = word.substring(0, lastPos - 2).concat("rb");
743
if (word.endsWith("urs")) {
744
word = word.substring(0, lastPos - 2).concat("ur");
749
if (word.endsWith("istr")) {
750
word = word.substring(0, lastPos - 3).concat("ister");
755
if (word.endsWith("metr")) {
756
word = word.substring(0, lastPos - 3).concat("meter");
761
if (word.endsWith("olv")) {
762
word = word.substring(0, lastPos - 2).concat("olut");
767
if (word.endsWith("ul")) {
768
if ((lastPos - 2 < 0) ||
769
((word.charAt(lastPos - 2) != 'a') &&
770
(word.charAt(lastPos - 2) != 'i') &&
771
(word.charAt(lastPos - 2) != 'o'))) {
772
word = word.substring(0, lastPos - 1).concat("l");
778
if (word.endsWith("bex")) {
779
word = word.substring(0, lastPos - 2).concat("bic");
783
if (word.endsWith("dex")) {
784
word = word.substring(0, lastPos - 2).concat("dic");
788
if (word.endsWith("pex")) {
789
word = word.substring(0, lastPos - 2).concat("pic");
793
if (word.endsWith("tex")) {
794
word = word.substring(0, lastPos - 2).concat("tic");
798
if (word.endsWith("ax")) {
799
word = word.substring(0, lastPos - 1).concat("ac");
803
if (word.endsWith("ex")) {
804
word = word.substring(0, lastPos - 1).concat("ec");
808
if (word.endsWith("ix")) {
809
word = word.substring(0, lastPos - 1).concat("ic");
813
if (word.endsWith("lux")) {
814
word = word.substring(0, lastPos - 2).concat("luc");
818
if (word.endsWith("uad")) {
819
word = word.substring(0, lastPos - 2).concat("uas");
823
if (word.endsWith("vad")) {
824
word = word.substring(0, lastPos - 2).concat("vas");
828
if (word.endsWith("cid")) {
829
word = word.substring(0, lastPos - 2).concat("cis");
833
if (word.endsWith("lid")) {
834
word = word.substring(0, lastPos - 2).concat("lis");
838
if (word.endsWith("erid")) {
839
word = word.substring(0, lastPos - 3).concat("eris");
843
if (word.endsWith("pand")) {
844
word = word.substring(0, lastPos - 3).concat("pans");
848
if (word.endsWith("end")) {
849
if ((lastPos - 3 < 0) ||
850
(word.charAt(lastPos - 3) != 's')) {
851
word = word.substring(0, lastPos - 2).concat("ens");
856
if (word.endsWith("ond")) {
857
word = word.substring(0, lastPos - 2).concat("ons");
861
if (word.endsWith("lud")) {
862
word = word.substring(0, lastPos - 2).concat("lus");
866
if (word.endsWith("rud")) {
867
word = word.substring(0, lastPos - 2).concat("rus");
871
if (word.endsWith("her")) {
872
if ((lastPos - 3 < 0) ||
873
((word.charAt(lastPos - 3) != 'p') &&
874
(word.charAt(lastPos - 3) != 't'))) {
875
word = word.substring(0, lastPos - 2).concat("hes");
880
if (word.endsWith("mit")) {
881
word = word.substring(0, lastPos - 2).concat("mis");
885
if (word.endsWith("end")) {
886
if ((lastPos - 3 < 0) ||
887
(word.charAt(lastPos - 3) != 'm')) {
888
word = word.substring(0, lastPos - 2).concat("ens");
893
if (word.endsWith("ert")) {
894
word = word.substring(0, lastPos - 2).concat("ers");
898
if (word.endsWith("et")) {
899
if ((lastPos - 2 < 0) ||
900
(word.charAt(lastPos - 2) != 'n')) {
901
word = word.substring(0, lastPos - 1).concat("es");
906
if (word.endsWith("yt")) {
907
word = word.substring(0, lastPos - 1).concat("ys");
911
if (word.endsWith("yz")) {
912
word = word.substring(0, lastPos - 1).concat("ys");
919
* Returns the stemmed version of the given word.
920
* Word is converted to lower case before stemming.
922
* @param word a string consisting of a single word
923
* @return the stemmed word
925
public String stem(String word) {
927
if (word.length() > 2) {
928
return recodeEnding(removeEnding(word.toLowerCase()));
930
return word.toLowerCase();
935
* Stems everything in the given string. String
936
* is converted to lower case before stemming.
938
* @param str the string to stem
939
* @return the processed string
941
public String stemString(String str) {
943
StringBuffer result = new StringBuffer();
945
for (int j = 0; j < str.length(); j++) {
946
char c = str.charAt(j);
947
if (Character.isLetterOrDigit(c)) {
951
} else if (c == '\'') {
957
result.append(stem(str.substring(start, j)));
964
result.append(stem(str.substring(start, str.length())));
966
return result.toString();
970
* returns a string representation of the stemmer
972
* @return a string representation of the stemmer
974
public String toString() {
975
return getClass().getName();
979
* Runs the stemmer with the given options
981
* @param args the options
983
public static void main(String[] args) {
985
Stemming.useStemmer(new LovinsStemmer(), args);
987
catch (Exception e) {