2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
9
* http://www.apache.org/licenses/LICENSE-2.0
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
19
This file was partially derived from the
20
original CIIR University of Massachusetts Amherst version of KStemmer.java (license for
21
the original shown below)
26
Center for Intelligent Information Retrieval,
27
University of Massachusetts, Amherst.
30
Redistribution and use in source and binary forms, with or without modification,
31
are permitted provided that the following conditions are met:
33
1. Redistributions of source code must retain the above copyright notice, this
34
list of conditions and the following disclaimer.
36
2. Redistributions in binary form must reproduce the above copyright notice,
37
this list of conditions and the following disclaimer in the documentation
38
and/or other materials provided with the distribution.
40
3. The names "Center for Intelligent Information Retrieval" and
41
"University of Massachusetts" must not be used to endorse or promote products
42
derived from this software without prior written permission. To obtain
43
permission, contact info@ciir.cs.umass.edu.
45
THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS
46
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
47
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
49
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
50
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
51
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57
package org.apache.lucene.analysis.en;
59
import org.apache.lucene.analysis.CharArrayMap;
60
import org.apache.lucene.analysis.util.OpenStringBuilder;
62
* <p>Title: Kstemmer</p>
63
* <p>Description: This is a java version of Bob Krovetz' kstem stemmer</p>
64
* <p>Copyright: Copyright 2008, Luicid Imagination, Inc. </p>
65
* <p>Copyright: Copyright 2003, CIIR University of Massachusetts Amherst (http://ciir.cs.umass.edu) </p>
67
import org.apache.lucene.util.Version;
70
* This class implements the Kstem algorithm
72
public class KStemmer {
73
static private final int MaxWordLen = 50;
75
static private final String[] exceptionWords = {"aide", "bathe", "caste",
76
"cute", "dame", "dime", "doge", "done", "dune", "envelope", "gage",
77
"grille", "grippe", "lobe", "mane", "mare", "nape", "node", "pane",
78
"pate", "plane", "pope", "programme", "quite", "ripe", "rote", "rune",
79
"sage", "severe", "shoppe", "sine", "slime", "snipe", "steppe", "suite",
80
"swinge", "tare", "tine", "tope", "tripe", "twine"};
82
static private final String[][] directConflations = { {"aging", "age"},
83
{"going", "go"}, {"goes", "go"}, {"lying", "lie"}, {"using", "use"},
84
{"owing", "owe"}, {"suing", "sue"}, {"dying", "die"}, {"tying", "tie"},
85
{"vying", "vie"}, {"aged", "age"}, {"used", "use"}, {"vied", "vie"},
86
{"cued", "cue"}, {"died", "die"}, {"eyed", "eye"}, {"hued", "hue"},
87
{"iced", "ice"}, {"lied", "lie"}, {"owed", "owe"}, {"sued", "sue"},
88
{"toed", "toe"}, {"tied", "tie"}, {"does", "do"}, {"doing", "do"},
89
{"aeronautical", "aeronautics"}, {"mathematical", "mathematics"},
90
{"political", "politics"}, {"metaphysical", "metaphysics"},
91
{"cylindrical", "cylinder"}, {"nazism", "nazi"},
92
{"ambiguity", "ambiguous"}, {"barbarity", "barbarous"},
93
{"credulity", "credulous"}, {"generosity", "generous"},
94
{"spontaneity", "spontaneous"}, {"unanimity", "unanimous"},
95
{"voracity", "voracious"}, {"fled", "flee"}, {"miscarriage", "miscarry"}};
97
static private final String[][] countryNationality = {
98
{"afghan", "afghanistan"}, {"african", "africa"},
99
{"albanian", "albania"}, {"algerian", "algeria"},
100
{"american", "america"}, {"andorran", "andorra"}, {"angolan", "angola"},
101
{"arabian", "arabia"}, {"argentine", "argentina"},
102
{"armenian", "armenia"}, {"asian", "asia"}, {"australian", "australia"},
103
{"austrian", "austria"}, {"azerbaijani", "azerbaijan"},
104
{"azeri", "azerbaijan"}, {"bangladeshi", "bangladesh"},
105
{"belgian", "belgium"}, {"bermudan", "bermuda"}, {"bolivian", "bolivia"},
106
{"bosnian", "bosnia"}, {"botswanan", "botswana"},
107
{"brazilian", "brazil"}, {"british", "britain"},
108
{"bulgarian", "bulgaria"}, {"burmese", "burma"},
109
{"californian", "california"}, {"cambodian", "cambodia"},
110
{"canadian", "canada"}, {"chadian", "chad"}, {"chilean", "chile"},
111
{"chinese", "china"}, {"colombian", "colombia"}, {"croat", "croatia"},
112
{"croatian", "croatia"}, {"cuban", "cuba"}, {"cypriot", "cyprus"},
113
{"czechoslovakian", "czechoslovakia"}, {"danish", "denmark"},
114
{"egyptian", "egypt"}, {"equadorian", "equador"},
115
{"eritrean", "eritrea"}, {"estonian", "estonia"},
116
{"ethiopian", "ethiopia"}, {"european", "europe"}, {"fijian", "fiji"},
117
{"filipino", "philippines"}, {"finnish", "finland"},
118
{"french", "france"}, {"gambian", "gambia"}, {"georgian", "georgia"},
119
{"german", "germany"}, {"ghanian", "ghana"}, {"greek", "greece"},
120
{"grenadan", "grenada"}, {"guamian", "guam"},
121
{"guatemalan", "guatemala"}, {"guinean", "guinea"},
122
{"guyanan", "guyana"}, {"haitian", "haiti"}, {"hawaiian", "hawaii"},
123
{"holland", "dutch"}, {"honduran", "honduras"}, {"hungarian", "hungary"},
124
{"icelandic", "iceland"}, {"indonesian", "indonesia"},
125
{"iranian", "iran"}, {"iraqi", "iraq"}, {"iraqui", "iraq"},
126
{"irish", "ireland"}, {"israeli", "israel"},
127
{"italian", "italy"},
128
{"jamaican", "jamaica"},
129
{"japanese", "japan"},
130
{"jordanian", "jordan"},
131
{"kampuchean", "cambodia"},
134
{"kuwaiti", "kuwait"},
137
{"latvian", "latvia"},
138
{"lebanese", "lebanon"},
139
{"liberian", "liberia"},
141
{"lithuanian", "lithuania"},
142
{"macedonian", "macedonia"},
143
{"madagascan", "madagascar"},
144
{"malaysian", "malaysia"},
145
{"maltese", "malta"},
146
{"mauritanian", "mauritania"},
147
{"mexican", "mexico"},
148
{"micronesian", "micronesia"},
149
{"moldovan", "moldova"},
150
{"monacan", "monaco"},
151
{"mongolian", "mongolia"},
152
{"montenegran", "montenegro"},
153
{"moroccan", "morocco"},
154
{"myanmar", "burma"},
155
{"namibian", "namibia"},
156
{"nepalese", "nepal"},
157
// {"netherlands", "dutch"},
158
{"nicaraguan", "nicaragua"}, {"nigerian", "nigeria"},
159
{"norwegian", "norway"}, {"omani", "oman"}, {"pakistani", "pakistan"},
160
{"panamanian", "panama"}, {"papuan", "papua"},
161
{"paraguayan", "paraguay"}, {"peruvian", "peru"},
162
{"portuguese", "portugal"}, {"romanian", "romania"},
163
{"rumania", "romania"}, {"rumanian", "romania"}, {"russian", "russia"},
164
{"rwandan", "rwanda"}, {"samoan", "samoa"}, {"scottish", "scotland"},
165
{"serb", "serbia"}, {"serbian", "serbia"}, {"siam", "thailand"},
166
{"siamese", "thailand"}, {"slovakia", "slovak"}, {"slovakian", "slovak"},
167
{"slovenian", "slovenia"}, {"somali", "somalia"},
168
{"somalian", "somalia"}, {"spanish", "spain"}, {"swedish", "sweden"},
169
{"swiss", "switzerland"}, {"syrian", "syria"}, {"taiwanese", "taiwan"},
170
{"tanzanian", "tanzania"}, {"texan", "texas"}, {"thai", "thailand"},
171
{"tunisian", "tunisia"}, {"turkish", "turkey"}, {"ugandan", "uganda"},
172
{"ukrainian", "ukraine"}, {"uruguayan", "uruguay"},
173
{"uzbek", "uzbekistan"}, {"venezuelan", "venezuela"},
174
{"vietnamese", "viet"}, {"virginian", "virginia"}, {"yemeni", "yemen"},
175
{"yugoslav", "yugoslavia"}, {"yugoslavian", "yugoslavia"},
176
{"zambian", "zambia"}, {"zealander", "zealand"},
177
{"zimbabwean", "zimbabwe"}};
179
static private final String[] supplementDict = {"aids", "applicator",
180
"capacitor", "digitize", "electromagnet", "ellipsoid", "exosphere",
181
"extensible", "ferromagnet", "graphics", "hydromagnet", "polygraph",
182
"toroid", "superconduct", "backscatter", "connectionism"};
184
static private final String[] properNouns = {"abrams", "achilles",
185
"acropolis", "adams", "agnes", "aires", "alexander", "alexis", "alfred",
186
"algiers", "alps", "amadeus", "ames", "amos", "andes", "angeles",
187
"annapolis", "antilles", "aquarius", "archimedes", "arkansas", "asher",
188
"ashly", "athens", "atkins", "atlantis", "avis", "bahamas", "bangor",
189
"barbados", "barger", "bering", "brahms", "brandeis", "brussels",
190
"bruxelles", "cairns", "camoros", "camus", "carlos", "celts", "chalker",
191
"charles", "cheops", "ching", "christmas", "cocos", "collins",
192
"columbus", "confucius", "conners", "connolly", "copernicus", "cramer",
193
"cyclops", "cygnus", "cyprus", "dallas", "damascus", "daniels", "davies",
194
"davis", "decker", "denning", "dennis", "descartes", "dickens", "doris",
195
"douglas", "downs", "dreyfus", "dukakis", "dulles", "dumfries",
196
"ecclesiastes", "edwards", "emily", "erasmus", "euphrates", "evans",
197
"everglades", "fairbanks", "federales", "fisher", "fitzsimmons",
198
"fleming", "forbes", "fowler", "france", "francis", "goering",
199
"goodling", "goths", "grenadines", "guiness", "hades", "harding",
200
"harris", "hastings", "hawkes", "hawking", "hayes", "heights",
201
"hercules", "himalayas", "hippocrates", "hobbs", "holmes", "honduras",
202
"hopkins", "hughes", "humphreys", "illinois", "indianapolis",
203
"inverness", "iris", "iroquois", "irving", "isaacs", "italy", "james",
204
"jarvis", "jeffreys", "jesus", "jones", "josephus", "judas", "julius",
205
"kansas", "keynes", "kipling", "kiwanis", "lansing", "laos", "leeds",
206
"levis", "leviticus", "lewis", "louis", "maccabees", "madras",
207
"maimonides", "maldive", "massachusetts", "matthews", "mauritius",
208
"memphis", "mercedes", "midas", "mingus", "minneapolis", "mohammed",
209
"moines", "morris", "moses", "myers", "myknos", "nablus", "nanjing",
210
"nantes", "naples", "neal", "netherlands", "nevis", "nostradamus",
211
"oedipus", "olympus", "orleans", "orly", "papas", "paris", "parker",
212
"pauling", "peking", "pershing", "peter", "peters", "philippines",
213
"phineas", "pisces", "pryor", "pythagoras", "queens", "rabelais",
214
"ramses", "reynolds", "rhesus", "rhodes", "richards", "robins",
215
"rodgers", "rogers", "rubens", "sagittarius", "seychelles", "socrates",
216
"texas", "thames", "thomas", "tiberias", "tunis", "venus", "vilnius",
217
"wales", "warner", "wilkins", "williams", "wyoming", "xmas", "yonkers",
218
"zeus", "frances", "aarhus", "adonis", "andrews", "angus", "antares",
219
"aquinas", "arcturus", "ares", "artemis", "augustus", "ayers",
220
"barnabas", "barnes", "becker", "bejing", "biggs", "billings", "boeing",
221
"boris", "borroughs", "briggs", "buenos", "calais", "caracas", "cassius",
222
"cerberus", "ceres", "cervantes", "chantilly", "chartres", "chester",
223
"connally", "conner", "coors", "cummings", "curtis", "daedalus",
224
"dionysus", "dobbs", "dolores", "edmonds"};
226
static class DictEntry {
230
DictEntry(String root, boolean isException) {
232
this.exception = isException;
236
private static final CharArrayMap<DictEntry> dict_ht = initializeDictHash();
239
* caching off private int maxCacheSize; private CharArrayMap<String> cache =
240
* null; private static final String SAME = "SAME"; // use if stemmed form is
244
private final OpenStringBuilder word = new OpenStringBuilder();
245
private int j; /* index of final letter in stem (within word) */
247
* INDEX of final letter in word. You must add 1 to k to get
248
* the current length of word. When you want the length of
249
* word, use the method wordLength, which returns (k+1).
253
* private void initializeStemHash() { if (maxCacheSize > 0) cache = new
254
* CharArrayMap<String>(maxCacheSize,false); }
257
private char finalChar() {
258
return word.charAt(k);
261
private char penultChar() {
262
return word.charAt(k - 1);
265
private boolean isVowel(int index) {
266
return !isCons(index);
269
private boolean isCons(int index) {
272
ch = word.charAt(index);
274
if ((ch == 'a') || (ch == 'e') || (ch == 'i') || (ch == 'o') || (ch == 'u')) return false;
275
if ((ch != 'y') || (index == 0)) return true;
276
else return (!isCons(index - 1));
279
private static CharArrayMap<DictEntry> initializeDictHash() {
280
DictEntry defaultEntry;
283
CharArrayMap<DictEntry> d = new CharArrayMap<DictEntry>(
284
Version.LUCENE_31, 1000, false);
286
d = new CharArrayMap<DictEntry>(Version.LUCENE_31, 1000, false);
287
for (int i = 0; i < exceptionWords.length; i++) {
288
if (!d.containsKey(exceptionWords[i])) {
289
entry = new DictEntry(exceptionWords[i], true);
290
d.put(exceptionWords[i], entry);
292
System.out.println("Warning: Entry [" + exceptionWords[i]
293
+ "] already in dictionary 1");
297
for (int i = 0; i < directConflations.length; i++) {
298
if (!d.containsKey(directConflations[i][0])) {
299
entry = new DictEntry(directConflations[i][1], false);
300
d.put(directConflations[i][0], entry);
302
System.out.println("Warning: Entry [" + directConflations[i][0]
303
+ "] already in dictionary 2");
307
for (int i = 0; i < countryNationality.length; i++) {
308
if (!d.containsKey(countryNationality[i][0])) {
309
entry = new DictEntry(countryNationality[i][1], false);
310
d.put(countryNationality[i][0], entry);
312
System.out.println("Warning: Entry [" + countryNationality[i][0]
313
+ "] already in dictionary 3");
317
defaultEntry = new DictEntry(null, false);
320
array = KStemData1.data;
322
for (int i = 0; i < array.length; i++) {
323
if (!d.containsKey(array[i])) {
324
d.put(array[i], defaultEntry);
326
System.out.println("Warning: Entry [" + array[i]
327
+ "] already in dictionary 4");
331
array = KStemData2.data;
332
for (int i = 0; i < array.length; i++) {
333
if (!d.containsKey(array[i])) {
334
d.put(array[i], defaultEntry);
336
System.out.println("Warning: Entry [" + array[i]
337
+ "] already in dictionary 4");
341
array = KStemData3.data;
342
for (int i = 0; i < array.length; i++) {
343
if (!d.containsKey(array[i])) {
344
d.put(array[i], defaultEntry);
346
System.out.println("Warning: Entry [" + array[i]
347
+ "] already in dictionary 4");
351
array = KStemData4.data;
352
for (int i = 0; i < array.length; i++) {
353
if (!d.containsKey(array[i])) {
354
d.put(array[i], defaultEntry);
356
System.out.println("Warning: Entry [" + array[i]
357
+ "] already in dictionary 4");
361
array = KStemData5.data;
362
for (int i = 0; i < array.length; i++) {
363
if (!d.containsKey(array[i])) {
364
d.put(array[i], defaultEntry);
366
System.out.println("Warning: Entry [" + array[i]
367
+ "] already in dictionary 4");
371
array = KStemData6.data;
372
for (int i = 0; i < array.length; i++) {
373
if (!d.containsKey(array[i])) {
374
d.put(array[i], defaultEntry);
376
System.out.println("Warning: Entry [" + array[i]
377
+ "] already in dictionary 4");
381
array = KStemData7.data;
382
for (int i = 0; i < array.length; i++) {
383
if (!d.containsKey(array[i])) {
384
d.put(array[i], defaultEntry);
386
System.out.println("Warning: Entry [" + array[i]
387
+ "] already in dictionary 4");
391
for (int i = 0; i < KStemData8.data.length; i++) {
392
if (!d.containsKey(KStemData8.data[i])) {
393
d.put(KStemData8.data[i], defaultEntry);
395
System.out.println("Warning: Entry [" + KStemData8.data[i]
396
+ "] already in dictionary 4");
400
for (int i = 0; i < supplementDict.length; i++) {
401
if (!d.containsKey(supplementDict[i])) {
402
d.put(supplementDict[i], defaultEntry);
404
System.out.println("Warning: Entry [" + supplementDict[i]
405
+ "] already in dictionary 5");
409
for (int i = 0; i < properNouns.length; i++) {
410
if (!d.containsKey(properNouns[i])) {
411
d.put(properNouns[i], defaultEntry);
413
System.out.println("Warning: Entry [" + properNouns[i]
414
+ "] already in dictionary 6");
421
private boolean isAlpha(char ch) {
422
return ch >= 'a' && ch <= 'z'; // terms must be lowercased already
425
/* length of stem within word */
426
private int stemLength() {
430
private boolean endsIn(char[] s) {
431
if (s.length > k) return false;
433
int r = word.length() - s.length; /* length of word before this suffix */
435
for (int r1 = r, i = 0; i < s.length; i++, r1++) {
436
if (s[i] != word.charAt(r1)) return false;
438
j = r - 1; /* index of the character BEFORE the posfix */
442
private boolean endsIn(char a, char b) {
443
if (2 > k) return false;
444
// check left to right since the endings have often already matched
445
if (word.charAt(k - 1) == a && word.charAt(k) == b) {
452
private boolean endsIn(char a, char b, char c) {
453
if (3 > k) return false;
454
if (word.charAt(k - 2) == a && word.charAt(k - 1) == b
455
&& word.charAt(k) == c) {
462
private boolean endsIn(char a, char b, char c, char d) {
463
if (4 > k) return false;
464
if (word.charAt(k - 3) == a && word.charAt(k - 2) == b
465
&& word.charAt(k - 1) == c && word.charAt(k) == d) {
472
private DictEntry wordInDict() {
474
* if (matchedEntry != null) { if (dict_ht.get(word.getArray(), 0,
475
* word.size()) != matchedEntry) {
476
* System.out.println("Uh oh... cached entry doesn't match"); } return
479
if (matchedEntry != null) return matchedEntry;
480
DictEntry e = dict_ht.get(word.getArray(), 0, word.length());
481
if (e != null && !e.exception) {
482
matchedEntry = e; // only cache if it's not an exception.
484
// lookups.add(word.toString());
488
/* Convert plurals to singular form, and '-ies' to 'y' */
489
private void plural() {
490
if (word.charAt(k) == 's') {
491
if (endsIn('i', 'e', 's')) {
492
word.setLength(j + 3);
494
if (lookup()) /* ensure calories -> calorie */
497
word.unsafeWrite('s');
500
} else if (endsIn('e', 's')) {
501
/* try just removing the "s" */
502
word.setLength(j + 2);
506
* note: don't check for exceptions here. So, `aides' -> `aide', but
507
* `aided' -> `aid'. The exception for double s is used to prevent
508
* crosses -> crosse. This is actually correct if crosses is a plural
509
* noun (a type of racket used in lacrosse), but the verb is much more
514
* YCS: this was the one place where lookup was not followed by return.
515
* So restructure it. if ((j>0)&&(lookup(word.toString())) &&
516
* !((word.charAt(j) == 's') && (word.charAt(j-1) == 's'))) return;
519
&& !((word.charAt(j) == 's') && (word.charAt(j - 1) == 's'));
520
if (tryE && lookup()) return;
522
/* try removing the "es" */
524
word.setLength(j + 1);
526
if (lookup()) return;
528
/* the default is to retain the "e" */
529
word.unsafeWrite('e');
532
if (!tryE) lookup(); // if we didn't try the "e" ending before
535
if (word.length() > 3 && penultChar() != 's' && !endsIn('o', 'u', 's')) {
536
/* unless the word ends in "ous" or a double "s", remove the final "s" */
546
private void setSuffix(String s) {
547
setSuff(s, s.length());
550
/* replace old suffix with s */
551
private void setSuff(String s, int len) {
552
word.setLength(j + 1);
553
for (int l = 0; l < len; l++) {
554
word.unsafeWrite(s.charAt(l));
559
/* Returns true if the word is found in the dictionary */
560
// almost all uses of lookup() return immediately and are
561
// followed by another lookup in the dict. Store the match
562
// to avoid this double lookup.
563
DictEntry matchedEntry = null;
565
private boolean lookup() {
567
* debugging code String thisLookup = word.toString(); boolean added =
568
* lookups.add(thisLookup); if (!added) {
569
* System.out.println("######extra lookup:" + thisLookup); // occaasional
570
* extra lookups aren't necessarily errors... could happen by diff
571
* manipulations // throw new RuntimeException("######extra lookup:" +
572
* thisLookup); } else { // System.out.println("new lookup:" + thisLookup);
576
matchedEntry = dict_ht.get(word.getArray(), 0, word.size());
577
return matchedEntry != null;
580
// Set<String> lookups = new HashSet<String>();
582
/* convert past tense (-ed) to present, and `-ied' to `y' */
583
private void pastTense() {
585
* Handle words less than 5 letters with a direct mapping This prevents
588
if (word.length() <= 4) return;
590
if (endsIn('i', 'e', 'd')) {
591
word.setLength(j + 3);
593
if (lookup()) /* we almost always want to convert -ied to -y, but */
594
return; /* this isn't true for short words (died->die) */
595
k++; /* I don't know any long words that this applies to, */
596
word.unsafeWrite('d'); /* but just in case... */
602
/* the vowelInStem() is necessary so we don't stem acronyms */
603
if (endsIn('e', 'd') && vowelInStem()) {
604
/* see if the root ends in `e' */
605
word.setLength(j + 2);
608
DictEntry entry = wordInDict();
609
if (entry != null) if (!entry.exception) /*
610
* if it's in the dictionary and
615
/* try removing the "ed" */
616
word.setLength(j + 1);
618
if (lookup()) return;
621
* try removing a doubled consonant. if the root isn't found in the
622
* dictionary, the default is to leave it doubled. This will correctly
623
* capture `backfilled' -> `backfill' instead of `backfill' ->
624
* `backfille', and seems correct most of the time
630
if (lookup()) return;
631
word.unsafeWrite(word.charAt(k));
637
/* if we have a `un-' prefix, then leave the word alone */
638
/* (this will sometimes screw up with `under-', but we */
639
/* will take care of that later) */
641
if ((word.charAt(0) == 'u') && (word.charAt(1) == 'n')) {
642
word.unsafeWrite('e');
643
word.unsafeWrite('d');
650
* it wasn't found by just removing the `d' or the `ed', so prefer to end
651
* with an `e' (e.g., `microcoded' -> `microcode').
654
word.setLength(j + 1);
655
word.unsafeWrite('e');
657
// nolookup() - we already tried the "e" ending
662
/* return TRUE if word ends with a double consonant */
663
private boolean doubleC(int i) {
664
if (i < 1) return false;
666
if (word.charAt(i) != word.charAt(i - 1)) return false;
670
private boolean vowelInStem() {
671
for (int i = 0; i < stemLength(); i++) {
672
if (isVowel(i)) return true;
677
/* handle `-ing' endings */
678
private void aspect() {
680
* handle short words (aging -> age) via a direct mapping. This prevents
681
* (thing -> the) in the version of this routine that ignores inflectional
682
* variants that are mentioned in the dictionary (when the root is also
686
if (word.length() <= 5) return;
688
/* the vowelinstem() is necessary so we don't stem acronyms */
689
if (endsIn('i', 'n', 'g') && vowelInStem()) {
691
/* try adding an `e' to the stem and check against the dictionary */
692
word.setCharAt(j + 1, 'e');
693
word.setLength(j + 2);
696
DictEntry entry = wordInDict();
698
if (!entry.exception) /* if it's in the dictionary and not an exception */
702
/* adding on the `e' didn't work, so remove it */
704
k--; /* note that `ing' has also been removed */
706
if (lookup()) return;
708
/* if I can remove a doubled consonant and get a word, then do so */
711
word.setLength(k + 1);
712
if (lookup()) return;
713
word.unsafeWrite(word.charAt(k)); /* restore the doubled consonant */
715
/* the default is to leave the consonant doubled */
716
/* (e.g.,`fingerspelling' -> `fingerspell'). Unfortunately */
717
/* `bookselling' -> `booksell' and `mislabelling' -> `mislabell'). */
718
/* Without making the algorithm significantly more complicated, this */
719
/* is the best I can do */
726
* the word wasn't in the dictionary after removing the stem, and then
727
* checking with and without a final `e'. The default is to add an `e'
728
* unless the word ends in two consonants, so `microcoding' ->
729
* `microcode'. The two consonants restriction wouldn't normally be
730
* necessary, but is needed because we don't try to deal with prefixes and
731
* compounds, and most of the time it is correct (e.g., footstamping ->
732
* footstamp, not footstampe; however, decoupled -> decoupl). We can
733
* prevent almost all of the incorrect stems if we try to do some prefix
737
if ((j > 0) && isCons(j) && isCons(j - 1)) {
739
word.setLength(k + 1);
740
// nolookup() because we already did according to the comment
744
word.setLength(j + 1);
745
word.unsafeWrite('e');
747
// nolookup(); we already tried an 'e' ending
753
* this routine deals with -ity endings. It accepts -ability, -ibility, and
754
* -ality, even without checking the dictionary because they are so
755
* productive. The first two are mapped to -ble, and the -ity is remove for
758
private void ityEndings() {
761
if (endsIn('i', 't', 'y')) {
762
word.setLength(j + 1); /* try just removing -ity */
764
if (lookup()) return;
765
word.unsafeWrite('e'); /* try removing -ity and adding -e */
767
if (lookup()) return;
768
word.setCharAt(j + 1, 'i');
772
* the -ability and -ibility endings are highly productive, so just accept
775
if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'l')) {
776
word.setLength(j - 1);
777
word.append("le"); /* convert to -ble */
783
/* ditto for -ivity */
784
if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'v')) {
785
word.setLength(j + 1);
786
word.unsafeWrite('e'); /* convert to -ive */
791
/* ditto for -ality */
792
if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) {
793
word.setLength(j + 1);
800
* if the root isn't in the dictionary, and the variant *is* there, then
801
* use the variant. This allows `immunity'->`immune', but prevents
802
* `capacity'->`capac'. If neither the variant nor the root form are in
803
* the dictionary, then remove the ending as a default
806
if (lookup()) return;
808
/* the default is to remove -ity altogether */
809
word.setLength(j + 1);
811
// nolookup(), we already did it.
816
/* handle -ence and -ance */
817
private void nceEndings() {
821
if (endsIn('n', 'c', 'e')) {
822
word_char = word.charAt(j);
823
if (!((word_char == 'e') || (word_char == 'a'))) return;
825
word.unsafeWrite('e'); /* try converting -e/ance to -e (adherance/adhere) */
827
if (lookup()) return;
828
word.setLength(j); /*
829
* try removing -e/ance altogether
830
* (disappearance/disappear)
833
if (lookup()) return;
834
word.unsafeWrite(word_char); /* restore the original ending */
837
// nolookup() because we restored the original ending
843
private void nessEndings() {
844
if (endsIn('n', 'e', 's', 's')) { /*
845
* this is a very productive endings, so
848
word.setLength(j + 1);
850
if (word.charAt(j) == 'i') word.setCharAt(j, 'y');
857
private void ismEndings() {
858
if (endsIn('i', 's', 'm')) { /*
859
* this is a very productive ending, so just
862
word.setLength(j + 1);
869
/* this routine deals with -ment endings. */
870
private void mentEndings() {
873
if (endsIn('m', 'e', 'n', 't')) {
874
word.setLength(j + 1);
876
if (lookup()) return;
884
/* this routine deals with -ize endings. */
885
private void izeEndings() {
888
if (endsIn('i', 'z', 'e')) {
889
word.setLength(j + 1); /* try removing -ize entirely */
891
if (lookup()) return;
892
word.unsafeWrite('i');
894
if (doubleC(j)) { /* allow for a doubled consonant */
897
if (lookup()) return;
898
word.unsafeWrite(word.charAt(j - 1));
901
word.setLength(j + 1);
902
word.unsafeWrite('e'); /* try removing -ize and adding -e */
904
if (lookup()) return;
905
word.setLength(j + 1);
913
/* handle -ency and -ancy */
914
private void ncyEndings() {
915
if (endsIn('n', 'c', 'y')) {
916
if (!((word.charAt(j) == 'e') || (word.charAt(j) == 'a'))) return;
917
word.setCharAt(j + 2, 't'); /* try converting -ncy to -nt */
918
word.setLength(j + 3);
921
if (lookup()) return;
923
word.setCharAt(j + 2, 'c'); /* the default is to convert it to -nce */
924
word.unsafeWrite('e');
931
/* handle -able and -ible */
932
private void bleEndings() {
936
if (endsIn('b', 'l', 'e')) {
937
if (!((word.charAt(j) == 'a') || (word.charAt(j) == 'i'))) return;
938
word_char = word.charAt(j);
939
word.setLength(j); /* try just removing the ending */
941
if (lookup()) return;
942
if (doubleC(k)) { /* allow for a doubled consonant */
945
if (lookup()) return;
947
word.unsafeWrite(word.charAt(k - 1));
950
word.unsafeWrite('e'); /* try removing -a/ible and adding -e */
952
if (lookup()) return;
954
word.append("ate"); /* try removing -able and adding -ate */
955
/* (e.g., compensable/compensate) */
957
if (lookup()) return;
959
word.unsafeWrite(word_char); /* restore the original values */
968
* handle -ic endings. This is fairly straightforward, but this is also the
969
* only place we try *expanding* an ending, -ic -> -ical. This is to handle
970
* cases like `canonic' -> `canonical'
972
private void icEndings() {
973
if (endsIn('i', 'c')) {
974
word.setLength(j + 3);
975
word.append("al"); /* try converting -ic to -ical */
977
if (lookup()) return;
979
word.setCharAt(j + 1, 'y'); /* try converting -ic to -y */
980
word.setLength(j + 2);
982
if (lookup()) return;
984
word.setCharAt(j + 1, 'e'); /* try converting -ic to -e */
985
if (lookup()) return;
987
word.setLength(j + 1); /* try removing -ic altogether */
989
if (lookup()) return;
990
word.append("ic"); /* restore the original ending */
997
private static char[] ization = "ization".toCharArray();
998
private static char[] ition = "ition".toCharArray();
999
private static char[] ation = "ation".toCharArray();
1000
private static char[] ication = "ication".toCharArray();
1002
/* handle some derivational endings */
1004
* this routine deals with -ion, -ition, -ation, -ization, and -ication. The
1005
* -ization ending is always converted to -ize
1007
private void ionEndings() {
1009
if (!endsIn('i', 'o', 'n')) {
1013
if (endsIn(ization)) { /*
1014
* the -ize ending is very productive, so simply
1015
* accept it as the root
1017
word.setLength(j + 3);
1018
word.unsafeWrite('e');
1024
if (endsIn(ition)) {
1025
word.setLength(j + 1);
1026
word.unsafeWrite('e');
1029
* remove -ition and add `e', and check against the
1032
return; /* (e.g., definition->define, opposition->oppose) */
1034
/* restore original values */
1035
word.setLength(j + 1);
1036
word.append("ition");
1039
} else if (endsIn(ation)) {
1040
word.setLength(j + 3);
1041
word.unsafeWrite('e');
1043
if (lookup()) /* remove -ion and add `e', and check against the dictionary */
1044
return; /* (elmination -> eliminate) */
1046
word.setLength(j + 1);
1047
word.unsafeWrite('e'); /*
1048
* remove -ation and add `e', and check against the
1052
if (lookup()) return;
1054
word.setLength(j + 1);/*
1055
* just remove -ation (resignation->resign) and
1059
if (lookup()) return;
1061
/* restore original values */
1062
word.setLength(j + 1);
1063
word.append("ation");
1070
* test -ication after -ation is attempted (e.g., `complication->complicate'
1071
* rather than `complication->comply')
1074
if (endsIn(ication)) {
1075
word.setLength(j + 1);
1076
word.unsafeWrite('y');
1079
* remove -ication and add `y', and check against the
1082
return; /* (e.g., amplification -> amplify) */
1084
/* restore original values */
1085
word.setLength(j + 1);
1086
word.append("ication");
1091
// if (endsIn(ion)) {
1092
if (true) { // we checked for this earlier... just need to set "j"
1095
word.setLength(j + 1);
1096
word.unsafeWrite('e');
1098
if (lookup()) /* remove -ion and add `e', and check against the dictionary */
1101
word.setLength(j + 1);
1103
if (lookup()) /* remove -ion, and if it's found, treat that as the root */
1106
/* restore original values */
1107
word.setLength(j + 1);
1113
// nolookup(); all of the other paths restored original values
1118
* this routine deals with -er, -or, -ier, and -eer. The -izer ending is
1119
* always converted to -ize
1121
private void erAndOrEndings() {
1124
if (word.charAt(k) != 'r') return; // YCS
1126
char word_char; /* so we can remember if it was -er or -or */
1128
if (endsIn('i', 'z', 'e', 'r')) { /*
1129
* -ize is very productive, so accept it
1132
word.setLength(j + 4);
1138
if (endsIn('e', 'r') || endsIn('o', 'r')) {
1139
word_char = word.charAt(j + 1);
1143
if (lookup()) return;
1144
word.unsafeWrite(word.charAt(j - 1)); /* restore the doubled consonant */
1147
if (word.charAt(j) == 'i') { /* do we have a -ier ending? */
1148
word.setCharAt(j, 'y');
1149
word.setLength(j + 1);
1151
if (lookup()) /* yes, so check against the dictionary */
1153
word.setCharAt(j, 'i'); /* restore the endings */
1154
word.unsafeWrite('e');
1157
if (word.charAt(j) == 'e') { /* handle -eer */
1160
if (lookup()) return;
1161
word.unsafeWrite('e');
1164
word.setLength(j + 2); /* remove the -r ending */
1166
if (lookup()) return;
1167
word.setLength(j + 1); /* try removing -er/-or */
1169
if (lookup()) return;
1170
word.unsafeWrite('e'); /* try removing -or and adding -e */
1172
if (lookup()) return;
1173
word.setLength(j + 1);
1174
word.unsafeWrite(word_char);
1175
word.unsafeWrite('r'); /* restore the word to the way it was */
1183
* this routine deals with -ly endings. The -ally ending is always converted
1184
* to -al Sometimes this will temporarily leave us with a non-word (e.g.,
1185
* heuristically maps to heuristical), but then the -al is removed in the next
1188
private void lyEndings() {
1191
if (endsIn('l', 'y')) {
1193
word.setCharAt(j + 2, 'e'); /* try converting -ly to -le */
1195
if (lookup()) return;
1196
word.setCharAt(j + 2, 'y');
1198
word.setLength(j + 1); /* try just removing the -ly */
1201
if (lookup()) return;
1203
if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) /*
1216
if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'b')) { /*
1225
word.setCharAt(j + 2, 'e');
1230
if (word.charAt(j) == 'i') { /* e.g., militarily -> military */
1232
word.unsafeWrite('y');
1234
if (lookup()) return;
1240
word.setLength(j + 1); /* the default is to remove -ly */
1243
// nolookup()... we already tried removing the "ly" variant
1249
* this routine deals with -al endings. Some of the endings from the previous
1250
* routine are finished up here.
1252
private void alEndings() {
1255
if (word.length() < 4) return;
1256
if (endsIn('a', 'l')) {
1257
word.setLength(j + 1);
1259
if (lookup()) /* try just removing the -al */
1262
if (doubleC(j)) { /* allow for a doubled consonant */
1265
if (lookup()) return;
1266
word.unsafeWrite(word.charAt(j - 1));
1269
word.setLength(j + 1);
1270
word.unsafeWrite('e'); /* try removing the -al and adding -e */
1272
if (lookup()) return;
1274
word.setLength(j + 1);
1275
word.append("um"); /* try converting -al to -um */
1276
/* (e.g., optimal - > optimum ) */
1278
if (lookup()) return;
1280
word.setLength(j + 1);
1281
word.append("al"); /* restore the ending to the way it was */
1284
if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'c')) {
1285
word.setLength(j - 1); /* try removing -ical */
1287
if (lookup()) return;
1289
word.setLength(j - 1);
1290
word.unsafeWrite('y');/* try turning -ical to -y (e.g., bibliographical) */
1292
if (lookup()) return;
1294
word.setLength(j - 1);
1295
word.append("ic"); /* the default is to convert -ical to -ic */
1297
// nolookup() ... converting ical to ic means removing "al" which we
1304
if (word.charAt(j) == 'i') { /* sometimes -ial endings should be removed */
1305
word.setLength(j); /* (sometimes it gets turned into -y, but we */
1306
k = j - 1; /* aren't dealing with that case for now) */
1307
if (lookup()) return;
1318
* this routine deals with -ive endings. It normalizes some of the -ative
1319
* endings directly, and also maps some -ive endings to -ion.
1321
private void iveEndings() {
1324
if (endsIn('i', 'v', 'e')) {
1325
word.setLength(j + 1); /* try removing -ive entirely */
1327
if (lookup()) return;
1329
word.unsafeWrite('e'); /* try removing -ive and adding -e */
1331
if (lookup()) return;
1332
word.setLength(j + 1);
1334
if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 't')) {
1335
word.setCharAt(j - 1, 'e'); /* try removing -ative and adding -e */
1336
word.setLength(j); /* (e.g., determinative -> determine) */
1338
if (lookup()) return;
1339
word.setLength(j - 1); /* try just removing -ative */
1340
if (lookup()) return;
1342
word.append("ative");
1346
/* try mapping -ive to -ion (e.g., injunctive/injunction) */
1347
word.setCharAt(j + 2, 'o');
1348
word.setCharAt(j + 3, 'n');
1349
if (lookup()) return;
1351
word.setCharAt(j + 2, 'v'); /* restore the original values */
1352
word.setCharAt(j + 3, 'e');
1361
String stem(String term) {
1362
boolean changed = stem(term.toCharArray(), term.length());
1363
if (!changed) return term;
1368
* Returns the result of the stem (assuming the word was changed) as a String.
1371
String s = getString();
1372
if (s != null) return s;
1373
return word.toString();
1376
CharSequence asCharSequence() {
1377
return result != null ? result : word;
1380
String getString() {
1385
return word.getArray();
1389
return word.length();
1394
private boolean matched() {
1396
* if (!lookups.contains(word.toString())) { throw new
1397
* RuntimeException("didn't look up "+word.toString()+" prev="+prevLookup);
1401
return matchedEntry != null;
1405
* Stems the text in the token. Returns true if changed.
1407
boolean stem(char[] term, int len) {
1412
if ((k <= 1) || (k >= MaxWordLen - 1)) {
1413
return false; // don't stem
1416
// first check the stemmer dictionaries, and avoid using the
1417
// cache if it's in there.
1418
DictEntry entry = dict_ht.get(term, 0, len);
1419
if (entry != null) {
1420
if (entry.root != null) {
1421
result = entry.root;
1428
* caching off is normally faster if (cache == null) initializeStemHash();
1430
* // now check the cache, before we copy chars to "word" if (cache != null)
1431
* { String val = cache.get(term, 0, len); if (val != null) { if (val !=
1432
* SAME) { result = val; return true; } return false; } }
1436
// allocate enough space so that an expansion is never needed
1437
word.reserve(len + 10);
1438
for (int i = 0; i < len; i++) {
1440
if (!isAlpha(ch)) return false; // don't stem
1441
// don't lowercase... it's a requirement that lowercase filter be
1442
// used before this stemmer.
1443
word.unsafeWrite(ch);
1446
matchedEntry = null;
1448
* lookups.clear(); lookups.add(word.toString());
1452
* This while loop will never be executed more than one time; it is here
1453
* only to allow the break statement to be used to escape as soon as a word
1457
// YCS: extra lookup()s were inserted so we don't need to
1458
// do an extra wordInDict() here.
1460
if (matched()) break;
1462
if (matched()) break;
1464
if (matched()) break;
1466
if (matched()) break;
1468
if (matched()) break;
1470
if (matched()) break;
1472
if (matched()) break;
1474
if (matched()) break;
1476
if (matched()) break;
1477
entry = wordInDict();
1479
if (matched()) break;
1481
if (matched()) break;
1483
if (matched()) break;
1485
if (matched()) break;
1487
if (matched()) break;
1489
if (matched()) break;
1491
if (matched()) break;
1498
* try for a direct mapping (allows for cases like `Italian'->`Italy' and
1499
* `Italians'->`Italy')
1501
entry = matchedEntry;
1502
if (entry != null) {
1503
result = entry.root; // may be null, which means that "word" is the stem
1507
* caching off is normally faster if (cache != null && cache.size() <
1508
* maxCacheSize) { char[] key = new char[len]; System.arraycopy(term, 0,
1509
* key, 0, len); if (result != null) { cache.put(key, result); } else {
1510
* cache.put(key, word.toString()); } }
1514
* if (entry == null) { if (!word.toString().equals(new String(term,0,len)))
1515
* { System.out.println("CASE:" + word.toString() + "," + new
1516
* String(term,0,len));
1521
// no entry matched means result is "word"