1
package org.apache.lucene.analysis.ru;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
21
* Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
22
* @deprecated Use {@link org.tartarus.snowball.ext.RussianStemmer} instead,
23
* which has the same functionality. This filter will be removed in Lucene 4.0
28
// positions of RV, R1 and R2 respectively
29
private int RV, /*R1,*/ R2;
31
// letters (currently unused letters are commented out)
32
private final static char A = '\u0430';
33
//private final static char B = '\u0431';
34
private final static char V = '\u0432';
35
private final static char G = '\u0433';
36
//private final static char D = '\u0434';
37
private final static char E = '\u0435';
38
//private final static char ZH = '\u0436';
39
//private final static char Z = '\u0437';
40
private final static char I = '\u0438';
41
private final static char I_ = '\u0439';
42
//private final static char K = '\u043A';
43
private final static char L = '\u043B';
44
private final static char M = '\u043C';
45
private final static char N = '\u043D';
46
private final static char O = '\u043E';
47
//private final static char P = '\u043F';
48
//private final static char R = '\u0440';
49
private final static char S = '\u0441';
50
private final static char T = '\u0442';
51
private final static char U = '\u0443';
52
//private final static char F = '\u0444';
53
private final static char X = '\u0445';
54
//private final static char TS = '\u0446';
55
//private final static char CH = '\u0447';
56
private final static char SH = '\u0448';
57
private final static char SHCH = '\u0449';
58
//private final static char HARD = '\u044A';
59
private final static char Y = '\u044B';
60
private final static char SOFT = '\u044C';
61
private final static char AE = '\u044D';
62
private final static char IU = '\u044E';
63
private final static char IA = '\u044F';
66
private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
68
private static char[][] perfectiveGerundEndings1 = {
74
private static char[][] perfectiveGerund1Predessors = {
79
private static char[][] perfectiveGerundEndings2 = { { I, V }, {
83
I, V, SH, I, S, SOFT }, {
84
Y, V, SH, I, S, SOFT }
87
private static char[][] adjectiveEndings = {
116
private static char[][] participleEndings1 = {
124
private static char[][] participleEndings2 = {
130
private static char[][] participle1Predessors = {
135
private static char[][] reflexiveEndings = {
140
private static char[][] verbEndings1 = {
160
private static char[][] verbEndings2 = {
192
private static char[][] verb1Predessors = {
197
private static char[][] nounEndings = {
236
private static char[][] superlativeEndings = {
241
private static char[][] derivationalEndings = {
247
* RussianStemmer constructor comment.
249
public RussianStemmer()
255
* Adjectival ending is an adjective ending,
256
* optionally preceded by participle ending.
257
* Creation date: (17/03/2002 12:14:58 AM)
258
* @param stemmingZone java.lang.StringBuilder
260
private boolean adjectival(StringBuilder stemmingZone)
262
// look for adjective ending in a stemming zone
263
if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
265
// if adjective ending was found, try for participle ending.
266
if (!findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors))
267
findAndRemoveEnding(stemmingZone, participleEndings2);
272
* Derivational endings
273
* Creation date: (17/03/2002 12:14:58 AM)
274
* @param stemmingZone java.lang.StringBuilder
276
private boolean derivational(StringBuilder stemmingZone)
278
int endingLength = findEnding(stemmingZone, derivationalEndings);
279
if (endingLength == 0)
280
// no derivational ending found
284
// Ensure that the ending locates in R2
285
if (R2 - RV <= stemmingZone.length() - endingLength)
287
stemmingZone.setLength(stemmingZone.length() - endingLength);
298
* Finds ending among given ending class and returns the length of ending found(0, if not found).
299
* Creation date: (17/03/2002 8:18:34 PM)
301
private int findEnding(StringBuilder stemmingZone, int startIndex, char[][] theEndingClass)
303
boolean match = false;
304
for (int i = theEndingClass.length - 1; i >= 0; i--)
306
char[] theEnding = theEndingClass[i];
307
// check if the ending is bigger than stemming zone
308
if (startIndex < theEnding.length - 1)
314
int stemmingIndex = startIndex;
315
for (int j = theEnding.length - 1; j >= 0; j--)
317
if (stemmingZone.charAt(stemmingIndex--) != theEnding[j])
323
// check if ending was found
326
return theEndingClass[i].length; // cut ending
332
private int findEnding(StringBuilder stemmingZone, char[][] theEndingClass)
334
return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
338
* Finds the ending among the given class of endings and removes it from stemming zone.
339
* Creation date: (17/03/2002 8:18:34 PM)
341
private boolean findAndRemoveEnding(StringBuilder stemmingZone, char[][] theEndingClass)
343
int endingLength = findEnding(stemmingZone, theEndingClass);
344
if (endingLength == 0)
348
stemmingZone.setLength(stemmingZone.length() - endingLength);
349
// cut the ending found
355
* Finds the ending among the given class of endings, then checks if this ending was
356
* preceded by any of given predecessors, and if so, removes it from stemming zone.
357
* Creation date: (17/03/2002 8:18:34 PM)
359
private boolean findAndRemoveEnding(StringBuilder stemmingZone,
360
char[][] theEndingClass, char[][] thePredessors)
362
int endingLength = findEnding(stemmingZone, theEndingClass);
363
if (endingLength == 0)
368
int predessorLength =
369
findEnding(stemmingZone,
370
stemmingZone.length() - endingLength - 1,
372
if (predessorLength == 0)
375
stemmingZone.setLength(stemmingZone.length() - endingLength);
376
// cut the ending found
384
* Marks positions of RV, R1 and R2 in a given word.
385
* Creation date: (16/03/2002 3:40:11 PM)
387
private void markPositions(String word)
394
while (word.length() > i && !isVowel(word.charAt(i)))
398
if (word.length() - 1 < ++i)
399
return; // RV zone is empty
402
while (word.length() > i && isVowel(word.charAt(i)))
406
if (word.length() - 1 < ++i)
407
return; // R1 zone is empty
410
while (word.length() > i && !isVowel(word.charAt(i)))
414
if (word.length() - 1 < ++i)
415
return; // R2 zone is empty
416
while (word.length() > i && isVowel(word.charAt(i)))
420
if (word.length() - 1 < ++i)
421
return; // R2 zone is empty
426
* Checks if character is a vowel..
427
* Creation date: (16/03/2002 10:47:03 PM)
431
private boolean isVowel(char letter)
433
for (int i = 0; i < vowels.length; i++)
435
if (letter == vowels[i])
443
* Creation date: (17/03/2002 12:14:58 AM)
444
* @param stemmingZone java.lang.StringBuilder
446
private boolean noun(StringBuilder stemmingZone)
448
return findAndRemoveEnding(stemmingZone, nounEndings);
452
* Perfective gerund endings.
453
* Creation date: (17/03/2002 12:14:58 AM)
454
* @param stemmingZone java.lang.StringBuilder
456
private boolean perfectiveGerund(StringBuilder stemmingZone)
458
return findAndRemoveEnding(
460
perfectiveGerundEndings1,
461
perfectiveGerund1Predessors)
462
|| findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
467
* Creation date: (17/03/2002 12:14:58 AM)
468
* @param stemmingZone java.lang.StringBuilder
470
private boolean reflexive(StringBuilder stemmingZone)
472
return findAndRemoveEnding(stemmingZone, reflexiveEndings);
476
* Insert the method's description here.
477
* Creation date: (17/03/2002 12:14:58 AM)
478
* @param stemmingZone java.lang.StringBuilder
480
private boolean removeI(StringBuilder stemmingZone)
482
if (stemmingZone.length() > 0
483
&& stemmingZone.charAt(stemmingZone.length() - 1) == I)
485
stemmingZone.setLength(stemmingZone.length() - 1);
495
* Insert the method's description here.
496
* Creation date: (17/03/2002 12:14:58 AM)
497
* @param stemmingZone java.lang.StringBuilder
499
private boolean removeSoft(StringBuilder stemmingZone)
501
if (stemmingZone.length() > 0
502
&& stemmingZone.charAt(stemmingZone.length() - 1) == SOFT)
504
stemmingZone.setLength(stemmingZone.length() - 1);
514
* Finds the stem for given Russian word.
515
* Creation date: (16/03/2002 3:36:48 PM)
516
* @return java.lang.String
517
* @param input java.lang.String
519
public String stem(String input)
521
markPositions(input);
523
return input; //RV wasn't detected, nothing to stem
524
StringBuilder stemmingZone = new StringBuilder(input.substring(RV));
525
// stemming goes on in RV
528
if (!perfectiveGerund(stemmingZone))
530
reflexive(stemmingZone);
531
if (!adjectival(stemmingZone))
532
if (!verb(stemmingZone))
536
removeI(stemmingZone);
538
derivational(stemmingZone);
540
superlative(stemmingZone);
541
undoubleN(stemmingZone);
542
removeSoft(stemmingZone);
544
return input.substring(0, RV) + stemmingZone.toString();
548
* Superlative endings.
549
* Creation date: (17/03/2002 12:14:58 AM)
550
* @param stemmingZone java.lang.StringBuilder
552
private boolean superlative(StringBuilder stemmingZone)
554
return findAndRemoveEnding(stemmingZone, superlativeEndings);
559
* Creation date: (17/03/2002 12:14:58 AM)
560
* @param stemmingZone java.lang.StringBuilder
562
private boolean undoubleN(StringBuilder stemmingZone)
567
if (findEnding(stemmingZone, doubleN) != 0)
569
stemmingZone.setLength(stemmingZone.length() - 1);
580
* Creation date: (17/03/2002 12:14:58 AM)
581
* @param stemmingZone java.lang.StringBuilder
583
private boolean verb(StringBuilder stemmingZone)
585
return findAndRemoveEnding(
589
|| findAndRemoveEnding(stemmingZone, verbEndings2);
593
* Static method for stemming.
595
public static String stemWord(String theWord)
597
RussianStemmer stemmer = new RussianStemmer();
598
return stemmer.stem(theWord);