~ubuntu-branches/ubuntu/trusty/pylucene/trusty

« back to all changes in this revision

Viewing changes to lucene-java-2.3.1/contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java

  • Committer: Package Import Robot
  • Author(s): Dmitry Nezhevenko
  • Date: 2012-04-23 16:43:55 UTC
  • mfrom: (1.1.1)
  • Revision ID: package-import@ubuntu.com-20120423164355-grqtepnwtecdjfk2
Tags: 3.5.0-1
* New maintainer (closes: 670179)
* New upstream release
* Switch to dpkg-source 3.0 (quilt) format
* Switch to machine-readable debian/copyright
* Bump debian/compat to 8, drop debian/pycompat
* Switch from cdbs to dh
* Add watch file
* Build for all supported versions of python2 (closes: 581198, 632240)
* Rename binary package to python-lucene (closes: 581197)
* Add -dbg package

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
package org.apache.lucene.analysis.fr;
2
 
 
3
 
/**
4
 
 * Licensed to the Apache Software Foundation (ASF) under one or more
5
 
 * contributor license agreements.  See the NOTICE file distributed with
6
 
 * this work for additional information regarding copyright ownership.
7
 
 * The ASF licenses this file to You under the Apache License, Version 2.0
8
 
 * (the "License"); you may not use this file except in compliance with
9
 
 * the License.  You may obtain a copy of the License at
10
 
 *
11
 
 *     http://www.apache.org/licenses/LICENSE-2.0
12
 
 *
13
 
 * Unless required by applicable law or agreed to in writing, software
14
 
 * distributed under the License is distributed on an "AS IS" BASIS,
15
 
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
 
 * See the License for the specific language governing permissions and
17
 
 * limitations under the License.
18
 
 */
19
 
 
20
 
/**
21
 
 * A stemmer for French words. The algorithm is based on the work of
22
 
 * Dr Martin Porter on his snowball project<br>
23
 
 * refer to http://snowball.sourceforge.net/french/stemmer.html<br>
24
 
 * (French stemming algorithm) for details
25
 
 *
26
 
 * @author    Patrick Talbot
27
 
 */
28
 
 
29
 
public class FrenchStemmer {
30
 
 
31
 
    /**
32
 
     * Buffer for the terms while stemming them.
33
 
     */
34
 
    private StringBuffer sb = new StringBuffer();
35
 
 
36
 
    /**
37
 
     * A temporary buffer, used to reconstruct R2
38
 
     */
39
 
     private StringBuffer tb = new StringBuffer();
40
 
 
41
 
        /**
42
 
         * Region R0 is equal to the whole buffer
43
 
         */
44
 
        private String R0;
45
 
 
46
 
        /**
47
 
         * Region RV
48
 
         * "If the word begins with two vowels, RV is the region after the third letter,
49
 
         * otherwise the region after the first vowel not at the beginning of the word,
50
 
         * or the end of the word if these positions cannot be found."
51
 
         */
52
 
    private String RV;
53
 
 
54
 
        /**
55
 
         * Region R1
56
 
         * "R1 is the region after the first non-vowel following a vowel
57
 
         * or is the null region at the end of the word if there is no such non-vowel"
58
 
         */
59
 
    private String R1;
60
 
 
61
 
        /**
62
 
         * Region R2
63
 
         * "R2 is the region after the first non-vowel in R1 following a vowel
64
 
         * or is the null region at the end of the word if there is no such non-vowel"
65
 
         */
66
 
    private String R2;
67
 
 
68
 
 
69
 
        /**
70
 
         * Set to true if we need to perform step 2
71
 
         */
72
 
    private boolean suite;
73
 
 
74
 
        /**
75
 
         * Set to true if the buffer was modified
76
 
         */
77
 
    private boolean modified;
78
 
 
79
 
 
80
 
    /**
81
 
     * Stemms the given term to a unique <tt>discriminator</tt>.
82
 
     *
83
 
     * @param term  java.langString The term that should be stemmed
84
 
     * @return java.lang.String  Discriminator for <tt>term</tt>
85
 
     */
86
 
    protected String stem( String term ) {
87
 
                if ( !isStemmable( term ) ) {
88
 
                        return term;
89
 
                }
90
 
 
91
 
                // Use lowercase for medium stemming.
92
 
                term = term.toLowerCase();
93
 
 
94
 
                // Reset the StringBuffer.
95
 
                sb.delete( 0, sb.length() );
96
 
                sb.insert( 0, term );
97
 
 
98
 
                // reset the booleans
99
 
                modified = false;
100
 
                suite = false;
101
 
 
102
 
                sb = treatVowels( sb );
103
 
 
104
 
                setStrings();
105
 
 
106
 
                step1();
107
 
 
108
 
                if (!modified || suite)
109
 
                {
110
 
                        if (RV != null)
111
 
                        {
112
 
                                suite = step2a();
113
 
                                if (!suite)
114
 
                                        step2b();
115
 
                        }
116
 
                }
117
 
 
118
 
                if (modified || suite)
119
 
                        step3();
120
 
                else
121
 
                        step4();
122
 
 
123
 
                step5();
124
 
 
125
 
                step6();
126
 
 
127
 
                return sb.toString();
128
 
    }
129
 
 
130
 
        /**
131
 
         * Sets the search region Strings<br>
132
 
         * it needs to be done each time the buffer was modified
133
 
         */
134
 
        private void setStrings() {
135
 
                // set the strings
136
 
                R0 = sb.toString();
137
 
                RV = retrieveRV( sb );
138
 
                R1 = retrieveR( sb );
139
 
                if ( R1 != null )
140
 
                {
141
 
                        tb.delete( 0, tb.length() );
142
 
                        tb.insert( 0, R1 );
143
 
                        R2 = retrieveR( tb );
144
 
                }
145
 
                else
146
 
                        R2 = null;
147
 
        }
148
 
 
149
 
        /**
150
 
         * First step of the Porter Algorithmn<br>
151
 
         * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
152
 
         */
153
 
        private void step1( ) {
154
 
                String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
155
 
                deleteFrom( R2, suffix );
156
 
 
157
 
                replaceFrom( R2, new String[] { "logies", "logie" }, "log" );
158
 
                replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
159
 
                replaceFrom( R2, new String[] { "ences", "ence" }, "ent" );
160
 
 
161
 
                String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
162
 
                deleteButSuffixFromElseReplace( R2, search, "ic",  true, R0, "iqU" );
163
 
 
164
 
                deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
165
 
                deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
166
 
                deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
167
 
                deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
168
 
                deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );
169
 
 
170
 
                deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
171
 
                deleteFrom( RV, new String[] { "ements", "ement" } );
172
 
 
173
 
                deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" );
174
 
                deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" );
175
 
                deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true );
176
 
 
177
 
                String[] autre = { "ifs", "ives", "if", "ive" };
178
 
                deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
179
 
                deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
180
 
 
181
 
                replaceFrom( R0, new String[] { "eaux" }, "eau" );
182
 
 
183
 
                replaceFrom( R1, new String[] { "aux" }, "al" );
184
 
 
185
 
                deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );
186
 
 
187
 
                deleteFrom( R2, new String[] { "eux" } );
188
 
 
189
 
                // if one of the next steps is performed, we will need to perform step2a
190
 
                boolean temp = false;
191
 
                temp = replaceFrom( RV, new String[] { "amment" }, "ant" );
192
 
                if (temp == true)
193
 
                        suite = true;
194
 
                temp = replaceFrom( RV, new String[] { "emment" }, "ent" );
195
 
                if (temp == true)
196
 
                        suite = true;
197
 
                temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
198
 
                if (temp == true)
199
 
                        suite = true;
200
 
 
201
 
        }
202
 
 
203
 
        /**
204
 
         * Second step (A) of the Porter Algorithmn<br>
205
 
         * Will be performed if nothing changed from the first step
206
 
         * or changed were done in the amment, emment, ments or ment suffixes<br>
207
 
         * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
208
 
         *
209
 
         * @return boolean - true if something changed in the StringBuffer
210
 
         */
211
 
        private boolean step2a() {
212
 
                String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira",
213
 
                                                        "irent", "iriez", "irez", "irions", "irons", "iront",
214
 
                                                        "issaIent", "issais", "issantes", "issante", "issants", "issant",
215
 
                                                        "issait", "issais", "issions", "issons", "issiez", "issez", "issent",
216
 
                                                        "isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" };
217
 
                return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
218
 
        }
219
 
 
220
 
        /**
221
 
         * Second step (B) of the Porter Algorithmn<br>
222
 
         * Will be performed if step 2 A was performed unsuccessfully<br>
223
 
         * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
224
 
         */
225
 
        private void step2b() {
226
 
                String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
227
 
                                                        "erons", "eront","erez", "èrent", "era", "ées", "iez",
228
 
                                                        "ée", "és", "er", "ez", "é" };
229
 
                deleteFrom( RV, suffix );
230
 
 
231
 
                String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
232
 
                                                        "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",
233
 
                                                        "ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
234
 
                deleteButSuffixFrom( RV, search, "e", true );
235
 
 
236
 
                deleteFrom( R2, new String[] { "ions" } );
237
 
        }
238
 
 
239
 
        /**
240
 
         * Third step of the Porter Algorithmn<br>
241
 
         * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
242
 
         */
243
 
        private void step3() {
244
 
                if (sb.length()>0)
245
 
                {
246
 
                        char ch = sb.charAt( sb.length()-1 );
247
 
                        if (ch == 'Y')
248
 
                        {
249
 
                                sb.setCharAt( sb.length()-1, 'i' );
250
 
                                setStrings();
251
 
                        }
252
 
                        else if (ch == 'ç')
253
 
                        {
254
 
                                sb.setCharAt( sb.length()-1, 'c' );
255
 
                                setStrings();
256
 
                        }
257
 
                }
258
 
        }
259
 
 
260
 
        /**
261
 
         * Fourth step of the Porter Algorithmn<br>
262
 
         * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
263
 
         */
264
 
        private void step4() {
265
 
                if (sb.length() > 1)
266
 
                {
267
 
                        char ch = sb.charAt( sb.length()-1 );
268
 
                        if (ch == 's')
269
 
                        {
270
 
                                char b = sb.charAt( sb.length()-2 );
271
 
                                if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')
272
 
                                {
273
 
                                        sb.delete( sb.length() - 1, sb.length());
274
 
                                        setStrings();
275
 
                                }
276
 
                        }
277
 
                }
278
 
                boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
279
 
                if (!found)
280
 
                found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
281
 
 
282
 
                replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
283
 
                deleteFrom( RV, new String[] { "e" } );
284
 
                deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
285
 
        }
286
 
 
287
 
        /**
288
 
         * Fifth step of the Porter Algorithmn<br>
289
 
         * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
290
 
         */
291
 
        private void step5() {
292
 
                if (R0 != null)
293
 
                {
294
 
                        if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill"))
295
 
                        {
296
 
                                sb.delete( sb.length() - 1, sb.length() );
297
 
                                setStrings();
298
 
                        }
299
 
                }
300
 
        }
301
 
 
302
 
        /**
303
 
         * Sixth (and last!) step of the Porter Algorithmn<br>
304
 
         * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
305
 
         */
306
 
        private void step6() {
307
 
                if (R0!=null && R0.length()>0)
308
 
                {
309
 
                        boolean seenVowel = false;
310
 
                        boolean seenConson = false;
311
 
                        int pos = -1;
312
 
                        for (int i = R0.length()-1; i > -1; i--)
313
 
                        {
314
 
                                char ch = R0.charAt(i);
315
 
                                if (isVowel(ch))
316
 
                                {
317
 
                                        if (!seenVowel)
318
 
                                        {
319
 
                                                if (ch == 'é' || ch == 'è')
320
 
                                                {
321
 
                                                        pos = i;
322
 
                                                        break;
323
 
                                                }
324
 
                                        }
325
 
                                        seenVowel = true;
326
 
                                }
327
 
                                else
328
 
                                {
329
 
                                        if (seenVowel)
330
 
                                                break;
331
 
                                        else
332
 
                                                seenConson = true;
333
 
                                }
334
 
                        }
335
 
                        if (pos > -1 && seenConson && !seenVowel)
336
 
                                sb.setCharAt(pos, 'e');
337
 
                }
338
 
        }
339
 
 
340
 
        /**
341
 
         * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
342
 
         *
343
 
         * @param source java.lang.String - the primary source zone for search
344
 
         * @param search java.lang.String[] - the strings to search for suppression
345
 
         * @param from java.lang.String - the secondary source zone for search
346
 
         * @param prefix java.lang.String - the prefix to add to the search string to test
347
 
         * @return boolean - true if modified
348
 
         */
349
 
        private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) {
350
 
                boolean found = false;
351
 
                if (source!=null )
352
 
                {
353
 
                        for (int i = 0; i < search.length; i++) {
354
 
                                if ( source.endsWith( search[i] ))
355
 
                                {
356
 
                                        if (from!=null && from.endsWith( prefix + search[i] ))
357
 
                                        {
358
 
                                                sb.delete( sb.length() - search[i].length(), sb.length());
359
 
                                                found = true;
360
 
                                                setStrings();
361
 
                                                break;
362
 
                                        }
363
 
                                }
364
 
                        }
365
 
                }
366
 
                return found;
367
 
        }
368
 
 
369
 
        /**
370
 
         * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
371
 
         *
372
 
         * @param source java.lang.String - the primary source zone for search
373
 
         * @param search java.lang.String[] - the strings to search for suppression
374
 
         * @param vowel boolean - true if we need a vowel before the search string
375
 
         * @param from java.lang.String - the secondary source zone for search (where vowel could be)
376
 
         * @return boolean - true if modified
377
 
         */
378
 
        private boolean deleteFromIfTestVowelBeforeIn( String source, String[] search, boolean vowel, String from ) {
379
 
                boolean found = false;
380
 
                if (source!=null && from!=null)
381
 
                {
382
 
                        for (int i = 0; i < search.length; i++) {
383
 
                                if ( source.endsWith( search[i] ))
384
 
                                {
385
 
                                        if ((search[i].length() + 1) <= from.length())
386
 
                                        {
387
 
                                                boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1)));
388
 
                                                if (test == vowel)
389
 
                                                {
390
 
                                                        sb.delete( sb.length() - search[i].length(), sb.length());
391
 
                                                        modified = true;
392
 
                                                        found = true;
393
 
                                                        setStrings();
394
 
                                                        break;
395
 
                                                }
396
 
                                        }
397
 
                                }
398
 
                        }
399
 
                }
400
 
                return found;
401
 
        }
402
 
 
403
 
        /**
404
 
         * Delete a suffix searched in zone "source" if preceded by the prefix
405
 
         *
406
 
         * @param source java.lang.String - the primary source zone for search
407
 
         * @param search java.lang.String[] - the strings to search for suppression
408
 
         * @param prefix java.lang.String - the prefix to add to the search string to test
409
 
         * @param without boolean - true if it will be deleted even without prefix found
410
 
         */
411
 
        private void deleteButSuffixFrom( String source, String[] search, String prefix, boolean without ) {
412
 
                if (source!=null)
413
 
                {
414
 
                        for (int i = 0; i < search.length; i++) {
415
 
                                if ( source.endsWith( prefix + search[i] ))
416
 
                                {
417
 
                                        sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
418
 
                                        modified = true;
419
 
                                        setStrings();
420
 
                                        break;
421
 
                                }
422
 
                                else if ( without && source.endsWith( search[i] ))
423
 
                                {
424
 
                                        sb.delete( sb.length() - search[i].length(), sb.length() );
425
 
                                        modified = true;
426
 
                                        setStrings();
427
 
                                        break;
428
 
                                }
429
 
                        }
430
 
                }
431
 
        }
432
 
 
433
 
        /**
434
 
         * Delete a suffix searched in zone "source" if preceded by prefix<br>
435
 
         * or replace it with the replace string if preceded by the prefix in the zone "from"<br>
436
 
         * or delete the suffix if specified
437
 
         *
438
 
         * @param source java.lang.String - the primary source zone for search
439
 
         * @param search java.lang.String[] - the strings to search for suppression
440
 
         * @param prefix java.lang.String - the prefix to add to the search string to test
441
 
         * @param without boolean - true if it will be deleted even without prefix found
442
 
         */
443
 
        private void deleteButSuffixFromElseReplace( String source, String[] search, String prefix, boolean without, String from, String replace ) {
444
 
                if (source!=null)
445
 
                {
446
 
                        for (int i = 0; i < search.length; i++) {
447
 
                                if ( source.endsWith( prefix + search[i] ))
448
 
                                {
449
 
                                        sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
450
 
                                        modified = true;
451
 
                                        setStrings();
452
 
                                        break;
453
 
                                }
454
 
                                else if ( from!=null && from.endsWith( prefix + search[i] ))
455
 
                                {
456
 
                                        sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace );
457
 
                                        modified = true;
458
 
                                        setStrings();
459
 
                                        break;
460
 
                                }
461
 
                                else if ( without && source.endsWith( search[i] ))
462
 
                                {
463
 
                                        sb.delete( sb.length() - search[i].length(), sb.length() );
464
 
                                        modified = true;
465
 
                                        setStrings();
466
 
                                        break;
467
 
                                }
468
 
                        }
469
 
                }
470
 
        }
471
 
 
472
 
        /**
473
 
         * Replace a search string with another within the source zone
474
 
         *
475
 
         * @param source java.lang.String - the source zone for search
476
 
         * @param search java.lang.String[] - the strings to search for replacement
477
 
         * @param replace java.lang.String - the replacement string
478
 
         */
479
 
        private boolean replaceFrom( String source, String[] search, String replace ) {
480
 
                boolean found = false;
481
 
                if (source!=null)
482
 
                {
483
 
                        for (int i = 0; i < search.length; i++) {
484
 
                                if ( source.endsWith( search[i] ))
485
 
                                {
486
 
                                        sb.replace( sb.length() - search[i].length(), sb.length(), replace );
487
 
                                        modified = true;
488
 
                                        found = true;
489
 
                                        setStrings();
490
 
                                        break;
491
 
                                }
492
 
                        }
493
 
                }
494
 
                return found;
495
 
        }
496
 
 
497
 
        /**
498
 
         * Delete a search string within the source zone
499
 
         *
500
 
         * @param source the source zone for search
501
 
         * @param suffix the strings to search for suppression
502
 
         */
503
 
        private void deleteFrom(String source, String[] suffix ) {
504
 
                if (source!=null)
505
 
                {
506
 
                        for (int i = 0; i < suffix.length; i++) {
507
 
                                if (source.endsWith( suffix[i] ))
508
 
                                {
509
 
                                        sb.delete( sb.length() - suffix[i].length(), sb.length());
510
 
                                        modified = true;
511
 
                                        setStrings();
512
 
                                        break;
513
 
                                }
514
 
                        }
515
 
                }
516
 
        }
517
 
 
518
 
        /**
519
 
         * Test if a char is a french vowel, including accentuated ones
520
 
         *
521
 
         * @param ch the char to test
522
 
         * @return boolean - true if the char is a vowel
523
 
         */
524
 
        private boolean isVowel(char ch) {
525
 
                switch (ch)
526
 
                {
527
 
                        case 'a':
528
 
                        case 'e':
529
 
                        case 'i':
530
 
                        case 'o':
531
 
                        case 'u':
532
 
                        case 'y':
533
 
                        case 'â':
534
 
                        case 'à':
535
 
                        case 'ë':
536
 
                        case 'é':
537
 
                        case 'ê':
538
 
                        case 'è':
539
 
                        case 'ï':
540
 
                        case 'î':
541
 
                        case 'ô':
542
 
                        case 'ü':
543
 
                        case 'ù':
544
 
                        case 'û':
545
 
                                return true;
546
 
                        default:
547
 
                                return false;
548
 
                }
549
 
        }
550
 
 
551
 
        /**
552
 
         * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
553
 
         * "R is the region after the first non-vowel following a vowel
554
 
         * or is the null region at the end of the word if there is no such non-vowel"<br>
555
 
         * @param buffer java.lang.StringBuffer - the in buffer
556
 
         * @return java.lang.String - the resulting string
557
 
         */
558
 
        private String retrieveR( StringBuffer buffer ) {
559
 
                int len = buffer.length();
560
 
                int pos = -1;
561
 
                for (int c = 0; c < len; c++) {
562
 
                        if (isVowel( buffer.charAt( c )))
563
 
                        {
564
 
                                pos = c;
565
 
                                break;
566
 
                        }
567
 
                }
568
 
                if (pos > -1)
569
 
                {
570
 
                        int consonne = -1;
571
 
                        for (int c = pos; c < len; c++) {
572
 
                                if (!isVowel(buffer.charAt( c )))
573
 
                                {
574
 
                                        consonne = c;
575
 
                                        break;
576
 
                                }
577
 
                        }
578
 
                        if (consonne > -1 && (consonne+1) < len)
579
 
                                return buffer.substring( consonne+1, len );
580
 
                        else
581
 
                                return null;
582
 
                }
583
 
                else
584
 
                        return null;
585
 
        }
586
 
 
587
 
        /**
588
 
         * Retrieve the "RV zone" from a buffer an return the corresponding string<br>
589
 
         * "If the word begins with two vowels, RV is the region after the third letter,
590
 
         * otherwise the region after the first vowel not at the beginning of the word,
591
 
         * or the end of the word if these positions cannot be found."<br>
592
 
         * @param buffer java.lang.StringBuffer - the in buffer
593
 
         * @return java.lang.String - the resulting string
594
 
         */
595
 
        private String retrieveRV( StringBuffer buffer ) {
596
 
                int len = buffer.length();
597
 
                if ( buffer.length() > 3)
598
 
                {
599
 
                        if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) {
600
 
                                return buffer.substring(3,len);
601
 
                        }
602
 
                        else
603
 
                        {
604
 
                                int pos = 0;
605
 
                                for (int c = 1; c < len; c++) {
606
 
                                        if (isVowel( buffer.charAt( c )))
607
 
                                        {
608
 
                                                pos = c;
609
 
                                                break;
610
 
                                        }
611
 
                                }
612
 
                                if ( pos+1 < len )
613
 
                                        return buffer.substring( pos+1, len );
614
 
                                else
615
 
                                        return null;
616
 
                        }
617
 
                }
618
 
                else
619
 
                        return null;
620
 
        }
621
 
 
622
 
 
623
 
 
624
 
    /**
625
 
         * Turns u and i preceded AND followed by a vowel to UpperCase<br>
626
 
         * Turns y preceded OR followed by a vowel to UpperCase<br>
627
 
         * Turns u preceded by q to UpperCase<br>
628
 
     *
629
 
     * @param buffer java.util.StringBuffer - the buffer to treat
630
 
     * @return java.util.StringBuffer - the treated buffer
631
 
     */
632
 
    private StringBuffer treatVowels( StringBuffer buffer ) {
633
 
                for ( int c = 0; c < buffer.length(); c++ ) {
634
 
                        char ch = buffer.charAt( c );
635
 
 
636
 
                        if (c == 0) // first char
637
 
                        {
638
 
                                if (buffer.length()>1)
639
 
                                {
640
 
                                        if (ch == 'y' && isVowel(buffer.charAt( c + 1 )))
641
 
                                                buffer.setCharAt( c, 'Y' );
642
 
                                }
643
 
                        }
644
 
                        else if (c == buffer.length()-1) // last char
645
 
                        {
646
 
                                if (ch == 'u' && buffer.charAt( c - 1 ) == 'q')
647
 
                                        buffer.setCharAt( c, 'U' );
648
 
                                if (ch == 'y' && isVowel(buffer.charAt( c - 1 )))
649
 
                                        buffer.setCharAt( c, 'Y' );
650
 
                        }
651
 
                        else // other cases
652
 
                        {
653
 
                                if (ch == 'u')
654
 
                                {
655
 
                                        if (buffer.charAt( c - 1) == 'q')
656
 
                                                buffer.setCharAt( c, 'U' );
657
 
                                        else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
658
 
                                                buffer.setCharAt( c, 'U' );
659
 
                                }
660
 
                                if (ch == 'i')
661
 
                                {
662
 
                                        if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
663
 
                                                buffer.setCharAt( c, 'I' );
664
 
                                }
665
 
                                if (ch == 'y')
666
 
                                {
667
 
                                        if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 )))
668
 
                                                buffer.setCharAt( c, 'Y' );
669
 
                                }
670
 
                        }
671
 
                }
672
 
 
673
 
                return buffer;
674
 
    }
675
 
 
676
 
    /**
677
 
     * Checks a term if it can be processed correctly.
678
 
     *
679
 
     * @return boolean - true if, and only if, the given term consists in letters.
680
 
     */
681
 
    private boolean isStemmable( String term ) {
682
 
                boolean upper = false;
683
 
                int first = -1;
684
 
                for ( int c = 0; c < term.length(); c++ ) {
685
 
                        // Discard terms that contain non-letter characters.
686
 
                        if ( !Character.isLetter( term.charAt( c ) ) ) {
687
 
                                return false;
688
 
                        }
689
 
                        // Discard terms that contain multiple uppercase letters.
690
 
                        if ( Character.isUpperCase( term.charAt( c ) ) ) {
691
 
                                if ( upper ) {
692
 
                                        return false;
693
 
                                }
694
 
                        // First encountered uppercase letter, set flag and save
695
 
                        // position.
696
 
                                else {
697
 
                                        first = c;
698
 
                                        upper = true;
699
 
                                }
700
 
                        }
701
 
                }
702
 
                // Discard the term if it contains a single uppercase letter that
703
 
                // is not starting the term.
704
 
                if ( first > 0 ) {
705
 
                        return false;
706
 
                }
707
 
                return true;
708
 
    }
709
 
}