1
package org.apache.lucene.analysis.fr;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
21
* A stemmer for French words. The algorithm is based on the work of
22
* Dr Martin Porter on his snowball project<br>
23
* refer to http://snowball.sourceforge.net/french/stemmer.html<br>
24
* (French stemming algorithm) for details
26
* @author Patrick Talbot
29
public class FrenchStemmer {
32
* Buffer for the terms while stemming them.
34
private StringBuffer sb = new StringBuffer();
37
* A temporary buffer, used to reconstruct R2
39
private StringBuffer tb = new StringBuffer();
42
* Region R0 is equal to the whole buffer
48
* "If the word begins with two vowels, RV is the region after the third letter,
49
* otherwise the region after the first vowel not at the beginning of the word,
50
* or the end of the word if these positions cannot be found."
56
* "R1 is the region after the first non-vowel following a vowel
57
* or is the null region at the end of the word if there is no such non-vowel"
63
* "R2 is the region after the first non-vowel in R1 following a vowel
64
* or is the null region at the end of the word if there is no such non-vowel"
70
* Set to true if we need to perform step 2
72
private boolean suite;
75
* Set to true if the buffer was modified
77
private boolean modified;
81
* Stemms the given term to a unique <tt>discriminator</tt>.
83
* @param term java.langString The term that should be stemmed
84
* @return java.lang.String Discriminator for <tt>term</tt>
86
protected String stem( String term ) {
87
if ( !isStemmable( term ) ) {
91
// Use lowercase for medium stemming.
92
term = term.toLowerCase();
94
// Reset the StringBuffer.
95
sb.delete( 0, sb.length() );
102
sb = treatVowels( sb );
108
if (!modified || suite)
118
if (modified || suite)
127
return sb.toString();
131
* Sets the search region Strings<br>
132
* it needs to be done each time the buffer was modified
134
private void setStrings() {
137
RV = retrieveRV( sb );
138
R1 = retrieveR( sb );
141
tb.delete( 0, tb.length() );
143
R2 = retrieveR( tb );
150
* First step of the Porter Algorithmn<br>
151
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
153
private void step1( ) {
154
String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
155
deleteFrom( R2, suffix );
157
replaceFrom( R2, new String[] { "logies", "logie" }, "log" );
158
replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
159
replaceFrom( R2, new String[] { "ences", "ence" }, "ent" );
161
String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
162
deleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" );
164
deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
165
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
166
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
167
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
168
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );
170
deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
171
deleteFrom( RV, new String[] { "ements", "ement" } );
173
deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" );
174
deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" );
175
deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true );
177
String[] autre = { "ifs", "ives", "if", "ive" };
178
deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
179
deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
181
replaceFrom( R0, new String[] { "eaux" }, "eau" );
183
replaceFrom( R1, new String[] { "aux" }, "al" );
185
deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );
187
deleteFrom( R2, new String[] { "eux" } );
189
// if one of the next steps is performed, we will need to perform step2a
190
boolean temp = false;
191
temp = replaceFrom( RV, new String[] { "amment" }, "ant" );
194
temp = replaceFrom( RV, new String[] { "emment" }, "ent" );
197
temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
204
* Second step (A) of the Porter Algorithmn<br>
205
* Will be performed if nothing changed from the first step
206
* or changed were done in the amment, emment, ments or ment suffixes<br>
207
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
209
* @return boolean - true if something changed in the StringBuffer
211
private boolean step2a() {
212
String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira",
213
"irent", "iriez", "irez", "irions", "irons", "iront",
214
"issaIent", "issais", "issantes", "issante", "issants", "issant",
215
"issait", "issais", "issions", "issons", "issiez", "issez", "issent",
216
"isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" };
217
return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
221
* Second step (B) of the Porter Algorithmn<br>
222
* Will be performed if step 2 A was performed unsuccessfully<br>
223
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
225
private void step2b() {
226
String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
227
"erons", "eront","erez", "èrent", "era", "ées", "iez",
228
"ée", "és", "er", "ez", "é" };
229
deleteFrom( RV, suffix );
231
String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
232
"antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",
233
"ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
234
deleteButSuffixFrom( RV, search, "e", true );
236
deleteFrom( R2, new String[] { "ions" } );
240
* Third step of the Porter Algorithmn<br>
241
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
243
private void step3() {
246
char ch = sb.charAt( sb.length()-1 );
249
sb.setCharAt( sb.length()-1, 'i' );
254
sb.setCharAt( sb.length()-1, 'c' );
261
* Fourth step of the Porter Algorithmn<br>
262
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
264
private void step4() {
267
char ch = sb.charAt( sb.length()-1 );
270
char b = sb.charAt( sb.length()-2 );
271
if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')
273
sb.delete( sb.length() - 1, sb.length());
278
boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
280
found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
282
replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
283
deleteFrom( RV, new String[] { "e" } );
284
deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
288
* Fifth step of the Porter Algorithmn<br>
289
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
291
private void step5() {
294
if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill"))
296
sb.delete( sb.length() - 1, sb.length() );
303
* Sixth (and last!) step of the Porter Algorithmn<br>
304
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
306
private void step6() {
307
if (R0!=null && R0.length()>0)
309
boolean seenVowel = false;
310
boolean seenConson = false;
312
for (int i = R0.length()-1; i > -1; i--)
314
char ch = R0.charAt(i);
319
if (ch == 'é' || ch == 'è')
335
if (pos > -1 && seenConson && !seenVowel)
336
sb.setCharAt(pos, 'e');
341
* Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
343
* @param source java.lang.String - the primary source zone for search
344
* @param search java.lang.String[] - the strings to search for suppression
345
* @param from java.lang.String - the secondary source zone for search
346
* @param prefix java.lang.String - the prefix to add to the search string to test
347
* @return boolean - true if modified
349
private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) {
350
boolean found = false;
353
for (int i = 0; i < search.length; i++) {
354
if ( source.endsWith( search[i] ))
356
if (from!=null && from.endsWith( prefix + search[i] ))
358
sb.delete( sb.length() - search[i].length(), sb.length());
370
* Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
372
* @param source java.lang.String - the primary source zone for search
373
* @param search java.lang.String[] - the strings to search for suppression
374
* @param vowel boolean - true if we need a vowel before the search string
375
* @param from java.lang.String - the secondary source zone for search (where vowel could be)
376
* @return boolean - true if modified
378
private boolean deleteFromIfTestVowelBeforeIn( String source, String[] search, boolean vowel, String from ) {
379
boolean found = false;
380
if (source!=null && from!=null)
382
for (int i = 0; i < search.length; i++) {
383
if ( source.endsWith( search[i] ))
385
if ((search[i].length() + 1) <= from.length())
387
boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1)));
390
sb.delete( sb.length() - search[i].length(), sb.length());
404
* Delete a suffix searched in zone "source" if preceded by the prefix
406
* @param source java.lang.String - the primary source zone for search
407
* @param search java.lang.String[] - the strings to search for suppression
408
* @param prefix java.lang.String - the prefix to add to the search string to test
409
* @param without boolean - true if it will be deleted even without prefix found
411
private void deleteButSuffixFrom( String source, String[] search, String prefix, boolean without ) {
414
for (int i = 0; i < search.length; i++) {
415
if ( source.endsWith( prefix + search[i] ))
417
sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
422
else if ( without && source.endsWith( search[i] ))
424
sb.delete( sb.length() - search[i].length(), sb.length() );
434
* Delete a suffix searched in zone "source" if preceded by prefix<br>
435
* or replace it with the replace string if preceded by the prefix in the zone "from"<br>
436
* or delete the suffix if specified
438
* @param source java.lang.String - the primary source zone for search
439
* @param search java.lang.String[] - the strings to search for suppression
440
* @param prefix java.lang.String - the prefix to add to the search string to test
441
* @param without boolean - true if it will be deleted even without prefix found
443
private void deleteButSuffixFromElseReplace( String source, String[] search, String prefix, boolean without, String from, String replace ) {
446
for (int i = 0; i < search.length; i++) {
447
if ( source.endsWith( prefix + search[i] ))
449
sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
454
else if ( from!=null && from.endsWith( prefix + search[i] ))
456
sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace );
461
else if ( without && source.endsWith( search[i] ))
463
sb.delete( sb.length() - search[i].length(), sb.length() );
473
* Replace a search string with another within the source zone
475
* @param source java.lang.String - the source zone for search
476
* @param search java.lang.String[] - the strings to search for replacement
477
* @param replace java.lang.String - the replacement string
479
private boolean replaceFrom( String source, String[] search, String replace ) {
480
boolean found = false;
483
for (int i = 0; i < search.length; i++) {
484
if ( source.endsWith( search[i] ))
486
sb.replace( sb.length() - search[i].length(), sb.length(), replace );
498
* Delete a search string within the source zone
500
* @param source the source zone for search
501
* @param suffix the strings to search for suppression
503
private void deleteFrom(String source, String[] suffix ) {
506
for (int i = 0; i < suffix.length; i++) {
507
if (source.endsWith( suffix[i] ))
509
sb.delete( sb.length() - suffix[i].length(), sb.length());
519
* Test if a char is a french vowel, including accentuated ones
521
* @param ch the char to test
522
* @return boolean - true if the char is a vowel
524
private boolean isVowel(char ch) {
552
* Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
553
* "R is the region after the first non-vowel following a vowel
554
* or is the null region at the end of the word if there is no such non-vowel"<br>
555
* @param buffer java.lang.StringBuffer - the in buffer
556
* @return java.lang.String - the resulting string
558
private String retrieveR( StringBuffer buffer ) {
559
int len = buffer.length();
561
for (int c = 0; c < len; c++) {
562
if (isVowel( buffer.charAt( c )))
571
for (int c = pos; c < len; c++) {
572
if (!isVowel(buffer.charAt( c )))
578
if (consonne > -1 && (consonne+1) < len)
579
return buffer.substring( consonne+1, len );
588
* Retrieve the "RV zone" from a buffer an return the corresponding string<br>
589
* "If the word begins with two vowels, RV is the region after the third letter,
590
* otherwise the region after the first vowel not at the beginning of the word,
591
* or the end of the word if these positions cannot be found."<br>
592
* @param buffer java.lang.StringBuffer - the in buffer
593
* @return java.lang.String - the resulting string
595
private String retrieveRV( StringBuffer buffer ) {
596
int len = buffer.length();
597
if ( buffer.length() > 3)
599
if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) {
600
return buffer.substring(3,len);
605
for (int c = 1; c < len; c++) {
606
if (isVowel( buffer.charAt( c )))
613
return buffer.substring( pos+1, len );
625
* Turns u and i preceded AND followed by a vowel to UpperCase<br>
626
* Turns y preceded OR followed by a vowel to UpperCase<br>
627
* Turns u preceded by q to UpperCase<br>
629
* @param buffer java.util.StringBuffer - the buffer to treat
630
* @return java.util.StringBuffer - the treated buffer
632
private StringBuffer treatVowels( StringBuffer buffer ) {
633
for ( int c = 0; c < buffer.length(); c++ ) {
634
char ch = buffer.charAt( c );
636
if (c == 0) // first char
638
if (buffer.length()>1)
640
if (ch == 'y' && isVowel(buffer.charAt( c + 1 )))
641
buffer.setCharAt( c, 'Y' );
644
else if (c == buffer.length()-1) // last char
646
if (ch == 'u' && buffer.charAt( c - 1 ) == 'q')
647
buffer.setCharAt( c, 'U' );
648
if (ch == 'y' && isVowel(buffer.charAt( c - 1 )))
649
buffer.setCharAt( c, 'Y' );
655
if (buffer.charAt( c - 1) == 'q')
656
buffer.setCharAt( c, 'U' );
657
else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
658
buffer.setCharAt( c, 'U' );
662
if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
663
buffer.setCharAt( c, 'I' );
667
if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 )))
668
buffer.setCharAt( c, 'Y' );
677
* Checks a term if it can be processed correctly.
679
* @return boolean - true if, and only if, the given term consists in letters.
681
private boolean isStemmable( String term ) {
682
boolean upper = false;
684
for ( int c = 0; c < term.length(); c++ ) {
685
// Discard terms that contain non-letter characters.
686
if ( !Character.isLetter( term.charAt( c ) ) ) {
689
// Discard terms that contain multiple uppercase letters.
690
if ( Character.isUpperCase( term.charAt( c ) ) ) {
694
// First encountered uppercase letter, set flag and save
702
// Discard the term if it contains a single uppercase letter that
703
// is not starting the term.