2
* ----START-LICENCE----
3
* Copyright 1999,2000 BrightStation PLC
5
* This program is free software; you can redistribute it and/or
6
* modify it under the terms of the GNU General Public License as
7
* published by the Free Software Foundation; either version 2 of the
8
* License, or (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
19
* -----END-LICENCE-----
21
/* Version 1: see http://open.muscat.com/ for further information */
25
#include <ctype.h> /* tolower */
27
static void * setup_english_stemmer(void);
29
static const char * english_stem(void * z, const char * q, int i0, int i1);
31
static void closedown_english_stemmer(void * z);
34
/* To set up the english stemming process:
36
void * z = setup_stemmer();
40
char * p = stem(z, q, i0, i1);
42
The word to be stemmed is in byte address q offsets i0 to i1
43
inclusive (i.e. from q[i0] to q[i1]). The stemmed result is the
44
C string at address p.
46
To close down the stemming process:
52
/* The English stemming algorithm is essentially the Porter stemming
53
* algorithm, and has been coded up by its author. It follows the algorithm
56
* Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
59
* only differing from it at the points marked -DEPARTURE- and -NEW-
62
* For a more faithful version of the Porter algorithm, see
64
* http://www.muscat.com/~martin/stem.html
72
The 'l' of the 'logi' -> 'log' rule is put with the stem, so that
73
short stems like 'geo' 'theo' etc work like 'archaeo' 'philo' etc.
75
This follows a suggestion of Barry Wilkins, reasearch student at
81
the cvc test for not dropping final -e now looks after vc at the
82
beginning of a word, so are, eve, ice, ore, use keep final -e. In this
83
test c is any consonant, including w, x and y. This extension was
84
suggested by Chris Emerson.
86
-fully -> -ful treated like -fulness -> -ful, and
87
-tionally -> -tion treated like -tional -> -tion
89
both in Step 2. These were suggested by Hiranmay Ghosh, of New Delhi.
91
Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh.
102
struct pool_entry * entries;
106
/* This is used as a library to resolve exceptions in the various
107
stemming algorithms. Typical use is,
109
struct pool * p = create_pool(t);
110
char * s_translated = search_pool(p, strlen(s), s);
114
t is an array of strings, e.g.
116
static char * t[] = {
127
if s is "sky", "skies", "dying" etc., translated_s is becomes "sky",
130
The code includes a sort/merge capability which may be turned into
131
(or replaced by) something more general later on.
135
/* merge(n, p, q, r, l, k, f) repeatedly merges n-byte sequences of items of
136
size k from addresses p and q into r. f is the comparison routine and
137
l is the limit point for q.
140
static void merge(int n, char * p, char * q, char * r, char * l, int k,
141
int (*f)(char *, char *))
143
if (q0 > l) { memmove(r, p, l-p); return; }
149
{ if (p >= pl) { memmove(r, q, ql-q); r += ql-q; q = ql; break; }
150
if (q >= ql) { memmove(r, p, pl-p); r += pl-p; p = pl; break; }
151
if (f(p, q)) { memmove(r, p, k); p += k; }
152
else { memmove(r, q, k); q += k; }
159
/* In sort(p, c, k, f), p+c is a byte address at which begin a sequence of
160
items of size k to be sorted. p+l is the address of the byte after the
161
last of these items, so l - c is divisible by k. f is a comparison function
162
for a pair of these items: f(p+i, q+j) is true if the item at p+i is before
163
the item at q+j, false if it is equal to or after it.
166
static void sort(char * p, int c, int l, int k,
167
int (*f)(char *, char *))
169
char * q = malloc(l-c); /* temporary work space */
174
for (cycle = 1; cycle <= 2; cycle++)
175
{ int h = (w+j-1) / j / 2 * j; /* half way */
176
if (cycle == 1) merge(j, p+c, p+c+h, q, p+l, k, f);
177
else merge(j, q, q+h, p+c, q+w, k, f);
186
const char * translation;
187
const char * pointer;
192
static void print_entry(struct pool_entry * p)
194
{ int j; for (j=0;j<p->length;j++) fprintf(stderr, "%c", (p->pointer)[j]); }
195
fprintf(stderr, " --> %s\n", p->translation);
199
static void print_pool(struct pool * p)
202
struct pool_entry * q = p->entries;
203
fprintf(stderr, "\nPool:\n");
204
for (i = 0; i < size; i++) print_entry(q+i);
208
/* compare(p, q) is our comparison function, used for f above
211
static int compare(char * char_p, char * char_q)
212
{ struct pool_entry * p = (struct pool_entry *) char_p;
213
struct pool_entry * q = (struct pool_entry *) char_q;
214
if (p->length == q->length) return memcmp(p->pointer, q->pointer, p->length) < 0;
215
return p->length < q->length;
218
static int count_slashes(const char * s[])
219
{ int slash_count = 0;
221
for (i = 1; s[i] != 0; i += 2)
222
{ const char * p = s[i];
224
while (p[j] != 0) if (p[j++] == '/') slash_count++;
229
static struct pool * create_pool(const char * s[])
230
{ int size = count_slashes(s);
231
struct pool_entry * z = (struct pool_entry *) malloc(size * sizeof(struct pool_entry));
232
struct pool_entry * q = z;
234
for (i = 1; s[i] != 0; i += 2)
235
{ const char * p = s[i];
240
{ if (j0 != j) { fprintf(stderr, "%s lacks final '/'\n", p); exit(1); }
245
q->translation = s[i-1];
246
q->pointer = p+j0; q->length = j-j0;
253
sort((char *) z, 0, size * sizeof(struct pool_entry), sizeof(struct pool_entry), compare);
255
/* now validate the contents */
257
for (i = 1; i < size; i++)
258
{ struct pool_entry * p = z+i;
259
struct pool_entry * q = z+i-1;
260
if (p->length == q->length && memcmp(p->pointer, q->pointer, p->length) == 0)
261
{ fprintf(stderr, "warning: "); print_entry(p);
262
fprintf(stderr, " and "); print_entry(q);
266
{ struct pool * p = (struct pool *) malloc(sizeof(struct pool));
273
static int compare_to_pool(int length, const char * s, int length_p, const char * s_p)
274
{ if (length != length_p) return length-length_p;
275
return memcmp(s, s_p, length);
278
static const char * search_pool(struct pool * p, int length, char * s)
281
struct pool_entry * q = p->entries;
282
if (j == 0) return 0; /* empty pool */
283
if (compare_to_pool(length, s, q->length, q->pointer) < 0) return 0;
287
int diff = compare_to_pool(length, s, (q+h)->length, (q+h)->pointer);
288
if (diff == 0) return (q+h)->translation;
289
if (j-i <= 1) return 0;
290
if (diff < 0) j = h; else i = h;
294
static void free_pool(struct pool * p)
299
struct english_stemmer
305
struct pool * irregulars;
308
/* The main part of the stemming algorithm starts here. z->p is a buffer
309
holding a word to be stemmed. The letters are in z->p[0], z->p[1] ...
310
ending at z->p[z->k]. z->k is readjusted downwards as the stemming
311
progresses. Zero termination is not in fact used in the algorithm.
313
Note that only lower case sequences are stemmed. Forcing to lower case
314
should be done before english_stem(...) is called.
316
We will write p, k etc in place of z->p, z->k in the comments.
319
/* cons(z, i) is true <=> p[i] is a consonant.
322
static int cons(struct english_stemmer * z, int i)
324
{ case 'a': case 'e': case 'i': case 'o': case 'u':
327
return (i==0) ? true : !cons(z, i - 1);
328
default: return true;
332
/* m(z) measures the number of consonant sequences between 0 and j. if c is
333
a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
343
static int m(struct english_stemmer * z)
347
{ if (i > z->j) return n;
348
if (! cons(z, i)) break; i++;
353
{ if (i > z->j) return n;
354
if (cons(z, i)) break;
360
{ if (i > z->j) return n;
361
if (! cons(z, i)) break;
368
/* vowelinstem(z) is true p[0], ... p[j] contains a vowel
371
static int vowelinstem(struct english_stemmer * z)
373
for (i = 0; i <= z->j; i++) if (! cons(z, i)) return true;
377
/* doublec(z, i) is true <=> p[i], p[i - 1] contain a double consonant.
380
static int doublec(struct english_stemmer * z, int i)
381
{ if (i < 1) return false;
382
if (z->p[i] != z->p[i - 1]) return false;
386
/* cvc(z, i) is true <=>
388
a) ( -NEW- ) i == 1, and p[0] p[1] is vowel consonant, or
390
b) p[i - 2], p[i - 1], p[i] has the form consonant -
391
vowel - consonant and also if the second c is not w, x or y. this is used
392
when trying to restore an e at the end of a short word. e.g.
394
cav(e), lov(e), hop(e), crim(e), but
399
static int cvc(struct english_stemmer * z, int i)
401
if (i == 0) return false; /* i == 0 never happens perhaps */
403
if (i == 1) return !cons(z, 0) && cons(z, 1);
405
if (!cons(z, i) || cons(z, i - 1) || !cons(z, i - 2)) return false;
407
if (ch == 'w' || ch == 'x' || ch == 'y') return false;
412
/* ends(z, s, length) is true <=> p[0], ... p[k] ends with the string s.
415
static int ends(struct english_stemmer * z, const char * s, int length)
417
if (length > z->k + 1) return false;
418
if (memcmp(z->p + z->k - length + 1, s, length) != 0) return false;
419
z->j = z->k - length;
423
/* setto(z, s, length) sets p[j + 1] ... to the characters in the string s,
427
static void setto(struct english_stemmer * z, const char * s, int length)
429
memmove(z->p + z->j + 1, s, length);
430
z->k = z->j + length;
433
/* r(z, s, length) is used further down. */
435
static void r(struct english_stemmer * z, const char * s, int length)
437
if (m(z) > 0) setto(z, s, length);
440
/* step_1ab(z) gets rid of plurals and -ed or -ing. e.g.
445
tie -> tie (-NEW-: see below)
463
static void step_1ab(struct english_stemmer * z)
464
{ if (z->p[z->k] == 's')
465
{ if (ends(z, "sses", 4)) z->k -= 2; else
466
if (ends(z, "ies", 3))
467
if (z->j == 0) z->k--; else z->k -= 2;
469
/* this line extends the original algorithm, so that 'flies'->'fli' but
473
if (z->p[z->k - 1] != 's') z->k--;
476
if (ends(z, "ied", 3)) { if (z->j == 0) z->k--; else z->k -= 2; } else
478
/* this line extends the original algorithm, so that 'spied'->'spi' but
481
if (ends(z, "eed", 3)) { if (m(z) > 0) z->k--; } else
482
if ((ends(z, "ed", 2) || ends(z, "ing", 3)) && vowelinstem(z))
484
if (ends(z, "at", 2)) setto(z, "ate", 3); else
485
if (ends(z, "bl", 2)) setto(z, "ble", 3); else
486
if (ends(z, "iz", 2)) setto(z, "ize", 3); else
487
if (doublec(z, z->k))
489
{ int ch = z->p[z->k];
490
if (ch == 'l' || ch == 's' || ch == 'z') z->k++;
493
else if (m(z) == 1 && cvc(z, z->k)) setto(z, "e", 1);
497
/* step_1c(z) turns terminal y to i when there is another vowel in the stem.
499
-NEW-: This has been modified from the original Porter algorithm so that y->i
500
is only done when y is preceded by a consonant, but not if the stem
501
is only a single consonant, i.e.
503
(*c and not c) Y -> I
505
So 'happy' -> 'happi', but
506
'enjoy' -> 'enjoy' etc
508
This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'->
509
'enjoy'. Step 1c is perhaps done too soon; but with this modification that
510
no longer really matters.
512
Also, the removal of the vowelinstem(z) condition means that 'spy', 'fly',
513
'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried',
518
static void step_1c(struct english_stemmer * z)
520
if (ends(z, "y", 1) && z->j > 0 && cons(z, z->k - 1)) z->p[z->k] = 'i';
524
/* step_2(z) maps double suffices to single ones. so -ization ( = -ize plus
525
-ation) maps to -ize etc. Note that the string before the suffix must give
529
static void step_2(struct english_stemmer * z)
530
{ switch (z->p[z->k - 1])
533
if (ends(z, "ational", 7)) { r(z, "ate", 3); break; }
534
if (ends(z, "tional", 6)) { r(z, "tion", 4); break; }
537
if (ends(z, "enci", 4)) { r(z, "ence", 4); break; }
538
if (ends(z, "anci", 4)) { r(z, "ance", 4); break; }
541
if (ends(z, "izer", 4)) { r(z, "ize", 3); break; }
544
if (ends(z, "bli", 3)) { r(z, "ble", 3); break; } /*-DEPARTURE-*/
546
/* To match the published algorithm, replace this line with
548
if (ends(z, "abli", 4)) { r(z, "able", 4); break; }
550
if (ends(z, "alli", 4))
552
if (m(z) > 0) { setto(z, "al", 2); step_2(z); } /*-NEW-*/
556
if (ends(z, "fulli", 5)) { r(z, "ful", 3); break; } /*-NEW-*/
557
if (ends(z, "entli", 5)) { r(z, "ent", 3); break; }
558
if (ends(z, "eli", 3)) { r(z, "e", 1); break; }
559
if (ends(z, "ousli", 5)) { r(z, "ous", 3); break; }
562
if (ends(z, "ization", 7)) { r(z, "ize", 3); break; }
563
if (ends(z, "ation", 5)) { r(z, "ate", 3); break; }
564
if (ends(z, "ator", 4)) { r(z, "ate", 3); break; }
567
if (ends(z, "alism", 5)) { r(z, "al", 2); break; }
568
if (ends(z, "iveness", 7)) { r(z, "ive", 3); break; }
569
if (ends(z, "fulness", 7)) { r(z, "ful", 3); break; }
570
if (ends(z, "ousness", 7)) { r(z, "ous", 3); break; }
573
if (ends(z, "aliti", 5)) { r(z, "al", 2); break; }
574
if (ends(z, "iviti", 5)) { r(z, "ive", 3); break; }
575
if (ends(z, "biliti", 6)) { r(z, "ble", 3); break; }
578
if (ends(z, "logi", 4))
579
{ z->j++; /*-NEW-*/ /*(Barry Wilkins)*/
580
r(z, "og", 2); break;
583
/* To match the published algorithm, delete this line */
588
/* step_3(z) deals with -ic-, -full, -ness etc. Similar strategy to step_2.
591
static void step_3(struct english_stemmer * z)
592
{ switch (z->p[z->k])
595
if (ends(z, "icate", 5)) { r(z, "ic", 2); break; }
596
if (ends(z, "ative", 5)) { r(z, "", 0); break; }
597
if (ends(z, "alize", 5)) { r(z, "al", 2); break; }
600
if (ends(z, "iciti", 5)) { r(z, "ic", 2); break; }
603
if (ends(z, "ical", 4)) { r(z, "ic", 2); break; }
604
if (ends(z, "ful", 3)) { r(z, "", 0); break; }
607
if (ends(z, "ness", 4)) { r(z, "", 0); break; }
612
/* step_4() takes off -ant, -ence etc., in context <c>vcvc<v>.
615
static void step_4(struct english_stemmer * z)
616
{ switch (z->p[z->k - 1])
618
if (ends(z, "al", 2)) break; return;
620
if (ends(z, "ance", 4)) break;
621
if (ends(z, "ence", 4)) break; return;
623
if (ends(z, "er", 2)) break; return;
625
if (ends(z, "ic", 2)) break; return;
627
if (ends(z, "able", 4)) break;
628
if (ends(z, "ible", 4)) break; return;
630
if (ends(z, "ant", 3)) break;
631
if (ends(z, "ement", 5)) break;
632
if (ends(z, "ment", 4)) break;
633
if (ends(z, "ent", 3)) break; return;
635
if (ends(z, "ion", 3) && (z->p[z->j] == 's' ||
636
z->p[z->j] == 't')) break;
637
if (ends(z, "ou", 2)) break; return;
638
/* takes care of -ous */
640
if (ends(z, "ism", 3)) break; return;
642
if (ends(z, "ate", 3)) break;
643
if (ends(z, "iti", 3)) break; return;
645
if (ends(z, "ous", 3)) break; return;
647
if (ends(z, "ive", 3)) break; return;
649
if (ends(z, "ize", 3)) break; return;
653
if (m(z) > 1) z->k = z->j;
656
/* step_5(z) removes a final -e if m(z) > 1, and changes -ll to -l if
660
static void step_5(struct english_stemmer * z)
662
if (z->p[z->k] == 'e')
664
if (a > 1 || (a == 1 && !cvc(z, z->k - 1))) z->k--;
666
if (z->p[z->k] == 'l' && doublec(z, z->k) && m(z) > 1) z->k--;
669
static const char * english_stem(void * z_, const char * q, int i0, int i1)
671
struct english_stemmer * z = (struct english_stemmer *) z_;
672
int p_size = z->p_size;
674
if (i1 - i0 + 50 > p_size)
676
p_size = i1 - i0 + 75; /* ample */ z->p_size = p_size;
677
z->p = (char *) malloc(p_size);
680
memmove(z->p, q + i0, i1 - i0 + 1);
685
{ const char * t = search_pool(z->irregulars, z->k + 1, z->p);
687
z->k = strlen(t) - 1;
692
if (z->k > 1) /*-DEPARTURE-*/
694
/* With this line, strings of length 1 or 2 don't go through the
695
stemming process, although no mention is made of this in the
696
published algorithm. Remove the line to match the published
699
{ step_1ab(z); step_1c(z);
706
z->p[z->k + 1] = 0; /* C string form for now */
711
This is a table of irregular forms. It is quite short, but still
712
reflects the errors actually drawn to Martin Porter's attention over
715
Extend it as necessary.
717
The form of the table is:
719
"p1" "s11/s12/s13/ ... /"
720
"p2" "s21/s22/s23/ ... /"
722
"pn" "sn1/sn2/sn3/ ... /"
725
String sij is mapped to paradigm form pi, and the main stemming
726
process is then bypassed.
729
static const char * irregular_forms[] = {
736
"inning", "innings/inning/",
737
"outing", "outings/outing/",
738
"canning", "cannings/canning/",
742
"proceed", "proceed/",
744
"succeed", "succeed/", /* Hiranmay Ghosh */
746
0, 0 /* terminator */
762
/* is exists left tree ? */
764
/* finish word flag */
766
#define ISLEFT(x) (((ESWNODE*)x)->flag & L)
767
#define ISFINISH(x) (((ESWNODE*)x)->flag & F)
770
static ESWNODE engstoptree[] = {
1204
find_english_stopword( unsigned char *buf, int len ) {
1205
ESWNODE *ptr = engstoptree;
1207
unsigned char *cur = buf;
1209
while( cur - buf < len ) {
1210
if ( ptr->val == *cur ) {
1212
if ( ISFINISH(ptr) ) result = cur - buf;
1213
if ( ! ptr->child ) break;
1215
} else if ( ptr->val > *cur ) {
1236
is_stopengword(void* obj,char* word,int len)
1238
return ( len == find_english_stopword((unsigned char*)word, len) ) ? 1 : 0;
1242
setup_english_stemmer(void)
1244
struct english_stemmer * z = (struct english_stemmer *) malloc(sizeof(struct english_stemmer));
1245
z->p = 0; z->p_size = 0;
1246
z->irregulars = create_pool(irregular_forms);
1251
closedown_english_stemmer(void * z_)
1253
struct english_stemmer * z = (struct english_stemmer *) z_;
1254
free_pool(z->irregulars);
1260
engstemming(void* obj, char *word, int *len)
1262
struct english_stemmer * z = (struct english_stemmer *) obj;
1263
const char* stemmed_word;
1264
char *result = word;
1266
while(result-word < *len) {
1267
*result = tolower((unsigned char) *result);
1270
stemmed_word = english_stem(obj, word, 0, *len-1);
1273
result = (char*)palloc( *len );
1274
memcpy((void*)result, (void*)stemmed_word, *len);
1277
#endif /* DICT_BODY */
1282
setup_english_stemmer,
1283
closedown_english_stemmer,