1
#include "license.readme"
10
#include "affentry.hxx"
14
extern char * mystrdup(const char * s);
15
extern char * myrevstrdup(const char * s);
17
PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
19
// register affix manager
22
// set up its intial values
23
achar = dp->achar; // char flag
24
strip = dp->strip; // string to strip
25
appnd = dp->appnd; // string to append
26
stripl = dp->stripl; // length of strip string
27
appndl = dp->appndl; // length of append string
28
numconds = dp->numconds; // number of conditions to match
29
xpflg = dp->xpflg; // cross product flag
30
// then copy over all of the conditions
31
memcpy(&conds[0],&dp->conds[0],SETSIZE*sizeof(conds[0]));
41
if (appnd) free(appnd);
42
if (strip)free(strip);
50
// add prefix to this word assuming conditions hold
51
char * PfxEntry::add(const char * word, int len)
54
char tword[MAXWORDLEN+1];
56
/* make sure all conditions match */
57
if ((len > stripl) && (len >= numconds)) {
58
unsigned char * cp = (unsigned char *) word;
59
for (cond = 0; cond < numconds; cond++) {
60
if ((conds[*cp++] & (1 << cond)) == 0)
63
if (cond >= numconds) {
64
/* we have a match so add prefix */
70
char * pp = tword + tlen;
71
strcpy(pp, (word + stripl));
72
return mystrdup(tword);
81
// check if this prefix entry matches
82
struct hentry * PfxEntry::check(const char * word, int len)
84
int cond; // condition number being examined
85
int tmpl; // length of tmpword
86
struct hentry * he; // hash entry of root word or NULL
88
char tmpword[MAXWORDLEN+1];
91
// on entry prefix is 0 length or already matches the beginning of the word.
92
// So if the remaining root word has positive length
93
// and if there are enough chars in root word and added back strip chars
94
// to meet the number of characters conditions, then test it
98
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
100
// generate new root word by removing prefix and adding
101
// back any characters that would have been stripped
103
if (stripl) strcpy (tmpword, strip);
104
strcpy ((tmpword + stripl), (word + appndl));
106
// now make sure all of the conditions on characters
107
// are met. Please see the appendix at the end of
108
// this file for more info on exactly what is being
111
cp = (unsigned char *)tmpword;
112
for (cond = 0; cond < numconds; cond++) {
113
if ((conds[*cp++] & (1 << cond)) == 0) break;
116
// if all conditions are met then check if resulting
117
// root word in the dictionary
119
if (cond >= numconds) {
121
if ((he = pmyMgr->lookup(tmpword)) != NULL) {
122
if (TESTAFF(he->astr, achar, he->alen)) return he;
125
// prefix matched but no root word was found
126
// if XPRODUCT is allowed, try again but now
127
// ross checked combined with a suffix
129
if (xpflg & XPRODUCT) {
130
he = pmyMgr->suffix_check(tmpword, tmpl, XPRODUCT, (AffEntry *)this);
140
SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
142
// register affix manager
145
// set up its intial values
146
achar = dp->achar; // char flag
147
strip = dp->strip; // string to strip
148
appnd = dp->appnd; // string to append
149
stripl = dp->stripl; // length of strip string
150
appndl = dp->appndl; // length of append string
151
numconds = dp->numconds; // number of conditions to match
152
xpflg = dp->xpflg; // cross product flag
154
// then copy over all of the conditions
155
memcpy(&conds[0],&dp->conds[0],SETSIZE*sizeof(conds[0]));
157
rappnd = myrevstrdup(appnd);
161
SfxEntry::~SfxEntry()
164
if (appnd) free(appnd);
165
if (rappnd) free(rappnd);
166
if (strip) free(strip);
174
// add suffix to this word assuming conditions hold
175
char * SfxEntry::add(const char * word, int len)
178
char tword[MAXWORDLEN+1];
180
/* make sure all conditions match */
181
if ((len > stripl) && (len >= numconds)) {
182
unsigned char * cp = (unsigned char *) (word + len);
183
for (cond = numconds; --cond >=0; ) {
184
if ((conds[*--cp] & (1 << cond)) == 0)
188
/* we have a match so add suffix */
194
char * pp = (tword + tlen);
199
return mystrdup(tword);
207
// see if this suffix is present in the word
208
struct hentry * SfxEntry::check(const char * word, int len, int optflags, AffEntry* ppfx)
210
int tmpl; // length of tmpword
211
int cond; // condition beng examined
212
struct hentry * he; // hash entry pointer
214
char tmpword[MAXWORDLEN+1];
215
PfxEntry* ep = (PfxEntry *) ppfx;
218
// if this suffix is being cross checked with a prefix
219
// but it does not support cross products skip it
221
if ((optflags & XPRODUCT) != 0 && (xpflg & XPRODUCT) == 0)
224
// upon entry suffix is 0 length or already matches the end of the word.
225
// So if the remaining root word has positive length
226
// and if there are enough chars in root word and added back strip chars
227
// to meet the number of characters conditions, then test it
231
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
233
// generate new root word by removing suffix and adding
234
// back any characters that would have been stripped or
235
// or null terminating the shorter string
237
strcpy (tmpword, word);
238
cp = (unsigned char *)(tmpword + tmpl);
240
strcpy ((char *)cp, strip);
242
cp = (unsigned char *)(tmpword + tmpl);
245
// now make sure all of the conditions on characters
246
// are met. Please see the appendix at the end of
247
// this file for more info on exactly what is being
250
for (cond = numconds; --cond >= 0; ) {
251
if ((conds[*--cp] & (1 << cond)) == 0) break;
254
// if all conditions are met then check if resulting
255
// root word in the dictionary
258
if ((he = pmyMgr->lookup(tmpword)) != NULL) {
259
if (TESTAFF(he->astr, achar , he->alen) &&
260
((optflags & XPRODUCT) == 0 ||
261
TESTAFF(he->astr, ep->getFlag(), he->alen))) return he;
273
Appendix: Understanding Affix Code
276
An affix is either a prefix or a suffix attached to root words to make
279
Basically a Prefix or a Suffix is set of AffEntry objects
280
which store information about the prefix or suffix along
281
with supporting routines to check if a word has a particular
282
prefix or suffix or a combination.
284
The structure affentry is defined as follows:
288
unsigned char achar; // char used to represent the affix
289
char * strip; // string to strip before adding affix
290
char * appnd; // the affix string to add
291
short stripl; // length of the strip string
292
short appndl; // length of the affix string
293
short numconds; // the number of conditions that must be met
294
short xpflg; // flag: XPRODUCT- combine both prefix and suffix
295
char conds[SETSIZE]; // array which encodes the conditions to be met
299
Here is a suffix borrowed from the en_US.aff file. This file
300
is whitespace delimited.
304
SFX D y ied [^aeiou]y
308
This information can be interpreted as follows:
310
In the first line has 4 fields
314
1 SFX - indicates this is a suffix
315
2 D - is the name of the character flag which represents this suffix
316
3 Y - indicates it can be combined with prefixes (cross product)
317
4 4 - indicates that sequence of 4 affentry structures are needed to
318
properly store the affix information
320
The remaining lines describe the unique information for the 4 SfxEntry
321
objects that make up this affix. Each line can be interpreted
322
as follows: (note fields 1 and 2 are as a check against line 1 info)
326
1 SFX - indicates this is a suffix
327
2 D - is the name of the character flag for this affix
328
3 y - the string of chars to strip off before adding affix
329
(a 0 here indicates the NULL string)
330
4 ied - the string of affix characters to add
331
5 [^aeiou]y - the conditions which must be met before the affix
334
Field 5 is interesting. Since this is a suffix, field 5 tells us that
335
there are 2 conditions that must be met. The first condition is that
336
the next to the last character in the word must *NOT* be any of the
337
following "a", "e", "i", "o" or "u". The second condition is that
338
the last character of the word must end in "y".
340
So how can we encode this information concisely and be able to
341
test for both conditions in a fast manner? The answer is found
342
but studying the wonderful ispell code of Geoff Kuenning, et.al.
343
(now available under a normal BSD license).
345
If we set up a conds array of 256 bytes indexed (0 to 255) and access it
346
using a character (cast to an unsigned char) of a string, we have 8 bits
347
of information we can store about that character. Specifically we
348
could use each bit to say if that character is allowed in any of the
349
last (or first for prefixes) 8 characters of the word.
351
Basically, each character at one end of the word (up to the number
352
of conditions) is used to index into the conds array and the resulting
353
value found there says whether the that character is valid for a
354
specific character position in the word.
356
For prefixes, it does this by setting bit 0 if that char is valid
357
in the first position, bit 1 if valid in the second position, and so on.
359
If a bit is not set, then that char is not valid for that postion in the
362
If working with suffixes bit 0 is used for the character closest
363
to the front, bit 1 for the next character towards the end, ...,
364
with bit numconds-1 representing the last char at the end of the string.
366
Note: since entries in the conds[] are 8 bits, only 8 conditions
367
(read that only 8 character positions) can be examined at one
368
end of a word (the beginning for prefixes and the end for suffixes.
370
So to make this clearer, lets encode the conds array values for the
371
first two affentries for the suffix D described earlier.
374
For the first affentry:
375
numconds = 1 (only examine the last character)
377
conds['e'] = (1 << 0) (the word must end in an E)
380
For the second affentry:
381
numconds = 2 (only examine the last two characters)
383
conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
384
where X is all characters *but* a, e, i, o, or u
387
conds['y'] = (1 << 1) (the last char must be a y)
388
all other bits for all other entries in the conds array are zero