1
#include "license.hunspell"
2
#include "license.myspell"
9
#include "affentry.hxx"
12
PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
14
// register affix manager
17
// set up its initial values
19
aflag = dp->aflag; // flag
20
strip = dp->strip; // string to strip
21
appnd = dp->appnd; // string to append
22
stripl = dp->stripl; // length of strip string
23
appndl = dp->appndl; // length of append string
24
numconds = dp->numconds; // length of the condition
25
opts = dp->opts; // cross product flag
26
// then copy over all of the conditions
27
if (opts & aeLONGCOND) {
28
memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
29
c.l.conds2 = dp->c.l.conds2;
30
} else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
34
morphcode = dp->morphcode;
35
contclass = dp->contclass;
36
contclasslen = dp->contclasslen;
43
if (appnd) free(appnd);
44
if (strip) free(strip);
48
if (opts & aeLONGCOND) free(c.l.conds2);
49
if (morphcode && !(opts & aeALIASM)) free(morphcode);
50
if (contclass && !(opts & aeALIASF)) free(contclass);
53
// add prefix to this word assuming conditions hold
54
char * PfxEntry::add(const char * word, int len)
56
char tword[MAXWORDUTF8LEN + 4];
58
if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
59
(len >= numconds) && test_condition(word) &&
60
(!stripl || (strncmp(word, strip, stripl) == 0)) &&
61
((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
62
/* we have a match so add prefix */
68
strcpy(pp, (word + stripl));
69
return mystrdup(tword);
74
inline char * PfxEntry::nextchar(char * p) {
77
if (opts & aeLONGCOND) {
78
// jump to the 2nd part of the condition
79
if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;
80
// end of the MAXCONDLEN length condition
81
} else if (p == c.conds + MAXCONDLEN) return NULL;
87
inline int PfxEntry::test_condition(const char * st)
89
const char * pos = NULL; // group with pos input position
90
bool neg = false; // complementer
91
bool ingroup = false; // character in the group
92
if (numconds == 0) return 1;
103
case '^': { p = nextchar(p); neg = true; break; }
105
if ((neg && ingroup) || (!neg && !ingroup)) return 0;
108
// skip the next character
109
if (!ingroup) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
110
if (*st == '\0' && p) return 0; // word <= condition
113
case '.': if (!pos) { // dots are not metacharacters in groups: [.]
115
// skip the next character
116
for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
117
if (*st == '\0' && p) return 0; // word <= condition
124
if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
125
while (p && (*p & 0xc0) == 0x80) { // character
134
if (pos && st != pos) {
136
while (p && *p != ']' && (p = nextchar(p)));
140
while (p && *p != ']' && (p = nextchar(p)));
142
} else if (pos) { // group
151
// check if this prefix entry matches
152
struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
154
int tmpl; // length of tmpword
155
struct hentry * he; // hash entry of root word or NULL
156
char tmpword[MAXWORDUTF8LEN + 4];
158
// on entry prefix is 0 length or already matches the beginning of the word.
159
// So if the remaining root word has positive length
160
// and if there are enough chars in root word and added back strip chars
161
// to meet the number of characters conditions, then test it
165
if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
167
// generate new root word by removing prefix and adding
168
// back any characters that would have been stripped
170
if (stripl) strcpy (tmpword, strip);
171
strcpy ((tmpword + stripl), (word + appndl));
173
// now make sure all of the conditions on characters
174
// are met. Please see the appendix at the end of
175
// this file for more info on exactly what is being
178
// if all conditions are met then check if resulting
179
// root word in the dictionary
181
if (test_condition(tmpword)) {
183
if ((he = pmyMgr->lookup(tmpword)) != NULL) {
185
if (TESTAFF(he->astr, aflag, he->alen) &&
186
// forbid single prefixes with needaffix flag
187
! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
189
((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
190
(contclass && TESTAFF(contclass, needflag, contclasslen))))
192
he = he->next_homonym; // check homonyms
196
// prefix matched but no root word was found
197
// if aeXPRODUCT is allowed, try again but now
198
// ross checked combined with a suffix
200
//if ((opts & aeXPRODUCT) && in_compound) {
201
if ((opts & aeXPRODUCT)) {
202
he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL,
203
0, NULL, FLAG_NULL, needflag, in_compound);
211
// check if this prefix entry matches
212
struct hentry * PfxEntry::check_twosfx(const char * word, int len,
213
char in_compound, const FLAG needflag)
215
int tmpl; // length of tmpword
216
struct hentry * he; // hash entry of root word or NULL
217
char tmpword[MAXWORDUTF8LEN + 4];
219
// on entry prefix is 0 length or already matches the beginning of the word.
220
// So if the remaining root word has positive length
221
// and if there are enough chars in root word and added back strip chars
222
// to meet the number of characters conditions, then test it
226
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
227
(tmpl + stripl >= numconds)) {
229
// generate new root word by removing prefix and adding
230
// back any characters that would have been stripped
232
if (stripl) strcpy (tmpword, strip);
233
strcpy ((tmpword + stripl), (word + appndl));
235
// now make sure all of the conditions on characters
236
// are met. Please see the appendix at the end of
237
// this file for more info on exactly what is being
240
// if all conditions are met then check if resulting
241
// root word in the dictionary
243
if (test_condition(tmpword)) {
246
// prefix matched but no root word was found
247
// if aeXPRODUCT is allowed, try again but now
248
// cross checked combined with a suffix
250
if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
251
he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag);
259
// check if this prefix entry matches
260
char * PfxEntry::check_twosfx_morph(const char * word, int len,
261
char in_compound, const FLAG needflag)
263
int tmpl; // length of tmpword
264
char tmpword[MAXWORDUTF8LEN + 4];
266
// on entry prefix is 0 length or already matches the beginning of the word.
267
// So if the remaining root word has positive length
268
// and if there are enough chars in root word and added back strip chars
269
// to meet the number of characters conditions, then test it
273
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
274
(tmpl + stripl >= numconds)) {
276
// generate new root word by removing prefix and adding
277
// back any characters that would have been stripped
279
if (stripl) strcpy (tmpword, strip);
280
strcpy ((tmpword + stripl), (word + appndl));
282
// now make sure all of the conditions on characters
283
// are met. Please see the appendix at the end of
284
// this file for more info on exactly what is being
287
// if all conditions are met then check if resulting
288
// root word in the dictionary
290
if (test_condition(tmpword)) {
293
// prefix matched but no root word was found
294
// if aeXPRODUCT is allowed, try again but now
295
// ross checked combined with a suffix
297
if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
298
return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
299
aeXPRODUCT, this, needflag);
306
// check if this prefix entry matches
307
char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
309
int tmpl; // length of tmpword
310
struct hentry * he; // hash entry of root word or NULL
311
char tmpword[MAXWORDUTF8LEN + 4];
312
char result[MAXLNLEN];
317
// on entry prefix is 0 length or already matches the beginning of the word.
318
// So if the remaining root word has positive length
319
// and if there are enough chars in root word and added back strip chars
320
// to meet the number of characters conditions, then test it
324
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
325
(tmpl + stripl >= numconds)) {
327
// generate new root word by removing prefix and adding
328
// back any characters that would have been stripped
330
if (stripl) strcpy (tmpword, strip);
331
strcpy ((tmpword + stripl), (word + appndl));
333
// now make sure all of the conditions on characters
334
// are met. Please see the appendix at the end of
335
// this file for more info on exactly what is being
338
// if all conditions are met then check if resulting
339
// root word in the dictionary
341
if (test_condition(tmpword)) {
343
if ((he = pmyMgr->lookup(tmpword)) != NULL) {
345
if (TESTAFF(he->astr, aflag, he->alen) &&
346
// forbid single prefixes with needaffix flag
347
! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
349
((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
350
(contclass && TESTAFF(contclass, needflag, contclasslen)))) {
352
mystrcat(result, " ", MAXLNLEN);
353
mystrcat(result, morphcode, MAXLNLEN);
354
} else mystrcat(result,getKey(), MAXLNLEN);
355
if (!HENTRY_FIND(he, MORPH_STEM)) {
356
mystrcat(result, " ", MAXLNLEN);
357
mystrcat(result, MORPH_STEM, MAXLNLEN);
358
mystrcat(result, HENTRY_WORD(he), MAXLNLEN);
360
// store the pointer of the hash entry
361
if (HENTRY_DATA(he)) {
362
mystrcat(result, " ", MAXLNLEN);
363
mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);
365
// return with debug information
366
char * flag = pmyMgr->encode_flag(getFlag());
367
mystrcat(result, " ", MAXLNLEN);
368
mystrcat(result, MORPH_FLAG, MAXLNLEN);
369
mystrcat(result, flag, MAXLNLEN);
372
mystrcat(result, "\n", MAXLNLEN);
374
he = he->next_homonym;
378
// prefix matched but no root word was found
379
// if aeXPRODUCT is allowed, try again but now
380
// ross checked combined with a suffix
382
if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
383
st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this,
384
FLAG_NULL, needflag);
386
mystrcat(result, st, MAXLNLEN);
393
if (*result) return mystrdup(result);
397
SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
399
// register affix manager
402
// set up its initial values
403
aflag = dp->aflag; // char flag
404
strip = dp->strip; // string to strip
405
appnd = dp->appnd; // string to append
406
stripl = dp->stripl; // length of strip string
407
appndl = dp->appndl; // length of append string
408
numconds = dp->numconds; // length of the condition
409
opts = dp->opts; // cross product flag
411
// then copy over all of the conditions
412
if (opts & aeLONGCOND) {
413
memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
414
c.l.conds2 = dp->c.l.conds2;
415
} else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
417
rappnd = myrevstrdup(appnd);
418
morphcode = dp->morphcode;
419
contclass = dp->contclass;
420
contclasslen = dp->contclasslen;
424
SfxEntry::~SfxEntry()
427
if (appnd) free(appnd);
428
if (rappnd) free(rappnd);
429
if (strip) free(strip);
433
if (opts & aeLONGCOND) free(c.l.conds2);
434
if (morphcode && !(opts & aeALIASM)) free(morphcode);
435
if (contclass && !(opts & aeALIASF)) free(contclass);
438
// add suffix to this word assuming conditions hold
439
char * SfxEntry::add(const char * word, int len)
441
char tword[MAXWORDUTF8LEN + 4];
443
/* make sure all conditions match */
444
if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
445
(len >= numconds) && test_condition(word + len, word) &&
446
(!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
447
((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
448
/* we have a match so add suffix */
451
strcpy(tword + len - stripl, appnd);
453
*(tword + len - stripl) = '\0';
455
return mystrdup(tword);
460
inline char * SfxEntry::nextchar(char * p) {
463
if (opts & aeLONGCOND) {
464
// jump to the 2nd part of the condition
465
if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;
466
// end of the MAXCONDLEN length condition
467
} else if (p == c.conds + MAXCONDLEN) return NULL;
468
return *p ? p : NULL;
473
inline int SfxEntry::test_condition(const char * st, const char * beg)
475
const char * pos = NULL; // group with pos input position
476
bool neg = false; // complementer
477
bool ingroup = false; // character in the group
478
if (numconds == 0) return 1;
485
case '[': { p = nextchar(p); pos = st; break; }
486
case '^': { p = nextchar(p); neg = true; break; }
487
case ']': { if (!neg && !ingroup) return 0;
489
// skip the next character
491
for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
498
if (st < beg && p) return 0; // word <= condition
501
case '.': if (!pos) { // dots are not metacharacters in groups: [.]
503
// skip the next character
504
for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
505
if (st < beg) { // word <= condition
506
if (p) return 0; else return 1;
508
if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
510
if (st < beg) { // word <= condition
511
if (p) return 0; else return 1;
519
if ((opts & aeUTF8) && (*st & 0x80)) {
521
while (p && (st >= beg)) {
527
// first byte of the UTF-8 multibyte character
528
if ((*p & 0xc0) != 0x80) break;
532
if (pos && st != pos) {
534
else if (i == numconds) return 1;
536
while (p && *p != ']' && (p = nextchar(p)));
539
if (p && *p != ']') p = nextchar(p);
542
else if (i == numconds) return 1;
544
while (p && *p != ']' && (p = nextchar(p)));
545
// if (p && *p != ']') p = nextchar(p);
552
if (st < beg && p && *p != ']') return 0; // word <= condition
553
} else if (pos) { // group
562
// see if this suffix is present in the word
563
struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
564
PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
567
int tmpl; // length of tmpword
568
struct hentry * he; // hash entry pointer
570
char tmpword[MAXWORDUTF8LEN + 4];
573
// if this suffix is being cross checked with a prefix
574
// but it does not support cross products skip it
576
if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
579
// upon entry suffix is 0 length or already matches the end of the word.
580
// So if the remaining root word has positive length
581
// and if there are enough chars in root word and added back strip chars
582
// to meet the number of characters conditions, then test it
585
// the second condition is not enough for UTF-8 strings
586
// it checked in test_condition()
588
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
589
(tmpl + stripl >= numconds)) {
591
// generate new root word by removing suffix and adding
592
// back any characters that would have been stripped or
593
// or null terminating the shorter string
595
strcpy (tmpword, word);
596
cp = (unsigned char *)(tmpword + tmpl);
598
strcpy ((char *)cp, strip);
600
cp = (unsigned char *)(tmpword + tmpl);
603
// now make sure all of the conditions on characters
604
// are met. Please see the appendix at the end of
605
// this file for more info on exactly what is being
608
// if all conditions are met then check if resulting
609
// root word in the dictionary
611
if (test_condition((char *) cp, (char *) tmpword)) {
613
#ifdef SZOSZABLYA_POSSIBLE_ROOTS
614
fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
616
if ((he = pmyMgr->lookup(tmpword)) != NULL) {
618
// check conditional suffix (enabled by prefix)
619
if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
620
TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
621
(((optflags & aeXPRODUCT) == 0) ||
622
(ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
624
((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))
626
// handle cont. class
628
((contclass) && TESTAFF(contclass, cclass, contclasslen))
630
// check only in compound homonyms (bad flags)
631
(!badflag || !TESTAFF(he->astr, badflag, he->alen)
633
// handle required flag
635
(TESTAFF(he->astr, needflag, he->alen) ||
636
((contclass) && TESTAFF(contclass, needflag, contclasslen)))
639
he = he->next_homonym; // check homonyms
642
// obsolote stemming code (used only by the
643
// experimental SuffixMgr:suggest_pos_stems)
644
// store resulting root in wlst
645
} else if (wlst && (*ns < maxSug)) {
647
for (int k=0; k < *ns; k++)
648
if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
650
wlst[*ns] = mystrdup(tmpword);
651
if (wlst[*ns] == NULL) {
652
for (int j=0; j<*ns; j++) free(wlst[j]);
664
// see if two-level suffix is present in the word
665
struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
666
PfxEntry* ppfx, const FLAG needflag)
668
int tmpl; // length of tmpword
669
struct hentry * he; // hash entry pointer
671
char tmpword[MAXWORDUTF8LEN + 4];
675
// if this suffix is being cross checked with a prefix
676
// but it does not support cross products skip it
678
if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
681
// upon entry suffix is 0 length or already matches the end of the word.
682
// So if the remaining root word has positive length
683
// and if there are enough chars in root word and added back strip chars
684
// to meet the number of characters conditions, then test it
688
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
689
(tmpl + stripl >= numconds)) {
691
// generate new root word by removing suffix and adding
692
// back any characters that would have been stripped or
693
// or null terminating the shorter string
695
strcpy (tmpword, word);
696
cp = (unsigned char *)(tmpword + tmpl);
698
strcpy ((char *)cp, strip);
700
cp = (unsigned char *)(tmpword + tmpl);
703
// now make sure all of the conditions on characters
704
// are met. Please see the appendix at the end of
705
// this file for more info on exactly what is being
708
// if all conditions are met then recall suffix_check
710
if (test_condition((char *) cp, (char *) tmpword)) {
712
// handle conditional suffix
713
if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
714
he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
716
he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
718
he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
726
// see if two-level suffix is present in the word
727
char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
728
PfxEntry* ppfx, const FLAG needflag)
730
int tmpl; // length of tmpword
732
char tmpword[MAXWORDUTF8LEN + 4];
736
char result[MAXLNLEN];
740
// if this suffix is being cross checked with a prefix
741
// but it does not support cross products skip it
743
if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
746
// upon entry suffix is 0 length or already matches the end of the word.
747
// So if the remaining root word has positive length
748
// and if there are enough chars in root word and added back strip chars
749
// to meet the number of characters conditions, then test it
753
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
754
(tmpl + stripl >= numconds)) {
756
// generate new root word by removing suffix and adding
757
// back any characters that would have been stripped or
758
// or null terminating the shorter string
760
strcpy (tmpword, word);
761
cp = (unsigned char *)(tmpword + tmpl);
763
strcpy ((char *)cp, strip);
765
cp = (unsigned char *)(tmpword + tmpl);
768
// now make sure all of the conditions on characters
769
// are met. Please see the appendix at the end of
770
// this file for more info on exactly what is being
773
// if all conditions are met then recall suffix_check
775
if (test_condition((char *) cp, (char *) tmpword)) {
777
// handle conditional suffix
778
if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
779
st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
781
if (ppfx->getMorph()) {
782
mystrcat(result, ppfx->getMorph(), MAXLNLEN);
783
mystrcat(result, " ", MAXLNLEN);
785
mystrcat(result,st, MAXLNLEN);
790
st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
792
mystrcat(result, st, MAXLNLEN);
798
st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
800
mystrcat(result, st, MAXLNLEN);
805
if (*result) return mystrdup(result);
811
// get next homonym with same affix
812
struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx,
813
const FLAG cclass, const FLAG needflag)
816
FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
818
while (he->next_homonym) {
819
he = he->next_homonym;
820
if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
821
((optflags & aeXPRODUCT) == 0 ||
822
TESTAFF(he->astr, eFlag, he->alen) ||
823
// handle conditional suffix
824
((contclass) && TESTAFF(contclass, eFlag, contclasslen))
826
// handle cont. class
828
((contclass) && TESTAFF(contclass, cclass, contclasslen))
830
// handle required flag
832
(TESTAFF(he->astr, needflag, he->alen) ||
833
((contclass) && TESTAFF(contclass, needflag, contclasslen)))
843
Appendix: Understanding Affix Code
846
An affix is either a prefix or a suffix attached to root words to make
849
Basically a Prefix or a Suffix is set of AffEntry objects
850
which store information about the prefix or suffix along
851
with supporting routines to check if a word has a particular
852
prefix or suffix or a combination.
854
The structure affentry is defined as follows:
858
unsigned short aflag; // ID used to represent the affix
859
char * strip; // string to strip before adding affix
860
char * appnd; // the affix string to add
861
unsigned char stripl; // length of the strip string
862
unsigned char appndl; // length of the affix string
863
char numconds; // the number of conditions that must be met
864
char opts; // flag: aeXPRODUCT- combine both prefix and suffix
865
char conds[SETSIZE]; // array which encodes the conditions to be met
869
Here is a suffix borrowed from the en_US.aff file. This file
870
is whitespace delimited.
874
SFX D y ied [^aeiou]y
878
This information can be interpreted as follows:
880
In the first line has 4 fields
884
1 SFX - indicates this is a suffix
885
2 D - is the name of the character flag which represents this suffix
886
3 Y - indicates it can be combined with prefixes (cross product)
887
4 4 - indicates that sequence of 4 affentry structures are needed to
888
properly store the affix information
890
The remaining lines describe the unique information for the 4 SfxEntry
891
objects that make up this affix. Each line can be interpreted
892
as follows: (note fields 1 and 2 are as a check against line 1 info)
896
1 SFX - indicates this is a suffix
897
2 D - is the name of the character flag for this affix
898
3 y - the string of chars to strip off before adding affix
899
(a 0 here indicates the NULL string)
900
4 ied - the string of affix characters to add
901
5 [^aeiou]y - the conditions which must be met before the affix
904
Field 5 is interesting. Since this is a suffix, field 5 tells us that
905
there are 2 conditions that must be met. The first condition is that
906
the next to the last character in the word must *NOT* be any of the
907
following "a", "e", "i", "o" or "u". The second condition is that
908
the last character of the word must end in "y".
910
So how can we encode this information concisely and be able to
911
test for both conditions in a fast manner? The answer is found
912
but studying the wonderful ispell code of Geoff Kuenning, et.al.
913
(now available under a normal BSD license).
915
If we set up a conds array of 256 bytes indexed (0 to 255) and access it
916
using a character (cast to an unsigned char) of a string, we have 8 bits
917
of information we can store about that character. Specifically we
918
could use each bit to say if that character is allowed in any of the
919
last (or first for prefixes) 8 characters of the word.
921
Basically, each character at one end of the word (up to the number
922
of conditions) is used to index into the conds array and the resulting
923
value found there says whether the that character is valid for a
924
specific character position in the word.
926
For prefixes, it does this by setting bit 0 if that char is valid
927
in the first position, bit 1 if valid in the second position, and so on.
929
If a bit is not set, then that char is not valid for that postion in the
932
If working with suffixes bit 0 is used for the character closest
933
to the front, bit 1 for the next character towards the end, ...,
934
with bit numconds-1 representing the last char at the end of the string.
936
Note: since entries in the conds[] are 8 bits, only 8 conditions
937
(read that only 8 character positions) can be examined at one
938
end of a word (the beginning for prefixes and the end for suffixes.
940
So to make this clearer, lets encode the conds array values for the
941
first two affentries for the suffix D described earlier.
944
For the first affentry:
945
numconds = 1 (only examine the last character)
947
conds['e'] = (1 << 0) (the word must end in an E)
950
For the second affentry:
951
numconds = 2 (only examine the last two characters)
953
conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
954
where X is all characters *but* a, e, i, o, or u
957
conds['y'] = (1 << 1) (the last char must be a y)
958
all other bits for all other entries in the conds array are zero