2
This file is part of RoboJournal.
3
Copyright (c) 2013 by Will Kraft <pwizard@gmail.com>.
6
RoboJournal is free software: you can redistribute it and/or modify
7
it under the terms of the GNU General Public License as published by
8
the Free Software Foundation, either version 3 of the License, or
9
(at your option) any later version.
11
RoboJournal is distributed in the hope that it will be useful,
12
but WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
GNU General Public License for more details.
16
You should have received a copy of the GNU General Public License
17
along with RoboJournal. If not, see <http://www.gnu.org/licenses/>.
20
/* ***** BEGIN LICENSE BLOCK *****
21
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
23
* The contents of this file are subject to the Mozilla Public License Version
24
* 1.1 (the "License"); you may not use this file except in compliance with
25
* the License. You may obtain a copy of the License at
26
* http://www.mozilla.org/MPL/
28
* Software distributed under the License is distributed on an "AS IS" basis,
29
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
30
* for the specific language governing rights and limitations under the
33
* The Original Code is Hunspell, based on MySpell.
35
* The Initial Developers of the Original Code are
36
* Kevin Hendricks (MySpell) and Laszlo Nemeth (Hunspell).
37
* Portions created by the Initial Developers are Copyright (C) 2002-2005
38
* the Initial Developers. All Rights Reserved.
64
* Alternatively, the contents of this file may be used under the terms of
65
* either the GNU General Public License Version 2 or later (the "GPL"), or
66
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
67
* in which case the provisions of the GPL or the LGPL are applicable instead
68
* of those above. If you wish to allow use of your version of this file only
69
* under the terms of either the GPL or the LGPL, and not to allow others to
70
* use your version of this file under the terms of the MPL, indicate your
71
* decision by deleting the provisions above and replace them with the notice
72
* and other provisions required by the GPL or the LGPL. If you do not delete
73
* the provisions above, a recipient may use your version of this file under
74
* the terms of any one of the MPL, the GPL or the LGPL.
76
* ***** END LICENSE BLOCK ***** */
78
#include "license.hunspell"
79
#include "license.myspell"
81
#ifndef MOZILLA_CLIENT
93
#include "ui/hunspell/affentry.hxx"
94
#include "ui/hunspell/csutil.hxx"
96
#ifndef MOZILLA_CLIENT
103
PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
105
// register affix manager
108
// set up its intial values
110
aflag = dp->aflag; // flag
111
strip = dp->strip; // string to strip
112
appnd = dp->appnd; // string to append
113
stripl = dp->stripl; // length of strip string
114
appndl = dp->appndl; // length of append string
115
numconds = dp->numconds; // length of the condition
116
opts = dp->opts; // cross product flag
117
// then copy over all of the conditions
118
if (opts & aeLONGCOND) {
119
memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
120
c.l.conds2 = dp->c.l.conds2;
121
} else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
125
morphcode = dp->morphcode;
126
contclass = dp->contclass;
127
contclasslen = dp->contclasslen;
131
PfxEntry::~PfxEntry()
134
if (appnd) free(appnd);
135
if (strip) free(strip);
139
if (opts & aeLONGCOND) free(c.l.conds2);
140
if (morphcode && !(opts & aeALIASM)) free(morphcode);
141
if (contclass && !(opts & aeALIASF)) free(contclass);
144
// add prefix to this word assuming conditions hold
145
char * PfxEntry::add(const char * word, int len)
147
char tword[MAXWORDUTF8LEN + 4];
149
if ((len > stripl) && (len >= numconds) && test_condition(word) &&
150
(!stripl || (strncmp(word, strip, stripl) == 0)) &&
151
((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
152
/* we have a match so add prefix */
158
strcpy(pp, (word + stripl));
159
return mystrdup(tword);
164
inline char * PfxEntry::nextchar(char * p) {
167
if (opts & aeLONGCOND) {
168
// jump to the 2nd part of the condition
169
if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;
170
// end of the MAXCONDLEN length condition
171
} else if (p == c.conds + MAXCONDLEN) return NULL;
176
inline int PfxEntry::test_condition(const char * st)
178
const char * pos = NULL; // group with pos input position
179
bool neg = false; // complementer
180
bool ingroup = false; // character in the group
181
if (numconds == 0) return 1;
192
case '^': { p = nextchar(p); neg = true; break; }
194
if ((neg && ingroup) || (!neg && !ingroup)) return 0;
197
// skip the next character
198
if (!ingroup) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
199
if (*st == '\0' && p && *p != '\0') return 0; // word <= condition
202
case '.': if (!pos) { // dots are not metacharacters in groups: [.]
204
// skip the next character
205
for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
206
if (*st == '\0') return 0; // word <= condition
213
if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
214
while (p && (*p & 0xc0) == 0x80) { // character
223
if (pos && st != pos) {
225
while (p && *p != ']' && (p = nextchar(p)));
229
while (p && *p != ']' && (p = nextchar(p)));
231
} else if (pos) { // group
240
// check if this prefix entry matches
241
struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
243
int tmpl; // length of tmpword
244
struct hentry * he; // hash entry of root word or NULL
245
char tmpword[MAXWORDUTF8LEN + 4];
247
// on entry prefix is 0 length or already matches the beginning of the word.
248
// So if the remaining root word has positive length
249
// and if there are enough chars in root word and added back strip chars
250
// to meet the number of characters conditions, then test it
256
// generate new root word by removing prefix and adding
257
// back any characters that would have been stripped
259
if (stripl) strcpy (tmpword, strip);
260
strcpy ((tmpword + stripl), (word + appndl));
262
// now make sure all of the conditions on characters
263
// are met. Please see the appendix at the end of
264
// this file for more info on exactly what is being
267
// if all conditions are met then check if resulting
268
// root word in the dictionary
270
if (test_condition(tmpword)) {
272
if ((he = pmyMgr->lookup(tmpword)) != NULL) {
274
if (TESTAFF(he->astr, aflag, he->alen) &&
275
// forbid single prefixes with needaffix flag
276
! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
278
((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
279
(contclass && TESTAFF(contclass, needflag, contclasslen))))
281
he = he->next_homonym; // check homonyms
285
// prefix matched but no root word was found
286
// if aeXPRODUCT is allowed, try again but now
287
// ross checked combined with a suffix
289
//if ((opts & aeXPRODUCT) && in_compound) {
290
if ((opts & aeXPRODUCT)) {
291
he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL,
292
0, NULL, FLAG_NULL, needflag, in_compound);
300
// check if this prefix entry matches
301
struct hentry * PfxEntry::check_twosfx(const char * word, int len,
302
char in_compound, const FLAG needflag)
304
int tmpl; // length of tmpword
305
struct hentry * he; // hash entry of root word or NULL
306
char tmpword[MAXWORDUTF8LEN + 4];
308
// on entry prefix is 0 length or already matches the beginning of the word.
309
// So if the remaining root word has positive length
310
// and if there are enough chars in root word and added back strip chars
311
// to meet the number of characters conditions, then test it
315
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
317
// generate new root word by removing prefix and adding
318
// back any characters that would have been stripped
320
if (stripl) strcpy (tmpword, strip);
321
strcpy ((tmpword + stripl), (word + appndl));
323
// now make sure all of the conditions on characters
324
// are met. Please see the appendix at the end of
325
// this file for more info on exactly what is being
328
// if all conditions are met then check if resulting
329
// root word in the dictionary
331
if (test_condition(tmpword)) {
334
// prefix matched but no root word was found
335
// if aeXPRODUCT is allowed, try again but now
336
// cross checked combined with a suffix
338
if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
339
he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, needflag);
347
// check if this prefix entry matches
348
char * PfxEntry::check_twosfx_morph(const char * word, int len,
349
char in_compound, const FLAG needflag)
351
int tmpl; // length of tmpword
352
char tmpword[MAXWORDUTF8LEN + 4];
354
// on entry prefix is 0 length or already matches the beginning of the word.
355
// So if the remaining root word has positive length
356
// and if there are enough chars in root word and added back strip chars
357
// to meet the number of characters conditions, then test it
361
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
363
// generate new root word by removing prefix and adding
364
// back any characters that would have been stripped
366
if (stripl) strcpy (tmpword, strip);
367
strcpy ((tmpword + stripl), (word + appndl));
369
// now make sure all of the conditions on characters
370
// are met. Please see the appendix at the end of
371
// this file for more info on exactly what is being
374
// if all conditions are met then check if resulting
375
// root word in the dictionary
377
if (test_condition(tmpword)) {
380
// prefix matched but no root word was found
381
// if aeXPRODUCT is allowed, try again but now
382
// ross checked combined with a suffix
384
if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
385
return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
386
aeXPRODUCT, (AffEntry *)this, needflag);
393
// check if this prefix entry matches
394
char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
396
int tmpl; // length of tmpword
397
struct hentry * he; // hash entry of root word or NULL
398
char tmpword[MAXWORDUTF8LEN + 4];
399
char result[MAXLNLEN];
404
// on entry prefix is 0 length or already matches the beginning of the word.
405
// So if the remaining root word has positive length
406
// and if there are enough chars in root word and added back strip chars
407
// to meet the number of characters conditions, then test it
411
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
413
// generate new root word by removing prefix and adding
414
// back any characters that would have been stripped
416
if (stripl) strcpy (tmpword, strip);
417
strcpy ((tmpword + stripl), (word + appndl));
419
// now make sure all of the conditions on characters
420
// are met. Please see the appendix at the end of
421
// this file for more info on exactly what is being
424
// if all conditions are met then check if resulting
425
// root word in the dictionary
427
if (test_condition(tmpword)) {
429
if ((he = pmyMgr->lookup(tmpword)) != NULL) {
431
if (TESTAFF(he->astr, aflag, he->alen) &&
432
// forbid single prefixes with needaffix flag
433
! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
435
((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
436
(contclass && TESTAFF(contclass, needflag, contclasslen)))) {
439
strcat(result, morphcode);
440
} else strcat(result,getKey());
441
if (!HENTRY_FIND(he, MORPH_STEM)) {
443
strcat(result, MORPH_STEM);
444
strcat(result, HENTRY_WORD(he));
446
// store the pointer of the hash entry
447
if (HENTRY_DATA(he)) {
449
strcat(result, HENTRY_DATA2(he));
451
// return with debug information
452
char * flag = pmyMgr->encode_flag(getFlag());
454
strcat(result, MORPH_FLAG);
455
strcat(result, flag);
458
strcat(result, "\n");
460
he = he->next_homonym;
464
// prefix matched but no root word was found
465
// if aeXPRODUCT is allowed, try again but now
466
// ross checked combined with a suffix
468
if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
469
st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this,
470
FLAG_NULL, needflag);
479
if (*result) return mystrdup(result);
483
SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
485
// register affix manager
488
// set up its intial values
489
aflag = dp->aflag; // char flag
490
strip = dp->strip; // string to strip
491
appnd = dp->appnd; // string to append
492
stripl = dp->stripl; // length of strip string
493
appndl = dp->appndl; // length of append string
494
numconds = dp->numconds; // length of the condition
495
opts = dp->opts; // cross product flag
497
// then copy over all of the conditions
498
if (opts & aeLONGCOND) {
499
memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
500
c.l.conds2 = dp->c.l.conds2;
501
} else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
503
rappnd = myrevstrdup(appnd);
504
morphcode = dp->morphcode;
505
contclass = dp->contclass;
506
contclasslen = dp->contclasslen;
510
SfxEntry::~SfxEntry()
513
if (appnd) free(appnd);
514
if (rappnd) free(rappnd);
515
if (strip) free(strip);
519
if (opts & aeLONGCOND) free(c.l.conds2);
520
if (morphcode && !(opts & aeALIASM)) free(morphcode);
521
if (contclass && !(opts & aeALIASF)) free(contclass);
524
// add suffix to this word assuming conditions hold
525
char * SfxEntry::add(const char * word, int len)
527
char tword[MAXWORDUTF8LEN + 4];
529
/* make sure all conditions match */
530
if ((len > stripl) && (len >= numconds) && test_condition(word + len, word) &&
531
(!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
532
((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
533
/* we have a match so add suffix */
536
strcpy(tword + len - stripl, appnd);
538
*(tword + len - stripl) = '\0';
540
return mystrdup(tword);
545
inline char * SfxEntry::nextchar(char * p) {
547
if (opts & aeLONGCOND) {
548
// jump to the 2nd part of the condition
549
if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;
550
// end of the MAXCONDLEN length condition
551
} else if (p == c.conds + MAXCONDLEN) return NULL;
555
inline int SfxEntry::test_condition(const char * st, const char * beg)
557
const char * pos = NULL; // group with pos input position
558
bool neg = false; // complementer
559
bool ingroup = false; // character in the group
560
if (numconds == 0) return 1;
567
case '[': { p = nextchar(p); pos = st; break; }
568
case '^': { p = nextchar(p); neg = true; break; }
569
case ']': { if (!neg && !ingroup) return 0;
571
// skip the next character
573
for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
580
if (st < beg && p && *p != '\0') return 0; // word <= condition
583
case '.': if (!pos) { // dots are not metacharacters in groups: [.]
585
// skip the next character
586
for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
587
if (st < beg) return 0; // word <= condition
588
if (*st & 0x80) { // head of the UTF-8 character
590
if (st < beg) return 0; // word <= condition
597
if ((opts & aeUTF8) && (*st & 0x80)) {
599
while (p && (st >= beg)) {
605
// first byte of the UTF-8 multibyte character
606
if ((*p & 0xc0) != 0x80) break;
610
if (pos && st != pos) {
612
else if (i == numconds) return 1;
614
while (p && *p != ']' && (p = nextchar(p)));
616
if (p && *p != '\0') p = nextchar(p);
619
else if (i == numconds) return 1;
626
if (st < beg && p && *p != '\0') return 0; // word <= condition
628
} else if (pos) { // group
637
// see if this suffix is present in the word
638
struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
639
AffEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
642
int tmpl; // length of tmpword
643
struct hentry * he; // hash entry pointer
645
char tmpword[MAXWORDUTF8LEN + 4];
646
PfxEntry* ep = (PfxEntry *) ppfx;
648
// if this suffix is being cross checked with a prefix
649
// but it does not support cross products skip it
651
if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
654
// upon entry suffix is 0 length or already matches the end of the word.
655
// So if the remaining root word has positive length
656
// and if there are enough chars in root word and added back strip chars
657
// to meet the number of characters conditions, then test it
660
// the second condition is not enough for UTF-8 strings
661
// it checked in test_condition()
663
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
665
// generate new root word by removing suffix and adding
666
// back any characters that would have been stripped or
667
// or null terminating the shorter string
669
strcpy (tmpword, word);
670
cp = (unsigned char *)(tmpword + tmpl);
672
strcpy ((char *)cp, strip);
674
cp = (unsigned char *)(tmpword + tmpl);
677
// now make sure all of the conditions on characters
678
// are met. Please see the appendix at the end of
679
// this file for more info on exactly what is being
682
// if all conditions are met then check if resulting
683
// root word in the dictionary
685
if (test_condition((char *) cp, (char *) tmpword)) {
687
#ifdef SZOSZABLYA_POSSIBLE_ROOTS
688
fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
690
if ((he = pmyMgr->lookup(tmpword)) != NULL) {
692
// check conditional suffix (enabled by prefix)
693
if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
694
TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
695
(((optflags & aeXPRODUCT) == 0) ||
696
TESTAFF(he->astr, ep->getFlag(), he->alen) ||
698
((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
700
// handle cont. class
702
((contclass) && TESTAFF(contclass, cclass, contclasslen))
704
// check only in compound homonyms (bad flags)
705
(!badflag || !TESTAFF(he->astr, badflag, he->alen)
707
// handle required flag
709
(TESTAFF(he->astr, needflag, he->alen) ||
710
((contclass) && TESTAFF(contclass, needflag, contclasslen)))
713
he = he->next_homonym; // check homonyms
716
// obsolote stemming code (used only by the
717
// experimental SuffixMgr:suggest_pos_stems)
718
// store resulting root in wlst
719
} else if (wlst && (*ns < maxSug)) {
721
for (int k=0; k < *ns; k++)
722
if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
724
wlst[*ns] = mystrdup(tmpword);
725
if (wlst[*ns] == NULL) {
726
for (int j=0; j<*ns; j++) free(wlst[j]);
738
// see if two-level suffix is present in the word
739
struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
740
AffEntry* ppfx, const FLAG needflag)
742
int tmpl; // length of tmpword
743
struct hentry * he; // hash entry pointer
745
char tmpword[MAXWORDUTF8LEN + 4];
746
PfxEntry* ep = (PfxEntry *) ppfx;
749
// if this suffix is being cross checked with a prefix
750
// but it does not support cross products skip it
752
if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
755
// upon entry suffix is 0 length or already matches the end of the word.
756
// So if the remaining root word has positive length
757
// and if there are enough chars in root word and added back strip chars
758
// to meet the number of characters conditions, then test it
762
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
764
// generate new root word by removing suffix and adding
765
// back any characters that would have been stripped or
766
// or null terminating the shorter string
768
strcpy (tmpword, word);
769
cp = (unsigned char *)(tmpword + tmpl);
771
strcpy ((char *)cp, strip);
773
cp = (unsigned char *)(tmpword + tmpl);
776
// now make sure all of the conditions on characters
777
// are met. Please see the appendix at the end of
778
// this file for more info on exactly what is being
781
// if all conditions are met then recall suffix_check
783
if (test_condition((char *) cp, (char *) tmpword)) {
785
// handle conditional suffix
786
if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
787
he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
789
he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
791
he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
799
// see if two-level suffix is present in the word
800
char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
801
AffEntry* ppfx, const FLAG needflag)
803
int tmpl; // length of tmpword
805
char tmpword[MAXWORDUTF8LEN + 4];
806
PfxEntry* ep = (PfxEntry *) ppfx;
809
char result[MAXLNLEN];
813
// if this suffix is being cross checked with a prefix
814
// but it does not support cross products skip it
816
if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
819
// upon entry suffix is 0 length or already matches the end of the word.
820
// So if the remaining root word has positive length
821
// and if there are enough chars in root word and added back strip chars
822
// to meet the number of characters conditions, then test it
826
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
828
// generate new root word by removing suffix and adding
829
// back any characters that would have been stripped or
830
// or null terminating the shorter string
832
strcpy (tmpword, word);
833
cp = (unsigned char *)(tmpword + tmpl);
835
strcpy ((char *)cp, strip);
837
cp = (unsigned char *)(tmpword + tmpl);
840
// now make sure all of the conditions on characters
841
// are met. Please see the appendix at the end of
842
// this file for more info on exactly what is being
845
// if all conditions are met then recall suffix_check
847
if (test_condition((char *) cp, (char *) tmpword)) {
849
// handle conditional suffix
850
if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
851
st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
853
if (((PfxEntry *) ppfx)->getMorph()) {
854
strcat(result, ((PfxEntry *) ppfx)->getMorph());
862
st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
870
st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
877
if (*result) return mystrdup(result);
883
// get next homonym with same affix
884
struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx,
885
const FLAG cclass, const FLAG needflag)
887
PfxEntry* ep = (PfxEntry *) ppfx;
888
FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
890
while (he->next_homonym) {
891
he = he->next_homonym;
892
if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
893
((optflags & aeXPRODUCT) == 0 ||
894
TESTAFF(he->astr, eFlag, he->alen) ||
895
// handle conditional suffix
896
((contclass) && TESTAFF(contclass, eFlag, contclasslen))
898
// handle cont. class
900
((contclass) && TESTAFF(contclass, cclass, contclasslen))
902
// handle required flag
904
(TESTAFF(he->astr, needflag, he->alen) ||
905
((contclass) && TESTAFF(contclass, needflag, contclasslen)))
915
Appendix: Understanding Affix Code
918
An affix is either a prefix or a suffix attached to root words to make
921
Basically a Prefix or a Suffix is set of AffEntry objects
922
which store information about the prefix or suffix along
923
with supporting routines to check if a word has a particular
924
prefix or suffix or a combination.
926
The structure affentry is defined as follows:
930
unsigned short aflag; // ID used to represent the affix
931
char * strip; // string to strip before adding affix
932
char * appnd; // the affix string to add
933
unsigned char stripl; // length of the strip string
934
unsigned char appndl; // length of the affix string
935
char numconds; // the number of conditions that must be met
936
char opts; // flag: aeXPRODUCT- combine both prefix and suffix
937
char conds[SETSIZE]; // array which encodes the conditions to be met
941
Here is a suffix borrowed from the en_US.aff file. This file
942
is whitespace delimited.
946
SFX D y ied [^aeiou]y
950
This information can be interpreted as follows:
952
In the first line has 4 fields
956
1 SFX - indicates this is a suffix
957
2 D - is the name of the character flag which represents this suffix
958
3 Y - indicates it can be combined with prefixes (cross product)
959
4 4 - indicates that sequence of 4 affentry structures are needed to
960
properly store the affix information
962
The remaining lines describe the unique information for the 4 SfxEntry
963
objects that make up this affix. Each line can be interpreted
964
as follows: (note fields 1 and 2 are as a check against line 1 info)
968
1 SFX - indicates this is a suffix
969
2 D - is the name of the character flag for this affix
970
3 y - the string of chars to strip off before adding affix
971
(a 0 here indicates the NULL string)
972
4 ied - the string of affix characters to add
973
5 [^aeiou]y - the conditions which must be met before the affix
976
Field 5 is interesting. Since this is a suffix, field 5 tells us that
977
there are 2 conditions that must be met. The first condition is that
978
the next to the last character in the word must *NOT* be any of the
979
following "a", "e", "i", "o" or "u". The second condition is that
980
the last character of the word must end in "y".
982
So how can we encode this information concisely and be able to
983
test for both conditions in a fast manner? The answer is found
984
but studying the wonderful ispell code of Geoff Kuenning, et.al.
985
(now available under a normal BSD license).
987
If we set up a conds array of 256 bytes indexed (0 to 255) and access it
988
using a character (cast to an unsigned char) of a string, we have 8 bits
989
of information we can store about that character. Specifically we
990
could use each bit to say if that character is allowed in any of the
991
last (or first for prefixes) 8 characters of the word.
993
Basically, each character at one end of the word (up to the number
994
of conditions) is used to index into the conds array and the resulting
995
value found there says whether the that character is valid for a
996
specific character position in the word.
998
For prefixes, it does this by setting bit 0 if that char is valid
999
in the first position, bit 1 if valid in the second position, and so on.
1001
If a bit is not set, then that char is not valid for that postion in the
1004
If working with suffixes bit 0 is used for the character closest
1005
to the front, bit 1 for the next character towards the end, ...,
1006
with bit numconds-1 representing the last char at the end of the string.
1008
Note: since entries in the conds[] are 8 bits, only 8 conditions
1009
(read that only 8 character positions) can be examined at one
1010
end of a word (the beginning for prefixes and the end for suffixes.
1012
So to make this clearer, lets encode the conds array values for the
1013
first two affentries for the suffix D described earlier.
1016
For the first affentry:
1017
numconds = 1 (only examine the last character)
1019
conds['e'] = (1 << 0) (the word must end in an E)
1020
all others are all 0
1022
For the second affentry:
1023
numconds = 2 (only examine the last two characters)
1025
conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
1026
where X is all characters *but* a, e, i, o, or u
1029
conds['y'] = (1 << 1) (the last char must be a y)
1030
all other bits for all other entries in the conds array are zero