1
/* Munch a word list and generate a smaller root word list with affixes*/
19
int main(int argc, char** argv)
29
char *nword, *wf, *af;
30
char as[(MAX_PREFIXES + MAX_SUFFIXES)];
38
/* first parse the command line options */
39
/* arg1 - wordlist, arg2 - affix file */
42
wf = mystrdup(argv[1]);
44
fprintf(stderr,"correct syntax is:\n");
45
fprintf(stderr,"munch word_list_file affix_file\n");
49
af = mystrdup(argv[2]);
51
fprintf(stderr,"correct syntax is:\n");
52
fprintf(stderr,"munch word_list_file affix_file\n");
56
/* open the affix file */
57
afflst = fopen(af,"r");
59
fprintf(stderr,"Error - could not open affix description file\n");
63
/* step one is to parse the affix file building up the internal
64
affix data structures */
69
parse_aff_file(afflst);
72
fprintf(stderr,"parsed in %d prefixes and %d suffixes\n",numpfx,numsfx);
74
/* affix file is now parsed so create hash table of wordlist on the fly */
76
/* open the wordlist */
77
wrdlst = fopen(wf,"r");
79
fprintf(stderr,"Error - could not open word list file\n");
83
if (load_tables(wrdlst)) {
84
fprintf(stderr,"Error building hash tables\n");
89
for (i=0; i< tablesize; i++) {
91
if (ep->word == NULL) continue;
92
for ( ; ep != NULL; ep = ep->next) {
94
aff_chk(ep->word,strlen(ep->word));
96
/* now there might be a number of combinations */
97
/* of prefixes and suffixes that might match this */
98
/* word. So how to choose? As a first shot look */
99
/* for the shortest remaining root word to */
100
/* to maximize the combinatorial power */
102
/* but be careful, do not REQUIRE a specific combination */
103
/* of a prefix and a suffix to generate the word since */
104
/* that violates the rule that the root word with just */
105
/* the prefix or just the suffix must also exist in the */
106
/* wordlist as well */
108
/* in fact because of the cross product issue, this not a */
109
/* simple choice since some combinations of previous */
110
/* prefixes and new suffixes may not be valid. */
111
/* The only way to know is to simply try them all */
116
for (j = 0; j < numroots; j++){
118
/* first collect the root word info and build up */
119
/* the potential new affix string */
120
nword = (roots[j].hashent)->word;
125
if (roots[j].prefix) *ap++ = (roots[j].prefix)->achar;
126
if (roots[j].suffix) *ap++ = (roots[j].suffix)->achar;
127
if ((roots[j].hashent)->affstr) {
128
strcpy(ap,(roots[j].hashent)->affstr);
134
/* now expand the potential affix string to generate */
135
/* all legal words and make sure they all exist in the */
138
wlist[numwords].word = mystrdup(nword);
139
wlist[numwords].pallow = 0;
143
expand_rootword(nword,nwl,as,al);
144
for (k=0; k<numwords; k++) {
145
if (lookup(wlist[k].word)) n++;
147
wlist[k].word = NULL;
151
/* if all exist in word list then okay */
160
ep1 = roots[p].hashent;
161
pfxp = roots[p].prefix;
162
sfxp = roots[p].suffix;
164
if (pfxp != NULL) add_affix_char(ep1,pfxp->achar);
165
if (sfxp != NULL) add_affix_char(ep1,sfxp->achar);
175
/* now output only the words to keep along with affixes info */
176
/* first count how many words that is */
178
for (i=0; i< tablesize; i++) {
180
if (ep->word == NULL) continue;
181
for ( ; ep != NULL; ep = ep->next) {
182
if (ep->keep > 0) k++;
185
fprintf(stdout,"%d\n",k);
187
for (i=0; i< tablesize; i++) {
189
if (ep->word == NULL) continue;
190
for ( ; ep != NULL; ep = ep->next) {
192
if (ep->affstr != NULL) {
193
fprintf(stdout,"%s/%s\n",ep->word,ep->affstr);
195
fprintf(stdout,"%s\n",ep->word);
204
void parse_aff_file(FILE * afflst)
211
struct affent * ptr= NULL;
212
struct affent * nptr= NULL;
213
char * line = malloc(MAX_LN_LEN);
215
while (fgets(line,MAX_LN_LEN,afflst)) {
218
fprintf(stderr,"parsing line: %s\n",line);
219
if (strncmp(line,"PFX",3) == 0) ft = 'P';
220
if (strncmp(line,"SFX",3) == 0) ft = 'S';
226
while ((piece=mystrsep(&tp,' '))) {
227
if (*piece != '\0') {
230
case 1: { achar = *piece; break; }
231
case 2: { if (*piece == 'Y') ff = XPRODUCT; break; }
232
case 3: { numents = atoi(piece);
233
ptr = malloc(numents * sizeof(struct affent));
236
fprintf(stderr,"parsing %c entries %d\n",achar,numents);
245
/* now parse all of the sub entries*/
247
for (j=0; j < numents; j++) {
248
fgets(line,MAX_LN_LEN,afflst);
252
while ((piece=mystrsep(&tp,' '))) {
253
if (*piece != '\0') {
255
case 0: { if (nptr != ptr) {
256
nptr->achar = ptr->achar;
257
nptr->xpflg = ptr->xpflg;
262
case 2: { nptr->strip = mystrdup(piece);
263
nptr->stripl = strlen(nptr->strip);
264
if (strcmp(nptr->strip,"0") == 0) {
266
nptr->strip=mystrdup("");
271
case 3: { nptr->appnd = mystrdup(piece);
272
nptr->appndl = strlen(nptr->appnd);
273
if (strcmp(nptr->appnd,"0") == 0) {
275
nptr->appnd=mystrdup("");
280
case 4: { encodeit(nptr,piece);}
281
fprintf(stderr, " affix: %s %d, strip: %s %d\n",nptr->appnd,
282
nptr->appndl,nptr->strip,nptr->stripl);
292
ptable[numpfx].aep = ptr;
293
ptable[numpfx].num = numents;
294
fprintf(stderr,"ptable %d num is %d\n",numpfx,ptable[numpfx].num);
297
stable[numsfx].aep = ptr;
298
stable[numsfx].num = numents;
299
fprintf(stderr,"stable %d num is %d\n",numsfx,stable[numsfx].num);
312
void encodeit(struct affent * ptr, char * cs)
322
unsigned char mbr[MAX_WD_LEN];
324
/* now clear the conditions array */
325
for (i=0;i<SET_SIZE;i++) ptr->conds[i] = (unsigned char) 0;
327
/* now parse the string to create the conds array */
329
neg = 0; /* complement indicator */
330
grp = 0; /* group indicator */
331
n = 0; /* number of conditions */
332
ec = 0; /* end condition indicator */
333
nm = 0; /* number of member in group */
335
if (strcmp(cs,".")==0) {
340
c = *((unsigned char *)(cs + i));
345
if ((grp == 1) && (c == '^')) {
353
if ((grp == 1) && (c != 0)) {
365
k = (unsigned int) mbr[j];
366
ptr->conds[k] = ptr->conds[k] | (1 << n);
369
for (j=0;j<SET_SIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n);
371
k = (unsigned int) mbr[j];
372
ptr->conds[k] = ptr->conds[k] & ~(1 << n);
379
/* not a group so just set the proper bit for this char */
380
/* but first handle special case of . inside condition */
382
/* wild card character so set them all */
383
for (j=0;j<SET_SIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n);
385
ptr->conds[(unsigned int) c] = ptr->conds[(unsigned int)c] | (1 << n);
399
/* search for a prefix */
400
void pfx_chk (const char * word, int len, struct affent* ep, int num)
402
struct affent * aent;
405
struct hentry * hent;
408
char tword[MAX_WD_LEN];
410
for (aent = ep, i = num; i > 0; aent++, i--) {
412
tlen = len - aent->appndl;
414
if (tlen > 0 && (aent->appndl == 0 ||
415
strncmp(aent->appnd, word, aent->appndl) == 0)
416
&& tlen + aent->stripl >= aent->numconds) {
418
if (aent->stripl) strcpy (tword, aent->strip);
419
strcpy((tword + aent->stripl), (word + aent->appndl));
421
/* now go through the conds and make sure they all match */
422
cp = (unsigned char *) tword;
423
for (cond = 0; cond < aent->numconds; cond++) {
424
if ((aent->conds[*cp++] & (1 << cond)) == 0)
428
if (cond >= aent->numconds) {
429
tlen += aent->stripl;
430
if ((hent = lookup(tword)) != NULL) {
431
if (numroots < MAX_ROOTS) {
432
roots[numroots].hashent = hent;
433
roots[numroots].prefix = aent;
434
roots[numroots].suffix = NULL;
445
void suf_chk (const char * word, int len, struct affent * ep,
446
int num, struct affent * pfxent, int cpflag)
448
struct affent * aent;
451
struct hentry * hent;
454
char tword[MAX_WD_LEN];
456
for (aent = ep, i = num; i > 0; aent++, i--) {
458
if ((cpflag & XPRODUCT) != 0 && (aent->xpflg & XPRODUCT) == 0)
461
tlen = len - aent->appndl;
462
if (tlen > 0 && (aent->appndl == 0 ||
463
strcmp(aent->appnd, (word + tlen)) == 0)
464
&& tlen + aent->stripl >= aent->numconds) {
466
strcpy (tword, word);
467
cp = (unsigned char *) (tword + tlen);
469
strcpy ((char *)cp, aent->strip);
470
tlen += aent->stripl;
471
cp = (unsigned char *)(tword + tlen);
474
for (cond = aent->numconds; --cond >= 0; ) {
475
if ((aent->conds[*--cp] & (1 << cond)) == 0) break;
478
if ((hent = lookup(tword)) != NULL) {
479
if (numroots < MAX_ROOTS) {
480
roots[numroots].hashent = hent;
481
roots[numroots].prefix = pfxent;
482
roots[numroots].suffix = aent;
493
void aff_chk (const char * word, int len)
503
for (i=0; i < numpfx; i++) {
504
pfx_chk(word, len, ptable[i].aep, ptable[i].num);
511
if (roots[j].prefix->xpflg & XPRODUCT) {
512
nword = mystrdup((roots[j].hashent)->word);
514
for (i=0; i < numsfx; i++) {
515
suf_chk(nword,nwl,stable[i].aep, stable[i].num, roots[j].prefix, XPRODUCT);
521
for (i=0; i < numsfx; i++) {
522
suf_chk(word, len, stable[i].aep, stable[i].num, NULL, 0);
528
/* lookup a root word in the hashtable */
530
struct hentry * lookup(const char *word)
533
dp = &tableptr[hash(word)];
534
if (dp->word == NULL) return NULL;
535
for ( ; dp != NULL; dp = dp->next) {
536
if (strcmp(word,dp->word) == 0) return dp;
543
/* add a word to the hash table */
545
int add_word(char * word)
549
struct hentry * hp = (struct hentry *) malloc (sizeof(struct hentry));
559
if (dp->word == NULL) {
563
while (dp->next != NULL) dp=dp->next;
571
/* load a word list and build a hash table on the fly */
573
int load_tables(FILE * wdlst)
578
/* first read the first line of file to get hash table size */
579
if (! fgets(ts, MAX_LN_LEN-1,wdlst)) return 2;
581
tablesize = atoi(ts);
582
tablesize = tablesize + 5;
583
if ((tablesize %2) == 0) tablesize++;
585
/* allocate the hash table */
586
tableptr = (struct hentry *) calloc(tablesize, sizeof(struct hentry));
587
if (! tableptr) return 3;
589
/* loop thorugh all words on much list and add to hash
590
* table and store away word and affix strings in tmpfile
593
while (fgets(ts,MAX_LN_LEN-1,wdlst)) {
603
/* the hash function is a simple load and rotate
607
int hash(const char * word)
611
for (i=0; i < 4 && *word != 0; i++)
612
hv = (hv << 8) | (*word++);
614
ROTATE(hv,ROTATE_LEN);
617
return (unsigned long) hv % tablesize;
621
void add_affix_char(struct hentry * ep, char ac)
626
if (ep->affstr == NULL) {
627
ep->affstr = (char *) malloc(2*sizeof(char));
629
*((ep->affstr)+1) = '\0';
632
al = strlen(ep->affstr);
633
for (i=0; i< al; i++)
634
if (ac == (ep->affstr)[i]) return;
635
tmp = calloc((al+2),sizeof(char));
636
memcpy(tmp,ep->affstr,(al+1));
645
/* add a prefix to word */
646
void pfx_add (const char * word, int len, struct affent* ep, int num)
648
struct affent * aent;
654
char tword[MAX_WD_LEN];
657
for (aent = ep, i = num; i > 0; aent++, i--) {
659
/* now make sure all conditions match */
660
if ((len > aent->stripl) && (len >= aent->numconds)) {
662
cp = (unsigned char *) word;
663
for (cond = 0; cond < aent->numconds; cond++) {
664
if ((aent->conds[*cp++] & (1 << cond)) == 0)
667
if (cond >= aent->numconds) {
669
/* we have a match so add prefix */
672
strcpy(tword,aent->appnd);
673
tlen += aent->appndl;
676
strcpy(pp, (word + aent->stripl));
677
tlen = tlen + len - aent->stripl;
679
if (numwords < MAX_WORDS) {
680
wlist[numwords].word = mystrdup(tword);
681
wlist[numwords].pallow = 0;
690
/* add a suffix to a word */
691
void suf_add (const char * word, int len, struct affent * ep, int num)
693
struct affent * aent;
698
char tword[MAX_WD_LEN];
701
for (aent = ep, i = num; i > 0; aent++, i--) {
703
/* if conditions hold on root word
704
* then strip off strip string and add suffix
707
if ((len > aent->stripl) && (len >= aent->numconds)) {
708
cp = (unsigned char *) (word + len);
709
for (cond = aent->numconds; --cond >= 0; ) {
710
if ((aent->conds[*--cp] & (1 << cond)) == 0) break;
713
/* we have a matching condition */
717
tlen -= aent->stripl;
721
strcpy (pp, aent->appnd);
722
tlen += aent->stripl;
725
if (numwords < MAX_WORDS) {
726
wlist[numwords].word = mystrdup(tword);
727
wlist[numwords].pallow = (aent->xpflg & XPRODUCT);
737
int expand_rootword(const char * ts, int wl, const char * ap, int al)
744
for (i=0; i < numsfx; i++) {
745
if (strchr(ap,(stable[i].aep)->achar)) {
746
suf_add(ts, wl, stable[i].aep, stable[i].num);
754
if (wlist[j].pallow) {
755
for (i=0; i < numpfx; i++) {
756
if (strchr(ap,(ptable[i].aep)->achar)) {
757
if ((ptable[i].aep)->xpflg & XPRODUCT) {
758
nwl = strlen(wlist[j].word);
759
pfx_add(wlist[j].word, nwl, ptable[i].aep, ptable[i].num);
767
for (i=0; i < numpfx; i++) {
768
if (strchr(ap,(ptable[i].aep)->achar)) {
769
pfx_add(ts, wl, ptable[i].aep, ptable[i].num);
777
/* strip strings into token based on single char delimiter
778
* acts like strsep() but only uses a delim char and not
781
char * mystrsep(char ** stringp, const char delim)
784
char * mp = *stringp;
787
char * dp = (char *)memchr(mp,(int)((unsigned char)delim),n);
791
nc = (int)((unsigned long)dp - (unsigned long)mp);
792
rv = (char *) malloc(nc+1);
797
rv = (char *) malloc(n+1);
808
char * mystrdup(const char * s)
813
d = (char *) malloc(((sl+1) * sizeof(char)));
814
if (d) memcpy(d,s,((sl+1)*sizeof(char)));
820
void mychomp(char * s)
823
if (k > 0) *(s+k-1) = '\0';
824
if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';