1
#include "license.hunspell"
2
#include "license.myspell"
9
#include "suggestmgr.hxx"
13
const w_char W_VLINE = { '\0', '|' };
15
SuggestMgr::SuggestMgr(const char * tryme, int maxn,
19
// register affix manager and check in string of chars to
20
// try when building candidate suggestions
39
maxngramsugs = MAXNGRAMSUGS;
42
char * enc = pAMgr->get_encoding();
43
csconv = get_current_cs(enc);
45
langnum = pAMgr->get_langnum();
46
ckey = pAMgr->get_key_string();
47
nosplitsugs = pAMgr->get_nosplitsugs();
48
if (pAMgr->get_maxngramsugs() >= 0) maxngramsugs = pAMgr->get_maxngramsugs();
49
utf8 = pAMgr->get_utf8();
50
complexprefixes = pAMgr->get_complexprefixes();
56
ckeyl = u8_u16(t, MAXSWL, ckey);
57
ckey_utf = (w_char *) malloc(ckeyl * sizeof(w_char));
58
if (ckey_utf) memcpy(ckey_utf, t, ckeyl * sizeof(w_char));
66
ctry = mystrdup(tryme);
67
if (ctry) ctryl = strlen(ctry);
70
ctryl = u8_u16(t, MAXSWL, tryme);
71
ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char));
72
if (ctry_utf) memcpy(ctry_utf, t, ctryl * sizeof(w_char));
79
SuggestMgr::~SuggestMgr()
84
if (ckey_utf) free(ckey_utf);
89
if (ctry_utf) free(ctry_utf);
98
int SuggestMgr::testsug(char** wlst, const char * candidate, int wl, int ns, int cpdsuggest,
99
int * timer, clock_t * timelimit) {
101
if (ns == maxSug) return maxSug;
102
for (int k=0; k < ns; k++) {
103
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
105
if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) {
106
wlst[ns] = mystrdup(candidate);
107
if (wlst[ns] == NULL) {
108
for (int j=0; j<ns; j++) free(wlst[j]);
116
// generate suggestions for a misspelled word
117
// pass in address of array of char * pointers
118
// onlycompoundsug: probably bad suggestions (need for ngram sugs, too)
120
int SuggestMgr::suggest(char*** slst, const char * w, int nsug,
121
int * onlycompoundsug)
123
int nocompoundtwowords = 0;
125
w_char word_utf[MAXSWL];
128
char w2[MAXWORDUTF8LEN];
129
const char * word = w;
131
// word reversing wrapper for complex prefixes
132
if (complexprefixes) {
134
if (utf8) reverseword_utf(w2); else reverseword(w2);
141
wlst = (char **) malloc(maxSug * sizeof(char *));
142
if (wlst == NULL) return -1;
143
for (int i = 0; i < maxSug; i++) {
149
wl = u8_u16(word_utf, MAXSWL, word);
152
for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) {
154
// suggestions for an uppercase word (html -> HTML)
155
if ((nsug < maxSug) && (nsug > -1)) {
156
nsug = (utf8) ? capchars_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
157
capchars(wlst, word, nsug, cpdsuggest);
160
// perhaps we made a typical fault of spelling
161
if ((nsug < maxSug) && (nsug > -1))
162
nsug = replchars(wlst, word, nsug, cpdsuggest);
164
// perhaps we made chose the wrong char from a related set
165
if ((nsug < maxSug) && (nsug > -1)) {
166
nsug = mapchars(wlst, word, nsug, cpdsuggest);
169
// did we swap the order of chars by mistake
170
if ((nsug < maxSug) && (nsug > -1)) {
171
nsug = (utf8) ? swapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
172
swapchar(wlst, word, nsug, cpdsuggest);
175
// did we swap the order of non adjacent chars by mistake
176
if ((nsug < maxSug) && (nsug > -1)) {
177
nsug = (utf8) ? longswapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
178
longswapchar(wlst, word, nsug, cpdsuggest);
181
// did we just hit the wrong key in place of a good char (case and keyboard)
182
if ((nsug < maxSug) && (nsug > -1)) {
183
nsug = (utf8) ? badcharkey_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
184
badcharkey(wlst, word, nsug, cpdsuggest);
187
// only suggest compound words when no other suggestion
188
if ((cpdsuggest == 0) && (nsug > 0)) nocompoundtwowords=1;
190
// did we add a char that should not be there
191
if ((nsug < maxSug) && (nsug > -1)) {
192
nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
193
extrachar(wlst, word, nsug, cpdsuggest);
197
// did we forgot a char
198
if ((nsug < maxSug) && (nsug > -1)) {
199
nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
200
forgotchar(wlst, word, nsug, cpdsuggest);
203
// did we move a char
204
if ((nsug < maxSug) && (nsug > -1)) {
205
nsug = (utf8) ? movechar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
206
movechar(wlst, word, nsug, cpdsuggest);
209
// did we just hit the wrong key in place of a good char
210
if ((nsug < maxSug) && (nsug > -1)) {
211
nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
212
badchar(wlst, word, nsug, cpdsuggest);
215
// did we double two characters
216
if ((nsug < maxSug) && (nsug > -1)) {
217
nsug = (utf8) ? doubletwochars_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
218
doubletwochars(wlst, word, nsug, cpdsuggest);
221
// perhaps we forgot to hit space and two words ran together
222
if ((!nosplitsugs) && (nsug < maxSug) && (nsug > -1)) {
223
nsug = twowords(wlst, word, nsug, cpdsuggest);
226
} // repeating ``for'' statement compounding support
229
// we ran out of memory - we should free up as much as possible
230
for (int i = 0; i < maxSug; i++)
231
if (wlst[i] != NULL) free(wlst[i]);
236
if (!nocompoundtwowords && (nsug > 0) && onlycompoundsug) *onlycompoundsug = 1;
242
// generate suggestions for a word with typical mistake
243
// pass in address of array of char * pointers
244
#ifdef HUNSPELL_EXPERIMENTAL
245
int SuggestMgr::suggest_auto(char*** slst, const char * w, int nsug)
247
int nocompoundtwowords = 0;
250
char w2[MAXWORDUTF8LEN];
251
const char * word = w;
253
// word reversing wrapper for complex prefixes
254
if (complexprefixes) {
256
if (utf8) reverseword_utf(w2); else reverseword(w2);
263
wlst = (char **) malloc(maxSug * sizeof(char *));
264
if (wlst == NULL) return -1;
267
for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) {
269
// perhaps we made a typical fault of spelling
270
if ((nsug < maxSug) && (nsug > -1))
271
nsug = replchars(wlst, word, nsug, cpdsuggest);
273
// perhaps we made chose the wrong char from a related set
274
if ((nsug < maxSug) && (nsug > -1))
275
nsug = mapchars(wlst, word, nsug, cpdsuggest);
277
if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;
279
// perhaps we forgot to hit space and two words ran together
281
if ((nsug < maxSug) && (nsug > -1) && check_forbidden(word, strlen(word))) {
282
nsug = twowords(wlst, word, nsug, cpdsuggest);
285
} // repeating ``for'' statement compounding support
288
for (int i=0;i<maxSug; i++)
289
if (wlst[i] != NULL) free(wlst[i]);
297
#endif // END OF HUNSPELL_EXPERIMENTAL CODE
299
// suggestions for an uppercase word (html -> HTML)
300
int SuggestMgr::capchars_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
302
char candidate[MAXSWUTF8L];
303
w_char candidate_utf[MAXSWL];
304
memcpy(candidate_utf, word, wl * sizeof(w_char));
305
mkallcap_utf(candidate_utf, wl, langnum);
306
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
307
return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
310
// suggestions for an uppercase word (html -> HTML)
311
int SuggestMgr::capchars(char** wlst, const char * word, int ns, int cpdsuggest)
313
char candidate[MAXSWUTF8L];
314
strcpy(candidate, word);
315
mkallcap(candidate, csconv);
316
return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
319
// suggestions for when chose the wrong char out of a related set
320
int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest)
322
char candidate[MAXSWUTF8L];
327
int wl = strlen(word);
328
if (wl < 2 || ! pAMgr) return ns;
330
int nummap = pAMgr->get_nummap();
331
struct mapentry* maptable = pAMgr->get_maptable();
332
if (maptable==NULL) return ns;
336
return map_related(word, (char *) &candidate, 0, 0, wlst, cpdsuggest, ns, maptable, nummap, &timer, &timelimit);
339
int SuggestMgr::map_related(const char * word, char * candidate, int wn, int cn,
340
char** wlst, int cpdsuggest, int ns,
341
const mapentry* maptable, int nummap, int * timer, clock_t * timelimit)
343
if (*(word + wn) == '\0') {
345
*(candidate + cn) = '\0';
346
int wl = strlen(candidate);
347
for (int m=0; m < ns; m++)
348
if (strcmp(candidate, wlst[m]) == 0) cwrd = 0;
349
if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) {
351
wlst[ns] = mystrdup(candidate);
352
if (wlst[ns] == NULL) return -1;
359
for (int j = 0; j < nummap; j++) {
360
for (int k = 0; k < maptable[j].len; k++) {
361
int len = strlen(maptable[j].set[k]);
362
if (strncmp(maptable[j].set[k], word + wn, len) == 0) {
364
for (int l = 0; l < maptable[j].len; l++) {
365
strcpy(candidate + cn, maptable[j].set[l]);
366
ns = map_related(word, candidate, wn + len, strlen(candidate), wlst,
367
cpdsuggest, ns, maptable, nummap, timer, timelimit);
368
if (!(*timer)) return ns;
374
*(candidate + cn) = *(word + wn);
375
ns = map_related(word, candidate, wn + 1, cn + 1, wlst, cpdsuggest,
376
ns, maptable, nummap, timer, timelimit);
381
// suggestions for a typical fault of spelling, that
382
// differs with more, than 1 letter from the right form.
383
int SuggestMgr::replchars(char** wlst, const char * word, int ns, int cpdsuggest)
385
char candidate[MAXSWUTF8L];
388
int wl = strlen(word);
389
if (wl < 2 || ! pAMgr) return ns;
390
int numrep = pAMgr->get_numrep();
391
struct replentry* reptable = pAMgr->get_reptable();
392
if (reptable==NULL) return ns;
393
for (int i=0; i < numrep; i++ ) {
395
lenr = strlen(reptable[i].pattern2);
396
lenp = strlen(reptable[i].pattern);
397
// search every occurence of the pattern in the word
398
while ((r=strstr(r, reptable[i].pattern)) != NULL) {
399
strcpy(candidate, word);
400
if (r-word + lenr + strlen(r+lenp) >= MAXSWUTF8L) break;
401
strcpy(candidate+(r-word),reptable[i].pattern2);
402
strcpy(candidate+(r-word)+lenr, r+lenp);
403
ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL);
404
if (ns == -1) return -1;
405
// check REP suggestions with space
406
char * sp = strchr(candidate, ' ');
409
if (checkword(candidate, strlen(candidate), 0, NULL, NULL)) {
412
ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL, NULL);
413
if (ns == -1) return -1;
416
wlst[ns - 1] = mystrdup(candidate);
417
if (!wlst[ns - 1]) return -1;
422
r++; // search for the next letter
428
// perhaps we doubled two characters (pattern aba -> ababa, for example vacation -> vacacation)
429
int SuggestMgr::doubletwochars(char** wlst, const char * word, int ns, int cpdsuggest)
431
char candidate[MAXSWUTF8L];
433
int wl = strlen(word);
434
if (wl < 5 || ! pAMgr) return ns;
435
for (int i=2; i < wl; i++ ) {
436
if (word[i]==word[i-2]) {
439
strcpy(candidate,word);
440
strcpy(candidate+i-1,word+i+1);
441
ns = testsug(wlst, candidate, wl-2, ns, cpdsuggest, NULL, NULL);
442
if (ns == -1) return -1;
452
// perhaps we doubled two characters (pattern aba -> ababa, for example vacation -> vacacation)
453
int SuggestMgr::doubletwochars_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
455
w_char candidate_utf[MAXSWL];
456
char candidate[MAXSWUTF8L];
458
if (wl < 5 || ! pAMgr) return ns;
459
for (int i=2; i < wl; i++) {
460
if (w_char_eq(word[i], word[i-2])) {
463
memcpy(candidate_utf, word, (i - 1) * sizeof(w_char));
464
memcpy(candidate_utf+i-1, word+i+1, (wl-i-1) * sizeof(w_char));
465
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl-2);
466
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
467
if (ns == -1) return -1;
477
// error is wrong char in place of correct one (case and keyboard related version)
478
int SuggestMgr::badcharkey(char ** wlst, const char * word, int ns, int cpdsuggest)
481
char candidate[MAXSWUTF8L];
482
int wl = strlen(word);
483
strcpy(candidate, word);
484
// swap out each char one by one and try uppercase and neighbor
485
// keyboard chars in its place to see if that makes a good word
487
for (int i=0; i < wl; i++) {
489
// check with uppercase letters
490
candidate[i] = csconv[((unsigned char)tmpc)].cupper;
491
if (tmpc != candidate[i]) {
492
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
493
if (ns == -1) return -1;
496
// check neighbor characters in keyboard string
498
char * loc = strchr(ckey, tmpc);
500
if ((loc > ckey) && (*(loc - 1) != '|')) {
501
candidate[i] = *(loc - 1);
502
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
503
if (ns == -1) return -1;
505
if ((*(loc + 1) != '|') && (*(loc + 1) != '\0')) {
506
candidate[i] = *(loc + 1);
507
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
508
if (ns == -1) return -1;
510
loc = strchr(loc + 1, tmpc);
517
// error is wrong char in place of correct one (case and keyboard related version)
518
int SuggestMgr::badcharkey_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
521
w_char candidate_utf[MAXSWL];
522
char candidate[MAXSWUTF8L];
523
memcpy(candidate_utf, word, wl * sizeof(w_char));
524
// swap out each char one by one and try all the tryme
525
// chars in its place to see if that makes a good word
526
for (int i=0; i < wl; i++) {
527
tmpc = candidate_utf[i];
528
// check with uppercase letters
529
mkallcap_utf(candidate_utf + i, 1, langnum);
530
if (!w_char_eq(tmpc, candidate_utf[i])) {
531
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
532
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
533
if (ns == -1) return -1;
534
candidate_utf[i] = tmpc;
536
// check neighbor characters in keyboard string
538
w_char * loc = ckey_utf;
539
while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)) loc++;
540
while (loc < (ckey_utf + ckeyl)) {
541
if ((loc > ckey_utf) && !w_char_eq(*(loc - 1), W_VLINE)) {
542
candidate_utf[i] = *(loc - 1);
543
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
544
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
545
if (ns == -1) return -1;
547
if (((loc + 1) < (ckey_utf + ckeyl)) && !w_char_eq(*(loc + 1), W_VLINE)) {
548
candidate_utf[i] = *(loc + 1);
549
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
550
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
551
if (ns == -1) return -1;
553
do { loc++; } while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc));
555
candidate_utf[i] = tmpc;
560
// error is wrong char in place of correct one
561
int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest)
564
char candidate[MAXSWUTF8L];
565
clock_t timelimit = clock();
566
int timer = MINTIMER;
567
int wl = strlen(word);
568
strcpy(candidate, word);
569
// swap out each char one by one and try all the tryme
570
// chars in its place to see if that makes a good word
571
for (int j=0; j < ctryl; j++) {
572
for (int i=wl-1; i >= 0; i--) {
574
if (ctry[j] == tmpc) continue;
575
candidate[i] = ctry[j];
576
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, &timer, &timelimit);
577
if (ns == -1) return -1;
578
if (!timer) return ns;
585
// error is wrong char in place of correct one
586
int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
589
w_char candidate_utf[MAXSWL];
590
char candidate[MAXSWUTF8L];
591
clock_t timelimit = clock();
592
int timer = MINTIMER;
593
memcpy(candidate_utf, word, wl * sizeof(w_char));
594
// swap out each char one by one and try all the tryme
595
// chars in its place to see if that makes a good word
596
for (int j=0; j < ctryl; j++) {
597
for (int i=wl-1; i >= 0; i--) {
598
tmpc = candidate_utf[i];
599
if (w_char_eq(tmpc, ctry_utf[j])) continue;
600
candidate_utf[i] = ctry_utf[j];
601
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
602
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit);
603
if (ns == -1) return -1;
604
if (!timer) return ns;
605
candidate_utf[i] = tmpc;
611
// error is word has an extra letter it does not need
612
int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
614
char candidate[MAXSWUTF8L];
615
w_char candidate_utf[MAXSWL];
617
w_char tmpc = W_VLINE; // not used value, only for VCC warning message
618
if (wl < 2) return ns;
619
// try omitting one char of word at a time
620
memcpy(candidate_utf, word, wl * sizeof(w_char));
621
for (p = candidate_utf + wl - 1; p >= candidate_utf; p--) {
623
if (p < candidate_utf + wl - 1) *p = tmpc;
624
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1);
625
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
626
if (ns == -1) return -1;
632
// error is word has an extra letter it does not need
633
int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest)
636
char candidate[MAXSWUTF8L];
638
int wl = strlen(word);
639
if (wl < 2) return ns;
640
// try omitting one char of word at a time
641
strcpy (candidate, word);
642
for (p = candidate + wl - 1; p >=candidate; p--) {
645
ns = testsug(wlst, candidate, wl-1, ns, cpdsuggest, NULL, NULL);
646
if (ns == -1) return -1;
652
// error is missing a letter it needs
653
int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsuggest)
655
char candidate[MAXSWUTF8L];
657
clock_t timelimit = clock();
658
int timer = MINTIMER;
659
int wl = strlen(word);
660
// try inserting a tryme character before every letter (and the null terminator)
661
for (int i = 0; i < ctryl; i++) {
662
strcpy(candidate, word);
663
for (p = candidate + wl; p >= candidate; p--) {
666
ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, &timer, &timelimit);
667
if (ns == -1) return -1;
668
if (!timer) return ns;
674
// error is missing a letter it needs
675
int SuggestMgr::forgotchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
677
w_char candidate_utf[MAXSWL];
678
char candidate[MAXSWUTF8L];
680
clock_t timelimit = clock();
681
int timer = MINTIMER;
682
// try inserting a tryme character at the end of the word and before every letter
683
for (int i = 0; i < ctryl; i++) {
684
memcpy (candidate_utf, word, wl * sizeof(w_char));
685
for (p = candidate_utf + wl; p >= candidate_utf; p--) {
688
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);
689
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit);
690
if (ns == -1) return -1;
691
if (!timer) return ns;
698
/* error is should have been two words */
699
int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest)
701
char candidate[MAXSWUTF8L];
708
if (wl < 3) return ns;
710
if (langnum == LANG_hu) forbidden = check_forbidden(word, wl);
712
strcpy(candidate + 1, word);
713
// split the string into two pieces after every char
714
// if both pieces are good words make them a suggestion
715
for (p = candidate + 1; p[1] != '\0'; p++) {
717
// go to end of the UTF-8 character
718
while (utf8 && ((p[1] & 0xc0) == 0x80)) {
722
if (utf8 && p[1] == '\0') break; // last UTF-8 character
724
c1 = checkword(candidate,strlen(candidate), cpdsuggest, NULL, NULL);
726
c2 = checkword((p+1),strlen(p+1), cpdsuggest, NULL, NULL);
730
// spec. Hungarian code (need a better compound word support)
731
if ((langnum == LANG_hu) && !forbidden &&
732
// if 3 repeating letter, use - instead of space
733
(((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||
734
// or multiple compounding, with more, than 6 syllables
735
((c1 == 3) && (c2 >= 2)))) *p = '-';
738
for (int k=0; k < ns; k++)
739
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
742
wlst[ns] = mystrdup(candidate);
743
if (wlst[ns] == NULL) return -1;
747
// add two word suggestion with dash, if TRY string contains
749
// NOTE: cwrd doesn't modified for REP twoword sugg.
750
if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) &&
751
mystrlen(p + 1) > 1 &&
752
mystrlen(candidate) - mystrlen(p) > 1) {
754
for (int k=0; k < ns; k++)
755
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
758
wlst[ns] = mystrdup(candidate);
759
if (wlst[ns] == NULL) return -1;
771
// error is adjacent letter were swapped
772
int SuggestMgr::swapchar(char ** wlst, const char * word, int ns, int cpdsuggest)
774
char candidate[MAXSWUTF8L];
778
// try swapping adjacent chars one by one
779
strcpy(candidate, word);
780
for (p = candidate; p[1] != 0; p++) {
784
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
785
if (ns == -1) return -1;
789
// try double swaps for short words
790
// ahev -> have, owudl -> would
791
if (wl == 4 || wl == 5) {
792
candidate[0] = word[1];
793
candidate[1] = word[0];
794
candidate[2] = word[2];
795
candidate[wl - 2] = word[wl - 1];
796
candidate[wl - 1] = word[wl - 2];
797
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
798
if (ns == -1) return -1;
800
candidate[0] = word[0];
801
candidate[1] = word[2];
802
candidate[2] = word[1];
803
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
804
if (ns == -1) return -1;
810
// error is adjacent letter were swapped
811
int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
813
w_char candidate_utf[MAXSWL];
814
char candidate[MAXSWUTF8L];
818
// try swapping adjacent chars one by one
819
memcpy (candidate_utf, word, wl * sizeof(w_char));
820
for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) {
824
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
825
if (len == 0) len = strlen(candidate);
826
ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
827
if (ns == -1) return -1;
831
// try double swaps for short words
832
// ahev -> have, owudl -> would, suodn -> sound
833
if (wl == 4 || wl == 5) {
834
candidate_utf[0] = word[1];
835
candidate_utf[1] = word[0];
836
candidate_utf[2] = word[2];
837
candidate_utf[wl - 2] = word[wl - 1];
838
candidate_utf[wl - 1] = word[wl - 2];
839
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
840
ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
841
if (ns == -1) return -1;
843
candidate_utf[0] = word[0];
844
candidate_utf[1] = word[2];
845
candidate_utf[2] = word[1];
846
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
847
ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
848
if (ns == -1) return -1;
854
// error is not adjacent letter were swapped
855
int SuggestMgr::longswapchar(char ** wlst, const char * word, int ns, int cpdsuggest)
857
char candidate[MAXSWUTF8L];
862
// try swapping not adjacent chars one by one
863
strcpy(candidate, word);
864
for (p = candidate; *p != 0; p++) {
865
for (q = candidate; *q != 0; q++) {
866
if (abs((int)(p-q)) > 1) {
870
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
871
if (ns == -1) return -1;
881
// error is adjacent letter were swapped
882
int SuggestMgr::longswapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
884
w_char candidate_utf[MAXSWL];
885
char candidate[MAXSWUTF8L];
889
// try swapping not adjacent chars
890
memcpy (candidate_utf, word, wl * sizeof(w_char));
891
for (p = candidate_utf; p < (candidate_utf + wl); p++) {
892
for (q = candidate_utf; q < (candidate_utf + wl); q++) {
893
if (abs((int)(p-q)) > 1) {
897
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
898
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
899
if (ns == -1) return -1;
908
// error is a letter was moved
909
int SuggestMgr::movechar(char ** wlst, const char * word, int ns, int cpdsuggest)
911
char candidate[MAXSWUTF8L];
918
strcpy(candidate, word);
919
for (p = candidate; *p != 0; p++) {
920
for (q = p + 1; (*q != 0) && ((q - p) < 10); q++) {
924
if ((q-p) < 2) continue; // omit swap char
925
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
926
if (ns == -1) return -1;
928
strcpy(candidate, word);
930
for (p = candidate + wl - 1; p > candidate; p--) {
931
for (q = p - 1; (q >= candidate) && ((p - q) < 10); q--) {
935
if ((p-q) < 2) continue; // omit swap char
936
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
937
if (ns == -1) return -1;
939
strcpy(candidate, word);
944
// error is a letter was moved
945
int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
947
w_char candidate_utf[MAXSWL];
948
char candidate[MAXSWUTF8L];
953
memcpy (candidate_utf, word, wl * sizeof(w_char));
954
for (p = candidate_utf; p < (candidate_utf + wl); p++) {
955
for (q = p + 1; (q < (candidate_utf + wl)) && ((q - p) < 10); q++) {
959
if ((q-p) < 2) continue; // omit swap char
960
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
961
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
962
if (ns == -1) return -1;
964
memcpy (candidate_utf, word, wl * sizeof(w_char));
966
for (p = candidate_utf + wl - 1; p > candidate_utf; p--) {
967
for (q = p - 1; (q >= candidate_utf) && ((p - q) < 10); q--) {
971
if ((p-q) < 2) continue; // omit swap char
972
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
973
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
974
if (ns == -1) return -1;
976
memcpy (candidate_utf, word, wl * sizeof(w_char));
981
// generate a set of suggestions for very poorly spelled words
982
int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md)
991
// exhaustively search through all root words
992
// keeping track of the MAX_ROOTS most similar root words
993
struct hentry * roots[MAX_ROOTS];
994
char * rootsphon[MAX_ROOTS];
995
int scores[MAX_ROOTS];
996
int scoresphon[MAX_ROOTS];
997
for (i = 0; i < MAX_ROOTS; i++) {
999
scores[i] = -100 * i;
1000
rootsphon[i] = NULL;
1001
scoresphon[i] = -100 * i;
1004
lpphon = MAX_ROOTS - 1;
1005
scphon = scoresphon[MAX_ROOTS-1];
1007
char w2[MAXWORDUTF8LEN];
1011
// word reversing wrapper for complex prefixes
1012
if (complexprefixes) {
1014
if (utf8) reverseword_utf(w2); else reverseword(w2);
1018
char mw[MAXSWUTF8L];
1020
int nc = strlen(word);
1021
int n = (utf8) ? u8_u16(u8, MAXSWL, word) : nc;
1023
// set character based ngram suggestion for words with non-BMP Unicode characters
1030
struct hentry* hp = NULL;
1032
phonetable * ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;
1033
char target[MAXSWUTF8L];
1034
char candidate[MAXSWUTF8L];
1036
strcpy(candidate, word);
1037
mkallcap(candidate, csconv);
1038
phonet(candidate, target, n, *ph);
1041
for (i = 0; i < md; i++) {
1042
while (0 != (hp = (pHMgr[i])->walk_hashtable(col, hp))) {
1043
if ((hp->astr) && (pAMgr) &&
1044
(TESTAFF(hp->astr, pAMgr->get_forbiddenword(), hp->alen) ||
1045
TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) ||
1046
TESTAFF(hp->astr, pAMgr->get_nosuggest(), hp->alen) ||
1047
TESTAFF(hp->astr, pAMgr->get_onlyincompound(), hp->alen))) continue;
1049
sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + NGRAM_LOWERING) +
1050
leftcommonsubstring(word, HENTRY_WORD(hp));
1052
// check special pronounciation
1053
if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {
1054
int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + NGRAM_LOWERING) +
1055
leftcommonsubstring(word, f);
1056
if (sc2 > sc) sc = sc2;
1059
if (ph && (sc > 2) && (abs(n - (int) hp->clen) <= 3)) {
1060
char target2[MAXSWUTF8L];
1061
strcpy(candidate, HENTRY_WORD(hp));
1062
mkallcap(candidate, csconv);
1063
phonet(candidate, target2, -1, *ph);
1064
scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE);
1067
if (sc > scores[lp]) {
1071
for (j=0; j < MAX_ROOTS; j++)
1072
if (scores[j] < lval) {
1078
if (scphon > scoresphon[lpphon]) {
1079
scoresphon[lpphon] = scphon;
1080
rootsphon[lpphon] = HENTRY_WORD(hp);
1082
for (j=0; j < MAX_ROOTS; j++)
1083
if (scoresphon[j] < lval) {
1085
lval = scoresphon[j];
1090
// find minimum threshold for a passable suggestion
1091
// mangle original word three differnt ways
1092
// and score them to generate a minimum acceptable score
1094
for (int sp = 1; sp < 4; sp++) {
1096
for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*';
1097
u16_u8(mw, MAXSWUTF8L, u8, n);
1098
thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + NGRAM_LOWERING);
1101
for (int k=sp; k < n; k+=4) *(mw + k) = '*';
1102
thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + NGRAM_LOWERING);
1105
thresh = thresh / 3;
1108
// now expand affixes on each of these root words and
1109
// and use length adjusted ngram scores to select
1110
// possible suggestions
1111
char * guess[MAX_GUESS];
1112
char * guessorig[MAX_GUESS];
1113
int gscore[MAX_GUESS];
1114
for(i=0;i<MAX_GUESS;i++) {
1116
guessorig[i] = NULL;
1117
gscore[i] = -100 * i;
1122
struct guessword * glst;
1123
glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword));
1125
if (nonbmp) utf8 = 1;
1129
for (i = 0; i < MAX_ROOTS; i++) {
1131
struct hentry * rp = roots[i];
1132
int nw = pAMgr->expand_rootword(glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen,
1133
rp->astr, rp->alen, word, nc,
1134
((rp->var & H_OPT_PHON) ? copy_field(f, HENTRY_DATA(rp), MORPH_PHON) : NULL));
1136
for (int k = 0; k < nw ; k++) {
1137
sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + NGRAM_LOWERING) +
1138
leftcommonsubstring(word, glst[k].word);
1140
if ((sc > thresh)) {
1141
if (sc > gscore[lp]) {
1144
if (guessorig[lp]) {
1145
free(guessorig[lp]);
1146
guessorig[lp] = NULL;
1150
guess[lp] = glst[k].word;
1151
guessorig[lp] = glst[k].orig;
1153
for (j=0; j < MAX_GUESS; j++)
1154
if (gscore[j] < lval) {
1160
if (glst[k].orig) free(glst[k].orig);
1164
if (glst[k].orig) free(glst[k].orig);
1171
// now we are done generating guesses
1172
// sort in order of decreasing score
1175
bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);
1176
if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);
1178
// weight suggestions with a similarity index, based on
1179
// the longest common subsequent algorithm and resort
1182
for (i=0; i < MAX_GUESS; i++) {
1184
// lowering guess[i]
1185
char gl[MAXSWUTF8L];
1189
len = u8_u16(_w, MAXSWL, guess[i]);
1190
mkallsmall_utf(_w, len, langnum);
1191
u16_u8(gl, MAXSWUTF8L, _w, len);
1193
strcpy(gl, guess[i]);
1194
mkallsmall(gl, csconv);
1195
len = strlen(guess[i]);
1198
int _lcs = lcslen(word, gl);
1200
// same characters with different casing
1201
if ((n == len) && (n == _lcs)) {
1206
// heuristic weigthing of ngram scores
1208
// length of longest common subsequent minus length difference
1209
2 * _lcs - abs((int) (n - len)) +
1210
// weight length of the left common substring
1211
leftcommonsubstring(word, gl) +
1212
// weight equal character positions
1213
((_lcs == commoncharacterpositions(word, gl, &is_swap)) ? 1: 0) +
1214
// swap character (not neighboring)
1215
((is_swap) ? 1000 : 0);
1219
bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);
1222
if (ph) for (i=0; i < MAX_ROOTS; i++) {
1224
// lowering rootphon[i]
1225
char gl[MAXSWUTF8L];
1229
len = u8_u16(_w, MAXSWL, rootsphon[i]);
1230
mkallsmall_utf(_w, len, langnum);
1231
u16_u8(gl, MAXSWUTF8L, _w, len);
1233
strcpy(gl, rootsphon[i]);
1234
mkallsmall(gl, csconv);
1235
len = strlen(rootsphon[i]);
1238
// heuristic weigthing of ngram scores
1239
scoresphon[i] += 2 * lcslen(word, gl) - abs((int) (n - len)) +
1240
// weight length of the left common substring
1241
leftcommonsubstring(word, gl);
1245
if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);
1251
for (i=0; i < MAX_GUESS; i++) {
1253
if ((ns < oldns + maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) {
1255
// leave only excellent suggestions, if exists
1256
if (gscore[i] > 1000) same = 1;
1257
for (j = 0; j < ns; j++) {
1258
// don't suggest previous suggestions or a previous suggestion with prefixes or affixes
1259
if ((!guessorig[i] && strstr(guess[i], wlst[j])) ||
1260
(guessorig[i] && strstr(guessorig[i], wlst[j])) ||
1261
// check forbidden words
1262
!checkword(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0;
1265
wlst[ns++] = guess[i];
1268
wlst[ns-1] = guessorig[i];
1272
if (guessorig[i]) free(guessorig[i]);
1276
if (guessorig[i]) free(guessorig[i]);
1282
if (ph) for (i=0; i < MAX_ROOTS; i++) {
1284
if ((ns < oldns + MAXPHONSUGS) && (ns < maxSug)) {
1286
for (j = 0; j < ns; j++) {
1287
// don't suggest previous suggestions or a previous suggestion with prefixes or affixes
1288
if (strstr(rootsphon[i], wlst[j]) ||
1289
// check forbidden words
1290
!checkword(rootsphon[i], strlen(rootsphon[i]), 0, NULL, NULL)) unique = 0;
1293
wlst[ns++] = mystrdup(rootsphon[i]);
1294
if (!wlst[ns - 1]) return ns - 1;
1300
if (nonbmp) utf8 = 1;
1305
// see if a candidate suggestion is spelled correctly
1306
// needs to check both root words and words with affixes
1308
// obsolote MySpell-HU modifications:
1309
// return value 2 and 3 marks compounding with hyphen (-)
1310
// `3' marks roots without suffix
1311
int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, clock_t * timelimit)
1313
struct hentry * rv=NULL;
1319
if (!(*timer) && timelimit) {
1320
if ((clock() - *timelimit) > TIMELIMIT) return 0;
1321
*timer = MAXPLUSTIMER;
1326
if (cpdsuggest==1) {
1327
if (pAMgr->get_compound()) {
1328
rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 1); //EXT
1329
if (rv) return 3; // XXX obsolote categorisation
1334
rv = pAMgr->lookup(word);
1337
if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)
1338
|| TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0;
1340
if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) ||
1341
TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1342
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {
1343
rv = rv->next_homonym;
1346
} else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX
1351
rv = pAMgr->suffix_check(word, len, 0, NULL, NULL, 0, NULL); // only suffix
1354
if (!rv && pAMgr->have_contclass()) {
1355
rv = pAMgr->suffix_check_twosfx(word, len, 0, NULL, FLAG_NULL);
1356
if (!rv) rv = pAMgr->prefix_check_twosfx(word, len, 1, FLAG_NULL);
1359
// check forbidden words
1360
if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) ||
1361
TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1362
TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) ||
1363
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0;
1365
if (rv) { // XXX obsolote
1366
if ((pAMgr->get_compoundflag()) &&
1367
TESTAFF(rv->astr, pAMgr->get_compoundflag(), rv->alen)) return 2 + nosuffix;
1374
int SuggestMgr::check_forbidden(const char * word, int len)
1376
struct hentry * rv = NULL;
1379
rv = pAMgr->lookup(word);
1380
if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) ||
1381
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;
1382
if (!(pAMgr->prefix_check(word,len,1)))
1383
rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix
1384
// check forbidden words
1385
if ((rv) && (rv->astr) && TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)) return 1;
1390
#ifdef HUNSPELL_EXPERIMENTAL
1391
// suggest possible stems
1392
int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug)
1396
struct hentry * rv = NULL;
1398
char w2[MAXSWUTF8L];
1399
const char * word = w;
1401
// word reversing wrapper for complex prefixes
1402
if (complexprefixes) {
1404
if (utf8) reverseword_utf(w2); else reverseword(w2);
1408
int wl = strlen(word);
1414
wlst = (char **) calloc(maxSug, sizeof(char *));
1415
if (wlst == NULL) return -1;
1418
rv = pAMgr->suffix_check(word, wl, 0, NULL, wlst, maxSug, &nsug);
1420
// delete dash from end of word
1422
for (int j=0; j < nsug; j++) {
1423
if (wlst[j][strlen(wlst[j]) - 1] == '-') wlst[j][strlen(wlst[j]) - 1] = '\0';
1430
#endif // END OF HUNSPELL_EXPERIMENTAL CODE
1433
char * SuggestMgr::suggest_morph(const char * w)
1435
char result[MAXLNLEN];
1436
char * r = (char *) result;
1439
struct hentry * rv = NULL;
1443
if (! pAMgr) return NULL;
1445
char w2[MAXSWUTF8L];
1446
const char * word = w;
1448
// word reversing wrapper for complex prefixes
1449
if (complexprefixes) {
1451
if (utf8) reverseword_utf(w2); else reverseword(w2);
1455
rv = pAMgr->lookup(word);
1458
if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) ||
1459
TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) ||
1460
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {
1461
if (!HENTRY_FIND(rv, MORPH_STEM)) {
1462
mystrcat(result, " ", MAXLNLEN);
1463
mystrcat(result, MORPH_STEM, MAXLNLEN);
1464
mystrcat(result, word, MAXLNLEN);
1466
if (HENTRY_DATA(rv)) {
1467
mystrcat(result, " ", MAXLNLEN);
1468
mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
1470
mystrcat(result, "\n", MAXLNLEN);
1472
rv = rv->next_homonym;
1475
st = pAMgr->affix_check_morph(word,strlen(word));
1477
mystrcat(result, st, MAXLNLEN);
1481
if (pAMgr->get_compound() && (*result == '\0'))
1482
pAMgr->compound_check_morph(word, strlen(word),
1483
0, 0, 100, 0,NULL, 0, &r, NULL);
1485
return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL;
1488
#ifdef HUNSPELL_EXPERIMENTAL
1489
char * SuggestMgr::suggest_morph_for_spelling_error(const char * word)
1492
char ** wlst = (char **) calloc(maxSug, sizeof(char *));
1493
if (!**wlst) return NULL;
1494
// we will use only the first suggestion
1495
for (int i = 0; i < maxSug - 1; i++) wlst[i] = "";
1496
int ns = suggest(&wlst, word, maxSug - 1, NULL);
1498
p = suggest_morph(wlst[maxSug - 1]);
1499
free(wlst[maxSug - 1]);
1501
if (wlst) free(wlst);
1504
#endif // END OF HUNSPELL_EXPERIMENTAL CODE
1507
char * SuggestMgr::suggest_hentry_gen(hentry * rv, char * pattern)
1509
char result[MAXLNLEN];
1511
int sfxcount = get_sfxcount(pattern);
1513
if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) return NULL;
1515
if (HENTRY_DATA(rv)) {
1516
char * aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen,
1517
HENTRY_DATA(rv), pattern, 0);
1519
mystrcat(result, aff, MAXLNLEN);
1520
mystrcat(result, "\n", MAXLNLEN);
1525
// check all allomorphs
1526
char allomorph[MAXLNLEN];
1528
if (HENTRY_DATA(rv)) p = (char *) strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH);
1530
struct hentry * rv2 = NULL;
1532
int plen = fieldlen(p);
1533
strncpy(allomorph, p, plen);
1534
allomorph[plen] = '\0';
1535
rv2 = pAMgr->lookup(allomorph);
1537
// if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= sfxcount) {
1538
if (HENTRY_DATA(rv2)) {
1539
char * st = (char *) strstr(HENTRY_DATA2(rv2), MORPH_STEM);
1540
if (st && (strncmp(st + MORPH_TAG_LEN,
1541
HENTRY_WORD(rv), fieldlen(st + MORPH_TAG_LEN)) == 0)) {
1542
char * aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv2->astr, rv2->alen,
1543
HENTRY_DATA(rv2), pattern, 0);
1545
mystrcat(result, aff, MAXLNLEN);
1546
mystrcat(result, "\n", MAXLNLEN);
1551
rv2 = rv2->next_homonym;
1553
p = strstr(p + plen, MORPH_ALLOMORPH);
1556
return (*result) ? mystrdup(result) : NULL;
1559
char * SuggestMgr::suggest_gen(char ** desc, int n, char * pattern) {
1560
char result[MAXLNLEN];
1561
char result2[MAXLNLEN];
1562
char newpattern[MAXLNLEN];
1564
if (n == 0) return 0;
1566
struct hentry * rv = NULL;
1567
if (!pAMgr) return NULL;
1569
// search affixed forms with and without derivational suffixes
1572
for (int k = 0; k < n; k++) {
1574
// add compound word parts (except the last one)
1575
char * s = (char *) desc[k];
1576
char * part = strstr(s, MORPH_PART);
1578
char * nextpart = strstr(part + 1, MORPH_PART);
1580
copy_field(result + strlen(result), part, MORPH_PART);
1582
nextpart = strstr(part + 1, MORPH_PART);
1590
char * alt = strstr(tok, " | ");
1593
alt = strstr(alt, " | ");
1595
int pln = line_tok(tok, &pl, MSEP_ALT);
1596
for (int i = 0; i < pln; i++) {
1597
// remove inflectional and terminal suffixes
1598
char * is = strstr(pl[i], MORPH_INFL_SFX);
1600
char * ts = strstr(pl[i], MORPH_TERM_SFX);
1603
ts = strstr(pl[i], MORPH_TERM_SFX);
1605
char * st = strstr(s, MORPH_STEM);
1607
copy_field(tok, st, MORPH_STEM);
1608
rv = pAMgr->lookup(tok);
1610
char newpat[MAXLNLEN];
1611
strcpy(newpat, pl[i]);
1612
strcat(newpat, pattern);
1613
char * sg = suggest_hentry_gen(rv, newpat);
1614
if (!sg) sg = suggest_hentry_gen(rv, pattern);
1617
int genl = line_tok(sg, &gen, MSEP_REC);
1620
for (int j = 0; j < genl; j++) {
1621
if (strstr(pl[i], MORPH_SURF_PFX)) {
1622
int r2l = strlen(result2);
1623
result2[r2l] = MSEP_REC;
1624
strcpy(result2 + r2l + 1, result);
1625
copy_field(result2 + strlen(result2), pl[i], MORPH_SURF_PFX);
1626
mystrcat(result2, gen[j], MAXLNLEN);
1628
sprintf(result2 + strlen(result2), "%c%s%s",
1629
MSEP_REC, result, gen[j]);
1632
freelist(&gen, genl);
1634
rv = rv->next_homonym;
1641
if (*result2 || !strstr(pattern, MORPH_DERI_SFX)) break;
1642
strcpy(newpattern, pattern);
1643
pattern = newpattern;
1644
char * ds = strstr(pattern, MORPH_DERI_SFX);
1646
strncpy(ds, MORPH_TERM_SFX, MORPH_TAG_LEN);
1647
ds = strstr(pattern, MORPH_DERI_SFX);
1650
return (*result2 ? mystrdup(result2) : NULL);
1654
// generate an n-gram score comparing s1 and s2
1655
int SuggestMgr::ngram(int n, char * s1, const char * s2, int opt)
1665
l1 = u8_u16(su1, MAXSWL, s1);
1666
l2 = u8_u16(su2, MAXSWL, s2);
1667
if ((l2 <= 0) || (l1 == -1)) return 0;
1668
// lowering dictionary word
1669
if (opt & NGRAM_LOWERING) mkallsmall_utf(su2, l2, langnum);
1670
for (int j = 1; j <= n; j++) {
1672
for (int i = 0; i <= (l1-j); i++) {
1673
for (int l = 0; l <= (l2-j); l++) {
1675
for (k = 0; (k < j); k++) {
1676
w_char * c1 = su1 + i + k;
1677
w_char * c2 = su2 + l + k;
1678
if ((c1->l != c2->l) || (c1->h != c2->h)) break;
1686
nscore = nscore + ns;
1691
if (l2 == 0) return 0;
1693
char *t = mystrdup(s2);
1694
if (opt & NGRAM_LOWERING) mkallsmall(t, csconv);
1695
for (int j = 1; j <= n; j++) {
1697
for (int i = 0; i <= (l1-j); i++) {
1698
char c = *(s1 + i + j);
1699
*(s1 + i + j) = '\0';
1700
if (strstr(t,(s1+i))) ns++;
1703
nscore = nscore + ns;
1710
if (opt & NGRAM_LONGER_WORSE) ns = (l2-l1)-2;
1711
if (opt & NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;
1712
ns = (nscore - ((ns > 0) ? ns : 0));
1716
// length of the left common substring of s1 and (decapitalised) s2
1717
int SuggestMgr::leftcommonsubstring(char * s1, const char * s2) {
1721
// decapitalize dictionary word
1722
if (complexprefixes) {
1723
int l1 = u8_u16(su1, MAXSWL, s1);
1724
int l2 = u8_u16(su2, MAXSWL, s2);
1725
if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1;
1730
unsigned short idx = (su2->h << 8) + su2->l;
1731
unsigned short otheridx = (su1->h << 8) + su1->l;
1732
if (otheridx != idx &&
1733
(otheridx != unicodetolower(idx, langnum))) return 0;
1734
int l1 = u8_u16(su1, MAXSWL, s1);
1735
int l2 = u8_u16(su2, MAXSWL, s2);
1736
for(i = 1; (i < l1) && (i < l2) &&
1737
(su1[i].l == su2[i].l) && (su1[i].h == su2[i].h); i++);
1741
if (complexprefixes) {
1742
int l1 = strlen(s1);
1743
int l2 = strlen(s2);
1744
if (*(s2+l1-1) == *(s2+l2-1)) return 1;
1747
// decapitalise dictionary word
1748
if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) return 0;
1751
} while ((*s1 == *s2) && (*s1 != '\0'));
1752
return (int)(s1 - olds);
1758
int SuggestMgr::commoncharacterpositions(char * s1, const char * s2, int * is_swap) {
1766
int l1 = u8_u16(su1, MAXSWL, s1);
1767
int l2 = u8_u16(su2, MAXSWL, s2);
1768
// decapitalize dictionary word
1769
if (complexprefixes) {
1770
mkallsmall_utf(su2+l2-1, 1, langnum);
1772
mkallsmall_utf(su2, 1, langnum);
1774
for (int i = 0; (i < l1) && (i < l2); i++) {
1775
if (((short *) su1)[i] == ((short *) su2)[i]) {
1778
if (diff < 2) diffpos[diff] = i;
1782
if ((diff == 2) && (l1 == l2) &&
1783
(((short *) su1)[diffpos[0]] == ((short *) su2)[diffpos[1]]) &&
1784
(((short *) su1)[diffpos[1]] == ((short *) su2)[diffpos[0]])) *is_swap = 1;
1789
// decapitalize dictionary word
1790
if (complexprefixes) {
1792
*(t+l2-1) = csconv[((unsigned char)*(t+l2-1))].clower;
1794
mkallsmall(t, csconv);
1796
for (i = 0; (*(s1+i) != 0) && (*(t+i) != 0); i++) {
1797
if (*(s1+i) == *(t+i)) {
1800
if (diff < 2) diffpos[diff] = i;
1804
if ((diff == 2) && (*(s1+i) == 0) && (*(t+i) == 0) &&
1805
(*(s1+diffpos[0]) == *(t+diffpos[1])) &&
1806
(*(s1+diffpos[1]) == *(t+diffpos[0]))) *is_swap = 1;
1811
int SuggestMgr::mystrlen(const char * word) {
1814
return u8_u16(w, MAXSWL, word);
1815
} else return strlen(word);
1818
// sort in decreasing order of score
1819
void SuggestMgr::bubblesort(char** rword, char** rword2, int* rsc, int n )
1825
if (rsc[j-1] < rsc[j]) {
1826
int sctmp = rsc[j-1];
1827
char * wdtmp = rword[j-1];
1829
rword[j-1] = rword[j];
1833
wdtmp = rword2[j-1];
1834
rword2[j-1] = rword2[j];
1845
// longest common subsequence
1846
void SuggestMgr::lcs(const char * s, const char * s2, int * l1, int * l2, char ** result) {
1855
m = u8_u16(su, MAXSWL, s);
1856
n = u8_u16(su2, MAXSWL, s2);
1861
c = (char *) malloc((m + 1) * (n + 1));
1862
b = (char *) malloc((m + 1) * (n + 1));
1869
for (i = 1; i <= m; i++) c[i*(n+1)] = 0;
1870
for (j = 0; j <= n; j++) c[j] = 0;
1871
for (i = 1; i <= m; i++) {
1872
for (j = 1; j <= n; j++) {
1873
if ( ((utf8) && (*((short *) su+i-1) == *((short *)su2+j-1)))
1874
|| ((!utf8) && ((*(s+i-1)) == (*(s2+j-1))))) {
1875
c[i*(n+1) + j] = c[(i-1)*(n+1) + j-1]+1;
1876
b[i*(n+1) + j] = LCS_UPLEFT;
1877
} else if (c[(i-1)*(n+1) + j] >= c[i*(n+1) + j-1]) {
1878
c[i*(n+1) + j] = c[(i-1)*(n+1) + j];
1879
b[i*(n+1) + j] = LCS_UP;
1881
c[i*(n+1) + j] = c[i*(n+1) + j-1];
1882
b[i*(n+1) + j] = LCS_LEFT;
1892
int SuggestMgr::lcslen(const char * s, const char* s2) {
1899
lcs(s, s2, &m, &n, &result);
1900
if (!result) return 0;
1903
while ((i != 0) && (j != 0)) {
1904
if (result[i*(n+1) + j] == LCS_UPLEFT) {
1908
} else if (result[i*(n+1) + j] == LCS_UP) {