1
/* ***** BEGIN LICENSE BLOCK *****
2
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
4
* The contents of this file are subject to the Mozilla Public License Version
5
* 1.1 (the "License"); you may not use this file except in compliance with
6
* the License. You may obtain a copy of the License at
7
* http://www.mozilla.org/MPL/
9
* Software distributed under the License is distributed on an "AS IS" basis,
10
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11
* for the specific language governing rights and limitations under the
14
* The Original Code is Hunspell, based on MySpell.
16
* The Initial Developers of the Original Code are
17
* Kevin Hendricks (MySpell) and NĆ©meth LĆ”szlĆ³ (Hunspell).
18
* Portions created by the Initial Developers are Copyright (C) 2002-2005
19
* the Initial Developers. All Rights Reserved.
21
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
22
* Gianluca Turconi, Simon Brouwer, Noll JĆ”nos, BĆrĆ³ ĆrpĆ”d,
23
* Goldman EleonĆ³ra, SarlĆ³s TamĆ”s, BencsĆ”th BoldizsĆ”r, HalĆ”csy PĆ©ter,
24
* Dvornik LĆ”szlĆ³, Gefferth AndrĆ”s, Nagy Viktor, Varga DĆ”niel, Chris Halls,
25
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri PitkƤnen
27
* Alternatively, the contents of this file may be used under the terms of
28
* either the GNU General Public License Version 2 or later (the "GPL"), or
29
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30
* in which case the provisions of the GPL or the LGPL are applicable instead
31
* of those above. If you wish to allow use of your version of this file only
32
* under the terms of either the GPL or the LGPL, and not to allow others to
33
* use your version of this file under the terms of the MPL, indicate your
34
* decision by deleting the provisions above and replace them with the notice
35
* and other provisions required by the GPL or the LGPL. If you do not delete
36
* the provisions above, a recipient may use your version of this file under
37
* the terms of any one of the MPL, the GPL or the LGPL.
39
* ***** END LICENSE BLOCK ***** */
41
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
42
* And Contributors. All rights reserved.
44
* Redistribution and use in source and binary forms, with or without
45
* modification, are permitted provided that the following conditions
48
* 1. Redistributions of source code must retain the above copyright
49
* notice, this list of conditions and the following disclaimer.
51
* 2. Redistributions in binary form must reproduce the above copyright
52
* notice, this list of conditions and the following disclaimer in the
53
* documentation and/or other materials provided with the distribution.
55
* 3. All modifications to the source code must be clearly marked as
56
* such. Binary redistributions based on modified source code
57
* must be clearly marked as modified versions in the documentation
58
* and/or other materials provided with the distribution.
60
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
61
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
62
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
63
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
64
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
65
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
66
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
67
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
68
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
69
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
70
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
79
#include "suggestmgr.hxx"
83
const w_char W_VLINE = { '\0', '|' };
85
SuggestMgr::SuggestMgr(const char * tryme, int maxn,
89
// register affix manager and check in string of chars to
90
// try when building candidate suggestions
109
maxngramsugs = MAXNGRAMSUGS;
110
maxcpdsugs = MAXCOMPOUNDSUGS;
113
langnum = pAMgr->get_langnum();
114
ckey = pAMgr->get_key_string();
115
nosplitsugs = pAMgr->get_nosplitsugs();
116
if (pAMgr->get_maxngramsugs() >= 0)
117
maxngramsugs = pAMgr->get_maxngramsugs();
118
utf8 = pAMgr->get_utf8();
119
if (pAMgr->get_maxcpdsugs() >= 0)
120
maxcpdsugs = pAMgr->get_maxcpdsugs();
123
char * enc = pAMgr->get_encoding();
124
csconv = get_current_cs(enc);
127
complexprefixes = pAMgr->get_complexprefixes();
133
ckeyl = u8_u16(t, MAXSWL, ckey);
134
ckey_utf = (w_char *) malloc(ckeyl * sizeof(w_char));
135
if (ckey_utf) memcpy(ckey_utf, t, ckeyl * sizeof(w_char));
138
ckeyl = strlen(ckey);
143
ctry = mystrdup(tryme);
144
if (ctry) ctryl = strlen(ctry);
147
ctryl = u8_u16(t, MAXSWL, tryme);
148
ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char));
149
if (ctry_utf) memcpy(ctry_utf, t, ctryl * sizeof(w_char));
156
SuggestMgr::~SuggestMgr()
159
if (ckey) free(ckey);
161
if (ckey_utf) free(ckey_utf);
164
if (ctry) free(ctry);
166
if (ctry_utf) free(ctry_utf);
170
#ifdef MOZILLA_CLIENT
175
int SuggestMgr::testsug(char** wlst, const char * candidate, int wl, int ns, int cpdsuggest,
176
int * timer, clock_t * timelimit) {
178
if (ns == maxSug) return maxSug;
179
for (int k=0; k < ns; k++) {
180
if (strcmp(candidate,wlst[k]) == 0) {
185
if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) {
186
wlst[ns] = mystrdup(candidate);
187
if (wlst[ns] == NULL) {
188
for (int j=0; j<ns; j++) free(wlst[j]);
196
// generate suggestions for a misspelled word
197
// pass in address of array of char * pointers
198
// onlycompoundsug: probably bad suggestions (need for ngram sugs, too)
200
int SuggestMgr::suggest(char*** slst, const char * w, int nsug,
201
int * onlycompoundsug)
203
int nocompoundtwowords = 0;
205
w_char word_utf[MAXSWL];
209
const char * word = w;
212
// word reversing wrapper for complex prefixes
213
if (complexprefixes) {
215
if (utf8) reverseword_utf(w2); else reverseword(w2);
222
wlst = (char **) malloc(maxSug * sizeof(char *));
223
if (wlst == NULL) return -1;
224
for (int i = 0; i < maxSug; i++) {
230
wl = u8_u16(word_utf, MAXSWL, word);
237
for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) {
239
// limit compound suggestion
240
if (cpdsuggest > 0) oldSug = nsug;
242
// suggestions for an uppercase word (html -> HTML)
243
if ((nsug < maxSug) && (nsug > -1)) {
244
nsug = (utf8) ? capchars_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
245
capchars(wlst, word, nsug, cpdsuggest);
248
// perhaps we made a typical fault of spelling
249
if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
250
nsug = replchars(wlst, word, nsug, cpdsuggest);
253
// perhaps we made chose the wrong char from a related set
254
if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
255
nsug = mapchars(wlst, word, nsug, cpdsuggest);
258
// only suggest compound words when no other suggestion
259
if ((cpdsuggest == 0) && (nsug > nsugorig)) nocompoundtwowords=1;
261
// did we swap the order of chars by mistake
262
if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
263
nsug = (utf8) ? swapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
264
swapchar(wlst, word, nsug, cpdsuggest);
267
// did we swap the order of non adjacent chars by mistake
268
if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
269
nsug = (utf8) ? longswapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
270
longswapchar(wlst, word, nsug, cpdsuggest);
273
// did we just hit the wrong key in place of a good char (case and keyboard)
274
if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
275
nsug = (utf8) ? badcharkey_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
276
badcharkey(wlst, word, nsug, cpdsuggest);
279
// did we add a char that should not be there
280
if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
281
nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
282
extrachar(wlst, word, nsug, cpdsuggest);
286
// did we forgot a char
287
if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
288
nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
289
forgotchar(wlst, word, nsug, cpdsuggest);
292
// did we move a char
293
if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
294
nsug = (utf8) ? movechar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
295
movechar(wlst, word, nsug, cpdsuggest);
298
// did we just hit the wrong key in place of a good char
299
if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
300
nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
301
badchar(wlst, word, nsug, cpdsuggest);
304
// did we double two characters
305
if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
306
nsug = (utf8) ? doubletwochars_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
307
doubletwochars(wlst, word, nsug, cpdsuggest);
310
// perhaps we forgot to hit space and two words ran together
311
if (!nosplitsugs && (nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
312
nsug = twowords(wlst, word, nsug, cpdsuggest);
315
} // repeating ``for'' statement compounding support
318
// we ran out of memory - we should free up as much as possible
319
for (int i = 0; i < maxSug; i++)
320
if (wlst[i] != NULL) free(wlst[i]);
325
if (!nocompoundtwowords && (nsug > 0) && onlycompoundsug) *onlycompoundsug = 1;
331
// generate suggestions for a word with typical mistake
332
// pass in address of array of char * pointers
333
#ifdef HUNSPELL_EXPERIMENTAL
334
int SuggestMgr::suggest_auto(char*** slst, const char * w, int nsug)
336
int nocompoundtwowords = 0;
340
char w2[MAXWORDUTF8LEN];
341
const char * word = w;
343
// word reversing wrapper for complex prefixes
344
if (complexprefixes) {
346
if (utf8) reverseword_utf(w2); else reverseword(w2);
353
wlst = (char **) malloc(maxSug * sizeof(char *));
354
if (wlst == NULL) return -1;
357
for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) {
359
// limit compound suggestion
360
if (cpdsuggest > 0) oldSug = nsug;
362
// perhaps we made a typical fault of spelling
363
if ((nsug < maxSug) && (nsug > -1))
364
nsug = replchars(wlst, word, nsug, cpdsuggest);
366
// perhaps we made chose the wrong char from a related set
367
if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs)))
368
nsug = mapchars(wlst, word, nsug, cpdsuggest);
370
if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;
372
// perhaps we forgot to hit space and two words ran together
374
if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs)) && check_forbidden(word, strlen(word))) {
375
nsug = twowords(wlst, word, nsug, cpdsuggest);
378
} // repeating ``for'' statement compounding support
381
for (int i=0;i<maxSug; i++)
382
if (wlst[i] != NULL) free(wlst[i]);
390
#endif // END OF HUNSPELL_EXPERIMENTAL CODE
392
// suggestions for an uppercase word (html -> HTML)
393
int SuggestMgr::capchars_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
395
char candidate[MAXSWUTF8L];
396
w_char candidate_utf[MAXSWL];
397
memcpy(candidate_utf, word, wl * sizeof(w_char));
398
mkallcap_utf(candidate_utf, wl, langnum);
399
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
400
return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
403
// suggestions for an uppercase word (html -> HTML)
404
int SuggestMgr::capchars(char** wlst, const char * word, int ns, int cpdsuggest)
406
std::string candidate(word);
407
mkallcap(candidate, csconv);
408
return testsug(wlst, candidate.data(), candidate.size(), ns, cpdsuggest, NULL, NULL);
411
// suggestions for when chose the wrong char out of a related set
412
int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest)
414
char candidate[MAXSWUTF8L];
419
int wl = strlen(word);
420
if (wl < 2 || ! pAMgr) return ns;
422
int nummap = pAMgr->get_nummap();
423
struct mapentry* maptable = pAMgr->get_maptable();
424
if (maptable==NULL) return ns;
428
return map_related(word, (char *) &candidate, 0, 0, wlst, cpdsuggest, ns, maptable, nummap, &timer, &timelimit);
431
int SuggestMgr::map_related(const char * word, char * candidate, int wn, int cn,
432
char** wlst, int cpdsuggest, int ns,
433
const mapentry* maptable, int nummap, int * timer, clock_t * timelimit)
435
if (*(word + wn) == '\0') {
437
*(candidate + cn) = '\0';
438
int wl = strlen(candidate);
439
for (int m=0; m < ns; m++) {
440
if (strcmp(candidate, wlst[m]) == 0) {
445
if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) {
447
wlst[ns] = mystrdup(candidate);
448
if (wlst[ns] == NULL) return -1;
455
for (int j = 0; j < nummap; j++) {
456
for (int k = 0; k < maptable[j].len; k++) {
457
int len = strlen(maptable[j].set[k]);
458
if (strncmp(maptable[j].set[k], word + wn, len) == 0) {
460
for (int l = 0; l < maptable[j].len; l++) {
461
strcpy(candidate + cn, maptable[j].set[l]);
462
ns = map_related(word, candidate, wn + len, strlen(candidate), wlst,
463
cpdsuggest, ns, maptable, nummap, timer, timelimit);
464
if (!(*timer)) return ns;
470
*(candidate + cn) = *(word + wn);
471
ns = map_related(word, candidate, wn + 1, cn + 1, wlst, cpdsuggest,
472
ns, maptable, nummap, timer, timelimit);
477
// suggestions for a typical fault of spelling, that
478
// differs with more, than 1 letter from the right form.
479
int SuggestMgr::replchars(char** wlst, const char * word, int ns, int cpdsuggest)
481
char candidate[MAXSWUTF8L];
484
int wl = strlen(word);
485
if (wl < 2 || ! pAMgr) return ns;
486
int numrep = pAMgr->get_numrep();
487
struct replentry* reptable = pAMgr->get_reptable();
488
if (reptable==NULL) return ns;
489
for (int i=0; i < numrep; i++ ) {
491
lenr = strlen(reptable[i].pattern2);
492
lenp = strlen(reptable[i].pattern);
493
// search every occurence of the pattern in the word
494
while ((r=strstr(r, reptable[i].pattern)) != NULL && (!reptable[i].end || strlen(r) == strlen(reptable[i].pattern)) &&
495
(!reptable[i].start || r == word)) {
496
strcpy(candidate, word);
497
if (r-word + lenr + strlen(r+lenp) >= MAXSWUTF8L) break;
498
strcpy(candidate+(r-word),reptable[i].pattern2);
499
strcpy(candidate+(r-word)+lenr, r+lenp);
500
ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL);
501
if (ns == -1) return -1;
502
// check REP suggestions with space
503
char * sp = strchr(candidate, ' ');
505
char * prev = candidate;
508
if (checkword(prev, strlen(prev), 0, NULL, NULL)) {
511
ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL, NULL);
512
if (ns == -1) return -1;
515
wlst[ns - 1] = mystrdup(candidate);
516
if (!wlst[ns - 1]) return -1;
521
sp = strchr(prev, ' ');
524
r++; // search for the next letter
530
// perhaps we doubled two characters (pattern aba -> ababa, for example vacation -> vacacation)
531
int SuggestMgr::doubletwochars(char** wlst, const char * word, int ns, int cpdsuggest)
533
char candidate[MAXSWUTF8L];
535
int wl = strlen(word);
536
if (wl < 5 || ! pAMgr) return ns;
537
for (int i=2; i < wl; i++ ) {
538
if (word[i]==word[i-2]) {
541
strcpy(candidate,word);
542
strcpy(candidate+i-1,word+i+1);
543
ns = testsug(wlst, candidate, wl-2, ns, cpdsuggest, NULL, NULL);
544
if (ns == -1) return -1;
554
// perhaps we doubled two characters (pattern aba -> ababa, for example vacation -> vacacation)
555
int SuggestMgr::doubletwochars_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
557
w_char candidate_utf[MAXSWL];
558
char candidate[MAXSWUTF8L];
560
if (wl < 5 || ! pAMgr) return ns;
561
for (int i=2; i < wl; i++) {
562
if (w_char_eq(word[i], word[i-2])) {
565
memcpy(candidate_utf, word, (i - 1) * sizeof(w_char));
566
memcpy(candidate_utf+i-1, word+i+1, (wl-i-1) * sizeof(w_char));
567
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl-2);
568
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
569
if (ns == -1) return -1;
579
// error is wrong char in place of correct one (case and keyboard related version)
580
int SuggestMgr::badcharkey(char ** wlst, const char * word, int ns, int cpdsuggest)
583
char candidate[MAXSWUTF8L];
584
int wl = strlen(word);
585
strcpy(candidate, word);
586
// swap out each char one by one and try uppercase and neighbor
587
// keyboard chars in its place to see if that makes a good word
589
for (int i=0; i < wl; i++) {
591
// check with uppercase letters
592
candidate[i] = csconv[((unsigned char)tmpc)].cupper;
593
if (tmpc != candidate[i]) {
594
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
595
if (ns == -1) return -1;
598
// check neighbor characters in keyboard string
600
char * loc = strchr(ckey, tmpc);
602
if ((loc > ckey) && (*(loc - 1) != '|')) {
603
candidate[i] = *(loc - 1);
604
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
605
if (ns == -1) return -1;
607
if ((*(loc + 1) != '|') && (*(loc + 1) != '\0')) {
608
candidate[i] = *(loc + 1);
609
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
610
if (ns == -1) return -1;
612
loc = strchr(loc + 1, tmpc);
619
// error is wrong char in place of correct one (case and keyboard related version)
620
int SuggestMgr::badcharkey_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
623
w_char candidate_utf[MAXSWL];
624
char candidate[MAXSWUTF8L];
625
memcpy(candidate_utf, word, wl * sizeof(w_char));
626
// swap out each char one by one and try all the tryme
627
// chars in its place to see if that makes a good word
628
for (int i=0; i < wl; i++) {
629
tmpc = candidate_utf[i];
630
// check with uppercase letters
631
mkallcap_utf(candidate_utf + i, 1, langnum);
632
if (!w_char_eq(tmpc, candidate_utf[i])) {
633
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
634
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
635
if (ns == -1) return -1;
636
candidate_utf[i] = tmpc;
638
// check neighbor characters in keyboard string
640
w_char * loc = ckey_utf;
641
while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)) loc++;
642
while (loc < (ckey_utf + ckeyl)) {
643
if ((loc > ckey_utf) && !w_char_eq(*(loc - 1), W_VLINE)) {
644
candidate_utf[i] = *(loc - 1);
645
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
646
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
647
if (ns == -1) return -1;
649
if (((loc + 1) < (ckey_utf + ckeyl)) && !w_char_eq(*(loc + 1), W_VLINE)) {
650
candidate_utf[i] = *(loc + 1);
651
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
652
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
653
if (ns == -1) return -1;
655
do { loc++; } while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc));
657
candidate_utf[i] = tmpc;
662
// error is wrong char in place of correct one
663
int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest)
666
char candidate[MAXSWUTF8L];
667
clock_t timelimit = clock();
668
int timer = MINTIMER;
669
int wl = strlen(word);
670
strcpy(candidate, word);
671
// swap out each char one by one and try all the tryme
672
// chars in its place to see if that makes a good word
673
for (int j=0; j < ctryl; j++) {
674
for (int i=wl-1; i >= 0; i--) {
676
if (ctry[j] == tmpc) continue;
677
candidate[i] = ctry[j];
678
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, &timer, &timelimit);
679
if (ns == -1) return -1;
680
if (!timer) return ns;
687
// error is wrong char in place of correct one
688
int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
691
w_char candidate_utf[MAXSWL];
692
char candidate[MAXSWUTF8L];
693
clock_t timelimit = clock();
694
int timer = MINTIMER;
695
memcpy(candidate_utf, word, wl * sizeof(w_char));
696
// swap out each char one by one and try all the tryme
697
// chars in its place to see if that makes a good word
698
for (int j=0; j < ctryl; j++) {
699
for (int i=wl-1; i >= 0; i--) {
700
tmpc = candidate_utf[i];
701
if (w_char_eq(tmpc, ctry_utf[j])) continue;
702
candidate_utf[i] = ctry_utf[j];
703
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
704
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit);
705
if (ns == -1) return -1;
706
if (!timer) return ns;
707
candidate_utf[i] = tmpc;
713
// error is word has an extra letter it does not need
714
int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
716
char candidate[MAXSWUTF8L];
717
w_char candidate_utf[MAXSWL];
719
w_char tmpc = W_VLINE; // not used value, only for VCC warning message
720
if (wl < 2) return ns;
721
// try omitting one char of word at a time
722
memcpy(candidate_utf, word, wl * sizeof(w_char));
723
for (p = candidate_utf + wl - 1; p >= candidate_utf; p--) {
725
if (p < candidate_utf + wl - 1) *p = tmpc;
726
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1);
727
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
728
if (ns == -1) return -1;
734
// error is word has an extra letter it does not need
735
int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest)
738
char candidate[MAXSWUTF8L];
740
int wl = strlen(word);
741
if (wl < 2) return ns;
742
// try omitting one char of word at a time
743
strcpy (candidate, word);
744
for (p = candidate + wl - 1; p >=candidate; p--) {
747
ns = testsug(wlst, candidate, wl-1, ns, cpdsuggest, NULL, NULL);
748
if (ns == -1) return -1;
754
// error is missing a letter it needs
755
int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsuggest)
757
char candidate[MAXSWUTF8L + 4];
759
clock_t timelimit = clock();
760
int timer = MINTIMER;
761
int wl = strlen(word);
762
// try inserting a tryme character before every letter (and the null terminator)
763
for (int i = 0; i < ctryl; i++) {
764
strcpy(candidate, word);
765
for (p = candidate + wl; p >= candidate; p--) {
768
ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, &timer, &timelimit);
769
if (ns == -1) return -1;
770
if (!timer) return ns;
776
// error is missing a letter it needs
777
int SuggestMgr::forgotchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
779
w_char candidate_utf[MAXSWL + 1];
780
char candidate[MAXSWUTF8L + 4];
782
clock_t timelimit = clock();
783
int timer = MINTIMER;
784
// try inserting a tryme character at the end of the word and before every letter
785
for (int i = 0; i < ctryl; i++) {
786
memcpy (candidate_utf, word, wl * sizeof(w_char));
787
for (p = candidate_utf + wl; p >= candidate_utf; p--) {
790
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);
791
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit);
792
if (ns == -1) return -1;
793
if (!timer) return ns;
800
/* error is should have been two words */
801
int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest)
803
char candidate[MAXSWUTF8L];
810
if (wl < 3) return ns;
812
if (langnum == LANG_hu) forbidden = check_forbidden(word, wl);
814
strcpy(candidate + 1, word);
815
// split the string into two pieces after every char
816
// if both pieces are good words make them a suggestion
817
for (p = candidate + 1; p[1] != '\0'; p++) {
819
// go to end of the UTF-8 character
820
while (utf8 && ((p[1] & 0xc0) == 0x80)) {
824
if (utf8 && p[1] == '\0') break; // last UTF-8 character
826
c1 = checkword(candidate,strlen(candidate), cpdsuggest, NULL, NULL);
828
c2 = checkword((p+1),strlen(p+1), cpdsuggest, NULL, NULL);
832
// spec. Hungarian code (need a better compound word support)
833
if ((langnum == LANG_hu) && !forbidden &&
834
// if 3 repeating letter, use - instead of space
835
(((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||
836
// or multiple compounding, with more, than 6 syllables
837
((c1 == 3) && (c2 >= 2)))) *p = '-';
840
for (int k=0; k < ns; k++) {
841
if (strcmp(candidate,wlst[k]) == 0) {
848
wlst[ns] = mystrdup(candidate);
849
if (wlst[ns] == NULL) return -1;
853
// add two word suggestion with dash, if TRY string contains
855
// NOTE: cwrd doesn't modified for REP twoword sugg.
856
if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) &&
857
mystrlen(p + 1) > 1 &&
858
mystrlen(candidate) - mystrlen(p) > 1) {
860
for (int k=0; k < ns; k++) {
861
if (strcmp(candidate,wlst[k]) == 0) {
868
wlst[ns] = mystrdup(candidate);
869
if (wlst[ns] == NULL) return -1;
881
// error is adjacent letter were swapped
882
int SuggestMgr::swapchar(char ** wlst, const char * word, int ns, int cpdsuggest)
884
char candidate[MAXSWUTF8L];
888
// try swapping adjacent chars one by one
889
strcpy(candidate, word);
890
for (p = candidate; p[1] != 0; p++) {
894
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
895
if (ns == -1) return -1;
899
// try double swaps for short words
900
// ahev -> have, owudl -> would
901
if (wl == 4 || wl == 5) {
902
candidate[0] = word[1];
903
candidate[1] = word[0];
904
candidate[2] = word[2];
905
candidate[wl - 2] = word[wl - 1];
906
candidate[wl - 1] = word[wl - 2];
907
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
908
if (ns == -1) return -1;
910
candidate[0] = word[0];
911
candidate[1] = word[2];
912
candidate[2] = word[1];
913
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
914
if (ns == -1) return -1;
920
// error is adjacent letter were swapped
921
int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
923
w_char candidate_utf[MAXSWL];
924
char candidate[MAXSWUTF8L];
928
// try swapping adjacent chars one by one
929
memcpy (candidate_utf, word, wl * sizeof(w_char));
930
for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) {
934
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
935
if (len == 0) len = strlen(candidate);
936
ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
937
if (ns == -1) return -1;
941
// try double swaps for short words
942
// ahev -> have, owudl -> would, suodn -> sound
943
if (wl == 4 || wl == 5) {
944
candidate_utf[0] = word[1];
945
candidate_utf[1] = word[0];
946
candidate_utf[2] = word[2];
947
candidate_utf[wl - 2] = word[wl - 1];
948
candidate_utf[wl - 1] = word[wl - 2];
949
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
950
ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
951
if (ns == -1) return -1;
953
candidate_utf[0] = word[0];
954
candidate_utf[1] = word[2];
955
candidate_utf[2] = word[1];
956
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
957
ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
958
if (ns == -1) return -1;
964
// error is not adjacent letter were swapped
965
int SuggestMgr::longswapchar(char ** wlst, const char * word, int ns, int cpdsuggest)
967
char candidate[MAXSWUTF8L];
972
// try swapping not adjacent chars one by one
973
strcpy(candidate, word);
974
for (p = candidate; *p != 0; p++) {
975
for (q = candidate; *q != 0; q++) {
976
if (abs((int)(p-q)) > 1) {
980
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
981
if (ns == -1) return -1;
991
// error is adjacent letter were swapped
992
int SuggestMgr::longswapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
994
w_char candidate_utf[MAXSWL];
995
char candidate[MAXSWUTF8L];
999
// try swapping not adjacent chars
1000
memcpy (candidate_utf, word, wl * sizeof(w_char));
1001
for (p = candidate_utf; p < (candidate_utf + wl); p++) {
1002
for (q = candidate_utf; q < (candidate_utf + wl); q++) {
1003
if (abs((int)(p-q)) > 1) {
1007
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
1008
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
1009
if (ns == -1) return -1;
1018
// error is a letter was moved
1019
int SuggestMgr::movechar(char ** wlst, const char * word, int ns, int cpdsuggest)
1021
char candidate[MAXSWUTF8L];
1026
int wl=strlen(word);
1027
// try moving a char
1028
strcpy(candidate, word);
1029
for (p = candidate; *p != 0; p++) {
1030
for (q = p + 1; (*q != 0) && ((q - p) < 10); q++) {
1034
if ((q-p) < 2) continue; // omit swap char
1035
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
1036
if (ns == -1) return -1;
1038
strcpy(candidate, word);
1040
for (p = candidate + wl - 1; p > candidate; p--) {
1041
for (q = p - 1; (q >= candidate) && ((p - q) < 10); q--) {
1045
if ((p-q) < 2) continue; // omit swap char
1046
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
1047
if (ns == -1) return -1;
1049
strcpy(candidate, word);
1054
// error is a letter was moved
1055
int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
1057
w_char candidate_utf[MAXSWL];
1058
char candidate[MAXSWUTF8L];
1062
// try moving a char
1063
memcpy (candidate_utf, word, wl * sizeof(w_char));
1064
for (p = candidate_utf; p < (candidate_utf + wl); p++) {
1065
for (q = p + 1; (q < (candidate_utf + wl)) && ((q - p) < 10); q++) {
1069
if ((q-p) < 2) continue; // omit swap char
1070
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
1071
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
1072
if (ns == -1) return -1;
1074
memcpy (candidate_utf, word, wl * sizeof(w_char));
1076
for (p = candidate_utf + wl - 1; p > candidate_utf; p--) {
1077
for (q = p - 1; (q >= candidate_utf) && ((p - q) < 10); q--) {
1081
if ((p-q) < 2) continue; // omit swap char
1082
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
1083
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
1084
if (ns == -1) return -1;
1086
memcpy (candidate_utf, word, wl * sizeof(w_char));
1091
// generate a set of suggestions for very poorly spelled words
1092
int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md)
1101
// exhaustively search through all root words
1102
// keeping track of the MAX_ROOTS most similar root words
1103
struct hentry * roots[MAX_ROOTS];
1104
char * rootsphon[MAX_ROOTS];
1105
int scores[MAX_ROOTS];
1106
int scoresphon[MAX_ROOTS];
1107
for (i = 0; i < MAX_ROOTS; i++) {
1109
scores[i] = -100 * i;
1110
rootsphon[i] = NULL;
1111
scoresphon[i] = -100 * i;
1114
lpphon = MAX_ROOTS - 1;
1115
int low = NGRAM_LOWERING;
1119
const char * word = w;
1121
// word reversing wrapper for complex prefixes
1122
if (complexprefixes){
1124
if (utf8) reverseword_utf(w2); else reverseword(w2);
1128
char mw[MAXSWUTF8L];
1130
int nc = strlen(word);
1131
int n = (utf8) ? u8_u16(u8, MAXSWL, word) : nc;
1133
// set character based ngram suggestion for words with non-BMP Unicode characters
1135
utf8 = 0; // XXX not state-free
1141
struct hentry* hp = NULL;
1143
phonetable * ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;
1144
char target[MAXSWUTF8L];
1145
std::string candidate;
1148
std::vector<w_char> _w;
1149
int _wl = u8_u16(_w, word);
1150
mkallcap_utf(_w, _wl, langnum);
1151
u16_u8(candidate, _w);
1153
candidate.assign(word);
1154
if (!nonbmp) mkallcap(candidate, csconv);
1156
phonet(candidate.c_str(), target, nc, *ph); // XXX phonet() is 8-bit (nc, not n)
1159
FLAG forbiddenword = pAMgr ? pAMgr->get_forbiddenword() : FLAG_NULL;
1160
FLAG nosuggest = pAMgr ? pAMgr->get_nosuggest() : FLAG_NULL;
1161
FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL;
1162
FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL;
1164
for (i = 0; i < md; i++) {
1165
while (0 != (hp = (pHMgr[i])->walk_hashtable(col, hp))) {
1166
if ((hp->astr) && (pAMgr) &&
1167
(TESTAFF(hp->astr, forbiddenword, hp->alen) ||
1168
TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) ||
1169
TESTAFF(hp->astr, nosuggest, hp->alen) ||
1170
TESTAFF(hp->astr, nongramsuggest, hp->alen) ||
1171
TESTAFF(hp->astr, onlyincompound, hp->alen))) continue;
1173
sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +
1174
leftcommonsubstring(word, HENTRY_WORD(hp));
1176
// check special pronounciation
1177
if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {
1178
int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +
1179
+ leftcommonsubstring(word, f);
1180
if (sc2 > sc) sc = sc2;
1183
int scphon = -20000;
1184
if (ph && (sc > 2) && (abs(n - (int) hp->clen) <= 3)) {
1185
char target2[MAXSWUTF8L];
1187
std::vector<w_char> _w;
1188
int _wl = u8_u16(_w, HENTRY_WORD(hp));
1189
mkallcap_utf(_w, _wl, langnum);
1190
u16_u8(candidate, _w);
1192
candidate.assign(HENTRY_WORD(hp));
1193
mkallcap(candidate, csconv);
1195
phonet(candidate.c_str(), target2, -1, *ph);
1196
scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE);
1199
if (sc > scores[lp]) {
1203
for (j=0; j < MAX_ROOTS; j++)
1204
if (scores[j] < lval) {
1211
if (scphon > scoresphon[lpphon]) {
1212
scoresphon[lpphon] = scphon;
1213
rootsphon[lpphon] = HENTRY_WORD(hp);
1215
for (j=0; j < MAX_ROOTS; j++)
1216
if (scoresphon[j] < lval) {
1218
lval = scoresphon[j];
1223
// find minimum threshold for a passable suggestion
1224
// mangle original word three differnt ways
1225
// and score them to generate a minimum acceptable score
1227
for (int sp = 1; sp < 4; sp++) {
1229
for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*';
1230
u16_u8(mw, MAXSWUTF8L, u8, n);
1231
thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
1234
for (int k=sp; k < n; k+=4) *(mw + k) = '*';
1235
thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
1238
thresh = thresh / 3;
1241
// now expand affixes on each of these root words and
1242
// and use length adjusted ngram scores to select
1243
// possible suggestions
1244
char * guess[MAX_GUESS];
1245
char * guessorig[MAX_GUESS];
1246
int gscore[MAX_GUESS];
1247
for(i=0;i<MAX_GUESS;i++) {
1249
guessorig[i] = NULL;
1250
gscore[i] = -100 * i;
1255
struct guessword * glst;
1256
glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword));
1258
if (nonbmp) utf8 = 1;
1262
for (i = 0; i < MAX_ROOTS; i++) {
1264
struct hentry * rp = roots[i];
1265
int nw = pAMgr->expand_rootword(glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen,
1266
rp->astr, rp->alen, word, nc,
1267
((rp->var & H_OPT_PHON) ? copy_field(f, HENTRY_DATA(rp), MORPH_PHON) : NULL));
1269
for (int k = 0; k < nw ; k++) {
1270
sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) +
1271
leftcommonsubstring(word, glst[k].word);
1274
if (sc > gscore[lp]) {
1277
if (guessorig[lp]) {
1278
free(guessorig[lp]);
1279
guessorig[lp] = NULL;
1283
guess[lp] = glst[k].word;
1284
guessorig[lp] = glst[k].orig;
1286
for (j=0; j < MAX_GUESS; j++)
1287
if (gscore[j] < lval) {
1293
if (glst[k].orig) free(glst[k].orig);
1297
if (glst[k].orig) free(glst[k].orig);
1304
// now we are done generating guesses
1305
// sort in order of decreasing score
1308
bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);
1309
if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);
1311
// weight suggestions with a similarity index, based on
1312
// the longest common subsequent algorithm and resort
1318
int maxd = pAMgr->get_maxdiff();
1319
if (maxd >= 0) fact = (10.0 - maxd)/5.0;
1322
for (i=0; i < MAX_GUESS; i++) {
1324
// lowering guess[i]
1328
std::vector<w_char> _w;
1329
len = u8_u16(_w, guess[i]);
1330
mkallsmall_utf(_w, len, langnum);
1333
gl.assign(guess[i]);
1334
if (!nonbmp) mkallsmall(gl, csconv);
1335
len = strlen(guess[i]);
1338
int _lcs = lcslen(word, gl.c_str());
1340
// same characters with different casing
1341
if ((n == len) && (n == _lcs)) {
1345
// using 2-gram instead of 3, and other weightening
1347
re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
1348
ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
1351
// length of longest common subsequent minus length difference
1352
2 * _lcs - abs((int) (n - len)) +
1353
// weight length of the left common substring
1354
leftcommonsubstring(word, gl.c_str()) +
1355
// weight equal character positions
1356
(!nonbmp && commoncharacterpositions(word, gl.c_str(), &is_swap) ? 1: 0) +
1357
// swap character (not neighboring)
1358
((is_swap) ? 10 : 0) +
1360
ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) +
1363
// different limit for dictionaries with PHONE rules
1364
(ph ? (re < len * fact ? -1000 : 0) : (re < (n + len)*fact? -1000 : 0));
1368
bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);
1371
if (ph) for (i=0; i < MAX_ROOTS; i++) {
1373
// lowering rootphon[i]
1377
std::vector<w_char> _w;
1378
len = u8_u16(_w, rootsphon[i]);
1379
mkallsmall_utf(_w, len, langnum);
1382
gl.assign(rootsphon[i]);
1383
if (!nonbmp) mkallsmall(gl, csconv);
1384
len = strlen(rootsphon[i]);
1387
// heuristic weigthing of ngram scores
1388
scoresphon[i] += 2 * lcslen(word, gl) - abs((int) (n - len)) +
1389
// weight length of the left common substring
1390
leftcommonsubstring(word, gl.c_str());
1394
if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);
1400
for (i=0; i < MAX_GUESS; i++) {
1402
if ((ns < oldns + maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) {
1404
// leave only excellent suggestions, if exists
1405
if (gscore[i] > 1000) same = 1; else if (gscore[i] < -100) {
1407
// keep the best ngram suggestions, unless in ONLYMAXDIFF mode
1408
if (ns > oldns || (pAMgr && pAMgr->get_onlymaxdiff())) {
1410
if (guessorig[i]) free(guessorig[i]);
1414
for (j = 0; j < ns; j++) {
1415
// don't suggest previous suggestions or a previous suggestion with prefixes or affixes
1416
if ((!guessorig[i] && strstr(guess[i], wlst[j])) ||
1417
(guessorig[i] && strstr(guessorig[i], wlst[j])) ||
1418
// check forbidden words
1419
!checkword(guess[i], strlen(guess[i]), 0, NULL, NULL)) {
1425
wlst[ns++] = guess[i];
1428
wlst[ns-1] = guessorig[i];
1432
if (guessorig[i]) free(guessorig[i]);
1436
if (guessorig[i]) free(guessorig[i]);
1442
if (ph) for (i=0; i < MAX_ROOTS; i++) {
1444
if ((ns < oldns + MAXPHONSUGS) && (ns < maxSug)) {
1446
for (j = 0; j < ns; j++) {
1447
// don't suggest previous suggestions or a previous suggestion with prefixes or affixes
1448
if (strstr(rootsphon[i], wlst[j]) ||
1449
// check forbidden words
1450
!checkword(rootsphon[i], strlen(rootsphon[i]), 0, NULL, NULL)) {
1456
wlst[ns++] = mystrdup(rootsphon[i]);
1457
if (!wlst[ns - 1]) return ns - 1;
1463
if (nonbmp) utf8 = 1;
1468
// see if a candidate suggestion is spelled correctly
1469
// needs to check both root words and words with affixes
1471
// obsolote MySpell-HU modifications:
1472
// return value 2 and 3 marks compounding with hyphen (-)
1473
// `3' marks roots without suffix
1474
int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, clock_t * timelimit)
1476
struct hentry * rv=NULL;
1477
struct hentry * rv2=NULL;
1483
if (!(*timer) && timelimit) {
1484
if ((clock() - *timelimit) > TIMELIMIT) return 0;
1485
*timer = MAXPLUSTIMER;
1490
if (cpdsuggest==1) {
1491
if (pAMgr->get_compound()) {
1492
rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 1, 0); //EXT
1493
if (rv && (!(rv2 = pAMgr->lookup(word)) || !rv2->astr ||
1494
!(TESTAFF(rv2->astr,pAMgr->get_forbiddenword(),rv2->alen) ||
1495
TESTAFF(rv2->astr,pAMgr->get_nosuggest(),rv2->alen)))) return 3; // XXX obsolote categorisation + only ICONV needs affix flag check?
1500
rv = pAMgr->lookup(word);
1503
if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)
1504
|| TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0;
1506
if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) ||
1507
TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1508
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {
1509
rv = rv->next_homonym;
1512
} else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX
1517
rv = pAMgr->suffix_check(word, len, 0, NULL, NULL, 0, NULL); // only suffix
1520
if (!rv && pAMgr->have_contclass()) {
1521
rv = pAMgr->suffix_check_twosfx(word, len, 0, NULL, FLAG_NULL);
1522
if (!rv) rv = pAMgr->prefix_check_twosfx(word, len, 1, FLAG_NULL);
1525
// check forbidden words
1526
if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) ||
1527
TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1528
TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) ||
1529
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0;
1531
if (rv) { // XXX obsolote
1532
if ((pAMgr->get_compoundflag()) &&
1533
TESTAFF(rv->astr, pAMgr->get_compoundflag(), rv->alen)) return 2 + nosuffix;
1540
int SuggestMgr::check_forbidden(const char * word, int len)
1542
struct hentry * rv = NULL;
1545
rv = pAMgr->lookup(word);
1546
if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) ||
1547
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;
1548
if (!(pAMgr->prefix_check(word,len,1)))
1549
rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix
1550
// check forbidden words
1551
if ((rv) && (rv->astr) && TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)) return 1;
1556
#ifdef HUNSPELL_EXPERIMENTAL
1557
// suggest possible stems
1558
int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug)
1562
struct hentry * rv = NULL;
1564
char w2[MAXSWUTF8L];
1565
const char * word = w;
1567
// word reversing wrapper for complex prefixes
1568
if (complexprefixes) {
1570
if (utf8) reverseword_utf(w2); else reverseword(w2);
1574
int wl = strlen(word);
1580
wlst = (char **) calloc(maxSug, sizeof(char *));
1581
if (wlst == NULL) return -1;
1584
rv = pAMgr->suffix_check(word, wl, 0, NULL, wlst, maxSug, &nsug);
1586
// delete dash from end of word
1588
for (int j=0; j < nsug; j++) {
1589
if (wlst[j][strlen(wlst[j]) - 1] == '-') wlst[j][strlen(wlst[j]) - 1] = '\0';
1596
#endif // END OF HUNSPELL_EXPERIMENTAL CODE
1599
char * SuggestMgr::suggest_morph(const char * w)
1601
char result[MAXLNLEN];
1602
char * r = (char *) result;
1605
struct hentry * rv = NULL;
1609
if (! pAMgr) return NULL;
1612
const char * word = w;
1614
// word reversing wrapper for complex prefixes
1615
if (complexprefixes) {
1617
if (utf8) reverseword_utf(w2); else reverseword(w2);
1621
rv = pAMgr->lookup(word);
1624
if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) ||
1625
TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) ||
1626
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {
1627
if (!HENTRY_FIND(rv, MORPH_STEM)) {
1628
mystrcat(result, " ", MAXLNLEN);
1629
mystrcat(result, MORPH_STEM, MAXLNLEN);
1630
mystrcat(result, word, MAXLNLEN);
1632
if (HENTRY_DATA(rv)) {
1633
mystrcat(result, " ", MAXLNLEN);
1634
mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
1636
mystrcat(result, "\n", MAXLNLEN);
1638
rv = rv->next_homonym;
1641
st = pAMgr->affix_check_morph(word,strlen(word));
1643
mystrcat(result, st, MAXLNLEN);
1647
if (pAMgr->get_compound() && (*result == '\0'))
1648
pAMgr->compound_check_morph(word, strlen(word),
1649
0, 0, 100, 0,NULL, 0, &r, NULL);
1651
return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL;
1654
#ifdef HUNSPELL_EXPERIMENTAL
1655
char * SuggestMgr::suggest_morph_for_spelling_error(const char * word)
1658
char ** wlst = (char **) calloc(maxSug, sizeof(char *));
1659
if (!**wlst) return NULL;
1660
// we will use only the first suggestion
1661
for (int i = 0; i < maxSug - 1; i++) wlst[i] = "";
1662
int ns = suggest(&wlst, word, maxSug - 1, NULL);
1664
p = suggest_morph(wlst[maxSug - 1]);
1665
free(wlst[maxSug - 1]);
1667
if (wlst) free(wlst);
1670
#endif // END OF HUNSPELL_EXPERIMENTAL CODE
1673
char * SuggestMgr::suggest_hentry_gen(hentry * rv, const char * pattern)
1675
char result[MAXLNLEN];
1677
int sfxcount = get_sfxcount(pattern);
1679
if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) return NULL;
1681
if (HENTRY_DATA(rv)) {
1682
char * aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen,
1683
HENTRY_DATA(rv), pattern, 0);
1685
mystrcat(result, aff, MAXLNLEN);
1686
mystrcat(result, "\n", MAXLNLEN);
1691
// check all allomorphs
1692
char allomorph[MAXLNLEN];
1694
if (HENTRY_DATA(rv)) p = (char *) strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH);
1696
struct hentry * rv2 = NULL;
1698
int plen = fieldlen(p);
1699
strncpy(allomorph, p, plen);
1700
allomorph[plen] = '\0';
1701
rv2 = pAMgr->lookup(allomorph);
1703
// if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= sfxcount) {
1704
if (HENTRY_DATA(rv2)) {
1705
char * st = (char *) strstr(HENTRY_DATA2(rv2), MORPH_STEM);
1706
if (st && (strncmp(st + MORPH_TAG_LEN,
1707
HENTRY_WORD(rv), fieldlen(st + MORPH_TAG_LEN)) == 0)) {
1708
char * aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv2->astr, rv2->alen,
1709
HENTRY_DATA(rv2), pattern, 0);
1711
mystrcat(result, aff, MAXLNLEN);
1712
mystrcat(result, "\n", MAXLNLEN);
1717
rv2 = rv2->next_homonym;
1719
p = strstr(p + plen, MORPH_ALLOMORPH);
1722
return (*result) ? mystrdup(result) : NULL;
1725
char * SuggestMgr::suggest_gen(char ** desc, int n, const char * pattern) {
1726
if (n == 0 || !pAMgr) return NULL;
1728
char result[MAXLNLEN];
1729
char result2[MAXLNLEN];
1730
std::string newpattern;
1732
struct hentry * rv = NULL;
1734
// search affixed forms with and without derivational suffixes
1737
for (int k = 0; k < n; k++) {
1739
// add compound word parts (except the last one)
1740
char * s = (char *) desc[k];
1741
char * part = strstr(s, MORPH_PART);
1743
char * nextpart = strstr(part + 1, MORPH_PART);
1745
copy_field(result + strlen(result), part, MORPH_PART);
1747
nextpart = strstr(part + 1, MORPH_PART);
1754
size_t pos = tok.find(" | ");
1755
while (pos != std::string::npos)
1757
tok[pos+1] = MSEP_ALT;
1758
pos = tok.find(" | ", pos);
1760
int pln = line_tok(tok.c_str(), &pl, MSEP_ALT);
1761
for (int i = 0; i < pln; i++) {
1762
// remove inflectional and terminal suffixes
1763
char * is = strstr(pl[i], MORPH_INFL_SFX);
1765
char * ts = strstr(pl[i], MORPH_TERM_SFX);
1768
ts = strstr(pl[i], MORPH_TERM_SFX);
1770
char * st = strstr(s, MORPH_STEM);
1772
copy_field(tok, st, MORPH_STEM);
1773
rv = pAMgr->lookup(tok.c_str());
1775
std::string newpat(pl[i]);
1776
newpat.append(pattern);
1777
char * sg = suggest_hentry_gen(rv, newpat.c_str());
1778
if (!sg) sg = suggest_hentry_gen(rv, pattern);
1781
int genl = line_tok(sg, &gen, MSEP_REC);
1784
for (int j = 0; j < genl; j++) {
1785
if (strstr(pl[i], MORPH_SURF_PFX)) {
1786
int r2l = strlen(result2);
1787
result2[r2l] = MSEP_REC;
1788
strcpy(result2 + r2l + 1, result);
1789
copy_field(result2 + strlen(result2), pl[i], MORPH_SURF_PFX);
1790
mystrcat(result2, gen[j], MAXLNLEN);
1792
sprintf(result2 + strlen(result2), "%c%s%s",
1793
MSEP_REC, result, gen[j]);
1796
freelist(&gen, genl);
1798
rv = rv->next_homonym;
1805
if (*result2 || !strstr(pattern, MORPH_DERI_SFX)) break;
1807
newpattern.assign(pattern);
1808
mystrrep(newpattern, MORPH_DERI_SFX, MORPH_TERM_SFX);
1809
pattern = newpattern.c_str();
1811
return (*result2 ? mystrdup(result2) : NULL);
1814
// generate an n-gram score comparing s1 and s2
1815
int SuggestMgr::ngram(int n, const std::string& s1, const std::string& s2, int opt)
1824
std::vector<w_char> su1;
1825
std::vector<w_char> su2;
1826
l1 = u8_u16(su1, s1);
1827
l2 = u8_u16(su2, s2);
1828
if ((l2 <= 0) || (l1 == -1)) return 0;
1829
// lowering dictionary word
1830
if (opt & NGRAM_LOWERING) mkallsmall_utf(su2, l2, langnum);
1831
for (int j = 1; j <= n; j++) {
1833
for (int i = 0; i <= (l1-j); i++) {
1835
for (int l = 0; l <= (l2-j); l++) {
1836
for (k = 0; k < j; k++) {
1837
w_char& c1 = su1[i + k];
1838
w_char& c2 = su2[l + k];
1839
if ((c1.l != c2.l) || (c1.h != c2.h)) break;
1846
if (k != j && opt & NGRAM_WEIGHTED) {
1849
if (i == 0 || i == l1-j) ns--; // side weight
1852
nscore = nscore + ns;
1853
if (ns < 2 && !(opt & NGRAM_WEIGHTED)) break;
1857
if (l2 == 0) return 0;
1860
if (opt & NGRAM_LOWERING) mkallsmall(t, csconv);
1861
for (int j = 1; j <= n; j++) {
1863
for (int i = 0; i <= (l1-j); i++) {
1864
std::string temp(s1.substr(i, j));
1865
if (t.find(temp) != std::string::npos) {
1867
} else if (opt & NGRAM_WEIGHTED) {
1870
if (i == 0 || i == l1-j) ns--; // side weight
1873
nscore = nscore + ns;
1874
if (ns < 2 && !(opt & NGRAM_WEIGHTED)) break;
1879
if (opt & NGRAM_LONGER_WORSE) ns = (l2-l1)-2;
1880
if (opt & NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;
1881
ns = (nscore - ((ns > 0) ? ns : 0));
1885
// length of the left common substring of s1 and (decapitalised) s2
1886
int SuggestMgr::leftcommonsubstring(const char * s1, const char * s2) {
1890
su1[0].l = su2[0].l = su1[0].h = su2[0].h = 0;
1891
// decapitalize dictionary word
1892
if (complexprefixes) {
1893
int l1 = u8_u16(su1, MAXSWL, s1);
1894
int l2 = u8_u16(su2, MAXSWL, s2);
1895
if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1;
1900
unsigned short idx = (su2->h << 8) + su2->l;
1901
unsigned short otheridx = (su1->h << 8) + su1->l;
1902
if (otheridx != idx &&
1903
(otheridx != unicodetolower(idx, langnum))) return 0;
1904
int l1 = u8_u16(su1, MAXSWL, s1);
1905
int l2 = u8_u16(su2, MAXSWL, s2);
1906
for(i = 1; (i < l1) && (i < l2) &&
1907
(su1[i].l == su2[i].l) && (su1[i].h == su2[i].h); i++);
1911
if (complexprefixes) {
1912
int l1 = strlen(s1);
1913
int l2 = strlen(s2);
1914
if (*(s2+l1-1) == *(s2+l2-1)) return 1;
1916
const char * olds = s1;
1917
// decapitalise dictionary word
1918
if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) return 0;
1921
} while ((*s1 == *s2) && (*s1 != '\0'));
1922
return (int)(s1 - olds);
1928
int SuggestMgr::commoncharacterpositions(const char * s1, const char * s2, int * is_swap) {
1936
int l1 = u8_u16(su1, MAXSWL, s1);
1937
int l2 = u8_u16(su2, MAXSWL, s2);
1939
if (l1 <= 0 || l2 <= 0)
1942
// decapitalize dictionary word
1943
if (complexprefixes) {
1944
mkallsmall_utf(su2+l2-1, 1, langnum);
1946
mkallsmall_utf(su2, 1, langnum);
1948
for (int i = 0; (i < l1) && (i < l2); i++) {
1949
if (((short *) su1)[i] == ((short *) su2)[i]) {
1952
if (diff < 2) diffpos[diff] = i;
1956
if ((diff == 2) && (l1 == l2) &&
1957
(((short *) su1)[diffpos[0]] == ((short *) su2)[diffpos[1]]) &&
1958
(((short *) su1)[diffpos[1]] == ((short *) su2)[diffpos[0]])) *is_swap = 1;
1962
// decapitalize dictionary word
1963
if (complexprefixes) {
1964
size_t l2 = t.size();
1965
t[l2-1] = csconv[(unsigned char)t[l2-1]].clower;
1967
mkallsmall(t, csconv);
1969
for (i = 0; (*(s1+i) != 0) && i < t.size(); i++) {
1970
if (*(s1+i) == t[i]) {
1973
if (diff < 2) diffpos[diff] = i;
1977
if ((diff == 2) && (*(s1+i) == 0) && i == t.size() &&
1978
(*(s1+diffpos[0]) == t[diffpos[1]]) &&
1979
(*(s1+diffpos[1]) == t[diffpos[0]])) *is_swap = 1;
1984
int SuggestMgr::mystrlen(const char * word) {
1987
return u8_u16(w, MAXSWL, word);
1988
} else return strlen(word);
1991
// sort in decreasing order of score
1992
void SuggestMgr::bubblesort(char** rword, char** rword2, int* rsc, int n )
1998
if (rsc[j-1] < rsc[j]) {
1999
int sctmp = rsc[j-1];
2000
char * wdtmp = rword[j-1];
2002
rword[j-1] = rword[j];
2006
wdtmp = rword2[j-1];
2007
rword2[j-1] = rword2[j];
2018
// longest common subsequence
2019
void SuggestMgr::lcs(const char * s, const char * s2, int * l1, int * l2, char ** result) {
2028
m = u8_u16(su, MAXSWL, s);
2029
n = u8_u16(su2, MAXSWL, s2);
2034
c = (char *) malloc((m + 1) * (n + 1));
2035
b = (char *) malloc((m + 1) * (n + 1));
2042
for (i = 1; i <= m; i++) c[i*(n+1)] = 0;
2043
for (j = 0; j <= n; j++) c[j] = 0;
2044
for (i = 1; i <= m; i++) {
2045
for (j = 1; j <= n; j++) {
2046
if ( ((utf8) && (*((short *) su+i-1) == *((short *)su2+j-1)))
2047
|| ((!utf8) && ((*(s+i-1)) == (*(s2+j-1))))) {
2048
c[i*(n+1) + j] = c[(i-1)*(n+1) + j-1]+1;
2049
b[i*(n+1) + j] = LCS_UPLEFT;
2050
} else if (c[(i-1)*(n+1) + j] >= c[i*(n+1) + j-1]) {
2051
c[i*(n+1) + j] = c[(i-1)*(n+1) + j];
2052
b[i*(n+1) + j] = LCS_UP;
2054
c[i*(n+1) + j] = c[i*(n+1) + j-1];
2055
b[i*(n+1) + j] = LCS_LEFT;
2065
int SuggestMgr::lcslen(const char * s, const char* s2) {
2072
lcs(s, s2, &m, &n, &result);
2073
if (!result) return 0;
2076
while ((i != 0) && (j != 0)) {
2077
if (result[i*(n+1) + j] == LCS_UPLEFT) {
2081
} else if (result[i*(n+1) + j] == LCS_UP) {
2089
int SuggestMgr::lcslen(const std::string& s, const std::string& s2) {
2090
return lcslen(s.c_str(), s2.c_str());