243
243
signed char replindex;
244
244
signed char replcut;
245
245
int state_num = 0, last_state;
251
252
f = fopen (fn, "r");
256
// loading one or two dictionaries (separated by NEXTLEVEL keyword)
257
for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
255
258
hashtab = hnj_hash_new ();
257
260
global = hashtab;
259
262
hnj_hash_insert (hashtab, "", 0);
261
dict = hnj_malloc (sizeof(HyphenDict));
262
dict->num_states = 1;
263
dict->states = hnj_malloc (sizeof(HyphenState));
264
dict->states[0].match = NULL;
265
dict->states[0].repl = NULL;
266
dict->states[0].fallback_state = -1;
267
dict->states[0].num_trans = 0;
268
dict->states[0].trans = NULL;
263
dict[k] = hnj_malloc (sizeof(HyphenDict));
264
dict[k]->num_states = 1;
265
dict[k]->states = hnj_malloc (sizeof(HyphenState));
266
dict[k]->states[0].match = NULL;
267
dict[k]->states[0].repl = NULL;
268
dict[k]->states[0].fallback_state = -1;
269
dict[k]->states[0].num_trans = 0;
270
dict[k]->states[0].trans = NULL;
271
dict[k]->nextlevel = NULL;
270
277
/* read in character set info */
271
for (i=0;i<MAX_NAME;i++) dict->cset[i]= 0;
272
fgets(dict->cset, sizeof(dict->cset),f);
273
for (i=0;i<MAX_NAME;i++)
274
if ((dict->cset[i] == '\r') || (dict->cset[i] == '\n'))
276
dict->utf8 = (strcmp(dict->cset, "UTF-8") == 0);
279
for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
280
fgets(dict[k]->cset, sizeof(dict[k]->cset),f);
281
for (i=0;i<MAX_NAME;i++)
282
if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
283
dict[k]->cset[i] = 0;
284
dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
286
strcpy(dict[k]->cset, dict[0]->cset);
287
dict[k]->utf8 = dict[0]->utf8;
278
290
while (fgets (buf, sizeof(buf), f) != NULL)
280
292
if (buf[0] != '%')
294
if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
297
} else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
298
dict[k]->lhmin = atoi(buf + 13);
300
} else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
301
dict[k]->rhmin = atoi(buf + 14);
303
} else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
304
dict[k]->clhmin = atoi(buf + 21);
306
} else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
307
dict[k]->crhmin = atoi(buf + 22);
283
311
pattern[j] = '0';
284
312
repl = strchr(buf, '/');
346
374
printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl);
348
376
found = hnj_hash_lookup (hashtab, word);
349
state_num = hnj_get_state (dict, hashtab, word);
350
dict->states[state_num].match = hnj_strdup (pattern + i);
351
dict->states[state_num].repl = repl;
352
dict->states[state_num].replindex = replindex;
377
state_num = hnj_get_state (dict[k], hashtab, word);
378
dict[k]->states[state_num].match = hnj_strdup (pattern + i);
379
dict[k]->states[state_num].repl = repl;
380
dict[k]->states[state_num].replindex = replindex;
354
dict->states[state_num].replcut = strlen(word);
382
dict[k]->states[state_num].replcut = strlen(word);
356
dict->states[state_num].replcut = replcut;
384
dict[k]->states[state_num].replcut = replcut;
359
387
/* now, put in the prefix transitions */
400
428
/* KBH: FIXME state 0 fallback_state should always be -1? */
402
dict->states[e->val].fallback_state = state_num;
430
dict[k]->states[e->val].fallback_state = state_num;
405
433
for (i = 0; i < HASH_SIZE; i++)
406
434
for (e = hashtab->entries[i]; e; e = e->next)
408
436
printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
409
dict->states[e->val].fallback_state);
410
for (j = 0; j < dict->states[e->val].num_trans; j++)
411
printf (" %c->%d\n", dict->states[e->val].trans[j].ch,
412
dict->states[e->val].trans[j].new_state);
437
dict[k]->states[e->val].fallback_state);
438
for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
439
printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
440
dict[k]->states[e->val].trans[j].new_state);
417
445
hnj_hash_free (hashtab);
450
if (k == 2) dict[0]->nextlevel = dict[1];
423
454
void hnj_hyphen_free (HyphenDict *dict)
564
596
if (prep_word != prep_word_buf)
565
597
hnj_free (prep_word);
569
int hnj_hyphen_hyphenate2 (HyphenDict *dict,
570
const char *word, int word_size, char * hyphens,
571
char *hyphword, char *** rep, int ** pos, int ** cut)
602
/* character length of the first n byte of the input word */
603
int hnj_hyphen_strnlen(const char * word, int n, int utf8)
607
while (j < n && word[j] != '\0') {
609
for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
614
int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
615
char *** rep, int ** pos, int ** cut, int lhmin)
618
for (i = 1, j = 0; i < lhmin && word[j] != '\0'; i++) do {
619
// check length of the non-standard part
620
if (*rep && *pos && *cut && (*rep)[j]) {
621
char * rh = strchr((*rep)[j], '=');
622
if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
623
hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
632
} while (utf8 && (word[j + 1] & 0xc0) == 0xc0);
636
int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
637
char *** rep, int ** pos, int ** cut, int rhmin)
640
int j = word_size - 2;
641
for (i = 1; i < rhmin && j > 0; j--) {
642
// check length of the non-standard part
643
if (*rep && *pos && *cut && (*rep)[j]) {
644
char * rh = strchr((*rep)[j], '=');
645
if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
646
hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
654
if (!utf8 || (word[j] & 0xc0) != 0xc0) i++;
659
// recursive function for compound level hyphenation
660
int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
661
char * hyphens, char *** rep, int ** pos, int ** cut,
662
int clhmin, int crhmin, int lend, int rend)
573
664
char prep_word_buf[MAX_WORD];
752
836
(*pos)[matchindex[i] - 1] = matchindex[i] - i;
753
837
(*cut)[matchindex[i] - 1] = matchlen[i];
755
if (hyphword) strcpy(hyphword + j, matchrepl[matchindex[i]]);
756
839
j += strlen(matchrepl[matchindex[i]]);
757
840
i += matchlen[i] - 1;
758
} else if (hyphword) {
759
hyphword[j++] = word[i];
760
if ((hyphens[i]&1) && !(isrepl && ((i+1) < word_size) &&
761
(matchindex[i+1] >= 0) && matchrepl[matchindex[i+1]])) hyphword[j++] = '=';
765
if (hyphword) hyphword[j] = '\0';
767
844
if (matchrepl != matchrepl_buf) {
768
845
hnj_free (matchrepl);
769
846
hnj_free (matchlen);
770
847
hnj_free (matchindex);
773
if (!(dict->utf8)) return 0;
850
// recursive hyphenation of the first (compound) level segments
851
if (dict->nextlevel) {
852
char * rep2_buf[MAX_WORD];
853
int pos2_buf[MAX_WORD];
854
int cut2_buf[MAX_WORD];
855
char hyphens2_buf[MAX_WORD];
861
if (word_size < MAX_CHARS) {
865
hyphens2 = hyphens2_buf;
867
rep2 = hnj_malloc (word_size * sizeof(char *));
868
pos2 = hnj_malloc (word_size * sizeof(int));
869
cut2 = hnj_malloc (word_size * sizeof(int));
870
hyphens2 = hnj_malloc (word_size);
872
for (i = 0; i < word_size; i++) rep2[i] = NULL;
873
for (i = 0; i < word_size; i++) if
874
(hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
877
prep_word[i + 2] = '\0';
878
/* non-standard hyphenation at compound boundary (Schiffahrt) */
879
if (*rep && *pos && *cut && (*rep)[i]) {
880
char * l = strchr((*rep)[i], '=');
881
strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]);
883
hyph = (l - (*rep)[i]) - (*pos)[i];
884
prep_word[2 + i + hyph] = '\0';
887
hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
888
hyphens2, &rep2, &pos2, &cut2, clhmin,
889
crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
890
for (j = 0; j < i - begin - 1; j++) {
891
hyphens[begin + j] = hyphens2[j];
892
if (rep2[j] && rep && pos && cut) {
893
if (!*rep && !*pos && !*cut) {
895
*rep = (char **) malloc(sizeof(char *) * word_size);
896
*pos = (int *) malloc(sizeof(int) * word_size);
897
*cut = (int *) malloc(sizeof(int) * word_size);
898
for (k = 0; k < word_size; k++) {
904
(*rep)[begin + j] = rep2[j];
905
(*pos)[begin + j] = pos2[j];
906
(*cut)[begin + j] = cut2[j];
909
prep_word[i + 2] = word[i + 1];
910
if (*rep && *pos && *cut && (*rep)[i]) {
911
strcpy(prep_word + 1, word);
915
for (j = 0; j < word_size; j++) rep2[j] = NULL;
920
hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
921
hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
922
if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
923
rep, pos, cut, clhmin);
924
if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
925
rep, pos, cut, crhmin);
928
if (rep2 != rep2_buf) {
936
if (prep_word != prep_word_buf) hnj_free (prep_word);
940
/* UTF-8 normalization of hyphen and non-standard positions */
941
int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
942
char *** rep, int ** pos, int ** cut)
775
944
if ((((unsigned char) word[0]) >> 6) == 2) {
776
945
fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
780
949
/* calculate UTF-8 character positions */
782
for (i = 0; i < word_size; i++) {
951
for (i = 0, j = -1; i < word_size; i++) {
783
952
/* beginning of an UTF-8 character (not '10' start bits) */
784
953
if ((((unsigned char) word[i]) >> 6) != 2) j++;
785
954
hyphens[j] = hyphens[i];
806
975
hyphens[j + 1] = '\0';
979
/* get the word with all possible hyphenations (output: hyphword) */
980
void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
981
char * hyphword, char *** rep, int ** pos, int ** cut)
984
for (i = 0, j = 0; i < l; i++, j++) {
986
hyphword[j] = word[i];
987
if (*rep && *pos && *cut && (*rep)[i]) {
988
strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]);
989
j += strlen((*rep)[i]) - (*pos)[i];
990
i += (*cut)[i] - (*pos)[i];
991
} else hyphword[++j] = '=';
992
} else hyphword[j] = word[i];
998
/* main api function with default hyphenmin parameters */
999
int hnj_hyphen_hyphenate2 (HyphenDict *dict,
1000
const char *word, int word_size, char * hyphens,
1001
char *hyphword, char *** rep, int ** pos, int ** cut)
1003
hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1004
dict->clhmin, dict->crhmin, 1, 1);
1005
hnj_hyphen_lhmin(dict->utf8, word, word_size,
1006
hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
1007
hnj_hyphen_rhmin(dict->utf8, word, word_size,
1008
hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
1009
if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1010
if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
1014
/* previous main api function with hyphenmin parameters */
1015
int hnj_hyphen_hyphenate3 (HyphenDict *dict,
1016
const char *word, int word_size, char * hyphens,
1017
char *hyphword, char *** rep, int ** pos, int ** cut,
1018
int lhmin, int rhmin, int clhmin, int crhmin)
1020
lhmin = (lhmin > 0 ? lhmin : dict->lhmin);
1021
rhmin = (rhmin > 0 ? rhmin : dict->rhmin);
1022
hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1023
clhmin, crhmin, 1, 1);
1024
hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
1025
rep, pos, cut, (lhmin > 0 ? lhmin : 2));
1026
hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
1027
rep, pos, cut, (rhmin > 0 ? rhmin : 2));
1028
if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1029
if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);