495
660
hyphens[i] = '0';
496
661
hyphens[word_size] = '\0';
498
if (prep_word != prep_word_buf)
499
hnj_free (prep_word);
663
hnj_free (prep_word);
668
/* Unicode ligature length */
669
int hnj_ligature(unsigned char c) {
673
case 0x82: return LIG_xx; /* fl */
675
case 0x84: return LIG_xxx; /* ffl */
676
case 0x85: /* long st */
677
case 0x86: return LIG_xx; /* st */
682
/* character length of the first n byte of the input word */
683
int hnj_hyphen_strnlen(const char * word, int n, int utf8)
687
while (j < n && word[j] != '\0') {
689
// Unicode ligature support
690
if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
691
i += hnj_ligature(word[j + 2]);
693
for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
698
int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
699
char *** rep, int ** pos, int ** cut, int lhmin)
703
// Unicode ligature support
704
if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) {
705
i += hnj_ligature(word[2]);
709
for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--;
711
for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
712
// check length of the non-standard part
713
if (*rep && *pos && *cut && (*rep)[j]) {
714
char * rh = strchr((*rep)[j], '=');
715
if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
716
hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
726
// Unicode ligature support
727
if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
728
i += hnj_ligature(word[j + 2]);
730
} while (utf8 && (word[j] & 0xc0) == 0x80);
734
int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
735
char *** rep, int ** pos, int ** cut, int rhmin)
741
for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--;
743
for (j = word_size - 1; i < rhmin && j > 0; j--) {
744
// check length of the non-standard part
745
if (*rep && *pos && *cut && (*rep)[j]) {
746
char * rh = strchr((*rep)[j], '=');
747
if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
748
hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
756
if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++;
761
// recursive function for compound level hyphenation
762
int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
763
char * hyphens, char *** rep, int ** pos, int ** cut,
764
int clhmin, int crhmin, int lend, int rend)
773
signed char replindex;
782
size_t prep_word_size = word_size + 3;
783
prep_word = hnj_malloc (prep_word_size);
784
matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
785
matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
786
matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
789
prep_word[j++] = '.';
791
for (i = 0; i < word_size; i++) {
792
if (word[i] <= '9' && word[i] >= '0') {
793
prep_word[j++] = '.';
795
prep_word[j++] = word[i];
801
prep_word[j++] = '.';
804
for (i = 0; i < j; i++)
808
printf ("prep_word = %s\n", prep_word);
811
/* now, run the finite state machine */
813
for (i = 0; i < j; i++)
821
/* KBH: FIXME shouldn't this be as follows? */
823
goto try_next_letter;
828
state_str = get_state_str (state, 1);
830
for (k = 0; k < i - strlen (state_str); k++)
832
printf ("%s", state_str);
835
hstate = &dict->states[state];
836
for (k = 0; k < hstate->num_trans; k++)
837
if (hstate->trans[k].ch == ch)
839
state = hstate->trans[k].new_state;
842
state = hstate->fallback_state;
844
printf (" falling back, fallback_state %d\n", state);
849
printf ("found state %d\n",state);
851
/* Additional optimization is possible here - especially,
852
elimination of trailing zeroes from the match. Leading zeroes
853
have already been optimized. */
854
match = dict->states[state].match;
855
repl = dict->states[state].repl;
856
replindex = dict->states[state].replindex;
857
replcut = dict->states[state].replcut;
858
/* replacing rules not handled by hyphen_hyphenate() */
861
offset = i + 1 - strlen (match);
863
for (k = 0; k < offset; k++)
865
printf ("%s (%s)\n", match, repl);
868
if (!isrepl) for(; isrepl < word_size; isrepl++) {
869
matchrepl[isrepl] = NULL;
870
matchindex[isrepl] = -1;
872
matchlen[offset + replindex] = replcut;
874
/* This is a linear search because I tried a binary search and
875
found it to be just a teeny bit slower. */
876
for (k = 0; match[k]; k++) {
877
if ((hyphens[offset + k] < match[k])) {
878
hyphens[offset + k] = match[k];
880
matchrepl[offset + k] = repl;
881
if (repl && (k >= replindex) && (k <= replindex + replcut)) {
882
matchindex[offset + replindex] = offset + k;
890
/* KBH: we need this to make sure we keep looking in a word */
891
/* for patterns even if the current character is not known in state 0 */
892
/* since patterns for hyphenation may occur anywhere in the word */
897
for (i = 0; i < j; i++)
898
putchar (hyphens[i]);
902
for (i = 0; i < j - 3; i++)
904
if (hyphens[i + 1] & 1)
907
hyphens[i] = hyphens[i + 1];
909
for (; i < word_size; i++)
911
hyphens[word_size] = '\0';
913
/* now create a new char string showing hyphenation positions */
914
/* count the hyphens and allocate space for the new hyphenated string */
916
for (i = 0; i < word_size; i++)
920
for (i = 0; i < word_size; i++) {
921
if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
922
if (rep && pos && cut) {
924
*rep = (char **) calloc(word_size, sizeof(char *));
926
*pos = (int *) calloc(word_size, sizeof(int));
928
*cut = (int *) calloc(word_size, sizeof(int));
930
(*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
931
(*pos)[matchindex[i] - 1] = matchindex[i] - i;
932
(*cut)[matchindex[i] - 1] = matchlen[i];
934
j += strlen(matchrepl[matchindex[i]]);
935
i += matchlen[i] - 1;
939
hnj_free (matchrepl);
941
hnj_free (matchindex);
943
// recursive hyphenation of the first (compound) level segments
944
if (dict->nextlevel) {
951
rep2 = hnj_malloc (word_size * sizeof(char *));
952
pos2 = hnj_malloc (word_size * sizeof(int));
953
cut2 = hnj_malloc (word_size * sizeof(int));
954
hyphens2 = hnj_malloc (word_size + 3);
955
for (i = 0; i < word_size; i++) rep2[i] = NULL;
956
for (i = 0; i < word_size; i++) if
957
(hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
960
prep_word[i + 2] = '\0';
961
/* non-standard hyphenation at compound boundary (Schiffahrt) */
962
if (rep && *rep && *pos && *cut && (*rep)[i]) {
963
char * l = strchr((*rep)[i], '=');
964
size_t offset = 2 + i - (*pos)[i];
965
strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1);
966
prep_word[prep_word_size - 1] = '\0';
968
hyph = (l - (*rep)[i]) - (*pos)[i];
969
prep_word[2 + i + hyph] = '\0';
972
hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
973
hyphens2, &rep2, &pos2, &cut2, clhmin,
974
crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
975
for (j = 0; j < i - begin - 1; j++) {
976
hyphens[begin + j] = hyphens2[j];
977
if (rep2[j] && rep && pos && cut) {
978
if (!*rep && !*pos && !*cut) {
980
*rep = (char **) malloc(sizeof(char *) * word_size);
981
*pos = (int *) malloc(sizeof(int) * word_size);
982
*cut = (int *) malloc(sizeof(int) * word_size);
983
for (k = 0; k < word_size; k++) {
989
(*rep)[begin + j] = rep2[j];
990
(*pos)[begin + j] = pos2[j];
991
(*cut)[begin + j] = cut2[j];
994
prep_word[i + 2] = word[i + 1];
995
if (*rep && *pos && *cut && (*rep)[i]) {
997
strncpy(prep_word + offset, word, prep_word_size - offset - 1);
998
prep_word[prep_word_size - 1] = '\0';
1002
for (j = 0; j < word_size; j++) rep2[j] = NULL;
1007
hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
1008
hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
1009
if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
1010
rep, pos, cut, clhmin);
1011
if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
1012
rep, pos, cut, crhmin);
1021
hnj_free (prep_word);
1025
/* UTF-8 normalization of hyphen and non-standard positions */
1026
int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
1027
char *** rep, int ** pos, int ** cut)
1030
if ((((unsigned char) word[0]) >> 6) == 2) {
1031
fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
1035
/* calculate UTF-8 character positions */
1036
for (i = 0, j = -1; i < word_size; i++) {
1037
/* beginning of an UTF-8 character (not '10' start bits) */
1038
if ((((unsigned char) word[i]) >> 6) != 2) j++;
1039
hyphens[j] = hyphens[i];
1040
if (rep && pos && cut && *rep && *pos && *cut) {
1043
for (k = 0; k < l; k++) {
1044
if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
1049
for (; k < l; k++) {
1050
if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
1052
(*rep)[j] = (*rep)[i];
1060
hyphens[j + 1] = '\0';
1062
printf ("nums: %s\n", hyphens);
1067
/* get the word with all possible hyphenations (output: hyphword) */
1068
void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
1069
char * hyphword, char *** rep, int ** pos, int ** cut)
1071
int hyphenslen = l + 5;
1074
for (i = 0, j = 0; i < l; i++, j++) {
1076
hyphword[j] = word[i];
1077
if (*rep && *pos && *cut && (*rep)[i]) {
1078
size_t offset = j - (*pos)[i] + 1;
1079
strncpy(hyphword + offset, (*rep)[i], hyphenslen - offset - 1);
1080
hyphword[hyphenslen-1] = '\0';
1081
j += strlen((*rep)[i]) - (*pos)[i];
1082
i += (*cut)[i] - (*pos)[i];
1083
} else hyphword[++j] = '=';
1084
} else hyphword[j] = word[i];
1090
/* main api function with default hyphenmin parameters */
1091
int hnj_hyphen_hyphenate2 (HyphenDict *dict,
1092
const char *word, int word_size, char * hyphens,
1093
char *hyphword, char *** rep, int ** pos, int ** cut)
1095
hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1096
dict->clhmin, dict->crhmin, 1, 1);
1097
hnj_hyphen_lhmin(dict->utf8, word, word_size,
1098
hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
1099
hnj_hyphen_rhmin(dict->utf8, word, word_size,
1100
hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
1103
if (dict->nohyphen) {
1104
char * nh = dict->nohyphen;
1106
for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
1107
char * nhy = (char *) strstr(word, nh);
1109
hyphens[nhy - word + strlen(nh) - 1] = '0';
1110
if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = '0';
1111
nhy = (char *) strstr(nhy + 1, nh);
1113
nh = nh + strlen(nh) + 1;
1117
if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1118
if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
1120
printf ("nums: %s\n", hyphens);
1125
/* previous main api function with hyphenmin parameters */
1126
int hnj_hyphen_hyphenate3 (HyphenDict *dict,
1127
const char *word, int word_size, char * hyphens,
1128
char *hyphword, char *** rep, int ** pos, int ** cut,
1129
int lhmin, int rhmin, int clhmin, int crhmin)
1131
lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin;
1132
rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin;
1133
clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin;
1134
crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin;
1135
hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1136
clhmin, crhmin, 1, 1);
1137
hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
1138
rep, pos, cut, (lhmin > 0 ? lhmin : 2));
1139
hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
1140
rep, pos, cut, (rhmin > 0 ? rhmin : 2));
1141
if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1144
if (dict->nohyphen) {
1145
char * nh = dict->nohyphen;
1147
for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
1148
char * nhy = (char *) strstr(word, nh);
1150
hyphens[nhy - word + strlen(nh) - 1] = 0;
1151
if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = 0;
1152
nhy = (char *) strstr(nhy + 1, nh);
1154
nh = nh + strlen(nh) + 1;
1158
if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);