542
// NB: we 'loose' this memory as it is never freed
543
static halfword begin = null;
544
static halfword end = null;
548
514
HyphenDict* hnj_hyphen_new() {
549
515
HyphenDict* dict = hnj_malloc (sizeof(HyphenDict));
551
if (begin==null) begin = insert_character(null,(int)'.');
552
if (end ==null) end = insert_character(null,(int)'.');
559
521
void hnj_hyphen_clear(
612
// hyphenation pattern:
614
// 0 indicates end (actually any negative number)
615
// : prio(1+),startpos,length,len1,[replace],len2,[replace]
616
// most basic example is:
618
// for a hyphenation point between characters
568
/* hyphenation pattern:
570
* 0 indicates end (actually any negative number)
571
* : prio(1+),startpos,length,len1,[replace],len2,[replace]
572
* most basic example is:
574
* for a hyphenation point between characters
622
578
void hnj_hyphen_load(
623
579
HyphenDict* dict,
624
580
const unsigned char *f
667
623
if (format[i]>='0'&&format[i]<='9') j++;
668
624
if (is_utf8_follow(format[i])) e++;
670
// l-e => number of _characters_ not _bytes_
671
// l-e-j => number of pattern characters
626
/* l-e => number of _characters_ not _bytes_*/
627
/* l-e-j => number of pattern characters*/
672
628
unsigned char *pat = (unsigned char*) malloc(1+l-j);
673
629
char *org = ( char*) malloc(2+l-e-j);
674
// remove hyphenation encoders (digits) from pat
630
/* remove hyphenation encoders (digits) from pat*/
676
632
for (i=0,j=0,e=0; i<l; i++) {
677
633
unsigned char c = format[i];
689
645
hyppat_insert(dict->patterns,pat,org);
691
dict->pat_length += (f-begin)+2; // 2 for spurious spaces
647
dict->pat_length += (f-begin)+2; /* 2 for spurious spaces*/
692
648
init_hash(&dict->merged);
693
649
v = new_HashIter(dict->patterns);
694
650
while (nextHash(v,&word)) {
695
651
int wordsize = strlen((char*)word);
697
653
for (l=1; l<=wordsize; l++) {
698
if (is_utf8_follow(word[l])) continue; // Do not clip an utf8 sequence
654
if (is_utf8_follow(word[l])) continue; /* Do not clip an utf8 sequence*/
699
655
for (j=1; j<=l; j++) {
701
if (is_utf8_follow(word[i])) continue; // Do not start halfway an utf8 sequence
657
if (is_utf8_follow(word[i])) continue; /* Do not start halfway an utf8 sequence*/
702
658
char *subpat_pat;
703
659
if ((subpat_pat = hyppat_lookup(dict->patterns,word+i,j))!=NULL) {
704
660
char* newpat_pat;
709
665
for (i=0; i<l; i++) if (is_utf8_follow(newword[i])) e++;
710
666
char *neworg = malloc(l+2-e);
711
sprintf(neworg,"%0*d",l+1-e,0); // fill with right amount of '0'
667
sprintf(neworg,"%0*d",l+1-e,0); /* fill with right amount of '0'*/
712
668
hyppat_insert(dict->merged,newword,combine(neworg,subpat_pat));
714
670
combine(newpat_pat,subpat_pat);
754
710
delete_HashIter(v);
755
711
clear_hyppat_hash(&dict->merged);
757
//***************************************
713
/***************************************/
759
715
/* put in the fallback states */
760
716
for (i = 0; i < HASH_SIZE; i++) {
761
717
for (e = dict->state_num->entries[i]; e; e = e->next) {
762
// do not do state==0 otherwise things get confused
718
/* do not do state==0 otherwise things get confused*/
763
719
if (e->u.state) {
764
720
for (j = 1; 1; j++) {
765
721
state_num = state_lookup(dict->state_num, e->key + j);
797
751
lang_variables *lan
799
// +2 for dots at each end, +1 for points /outside/ characters
753
/* +2 for dots at each end, +1 for points /outside/ characters*/
800
754
int ext_word_len = length+2;
801
755
int hyphen_len = ext_word_len+1;
802
//char *hyphens = hnj_malloc((hyphen_len*2)+1); // LATER
756
/*char *hyphens = hnj_malloc((hyphen_len*2)+1); */ /* LATER */
803
757
char *hyphens = hnj_malloc(hyphen_len+1);
805
// Add a '.' to beginning and end to facilitate matching
806
set_vlink(begin,first);
807
set_vlink(end,get_vlink(last));
759
/* Add a '.' to beginning and end to facilitate matching*/
760
set_vlink(begin_point,first);
761
set_vlink(end_point,get_vlink(last));
762
set_vlink(last,end_point);
811
765
for (char_num = 0; char_num < hyphen_len; char_num++) {
812
// hyphens[char_num*2] = '0'; // LATER
813
// hyphens[char_num*2+1] = '0'; // LATER
766
/* hyphens[char_num*2] = '0'; */ /* LATER */
767
/* hyphens[char_num*2+1] = '0'; */ /* LATER */
814
768
hyphens[char_num] = '0';
816
//hyphens[hyphen_len*2] = 0; // LATER
770
/*hyphens[hyphen_len*2] = 0; */ /* LATER */
817
771
hyphens[hyphen_len] = 0;
819
773
/* now, run the finite state machine */
822
for (char_num=0, here=begin; here!=end; here=get_vlink(here)) {
776
for (char_num=0, here=begin_point; here!=end_point; here=get_vlink(here)) {
824
778
int ch = get_character(here);
826
780
while (state!=-1) {
827
//+ printf("%*s%s%c",char_num-strlen(get_state_str(state)),"",get_state_str(state),(char)ch);
781
/* printf("%*s%s%c",char_num-strlen(get_state_str(state)),"",get_state_str(state),(char)ch);*/
828
782
HyphenState *hstate = &dict->states[state];
830
784
for (k = 0; k < hstate->num_trans; k++) {
831
785
if (hstate->trans[k].uni_ch == ch) {
832
786
state = hstate->trans[k].new_state;
833
//+ printf(" state %d\n",state);
787
/* printf(" state %d\n",state);*/
834
788
char *match = dict->states[state].match;
837
// 1 string length is one bigger than offset
838
// 1 hyphenation starts before first character
791
* 1 string length is one bigger than offset
792
* 1 hyphenation starts before first character
839
794
int offset = char_num + 2 - strlen (match);
840
//+ printf ("%*s%s\n", offset,"", match);
795
/* printf ("%*s%s\n", offset,"", match);*/
842
797
for (m = 0; match[m]; m++) {
843
798
if (hyphens[offset+m] < match[m]) hyphens[offset+m] = match[m];
849
804
state = hstate->fallback_state;
850
//+ printf (" back to %d\n", state);
805
/* printf (" back to %d\n", state);*/
852
// nothing worked, let's go to the next character
807
/* nothing worked, let's go to the next character*/
854
809
try_next_letter: ;
858
// restore the correct pointers
859
set_vlink(last,get_vlink(end));
813
/* restore the correct pointers*/
814
set_vlink(last,get_vlink(end_point));
861
// pattern is ^.^w^o^r^d^.^ word_len=4, ext_word_len=6, hyphens=7
862
// check ^ ^ ^ so drop first two and stop after word_len-1
816
/* pattern is ^.^w^o^r^d^.^ word_len=4, ext_word_len=6, hyphens=7
817
* check ^ ^ ^ so drop first two and stop after word_len-1
863
819
for (here=first,char_num=2; here!=left; here=get_vlink(here)) char_num++;
864
820
for (; here!=right; here=get_vlink(here)) {
865
821
if (hyphens[char_num] & 1)