~ubuntu-branches/ubuntu/oneiric/espeak/oneiric

« back to all changes in this revision

Viewing changes to src/translate.cpp

  • Committer: Bazaar Package Importer
  • Author(s): Luke Yelavich
  • Date: 2011-05-04 11:25:46 UTC
  • mfrom: (1.1.24 upstream) (5.1.10 sid)
  • Revision ID: james.westby@ubuntu.com-20110504112546-ykijzihgc7ybgzn2
Tags: 1.45.04-1ubuntu1
* Merge from debian unstable, remaining changes:
  - Add gbp.conf for use with git buildpackage
  - Update the explanation of the -b command-line flag in the espeak manpage

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
1
/***************************************************************************
2
 
 *   Copyright (C) 2005 to 2010 by Jonathan Duddington                     *
 
2
 *   Copyright (C) 2005 to 2011 by Jonathan Duddington                     *
3
3
 *   email: jonsd@users.sourceforge.net                                    *
4
4
 *                                                                         *
5
5
 *   This program is free software; you can redistribute it and/or modify  *
122
122
static const unsigned short breaks[] = {'_', 0};
123
123
 
124
124
// treat these characters as spaces, in addition to iswspace()
125
 
static const wchar_t chars_space[] = {0x2500,0};  // box drawing horiz
 
125
// static const wchar_t chars_space[] = {0x2500,0x2501,0};  // box drawing horiz
126
126
 
127
127
 
128
128
// Translate character codes 0xA0 to 0xFF into their unicode values
386
386
                        return(1);
387
387
                if(lookupwchar(extra_indic_alphas, c) != 0)
388
388
                        return(1);
 
389
                if((c >= 0xd7a) && (c <= 0xd7f))
 
390
                        return(1);   // malaytalam chillu characters
 
391
 
389
392
                return(0);
390
393
        }
391
394
 
395
398
        if((c >= 0x300) && (c <= 0x36f))
396
399
                return(1);   // combining accents
397
400
 
 
401
        if((c >= 0x780) && (c <= 0x7b1))
 
402
                return(1);   // taani/divehi (maldives)
 
403
 
398
404
        if((c >= 0x1100) && (c <= 0x11ff))
399
405
                return(1);  //Korean jamo
400
406
 
426
432
{//========================
427
433
        if(c == 0)
428
434
                return(0);
429
 
        if(wcschr(chars_space,c))
430
 
                return(1);
 
435
        if((c >= 0x2500) && (c < 0x25a0))
 
436
                return(1);  // box drawing characters
 
437
//      if(wcschr(chars_space,c))
 
438
//              return(1);
431
439
        return(iswspace(c));
432
440
}
433
441
 
574
582
}
575
583
 
576
584
 
 
585
int IsAllUpper(const char *word)
 
586
{//=============================
 
587
        int c;
 
588
        while((*word != 0) && !isspace2(*word))
 
589
        {
 
590
                word += utf8_in(&c, word);
 
591
                if(!iswupper(c))
 
592
                        return(0);
 
593
        }
 
594
        return(1);
 
595
}
 
596
 
 
597
 
577
598
static char *SpeakIndividualLetters(Translator *tr, char *word, char *phonemes, int spell_word)
578
599
{//============================================================================================
579
600
        int posn = 0;
760
781
 
761
782
 
762
783
 
763
 
int TranslateWord(Translator *tr, char *word1, int next_pause, WORD_TAB *wtab)
 
784
int TranslateWord(Translator *tr, char *word_start, int next_pause, WORD_TAB *wtab)
764
785
{//===========================================================================
765
786
// word1 is terminated by space (0x20) character
766
787
 
 
788
        char *word1;
767
789
        int word_length;
768
790
        int ix;
769
791
        char *p;
781
803
        char unpron_phonemes[N_WORD_PHONEMES];
782
804
        char end_phonemes[N_WORD_PHONEMES];
783
805
        char word_copy[N_WORD_BYTES];
784
 
        char prefix_chars[41];
 
806
        char word_copy2[N_WORD_BYTES];
 
807
        int word_copy_length;
 
808
        char prefix_chars[0x3f + 2];  
785
809
        int found=0;
786
810
   int end_flags;
787
811
        char c_temp;   // save a character byte while we temporarily replace it with space
795
819
        int emphasize_allcaps = 0;
796
820
        int wflags;
797
821
        int wmark;
 
822
        int was_unpronouncable = 0;
798
823
        WORD_TAB wtab_null[8];
799
824
 
800
825
        // translate these to get pronunciations of plural 's' suffix (different forms depending on
831
856
        }
832
857
 
833
858
        // count the length of the word
 
859
        word1 = word_start;
834
860
        if(*word1 == ' ') word1++;   // possibly a dot was replaced by space:  $dot
835
861
        wordx = word1;
836
862
 
842
868
                word_length++;
843
869
        }
844
870
 
 
871
        word_copy_length = wordx - word_start;
 
872
        if(word_copy_length >= N_WORD_BYTES)
 
873
                word_copy_length = N_WORD_BYTES-1;
 
874
        memcpy(word_copy2, word_start, word_copy_length);
 
875
 
845
876
        spell_word = 0;
846
877
 
847
878
        if(option_sayas == SAYAS_KEY)
958
989
 
959
990
                        if((tr->langopts.numbers & NUM_ROMAN) || ((tr->langopts.numbers & NUM_ROMAN_CAPITALS) && (wflags & FLAG_ALL_UPPER)))
960
991
                        {
961
 
                                if((found = TranslateRoman(tr, word1, phonemes, wtab)) != 0)
962
 
                                        dictionary_flags[0] |= FLAG_ABBREV;   // prevent emphasis if capitals
 
992
                                if((wflags & FLAG_LAST_WORD) || !(wtab[1].flags & FLAG_NOSPACE))
 
993
                                {
 
994
                                        // don't use Roman number if this word is not separated from the next word (eg. "XLTest")
 
995
                                        if((found = TranslateRoman(tr, word1, phonemes, wtab)) != 0)
 
996
                                                dictionary_flags[0] |= FLAG_ABBREV;   // prevent emphasis if capitals
 
997
                                }
963
998
                        }
964
999
                }
965
1000
 
1007
1042
                length = 999;
1008
1043
                wordx = word1;
1009
1044
 
1010
 
                while(((length < 3) && (length > 0))|| (word_length > 1 && Unpronouncable(tr,wordx)))
 
1045
                while(((length < 3) && (length > 0))|| (word_length > 1 && Unpronouncable(tr, wordx, posn)))
1011
1046
                {
1012
1047
                        // This word looks "unpronouncable", so speak letters individually until we
1013
1048
                        // find a remainder that we can pronounce.
 
1049
                        was_unpronouncable = FLAG_WAS_UNPRONOUNCABLE;
1014
1050
                        emphasize_allcaps = 0;
1015
1051
 
1016
1052
                        if(wordx[0] == '\'')
1084
1120
 
1085
1121
                        found = 0;
1086
1122
                        confirm_prefix = 1;
1087
 
                        while(end_type & SUFX_P)
 
1123
                        for (int loopcount = 0; (loopcount < 50) && (end_type & SUFX_P); loopcount++)
1088
1124
                        {
1089
1125
                                // Found a standard prefix, remove it and retranslate
1090
 
 
 
1126
                                // loopcount guards against an endless loop
1091
1127
                                if(confirm_prefix && !(end_type & SUFX_B))
1092
1128
                                {
1093
1129
                                        int end2;
1160
1196
 
1161
1197
                                if(prefix_type & SUFX_B)
1162
1198
                                {
1163
 
// SUFX_B is used for Turkish, tr_rules contains "(Pb£
1164
 
                                        // retranslate the prefix part
 
1199
// SUFX_B is used for Turkish, tr_rules contains " ' (Pb"
 
1200
                                        // examine the prefix part
1165
1201
                                        char *wordpf;
1166
1202
                                        char prefix_phonemes2[12];
1167
1203
 
1168
1204
                                        strncpy0(prefix_phonemes2,end_phonemes,sizeof(prefix_phonemes2));
1169
1205
                                        wordpf = &prefix_chars[1];
1170
 
                                        found = LookupDictList(tr, &wordpf, phonemes, dictionary_flags, SUFX_P, wtab);   // without prefix
1171
 
                                        if(found == 0)
1172
 
                                        {
1173
 
                                                end_type = TranslateRules(tr, wordpf, phonemes, N_WORD_PHONEMES, end_phonemes, 0, dictionary_flags);
1174
 
                                                sprintf(prefix_phonemes,"%s%s%s",phonemes,end_phonemes,prefix_phonemes2);
1175
 
                                        }
1176
 
                                        prefix_flags = 1;
 
1206
                                        strcpy(prefix_phonemes, phonemes);
 
1207
 
 
1208
                                        // look for stress marker or $abbrev
 
1209
                                        found = LookupDictList(tr, &wordpf, phonemes, dictionary_flags, 0, wtab);
 
1210
                                        if(found)
 
1211
                                        {
 
1212
                                                strcpy(prefix_phonemes, phonemes);
 
1213
                                        }
 
1214
                                        if(dictionary_flags[0] & FLAG_ABBREV)
 
1215
                                        {
 
1216
                                                prefix_phonemes[0] = 0;
 
1217
                                                SpeakIndividualLetters(tr, wordpf, prefix_phonemes, 1);
 
1218
                                        }
1177
1219
                                }
1178
1220
                                else
1179
1221
                                {
1220
1262
                                        wordx[-1] = c_temp;
1221
1263
                                        found = LookupDictList(tr, &word1, phonemes, dictionary_flags2, end_flags, wtab);  // include prefix, but not suffix
1222
1264
                                        wordx[-1] = ' ';
 
1265
                                        if(phonemes[0] == phonSWITCH)
 
1266
                                        {
 
1267
                                                // change to another language in order to translate this word
 
1268
                                                memcpy(wordx,word_copy,strlen(word_copy));
 
1269
                                                strcpy(word_phonemes,phonemes);
 
1270
                                                return(0);
 
1271
                                        }
1223
1272
                                        if(dictionary_flags[0]==0)
1224
1273
                                        {
1225
1274
                                                dictionary_flags[0] = dictionary_flags2[0];
1484
1533
                ApplySpecialAttribute2(tr,word_phonemes,dictionary_flags[0]);
1485
1534
        }
1486
1535
 
 
1536
        dictionary_flags[0] |= was_unpronouncable;
 
1537
        memcpy(word_start, word_copy2, word_copy_length);
1487
1538
        return(dictionary_flags[0]);
1488
1539
}  //  end of TranslateWord
1489
1540
 
1709
1760
                {
1710
1761
                        char *p2;
1711
1762
                        int ok = 1;
1712
 
                        int flags2 = 0;
 
1763
                        unsigned int flags2[2];
1713
1764
                        int c_word2;
1714
1765
                        char ph_buf[N_WORD_PHONEMES];
1715
1766
 
 
1767
                        flags2[0] = 0;
1716
1768
                        sylimit = tr->langopts.param[LOPT_COMBINE_WORDS];
1717
1769
 
1718
1770
                        // LANG=cs,sk
1729
1781
                        if(ok != 0)
1730
1782
                        {
1731
1783
                                strcpy(ph_buf,word_phonemes);
1732
 
                                flags2 = TranslateWord(translator, p2+1, 0, wtab+1);
 
1784
 
 
1785
                                flags2[0] = TranslateWord(translator, p2+1, 0, wtab+1);
 
1786
                                if(flags2[0] & FLAG_WAS_UNPRONOUNCABLE)
 
1787
                                        ok = 0;
1733
1788
 
1734
1789
                                if(sylimit & 0x100)
1735
1790
                                {
1736
1791
                                        // only if the second word has $alt attribute
1737
 
                                        if((flags2 & FLAG_ALT_TRANS) == 0)
 
1792
                                        if((flags2[0] & FLAG_ALT_TRANS) == 0)
1738
1793
                                        {
1739
1794
                                                ok = 0;
1740
1795
                                        }
1765
1820
                                else
1766
1821
                                {
1767
1822
                                        if(flags == 0)
1768
 
                                                flags = flags2;   // no flags for the combined word, so use flags from the second word eg. lang-hu "nem december 7-e"
 
1823
                                                flags = flags2[0];   // no flags for the combined word, so use flags from the second word eg. lang-hu "nem december 7-e"
1769
1824
                                        flags |= FLAG_SKIPWORDS;
1770
1825
                                        dictionary_skipwords = 1;
1771
1826
                                }
2288
2343
        int cc;
2289
2344
        unsigned int source_index=0;
2290
2345
        unsigned int prev_source_index=0;
 
2346
        int source_index_word=0;
2291
2347
        int prev_in;
2292
2348
        int prev_out=' ';
2293
2349
        int prev_out2;
2329
2385
        int tone;
2330
2386
        int tone2;
2331
2387
 
 
2388
        if(tr==NULL)
 
2389
        {
 
2390
                return(NULL);
 
2391
        }
 
2392
 
2332
2393
        p_textinput = (unsigned char *)vp_input;
2333
2394
        p_wchar_input = (wchar_t *)vp_input;
2334
2395
 
2431
2492
        words[0].flags = 0;
2432
2493
        finished = 0;
2433
2494
 
2434
 
        for(j=0; charix[j]==0; j++);
 
2495
        for(j=0; charix[j]<=0; j++);
2435
2496
        words[0].sourceix = charix[j];
2436
2497
        k = 0;
2437
2498
        while(charix[j] != 0)
2744
2805
                                        {
2745
2806
                                                // '-' between two letters is a hyphen, treat as a space
2746
2807
                                                word_flags |= FLAG_HYPHEN;
2747
 
                                                words[word_count-1].flags |= FLAG_HYPHEN_AFTER;
 
2808
                                                if(word_count > 0)
 
2809
                                                        words[word_count-1].flags |= FLAG_HYPHEN_AFTER;
2748
2810
                                                c = ' ';
2749
2811
                                        }
2750
2812
                                }
2774
2836
                        else
2775
2837
                        if(c == '.')
2776
2838
                        {
2777
 
                                if(!(words[word_count-1].flags & FLAG_NOSPACE) && IsAlpha(prev_in))
 
2839
                                if(prev_out == '.')
 
2840
                                {
 
2841
                                        // multiple dots, separate by spaces. Note >3 dots has been replaced by elipsis
 
2842
                                        c = ' ';
 
2843
                                        space_inserted = 1;
 
2844
                                }
 
2845
                                else
 
2846
                                if((word_count > 0) && !(words[word_count-1].flags & FLAG_NOSPACE) && IsAlpha(prev_in))
2778
2847
                                {
2779
2848
                                        // dot after a word, with space following, probably an abbreviation
2780
2849
                                        words[word_count-1].flags |= FLAG_HAS_DOT;
2781
2850
 
2782
 
                                        if(IsSpace(next_in))
2783
 
                                                c = ' ';   // remove the dot if it's followed by a space, so that it's not pronounced
 
2851
                                        if(IsSpace(next_in) || (next_in == '-'))
 
2852
                                                c = ' ';   // remove the dot if it's followed by a space or hyphen, so that it's not pronounced
2784
2853
                                }
2785
2854
                        }
2786
2855
                        else
2878
2947
 
2879
2948
                        if(space_inserted)
2880
2949
                        {
2881
 
                                words[word_count].length = source_index - words[word_count].sourceix;
 
2950
                                // count the number of characters since the start of the word
 
2951
                                j = 0;
 
2952
                                k = source_index - 1;
 
2953
                                while((k >= source_index_word) && (charix[k] != 0))
 
2954
                                {
 
2955
                                        if(charix[k] > 0)    // don't count initial bytes of multi-byte character
 
2956
                                                j++;
 
2957
                                        k--;
 
2958
                                }
 
2959
                                words[word_count].length = j;
2882
2960
                        }
2883
2961
 
 
2962
                        source_index_word = source_index;
 
2963
 
2884
2964
                        // end of 'word'
2885
2965
                        sbuf[ix++] = ' ';
2886
2966
 
2887
 
                        if((ix > words[word_count].start) && (word_count < N_CLAUSE_WORDS-1))
 
2967
                        if((word_count < N_CLAUSE_WORDS-1) && (ix > words[word_count].start))
2888
2968
                        {
2889
2969
                                if(embedded_count > 0)
2890
2970
                                {
2941
3021
                }
2942
3022
                else
2943
3023
                {
2944
 
                        ix += utf8_out(c,&sbuf[ix]);   // sbuf[ix++] = c;
 
3024
                        if((ix < (N_TR_SOURCE - 4)))
 
3025
                                ix += utf8_out(c,&sbuf[ix]);   // sbuf[ix++] = c;
2945
3026
                }
2946
3027
                if(pre_pause_add > pre_pause)
2947
3028
                        pre_pause = pre_pause_add;
3010
3091
                                        *pn++ = *pw++;
3011
3092
                                }
3012
3093
                                else
3013
 
                                if((*pw == tr->langopts.thousands_sep) && (pw[1] == ' ') && iswdigit(pw[2]))
 
3094
                                if((*pw == tr->langopts.thousands_sep) && (pw[1] == ' ')
 
3095
                                        && iswdigit(pw[2]) && (pw[3] != ' ') && (pw[4] != ' '))  // don't allow only 1 or 2 digits in the final part
3014
3096
                                {
3015
3097
                                        pw += 2;
3016
3098
                                        ix++;  // skip "word"