1
1
/***************************************************************************
2
* Copyright (C) 2005 to 2010 by Jonathan Duddington *
2
* Copyright (C) 2005 to 2011 by Jonathan Duddington *
3
3
* email: jonsd@users.sourceforge.net *
5
5
* This program is free software; you can redistribute it and/or modify *
122
122
static const unsigned short breaks[] = {'_', 0};
124
124
// treat these characters as spaces, in addition to iswspace()
125
static const wchar_t chars_space[] = {0x2500,0}; // box drawing horiz
125
// static const wchar_t chars_space[] = {0x2500,0x2501,0}; // box drawing horiz
128
128
// Translate character codes 0xA0 to 0xFF into their unicode values
395
398
if((c >= 0x300) && (c <= 0x36f))
396
399
return(1); // combining accents
401
if((c >= 0x780) && (c <= 0x7b1))
402
return(1); // taani/divehi (maldives)
398
404
if((c >= 0x1100) && (c <= 0x11ff))
399
405
return(1); //Korean jamo
426
432
{//========================
429
if(wcschr(chars_space,c))
435
if((c >= 0x2500) && (c < 0x25a0))
436
return(1); // box drawing characters
437
// if(wcschr(chars_space,c))
431
439
return(iswspace(c));
585
int IsAllUpper(const char *word)
586
{//=============================
588
while((*word != 0) && !isspace2(*word))
590
word += utf8_in(&c, word);
577
598
static char *SpeakIndividualLetters(Translator *tr, char *word, char *phonemes, int spell_word)
578
599
{//============================================================================================
763
int TranslateWord(Translator *tr, char *word1, int next_pause, WORD_TAB *wtab)
784
int TranslateWord(Translator *tr, char *word_start, int next_pause, WORD_TAB *wtab)
764
785
{//===========================================================================
765
786
// word1 is terminated by space (0x20) character
781
803
char unpron_phonemes[N_WORD_PHONEMES];
782
804
char end_phonemes[N_WORD_PHONEMES];
783
805
char word_copy[N_WORD_BYTES];
784
char prefix_chars[41];
806
char word_copy2[N_WORD_BYTES];
807
int word_copy_length;
808
char prefix_chars[0x3f + 2];
787
811
char c_temp; // save a character byte while we temporarily replace it with space
871
word_copy_length = wordx - word_start;
872
if(word_copy_length >= N_WORD_BYTES)
873
word_copy_length = N_WORD_BYTES-1;
874
memcpy(word_copy2, word_start, word_copy_length);
847
878
if(option_sayas == SAYAS_KEY)
959
990
if((tr->langopts.numbers & NUM_ROMAN) || ((tr->langopts.numbers & NUM_ROMAN_CAPITALS) && (wflags & FLAG_ALL_UPPER)))
961
if((found = TranslateRoman(tr, word1, phonemes, wtab)) != 0)
962
dictionary_flags[0] |= FLAG_ABBREV; // prevent emphasis if capitals
992
if((wflags & FLAG_LAST_WORD) || !(wtab[1].flags & FLAG_NOSPACE))
994
// don't use Roman number if this word is not separated from the next word (eg. "XLTest")
995
if((found = TranslateRoman(tr, word1, phonemes, wtab)) != 0)
996
dictionary_flags[0] |= FLAG_ABBREV; // prevent emphasis if capitals
1010
while(((length < 3) && (length > 0))|| (word_length > 1 && Unpronouncable(tr,wordx)))
1045
while(((length < 3) && (length > 0))|| (word_length > 1 && Unpronouncable(tr, wordx, posn)))
1012
1047
// This word looks "unpronouncable", so speak letters individually until we
1013
1048
// find a remainder that we can pronounce.
1049
was_unpronouncable = FLAG_WAS_UNPRONOUNCABLE;
1014
1050
emphasize_allcaps = 0;
1016
1052
if(wordx[0] == '\'')
1086
1122
confirm_prefix = 1;
1087
while(end_type & SUFX_P)
1123
for (int loopcount = 0; (loopcount < 50) && (end_type & SUFX_P); loopcount++)
1089
1125
// Found a standard prefix, remove it and retranslate
1126
// loopcount guards against an endless loop
1091
1127
if(confirm_prefix && !(end_type & SUFX_B))
1161
1197
if(prefix_type & SUFX_B)
1163
// SUFX_B is used for Turkish, tr_rules contains "(PbÃÂã
1164
// retranslate the prefix part
1199
// SUFX_B is used for Turkish, tr_rules contains " ' (Pb"
1200
// examine the prefix part
1166
1202
char prefix_phonemes2[12];
1168
1204
strncpy0(prefix_phonemes2,end_phonemes,sizeof(prefix_phonemes2));
1169
1205
wordpf = &prefix_chars[1];
1170
found = LookupDictList(tr, &wordpf, phonemes, dictionary_flags, SUFX_P, wtab); // without prefix
1173
end_type = TranslateRules(tr, wordpf, phonemes, N_WORD_PHONEMES, end_phonemes, 0, dictionary_flags);
1174
sprintf(prefix_phonemes,"%s%s%s",phonemes,end_phonemes,prefix_phonemes2);
1206
strcpy(prefix_phonemes, phonemes);
1208
// look for stress marker or $abbrev
1209
found = LookupDictList(tr, &wordpf, phonemes, dictionary_flags, 0, wtab);
1212
strcpy(prefix_phonemes, phonemes);
1214
if(dictionary_flags[0] & FLAG_ABBREV)
1216
prefix_phonemes[0] = 0;
1217
SpeakIndividualLetters(tr, wordpf, prefix_phonemes, 1);
1220
1262
wordx[-1] = c_temp;
1221
1263
found = LookupDictList(tr, &word1, phonemes, dictionary_flags2, end_flags, wtab); // include prefix, but not suffix
1222
1264
wordx[-1] = ' ';
1265
if(phonemes[0] == phonSWITCH)
1267
// change to another language in order to translate this word
1268
memcpy(wordx,word_copy,strlen(word_copy));
1269
strcpy(word_phonemes,phonemes);
1223
1272
if(dictionary_flags[0]==0)
1225
1274
dictionary_flags[0] = dictionary_flags2[0];
1484
1533
ApplySpecialAttribute2(tr,word_phonemes,dictionary_flags[0]);
1536
dictionary_flags[0] |= was_unpronouncable;
1537
memcpy(word_start, word_copy2, word_copy_length);
1487
1538
return(dictionary_flags[0]);
1488
1539
} // end of TranslateWord
1731
1783
strcpy(ph_buf,word_phonemes);
1732
flags2 = TranslateWord(translator, p2+1, 0, wtab+1);
1785
flags2[0] = TranslateWord(translator, p2+1, 0, wtab+1);
1786
if(flags2[0] & FLAG_WAS_UNPRONOUNCABLE)
1734
1789
if(sylimit & 0x100)
1736
1791
// only if the second word has $alt attribute
1737
if((flags2 & FLAG_ALT_TRANS) == 0)
1792
if((flags2[0] & FLAG_ALT_TRANS) == 0)
1768
flags = flags2; // no flags for the combined word, so use flags from the second word eg. lang-hu "nem december 7-e"
1823
flags = flags2[0]; // no flags for the combined word, so use flags from the second word eg. lang-hu "nem december 7-e"
1769
1824
flags |= FLAG_SKIPWORDS;
1770
1825
dictionary_skipwords = 1;
2745
2806
// '-' between two letters is a hyphen, treat as a space
2746
2807
word_flags |= FLAG_HYPHEN;
2747
words[word_count-1].flags |= FLAG_HYPHEN_AFTER;
2809
words[word_count-1].flags |= FLAG_HYPHEN_AFTER;
2777
if(!(words[word_count-1].flags & FLAG_NOSPACE) && IsAlpha(prev_in))
2841
// multiple dots, separate by spaces. Note >3 dots has been replaced by elipsis
2846
if((word_count > 0) && !(words[word_count-1].flags & FLAG_NOSPACE) && IsAlpha(prev_in))
2779
2848
// dot after a word, with space following, probably an abbreviation
2780
2849
words[word_count-1].flags |= FLAG_HAS_DOT;
2782
if(IsSpace(next_in))
2783
c = ' '; // remove the dot if it's followed by a space, so that it's not pronounced
2851
if(IsSpace(next_in) || (next_in == '-'))
2852
c = ' '; // remove the dot if it's followed by a space or hyphen, so that it's not pronounced
2879
2948
if(space_inserted)
2881
words[word_count].length = source_index - words[word_count].sourceix;
2950
// count the number of characters since the start of the word
2952
k = source_index - 1;
2953
while((k >= source_index_word) && (charix[k] != 0))
2955
if(charix[k] > 0) // don't count initial bytes of multi-byte character
2959
words[word_count].length = j;
2962
source_index_word = source_index;
2884
2964
// end of 'word'
2885
2965
sbuf[ix++] = ' ';
2887
if((ix > words[word_count].start) && (word_count < N_CLAUSE_WORDS-1))
2967
if((word_count < N_CLAUSE_WORDS-1) && (ix > words[word_count].start))
2889
2969
if(embedded_count > 0)
2944
ix += utf8_out(c,&sbuf[ix]); // sbuf[ix++] = c;
3024
if((ix < (N_TR_SOURCE - 4)))
3025
ix += utf8_out(c,&sbuf[ix]); // sbuf[ix++] = c;
2946
3027
if(pre_pause_add > pre_pause)
2947
3028
pre_pause = pre_pause_add;