2
# -*- coding: utf-8 -*-
3
#A part of NonVisual Desktop Access (NVDA)
4
#Copyright (C) 2010-2011 Takuya Nishimoto (nishimotz.com)
5
#This file is covered by the GNU General Public License.
6
#See the file COPYING for more details.
8
# Japanese speech engine wrapper for Open JTalk
9
# http://ja.nishimotz.com/project:libopenjtalk
19
from nvdajptext.mecab import *
23
sys.path.append(os.getcwd())
24
from nvdajptext.mecab import *
26
############################################
28
# htsengineapi/include/HTS_engine.h
38
class HTS_ModelSet(Structure):
40
("_dummy", c_byte * 56),
43
class HTS_Label(Structure):
45
("_dummy", c_byte * 76),
47
HTS_Label_ptr = POINTER(HTS_Label)
49
class HTS_SStreamSet(Structure):
51
("_dummy", c_byte * 24),
54
class HTS_PStreamSet(Structure):
56
("_dummy", c_byte * 12),
59
class HTS_GStream(Structure):
61
("static_length", c_int), # int static_length; /* static features length */
62
("par", c_double_p_p), # double **par; /* generated parameter */
65
HTS_GStream_ptr = POINTER(HTS_GStream)
67
# FIXME: engine.gss.total_nsample is always 0
68
class HTS_GStreamSet(Structure):
70
("total_nsample", c_int), # int total_nsample; /* total sample */
71
("total_frame", c_int), # int total_frame; /* total frame */
72
("nstream", c_int), # int nstream; /* # of streams */
73
("gstream", HTS_GStream_ptr), # HTS_GStream *gstream; /* generated parameter streams */
74
("gspeech", c_short_p), # short *gspeech; /* generated speech */
76
HTS_GStreamSet_ptr = POINTER(HTS_GStreamSet)
78
class HTS_Global(Structure):
80
("state", c_int), # /* Gamma=-1/stage : if stage=0 then Gamma=0 */
81
("use_log_gain", c_int), # HTS_Boolean (TRUE=1) /* log gain flag (for LSP) */
82
("sampling_rate", c_int), # /* sampling rate */
83
("fperiod", c_int), # /* frame period */
84
("alpha", c_double), # /* all-pass constant */
85
("beta", c_double), # /* postfiltering coefficient */
86
("audio_buff_size", c_int), # /* audio buffer size (for audio device) */
87
("msd_threshold", c_double_p), # /* MSD thresholds */
88
("duration_iw", c_double_p), # /* weights for duration interpolation */
89
("parameter_iw", c_double_p_p), # /* weights for parameter interpolation */
90
("gv_iw", c_double_p_p), # /* weights for GV interpolation */
91
("gv_weight", c_double_p), # /* GV weights */
93
HTS_Global_ptr = POINTER(HTS_Global)
95
class HTS_Engine(Structure):
97
("global", HTS_Global),
98
# HTS_Audio audio (removed from HTS_Engine_API)
100
("label", HTS_Label),
101
("sss", HTS_SStreamSet),
102
("pss", HTS_PStreamSet),
103
("gss", HTS_GStreamSet),
104
("lf0_offset", c_double),
105
("lf0_amp", c_double),
107
HTS_Engine_ptr = POINTER(HTS_Engine)
109
############################################
111
class NJD(Structure):
113
("_dummy", c_byte * 8),
115
NJD_ptr = POINTER(NJD)
117
class JPCommonNode(Structure):
119
JPCommonNode_ptr = POINTER(JPCommonNode)
121
class JPCommonLabel(Structure):
123
JPCommonLabel_ptr = POINTER(JPCommonLabel)
125
class JPCommon(Structure):
127
("head", JPCommonNode_ptr),
128
("tail", JPCommonNode_ptr),
129
("label", JPCommonLabel_ptr),
131
JPCommon_ptr = POINTER(JPCommon)
134
def JPC_label_print(feature, size, logwrite_):
135
if logwrite_ == None: return
136
if feature == None or size == None:
137
logwrite_( "JPC_label_print size: 0" )
139
s2 = "JPC_label_print size: %d\n" % size
140
for i in xrange(0, size):
141
s = string_at(feature[i])
148
#############################################
151
FILENAME = c_char * FNLEN
152
FILENAME_ptr = POINTER(FILENAME)
153
FILENAME_ptr_ptr = POINTER(FILENAME_ptr)
154
FILENAME_ptr_x3 = FILENAME_ptr * 3
155
FILENAME_ptr_x3_ptr = POINTER(FILENAME_ptr_x3)
159
jpcommon = JPCommon()
160
engine = HTS_Engine()
164
if libjt == None: return "libjt version none"
165
return libjt.jt_version()
167
def libjt_initialize(JT_DLL, **args):
168
global libjt, njd, jpcommon, engine, use_lpf
169
use_lpf = args['use_lpf']
171
if libjt == None: libjt = cdll.LoadLibrary(JT_DLL)
172
libjt.jt_version.restype = c_char_p
174
libjt.NJD_initialize.argtypes = [NJD_ptr]
175
libjt.NJD_initialize(njd)
177
libjt.JPCommon_initialize.argtypes = [JPCommon_ptr]
178
libjt.JPCommon_initialize(jpcommon)
180
libjt.HTS_Engine_initialize.argtypes = [HTS_Engine_ptr, c_int]
182
libjt.HTS_Engine_initialize(engine, 3)
184
libjt.HTS_Engine_initialize(engine, 2)
186
libjt.HTS_Engine_set_sampling_rate.argtypes = [HTS_Engine_ptr, c_int]
187
libjt.HTS_Engine_set_sampling_rate(engine, args['samp_rate']) # 16000
189
libjt.HTS_Engine_set_fperiod.argtypes = [HTS_Engine_ptr, c_int]
190
libjt.HTS_Engine_set_fperiod(engine, args['fperiod']) # if samping-rate is 16000: 80(point=5ms) frame period
192
libjt.HTS_Engine_set_alpha.argtypes = [HTS_Engine_ptr, c_double]
193
libjt.HTS_Engine_set_alpha(engine, args['alpha']) # 0.42
195
libjt.HTS_Engine_set_gamma.argtypes = [HTS_Engine_ptr, c_int]
196
libjt.HTS_Engine_set_gamma(engine, 0)
198
libjt.HTS_Engine_set_log_gain.argtypes = [HTS_Engine_ptr, c_int]
199
libjt.HTS_Engine_set_log_gain(engine, 0)
201
libjt.HTS_Engine_set_beta.argtypes = [HTS_Engine_ptr, c_double]
202
libjt.HTS_Engine_set_beta(engine, 0.0)
204
libjt.HTS_Engine_set_audio_buff_size.argtypes = [HTS_Engine_ptr, c_int]
205
libjt.HTS_Engine_set_audio_buff_size(engine, 1600)
207
libjt.HTS_Engine_set_msd_threshold.argtypes = [HTS_Engine_ptr, c_int, c_double]
208
libjt.HTS_Engine_set_msd_threshold(engine, 1, 0.5)
210
libjt.HTS_Engine_set_gv_weight.argtypes = [HTS_Engine_ptr, c_int, c_double]
211
libjt.HTS_Engine_set_gv_weight(engine, 0, 1.0)
212
libjt.HTS_Engine_set_gv_weight(engine, 1, 0.7)
214
libjt.HTS_Engine_set_gv_weight(engine, 2, 1.0)
216
# for libjt_synthesis()
217
libjt.mecab2njd.argtypes = [NJD_ptr, FEATURE_ptr_array_ptr, c_int]
218
libjt.njd_set_pronunciation.argtypes = [NJD_ptr]
219
libjt.njd_set_digit.argtypes = [NJD_ptr]
220
libjt.njd_set_accent_phrase.argtypes = [NJD_ptr]
221
libjt.njd_set_accent_type.argtypes = [NJD_ptr]
222
libjt.njd_set_unvoiced_vowel.argtypes = [NJD_ptr]
223
libjt.njd_set_long_vowel.argtypes = [NJD_ptr]
224
libjt.njd2jpcommon.argtypes = [JPCommon_ptr, NJD_ptr]
225
libjt.JPCommon_make_label.argtypes = [JPCommon_ptr]
226
libjt.JPCommon_get_label_size.argtypes = [JPCommon_ptr]
227
libjt.JPCommon_get_label_size.argtypes = [JPCommon_ptr]
228
libjt.JPCommon_get_label_feature.argtypes = [JPCommon_ptr]
230
libjt.JPCommon_get_label_feature.restype = c_char_p_p
231
libjt.JPCommon_get_label_size.argtypes = [JPCommon_ptr]
232
libjt.HTS_Engine_load_label_from_string_list.argtypes = [
233
HTS_Engine_ptr, c_char_p_p, c_int]
235
libjt.HTS_Engine_create_sstream.argtypes = [HTS_Engine_ptr]
236
libjt.HTS_Engine_create_pstream.argtypes = [HTS_Engine_ptr]
237
libjt.HTS_Engine_create_gstream.argtypes = [HTS_Engine_ptr]
238
libjt.HTS_Engine_refresh.argtypes = [HTS_Engine_ptr]
239
libjt.JPCommon_refresh.argtypes = [JPCommon_ptr]
240
libjt.NJD_refresh.argtypes = [NJD_ptr]
241
libjt.HTS_GStreamSet_get_total_nsample.argtypes = [HTS_GStreamSet_ptr]
242
libjt.HTS_GStreamSet_get_speech.argtypes = [HTS_GStreamSet_ptr, c_int]
243
libjt.NJD_print.argtypes = [NJD_ptr]
244
libjt.JPCommon_print.argtypes = [JPCommon_ptr]
245
libjt.JPCommonLabel_print.argtypes = [JPCommonLabel_ptr]
247
libjt.jt_total_nsample.argtypes = [HTS_Engine_ptr]
248
libjt.jt_speech_ptr.argtypes = [HTS_Engine_ptr]
249
libjt.jt_speech_ptr.restype = c_short_p
250
libjt.jt_save_logs.argtypes = [c_char_p, HTS_Engine_ptr, NJD_ptr]
251
libjt.jt_save_riff.argtypes = [c_char_p, HTS_Engine_ptr]
252
libjt.jt_speech_normalize.argtypes = [HTS_Engine_ptr, c_short]
253
libjt.jt_trim_silence.argtypes = [HTS_Engine_ptr, c_short, c_short]
254
libjt.jt_trim_silence.restype = c_int
256
libjt.NJD_clear.argtypes = [NJD_ptr]
257
libjt.JPCommon_clear.argtypes = [JPCommon_ptr]
258
libjt.HTS_Engine_clear.argtypes = [HTS_Engine_ptr]
260
libjt.HTS_Engine_set_lf0_offset_amp.argtypes = [HTS_Engine_ptr, c_double, c_double]
262
def libjt_load(VOICE):
263
global libjt, engine, use_lpf
264
libjt.HTS_Engine_load_duration_from_fn.argtypes = [
265
HTS_Engine_ptr, FILENAME_ptr_ptr, FILENAME_ptr_ptr, c_int]
267
fn_ms_dur_buf = create_string_buffer(VOICE + os.sep + "dur.pdf")
268
fn_ms_dur_buf_ptr = cast(byref(fn_ms_dur_buf), FILENAME_ptr)
269
fn_ms_dur = cast(byref(fn_ms_dur_buf_ptr), FILENAME_ptr_ptr)
270
fn_ts_dur_buf = create_string_buffer(VOICE + os.sep + "tree-dur.inf")
271
fn_ts_dur_buf_ptr = cast(byref(fn_ts_dur_buf), FILENAME_ptr)
272
fn_ts_dur = cast(byref(fn_ts_dur_buf_ptr), FILENAME_ptr_ptr)
273
libjt.HTS_Engine_load_duration_from_fn(engine, fn_ms_dur, fn_ts_dur, 1)
275
libjt.HTS_Engine_load_parameter_from_fn.argtypes = [
276
HTS_Engine_ptr, FILENAME_ptr_ptr, FILENAME_ptr_ptr,
277
FILENAME_ptr_x3_ptr, c_int, c_int, c_int, c_int]
279
fn_ms_mcp_buf = create_string_buffer(VOICE + os.sep + "mgc.pdf")
280
fn_ms_mcp_buf_ptr = cast(byref(fn_ms_mcp_buf), FILENAME_ptr)
281
fn_ms_mcp = cast(byref(fn_ms_mcp_buf_ptr), FILENAME_ptr_ptr)
282
fn_ts_mcp_buf = create_string_buffer(VOICE + os.sep + "tree-mgc.inf")
283
fn_ts_mcp_buf_ptr = cast(byref(fn_ts_mcp_buf), FILENAME_ptr)
284
fn_ts_mcp = cast(byref(fn_ts_mcp_buf_ptr), FILENAME_ptr_ptr)
285
fn_ws_mcp_buf_1 = create_string_buffer(VOICE + os.sep + "mgc.win1")
286
fn_ws_mcp_buf_2 = create_string_buffer(VOICE + os.sep + "mgc.win2")
287
fn_ws_mcp_buf_3 = create_string_buffer(VOICE + os.sep + "mgc.win3")
288
fn_ws_mcp_buf_ptr_x3 = FILENAME_ptr_x3(
289
cast(byref(fn_ws_mcp_buf_1), FILENAME_ptr),
290
cast(byref(fn_ws_mcp_buf_2), FILENAME_ptr),
291
cast(byref(fn_ws_mcp_buf_3), FILENAME_ptr))
292
fn_ws_mcp = cast(byref(fn_ws_mcp_buf_ptr_x3), FILENAME_ptr_x3_ptr)
293
libjt.HTS_Engine_load_parameter_from_fn(
294
engine, fn_ms_mcp, fn_ts_mcp, fn_ws_mcp,
297
fn_ms_lf0_buf = create_string_buffer(VOICE + os.sep + "lf0.pdf")
298
fn_ms_lf0_buf_ptr = cast(byref(fn_ms_lf0_buf), FILENAME_ptr)
299
fn_ms_lf0 = cast(byref(fn_ms_lf0_buf_ptr), FILENAME_ptr_ptr)
300
fn_ts_lf0_buf = create_string_buffer(VOICE + os.sep + "tree-lf0.inf")
301
fn_ts_lf0_buf_ptr = cast(byref(fn_ts_lf0_buf), FILENAME_ptr)
302
fn_ts_lf0 = cast(byref(fn_ts_lf0_buf_ptr), FILENAME_ptr_ptr)
303
fn_ws_lf0_buf_1 = create_string_buffer(VOICE + os.sep + "lf0.win1")
304
fn_ws_lf0_buf_2 = create_string_buffer(VOICE + os.sep + "lf0.win2")
305
fn_ws_lf0_buf_3 = create_string_buffer(VOICE + os.sep + "lf0.win3")
306
fn_ws_lf0_buf_ptr_x3 = FILENAME_ptr_x3(
307
cast(byref(fn_ws_lf0_buf_1), FILENAME_ptr),
308
cast(byref(fn_ws_lf0_buf_2), FILENAME_ptr),
309
cast(byref(fn_ws_lf0_buf_3), FILENAME_ptr))
310
fn_ws_lf0 = cast(byref(fn_ws_lf0_buf_ptr_x3), FILENAME_ptr_x3_ptr)
311
libjt.HTS_Engine_load_parameter_from_fn(
312
engine, fn_ms_lf0, fn_ts_lf0, fn_ws_lf0,
316
fn_ms_lpf_buf = create_string_buffer(VOICE + os.sep + "lpf.pdf")
317
fn_ms_lpf_buf_ptr = cast(byref(fn_ms_lpf_buf), FILENAME_ptr)
318
fn_ms_lpf = cast(byref(fn_ms_lpf_buf_ptr), FILENAME_ptr_ptr)
319
fn_ts_lpf_buf = create_string_buffer(VOICE + os.sep + "tree-lpf.inf")
320
fn_ts_lpf_buf_ptr = cast(byref(fn_ts_lpf_buf), FILENAME_ptr)
321
fn_ts_lpf = cast(byref(fn_ts_lpf_buf_ptr), FILENAME_ptr_ptr)
322
fn_ws_lpf_buf_1 = create_string_buffer(VOICE + os.sep + "lpf.win1")
323
fn_ws_lpf_buf_ptr_x3 = FILENAME_ptr_x3(
324
cast(byref(fn_ws_lpf_buf_1), FILENAME_ptr),
325
cast(0, FILENAME_ptr),
326
cast(0, FILENAME_ptr))
327
fn_ws_lpf = cast(byref(fn_ws_lpf_buf_ptr_x3), FILENAME_ptr_x3_ptr)
328
libjt.HTS_Engine_load_parameter_from_fn(engine, fn_ms_lpf, fn_ts_lpf, fn_ws_lpf, 2, 0, 1, 1)
330
libjt.HTS_Engine_load_gv_from_fn.argtypes = [
331
HTS_Engine_ptr, FILENAME_ptr_ptr, FILENAME_ptr_ptr,
334
fn_ms_gvm_buf = create_string_buffer(VOICE + os.sep + "gv-mgc.pdf")
335
fn_ms_gvm_buf_ptr = cast(byref(fn_ms_gvm_buf), FILENAME_ptr)
336
fn_ms_gvm = cast(byref(fn_ms_gvm_buf_ptr), FILENAME_ptr_ptr)
337
fn_ts_gvm_buf = create_string_buffer(VOICE + os.sep + "tree-gv-mgc.inf")
338
fn_ts_gvm_buf_ptr = cast(byref(fn_ts_gvm_buf), FILENAME_ptr)
339
fn_ts_gvm = cast(byref(fn_ts_gvm_buf_ptr), FILENAME_ptr_ptr)
340
libjt.HTS_Engine_load_gv_from_fn(
341
engine, fn_ms_gvm, fn_ts_gvm, 0, 1)
343
fn_ms_gvl_buf = create_string_buffer(VOICE + os.sep + "gv-lf0.pdf")
344
fn_ms_gvl_buf_ptr = cast(byref(fn_ms_gvl_buf), FILENAME_ptr)
345
fn_ms_gvl = cast(byref(fn_ms_gvl_buf_ptr), FILENAME_ptr_ptr)
346
fn_ts_gvl_buf = create_string_buffer(VOICE + os.sep + "tree-gv-lf0.inf")
347
fn_ts_gvl_buf_ptr = cast(byref(fn_ts_gvl_buf), FILENAME_ptr)
348
fn_ts_gvl = cast(byref(fn_ts_gvl_buf_ptr), FILENAME_ptr_ptr)
349
libjt.HTS_Engine_load_gv_from_fn(
350
engine, fn_ms_gvl, fn_ts_gvl, 1, 1)
352
libjt.HTS_Engine_load_gv_switch_from_fn.argtypes = [
353
HTS_Engine_ptr, FILENAME_ptr]
355
fn_gv_switch_buf = create_string_buffer(VOICE + os.sep + "gv-switch.inf")
356
fn_gv_switch = cast(byref(fn_gv_switch_buf), FILENAME_ptr)
357
libjt.HTS_Engine_load_gv_switch_from_fn(
358
engine, fn_gv_switch)
361
libjt.HTS_Engine_refresh(engine)
362
libjt.JPCommon_refresh(jpcommon)
363
libjt.NJD_refresh(njd)
367
libjt.JPCommon_clear(jpcommon)
368
libjt.HTS_Engine_clear(engine)
370
def libjt_synthesis(feature, size, fperiod_=80, feed_func_=None, is_speaking_func_=None, thres_=32, thres2_=32, level_=32767, logwrite_=None, lf0_offset_=0.0, lf0_amp_=1.0):
371
if feature == None or size == None: return None
372
if logwrite_ != None: logwrite_('libjt_synthesis start.')
374
libjt.HTS_Engine_set_lf0_offset_amp(engine, lf0_offset_, lf0_amp_)
375
libjt.HTS_Engine_set_fperiod(engine, fperiod_) # 80(point=5ms) frame period
376
libjt.mecab2njd(njd, feature, size)
377
libjt.njd_set_pronunciation(njd)
378
libjt.njd_set_digit(njd)
379
libjt.njd_set_accent_phrase(njd)
381
if logwrite_ != None: logwrite_('libjt_synthesis error #1 ')
382
# exception: access violation reading 0x00000000
383
# https://github.com/nishimotz/libopenjtalk/commit/10d3abda6835e0547846fb5e12a36c1425561aaa#diff-66
385
libjt.njd_set_accent_type(njd)
387
if logwrite_ != None: logwrite_('libjt_synthesis njd_set_accent_type() error ')
389
libjt.njd_set_unvoiced_vowel(njd)
390
libjt.njd_set_long_vowel(njd)
391
libjt.njd2jpcommon(jpcommon, njd)
392
libjt.JPCommon_make_label(jpcommon)
394
if logwrite_ != None: logwrite_('libjt_synthesis error #2 ')
395
if is_speaking_func_ and not is_speaking_func_() :
400
s = libjt.JPCommon_get_label_size(jpcommon)
402
if logwrite_ != None: logwrite_('libjt_synthesis JPCommon_get_label_size() error ')
406
f = libjt.JPCommon_get_label_feature(jpcommon)
407
libjt.HTS_Engine_load_label_from_string_list(engine, f, s)
408
libjt.HTS_Engine_create_sstream(engine)
409
libjt.HTS_Engine_create_pstream(engine)
410
libjt.HTS_Engine_create_gstream(engine)
412
if logwrite_ != None: logwrite_('libjt_synthesis error #3 ')
413
if is_speaking_func_ and not is_speaking_func_() :
418
libjt.jt_speech_normalize(engine, level_)
420
if logwrite_ != None: logwrite_('libjt_synthesis error #4 ')
422
total_nsample = libjt.jt_trim_silence(engine, thres_, thres2_)
423
speech_ptr = libjt.jt_speech_ptr(engine)
424
byte_count = total_nsample * sizeof(c_short)
425
buf = string_at(speech_ptr, byte_count)
426
if feed_func_: feed_func_(buf)
427
#libjt.jt_save_logs("_logfile", engine, njd)
429
if logwrite_ != None: logwrite_('libjt_synthesis error #5 ')
430
if logwrite_ != None: logwrite_('libjt_synthesis done.')