1
/* ----------------------------------------------------------------- */
2
/* The HMM-Based Speech Synthesis Engine "hts_engine API" */
3
/* developed by HTS Working Group */
4
/* http://hts-engine.sourceforge.net/ */
5
/* ----------------------------------------------------------------- */
7
/* Copyright (c) 2001-2011 Nagoya Institute of Technology */
8
/* Department of Computer Science */
10
/* 2001-2008 Tokyo Institute of Technology */
11
/* Interdisciplinary Graduate School of */
12
/* Science and Engineering */
14
/* All rights reserved. */
16
/* Redistribution and use in source and binary forms, with or */
17
/* without modification, are permitted provided that the following */
18
/* conditions are met: */
20
/* - Redistributions of source code must retain the above copyright */
21
/* notice, this list of conditions and the following disclaimer. */
22
/* - Redistributions in binary form must reproduce the above */
23
/* copyright notice, this list of conditions and the following */
24
/* disclaimer in the documentation and/or other materials provided */
25
/* with the distribution. */
26
/* - Neither the name of the HTS working group nor the names of its */
27
/* contributors may be used to endorse or promote products derived */
28
/* from this software without specific prior written permission. */
30
/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */
31
/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */
32
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
33
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
34
/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */
35
/* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */
36
/* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */
37
/* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */
38
/* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */
39
/* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, */
40
/* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY */
41
/* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
42
/* POSSIBILITY OF SUCH DAMAGE. */
43
/* ----------------------------------------------------------------- */
49
#define HTS_PSTREAM_C_START extern "C" {
50
#define HTS_PSTREAM_C_END }
52
#define HTS_PSTREAM_C_START
53
#define HTS_PSTREAM_C_END
54
#endif /* __CPLUSPLUS */
58
#include <math.h> /* for sqrt() */
60
/* hts_engine libraries */
61
#include "HTS_hidden.h"
63
/* HTS_finv: calculate 1.0/variance function */
64
static double HTS_finv(const double x)
70
if (x <= INVINF2 && x >= 0)
72
if (x >= -INVINF2 && x < 0)
78
/* HTS_PStream_calc_wuw_and_wum: calcurate W'U^{-1}W and W'U^{-1}M */
79
static void HTS_PStream_calc_wuw_and_wum(HTS_PStream * pst, const int m)
84
for (t = 0; t < pst->length; t++) {
87
for (i = 0; i < pst->width; i++)
88
pst->sm.wuw[t][i] = 0.0;
91
for (i = 0; i < pst->win_size; i++)
92
for (j = pst->win_l_width[i]; j <= pst->win_r_width[i]; j++)
93
if ((t + j >= 0) && (t + j < pst->length)
94
&& (pst->win_coefficient[i][-j] != 0.0)) {
95
wu = pst->win_coefficient[i][-j] *
96
pst->sm.ivar[t + j][i * pst->static_length + m];
98
wu * pst->sm.mean[t + j][i * pst->static_length + m];
99
for (k = 0; (k < pst->width) && (t + k < pst->length); k++)
100
if ((k - j <= pst->win_r_width[i])
101
&& (pst->win_coefficient[i][k - j] != 0.0))
102
pst->sm.wuw[t][k] += wu * pst->win_coefficient[i][k - j];
108
/* HTS_PStream_ldl_factorization: Factorize W'*U^{-1}*W to L*D*L' (L: lower triangular, D: diagonal) */
109
static void HTS_PStream_ldl_factorization(HTS_PStream * pst)
113
for (t = 0; t < pst->length; t++) {
114
for (i = 1; (i < pst->width) && (t >= i); i++)
115
pst->sm.wuw[t][0] -= pst->sm.wuw[t - i][i] *
116
pst->sm.wuw[t - i][i] * pst->sm.wuw[t - i][0];
118
for (i = 1; i < pst->width; i++) {
119
for (j = 1; (i + j < pst->width) && (t >= j); j++)
120
pst->sm.wuw[t][i] -= pst->sm.wuw[t - j][j] *
121
pst->sm.wuw[t - j][i + j] * pst->sm.wuw[t - j][0];
122
pst->sm.wuw[t][i] /= pst->sm.wuw[t][0];
127
/* HTS_PStream_forward_substitution: forward subtitution for mlpg */
128
static void HTS_PStream_forward_substitution(HTS_PStream * pst)
132
for (t = 0; t < pst->length; t++) {
133
pst->sm.g[t] = pst->sm.wum[t];
134
for (i = 1; (i < pst->width) && (t >= i); i++)
135
pst->sm.g[t] -= pst->sm.wuw[t - i][i] * pst->sm.g[t - i];
139
/* HTS_PStream_backward_substitution: backward subtitution for mlpg */
140
static void HTS_PStream_backward_substitution(HTS_PStream * pst, const int m)
144
for (t = pst->length - 1; t >= 0; t--) {
145
pst->par[t][m] = pst->sm.g[t] / pst->sm.wuw[t][0];
146
for (i = 1; (i < pst->width) && (t + i < pst->length); i++)
147
pst->par[t][m] -= pst->sm.wuw[t][i] * pst->par[t + i][m];
151
/* HTS_PStream_calc_gv: subfunction for mlpg using GV */
152
static void HTS_PStream_calc_gv(HTS_PStream * pst, const int m, double *mean,
158
for (t = 0; t < pst->length; t++)
159
if (pst->gv_switch[t])
160
*mean += pst->par[t][m];
161
*mean /= pst->gv_length;
163
for (t = 0; t < pst->length; t++)
164
if (pst->gv_switch[t])
165
*vari += (pst->par[t][m] - *mean) * (pst->par[t][m] - *mean);
166
*vari /= pst->gv_length;
169
/* HTS_PStream_conv_gv: subfunction for mlpg using GV */
170
static void HTS_PStream_conv_gv(HTS_PStream * pst, const int m)
177
HTS_PStream_calc_gv(pst, m, &mean, &vari);
178
ratio = sqrt(pst->gv_mean[m] / vari);
179
for (t = 0; t < pst->length; t++)
180
if (pst->gv_switch[t])
181
pst->par[t][m] = ratio * (pst->par[t][m] - mean) + mean;
184
/* HTS_PStream_calc_derivative: subfunction for mlpg using GV */
185
static double HTS_PStream_calc_derivative(HTS_PStream * pst, const int m)
194
const double w = 1.0 / (pst->win_size * pst->length);
196
HTS_PStream_calc_gv(pst, m, &mean, &vari);
197
gvobj = -0.5 * W2 * vari * pst->gv_vari[m] * (vari - 2.0 * pst->gv_mean[m]);
198
dv = -2.0 * pst->gv_vari[m] * (vari - pst->gv_mean[m]) / pst->length;
200
for (t = 0; t < pst->length; t++) {
201
pst->sm.g[t] = pst->sm.wuw[t][0] * pst->par[t][m];
202
for (i = 1; i < pst->width; i++) {
203
if (t + i < pst->length)
204
pst->sm.g[t] += pst->sm.wuw[t][i] * pst->par[t + i][m];
206
pst->sm.g[t] += pst->sm.wuw[t - i][i] * pst->par[t - i][m];
210
for (t = 0, hmmobj = 0.0; t < pst->length; t++) {
211
hmmobj += W1 * w * pst->par[t][m] * (pst->sm.wum[t] - 0.5 * pst->sm.g[t]);
212
h = -W1 * w * pst->sm.wuw[t][1 - 1]
213
- W2 * 2.0 / (pst->length * pst->length) *
214
((pst->length - 1) * pst->gv_vari[m] * (vari - pst->gv_mean[m])
215
+ 2.0 * pst->gv_vari[m] * (pst->par[t][m] - mean) * (pst->par[t][m] -
217
if (pst->gv_switch[t])
219
1.0 / h * (W1 * w * (-pst->sm.g[t] + pst->sm.wum[t]) +
220
W2 * dv * (pst->par[t][m] - mean));
222
pst->sm.g[t] = 1.0 / h * (W1 * w * (-pst->sm.g[t] + pst->sm.wum[t]));
225
return (-(hmmobj + gvobj));
228
/* HTS_PStream_gv_parmgen: function for mlpg using GV */
229
static void HTS_PStream_gv_parmgen(HTS_PStream * pst, const int m)
232
double step = STEPINIT;
233
double prev = -LZERO;
236
if (pst->gv_length == 0)
239
HTS_PStream_conv_gv(pst, m);
240
if (GV_MAX_ITERATION > 0) {
241
HTS_PStream_calc_wuw_and_wum(pst, m);
242
for (i = 1; i <= GV_MAX_ITERATION; i++) {
243
obj = HTS_PStream_calc_derivative(pst, m);
248
for (t = 0; t < pst->length; t++)
249
pst->par[t][m] += step * pst->sm.g[t];
255
/* HTS_PStream_mlpg: generate sequence of speech parameter vector maximizing its output probability for given pdf sequence */
256
static void HTS_PStream_mlpg(HTS_PStream * pst)
260
if (pst->length == 0)
263
for (m = 0; m < pst->static_length; m++) {
264
HTS_PStream_calc_wuw_and_wum(pst, m);
265
HTS_PStream_ldl_factorization(pst); /* LDL factorization */
266
HTS_PStream_forward_substitution(pst); /* forward substitution */
267
HTS_PStream_backward_substitution(pst, m); /* backward substitution */
268
if (pst->gv_length > 0)
269
HTS_PStream_gv_parmgen(pst, m);
273
/* HTS_PStreamSet_initialize: initialize parameter stream set */
274
void HTS_PStreamSet_initialize(HTS_PStreamSet * pss)
278
pss->total_frame = 0;
281
/* HTS_PStreamSet_create: parameter generation using GV weight */
282
void HTS_PStreamSet_create(HTS_PStreamSet * pss, HTS_SStreamSet * sss,
283
double *msd_threshold, double *gv_weight)
286
int frame, msd_frame, state;
289
HTS_Boolean not_bound;
292
HTS_error(1, "HTS_PstreamSet_create: HTS_PStreamSet should be clear.\n");
295
pss->nstream = HTS_SStreamSet_get_nstream(sss);
296
pss->pstream = (HTS_PStream *) HTS_calloc(pss->nstream, sizeof(HTS_PStream));
297
pss->total_frame = HTS_SStreamSet_get_total_frame(sss);
300
for (i = 0; i < pss->nstream; i++) {
301
pst = &pss->pstream[i];
302
if (HTS_SStreamSet_is_msd(sss, i)) { /* for MSD */
304
for (state = 0; state < HTS_SStreamSet_get_total_state(sss); state++)
305
if (HTS_SStreamSet_get_msd(sss, i, state) > msd_threshold[i])
306
pst->length += HTS_SStreamSet_get_duration(sss, state);
308
(HTS_Boolean *) HTS_calloc(pss->total_frame, sizeof(HTS_Boolean));
309
for (state = 0, frame = 0; state < HTS_SStreamSet_get_total_state(sss);
311
if (HTS_SStreamSet_get_msd(sss, i, state) > msd_threshold[i])
312
for (j = 0; j < HTS_SStreamSet_get_duration(sss, state); j++) {
313
pst->msd_flag[frame] = TRUE;
316
for (j = 0; j < HTS_SStreamSet_get_duration(sss, state); j++) {
317
pst->msd_flag[frame] = FALSE;
320
} else { /* for non MSD */
321
pst->length = pss->total_frame;
322
pst->msd_flag = NULL;
324
pst->vector_length = HTS_SStreamSet_get_vector_length(sss, i);
325
pst->width = HTS_SStreamSet_get_window_max_width(sss, i) * 2 + 1; /* band width of R */
326
pst->win_size = HTS_SStreamSet_get_window_size(sss, i);
327
pst->static_length = pst->vector_length / pst->win_size;
328
pst->sm.mean = HTS_alloc_matrix(pst->length, pst->vector_length);
329
pst->sm.ivar = HTS_alloc_matrix(pst->length, pst->vector_length);
330
pst->sm.wum = (double *) HTS_calloc(pst->length, sizeof(double));
331
pst->sm.wuw = HTS_alloc_matrix(pst->length, pst->width);
332
pst->sm.g = (double *) HTS_calloc(pst->length, sizeof(double));
333
pst->par = HTS_alloc_matrix(pst->length, pst->static_length);
334
/* copy dynamic window */
335
pst->win_l_width = (int *) HTS_calloc(pst->win_size, sizeof(int));
336
pst->win_r_width = (int *) HTS_calloc(pst->win_size, sizeof(int));
337
pst->win_coefficient =
338
(double **) HTS_calloc(pst->win_size, sizeof(double));
339
for (j = 0; j < pst->win_size; j++) {
340
pst->win_l_width[j] = HTS_SStreamSet_get_window_left_width(sss, i, j);
341
pst->win_r_width[j] = HTS_SStreamSet_get_window_right_width(sss, i, j);
342
if (pst->win_l_width[j] + pst->win_r_width[j] == 0)
343
pst->win_coefficient[j] = (double *)
344
HTS_calloc(-2 * pst->win_l_width[j] + 1, sizeof(double));
346
pst->win_coefficient[j] = (double *)
347
HTS_calloc(-2 * pst->win_l_width[j], sizeof(double));
348
pst->win_coefficient[j] -= pst->win_l_width[j];
349
for (k = pst->win_l_width[j]; k <= pst->win_r_width[j]; k++)
350
pst->win_coefficient[j][k] =
351
HTS_SStreamSet_get_window_coefficient(sss, i, j, k);
354
if (HTS_SStreamSet_use_gv(sss, i)) {
356
(double *) HTS_calloc(pst->static_length, sizeof(double));
358
(double *) HTS_calloc(pst->static_length, sizeof(double));
359
for (j = 0; j < pst->static_length; j++) {
361
HTS_SStreamSet_get_gv_mean(sss, i, j) * gv_weight[i];
362
pst->gv_vari[j] = HTS_SStreamSet_get_gv_vari(sss, i, j);
365
(HTS_Boolean *) HTS_calloc(pst->length, sizeof(HTS_Boolean));
366
if (HTS_SStreamSet_is_msd(sss, i)) { /* for MSD */
367
for (state = 0, frame = 0, msd_frame = 0;
368
state < HTS_SStreamSet_get_total_state(sss); state++)
369
for (j = 0; j < HTS_SStreamSet_get_duration(sss, state);
371
if (pst->msd_flag[frame])
372
pst->gv_switch[msd_frame++] =
373
HTS_SStreamSet_get_gv_switch(sss, i, state);
374
} else { /* for non MSD */
375
for (state = 0, frame = 0;
376
state < HTS_SStreamSet_get_total_state(sss); state++)
377
for (j = 0; j < HTS_SStreamSet_get_duration(sss, state); j++)
378
pst->gv_switch[frame++] =
379
HTS_SStreamSet_get_gv_switch(sss, i, state);
381
for (j = 0, pst->gv_length = 0; j < pst->length; j++)
382
if (pst->gv_switch[j])
385
pst->gv_switch = NULL;
391
if (HTS_SStreamSet_is_msd(sss, i)) { /* for MSD */
392
for (state = 0, frame = 0, msd_frame = 0;
393
state < HTS_SStreamSet_get_total_state(sss); state++)
394
for (j = 0; j < HTS_SStreamSet_get_duration(sss, state); j++) {
395
if (pst->msd_flag[frame]) {
396
/* check current frame is MSD boundary or not */
397
for (k = 0; k < pst->win_size; k++) {
399
for (l = pst->win_l_width[k]; l <= pst->win_r_width[k];
401
if (frame + l < 0 || pss->total_frame <= frame + l
402
|| !pst->msd_flag[frame + l]) {
406
for (l = 0; l < pst->static_length; l++) {
407
m = pst->static_length * k + l;
408
pst->sm.mean[msd_frame][m] =
409
HTS_SStreamSet_get_mean(sss, i, state, m);
410
if (not_bound || k == 0)
411
pst->sm.ivar[msd_frame][m] =
412
HTS_finv(HTS_SStreamSet_get_vari
415
pst->sm.ivar[msd_frame][m] = 0.0;
422
} else { /* for non MSD */
423
for (state = 0, frame = 0;
424
state < HTS_SStreamSet_get_total_state(sss); state++) {
425
for (j = 0; j < HTS_SStreamSet_get_duration(sss, state); j++) {
426
for (k = 0; k < pst->win_size; k++) {
428
for (l = pst->win_l_width[k]; l <= pst->win_r_width[k]; l++)
429
if (frame + l < 0 || pss->total_frame <= frame + l) {
433
for (l = 0; l < pst->static_length; l++) {
434
m = pst->static_length * k + l;
435
pst->sm.mean[frame][m] =
436
HTS_SStreamSet_get_mean(sss, i, state, m);
437
if (not_bound || k == 0)
438
pst->sm.ivar[frame][m] =
439
HTS_finv(HTS_SStreamSet_get_vari(sss, i, state, m));
441
pst->sm.ivar[frame][m] = 0.0;
448
/* parameter generation */
449
HTS_PStream_mlpg(pst);
453
/* HTS_PStreamSet_get_nstream: get number of stream */
454
int HTS_PStreamSet_get_nstream(HTS_PStreamSet * pss)
459
/* HTS_PStreamSet_get_static_length: get static features length */
460
int HTS_PStreamSet_get_static_length(HTS_PStreamSet * pss, int stream_index)
462
return pss->pstream[stream_index].static_length;
465
/* HTS_PStreamSet_get_total_frame: get total number of frame */
466
int HTS_PStreamSet_get_total_frame(HTS_PStreamSet * pss)
468
return pss->total_frame;
471
/* HTS_PStreamSet_get_parameter: get parameter */
472
double HTS_PStreamSet_get_parameter(HTS_PStreamSet * pss,
473
int stream_index, int frame_index,
476
return pss->pstream[stream_index].par[frame_index][vector_index];
479
/* HTS_PStreamSet_get_parameter_vector: get parameter vector*/
480
double *HTS_PStreamSet_get_parameter_vector(HTS_PStreamSet * pss,
481
int stream_index, int frame_index)
483
return pss->pstream[stream_index].par[frame_index];
486
/* HTS_PStreamSet_get_msd_flag: get generated MSD flag per frame */
487
HTS_Boolean HTS_PStreamSet_get_msd_flag(HTS_PStreamSet * pss,
488
int stream_index, int frame_index)
490
return pss->pstream[stream_index].msd_flag[frame_index];
493
/* HTS_PStreamSet_is_msd: get MSD flag */
494
HTS_Boolean HTS_PStreamSet_is_msd(HTS_PStreamSet * pss, int stream_index)
496
return pss->pstream[stream_index].msd_flag ? TRUE : FALSE;
499
/* HTS_PStreamSet_clear: free parameter stream set */
500
void HTS_PStreamSet_clear(HTS_PStreamSet * pss)
503
HTS_PStream *pstream;
506
for (i = 0; i < pss->nstream; i++) {
507
pstream = &pss->pstream[i];
508
HTS_free(pstream->sm.wum);
509
HTS_free(pstream->sm.g);
510
HTS_free_matrix(pstream->sm.wuw, pstream->length);
511
HTS_free_matrix(pstream->sm.ivar, pstream->length);
512
HTS_free_matrix(pstream->sm.mean, pstream->length);
513
HTS_free_matrix(pstream->par, pstream->length);
514
if (pstream->msd_flag)
515
HTS_free(pstream->msd_flag);
516
for (j = pstream->win_size - 1; j >= 0; j--) {
517
pstream->win_coefficient[j] += pstream->win_l_width[j];
518
HTS_free(pstream->win_coefficient[j]);
520
if (pstream->gv_mean)
521
HTS_free(pstream->gv_mean);
522
if (pstream->gv_vari)
523
HTS_free(pstream->gv_vari);
524
HTS_free(pstream->win_coefficient);
525
HTS_free(pstream->win_l_width);
526
HTS_free(pstream->win_r_width);
527
if (pstream->gv_switch)
528
HTS_free(pstream->gv_switch);
530
HTS_free(pss->pstream);
532
HTS_PStreamSet_initialize(pss);
537
#endif /* !HTS_PSTREAM_C */