This paper describes a speaker-independent/adaptive HMM-based speech synthesis system developed for the Blizzard Challenge 2007. The new system, named "HTS-2007", employs speaker adaptation (CSMAPLR+MAP), feature-space adaptive training, mixed-gender modeling, and full-covariance modeling using CSMAPLR transforms, in addition to several other techniques that have proved effective in our previous systems. Subjective evaluation results show that the new system generates significantly better quality synthetic speech than that of speaker-dependent approaches with realistic amounts of speech data, and that it bears comparison with speaker-dependent approaches even when large amounts of speech data are available.
%0 Conference Paper
%1 Yamagishi2008
%A Yamagishi, Junichi
%A Nose, Takashi
%A Zen, Heiga
%A Toda, Tomoki
%A Tokuda, Keiichi
%B Proceedings of the 2008 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)
%C Las Vegas, NV, USA
%D 2008
%K 2007;HTS-2007;feature-space Challenge Challenge;HMM;HTS;speaker HMM-based Markov adaptation;speaker-dependent adaptation;speech adaptive analysis;Speech analysis;hidden approaches;speaker-independent covariance data;Blizzard evaluation;speaker matrix;Hidden measurements;Context modeling;Covariance modeling;mixed-gender modeling;performance models;High models;speech speech superconductors;Loudspeakers;Robustness;Speech synthesis synthesis;Blizzard synthesis;Training system;Acoustic temperature training;full-covariance
%P 3957-3960
%R 10.1109/ICASSP.2008.4518520
%T Performance evaluation of the speaker-independent HMM-based speech synthesis system “HTS 2007” for the Blizzard Challenge 2007
%X This paper describes a speaker-independent/adaptive HMM-based speech synthesis system developed for the Blizzard Challenge 2007. The new system, named "HTS-2007", employs speaker adaptation (CSMAPLR+MAP), feature-space adaptive training, mixed-gender modeling, and full-covariance modeling using CSMAPLR transforms, in addition to several other techniques that have proved effective in our previous systems. Subjective evaluation results show that the new system generates significantly better quality synthetic speech than that of speaker-dependent approaches with realistic amounts of speech data, and that it bears comparison with speaker-dependent approaches even when large amounts of speech data are available.
@inproceedings{Yamagishi2008,
abstract = {This paper describes a speaker-independent/adaptive HMM-based speech synthesis system developed for the Blizzard Challenge 2007. The new system, named "HTS-2007", employs speaker adaptation (CSMAPLR+MAP), feature-space adaptive training, mixed-gender modeling, and full-covariance modeling using CSMAPLR transforms, in addition to several other techniques that have proved effective in our previous systems. Subjective evaluation results show that the new system generates significantly better quality synthetic speech than that of speaker-dependent approaches with realistic amounts of speech data, and that it bears comparison with speaker-dependent approaches even when large amounts of speech data are available.},
added-at = {2021-02-01T10:51:23.000+0100},
address = {Las Vegas, NV, USA},
author = {Yamagishi, Junichi and Nose, Takashi and Zen, Heiga and Toda, Tomoki and Tokuda, Keiichi},
biburl = {https://www.bibsonomy.org/bibtex/221165ec94a191d4cec628172129a94de/m-toman},
booktitle = {Proceedings of the 2008 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
doi = {10.1109/ICASSP.2008.4518520},
file = {:pdfs/yamagishi_icassp_2008.pdf:PDF},
interhash = {a80455708d3830508f91c51e194880c4},
intrahash = {21165ec94a191d4cec628172129a94de},
issn = {1520-6149},
keywords = {2007;HTS-2007;feature-space Challenge Challenge;HMM;HTS;speaker HMM-based Markov adaptation;speaker-dependent adaptation;speech adaptive analysis;Speech analysis;hidden approaches;speaker-independent covariance data;Blizzard evaluation;speaker matrix;Hidden measurements;Context modeling;Covariance modeling;mixed-gender modeling;performance models;High models;speech speech superconductors;Loudspeakers;Robustness;Speech synthesis synthesis;Blizzard synthesis;Training system;Acoustic temperature training;full-covariance},
month = mar,
owner = {schabus},
pages = {3957-3960},
timestamp = {2021-02-01T10:51:23.000+0100},
title = {Performance evaluation of the speaker-independent HMM-based speech synthesis system “HTS 2007” for the Blizzard Challenge 2007},
year = 2008
}