@article {saz16, title = {Acoustic Adaptation to Dynamic Background Conditions with Asynchronous Transformations}, journal = {Computer Speech and Language}, year = {2016}, doi = {10.1016/j.csl.2016.06.008 }, author = {Oscar Saz and Thomas Hain} } @conference {380, title = {Automatic Genre and Show Identification of Broadcast Media}, booktitle = {Proceedings of the 17th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2016}, address = {San Francisco, California, USA}, author = {Mortaza Doulaty and Oscar Saz and Raymond W. M. Ng and Thomas Hain} } @conference {DeenaHDSH16, title = {Combining Feature and Model-Based Adaptation of RNNLMs for Multi-Genre Broadcast Speech Recognition}, booktitle = {Proceedings of the 17th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2016}, address = {San Francisco, California, USA}, author = {Salil Deena, Madina Hasan, Mortaza Doulaty, Oscar Saz and Thomas Hain} } @conference {DeenaHDSH16, title = {Combining Feature and Model-Based Adaptation of RNNLMs for Multi-Genre Broadcast Speech Recognition}, booktitle = {Proceedings of the 17th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2016}, address = {San Francisco, California, USA}, author = {Salil Deena and Madina Hasan and Mortaza Doulaty and Oscar Saz and Thomas Hain} } @conference {ng_is16, title = {Combining weak tokenisers for phonotactic language recognition in a resource-constrained setting}, booktitle = {Proceedings of the 17th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2016}, address = {San Francisco, CA}, author = {Raymond W. M. Ng and Bhusan Chettri and Thomas Hain} } @conference {merritt2016hybrid, title = {Deep neural network-guided unit selection synthesis}, booktitle = {Proc. ICASSP}, year = {2016}, abstract = {Vocoding of speech is a standard part of statistical parametric speech synthesis systems. It imposes an upper bound of the naturalness that can possibly be achieved. Hybrid systems using parametric models to guide the selection of natural speech units can combine the benefits of robust statistical models with the high level of naturalness of waveform concatenation. Existing hybrid systems use Hidden Markov Models (HMMs) as the statistical model. This paper demonstrates that the superiority of Deep Neural Network (DNN) acoustic models over HMMs in conventional statistical parametric speech synthesis also carries over to hybrid synthesis. We compare various DNN and HMM hybrid configurations, guiding the selection of waveform units in either the vocoder parameter domain, or in the domain of embeddings (bottleneck features).}, url = {http://datashare.is.ed.ac.uk/bitstream/handle/10283/1889/Merritt_ICASSP2016.pdf}, author = {Merritt, Thomas and Clark, Robert A J and Wu, Zhizheng and Yamagishi, Junichi and King, Simon} } @article {7499870, title = {Differentiable Pooling for Unsupervised Acoustic Model Adaptation}, journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing}, volume = {PP}, number = {99}, year = {2016}, pages = {1-1}, issn = {2329-9290}, doi = {10.1109/TASLP.2016.2584700}, author = {P. Swietojanski and S. Renals} } @conference {Zhang:2015cd, title = {DNN Speaker Adaptation using Parameterised Sigmoid and ReLU Hidden Activation Functions}, booktitle = {Proc. ICASSP{\textquoteright}16}, year = {2016}, address = {Shanghai, China}, url = {http://mi.eng.cam.ac.uk/~cz277/doc/Conference-ICASSP2016-ADAPT.pdf}, author = {Zhang, C. and Woodland, P.C.} } @conference {milner_is16, title = {DNN-based speaker clustering for speaker diarisation}, booktitle = {Proceedings of the 17th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2016}, address = {San Francisco, CA}, author = {Rosanna Milner and Thomas Hain} } @conference {olcoz_is16, title = {Error correction in lightly supervised alignment of broadcast subtitles}, booktitle = {Proceedings of the 17th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2016}, address = {San Francisco, CA}, author = {Julia Olcoz and Oscar Saz and Thomas Hain} } @conference {369, title = {Evaluating comprehension of natural and synthetic conversational speech}, booktitle = {Speech Prosody}, year = {2016}, month = {June}, publisher = {ISCA}, organization = {ISCA}, address = {Boston, MA}, abstract = {Current speech synthesis methods typically operate on isolated sentences and lack convincing prosody when generating longer segments of speech. Similarly, prevailing TTS evaluation paradigms, such as intelligibility (transcription word error rate) or MOS, only score sentences in isolation, even though overall comprehension is arguably more important for speech-based communication. In an effort to develop more ecologically-relevant evaluation techniques that go beyond isolated sentences, we investigated comprehension of natural and synthetic speech dialogues. Specifically, we tested listener comprehension on long segments of spontaneous and engaging conversational speech (three 10-minute radio interviews of comedians). Interviews were reproduced either as natural speech, synthesised from carefully prepared transcripts, or synthesised using durations from forced-alignment against the natural speech, all in a balanced design. Comprehension was measured using multiple choice questions. A significant difference was measured between the comprehension/retention of natural speech (74\% correct responses) and synthetic speech with forced-aligned durations (61\% correct responses). However, no significant difference was observed between natural and regular synthetic speech (70\% correct responses). Effective evaluation of comprehension remains elusive.}, keywords = {comprehension, conversational speech, evaluation, statistical parametric speech synthesis}, url = {http://www.isca-speech.org/archive/sp2016/pdfs_stamped/41.pdf}, author = {Wester, Mirjam and Watts, Oliver and Henter, Gustav Eje} } @conference {nicolao_lrec2016, title = {A framework for collecting realistic recordings of dysarthric speech - the homeService corpus}, booktitle = {The International Conference on Language Resources and Evaluation - LREC 2016}, year = {2016}, month = {05/2016}, publisher = {ELRA}, organization = {ELRA}, address = {Portoro{\v z}, SLO}, author = {Nicolao, Mauro and Christensen, Heidi and Cunningham, Stuart and Green, Phil and Hain, Thomas} } @conference {watts2016hmms, title = {From {HMM}s to {DNN}s: where do the improvements come from?}, booktitle = {Proc. ICASSP}, volume = {41}, year = {2016}, month = {March}, publisher = {IEEE}, organization = {IEEE}, address = {Shanghai, China}, abstract = {Deep neural networks (DNNs) have recently been the focus of much text-to-speech research as a replacement for decision trees and hidden Markov models (HMMs) in statistical parametric synthesis systems. Performance improvements have been reported; however, the configuration of systems evaluated makes it impossible to judge how much of the improvement is due to the new machine learning methods, and how much is due to other novel aspects of the systems. Specifically, whereas the decision trees in HMM-based systems typically operate at the state-level, and separate trees are used to handle separate acoustic streams, most DNN-based systems are trained to make predictions simultaneously for all streams at the level of the acoustic frame. This paper isolates the influence of three factors (machine learning method; state vs. frame predictions; separate vs. combined stream predictions) by building a continuum of systems along which only a single factor is varied at a time. We find that replacing decision trees with DNNs and moving from state-level to frame-level predictions both significantly improve listeners\&$\#$39; naturalness ratings of synthetic speech produced by the systems. No improvement is found to result from switching from separate-stream to combined-stream predictions.}, keywords = {decision tree, deep neural network, hidden Markov model, speech synthesis}, url = {http://homepages.inf.ed.ac.uk/ghenter/pubs/watts2016hmms.pdf}, author = {Watts, Oliver and Henter, Gustav Eje and Merritt, Thomas and Wu, Zhizheng and King, Simon} } @conference {ng_icassp16, title = {Groupwise learning for ASR k-best list reranking in spoken language translation}, booktitle = {Proceedings of the 2016 International Conference on Acoustic, Speech and Signal Processing (ICASSP)}, year = {2016}, address = {Shanghai, China}, author = {Raymond W. M. Ng and Kashif Shah and Lucia Specia and Thomas Hain} } @conference {371, title = {A hierarchical predictor of synthetic speech naturalness using neural networks}, booktitle = {Proc. Interspeech}, year = {2016}, month = {September}, publisher = {ISCA}, organization = {ISCA}, address = {San Francisco, CA}, abstract = {A problem when developing and tuning speech synthesis systems is that there is no well-established method of automatically rating the quality of the synthetic speech. This research attempts to obtain a new automated measure which is trained on the result of large-scale subjective evaluations employing many human listeners, i.e., the Blizzard Challenge. To exploit the data, we experiment with linear regression, feed-forward and convolutional neural network models, and combinations of them to regress from synthetic speech to the perceptual scores obtained from listeners. The biggest improvements were seen when combining stimulus- and system-level predictions.}, keywords = {Blizzard Challenge, naturalness, neural network, speech synthesis}, author = {Yoshimura, Takenori and Henter, Gustav Eje and Watts, Oliver and Wester, Mirjam and Yamagishi, Junichi and Tokuda, Keiichi} } @conference {wl519:2016ab, title = {Improved DNN-based Segmentation for Multi-genre Broadcast Audio}, booktitle = {Proc. ICASSP{\textquoteright}16}, year = {2016}, address = {Shanghai, China}, url = {http://mi.eng.cam.ac.uk/~cz277/doc/Conference-ICASSP2016-DNNSEG.pdf}, author = {Wang, L. and Zhang, C. and Woodland, P.C. and Gales, M.J.F. and Karanasou, P. and Lanchantin, P. and Liu, X. and Qian, Y.} } @conference {casanueva_is16, title = {Improving generalisation to new speakers in spoken dialogue state tracking}, booktitle = {Proceedings of the 17th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2016}, address = {San Francisco, CA}, author = {I{\~n}igo Casanueva and Thomas Hain and Phil Green} } @conference {qian16Integrated, title = {Integrated adaptation with multi-factor joint-learning for far-field speech recognition}, booktitle = {Proc. ICASSP{\textquoteright}16}, year = {2016}, address = {Shanghai, China}, author = {Yanmin Qian and Tian Tan and Dong Yu and Yu Zhang} } @conference {qian16An, title = {An investigation into using parallel data for far-field speech recognition}, booktitle = {Proc. ICASSP{\textquoteright}16}, year = {2016}, address = {Shanghai, China}, author = {Yanmin Qian and Tian Tan and Dong Yu} } @article {7462247, title = {Learning Hidden Unit Contributions for Unsupervised Acoustic Model Adaptation}, journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing}, volume = {24}, number = {8}, year = {2016}, month = {Aug}, pages = {1450-1463}, keywords = {acoustic signal processing, Acoustics, Adaptation, Adaptation models, AMI meetings, Aurora4, consistent word error rate reductions, deep neural networks (DNNs), DNN acoustic model, factorisation, feature extraction, Hidden Markov models, learning hidden unit contributions, learning hidden unit contributions (lHUC), LHUC, neural nets, neural network acoustic models, Neural networks, SAT, speaker adaptive training framework, speaker recognition, speaker-dependent manner, speaker-independent manner, speech recognition benchmarks, Switchboard, TED talks, Training, Transforms, unsupervised acoustic model adaptation, unsupervised learning}, issn = {2329-9290}, doi = {10.1109/TASLP.2016.2560534}, author = {P. Swietojanski and J. Li and S. Renals} } @conference {qian15multi, title = {Multi-task joint-learning of deep neural networks for robust speech recognition}, booktitle = {Proc. ASRU{\textquoteright}15}, year = {2016}, address = {Scottsdale, Arizona, USA}, author = {Yanmin Qian, Maofan Yin, Yongbin You and Kai Yu} } @conference {henter2016robust, title = {Robust {TTS} duration modelling using {DNN}s}, booktitle = {Proc. ICASSP}, volume = {41}, year = {2016}, month = {March}, publisher = {IEEE}, organization = {IEEE}, address = {Shanghai, China}, abstract = {Accurate modelling and prediction of speech-sound durations is an important component in generating more natural synthetic speech. Deep neural networks (DNNs) offer a powerful modelling paradigm, and large, found corpora of natural and expressive speech are easy to acquire for training them. Unfortunately, found datasets are seldom subject to the quality-control that traditional synthesis methods expect. Common issues likely to affect duration modelling include transcription errors, reductions, filled pauses, and forced-alignment inaccuracies. To combat this, we propose to improve modelling and prediction of speech durations using methods from robust statistics, which are able to disregard ill-fitting points in the training material. We describe a robust fitting criterion based on the density power divergence (the beta-divergence) and a robust generation heuristic using mixture density networks (MDNs). Perceptual tests indicate that subjects prefer synthetic speech generated using robust models of duration over the baselines.}, keywords = {duration modelling, robust statistics, speech synthesis}, url = {http://homepages.inf.ed.ac.uk/ghenter/pubs/henter2016robust.pdf}, author = {Henter, Gustav Eje and Ronanki, Srikanth and Watts, Oliver and Wester, Mirjam and Wu, Zhizheng and King, Simon} } @conference {Swietojanski_ICASSP2016, title = {SAT-LHUC: Speaker Adaptive Training for Learning Hidden Unit Contributions}, booktitle = {Proc. IEEE ICASSP}, year = {2016}, month = {March}, address = {Shanghai, China}, abstract = {This paper extends learning hidden unit contributions (LHUC) unsupervised speaker adaptation with speaker adaptive training (SAT). Contrary to other SAT approaches, the proposed technique does not require speaker-dependent features, the generation of auxiliary generative models to estimate or extract speaker-dependent information, or any changes to the speaker-independent model structure. SAT-LHUC is directly integrated into the objective and jointly learns speaker-independent and speaker-dependent representations. We demonstrate that the SAT-LHUC technique can match feature-space regression transforms for matched narrow-band data and outperform it on wide-band data when the runtime distribution differs significantly from training one. We have obtained 6.5\%, 10\% and 18.5\% relative word error rate reductions compared to speaker-independent models on Switchboard, AMI meetings and TED lectures, respectively. This corresponds to relative gains of 2\%, 4\% and 6\% compared with non-SAT LHUC adaptation. SAT-LHUC was also found to be complementary to SAT with feature-space maximum likelihood linear regression transforms.}, url = {http://homepages.inf.ed.ac.uk/s1136550/data/Swietojanski_ICASSP2016.pdf}, author = {Swietojanski, P. and Renals, S.} } @conference {lu2016segmental, title = {Segmental Recurrent Neural Networks for End-to-end Speech Recognition}, booktitle = {Proc. INTERSPEECH}, year = {2016}, author = {Lu, Liang and Kong, Lingpeng and Dyer, Chris and Smith, Noah A and Renals, Steve} } @conference {milner_icassp16, title = {Segment-oriented evaluation of speaker diarisation performance}, booktitle = {Proceedings of the 2016 International Conference on Acoustic, Speech and Signal Processing (ICASSP)}, year = {2016}, address = {Shanghai, China}, author = {Rosanna Milner and Thomas Hain} } @conference {392, title = {Selection of Multi-genre Broadcast Data for the Training of Automatic Speech Recognition Systems}, booktitle = {Proc. ICASSP}, year = {2016}, author = {P. Lanchantin and M.J.F. Gales and P. Karanasou and X. Liu and Y. Qian and L. Wang and P.C. Woodland and C. Zhang} } @conference {ng_odyssey16, title = {{Sheffield {LRE} 2015 System Description}}, booktitle = {{Odyssey: The Speaker and Language Recognition Workshop (Submitted)}}, year = {2016}, month = {June}, author = {Raymond W. M. Ng and Mauro Nicolao and Oscar Saz and Madina Hasan and Bhusan Chettri and Mortaza Doulaty and Tan Lee and Thomas Hain} } @conference {Liu2016, title = {The Sheffield Wargame Corpus - Day Two and Day Three}, booktitle = {Proceedings of Interspeech 2016}, year = {2016}, month = {September}, address = {San Francisco, USA}, abstract = {Improving the performance of distant speech recognition is of considerable current interest, driven by a desire to bring speech recognition into people{\textquoteright}s homes. Standard approaches to this task aim to enhance the signal prior to recognition, typically using beamforming techniques on multiple channels. Only few real-world recordings are available that allow experimentation with such techniques. This has become even more pertinent with recent works with deep neural networks aiming to learn beamforming from data. Such approaches require large multi-channel training sets, ideally with location annotation for moving speakers, which is scarce in existing corpora. This paper presents a freely available and new extended corpus of English speech recordings in a natural setting, with moving speakers. The data is recorded with diverse microphone arrays, and uniquely, with ground truth location tracking. It extends the 8.0 hour Sheffield Wargames Corpus released in Interspeech 2013, with a further 16.6 hours of fully annotated data, including 6.1 hours of female speech to improve gender bias. Additional blog-based language model data is provided alongside, as well as a Kaldi baseline system. Results are reported with a standard Kaldi configuration, and a baseline meeting recognition system. Index Terms: distant speech recognition, multi-channel speech recognition, natural speech corpora, deep neural network.} pdf = {http://staffwww.dcs.shef.ac.uk/people/Y.Liu/publications/pdf/Liu2016.pdf}, author = {Yulan Liu and Charles Fox and Madina Hasan and Thomas Hain} } @conference {liang2016small, title = {Small-footprint Deep Neural Networks with Highway Connections for Speech Recognition}, booktitle = {Proc. INTERSPEECH}, year = {2016}, author = {Lu, Liang and Renals, Steve} } @conference {CassiaIS16, title = {Speech Enhancement for a Noise-Robust Text-to-Speech Synthesis System using Deep Recurrent Neural Networks}, booktitle = {Proc. Interspeech}, year = {2016}, month = {Sep.}, address = {San Fransisco, US}, author = {Valentini-Botinhao, C. and Wang, X. and Takaki, S. and Yamagishi, J.} } @conference {jy308:2016ab, title = {System Combiantion with Log-linear Models}, booktitle = {Proc. ICASSP{\textquoteright}16}, year = {2016}, address = {Shanghai, China}, url = {http://mi.eng.cam.ac.uk/~cz277/doc/Conference-ICASSP2016-LONLIN.pdf}, author = {Yang, J. and Zhang, C. and Ragni, A. and Gales, M.J.F. and Woodland, P.C.} } @conference {370, title = {A template-based approach for speech synthesis intonation generation using LSTMs}, booktitle = {Proc. Interspeech}, year = {2016}, month = {September}, publisher = {ISCA}, organization = {ISCA}, address = {San Francisco, CA}, abstract = {The absence of convincing intonation makes current parametric speech synthesis systems sound dull and lifeless, even when trained on expressive speech data. Typically, these systems use regression techniques to predict the fundamental frequency (F0) frame-by-frame. This approach leads to overly-smooth pitch contours and fails to construct an appropriate prosodic structure across the full utterance. In order to capture and reproduce larger-scale pitch patterns, this paper proposes a template-based approach for automatic F0 generation, where per-syllable pitch-contour templates (from a small, automatically learned set) are predicted by a recurrent neural network (RNN). The use of syllable templates mitigates the over-smoothing problem and is able to reproduce pitch patterns observed in the data. The use of an RNN, paired with connectionist temporal classification (CTC), enables the prediction of structure in the pitch contour spanning the entire utterance. This novel F0 prediction system is used alongside separate LSTMs for predicting phone durations and the other acoustic features, to construct a complete text-to-speech system. We report the results of objective and subjective tests on an expressive speech corpus of children\&$\#$39;s audiobooks, and include comparisons to a conventional baseline that predicts F0 directly at the frame level.}, keywords = {CTC, F0 templates, intonation modelling, LSTM, speech synthesis}, author = {Ronanki, Srikanth and Henter, Gustav Eje and Wu, Zhizheng and King, Simon} } @conference {dall2016testing, title = {Testing the consistency assumption: Pronunciation variant forced alignment in read and spontaneous speech synthesis}, booktitle = {Proc. ICASSP}, volume = {41}, year = {2016}, month = {March}, publisher = {IEEE}, organization = {IEEE}, address = {Shanghai, China}, abstract = {Forced alignment for speech synthesis traditionally aligns a phoneme sequence predetermined by the front-end text processing system. This sequence is not altered during alignment, i.e., it is forced, despite possibly being faulty. The consistency assumption is the assumption that these mistakes do not degrade models, as long as the mistakes are consistent across training and synthesis. We present evidence that in the alignment of both standard read prompts and spontaneous speech this phoneme sequence is often wrong, and that this is likely to have a negative impact on acoustic models. A lattice-based forced alignment system allowing for pronunciation variation is implemented, resulting in improved phoneme identity accuracy for both types of speech. A perceptual evaluation of HMM-based voices showed that spontaneous models trained on this improved alignment also improved standard synthesis, despite breaking the consistency assumption}, keywords = {forced alignment, HMM, speech synthesis, TTS}, url = {http://homepages.inf.ed.ac.uk/ghenter/pubs/dall2016testing.pdf}, author = {Dall, Rasmus and Brognaux, Sandrine and Richmond, Korin and Valentini-Botinhao, Cassia and Henter, Gustav Eje and Hirschberg, Julia and Yamagishi, Junichi and King, Simon} } @conference {lu2016training, title = {On training the recurrent neural network encoder-decoder for large vocabulary end-to-end speech recognition}, booktitle = {Proc. ICASSP}, year = {2016}, publisher = {IEEE}, organization = {IEEE}, author = {Lu, Liang and Zhang, Xingxing and Renais, Steve} } @conference {hain_is16, title = {webASR 2 - Improved cloud based speech technology}, booktitle = {Proceedings of the 17th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2016}, address = {San Francisco, CA}, author = {Thomas Hain and Jeremy Christian and Oscar Saz and Salil Deena and Madina Hasan and Raymond W. M. Ng and Rosanna Milner and Mortaza Doulaty and Yulan Liu} } @conference {milner_ASRU2015, title = {{The 2015 Sheffield System for Longitudinal Diarisation of Broadcast Media}}, booktitle = {{Proceedings of the 2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)}}, year = {2015}, address = {Scottsdale, AZ}, author = {Rosanna Milner and Oscar Saz and Salil Deena and Mortaza Doulaty and Raymond Ng and Thomas Hain} } @conference {Saz_ASRU, title = {{The 2015 Sheffield System for Transcription of Multi{\textendash}Genre Broadcast Media}}, booktitle = {{Proceedings of the 2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)}}, year = {2015}, address = {Scottsdale, AZ}, author = {Oscar Saz and Mortaza Doulaty and Salil Deena and Rosanna Milner and Raymond Ng and Madina Hasan and Yulan Liu and Thomas Hain} } @conference {wester:listeners:IS2015, title = {Are we using enough listeners? {No! An empirically-supported critique of Interspeech 2014 TTS evaluations}}, booktitle = {Proc. of Interspeech}, year = {2015}, month = {September}, address = {Dresden}, abstract = {Tallying the numbers of listeners that took part in subjective evaluations of synthetic speech at Interspeech 2014 showed that in more than 60\% of papers conclusions are based on listening tests with less than 20 listeners. Our analysis of Blizzard 2013 data shows that for a MOS test measuring naturalness a stable level of significance is only reached when more than 30 listeners are used. In this paper, we set out a list of guidelines, i.e., a checklist for carrying out meaningful subjective evaluations. We further illustrate the importance of sentence coverage and number of listeners by presenting changes to rank order and number of significant pairs by re-analysing data from the Blizzard Challenge 2013.}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/wester:listeners:IS2015.pdf}, author = {Wester, Mirjam and Valentini-Botinhao, Cassia and Henter, Gustav Eje} } @conference {wester:artificial:IS2015, title = {Artificial Personality and Disfluency}, booktitle = {Proc. of Interspeech}, year = {2015}, month = {September}, address = {Dresden}, abstract = {The focus of this paper is artificial voices with different personalities. Previous studies have shown links between an individual\&$\#$39;s use of disfluencies in their speech and their perceived personality. Here, filled pauses (uh and um) and discourse markers (like, you know, I mean) have been included in synthetic speech as a way of creating an artificial voice with different personalities. We discuss the automatic insertion of filled pauses and discourse markers (i.e., fillers) into otherwise fluent texts. The automatic system is compared to a ground truth of human \"acted\" filler insertion. Perceived personality (as defined by the big five personality dimensions) of the synthetic speech is assessed by means of a standardised questionnaire. Synthesis without fillers is compared to synthesis with either spontaneous or synthetic fillers. Our findings explore how the inclusion of disfluencies influences the way in which subjects rate the perceived personality of an artificial voice.}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/wester:artificial:IS2015.pdf}, author = {Wester, Mirjam and Aylett, Matthew and Tomalin, Marcus and Dall, Rasmus} } @conference {Merritt2015Attributing, title = {Attributing modelling errors in HMM synthesis by stepping gradually from natural to modelled speech}, booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, year = {2015}, address = {Brisbane}, abstract = {Even the best statistical parametric speech synthesis systems do not achieve the naturalness of good unit selection. We investigated possible causes of this. By constructing speech signals that lie inbetween natural speech and the output from a complete HMM synthesis system, we investigated various effects of modelling. We manipulated the temporal smoothness and the variance of the spectral parameters to create stimuli, then presented these to listeners alongside natural and vocoded speech, as well as output from a full HMM-based text-to-speech system and from an idealised {\textquoteleft}pseudo-HMM\&$\#$39;. All speech signals, except the natural waveform, were created using vocoders employing one of two popular spectral parameterisations: Mel-Cepstra or Mel-Line Spectral Pairs. Listeners made {\textquoteleft}same or different\&$\#$39; pairwise judgements, from which we generated a perceptual map using Multidimensional Scaling. We draw conclusions about which aspects of HMM synthesis are limiting the naturalness of the synthetic speech.}, author = {Merritt, Thomas and Latorre, Javier and King, Simon} } @conference {Woo15, title = {{C}ambridge {U}niversity {T}ranscription {S}ystems for the {M}ulti-{G}enre {B}roadcast {C}hallenge}, booktitle = {Proc. of ASRU}, year = {2015}, address = {Scottsdale, USA}, url = {http://mi.eng.cam.ac.uk/~xl207/publications/conferences/ASRU2015-mgbsys-asr.pdf}, author = {P.C. Woodland and X. Liu and Y. Qian and C. Zhang and M.J.F. Gales and P. Karanasou and P. Lanchantin and L. Wang} } @conference {bell15_complementary_task_mt, title = {Complementary tasks for context-dependent deep neural network acoustic models}, booktitle = {Proc. Interspeech}, year = {2015}, url = {http://www.cstr.ed.ac.uk/downloads/publications/2015/bell15_complementary_task_mt.pdf}, author = {Bell, Peter and Renals, Steve} } @conference {doulaty15, title = {Data-selective Transfer Learning for Multi-Domain Speech Recognition}, booktitle = {Proceedings of the 16th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2015}, address = {Dresden, Germany}, url = {http://www.isca-speech.org/archive/interspeech_2015/papers/i15_2897.pdf}, author = {M. Doulaty and Oscar Saz and Thomas Hain} } @article {7169536, title = {{A Deep Generative Architecture for Postfiltering in Statistical Parametric Speech Synthesis}}, journal = {Audio, Speech, and Language Processing, IEEE/ACM Transactions on}, volume = {23}, number = {11}, year = {2015}, pages = {2003-2014}, abstract = {The generated speech of hidden Markov model (HMM)-based statistical parametric speech synthesis still sounds muffled. One cause of this degradation in speech quality may be the loss of fine spectral structures. In this paper, we propose to use a deep generative architecture, a deep neural network (DNN) generatively trained, as a postfilter. The network models the conditional probability of the spectrum of natural speech given that of synthetic speech to compensate for such gap between synthetic and natural speech. The proposed probabilistic postfilter is generatively trained by cascading two restricted Boltzmann machines (RBMs) or deep belief networks (DBNs) with one bidirectional associative memory (BAM). We devised two types of DNN postfilters: one operating in the mel-cepstral domain and the other in the higher dimensional spectral domain. We compare these two new data-driven postfilters with other types of postfilters that are currently used in speech synthesis: a fixed mel-cepstral based postfilter, the global variance based parameter generation, and the modulation spectrum-based enhancement. Subjective evaluations using the synthetic voices of a male and female speaker confirmed that the proposed DNN-based postfilter in the spectral domain significantly improved the segmental quality of synthetic speech compared to that with conventional methods.}, keywords = {deep generative architecture, HMM, modulation spectrum, postfilter, segmental quality, speech synthesis}, issn = {2329-9290}, doi = {10.1109/TASLP.2015.2461448}, author = {Chen, Ling-Hui and Raitio, T. and Valentini-Botinhao, C. and Ling, Z. and Yamagishi, J.} } @conference {merritt2015richcontext, title = {Deep neural network context embeddings for model selection in rich-context HMM synthesis}, booktitle = {Proc. Interspeech}, year = {2015}, month = {September}, pages = {2207{\textendash}2211}, address = {Dresden, Germany}, abstract = {This paper introduces a novel form of parametric synthesis that uses context embeddings produced by the bottleneck layer of a deep neural network to guide the selection of models in a rich-context HMM-based synthesiser. Rich-context synthesis {\textendash} in which Gaussian distributions estimated from single linguistic contexts seen in the training data are used for synthesis, rather than more conventional decision tree-tied models {\textendash} was originally proposed to address over-smoothing due to averaging across contexts. Our previous investigations have confirmed experimentally that averaging across different contexts is indeed one of the largest factors contributing to the limited quality of statistical parametric speech synthesis. However, a possible weakness of the rich context approach as previously formulated is that a conventional tied model is still used to guide selection of Gaussians at synthesis time. Our proposed approach replaces this with context embeddings derived from a neural network.}, url = {http://www.research.ed.ac.uk/portal/files/19840195/Merritt_Interspeech2015.pdf}, author = {Merritt, Thomas and Yamagishi, Junichi and Wu, Zhizheng and Watts, Oliver and King, Simon} } @conference {WuICASSP15, title = {Deep neural networks employing multi-task learning and stacked bottleneck features for speech synthesis}, booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, year = {2015}, abstract = {Deep neural networks (DNNs) use a cascade of hidden representations to enable the learning of complex mappings from input to output features. They are able to learn the complex mapping from text-based linguistic features to speech acoustic features, and so perform text-to-speech synthesis. Recent results suggest that DNNs can produce more natural synthetic speech than conventional HMM-based statistical parametric systems. In this paper, we show that the hidden representation used within a DNN can be improved through the use of Multi-Task Learning, and that stacking multiple frames of hidden layer activations (stacked bottleneck features) also leads to improvements. Experimental results confirmed the effectiveness of the proposed methods, and in listening tests we find that stacked bottleneck features in particular offer a significant improvement over both a baseline DNN and a benchmark HMM system}, url = {http://www.zhizheng.org/papers/icassp2015_dnn_tts.pdf}, author = {Zhizheng Wu and Cassia Valentini-Botinhao and Oliver Watts and Simon King} } @conference {SwietojanskiICASSP15, title = {Differentiable Pooling for Unsupervised Speaker Adaptation}, booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, year = {2015}, abstract = {This paper proposes a differentiable pooling mechanism to perform model-based neural network speaker adaptation. The proposed technique learns a speaker-dependent combination of activations within pools of hidden units, was shown to work well unsupervised, and does not require speaker-adaptive training. We have conducted a set of experiments on the TED talks data, as used in the IWSLT evaluations. Our results indicate that the approach can reduce word error rates (WERs) on standard IWSLT test sets by about 5{\textendash}11\% relative compared to speaker-independent systems and was found complementary to the recently proposed learning hidden units contribution (LHUC) approach, reducing WER by 6{\textendash}13\% relative. Both methods were also found to work well when adapting with small amounts of unsupervised data {\textendash} 10 seconds is able to decrease the WER by 5\% relative compared to the baseline speaker independent system}, url = {http://homepages.inf.ed.ac.uk/s1136550/data/Swietojanski_ICASSP2015.pdf}, author = {Swietojanski, P. and Renals, S.} } @conference {dall:diss2015, title = {Disfluencies in change detection in natural, vocoded and synthetic speech}, booktitle = {Proc. of DiSS 2015}, year = {2015}, month = {August}, address = {Edinburgh}, abstract = {In this paper, we investigate the effect of filled pauses, a discourse marker and silent pauses in a change detection experiment in natural, vocoded and synthetic speech. In natural speech change detection has been found to increase in the presence of filled pauses, we extend this work by replicating earlier findings and explore the effect of a discourse marker, like, and silent pauses. Furthermore we report how the use of \"unnatural\" speech, namely synthetic and vocoded, affects change detection rates. It was found that the filled pauses, the discourse marker and silent pauses all increase change detection rates in natural speech, however in neither synthetic nor vocoded speech did this effect appear. Rather, change detection rates decreased in both types of \"unnatural\" speech compared to natural speech. The natural results suggests that while each type of pause increase detection rates, the type of pause may have a further effect. The \"unnatural\" results suggest that it is not the full pipeline of synthetic speech that causes the degradation, but rather that something in the pre-processing, i.e. vocoding, of the speech database limits the resulting synthesis.}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/dall:diss:2015.pdf}, author = {Dall, Rasmus and Wester, Mirjam and Corley, Martin} } @book {Obi15, title = {Exploiting Alternatives for Text-To-Speech Synthesis: From Machine to Human}, series = {Speech Prosody in Speech Synthesis: Modeling and generation of prosody for high quality and flexible speech synthesis}, year = {2015}, month = {February}, pages = {189-202}, publisher = {Springer Verlag}, organization = {Springer Verlag}, chapter = {Control of Prosody in Speech Synthesis}, author = {N. Obin and C. Veaux and P. Lanchantin} } @conference {llu_is2015a, title = {Feature-space Speaker Adaptation for Probabilistic Linear Discriminant Analysis Acoustic Models}, booktitle = {Proc. INTERSPEECH}, year = {2015}, abstract = {Probabilistic linear discriminant analysis (PLDA) acoustic models extend Gaussian mixture models by factorizing the acoustic variability using state-dependent and observation-dependent variables. This enables the use of higher dimensional acoustic features, and the capture of intra-frame feature correlations. In this paper, we investigate the estimation of speaker adaptive feature-space (constrained) maximum likelihood linear regression transforms from PLDA-based acoustic models. This feature-space speaker transformation estimation approach is potentially very useful due to the ability of PLDA acoustic models to use different types of acoustic features, for example applying these transforms to deep neural network (DNN) acoustic models for cross adaptation. We evaluated the approach on the Switchboard corpus, and observe significant word error reduction by using both the mel-frequency cepstral coefficients and DNN bottleneck features.}, author = {Lu, Liang and Renals, Steve} } @conference {Zhang:2015ab, title = {A General Artificial Neural Network Extension for {HTK}}, booktitle = {Proc. Interspeech{\textquoteright}15}, year = {2015}, address = {Dresden, Germany}, url = {http://www.isca-speech.org/archive/interspeech_2015/papers/i15_3581.pdf}, author = {Zhang, C. and Woodland, P.C.} } @conference {wester:human:IS2015, title = {Human vs Machine Spoofing Detection on Wideband and Narrowband Data}, booktitle = {Proc. of Interspeech}, year = {2015}, month = {September}, address = {Dresden}, abstract = {How well do humans detect spoofing attacks directed at automatic speaker verification systems? This paper investigates the performance of humans at detecting spoofing attacks from speech synthesis and voice conversion systems. Two speaker verification tasks, in which the speakers were either humans or machines, were also conducted. The three tasks were carried out with two types of data: wideband (16kHz) and narrowband (8kHz) telephone line simulated data. Spoofing detection by humans was compared to automatic spoofing detection (ASD) algorithms. Listening tests were carefully constructed to en- sure the human and automatic tasks were as similar as possible taking into consideration listener\’s constraints (e.g., fatigue and memory limitations). Results for human trials show the error rates on narrowband data double compared to on wide- band data. The second verification task, which included only artificial speech, showed equal overall acceptance rates for both 8kHz and 16kHz. In the spoofing detection task, there was a drop in performance on most of the artificial trials as well as on human trials. At 8kHz, 20\% of human trials were incorrectly classified as artificial, compared to 12\% at 16kHz. The ASD algorithms also showed a drop in performance on 8kHz data, but outperformed human listeners across the board.}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/wester:human:IS2015.pdf}, author = {Wester, Mirjam and Wu, Zhizheng and Yamagishi, Junichi} } @conference {chen2015improving, title = {Improving the training and evaluation efficiency of recurrent neural network language models}, booktitle = {Proc. ICASSP}, year = {2015}, address = {Brisbane, Australia}, url = {http://mi.eng.cam.ac.uk/~xc257/papers/ICASSP2015-rnnlm-extend.pdf}, author = {Chen, Xie and Liu, Xunying and Gales, Mark and Woodland, Phil} } @article { martinez2015intelligibility, title = {Intelligibility Assessment and Speech Recognizer Word Accuracy Rate Prediction for Dysarthric Speakers in a Factor Analysis Subspace}, journal = {ACM Transactions on Accessible Computing (TACCESS)}, volume = {6}, number = {3}, year = {2015}, pages = {10}, publisher = {ACM}, doi = {10.1145/2746405}, url = {http://dl.acm.org/citation.cfm?id=2746405}, author = {Mart{\'\i}nez, David and Lleida, Eduardo and Green, Phil and Christensen, Heidi and Ortega, Alfonso and Miguel, Antonio} } @conference {Liu2015, title = {An Investigation Into Speaker Informed DNN Front-end for {LVCSR}}, booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, year = {2015}, month = {April}, abstract = {Deep Neural Network (DNN) has become a standard method in many ASR tasks. Recently there is considerable interest in \“informed training\” of DNNs, where DNN input is augmented with auxiliary codes, such as i-vectors, speaker codes, speaker separation bottleneck (SSBN) features, etc. This paper compares different speaker informed DNN training methods in LVCSR task. We discuss mathematical equivalence between speaker informed DNN training and \“bias adaptation\” which uses speaker dependent biases, and give detailed analysis on influential factors such as dimension, discrimination and stability of auxiliary codes. The analysis is supported by experiments on a meeting recognition task using bottleneck feature based system. Results show that i-vector based adaptation is also effective in bottleneck feature based system (not just hybrid systems). However all tested methods show poor generalisation to unseen speakers. We introduce a system based on speaker classification followed by speaker adaptation of biases, which yields equivalent performance to an i-vector based system with 10.4\% relative improvement over baseline on seen speakers. The new approach can serve as a fast alternative especially for short utterances. \ }, keywords = {bias adaptation, deep neural network, speaker adaptation, speaker informed training, Speech recognition}, url = {http://staffwww.dcs.shef.ac.uk/people/Y.Liu/publications/pdf/Liu2015.pdf}, author = {Yulan Liu and Penny Karanasou and Thomas Hain} } @conference {Karanasou_IS2015, title = {I-Vector Estimation Using Informative Priors for Adaptation of Deep Neural Networks}, booktitle = {Proc. of Interspeech}, year = {2015}, author = {Penny Karanasou and Mark Gales and Philip Woodland} } @conference {MGB, title = {Latent Dirichlet Allocation Based Organisation of Broadcast Media Archives for Deep Neural Network Adaptation}, booktitle = {Proc. of ASRU}, year = {2015}, address = {Arizona, USA}, author = {M. Doulaty and Oscar Saz and Raymond W. M. Ng and Thomas Hain} } @conference {tomalin:diss:2015, title = {A Lattice-based Approach to Automatic Filled Pause Insertion}, booktitle = {Proc. of DiSS 2015}, year = {2015}, month = {August}, address = {Edinburgh}, abstract = {This paper describes a novel method for automatically inserting filled pauses (e.g., UM) into fluent texts. Although filled pauses are known to serve a wide range of psychological and structural functions in conversational speech, they have not traditionally been modelled overtly by state-of-the-art speech synthesis systems. However, several recent systems have started to model disfluencies specifically, and so there is an increasing need to create disfluent speech synthesis input by automatically inserting filled pauses into otherwise fluent text. The approach presented here interpolates Ngrams and Full-Output Recurrent Neural Network Language Models (f-RNNLMs) in a lattice-rescoring framework. It is shown that the interpolated system outperforms separate Ngram and f-RNNLM systems, where performance is analysed using the Precision, Recall, and F-score metrics.}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/tomalin:diss:2015.pdf}, author = {Tomalin, Marcus and Wester, Mirjam and Dall, Rasmus and Byrne, Bill and King, Simon} } @conference {Hu_ICASSP2015, title = {Methods for applying dynamic sinusoidal models to statistical parametric speech synthesis}, booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, year = {2015}, author = {Qiong Hu and Yannis Stylianou and Ranniery Maia and Korin Richmond and Junichi Yamagishi} } @conference {Bell_ASRU, title = {{The MGB Challenge: Evaluating Multi-genre Broadcast Media Recognition}}, booktitle = {{Proceedings of the 2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)}}, year = {2015}, address = {Scottsdale, AZ}, url = {http://homepages.inf.ed.ac.uk/pbell1/bell15_mgb_challenge.pdf}, author = {Peter Bell and Mark Gales and Thomas Hain and Jonathan Kilgour and Pierre Lanchantin and Andrew Liu and Andrew McParland and Steve Renals and Oscar Saz and Mirjam Wester and Phil Woodland} } @conference {UriaICASSP15, title = {{Modelling acoustic feature dependencies with artificial neural networks: Trajectory-RNADE}}, booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, year = {2015}, abstract = {Given a transcription, sampling from a good model of acoustic feature trajectories should result in plausible realizations of an utterance. However, samples from current probabilistic speech synthesis systems result in low quality synthetic speech. Henter et al. have demonstrated the need to capture the dependencies between acoustic features conditioned on the phonetic labels in order to obtain high quality synthetic speech. These dependencies are often ignored in neural network based acoustic models. We tackle this deficiency by introducing a probabilistic neural network model of acoustic trajectories, trajectory RNADE, able to capture these dependencies.}, url = {http://www.benignouria.com/en/research/rnade_synthesis/draft.pdf}, author = {Benigno Uria and Iain Murray and Steve Renals and Cassia Valentini-Botinhao} } @conference {301, title = {MULTI-BASIS ADAPTIVE NEURAL NETWORK FOR RAPID ADAPTATION IN SPEECH RECOGNITION}, booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on}, year = {2015}, publisher = {IEEE}, organization = {IEEE}, url = {https://www.wiki.ed.ac.uk/download/attachments/152778802/mbann-rev15.pdf?version=1\&modificationDate=1412870033000\&api=v2}, author = {Wu, Chunyang and Gales, Mark} } @conference {llu_icassp15, title = {Multi-frame factorisation for long-span acoustic modelling}, booktitle = {Proc. ICASSP}, year = {2015}, abstract = {Acoustic models based on Gaussian mixture models (GMMs) typically use short span acoustic feature inputs. This does not capture long-term temporal information from speech owing to the conditional independence assumption of hidden Markov models. In this paper, we present an implicit approach that approximates the joint distribution of long span features by product of factorized models, in contrast to deep neural networks (DNNs) that model feature correlations directly. The approach is applicable to a broad range of acoustic models. We present experiments using GMM and probabilistic linear discriminant analysis (PLDA) based models on Switchboard, observing consistent word error rate reductions.}, author = {Lu, Liang and Renals, Steve} } @conference {ali15_multi_wer_asr, title = {Multi-reference {WER} for evaluating {ASR} for languages with no orthographic rules}, booktitle = {Proc. ASRU}, year = {2015}, author = {Ali, Ahmed. and Magdy, Walid. and Bell, Peter. and Renals, Steve} } @conference {qian15multi, title = {Multi-task joint-learning of deep neural networks for robust speech recognition}, booktitle = {Proc. ASRU{\textquoteright}15}, year = {2015}, address = {Scottsdale, Arizona, USA}, author = {Yanmin Qian and Maofan Yin and Yongbin You and Kai Yu} } @conference {Zhang:2015cd, title = {Parameterised Sigmoid and {R}e{LU} Hidden Activation Functions for {DNN} Acoustic Modelling}, booktitle = {Proc. Interspeech{\textquoteright}15}, year = {2015}, address = {Dresden, Germany}, url = {http://mi.eng.cam.ac.uk/~cz277/doc/Conference-Interspeech2015-ACT.pdf}, author = {Zhang, C. and Woodland, P.C.} } @conference {andrewliu:2015:icassp, title = {Paraphrastic Recurrent Neural Network Language Models}, booktitle = {Proc. ICASSP}, year = {2015}, address = {Brisbane, Australia}, url = {http://mi.eng.cam.ac.uk/~xc257/papers/ICASSP2015-prnnlm.pdf}, author = {Liu, Xunying and Chen, Xie and Gales, Mark and Woodland, Phil} } @proceedings {329, title = {Reconstructing Voices Within the Multiple-Average-Voice-Model Framework}, journal = {Interspeech}, year = {2015}, url = {http://www.isca-speech.org/archive/interspeech_2015/i15_2232.html}, author = {P. Lanchantin and C. Veaux and M. J. F. Gales and S. King and J. Yamagishi} } @conference {chen2015recurrent, title = {Recurrent neural network language model training with noise contrastive estimation for speech recognition}, booktitle = {Proc. ICASSP}, year = {2015}, address = {Brisbane, Australia}, url = {http://mi.eng.cam.ac.uk/~xc257/papers/ICASSP2015-rnnlm-nce.pdf}, author = {Chen, Xie and Liu, Xunying and Gales, Mark and Woodland, Phil} } @conference {bell15_cd_multitask, title = {Regularization of context-dependent deep neural networks with context-independent multi-task training}, booktitle = {Proc. ICASSP}, year = {2015}, url = {http://www.cstr.ed.ac.uk/downloads/publications/2015/bell15_cd_multitask.pdf}, author = {Bell, Peter and Renals, Steve} } @article {Obi15, title = {{S}ymbolic {M}odelling of {S}peech {P}rosody: {F}rom {L}inguistics to {S}tatistics}, journal = {IEEE Transactions on Audio, Speech, and Language Processing}, volume = {23}, number = {3}, year = {2015}, month = {March}, author = {N. Obin and P. Lanchantin} } @conference {288, title = {SAS: A speaker verification spoofing database containing diverse attacks}, booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, year = {2015}, author = {Zhizheng Wu and Ali Khodabakhsh and Cenk Demiroglu and Junichi Yamagishi and Daisuke Saito and Tomoki Toda and Simon King} } @proceedings {KaranasouASRU2015, title = {Speaker Diarisation and Longitudinal Linking in Multi-Genre Broadcast Data}, journal = {Proc. of ASRU}, year = {2015}, url = {http://mi.eng.cam.ac.uk/\%7Exl207/publications/conferences/ASRU2015-mgbsys-diarisation.pdf}, author = {P. Karanasou and M. J. F. Gales and P. Lanchantin and X. Liu and Y. Qian and L. Wang and P. C. Woodland and C. Zhang} } @conference {Christensen:2015uv, title = {{Speech-Enabled Environmental Control in an AAL setting for people with Speech Disorders: a Case Study}}, booktitle = {IET International Conference on Technologies for Active and Assisted Living}, year = {2015}, address = {London, UK}, url = {http://staffwww.dcs.shef.ac.uk/people/M.Nicolao/site/publications_files/speech-enabled-environmental.pdf}, author = {Christensen, Heidi and Nicolao, Mauro and Cunningham, Stuart and Deena, Salil and Green, Phil and Hain, Thomas} } @conference {Swietojanski2015, title = {Structured Output Layer with Auxiliary Targets for Context-Dependent Acoustic Modelling}, booktitle = {Proc. Interspeech}, year = {2015}, month = {September}, address = {Dresden, Germany}, abstract = {In previous work we have introduced a multi-task training technique for neural network acoustic modelling, in which context-dependent and context-independent targets are jointly learned. In this paper, we extend the approach by structuring the out-put layer such that the context-dependent outputs are dependent on the context-independent outputs, thus using the context-independent predictions at run-time. We have also investigated the applicability of this idea to unsupervised speaker adaptation as an approach to overcome the data sparsity issues that comes to the fore when estimating systems with a large number of context-dependent states, when data is limited. We have experimented with various amounts of training material (from 10 to 300 hours) and find the proposed techniques are particularly well suited to data-constrained conditions allowing to better utilise large context-dependent state-clustered trees. Experimental results are reported for large vocabulary speech recognition using the Switchboard and TED corpora.}, url = {http://homepages.inf.ed.ac.uk/s1136550/data/Swietojanski_Interspeech2015.pdf}, author = {Swietojanski, P. and Bell, P. and Renals, S.} } @conference {llu_is2015b, title = {A Study of the Recurrent Neural Network Encoder-Decoder for Large Vocabulary Speech Recognition}, booktitle = {Proc. INTERSPEECH}, year = {2015}, abstract = {Deep neural networks have advanced the state-of-the-art in automatic speech recognition, when combined with hidden Markov models (HMMs). Recently there has been interest in using systems based on recurrent neural networks (RNNs) to perform sequence modelling directly, without the requirement of an HMM superstructure. In this paper, we study the RNN encoder-decoder approach for large vocabulary end-to-end speech recognition, whereby an encoder transforms a sequence of acoustic vectors into a sequence of feature representations, from which a decoder recovers a sequence of words. We investigated this approach on the Switchboard corpus using a training set of around 300 hours of transcribed audio data. Without the use of an explicit language model or pronunciation lexicon, we achieved promising recognition accuracy, demonstrating that this approach warrants further investigation.}, author = {Lu, Liang and Zhang, Xingxing and Cho, KyungHyun and Renals, Steve} } @conference {bell15_alignment, title = {A system for automatic alignment of broadcast media captions using weighted finite-state transducers}, booktitle = {Proc. ASRU}, year = {2015}, url = {http://homepages.inf.ed.ac.uk/pbell1/bell15_alignment.pdf}, author = {Bell, Peter and Renals, Steve} } @conference {bell15_news_summarisation, title = {A system for automatic broadcast news summarisation, geolocation and translation}, booktitle = {Proc. Interspeech (demo session)}, year = {2015}, address = {Dresden, Germany}, abstract = {An increasing amount of news content is produced in audio-video form every day. To effectively analyse and monitoring this multilingual data stream, we require methods to extract and present audio content in accessible ways. In this paper, we describe an end-to-end system for processing and browsing audio news data. This fully automated system brings together our recent research on audio scene analysis, speech recognition, summarisation, named entity detection, geolocation, and machine translation. The graphical interface allows users to visualise the distribution of news content by entity names and story location. Browsing of news events is facilitated through extractive summaries and the ability to view transcripts in multiple languages.}, url = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/bell15_news_summarisation.pdf}, author = {Bell, Peter and Lai, Catherine and Llewellyn, Clare and Birch, Alexandra and Sinclair, Mark} } @conference {Lan15c, title = {{T}he {D}evelopment of the {C}ambridge {U}niversity {A}lignment {S}ystems for the {M}ulti-{G}enre {B}roadcast {C}hallenge}, booktitle = {Proc. of ASRU}, year = {2015}, address = {Scottsdale, USA}, url = {http://mi.eng.cam.ac.uk/~xl207/publications/conferences/ASRU2015-mgbsys-align.pdf}, author = {P. Lanchantin and P. Karanasou and M.J.F. Gales and X. Liu and L. Wang and Y. Qian and P. C. Woodland and C. Zhang} } @conference {Wester:diss:2015, title = {The Temporal Delay Hypothesis: Natural, Vocoded and Synthetic Speech}, booktitle = {Proc. of DiSS 2015}, year = {2015}, month = {August}, address = {Edinburgh}, abstract = {Including disfluencies in synthetic speech is being explored as a way of making synthetic speech sound more natural and conversational. How to measure whether the resulting speech is actually more natural, however, is not straightforward. Conventional approaches to synthetic speech evaluation fall short as a listener is either primed to prefer stimuli with filled pauses or, when they aren\&$\#$39;t primed they prefer more fluent speech. Psycholinguistic reaction time experiments may circumvent this issue. In this paper, we revisit one such reaction time experiment. For natural speech, delays in word onset were found to facilitate word recognition regardless of the type of delay; be they a filled pause (um), silence or a tone. We expand these experiments by examining the effect of using vocoded and synthetic speech. Our results partially replicate previous findings. For natural and vocoded speech, if the delay is a silent pause, significant increases in the speed of word recognition are found. If the delay comprises a filled pause there is a significant increase in reaction time for vocoded speech but not for natural speech. For synthetic speech, no clear effects of delay on word recognition are found. We hypothesise this is because it takes longer (requires more cognitive resources) to process synthetic speech than natural or vocoded speech.}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/wester:diss:2015.pdf}, author = {Wester, Mirjam and Corley, Martin and Dall, Rasmus} } @conference {cervone15_reported_speech_prosody, title = {Towards automatic detection of reported speech in dialogue using prosodic cues}, booktitle = {Proc. Interspeech}, year = {2015}, month = {sep}, address = {Dresden, Germany}, abstract = {The phenomenon of reported speech \– whereby we quote the words, thoughts and opinions of others, or recount past dialogue \– is widespread in conversational speech. Detecting such quotations automatically has numerous applications: for example, in enhancing automatic transcription or spoken language understanding applications. However, the task is challenging, not least because lexical cues of quotations are frequently ambiguous or not present in spoken language. The aim of this paper is to identify potential prosodic cues of reported speech which could be used, along with the lexical ones, to automatically detect quotations and ascribe them to their rightful source, that is reconstructing their Attribution Relations. In order to do so we analyze SARC, a small corpus of telephone conversations that we have annotated with Attribution Relations. The results of the statistical analysis performed on the data show how variations in pitch, intensity, and timing features can be exploited as cues of quotations. Furthermore, we build a SVM classifier which integrates lexical and prosodic cues to automatically detect quotations in speech that performs significantly better than chance.}, url = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/cervone15_reported_speech_prosody.pdf}, author = {Cervone, Alessandra and Lai, Catherine and Pareti, Silvia and Bell, Peter} } @conference {dnncost_IS15, title = {{Towards minimum perceptual error training for DNN-based speech synthesis}}, booktitle = {Proc. Interspeech}, year = {2015}, month = {Sept.}, address = {Dresden, Germany}, abstract = {We propose to use a perceptually-oriented domain to improve the quality of text-to-speech generated by deep neural networks (DNNs). We train a DNN that predicts the parameters required for speech reconstruction but whose cost function is calculated in another domain. In this paper, to represent this perceptual domain we extract an approximated version of the Spectro-Temporal Excitation Pattern that was originally proposed as part of a model of hearing speech in noise. We train DNNs that predict band aperiodicity, fundamental frequency and Mel cepstral coefficients and compare generated speech when the spectral cost function is defined in the Mel cepstral, warped log spectrum or perceptual domains. Objective results indicate that the perceptual domain system achieves the highest quality.}, url = {http://www.isca-speech.org/archive/interspeech_2015/papers/i15_0869.pdf}, author = {Valentini-Botinhao, C. and Wu, Z. and King, S.} } @conference {doulaty15b, title = {Unsupervised Domain Discovery using Latent Dirichlet Allocation for Acoustic Modelling in Speech Recognition}, booktitle = {Proceedings of the 16th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2015}, address = {Dresden, Germany}, url = {http://www.isca-speech.org/archive/interspeech_2015/papers/i15_3640.pdf}, author = {M. Doulaty and Oscar Saz and Thomas Hain} } @conference {karanasou_is14, title = {Adaptation of Deep Neural Network Acoustic Models Using Factorised I-vectors}, booktitle = {Proceedings of Interspeech{\textquoteright}14}, year = {2014}, author = {Penny Karanasou and Yongqiang Wang and Mark Gales and Phil Woodland} } @conference {casanueva_is14, title = {"Adaptive speech recognition and dialogue management for users with speech disorders}, booktitle = {Proceedings of Interspeech{\textquoteright}14}, year = {2014}, author = {I. Casanueva and H. Christensen and T. Hain and P. Green} } @conference {270, title = {Automatic Selection of Speakers for Improved Acoustic Modelling : Recognition of Disordered Speech with Sparse Data}, booktitle = {Spoken Language Technology Workshop, SLT{\textquoteright}14}, year = {2014}, address = {Lake Tahoe}, author = {H. Christensen and I. Casanueva and S. Cunningham and P. Green and T. Hain} } @conference {Saz14b, title = {Background-Tracking Acoustic Features for Genre Identification of Broadcast Shows}, booktitle = {Proceedings of the 2014 Spoken Language Technology (SLT) Workshop}, year = {2014}, pages = {118{\textendash}123}, address = {South Lake Tahoe NV, USA}, url = {http://staffwww.dcs.shef.ac.uk/people/O.Saztorralba/data/2014\%20-\%20oskarsaz\%20-\%20SLT.pdf}, author = {Oscar Saz and M. Doulaty and Thomas Hain} } @conference {Brown05, title = {Background-tracking acoustic features for genre identification of broadcast shows}, booktitle = {Proceedings of the 2014 IEEE Spoken Language Technology Workshop (SLT)}, year = {2014}, pages = {118{\textendash}123}, address = {South Lake Tahoe, NV}, url = {http://staffwww.dcs.shef.ac.uk/people/O.Saztorralba/data/2014\%20-\%20oskarsaz\%20-\%20SLT.pdf}, author = {O. Saz and M. Doulaty and T. Hain} } @article {Swietojanski:SPL14, title = {Convolutional Neural Networks for Distant Speech Recognition}, journal = {Signal Processing Letters, IEEE}, volume = {21}, number = {9}, year = {2014}, month = {September}, pages = {1120-1124}, abstract = {We investigate convolutional neural networks (CNNs) for large vocabulary distant speech recognition, trained using speech recorded from a single distant microphone (SDM) and multiple distant microphones (MDM). In the MDM case we explore a beamformed signal input representation compared with the direct use of multiple acoustic channels as a parallel input to the CNN. We have explored different weight sharing approaches, and propose a channel-wise convolution with two-way pooling. Our experiments, using the AMI meeting corpus, found that CNNs improve the word error rate (WER) by 6.5\% relative compared to conventional deep neural network (DNN) models and 15.7\% over a discriminatively trained Gaussian mixture model (GMM) baseline. For cross-channel CNN training, the WER improves by 3.5\% relative over the comparable DNN structure. Compared with the best beamformed GMM system, cross-channel convolution reduces the WER by 9.7\% relative, and matches the accuracy of a beamformed DNN.}, issn = {1070-9908}, doi = {10.1109/LSP.2014.2325781}, author = {Swietojanski, P. and Ghoshal, A. and Renals, S.} } @conference {bell14_xling_mlan, title = {Cross-lingual adaptation with multi-task adaptive networks}, booktitle = {Proc. Interspeech}, year = {2014}, abstract = {Posterior-based or bottleneck features derived from neural networks trained on out-of-domain data may be successfully applied to improve speech recognition performance when data is scarce for the target domain or language. In this paper we combine this approach with the use of a hierarchical deep neural network (DNN) network structure \– which we term a multi-level adaptive network (MLAN) \– and the use of multitask learning. We have applied the technique to cross-lingual speech recognition experiments on recordings of TED talks and European Parliament sessions in English (source language) and German (target language). We demonstrate that the proposed method can lead to improvements over standard methods, even when the quantity of training data for the target language is relatively high. When the complete method is applied, we achieve relative WER reductions of around 13\% compared to a monolingual hybrid DNN baseline.}, url = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/bell_xling_mlan_is2014.pdf}, author = {Bell, Peter and Driesen, Joris and Renals, Steve} } @article {lu2013cross, title = {{Cross-lingual subspace Gaussian mixture model for low-resource speech recognition}}, journal = {IEEE Transactions on Audio, Speech and Language Processing}, year = {2014}, abstract = {This paper studies cross-lingual acoustic modelling in the context of subspace Gaussian mixture models (SGMMs). SGMMs factorize the acoustic model parameters into a set that is globally shared between all the states of a hidden Markov model (HMM) and another that is specific to the HMM states. We demonstrate that the SGMM global parameters are transferable between languages, particularly when the parameters are trained multilingually. As a result, acoustic models may be trained using limited amounts of transcribed audio by borrowing the SGMM global parameters from one or more source languages, and only training the state-specific parameters on the target language audio. Model regularization using $\ell_1$-norm penalty is shown to be particularly effective at avoiding overtraining and leading to lower word error rates. We investigate maximum a posteriori (MAP) adaptation of subspace parameters in order to reduce the mismatch between the SGMM global parameters of the source and target languages. In addition, monolingual and cross-lingual speaker adaptive training is used to reduce the model variance introduced by speakers. We have systematically evaluated these techniques by experiments on the GlobalPhone corpus.}, url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6600925}, author = {Lu, Liang and Ghoshal, Arnab and Renals, Steve} } @conference {dall_IS14, title = {The Effect of Filled Pauses and Speaking Rate on Speech Comprehension in Natural, Vocoded and Synthetic Speech}, booktitle = {Proceedings of Interspeech}, year = {2014}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/Dall_Wester_Corley_IS14.pdf}, author = {Rasmus Dall and Mirjam Wester and Martin Corley} } @conference {chen2014efficient, title = {Efficient {GPU}-based training of recurrent neural network language models using spliced sentence bunch}, booktitle = {Proc. Interspeech}, year = {2014}, address = {Singapore}, url = {http://mi.eng.cam.ac.uk/~xc257/papers/RNNLMTrain_Interspeech2014.pdf}, author = {Chen, Xie and Wang, Yongqiang and Liu, Xunying and Gales, Mark and Woodland, Phil} } @conference {251, title = {EFFICIENT LATTICE RESCORING USING RECURRENT NEURAL NETWORK LANGUAGE MODELS}, booktitle = {IEEE ICASSP2014}, year = {2014}, month = {04/05/2014}, publisher = {IEEE ICASSP2014}, organization = {IEEE ICASSP2014}, address = {Florence, Italy}, abstract = {Recurrent neural network language models (RNNLM) have become an increasingly popular choice for state-of-the-art speech recognition systems due to their inherently strong generalization performance. As these models use a vector representation of complete history contexts, RNNLMs are normally used to rescore N-best lists. Motivated by their intrinsic characteristics, two novel lattice rescoring methods for RNNLMs are investigated in this paper. The first uses an $n$-gram style clustering of history contexts. The second approach directly exploits the distance measure between hidden history vectors. Both methods produced 1-best performance comparable with a 10k-best rescoring baseline RNNLM system on a large vocabulary conversational telephone speech recognition task. Significant lattice size compression of over 70\% and consistent improvements after confusion network (CN) decoding were also obtained over the N-best rescoring approach. \ }, keywords = {language model, recurrent neural network, Speech recognition}, author = {Xunying Liu and Yongqiang Wang and Xie Chen and Mark Gales and Phil Woodland} } @conference {aylett2014, title = {A Flexible Front-End for HTS}, booktitle = {Proc. Interspeech}, year = {2014}, address = {Singapore}, abstract = {Parametric speech synthesis techniques depend on full context acoustic models generated by language front-ends, which analyse linguistic and phonetic structure. HTS, the leading parametric synthesis system, can use a number of different front-ends to generate full context models for synthesis and training. In this paper we explore the use of a new text processing front-end that has been added to the speech recognition toolkit Kaldi as part of an ongoing project to produce a new parametric speech synthesis system, Idlak. The use of XML specification files, a modular design, and modern coding and testing approaches, make the Idlak front-end ideal for adding, altering and experimenting with the contexts used in full context acoustic models. The Idlak front-end was evaluated against the standard Festival front-end in the HTS system. Results from the Idlak front-end compare well with the more mature Festival front-end (Idlak - 2.83 MOS vs Festival - 2.85 MOS), although a slight reduction in naturalness perceived by non-native English speakers can be attributed to Festival\’s insertion of non-punctuated pauses.}, url = {http://www.isca-speech.org/archive/archive_papers/interspeech_2014/i14_1283.pdf}, author = {Aylett, Matthew P. and Dall, Rasmus and Ghoshal, Arnab and Henter, Gustav Eje and Merritt, Thomas} } @conference {Dall_Tomalin_IS14, title = {Investigating Automatic \& Human Filled Pause Insertion for Speech Synthesis}, booktitle = {Proceedings of Interspeech}, year = {2014}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/Dall_Tomalin_Wester.pdf}, author = {Rasmus Dall and Marcus Tomalin and Mirjam Wester and William Byrne and Simon King} } @conference {merritt2014investigating, title = {Investigating source and filter contributions, and their interaction, to statistical parametric speech synthesis}, booktitle = {Proc. Interspeech}, year = {2014}, month = {September}, pages = {1509{\textendash}1513}, address = {Singapore}, abstract = {This paper presents an investigation of the separate perceptual degradations introduced by the modelling of source and filter features in statistical parametric speech synthesis. This is achieved using stimuli in which various permutations of natural, vocoded and modelled source and filter are combined, optionally with the addition of filter modifications (e.g. global variance or modulation spectrum scaling). We also examine the assumption of independence between source and filter parameters. Two complementary perceptual testing paradigms are adopted. In the first, we ask listeners to perform {\textquotedblleft}same or different quality{\textquotedblright} judgements between pairs of stimuli from different configurations. In the second, we ask listeners to give an opinion score for individual stimuli. Combining the findings from these tests, we draw some conclusions regarding the relative contributions of source and filter to the currently rather limited naturalness of statistical parametric synthetic speech, and test whether current independence assumptions are justified.}, url = {http://homepages.inf.ed.ac.uk/s1270339/Publications/merritt2014investigating.pdf}, author = {Merritt, Thomas and Raitio, Tuomo and King, Simon} } @conference {Swietojanski2014_lhuc, title = {Learning Hidden Unit Contributions for Unsupervised Speaker Adaptation of Neural Network Acoustic Models}, booktitle = {Proc. IEEE Workshop on Spoken Language Technology}, year = {2014}, month = {December}, address = {Lake Tahoe, USA}, abstract = {This paper proposes a simple yet effective model-based neural network speaker adaptation technique that learns speaker- specific hidden unit contributions given adaptation data, without requiring any form of speaker-adaptive training, or labelled adaptation data. An additional amplitude parameter is defined for each hidden unit; the amplitude parameters are tied for each speaker, and are learned using unsupervised adaptation. We conducted experiments on the TED talks data, as used in the International Workshop on Spoken Language Translation (IWSLT) evaluations. Our results indicate that the approach can reduce word error rates on standard IWSLT test sets by about 8{\textendash}15\% relative compared to unadapted systems, with a further reduction of 4{\textendash}6\% relative when combined with feature-space maximum likelihood linear re- gression (fMLLR). The approach can be employed in most existing feed-forward neural network architectures, and we report results using various hidden unit activation functions: sigmoid, maxout, and rectifying linear units (ReLU).}, author = {Swietojanski, P and Renals, S} } @conference {279, title = {Measuring the perceptual effects of modelling assumptions in speech synthesis using stimuli constructed from repeated natural speech}, booktitle = {Proceedings of Interspeech}, year = {2014}, month = {09/2014}, address = {Singapore}, abstract = {Acoustic models used for statistical parametric speech synthesis typically incorporate many modelling assumptions. It is an open question to what extent these assumptions limit the naturalness of synthesised speech. To investigate this question, we recorded a speech corpus where each prompt was read aloud multiple times. By combining speech parameter trajectories extracted from different repetitions, we were able to quantify the perceptual effects of certain commonly used modelling assumptions. Subjective listening tests show that taking the source and filter parameters to be conditionally independent, or using diagonal covariance matrices, significantly limits the naturalness that can be achieved. Our experimental results also demonstrate the shortcomings of mean-based parameter generation.}, keywords = {acoustic modelling, diagonal covariance matrices, repeated speech, speech synthesis, stream independence}, url = {http://mi.eng.cam.ac.uk/~sms46/papers/henter2014measuring.pdf}, author = {Henter, Gustav Eje and Merritt, Thomas and Shannon, Matt and Mayo, Catherine and King, Simon} } @conference {Lan14, title = {Multiple-Average-Voice-based Speech Synthesis}, booktitle = {Proc. ICASSP}, year = {2014}, author = {P. Lanchantin and M. J. F. Gales and S. King and J. Yamagishi} } @conference {Renals:HSCMA14, title = {Neural Networks for Distant Speech Recognition}, booktitle = {The 4th Joint Workshop on Hands-free Speech Communication and Microphone Arrays (HSCMA)}, year = {2014}, abstract = {Distant conversational speech recognition is challenging owing to the presence of multiple, overlapping talkers, additional non-speech acoustic sources, and the effects of reverberation. In this paper we review work on distant speech recognition, with an emphasis on approaches which combine multichannel signal processing with acoustic modelling, and investigate the use of hybrid neural network / hidden Markov model acoustic models for distant speech recognition of meetings recorded using microphone arrays. In particular we investigate the use of convolutional and fully-connected neural networks with different activation functions (sigmoid, rectified linear, and maxout). We performed experiments on the AMI and ICSI meeting corpora, with results indicating that neural network models are capable of significant improvements in accuracy compared with discriminatively trained Gaussian mixture models.}, author = {Renals, S. and Swietojanski, P.} } @article {liu2014paraphrastic, title = {Paraphrastic language models}, journal = {Computer Speech \& Language}, volume = {28}, number = {6}, year = {2014}, pages = {1298{\textendash}1316}, publisher = {Elsevier}, url = {http://mi.eng.cam.ac.uk/~xl207/publications/journals/csl14-parlm-preprn.pdf}, author = {Liu, Xunying and Gales, Mark and Woodland, Phil} } @conference {252, title = {PARAPHRASTIC NEURAL NETWORK LANGUAGE MODELS}, booktitle = {IEEE ICASSP2014}, year = {2014}, month = {04/05/2014}, publisher = {IEEE ICASSP2014}, organization = {IEEE ICASSP2014}, address = {Florence, Italy}, abstract = {Expressive richness in natural languages presents a significant challenge for statistical language models (LM). As multiple word se- quences can represent the same underlying meaning, only modelling the observed surface word sequence can lead to poor context cov- erage. To handle this issue, paraphrastic LMs were previously pro- posed to improve the generalization of back-off n-gram LMs. Paraphrastic neural network LMs (NNLM) are investigated in this paper. Using a paraphrastic multi-level feedforward NNLM modelling both word and phrase sequences, significant error rate reductions of 1.3\% absolute (8\% relative) and 0.9\% absolute (5.5\% relative) were ob- tained over the baseline n-gram and NNLM systems respectively on a state-of-the-art conversational telephone speech recognition sys- tem trained on 2000 hours of audio and 545 million words of texts.}, keywords = {neural network language model, paraphrase, Speech recognition}, author = {Xunying Liu and Mark Gales and Phil Woodland} } @article {lu2014a, title = {Probabilistic Linear Discriminant Analysis for Acoustic Modelling}, journal = {IEEE Signal Processing Letters}, volume = {21}, number = {6}, year = {2014}, pages = {702-706}, abstract = {In this letter, we propose a new acoustic modelling approach for automatic speech recognition based on probabilistic linear discriminant analysis (PLDA), which is used to model the state density function for the standard hidden Markov models (HMMs). Unlike the conventional Gaussian mixture models (GMMs) where the correlations are weakly modelled by using the diagonal covariance matrices, PLDA captures the correlations of feature vector in subspaces without vastly expanding the model. It also allows the usage of high dimensional feature input, and therefore is more flexible to make use of different type of acoustic features. We performed the preliminary experiments on the Switchboard corpus, and demonstrated the feasibility of this acoustic model.}, doi = {10.1109/LSP.2014.2313410}, author = {Lu, Liang and Renals, Steve} } @conference {llu_is2014, title = {Probabilistic linear discriminant analysis with bottleneck features for speech recognition}, booktitle = {Proc. INTERSPEECH}, year = {2014}, author = {Lu, Liang and Renals, Steve} } @conference {sinclairbell_interspeech14, title = {A semi-Markov model for speech segmentation with an utterance-break prior}, booktitle = {Proc. Interspeech}, year = {2014}, month = {September}, abstract = {Speech segmentation is the problem of finding the end points of a speech utterance for passing to an automatic speech recognition (ASR) system. The quality of this segmentation can have a large impact on the accuracy of the ASR system; in this paper we demonstrate that it can have an even larger impact on downstream natural language processing tasks \– in this case, machine translation. We develop a novel semi-Markov model which allows the segmentation of audio streams into speech utterances which are optimised for the desired distribution of sentence lengths for the target domain. We compare this with existing state-of-the-art methods and show that it is able to achieve not only improved ASR performance, but also to yield significant benefits to a speech translation task.}, url = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/interspeech2014.pdf}, author = {Sinclair, Mark and Bell, Peter and Birch, Alexandra and McInnes, Fergus} } @conference {Zhang2014, title = {Semi-Supervised DNN Training in Meeting Recognition}, year = {2014}, month = {December}, address = {South Lake Tahoe, USA}, abstract = {Training acoustic models for ASR requires large amounts of labelled data which is costly to obtain. Hence it is desirable to make use of unlabelled data. While unsupervised training can give gains for standard HMM training, it is more difficult to make use of unlabelled data for discriminative models. This paper explores semi-supervised training of Deep Neural Networks (DNN) in a meeting recognition task. We first analyse the impact of imperfect transcription on the DNN and the ASR performance. As labelling error is the source of the problem, we investigate two options available to reduce that: selecting data with fewer errors, and changing the dependence on noise by reducing label precision. Both confidence based data selection and label resolution change are explored in the context of two scenarios of matched and unmatched unlabelled data. We introduce improved DNN based confidence score estimators and show their performance on data selection for both scenarios. Confidence score based data selection was found to yield up to 14.6\% relative WER reduction, while better balance between label resolution and recognition hypothesis accuracy allowed further WER reductions by 16.6\% relative in the mismatched scenario.}, url = {http://staffwww.dcs.shef.ac.uk/people/Y.Liu/publications/pdf/Zhang2014.pdf}, author = {Pengyuan Zhang and Yulan Liu and Thomas Hain} } @conference {Zhang:2014ab, title = {Standalone training of context-dependent deep neural network acoustic models}, booktitle = {IEEE ICASSP 2014}, year = {2014}, month = {05/2014}, address = {Florence, Italy}, abstract = {
Recently, context-dependent (CD) deep neural network (DNN) hidden Markov models (HMMs) have been widely used as acoustic models for speech recognition. However, the standard method to build such models requires target training labels from a \ system using \ HMMs with Gaussian mixture model output distributions (GMM-HMMs). In this paper, we introduce a method for training state-of-the-art CD-DNN-HMMs without relying on such a pre-existing system. We achieve this in two steps: build a context-independent (CI) DNN iteratively with word transcriptions, and then cluster the equivalent output distributions of the untied CD-DNN HMM states using the decision tree based state tying approach. Experiments have been performed on the Wall Street Journal corpus and the resulting system gave comparable word error rates (WER) to CD-DNNs built based on GMM-HMM alignments and state-clustering.
}, url = {http://mi.eng.cam.ac.uk/~cz277/doc/Conference-ICASSP2014.pdf}, author = {Zhang, C. and Woodland, P. C.} } @conference {Saz14, title = {{Using Contextual Information in Joint Factor Eigenspace MLLR for Speech Recognition in Diverse Scenarios}}, booktitle = {{Proceedings of the 2014 International Conference on Acoustic, Speech and Signal Processing (ICASSP)}}, year = {2014}, pages = {6314{\textendash}6318}, address = {Florence, Italy}, author = {Oscar Saz and Thomas Hain} } @conference {Saz14, title = {{Using Contextual Information in Joint Factor Eigenspace MLLR for Speech Recognition in Diverse Scenarios}}, booktitle = {Proceedings of the 2014 ICASSP}, year = {2014}, month = {May}, address = {Florence, Italy.}, author = {O. Saz and T. Hain} } @conference {Valentini_IS14, title = {Using linguistic predictability and the Lombard effect to increase the intelligibility of synthetic speech in noise}, booktitle = {Proceedings of Interspeech}, year = {2014}, abstract = {In order to predict which words in a sentence are harder to understand in noise it is necessary to consider not only audibility but also semantic or linguistic information. This paper focuses on using linguistic predictability to inform an intelligibility enhancement method that uses Lombard-adapted synthetic speech to modify low predictable words in Speech Perception in Noise (SPIN) test sentences. Word intelligibility in the presence of speech-shaped noise was measured using plain, Lombard and a combination of the two synthetic voices. The findings show that the Lombard voice increases intelligibility in noise but the intelligibility gap between words in a high and low predictable context still remains. Using a Lombard voice when a word is unpredictable is a good strategy, but if a word is predictable from its context the Lombard benefit only occurs when other words in the sentence are also modified.}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/Valentini_Wester_IS14.pdf}, author = {Cassia Valentini-Botinhao and Mirjam Wester} } @conference {Liu1405:Using, title = {Using neural network front-ends on far field multiple microphones based speech recognition}, booktitle = {ICASSP2014 - Speech and Language Processing (ICASSP2014 - SLTC)}, year = {2014}, address = {Florence, Italy}, abstract = {This paper presents an investigation of far field speech recognition using beamforming and channel concatenation in the context of Deep Neural Network (DNN) based feature extraction. While speech enhancement with beamforming is attractive, the algorithms are typically signal-based with no information about the special properties of speech. A simple alternative to beamforming is concatenating multiple channel features. Results presented in this paper indicate that channel concatenation gives similar or better results. On average the DNN front-end yields a 25\% relative reduction in Word Error Rate (WER). Further experiments aim at including relevant information in training adapted DNN features. Augmenting the standard DNN input with the bottleneck feature from a Speaker Aware Deep Neural Network (SADNN) shows a general advantage over the standard DNN based recognition system, and yields additional improvements for far field speech recognition.}, keywords = {beamforming, deep neural networks, multiple microphone, Speech recognition}, url = {http://staffwww.dcs.shef.ac.uk/people/Y.Liu/publications/pdf/Liu2014.pdf}, author = {Yulan Liu and Pengyuan Zhang and Thomas Hain} } @conference {lu2013_pronunciation, title = {{Acoustic Data-driven Pronunciation Lexicon for Large Vocabulary Speech Recognition}}, booktitle = {Proc. ASRU}, year = {2013}, abstract = {Speech recognition systems normally use handcrafted pronunciation lexicons designed by linguistic experts. Building and maintaining such a lexicon is expensive and time consuming. This paper concerns automatically learning a pronunciation lexicon for speech recognition. We assume the availability of a small seed lexicon and then learn the pronunciations of new words directly from speech that is transcribed at word-level. We present two implementations for refining the putative pronunciations of new words based on acoustic evidence. The first one is an expectation maximization (EM) algorithm based on weighted finite state transducers (WFSTs) and the other is its Viterbi approximation. We carried out experiments on the Switchboard corpus of conversational telephone speech. The expert lexicon has a size of more than 30,000 words, from which we randomly selected 5,000 words to form the seed lexicon. By using the proposed lexicon learning method, we have significantly improved the accuracy compared with a lexicon learned using a grapheme-to-phoneme transformation, and have obtained a word error rate that approaches that achieved using a fully handcrafted lexicon.}, author = {Lu. L, and Ghoshal A. and Renals S.} } @conference {lu2013_pronunciation, title = {{Acoustic Data-driven Pronunciation Lexicon for Large Vocabulary Speech Recognition}}, booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)}, year = {2013}, publisher = {Proc. IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)}, organization = {Proc. IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)}, abstract = {Speech recognition systems normally use handcrafted pronunciation lexicons designed by linguistic experts. Building and maintaining such a lexicon is expensive and time consuming. This paper concerns automatically learning a pronunciation lexicon for speech recognition. We assume the availability of a small seed lexicon and then learn the pronunciations of new words directly from speech that is transcribed at word-level. We present two implementations for refining the putative pronunciations of new words based on acoustic evidence. The first one is an expectation maximization (EM) algorithm based on weighted finite state transducers (WFSTs) and the other is its Viterbi approximation. We carried out experiments on the Switchboard corpus of conversational telephone speech. The expert lexicon has a size of more than 30,000 words, from which we randomly selected 5,000 words to form the seed lexicon. By using the proposed lexicon learning method, we have significantly improved the accuracy compared with a lexicon learned using a grapheme-to-phoneme transformation, and have obtained a word error rate that approaches that achieved using a fully handcrafted lexicon.}, url = {http://homepages.inf.ed.ac.uk/llu/pdf/liang_asru13.pdf}, author = {Lu, Liang and Ghoshal, Arnab and Renals, Steve} } @conference {Saz13, title = {{Asynchronous Factorisation of Speaker and Background with Feature Transforms in Speech Recognition}}, booktitle = {{Proceedings of the 14th Annual Conference of the International Speech Communication Association (Interspeech)}}, year = {2013}, pages = {1238{\textendash}1242}, address = {Lyon, France}, author = {Oscar Saz and Thomas Hain} } @conference {Saz13, title = {Asynchronous factorisation of speaker and background with feature transforms in speech recognition}, booktitle = {Proceedings of Interspeech 2013}, year = {2013}, month = {August}, address = {Lyon, France}, url = {http://staffwww.dcs.shef.ac.uk/people/O.Saztorralba/data/2013\%20-\%20oskarsaz\%20-\%20Interspeech13.pdf}, author = {O. Saz and T. Hain} } @conference {Saz13b, title = {{Automatic Transcription of Multi-Genre Media Archives}}, booktitle = {{Proceedings of the First Workshop on Speech, Language and Audio in Multimedia}}, year = {2013}, pages = {26{\textendash}31}, address = {Marseille, France}, author = {P. Lanchantin and P.J. Bell and M.J.F. Gales and Thomas Hain and X. Liu and Y. Long and J. Quinnell and S. Renals and Oscar Saz and M.S. Seigel and P. Swietojanski and P.C. Woodland} } @conference {SLAM13, title = {Automatic Transcription of Multi-genre Media Archives}, booktitle = {Proceedings of SLAM Workshop}, year = {2013}, month = {August}, address = {Marseille, France}, url = {https://www.repository.cam.ac.uk/handle/1810/244726}, author = {Lanchantin, P. and P.-J. Bell and M.-J.-F. Gales and T. Hain and Liu, X. and Long, Y. and J. Quinnell and S. Renals and O. Saz and M.-S. Seigel and P Swietojanski and P.-C. Woodland} } @article {shannon2013autoregressive, title = {{Autoregressive models for statistical parametric speech synthesis}}, journal = {IEEE Trans. Audio Speech Language Process.}, volume = {21}, number = {3}, year = {2013}, pages = {587{\textendash}597}, url = {http://www.dspace.cam.ac.uk/handle/1810/244407}, author = {Matt Shannon and Heiga Zen and William Byrne} } @conference {Lu_SSW8, title = {Combining a Vector Space Representation of Linguistic Context with a Deep Neural Network for Text-To-Speech Synthesis}, booktitle = {8th ISCA Workshop on Speech Synthesis}, year = {2013}, month = {August}, pages = {281{\textendash}285}, address = {Barcelona, Spain}, abstract = {Conventional statistical parametric speech synthesis relies on decision trees to cluster together similar contexts, result- ing in tied-parameter context-dependent hidden Markov models (HMMs). However, decision tree clustering has a major weak- ness: it use hard division and subdivides the model space based on one feature at a time, fragmenting the data and failing to exploit interactions between linguistic context features. These linguistic features themselves are also problematic, being noisy and of varied relevance to the acoustics. We propose to combine our previous work on vector-space representations of linguistic context, which have the added ad- vantage of working directly from textual input, and Deep Neural Networks (DNNs), which can directly accept such continuous representations as input. The outputs of the network are probability distributions over speech features. Maximum Likelihood Parameter Generation is then used to create parameter trajectories, which in turn drive a vocoder to generate the waveform. Various configurations of the system are compared, using both conventional and vector space context representations and with the DNN making speech parameter predictions at two dif- ferent temporal resolutions: frames, or states. Both objective and subjective results are presented.}, url = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_PS3-3_Lu.pdf}, author = {Lu, Heng and King, Simon and Watts, Oliver} } @conference {Christensen2013, title = {Combining in-domain and out-of-domain speech data for automatic recognition of disordered speech}, booktitle = {Interspeech{\textquoteright}13}, year = {2013}, url = {http://staffwww.dcs.shef.ac.uk/people/H.Christensen/pubs/christensen_is13_2.pdf}, author = {H. Christensen and M. B. Aniol and Bell, P. and P. Green and T. Hain and S. King and P Swietojanski} } @conference {253, title = {Cross-domain Paraphrasing For Improving Language Modelling Using Out-of-domain Data}, booktitle = {ISCA Interspeech2013}, year = {2013}, month = {25/08/2013}, publisher = {ISCA Interspeech2013}, organization = {ISCA Interspeech2013}, address = {Lyon, France}, abstract = {In natural languages the variability in the underlying linguistic generation rules significantly alters the observed surface word sequence they create, and thus introduces a mismatch against other data generated via alternative realizations associated with, for example, a different domain. Hence, direct modelling of out-of-domain data can result in poor generalization to the in-domain data of interest. To handle this problem, this paper investigated using cross-domain paraphrastic language models to improve in-domain language modelling (LM) using out-of- domain data. Phrase level paraphrase models learnt from each domain were used to generate paraphrase variants for the data of other domains. These were used to both improve the context coverage of in-domain data, and reduce the domain mismatch of the out-of-domain data. Significant error rate reduction of 0.6\% absolute was obtained on a state-of-the-art conversational telephone speech recognition task using a cross-domain paraphras- tic multi-level LM trained on a billion words of mixed conver- sational and broadcast news data. Consistent improvements on the in-domain data context coverage were also obtained.}, keywords = {language model, paraphrase, Speech recognition}, author = {Xunying Liu and Mark Gales and Phil Woodland} } @conference {driesen13_iwslt13_german, title = {Description of the {UEDIN} system for German {ASR}}, booktitle = {Proc. IWSLT}, year = {2013}, author = {Driesen, J. and Bell, P. and S. Renals} } @conference {Gonzalez2013, title = {Dysarthria Intelligibility Assessment in a Factor Analysis Total Variability Space}, booktitle = {Interspeech{\textquoteright}13}, year = {2013}, url = {http://staffwww.dcs.shef.ac.uk/people/H.Christensen/pubs/christensen_is13_3.pdf}, author = {D. M Gonz{\'a}lez and P. Green and H. Christensen} } @conference {Heng13, title = {FACTORIZED CONTEXT MODELLING FOR TEXT-TO-SPEECH SYNTHESIS}, booktitle = {Proc ICASSP 2013}, year = {2013}, month = {May}, address = {Vancouver, Canada}, url = {http://homepages.inf.ed.ac.uk/hlu2/pdfs/IC13HengSimon.pdf}, author = {H. Lu and S. King} } @conference {shannon2013fast, title = {{Fast, low-artifact speech synthesis considering global variance}}, booktitle = {Proc. ICASSP 2013}, year = {2013}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, Canada}, doi = {http://mi.eng.cam.ac.uk/~sms46/papers/shannon2013fast.pdf}, url = {http://www.dspace.cam.ac.uk/handle/1810/244408}, author = {Matt Shannon and William Byrne} } @conference {rasipuram13_gaelic_graphemes, title = {Grapheme and multilingual posterior features for under-resourced speech recognition: a study on {S}cottish {G}aelic}, booktitle = {Proc. ICASSP}, year = {2013}, month = {may}, address = {Vancouver, Canada}, abstract = {Standard automatic speech recognition (ASR) systems use phonemes as subword units. Thus, one of the primary resources required to build a good ASR system is a well developed phoneme pronunciation lexicon. However, under-resourced languages typically lack such lexical resources. In this paper, we investigate recently proposed grapheme-based ASR in the framework of Kullback-Leibler divergence based hidden Markov model (KL-HMM) for under-resourced languages, particularly Scottish Gaelic which has no lexical resources. More specifically, we study the use of grapheme and multilingual phoneme class conditional probabilities (posterior features) as feature observations in the KL-HMM. ASR studies conducted show that the proposed approach yields better system compared to the conventional HMM/GMM approach using cepstral features. Furthermore, grapheme posterior features estimated using both auxiliary data and Gaelic data yield the best system.}, url = {http://homepages.inf.ed.ac.uk/s0566164/gaelic_graphemes_icassp13.pdf}, author = {Rasipuram, Ramya and Bell, Peter and Magimai.-Doss, Mathew} } @conference {christensen_slpat13, title = {homeService: Voice-enabled assistive technology in the home using cloud-based automatic speech recognition}, booktitle = {4th Workshop on Speech and Language Processing (SLPAT)}, year = {2013}, url = {http://staffwww.dcs.shef.ac.uk/people/H.Christensen/pubs/christensen_slpat13.pdf}, author = {H. Christensen and S. Cunningham and P. Green and T. Hain} } @conference {Swietojanski:ASRU13, title = {HYBRID ACOUSTIC MODELS FOR DISTANT AND MULTICHANNEL LARGE VOCABULARY SPEECH RECOGNITION}, booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)}, year = {2013}, abstract = {We investigate the application of deep neural network (DNN)-hidden Markov model (HMM) hybrid acoustic models for far-field speech recognition of meetings recorded using microphone arrays. We show that the hybrid models achieve significantly better accuracy than conventional systems based on Gaussian mixture models (GMMs). We observe up to 8\% absolute word error rate (WER) reduction from a discriminatively trained GMM baseline when using a single distant microphone, and between 4{\textendash}6\% absolute WER reduction when using beamforming on various combinations of array channels. By training the networks on audio from multiple channels, we find the networks can recover significant part of accuracy difference between the single distant microphone and beamformed configurations. Finally, we show that the accuracy of a network recognising speech from a single distant microphone can approach that of a multi-microphone setup by training with data from other microphones.}, url = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Swietojanski_ASRU2013.pdf}, author = {P Swietojanski and Ghoshal, A. and S. Renals} } @conference {Lon13, title = {Improving Lightly Supervised Training for Broadcast Transcriptions}, booktitle = {Interspeech 2013}, year = {2013}, url = {http://recherche.ircam.fr/equipes/analyse-synthese/lanchant/uploads/Main/Interspeech13.pdf}, author = {Long, Y. and Gales, M.J.F. and Lanchantin, P. and Liu, X. and M.S. Seigel and Woodland, P.C.} } @conference {Merritt_SSW8, title = {Investigating the shortcomings of {HMM} synthesis}, booktitle = {8th ISCA Workshop on Speech Synthesis}, year = {2013}, month = {August}, pages = {185{\textendash}190}, address = {Barcelona, Spain}, abstract = {This paper presents the beginnings of a framework for formal testing of the causes of the current limited quality of HMM (Hidden Markov Model) speech synthesis. This framework separates each of the effects of modelling to observe their independent effects on vocoded speech parameters in order to address the issues that are restricting the progression to highly intelligible and natural-sounding speech synthesis. The simulated HMM synthesis conditions are performed on spectral speech parameters and tested via a pairwise listening test, asking listeners to perform a \“same or different\” judgement on the quality of the synthesised speech produced between these conditions. These responses are then processed using multidimensional scaling to identify the qualities in modelled speech that listeners are attending to and thus forms the basis of why they are distinguishable from natural speech. The future improvements to be made to the framework will finally be discussed which include the extension to more of the parameters modelled during speech synthesis.}, url = {http://www.cstr.ed.ac.uk/downloads/publications/2013/ssw8_PS2-4_Merritt.pdf}, author = {Merritt, Thomas and King, Simon} } @article {210, title = {Joint Uncertainty Decoding for Noise Robust Subspace Gaussian Mixture Models}, journal = {IEEE Transactions on Audio, Speech and Language Processing}, year = {2013}, abstract = {Joint uncertainty decoding (JUD) is a model-based noise compensation technique for conventional Gaussian Mixture Model (GMM) based speech recognition systems. Unlike vector Taylor series (VTS) compensation which operates on the individual Gaussian components in an acoustic model, JUD clusters the Gaussian components into a smaller number of classes, sharing the compensation parameters for the set of Gaussians in a given class. This significantly reduces the computational cost. In this paper, we investigate noise compensation for subspace Gaussian mixture model (SGMM) based speech recognition systems using JUD. The total number of Gaussian components in an SGMM is typically very large. Therefore direct compensation of the individual Gaussian components, as performed by VTS, is computationally expensive. In this paper we show that JUD-based noise compensation can be successfully applied to SGMMs in a computationally efficient way. We evaluate the JUD/SGMM technique on the standard Aurora 4 corpus. Our experimental results indicate that the JUD/SGMM system results in lower word error rates compared with a conventional GMM system with either VTS-based or JUD-based noise compensation.}, url = {http://homepages.inf.ed.ac.uk/aghoshal/pubs/taslp13-sgmm_jud.pdf}, author = {Lu, Liang and K. K. Chin and Ghoshal, Arnab and Renals, Steve} } @conference {christensen_pron_is13, title = {Learning speaker-specific pronunciations of disordered speech}, booktitle = {Interspeech{\textquoteright}13}, year = {2013}, url = {http://staffwww.dcs.shef.ac.uk/people/H.Christensen/pubs/christensen_is13_1.pdf}, author = {H. Christensen and P. Green and T. Hain} } @conference {bell13_lecture_transcription, title = {A lecture transcription system combining neural network acoustic and language models}, booktitle = {Proc. Interspeech}, year = {2013}, month = {aug}, address = {Lyon, France}, abstract = {This paper presents a new system for automatic transcription of lectures. The system combines a number of novel features, including deep neural network acoustic models using multi-level adaptive networks to incorporate out-of-domain information, and factored recurrent neural network language models. We demonstrate that the system achieves large improvements on the TED lecture transcription task from the 2012 IWSLT evaluation \– our results are currently the best reported on this task, showing an relative WER reduction of more than 16\% compared to the closest competing system from the evaluation.}, url = {http://homepages.inf.ed.ac.uk/pbell1/lecture_transcription_is2013.pdf}, author = {Bell, Peter and Yamamoto, Hitoshi and P Swietojanski and Wu, Youzheng and McInnes, Fergus and Hori, Chiori and Renals, Steve} } @conference {stan13_lightly_supervised_discriminative, title = {Lightly Supervised Discriminative Training of Grapheme Models for Improved Sentence-level Alignment of Speech and Text Data}, booktitle = {Proc. Interspeech}, year = {2013}, month = {aug}, address = {Lyon, France}, abstract = {This paper introduces a method for lightly supervised discriminative training using MMI to improve the alignment of speech and text data for use in training HMM-based TTS systems for low-resource languages. In TTS applications, due to the use of long-span contexts, it is important to select training utterances which have wholly correct transcriptions. In a low-resource setting, when using poorly trained grapheme models, we show that the use of MMI discriminative training at the grapheme-level enables us to increase the amount of correctly aligned data by 40\%, while maintaining a 7\% sentence error rate and 0.8\% word error rate. We present the procedure for lightly supervised discriminative training with regard to the objective of minimising sentence error rate.}, url = {http://homepages.inf.ed.ac.uk/pbell1/lightly_supervised_discriminative_is2013.pdf}, author = {Stan, Adriana and Bell, Peter and Junichi Yamagishi and King, Simon} } @conference {WILDCAT, title = {Lightly supervised learning from a damaged natural speech corpus}, booktitle = {Proc. IEEE ICASSP 2013}, year = {2013}, url = {http://staffwww.dcs.shef.ac.uk/people/C.Fox/fox_icassp13.pdf}, author = {Charles Fox and T. Hain} } @conference {bell13_mlan, title = {Multi-level adaptive networks in tandem and hybrid {ASR} systems}, booktitle = {Proc. ICASSP}, year = {2013}, month = {may}, address = {Vancouver, Canada}, abstract = {In this paper we investigate the use of Multi-level adaptive networks (MLAN) to incorporate out-of-domain data when training large vocabulary speech recognition systems. In a set of experiments on multi-genre broadcast data and on TED lecture recordings we present results using of out-of-domain features in a hybrid DNN system and explore tandem systems using a variety of input acoustic features. Our experiments indicate using the MLAN approach in both hybrid and tandem systems results in consistent reductions in word error rate of 5\–10\% relative.}, url = {http://homepages.inf.ed.ac.uk/s0566164/mlan_icassp2013.pdf}, author = {Bell, Peter and P Swietojanski and Renals, Steve} } @conference {201, title = {Multilingual training of deep neural networks}, booktitle = {Proc. ICASSP}, year = {2013}, month = {May}, abstract = {We investigate multilingual modeling in the context of a deep neural network (DNN) \– hidden Markov model (HMM) hybrid, where the DNN outputs are used as the HMM state likelihoods. By viewing neural networks as a cascade of feature extractors followed by a logistic regression classifier, we hypothesise that the hidden layers, which act as feature extractors, will be transferable between languages. As a corollary, we propose that training the hidden layers on multiple languages makes them more suitable for such cross-lingual transfer. We experimentally confirm these hypotheses on the GlobalPhone corpus using seven languages from three different language families: Germanic, Romance, and Slavic. The experiments demonstrate substantial improvements over a monolingual DNN-HMM hybrid baseline, and hint at avenues of further exploration.}, url = {http://homepages.inf.ed.ac.uk/aghoshal/pubs/icassp13-multiling.pdf}, author = {Ghoshal, Arnab and P Swietojanski and Renals, Steve} } @conference {211, title = {Noise adaptive training for subspace Gaussian mixture models}, booktitle = {INTERSPEECH}, year = {2013}, month = {August}, abstract = {Noise adaptive training (NAT) is an effective approach to normalise environmental distortions when training a speech recogniser on noise-corrupted speech. This paper investigates the model-based NAT scheme using joint uncertainty decoding (JUD) for subspace Gaussian mixture models (SGMMs). A typical SGMM acoustic model has much larger number of surface Gaussian components, which makes it computationally infeasible to compensate each Gaussian explicitly. JUD tackles this problem by sharing the compensation parameters among the Gaussians and hence reduces the computational and memory demands. For noise adaptive training, JUD is reformulated into a generative model, which leads to an efficient expectation-maximisation (EM) based algorithm to update the SGMM acoustic model parameters. We evaluated the SGMMs with NAT on the Aurora 4 database, and obtained higher recognition accuracy compared to systems without adaptive training. Index Terms: adaptive training, noise robustness, joint uncertainty decoding, subspace Gaussian mixture models.}, url = {http://www.cstr.ed.ac.uk/downloads/publications/2013/lu2013noise.pdf}, author = {Lu, Liang and Ghoshal, Arnab and Renals, Steve} } @conference {254, title = {PARAPHRASTIC LANGUAGE MODELS AND COMBINATION WITH NEURAL NETWORK LANGUAGE MODELS}, booktitle = {IEEE ICASSP2013}, year = {2013}, month = {26/05/2013}, publisher = {IEEE ICASSP2013}, organization = {IEEE ICASSP2013}, address = {Vancouver, Canada}, abstract = {In natural languages multiple word sequences can represent the same underlying meaning. Only modelling the observed surface word se- quence can result in poor context coverage, for example, when us- ing n-gram language models (LM). To handle this issue, paraphras- tic LMs were proposed in previous research and successfully ap- plied to a US English conversational telephone speech transcription task. In order to exploit the complementary characteristics of para- phrastic LMs and neural network LMs (NNLM), the combination between the two is investigated in this paper. To investigate para- phrastic LMs\’ generalization ability to other languages, experiments are conducted on a Mandarin Chinese broadcast speech transcrip- tion task. Using a paraphrastic multi-level LM modelling both word and phrase sequences, significant error rate reductions of 0.9\% ab- solute (9\% relative) and 0.5\% absolute (5\% relative) were obtained over the baseline n-gram and NNLM systems respectively, after a combination with word and phrase level NNLMs.}, keywords = {language model, paraphrase, Speech recognition}, author = {Xunying Liu and Mark Gales and Phil Woodland} } @conference {Swietojanski:ICASSP13, title = {Revisiting Hybrid and GMM-HMM system combination techniques}, booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, year = {2013}, abstract = {In this paper we investigate techniques to combine hybrid HMM-DNN (hidden Markov model \– deep neural network) and tandem HMM-GMM (hidden Markov model \– Gaussian mixture model) acoustic models using: (1) model averaging, and (2) lattice combination with Minimum Bayes Risk de- coding. We have performed experiments on the {\textquoteleft}{\textquoteleft}TED Talks\&$\#$39;\&$\#$39; task following the protocol of the IWSLT-2012 evaluation. Our experimental results suggest that DNN-based and GMM- based acoustic models are complementary, with error rates being reduced by up to 8\% relative when the DNN and GMM systems are combined at model-level in a multi-pass auto- matic speech recognition (ASR) system. Additionally, fur- ther gains were obtained by combining model-averaged lat- tices with the one obtained from baseline systems.}, url = {http://homepages.inf.ed.ac.uk/s1136550/data/Swietojanski_ICASSP2013.pdf}, author = {P Swietojanski and Ghoshal, Arnab and Renals, Steve} } @conference {212, title = {Sequence-discriminative training of deep neural networks}, booktitle = {INTERSPEECH}, year = {2013}, month = {August}, abstract = {Sequence-discriminative training of deep neural networks (DNNs) is investigated on a 300 hour American English conversational telephone speech task. Different sequence-discriminative criteria \–- maximum mutual information (MMI), minimum phone error (MPE), state-level minimum Bayes risk (sMBR), and boosted MMI \–- are compared. Two different heuristics are investigated to improve the performance of the DNNs trained using sequence-based criteria \–- lattices are re-generated after the first iteration of training; and, for MMI and BMMI, the frames where the numerator and denominator hypotheses are disjoint are removed from the gradient computation. Starting from a competitive DNN baseline trained using cross-entropy, different sequence-discriminative criteria are shown to lower word error rates by 8-9\% relative, on average. Little difference is noticed between the different sequence-based criteria that are investigated. The experiments are done using the open-source Kaldi toolkit, which makes it possible for the wider community to reproduce these results.}, url = {http://www.cstr.ed.ac.uk/downloads/publications/2013/is13-dnn_seq.pdf}, author = {Karel Vesel{\'y} and Ghoshal, Arnab and Luk{\'a}{\v s} Burget and Daniel Povey} } @conference {Fox13, title = {The Sheffield Wargames Corpus.}, booktitle = {Proceedings of Interspeech 2013}, year = {2013}, month = {August}, address = {Lyon, France}, url = {http://staffwww.dcs.shef.ac.uk/people/C.Fox/fox_is13_wargames.pdf}, author = {Fox, C.W. and Liu, Y. and Zwyssig, E. and T. Hain} } @conference {Veaux_SPLAT13, title = {Towards Personalized Synthesized Voices for Individuals with Vocal Disabilities: Voice Banking and Reconstruction}, booktitle = {SLPAT 2013, 4th Workshop on Speech and Language Processing for Assistive Technologies}, year = {2013}, pages = {107{\textendash}111}, url = {http://www.slpat.org/slpat2013/W13/W13-3917.pdf}, author = {Christophe Veaux and Junichi Yamagishi and King, Simon} } @conference {bell13_iwslt_en, title = {The {UEDIN} English {ASR} System for the {IWSLT} 2013 Evaluation}, booktitle = {Proc. IWSLT}, year = {2013}, address = {Heidelberg, Germany}, author = {Bell, P. and McInnes, F. and Gangireddy, S. and Sinclair, M. and Birch, A. and S. Renals} } @conference {Valentini-Botinhao_SSW8, title = {Using neighbourhood density and selective {SNR} boosting to increase the intelligibility of synthetic speech in noise}, booktitle = {8th ISCA Workshop on Speech Synthesis}, year = {2013}, month = {August}, pages = {133{\textendash}138}, address = {Barcelona, Spain}, abstract = {Motivated by the fact that words are not equally confusable, we explore the idea of using word-level intelligibility predictions to selectively boost the harder-to-understand words in a sentence, aiming to improve overall intelligibility in the presence of noise. First, the intelligibility of a set of words from dense and sparse phonetic neighbourhoods was evaluated in isolation. The resulting intelligibility scores were used to inform two sentencelevel experiments. In the first experiment the signal-to-noise ratio of one word was boosted to the detriment of another word. Sentence intelligibility did not generally improve. The intelligibility of words in isolation and in a sentence were found to be significantly different, both in clean and in noisy conditions. For the second experiment, one word was selectively boosted while slightly attenuating all other words in the sentence. This strategy was successful for words that were poorly recognised in that particular context. However, a reliable predictor of word-in-context intelligibility remains elusive, since this involves \– as our results indicate \– semantic, syntactic and acoustic information about the word and the sentence.}, url = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Cassia_SSW13.pdf}, author = {Valentini-Botinhao, Cassia and Wester, Mirjam and Yamagishi, Junichi and King, Simon} } @conference {christensen_is12, title = {A comparative study of adaptive, automatic recognition of disordered speech}, booktitle = {Proc Interspeech 2012}, year = {2012}, month = {Sep}, address = {Portland, Oregon, US}, url = {http://staffwww.dcs.shef.ac.uk/people/H.Christensen/pubs/christensen_is12.pdf}, author = {H. Christensen and S. Cunningham and Charles Fox and P. Green and T. Hain} } @conference {Povey:ICASSP12a, title = {Generating exact lattices in the WFST framework}, booktitle = {Proc. IEEE ICASSP}, year = {2012}, pages = {4213-4216}, abstract = {We describe a lattice generation method that is exact, i.e. it satisfies all the natural properties we would want from a lattice of alterna- tive transcriptions of an utterance. This method does not introduce substantial overhead above one-best decoding. Our method is most directly applicable when using WFST decoders where the WFST is {\textquoteleft}{\textquoteleft}fully expanded\&$\#$39;\&$\#$39;, i.e. where the arcs correspond to HMM transi- tions. It outputs lattices that include HMM-state-level alignments as well as word labels. The general idea is to create a state-level lattice during decoding, and to do a special form of determinization that retains only the best-scoring path for each word sequence. This spe- cial determinization algorithm is a solution to the following problem: Given a WFST A, compute a WFST B that, for each input-symbol- sequence of A, contains just the lowest-cost path through A.}, url = {http://homepages.inf.ed.ac.uk/aghoshal/pubs/icassp12-lattices.pdf}, author = {Daniel Povey and Mirko Hannemann and Gilles Boulianne and Lukas Burget and Ghoshal, Arnab and Milos Janda and Martin Karafiat and Stefan Kombrink and Petr Motlicek and Yanmin Qian and Korbinian Riedhammer and Karel Vesely and Ngoc Thang Vu} } @conference {151, title = {Joint Uncertainty Decoding with Unscented Transform for Noise Robust Subspace Gaussian Mixture Models}, booktitle = {Proc. SAPA-SCALE Conference}, year = {2012}, month = {September}, address = {Portland, OR}, abstract = {Common noise compensation techniques use vector Taylor series (VTS) to approximate the mismatch function. Recent work shows that the approximation accuracy may be improved by sampling. One such sampling technique is the unscented transform (UT), which draws samples deterministically from clean speech and noise model to derive the noise corrupted speech parameters. This paper applies UT to noise compensation of the subspace Gaussian mixture model (SGMM). Since UT requires relatively smaller number of samples for accurate estimation, it has significantly lower computational cost compared to other random sampling techniques. However, the number of surface Gaussians in an SGMM is typically very large, making the direct application of UT, for compensating individual Gaussian components, computationally impractical. In this paper, we avoid the computational burden by employing UT in the framework of joint uncertainty decoding (JUD), which groups all the Gaussian components into small number of classes, sharing the compensation parameters by class. We evaluate the JUD-UT technique for an SGMM system using the Aurora 4 corpus. Experimental results indicate that UT can lead to increased accuracy compared to VTS approximation if the JUD phase factor is untuned, and to similar accuracy if the phase factor is tuned empirically.}, url = {http://www.sapaworkshops.org/2012/papers/sapa2012-116.pdf}, author = {Lu, Liang and Ghoshal, Arnab and Renals, Steve} } @conference {Lu:ICASSP12, title = {Maximum a posteriori adaptation of subspace Gaussian mixture models for cross-lingual speech recognition}, booktitle = {Proc. IEEE ICASSP}, year = {2012}, pages = {4877-4880}, abstract = {This paper concerns cross-lingual acoustic modeling in the case when there are limited target language resources. We build on an approach in which a subspace Gaussian mixture model (SGMM) is adapted to the target language by reusing the globally shared parameters estimated from out-of-language training data. In current cross-lingual systems, these parameters are fixed when training the target system, which can give rise to a mismatch between the source and target systems. We investigate a maximum a posteriori (MAP) adaptation approach to alleviate the potential mismatch. In partic- ular, we focus on the adaptation of phonetic subspace parameters using a matrix variate Gaussian prior distribution. Experiments on the GlobalPhone corpus using the MAP adaptation approach results in word error rate reductions, compared with the cross-lingual base- line systems and systems updated using maximum likelihood, for training conditions with 1 hour and 5 hours of target language data.}, url = {http://homepages.inf.ed.ac.uk/aghoshal/pubs/icassp12-sgmm_map_xling.pdf}, author = {Lu, Liang and Ghoshal, Arnab and Renals, Steve} } @conference {152, title = {Noise Compensation for Subspace Gaussian Mixture Models}, booktitle = {Proc. Interspeech}, year = {2012}, month = {September}, address = {Portland, OR}, abstract = {Joint uncertainty decoding (JUD) is an effective model-based noise compensation technique for conventional Gaussian mix- ture model (GMM) based speech recognition systems. In this paper, we apply JUD to subspace Gaussian mixture model (SGMM) based acoustic models. The total number of Gaus- sians in the SGMM acoustic model is usually much larger than for conventional GMMs, which limits the application of approaches which explicitly compensate each Gaussian, such as vector Taylor series (VTS). However, by clustering the Gaussian components into a number of regression classes, JUD-based noise compensation can be successfully applied to SGMM systems. We evaluate the JUD/SGMM technique us- ing the Aurora 4 corpus, and the experimental results indicated that it is more accurate than conventional GMM-based systems using either VTS or JUD noise compensation.}, url = {http://homepages.inf.ed.ac.uk/aghoshal/pubs/lu-is12-sgmm_jud.pdf}, author = {Lu, Liang and K. K. Chin and Ghoshal, Arnab and Renals, Steve} } @conference {255, title = {Paraphrastic Language Models}, booktitle = {ISCA Interspeech2012}, year = {2012}, month = {09/09/2012}, publisher = {ISCA Interspeech2012}, organization = {ISCA Interspeech2012}, address = {Portland, Oregon}, abstract = {In natural languages multiple word sequences can represent the same underlying meaning. Only modelling the observed sur- face word sequence can result in poor context coverage, for ex- ample, when using n-gram language models (LM). To handle this issue, this paper presents a novel form of language model, the paraphrastic LM. A phrase level transduction model that is statistically learned from standard text data is used to gener- ate paraphrase variants. LM probabilities are then estimated by maximizing their marginal probability. Significant error rate reductions of 0.5\%-0.6\% absolute were obtained on a state-of- the-art conversational telephone speech recognition task using a paraphrastic multi-level LM modelling both word and phrase sequences.}, keywords = {language model, paraphrase, Speech recognition}, author = {Xunying Liu and Mark Gales and Phil Woodland} } @conference {Riedhammer:ICASSP12, title = {Revisiting semi-continuous hidden Markov models}, booktitle = {Proc. IEEE ICASSP}, year = {2012}, pages = {4271-4274}, abstract = {In the past decade, semi-continuous hidden Markov models (SC- HMMs) have not attracted much attention in the speech recognition community. Growing amounts of training data and increasing so- phistication of model estimation led to the impression that continu- ous HMMs are the best choice of acoustic model. However, recent work on recognition of under-resourced languages faces the same old problem of estimating a large number of parameters from lim- ited amounts of transcribed speech. This has led to a renewed in- terest in methods of reducing the number of parameters while main- taining or extending the modeling capabilities of continuous mod- els. In this work, we compare classic and multiple-codebook semi- continuous models using diagonal and full covariance matrices with continuous HMMs and subspace Gaussian mixture models. Experi- ments on the RM and WSJ corpora show that while a classical semi- continuous system does not perform as well as a continuous one, multiple-codebook semi-continuous systems can perform better, par- ticular when using full-covariance Gaussians.}, url = {http://homepages.inf.ed.ac.uk/aghoshal/pubs/icassp12-semi_cont.pdf}, author = {Korbinian Riedhammer and Tobias Bocklet and Ghoshal, Arnab and Daniel Povey} } @conference {Christensen_iast2012, title = {SPECS - an embedded platform, speech-driven environmental control system evaluated in a virtuous circle framework}, booktitle = {Proc. Workshop on Innovation and Applications in Speech Technology}, year = {2012}, url = {http://www.dcs.shef.ac.uk/~heidi/pubs/iast-abstract.pdf}, author = {H. Christensen and S. Siddharth and P. O{\textquoteright}Neill and Z. Clarke and S. Judge and S. Cunningham and M. Hawley} } @article {YamagishiVoiceBank20121, title = {Speech synthesis technologies for individuals with vocal disabilities: Voice banking and reconstruction}, journal = {Acoustical Science and Technology}, volume = {33}, number = {1}, year = {2012}, pages = {1-5}, url = {https://www.jstage.jst.go.jp/article/ast/33/1/33_1_1/_pdf}, author = {Junichi Yamagishi and Christophe Veaux and King, Simon and Renals, Steve} } @conference {FOX-LAC2012, title = {Studio report: Linux audio for multi-speaker natural speech technology.}, booktitle = {Proc. Linux Audio Conference}, year = {2012}, url = {http://staffwww.dcs.shef.ac.uk/people/C.Fox/fox_lac2012.pdf}, author = {Charles Fox and H. Christensen and T. Hain} } @conference {bell12_mlan, title = {Transcription of multi-genre media archives using out-of-domain data}, booktitle = {Proc. IEEE Workshop on Spoken Language Technology}, year = {2012}, address = {Miami, Florida, USA}, abstract = {We describe our work on developing a speech recognition system for multi-genre media archives. The high diversity of the data makes this a challenging recognition task, which may benefit from systems trained on a combination of in-domain and out-of-domain data. Working with tandem HMMs, we present Multi-level Adaptive Networks (MLAN), a novel technique for incorporating information from out-of-domain posterior features using deep neural networks. We show that it provides a substantial reduction in WER over other systems, with relative WER reductions of 15\% over a PLP baseline, 9\% over in-domain tandem features and 8\% over the best out-of-domain tandem features.}, url = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/mlan_slt2012.pdf}, author = {Bell, P. and Gales, M. and Lanchantin, P. and Liu, X. and Long, Y. and Renals, Steve and P Swietojanski and Woodland, P.} } @conference {163, title = {Unsupervised Cross-lingual Knowledge Transfer for DNN-based LVCSR}, booktitle = {Proceedings of the IEEE Workshop on Spoken Language Technology}, year = {2012}, month = {December}, abstract = {We investigate the use of cross-lingual acoustic data to initialise deep neural network (DNN) acoustic models by means of unsupervised restricted Boltzmann machine (RBM) pre-training. DNNs for German are pretrained using one or all of German, Portuguese, Spanish and Swedish. The DNNs are used in a tandem configuration, where the network outputs are used as features for a hidden Markov model (HMM) whose emission densities are modeled by Gaussian mixture models (GMMs), as well as in a hybrid configuration, where the network outputs are used as the HMM state likelihoods. The experiments show that unsupervised pretraining is more crucial for the hybrid setups, particularly with limited amounts of transcribed training data. More importantly, unsupervised pretraining is shown to be language-independent.}, url = {http://homepages.inf.ed.ac.uk/aghoshal/pubs/slt12-xling_dnn.pdf}, author = {P Swietojanski and Ghoshal, Arnab and Renals, Steve} } @conference {150, title = {Using Bayesian Networks to find relevant context features for HMM-based speech synthesis}, booktitle = {Proc Interspeech 2012}, year = {2012}, month = {09/2012}, publisher = {Interspeech 2012}, organization = {Interspeech 2012}, address = {Portland, Oregon, US}, keywords = {Bayesian Networks, context information, HMM-based speech synthesis}, url = {http://homepages.inf.ed.ac.uk/hlu2/pdfs/HengLuSimonKing.pdf}, author = {Lu, Heng and S. King} } @conference {Lu2011, title = {Regularized Subspace Gaussian Mixture Models for Cross-lingual Speech Recognition}, booktitle = {Proc IEEE ASRU}, year = {2011}, abstract = {We investigate cross-lingual acoustic modelling for low resource languages using the subspace Gaussian mixture model (SGMM). We assume the presence of acoustic models trained on multiple source languages, and use the global subspace parameters from those models for improved modelling in a target language with limited amounts of transcribed speech. Experiments on the GlobalPhone corpus using Spanish, Portuguese, and Swedish as source languages and German as target language (with 1 hour and 5 hours of transcribed audio) show that multilingually trained SGMM shared parameters result in lower word error rates (WERs) than using those from a single source language. We also show that regularizing the estimation of the SGMM state vectors by penalizing their l1-norm help to overcome numerical instabilities and lead to lower WER.}, doi = {http://dx.doi.org/10.1109/ASRU.2011.6163959}, url = {http://www.cstr.ed.ac.uk/downloads/publications/2011/lu-asru-2011.pdf}, author = {Lu, Liang and Ghoshal, Arnab and Renals, Steve} } @conference {lu2016segmental, title = {Segmental Recurrent Neural Networks for End-to-end Speech Recognition}, booktitle = {Proc. INTERSPEECH}, author = {Lu, Liang and Kong, Lingpeng and Dyer, Chris and Smith, Noah A and Renals, Steve} } @conference {liang2016small, title = {Small-footprint Deep Neural Networks with Highway Connections for Speech Recognition}, booktitle = {Proc. INTERSPEECH}, author = {Liang, Lu and Steve, Renals} } @conference {lu2016training, title = {On training the recurrent neural network encoder-decoder for large vocabulary end-to-end speech recognition}, booktitle = {Proc. ICASSP}, publisher = {IEEE}, organization = {IEEE}, author = {Lu, Liang and Zhang, Xingxing and Renais, Steve} }