@article {saz16, title = {Acoustic Adaptation to Dynamic Background Conditions with Asynchronous Transformations}, journal = {Computer Speech and Language}, year = {2016}, doi = {10.1016/j.csl.2016.06.008 }, author = {Oscar Saz and Thomas Hain} } @conference {380, title = {Automatic Genre and Show Identification of Broadcast Media}, booktitle = {Proceedings of the 17th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2016}, address = {San Francisco, California, USA}, author = {Mortaza Doulaty and Oscar Saz and Raymond W. M. Ng and Thomas Hain} } @conference {DeenaHDSH16, title = {Combining Feature and Model-Based Adaptation of RNNLMs for Multi-Genre Broadcast Speech Recognition}, booktitle = {Proceedings of the 17th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2016}, address = {San Francisco, California, USA}, author = {Salil Deena, Madina Hasan, Mortaza Doulaty, Oscar Saz and Thomas Hain} } @conference {DeenaHDSH16, title = {Combining Feature and Model-Based Adaptation of RNNLMs for Multi-Genre Broadcast Speech Recognition}, booktitle = {Proceedings of the 17th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2016}, address = {San Francisco, California, USA}, author = {Salil Deena and Madina Hasan and Mortaza Doulaty and Oscar Saz and Thomas Hain} } @conference {ng_is16, title = {Combining weak tokenisers for phonotactic language recognition in a resource-constrained setting}, booktitle = {Proceedings of the 17th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2016}, address = {San Francisco, CA}, author = {Raymond W. M. Ng and Bhusan Chettri and Thomas Hain} } @conference {merritt2016hybrid, title = {Deep neural network-guided unit selection synthesis}, booktitle = {Proc. ICASSP}, year = {2016}, abstract = {Vocoding of speech is a standard part of statistical parametric speech synthesis systems. It imposes an upper bound of the naturalness that can possibly be achieved. Hybrid systems using parametric models to guide the selection of natural speech units can combine the benefits of robust statistical models with the high level of naturalness of waveform concatenation. Existing hybrid systems use Hidden Markov Models (HMMs) as the statistical model. This paper demonstrates that the superiority of Deep Neural Network (DNN) acoustic models over HMMs in conventional statistical parametric speech synthesis also carries over to hybrid synthesis. We compare various DNN and HMM hybrid configurations, guiding the selection of waveform units in either the vocoder parameter domain, or in the domain of embeddings (bottleneck features).}, url = {http://datashare.is.ed.ac.uk/bitstream/handle/10283/1889/Merritt_ICASSP2016.pdf}, author = {Merritt, Thomas and Clark, Robert A J and Wu, Zhizheng and Yamagishi, Junichi and King, Simon} } @article {7499870, title = {Differentiable Pooling for Unsupervised Acoustic Model Adaptation}, journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing}, volume = {PP}, number = {99}, year = {2016}, pages = {1-1}, issn = {2329-9290}, doi = {10.1109/TASLP.2016.2584700}, author = {P. Swietojanski and S. Renals} } @conference {Zhang:2015cd, title = {DNN Speaker Adaptation using Parameterised Sigmoid and ReLU Hidden Activation Functions}, booktitle = {Proc. ICASSP{\textquoteright}16}, year = {2016}, address = {Shanghai, China}, url = {http://mi.eng.cam.ac.uk/~cz277/doc/Conference-ICASSP2016-ADAPT.pdf}, author = {Zhang, C. and Woodland, P.C.} } @conference {milner_is16, title = {DNN-based speaker clustering for speaker diarisation}, booktitle = {Proceedings of the 17th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2016}, address = {San Francisco, CA}, author = {Rosanna Milner and Thomas Hain} } @conference {olcoz_is16, title = {Error correction in lightly supervised alignment of broadcast subtitles}, booktitle = {Proceedings of the 17th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2016}, address = {San Francisco, CA}, author = {Julia Olcoz and Oscar Saz and Thomas Hain} } @conference {369, title = {Evaluating comprehension of natural and synthetic conversational speech}, booktitle = {Speech Prosody}, year = {2016}, month = {June}, publisher = {ISCA}, organization = {ISCA}, address = {Boston, MA}, abstract = {Current speech synthesis methods typically operate on isolated sentences and lack convincing prosody when generating longer segments of speech. Similarly, prevailing TTS evaluation paradigms, such as intelligibility (transcription word error rate) or MOS, only score sentences in isolation, even though overall comprehension is arguably more important for speech-based communication. In an effort to develop more ecologically-relevant evaluation techniques that go beyond isolated sentences, we investigated comprehension of natural and synthetic speech dialogues. Specifically, we tested listener comprehension on long segments of spontaneous and engaging conversational speech (three 10-minute radio interviews of comedians). Interviews were reproduced either as natural speech, synthesised from carefully prepared transcripts, or synthesised using durations from forced-alignment against the natural speech, all in a balanced design. Comprehension was measured using multiple choice questions. A significant difference was measured between the comprehension/retention of natural speech (74\% correct responses) and synthetic speech with forced-aligned durations (61\% correct responses). However, no significant difference was observed between natural and regular synthetic speech (70\% correct responses). Effective evaluation of comprehension remains elusive.}, keywords = {comprehension, conversational speech, evaluation, statistical parametric speech synthesis}, url = {http://www.isca-speech.org/archive/sp2016/pdfs_stamped/41.pdf}, author = {Wester, Mirjam and Watts, Oliver and Henter, Gustav Eje} } @conference {nicolao_lrec2016, title = {A framework for collecting realistic recordings of dysarthric speech - the homeService corpus}, booktitle = {The International Conference on Language Resources and Evaluation - LREC 2016}, year = {2016}, month = {05/2016}, publisher = {ELRA}, organization = {ELRA}, address = {Portoro{\v z}, SLO}, author = {Nicolao, Mauro and Christensen, Heidi and Cunningham, Stuart and Green, Phil and Hain, Thomas} } @conference {watts2016hmms, title = {From {HMM}s to {DNN}s: where do the improvements come from?}, booktitle = {Proc. ICASSP}, volume = {41}, year = {2016}, month = {March}, publisher = {IEEE}, organization = {IEEE}, address = {Shanghai, China}, abstract = {Deep neural networks (DNNs) have recently been the focus of much text-to-speech research as a replacement for decision trees and hidden Markov models (HMMs) in statistical parametric synthesis systems. Performance improvements have been reported; however, the configuration of systems evaluated makes it impossible to judge how much of the improvement is due to the new machine learning methods, and how much is due to other novel aspects of the systems. Specifically, whereas the decision trees in HMM-based systems typically operate at the state-level, and separate trees are used to handle separate acoustic streams, most DNN-based systems are trained to make predictions simultaneously for all streams at the level of the acoustic frame. This paper isolates the influence of three factors (machine learning method; state vs. frame predictions; separate vs. combined stream predictions) by building a continuum of systems along which only a single factor is varied at a time. We find that replacing decision trees with DNNs and moving from state-level to frame-level predictions both significantly improve listeners\&$\#$39; naturalness ratings of synthetic speech produced by the systems. No improvement is found to result from switching from separate-stream to combined-stream predictions.}, keywords = {decision tree, deep neural network, hidden Markov model, speech synthesis}, url = {http://homepages.inf.ed.ac.uk/ghenter/pubs/watts2016hmms.pdf}, author = {Watts, Oliver and Henter, Gustav Eje and Merritt, Thomas and Wu, Zhizheng and King, Simon} } @conference {ng_icassp16, title = {Groupwise learning for ASR k-best list reranking in spoken language translation}, booktitle = {Proceedings of the 2016 International Conference on Acoustic, Speech and Signal Processing (ICASSP)}, year = {2016}, address = {Shanghai, China}, author = {Raymond W. M. Ng and Kashif Shah and Lucia Specia and Thomas Hain} } @conference {371, title = {A hierarchical predictor of synthetic speech naturalness using neural networks}, booktitle = {Proc. Interspeech}, year = {2016}, month = {September}, publisher = {ISCA}, organization = {ISCA}, address = {San Francisco, CA}, abstract = {A problem when developing and tuning speech synthesis systems is that there is no well-established method of automatically rating the quality of the synthetic speech. This research attempts to obtain a new automated measure which is trained on the result of large-scale subjective evaluations employing many human listeners, i.e., the Blizzard Challenge. To exploit the data, we experiment with linear regression, feed-forward and convolutional neural network models, and combinations of them to regress from synthetic speech to the perceptual scores obtained from listeners. The biggest improvements were seen when combining stimulus- and system-level predictions.}, keywords = {Blizzard Challenge, naturalness, neural network, speech synthesis}, author = {Yoshimura, Takenori and Henter, Gustav Eje and Watts, Oliver and Wester, Mirjam and Yamagishi, Junichi and Tokuda, Keiichi} } @conference {wl519:2016ab, title = {Improved DNN-based Segmentation for Multi-genre Broadcast Audio}, booktitle = {Proc. ICASSP{\textquoteright}16}, year = {2016}, address = {Shanghai, China}, url = {http://mi.eng.cam.ac.uk/~cz277/doc/Conference-ICASSP2016-DNNSEG.pdf}, author = {Wang, L. and Zhang, C. and Woodland, P.C. and Gales, M.J.F. and Karanasou, P. and Lanchantin, P. and Liu, X. and Qian, Y.} } @conference {casanueva_is16, title = {Improving generalisation to new speakers in spoken dialogue state tracking}, booktitle = {Proceedings of the 17th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2016}, address = {San Francisco, CA}, author = {I{\~n}igo Casanueva and Thomas Hain and Phil Green} } @conference {qian16Integrated, title = {Integrated adaptation with multi-factor joint-learning for far-field speech recognition}, booktitle = {Proc. ICASSP{\textquoteright}16}, year = {2016}, address = {Shanghai, China}, author = {Yanmin Qian and Tian Tan and Dong Yu and Yu Zhang} } @conference {qian16An, title = {An investigation into using parallel data for far-field speech recognition}, booktitle = {Proc. ICASSP{\textquoteright}16}, year = {2016}, address = {Shanghai, China}, author = {Yanmin Qian and Tian Tan and Dong Yu} } @article {7462247, title = {Learning Hidden Unit Contributions for Unsupervised Acoustic Model Adaptation}, journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing}, volume = {24}, number = {8}, year = {2016}, month = {Aug}, pages = {1450-1463}, keywords = {acoustic signal processing, Acoustics, Adaptation, Adaptation models, AMI meetings, Aurora4, consistent word error rate reductions, deep neural networks (DNNs), DNN acoustic model, factorisation, feature extraction, Hidden Markov models, learning hidden unit contributions, learning hidden unit contributions (lHUC), LHUC, neural nets, neural network acoustic models, Neural networks, SAT, speaker adaptive training framework, speaker recognition, speaker-dependent manner, speaker-independent manner, speech recognition benchmarks, Switchboard, TED talks, Training, Transforms, unsupervised acoustic model adaptation, unsupervised learning}, issn = {2329-9290}, doi = {10.1109/TASLP.2016.2560534}, author = {P. Swietojanski and J. Li and S. Renals} } @conference {qian15multi, title = {Multi-task joint-learning of deep neural networks for robust speech recognition}, booktitle = {Proc. ASRU{\textquoteright}15}, year = {2016}, address = {Scottsdale, Arizona, USA}, author = {Yanmin Qian, Maofan Yin, Yongbin You and Kai Yu} } @conference {henter2016robust, title = {Robust {TTS} duration modelling using {DNN}s}, booktitle = {Proc. ICASSP}, volume = {41}, year = {2016}, month = {March}, publisher = {IEEE}, organization = {IEEE}, address = {Shanghai, China}, abstract = {Accurate modelling and prediction of speech-sound durations is an important component in generating more natural synthetic speech. Deep neural networks (DNNs) offer a powerful modelling paradigm, and large, found corpora of natural and expressive speech are easy to acquire for training them. Unfortunately, found datasets are seldom subject to the quality-control that traditional synthesis methods expect. Common issues likely to affect duration modelling include transcription errors, reductions, filled pauses, and forced-alignment inaccuracies. To combat this, we propose to improve modelling and prediction of speech durations using methods from robust statistics, which are able to disregard ill-fitting points in the training material. We describe a robust fitting criterion based on the density power divergence (the beta-divergence) and a robust generation heuristic using mixture density networks (MDNs). Perceptual tests indicate that subjects prefer synthetic speech generated using robust models of duration over the baselines.}, keywords = {duration modelling, robust statistics, speech synthesis}, url = {http://homepages.inf.ed.ac.uk/ghenter/pubs/henter2016robust.pdf}, author = {Henter, Gustav Eje and Ronanki, Srikanth and Watts, Oliver and Wester, Mirjam and Wu, Zhizheng and King, Simon} } @conference {Swietojanski_ICASSP2016, title = {SAT-LHUC: Speaker Adaptive Training for Learning Hidden Unit Contributions}, booktitle = {Proc. IEEE ICASSP}, year = {2016}, month = {March}, address = {Shanghai, China}, abstract = {This paper extends learning hidden unit contributions (LHUC) unsupervised speaker adaptation with speaker adaptive training (SAT). Contrary to other SAT approaches, the proposed technique does not require speaker-dependent features, the generation of auxiliary generative models to estimate or extract speaker-dependent information, or any changes to the speaker-independent model structure. SAT-LHUC is directly integrated into the objective and jointly learns speaker-independent and speaker-dependent representations. We demonstrate that the SAT-LHUC technique can match feature-space regression transforms for matched narrow-band data and outperform it on wide-band data when the runtime distribution differs significantly from training one. We have obtained 6.5\%, 10\% and 18.5\% relative word error rate reductions compared to speaker-independent models on Switchboard, AMI meetings and TED lectures, respectively. This corresponds to relative gains of 2\%, 4\% and 6\% compared with non-SAT LHUC adaptation. SAT-LHUC was also found to be complementary to SAT with feature-space maximum likelihood linear regression transforms.}, url = {http://homepages.inf.ed.ac.uk/s1136550/data/Swietojanski_ICASSP2016.pdf}, author = {Swietojanski, P. and Renals, S.} } @conference {lu2016segmental, title = {Segmental Recurrent Neural Networks for End-to-end Speech Recognition}, booktitle = {Proc. INTERSPEECH}, year = {2016}, author = {Lu, Liang and Kong, Lingpeng and Dyer, Chris and Smith, Noah A and Renals, Steve} } @conference {milner_icassp16, title = {Segment-oriented evaluation of speaker diarisation performance}, booktitle = {Proceedings of the 2016 International Conference on Acoustic, Speech and Signal Processing (ICASSP)}, year = {2016}, address = {Shanghai, China}, author = {Rosanna Milner and Thomas Hain} } @conference {392, title = {Selection of Multi-genre Broadcast Data for the Training of Automatic Speech Recognition Systems}, booktitle = {Proc. ICASSP}, year = {2016}, author = {P. Lanchantin and M.J.F. Gales and P. Karanasou and X. Liu and Y. Qian and L. Wang and P.C. Woodland and C. Zhang} } @conference {ng_odyssey16, title = {{Sheffield {LRE} 2015 System Description}}, booktitle = {{Odyssey: The Speaker and Language Recognition Workshop (Submitted)}}, year = {2016}, month = {June}, author = {Raymond W. M. Ng and Mauro Nicolao and Oscar Saz and Madina Hasan and Bhusan Chettri and Mortaza Doulaty and Tan Lee and Thomas Hain} } @conference {Liu2016, title = {The Sheffield Wargame Corpus - Day Two and Day Three}, booktitle = {Proceedings of Interspeech 2016}, year = {2016}, month = {September}, address = {San Francisco, USA}, abstract = {Improving the performance of distant speech recognition is of considerable current interest, driven by a desire to bring speech recognition into people{\textquoteright}s homes. Standard approaches to this task aim to enhance the signal prior to recognition, typically using beamforming techniques on multiple channels. Only few real-world recordings are available that allow experimentation with such techniques. This has become even more pertinent with recent works with deep neural networks aiming to learn beamforming from data. Such approaches require large multi-channel training sets, ideally with location annotation for moving speakers, which is scarce in existing corpora. This paper presents a freely available and new extended corpus of English speech recordings in a natural setting, with moving speakers. The data is recorded with diverse microphone arrays, and uniquely, with ground truth location tracking. It extends the 8.0 hour Sheffield Wargames Corpus released in Interspeech 2013, with a further 16.6 hours of fully annotated data, including 6.1 hours of female speech to improve gender bias. Additional blog-based language model data is provided alongside, as well as a Kaldi baseline system. Results are reported with a standard Kaldi configuration, and a baseline meeting recognition system. Index Terms: distant speech recognition, multi-channel speech recognition, natural speech corpora, deep neural network.} pdf = {http://staffwww.dcs.shef.ac.uk/people/Y.Liu/publications/pdf/Liu2016.pdf}, author = {Yulan Liu and Charles Fox and Madina Hasan and Thomas Hain} } @conference {liang2016small, title = {Small-footprint Deep Neural Networks with Highway Connections for Speech Recognition}, booktitle = {Proc. INTERSPEECH}, year = {2016}, author = {Lu, Liang and Renals, Steve} } @conference {CassiaIS16, title = {Speech Enhancement for a Noise-Robust Text-to-Speech Synthesis System using Deep Recurrent Neural Networks}, booktitle = {Proc. Interspeech}, year = {2016}, month = {Sep.}, address = {San Fransisco, US}, author = {Valentini-Botinhao, C. and Wang, X. and Takaki, S. and Yamagishi, J.} } @conference {jy308:2016ab, title = {System Combiantion with Log-linear Models}, booktitle = {Proc. ICASSP{\textquoteright}16}, year = {2016}, address = {Shanghai, China}, url = {http://mi.eng.cam.ac.uk/~cz277/doc/Conference-ICASSP2016-LONLIN.pdf}, author = {Yang, J. and Zhang, C. and Ragni, A. and Gales, M.J.F. and Woodland, P.C.} } @conference {370, title = {A template-based approach for speech synthesis intonation generation using LSTMs}, booktitle = {Proc. Interspeech}, year = {2016}, month = {September}, publisher = {ISCA}, organization = {ISCA}, address = {San Francisco, CA}, abstract = {The absence of convincing intonation makes current parametric speech synthesis systems sound dull and lifeless, even when trained on expressive speech data. Typically, these systems use regression techniques to predict the fundamental frequency (F0) frame-by-frame. This approach leads to overly-smooth pitch contours and fails to construct an appropriate prosodic structure across the full utterance. In order to capture and reproduce larger-scale pitch patterns, this paper proposes a template-based approach for automatic F0 generation, where per-syllable pitch-contour templates (from a small, automatically learned set) are predicted by a recurrent neural network (RNN). The use of syllable templates mitigates the over-smoothing problem and is able to reproduce pitch patterns observed in the data. The use of an RNN, paired with connectionist temporal classification (CTC), enables the prediction of structure in the pitch contour spanning the entire utterance. This novel F0 prediction system is used alongside separate LSTMs for predicting phone durations and the other acoustic features, to construct a complete text-to-speech system. We report the results of objective and subjective tests on an expressive speech corpus of children\&$\#$39;s audiobooks, and include comparisons to a conventional baseline that predicts F0 directly at the frame level.}, keywords = {CTC, F0 templates, intonation modelling, LSTM, speech synthesis}, author = {Ronanki, Srikanth and Henter, Gustav Eje and Wu, Zhizheng and King, Simon} } @conference {dall2016testing, title = {Testing the consistency assumption: Pronunciation variant forced alignment in read and spontaneous speech synthesis}, booktitle = {Proc. ICASSP}, volume = {41}, year = {2016}, month = {March}, publisher = {IEEE}, organization = {IEEE}, address = {Shanghai, China}, abstract = {Forced alignment for speech synthesis traditionally aligns a phoneme sequence predetermined by the front-end text processing system. This sequence is not altered during alignment, i.e., it is forced, despite possibly being faulty. The consistency assumption is the assumption that these mistakes do not degrade models, as long as the mistakes are consistent across training and synthesis. We present evidence that in the alignment of both standard read prompts and spontaneous speech this phoneme sequence is often wrong, and that this is likely to have a negative impact on acoustic models. A lattice-based forced alignment system allowing for pronunciation variation is implemented, resulting in improved phoneme identity accuracy for both types of speech. A perceptual evaluation of HMM-based voices showed that spontaneous models trained on this improved alignment also improved standard synthesis, despite breaking the consistency assumption}, keywords = {forced alignment, HMM, speech synthesis, TTS}, url = {http://homepages.inf.ed.ac.uk/ghenter/pubs/dall2016testing.pdf}, author = {Dall, Rasmus and Brognaux, Sandrine and Richmond, Korin and Valentini-Botinhao, Cassia and Henter, Gustav Eje and Hirschberg, Julia and Yamagishi, Junichi and King, Simon} } @conference {lu2016training, title = {On training the recurrent neural network encoder-decoder for large vocabulary end-to-end speech recognition}, booktitle = {Proc. ICASSP}, year = {2016}, publisher = {IEEE}, organization = {IEEE}, author = {Lu, Liang and Zhang, Xingxing and Renais, Steve} } @conference {hain_is16, title = {webASR 2 - Improved cloud based speech technology}, booktitle = {Proceedings of the 17th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2016}, address = {San Francisco, CA}, author = {Thomas Hain and Jeremy Christian and Oscar Saz and Salil Deena and Madina Hasan and Raymond W. M. Ng and Rosanna Milner and Mortaza Doulaty and Yulan Liu} } @conference {milner_ASRU2015, title = {{The 2015 Sheffield System for Longitudinal Diarisation of Broadcast Media}}, booktitle = {{Proceedings of the 2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)}}, year = {2015}, address = {Scottsdale, AZ}, author = {Rosanna Milner and Oscar Saz and Salil Deena and Mortaza Doulaty and Raymond Ng and Thomas Hain} } @conference {Saz_ASRU, title = {{The 2015 Sheffield System for Transcription of Multi{\textendash}Genre Broadcast Media}}, booktitle = {{Proceedings of the 2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)}}, year = {2015}, address = {Scottsdale, AZ}, author = {Oscar Saz and Mortaza Doulaty and Salil Deena and Rosanna Milner and Raymond Ng and Madina Hasan and Yulan Liu and Thomas Hain} } @conference {wester:listeners:IS2015, title = {Are we using enough listeners? {No! An empirically-supported critique of Interspeech 2014 TTS evaluations}}, booktitle = {Proc. of Interspeech}, year = {2015}, month = {September}, address = {Dresden}, abstract = {Tallying the numbers of listeners that took part in subjective evaluations of synthetic speech at Interspeech 2014 showed that in more than 60\% of papers conclusions are based on listening tests with less than 20 listeners. Our analysis of Blizzard 2013 data shows that for a MOS test measuring naturalness a stable level of significance is only reached when more than 30 listeners are used. In this paper, we set out a list of guidelines, i.e., a checklist for carrying out meaningful subjective evaluations. We further illustrate the importance of sentence coverage and number of listeners by presenting changes to rank order and number of significant pairs by re-analysing data from the Blizzard Challenge 2013.}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/wester:listeners:IS2015.pdf}, author = {Wester, Mirjam and Valentini-Botinhao, Cassia and Henter, Gustav Eje} } @conference {wester:artificial:IS2015, title = {Artificial Personality and Disfluency}, booktitle = {Proc. of Interspeech}, year = {2015}, month = {September}, address = {Dresden}, abstract = {The focus of this paper is artificial voices with different personalities. Previous studies have shown links between an individual\&$\#$39;s use of disfluencies in their speech and their perceived personality. Here, filled pauses (uh and um) and discourse markers (like, you know, I mean) have been included in synthetic speech as a way of creating an artificial voice with different personalities. We discuss the automatic insertion of filled pauses and discourse markers (i.e., fillers) into otherwise fluent texts. The automatic system is compared to a ground truth of human \"acted\" filler insertion. Perceived personality (as defined by the big five personality dimensions) of the synthetic speech is assessed by means of a standardised questionnaire. Synthesis without fillers is compared to synthesis with either spontaneous or synthetic fillers. Our findings explore how the inclusion of disfluencies influences the way in which subjects rate the perceived personality of an artificial voice.}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/wester:artificial:IS2015.pdf}, author = {Wester, Mirjam and Aylett, Matthew and Tomalin, Marcus and Dall, Rasmus} } @conference {Merritt2015Attributing, title = {Attributing modelling errors in HMM synthesis by stepping gradually from natural to modelled speech}, booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, year = {2015}, address = {Brisbane}, abstract = {Even the best statistical parametric speech synthesis systems do not achieve the naturalness of good unit selection. We investigated possible causes of this. By constructing speech signals that lie inbetween natural speech and the output from a complete HMM synthesis system, we investigated various effects of modelling. We manipulated the temporal smoothness and the variance of the spectral parameters to create stimuli, then presented these to listeners alongside natural and vocoded speech, as well as output from a full HMM-based text-to-speech system and from an idealised {\textquoteleft}pseudo-HMM\&$\#$39;. All speech signals, except the natural waveform, were created using vocoders employing one of two popular spectral parameterisations: Mel-Cepstra or Mel-Line Spectral Pairs. Listeners made {\textquoteleft}same or different\&$\#$39; pairwise judgements, from which we generated a perceptual map using Multidimensional Scaling. We draw conclusions about which aspects of HMM synthesis are limiting the naturalness of the synthetic speech.}, author = {Merritt, Thomas and Latorre, Javier and King, Simon} } @conference {Woo15, title = {{C}ambridge {U}niversity {T}ranscription {S}ystems for the {M}ulti-{G}enre {B}roadcast {C}hallenge}, booktitle = {Proc. of ASRU}, year = {2015}, address = {Scottsdale, USA}, url = {http://mi.eng.cam.ac.uk/~xl207/publications/conferences/ASRU2015-mgbsys-asr.pdf}, author = {P.C. Woodland and X. Liu and Y. Qian and C. Zhang and M.J.F. Gales and P. Karanasou and P. Lanchantin and L. Wang} } @conference {bell15_complementary_task_mt, title = {Complementary tasks for context-dependent deep neural network acoustic models}, booktitle = {Proc. Interspeech}, year = {2015}, url = {http://www.cstr.ed.ac.uk/downloads/publications/2015/bell15_complementary_task_mt.pdf}, author = {Bell, Peter and Renals, Steve} } @conference {doulaty15, title = {Data-selective Transfer Learning for Multi-Domain Speech Recognition}, booktitle = {Proceedings of the 16th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2015}, address = {Dresden, Germany}, url = {http://www.isca-speech.org/archive/interspeech_2015/papers/i15_2897.pdf}, author = {M. Doulaty and Oscar Saz and Thomas Hain} } @article {7169536, title = {{A Deep Generative Architecture for Postfiltering in Statistical Parametric Speech Synthesis}}, journal = {Audio, Speech, and Language Processing, IEEE/ACM Transactions on}, volume = {23}, number = {11}, year = {2015}, pages = {2003-2014}, abstract = {The generated speech of hidden Markov model (HMM)-based statistical parametric speech synthesis still sounds muffled. One cause of this degradation in speech quality may be the loss of fine spectral structures. In this paper, we propose to use a deep generative architecture, a deep neural network (DNN) generatively trained, as a postfilter. The network models the conditional probability of the spectrum of natural speech given that of synthetic speech to compensate for such gap between synthetic and natural speech. The proposed probabilistic postfilter is generatively trained by cascading two restricted Boltzmann machines (RBMs) or deep belief networks (DBNs) with one bidirectional associative memory (BAM). We devised two types of DNN postfilters: one operating in the mel-cepstral domain and the other in the higher dimensional spectral domain. We compare these two new data-driven postfilters with other types of postfilters that are currently used in speech synthesis: a fixed mel-cepstral based postfilter, the global variance based parameter generation, and the modulation spectrum-based enhancement. Subjective evaluations using the synthetic voices of a male and female speaker confirmed that the proposed DNN-based postfilter in the spectral domain significantly improved the segmental quality of synthetic speech compared to that with conventional methods.}, keywords = {deep generative architecture, HMM, modulation spectrum, postfilter, segmental quality, speech synthesis}, issn = {2329-9290}, doi = {10.1109/TASLP.2015.2461448}, author = {Chen, Ling-Hui and Raitio, T. and Valentini-Botinhao, C. and Ling, Z. and Yamagishi, J.} } @conference {merritt2015richcontext, title = {Deep neural network context embeddings for model selection in rich-context HMM synthesis}, booktitle = {Proc. Interspeech}, year = {2015}, month = {September}, pages = {2207{\textendash}2211}, address = {Dresden, Germany}, abstract = {This paper introduces a novel form of parametric synthesis that uses context embeddings produced by the bottleneck layer of a deep neural network to guide the selection of models in a rich-context HMM-based synthesiser. Rich-context synthesis {\textendash} in which Gaussian distributions estimated from single linguistic contexts seen in the training data are used for synthesis, rather than more conventional decision tree-tied models {\textendash} was originally proposed to address over-smoothing due to averaging across contexts. Our previous investigations have confirmed experimentally that averaging across different contexts is indeed one of the largest factors contributing to the limited quality of statistical parametric speech synthesis. However, a possible weakness of the rich context approach as previously formulated is that a conventional tied model is still used to guide selection of Gaussians at synthesis time. Our proposed approach replaces this with context embeddings derived from a neural network.}, url = {http://www.research.ed.ac.uk/portal/files/19840195/Merritt_Interspeech2015.pdf}, author = {Merritt, Thomas and Yamagishi, Junichi and Wu, Zhizheng and Watts, Oliver and King, Simon} } @conference {WuICASSP15, title = {Deep neural networks employing multi-task learning and stacked bottleneck features for speech synthesis}, booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, year = {2015}, abstract = {Deep neural networks (DNNs) use a cascade of hidden representations to enable the learning of complex mappings from input to output features. They are able to learn the complex mapping from text-based linguistic features to speech acoustic features, and so perform text-to-speech synthesis. Recent results suggest that DNNs can produce more natural synthetic speech than conventional HMM-based statistical parametric systems. In this paper, we show that the hidden representation used within a DNN can be improved through the use of Multi-Task Learning, and that stacking multiple frames of hidden layer activations (stacked bottleneck features) also leads to improvements. Experimental results confirmed the effectiveness of the proposed methods, and in listening tests we find that stacked bottleneck features in particular offer a significant improvement over both a baseline DNN and a benchmark HMM system}, url = {http://www.zhizheng.org/papers/icassp2015_dnn_tts.pdf}, author = {Zhizheng Wu and Cassia Valentini-Botinhao and Oliver Watts and Simon King} } @conference {SwietojanskiICASSP15, title = {Differentiable Pooling for Unsupervised Speaker Adaptation}, booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, year = {2015}, abstract = {This paper proposes a differentiable pooling mechanism to perform model-based neural network speaker adaptation. The proposed technique learns a speaker-dependent combination of activations within pools of hidden units, was shown to work well unsupervised, and does not require speaker-adaptive training. We have conducted a set of experiments on the TED talks data, as used in the IWSLT evaluations. Our results indicate that the approach can reduce word error rates (WERs) on standard IWSLT test sets by about 5{\textendash}11\% relative compared to speaker-independent systems and was found complementary to the recently proposed learning hidden units contribution (LHUC) approach, reducing WER by 6{\textendash}13\% relative. Both methods were also found to work well when adapting with small amounts of unsupervised data {\textendash} 10 seconds is able to decrease the WER by 5\% relative compared to the baseline speaker independent system}, url = {http://homepages.inf.ed.ac.uk/s1136550/data/Swietojanski_ICASSP2015.pdf}, author = {Swietojanski, P. and Renals, S.} } @conference {dall:diss2015, title = {Disfluencies in change detection in natural, vocoded and synthetic speech}, booktitle = {Proc. of DiSS 2015}, year = {2015}, month = {August}, address = {Edinburgh}, abstract = {In this paper, we investigate the effect of filled pauses, a discourse marker and silent pauses in a change detection experiment in natural, vocoded and synthetic speech. In natural speech change detection has been found to increase in the presence of filled pauses, we extend this work by replicating earlier findings and explore the effect of a discourse marker, like, and silent pauses. Furthermore we report how the use of \"unnatural\" speech, namely synthetic and vocoded, affects change detection rates. It was found that the filled pauses, the discourse marker and silent pauses all increase change detection rates in natural speech, however in neither synthetic nor vocoded speech did this effect appear. Rather, change detection rates decreased in both types of \"unnatural\" speech compared to natural speech. The natural results suggests that while each type of pause increase detection rates, the type of pause may have a further effect. The \"unnatural\" results suggest that it is not the full pipeline of synthetic speech that causes the degradation, but rather that something in the pre-processing, i.e. vocoding, of the speech database limits the resulting synthesis.}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/dall:diss:2015.pdf}, author = {Dall, Rasmus and Wester, Mirjam and Corley, Martin} } @book {Obi15, title = {Exploiting Alternatives for Text-To-Speech Synthesis: From Machine to Human}, series = {Speech Prosody in Speech Synthesis: Modeling and generation of prosody for high quality and flexible speech synthesis}, year = {2015}, month = {February}, pages = {189-202}, publisher = {Springer Verlag}, organization = {Springer Verlag}, chapter = {Control of Prosody in Speech Synthesis}, author = {N. Obin and C. Veaux and P. Lanchantin} } @conference {llu_is2015a, title = {Feature-space Speaker Adaptation for Probabilistic Linear Discriminant Analysis Acoustic Models}, booktitle = {Proc. INTERSPEECH}, year = {2015}, abstract = {Probabilistic linear discriminant analysis (PLDA) acoustic models extend Gaussian mixture models by factorizing the acoustic variability using state-dependent and observation-dependent variables. This enables the use of higher dimensional acoustic features, and the capture of intra-frame feature correlations. In this paper, we investigate the estimation of speaker adaptive feature-space (constrained) maximum likelihood linear regression transforms from PLDA-based acoustic models. This feature-space speaker transformation estimation approach is potentially very useful due to the ability of PLDA acoustic models to use different types of acoustic features, for example applying these transforms to deep neural network (DNN) acoustic models for cross adaptation. We evaluated the approach on the Switchboard corpus, and observe significant word error reduction by using both the mel-frequency cepstral coefficients and DNN bottleneck features.}, author = {Lu, Liang and Renals, Steve} } @conference {Zhang:2015ab, title = {A General Artificial Neural Network Extension for {HTK}}, booktitle = {Proc. Interspeech{\textquoteright}15}, year = {2015}, address = {Dresden, Germany}, url = {http://www.isca-speech.org/archive/interspeech_2015/papers/i15_3581.pdf}, author = {Zhang, C. and Woodland, P.C.} } @conference {wester:human:IS2015, title = {Human vs Machine Spoofing Detection on Wideband and Narrowband Data}, booktitle = {Proc. of Interspeech}, year = {2015}, month = {September}, address = {Dresden}, abstract = {How well do humans detect spoofing attacks directed at automatic speaker verification systems? This paper investigates the performance of humans at detecting spoofing attacks from speech synthesis and voice conversion systems. Two speaker verification tasks, in which the speakers were either humans or machines, were also conducted. The three tasks were carried out with two types of data: wideband (16kHz) and narrowband (8kHz) telephone line simulated data. Spoofing detection by humans was compared to automatic spoofing detection (ASD) algorithms. Listening tests were carefully constructed to en- sure the human and automatic tasks were as similar as possible taking into consideration listener\’s constraints (e.g., fatigue and memory limitations). Results for human trials show the error rates on narrowband data double compared to on wide- band data. The second verification task, which included only artificial speech, showed equal overall acceptance rates for both 8kHz and 16kHz. In the spoofing detection task, there was a drop in performance on most of the artificial trials as well as on human trials. At 8kHz, 20\% of human trials were incorrectly classified as artificial, compared to 12\% at 16kHz. The ASD algorithms also showed a drop in performance on 8kHz data, but outperformed human listeners across the board.}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/wester:human:IS2015.pdf}, author = {Wester, Mirjam and Wu, Zhizheng and Yamagishi, Junichi} } @conference {chen2015improving, title = {Improving the training and evaluation efficiency of recurrent neural network language models}, booktitle = {Proc. ICASSP}, year = {2015}, address = {Brisbane, Australia}, url = {http://mi.eng.cam.ac.uk/~xc257/papers/ICASSP2015-rnnlm-extend.pdf}, author = {Chen, Xie and Liu, Xunying and Gales, Mark and Woodland, Phil} } @article { martinez2015intelligibility, title = {Intelligibility Assessment and Speech Recognizer Word Accuracy Rate Prediction for Dysarthric Speakers in a Factor Analysis Subspace}, journal = {ACM Transactions on Accessible Computing (TACCESS)}, volume = {6}, number = {3}, year = {2015}, pages = {10}, publisher = {ACM}, doi = {10.1145/2746405}, url = {http://dl.acm.org/citation.cfm?id=2746405}, author = {Mart{\'\i}nez, David and Lleida, Eduardo and Green, Phil and Christensen, Heidi and Ortega, Alfonso and Miguel, Antonio} } @conference {Liu2015, title = {An Investigation Into Speaker Informed DNN Front-end for {LVCSR}}, booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, year = {2015}, month = {April}, abstract = {Deep Neural Network (DNN) has become a standard method in many ASR tasks. Recently there is considerable interest in \“informed training\” of DNNs, where DNN input is augmented with auxiliary codes, such as i-vectors, speaker codes, speaker separation bottleneck (SSBN) features, etc. This paper compares different speaker informed DNN training methods in LVCSR task. We discuss mathematical equivalence between speaker informed DNN training and \“bias adaptation\” which uses speaker dependent biases, and give detailed analysis on influential factors such as dimension, discrimination and stability of auxiliary codes. The analysis is supported by experiments on a meeting recognition task using bottleneck feature based system. Results show that i-vector based adaptation is also effective in bottleneck feature based system (not just hybrid systems). However all tested methods show poor generalisation to unseen speakers. We introduce a system based on speaker classification followed by speaker adaptation of biases, which yields equivalent performance to an i-vector based system with 10.4\% relative improvement over baseline on seen speakers. The new approach can serve as a fast alternative especially for short utterances. \ }, keywords = {bias adaptation, deep neural network, speaker adaptation, speaker informed training, Speech recognition}, url = {http://staffwww.dcs.shef.ac.uk/people/Y.Liu/publications/pdf/Liu2015.pdf}, author = {Yulan Liu and Penny Karanasou and Thomas Hain} } @conference {Karanasou_IS2015, title = {I-Vector Estimation Using Informative Priors for Adaptation of Deep Neural Networks}, booktitle = {Proc. of Interspeech}, year = {2015}, author = {Penny Karanasou and Mark Gales and Philip Woodland} } @conference {MGB, title = {Latent Dirichlet Allocation Based Organisation of Broadcast Media Archives for Deep Neural Network Adaptation}, booktitle = {Proc. of ASRU}, year = {2015}, address = {Arizona, USA}, author = {M. Doulaty and Oscar Saz and Raymond W. M. Ng and Thomas Hain} } @conference {tomalin:diss:2015, title = {A Lattice-based Approach to Automatic Filled Pause Insertion}, booktitle = {Proc. of DiSS 2015}, year = {2015}, month = {August}, address = {Edinburgh}, abstract = {This paper describes a novel method for automatically inserting filled pauses (e.g., UM) into fluent texts. Although filled pauses are known to serve a wide range of psychological and structural functions in conversational speech, they have not traditionally been modelled overtly by state-of-the-art speech synthesis systems. However, several recent systems have started to model disfluencies specifically, and so there is an increasing need to create disfluent speech synthesis input by automatically inserting filled pauses into otherwise fluent text. The approach presented here interpolates Ngrams and Full-Output Recurrent Neural Network Language Models (f-RNNLMs) in a lattice-rescoring framework. It is shown that the interpolated system outperforms separate Ngram and f-RNNLM systems, where performance is analysed using the Precision, Recall, and F-score metrics.}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/tomalin:diss:2015.pdf}, author = {Tomalin, Marcus and Wester, Mirjam and Dall, Rasmus and Byrne, Bill and King, Simon} } @conference {Hu_ICASSP2015, title = {Methods for applying dynamic sinusoidal models to statistical parametric speech synthesis}, booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, year = {2015}, author = {Qiong Hu and Yannis Stylianou and Ranniery Maia and Korin Richmond and Junichi Yamagishi} } @conference {Bell_ASRU, title = {{The MGB Challenge: Evaluating Multi-genre Broadcast Media Recognition}}, booktitle = {{Proceedings of the 2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)}}, year = {2015}, address = {Scottsdale, AZ}, url = {http://homepages.inf.ed.ac.uk/pbell1/bell15_mgb_challenge.pdf}, author = {Peter Bell and Mark Gales and Thomas Hain and Jonathan Kilgour and Pierre Lanchantin and Andrew Liu and Andrew McParland and Steve Renals and Oscar Saz and Mirjam Wester and Phil Woodland} } @conference {UriaICASSP15, title = {{Modelling acoustic feature dependencies with artificial neural networks: Trajectory-RNADE}}, booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, year = {2015}, abstract = {Given a transcription, sampling from a good model of acoustic feature trajectories should result in plausible realizations of an utterance. However, samples from current probabilistic speech synthesis systems result in low quality synthetic speech. Henter et al. have demonstrated the need to capture the dependencies between acoustic features conditioned on the phonetic labels in order to obtain high quality synthetic speech. These dependencies are often ignored in neural network based acoustic models. We tackle this deficiency by introducing a probabilistic neural network model of acoustic trajectories, trajectory RNADE, able to capture these dependencies.}, url = {http://www.benignouria.com/en/research/rnade_synthesis/draft.pdf}, author = {Benigno Uria and Iain Murray and Steve Renals and Cassia Valentini-Botinhao} } @conference {301, title = {MULTI-BASIS ADAPTIVE NEURAL NETWORK FOR RAPID ADAPTATION IN SPEECH RECOGNITION}, booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on}, year = {2015}, publisher = {IEEE}, organization = {IEEE}, url = {https://www.wiki.ed.ac.uk/download/attachments/152778802/mbann-rev15.pdf?version=1\&modificationDate=1412870033000\&api=v2}, author = {Wu, Chunyang and Gales, Mark} } @conference {llu_icassp15, title = {Multi-frame factorisation for long-span acoustic modelling}, booktitle = {Proc. ICASSP}, year = {2015}, abstract = {Acoustic models based on Gaussian mixture models (GMMs) typically use short span acoustic feature inputs. This does not capture long-term temporal information from speech owing to the conditional independence assumption of hidden Markov models. In this paper, we present an implicit approach that approximates the joint distribution of long span features by product of factorized models, in contrast to deep neural networks (DNNs) that model feature correlations directly. The approach is applicable to a broad range of acoustic models. We present experiments using GMM and probabilistic linear discriminant analysis (PLDA) based models on Switchboard, observing consistent word error rate reductions.}, author = {Lu, Liang and Renals, Steve} } @conference {ali15_multi_wer_asr, title = {Multi-reference {WER} for evaluating {ASR} for languages with no orthographic rules}, booktitle = {Proc. ASRU}, year = {2015}, author = {Ali, Ahmed. and Magdy, Walid. and Bell, Peter. and Renals, Steve} } @conference {qian15multi, title = {Multi-task joint-learning of deep neural networks for robust speech recognition}, booktitle = {Proc. ASRU{\textquoteright}15}, year = {2015}, address = {Scottsdale, Arizona, USA}, author = {Yanmin Qian and Maofan Yin and Yongbin You and Kai Yu} } @conference {Zhang:2015cd, title = {Parameterised Sigmoid and {R}e{LU} Hidden Activation Functions for {DNN} Acoustic Modelling}, booktitle = {Proc. Interspeech{\textquoteright}15}, year = {2015}, address = {Dresden, Germany}, url = {http://mi.eng.cam.ac.uk/~cz277/doc/Conference-Interspeech2015-ACT.pdf}, author = {Zhang, C. and Woodland, P.C.} } @conference {andrewliu:2015:icassp, title = {Paraphrastic Recurrent Neural Network Language Models}, booktitle = {Proc. ICASSP}, year = {2015}, address = {Brisbane, Australia}, url = {http://mi.eng.cam.ac.uk/~xc257/papers/ICASSP2015-prnnlm.pdf}, author = {Liu, Xunying and Chen, Xie and Gales, Mark and Woodland, Phil} } @proceedings {329, title = {Reconstructing Voices Within the Multiple-Average-Voice-Model Framework}, journal = {Interspeech}, year = {2015}, url = {http://www.isca-speech.org/archive/interspeech_2015/i15_2232.html}, author = {P. Lanchantin and C. Veaux and M. J. F. Gales and S. King and J. Yamagishi} } @conference {chen2015recurrent, title = {Recurrent neural network language model training with noise contrastive estimation for speech recognition}, booktitle = {Proc. ICASSP}, year = {2015}, address = {Brisbane, Australia}, url = {http://mi.eng.cam.ac.uk/~xc257/papers/ICASSP2015-rnnlm-nce.pdf}, author = {Chen, Xie and Liu, Xunying and Gales, Mark and Woodland, Phil} } @conference {bell15_cd_multitask, title = {Regularization of context-dependent deep neural networks with context-independent multi-task training}, booktitle = {Proc. ICASSP}, year = {2015}, url = {http://www.cstr.ed.ac.uk/downloads/publications/2015/bell15_cd_multitask.pdf}, author = {Bell, Peter and Renals, Steve} } @article {Obi15, title = {{S}ymbolic {M}odelling of {S}peech {P}rosody: {F}rom {L}inguistics to {S}tatistics}, journal = {IEEE Transactions on Audio, Speech, and Language Processing}, volume = {23}, number = {3}, year = {2015}, month = {March}, author = {N. Obin and P. Lanchantin} } @conference {288, title = {SAS: A speaker verification spoofing database containing diverse attacks}, booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, year = {2015}, author = {Zhizheng Wu and Ali Khodabakhsh and Cenk Demiroglu and Junichi Yamagishi and Daisuke Saito and Tomoki Toda and Simon King} } @proceedings {KaranasouASRU2015, title = {Speaker Diarisation and Longitudinal Linking in Multi-Genre Broadcast Data}, journal = {Proc. of ASRU}, year = {2015}, url = {http://mi.eng.cam.ac.uk/\%7Exl207/publications/conferences/ASRU2015-mgbsys-diarisation.pdf}, author = {P. Karanasou and M. J. F. Gales and P. Lanchantin and X. Liu and Y. Qian and L. Wang and P. C. Woodland and C. Zhang} } @conference {Christensen:2015uv, title = {{Speech-Enabled Environmental Control in an AAL setting for people with Speech Disorders: a Case Study}}, booktitle = {IET International Conference on Technologies for Active and Assisted Living}, year = {2015}, address = {London, UK}, url = {http://staffwww.dcs.shef.ac.uk/people/M.Nicolao/site/publications_files/speech-enabled-environmental.pdf}, author = {Christensen, Heidi and Nicolao, Mauro and Cunningham, Stuart and Deena, Salil and Green, Phil and Hain, Thomas} } @conference {Swietojanski2015, title = {Structured Output Layer with Auxiliary Targets for Context-Dependent Acoustic Modelling}, booktitle = {Proc. Interspeech}, year = {2015}, month = {September}, address = {Dresden, Germany}, abstract = {In previous work we have introduced a multi-task training technique for neural network acoustic modelling, in which context-dependent and context-independent targets are jointly learned. In this paper, we extend the approach by structuring the out-put layer such that the context-dependent outputs are dependent on the context-independent outputs, thus using the context-independent predictions at run-time. We have also investigated the applicability of this idea to unsupervised speaker adaptation as an approach to overcome the data sparsity issues that comes to the fore when estimating systems with a large number of context-dependent states, when data is limited. We have experimented with various amounts of training material (from 10 to 300 hours) and find the proposed techniques are particularly well suited to data-constrained conditions allowing to better utilise large context-dependent state-clustered trees. Experimental results are reported for large vocabulary speech recognition using the Switchboard and TED corpora.}, url = {http://homepages.inf.ed.ac.uk/s1136550/data/Swietojanski_Interspeech2015.pdf}, author = {Swietojanski, P. and Bell, P. and Renals, S.} } @conference {llu_is2015b, title = {A Study of the Recurrent Neural Network Encoder-Decoder for Large Vocabulary Speech Recognition}, booktitle = {Proc. INTERSPEECH}, year = {2015}, abstract = {Deep neural networks have advanced the state-of-the-art in automatic speech recognition, when combined with hidden Markov models (HMMs). Recently there has been interest in using systems based on recurrent neural networks (RNNs) to perform sequence modelling directly, without the requirement of an HMM superstructure. In this paper, we study the RNN encoder-decoder approach for large vocabulary end-to-end speech recognition, whereby an encoder transforms a sequence of acoustic vectors into a sequence of feature representations, from which a decoder recovers a sequence of words. We investigated this approach on the Switchboard corpus using a training set of around 300 hours of transcribed audio data. Without the use of an explicit language model or pronunciation lexicon, we achieved promising recognition accuracy, demonstrating that this approach warrants further investigation.}, author = {Lu, Liang and Zhang, Xingxing and Cho, KyungHyun and Renals, Steve} } @conference {bell15_alignment, title = {A system for automatic alignment of broadcast media captions using weighted finite-state transducers}, booktitle = {Proc. ASRU}, year = {2015}, url = {http://homepages.inf.ed.ac.uk/pbell1/bell15_alignment.pdf}, author = {Bell, Peter and Renals, Steve} } @conference {bell15_news_summarisation, title = {A system for automatic broadcast news summarisation, geolocation and translation}, booktitle = {Proc. Interspeech (demo session)}, year = {2015}, address = {Dresden, Germany}, abstract = {An increasing amount of news content is produced in audio-video form every day. To effectively analyse and monitoring this multilingual data stream, we require methods to extract and present audio content in accessible ways. In this paper, we describe an end-to-end system for processing and browsing audio news data. This fully automated system brings together our recent research on audio scene analysis, speech recognition, summarisation, named entity detection, geolocation, and machine translation. The graphical interface allows users to visualise the distribution of news content by entity names and story location. Browsing of news events is facilitated through extractive summaries and the ability to view transcripts in multiple languages.}, url = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/bell15_news_summarisation.pdf}, author = {Bell, Peter and Lai, Catherine and Llewellyn, Clare and Birch, Alexandra and Sinclair, Mark} } @conference {Lan15c, title = {{T}he {D}evelopment of the {C}ambridge {U}niversity {A}lignment {S}ystems for the {M}ulti-{G}enre {B}roadcast {C}hallenge}, booktitle = {Proc. of ASRU}, year = {2015}, address = {Scottsdale, USA}, url = {http://mi.eng.cam.ac.uk/~xl207/publications/conferences/ASRU2015-mgbsys-align.pdf}, author = {P. Lanchantin and P. Karanasou and M.J.F. Gales and X. Liu and L. Wang and Y. Qian and P. C. Woodland and C. Zhang} } @conference {Wester:diss:2015, title = {The Temporal Delay Hypothesis: Natural, Vocoded and Synthetic Speech}, booktitle = {Proc. of DiSS 2015}, year = {2015}, month = {August}, address = {Edinburgh}, abstract = {Including disfluencies in synthetic speech is being explored as a way of making synthetic speech sound more natural and conversational. How to measure whether the resulting speech is actually more natural, however, is not straightforward. Conventional approaches to synthetic speech evaluation fall short as a listener is either primed to prefer stimuli with filled pauses or, when they aren\&$\#$39;t primed they prefer more fluent speech. Psycholinguistic reaction time experiments may circumvent this issue. In this paper, we revisit one such reaction time experiment. For natural speech, delays in word onset were found to facilitate word recognition regardless of the type of delay; be they a filled pause (um), silence or a tone. We expand these experiments by examining the effect of using vocoded and synthetic speech. Our results partially replicate previous findings. For natural and vocoded speech, if the delay is a silent pause, significant increases in the speed of word recognition are found. If the delay comprises a filled pause there is a significant increase in reaction time for vocoded speech but not for natural speech. For synthetic speech, no clear effects of delay on word recognition are found. We hypothesise this is because it takes longer (requires more cognitive resources) to process synthetic speech than natural or vocoded speech.}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/wester:diss:2015.pdf}, author = {Wester, Mirjam and Corley, Martin and Dall, Rasmus} } @conference {cervone15_reported_speech_prosody, title = {Towards automatic detection of reported speech in dialogue using prosodic cues}, booktitle = {Proc. Interspeech}, year = {2015}, month = {sep}, address = {Dresden, Germany}, abstract = {The phenomenon of reported speech \– whereby we quote the words, thoughts and opinions of others, or recount past dialogue \– is widespread in conversational speech. Detecting such quotations automatically has numerous applications: for example, in enhancing automatic transcription or spoken language understanding applications. However, the task is challenging, not least because lexical cues of quotations are frequently ambiguous or not present in spoken language. The aim of this paper is to identify potential prosodic cues of reported speech which could be used, along with the lexical ones, to automatically detect quotations and ascribe them to their rightful source, that is reconstructing their Attribution Relations. In order to do so we analyze SARC, a small corpus of telephone conversations that we have annotated with Attribution Relations. The results of the statistical analysis performed on the data show how variations in pitch, intensity, and timing features can be exploited as cues of quotations. Furthermore, we build a SVM classifier which integrates lexical and prosodic cues to automatically detect quotations in speech that performs significantly better than chance.}, url = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/cervone15_reported_speech_prosody.pdf}, author = {Cervone, Alessandra and Lai, Catherine and Pareti, Silvia and Bell, Peter} } @conference {dnncost_IS15, title = {{Towards minimum perceptual error training for DNN-based speech synthesis}}, booktitle = {Proc. Interspeech}, year = {2015}, month = {Sept.}, address = {Dresden, Germany}, abstract = {We propose to use a perceptually-oriented domain to improve the quality of text-to-speech generated by deep neural networks (DNNs). We train a DNN that predicts the parameters required for speech reconstruction but whose cost function is calculated in another domain. In this paper, to represent this perceptual domain we extract an approximated version of the Spectro-Temporal Excitation Pattern that was originally proposed as part of a model of hearing speech in noise. We train DNNs that predict band aperiodicity, fundamental frequency and Mel cepstral coefficients and compare generated speech when the spectral cost function is defined in the Mel cepstral, warped log spectrum or perceptual domains. Objective results indicate that the perceptual domain system achieves the highest quality.}, url = {http://www.isca-speech.org/archive/interspeech_2015/papers/i15_0869.pdf}, author = {Valentini-Botinhao, C. and Wu, Z. and King, S.} } @conference {doulaty15b, title = {Unsupervised Domain Discovery using Latent Dirichlet Allocation for Acoustic Modelling in Speech Recognition}, booktitle = {Proceedings of the 16th Annual Conference of the International Speech Communication Association (Interspeech)}, year = {2015}, address = {Dresden, Germany}, url = {http://www.isca-speech.org/archive/interspeech_2015/papers/i15_3640.pdf}, author = {M. Doulaty and Oscar Saz and Thomas Hain} } @conference {karanasou_is14, title = {Adaptation of Deep Neural Network Acoustic Models Using Factorised I-vectors}, booktitle = {Proceedings of Interspeech{\textquoteright}14}, year = {2014}, author = {Penny Karanasou and Yongqiang Wang and Mark Gales and Phil Woodland} } @conference {casanueva_is14, title = {"Adaptive speech recognition and dialogue management for users with speech disorders}, booktitle = {Proceedings of Interspeech{\textquoteright}14}, year = {2014}, author = {I. Casanueva and H. Christensen and T. Hain and P. Green} } @conference {270, title = {Automatic Selection of Speakers for Improved Acoustic Modelling : Recognition of Disordered Speech with Sparse Data}, booktitle = {Spoken Language Technology Workshop, SLT{\textquoteright}14}, year = {2014}, address = {Lake Tahoe}, author = {H. Christensen and I. Casanueva and S. Cunningham and P. Green and T. Hain} } @conference {Saz14b, title = {Background-Tracking Acoustic Features for Genre Identification of Broadcast Shows}, booktitle = {Proceedings of the 2014 Spoken Language Technology (SLT) Workshop}, year = {2014}, pages = {118{\textendash}123}, address = {South Lake Tahoe NV, USA}, url = {http://staffwww.dcs.shef.ac.uk/people/O.Saztorralba/data/2014\%20-\%20oskarsaz\%20-\%20SLT.pdf}, author = {Oscar Saz and M. Doulaty and Thomas Hain} } @conference {Brown05, title = {Background-tracking acoustic features for genre identification of broadcast shows}, booktitle = {Proceedings of the 2014 IEEE Spoken Language Technology Workshop (SLT)}, year = {2014}, pages = {118{\textendash}123}, address = {South Lake Tahoe, NV}, url = {http://staffwww.dcs.shef.ac.uk/people/O.Saztorralba/data/2014\%20-\%20oskarsaz\%20-\%20SLT.pdf}, author = {O. Saz and M. Doulaty and T. Hain} } @article {Swietojanski:SPL14, title = {Convolutional Neural Networks for Distant Speech Recognition}, journal = {Signal Processing Letters, IEEE}, volume = {21}, number = {9}, year = {2014}, month = {September}, pages = {1120-1124}, abstract = {We investigate convolutional neural networks (CNNs) for large vocabulary distant speech recognition, trained using speech recorded from a single distant microphone (SDM) and multiple distant microphones (MDM). In the MDM case we explore a beamformed signal input representation compared with the direct use of multiple acoustic channels as a parallel input to the CNN. We have explored different weight sharing approaches, and propose a channel-wise convolution with two-way pooling. Our experiments, using the AMI meeting corpus, found that CNNs improve the word error rate (WER) by 6.5\% relative compared to conventional deep neural network (DNN) models and 15.7\% over a discriminatively trained Gaussian mixture model (GMM) baseline. For cross-channel CNN training, the WER improves by 3.5\% relative over the comparable DNN structure. Compared with the best beamformed GMM system, cross-channel convolution reduces the WER by 9.7\% relative, and matches the accuracy of a beamformed DNN.}, issn = {1070-9908}, doi = {10.1109/LSP.2014.2325781}, author = {Swietojanski, P. and Ghoshal, A. and Renals, S.} } @conference {bell14_xling_mlan, title = {Cross-lingual adaptation with multi-task adaptive networks}, booktitle = {Proc. Interspeech}, year = {2014}, abstract = {Posterior-based or bottleneck features derived from neural networks trained on out-of-domain data may be successfully applied to improve speech recognition performance when data is scarce for the target domain or language. In this paper we combine this approach with the use of a hierarchical deep neural network (DNN) network structure \– which we term a multi-level adaptive network (MLAN) \– and the use of multitask learning. We have applied the technique to cross-lingual speech recognition experiments on recordings of TED talks and European Parliament sessions in English (source language) and German (target language). We demonstrate that the proposed method can lead to improvements over standard methods, even when the quantity of training data for the target language is relatively high. When the complete method is applied, we achieve relative WER reductions of around 13\% compared to a monolingual hybrid DNN baseline.}, url = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/bell_xling_mlan_is2014.pdf}, author = {Bell, Peter and Driesen, Joris and Renals, Steve} } @article {lu2013cross, title = {{Cross-lingual subspace Gaussian mixture model for low-resource speech recognition}}, journal = {IEEE Transactions on Audio, Speech and Language Processing}, year = {2014}, abstract = {This paper studies cross-lingual acoustic modelling in the context of subspace Gaussian mixture models (SGMMs). SGMMs factorize the acoustic model parameters into a set that is globally shared between all the states of a hidden Markov model (HMM) and another that is specific to the HMM states. We demonstrate that the SGMM global parameters are transferable between languages, particularly when the parameters are trained multilingually. As a result, acoustic models may be trained using limited amounts of transcribed audio by borrowing the SGMM global parameters from one or more source languages, and only training the state-specific parameters on the target language audio. Model regularization using $\ell_1$-norm penalty is shown to be particularly effective at avoiding overtraining and leading to lower word error rates. We investigate maximum a posteriori (MAP) adaptation of subspace parameters in order to reduce the mismatch between the SGMM global parameters of the source and target languages. In addition, monolingual and cross-lingual speaker adaptive training is used to reduce the model variance introduced by speakers. We have systematically evaluated these techniques by experiments on the GlobalPhone corpus.}, url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6600925}, author = {Lu, Liang and Ghoshal, Arnab and Renals, Steve} } @conference {dall_IS14, title = {The Effect of Filled Pauses and Speaking Rate on Speech Comprehension in Natural, Vocoded and Synthetic Speech}, booktitle = {Proceedings of Interspeech}, year = {2014}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/Dall_Wester_Corley_IS14.pdf}, author = {Rasmus Dall and Mirjam Wester and Martin Corley} } @conference {chen2014efficient, title = {Efficient {GPU}-based training of recurrent neural network language models using spliced sentence bunch}, booktitle = {Proc. Interspeech}, year = {2014}, address = {Singapore}, url = {http://mi.eng.cam.ac.uk/~xc257/papers/RNNLMTrain_Interspeech2014.pdf}, author = {Chen, Xie and Wang, Yongqiang and Liu, Xunying and Gales, Mark and Woodland, Phil} } @conference {251, title = {EFFICIENT LATTICE RESCORING USING RECURRENT NEURAL NETWORK LANGUAGE MODELS}, booktitle = {IEEE ICASSP2014}, year = {2014}, month = {04/05/2014}, publisher = {IEEE ICASSP2014}, organization = {IEEE ICASSP2014}, address = {Florence, Italy}, abstract = {Recurrent neural network language models (RNNLM) have become an increasingly popular choice for state-of-the-art speech recognition systems due to their inherently strong generalization performance. As these models use a vector representation of complete history contexts, RNNLMs are normally used to rescore N-best lists. Motivated by their intrinsic characteristics, two novel lattice rescoring methods for RNNLMs are investigated in this paper. The first uses an $n$-gram style clustering of history contexts. The second approach directly exploits the distance measure between hidden history vectors. Both methods produced 1-best performance comparable with a 10k-best rescoring baseline RNNLM system on a large vocabulary conversational telephone speech recognition task. Significant lattice size compression of over 70\% and consistent improvements after confusion network (CN) decoding were also obtained over the N-best rescoring approach. \ }, keywords = {language model, recurrent neural network, Speech recognition}, author = {Xunying Liu and Yongqiang Wang and Xie Chen and Mark Gales and Phil Woodland} } @conference {aylett2014, title = {A Flexible Front-End for HTS}, booktitle = {Proc. Interspeech}, year = {2014}, address = {Singapore}, abstract = {Parametric speech synthesis techniques depend on full context acoustic models generated by language front-ends, which analyse linguistic and phonetic structure. HTS, the leading parametric synthesis system, can use a number of different front-ends to generate full context models for synthesis and training. In this paper we explore the use of a new text processing front-end that has been added to the speech recognition toolkit Kaldi as part of an ongoing project to produce a new parametric speech synthesis system, Idlak. The use of XML specification files, a modular design, and modern coding and testing approaches, make the Idlak front-end ideal for adding, altering and experimenting with the contexts used in full context acoustic models. The Idlak front-end was evaluated against the standard Festival front-end in the HTS system. Results from the Idlak front-end compare well with the more mature Festival front-end (Idlak - 2.83 MOS vs Festival - 2.85 MOS), although a slight reduction in naturalness perceived by non-native English speakers can be attributed to Festival\’s insertion of non-punctuated pauses.}, url = {http://www.isca-speech.org/archive/archive_papers/interspeech_2014/i14_1283.pdf}, author = {Aylett, Matthew P. and Dall, Rasmus and Ghoshal, Arnab and Henter, Gustav Eje and Merritt, Thomas} } @conference {Dall_Tomalin_IS14, title = {Investigating Automatic \& Human Filled Pause Insertion for Speech Synthesis}, booktitle = {Proceedings of Interspeech}, year = {2014}, url = {http://homepages.inf.ed.ac.uk/mwester/publications/Dall_Tomalin_Wester.pdf}, author = {Rasmus Dall and Marcus Tomalin and Mirjam Wester and William Byrne and Simon King} } @conference {merritt2014investigating, title = {Investigating source and filter contributions, and their interaction, to statistical parametric speech synthesis}, booktitle = {Proc. Interspeech}, year = {2014}, month = {September}, pages = {1509{\textendash}1513}, address = {Singapore}, abstract = {This paper presents an investigation of the separate perceptual degradations introduced by the modelling of source and filter features in statistical parametric speech synthesis. This is achieved using stimuli in which various permutations of natural, vocoded and modelled source and filter are combined, optionally with the addition of filter modifications (e.g. global variance or modulation spectrum scaling). We also examine the assumption of independence between source and filter parameters. Two complementary perceptual testing paradigms are adopted. In the first, we ask listeners to perform {\textquotedblleft}same or different quality{\textquotedblright} judgements between pairs of stimuli from different configurations. In the second, we ask listeners to give an opinion score for individual stimuli. Combining the findings from these tests, we draw some conclusions regarding the relative contributions of source and filter to the currently rather limited naturalness of statistical parametric synthetic speech, and test whether current independence assumptions are justified.}, url = {http://homepages.inf.ed.ac.uk/s1270339/Publications/merritt2014investigating.pdf}, author = {Merritt, Thomas and Raitio, Tuomo and King, Simon} } @conference {Swietojanski2014_lhuc, title = {Learning Hidden Unit Contributions for Unsupervised Speaker Adaptation of Neural Network Acoustic Models}, booktitle = {Proc. IEEE Workshop on Spoken Language Technology}, year = {2014}, month = {December}, address = {Lake Tahoe, USA}, abstract = {This paper proposes a simple yet effective model-based neural network speaker adaptation technique that learns speaker- specific hidden unit contributions given adaptation data, without requiring any form of speaker-adaptive training, or labelled adaptation data. An additional amplitude parameter is defined for each hidden unit; the amplitude parameters are tied for each speaker, and are learned using unsupervised adaptation. We conducted experiments on the TED talks data, as used in the International Workshop on Spoken Language Translation (IWSLT) evaluations. Our results indicate that the approach can reduce word error rates on standard IWSLT test sets by about 8{\textendash}15\% relative compared to unadapted systems, with a further reduction of 4{\textendash}6\% relative when combined with feature-space maximum likelihood linear re- gression (fMLLR). The approach can be employed in most existing feed-forward neural network architectures, and we report results using various hidden unit activation functions: sigmoid, maxout, and rectifying linear units (ReLU).}, author = {Swietojanski, P and Renals, S} } @conference {279, title = {Measuring the perceptual effects of modelling assumptions in speech synthesis using stimuli constructed from repeated natural speech}, booktitle = {Proceedings of Interspeech}, year = {2014}, month = {09/2014}, address = {Singapore}, abstract = {Acoustic models used for statistical parametric speech synthesis typically incorporate many modelling assumptions. It is an open question to what extent these assumptions limit the naturalness of synthesised speech. To investigate this question, we recorded a speech corpus where each prompt was read aloud multiple times. By combining speech parameter trajectories extracted from different repetitions, we were able to quantify the perceptual effects of certain commonly used modelling assumptions. Subjective listening tests show that taking the source and filter parameters to be conditionally independent, or using diagonal covariance matrices, significantly limits the naturalness that can be achieved. Our experimental results also demonstrate the shortcomings of mean-based parameter generation.}, keywords = {acoustic modelling, diagonal covariance matrices, repeated speech, speech synthesis, stream independence}, url = {http://mi.eng.cam.ac.uk/~sms46/papers/henter2014measuring.pdf}, author = {Henter, Gustav Eje and Merritt, Thomas and Shannon, Matt and Mayo, Catherine and King, Simon} } @conference {Lan14, title = {Multiple-Average-Voice-based Speech Synthesis}, booktitle = {Proc. ICASSP}, year = {2014}, author = {P. Lanchantin and M. J. F. Gales and S. King and J. Yamagishi} } @conference {Renals:HSCMA14, title = {Neural Networks for Distant Speech Recognition}, booktitle = {The 4th Joint Workshop on Hands-free Speech Communication and Microphone Arrays (HSCMA)}, year = {2014}, abstract = {Distant conversational speech recognition is challenging owing to the presence of multiple, overlapping talkers, additional non-speech acoustic sources, and the effects of reverberation. In this paper we review work on distant speech recognition, with an emphasis on approaches which combine multichannel signal processing with acoustic modelling, and investigate the use of hybrid neural network / hidden Markov model acoustic models for distant speech recognition of meetings recorded using microphone arrays. In particular we investigate the use of convolutional and fully-connected neural networks with different activation functions (sigmoid, rectified linear, and maxout). We performed experiments on the AMI and ICSI meeting corpora, with results indicating that neural network models are capable of significant improvements in accuracy compared with discriminatively trained Gaussian mixture models.}, author = {Renals, S. and Swietojanski, P.} } @article {liu2014paraphrastic, title = {Paraphrastic language models}, journal = {Computer Speech \& Language}, volume = {28}, number = {6}, year = {2014}, pages = {1298{\textendash}1316}, publisher = {Elsevier}, url = {http://mi.eng.cam.ac.uk/~xl207/publications/journals/csl14-parlm-preprn.pdf}, author = {Liu, Xunying and Gales, Mark and Woodland, Phil} } @conference {252, title = {PARAPHRASTIC NEURAL NETWORK LANGUAGE MODELS}, booktitle = {IEEE ICASSP2014}, year = {2014}, month = {04/05/2014}, publisher = {IEEE ICASSP2014}, organization = {IEEE ICASSP2014}, address = {Florence, Italy}, abstract = {Expressive richness in natural languages presents a significant challenge for statistical language models (LM). As multiple word se- quences can represent the same underlying meaning, only modelling the observed surface word sequence can lead to poor context cov- erage. To handle this issue, paraphrastic LMs were previously pro- posed to improve the generalization of back-off n-gram LMs. Paraphrastic neural network LMs (NNLM) are investigated in this paper. Using a paraphrastic multi-level feedforward NNLM modelling both word and phrase sequences, significant error rate reductions of 1.3\% absolute (8\% relative) and 0.9\% absolute (5.5\% relative) were ob- tained over the baseline n-gram and NNLM systems respectively on a state-of-the-art conversational telephone speech recognition sys- tem trained on 2000 hours of audio and 545 million words of texts.}, keywords = {neural network language model, paraphrase, Speech recognition}, author = {Xunying Liu and Mark Gales and Phil Woodland} } @article {lu2014a, title = {Probabilistic Linear Discriminant Analysis for Acoustic Modelling}, journal = {IEEE Signal Processing Letters}, volume = {21}, number = {6}, year = {2014}, pages = {702-706}, abstract = {In this letter, we propose a new acoustic modelling approach for automatic speech recognition based on probabilistic linear discriminant analysis (PLDA), which is used to model the state density function for the standard hidden Markov models (HMMs). Unlike the conventional Gaussian mixture models (GMMs) where the correlations are weakly modelled by using the diagonal covariance matrices, PLDA captures the correlations of feature vector in subspaces without vastly expanding the model. It also allows the usage of high dimensional feature input, and therefore is more flexible to make use of different type of acoustic features. We performed the preliminary experiments on the Switchboard corpus, and demonstrated the feasibility of this acoustic model.}, doi = {10.1109/LSP.2014.2313410}, author = {Lu, Liang and Renals, Steve} } @conference {llu_is2014, title = {Probabilistic linear discriminant analysis with bottleneck features for speech recognition}, booktitle = {Proc. INTERSPEECH}, year = {2014}, author = {Lu, Liang and Renals, Steve} } @conference {sinclairbell_interspeech14, title = {A semi-Markov model for speech segmentation with an utterance-break prior}, booktitle = {Proc. Interspeech}, year = {2014}, month = {September}, abstract = {Speech segmentation is the problem of finding the end points of a speech utterance for passing to an automatic speech recognition (ASR) system. The quality of this segmentation can have a large impact on the accuracy of the ASR system; in this paper we demonstrate that it can have an even larger impact on downstream natural language processing tasks \– in this case, machine translation. We develop a novel semi-Markov model which allows the segmentation of audio streams into speech utterances which are optimised for the desired distribution of sentence lengths for the target domain. We compare this with existing state-of-the-art methods and show that it is able to achieve not only improved ASR performance, but also to yield significant benefits to a speech translation task.}, url = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/interspeech2014.pdf}, author = {Sinclair, Mark and Bell, Peter and Birch, Alexandra and McInnes, Fergus} } @conference {Zhang2014, title = {Semi-Supervised DNN Training in Meeting Recognition}, year = {2014}, month = {December}, address = {South Lake Tahoe, USA}, abstract = {Training acoustic models for ASR requires large amounts of labelled data which is costly to obtain. Hence it is desirable to make use of unlabelled data. While unsupervised training can give gains for standard HMM training, it is more difficult to make use of unlabelled data for discriminative models. This paper explores semi-supervised training of Deep Neural Networks (DNN) in a meeting recognition task. We first analyse the impact of imperfect transcription on the DNN and the ASR performance. As labelling error is the source of the problem, we investigate two options available to reduce that: selecting data with fewer errors, and changing the dependence on noise by reducing label precision. Both confidence based data selection and label resolution change are explored in the context of two scenarios of matched and unmatched unlabelled data. We introduce improved DNN based confidence score estimators and show their performance on data selection for both scenarios. Confidence score based data selection was found to yield up to 14.6\% relative WER reduction, while better balance between label resolution and recognition hypothesis accuracy allowed further WER reductions by 16.6\% relative in the mismatched scenario.}, url = {http://staffwww.dcs.shef.ac.uk/people/Y.Liu/publications/pdf/Zhang2014.pdf}, author = {Pengyuan Zhang and Yulan Liu and Thomas Hain} } @conference {Zhang:2014ab, title = {Standalone training of context-dependent deep neural network acoustic models}, booktitle = {IEEE ICASSP 2014}, year = {2014}, month = {05/2014}, address = {Florence, Italy}, abstract = {