2019 |
del Agua Teba, Miguel Á Contributions to Efficient Automatic Transcription of Video Lectures PhD Thesis Universitat Politècnica de València, 2019, (Advisers: Alfons Juan Ciscar and Albert Sanchis Navarro). Links | BibTeX | Tags: Automatic Speech Recognition, Confidence measures, Video Lectures @phdthesis{delTeba2019, title = {Contributions to Efficient Automatic Transcription of Video Lectures}, author = {del Agua Teba, Miguel Á. }, url = {https://www.upv.es/pls/oalu/sic_ted.mostrar_tesis?p_num_reg=10772}, year = {2019}, date = {2019-01-01}, school = {Universitat Politècnica de València}, note = {Advisers: Alfons Juan Ciscar and Albert Sanchis Navarro}, keywords = {Automatic Speech Recognition, Confidence measures, Video Lectures}, pubstate = {published}, tppubtype = {phdthesis} } |
2018 |
Del-Agua, Miguel Ángel ; Giménez, Adrià ; Sanchis, Alberto ; Civera, Jorge; Juan, Alfons Speaker-Adapted Confidence Measures for ASR using Deep Bidirectional Recurrent Neural Networks Journal Article IEEE/ACM Transactions on Audio, Speech, and Language Processing, 26 (7), pp. 1194–1202, 2018. Abstract | Links | BibTeX | Tags: Automatic Speech Recognition, Confidence estimation, Confidence measures, Deep bidirectional recurrent neural networks, Long short-term memory, Speaker adaptation @article{Del-Agua2018, title = {Speaker-Adapted Confidence Measures for ASR using Deep Bidirectional Recurrent Neural Networks}, author = {Del-Agua, Miguel Ángel AND Giménez, Adrià AND Sanchis, Alberto AND Civera,Jorge AND Juan, Alfons}, url = {http://www.mllp.upv.es/wp-content/uploads/2018/04/Del-Agua2018_authors_version.pdf https://doi.org/10.1109/TASLP.2018.2819900}, year = {2018}, date = {2018-01-01}, journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing}, volume = {26}, number = {7}, pages = {1194--1202}, abstract = {In the last years, Deep Bidirectional Recurrent Neural Networks (DBRNN) and DBRNN with Long Short-Term Memory cells (DBLSTM) have outperformed the most accurate classifiers for confidence estimation in automatic speech recognition. At the same time, we have recently shown that speaker adaptation of confidence measures using DBLSTM yields significant improvements over non-adapted confidence measures. In accordance with these two recent contributions to the state of the art in confidence estimation, this paper presents a comprehensive study of speaker-adapted confidence measures using DBRNN and DBLSTM models. Firstly, we present new empirical evidences of the superiority of RNN-based confidence classifiers evaluated over a large speech corpus consisting of the English LibriSpeech and the Spanish poliMedia tasks. Secondly, we show new results on speaker-adapted confidence measures considering a multi-task framework in which RNN-based confidence classifiers trained with LibriSpeech are adapted to speakers of the TED-LIUM corpus. These experiments confirm that speaker-adapted confidence measures outperform their non-adapted counterparts. Lastly, we describe an unsupervised adaptation method of the acoustic DBLSTM model based on confidence measures which results in better automatic speech recognition performance.}, keywords = {Automatic Speech Recognition, Confidence estimation, Confidence measures, Deep bidirectional recurrent neural networks, Long short-term memory, Speaker adaptation}, pubstate = {published}, tppubtype = {article} } In the last years, Deep Bidirectional Recurrent Neural Networks (DBRNN) and DBRNN with Long Short-Term Memory cells (DBLSTM) have outperformed the most accurate classifiers for confidence estimation in automatic speech recognition. At the same time, we have recently shown that speaker adaptation of confidence measures using DBLSTM yields significant improvements over non-adapted confidence measures. In accordance with these two recent contributions to the state of the art in confidence estimation, this paper presents a comprehensive study of speaker-adapted confidence measures using DBRNN and DBLSTM models. Firstly, we present new empirical evidences of the superiority of RNN-based confidence classifiers evaluated over a large speech corpus consisting of the English LibriSpeech and the Spanish poliMedia tasks. Secondly, we show new results on speaker-adapted confidence measures considering a multi-task framework in which RNN-based confidence classifiers trained with LibriSpeech are adapted to speakers of the TED-LIUM corpus. These experiments confirm that speaker-adapted confidence measures outperform their non-adapted counterparts. Lastly, we describe an unsupervised adaptation method of the acoustic DBLSTM model based on confidence measures which results in better automatic speech recognition performance. |
2016 |
del-Agua, Miguel Ángel; Piqueras, Santiago; Giménez, Adrià; Sanchis, Alberto; Civera, Jorge; Juan, Alfons ASR Confidence Estimation with Speaker-Adapted Recurrent Neural Networks Inproceedings Proc. of the 17th Annual Conf. of the ISCA (Interspeech 2016), pp. 3464–3468, San Francisco (USA), 2016. Abstract | Links | BibTeX | Tags: BLSTM, Confidence measures, Recurrent Neural Networks, Speaker adaptation, Speech Recognition @inproceedings{del-Agua2016, title = {ASR Confidence Estimation with Speaker-Adapted Recurrent Neural Networks}, author = {Miguel Ángel del-Agua and Santiago Piqueras and Adrià Giménez and Alberto Sanchis and Jorge Civera and Alfons Juan}, doi = {10.21437/Interspeech.2016-1142}, year = {2016}, date = {2016-09-08}, booktitle = {Proc. of the 17th Annual Conf. of the ISCA (Interspeech 2016)}, pages = {3464--3468}, address = {San Francisco (USA)}, abstract = {Confidence estimation for automatic speech recognition has been very recently improved by using Recurrent Neural Networks (RNNs), and also by speaker adaptation (on the basis of Conditional Random Fields). In this work, we explore how to obtain further improvements by combining RNNs and speaker adaptation. In particular, we explore different speaker-dependent and -independent data representations for Bidirectional Long Short Term Memory RNNs of various topologies. Empirical tests are reported on the LibriSpeech dataset, showing that the best results are achieved by the proposed combination of RNNs and speaker adaptation.}, keywords = {BLSTM, Confidence measures, Recurrent Neural Networks, Speaker adaptation, Speech Recognition}, pubstate = {published}, tppubtype = {inproceedings} } Confidence estimation for automatic speech recognition has been very recently improved by using Recurrent Neural Networks (RNNs), and also by speaker adaptation (on the basis of Conditional Random Fields). In this work, we explore how to obtain further improvements by combining RNNs and speaker adaptation. In particular, we explore different speaker-dependent and -independent data representations for Bidirectional Long Short Term Memory RNNs of various topologies. Empirical tests are reported on the LibriSpeech dataset, showing that the best results are achieved by the proposed combination of RNNs and speaker adaptation. |
Sanchez-Cortina, Isaias; Andrés-Ferrer, Jesús; Sanchis, Alberto; Juan, Alfons Speaker-adapted confidence measures for speech recognition of video lectures Journal Article Computer Speech & Language, 37 , pp. 11–23, 2016, ISBN: 0885-2308. Abstract | Links | BibTeX | Tags: Confidence measures, Log-linear models, Online video lectures, Speaker adaptation, Speech Recognition @article{SanchezCortina2016, title = {Speaker-adapted confidence measures for speech recognition of video lectures}, author = {Isaias Sanchez-Cortina and Jesús Andrés-Ferrer and Alberto Sanchis and Alfons Juan}, url = {http://www.sciencedirect.com/science/article/pii/S0885230815000960 http://authors.elsevier.com/a/1SAsB39HpSHRc0}, isbn = {0885-2308}, year = {2016}, date = {2016-01-01}, journal = {Computer Speech & Language}, volume = {37}, pages = {11--23}, abstract = {Abstract Automatic Speech Recognition applications can benefit from a confidence measure (CM) to predict the reliability of the output. Previous works showed that a word-dependent naïve Bayes (NB) classifier outperforms the conventional word posterior probability as a CM. However, a discriminative formulation usually renders improved performance due to the available training techniques. Taking this into account, we propose a logistic regression (LR) classifier defined with simple input functions to approximate to the \\{NB\\} behaviour. Additionally, as a main contribution, we propose to adapt the \\{CM\\} to the speaker in cases in which it is possible to identify the speakers, such as online lecture repositories. The experiments have shown that speaker-adapted models outperform their non-adapted counterparts on two difficult tasks from English (videoLectures.net) and Spanish (poliMedia) educational lectures. They have also shown that the \\{NB\\} model is clearly superseded by the proposed \\{LR\\} classifier.}, keywords = {Confidence measures, Log-linear models, Online video lectures, Speaker adaptation, Speech Recognition}, pubstate = {published}, tppubtype = {article} } Abstract Automatic Speech Recognition applications can benefit from a confidence measure (CM) to predict the reliability of the output. Previous works showed that a word-dependent naïve Bayes (NB) classifier outperforms the conventional word posterior probability as a CM. However, a discriminative formulation usually renders improved performance due to the available training techniques. Taking this into account, we propose a logistic regression (LR) classifier defined with simple input functions to approximate to the \{NB\} behaviour. Additionally, as a main contribution, we propose to adapt the \{CM\} to the speaker in cases in which it is possible to identify the speakers, such as online lecture repositories. The experiments have shown that speaker-adapted models outperform their non-adapted counterparts on two difficult tasks from English (videoLectures.net) and Spanish (poliMedia) educational lectures. They have also shown that the \{NB\} model is clearly superseded by the proposed \{LR\} classifier. |
Publications
Accessibility Automatic Speech Recognition Computer-assisted transcription Confidence measures Deep Neural Networks Docencia en Red Education language model adaptation Language Modeling Language Technologies Length modelling Log-linear models Machine Translation Massive Adaptation Models basats en seqüències de paraules Models log-lineals Multilingualism Neural Machine Translation Opencast Matterhorn Polimedia Sliding window Speaker adaptation Speech Recognition Speech Translation Statistical machine translation streaming text-to-speech transcripciones video lecture repositories Video Lectures
2019 |
Contributions to Efficient Automatic Transcription of Video Lectures PhD Thesis Universitat Politècnica de València, 2019, (Advisers: Alfons Juan Ciscar and Albert Sanchis Navarro). |
2018 |
Speaker-Adapted Confidence Measures for ASR using Deep Bidirectional Recurrent Neural Networks Journal Article IEEE/ACM Transactions on Audio, Speech, and Language Processing, 26 (7), pp. 1194–1202, 2018. |
2016 |
ASR Confidence Estimation with Speaker-Adapted Recurrent Neural Networks Inproceedings Proc. of the 17th Annual Conf. of the ISCA (Interspeech 2016), pp. 3464–3468, San Francisco (USA), 2016. |
Speaker-adapted confidence measures for speech recognition of video lectures Journal Article Computer Speech & Language, 37 , pp. 11–23, 2016, ISBN: 0885-2308. |