2019
|
del Agua Teba, Miguel Á Contributions to Efficient Automatic Transcription of Video Lectures PhD Thesis Universitat Politècnica de València, 2019, (Advisers: Alfons Juan Ciscar and Albert Sanchis Navarro). Links | BibTeX | Tags: Automatic Speech Recognition, Confidence measures, Video Lectures @phdthesis{delTeba2019,
title = {Contributions to Efficient Automatic Transcription of Video Lectures},
author = {del Agua Teba, Miguel Á. },
url = {https://www.upv.es/pls/oalu/sic_ted.mostrar_tesis?p_num_reg=10772},
year = {2019},
date = {2019-01-01},
school = {Universitat Politècnica de València},
note = {Advisers: Alfons Juan Ciscar and Albert Sanchis Navarro},
keywords = {Automatic Speech Recognition, Confidence measures, Video Lectures},
pubstate = {published},
tppubtype = {phdthesis}
}
|
2018
|
Del-Agua, Miguel Ángel ; Giménez, Adrià ; Sanchis, Alberto ; Civera, Jorge; Juan, Alfons Speaker-Adapted Confidence Measures for ASR using Deep Bidirectional Recurrent Neural Networks Journal Article IEEE/ACM Transactions on Audio, Speech, and Language Processing, 26 (7), pp. 1194–1202, 2018. Abstract | Links | BibTeX | Tags: Automatic Speech Recognition, Confidence estimation, Confidence measures, Deep bidirectional recurrent neural networks, Long short-term memory, Speaker adaptation @article{Del-Agua2018,
title = {Speaker-Adapted Confidence Measures for ASR using Deep Bidirectional Recurrent Neural Networks},
author = {Del-Agua, Miguel Ángel AND Giménez, Adrià AND Sanchis, Alberto AND Civera,Jorge AND Juan, Alfons},
url = {http://www.mllp.upv.es/wp-content/uploads/2018/04/Del-Agua2018_authors_version.pdf
https://doi.org/10.1109/TASLP.2018.2819900},
year = {2018},
date = {2018-01-01},
journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
volume = {26},
number = {7},
pages = {1194--1202},
abstract = {In the last years, Deep Bidirectional Recurrent Neural Networks (DBRNN) and DBRNN with Long Short-Term Memory cells (DBLSTM) have outperformed the most accurate classifiers for confidence estimation in automatic speech recognition. At the same time, we have recently shown that speaker adaptation of confidence measures using DBLSTM yields significant improvements over non-adapted confidence measures. In accordance with these two recent contributions to the state of the art in confidence estimation, this paper presents a comprehensive study of speaker-adapted confidence measures using DBRNN and DBLSTM models. Firstly, we present new empirical evidences of the superiority of RNN-based confidence classifiers evaluated over a large speech corpus consisting of the English LibriSpeech and the Spanish poliMedia tasks. Secondly, we show new results on speaker-adapted confidence measures considering a multi-task framework in which RNN-based confidence classifiers trained with LibriSpeech are adapted to speakers of the TED-LIUM corpus. These experiments confirm that speaker-adapted confidence measures outperform their non-adapted counterparts. Lastly, we describe an unsupervised adaptation method of the acoustic DBLSTM model based on confidence measures which results in better automatic speech recognition performance.},
keywords = {Automatic Speech Recognition, Confidence estimation, Confidence measures, Deep bidirectional recurrent neural networks, Long short-term memory, Speaker adaptation},
pubstate = {published},
tppubtype = {article}
}
In the last years, Deep Bidirectional Recurrent Neural Networks (DBRNN) and DBRNN with Long Short-Term Memory cells (DBLSTM) have outperformed the most accurate classifiers for confidence estimation in automatic speech recognition. At the same time, we have recently shown that speaker adaptation of confidence measures using DBLSTM yields significant improvements over non-adapted confidence measures. In accordance with these two recent contributions to the state of the art in confidence estimation, this paper presents a comprehensive study of speaker-adapted confidence measures using DBRNN and DBLSTM models. Firstly, we present new empirical evidences of the superiority of RNN-based confidence classifiers evaluated over a large speech corpus consisting of the English LibriSpeech and the Spanish poliMedia tasks. Secondly, we show new results on speaker-adapted confidence measures considering a multi-task framework in which RNN-based confidence classifiers trained with LibriSpeech are adapted to speakers of the TED-LIUM corpus. These experiments confirm that speaker-adapted confidence measures outperform their non-adapted counterparts. Lastly, we describe an unsupervised adaptation method of the acoustic DBLSTM model based on confidence measures which results in better automatic speech recognition performance. |
2016
|
del-Agua, Miguel Ángel; Piqueras, Santiago; Giménez, Adrià; Sanchis, Alberto; Civera, Jorge; Juan, Alfons ASR Confidence Estimation with Speaker-Adapted Recurrent Neural Networks Inproceedings Proc. of the 17th Annual Conf. of the ISCA (Interspeech 2016), pp. 3464–3468, San Francisco (USA), 2016. Abstract | Links | BibTeX | Tags: BLSTM, Confidence measures, Recurrent Neural Networks, Speaker adaptation, Speech Recognition @inproceedings{del-Agua2016,
title = {ASR Confidence Estimation with Speaker-Adapted Recurrent Neural Networks},
author = {Miguel Ángel del-Agua and Santiago Piqueras and Adrià Giménez and Alberto Sanchis and Jorge Civera and Alfons Juan},
doi = {10.21437/Interspeech.2016-1142},
year = {2016},
date = {2016-09-08},
booktitle = {Proc. of the 17th Annual Conf. of the ISCA (Interspeech 2016)},
pages = {3464--3468},
address = {San Francisco (USA)},
abstract = {Confidence estimation for automatic speech recognition has been very recently improved by using Recurrent Neural Networks (RNNs), and also by speaker adaptation (on the basis of Conditional Random Fields). In this work, we explore how to obtain further improvements by combining RNNs and speaker adaptation. In particular, we explore different speaker-dependent and -independent data representations for Bidirectional Long Short Term Memory RNNs of various topologies. Empirical tests are reported on the LibriSpeech dataset, showing that the best results are achieved by the proposed combination of RNNs and speaker adaptation.},
keywords = {BLSTM, Confidence measures, Recurrent Neural Networks, Speaker adaptation, Speech Recognition},
pubstate = {published},
tppubtype = {inproceedings}
}
Confidence estimation for automatic speech recognition has been very recently improved by using Recurrent Neural Networks (RNNs), and also by speaker adaptation (on the basis of Conditional Random Fields). In this work, we explore how to obtain further improvements by combining RNNs and speaker adaptation. In particular, we explore different speaker-dependent and -independent data representations for Bidirectional Long Short Term Memory RNNs of various topologies. Empirical tests are reported on the LibriSpeech dataset, showing that the best results are achieved by the proposed combination of RNNs and speaker adaptation. |
del-Agua, Miguel Ángel; Martínez-Villaronga, Adrià; Giménez, Adrià; Sanchis, Alberto; Civera, Jorge; Juan, Alfons The MLLP system for the 4th CHiME Challenge Inproceedings Proc. of the 4th Intl. Workshop on Speech Processing in Everyday Environments (CHiME 2016), pp. 57–59, San Francisco (USA), 2016. Abstract | Links | BibTeX | Tags: @inproceedings{del-Aguadel-Agua2016,
title = {The MLLP system for the 4th CHiME Challenge},
author = {Miguel Ángel del-Agua and Adrià Martínez-Villaronga and Adrià Giménez and Alberto Sanchis and Jorge Civera and Alfons Juan},
url = {http://www.mllp.upv.es/wp-content/uploads/2017/11/DelAgua2016-The_MLLP_system_for_the_4th_CHiME_Challenge.pdf
http://hdl.handle.net/10251/177497
http://spandh.dcs.shef.ac.uk/chime_workshop/chime2016/chime2016proceedings.pdf},
year = {2016},
date = {2016-01-01},
booktitle = {Proc. of the 4th Intl. Workshop on Speech Processing in Everyday Environments (CHiME 2016)},
pages = {57--59},
address = {San Francisco (USA)},
abstract = {The MLLP's CHiME-4 system is presented in this paper. It has been built using the transLectures-UPV toolkit (TLK), developed by the MLLP research group, which makes use of state-of-the-art speech techniques. Our best system built for the CHiME-4 challenge consists on the combination of different sub-systems in order to deal with the variety of acoustic conditions. Each sub-system in turn, follows a hybrid approach with different acoustic models, such as Deep Neural Networks or BLSTM Networks.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
The MLLP's CHiME-4 system is presented in this paper. It has been built using the transLectures-UPV toolkit (TLK), developed by the MLLP research group, which makes use of state-of-the-art speech techniques. Our best system built for the CHiME-4 challenge consists on the combination of different sub-systems in order to deal with the variety of acoustic conditions. Each sub-system in turn, follows a hybrid approach with different acoustic models, such as Deep Neural Networks or BLSTM Networks. |
2015
|
del-Agua, Miguel Ángel; Martínez-Villaronga, Adrià; Piqueras, Santiago; Giménez, Adrià; Sanchis, Alberto; Civera, Jorge; Juan, Alfons The MLLP ASR Systems for IWSLT 2015 Inproceedings Proc. of 12th Intl. Workshop on Spoken Language Translation (IWSLT 2015), pp. 39–44, Da Nang (Vietnam), 2015. Abstract | Links | BibTeX | Tags: @inproceedings{delAgua15,
title = {The MLLP ASR Systems for IWSLT 2015},
author = {Miguel Ángel del-Agua and Adrià Martínez-Villaronga and Santiago Piqueras and Adrià Giménez and Alberto Sanchis and Jorge Civera and Alfons Juan},
url = {https://aclanthology.org/2015.iwslt-evaluation.5/},
year = {2015},
date = {2015-12-03},
booktitle = {Proc. of 12th Intl. Workshop on Spoken Language Translation (IWSLT 2015)},
pages = {39--44},
address = {Da Nang (Vietnam)},
abstract = {This paper describes the Machine Learning and Language Processing (MLLP) ASR systems for the 2015 IWSLT evaluation campaing. The English system is based on the combination of five different subsystems which consist of two types of Neural Networks architectures (Deep feed-forward and Convolutional), two types of activation functions (sigmoid and rectified linear) and two types of input features (fMLLR and FBANK). All subsystems perform a speaker adaptation step based on confidence measures, the output of which is then combined with ROVER. This system achieves a Word Error Rate (WER) of 13.3% on the official IWSLT 2015 English test set.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
This paper describes the Machine Learning and Language Processing (MLLP) ASR systems for the 2015 IWSLT evaluation campaing. The English system is based on the combination of five different subsystems which consist of two types of Neural Networks architectures (Deep feed-forward and Convolutional), two types of activation functions (sigmoid and rectified linear) and two types of input features (fMLLR and FBANK). All subsystems perform a speaker adaptation step based on confidence measures, the output of which is then combined with ROVER. This system achieves a Word Error Rate (WER) of 13.3% on the official IWSLT 2015 English test set. |
2014
|
Piqueras, S; del-Agua, M A; Giménez, A; Civera, J; Juan, A Statistical text-to-speech synthesis of Spanish subtitles Inproceedings Proc. of VIII Jornadas en Tecnología del Habla and IV Iberian SLTech Workshop (IberSpeech 2014), Las Palmas de Gran Canaria (Spain), 2014. Links | BibTeX | Tags: @inproceedings{PiqAgu14,
title = {Statistical text-to-speech synthesis of Spanish subtitles},
author = {S. Piqueras and M. A. del-Agua and A. Giménez and J. Civera and A. Juan},
url = {http://www.mllp.upv.es/wp-content/uploads/2015/04/paper3.pdf
http://link.springer.com/chapter/10.1007%2F978-3-319-13623-3_5},
year = {2014},
date = {2014-01-01},
booktitle = {Proc. of VIII Jornadas en Tecnología del Habla and IV Iberian SLTech Workshop (IberSpeech 2014)},
address = {Las Palmas de Gran Canaria (Spain)},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Martínez-Villaronga, A; del-Agua, M A; Silvestre-Cerdà, J A; Andrés-Ferrer, J; Juan, A Language model adaptation for lecture transcription by document retrieval Inproceedings Proc. of VIII Jornadas en Tecnología del Habla and IV Iberian SLTech Workshop (IberSpeech 2014), Las Palmas de Gran Canaria (Spain), 2014. Links | BibTeX | Tags: @inproceedings{MarAgu14,
title = {Language model adaptation for lecture transcription by document retrieval},
author = {A. Martínez-Villaronga and M. A. del-Agua and J.A. Silvestre-Cerdà and J. Andrés-Ferrer and A. Juan},
url = {http://www.mllp.upv.es/wp-content/uploads/2015/04/ibsp14-cameraReady.pdf},
year = {2014},
date = {2014-01-01},
booktitle = {Proc. of VIII Jornadas en Tecnología del Habla and IV Iberian SLTech Workshop (IberSpeech 2014)},
address = {Las Palmas de Gran Canaria (Spain)},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
del-Agua, M A; Giménez, A; Serrano, N; Andrés-Ferrer, J; Civera, J; Sanchis, A; Juan, A The transLectures-UPV toolkit Inproceedings Proc. of VIII Jornadas en Tecnología del Habla and IV Iberian SLTech Workshop (IberSpeech 2014), Las Palmas de Gran Canaria (Spain), 2014. Links | BibTeX | Tags: @inproceedings{AguGim14,
title = {The transLectures-UPV toolkit},
author = {M. A. del-Agua and A. Giménez and N. Serrano and J. Andrés-Ferrer and J. Civera and A. Sanchis and A. Juan},
url = {http://www.mllp.upv.es/wp-content/uploads/2015/04/IberSpeech2014-TLK-camready1.pdf},
year = {2014},
date = {2014-01-01},
booktitle = {Proc. of VIII Jornadas en Tecnología del Habla and IV Iberian SLTech Workshop (IberSpeech 2014)},
address = {Las Palmas de Gran Canaria (Spain)},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
2013
|
Martínez-Villaronga, A; del Agua, M A; Andrés-Ferrer, J; Juan, A Language model adaptation for video lectures transcription Inproceedings Proc. of the IEEE Intl. Conf. on Acoustics, Speech and Signal Processing ICASSP 2013, pp. 8450-8454, Vancouver (Canada), 2013. Links | BibTeX | Tags: language model adaptation, Video Lectures @inproceedings{Martinez-Villaronga2013,
title = {Language model adaptation for video lectures transcription},
author = {A. Martínez-Villaronga and M.A. del Agua and J. Andrés-Ferrer and A. Juan},
url = {http://dx.doi.org/10.1109/ICASSP.2013.6639314},
year = {2013},
date = {2013-01-01},
booktitle = {Proc. of the IEEE Intl. Conf. on Acoustics, Speech and Signal Processing ICASSP 2013},
pages = {8450-8454},
address = {Vancouver (Canada)},
keywords = {language model adaptation, Video Lectures},
pubstate = {published},
tppubtype = {inproceedings}
}
|
2012
|
Silvestre-Cerdà, Joan Albert ; Del Agua, Miguel ; Garcés, Gonçal; Gascó, Guillem; Giménez-Pastor, Adrià; Martínez, Adrià; Pérez González de Martos, Alejandro ; Sánchez, Isaías; Serrano Martínez-Santos, Nicolás ; Spencer, Rachel; Valor Miró, Juan Daniel ; Andrés-Ferrer, Jesús; Civera, Jorge; Sanchís, Alberto; Juan, Alfons transLectures Inproceedings Proceedings (Online) of IberSPEECH 2012, pp. 345–351, Madrid (Spain), 2012. Abstract | Links | BibTeX | Tags: Accessibility, Automatic Speech Recognition, Education, Intelligent Interaction, Language Technologies, Machine Translation, Massive Adaptation, Multilingualism, Opencast Matterhorn, Video Lectures @inproceedings{Silvestre-Cerdà2012b,
title = {transLectures},
author = {Silvestre-Cerdà, Joan Albert and Del Agua, Miguel and Gonçal Garcés and Guillem Gascó and Adrià Giménez-Pastor and Adrià Martínez and Pérez González de Martos, Alejandro and Isaías Sánchez and Serrano Martínez-Santos, Nicolás and Rachel Spencer and Valor Miró, Juan Daniel and Jesús Andrés-Ferrer and Jorge Civera and Alberto Sanchís and Alfons Juan},
url = {http://hdl.handle.net/10251/37290
http://lorien.die.upm.es/~lapiz/rtth/JORNADAS/VII/IberSPEECH2012_OnlineProceedings.pdf
https://web.archive.org/web/20130609073144/http://iberspeech2012.ii.uam.es/IberSPEECH2012_OnlineProceedings.pdf
http://www.mllp.upv.es/wp-content/uploads/2015/04/1209IberSpeech.pdf},
year = {2012},
date = {2012-11-22},
booktitle = {Proceedings (Online) of IberSPEECH 2012},
pages = {345--351},
address = {Madrid (Spain)},
abstract = {[EN] transLectures (Transcription and Translation of Video Lectures) is an EU STREP project in which advanced automatic speech recognition and machine translation techniques are being tested on large video lecture repositories. The project began in November 2011 and will run for three years. This paper will outline the project's main motivation and objectives, and give a brief description of the two main repositories being considered: VideoLectures.NET and poliMèdia. The first results obtained by the UPV group for the poliMedia repository will also be provided.
[CA] transLectures (Transcription and Translation of Video Lectures) és un projecte del 7PM de la Unió Europea en el qual s'estan posant a prova tècniques avançades de reconeixement automàtic de la parla i de traducció automàtica sobre grans repositoris digitals de vídeos docents. El projecte començà al novembre de 2011 i tindrà una duració de tres anys. En aquest article exposem la motivació i els objectius del projecte, i descrivim breument els dos repositoris principals sobre els quals es treballa: VideoLectures.NET i poliMèdia. També oferim els primers resultats obtinguts per l'equip de la UPV al repositori poliMèdia.},
keywords = {Accessibility, Automatic Speech Recognition, Education, Intelligent Interaction, Language Technologies, Machine Translation, Massive Adaptation, Multilingualism, Opencast Matterhorn, Video Lectures},
pubstate = {published},
tppubtype = {inproceedings}
}
[EN] transLectures (Transcription and Translation of Video Lectures) is an EU STREP project in which advanced automatic speech recognition and machine translation techniques are being tested on large video lecture repositories. The project began in November 2011 and will run for three years. This paper will outline the project's main motivation and objectives, and give a brief description of the two main repositories being considered: VideoLectures.NET and poliMèdia. The first results obtained by the UPV group for the poliMedia repository will also be provided.
[CA] transLectures (Transcription and Translation of Video Lectures) és un projecte del 7PM de la Unió Europea en el qual s'estan posant a prova tècniques avançades de reconeixement automàtic de la parla i de traducció automàtica sobre grans repositoris digitals de vídeos docents. El projecte començà al novembre de 2011 i tindrà una duració de tres anys. En aquest article exposem la motivació i els objectius del projecte, i descrivim breument els dos repositoris principals sobre els quals es treballa: VideoLectures.NET i poliMèdia. També oferim els primers resultats obtinguts per l'equip de la UPV al repositori poliMèdia. |
del Agua, MiguelA.; Serrano, Nicolás ; Civera, Jorge ; Juan, Alfons Character-Based Handwritten Text Recognition of Multilingual Documents Incollection Advances in Speech and Language Technologies for Iberian Languages (IberSpeech 2012), 328 , pp. 187-196, Springer Berlin Heidelberg, 2012, ISBN: 978-3-642-35291-1, (doi: 10.1007/978-3-642-35292-8_20). Links | BibTeX | Tags: @incollection{delAgua12,
title = {Character-Based Handwritten Text Recognition of Multilingual Documents},
author = {MiguelA. del Agua and Nicolás Serrano and Jorge Civera and Alfons Juan},
url = {http://hdl.handle.net/10251/35180},
isbn = {978-3-642-35291-1},
year = {2012},
date = {2012-01-01},
booktitle = {Advances in Speech and Language Technologies for Iberian Languages (IberSpeech 2012)},
volume = {328},
pages = {187-196},
publisher = {Springer Berlin Heidelberg},
series = {Communications in Computer and Information Science},
note = {doi: 10.1007/978-3-642-35292-8_20},
keywords = {},
pubstate = {published},
tppubtype = {incollection}
}
|
2011
|
del Agua, MiguelA. ; Serrano, Nicolás ; Juan, Alfons Language Identification for Interactive Handwriting Transcription of Multilingual Documents Incollection Pattern Recognition and Image Analysis (IbPRIA 2011), 6669 , pp. 596-603, Springer Berlin Heidelberg, 2011, ISBN: 978-3-642-21256-7. Links | BibTeX | Tags: interactive handwriting transcription, Language identification, Multilingual documents @incollection{delAgua2011b,
title = {Language Identification for Interactive Handwriting Transcription of Multilingual Documents},
author = {del Agua, MiguelA. and Serrano, Nicolás and Juan, Alfons},
url = {http://hdl.handle.net/10251/37459
http://dx.doi.org/10.1007/978-3-642-21257-4_74},
isbn = {978-3-642-21256-7},
year = {2011},
date = {2011-01-01},
booktitle = {Pattern Recognition and Image Analysis (IbPRIA 2011)},
volume = {6669},
pages = {596-603},
publisher = {Springer Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
keywords = {interactive handwriting transcription, Language identification, Multilingual documents},
pubstate = {published},
tppubtype = {incollection}
}
|