X-Git-Url: https://git.martlubbers.net/?a=blobdiff_plain;ds=sidebyside;f=asr.tex;h=9cdbaf5582067cf91e3a6b83b320d68e1fbf6ded;hb=5945b2bce63d92454882cb7c66fb1c8d87c3a271;hp=42af838a3b64c79311e681e7c1685627e63d27a4;hpb=844805e280d5d10d0e088dec8c938c00d941b753;p=asr1617.git diff --git a/asr.tex b/asr.tex index 42af838..9cdbaf5 100644 --- a/asr.tex +++ b/asr.tex @@ -1,6 +1,7 @@ %&asr -\usepackage[nonumberlist,acronyms]{glossaries} +\usepackage[toc,nonumberlist,acronyms]{glossaries} \makeglossaries% +\newacronym{ANN}{ANN}{Artificial Neural Network} \newacronym{HMM}{HMM}{Hidden Markov Model} \newacronym{GMM}{GMM}{Gaussian Mixture Models} \newacronym{DHMM}{DHMM}{Duration-explicit \acrlong{HMM}} @@ -8,49 +9,81 @@ \newacronym{FA}{FA}{Forced alignment} \newacronym{MFC}{MFC}{Mel-frequency cepstrum} \newacronym{MFCC}{MFCC}{\acrlong{MFC} coefficient} -%\newglossaryentry{mTask}{name=mTask, -% description={is an abstraction for \glspl{Task} living on \acrshort{IoT} devices}} +\newacronym{PPF}{PPF}{Posterior Probability Features} +\newacronym{MLP}{MLP}{Multi-layer Perceptron} +\newacronym{PLP}{PLP}{Perceptual Linear Prediction} +\newacronym{ZCR}{ZCR}{Zero-crossing Rate} +\newacronym{LPC}{LPC}{Linear Prediction Coefficients} +\newacronym{LPCC}{LPCC}{\acrlong{LPC} derivec cepstrum} +\newacronym{IFPI}{IFPI}{International Federation of the Phonographic Industry} +\newglossaryentry{dm}{name={Death Metal}, + description={is an extreme heavy metal music style with growling vocals and + pounding drums}} +\newglossaryentry{dom}{name={Doom Metal}, + description={is an extreme heavy metal music style with growling vocals and + pounding drums played very slowly}} +\newglossaryentry{FT}{name={Fourier Transform}, + description={is a technique of converting a time representation signal to a + frequency representation}} +\newglossaryentry{MS}{name={Mel-Scale}, + description={is a human ear inspired scale for spectral signals.}} +\newglossaryentry{Viterbi}{name={Viterbi}, + description={is a dynamic programming algorithm for finding the most likely + sequence of hidden states in a \gls{HMM}}} \begin{document} -%Titlepage +\frontmatter{} + \maketitleru[ course={(Automatic) Speech Recognition}, institute={Radboud University Nijmegen}, - authorstext={Author:}] + authorstext={Author:}, + pagenr=1] \listoftodos[Todo] \tableofcontents +\mainmatter{} +%Berenzweig and Ellis use acoustic classifiers from speech recognition as a +%detector for singing lines. They achive 80\% accuracy for forty 15 second +%exerpts. They mention people that wrote signal features that discriminate +%between speech and music. Neural net +%\glspl{HMM}~\cite{berenzweig_locating_2001}. +% +%In 2014 Dzhambazov et al.\ applied state of the art segmentation methods to +%polyphonic turkish music, this might be interesting to use for heavy metal. +%They mention Fujihara (2011) to have a similar \gls{FA} system. This method uses +%phone level segmentation, first 12 \gls{MFCC}s. They first do vocal/non-vocal +%detection, then melody extraction, then alignment. They compare results with +%Mesaros \& Virtanen, 2008~\cite{dzhambazov_automatic_2014}. Later they +%specialize in long syllables in a capella. They use \glspl{DHMM} with +%\glspl{GMM} and show that adding knowledge increases alignment (bejing opera +%has long syllables)~\cite{dzhambazov_automatic_2016}. +% + + +%Introduction, leading to a clearly defined research question +\chapter{Introduction} +\input{intro.tex} + +\chapter{Methods} +\input{methods.tex} + +\chapter{Conclusion \& Discussion} +\input{conclusion.tex} + +%(Appendices) +\appendix +\input{appendices.tex} + +\newpage %Glossaries \glsaddall{} -\printglossaries% - -Berenzweig and Ellis use acoustic classifiers from speech recognition as a -detector for singing lines. They achive 80\% accuracy for forty 15 second -exerpts. They mention people that wrote signal features that discriminate -between speech and music. Neural net -\glspl{HMM}~\cite{berenzweig_locating_2001}. - -In 2014 Dzhambazov et al.\ applied state of the art segmentation methods to -polyphonic turkish music, this might be interesting to use for heavy metal. -They mention Fujihara(2011) to have a similar \gls{FA} system. This method uses -phone level segmentation, first 12 \gls{MFCC}s. They first do vocal/non-vocal -detection, then melody extraction, then alignment. They compare results with -Mesaros \& Virtanen, 2008~\cite{dzhambazov_automatic_2014}. Later they -specialize in long syllables in a capella. They use \glspl{DHMM} with -\glspl{GMM} and show that adding knowledge increases alignment (bejing opera -has long syllables)~\cite{dzhambazov_automatic_2016}. - -t\cite{fujihara_automatic_2006} -t\cite{fujihara_lyricsynchronizer:_2011} -t\cite{fujihara_three_2008} -t\cite{mauch_integrating_2012} -t\cite{mesaros_adaptation_2009} -t\cite{mesaros_automatic_2008} -t\cite{mesaros_automatic_2010} -t\cite{muller_multimodal_2012} -t\cite{pedone_phoneme-level_2011} -t\cite{yang_machine_2012} +\begingroup +\let\clearpage\relax +\let\cleardoublepage\relax +\printglossaries{} +\endgroup \bibliographystyle{ieeetr} \bibliography{asr}