asr.tex

   1 %&asr
   2 \usepackage[toc,nonumberlist,acronyms]{glossaries}
   3 \makeglossaries%
   4 \newacronym{ANN}{ANN}{Artificial Neural Network}
   5 \newacronym{DCT}{DCT}{Discrete Cosine Transform}
   6 \newacronym{DHMM}{DHMM}{Duration-explicit \acrlong{HMM}}
   7 \newacronym{FA}{FA}{Forced alignment}
   8 \newacronym{GMM}{GMM}{Gaussian Mixture Models}
   9 \newacronym{HMM}{HMM}{Hidden Markov Model}
  10 \newacronym{HTK}{HTK}{\acrlong{HMM} Toolkit}
  11 \newacronym{IFPI}{IFPI}{International Federation of the Phonographic Industry}
  12 \newacronym{LPCC}{LPCC}{\acrlong{LPC} derivec cepstrum}
  13 \newacronym{LPC}{LPC}{Linear Prediction Coefficients}
  14 \newacronym{MFCC}{MFCC}{\acrlong{MFC} coefficient}
  15 \newacronym{MFC}{MFC}{Mel-frequency cepstrum}
  16 \newacronym{MLP}{MLP}{Multi-layer Perceptron}
  17 \newacronym{PLP}{PLP}{Perceptual Linear Prediction}
  18 \newacronym{PPF}{PPF}{Posterior Probability Features}
  19 \newacronym{ZCR}{ZCR}{Zero-crossing Rate}
  20 \newglossaryentry{dm}{name={Death Metal},
  21         description={is an extreme heavy metal music style with growling vocals and
  22         pounding drums}}
  23 \newglossaryentry{dom}{name={Doom Metal},
  24         description={is an extreme heavy metal music style with growling vocals and
  25         pounding drums played very slowly}}
  26 \newglossaryentry{FT}{name={Fourier Transform},
  27         description={is a technique of converting a time representation signal to a
  28         frequency representation}}
  29 \newglossaryentry{MS}{name={Mel-Scale},
  30         description={is a human ear inspired scale for spectral signals.}}
  31 \newglossaryentry{Viterbi}{name={Viterbi},
  32         description={is a dynamic programming algorithm for finding the most likely
  33         sequence of hidden states in a \gls{HMM}}}
  34
  35 \begin{document}
  36 \frontmatter{}
  37
  38 \maketitleru[
  39         course={(Automatic) Speech Recognition},
  40         institute={Radboud University Nijmegen},
  41         authorstext={Author:},
  42         righttextheader={Supervisor:},
  43         righttext={Louis ten Bosch},
  44         pagenr=1]
  45 \listoftodos[Todo]
  46
  47 \tableofcontents
  48
  49 \mainmatter{}
  50 %Berenzweig and Ellis use acoustic classifiers from speech recognition as a
  51 %detector for singing lines.  They achive 80\% accuracy for forty 15 second
  52 %exerpts. They mention people that wrote signal features that discriminate
  53 %between speech and music. Neural net
  54 %\glspl{HMM}~\cite{berenzweig_locating_2001}.
  55 %
  56 %In 2014 Dzhambazov et al.\ applied state of the art segmentation methods to
  57 %polyphonic turkish music, this might be interesting to use for heavy metal.
  58 %They mention Fujihara (2011) to have a similar \gls{FA} system. This method uses
  59 %phone level segmentation, first 12 \gls{MFCC}s. They first do vocal/non-vocal
  60 %detection, then melody extraction, then alignment. They compare results with
  61 %Mesaros \& Virtanen, 2008~\cite{dzhambazov_automatic_2014}. Later they
  62 %specialize in long syllables in a capella. They use \glspl{DHMM} with
  63 %\glspl{GMM} and show that adding knowledge increases alignment (bejing opera
  64 %has long syllables)~\cite{dzhambazov_automatic_2016}.
  65 %
  66
  67
  68 %Introduction, leading to a clearly defined research question
  69 \chapter{Introduction}
  70 \input{intro.tex}
  71
  72 \chapter{Methods}
  73 \input{methods.tex}
  74
  75 \chapter{Conclusion \& Discussion}
  76 \input{conclusion.tex}
  77
  78 %(Appendices)
  79 \appendix
  80 \input{appendices.tex}
  81
  82 \newpage
  83 %Glossaries
  84 \glsaddall{}
  85 \begingroup
  86 \let\clearpage\relax
  87 \let\cleardoublepage\relax
  88 \printglossaries{}
  89 \endgroup
  90
  91 \bibliographystyle{ieeetr}
  92 \bibliography{asr}
  93 \end{document}