restructure repository, add literature review

[asr1617.git] / asr.tex
diff --git a/asr.tex b/asr.tex

index 42af838..9cdbaf5 100644 (file)
--- a/asr.tex
+++ b/asr.tex
@@ -1,6 +1,7 @@
  %&asr
-\usepackage[nonumberlist,acronyms]{glossaries}
+\usepackage[toc,nonumberlist,acronyms]{glossaries}
  \makeglossaries%
+\newacronym{ANN}{ANN}{Artificial Neural Network}
  \newacronym{HMM}{HMM}{Hidden Markov Model}
  \newacronym{GMM}{GMM}{Gaussian Mixture Models}
  \newacronym{DHMM}{DHMM}{Duration-explicit \acrlong{HMM}}
@@ -8,49 +9,81 @@
  \newacronym{FA}{FA}{Forced alignment}
  \newacronym{MFC}{MFC}{Mel-frequency cepstrum}
  \newacronym{MFCC}{MFCC}{\acrlong{MFC} coefficient}
-%\newglossaryentry{mTask}{name=mTask,
-%      description={is an abstraction for \glspl{Task} living on \acrshort{IoT} devices}}
+\newacronym{PPF}{PPF}{Posterior Probability Features}
+\newacronym{MLP}{MLP}{Multi-layer Perceptron}
+\newacronym{PLP}{PLP}{Perceptual Linear Prediction}
+\newacronym{ZCR}{ZCR}{Zero-crossing Rate}
+\newacronym{LPC}{LPC}{Linear Prediction Coefficients}
+\newacronym{LPCC}{LPCC}{\acrlong{LPC} derivec cepstrum}
+\newacronym{IFPI}{IFPI}{International Federation of the Phonographic Industry}
+\newglossaryentry{dm}{name={Death Metal},
+       description={is an extreme heavy metal music style with growling vocals and
+       pounding drums}}
+\newglossaryentry{dom}{name={Doom Metal},
+       description={is an extreme heavy metal music style with growling vocals and
+       pounding drums played very slowly}}
+\newglossaryentry{FT}{name={Fourier Transform},
+       description={is a technique of converting a time representation signal to a
+       frequency representation}}
+\newglossaryentry{MS}{name={Mel-Scale},
+       description={is a human ear inspired scale for spectral signals.}}
+\newglossaryentry{Viterbi}{name={Viterbi},
+       description={is a dynamic programming algorithm for finding the most likely
+       sequence of hidden states in a \gls{HMM}}}
  
  \begin{document}
-%Titlepage
+\frontmatter{}
+
  \maketitleru[
         course={(Automatic) Speech Recognition},
         institute={Radboud University Nijmegen},
-       authorstext={Author:}]
+       authorstext={Author:},
+       pagenr=1]
  \listoftodos[Todo]
  
  \tableofcontents
  
+\mainmatter{}
+%Berenzweig and Ellis use acoustic classifiers from speech recognition as a
+%detector for singing lines.  They achive 80\% accuracy for forty 15 second
+%exerpts. They mention people that wrote signal features that discriminate
+%between speech and music. Neural net
+%\glspl{HMM}~\cite{berenzweig_locating_2001}.
+%
+%In 2014 Dzhambazov et al.\ applied state of the art segmentation methods to
+%polyphonic turkish music, this might be interesting to use for heavy metal.
+%They mention Fujihara (2011) to have a similar \gls{FA} system. This method uses
+%phone level segmentation, first 12 \gls{MFCC}s. They first do vocal/non-vocal
+%detection, then melody extraction, then alignment. They compare results with
+%Mesaros \& Virtanen, 2008~\cite{dzhambazov_automatic_2014}. Later they
+%specialize in long syllables in a capella. They use \glspl{DHMM} with
+%\glspl{GMM} and show that adding knowledge increases alignment (bejing opera
+%has long syllables)~\cite{dzhambazov_automatic_2016}.
+%
+
+
+%Introduction, leading to a clearly defined research question
+\chapter{Introduction}
+\input{intro.tex}
+
+\chapter{Methods}
+\input{methods.tex}
+
+\chapter{Conclusion \& Discussion}
+\input{conclusion.tex}
+
+%(Appendices)
+\appendix
+\input{appendices.tex}
+
+\newpage
  %Glossaries
  \glsaddall{}
-\printglossaries%
-
-Berenzweig and Ellis use acoustic classifiers from speech recognition as a
-detector for singing lines.  They achive 80\% accuracy for forty 15 second
-exerpts. They mention people that wrote signal features that discriminate
-between speech and music. Neural net
-\glspl{HMM}~\cite{berenzweig_locating_2001}.
-
-In 2014 Dzhambazov et al.\ applied state of the art segmentation methods to
-polyphonic turkish music, this might be interesting to use for heavy metal.
-They mention Fujihara(2011) to have a similar \gls{FA} system. This method uses
-phone level segmentation, first 12 \gls{MFCC}s. They first do vocal/non-vocal
-detection, then melody extraction, then alignment. They compare results with
-Mesaros \& Virtanen, 2008~\cite{dzhambazov_automatic_2014}. Later they
-specialize in long syllables in a capella. They use \glspl{DHMM} with
-\glspl{GMM} and show that adding knowledge increases alignment (bejing opera
-has long syllables)~\cite{dzhambazov_automatic_2016}.
-
-t\cite{fujihara_automatic_2006}
-t\cite{fujihara_lyricsynchronizer:_2011}
-t\cite{fujihara_three_2008}
-t\cite{mauch_integrating_2012}
-t\cite{mesaros_adaptation_2009}
-t\cite{mesaros_automatic_2008}
-t\cite{mesaros_automatic_2010}
-t\cite{muller_multimodal_2012}
-t\cite{pedone_phoneme-level_2011}
-t\cite{yang_machine_2012}
+\begingroup
+\let\clearpage\relax
+\let\cleardoublepage\relax
+\printglossaries{}
+\endgroup
  
  \bibliographystyle{ieeetr}
  \bibliography{asr}