add results and update methods
[asr1617.git] / asr.tex
1 %&asr
2 \usepackage[toc,nonumberlist,acronyms]{glossaries}
3 \makeglossaries%
4 \newacronym{ANN}{ANN}{Artificial Neural Network}
5 \newacronym{DCT}{DCT}{Discrete Cosine Transform}
6 \newacronym{DHMM}{DHMM}{Duration-explicit \acrlong{HMM}}
7 \newacronym{FA}{FA}{Forced alignment}
8 \newacronym{GMM}{GMM}{Gaussian Mixture Models}
9 \newacronym{HMM}{HMM}{Hidden Markov Model}
10 \newacronym{HTK}{HTK}{\acrlong{HMM} Toolkit}
11 \newacronym{IFPI}{IFPI}{International Federation of the Phonographic Industry}
12 \newacronym{LPCC}{LPCC}{\acrlong{LPC} derivec cepstrum}
13 \newacronym{LPC}{LPC}{Linear Prediction Coefficients}
14 \newacronym{MFCC}{MFCC}{\acrlong{MFC} coefficient}
15 \newacronym{MFC}{MFC}{Mel-frequency cepstrum}
16 \newacronym{MLP}{MLP}{Multi-layer Perceptron}
17 \newacronym{PLP}{PLP}{Perceptual Linear Prediction}
18 \newacronym{PPF}{PPF}{Posterior Probability Features}
19 \newacronym{ZCR}{ZCR}{Zero-crossing Rate}
20 \newacronym{RELU}{ReLU}{Rectified Linear Unit}
21 \newglossaryentry{dm}{name={Death Metal},
22 description={is an extreme heavy metal music style with growling vocals and
23 pounding drums}}
24 \newglossaryentry{dom}{name={Doom Metal},
25 description={is an extreme heavy metal music style with growling vocals and
26 pounding drums played very slowly}}
27 \newglossaryentry{FT}{name={Fourier Transform},
28 description={is a technique of converting a time representation signal to a
29 frequency representation}}
30 \newglossaryentry{MS}{name={Mel-Scale},
31 description={is a human ear inspired scale for spectral signals}}
32 \newglossaryentry{Viterbi}{name={Viterbi},
33 description={is a dynamic programming algorithm for finding the most likely
34 sequence of hidden states in a \gls{HMM}}}
35
36 \begin{document}
37 \frontmatter{}
38
39 \maketitleru[
40 course={(Automatic) Speech Recognition},
41 institute={Radboud University Nijmegen},
42 authorstext={Author:},
43 righttextheader={Supervisor:},
44 righttext={Louis ten Bosch},
45 pagenr=1]
46 \listoftodos[Todo]
47
48 \tableofcontents
49
50 \mainmatter{}
51 %Berenzweig and Ellis use acoustic classifiers from speech recognition as a
52 %detector for singing lines. They achive 80\% accuracy for forty 15 second
53 %exerpts. They mention people that wrote signal features that discriminate
54 %between speech and music. Neural net
55 %\glspl{HMM}~\cite{berenzweig_locating_2001}.
56 %
57 %In 2014 Dzhambazov et al.\ applied state of the art segmentation methods to
58 %polyphonic turkish music, this might be interesting to use for heavy metal.
59 %They mention Fujihara (2011) to have a similar \gls{FA} system. This method uses
60 %phone level segmentation, first 12 \gls{MFCC}s. They first do vocal/non-vocal
61 %detection, then melody extraction, then alignment. They compare results with
62 %Mesaros \& Virtanen, 2008~\cite{dzhambazov_automatic_2014}. Later they
63 %specialize in long syllables in a capella. They use \glspl{DHMM} with
64 %\glspl{GMM} and show that adding knowledge increases alignment (bejing opera
65 %has long syllables)~\cite{dzhambazov_automatic_2016}.
66 %
67
68
69 %Introduction, leading to a clearly defined research question
70 \chapter{Introduction}
71 \input{intro.tex}
72
73 \chapter{Methods}
74 \input{methods.tex}
75
76 \chapter{Conclusion \& Discussion}
77 \input{conclusion.tex}
78
79 %(Appendices)
80 \appendix
81 \input{appendices.tex}
82
83 \newpage
84 %Glossaries
85 \glsaddall{}
86 \begingroup
87 \let\clearpage\relax
88 \let\cleardoublepage\relax
89 \printglossaries{}
90 \endgroup
91
92 \bibliographystyle{ieeetr}
93 \bibliography{asr}
94 \end{document}