2 \usepackage[toc,nonumberlist,acronyms
]{glossaries
}
4 \newacronym{ANN
}{ANN
}{Artificial Neural Network
}
5 \newacronym{DCT
}{DCT
}{Discrete Cosine Transform
}
6 \newacronym{DHMM
}{DHMM
}{Duration-explicit
\acrlong{HMM
}}
7 \newacronym{FA
}{FA
}{Forced alignment
}
8 \newacronym{GMM
}{GMM
}{Gaussian Mixture Models
}
9 \newacronym{HMM
}{HMM
}{Hidden Markov Model
}
10 \newacronym{HTK
}{HTK
}{\acrlong{HMM
} Toolkit
}
11 \newacronym{IFPI
}{IFPI
}{International Federation of the Phonographic Industry
}
12 \newacronym{LPCC
}{LPCC
}{\acrlong{LPC
} derivec cepstrum
}
13 \newacronym{LPC
}{LPC
}{Linear Prediction Coefficients
}
14 \newacronym{MFCC
}{MFCC
}{\acrlong{MFC
} coefficient
}
15 \newacronym{MFC
}{MFC
}{Mel-frequency cepstrum
}
16 \newacronym{MLP
}{MLP
}{Multi-layer Perceptron
}
17 \newacronym{PLP
}{PLP
}{Perceptual Linear Prediction
}
18 \newacronym{PPF
}{PPF
}{Posterior Probability Features
}
19 \newacronym{ZCR
}{ZCR
}{Zero-crossing Rate
}
20 \newacronym{RELU
}{ReLU
}{Rectified Linear Unit
}
21 \newglossaryentry{dm
}{name=
{Death Metal
},
22 description=
{is an extreme heavy metal music style with growling vocals and
24 \newglossaryentry{dom
}{name=
{Doom Metal
},
25 description=
{is an extreme heavy metal music style with growling vocals and
26 pounding drums played very slowly
}}
27 \newglossaryentry{FT
}{name=
{Fourier Transform
},
28 description=
{is a technique of converting a time representation signal to a
29 frequency representation
}}
30 \newglossaryentry{MS
}{name=
{Mel-Scale
},
31 description=
{is a human ear inspired scale for spectral signals
}}
32 \newglossaryentry{Viterbi
}{name=
{Viterbi
},
33 description=
{is a dynamic programming algorithm for finding the most likely
34 sequence of hidden states in a
\gls{HMM
}}}
40 course=
{(Automatic) Speech Recognition
},
41 institute=
{Radboud University Nijmegen
},
42 authorstext=
{Author:
},
43 righttextheader=
{Supervisor:
},
44 righttext=
{Louis ten Bosch
},
51 %Berenzweig and Ellis use acoustic classifiers from speech recognition as a
52 %detector for singing lines. They achive 80\% accuracy for forty 15 second
53 %exerpts. They mention people that wrote signal features that discriminate
54 %between speech and music. Neural net
55 %\glspl{HMM}~\cite{berenzweig_locating_2001}.
57 %In 2014 Dzhambazov et al.\ applied state of the art segmentation methods to
58 %polyphonic turkish music, this might be interesting to use for heavy metal.
59 %They mention Fujihara (2011) to have a similar \gls{FA} system. This method uses
60 %phone level segmentation, first 12 \gls{MFCC}s. They first do vocal/non-vocal
61 %detection, then melody extraction, then alignment. They compare results with
62 %Mesaros \& Virtanen, 2008~\cite{dzhambazov_automatic_2014}. Later they
63 %specialize in long syllables in a capella. They use \glspl{DHMM} with
64 %\glspl{GMM} and show that adding knowledge increases alignment (bejing opera
65 %has long syllables)~\cite{dzhambazov_automatic_2016}.
69 %Introduction, leading to a clearly defined research question
70 \chapter{Introduction
}
76 \chapter{Conclusion \& Discussion
}
77 \input{conclusion.tex
}
81 \input{appendices.tex
}
88 \let\cleardoublepage\relax
92 \bibliographystyle{ieeetr
}