From: Mart Lubbers Date: Tue, 16 May 2017 08:54:43 +0000 (+0200) Subject: expand MFCC bit X-Git-Url: https://git.martlubbers.net/?a=commitdiff_plain;h=22a1460dd86824b96092d9a8aea420a50d30c221;p=asr1617.git expand MFCC bit --- diff --git a/asr.pre b/asr.pre index 58737a7..267e8b8 100644 --- a/asr.pre +++ b/asr.pre @@ -14,6 +14,16 @@ \graphicspath{{img/}} +\pdfstringdefDisableCommands{% + \def\acrlong#1{}% + \def\acrshort#1{}% + \def\acrfull#1{}% + \def\gls#1{}% + \def\glspl#1{}% + \def\Gls#1{}% + \def\Glspl#1{}% +} + \urlstyle{same} \hypersetup{% pdftitle={Singing voice detection in Death Metal music}, diff --git a/asr.tex b/asr.tex index 9cdbaf5..22b567e 100644 --- a/asr.tex +++ b/asr.tex @@ -2,20 +2,21 @@ \usepackage[toc,nonumberlist,acronyms]{glossaries} \makeglossaries% \newacronym{ANN}{ANN}{Artificial Neural Network} -\newacronym{HMM}{HMM}{Hidden Markov Model} -\newacronym{GMM}{GMM}{Gaussian Mixture Models} +\newacronym{DCT}{DCT}{Discrete Cosine Transform} \newacronym{DHMM}{DHMM}{Duration-explicit \acrlong{HMM}} -\newacronym{HTK}{HTK}{\acrlong{HMM} Toolkit} \newacronym{FA}{FA}{Forced alignment} -\newacronym{MFC}{MFC}{Mel-frequency cepstrum} +\newacronym{GMM}{GMM}{Gaussian Mixture Models} +\newacronym{HMM}{HMM}{Hidden Markov Model} +\newacronym{HTK}{HTK}{\acrlong{HMM} Toolkit} +\newacronym{IFPI}{IFPI}{International Federation of the Phonographic Industry} +\newacronym{LPCC}{LPCC}{\acrlong{LPC} derivec cepstrum} +\newacronym{LPC}{LPC}{Linear Prediction Coefficients} \newacronym{MFCC}{MFCC}{\acrlong{MFC} coefficient} -\newacronym{PPF}{PPF}{Posterior Probability Features} +\newacronym{MFC}{MFC}{Mel-frequency cepstrum} \newacronym{MLP}{MLP}{Multi-layer Perceptron} \newacronym{PLP}{PLP}{Perceptual Linear Prediction} +\newacronym{PPF}{PPF}{Posterior Probability Features} \newacronym{ZCR}{ZCR}{Zero-crossing Rate} -\newacronym{LPC}{LPC}{Linear Prediction Coefficients} -\newacronym{LPCC}{LPCC}{\acrlong{LPC} derivec cepstrum} -\newacronym{IFPI}{IFPI}{International Federation of the Phonographic Industry} \newglossaryentry{dm}{name={Death Metal}, description={is an extreme heavy metal music style with growling vocals and pounding drums}} diff --git a/methods.tex b/methods.tex index c49c249..a15c421 100644 --- a/methods.tex +++ b/methods.tex @@ -46,7 +46,7 @@ Lastly a band from Moscow is chosen bearing the name \emph{Who Dies in Siberian Slush}. This band is a little odd compared to the previous \gls{dm} bands because they create \gls{dom}. \gls{dom} is characterized by the very slow tempo and low tuned guitars. The vocalist has a very characteristic growl -and performs in several moscovian bands. This band also stands out because it +and performs in several Muscovite bands. This band also stands out because it uses piano's and synthesizers. The droning synthesizers often operate in the same frequency as the vocals. @@ -54,7 +54,9 @@ same frequency as the vocals. The waveforms in itself are not very suitable to be used as features due to the high dimensionality and correlation. Therefore we use the often used \glspl{MFCC} feature vectors which has shown to be -suitable\cite{rocamora_comparing_2007}. The actual conversion is done using the +suitable\cite{rocamora_comparing_2007}. It has also been found that altering +the mel scale to better suit singing does not yield a better +performance\cite{you_comparative_2015}. The actual conversion is done using the \emph{python\_speech\_features}% \footnote{\url{https://github.com/jameslyons/python_speech_features}} package. @@ -71,12 +73,14 @@ steps. \item The standard \gls{FT} gives a spectral representation that has linearly scaled frequencies. This scale is converted to the \gls{MS} using triangular overlapping windows. - \item + \item The log is taken of the Mel frequencies. This step is inspired by the + \emph{Weber-Fechner} law that describes how humans perceive physical + magnitudes\footnote{Fechner, Gustav Theodor (1860). Elemente der + Psychophysik} + \item To decorrelate the signal a \gls{DCT} is applied. The \gls{MFCC} + features are then the amplitudes of the spectrum. \end{enumerate} - -\todo{Explain why MFCC and which parameters} - \section{\gls{ANN} Classifier} \todo{Spectrals might be enough, no decorrelation}