expand MFCC bit

author Mart Lubbers <mart@martlubbers.net>

Tue, 16 May 2017 08:54:43 +0000 (10:54 +0200)

committer Mart Lubbers <mart@martlubbers.net>

Tue, 16 May 2017 08:54:43 +0000 (10:54 +0200)
author Mart Lubbers <mart@martlubbers.net>
Tue, 16 May 2017 08:54:43 +0000 (10:54 +0200)
committer Mart Lubbers <mart@martlubbers.net>
Tue, 16 May 2017 08:54:43 +0000 (10:54 +0200)
diff --git a/asr.pre b/asr.pre

index 58737a7..267e8b8 100644 (file)
--- a/asr.pre
+++ b/asr.pre
@@ -14,6 +14,16 @@
  
  \graphicspath{{img/}}
  
+\pdfstringdefDisableCommands{%
+       \def\acrlong#1{}%
+       \def\acrshort#1{}%
+       \def\acrfull#1{}%
+       \def\gls#1{}%
+       \def\glspl#1{}%
+       \def\Gls#1{}%
+       \def\Glspl#1{}%
+}
+
  \urlstyle{same}
  \hypersetup{%
         pdftitle={Singing voice detection in Death Metal music},
diff --git a/asr.tex b/asr.tex

index 9cdbaf5..22b567e 100644 (file)
--- a/asr.tex
+++ b/asr.tex
@@ -2,20 +2,21 @@
  \usepackage[toc,nonumberlist,acronyms]{glossaries}
  \makeglossaries%
  \newacronym{ANN}{ANN}{Artificial Neural Network}
-\newacronym{HMM}{HMM}{Hidden Markov Model}
-\newacronym{GMM}{GMM}{Gaussian Mixture Models}
+\newacronym{DCT}{DCT}{Discrete Cosine Transform}
  \newacronym{DHMM}{DHMM}{Duration-explicit \acrlong{HMM}}
-\newacronym{HTK}{HTK}{\acrlong{HMM} Toolkit}
  \newacronym{FA}{FA}{Forced alignment}
-\newacronym{MFC}{MFC}{Mel-frequency cepstrum}
+\newacronym{GMM}{GMM}{Gaussian Mixture Models}
+\newacronym{HMM}{HMM}{Hidden Markov Model}
+\newacronym{HTK}{HTK}{\acrlong{HMM} Toolkit}
+\newacronym{IFPI}{IFPI}{International Federation of the Phonographic Industry}
+\newacronym{LPCC}{LPCC}{\acrlong{LPC} derivec cepstrum}
+\newacronym{LPC}{LPC}{Linear Prediction Coefficients}
  \newacronym{MFCC}{MFCC}{\acrlong{MFC} coefficient}
-\newacronym{PPF}{PPF}{Posterior Probability Features}
+\newacronym{MFC}{MFC}{Mel-frequency cepstrum}
  \newacronym{MLP}{MLP}{Multi-layer Perceptron}
  \newacronym{PLP}{PLP}{Perceptual Linear Prediction}
+\newacronym{PPF}{PPF}{Posterior Probability Features}
  \newacronym{ZCR}{ZCR}{Zero-crossing Rate}
-\newacronym{LPC}{LPC}{Linear Prediction Coefficients}
-\newacronym{LPCC}{LPCC}{\acrlong{LPC} derivec cepstrum}
-\newacronym{IFPI}{IFPI}{International Federation of the Phonographic Industry}
  \newglossaryentry{dm}{name={Death Metal},
         description={is an extreme heavy metal music style with growling vocals and
         pounding drums}}
diff --git a/methods.tex b/methods.tex

index c49c249..a15c421 100644 (file)
--- a/methods.tex
+++ b/methods.tex
@@ -46,7 +46,7 @@ Lastly a band from Moscow is chosen bearing the name \emph{Who Dies in
  Siberian Slush}. This band is a little odd compared to the previous \gls{dm}
  bands because they create \gls{dom}. \gls{dom} is characterized by the very
  slow tempo and low tuned guitars. The vocalist has a very characteristic growl
-and performs in several moscovian bands. This band also stands out because it
+and performs in several Muscovite bands. This band also stands out because it
  uses piano's and synthesizers. The droning synthesizers often operate in the
  same frequency as the vocals.
  
@@ -54,7 +54,9 @@ same frequency as the vocals.
  The waveforms in itself are not very suitable to be used as features due to the
  high dimensionality and correlation. Therefore we use the often used
  \glspl{MFCC} feature vectors which has shown to be
-suitable\cite{rocamora_comparing_2007}. The actual conversion is done using the
+suitable\cite{rocamora_comparing_2007}. It has also been found that altering
+the mel scale to better suit singing does not yield a better
+performance\cite{you_comparative_2015}. The actual conversion is done using the
  \emph{python\_speech\_features}%
  \footnote{\url{https://github.com/jameslyons/python_speech_features}} package.
  
@@ -71,12 +73,14 @@ steps.
         \item The standard \gls{FT} gives a spectral representation that has
                 linearly scaled frequencies. This scale is converted to the \gls{MS}
                 using triangular overlapping windows.
-       \item
+       \item The log is taken of the Mel frequencies. This step is inspired by the
+               \emph{Weber-Fechner} law that describes how humans perceive physical
+               magnitudes\footnote{Fechner, Gustav Theodor (1860). Elemente der
+               Psychophysik}
+       \item To decorrelate the signal a \gls{DCT} is applied. The \gls{MFCC}
+               features are then the amplitudes of the spectrum.
  \end{enumerate}
  
-
-\todo{Explain why MFCC and which parameters}
-
  \section{\gls{ANN} Classifier}
  \todo{Spectrals might be enough, no decorrelation}
author	Mart Lubbers <mart@martlubbers.net>
	Tue, 16 May 2017 08:54:43 +0000 (10:54 +0200)
committer	Mart Lubbers <mart@martlubbers.net>
	Tue, 16 May 2017 08:54:43 +0000 (10:54 +0200)
asr.pre		patch \| blob \| history
asr.tex		patch \| blob \| history
methods.tex		patch \| blob \| history