From de289a80233dc1a484ef989e4b08f5008c3a893e Mon Sep 17 00:00:00 2001 From: Mart Lubbers Date: Fri, 26 May 2017 14:23:19 +0200 Subject: [PATCH] add results and update methods --- asr.pre | 2 + asr.tex | 3 +- intro.tex | 2 - methods.tex | 132 +++++++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 124 insertions(+), 15 deletions(-) diff --git a/asr.pre b/asr.pre index 267e8b8..604494a 100644 --- a/asr.pre +++ b/asr.pre @@ -11,6 +11,8 @@ \usepackage{todonotes} % Todo's \usepackage{float} % Floating tables \usepackage{csquotes} % Typeset quotes +\usepackage{subcaption} % Subfigures and captions +\usepackage{multirow} % Multirow tables \graphicspath{{img/}} diff --git a/asr.tex b/asr.tex index a051094..273cdfa 100644 --- a/asr.tex +++ b/asr.tex @@ -17,6 +17,7 @@ \newacronym{PLP}{PLP}{Perceptual Linear Prediction} \newacronym{PPF}{PPF}{Posterior Probability Features} \newacronym{ZCR}{ZCR}{Zero-crossing Rate} +\newacronym{RELU}{ReLU}{Rectified Linear Unit} \newglossaryentry{dm}{name={Death Metal}, description={is an extreme heavy metal music style with growling vocals and pounding drums}} @@ -27,7 +28,7 @@ description={is a technique of converting a time representation signal to a frequency representation}} \newglossaryentry{MS}{name={Mel-Scale}, - description={is a human ear inspired scale for spectral signals.}} + description={is a human ear inspired scale for spectral signals}} \newglossaryentry{Viterbi}{name={Viterbi}, description={is a dynamic programming algorithm for finding the most likely sequence of hidden states in a \gls{HMM}}} diff --git a/intro.tex b/intro.tex index 10be98f..a64a9b2 100644 --- a/intro.tex +++ b/intro.tex @@ -42,8 +42,6 @@ tenth century\cite{friis_vikings_2004}: howling, only more untamed. \end{displayquote} -\section{\gls{dm}} - %Literature overview / related work \section{Related work} Applying speech related processing and classification techniques on music diff --git a/methods.tex b/methods.tex index 078ebbf..9478b87 100644 --- a/methods.tex +++ b/methods.tex @@ -68,7 +68,7 @@ The training and test data is divided as follows: \end{tabular} \end{table} -\section{\gls{MFCC} Features} +\section{\acrlong{MFCC} Features} The waveforms in itself are not very suitable to be used as features due to the high dimensionality and correlation. Therefore we use the often used \glspl{MFCC} feature vectors which has shown to be @@ -107,29 +107,137 @@ built incrementally in several steps. functions. \end{enumerate} -\section{\gls{ANN} Classifier} -\todo{Spectrals might be enough, no decorrelation} +The default number of \gls{MFCC} parameters is twelve. However, often a +thirteenth value is added that represents the energy in the data. + +\section{Experimental setup} +\subsection{Features} +The thirteen \gls{MFCC} features are chosen to feed to the classifier. The +parameters of the \gls{MFCC} features are varied in window step and window +length. The default speech processing parameters are tested but also bigger +window sizes since arguably the minimal size of a singing voice segment is a +lot bigger than the minimal size of a subphone component on which the +parameters are tuned. The parameters chosen are as follows: + +\begin{table}[H] + \centering + \begin{tabular}{lll} + \toprule + step (ms) & length (ms) & notes\\ + \midrule + 10 & 25 & Standard speech processing\\ + 40 & 100 &\\ + 80 & 200 &\\ + \bottomrule + \end{tabular} + \caption{\Gls{MFCC} parameter settings} +\end{table} -\section{Experiments} \subsection{\emph{Singing} voice detection} The first type of experiment conducted is \emph{Singing} voice detection. This is the act of segmenting an audio signal into segments that are labeled either as \emph{Singing} or as \emph{Instrumental}. The input of the classifier is a feature vector and the output is the probability that singing is happening in -the sample. - -\begin{figure}[H] - \centering - \includegraphics[width=.5\textwidth]{bcann} - \caption{Binary classifier network architecture}\label{fig:bcann} -\end{figure} +the sample. This results in an \gls{ANN} of the shape described in +Figure~\ref{fig:bcann}. The input dimension is thirteen and the output is one. \subsection{\emph{Singer} voice detection} The second type of experiment conducted is \emph{Singer} voice detection. This is the act of segmenting an audio signal into segments that are labeled either with the name of the singer or as \emph{Instrumental}. The input of the classifier is a feature vector and the outputs are probabilities for each of -the singers and a probability for the instrumental label. +the singers and a probability for the instrumental label. This results in an +\gls{ANN} of the shape described in Figure~\ref{fig:mcann}. The input dimension +is yet again thirteen and the output dimension is the number of categories. The +output is encoded in one-hot encoding. This means that the categories are +labeled as \texttt{1000, 0100, 0010, 0001}. + +\subsection{\acrlong{ANN}} +The data is classified using standard \gls{ANN} techniques, namely \glspl{MLP}. +The classification problems are only binary and four-class so therefore it is +interesting to see where the bottleneck lies. How abstract the abstraction can +go. The \gls{ANN} is built with the Keras\footnote{\url{https://keras.io}} +using the TensorFlow\footnote{\url{https://github.com/tensorflow/tensorflow}} +backend that provides a high-level interface to the highly technical networks. + +The general architecture of the networks is show in Figure~\ref{fig:bcann} and +Figure~\ref{fig:mcann} for respectively the binary classification and +multiclass classification. The inputs are fully connected to the hidden layer +which is fully connected too the output layer. The activation function used is +a \gls{RELU}. The \gls{RELU} function is a monotonic symmetric one-sided +function that is also known as the ramp function. The definition is given in +Equation~\ref{eq:relu}. \gls{RELU} was chosen because of its symmetry and +efficient computation. The activation function between the hidden layer and the +output layer is the sigmoid function in the case of binary classification. Of +which the definition is shown in Equation~\ref{eq:sigmoid}. The sigmoid is a +monotonic function that is differentiable on all values of $x$ and always +yields a non-negative derivative. For the multiclass classification the softmax +function is used between the hidden layer and the output layer. Softmax is an +activation function suitable for multiple output nodes. The definition is given +in Equation~\ref{eq:softmax}. + +The data is shuffled before fed to the network to mitigate the risk of +over fitting on one album. Every model was trained using $10$ epochs and a +batch size of $32$. + +\begin{equation}\label{eq:relu} + f(x) = \left\{\begin{array}{rcl} + 0 & \text{for} & x<0\\ + x & \text{for} & x \geq 0\\ + \end{array}\right. +\end{equation} + +\begin{equation}\label{eq:sigmoid} + f(x) = \frac{1}{1+e^{-x}} +\end{equation} + +\begin{equation}\label{eq:softmax} + \delta{(\boldsymbol{z})}_j = \frac{e^{z_j}}{\sum\limits^{K}_{k=1}e^{z_k}} +\end{equation} + +\begin{figure}[H] + \begin{subfigure}{.5\textwidth} + \centering + \includegraphics[width=.8\linewidth]{bcann} + \caption{Binary classifier network architecture}\label{fig:bcann} + \end{subfigure}% +% + \begin{subfigure}{.5\textwidth} + \includegraphics[width=.8\linewidth]{mcann} + \caption{Multiclass classifier network architecture}\label{fig:mcann} + \end{subfigure} + \caption{\acrlong{ANN} architectures.} +\end{figure} \section{Results} +\begin{table}[H] + \centering + \begin{tabular}{rccc} + \toprule + & \multicolumn{3}{c}{Parameters (step/length)}\\ + & 10/25 & 40/100 & 80/200\\ + \midrule + 3h & 0.86 (0.34) & 0.87 (0.32) & 0.85 (0.35)\\ + 5h & 0.87 (0.31) & 0.88 (0.30) & 0.87 (0.32)\\ + 8h & 0.88 (0.30) & 0.88 (0.31) & 0.88 (0.29)\\ + 13h & 0.89 (0.28) & 0.89 (0.29) & 0.88 (0.30)\\ + \bottomrule + \end{tabular} + \caption{Binary classification results (accuracy (loss))} +\end{table} +\begin{table}[H] + \centering + \begin{tabular}{rccc} + \toprule + & \multicolumn{3}{c}{Parameters (step/length)}\\ + & 10/25 & 40/100 & 80/200\\ + \midrule + 3h & 0.83 (0.48) & 0.82 (0.48) & 0.82 (0.48)\\ + 5h & 0.85 (0.43) & 0.84 (0.44) & 0.84 (0.44)\\ + 8h & 0.86 (0.41) & 0.86 (0.39) & 0.86 (0.40)\\ + 13h & 0.87 (0.37) & 0.87 (0.38) & 0.86 (0.39)\\ + \bottomrule + \end{tabular} + \caption{Multiclass classification results (accuracy (loss))} +\end{table} -- 2.20.1