From c21ab5607a5f3c2b86b851f19f0a72880ec27e59 Mon Sep 17 00:00:00 2001 From: Mart Lubbers Date: Thu, 8 Jun 2017 11:06:34 +0200 Subject: [PATCH] process Chris' comments --- abstract.tex | 6 ++++++ acknowledgements.tex | 1 + asr.tex | 22 +++++++++++++++++--- conclusion.tex | 8 ++++---- glossaries.tex | 28 ++++++++++++++++++++++++++ intro.tex | 10 ++++----- methods.tex | 48 ++++++++++++++++++++++++-------------------- results.tex | 28 +++++++++++++------------- 8 files changed, 103 insertions(+), 48 deletions(-) create mode 100644 abstract.tex create mode 100644 acknowledgements.tex diff --git a/abstract.tex b/abstract.tex new file mode 100644 index 0000000..888068f --- /dev/null +++ b/abstract.tex @@ -0,0 +1,6 @@ +This thesis present a method of detecting \emph{singing}-voice segments in non +standard music genres. The research uses existing techniques on extreme styles +like \gls{dm} and \gls{dom} and achieves a good performance. +\emph{Singer}-voice recognition and detection has also been attempted with +similarl positive results. This is founded basis for attempting lyrics +synchronization and lyrics recognition. diff --git a/acknowledgements.tex b/acknowledgements.tex new file mode 100644 index 0000000..3eab64c --- /dev/null +++ b/acknowledgements.tex @@ -0,0 +1 @@ +I would like to thank\ldots diff --git a/asr.tex b/asr.tex index dc11df3..9834402 100644 --- a/asr.tex +++ b/asr.tex @@ -1,7 +1,6 @@ %&asr -\usepackage[toc,nonumberlist,acronyms]{glossaries} +\usepackage[nonumberlist]{glossaries} \makeglossaries% -\input{acronyms} \input{glossaries} \begin{document} @@ -15,10 +14,24 @@ righttext={Louis ten Bosch}, pagenr=1] +%Abstract +\addcontentsline{toc}{chapter}{Abstract} +\chapter*{\centering Abstract} +\begin{quotation} + \centering\noindent + \input{abstract} +\end{quotation} + +% Acknowledgements +\addcontentsline{toc}{chapter}{Acknowledgements} +\chapter*{\centering Acknowledgements} +\begin{quotation} + \centering\it\noindent + \input{acknowledgements} +\end{quotation} \tableofcontents \glsaddall{} -\printglossaries{} \mainmatter{} @@ -38,6 +51,9 @@ \appendix \input{appendices} +\addcontentsline{toc}{chapter}{Glossaries \& Acronyms} +\printglossaries% + \bibliographystyle{ieeetr} \bibliography{asr} \end{document} diff --git a/conclusion.tex b/conclusion.tex index 778b4fc..3adbc05 100644 --- a/conclusion.tex +++ b/conclusion.tex @@ -28,8 +28,8 @@ Therefore the resulting model can be very general. On the other side, it could also result in a model that is overfitted the three islands in entire space of grunting voices. -In this case it seems that the model generalizes well. The alien data similar -to the trainingsdata offered to the model results in a good performance. +In this case it seems that the model generalizes well. The alien data --- similar +to the training data --- offered to the model, results in a good performance. However, alien data that has a very different style does not perform as good. While testing \emph{Catacombs} the performance was very poor. Adding \emph{Catacombs} or a similar style to the training set can probably overcome @@ -64,8 +64,8 @@ converting the waveforms to \gls{MFCC} can be performed by the neural network. The current decorrelation step might be inefficient or unnatural. The \gls{ANN} train the weights in such a way that performance is maximized. It would be interesting to see whether this results in a different normalization step. The -downside of this is that training the model is complexer because there are many -more weights to train. +downside of this is that training the model is more complex because there are +many more weights to train. \paragraph{Genre detection: } \emph{Singing}-voice detection and \emph{singer}-voice can be seen as a crude diff --git a/glossaries.tex b/glossaries.tex index 38e5895..8d90aeb 100644 --- a/glossaries.tex +++ b/glossaries.tex @@ -13,3 +13,31 @@ \newglossaryentry{Viterbi}{name={Viterbi}, description={is a dynamic programming algorithm for finding the most likely sequence of hidden states in a \gls{HMM}}} + +\newcommand{\newglossacr}[2]{\newglossaryentry{#1}{ + name={#1}, + first={#2 (#1)},% + firstplural={#2\glspluralsuffix{} (#1\glspluralsuffix)}, + description={#2}}} + +\newglossacr{GADT}{Generalized Algebraic Data type} +\newglossacr{ANN}{Artificial Neural Network} +\newglossacr{DCT}{Discrete Cosine Transform} +\newglossacr{DHMM}{Duration-explicit Hidden Markov Model} +\newglossacr{FA}{Forced Alignment} +\newglossacr{GMM}{Gaussian Mixture Models} +\newglossacr{HMM}{Hidden Markov Model} +\newglossacr{HTK}{Hidden Markov Model Toolkit} +\newglossacr{IFPI}{International Federation of the Phonographic Industry} +\newglossacr{LPCC}{Linear Prediction Coefficient Derived Cepstrum} +\newglossacr{LPC}{Linear Prediction Coefficient} +\newglossacr{MFCC}{Mel-frequency Cepstral Coefficient} +\newglossacr{MFC}{Mel-frequency Cepstrum} +\newglossacr{MLP}{Multi-layer Perceptron} +\newglossacr{PLP}{Perceptual Linear Prediction} +\newglossacr{PPF}{Posterior Probability Features} +\newglossacr{ZCR}{Zero-crossing Rate} +\newglossacr{RELU}{Rectified Linear Unit} +\newglossacr{CC}{Cannibal Corpse} +\newglossacr{DG}{Disgorge} +\newglossacr{WDISS}{Who Dies in Siberian Slush} diff --git a/intro.tex b/intro.tex index eb482f1..9e47f0a 100644 --- a/intro.tex +++ b/intro.tex @@ -1,7 +1,7 @@ \section{Introduction} The primary medium for music distribution is rapidly changing from physical -media to digital media. In 2016 the \gls{IFPI} stated that about $43\%$ of -music revenue arises from digital distribution. Another $39\%$ arises from the +media to digital media. In 2016 the \gls{IFPI} stated that about $50\%$ of +music revenue arises from digital distribution. Another $34\%$ arises from the physical sale and the remaining $16\%$ is made through performance and synchronisation revenues. The overtake of digital formats on physical formats took place somewhere in 2015. Moreover, ever since twenty years the music @@ -14,7 +14,7 @@ available for consumers. Lyrics for tracks are in almost all cases amply available. However, a temporal alignment of the lyrics is not and creating it involves manual labour. -A lot of the current day musical distribution goes via non-official channels +A lot of the current day music distribution goes via non-official channels such as YouTube\footnote{\url{https://youtube.com}} in which fans of the performers often accompany the music with synchronized lyrics. This means that there is an enormous treasure of lyrics-annotated music available. However, the @@ -51,7 +51,7 @@ classify audio in the categories \emph{Music} and \emph{Speech}. They found that music has different properties than speech. Music uses a wider spectral bandwidth in which events happen. Music contains more tonality and rhythm. Multivariate Gaussian classifiers were used to discriminate the categories with -an average performance of $90\%$~\cite{saunders_real-time_1996}. +an average accuracy of $90\%$~\cite{saunders_real-time_1996}. Williams and Ellis were inspired by the aforementioned research and tried to separate the singing segments from the instrumental segments~% @@ -68,7 +68,7 @@ classification and used an \gls{ANN} (\gls{MLP}) using \gls{PLP} coefficients to detect a singing voice~\cite{berenzweig_using_2002}. Nwe et al.\ showed that there is not much difference in accuracy when using different features founded in speech processing. They tested several features and found accuracies differ -less that a few percent. Moreover, they found that others have tried to tackle +less than a few percent. Moreover, they found that others have tried to tackle the problem using myriads of different approaches such as using \gls{ZCR}, \gls{MFCC} and \gls{LPCC} as features and \glspl{HMM} or \glspl{GMM} as classifiers~\cite{nwe_singing_2004}. diff --git a/methods.tex b/methods.tex index 0eed76e..0f694c5 100644 --- a/methods.tex +++ b/methods.tex @@ -17,14 +17,14 @@ visible over time. \begin{figure}[ht] \centering \includegraphics[width=.7\linewidth]{cement} - \caption{A vocal segment of the \acrlong{CC} song + \caption{A vocal segment of the Cannibal Corpse song \emph{Bloodstained Cement}}\label{fig:bloodstained} \end{figure} \begin{figure}[ht] \centering \includegraphics[width=.7\linewidth]{abominations} - \caption{A vocal segment of the \acrlong{DG} song + \caption{A vocal segment of the Disgorge song \emph{Enthroned Abominations}}\label{fig:abominations} \end{figure} @@ -49,7 +49,7 @@ performs in several Muscovite bands. This band also stands out because it uses piano's and synthesizers. The droning synthesizers often operate in the same frequency as the vocals. -Additional detailss about the dataset are listed in Appendix~\ref{app:data}. +Additional details about the dataset are listed in Appendix~\ref{app:data}. The data is labeled as singing and instrumental and labeled per band. The distribution for this is shown in Table~\ref{tbl:distribution}. \begin{table}[H] @@ -72,7 +72,7 @@ distribution for this is shown in Table~\ref{tbl:distribution}. \caption{Data distribution}\label{tbl:distribution} \end{table} -\section{\acrlong{MFCC} Features} +\section{Mel-frequencey Cepstral Features} The waveforms in itself are not very suitable to be used as features due to the high dimensionality and correlation in the temporal domain. Therefore we use the often used \glspl{MFCC} feature vectors which have shown to be suitable in @@ -102,8 +102,8 @@ created from a waveform incrementally using several steps: magnitudes\footnote{Fechner, Gustav Theodor (1860). Elemente der Psychophysik}. They found that energy is perceived in logarithmic increments. This means that twice the amount of energy does not mean - twice the amount of perceived loudness. Therefore we take the log of - the energy or amplitude of the \gls{MS} spectrum to closer match the + twice the amount of perceived loudness. Therefore we take the logarithm + of the energy or amplitude of the \gls{MS} spectrum to closer match the human hearing. \item The amplitudes of the spectrum are highly correlated and therefore the last step is a decorrelation step. \Gls{DCT} is applied on the @@ -118,9 +118,9 @@ The $c_0$ is chosen is this example. $c_0$ is the zeroth \gls{MFCC}. It represents the overall energy in the \gls{MS}. Another option would be $\log{(E)}$ which is the logarithm of the raw energy of the sample. -\section{\acrlong{ANN}} +\section{Artificial Neural Network} The data is classified using standard \gls{ANN} techniques, namely \glspl{MLP}. -The classification problems are only binary and four-class so it is +The classification problems are only binary or four-class problems so it is interesting to see where the bottleneck lies; how abstract can the abstraction be made. The \gls{ANN} is built with the Keras\footnote{\url{https://keras.io}} using the TensorFlow\footnote{\url{https://github.com/tensorflow/tensorflow}} @@ -132,20 +132,24 @@ multiclass classification. The inputs are fully connected to the hidden layer which is fully connected too the output layer. The activation function used is a \gls{RELU}. The \gls{RELU} function is a monotonic symmetric one-sided function that is also known as the ramp function. The definition is given in -Equation~\ref{eq:relu}. \gls{RELU} was chosen because of its symmetry and -efficient computation. The activation function between the hidden layer and the -output layer is the sigmoid function in the case of binary classification, of -which the definition is shown in Equation~\ref{eq:sigmoid}. The sigmoid is a -monotonic function that is differentiable on all values of $x$ and always -yields a non-negative derivative. For the multiclass classification the softmax -function is used between the hidden layer and the output layer. Softmax is an -activation function suitable for multiple output nodes. The definition is given -in Equation~\ref{eq:softmax}. +Equation~\ref{eq:relu}. \gls{RELU} has the downside that it can create +unreachable nodes in a deep network. This is not a problem in this network +since it only has one hidden layer. \gls{RELU} was also chosen because of its +efficient computation and nature inspiredness. + +The activation function between the hidden layer and the output layer is the +sigmoid function in the case of binary classification, of which the definition +is shown in Equation~\ref{eq:sigmoid}. The sigmoid is a monotonic function that +is differentiable on all values of $x$ and always yields a non-negative +derivative. For the multiclass classification the softmax function is used +between the hidden layer and the output layer. Softmax is an activation +function suitable for multiple output nodes. The definition is given in +Equation~\ref{eq:softmax}. The data is shuffled before fed to the network to mitigate the risk of -overfitting on one album. Every model was trained using $10$ epochs and a -batch size of $32$. The training set and test set are separated by taking a -$90\%$ slice of all the data. +overfitting on one album. Every model was trained using $10$ epochs which means +that all training data is offered to the model $10$ times. The training set and +test set are separated by taking a $90\%$ slice of all the data. \begin{equation}\label{eq:relu} f(x) = \left\{\begin{array}{rcl} @@ -174,12 +178,12 @@ $90\%$ slice of all the data. \includegraphics[width=.8\linewidth]{mcann} \caption{Multiclass classifier network architecture}\label{fig:mcann} \end{subfigure} - \caption{\acrlong{ANN} architectures.} + \caption{Artificial Neural Network architectures.} \end{figure} \section{Experimental setup} \subsection{Features} -The thirteen \gls{MFCC} features are used as the input. Th parameters of the +The thirteen \gls{MFCC} features are used as the input. The parameters of the \gls{MFCC} features are varied in window step and window length. The default speech processing parameters are tested but also bigger window sizes since arguably the minimal size of a singing voice segment is a lot bigger than the diff --git a/results.tex b/results.tex index 57f6779..ac27776 100644 --- a/results.tex +++ b/results.tex @@ -7,16 +7,16 @@ extraction. \begin{table}[H] \centering - \begin{tabular}{rccc} + \begin{tabular}{rrccc} \toprule - & \multicolumn{3}{c}{Parameters (step/length)}\\ - & 10/25 & 40/100 & 80/200\\ + & & \multicolumn{3}{c}{Parameters (step/length)}\\ + & & 10/25 & 40/100 & 80/200\\ \midrule \multirow{4}{*}{Hidden Nodes} - & 0.86 (0.34) & 0.87 (0.32) & 0.85 (0.35)\\ - & 0.87 (0.31) & 0.88 (0.30) & 0.87 (0.32)\\ - & 0.88 (0.30) & 0.88 (0.31) & 0.88 (0.29)\\ - & 0.89 (0.28) & 0.89 (0.29) & 0.88 (0.30)\\ + & 3 & 0.86 (0.34) & 0.87 (0.32) & 0.85 (0.35)\\ + & 5 & 0.87 (0.31) & 0.88 (0.30) & 0.87 (0.32)\\ + & 8 & 0.88 (0.30) & 0.88 (0.31) & 0.88 (0.29)\\ + & 13 & 0.89 (0.28) & 0.89 (0.29) & 0.88 (0.30)\\ \bottomrule \end{tabular} \caption{Binary classification results (accuracy (loss))}% @@ -43,16 +43,16 @@ same metrics are used as in \emph{Singing}-voice detection. \begin{table}[H] \centering - \begin{tabular}{rccc} + \begin{tabular}{rrccc} \toprule - & \multicolumn{3}{c}{Parameters (step/length)}\\ - & 10/25 & 40/100 & 80/200\\ + & & \multicolumn{3}{c}{Parameters (step/length)}\\ + & & 10/25 & 40/100 & 80/200\\ \midrule \multirow{4}{*}{Hidden Nodes} - & 0.83 (0.48) & 0.82 (0.48) & 0.82 (0.48)\\ - & 0.85 (0.43) & 0.84 (0.44) & 0.84 (0.44)\\ - & 0.86 (0.41) & 0.86 (0.39) & 0.86 (0.40)\\ - & 0.87 (0.37) & 0.87 (0.38) & 0.86 (0.39)\\ + & 3 & 0.83 (0.48) & 0.82 (0.48) & 0.82 (0.48)\\ + & 5 & 0.85 (0.43) & 0.84 (0.44) & 0.84 (0.44)\\ + & 8 & 0.86 (0.41) & 0.86 (0.39) & 0.86 (0.40)\\ + & 13 & 0.87 (0.37) & 0.87 (0.38) & 0.86 (0.39)\\ \bottomrule \end{tabular} \caption{Multiclass classification results (accuracy -- 2.20.1