From: Mart Lubbers <mart@martlubbers.net>
Date: Tue, 28 Mar 2017 15:44:32 +0000 (+0200)
Subject: started writing up some stuff
X-Git-Url: https://git.martlubbers.net/?a=commitdiff_plain;h=f945aee6ab335b268bf25d476942ea8471916382;p=asr1617.git

started writing up some stuff
---

diff --git a/asr.bib b/asr.bib
index 6ad0fa3..570488f 100644
--- a/asr.bib
+++ b/asr.bib
@@ -1,14 +1,18 @@
-@book{muller_multimodal_2012,
+
+@incollection{muller_lyrics--audio_2012,
 	address = {Wadern},
-	title = {Multimodal {Music} {Processing}},
+	title = {Lyrics-to-{Audio} {Alignment} and its {Application}},
 	isbn = {978-3-939897-37-8},
 	url = {http://nbn-resolving.de/urn:nbn:de:0030-drops-27851},
 	language = {English},
 	urldate = {2017-03-02},
+	booktitle = {Multimodal {Music} {Processing}},
 	publisher = {Schloss Dagstuhl - Leibniz-Zentrum fÃ¼r Informatik GmbH},
-	author = {MÃ¼ller, Meinard and Goto, Masataka and Schedl, Markus},
+	author = {Goto, Masataka and Fujihara, Hiromasa},
+	editor = {MÃ¼ller, Meinard},
 	year = {2012},
 	note = {OCLC: 864001691},
+	pages = {23--36},
 	file = {3.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/A4ZSSMW5/3.pdf:application/pdf}
 }
 
@@ -167,3 +171,44 @@
 	pages = {200--210},
 	file = {untitled - 05876304.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/MM4NI9SJ/05876304.pdf:application/pdf}
 }
+
+@book{tsatsishvili_automatic_2011,
+	title = {Automatic subgenre classification of heavy metal music},
+	url = {https://jyx.jyu.fi/dspace/handle/123456789/37227},
+	urldate = {2017-03-06},
+	author = {Tsatsishvili, Valeri},
+	year = {2011},
+	file = {AUTOMATIC-SUBGENRE-CLASSIFICATION-OF-HEAVY-METAL-MUSIC.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/3HTFFPVN/AUTOMATIC-SUBGENRE-CLASSIFICATION-OF-HEAVY-METAL-MUSIC.pdf:application/pdf}
+}
+
+@inproceedings{sturm_survey_2012,
+	title = {A survey of evaluation in music genre recognition},
+	booktitle = {International {Workshop} on {Adaptive} {Multimedia} {Retrieval}},
+	publisher = {Springer},
+	author = {Sturm, Bob L},
+	year = {2012},
+	pages = {29--66},
+	file = {Sturm20121212.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/6MJKTRHE/Sturm20121212.pdf:application/pdf}
+}
+
+@article{you_comparative_2015,
+	title = {Comparative study of singing voice detection methods},
+	issn = {1380-7501, 1573-7721},
+	url = {http://link.springer.com/10.1007/s11042-015-2894-9},
+	doi = {10.1007/s11042-015-2894-9},
+	language = {en},
+	urldate = {2017-03-06},
+	journal = {Multimedia Tools and Applications},
+	author = {You, Shingchern D. and Wu, Yi-Chung and Peng, Shih-Hsien},
+	month = aug,
+	year = {2015},
+	file = {11042_2015_2894_Article 1..16 - you2015.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/QQIS2H44/you2015.pdf:application/pdf}
+}
+
+@article{boersma_praat_2002,
+	title = {Praat, a system for doing phonetics by computer},
+	volume = {5},
+	journal = {Glot international},
+	author = {Boersma, Paulus Petrus Gerardus and {others}},
+	year = {2002}
+}
\ No newline at end of file
diff --git a/asr.pre b/asr.pre
index c1eaa28..fbb3c42 100644
--- a/asr.pre
+++ b/asr.pre
@@ -1,4 +1,4 @@
-\documentclass[a4paper]{article}
+\documentclass[a4paper]{book}
 
 \usepackage[british]{babel}
 
@@ -7,10 +7,13 @@
 \usepackage{hyperref}                % Hyperlinks
 \usepackage{booktabs}                % Better looking tables
 \usepackage{todonotes}               % Todo's
+\usepackage{float}                   % Floating tables
+
+\graphicspath{{img/}}
 
 \urlstyle{same}
 \hypersetup{%
-	pdftitle={Singing voice detection in Brutal Death Metal music},
+	pdftitle={Singing voice detection in Death Metal music},
 	pdfauthor={Mart Lubbers},
 	pdfsubject={},
 	pdfcreator={Mart Lubbers},
@@ -19,6 +22,6 @@
 	hidelinks=true
 }
 
-\title{Singing voice detection in \emph{Brutal Death Metal} music}
+\title{Singing voice detection in Death Metal music}
 \author{Mart Lubbers}
 \date{\today}
diff --git a/asr.tex b/asr.tex
index 6c900cd..e7c6297 100644
--- a/asr.tex
+++ b/asr.tex
@@ -1,6 +1,7 @@
 %&asr
 \usepackage[nonumberlist,acronyms]{glossaries}
 \makeglossaries%
+\newacronym{ANN}{ANN}{Artificial Neural Network}
 \newacronym{HMM}{HMM}{Hidden Markov Model}
 \newacronym{GMM}{GMM}{Gaussian Mixture Models}
 \newacronym{DHMM}{DHMM}{Duration-explicit \acrlong{HMM}}
@@ -8,15 +9,18 @@
 \newacronym{FA}{FA}{Forced alignment}
 \newacronym{MFC}{MFC}{Mel-frequency cepstrum}
 \newacronym{MFCC}{MFCC}{\acrlong{MFC} coefficient}
-%\newglossaryentry{mTask}{name=mTask,
-%	description={is an abstraction for \glspl{Task} living on \acrshort{IoT} devices}}
+\newglossaryentry{dm}{name={Death Metal},
+	description={is an extreme heavy metal music style with growling vocals and
+	pounding drums}}
 
 \begin{document}
-%Titlepage
+\frontmatter{}
+
 \maketitleru[
 	course={(Automatic) Speech Recognition},
 	institute={Radboud University Nijmegen},
-	authorstext={Author:}]
+	authorstext={Author:},
+	pagenr=1]
 \listoftodos[Todo]
 
 \tableofcontents
@@ -25,6 +29,7 @@
 \glsaddall{}
 \printglossaries%
 
+\mainmatter{}
 Berenzweig and Ellis use acoustic classifiers from speech recognition as a
 detector for singing lines.  They achive 80\% accuracy for forty 15 second
 exerpts. They mention people that wrote signal features that discriminate
@@ -33,7 +38,7 @@ between speech and music. Neural net
 
 In 2014 Dzhambazov et al.\ applied state of the art segmentation methods to
 polyphonic turkish music, this might be interesting to use for heavy metal.
-They mention Fujihara(2011) to have a similar \gls{FA} system. This method uses
+They mention Fujihara (2011) to have a similar \gls{FA} system. This method uses
 phone level segmentation, first 12 \gls{MFCC}s. They first do vocal/non-vocal
 detection, then melody extraction, then alignment. They compare results with
 Mesaros \& Virtanen, 2008~\cite{dzhambazov_automatic_2014}. Later they
@@ -56,13 +61,70 @@ t\cite{yang_machine_2012}
 %Introduction, leading to a clearly defined research question
 \chapter{Introduction}
 \section{Introduction}
+Music is a leading type of data distributed on the internet. Regular music
+distribution is almost entirely digital and services like Spotify and YouTube
+allow one to listen to almost any song within a few clicks. Moreover, there are
+myriads of websites offering lyrics of songs.
+
+\todo{explain relevancy, (preprocessing for lyric alignment)}
+
+This leads to the following research question:
+\begin{center}\em%
+	Are standard \gls{ANN} based techniques for singing voice detection
+	suitable for non-standard musical genres like Death metal.
+\end{center}
 
 %Literature overview / related work
 \section{Related work}
 
+Singing/non-singing detection has been fairecent topic of interest in the
+academia. Just in 2001 Berenzweig and Ellis~\cite{berenzweig_locating_2001}
+researched singing voice detection in stead of the more founded topic of
+discerning music from regular speech. In their research 
+
 \chapter{Methods}
 %Methodology
+
 %Experiment(s) (set-up, data, results, discussion)
+\section{Data \& Preprocessing}
+To run the experiments we have collected data from several \gls{dm} albums. The
+exact data used is available in Appendix~\ref{app:data}. The albums are
+extracted from the audio CD and converted to a mono channel waveform with the
+correct samplerate \emph{SoX}~\footnote{\url{http://sox.sourceforge.net/}}.
+When the waveforms are finished they are converted to \glspl{MFCC} vectors
+using the \emph{python\_speech\_features}%
+~\footnote{\url{https://github.com/jameslyons/python_speech_features}} package.
+All these steps combined results in thirteen tab separated features per line in
+a file for every source file. Every file is annotated using
+Praat~\cite{boersma_praat_2002} where the utterances are manually
+aligned to the audio. An example of an utterances are shown in
+Figures~\ref{fig:bloodstained,fig:abominations}. It is clearly visible that
+within the genre of death metal there are a lot of different spectral patterns
+visible.
+
+\begin{figure}[ht]
+	\centering
+	\includegraphics[width=.7\linewidth]{cement}
+	\caption{A vocal segment of the \emph{Cannibal Corpse} song
+		\emph{Bloodstained Cement}}\label{fig:bloodstained}
+\end{figure}
+
+\begin{figure}[ht]
+	\centering
+	\includegraphics[width=.7\linewidth]{abominations}
+	\caption{A vocal segment of the \emph{Disgorge} song
+		\emph{Enthroned Abominations}}\label{fig:abominations}
+\end{figure}
+
+The data is collected from two\todo{more in the future}\ studio albums. The first
+band is called \emph{Cannibal Corpse} and has been producing \gls{dm} for almost
+25 years and have been creating the same type every album. The singer of
+\emph{Cannibal Corpse} has a very raspy growls and the lyrics are quite
+comprehensible. The second band is called \emph{Disgorge} and make even more
+violent music. The growls of the lead singer sound more like a coffee grinder
+and are more shallow. The lyrics are completely incomprehensible and therefore
+some parts are not annotated with lyrics because it was too difficult to hear
+what was being sung.
 
 \chapter{Conclusion \& Discussion}
 %Discussion section
@@ -70,6 +132,40 @@ t\cite{yang_machine_2012}
 %Acknowledgements
 %Statement on authors' contributions
 %(Appendices)
+\appendix
+\chapter{Experimental data}\label{app:data}
+\begin{table}[h]
+	\centering
+	\begin{tabular}{cllll}
+		\toprule
+		Num. & Artist & Album & Song & Duration\\
+		\midrule
+		00 & Cannibal Corpse & A Skeletal Domain & High Velocity Impact Spatter & 04:06.91\\
+		01 & Cannibal Corpse & A Skeletal Domain & Sadistic Embodiment & 03:17.31\\
+		02 & Cannibal Corpse & A Skeletal Domain & Kill or Become & 03:50.67\\
+		03 & Cannibal Corpse & A Skeletal Domain & A Skeletal Domain & 03:38.77\\
+		04 & Cannibal Corpse & A Skeletal Domain & Headlong Into Carnage & 03:01.25\\
+		05 & Cannibal Corpse & A Skeletal Domain & The Murderer's Pact & 05:05.23\\
+		06 & Cannibal Corpse & A Skeletal Domain & Funeral Cremation & 03:41.89\\
+		07 & Cannibal Corpse & A Skeletal Domain & Icepick Lobotomy & 03:16.24\\
+		08 & Cannibal Corpse & A Skeletal Domain & Vector of Cruelty & 03:25.15\\
+		09 & Cannibal Corpse & A Skeletal Domain & Bloodstained Cement & 03:41.99\\
+		10 & Cannibal Corpse & A Skeletal Domain & Asphyxiate to Resuscitate & 03:47.40\\
+		11 & Cannibal Corpse & A Skeletal Domain & Hollowed Bodies & 03:05.80\\
+		12 & Disgorge & Parallels of Infinite Torture & Revealed in Obscurity & 05:13.20\\
+		13 & Disgorge & Parallels of Infinite Torture & Enthroned Abominations & 04:05.39\\
+		14 & Disgorge & Parallels of Infinite Torture & Atonement & 02:57.36\\
+		15 & Disgorge & Parallels of Infinite Torture & Abhorrent Desecration of Thee Iniquity & 04:17.20\\
+		16 & Disgorge & Parallels of Infinite Torture & Forgotten Scriptures & 02:01.72\\
+		17 & Disgorge & Parallels of Infinite Torture & Descending Upon Convulsive Devourment & 04:38.85\\
+		18 & Disgorge & Parallels of Infinite Torture & Condemned to Sufferance & 04:57.59\\
+		19 & Disgorge & Parallels of Infinite Torture & Parallels of Infinite Torture & 05:03.33\\
+		20 & Disgorge & Parallels of Infinite Torture & Asphyxiation of Thee Oppressed & 05:42.37\\
+		21 & Disgorge & Parallels of Infinite Torture & Ominous Sigils of Ungodly Ruin & 04:59.15\\
+		\bottomrule
+	\end{tabular}
+	\caption{Songs used in the experiments}
+\end{table}
 
 \bibliographystyle{ieeetr}
 \bibliography{asr}
diff --git a/experiment.tex b/experiment.tex
index 328c089..0e61c21 100644
--- a/experiment.tex
+++ b/experiment.tex
@@ -14,8 +14,8 @@ during growling the regions around $100$Hz have an increased intensity.
 \begin{itemize}
 	\item Sox~\footnote{\url{https://sox.sourceforge.net}} is used to convert
 		the stereo CD audio to mono $44.1Khz$ waveforms
-	\item Using the \texttt{python\_speech\_features}~%
-		\footnote{\url{https://github.com/jameslyons/python_speech_features}}
+	\item Using the \texttt{python\_speech\_features}\footnote{\url{%
+			https://github.com/jameslyons/python_speech_features}}
 		the waveforms are converted to $13$ $MFCC$ cepstrals with the default
 		$25ms$ window every $10ms$.
 	\item The data is matched with the annotated files using
@@ -32,7 +32,7 @@ because the predictions are very noisy. This is probably due to pauses in
 growling. This can easily be smoothed out by not allowing extremely short
 growling segments.
 
-\begin{figure}[h]
+\begin{figure}[ht]
 	\centering
 	\includegraphics[width=.7\linewidth]{cement}
 	\caption{A vocal segment of the \emph{Cannibal Corpse} song
diff --git a/img/abominations.png b/img/abominations.png
new file mode 100644
index 0000000..a649011
Binary files /dev/null and b/img/abominations.png differ