-@book{muller_multimodal_2012,
+
+@incollection{muller_lyrics--audio_2012,
address = {Wadern},
- title = {Multimodal {Music} {Processing}},
+ title = {Lyrics-to-{Audio} {Alignment} and its {Application}},
isbn = {978-3-939897-37-8},
url = {http://nbn-resolving.de/urn:nbn:de:0030-drops-27851},
language = {English},
urldate = {2017-03-02},
+ booktitle = {Multimodal {Music} {Processing}},
publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik GmbH},
- author = {Müller, Meinard and Goto, Masataka and Schedl, Markus},
+ author = {Goto, Masataka and Fujihara, Hiromasa},
+ editor = {Müller, Meinard},
year = {2012},
note = {OCLC: 864001691},
+ pages = {23--36},
file = {3.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/A4ZSSMW5/3.pdf:application/pdf}
}
pages = {200--210},
file = {untitled - 05876304.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/MM4NI9SJ/05876304.pdf:application/pdf}
}
+
+@book{tsatsishvili_automatic_2011,
+ title = {Automatic subgenre classification of heavy metal music},
+ url = {https://jyx.jyu.fi/dspace/handle/123456789/37227},
+ urldate = {2017-03-06},
+ author = {Tsatsishvili, Valeri},
+ year = {2011},
+ file = {AUTOMATIC-SUBGENRE-CLASSIFICATION-OF-HEAVY-METAL-MUSIC.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/3HTFFPVN/AUTOMATIC-SUBGENRE-CLASSIFICATION-OF-HEAVY-METAL-MUSIC.pdf:application/pdf}
+}
+
+@inproceedings{sturm_survey_2012,
+ title = {A survey of evaluation in music genre recognition},
+ booktitle = {International {Workshop} on {Adaptive} {Multimedia} {Retrieval}},
+ publisher = {Springer},
+ author = {Sturm, Bob L},
+ year = {2012},
+ pages = {29--66},
+ file = {Sturm20121212.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/6MJKTRHE/Sturm20121212.pdf:application/pdf}
+}
+
+@article{you_comparative_2015,
+ title = {Comparative study of singing voice detection methods},
+ issn = {1380-7501, 1573-7721},
+ url = {http://link.springer.com/10.1007/s11042-015-2894-9},
+ doi = {10.1007/s11042-015-2894-9},
+ language = {en},
+ urldate = {2017-03-06},
+ journal = {Multimedia Tools and Applications},
+ author = {You, Shingchern D. and Wu, Yi-Chung and Peng, Shih-Hsien},
+ month = aug,
+ year = {2015},
+ file = {11042_2015_2894_Article 1..16 - you2015.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/QQIS2H44/you2015.pdf:application/pdf}
+}
+
+@article{boersma_praat_2002,
+ title = {Praat, a system for doing phonetics by computer},
+ volume = {5},
+ journal = {Glot international},
+ author = {Boersma, Paulus Petrus Gerardus and {others}},
+ year = {2002}
+}
\ No newline at end of file
%&asr
\usepackage[nonumberlist,acronyms]{glossaries}
\makeglossaries%
+\newacronym{ANN}{ANN}{Artificial Neural Network}
\newacronym{HMM}{HMM}{Hidden Markov Model}
\newacronym{GMM}{GMM}{Gaussian Mixture Models}
\newacronym{DHMM}{DHMM}{Duration-explicit \acrlong{HMM}}
\newacronym{FA}{FA}{Forced alignment}
\newacronym{MFC}{MFC}{Mel-frequency cepstrum}
\newacronym{MFCC}{MFCC}{\acrlong{MFC} coefficient}
-%\newglossaryentry{mTask}{name=mTask,
-% description={is an abstraction for \glspl{Task} living on \acrshort{IoT} devices}}
+\newglossaryentry{dm}{name={Death Metal},
+ description={is an extreme heavy metal music style with growling vocals and
+ pounding drums}}
\begin{document}
-%Titlepage
+\frontmatter{}
+
\maketitleru[
course={(Automatic) Speech Recognition},
institute={Radboud University Nijmegen},
- authorstext={Author:}]
+ authorstext={Author:},
+ pagenr=1]
\listoftodos[Todo]
\tableofcontents
\glsaddall{}
\printglossaries%
+\mainmatter{}
Berenzweig and Ellis use acoustic classifiers from speech recognition as a
detector for singing lines. They achive 80\% accuracy for forty 15 second
exerpts. They mention people that wrote signal features that discriminate
In 2014 Dzhambazov et al.\ applied state of the art segmentation methods to
polyphonic turkish music, this might be interesting to use for heavy metal.
-They mention Fujihara(2011) to have a similar \gls{FA} system. This method uses
+They mention Fujihara (2011) to have a similar \gls{FA} system. This method uses
phone level segmentation, first 12 \gls{MFCC}s. They first do vocal/non-vocal
detection, then melody extraction, then alignment. They compare results with
Mesaros \& Virtanen, 2008~\cite{dzhambazov_automatic_2014}. Later they
%Introduction, leading to a clearly defined research question
\chapter{Introduction}
\section{Introduction}
+Music is a leading type of data distributed on the internet. Regular music
+distribution is almost entirely digital and services like Spotify and YouTube
+allow one to listen to almost any song within a few clicks. Moreover, there are
+myriads of websites offering lyrics of songs.
+
+\todo{explain relevancy, (preprocessing for lyric alignment)}
+
+This leads to the following research question:
+\begin{center}\em%
+ Are standard \gls{ANN} based techniques for singing voice detection
+ suitable for non-standard musical genres like Death metal.
+\end{center}
%Literature overview / related work
\section{Related work}
+Singing/non-singing detection has been fairecent topic of interest in the
+academia. Just in 2001 Berenzweig and Ellis~\cite{berenzweig_locating_2001}
+researched singing voice detection in stead of the more founded topic of
+discerning music from regular speech. In their research
+
\chapter{Methods}
%Methodology
+
%Experiment(s) (set-up, data, results, discussion)
+\section{Data \& Preprocessing}
+To run the experiments we have collected data from several \gls{dm} albums. The
+exact data used is available in Appendix~\ref{app:data}. The albums are
+extracted from the audio CD and converted to a mono channel waveform with the
+correct samplerate \emph{SoX}~\footnote{\url{http://sox.sourceforge.net/}}.
+When the waveforms are finished they are converted to \glspl{MFCC} vectors
+using the \emph{python\_speech\_features}%
+~\footnote{\url{https://github.com/jameslyons/python_speech_features}} package.
+All these steps combined results in thirteen tab separated features per line in
+a file for every source file. Every file is annotated using
+Praat~\cite{boersma_praat_2002} where the utterances are manually
+aligned to the audio. An example of an utterances are shown in
+Figures~\ref{fig:bloodstained,fig:abominations}. It is clearly visible that
+within the genre of death metal there are a lot of different spectral patterns
+visible.
+
+\begin{figure}[ht]
+ \centering
+ \includegraphics[width=.7\linewidth]{cement}
+ \caption{A vocal segment of the \emph{Cannibal Corpse} song
+ \emph{Bloodstained Cement}}\label{fig:bloodstained}
+\end{figure}
+
+\begin{figure}[ht]
+ \centering
+ \includegraphics[width=.7\linewidth]{abominations}
+ \caption{A vocal segment of the \emph{Disgorge} song
+ \emph{Enthroned Abominations}}\label{fig:abominations}
+\end{figure}
+
+The data is collected from two\todo{more in the future}\ studio albums. The first
+band is called \emph{Cannibal Corpse} and has been producing \gls{dm} for almost
+25 years and have been creating the same type every album. The singer of
+\emph{Cannibal Corpse} has a very raspy growls and the lyrics are quite
+comprehensible. The second band is called \emph{Disgorge} and make even more
+violent music. The growls of the lead singer sound more like a coffee grinder
+and are more shallow. The lyrics are completely incomprehensible and therefore
+some parts are not annotated with lyrics because it was too difficult to hear
+what was being sung.
\chapter{Conclusion \& Discussion}
%Discussion section
%Acknowledgements
%Statement on authors' contributions
%(Appendices)
+\appendix
+\chapter{Experimental data}\label{app:data}
+\begin{table}[h]
+ \centering
+ \begin{tabular}{cllll}
+ \toprule
+ Num. & Artist & Album & Song & Duration\\
+ \midrule
+ 00 & Cannibal Corpse & A Skeletal Domain & High Velocity Impact Spatter & 04:06.91\\
+ 01 & Cannibal Corpse & A Skeletal Domain & Sadistic Embodiment & 03:17.31\\
+ 02 & Cannibal Corpse & A Skeletal Domain & Kill or Become & 03:50.67\\
+ 03 & Cannibal Corpse & A Skeletal Domain & A Skeletal Domain & 03:38.77\\
+ 04 & Cannibal Corpse & A Skeletal Domain & Headlong Into Carnage & 03:01.25\\
+ 05 & Cannibal Corpse & A Skeletal Domain & The Murderer's Pact & 05:05.23\\
+ 06 & Cannibal Corpse & A Skeletal Domain & Funeral Cremation & 03:41.89\\
+ 07 & Cannibal Corpse & A Skeletal Domain & Icepick Lobotomy & 03:16.24\\
+ 08 & Cannibal Corpse & A Skeletal Domain & Vector of Cruelty & 03:25.15\\
+ 09 & Cannibal Corpse & A Skeletal Domain & Bloodstained Cement & 03:41.99\\
+ 10 & Cannibal Corpse & A Skeletal Domain & Asphyxiate to Resuscitate & 03:47.40\\
+ 11 & Cannibal Corpse & A Skeletal Domain & Hollowed Bodies & 03:05.80\\
+ 12 & Disgorge & Parallels of Infinite Torture & Revealed in Obscurity & 05:13.20\\
+ 13 & Disgorge & Parallels of Infinite Torture & Enthroned Abominations & 04:05.39\\
+ 14 & Disgorge & Parallels of Infinite Torture & Atonement & 02:57.36\\
+ 15 & Disgorge & Parallels of Infinite Torture & Abhorrent Desecration of Thee Iniquity & 04:17.20\\
+ 16 & Disgorge & Parallels of Infinite Torture & Forgotten Scriptures & 02:01.72\\
+ 17 & Disgorge & Parallels of Infinite Torture & Descending Upon Convulsive Devourment & 04:38.85\\
+ 18 & Disgorge & Parallels of Infinite Torture & Condemned to Sufferance & 04:57.59\\
+ 19 & Disgorge & Parallels of Infinite Torture & Parallels of Infinite Torture & 05:03.33\\
+ 20 & Disgorge & Parallels of Infinite Torture & Asphyxiation of Thee Oppressed & 05:42.37\\
+ 21 & Disgorge & Parallels of Infinite Torture & Ominous Sigils of Ungodly Ruin & 04:59.15\\
+ \bottomrule
+ \end{tabular}
+ \caption{Songs used in the experiments}
+\end{table}
\bibliographystyle{ieeetr}
\bibliography{asr}