journal = {Glot international},
author = {Boersma, Paulus Petrus Gerardus and {others}},
year = {2002}
+}
+
+@inproceedings{saunders_real-time_1996,
+ title = {Real-time discrimination of broadcast speech/music},
+ volume = {2},
+ booktitle = {Acoustics, {Speech}, and {Signal} {Processing}, 1996. {ICASSP}-96. {Conference} {Proceedings}., 1996 {IEEE} {International} {Conference} on},
+ publisher = {IEEE},
+ author = {Saunders, John},
+ year = {1996},
+ pages = {993--996},
+ file = {REAL-TIME DISCRIMINATION OF BROADCAST SPEECH/MUSIC - Acoustics, Speech, and Signal Processing, 1996. ICASSP-96. Conference Proceedings., 1996 IEEE Inte - saunders_j2.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/H9PG94BN/saunders_j2.pdf:application/pdf}
+}
+
+@inproceedings{scheirer_construction_1997,
+ title = {Construction and evaluation of a robust multifeature speech/music discriminator},
+ volume = {2},
+ booktitle = {Acoustics, {Speech}, and {Signal} {Processing}, 1997. {ICASSP}-97., 1997 {IEEE} {International} {Conference} on},
+ publisher = {IEEE},
+ author = {Scheirer, Eric and Slaney, Malcolm},
+ year = {1997},
+ pages = {1331--1334}
}
\ No newline at end of file
\newacronym{FA}{FA}{Forced alignment}
\newacronym{MFC}{MFC}{Mel-frequency cepstrum}
\newacronym{MFCC}{MFCC}{\acrlong{MFC} coefficient}
+\newacronym{IFPI}{IFPI}{International Federation of the Phonographic Industry}
\newglossaryentry{dm}{name={Death Metal},
description={is an extreme heavy metal music style with growling vocals and
pounding drums}}
\tableofcontents
%Glossaries
-\glsaddall{}
-\printglossaries%
+%\glsaddall{}
+%\printglossaries
\mainmatter{}
-Berenzweig and Ellis use acoustic classifiers from speech recognition as a
-detector for singing lines. They achive 80\% accuracy for forty 15 second
-exerpts. They mention people that wrote signal features that discriminate
-between speech and music. Neural net
-\glspl{HMM}~\cite{berenzweig_locating_2001}.
-
-In 2014 Dzhambazov et al.\ applied state of the art segmentation methods to
-polyphonic turkish music, this might be interesting to use for heavy metal.
-They mention Fujihara (2011) to have a similar \gls{FA} system. This method uses
-phone level segmentation, first 12 \gls{MFCC}s. They first do vocal/non-vocal
-detection, then melody extraction, then alignment. They compare results with
-Mesaros \& Virtanen, 2008~\cite{dzhambazov_automatic_2014}. Later they
-specialize in long syllables in a capella. They use \glspl{DHMM} with
-\glspl{GMM} and show that adding knowledge increases alignment (bejing opera
-has long syllables)~\cite{dzhambazov_automatic_2016}.
-
-t\cite{fujihara_automatic_2006}
-t\cite{fujihara_lyricsynchronizer:_2011}
-t\cite{fujihara_three_2008}
-t\cite{mauch_integrating_2012}
-t\cite{mesaros_adaptation_2009}
-t\cite{mesaros_automatic_2008}
-t\cite{mesaros_automatic_2010}
-t\cite{muller_multimodal_2012}
-t\cite{pedone_phoneme-level_2011}
-t\cite{yang_machine_2012}
+%Berenzweig and Ellis use acoustic classifiers from speech recognition as a
+%detector for singing lines. They achive 80\% accuracy for forty 15 second
+%exerpts. They mention people that wrote signal features that discriminate
+%between speech and music. Neural net
+%\glspl{HMM}~\cite{berenzweig_locating_2001}.
+%
+%In 2014 Dzhambazov et al.\ applied state of the art segmentation methods to
+%polyphonic turkish music, this might be interesting to use for heavy metal.
+%They mention Fujihara (2011) to have a similar \gls{FA} system. This method uses
+%phone level segmentation, first 12 \gls{MFCC}s. They first do vocal/non-vocal
+%detection, then melody extraction, then alignment. They compare results with
+%Mesaros \& Virtanen, 2008~\cite{dzhambazov_automatic_2014}. Later they
+%specialize in long syllables in a capella. They use \glspl{DHMM} with
+%\glspl{GMM} and show that adding knowledge increases alignment (bejing opera
+%has long syllables)~\cite{dzhambazov_automatic_2016}.
+%
%Introduction, leading to a clearly defined research question
\chapter{Introduction}
\section{Introduction}
-Music is a leading type of data distributed on the internet. Regular music
-distribution is almost entirely digital and services like Spotify and YouTube
-allow one to listen to almost any song within a few clicks. Moreover, there are
-myriads of websites offering lyrics of songs.
-
-\todo{explain relevancy, (preprocessing for lyric alignment)}
+The \gls{IFPI} stated that about $43\%$ of music revenue rises from digital
+distribution. The overtake on physical formats took place somewhere in 2015 and
+since twenty years the music industry has seen significant
+growth~\footnote{\url{http://www.ifpi.org/facts-and-stats.php}}.
+
+A lot of this musical distribution goes via non-official channels such as
+YouTube~\footnote{\url{https://youtube.com}} in which fans of the musical group
+accompany the music with synchronized lyrics so that users can sing or read
+along. Because of this interest it is very useful to device automatic
+techniques for segmenting instrumental and vocal parts of a song and
+apply forced alignment or even lyrics recognition on the audio file.
+
+
+%A majority of the music is not only instrumental but also contains vocal
+%segments.
+%
+%Music is a leading type of data distributed on the internet. Regular music
+%distribution is almost entirely digital and services like Spotify and YouTube
+%allow one to listen to almost any song within a few clicks. Moreover, there are
+%myriads of websites offering lyrics of songs.
+%
+%\todo{explain relevancy, (preprocessing for lyric alignment)}
+%
+%This leads to the following research question:
+%\begin{center}\em%
+% Are standard \gls{ANN} based techniques for singing voice detection
+% suitable for non-standard musical genres like Death metal.
+%\end{center}
+%Literature overview / related work
+\section{Related work}
+The field of applying standard speech processing techniques on music started in
+the late 90s~\cite{saunders_real-time_1996,scheirer_construction_1997} and it
+was found that music has different discriminating features compared to normal
+speech.
+
+Berenzweig and Ellis expanded on the aforementioned research by trying to
+separate singing from instrumental music\cite{berenzweig_locating_2001}.
+
+\todo{Incorporate this in literary framing}
+~\cite{fujihara_automatic_2006}
+~\cite{fujihara_lyricsynchronizer:_2011}
+~\cite{fujihara_three_2008}
+~\cite{mauch_integrating_2012}
+~\cite{mesaros_adaptation_2009}
+~\cite{mesaros_automatic_2008}
+~\cite{mesaros_automatic_2010}
+~%\cite{muller_multimodal_2012}
+~\cite{pedone_phoneme-level_2011}
+~\cite{yang_machine_2012}
+
+\section{Research question}
This leads to the following research question:
\begin{center}\em%
Are standard \gls{ANN} based techniques for singing voice detection
suitable for non-standard musical genres like Death metal.
\end{center}
-%Literature overview / related work
-\section{Related work}
-
-Singing/non-singing detection has been fairecent topic of interest in the
-academia. Just in 2001 Berenzweig and Ellis~\cite{berenzweig_locating_2001}
-researched singing voice detection in stead of the more founded topic of
-discerning music from regular speech. In their research
-
\chapter{Methods}
%Methodology
some parts are not annotated with lyrics because it was too difficult to hear
what was being sung.
+\section{Methods}
+\todo{To remove in final thesis}
+The initial planning is still up to date. About one and a half album has been
+annotated and a framework for setting up experiments has been created.
+Moreover, the first exploratory experiments are already been executed and
+promising. In April the experimental dataset will be expanded and I will try to
+mimic some of the experiments done in the literature to see whether it performs
+similar on Death Metal
+\begin{table}[ht]
+ \centering
+ \begin{tabular}{cll}
+ \toprule
+ Month & Description\\
+ \midrule
+ March
+ & Preparing the data\\
+ & Preparing an experiment platform\\
+ & Literature research\\
+ April
+ & Running the experiments\\
+ & Fiddle with parameters\\
+ & Explore the possibilities for forced alignment\\
+ May
+ & Write up the thesis\\
+ & Possibly do forced alignment\\
+ June
+ & Finish up thesis\\
+ & Wrap up\\
+ \bottomrule
+ \end{tabular}
+ \caption{Outline}
+\end{table}
+
+\section{Results}
+
+
\chapter{Conclusion \& Discussion}
%Discussion section
%Conclusion section