4899bd135279fbcb2704186ea0697f17316d707d
[asr1617.git] / asr.tex
1 %&asr
2 \usepackage[nonumberlist,acronyms]{glossaries}
3 \makeglossaries%
4 \newacronym{ANN}{ANN}{Artificial Neural Network}
5 \newacronym{HMM}{HMM}{Hidden Markov Model}
6 \newacronym{GMM}{GMM}{Gaussian Mixture Models}
7 \newacronym{DHMM}{DHMM}{Duration-explicit \acrlong{HMM}}
8 \newacronym{HTK}{HTK}{\acrlong{HMM} Toolkit}
9 \newacronym{FA}{FA}{Forced alignment}
10 \newacronym{MFC}{MFC}{Mel-frequency cepstrum}
11 \newacronym{MFCC}{MFCC}{\acrlong{MFC} coefficient}
12 \newacronym{IFPI}{IFPI}{International Federation of the Phonographic Industry}
13 \newglossaryentry{dm}{name={Death Metal},
14 description={is an extreme heavy metal music style with growling vocals and
15 pounding drums}}
16
17 \begin{document}
18 \frontmatter{}
19
20 \maketitleru[
21 course={(Automatic) Speech Recognition},
22 institute={Radboud University Nijmegen},
23 authorstext={Author:},
24 pagenr=1]
25 \listoftodos[Todo]
26
27 \tableofcontents
28
29 %Glossaries
30 %\glsaddall{}
31 %\printglossaries
32
33 \mainmatter{}
34 %Berenzweig and Ellis use acoustic classifiers from speech recognition as a
35 %detector for singing lines. They achive 80\% accuracy for forty 15 second
36 %exerpts. They mention people that wrote signal features that discriminate
37 %between speech and music. Neural net
38 %\glspl{HMM}~\cite{berenzweig_locating_2001}.
39 %
40 %In 2014 Dzhambazov et al.\ applied state of the art segmentation methods to
41 %polyphonic turkish music, this might be interesting to use for heavy metal.
42 %They mention Fujihara (2011) to have a similar \gls{FA} system. This method uses
43 %phone level segmentation, first 12 \gls{MFCC}s. They first do vocal/non-vocal
44 %detection, then melody extraction, then alignment. They compare results with
45 %Mesaros \& Virtanen, 2008~\cite{dzhambazov_automatic_2014}. Later they
46 %specialize in long syllables in a capella. They use \glspl{DHMM} with
47 %\glspl{GMM} and show that adding knowledge increases alignment (bejing opera
48 %has long syllables)~\cite{dzhambazov_automatic_2016}.
49 %
50
51
52 %Introduction, leading to a clearly defined research question
53 \chapter{Introduction}
54 \section{Introduction}
55 The \gls{IFPI} stated that about $43\%$ of music revenue rises from digital
56 distribution. The overtake on physical formats took place somewhere in 2015 and
57 since twenty years the music industry has seen significant
58 growth~\footnote{\url{http://www.ifpi.org/facts-and-stats.php}}.
59
60 A lot of this musical distribution goes via non-official channels such as
61 YouTube~\footnote{\url{https://youtube.com}} in which fans of the musical group
62 accompany the music with synchronized lyrics so that users can sing or read
63 along. Because of this interest it is very useful to device automatic
64 techniques for segmenting instrumental and vocal parts of a song and
65 apply forced alignment or even lyrics recognition on the audio file.
66
67
68 %A majority of the music is not only instrumental but also contains vocal
69 %segments.
70 %
71 %Music is a leading type of data distributed on the internet. Regular music
72 %distribution is almost entirely digital and services like Spotify and YouTube
73 %allow one to listen to almost any song within a few clicks. Moreover, there are
74 %myriads of websites offering lyrics of songs.
75 %
76 %\todo{explain relevancy, (preprocessing for lyric alignment)}
77 %
78 %This leads to the following research question:
79 %\begin{center}\em%
80 % Are standard \gls{ANN} based techniques for singing voice detection
81 % suitable for non-standard musical genres like Death metal.
82 %\end{center}
83
84 %Literature overview / related work
85 \section{Related work}
86 The field of applying standard speech processing techniques on music started in
87 the late 90s~\cite{saunders_real-time_1996,scheirer_construction_1997} and it
88 was found that music has different discriminating features compared to normal
89 speech.
90
91 Berenzweig and Ellis expanded on the aforementioned research by trying to
92 separate singing from instrumental music\cite{berenzweig_locating_2001}.
93
94 \todo{Incorporate this in literary framing}
95 ~\cite{fujihara_automatic_2006}
96 ~\cite{fujihara_lyricsynchronizer:_2011}
97 ~\cite{fujihara_three_2008}
98 ~\cite{mauch_integrating_2012}
99 ~\cite{mesaros_adaptation_2009}
100 ~\cite{mesaros_automatic_2008}
101 ~\cite{mesaros_automatic_2010}
102 ~%\cite{muller_multimodal_2012}
103 ~\cite{pedone_phoneme-level_2011}
104 ~\cite{yang_machine_2012}
105
106 \section{Research question}
107 This leads to the following research question:
108 \begin{center}\em%
109 Are standard \gls{ANN} based techniques for singing voice detection
110 suitable for non-standard musical genres like Death metal.
111 \end{center}
112
113 \chapter{Methods}
114 %Methodology
115
116 %Experiment(s) (set-up, data, results, discussion)
117 \section{Data \& Preprocessing}
118 To run the experiments we have collected data from several \gls{dm} albums. The
119 exact data used is available in Appendix~\ref{app:data}. The albums are
120 extracted from the audio CD and converted to a mono channel waveform with the
121 correct samplerate \emph{SoX}~\footnote{\url{http://sox.sourceforge.net/}}.
122 When the waveforms are finished they are converted to \glspl{MFCC} vectors
123 using the \emph{python\_speech\_features}%
124 ~\footnote{\url{https://github.com/jameslyons/python_speech_features}} package.
125 All these steps combined results in thirteen tab separated features per line in
126 a file for every source file. Every file is annotated using
127 Praat~\cite{boersma_praat_2002} where the utterances are manually
128 aligned to the audio. An example of an utterances are shown in
129 Figures~\ref{fig:bloodstained,fig:abominations}. It is clearly visible that
130 within the genre of death metal there are a lot of different spectral patterns
131 visible.
132
133 \begin{figure}[ht]
134 \centering
135 \includegraphics[width=.7\linewidth]{cement}
136 \caption{A vocal segment of the \emph{Cannibal Corpse} song
137 \emph{Bloodstained Cement}}\label{fig:bloodstained}
138 \end{figure}
139
140 \begin{figure}[ht]
141 \centering
142 \includegraphics[width=.7\linewidth]{abominations}
143 \caption{A vocal segment of the \emph{Disgorge} song
144 \emph{Enthroned Abominations}}\label{fig:abominations}
145 \end{figure}
146
147 The data is collected from two\todo{more in the future}\ studio albums. The first
148 band is called \emph{Cannibal Corpse} and has been producing \gls{dm} for almost
149 25 years and have been creating the same type every album. The singer of
150 \emph{Cannibal Corpse} has a very raspy growls and the lyrics are quite
151 comprehensible. The second band is called \emph{Disgorge} and make even more
152 violent music. The growls of the lead singer sound more like a coffee grinder
153 and are more shallow. The lyrics are completely incomprehensible and therefore
154 some parts are not annotated with lyrics because it was too difficult to hear
155 what was being sung.
156
157 \section{Methods}
158 \todo{To remove in final thesis}
159 The initial planning is still up to date. About one and a half album has been
160 annotated and a framework for setting up experiments has been created.
161 Moreover, the first exploratory experiments are already been executed and
162 promising. In April the experimental dataset will be expanded and I will try to
163 mimic some of the experiments done in the literature to see whether it performs
164 similar on Death Metal
165 \begin{table}[ht]
166 \centering
167 \begin{tabular}{cll}
168 \toprule
169 Month & Description\\
170 \midrule
171 March
172 & Preparing the data\\
173 & Preparing an experiment platform\\
174 & Literature research\\
175 April
176 & Running the experiments\\
177 & Fiddle with parameters\\
178 & Explore the possibilities for forced alignment\\
179 May
180 & Write up the thesis\\
181 & Possibly do forced alignment\\
182 June
183 & Finish up thesis\\
184 & Wrap up\\
185 \bottomrule
186 \end{tabular}
187 \caption{Outline}
188 \end{table}
189
190 \section{Results}
191
192
193 \chapter{Conclusion \& Discussion}
194 %Discussion section
195 %Conclusion section
196 %Acknowledgements
197 %Statement on authors' contributions
198 %(Appendices)
199 \appendix
200 \chapter{Experimental data}\label{app:data}
201 \begin{table}[h]
202 \centering
203 \begin{tabular}{cllll}
204 \toprule
205 Num. & Artist & Album & Song & Duration\\
206 \midrule
207 00 & Cannibal Corpse & A Skeletal Domain & High Velocity Impact Spatter & 04:06.91\\
208 01 & Cannibal Corpse & A Skeletal Domain & Sadistic Embodiment & 03:17.31\\
209 02 & Cannibal Corpse & A Skeletal Domain & Kill or Become & 03:50.67\\
210 03 & Cannibal Corpse & A Skeletal Domain & A Skeletal Domain & 03:38.77\\
211 04 & Cannibal Corpse & A Skeletal Domain & Headlong Into Carnage & 03:01.25\\
212 05 & Cannibal Corpse & A Skeletal Domain & The Murderer's Pact & 05:05.23\\
213 06 & Cannibal Corpse & A Skeletal Domain & Funeral Cremation & 03:41.89\\
214 07 & Cannibal Corpse & A Skeletal Domain & Icepick Lobotomy & 03:16.24\\
215 08 & Cannibal Corpse & A Skeletal Domain & Vector of Cruelty & 03:25.15\\
216 09 & Cannibal Corpse & A Skeletal Domain & Bloodstained Cement & 03:41.99\\
217 10 & Cannibal Corpse & A Skeletal Domain & Asphyxiate to Resuscitate & 03:47.40\\
218 11 & Cannibal Corpse & A Skeletal Domain & Hollowed Bodies & 03:05.80\\
219 12 & Disgorge & Parallels of Infinite Torture & Revealed in Obscurity & 05:13.20\\
220 13 & Disgorge & Parallels of Infinite Torture & Enthroned Abominations & 04:05.39\\
221 14 & Disgorge & Parallels of Infinite Torture & Atonement & 02:57.36\\
222 15 & Disgorge & Parallels of Infinite Torture & Abhorrent Desecration of Thee Iniquity & 04:17.20\\
223 16 & Disgorge & Parallels of Infinite Torture & Forgotten Scriptures & 02:01.72\\
224 17 & Disgorge & Parallels of Infinite Torture & Descending Upon Convulsive Devourment & 04:38.85\\
225 18 & Disgorge & Parallels of Infinite Torture & Condemned to Sufferance & 04:57.59\\
226 19 & Disgorge & Parallels of Infinite Torture & Parallels of Infinite Torture & 05:03.33\\
227 20 & Disgorge & Parallels of Infinite Torture & Asphyxiation of Thee Oppressed & 05:42.37\\
228 21 & Disgorge & Parallels of Infinite Torture & Ominous Sigils of Ungodly Ruin & 04:59.15\\
229 \bottomrule
230 \end{tabular}
231 \caption{Songs used in the experiments}
232 \end{table}
233
234 \bibliographystyle{ieeetr}
235 \bibliography{asr}
236 \end{document}