e7c62977fdba7bbd93c512890811a93603603488
[asr1617.git] / asr.tex
1 %&asr
2 \usepackage[nonumberlist,acronyms]{glossaries}
3 \makeglossaries%
4 \newacronym{ANN}{ANN}{Artificial Neural Network}
5 \newacronym{HMM}{HMM}{Hidden Markov Model}
6 \newacronym{GMM}{GMM}{Gaussian Mixture Models}
7 \newacronym{DHMM}{DHMM}{Duration-explicit \acrlong{HMM}}
8 \newacronym{HTK}{HTK}{\acrlong{HMM} Toolkit}
9 \newacronym{FA}{FA}{Forced alignment}
10 \newacronym{MFC}{MFC}{Mel-frequency cepstrum}
11 \newacronym{MFCC}{MFCC}{\acrlong{MFC} coefficient}
12 \newglossaryentry{dm}{name={Death Metal},
13 description={is an extreme heavy metal music style with growling vocals and
14 pounding drums}}
15
16 \begin{document}
17 \frontmatter{}
18
19 \maketitleru[
20 course={(Automatic) Speech Recognition},
21 institute={Radboud University Nijmegen},
22 authorstext={Author:},
23 pagenr=1]
24 \listoftodos[Todo]
25
26 \tableofcontents
27
28 %Glossaries
29 \glsaddall{}
30 \printglossaries%
31
32 \mainmatter{}
33 Berenzweig and Ellis use acoustic classifiers from speech recognition as a
34 detector for singing lines. They achive 80\% accuracy for forty 15 second
35 exerpts. They mention people that wrote signal features that discriminate
36 between speech and music. Neural net
37 \glspl{HMM}~\cite{berenzweig_locating_2001}.
38
39 In 2014 Dzhambazov et al.\ applied state of the art segmentation methods to
40 polyphonic turkish music, this might be interesting to use for heavy metal.
41 They mention Fujihara (2011) to have a similar \gls{FA} system. This method uses
42 phone level segmentation, first 12 \gls{MFCC}s. They first do vocal/non-vocal
43 detection, then melody extraction, then alignment. They compare results with
44 Mesaros \& Virtanen, 2008~\cite{dzhambazov_automatic_2014}. Later they
45 specialize in long syllables in a capella. They use \glspl{DHMM} with
46 \glspl{GMM} and show that adding knowledge increases alignment (bejing opera
47 has long syllables)~\cite{dzhambazov_automatic_2016}.
48
49 t\cite{fujihara_automatic_2006}
50 t\cite{fujihara_lyricsynchronizer:_2011}
51 t\cite{fujihara_three_2008}
52 t\cite{mauch_integrating_2012}
53 t\cite{mesaros_adaptation_2009}
54 t\cite{mesaros_automatic_2008}
55 t\cite{mesaros_automatic_2010}
56 t\cite{muller_multimodal_2012}
57 t\cite{pedone_phoneme-level_2011}
58 t\cite{yang_machine_2012}
59
60
61 %Introduction, leading to a clearly defined research question
62 \chapter{Introduction}
63 \section{Introduction}
64 Music is a leading type of data distributed on the internet. Regular music
65 distribution is almost entirely digital and services like Spotify and YouTube
66 allow one to listen to almost any song within a few clicks. Moreover, there are
67 myriads of websites offering lyrics of songs.
68
69 \todo{explain relevancy, (preprocessing for lyric alignment)}
70
71 This leads to the following research question:
72 \begin{center}\em%
73 Are standard \gls{ANN} based techniques for singing voice detection
74 suitable for non-standard musical genres like Death metal.
75 \end{center}
76
77 %Literature overview / related work
78 \section{Related work}
79
80 Singing/non-singing detection has been fairecent topic of interest in the
81 academia. Just in 2001 Berenzweig and Ellis~\cite{berenzweig_locating_2001}
82 researched singing voice detection in stead of the more founded topic of
83 discerning music from regular speech. In their research
84
85 \chapter{Methods}
86 %Methodology
87
88 %Experiment(s) (set-up, data, results, discussion)
89 \section{Data \& Preprocessing}
90 To run the experiments we have collected data from several \gls{dm} albums. The
91 exact data used is available in Appendix~\ref{app:data}. The albums are
92 extracted from the audio CD and converted to a mono channel waveform with the
93 correct samplerate \emph{SoX}~\footnote{\url{http://sox.sourceforge.net/}}.
94 When the waveforms are finished they are converted to \glspl{MFCC} vectors
95 using the \emph{python\_speech\_features}%
96 ~\footnote{\url{https://github.com/jameslyons/python_speech_features}} package.
97 All these steps combined results in thirteen tab separated features per line in
98 a file for every source file. Every file is annotated using
99 Praat~\cite{boersma_praat_2002} where the utterances are manually
100 aligned to the audio. An example of an utterances are shown in
101 Figures~\ref{fig:bloodstained,fig:abominations}. It is clearly visible that
102 within the genre of death metal there are a lot of different spectral patterns
103 visible.
104
105 \begin{figure}[ht]
106 \centering
107 \includegraphics[width=.7\linewidth]{cement}
108 \caption{A vocal segment of the \emph{Cannibal Corpse} song
109 \emph{Bloodstained Cement}}\label{fig:bloodstained}
110 \end{figure}
111
112 \begin{figure}[ht]
113 \centering
114 \includegraphics[width=.7\linewidth]{abominations}
115 \caption{A vocal segment of the \emph{Disgorge} song
116 \emph{Enthroned Abominations}}\label{fig:abominations}
117 \end{figure}
118
119 The data is collected from two\todo{more in the future}\ studio albums. The first
120 band is called \emph{Cannibal Corpse} and has been producing \gls{dm} for almost
121 25 years and have been creating the same type every album. The singer of
122 \emph{Cannibal Corpse} has a very raspy growls and the lyrics are quite
123 comprehensible. The second band is called \emph{Disgorge} and make even more
124 violent music. The growls of the lead singer sound more like a coffee grinder
125 and are more shallow. The lyrics are completely incomprehensible and therefore
126 some parts are not annotated with lyrics because it was too difficult to hear
127 what was being sung.
128
129 \chapter{Conclusion \& Discussion}
130 %Discussion section
131 %Conclusion section
132 %Acknowledgements
133 %Statement on authors' contributions
134 %(Appendices)
135 \appendix
136 \chapter{Experimental data}\label{app:data}
137 \begin{table}[h]
138 \centering
139 \begin{tabular}{cllll}
140 \toprule
141 Num. & Artist & Album & Song & Duration\\
142 \midrule
143 00 & Cannibal Corpse & A Skeletal Domain & High Velocity Impact Spatter & 04:06.91\\
144 01 & Cannibal Corpse & A Skeletal Domain & Sadistic Embodiment & 03:17.31\\
145 02 & Cannibal Corpse & A Skeletal Domain & Kill or Become & 03:50.67\\
146 03 & Cannibal Corpse & A Skeletal Domain & A Skeletal Domain & 03:38.77\\
147 04 & Cannibal Corpse & A Skeletal Domain & Headlong Into Carnage & 03:01.25\\
148 05 & Cannibal Corpse & A Skeletal Domain & The Murderer's Pact & 05:05.23\\
149 06 & Cannibal Corpse & A Skeletal Domain & Funeral Cremation & 03:41.89\\
150 07 & Cannibal Corpse & A Skeletal Domain & Icepick Lobotomy & 03:16.24\\
151 08 & Cannibal Corpse & A Skeletal Domain & Vector of Cruelty & 03:25.15\\
152 09 & Cannibal Corpse & A Skeletal Domain & Bloodstained Cement & 03:41.99\\
153 10 & Cannibal Corpse & A Skeletal Domain & Asphyxiate to Resuscitate & 03:47.40\\
154 11 & Cannibal Corpse & A Skeletal Domain & Hollowed Bodies & 03:05.80\\
155 12 & Disgorge & Parallels of Infinite Torture & Revealed in Obscurity & 05:13.20\\
156 13 & Disgorge & Parallels of Infinite Torture & Enthroned Abominations & 04:05.39\\
157 14 & Disgorge & Parallels of Infinite Torture & Atonement & 02:57.36\\
158 15 & Disgorge & Parallels of Infinite Torture & Abhorrent Desecration of Thee Iniquity & 04:17.20\\
159 16 & Disgorge & Parallels of Infinite Torture & Forgotten Scriptures & 02:01.72\\
160 17 & Disgorge & Parallels of Infinite Torture & Descending Upon Convulsive Devourment & 04:38.85\\
161 18 & Disgorge & Parallels of Infinite Torture & Condemned to Sufferance & 04:57.59\\
162 19 & Disgorge & Parallels of Infinite Torture & Parallels of Infinite Torture & 05:03.33\\
163 20 & Disgorge & Parallels of Infinite Torture & Asphyxiation of Thee Oppressed & 05:42.37\\
164 21 & Disgorge & Parallels of Infinite Torture & Ominous Sigils of Ungodly Ruin & 04:59.15\\
165 \bottomrule
166 \end{tabular}
167 \caption{Songs used in the experiments}
168 \end{table}
169
170 \bibliographystyle{ieeetr}
171 \bibliography{asr}
172 \end{document}