From 5945b2bce63d92454882cb7c66fb1c8d87c3a271 Mon Sep 17 00:00:00 2001
From: Mart Lubbers <mart@martlubbers.net>
Date: Tue, 16 May 2017 10:35:49 +0200
Subject: [PATCH] restructure repository, add literature review

---
 Makefile                                    |   4 +-
 appendices.tex                              |  47 +++
 asr.bib                                     | 429 ++++++++++++--------
 asr.tex                                     | 258 ++----------
 conclusion.tex                              |  18 +
 experiment/Makefile                         |  37 ++
 experiment.pre => experiment/experiment.pre |   2 +-
 experiment.tex => experiment/experiment.tex |   0
 intro.tex                                   | 107 +++++
 methods.tex                                 |  88 ++++
 proposal/Makefile                           |  37 ++
 proposal.pre => proposal/proposal.pre       |   0
 proposal.tex => proposal/proposal.tex       |   0
 13 files changed, 610 insertions(+), 417 deletions(-)
 create mode 100644 appendices.tex
 create mode 100644 conclusion.tex
 create mode 100644 experiment/Makefile
 rename experiment.pre => experiment/experiment.pre (95%)
 rename experiment.tex => experiment/experiment.tex (100%)
 create mode 100644 intro.tex
 create mode 100644 methods.tex
 create mode 100644 proposal/Makefile
 rename proposal.pre => proposal/proposal.pre (100%)
 rename proposal.tex => proposal/proposal.tex (100%)

diff --git a/Makefile b/Makefile
index bb74780..b5714bc 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-DOCS:=asr proposal experiment
+DOCS:=asr
 GREP?=grep
 LATEX?=pdflatex
 BIBTEX?=bibtex
@@ -21,8 +21,6 @@ all: $(addsuffix .pdf,$(DOCS))
 	if $(GREP) -q '\@istfilename' $(basename $<).aux; then $(MAKEGLOSSARIES) $(MAKEGLOSSARIESFLAGSFLAGS) $(basename $<); fi
 	$(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog 
 	$(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true
-	$(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true
-	$(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true
 	$(RM) $(basename $@).mlog
 
 clean: $(addprefix clean-,$(DOCS))
diff --git a/appendices.tex b/appendices.tex
new file mode 100644
index 0000000..3faffc3
--- /dev/null
+++ b/appendices.tex
@@ -0,0 +1,47 @@
+\chapter{Experimental data}\label{app:data}
+\begin{table}[H]
+	\centering
+	\begin{tabular}{cll}
+		\toprule
+		Num. & Song & Duration\\
+		\midrule
+		\multicolumn{3}{l}{\bf Cannibal Corpse {-} A Skeletal Domain}\\
+		00 & High Velocity Impact Spatter & 04:06.91\\
+		01 & Sadistic Embodiment & 03:17.31\\
+		02 & Kill or Become & 03:50.67\\
+		03 & A Skeletal Domain & 03:38.77\\
+		04 & Headlong Into Carnage & 03:01.25\\
+		05 & The Murderer's Pact & 05:05.23\\
+		06 & Funeral Cremation & 03:41.89\\
+		07 & Icepick Lobotomy & 03:16.24\\
+		08 & Vector of Cruelty & 03:25.15\\
+		09 & Bloodstained Cement & 03:41.99\\
+		10 & Asphyxiate to Resuscitate & 03:47.40\\
+		11 & Hollowed Bodies & 03:05.80\\
+		\midrule
+		\multicolumn{3}{l}{\bf Disgorge {-} Parallels of Infinite Torture}\\
+		12 & Revealed in Obscurity & 05:13.20\\
+		13 & Enthroned Abominations & 04:05.39\\
+		14 & Atonement & 02:57.36\\
+		15 & Abhorrent Desecration of Thee Iniquity & 04:17.20\\
+		16 & Forgotten Scriptures & 02:01.72\\
+		17 & Descending Upon Convulsive Devourment & 04:38.85\\
+		18 & Condemned to Sufferance & 04:57.59\\
+		19 & Parallels of Infinite Torture & 05:03.33\\
+		20 & Asphyxiation of Thee Oppressed & 05:42.37\\
+		21 & Ominous Sigils of Ungodly Ruin & 04:59.15\\
+		\midrule
+		\multicolumn{3}{l}{\bf Who Dies In Siberian Slush {-} Bitterness Of The Years That Are Lost}\\
+		22 & Leave Me & 06:35.60\\
+		23 & The Woman We Are Looking For & 06:53.63\\
+		24 & M\"obius Ring & 07:20.56\\
+		25 & Interlude & 04:26.49\\
+		26 & ÐÐ°Ð²ÐµÑÐ°Ð½Ð¸Ðµ ÐÑÐ¼Ð¸Ð»ÑÐ²Ð° & 08:46.76\\
+		27 & An Old Road Through The Snow & 02:31.56\\
+		28 & Bitterness Of The Years That Are Lost & 09:10.49\\
+		\midrule
+		& {\bf Total:} & 02:13:40\\
+		\bottomrule
+	\end{tabular}
+	\caption{Experimental data}
+\end{table}
diff --git a/asr.bib b/asr.bib
index d505823..e016550 100644
--- a/asr.bib
+++ b/asr.bib
@@ -1,69 +1,20 @@
 
-@incollection{muller_lyrics--audio_2012,
-	address = {Wadern},
-	title = {Lyrics-to-{Audio} {Alignment} and its {Application}},
-	isbn = {978-3-939897-37-8},
-	url = {http://nbn-resolving.de/urn:nbn:de:0030-drops-27851},
-	language = {English},
-	urldate = {2017-03-02},
-	booktitle = {Multimodal {Music} {Processing}},
-	publisher = {Schloss Dagstuhl - Leibniz-Zentrum fÃ¼r Informatik GmbH},
-	author = {Goto, Masataka and Fujihara, Hiromasa},
-	editor = {MÃ¼ller, Meinard},
-	year = {2012},
-	note = {OCLC: 864001691},
-	pages = {23--36},
-	file = {3.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/A4ZSSMW5/3.pdf:application/pdf}
-}
-
-@inproceedings{pedone_phoneme-level_2011,
-	title = {Phoneme-{Level} {Text} to {Audio} {Synchronization} on {Speech} {Signals} with {Background} {Music}.},
-	url = {http://ai2-s2-pdfs.s3.amazonaws.com/7fb2/210b6a9e69ea7ca0e4f496548544781c8a8b.pdf},
-	urldate = {2017-03-02},
-	booktitle = {{INTERSPEECH}},
-	author = {Pedone, Agnes and Burred, Juan JosÃ© and Maller, Simon and Leveau, Pierre},
-	year = {2011},
-	pages = {433--436},
-	file = {210b6a9e69ea7ca0e4f496548544781c8a8b.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/NQR3WB2S/210b6a9e69ea7ca0e4f496548544781c8a8b.pdf:application/pdf}
-}
-
-@inproceedings{fujihara_automatic_2006,
-	title = {Automatic synchronization between lyrics and music {CD} recordings based on {Viterbi} alignment of segregated vocal signals},
-	url = {http://ieeexplore.ieee.org/abstract/document/4061176/},
-	urldate = {2017-03-02},
-	booktitle = {Multimedia, 2006. {ISM}'06. {Eighth} {IEEE} {International} {Symposium} on},
-	publisher = {IEEE},
-	author = {Fujihara, Hiromasa and Goto, Masataka and Ogata, Jun and Komatani, Kazunori and Ogata, Tetsuya and Okuno, Hiroshi G.},
-	year = {2006},
-	pages = {257--264},
-	file = {04061176.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/6DU997E4/04061176.pdf:application/pdf}
-}
-
-@inproceedings{mesaros_adaptation_2009,
-	title = {Adaptation of a speech recognizer for singing voice},
-	url = {http://ieeexplore.ieee.org/abstract/document/7077626/},
-	urldate = {2017-03-02},
-	booktitle = {Signal {Processing} {Conference}, 2009 17th {European}},
-	publisher = {IEEE},
-	author = {Mesaros, Annamaria and Virtanen, Tuomas},
-	year = {2009},
-	pages = {1779--1783},
-	file = {07077626.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/FN9TWMMJ/07077626.pdf:application/pdf}
-}
-
-@article{mesaros_automatic_2010,
-	title = {Automatic {Recognition} of {Lyrics} in {Singing}},
-	volume = {2010},
-	issn = {1687-4714, 1687-4722},
-	url = {http://asmp.eurasipjournals.com/content/2010/1/546047},
-	doi = {10.1155/2010/546047},
+@article{yang_machine_2012,
+	title = {Machine {Recognition} of {Music} {Emotion}: {A} {Review}},
+	volume = {3},
+	issn = {21576904},
+	shorttitle = {Machine {Recognition} of {Music} {Emotion}},
+	url = {http://dl.acm.org/citation.cfm?doid=2168752.2168754},
+	doi = {10.1145/2168752.2168754},
 	language = {en},
+	number = {3},
 	urldate = {2017-03-02},
-	journal = {EURASIP Journal on Audio, Speech, and Music Processing},
-	author = {Mesaros, Annamaria and Virtanen, Tuomas},
-	year = {2010},
-	pages = {1--11},
-	file = {art%3A10.1155%2F2010%2F546047.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/3BR5E733/art%3A10.1155%2F2010%2F546047.pdf:application/pdf}
+	journal = {ACM Transactions on Intelligent Systems and Technology},
+	author = {Yang, Yi-Hsuan and Chen, Homer H.},
+	month = may,
+	year = {2012},
+	pages = {1--30},
+	file = {TST00040.dvi - a40-yang.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/RGP3XNGT/a40-yang.pdf:application/pdf}
 }
 
 @article{dzhambazov_automatic_2016,
@@ -72,29 +23,7 @@
 	urldate = {2017-03-02},
 	author = {Dzhambazov, Georgi and Yang, Yile and Repetto, Rafael Caro and Serra, Xavier},
 	year = {2016},
-	file = {Automatic Alignment of Long Syllables in a Cappella Beijing Opera - viewcontent.cgi:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/CSCH3FAK/viewcontent.pdf:application/pdf}
-}
-
-@inproceedings{mesaros_automatic_2008,
-	title = {Automatic alignment of music audio and lyrics},
-	url = {http://legacy.spa.aalto.fi/dafx08/papers/dafx08_57.pdf},
-	urldate = {2017-03-02},
-	booktitle = {Proceedings of the 11th {Int}. {Conference} on {Digital} {Audio} {Effects} ({DAFx}-08)},
-	author = {Mesaros, Annamaria and Virtanen, Tuomas},
-	year = {2008},
-	file = {dafx08_57.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/H24VX8KW/dafx08_57.pdf:application/pdf}
-}
-
-@inproceedings{berenzweig_locating_2001,
-	title = {Locating singing voice segments within music signals},
-	url = {http://ieeexplore.ieee.org/abstract/document/969557/},
-	urldate = {2017-03-02},
-	booktitle = {Applications of {Signal} {Processing} to {Audio} and {Acoustics}, 2001 {IEEE} {Workshop} on the},
-	publisher = {IEEE},
-	author = {Berenzweig, Adam L. and Ellis, Daniel PW},
-	year = {2001},
-	pages = {119--122},
-	file = {Locating singing voice segments within music signals - Applicationis of Signal Processing to Audio and Acoustics, 2001 IEEE Workshop on the - 00969557.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/DWBBQPDE/00969557.pdf:application/pdf}
+	file = {Automatic Alignment of Long Syllables in a Cappella Beijing Opera - viewcontent.cgi:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/CSCH3FAK/viewcontent.pdf:application/pdf}
 }
 
 @inproceedings{dzhambazov_automatic_2014,
@@ -105,55 +34,50 @@
 	author = {Dzhambazov, Georgi and SentÃ¼rk, Sertan and Serra, Xavier},
 	year = {2014},
 	pages = {61--64},
-	file = {lyrics-to-audio-FMA_full_paper.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/K7WFQSR8/lyrics-to-audio-FMA_full_paper.pdf:application/pdf}
+	file = {lyrics-to-audio-FMA_full_paper.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/K7WFQSR8/lyrics-to-audio-FMA_full_paper.pdf:application/pdf}
 }
 
-@inproceedings{fujihara_three_2008,
-	title = {Three techniques for improving automatic synchronization between music and lyrics: {Fricative} detection, filler model, and novel feature vectors for vocal activity detection},
-	shorttitle = {Three techniques for improving automatic synchronization between music and lyrics},
-	url = {http://ieeexplore.ieee.org/abstract/document/4517548/},
-	urldate = {2017-03-02},
-	booktitle = {Acoustics, {Speech} and {Signal} {Processing}, 2008. {ICASSP} 2008. {IEEE} {International} {Conference} on},
+@inproceedings{kato_acoustic_2013,
+	title = {Acoustic {Features} and {Auditory} {Impressions} of {Death} {Growl} and {Screaming} {Voice}},
+	isbn = {978-0-7695-5120-3},
+	url = {http://ieeexplore.ieee.org/document/6846676/},
+	doi = {10.1109/IIH-MSP.2013.120},
+	urldate = {2017-04-11},
 	publisher = {IEEE},
-	author = {Fujihara, Hiromasa and Goto, Masataka},
-	year = {2008},
-	pages = {69--72},
-	file = {THREE TECHNIQUES FOR IMPROVING AUTOMATIC SYNCHRONIZATION BETWEEN MUSIC AND LYRICS\: FRICATIVE DETECTION, FILLER MODEL, AND NOVEL FEATURE VECTORS FOR VOCAL ACTIVITY DETECTION - 04517548.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/CMGJ32AM/04517548.pdf:application/pdf}
+	author = {Kato, Keizo and Ito, Akinori},
+	month = oct,
+	year = {2013},
+	pages = {460--463},
+	file = {Acoustic Features and Auditory Impressions of Death Growl and Screaming Voice - 06846676.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/VAT5AGPP/06846676.pdf:application/pdf}
 }
 
-@article{yang_machine_2012,
-	title = {Machine {Recognition} of {Music} {Emotion}: {A} {Review}},
-	volume = {3},
-	issn = {21576904},
-	shorttitle = {Machine {Recognition} of {Music} {Emotion}},
-	url = {http://dl.acm.org/citation.cfm?doid=2168752.2168754},
-	doi = {10.1145/2168752.2168754},
-	language = {en},
-	number = {3},
-	urldate = {2017-03-02},
-	journal = {ACM Transactions on Intelligent Systems and Technology},
-	author = {Yang, Yi-Hsuan and Chen, Homer H.},
-	month = may,
-	year = {2012},
-	pages = {1--30},
-	file = {TST00040.dvi - a40-yang.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/RGP3XNGT/a40-yang.pdf:application/pdf}
+@article{boersma_praat_2002,
+	title = {Praat, a system for doing phonetics by computer},
+	volume = {5},
+	journal = {Glot international},
+	author = {Boersma, Paulus Petrus Gerardus},
+	year = {2002}
 }
 
-@article{fujihara_lyricsynchronizer:_2011,
-	title = {{LyricSynchronizer}: {Automatic} {Synchronization} {System} {Between} {Musical} {Audio} {Signals} and {Lyrics}},
-	volume = {5},
-	issn = {1932-4553, 1941-0484},
-	shorttitle = {{LyricSynchronizer}},
-	url = {http://ieeexplore.ieee.org/document/5876296/},
-	doi = {10.1109/JSTSP.2011.2159577},
-	number = {6},
-	urldate = {2017-03-02},
-	journal = {IEEE Journal of Selected Topics in Signal Processing},
-	author = {Fujihara, Hiromasa and Goto, Masataka and Ogata, Jun and Okuno, Hiroshi G.},
-	month = oct,
+@inproceedings{leglaive_singing_2015,
+	title = {Singing voice detection with deep recurrent neural networks},
+	url = {http://ieeexplore.ieee.org/abstract/document/7177944/},
+	urldate = {2017-04-25},
+	booktitle = {Acoustics, {Speech} and {Signal} {Processing} ({ICASSP}), 2015 {IEEE} {International} {Conference} on},
+	publisher = {IEEE},
+	author = {Leglaive, Simon and Hennequin, Romain and Badeau, Roland},
+	year = {2015},
+	pages = {121--125},
+	file = {SINGING VOICE DETECTION WITH DEEP RECURRENT NEURAL NETWORKS - 07177944.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/5K4JZDXC/07177944.pdf:application/pdf}
+}
+
+@book{tsatsishvili_automatic_2011,
+	title = {Automatic subgenre classification of heavy metal music},
+	url = {https://jyx.jyu.fi/dspace/handle/123456789/37227},
+	urldate = {2017-03-06},
+	author = {Tsatsishvili, Valeri},
 	year = {2011},
-	pages = {1252--1261},
-	file = {untitled - 05876296.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/Q9MQTWHC/05876296.pdf:application/pdf}
+	file = {AUTOMATIC-SUBGENRE-CLASSIFICATION-OF-HEAVY-METAL-MUSIC.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/3HTFFPVN/AUTOMATIC-SUBGENRE-CLASSIFICATION-OF-HEAVY-METAL-MUSIC.pdf:application/pdf}
 }
 
 @article{mauch_integrating_2012,
@@ -169,26 +93,30 @@
 	month = jan,
 	year = {2012},
 	pages = {200--210},
-	file = {untitled - 05876304.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/MM4NI9SJ/05876304.pdf:application/pdf}
+	file = {untitled - 05876304.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/MM4NI9SJ/05876304.pdf:application/pdf}
 }
 
-@book{tsatsishvili_automatic_2011,
-	title = {Automatic subgenre classification of heavy metal music},
-	url = {https://jyx.jyu.fi/dspace/handle/123456789/37227},
-	urldate = {2017-03-06},
-	author = {Tsatsishvili, Valeri},
-	year = {2011},
-	file = {AUTOMATIC-SUBGENRE-CLASSIFICATION-OF-HEAVY-METAL-MUSIC.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/3HTFFPVN/AUTOMATIC-SUBGENRE-CLASSIFICATION-OF-HEAVY-METAL-MUSIC.pdf:application/pdf}
+@inproceedings{saunders_real-time_1996,
+	title = {Real-time discrimination of broadcast speech/music},
+	volume = {2},
+	booktitle = {Acoustics, {Speech}, and {Signal} {Processing}, 1996. {ICASSP}-96. {Conference} {Proceedings}., 1996 {IEEE} {International} {Conference} on},
+	publisher = {IEEE},
+	author = {Saunders, John},
+	year = {1996},
+	pages = {993--996},
+	file = {REAL-TIME DISCRIMINATION OF BROADCAST SPEECH/MUSIC - Acoustics, Speech, and Signal Processing, 1996. ICASSP-96. Conference Proceedings., 1996 IEEE Inte - saunders_j2.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/H9PG94BN/saunders_j2.pdf:application/pdf}
 }
 
-@inproceedings{sturm_survey_2012,
-	title = {A survey of evaluation in music genre recognition},
-	booktitle = {International {Workshop} on {Adaptive} {Multimedia} {Retrieval}},
-	publisher = {Springer},
-	author = {Sturm, Bob L},
-	year = {2012},
-	pages = {29--66},
-	file = {Sturm20121212.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/6MJKTRHE/Sturm20121212.pdf:application/pdf}
+@inproceedings{nwe_singing_2004,
+	title = {Singing voice detection in popular music},
+	url = {http://dl.acm.org/citation.cfm?id=1027602},
+	urldate = {2017-04-25},
+	booktitle = {Proceedings of the 12th annual {ACM} international conference on {Multimedia}},
+	publisher = {ACM},
+	author = {Nwe, Tin Lay and Shenoy, Arun and Wang, Ye},
+	year = {2004},
+	pages = {324--327},
+	file = {p324-nwe.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/HD48B4K8/p324-nwe.pdf:application/pdf}
 }
 
 @article{you_comparative_2015,
@@ -202,26 +130,36 @@
 	author = {You, Shingchern D. and Wu, Yi-Chung and Peng, Shih-Hsien},
 	month = aug,
 	year = {2015},
-	file = {11042_2015_2894_Article 1..16 - you2015.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/QQIS2H44/you2015.pdf:application/pdf}
+	file = {11042_2015_2894_Article 1..16 - you2015.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/QQIS2H44/you2015.pdf:application/pdf}
 }
 
-@article{boersma_praat_2002,
-	title = {Praat, a system for doing phonetics by computer},
-	volume = {5},
-	journal = {Glot international},
-	author = {Boersma, Paulus Petrus Gerardus and {others}},
-	year = {2002}
+@inproceedings{fujihara_automatic_2006,
+	title = {Automatic synchronization between lyrics and music {CD} recordings based on {Viterbi} alignment of segregated vocal signals},
+	url = {http://ieeexplore.ieee.org/abstract/document/4061176/},
+	urldate = {2017-03-02},
+	booktitle = {Multimedia, 2006. {ISM}'06. {Eighth} {IEEE} {International} {Symposium} on},
+	publisher = {IEEE},
+	author = {Fujihara, Hiromasa and Goto, Masataka and Ogata, Jun and Komatani, Kazunori and Ogata, Tetsuya and Okuno, Hiroshi G.},
+	year = {2006},
+	pages = {257--264},
+	file = {04061176.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/6DU997E4/04061176.pdf:application/pdf}
 }
 
-@inproceedings{saunders_real-time_1996,
-	title = {Real-time discrimination of broadcast speech/music},
-	volume = {2},
-	booktitle = {Acoustics, {Speech}, and {Signal} {Processing}, 1996. {ICASSP}-96. {Conference} {Proceedings}., 1996 {IEEE} {International} {Conference} on},
-	publisher = {IEEE},
-	author = {Saunders, John},
-	year = {1996},
-	pages = {993--996},
-	file = {REAL-TIME DISCRIMINATION OF BROADCAST SPEECH/MUSIC - Acoustics, Speech, and Signal Processing, 1996. ICASSP-96. Conference Proceedings., 1996 IEEE Inte - saunders_j2.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/H9PG94BN/saunders_j2.pdf:application/pdf}
+@incollection{muller_lyrics--audio_2012,
+	address = {Wadern},
+	title = {Lyrics-to-{Audio} {Alignment} and its {Application}},
+	isbn = {978-3-939897-37-8},
+	url = {http://nbn-resolving.de/urn:nbn:de:0030-drops-27851},
+	language = {English},
+	urldate = {2017-03-02},
+	booktitle = {Multimodal {Music} {Processing}},
+	publisher = {Schloss Dagstuhl - Leibniz-Zentrum fÃ¼r Informatik GmbH},
+	author = {Goto, Masataka and Fujihara, Hiromasa},
+	editor = {MÃ¼ller, Meinard},
+	year = {2012},
+	note = {OCLC: 864001691},
+	pages = {23--36},
+	file = {3.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/A4ZSSMW5/3.pdf:application/pdf}
 }
 
 @inproceedings{scheirer_construction_1997,
@@ -234,18 +172,50 @@
 	pages = {1331--1334}
 }
 
-@inproceedings{kato_acoustic_2013,
-	title = {Acoustic {Features} and {Auditory} {Impressions} of {Death} {Growl} and {Screaming} {Voice}},
-	isbn = {978-0-7695-5120-3},
-	url = {http://ieeexplore.ieee.org/document/6846676/},
-	doi = {10.1109/IIH-MSP.2013.120},
-	urldate = {2017-04-11},
+@inproceedings{mesaros_adaptation_2009,
+	title = {Adaptation of a speech recognizer for singing voice},
+	url = {http://ieeexplore.ieee.org/abstract/document/7077626/},
+	urldate = {2017-03-02},
+	booktitle = {Signal {Processing} {Conference}, 2009 17th {European}},
 	publisher = {IEEE},
-	author = {Kato, Keizo and Ito, Akinori},
-	month = oct,
-	year = {2013},
-	pages = {460--463},
-	file = {Acoustic Features and Auditory Impressions of Death Growl and Screaming Voice - 06846676.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/VAT5AGPP/06846676.pdf:application/pdf}
+	author = {Mesaros, Annamaria and Virtanen, Tuomas},
+	year = {2009},
+	pages = {1779--1783},
+	file = {07077626.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/FN9TWMMJ/07077626.pdf:application/pdf}
+}
+
+@inproceedings{fujihara_three_2008,
+	title = {Three techniques for improving automatic synchronization between music and lyrics: {Fricative} detection, filler model, and novel feature vectors for vocal activity detection},
+	shorttitle = {Three techniques for improving automatic synchronization between music and lyrics},
+	url = {http://ieeexplore.ieee.org/abstract/document/4517548/},
+	urldate = {2017-03-02},
+	booktitle = {Acoustics, {Speech} and {Signal} {Processing}, 2008. {ICASSP} 2008. {IEEE} {International} {Conference} on},
+	publisher = {IEEE},
+	author = {Fujihara, Hiromasa and Goto, Masataka},
+	year = {2008},
+	pages = {69--72},
+	file = {THREE TECHNIQUES FOR IMPROVING AUTOMATIC SYNCHRONIZATION BETWEEN MUSIC AND LYRICS\: FRICATIVE DETECTION, FILLER MODEL, AND NOVEL FEATURE VECTORS FOR VOCAL ACTIVITY DETECTION - 04517548.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/CMGJ32AM/04517548.pdf:application/pdf}
+}
+
+@inproceedings{berenzweig_locating_2001,
+	title = {Locating singing voice segments within music signals},
+	url = {http://ieeexplore.ieee.org/abstract/document/969557/},
+	urldate = {2017-03-02},
+	booktitle = {Applications of {Signal} {Processing} to {Audio} and {Acoustics}, 2001 {IEEE} {Workshop} on the},
+	publisher = {IEEE},
+	author = {Berenzweig, Adam L. and Ellis, Daniel PW},
+	year = {2001},
+	pages = {119--122},
+	file = {Locating singing voice segments within music signals - Applicationis of Signal Processing to Audio and Acoustics, 2001 IEEE Workshop on the - 00969557.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/DWBBQPDE/00969557.pdf:application/pdf}
+}
+
+@misc{friis_vikings_2004,
+	title = {Vikings and their {Music}},
+	url = {http://www.viking.no/e/life/music/e-musikk-mogens.html},
+	urldate = {2017-04-11},
+	author = {Friis, Mogens},
+	year = {2004},
+	file = {Vikings and their Music:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/SEEXI3VR/e-musikk-mogens.html:text/html}
 }
 
 @inproceedings{sakakibara_growl_2004,
@@ -255,14 +225,115 @@
 	booktitle = {Proc. {Int}. {Symp}. on {Musical} {Acoustics}},
 	author = {Sakakibara, K. and Fuks, Leonardo and Imagawa, Hiroshi and Tayama, Niro and Naganuma, D.},
 	year = {2004},
-	file = {isma04.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/PUFH652B/isma04.pdf:application/pdf}
+	file = {isma04.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/PUFH652B/isma04.pdf:application/pdf}
 }
 
-@misc{friis_vikings_2004,
-	title = {Vikings and their {Music}},
-	url = {http://www.viking.no/e/life/music/e-musikk-mogens.html},
-	urldate = {2017-04-11},
-	author = {Friis, Mogens},
-	year = {2004},
-	file = {Vikings and their Music:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/SEEXI3VR/e-musikk-mogens.html:text/html}
-}
\ No newline at end of file
+@inproceedings{vembu_separation_2005,
+	title = {Separation of {Vocals} from {Polyphonic} {Audio} {Recordings}.},
+	url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.139.5510&rep=rep1&type=pdf},
+	urldate = {2017-04-25},
+	booktitle = {{ISMIR}},
+	publisher = {Citeseer},
+	author = {Vembu, Shankar and Baumann, Stephan},
+	year = {2005},
+	pages = {337--344},
+	file = {ismir05.dvi - download:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/WZ7INPRU/download.pdf:application/pdf}
+}
+
+@inproceedings{pedone_phoneme-level_2011,
+	title = {Phoneme-{Level} {Text} to {Audio} {Synchronization} on {Speech} {Signals} with {Background} {Music}.},
+	url = {http://ai2-s2-pdfs.s3.amazonaws.com/7fb2/210b6a9e69ea7ca0e4f496548544781c8a8b.pdf},
+	urldate = {2017-03-02},
+	booktitle = {{INTERSPEECH}},
+	author = {Pedone, Agnes and Burred, Juan JosÃ© and Maller, Simon and Leveau, Pierre},
+	year = {2011},
+	pages = {433--436},
+	file = {210b6a9e69ea7ca0e4f496548544781c8a8b.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/NQR3WB2S/210b6a9e69ea7ca0e4f496548544781c8a8b.pdf:application/pdf}
+}
+
+@inproceedings{mesaros_automatic_2008,
+	title = {Automatic alignment of music audio and lyrics},
+	url = {http://legacy.spa.aalto.fi/dafx08/papers/dafx08_57.pdf},
+	urldate = {2017-03-02},
+	booktitle = {Proceedings of the 11th {Int}. {Conference} on {Digital} {Audio} {Effects} ({DAFx}-08)},
+	author = {Mesaros, Annamaria and Virtanen, Tuomas},
+	year = {2008},
+	file = {dafx08_57.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/H24VX8KW/dafx08_57.pdf:application/pdf}
+}
+
+@article{mesaros_automatic_2010,
+	title = {Automatic {Recognition} of {Lyrics} in {Singing}},
+	volume = {2010},
+	issn = {1687-4714, 1687-4722},
+	url = {http://asmp.eurasipjournals.com/content/2010/1/546047},
+	doi = {10.1155/2010/546047},
+	language = {en},
+	urldate = {2017-03-02},
+	journal = {EURASIP Journal on Audio, Speech, and Music Processing},
+	author = {Mesaros, Annamaria and Virtanen, Tuomas},
+	year = {2010},
+	pages = {1--11},
+	file = {art%3A10.1155%2F2010%2F546047.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/3BR5E733/art%3A10.1155%2F2010%2F546047.pdf:application/pdf}
+}
+
+@article{fujihara_lyricsynchronizer:_2011,
+	title = {{LyricSynchronizer}: {Automatic} {Synchronization} {System} {Between} {Musical} {Audio} {Signals} and {Lyrics}},
+	volume = {5},
+	issn = {1932-4553, 1941-0484},
+	shorttitle = {{LyricSynchronizer}},
+	url = {http://ieeexplore.ieee.org/document/5876296/},
+	doi = {10.1109/JSTSP.2011.2159577},
+	number = {6},
+	urldate = {2017-03-02},
+	journal = {IEEE Journal of Selected Topics in Signal Processing},
+	author = {Fujihara, Hiromasa and Goto, Masataka and Ogata, Jun and Okuno, Hiroshi G.},
+	month = oct,
+	year = {2011},
+	pages = {1252--1261},
+	file = {untitled - 05876296.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/Q9MQTWHC/05876296.pdf:application/pdf}
+}
+
+@inproceedings{sturm_survey_2012,
+	title = {A survey of evaluation in music genre recognition},
+	booktitle = {International {Workshop} on {Adaptive} {Multimedia} {Retrieval}},
+	publisher = {Springer},
+	author = {Sturm, Bob L},
+	year = {2012},
+	pages = {29--66},
+	file = {Sturm20121212.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/6MJKTRHE/Sturm20121212.pdf:application/pdf}
+}
+
+@inproceedings{williams_speech/music_1999,
+	title = {Speech/music discrimination based on posterior probability features.},
+	volume = {99},
+	url = {https://pdfs.semanticscholar.org/1662/dba5ab1fc87e871605d1fc14b89b0b1a029c.pdf},
+	urldate = {2017-05-16},
+	booktitle = {Eurospeech},
+	author = {Williams, Gethin and Ellis, Daniel PW},
+	year = {1999},
+	pages = {687--690},
+	file = {euro99-uttclass.dvi - dba5ab1fc87e871605d1fc14b89b0b1a029c.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/PZDIDK4Q/dba5ab1fc87e871605d1fc14b89b0b1a029c.pdf:application/pdf}
+}
+
+@inproceedings{berenzweig_using_2002,
+	title = {Using voice segments to improve artist classification of music},
+	url = {http://www.aes.org/e-lib/browse.cfm?elib=11147},
+	urldate = {2017-05-16},
+	booktitle = {Audio {Engineering} {Society} {Conference}: 22nd {International} {Conference}: {Virtual}, {Synthetic}, and {Entertainment} {Audio}},
+	publisher = {Audio Engineering Society},
+	author = {Berenzweig, Adam L. and Ellis, Daniel PW and Lawrence, Steve},
+	year = {2002},
+	file = {aes02-aclass.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/WJHA7NW6/aes02-aclass.pdf:application/pdf}
+}
+
+@inproceedings{rocamora_comparing_2007,
+	title = {Comparing audio descriptors for singing voice detection in music audio files},
+	volume = {26},
+	url = {https://pdfs.semanticscholar.org/b1c0/d8188b6459a47993c814f212556e02fcfc91.pdf},
+	urldate = {2017-05-16},
+	booktitle = {Brazilian symposium on computer music, 11th. san pablo, brazil},
+	author = {Rocamora, MartÄ±n and Herrera, Perfecto},
+	year = {2007},
+	pages = {27},
+	file = {sbcm2007Singing.dvi - d8188b6459a47993c814f212556e02fcfc91.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/3SMMC6VR/d8188b6459a47993c814f212556e02fcfc91.pdf:application/pdf}
+}
diff --git a/asr.tex b/asr.tex
index 199f901..9cdbaf5 100644
--- a/asr.tex
+++ b/asr.tex
@@ -1,6 +1,6 @@
 %&asr
-\usepackage[nonumberlist,acronyms]{glossaries}
-%\makeglossaries%
+\usepackage[toc,nonumberlist,acronyms]{glossaries}
+\makeglossaries%
 \newacronym{ANN}{ANN}{Artificial Neural Network}
 \newacronym{HMM}{HMM}{Hidden Markov Model}
 \newacronym{GMM}{GMM}{Gaussian Mixture Models}
@@ -9,6 +9,12 @@
 \newacronym{FA}{FA}{Forced alignment}
 \newacronym{MFC}{MFC}{Mel-frequency cepstrum}
 \newacronym{MFCC}{MFCC}{\acrlong{MFC} coefficient}
+\newacronym{PPF}{PPF}{Posterior Probability Features}
+\newacronym{MLP}{MLP}{Multi-layer Perceptron}
+\newacronym{PLP}{PLP}{Perceptual Linear Prediction}
+\newacronym{ZCR}{ZCR}{Zero-crossing Rate}
+\newacronym{LPC}{LPC}{Linear Prediction Coefficients}
+\newacronym{LPCC}{LPCC}{\acrlong{LPC} derivec cepstrum}
 \newacronym{IFPI}{IFPI}{International Federation of the Phonographic Industry}
 \newglossaryentry{dm}{name={Death Metal},
 	description={is an extreme heavy metal music style with growling vocals and
@@ -21,6 +27,9 @@
 	frequency representation}}
 \newglossaryentry{MS}{name={Mel-Scale},
 	description={is a human ear inspired scale for spectral signals.}}
+\newglossaryentry{Viterbi}{name={Viterbi},
+	description={is a dynamic programming algorithm for finding the most likely
+	sequence of hidden states in a \gls{HMM}}}
 
 \begin{document}
 \frontmatter{}
@@ -34,10 +43,6 @@
 
 \tableofcontents
 
-%Glossaries
-%\glsaddall{}
-%\printglossaries
-
 \mainmatter{}
 %Berenzweig and Ellis use acoustic classifiers from speech recognition as a
 %detector for singing lines.  They achive 80\% accuracy for forty 15 second
@@ -59,241 +64,26 @@
 
 %Introduction, leading to a clearly defined research question
 \chapter{Introduction}
-\section{Introduction}
-The primary medium for music distribution is rapidly changing from physical
-media to digital media. The \gls{IFPI} stated that about $43\%$ of music
-revenue rises from digital distribution. Another $39\%$ arises from the
-physical sale and the remaining $16\%$ is made through performance and
-synchronisation revenieus. The overtake of digital formats on physical formats
-took place somewhere in 2015. Moreover, ever since twenty years the music
-industry has seen significant growth 
-again\footnote{\url{http://www.ifpi.org/facts-and-stats.php}}.
-
-There has always been an interest in lyrics to music alignment to be used in
-for example karaoke. As early as in the late 1980s karaoke machines were
-available for consumers. While the lyrics for the track are almost always
-available, a alignment is not and it involves manual labour to create such an
-alignment.
-
-A lot of this musical distribution goes via non-official channels such as
-YouTube\footnote{\url{https://youtube.com}} in which fans of the performers
-often accompany the music with synchronized lyrics. This means that there is an
-enormous treasure of lyrics-annotated music available but not within our reach
-since the subtitles are almost always hardcoded into the video stream and thus
-not directly usable as data. Because of this interest it is very useful to
-device automatic techniques for segmenting instrumental and vocal parts of a
-song, apply forced alignment or even lyrics recognition on the audio file.
-
-Such techniques are heavily researched and working systems have been created.
-However, these techniques are designed to detect a clean singing voice and have
-not been testen on so-called \emph{extended vocal techniques} such as grunting
-or growling. Growling is heavily used in extreme metal genres such as \gls{dm}
-but it must be noted that grunting is not a technique only used in extreme
-metal styles. Similar or equal techniques have been used in \emph{Beijing
-opera}, Japanese \emph{Noh} and but also more western styles like jazz singing
-by Louis Armstrong\cite{sakakibara_growl_2004}. It might even be traced back
-to viking times. For example, an arab merchant visiting a village in Denmark
-wrote in the tenth century\cite{friis_vikings_2004}:
-
-\begin{displayquote}
-	Never before I have heard uglier songs than those of the Vikings in
-	Slesvig. The growling sound coming from their throats reminds me of dogs
-	howling, only more untamed.
-\end{displayquote}
-
-\section{\gls{dm}}
-
-%Literature overview / related work
-\section{Related work}
-The field of applying standard speech processing techniques on music started in
-the late 90s\cite{saunders_real-time_1996,scheirer_construction_1997} and it
-was found that music has different discriminating features compared to normal
-speech.
-
-Berenzweig and Ellis expanded on the aforementioned research by trying to
-separate singing from instrumental music\cite{berenzweig_locating_2001}.
-
-\todo{Incorporate this in literary framing}%
-~\cite{fujihara_automatic_2006}%
-~\cite{fujihara_lyricsynchronizer:_2011}%
-~\cite{fujihara_three_2008}%
-~\cite{mauch_integrating_2012}%
-~\cite{mesaros_adaptation_2009}%
-~\cite{mesaros_automatic_2008}%
-~\cite{mesaros_automatic_2010}%
-~%\cite{muller_multimodal_2012}%
-~\cite{pedone_phoneme-level_2011}%
-~\cite{yang_machine_2012}%
-
-
-
-\section{Research question}
-It is discutable whether the aforementioned techniques work because the
-spectral properties of a growling voice is different from the spectral
-properties of a clean singing voice. It has been found that growling voices
-have less prominent peaks in the frequency representation and are closer to
-noise then clean singing\cite{kato_acoustic_2013}. This leads us to the
-research question:
-
-\begin{center}\em%
-	Are standard \gls{ANN} based techniques for singing voice detection
-	suitable for non-standard musical genres like \gls{dm}.
-\end{center}
+\input{intro.tex}
 
 \chapter{Methods}
-%Methodology
-
-%Experiment(s) (set-up, data, results, discussion)
-\section{Data \& Preprocessing}
-To run the experiments data has been collected from several \gls{dm} albums.
-The exact data used is available in Appendix~\ref{app:data}. The albums are
-extracted from the audio CD and converted to a mono channel waveform with the
-correct samplerate \emph{SoX}\footnote{\url{http://sox.sourceforge.net/}}.
-Every file is annotated using
-Praat\cite{boersma_praat_2002} where the utterances are manually aligned to
-the audio. Examples of utterances are shown in
-Figure~\ref{fig:bloodstained} and Figure~\ref{fig:abominations} where the
-waveform, $1-8000$Hz spectrals and annotations are shown. It is clearly visible
-that within the genre of death metal there are a different spectral patterns
-visible.
-
-\begin{figure}[ht]
-	\centering
-	\includegraphics[width=.7\linewidth]{cement}
-	\caption{A vocal segment of the \emph{Cannibal Corpse} song
-		\emph{Bloodstained Cement}}\label{fig:bloodstained}
-\end{figure}
-
-\begin{figure}[ht]
-	\centering
-	\includegraphics[width=.7\linewidth]{abominations}
-	\caption{A vocal segment of the \emph{Disgorge} song
-		\emph{Enthroned Abominations}}\label{fig:abominations}
-\end{figure}
-
-The data is collected from three studio albums. The
-first band is called \emph{Cannibal Corpse} and has been producing \gls{dm} for
-almost 25 years and have been creating the same type every album. The singer of
-\emph{Cannibal Corpse} has a very raspy growls and the lyrics are quite
-comprehensible. The vocals produced by \emph{Cannibal Corpse} are bordering
-regular shouting. 
-
-The second band is called \emph{Disgorge} and make even more violently sounding
-music. The growls of the lead singer sound like a coffee grinder and are more
-shallow. In the spectrals it is clearly visible that there are overtones
-produced during some parts of the growling. The lyrics are completely
-incomprehensible and therefore some parts were not annotated with the actual
-lyrics because it was not possible what was being sung.
-
-Lastly a band from Moscow is chosen bearing the name \emph{Who Dies in
-Siberian Slush}. This band is a little odd compared to the previous \gls{dm}
-bands because they create \gls{dom}. \gls{dom} is characterized by the very
-slow tempo and low tuned guitars. The vocalist has a very characteristic growl
-and performs in several moscovian bands. This band also stands out because it
-uses piano's and synthesizers. The droning synthesizers often operate in the
-same frequency as the vocals.
-
-\section{\gls{MFCC} Features}
-The waveforms in itself are not very suitable to be used as features due to the
-high dimensionality and correlation. Therefore we use the aften used
-\glspl{MFCC} feature vectors.\todo{cite which papers use this} The actual
-conversion is done using the \emph{python\_speech\_features}%
-\footnote{\url{https://github.com/jameslyons/python_speech_features}} package.
-
-\gls{MFCC} features are nature inspired and built incrementally in a several of
-steps. 
-\begin{enumerate}
-	\item The first step in the process is converting the time representation
-		of the signal to a spectral representation using a sliding window with
-		overlap. The width of the window and the step size are two important
-		parameters in the system. In classical phonetic analysis window sizes
-		of $25ms$ with a step of $10ms$ are often chosen because they are small
-		enough to only contain subphone entities. Singing for $25ms$ is
-		impossible so it is arguable that the window size is very small.
-	\item The standard \gls{FT} gives a spectral representation that has
-		linearly scaled frequencies. This scale is converted to the \gls{MS}
-		using triangular overlapping windows.
-	\item
-\end{enumerate}
-
-
-\todo{Explain why MFCC and which parameters}
-
-\section{\gls{ANN} Classifier}
-\todo{Spectrals might be enough, no decorrelation}
-
-\section{Model training}
-
-\section{Experiments}
-
-\section{Results}
-
+\input{methods.tex}
 
 \chapter{Conclusion \& Discussion}
-\section{Conclusion}
-This research shows that existing techniques for singing-voice detection
-designed for regular singing voices also work respectably on extreme singing
-styles like grunting. With a standard \gls{ANN} classifier using \gls{MFCC}
-features a performance of $85\%$ can be achieved. When applying smoothing this
-can be increased until\todo{results}.
+\input{conclusion.tex}
 
-%Discussion section
-\section{Discussion}
-Singing-voice detection can be seen as a crude way of
-genre-discrimination.\todo{finish}
-
-\todo{Novelty}
-\todo{Weaknesses}
-\todo{Dataset is not very varied but\ldots}
-
-\todo{Doom metal}
-%Conclusion section
-%Acknowledgements
-%Statement on authors' contributions
 %(Appendices)
 \appendix
-\chapter{Experimental data}\label{app:data}
-\begin{table}[h]
-	\centering
-	\begin{tabular}{cllll}
-		\toprule
-		Num. & Artist & Album & Song & Duration\\
-		\midrule
-		00 & Cannibal Corpse & A Skeletal Domain & High Velocity Impact Spatter & 04:06.91\\
-		01 & Cannibal Corpse & A Skeletal Domain & Sadistic Embodiment & 03:17.31\\
-		02 & Cannibal Corpse & A Skeletal Domain & Kill or Become & 03:50.67\\
-		03 & Cannibal Corpse & A Skeletal Domain & A Skeletal Domain & 03:38.77\\
-		04 & Cannibal Corpse & A Skeletal Domain & Headlong Into Carnage & 03:01.25\\
-		05 & Cannibal Corpse & A Skeletal Domain & The Murderer's Pact & 05:05.23\\
-		06 & Cannibal Corpse & A Skeletal Domain & Funeral Cremation & 03:41.89\\
-		07 & Cannibal Corpse & A Skeletal Domain & Icepick Lobotomy & 03:16.24\\
-		08 & Cannibal Corpse & A Skeletal Domain & Vector of Cruelty & 03:25.15\\
-		09 & Cannibal Corpse & A Skeletal Domain & Bloodstained Cement & 03:41.99\\
-		10 & Cannibal Corpse & A Skeletal Domain & Asphyxiate to Resuscitate & 03:47.40\\
-		11 & Cannibal Corpse & A Skeletal Domain & Hollowed Bodies & 03:05.80\\
-		12 & Disgorge & Parallels of Infinite Torture & Revealed in Obscurity & 05:13.20\\
-		13 & Disgorge & Parallels of Infinite Torture & Enthroned Abominations & 04:05.39\\
-		14 & Disgorge & Parallels of Infinite Torture & Atonement & 02:57.36\\
-		15 & Disgorge & Parallels of Infinite Torture & Abhorrent Desecration of Thee Iniquity & 04:17.20\\
-		16 & Disgorge & Parallels of Infinite Torture & Forgotten Scriptures & 02:01.72\\
-		17 & Disgorge & Parallels of Infinite Torture & Descending Upon Convulsive Devourment & 04:38.85\\
-		18 & Disgorge & Parallels of Infinite Torture & Condemned to Sufferance & 04:57.59\\
-		19 & Disgorge & Parallels of Infinite Torture & Parallels of Infinite Torture & 05:03.33\\
-		20 & Disgorge & Parallels of Infinite Torture & Asphyxiation of Thee Oppressed & 05:42.37\\
-		21 & Disgorge & Parallels of Infinite Torture & Ominous Sigils of Ungodly Ruin & 04:59.15\\
-		22 & Who Dies In Siberian Slush & Bitterness Of The Years That Are Lost & Leave Me & 06:35.60\\
-		23 & Who Dies In Siberian Slush & Bitterness Of The Years That Are Lost & The Woman We Are Looking For & 06:53.63\\
-		24 & Who Dies In Siberian Slush & Bitterness Of The Years That Are Lost & M\"obius Ring & 07:20.56\\
-		25 & Who Dies In Siberian Slush & Bitterness Of The Years That Are Lost & Interlude & 04:26.49\\
-		26 & Who Dies In Siberian Slush & Bitterness Of The Years That Are Lost & ÐÐ°Ð²ÐµÑÐ°Ð½Ð¸Ðµ ÐÑÐ¼Ð¸Ð»ÑÐ²Ð° & 08:46.76\\
-		27 & Who Dies In Siberian Slush & Bitterness Of The Years That Are Lost & An Old Road Through The Snow & 02:31.56\\
-		28 & Who Dies In Siberian Slush & Bitterness Of The Years That Are Lost & Bitterness Of The Years That Are Lost & 09:10.49\\
-		\midrule
-		& & & Total: & 02:13:40\\
-		\bottomrule
-	\end{tabular}
-	\caption{Songs used in the experiments}
-\end{table}
+\input{appendices.tex}
+
+\newpage
+%Glossaries
+\glsaddall{}
+\begingroup
+\let\clearpage\relax
+\let\cleardoublepage\relax
+\printglossaries{}
+\endgroup
 
 \bibliographystyle{ieeetr}
 \bibliography{asr}
diff --git a/conclusion.tex b/conclusion.tex
new file mode 100644
index 0000000..9d5176f
--- /dev/null
+++ b/conclusion.tex
@@ -0,0 +1,18 @@
+\section{Conclusion}
+This research shows that existing techniques for singing-voice detection
+designed for regular singing voices also work respectably on extreme singing
+styles like grunting. With a standard \gls{ANN} classifier using \gls{MFCC}
+features a performance of $85\%$ can be achieved. When applying smoothing this
+can be increased until\todo{results}.
+
+%Discussion section
+\section{Discussion}
+Singing-voice detection can be seen as a crude way of
+genre-discrimination.\todo{finish}
+
+\todo{Novelty}
+\todo{Weaknesses}
+\todo{Dataset is not very varied but\ldots}
+
+\todo{Doom metal}
+%Conclusion section
diff --git a/experiment/Makefile b/experiment/Makefile
new file mode 100644
index 0000000..5eea47e
--- /dev/null
+++ b/experiment/Makefile
@@ -0,0 +1,37 @@
+DOCS:=experiment
+GREP?=grep
+LATEX?=pdflatex
+BIBTEX?=bibtex
+BIBTEXFLAGS:=
+MAKEGLOSSARIES?=makeglossaries
+MAKEGLOSSARIESFLAGS?=
+LATEXFLAGS:=-file-line-error -halt-on-error -no-shell-escape
+
+.PHONY: all clean
+.SECONDARY: $(addsuffix .fmt,$(DOCS))
+
+all: $(addsuffix .pdf,$(DOCS))
+
+%.fmt: %.pre
+	$(LATEX) $(LATEXFLAGS) -ini -jobname="$(basename $@)" "&$(LATEX) $<\dump"
+
+%.pdf: %.tex %.fmt $(wildcard *.bib) $(wildcard *.tex)
+	$(LATEX) $(LATEXFLAGS) $<
+	if $(GREP) -q '^\\bibdata{' $(basename $<).aux; then $(BIBTEX) $(BIBTEXFLAGS) $(basename $<); fi
+	if $(GREP) -q '\@istfilename' $(basename $<).aux; then $(MAKEGLOSSARIES) $(MAKEGLOSSARIESFLAGSFLAGS) $(basename $<); fi
+	$(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog 
+	$(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true
+	$(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true
+	$(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true
+	$(RM) $(basename $@).mlog
+
+clean: $(addprefix clean-,$(DOCS))
+
+clobber: $(addprefix clobber-,$(DOCS))
+
+clean-%:
+	$(RM) $(addprefix $(@:clean-%=%).,acn acr alg aux bbl blg fmt glg glo gls\
+		ist lof log lol lot nav out run.xml snm tdo toc vrb xdy)
+
+clobber-%:
+	$(RM) $(@:clobber-%=%).pdf
diff --git a/experiment.pre b/experiment/experiment.pre
similarity index 95%
rename from experiment.pre
rename to experiment/experiment.pre
index c25962c..da1f5a8 100644
--- a/experiment.pre
+++ b/experiment/experiment.pre
@@ -5,7 +5,7 @@
 \usepackage{geometry}                % Papersize
 \usepackage{hyperref}                % Hyperlinks
 \usepackage{graphicx}                % Images
-\graphicspath{{img/}}
+\graphicspath{{../img/}}
 \urlstyle{same}
 \hypersetup{%
 	pdftitle={},
diff --git a/experiment.tex b/experiment/experiment.tex
similarity index 100%
rename from experiment.tex
rename to experiment/experiment.tex
diff --git a/intro.tex b/intro.tex
new file mode 100644
index 0000000..a1133e7
--- /dev/null
+++ b/intro.tex
@@ -0,0 +1,107 @@
+\section{Introduction}
+The primary medium for music distribution is rapidly changing from physical
+media to digital media. The \gls{IFPI} stated that about $43\%$ of music
+revenue rises from digital distribution. Another $39\%$ arises from the
+physical sale and the remaining $16\%$ is made through performance and
+synchronisation revenieus. The overtake of digital formats on physical formats
+took place somewhere in 2015. Moreover, ever since twenty years the music
+industry has seen significant growth 
+again\footnote{\url{http://www.ifpi.org/facts-and-stats.php}}.
+
+There has always been an interest in lyrics to music alignment to be used in
+for example karaoke. As early as in the late 1980s karaoke machines were
+available for consumers. While the lyrics for the track are almost always
+available, a alignment is not and it involves manual labour to create such an
+alignment.
+
+A lot of this musical distribution goes via non-official channels such as
+YouTube\footnote{\url{https://youtube.com}} in which fans of the performers
+often accompany the music with synchronized lyrics. This means that there is an
+enormous treasure of lyrics-annotated music available but not within our reach
+since the subtitles are almost always hardcoded into the video stream and thus
+not directly usable as data. Because of this interest it is very useful to
+device automatic techniques for segmenting instrumental and vocal parts of a
+song, apply forced alignment or even lyrics recognition on the audio file.
+
+Such techniques are heavily researched and working systems have been created.
+However, these techniques are designed to detect a clean singing voice and have
+not been testen on so-called \emph{extended vocal techniques} such as grunting
+or growling. Growling is heavily used in extreme metal genres such as \gls{dm}
+but it must be noted that grunting is not a technique only used in extreme
+metal styles. Similar or equal techniques have been used in \emph{Beijing
+opera}, Japanese \emph{Noh} and but also more western styles like jazz singing
+by Louis Armstrong\cite{sakakibara_growl_2004}. It might even be traced back
+to viking times. For example, an arab merchant visiting a village in Denmark
+wrote in the tenth century\cite{friis_vikings_2004}:
+
+\begin{displayquote}
+	Never before I have heard uglier songs than those of the Vikings in
+	Slesvig. The growling sound coming from their throats reminds me of dogs
+	howling, only more untamed.
+\end{displayquote}
+
+\section{\gls{dm}}
+
+%Literature overview / related work
+\section{Related work}
+Applying speech related processing and classification techniques on music
+already started in the late 90s. Saunders et al.\ devised a technique to
+classify audio in the categories \emph{Music} and \emph{Speech}. It was found
+that music has different properties than speech. Music has more bandwidth,
+tonality and regularity. Multivariate Gaussian classifiers were used to
+discriminate the categories with an average performance of $90\%$.
+
+Williams and Ellis were inspired by the aforementioned research and tried to
+separate the singing segments from the instrumental
+segments\cite{williams_speech/music_1999}. This was later verified by
+Berenzweig and Ellis\cite{berenzweig_locating_2001}. The latter became the de
+facto literature on singing voice detection. Both show that features derived
+from \gls{PPF} such as energy and distribution are highly effective in
+separating speech from non-speech signals such as music. The data used was
+already segmented.
+
+Later, Berenzweig showed singing voice segments to be more useful for artist
+classification and used a \gls{MLP} using \gls{PLP} coefficients to separate
+detect singing voice\cite{berenzweig_using_2002}. Nwe et al.\ showed that there
+is not much difference in accuracy when using different features founded in
+speech processing. They tested several features and found accuracies differ
+less that a few percent. Moreover, they found that others have tried to tackle
+the problem using myriads of different approaches such as using \gls{ZCR},
+\gls{MFCC} and \gls{LPCC} as features and \glspl{HMM} or \glspl{GMM} as
+classifiers\cite{nwe_singing_2004}.
+
+Fujihara et al.\ took the idea to a next level by attempting to do \gls{FA} on
+music. Their approach is a three step approach. First step is reducing the
+accompaniment levels, secondly the vocal segments are
+separated from the non-vocal segments using a simple two-state \gls{HMM}.
+The chain is concluded by applying \gls{Viterbi} alignment on the segregated
+signals with the lyrics. The system showed accuracy levels of $90\%$ on
+Japanese music\cite{fujihara_automatic_2006}. Later they improved
+hereupon\cite{fujihara_three_2008} and even made a ready to use karaoke
+application that can do the this online\cite{fujihara_lyricsynchronizer:_2011}.
+
+Singing voice detection can also be seen as a binary genre recognition problem.
+Therefore the techniques used in that field might be of use.  Genre recognition
+has a long history that can be found in the survey by
+Sturm\cite{sturm_survey_2012}. It must be noted that of all the $485$ papers
+cited by Sturm only one master thesis is applying genre recognition on heavy
+metal genres\cite{tsatsishvili_automatic_2011}.
+
+Singing voice detection has been tried on less conventional styles in the past.
+Dzhambazov et al.\ proposed to align long syllables in Beijing Opera to the
+audio\cite{dzhambazov_automatic_2016}. Beijing Opera sometimes contains
+growling like vocals. Dzhambazov also tried aligning lyrics to audio in
+classical Turkish music\cite{dzhambazov_automatic_2014}.
+
+\section{Research question}
+It is discutable whether the aforementioned techniques work because the
+spectral properties of a growling voice is different from the spectral
+properties of a clean singing voice. It has been found that growling voices
+have less prominent peaks in the frequency representation and are closer to
+noise then clean singing\cite{kato_acoustic_2013}. This leads us to the
+research question:
+
+\begin{center}\em%
+	Are standard \gls{ANN} based techniques for singing voice detection
+	suitable for non-standard musical genres like \gls{dm} and \gls{dom}.
+\end{center}
diff --git a/methods.tex b/methods.tex
new file mode 100644
index 0000000..c49c249
--- /dev/null
+++ b/methods.tex
@@ -0,0 +1,88 @@
+%Methodology
+
+%Experiment(s) (set-up, data, results, discussion)
+\section{Data \& Preprocessing}
+To run the experiments data has been collected from several \gls{dm} albums.
+The exact data used is available in Appendix~\ref{app:data}. The albums are
+extracted from the audio CD and converted to a mono channel waveform with the
+correct samplerate \emph{SoX}\footnote{\url{http://sox.sourceforge.net/}}.
+Every file is annotated using
+Praat\cite{boersma_praat_2002} where the utterances are manually aligned to
+the audio. Examples of utterances are shown in
+Figure~\ref{fig:bloodstained} and Figure~\ref{fig:abominations} where the
+waveform, $1-8000$Hz spectrals and annotations are shown. It is clearly visible
+that within the genre of death metal there are a different spectral patterns
+visible.
+
+\begin{figure}[ht]
+	\centering
+	\includegraphics[width=.7\linewidth]{cement}
+	\caption{A vocal segment of the \emph{Cannibal Corpse} song
+		\emph{Bloodstained Cement}}\label{fig:bloodstained}
+\end{figure}
+
+\begin{figure}[ht]
+	\centering
+	\includegraphics[width=.7\linewidth]{abominations}
+	\caption{A vocal segment of the \emph{Disgorge} song
+		\emph{Enthroned Abominations}}\label{fig:abominations}
+\end{figure}
+
+The data is collected from three studio albums. The
+first band is called \emph{Cannibal Corpse} and has been producing \gls{dm} for
+almost 25 years and have been creating the same type every album. The singer of
+\emph{Cannibal Corpse} has a very raspy growls and the lyrics are quite
+comprehensible. The vocals produced by \emph{Cannibal Corpse} are bordering
+regular shouting. 
+
+The second band is called \emph{Disgorge} and make even more violently sounding
+music. The growls of the lead singer sound like a coffee grinder and are more
+shallow. In the spectrals it is clearly visible that there are overtones
+produced during some parts of the growling. The lyrics are completely
+incomprehensible and therefore some parts were not annotated with the actual
+lyrics because it was not possible what was being sung.
+
+Lastly a band from Moscow is chosen bearing the name \emph{Who Dies in
+Siberian Slush}. This band is a little odd compared to the previous \gls{dm}
+bands because they create \gls{dom}. \gls{dom} is characterized by the very
+slow tempo and low tuned guitars. The vocalist has a very characteristic growl
+and performs in several moscovian bands. This band also stands out because it
+uses piano's and synthesizers. The droning synthesizers often operate in the
+same frequency as the vocals.
+
+\section{\gls{MFCC} Features}
+The waveforms in itself are not very suitable to be used as features due to the
+high dimensionality and correlation. Therefore we use the often used
+\glspl{MFCC} feature vectors which has shown to be
+suitable\cite{rocamora_comparing_2007}. The actual conversion is done using the
+\emph{python\_speech\_features}%
+\footnote{\url{https://github.com/jameslyons/python_speech_features}} package.
+
+\gls{MFCC} features are nature inspired and built incrementally in several
+steps.
+\begin{enumerate}
+	\item The first step in the process is converting the time representation
+		of the signal to a spectral representation using a sliding window with
+		overlap. The width of the window and the step size are two important
+		parameters in the system. In classical phonetic analysis window sizes
+		of $25ms$ with a step of $10ms$ are often chosen because they are small
+		enough to only contain subphone entities. Singing for $25ms$ is
+		impossible so it is arguable that the window size is very small.
+	\item The standard \gls{FT} gives a spectral representation that has
+		linearly scaled frequencies. This scale is converted to the \gls{MS}
+		using triangular overlapping windows.
+	\item
+\end{enumerate}
+
+
+\todo{Explain why MFCC and which parameters}
+
+\section{\gls{ANN} Classifier}
+\todo{Spectrals might be enough, no decorrelation}
+
+\section{Model training}
+
+\section{Experiments}
+
+\section{Results}
+
diff --git a/proposal/Makefile b/proposal/Makefile
new file mode 100644
index 0000000..16f7660
--- /dev/null
+++ b/proposal/Makefile
@@ -0,0 +1,37 @@
+DOCS:=proposal
+GREP?=grep
+LATEX?=pdflatex
+BIBTEX?=bibtex
+BIBTEXFLAGS:=
+MAKEGLOSSARIES?=makeglossaries
+MAKEGLOSSARIESFLAGS?=
+LATEXFLAGS:=-file-line-error -halt-on-error -no-shell-escape
+
+.PHONY: all clean
+.SECONDARY: $(addsuffix .fmt,$(DOCS))
+
+all: $(addsuffix .pdf,$(DOCS))
+
+%.fmt: %.pre
+	$(LATEX) $(LATEXFLAGS) -ini -jobname="$(basename $@)" "&$(LATEX) $<\dump"
+
+%.pdf: %.tex %.fmt $(wildcard *.bib) $(wildcard *.tex)
+	$(LATEX) $(LATEXFLAGS) $<
+	if $(GREP) -q '^\\bibdata{' $(basename $<).aux; then $(BIBTEX) $(BIBTEXFLAGS) $(basename $<); fi
+	if $(GREP) -q '\@istfilename' $(basename $<).aux; then $(MAKEGLOSSARIES) $(MAKEGLOSSARIESFLAGSFLAGS) $(basename $<); fi
+	$(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog 
+	$(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true
+	$(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true
+	$(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true
+	$(RM) $(basename $@).mlog
+
+clean: $(addprefix clean-,$(DOCS))
+
+clobber: $(addprefix clobber-,$(DOCS))
+
+clean-%:
+	$(RM) $(addprefix $(@:clean-%=%).,acn acr alg aux bbl blg fmt glg glo gls\
+		ist lof log lol lot nav out run.xml snm tdo toc vrb xdy)
+
+clobber-%:
+	$(RM) $(@:clobber-%=%).pdf
diff --git a/proposal.pre b/proposal/proposal.pre
similarity index 100%
rename from proposal.pre
rename to proposal/proposal.pre
diff --git a/proposal.tex b/proposal/proposal.tex
similarity index 100%
rename from proposal.tex
rename to proposal/proposal.tex
-- 
2.20.1