From: Mart Lubbers Date: Tue, 16 May 2017 08:35:49 +0000 (+0200) Subject: restructure repository, add literature review X-Git-Url: https://git.martlubbers.net/?a=commitdiff_plain;h=5945b2bce63d92454882cb7c66fb1c8d87c3a271;p=asr1617.git restructure repository, add literature review --- diff --git a/Makefile b/Makefile index bb74780..b5714bc 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -DOCS:=asr proposal experiment +DOCS:=asr GREP?=grep LATEX?=pdflatex BIBTEX?=bibtex @@ -21,8 +21,6 @@ all: $(addsuffix .pdf,$(DOCS)) if $(GREP) -q '\@istfilename' $(basename $<).aux; then $(MAKEGLOSSARIES) $(MAKEGLOSSARIESFLAGSFLAGS) $(basename $<); fi $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog $(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true - $(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true - $(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true $(RM) $(basename $@).mlog clean: $(addprefix clean-,$(DOCS)) diff --git a/appendices.tex b/appendices.tex new file mode 100644 index 0000000..3faffc3 --- /dev/null +++ b/appendices.tex @@ -0,0 +1,47 @@ +\chapter{Experimental data}\label{app:data} +\begin{table}[H] + \centering + \begin{tabular}{cll} + \toprule + Num. & Song & Duration\\ + \midrule + \multicolumn{3}{l}{\bf Cannibal Corpse {-} A Skeletal Domain}\\ + 00 & High Velocity Impact Spatter & 04:06.91\\ + 01 & Sadistic Embodiment & 03:17.31\\ + 02 & Kill or Become & 03:50.67\\ + 03 & A Skeletal Domain & 03:38.77\\ + 04 & Headlong Into Carnage & 03:01.25\\ + 05 & The Murderer's Pact & 05:05.23\\ + 06 & Funeral Cremation & 03:41.89\\ + 07 & Icepick Lobotomy & 03:16.24\\ + 08 & Vector of Cruelty & 03:25.15\\ + 09 & Bloodstained Cement & 03:41.99\\ + 10 & Asphyxiate to Resuscitate & 03:47.40\\ + 11 & Hollowed Bodies & 03:05.80\\ + \midrule + \multicolumn{3}{l}{\bf Disgorge {-} Parallels of Infinite Torture}\\ + 12 & Revealed in Obscurity & 05:13.20\\ + 13 & Enthroned Abominations & 04:05.39\\ + 14 & Atonement & 02:57.36\\ + 15 & Abhorrent Desecration of Thee Iniquity & 04:17.20\\ + 16 & Forgotten Scriptures & 02:01.72\\ + 17 & Descending Upon Convulsive Devourment & 04:38.85\\ + 18 & Condemned to Sufferance & 04:57.59\\ + 19 & Parallels of Infinite Torture & 05:03.33\\ + 20 & Asphyxiation of Thee Oppressed & 05:42.37\\ + 21 & Ominous Sigils of Ungodly Ruin & 04:59.15\\ + \midrule + \multicolumn{3}{l}{\bf Who Dies In Siberian Slush {-} Bitterness Of The Years That Are Lost}\\ + 22 & Leave Me & 06:35.60\\ + 23 & The Woman We Are Looking For & 06:53.63\\ + 24 & M\"obius Ring & 07:20.56\\ + 25 & Interlude & 04:26.49\\ + 26 & Завещание Гумилёва & 08:46.76\\ + 27 & An Old Road Through The Snow & 02:31.56\\ + 28 & Bitterness Of The Years That Are Lost & 09:10.49\\ + \midrule + & {\bf Total:} & 02:13:40\\ + \bottomrule + \end{tabular} + \caption{Experimental data} +\end{table} diff --git a/asr.bib b/asr.bib index d505823..e016550 100644 --- a/asr.bib +++ b/asr.bib @@ -1,69 +1,20 @@ -@incollection{muller_lyrics--audio_2012, - address = {Wadern}, - title = {Lyrics-to-{Audio} {Alignment} and its {Application}}, - isbn = {978-3-939897-37-8}, - url = {http://nbn-resolving.de/urn:nbn:de:0030-drops-27851}, - language = {English}, - urldate = {2017-03-02}, - booktitle = {Multimodal {Music} {Processing}}, - publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik GmbH}, - author = {Goto, Masataka and Fujihara, Hiromasa}, - editor = {Müller, Meinard}, - year = {2012}, - note = {OCLC: 864001691}, - pages = {23--36}, - file = {3.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/A4ZSSMW5/3.pdf:application/pdf} -} - -@inproceedings{pedone_phoneme-level_2011, - title = {Phoneme-{Level} {Text} to {Audio} {Synchronization} on {Speech} {Signals} with {Background} {Music}.}, - url = {http://ai2-s2-pdfs.s3.amazonaws.com/7fb2/210b6a9e69ea7ca0e4f496548544781c8a8b.pdf}, - urldate = {2017-03-02}, - booktitle = {{INTERSPEECH}}, - author = {Pedone, Agnes and Burred, Juan José and Maller, Simon and Leveau, Pierre}, - year = {2011}, - pages = {433--436}, - file = {210b6a9e69ea7ca0e4f496548544781c8a8b.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/NQR3WB2S/210b6a9e69ea7ca0e4f496548544781c8a8b.pdf:application/pdf} -} - -@inproceedings{fujihara_automatic_2006, - title = {Automatic synchronization between lyrics and music {CD} recordings based on {Viterbi} alignment of segregated vocal signals}, - url = {http://ieeexplore.ieee.org/abstract/document/4061176/}, - urldate = {2017-03-02}, - booktitle = {Multimedia, 2006. {ISM}'06. {Eighth} {IEEE} {International} {Symposium} on}, - publisher = {IEEE}, - author = {Fujihara, Hiromasa and Goto, Masataka and Ogata, Jun and Komatani, Kazunori and Ogata, Tetsuya and Okuno, Hiroshi G.}, - year = {2006}, - pages = {257--264}, - file = {04061176.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/6DU997E4/04061176.pdf:application/pdf} -} - -@inproceedings{mesaros_adaptation_2009, - title = {Adaptation of a speech recognizer for singing voice}, - url = {http://ieeexplore.ieee.org/abstract/document/7077626/}, - urldate = {2017-03-02}, - booktitle = {Signal {Processing} {Conference}, 2009 17th {European}}, - publisher = {IEEE}, - author = {Mesaros, Annamaria and Virtanen, Tuomas}, - year = {2009}, - pages = {1779--1783}, - file = {07077626.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/FN9TWMMJ/07077626.pdf:application/pdf} -} - -@article{mesaros_automatic_2010, - title = {Automatic {Recognition} of {Lyrics} in {Singing}}, - volume = {2010}, - issn = {1687-4714, 1687-4722}, - url = {http://asmp.eurasipjournals.com/content/2010/1/546047}, - doi = {10.1155/2010/546047}, +@article{yang_machine_2012, + title = {Machine {Recognition} of {Music} {Emotion}: {A} {Review}}, + volume = {3}, + issn = {21576904}, + shorttitle = {Machine {Recognition} of {Music} {Emotion}}, + url = {http://dl.acm.org/citation.cfm?doid=2168752.2168754}, + doi = {10.1145/2168752.2168754}, language = {en}, + number = {3}, urldate = {2017-03-02}, - journal = {EURASIP Journal on Audio, Speech, and Music Processing}, - author = {Mesaros, Annamaria and Virtanen, Tuomas}, - year = {2010}, - pages = {1--11}, - file = {art%3A10.1155%2F2010%2F546047.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/3BR5E733/art%3A10.1155%2F2010%2F546047.pdf:application/pdf} + journal = {ACM Transactions on Intelligent Systems and Technology}, + author = {Yang, Yi-Hsuan and Chen, Homer H.}, + month = may, + year = {2012}, + pages = {1--30}, + file = {TST00040.dvi - a40-yang.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/RGP3XNGT/a40-yang.pdf:application/pdf} } @article{dzhambazov_automatic_2016, @@ -72,29 +23,7 @@ urldate = {2017-03-02}, author = {Dzhambazov, Georgi and Yang, Yile and Repetto, Rafael Caro and Serra, Xavier}, year = {2016}, - file = {Automatic Alignment of Long Syllables in a Cappella Beijing Opera - viewcontent.cgi:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/CSCH3FAK/viewcontent.pdf:application/pdf} -} - -@inproceedings{mesaros_automatic_2008, - title = {Automatic alignment of music audio and lyrics}, - url = {http://legacy.spa.aalto.fi/dafx08/papers/dafx08_57.pdf}, - urldate = {2017-03-02}, - booktitle = {Proceedings of the 11th {Int}. {Conference} on {Digital} {Audio} {Effects} ({DAFx}-08)}, - author = {Mesaros, Annamaria and Virtanen, Tuomas}, - year = {2008}, - file = {dafx08_57.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/H24VX8KW/dafx08_57.pdf:application/pdf} -} - -@inproceedings{berenzweig_locating_2001, - title = {Locating singing voice segments within music signals}, - url = {http://ieeexplore.ieee.org/abstract/document/969557/}, - urldate = {2017-03-02}, - booktitle = {Applications of {Signal} {Processing} to {Audio} and {Acoustics}, 2001 {IEEE} {Workshop} on the}, - publisher = {IEEE}, - author = {Berenzweig, Adam L. and Ellis, Daniel PW}, - year = {2001}, - pages = {119--122}, - file = {Locating singing voice segments within music signals - Applicationis of Signal Processing to Audio and Acoustics, 2001 IEEE Workshop on the - 00969557.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/DWBBQPDE/00969557.pdf:application/pdf} + file = {Automatic Alignment of Long Syllables in a Cappella Beijing Opera - viewcontent.cgi:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/CSCH3FAK/viewcontent.pdf:application/pdf} } @inproceedings{dzhambazov_automatic_2014, @@ -105,55 +34,50 @@ author = {Dzhambazov, Georgi and Sentürk, Sertan and Serra, Xavier}, year = {2014}, pages = {61--64}, - file = {lyrics-to-audio-FMA_full_paper.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/K7WFQSR8/lyrics-to-audio-FMA_full_paper.pdf:application/pdf} + file = {lyrics-to-audio-FMA_full_paper.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/K7WFQSR8/lyrics-to-audio-FMA_full_paper.pdf:application/pdf} } -@inproceedings{fujihara_three_2008, - title = {Three techniques for improving automatic synchronization between music and lyrics: {Fricative} detection, filler model, and novel feature vectors for vocal activity detection}, - shorttitle = {Three techniques for improving automatic synchronization between music and lyrics}, - url = {http://ieeexplore.ieee.org/abstract/document/4517548/}, - urldate = {2017-03-02}, - booktitle = {Acoustics, {Speech} and {Signal} {Processing}, 2008. {ICASSP} 2008. {IEEE} {International} {Conference} on}, +@inproceedings{kato_acoustic_2013, + title = {Acoustic {Features} and {Auditory} {Impressions} of {Death} {Growl} and {Screaming} {Voice}}, + isbn = {978-0-7695-5120-3}, + url = {http://ieeexplore.ieee.org/document/6846676/}, + doi = {10.1109/IIH-MSP.2013.120}, + urldate = {2017-04-11}, publisher = {IEEE}, - author = {Fujihara, Hiromasa and Goto, Masataka}, - year = {2008}, - pages = {69--72}, - file = {THREE TECHNIQUES FOR IMPROVING AUTOMATIC SYNCHRONIZATION BETWEEN MUSIC AND LYRICS\: FRICATIVE DETECTION, FILLER MODEL, AND NOVEL FEATURE VECTORS FOR VOCAL ACTIVITY DETECTION - 04517548.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/CMGJ32AM/04517548.pdf:application/pdf} + author = {Kato, Keizo and Ito, Akinori}, + month = oct, + year = {2013}, + pages = {460--463}, + file = {Acoustic Features and Auditory Impressions of Death Growl and Screaming Voice - 06846676.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/VAT5AGPP/06846676.pdf:application/pdf} } -@article{yang_machine_2012, - title = {Machine {Recognition} of {Music} {Emotion}: {A} {Review}}, - volume = {3}, - issn = {21576904}, - shorttitle = {Machine {Recognition} of {Music} {Emotion}}, - url = {http://dl.acm.org/citation.cfm?doid=2168752.2168754}, - doi = {10.1145/2168752.2168754}, - language = {en}, - number = {3}, - urldate = {2017-03-02}, - journal = {ACM Transactions on Intelligent Systems and Technology}, - author = {Yang, Yi-Hsuan and Chen, Homer H.}, - month = may, - year = {2012}, - pages = {1--30}, - file = {TST00040.dvi - a40-yang.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/RGP3XNGT/a40-yang.pdf:application/pdf} +@article{boersma_praat_2002, + title = {Praat, a system for doing phonetics by computer}, + volume = {5}, + journal = {Glot international}, + author = {Boersma, Paulus Petrus Gerardus}, + year = {2002} } -@article{fujihara_lyricsynchronizer:_2011, - title = {{LyricSynchronizer}: {Automatic} {Synchronization} {System} {Between} {Musical} {Audio} {Signals} and {Lyrics}}, - volume = {5}, - issn = {1932-4553, 1941-0484}, - shorttitle = {{LyricSynchronizer}}, - url = {http://ieeexplore.ieee.org/document/5876296/}, - doi = {10.1109/JSTSP.2011.2159577}, - number = {6}, - urldate = {2017-03-02}, - journal = {IEEE Journal of Selected Topics in Signal Processing}, - author = {Fujihara, Hiromasa and Goto, Masataka and Ogata, Jun and Okuno, Hiroshi G.}, - month = oct, +@inproceedings{leglaive_singing_2015, + title = {Singing voice detection with deep recurrent neural networks}, + url = {http://ieeexplore.ieee.org/abstract/document/7177944/}, + urldate = {2017-04-25}, + booktitle = {Acoustics, {Speech} and {Signal} {Processing} ({ICASSP}), 2015 {IEEE} {International} {Conference} on}, + publisher = {IEEE}, + author = {Leglaive, Simon and Hennequin, Romain and Badeau, Roland}, + year = {2015}, + pages = {121--125}, + file = {SINGING VOICE DETECTION WITH DEEP RECURRENT NEURAL NETWORKS - 07177944.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/5K4JZDXC/07177944.pdf:application/pdf} +} + +@book{tsatsishvili_automatic_2011, + title = {Automatic subgenre classification of heavy metal music}, + url = {https://jyx.jyu.fi/dspace/handle/123456789/37227}, + urldate = {2017-03-06}, + author = {Tsatsishvili, Valeri}, year = {2011}, - pages = {1252--1261}, - file = {untitled - 05876296.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/Q9MQTWHC/05876296.pdf:application/pdf} + file = {AUTOMATIC-SUBGENRE-CLASSIFICATION-OF-HEAVY-METAL-MUSIC.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/3HTFFPVN/AUTOMATIC-SUBGENRE-CLASSIFICATION-OF-HEAVY-METAL-MUSIC.pdf:application/pdf} } @article{mauch_integrating_2012, @@ -169,26 +93,30 @@ month = jan, year = {2012}, pages = {200--210}, - file = {untitled - 05876304.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/MM4NI9SJ/05876304.pdf:application/pdf} + file = {untitled - 05876304.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/MM4NI9SJ/05876304.pdf:application/pdf} } -@book{tsatsishvili_automatic_2011, - title = {Automatic subgenre classification of heavy metal music}, - url = {https://jyx.jyu.fi/dspace/handle/123456789/37227}, - urldate = {2017-03-06}, - author = {Tsatsishvili, Valeri}, - year = {2011}, - file = {AUTOMATIC-SUBGENRE-CLASSIFICATION-OF-HEAVY-METAL-MUSIC.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/3HTFFPVN/AUTOMATIC-SUBGENRE-CLASSIFICATION-OF-HEAVY-METAL-MUSIC.pdf:application/pdf} +@inproceedings{saunders_real-time_1996, + title = {Real-time discrimination of broadcast speech/music}, + volume = {2}, + booktitle = {Acoustics, {Speech}, and {Signal} {Processing}, 1996. {ICASSP}-96. {Conference} {Proceedings}., 1996 {IEEE} {International} {Conference} on}, + publisher = {IEEE}, + author = {Saunders, John}, + year = {1996}, + pages = {993--996}, + file = {REAL-TIME DISCRIMINATION OF BROADCAST SPEECH/MUSIC - Acoustics, Speech, and Signal Processing, 1996. ICASSP-96. Conference Proceedings., 1996 IEEE Inte - saunders_j2.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/H9PG94BN/saunders_j2.pdf:application/pdf} } -@inproceedings{sturm_survey_2012, - title = {A survey of evaluation in music genre recognition}, - booktitle = {International {Workshop} on {Adaptive} {Multimedia} {Retrieval}}, - publisher = {Springer}, - author = {Sturm, Bob L}, - year = {2012}, - pages = {29--66}, - file = {Sturm20121212.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/6MJKTRHE/Sturm20121212.pdf:application/pdf} +@inproceedings{nwe_singing_2004, + title = {Singing voice detection in popular music}, + url = {http://dl.acm.org/citation.cfm?id=1027602}, + urldate = {2017-04-25}, + booktitle = {Proceedings of the 12th annual {ACM} international conference on {Multimedia}}, + publisher = {ACM}, + author = {Nwe, Tin Lay and Shenoy, Arun and Wang, Ye}, + year = {2004}, + pages = {324--327}, + file = {p324-nwe.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/HD48B4K8/p324-nwe.pdf:application/pdf} } @article{you_comparative_2015, @@ -202,26 +130,36 @@ author = {You, Shingchern D. and Wu, Yi-Chung and Peng, Shih-Hsien}, month = aug, year = {2015}, - file = {11042_2015_2894_Article 1..16 - you2015.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/QQIS2H44/you2015.pdf:application/pdf} + file = {11042_2015_2894_Article 1..16 - you2015.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/QQIS2H44/you2015.pdf:application/pdf} } -@article{boersma_praat_2002, - title = {Praat, a system for doing phonetics by computer}, - volume = {5}, - journal = {Glot international}, - author = {Boersma, Paulus Petrus Gerardus and {others}}, - year = {2002} +@inproceedings{fujihara_automatic_2006, + title = {Automatic synchronization between lyrics and music {CD} recordings based on {Viterbi} alignment of segregated vocal signals}, + url = {http://ieeexplore.ieee.org/abstract/document/4061176/}, + urldate = {2017-03-02}, + booktitle = {Multimedia, 2006. {ISM}'06. {Eighth} {IEEE} {International} {Symposium} on}, + publisher = {IEEE}, + author = {Fujihara, Hiromasa and Goto, Masataka and Ogata, Jun and Komatani, Kazunori and Ogata, Tetsuya and Okuno, Hiroshi G.}, + year = {2006}, + pages = {257--264}, + file = {04061176.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/6DU997E4/04061176.pdf:application/pdf} } -@inproceedings{saunders_real-time_1996, - title = {Real-time discrimination of broadcast speech/music}, - volume = {2}, - booktitle = {Acoustics, {Speech}, and {Signal} {Processing}, 1996. {ICASSP}-96. {Conference} {Proceedings}., 1996 {IEEE} {International} {Conference} on}, - publisher = {IEEE}, - author = {Saunders, John}, - year = {1996}, - pages = {993--996}, - file = {REAL-TIME DISCRIMINATION OF BROADCAST SPEECH/MUSIC - Acoustics, Speech, and Signal Processing, 1996. ICASSP-96. Conference Proceedings., 1996 IEEE Inte - saunders_j2.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/H9PG94BN/saunders_j2.pdf:application/pdf} +@incollection{muller_lyrics--audio_2012, + address = {Wadern}, + title = {Lyrics-to-{Audio} {Alignment} and its {Application}}, + isbn = {978-3-939897-37-8}, + url = {http://nbn-resolving.de/urn:nbn:de:0030-drops-27851}, + language = {English}, + urldate = {2017-03-02}, + booktitle = {Multimodal {Music} {Processing}}, + publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik GmbH}, + author = {Goto, Masataka and Fujihara, Hiromasa}, + editor = {Müller, Meinard}, + year = {2012}, + note = {OCLC: 864001691}, + pages = {23--36}, + file = {3.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/A4ZSSMW5/3.pdf:application/pdf} } @inproceedings{scheirer_construction_1997, @@ -234,18 +172,50 @@ pages = {1331--1334} } -@inproceedings{kato_acoustic_2013, - title = {Acoustic {Features} and {Auditory} {Impressions} of {Death} {Growl} and {Screaming} {Voice}}, - isbn = {978-0-7695-5120-3}, - url = {http://ieeexplore.ieee.org/document/6846676/}, - doi = {10.1109/IIH-MSP.2013.120}, - urldate = {2017-04-11}, +@inproceedings{mesaros_adaptation_2009, + title = {Adaptation of a speech recognizer for singing voice}, + url = {http://ieeexplore.ieee.org/abstract/document/7077626/}, + urldate = {2017-03-02}, + booktitle = {Signal {Processing} {Conference}, 2009 17th {European}}, publisher = {IEEE}, - author = {Kato, Keizo and Ito, Akinori}, - month = oct, - year = {2013}, - pages = {460--463}, - file = {Acoustic Features and Auditory Impressions of Death Growl and Screaming Voice - 06846676.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/VAT5AGPP/06846676.pdf:application/pdf} + author = {Mesaros, Annamaria and Virtanen, Tuomas}, + year = {2009}, + pages = {1779--1783}, + file = {07077626.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/FN9TWMMJ/07077626.pdf:application/pdf} +} + +@inproceedings{fujihara_three_2008, + title = {Three techniques for improving automatic synchronization between music and lyrics: {Fricative} detection, filler model, and novel feature vectors for vocal activity detection}, + shorttitle = {Three techniques for improving automatic synchronization between music and lyrics}, + url = {http://ieeexplore.ieee.org/abstract/document/4517548/}, + urldate = {2017-03-02}, + booktitle = {Acoustics, {Speech} and {Signal} {Processing}, 2008. {ICASSP} 2008. {IEEE} {International} {Conference} on}, + publisher = {IEEE}, + author = {Fujihara, Hiromasa and Goto, Masataka}, + year = {2008}, + pages = {69--72}, + file = {THREE TECHNIQUES FOR IMPROVING AUTOMATIC SYNCHRONIZATION BETWEEN MUSIC AND LYRICS\: FRICATIVE DETECTION, FILLER MODEL, AND NOVEL FEATURE VECTORS FOR VOCAL ACTIVITY DETECTION - 04517548.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/CMGJ32AM/04517548.pdf:application/pdf} +} + +@inproceedings{berenzweig_locating_2001, + title = {Locating singing voice segments within music signals}, + url = {http://ieeexplore.ieee.org/abstract/document/969557/}, + urldate = {2017-03-02}, + booktitle = {Applications of {Signal} {Processing} to {Audio} and {Acoustics}, 2001 {IEEE} {Workshop} on the}, + publisher = {IEEE}, + author = {Berenzweig, Adam L. and Ellis, Daniel PW}, + year = {2001}, + pages = {119--122}, + file = {Locating singing voice segments within music signals - Applicationis of Signal Processing to Audio and Acoustics, 2001 IEEE Workshop on the - 00969557.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/DWBBQPDE/00969557.pdf:application/pdf} +} + +@misc{friis_vikings_2004, + title = {Vikings and their {Music}}, + url = {http://www.viking.no/e/life/music/e-musikk-mogens.html}, + urldate = {2017-04-11}, + author = {Friis, Mogens}, + year = {2004}, + file = {Vikings and their Music:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/SEEXI3VR/e-musikk-mogens.html:text/html} } @inproceedings{sakakibara_growl_2004, @@ -255,14 +225,115 @@ booktitle = {Proc. {Int}. {Symp}. on {Musical} {Acoustics}}, author = {Sakakibara, K. and Fuks, Leonardo and Imagawa, Hiroshi and Tayama, Niro and Naganuma, D.}, year = {2004}, - file = {isma04.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/PUFH652B/isma04.pdf:application/pdf} + file = {isma04.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/PUFH652B/isma04.pdf:application/pdf} } -@misc{friis_vikings_2004, - title = {Vikings and their {Music}}, - url = {http://www.viking.no/e/life/music/e-musikk-mogens.html}, - urldate = {2017-04-11}, - author = {Friis, Mogens}, - year = {2004}, - file = {Vikings and their Music:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/SEEXI3VR/e-musikk-mogens.html:text/html} -} \ No newline at end of file +@inproceedings{vembu_separation_2005, + title = {Separation of {Vocals} from {Polyphonic} {Audio} {Recordings}.}, + url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.139.5510&rep=rep1&type=pdf}, + urldate = {2017-04-25}, + booktitle = {{ISMIR}}, + publisher = {Citeseer}, + author = {Vembu, Shankar and Baumann, Stephan}, + year = {2005}, + pages = {337--344}, + file = {ismir05.dvi - download:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/WZ7INPRU/download.pdf:application/pdf} +} + +@inproceedings{pedone_phoneme-level_2011, + title = {Phoneme-{Level} {Text} to {Audio} {Synchronization} on {Speech} {Signals} with {Background} {Music}.}, + url = {http://ai2-s2-pdfs.s3.amazonaws.com/7fb2/210b6a9e69ea7ca0e4f496548544781c8a8b.pdf}, + urldate = {2017-03-02}, + booktitle = {{INTERSPEECH}}, + author = {Pedone, Agnes and Burred, Juan José and Maller, Simon and Leveau, Pierre}, + year = {2011}, + pages = {433--436}, + file = {210b6a9e69ea7ca0e4f496548544781c8a8b.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/NQR3WB2S/210b6a9e69ea7ca0e4f496548544781c8a8b.pdf:application/pdf} +} + +@inproceedings{mesaros_automatic_2008, + title = {Automatic alignment of music audio and lyrics}, + url = {http://legacy.spa.aalto.fi/dafx08/papers/dafx08_57.pdf}, + urldate = {2017-03-02}, + booktitle = {Proceedings of the 11th {Int}. {Conference} on {Digital} {Audio} {Effects} ({DAFx}-08)}, + author = {Mesaros, Annamaria and Virtanen, Tuomas}, + year = {2008}, + file = {dafx08_57.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/H24VX8KW/dafx08_57.pdf:application/pdf} +} + +@article{mesaros_automatic_2010, + title = {Automatic {Recognition} of {Lyrics} in {Singing}}, + volume = {2010}, + issn = {1687-4714, 1687-4722}, + url = {http://asmp.eurasipjournals.com/content/2010/1/546047}, + doi = {10.1155/2010/546047}, + language = {en}, + urldate = {2017-03-02}, + journal = {EURASIP Journal on Audio, Speech, and Music Processing}, + author = {Mesaros, Annamaria and Virtanen, Tuomas}, + year = {2010}, + pages = {1--11}, + file = {art%3A10.1155%2F2010%2F546047.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/3BR5E733/art%3A10.1155%2F2010%2F546047.pdf:application/pdf} +} + +@article{fujihara_lyricsynchronizer:_2011, + title = {{LyricSynchronizer}: {Automatic} {Synchronization} {System} {Between} {Musical} {Audio} {Signals} and {Lyrics}}, + volume = {5}, + issn = {1932-4553, 1941-0484}, + shorttitle = {{LyricSynchronizer}}, + url = {http://ieeexplore.ieee.org/document/5876296/}, + doi = {10.1109/JSTSP.2011.2159577}, + number = {6}, + urldate = {2017-03-02}, + journal = {IEEE Journal of Selected Topics in Signal Processing}, + author = {Fujihara, Hiromasa and Goto, Masataka and Ogata, Jun and Okuno, Hiroshi G.}, + month = oct, + year = {2011}, + pages = {1252--1261}, + file = {untitled - 05876296.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/Q9MQTWHC/05876296.pdf:application/pdf} +} + +@inproceedings{sturm_survey_2012, + title = {A survey of evaluation in music genre recognition}, + booktitle = {International {Workshop} on {Adaptive} {Multimedia} {Retrieval}}, + publisher = {Springer}, + author = {Sturm, Bob L}, + year = {2012}, + pages = {29--66}, + file = {Sturm20121212.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/6MJKTRHE/Sturm20121212.pdf:application/pdf} +} + +@inproceedings{williams_speech/music_1999, + title = {Speech/music discrimination based on posterior probability features.}, + volume = {99}, + url = {https://pdfs.semanticscholar.org/1662/dba5ab1fc87e871605d1fc14b89b0b1a029c.pdf}, + urldate = {2017-05-16}, + booktitle = {Eurospeech}, + author = {Williams, Gethin and Ellis, Daniel PW}, + year = {1999}, + pages = {687--690}, + file = {euro99-uttclass.dvi - dba5ab1fc87e871605d1fc14b89b0b1a029c.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/PZDIDK4Q/dba5ab1fc87e871605d1fc14b89b0b1a029c.pdf:application/pdf} +} + +@inproceedings{berenzweig_using_2002, + title = {Using voice segments to improve artist classification of music}, + url = {http://www.aes.org/e-lib/browse.cfm?elib=11147}, + urldate = {2017-05-16}, + booktitle = {Audio {Engineering} {Society} {Conference}: 22nd {International} {Conference}: {Virtual}, {Synthetic}, and {Entertainment} {Audio}}, + publisher = {Audio Engineering Society}, + author = {Berenzweig, Adam L. and Ellis, Daniel PW and Lawrence, Steve}, + year = {2002}, + file = {aes02-aclass.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/WJHA7NW6/aes02-aclass.pdf:application/pdf} +} + +@inproceedings{rocamora_comparing_2007, + title = {Comparing audio descriptors for singing voice detection in music audio files}, + volume = {26}, + url = {https://pdfs.semanticscholar.org/b1c0/d8188b6459a47993c814f212556e02fcfc91.pdf}, + urldate = {2017-05-16}, + booktitle = {Brazilian symposium on computer music, 11th. san pablo, brazil}, + author = {Rocamora, Martın and Herrera, Perfecto}, + year = {2007}, + pages = {27}, + file = {sbcm2007Singing.dvi - d8188b6459a47993c814f212556e02fcfc91.pdf:/home/mrl/.mozilla/firefox/a614qfce.default/zotero/storage/3SMMC6VR/d8188b6459a47993c814f212556e02fcfc91.pdf:application/pdf} +} diff --git a/asr.tex b/asr.tex index 199f901..9cdbaf5 100644 --- a/asr.tex +++ b/asr.tex @@ -1,6 +1,6 @@ %&asr -\usepackage[nonumberlist,acronyms]{glossaries} -%\makeglossaries% +\usepackage[toc,nonumberlist,acronyms]{glossaries} +\makeglossaries% \newacronym{ANN}{ANN}{Artificial Neural Network} \newacronym{HMM}{HMM}{Hidden Markov Model} \newacronym{GMM}{GMM}{Gaussian Mixture Models} @@ -9,6 +9,12 @@ \newacronym{FA}{FA}{Forced alignment} \newacronym{MFC}{MFC}{Mel-frequency cepstrum} \newacronym{MFCC}{MFCC}{\acrlong{MFC} coefficient} +\newacronym{PPF}{PPF}{Posterior Probability Features} +\newacronym{MLP}{MLP}{Multi-layer Perceptron} +\newacronym{PLP}{PLP}{Perceptual Linear Prediction} +\newacronym{ZCR}{ZCR}{Zero-crossing Rate} +\newacronym{LPC}{LPC}{Linear Prediction Coefficients} +\newacronym{LPCC}{LPCC}{\acrlong{LPC} derivec cepstrum} \newacronym{IFPI}{IFPI}{International Federation of the Phonographic Industry} \newglossaryentry{dm}{name={Death Metal}, description={is an extreme heavy metal music style with growling vocals and @@ -21,6 +27,9 @@ frequency representation}} \newglossaryentry{MS}{name={Mel-Scale}, description={is a human ear inspired scale for spectral signals.}} +\newglossaryentry{Viterbi}{name={Viterbi}, + description={is a dynamic programming algorithm for finding the most likely + sequence of hidden states in a \gls{HMM}}} \begin{document} \frontmatter{} @@ -34,10 +43,6 @@ \tableofcontents -%Glossaries -%\glsaddall{} -%\printglossaries - \mainmatter{} %Berenzweig and Ellis use acoustic classifiers from speech recognition as a %detector for singing lines. They achive 80\% accuracy for forty 15 second @@ -59,241 +64,26 @@ %Introduction, leading to a clearly defined research question \chapter{Introduction} -\section{Introduction} -The primary medium for music distribution is rapidly changing from physical -media to digital media. The \gls{IFPI} stated that about $43\%$ of music -revenue rises from digital distribution. Another $39\%$ arises from the -physical sale and the remaining $16\%$ is made through performance and -synchronisation revenieus. The overtake of digital formats on physical formats -took place somewhere in 2015. Moreover, ever since twenty years the music -industry has seen significant growth -again\footnote{\url{http://www.ifpi.org/facts-and-stats.php}}. - -There has always been an interest in lyrics to music alignment to be used in -for example karaoke. As early as in the late 1980s karaoke machines were -available for consumers. While the lyrics for the track are almost always -available, a alignment is not and it involves manual labour to create such an -alignment. - -A lot of this musical distribution goes via non-official channels such as -YouTube\footnote{\url{https://youtube.com}} in which fans of the performers -often accompany the music with synchronized lyrics. This means that there is an -enormous treasure of lyrics-annotated music available but not within our reach -since the subtitles are almost always hardcoded into the video stream and thus -not directly usable as data. Because of this interest it is very useful to -device automatic techniques for segmenting instrumental and vocal parts of a -song, apply forced alignment or even lyrics recognition on the audio file. - -Such techniques are heavily researched and working systems have been created. -However, these techniques are designed to detect a clean singing voice and have -not been testen on so-called \emph{extended vocal techniques} such as grunting -or growling. Growling is heavily used in extreme metal genres such as \gls{dm} -but it must be noted that grunting is not a technique only used in extreme -metal styles. Similar or equal techniques have been used in \emph{Beijing -opera}, Japanese \emph{Noh} and but also more western styles like jazz singing -by Louis Armstrong\cite{sakakibara_growl_2004}. It might even be traced back -to viking times. For example, an arab merchant visiting a village in Denmark -wrote in the tenth century\cite{friis_vikings_2004}: - -\begin{displayquote} - Never before I have heard uglier songs than those of the Vikings in - Slesvig. The growling sound coming from their throats reminds me of dogs - howling, only more untamed. -\end{displayquote} - -\section{\gls{dm}} - -%Literature overview / related work -\section{Related work} -The field of applying standard speech processing techniques on music started in -the late 90s\cite{saunders_real-time_1996,scheirer_construction_1997} and it -was found that music has different discriminating features compared to normal -speech. - -Berenzweig and Ellis expanded on the aforementioned research by trying to -separate singing from instrumental music\cite{berenzweig_locating_2001}. - -\todo{Incorporate this in literary framing}% -~\cite{fujihara_automatic_2006}% -~\cite{fujihara_lyricsynchronizer:_2011}% -~\cite{fujihara_three_2008}% -~\cite{mauch_integrating_2012}% -~\cite{mesaros_adaptation_2009}% -~\cite{mesaros_automatic_2008}% -~\cite{mesaros_automatic_2010}% -~%\cite{muller_multimodal_2012}% -~\cite{pedone_phoneme-level_2011}% -~\cite{yang_machine_2012}% - - - -\section{Research question} -It is discutable whether the aforementioned techniques work because the -spectral properties of a growling voice is different from the spectral -properties of a clean singing voice. It has been found that growling voices -have less prominent peaks in the frequency representation and are closer to -noise then clean singing\cite{kato_acoustic_2013}. This leads us to the -research question: - -\begin{center}\em% - Are standard \gls{ANN} based techniques for singing voice detection - suitable for non-standard musical genres like \gls{dm}. -\end{center} +\input{intro.tex} \chapter{Methods} -%Methodology - -%Experiment(s) (set-up, data, results, discussion) -\section{Data \& Preprocessing} -To run the experiments data has been collected from several \gls{dm} albums. -The exact data used is available in Appendix~\ref{app:data}. The albums are -extracted from the audio CD and converted to a mono channel waveform with the -correct samplerate \emph{SoX}\footnote{\url{http://sox.sourceforge.net/}}. -Every file is annotated using -Praat\cite{boersma_praat_2002} where the utterances are manually aligned to -the audio. Examples of utterances are shown in -Figure~\ref{fig:bloodstained} and Figure~\ref{fig:abominations} where the -waveform, $1-8000$Hz spectrals and annotations are shown. It is clearly visible -that within the genre of death metal there are a different spectral patterns -visible. - -\begin{figure}[ht] - \centering - \includegraphics[width=.7\linewidth]{cement} - \caption{A vocal segment of the \emph{Cannibal Corpse} song - \emph{Bloodstained Cement}}\label{fig:bloodstained} -\end{figure} - -\begin{figure}[ht] - \centering - \includegraphics[width=.7\linewidth]{abominations} - \caption{A vocal segment of the \emph{Disgorge} song - \emph{Enthroned Abominations}}\label{fig:abominations} -\end{figure} - -The data is collected from three studio albums. The -first band is called \emph{Cannibal Corpse} and has been producing \gls{dm} for -almost 25 years and have been creating the same type every album. The singer of -\emph{Cannibal Corpse} has a very raspy growls and the lyrics are quite -comprehensible. The vocals produced by \emph{Cannibal Corpse} are bordering -regular shouting. - -The second band is called \emph{Disgorge} and make even more violently sounding -music. The growls of the lead singer sound like a coffee grinder and are more -shallow. In the spectrals it is clearly visible that there are overtones -produced during some parts of the growling. The lyrics are completely -incomprehensible and therefore some parts were not annotated with the actual -lyrics because it was not possible what was being sung. - -Lastly a band from Moscow is chosen bearing the name \emph{Who Dies in -Siberian Slush}. This band is a little odd compared to the previous \gls{dm} -bands because they create \gls{dom}. \gls{dom} is characterized by the very -slow tempo and low tuned guitars. The vocalist has a very characteristic growl -and performs in several moscovian bands. This band also stands out because it -uses piano's and synthesizers. The droning synthesizers often operate in the -same frequency as the vocals. - -\section{\gls{MFCC} Features} -The waveforms in itself are not very suitable to be used as features due to the -high dimensionality and correlation. Therefore we use the aften used -\glspl{MFCC} feature vectors.\todo{cite which papers use this} The actual -conversion is done using the \emph{python\_speech\_features}% -\footnote{\url{https://github.com/jameslyons/python_speech_features}} package. - -\gls{MFCC} features are nature inspired and built incrementally in a several of -steps. -\begin{enumerate} - \item The first step in the process is converting the time representation - of the signal to a spectral representation using a sliding window with - overlap. The width of the window and the step size are two important - parameters in the system. In classical phonetic analysis window sizes - of $25ms$ with a step of $10ms$ are often chosen because they are small - enough to only contain subphone entities. Singing for $25ms$ is - impossible so it is arguable that the window size is very small. - \item The standard \gls{FT} gives a spectral representation that has - linearly scaled frequencies. This scale is converted to the \gls{MS} - using triangular overlapping windows. - \item -\end{enumerate} - - -\todo{Explain why MFCC and which parameters} - -\section{\gls{ANN} Classifier} -\todo{Spectrals might be enough, no decorrelation} - -\section{Model training} - -\section{Experiments} - -\section{Results} - +\input{methods.tex} \chapter{Conclusion \& Discussion} -\section{Conclusion} -This research shows that existing techniques for singing-voice detection -designed for regular singing voices also work respectably on extreme singing -styles like grunting. With a standard \gls{ANN} classifier using \gls{MFCC} -features a performance of $85\%$ can be achieved. When applying smoothing this -can be increased until\todo{results}. +\input{conclusion.tex} -%Discussion section -\section{Discussion} -Singing-voice detection can be seen as a crude way of -genre-discrimination.\todo{finish} - -\todo{Novelty} -\todo{Weaknesses} -\todo{Dataset is not very varied but\ldots} - -\todo{Doom metal} -%Conclusion section -%Acknowledgements -%Statement on authors' contributions %(Appendices) \appendix -\chapter{Experimental data}\label{app:data} -\begin{table}[h] - \centering - \begin{tabular}{cllll} - \toprule - Num. & Artist & Album & Song & Duration\\ - \midrule - 00 & Cannibal Corpse & A Skeletal Domain & High Velocity Impact Spatter & 04:06.91\\ - 01 & Cannibal Corpse & A Skeletal Domain & Sadistic Embodiment & 03:17.31\\ - 02 & Cannibal Corpse & A Skeletal Domain & Kill or Become & 03:50.67\\ - 03 & Cannibal Corpse & A Skeletal Domain & A Skeletal Domain & 03:38.77\\ - 04 & Cannibal Corpse & A Skeletal Domain & Headlong Into Carnage & 03:01.25\\ - 05 & Cannibal Corpse & A Skeletal Domain & The Murderer's Pact & 05:05.23\\ - 06 & Cannibal Corpse & A Skeletal Domain & Funeral Cremation & 03:41.89\\ - 07 & Cannibal Corpse & A Skeletal Domain & Icepick Lobotomy & 03:16.24\\ - 08 & Cannibal Corpse & A Skeletal Domain & Vector of Cruelty & 03:25.15\\ - 09 & Cannibal Corpse & A Skeletal Domain & Bloodstained Cement & 03:41.99\\ - 10 & Cannibal Corpse & A Skeletal Domain & Asphyxiate to Resuscitate & 03:47.40\\ - 11 & Cannibal Corpse & A Skeletal Domain & Hollowed Bodies & 03:05.80\\ - 12 & Disgorge & Parallels of Infinite Torture & Revealed in Obscurity & 05:13.20\\ - 13 & Disgorge & Parallels of Infinite Torture & Enthroned Abominations & 04:05.39\\ - 14 & Disgorge & Parallels of Infinite Torture & Atonement & 02:57.36\\ - 15 & Disgorge & Parallels of Infinite Torture & Abhorrent Desecration of Thee Iniquity & 04:17.20\\ - 16 & Disgorge & Parallels of Infinite Torture & Forgotten Scriptures & 02:01.72\\ - 17 & Disgorge & Parallels of Infinite Torture & Descending Upon Convulsive Devourment & 04:38.85\\ - 18 & Disgorge & Parallels of Infinite Torture & Condemned to Sufferance & 04:57.59\\ - 19 & Disgorge & Parallels of Infinite Torture & Parallels of Infinite Torture & 05:03.33\\ - 20 & Disgorge & Parallels of Infinite Torture & Asphyxiation of Thee Oppressed & 05:42.37\\ - 21 & Disgorge & Parallels of Infinite Torture & Ominous Sigils of Ungodly Ruin & 04:59.15\\ - 22 & Who Dies In Siberian Slush & Bitterness Of The Years That Are Lost & Leave Me & 06:35.60\\ - 23 & Who Dies In Siberian Slush & Bitterness Of The Years That Are Lost & The Woman We Are Looking For & 06:53.63\\ - 24 & Who Dies In Siberian Slush & Bitterness Of The Years That Are Lost & M\"obius Ring & 07:20.56\\ - 25 & Who Dies In Siberian Slush & Bitterness Of The Years That Are Lost & Interlude & 04:26.49\\ - 26 & Who Dies In Siberian Slush & Bitterness Of The Years That Are Lost & Завещание Гумилёва & 08:46.76\\ - 27 & Who Dies In Siberian Slush & Bitterness Of The Years That Are Lost & An Old Road Through The Snow & 02:31.56\\ - 28 & Who Dies In Siberian Slush & Bitterness Of The Years That Are Lost & Bitterness Of The Years That Are Lost & 09:10.49\\ - \midrule - & & & Total: & 02:13:40\\ - \bottomrule - \end{tabular} - \caption{Songs used in the experiments} -\end{table} +\input{appendices.tex} + +\newpage +%Glossaries +\glsaddall{} +\begingroup +\let\clearpage\relax +\let\cleardoublepage\relax +\printglossaries{} +\endgroup \bibliographystyle{ieeetr} \bibliography{asr} diff --git a/conclusion.tex b/conclusion.tex new file mode 100644 index 0000000..9d5176f --- /dev/null +++ b/conclusion.tex @@ -0,0 +1,18 @@ +\section{Conclusion} +This research shows that existing techniques for singing-voice detection +designed for regular singing voices also work respectably on extreme singing +styles like grunting. With a standard \gls{ANN} classifier using \gls{MFCC} +features a performance of $85\%$ can be achieved. When applying smoothing this +can be increased until\todo{results}. + +%Discussion section +\section{Discussion} +Singing-voice detection can be seen as a crude way of +genre-discrimination.\todo{finish} + +\todo{Novelty} +\todo{Weaknesses} +\todo{Dataset is not very varied but\ldots} + +\todo{Doom metal} +%Conclusion section diff --git a/experiment/Makefile b/experiment/Makefile new file mode 100644 index 0000000..5eea47e --- /dev/null +++ b/experiment/Makefile @@ -0,0 +1,37 @@ +DOCS:=experiment +GREP?=grep +LATEX?=pdflatex +BIBTEX?=bibtex +BIBTEXFLAGS:= +MAKEGLOSSARIES?=makeglossaries +MAKEGLOSSARIESFLAGS?= +LATEXFLAGS:=-file-line-error -halt-on-error -no-shell-escape + +.PHONY: all clean +.SECONDARY: $(addsuffix .fmt,$(DOCS)) + +all: $(addsuffix .pdf,$(DOCS)) + +%.fmt: %.pre + $(LATEX) $(LATEXFLAGS) -ini -jobname="$(basename $@)" "&$(LATEX) $<\dump" + +%.pdf: %.tex %.fmt $(wildcard *.bib) $(wildcard *.tex) + $(LATEX) $(LATEXFLAGS) $< + if $(GREP) -q '^\\bibdata{' $(basename $<).aux; then $(BIBTEX) $(BIBTEXFLAGS) $(basename $<); fi + if $(GREP) -q '\@istfilename' $(basename $<).aux; then $(MAKEGLOSSARIES) $(MAKEGLOSSARIESFLAGSFLAGS) $(basename $<); fi + $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog + $(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true + $(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true + $(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true + $(RM) $(basename $@).mlog + +clean: $(addprefix clean-,$(DOCS)) + +clobber: $(addprefix clobber-,$(DOCS)) + +clean-%: + $(RM) $(addprefix $(@:clean-%=%).,acn acr alg aux bbl blg fmt glg glo gls\ + ist lof log lol lot nav out run.xml snm tdo toc vrb xdy) + +clobber-%: + $(RM) $(@:clobber-%=%).pdf diff --git a/experiment.pre b/experiment/experiment.pre similarity index 95% rename from experiment.pre rename to experiment/experiment.pre index c25962c..da1f5a8 100644 --- a/experiment.pre +++ b/experiment/experiment.pre @@ -5,7 +5,7 @@ \usepackage{geometry} % Papersize \usepackage{hyperref} % Hyperlinks \usepackage{graphicx} % Images -\graphicspath{{img/}} +\graphicspath{{../img/}} \urlstyle{same} \hypersetup{% pdftitle={}, diff --git a/experiment.tex b/experiment/experiment.tex similarity index 100% rename from experiment.tex rename to experiment/experiment.tex diff --git a/intro.tex b/intro.tex new file mode 100644 index 0000000..a1133e7 --- /dev/null +++ b/intro.tex @@ -0,0 +1,107 @@ +\section{Introduction} +The primary medium for music distribution is rapidly changing from physical +media to digital media. The \gls{IFPI} stated that about $43\%$ of music +revenue rises from digital distribution. Another $39\%$ arises from the +physical sale and the remaining $16\%$ is made through performance and +synchronisation revenieus. The overtake of digital formats on physical formats +took place somewhere in 2015. Moreover, ever since twenty years the music +industry has seen significant growth +again\footnote{\url{http://www.ifpi.org/facts-and-stats.php}}. + +There has always been an interest in lyrics to music alignment to be used in +for example karaoke. As early as in the late 1980s karaoke machines were +available for consumers. While the lyrics for the track are almost always +available, a alignment is not and it involves manual labour to create such an +alignment. + +A lot of this musical distribution goes via non-official channels such as +YouTube\footnote{\url{https://youtube.com}} in which fans of the performers +often accompany the music with synchronized lyrics. This means that there is an +enormous treasure of lyrics-annotated music available but not within our reach +since the subtitles are almost always hardcoded into the video stream and thus +not directly usable as data. Because of this interest it is very useful to +device automatic techniques for segmenting instrumental and vocal parts of a +song, apply forced alignment or even lyrics recognition on the audio file. + +Such techniques are heavily researched and working systems have been created. +However, these techniques are designed to detect a clean singing voice and have +not been testen on so-called \emph{extended vocal techniques} such as grunting +or growling. Growling is heavily used in extreme metal genres such as \gls{dm} +but it must be noted that grunting is not a technique only used in extreme +metal styles. Similar or equal techniques have been used in \emph{Beijing +opera}, Japanese \emph{Noh} and but also more western styles like jazz singing +by Louis Armstrong\cite{sakakibara_growl_2004}. It might even be traced back +to viking times. For example, an arab merchant visiting a village in Denmark +wrote in the tenth century\cite{friis_vikings_2004}: + +\begin{displayquote} + Never before I have heard uglier songs than those of the Vikings in + Slesvig. The growling sound coming from their throats reminds me of dogs + howling, only more untamed. +\end{displayquote} + +\section{\gls{dm}} + +%Literature overview / related work +\section{Related work} +Applying speech related processing and classification techniques on music +already started in the late 90s. Saunders et al.\ devised a technique to +classify audio in the categories \emph{Music} and \emph{Speech}. It was found +that music has different properties than speech. Music has more bandwidth, +tonality and regularity. Multivariate Gaussian classifiers were used to +discriminate the categories with an average performance of $90\%$. + +Williams and Ellis were inspired by the aforementioned research and tried to +separate the singing segments from the instrumental +segments\cite{williams_speech/music_1999}. This was later verified by +Berenzweig and Ellis\cite{berenzweig_locating_2001}. The latter became the de +facto literature on singing voice detection. Both show that features derived +from \gls{PPF} such as energy and distribution are highly effective in +separating speech from non-speech signals such as music. The data used was +already segmented. + +Later, Berenzweig showed singing voice segments to be more useful for artist +classification and used a \gls{MLP} using \gls{PLP} coefficients to separate +detect singing voice\cite{berenzweig_using_2002}. Nwe et al.\ showed that there +is not much difference in accuracy when using different features founded in +speech processing. They tested several features and found accuracies differ +less that a few percent. Moreover, they found that others have tried to tackle +the problem using myriads of different approaches such as using \gls{ZCR}, +\gls{MFCC} and \gls{LPCC} as features and \glspl{HMM} or \glspl{GMM} as +classifiers\cite{nwe_singing_2004}. + +Fujihara et al.\ took the idea to a next level by attempting to do \gls{FA} on +music. Their approach is a three step approach. First step is reducing the +accompaniment levels, secondly the vocal segments are +separated from the non-vocal segments using a simple two-state \gls{HMM}. +The chain is concluded by applying \gls{Viterbi} alignment on the segregated +signals with the lyrics. The system showed accuracy levels of $90\%$ on +Japanese music\cite{fujihara_automatic_2006}. Later they improved +hereupon\cite{fujihara_three_2008} and even made a ready to use karaoke +application that can do the this online\cite{fujihara_lyricsynchronizer:_2011}. + +Singing voice detection can also be seen as a binary genre recognition problem. +Therefore the techniques used in that field might be of use. Genre recognition +has a long history that can be found in the survey by +Sturm\cite{sturm_survey_2012}. It must be noted that of all the $485$ papers +cited by Sturm only one master thesis is applying genre recognition on heavy +metal genres\cite{tsatsishvili_automatic_2011}. + +Singing voice detection has been tried on less conventional styles in the past. +Dzhambazov et al.\ proposed to align long syllables in Beijing Opera to the +audio\cite{dzhambazov_automatic_2016}. Beijing Opera sometimes contains +growling like vocals. Dzhambazov also tried aligning lyrics to audio in +classical Turkish music\cite{dzhambazov_automatic_2014}. + +\section{Research question} +It is discutable whether the aforementioned techniques work because the +spectral properties of a growling voice is different from the spectral +properties of a clean singing voice. It has been found that growling voices +have less prominent peaks in the frequency representation and are closer to +noise then clean singing\cite{kato_acoustic_2013}. This leads us to the +research question: + +\begin{center}\em% + Are standard \gls{ANN} based techniques for singing voice detection + suitable for non-standard musical genres like \gls{dm} and \gls{dom}. +\end{center} diff --git a/methods.tex b/methods.tex new file mode 100644 index 0000000..c49c249 --- /dev/null +++ b/methods.tex @@ -0,0 +1,88 @@ +%Methodology + +%Experiment(s) (set-up, data, results, discussion) +\section{Data \& Preprocessing} +To run the experiments data has been collected from several \gls{dm} albums. +The exact data used is available in Appendix~\ref{app:data}. The albums are +extracted from the audio CD and converted to a mono channel waveform with the +correct samplerate \emph{SoX}\footnote{\url{http://sox.sourceforge.net/}}. +Every file is annotated using +Praat\cite{boersma_praat_2002} where the utterances are manually aligned to +the audio. Examples of utterances are shown in +Figure~\ref{fig:bloodstained} and Figure~\ref{fig:abominations} where the +waveform, $1-8000$Hz spectrals and annotations are shown. It is clearly visible +that within the genre of death metal there are a different spectral patterns +visible. + +\begin{figure}[ht] + \centering + \includegraphics[width=.7\linewidth]{cement} + \caption{A vocal segment of the \emph{Cannibal Corpse} song + \emph{Bloodstained Cement}}\label{fig:bloodstained} +\end{figure} + +\begin{figure}[ht] + \centering + \includegraphics[width=.7\linewidth]{abominations} + \caption{A vocal segment of the \emph{Disgorge} song + \emph{Enthroned Abominations}}\label{fig:abominations} +\end{figure} + +The data is collected from three studio albums. The +first band is called \emph{Cannibal Corpse} and has been producing \gls{dm} for +almost 25 years and have been creating the same type every album. The singer of +\emph{Cannibal Corpse} has a very raspy growls and the lyrics are quite +comprehensible. The vocals produced by \emph{Cannibal Corpse} are bordering +regular shouting. + +The second band is called \emph{Disgorge} and make even more violently sounding +music. The growls of the lead singer sound like a coffee grinder and are more +shallow. In the spectrals it is clearly visible that there are overtones +produced during some parts of the growling. The lyrics are completely +incomprehensible and therefore some parts were not annotated with the actual +lyrics because it was not possible what was being sung. + +Lastly a band from Moscow is chosen bearing the name \emph{Who Dies in +Siberian Slush}. This band is a little odd compared to the previous \gls{dm} +bands because they create \gls{dom}. \gls{dom} is characterized by the very +slow tempo and low tuned guitars. The vocalist has a very characteristic growl +and performs in several moscovian bands. This band also stands out because it +uses piano's and synthesizers. The droning synthesizers often operate in the +same frequency as the vocals. + +\section{\gls{MFCC} Features} +The waveforms in itself are not very suitable to be used as features due to the +high dimensionality and correlation. Therefore we use the often used +\glspl{MFCC} feature vectors which has shown to be +suitable\cite{rocamora_comparing_2007}. The actual conversion is done using the +\emph{python\_speech\_features}% +\footnote{\url{https://github.com/jameslyons/python_speech_features}} package. + +\gls{MFCC} features are nature inspired and built incrementally in several +steps. +\begin{enumerate} + \item The first step in the process is converting the time representation + of the signal to a spectral representation using a sliding window with + overlap. The width of the window and the step size are two important + parameters in the system. In classical phonetic analysis window sizes + of $25ms$ with a step of $10ms$ are often chosen because they are small + enough to only contain subphone entities. Singing for $25ms$ is + impossible so it is arguable that the window size is very small. + \item The standard \gls{FT} gives a spectral representation that has + linearly scaled frequencies. This scale is converted to the \gls{MS} + using triangular overlapping windows. + \item +\end{enumerate} + + +\todo{Explain why MFCC and which parameters} + +\section{\gls{ANN} Classifier} +\todo{Spectrals might be enough, no decorrelation} + +\section{Model training} + +\section{Experiments} + +\section{Results} + diff --git a/proposal/Makefile b/proposal/Makefile new file mode 100644 index 0000000..16f7660 --- /dev/null +++ b/proposal/Makefile @@ -0,0 +1,37 @@ +DOCS:=proposal +GREP?=grep +LATEX?=pdflatex +BIBTEX?=bibtex +BIBTEXFLAGS:= +MAKEGLOSSARIES?=makeglossaries +MAKEGLOSSARIESFLAGS?= +LATEXFLAGS:=-file-line-error -halt-on-error -no-shell-escape + +.PHONY: all clean +.SECONDARY: $(addsuffix .fmt,$(DOCS)) + +all: $(addsuffix .pdf,$(DOCS)) + +%.fmt: %.pre + $(LATEX) $(LATEXFLAGS) -ini -jobname="$(basename $@)" "&$(LATEX) $<\dump" + +%.pdf: %.tex %.fmt $(wildcard *.bib) $(wildcard *.tex) + $(LATEX) $(LATEXFLAGS) $< + if $(GREP) -q '^\\bibdata{' $(basename $<).aux; then $(BIBTEX) $(BIBTEXFLAGS) $(basename $<); fi + if $(GREP) -q '\@istfilename' $(basename $<).aux; then $(MAKEGLOSSARIES) $(MAKEGLOSSARIESFLAGSFLAGS) $(basename $<); fi + $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog + $(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true + $(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true + $(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true + $(RM) $(basename $@).mlog + +clean: $(addprefix clean-,$(DOCS)) + +clobber: $(addprefix clobber-,$(DOCS)) + +clean-%: + $(RM) $(addprefix $(@:clean-%=%).,acn acr alg aux bbl blg fmt glg glo gls\ + ist lof log lol lot nav out run.xml snm tdo toc vrb xdy) + +clobber-%: + $(RM) $(@:clobber-%=%).pdf diff --git a/proposal.pre b/proposal/proposal.pre similarity index 100% rename from proposal.pre rename to proposal/proposal.pre diff --git a/proposal.tex b/proposal/proposal.tex similarity index 100% rename from proposal.tex rename to proposal/proposal.tex