From: Mart Lubbers Date: Tue, 28 Mar 2017 21:50:05 +0000 (+0200) Subject: update for tomorrow X-Git-Url: https://git.martlubbers.net/?a=commitdiff_plain;h=a70106a1b0f0d504f3fd311f237673ace5751b5c;p=asr1617.git update for tomorrow --- diff --git a/asr.bib b/asr.bib index 570488f..594ead2 100644 --- a/asr.bib +++ b/asr.bib @@ -211,4 +211,25 @@ journal = {Glot international}, author = {Boersma, Paulus Petrus Gerardus and {others}}, year = {2002} +} + +@inproceedings{saunders_real-time_1996, + title = {Real-time discrimination of broadcast speech/music}, + volume = {2}, + booktitle = {Acoustics, {Speech}, and {Signal} {Processing}, 1996. {ICASSP}-96. {Conference} {Proceedings}., 1996 {IEEE} {International} {Conference} on}, + publisher = {IEEE}, + author = {Saunders, John}, + year = {1996}, + pages = {993--996}, + file = {REAL-TIME DISCRIMINATION OF BROADCAST SPEECH/MUSIC - Acoustics, Speech, and Signal Processing, 1996. ICASSP-96. Conference Proceedings., 1996 IEEE Inte - saunders_j2.pdf:/home/mrl/.mozilla/firefox/7b4r727h.default-1470981082057/zotero/storage/H9PG94BN/saunders_j2.pdf:application/pdf} +} + +@inproceedings{scheirer_construction_1997, + title = {Construction and evaluation of a robust multifeature speech/music discriminator}, + volume = {2}, + booktitle = {Acoustics, {Speech}, and {Signal} {Processing}, 1997. {ICASSP}-97., 1997 {IEEE} {International} {Conference} on}, + publisher = {IEEE}, + author = {Scheirer, Eric and Slaney, Malcolm}, + year = {1997}, + pages = {1331--1334} } \ No newline at end of file diff --git a/asr.tex b/asr.tex index e7c6297..4899bd1 100644 --- a/asr.tex +++ b/asr.tex @@ -9,6 +9,7 @@ \newacronym{FA}{FA}{Forced alignment} \newacronym{MFC}{MFC}{Mel-frequency cepstrum} \newacronym{MFCC}{MFCC}{\acrlong{MFC} coefficient} +\newacronym{IFPI}{IFPI}{International Federation of the Phonographic Industry} \newglossaryentry{dm}{name={Death Metal}, description={is an extreme heavy metal music style with growling vocals and pounding drums}} @@ -26,62 +27,89 @@ \tableofcontents %Glossaries -\glsaddall{} -\printglossaries% +%\glsaddall{} +%\printglossaries \mainmatter{} -Berenzweig and Ellis use acoustic classifiers from speech recognition as a -detector for singing lines. They achive 80\% accuracy for forty 15 second -exerpts. They mention people that wrote signal features that discriminate -between speech and music. Neural net -\glspl{HMM}~\cite{berenzweig_locating_2001}. - -In 2014 Dzhambazov et al.\ applied state of the art segmentation methods to -polyphonic turkish music, this might be interesting to use for heavy metal. -They mention Fujihara (2011) to have a similar \gls{FA} system. This method uses -phone level segmentation, first 12 \gls{MFCC}s. They first do vocal/non-vocal -detection, then melody extraction, then alignment. They compare results with -Mesaros \& Virtanen, 2008~\cite{dzhambazov_automatic_2014}. Later they -specialize in long syllables in a capella. They use \glspl{DHMM} with -\glspl{GMM} and show that adding knowledge increases alignment (bejing opera -has long syllables)~\cite{dzhambazov_automatic_2016}. - -t\cite{fujihara_automatic_2006} -t\cite{fujihara_lyricsynchronizer:_2011} -t\cite{fujihara_three_2008} -t\cite{mauch_integrating_2012} -t\cite{mesaros_adaptation_2009} -t\cite{mesaros_automatic_2008} -t\cite{mesaros_automatic_2010} -t\cite{muller_multimodal_2012} -t\cite{pedone_phoneme-level_2011} -t\cite{yang_machine_2012} +%Berenzweig and Ellis use acoustic classifiers from speech recognition as a +%detector for singing lines. They achive 80\% accuracy for forty 15 second +%exerpts. They mention people that wrote signal features that discriminate +%between speech and music. Neural net +%\glspl{HMM}~\cite{berenzweig_locating_2001}. +% +%In 2014 Dzhambazov et al.\ applied state of the art segmentation methods to +%polyphonic turkish music, this might be interesting to use for heavy metal. +%They mention Fujihara (2011) to have a similar \gls{FA} system. This method uses +%phone level segmentation, first 12 \gls{MFCC}s. They first do vocal/non-vocal +%detection, then melody extraction, then alignment. They compare results with +%Mesaros \& Virtanen, 2008~\cite{dzhambazov_automatic_2014}. Later they +%specialize in long syllables in a capella. They use \glspl{DHMM} with +%\glspl{GMM} and show that adding knowledge increases alignment (bejing opera +%has long syllables)~\cite{dzhambazov_automatic_2016}. +% %Introduction, leading to a clearly defined research question \chapter{Introduction} \section{Introduction} -Music is a leading type of data distributed on the internet. Regular music -distribution is almost entirely digital and services like Spotify and YouTube -allow one to listen to almost any song within a few clicks. Moreover, there are -myriads of websites offering lyrics of songs. - -\todo{explain relevancy, (preprocessing for lyric alignment)} +The \gls{IFPI} stated that about $43\%$ of music revenue rises from digital +distribution. The overtake on physical formats took place somewhere in 2015 and +since twenty years the music industry has seen significant +growth~\footnote{\url{http://www.ifpi.org/facts-and-stats.php}}. + +A lot of this musical distribution goes via non-official channels such as +YouTube~\footnote{\url{https://youtube.com}} in which fans of the musical group +accompany the music with synchronized lyrics so that users can sing or read +along. Because of this interest it is very useful to device automatic +techniques for segmenting instrumental and vocal parts of a song and +apply forced alignment or even lyrics recognition on the audio file. + + +%A majority of the music is not only instrumental but also contains vocal +%segments. +% +%Music is a leading type of data distributed on the internet. Regular music +%distribution is almost entirely digital and services like Spotify and YouTube +%allow one to listen to almost any song within a few clicks. Moreover, there are +%myriads of websites offering lyrics of songs. +% +%\todo{explain relevancy, (preprocessing for lyric alignment)} +% +%This leads to the following research question: +%\begin{center}\em% +% Are standard \gls{ANN} based techniques for singing voice detection +% suitable for non-standard musical genres like Death metal. +%\end{center} +%Literature overview / related work +\section{Related work} +The field of applying standard speech processing techniques on music started in +the late 90s~\cite{saunders_real-time_1996,scheirer_construction_1997} and it +was found that music has different discriminating features compared to normal +speech. + +Berenzweig and Ellis expanded on the aforementioned research by trying to +separate singing from instrumental music\cite{berenzweig_locating_2001}. + +\todo{Incorporate this in literary framing} +~\cite{fujihara_automatic_2006} +~\cite{fujihara_lyricsynchronizer:_2011} +~\cite{fujihara_three_2008} +~\cite{mauch_integrating_2012} +~\cite{mesaros_adaptation_2009} +~\cite{mesaros_automatic_2008} +~\cite{mesaros_automatic_2010} +~%\cite{muller_multimodal_2012} +~\cite{pedone_phoneme-level_2011} +~\cite{yang_machine_2012} + +\section{Research question} This leads to the following research question: \begin{center}\em% Are standard \gls{ANN} based techniques for singing voice detection suitable for non-standard musical genres like Death metal. \end{center} -%Literature overview / related work -\section{Related work} - -Singing/non-singing detection has been fairecent topic of interest in the -academia. Just in 2001 Berenzweig and Ellis~\cite{berenzweig_locating_2001} -researched singing voice detection in stead of the more founded topic of -discerning music from regular speech. In their research - \chapter{Methods} %Methodology @@ -126,6 +154,42 @@ and are more shallow. The lyrics are completely incomprehensible and therefore some parts are not annotated with lyrics because it was too difficult to hear what was being sung. +\section{Methods} +\todo{To remove in final thesis} +The initial planning is still up to date. About one and a half album has been +annotated and a framework for setting up experiments has been created. +Moreover, the first exploratory experiments are already been executed and +promising. In April the experimental dataset will be expanded and I will try to +mimic some of the experiments done in the literature to see whether it performs +similar on Death Metal +\begin{table}[ht] + \centering + \begin{tabular}{cll} + \toprule + Month & Description\\ + \midrule + March + & Preparing the data\\ + & Preparing an experiment platform\\ + & Literature research\\ + April + & Running the experiments\\ + & Fiddle with parameters\\ + & Explore the possibilities for forced alignment\\ + May + & Write up the thesis\\ + & Possibly do forced alignment\\ + June + & Finish up thesis\\ + & Wrap up\\ + \bottomrule + \end{tabular} + \caption{Outline} +\end{table} + +\section{Results} + + \chapter{Conclusion \& Discussion} %Discussion section %Conclusion section