From ffa8517ae9d919b4da3ebeace34bc7897b56142b Mon Sep 17 00:00:00 2001 From: Mart Lubbers Date: Tue, 25 Apr 2017 20:32:18 +0200 Subject: [PATCH] up --- Makefile | 10 +++++----- asr.tex | 27 +++++++++++++++------------ 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/Makefile b/Makefile index f368d8e..bb74780 100644 --- a/Makefile +++ b/Makefile @@ -19,11 +19,11 @@ all: $(addsuffix .pdf,$(DOCS)) $(LATEX) $(LATEXFLAGS) $< if $(GREP) -q '^\\bibdata{' $(basename $<).aux; then $(BIBTEX) $(BIBTEXFLAGS) $(basename $<); fi if $(GREP) -q '\@istfilename' $(basename $<).aux; then $(MAKEGLOSSARIES) $(MAKEGLOSSARIESFLAGSFLAGS) $(basename $<); fi - $(LATEX) $(LATEXFLAGS) $< | pee cat "$(GREP) -iFq 'Rerun'"\ - && $(LATEX) $(LATEXFLAGS) $< | pee cat "$(GREP) -iFq 'Rerun'"\ - && $(LATEX) $(LATEXFLAGS) $< | pee cat "$(GREP) -iFq 'Rerun'"\ - && $(LATEX) $(LATEXFLAGS) $< | pee cat "$(GREP) -iFq 'Rerun'"\ - || true + $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog + $(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true + $(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true + $(GREP) -iFq 'Rerun' $(basename $@).mlog && $(LATEX) $(LATEXFLAGS) $< | tee $(basename $@).mlog || true + $(RM) $(basename $@).mlog clean: $(addprefix clean-,$(DOCS)) diff --git a/asr.tex b/asr.tex index 40c6e52..f149b1d 100644 --- a/asr.tex +++ b/asr.tex @@ -1,6 +1,6 @@ %&asr \usepackage[nonumberlist,acronyms]{glossaries} -\makeglossaries% +%\makeglossaries% \newacronym{ANN}{ANN}{Artificial Neural Network} \newacronym{HMM}{HMM}{Hidden Markov Model} \newacronym{GMM}{GMM}{Gaussian Mixture Models} @@ -59,7 +59,7 @@ physical sale and the remaining $16\%$ is made through performance and synchronisation revenieus. The overtake of digital formats on physical formats took place somewhere in 2015. Moreover, ever since twenty years the music industry has seen significant growth -again~\footnote{\url{http://www.ifpi.org/facts-and-stats.php}}. +again\footnote{\url{http://www.ifpi.org/facts-and-stats.php}}. There has always been an interest in lyrics to music alignment to be used in for example karaoke. As early as in the late 1980s karaoke machines were @@ -68,7 +68,7 @@ available, a alignment is not and it involves manual labour to create such an alignment. A lot of this musical distribution goes via non-official channels such as -YouTube~\footnote{\url{https://youtube.com}} in which fans of the performers +YouTube\footnote{\url{https://youtube.com}} in which fans of the performers often accompany the music with synchronized lyrics. This means that there is an enormous treasure of lyrics-annotated music available but not within our reach since the subtitles are almost always hardcoded into the video stream and thus @@ -83,9 +83,9 @@ or growling. Growling is heavily used in extreme metal genres such as \gls{dm} but it must be noted that grunting is not a technique only used in extreme metal styles. Similar or equal techniques have been used in \emph{Beijing opera}, Japanese \emph{Noh} and but also more western styles like jazz singing -by Louis Armstrong~\cite{sakakibara_growl_2004}. It might even be traced back +by Louis Armstrong\cite{sakakibara_growl_2004}. It might even be traced back to viking times. For example, an arab merchant visiting a village in Denmark -wrote in the tenth century~\cite{friis_vikings_2004}: +wrote in the tenth century\cite{friis_vikings_2004}: \begin{displayquote} Never before I have heard uglier songs than those of the Vikings in @@ -98,7 +98,7 @@ wrote in the tenth century~\cite{friis_vikings_2004}: %Literature overview / related work \section{Related work} The field of applying standard speech processing techniques on music started in -the late 90s~\cite{saunders_real-time_1996,scheirer_construction_1997} and it +the late 90s\cite{saunders_real-time_1996,scheirer_construction_1997} and it was found that music has different discriminating features compared to normal speech. @@ -140,18 +140,18 @@ research question: To run the experiments data has been collected from several \gls{dm} albums. The exact data used is available in Appendix~\ref{app:data}. The albums are extracted from the audio CD and converted to a mono channel waveform with the -correct samplerate \emph{SoX}~\footnote{\url{http://sox.sourceforge.net/}}. +correct samplerate \emph{SoX}\footnote{\url{http://sox.sourceforge.net/}}. When the waveforms are finished they are converted to \glspl{MFCC} vectors using the \emph{python\_speech\_features}% -~\footnote{\url{https://github.com/jameslyons/python_speech_features}} package. +\footnote{\url{https://github.com/jameslyons/python_speech_features}} package. All these steps combined results in thirteen tab separated features per line in a file for every source file. Technical info about the processing steps is given in the following sections. Every file is annotated using -Praat~\cite{boersma_praat_2002} where the utterances are manually aligned to +Praat\cite{boersma_praat_2002} where the utterances are manually aligned to the audio. Examples of utterances are shown in -Figures~\ref{fig:bloodstained,fig:abominations} where the waveform, $1-8000$Hz -spectrals and annotations are shown. It is clearly visible that -within the genre of death metal there are a different spectral patterns +Figure~\ref{fig:bloodstained} and Figure~\ref{fig:abominations} where the +waveform, $1-8000$Hz spectrals and annotations are shown. It is clearly visible +that within the genre of death metal there are a different spectral patterns visible. \begin{figure}[ht] @@ -211,6 +211,9 @@ similar on Death Metal \caption{Outline} \end{table} +\section{Features} + + \todo{Explain why MFCC and which parameters} \todo{Spectrals might be enough, no decorrelation} -- 2.20.1