add experiment
authorMart Lubbers <mart@martlubbers.net>
Tue, 21 Mar 2017 21:17:09 +0000 (22:17 +0100)
committerMart Lubbers <mart@martlubbers.net>
Tue, 21 Mar 2017 21:17:09 +0000 (22:17 +0100)
Makefile
experiment.pre [new file with mode: 0644]
experiment.tex [new file with mode: 0644]
img/cement.png [new file with mode: 0644]

index ea2399a..f40e93c 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-DOCS:=asr proposal
+DOCS:=asr proposal experiment
 GREP?=grep
 LATEX?=pdflatex
 BIBTEX?=bibtex
diff --git a/experiment.pre b/experiment.pre
new file mode 100644 (file)
index 0000000..c25962c
--- /dev/null
@@ -0,0 +1,23 @@
+\documentclass[a4paper]{article}
+
+\usepackage[british]{babel}
+
+\usepackage{geometry}                % Papersize
+\usepackage{hyperref}                % Hyperlinks
+\usepackage{graphicx}                % Images
+\graphicspath{{img/}}
+\urlstyle{same}
+\hypersetup{%
+       pdftitle={},
+       pdfauthor={Mart Lubbers},
+       pdfsubject={},
+       pdfcreator={Mart Lubbers},
+       pdfproducer={Mart Lubbers},
+       pdfkeywords={},
+       hidelinks=true
+}
+
+\title{(Automatic) Speech Recognition\\{\large Experiment setup}}
+\author{Mart Lubbers\\
+       {\small\href{mailto:mart@martlubbers.net}{mart@martlubbers.net}}}
+\date{\today}
diff --git a/experiment.tex b/experiment.tex
new file mode 100644 (file)
index 0000000..328c089
--- /dev/null
@@ -0,0 +1,41 @@
+%&experiment
+\begin{document}
+\maketitle
+
+\section{Setup}
+At the moment a minimal framework for running experiments has been set-up and
+is up and running.
+
+As of now a full album of the death metal band \emph{Cannibal Corpse} has been
+annotated. Figure~\ref{fig:bloodstained} shows a segment of the song
+\emph{Bloodstained Cement}. From the spectrals it is clearly visible that
+during growling the regions around $100$Hz have an increased intensity.
+
+\begin{itemize}
+       \item Sox~\footnote{\url{https://sox.sourceforge.net}} is used to convert
+               the stereo CD audio to mono $44.1Khz$ waveforms
+       \item Using the \texttt{python\_speech\_features}~%
+               \footnote{\url{https://github.com/jameslyons/python_speech_features}}
+               the waveforms are converted to $13$ $MFCC$ cepstrals with the default
+               $25ms$ window every $10ms$.
+       \item The data is matched with the annotated files using
+               \texttt{pympi}~\footnote{\url{https://github.com/dopefishh/pympi}}.
+       \item The framework Keras~\footnote{\url{https://keras.io}} is used to
+               train models and classify the data
+\end{itemize}
+
+\section{Preliminary results}
+The simplest models with only one hidden layer already score around $85\%$
+accuracy. In the comings week more data will be annotated from different bands
+to see the robustness of the models. Moreover, smoothing needs to be applied
+because the predictions are very noisy. This is probably due to pauses in
+growling. This can easily be smoothed out by not allowing extremely short
+growling segments.
+
+\begin{figure}[h]
+       \centering
+       \includegraphics[width=.7\linewidth]{cement}
+       \caption{A vocal segment of the \emph{Cannibal Corpse} song
+               \emph{Bloodstained Cement}}\label{fig:bloodstained}
+\end{figure}
+\end{document}
diff --git a/img/cement.png b/img/cement.png
new file mode 100644 (file)
index 0000000..576e7df
Binary files /dev/null and b/img/cement.png differ