From 3eef972bfeccfa8ab537fedc904322afbef1d634 Mon Sep 17 00:00:00 2001 From: Mart Lubbers Date: Sat, 6 May 2017 13:53:26 +0200 Subject: [PATCH] up --- asr.tex | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/asr.tex b/asr.tex index 0aa6d0a..3afed4f 100644 --- a/asr.tex +++ b/asr.tex @@ -16,6 +16,11 @@ \newglossaryentry{dom}{name={Doom Metal}, description={is an extreme heavy metal music style with growling vocals and pounding drums played very slowly}} +\newglossaryentry{FT}{name={Fourier Transform}, + description={is a technique of converting a time representation signal to a + frequency representation}} +\newglossaryentry{MS}{name={Mel-Scale}, + description={is a human ear inspired scale for spectral signals.}} \begin{document} \frontmatter{} @@ -189,12 +194,28 @@ uses piano's and synthesizers. The droning synthesizers often operate in the same frequency as the vocals. \section{\gls{MFCC} Features} -The waveforms are converted to \glspl{MFCC} feature vectors using the -\emph{python\_speech\_features}% +The waveforms in itself are not very suitable to be used as features due to the +high dimensionality and correlation. Therefore we use the aften used +\glspl{MFCC} feature vectors.\todo{cite which papers use this} The actual +conversion is done using the \emph{python\_speech\_features}% \footnote{\url{https://github.com/jameslyons/python_speech_features}} package. -All these steps combined results in thirteen tab separated features per line in -a file for every source file. Technical info about the processing steps is -given in the following sections. + +\gls{MFCC} features are nature inspired and built incrementally in a several of +steps. +\begin{enumerate} + \item The first step in the process is converting the time representation + of the signal to a spectral representation using a sliding window with + overlap. The width of the window and the step size are two important + parameters in the system. In classical phonetic analysis window sizes + of $25ms$ with a step of $10ms$ are often chosen because they are small + enough to only contain subphone entities. Singing for $25ms$ is + impossible so it is arguable that the window size is very small. + \item The standard \gls{FT} gives a spectral representation that has + linearly scaled frequencies. This scale is converted to the \gls{MS} + using triangular overlapping windows. + \item +\end{enumerate} + \todo{Explain why MFCC and which parameters} -- 2.20.1