mfcc

[asr1617.git] / asr.tex
diff --git a/asr.tex b/asr.tex

index 0aa6d0a..3f5eaaf 100644 (file)
--- a/asr.tex
+++ b/asr.tex
@@ -16,6 +16,11 @@
  \newglossaryentry{dom}{name={Doom Metal},
         description={is an extreme heavy metal music style with growling vocals and
         pounding drums played very slowly}}
  \newglossaryentry{dom}{name={Doom Metal},
         description={is an extreme heavy metal music style with growling vocals and
         pounding drums played very slowly}}
+\newglossaryentry{FT}{name={Fourier Transform},
+       description={is a technique of converting a time representation signal to a
+       frequency representation}}
+\newglossaryentry{MS}{name={Mel-Scale},
+       description={is a human ear inspired scale for spectral signals.}}
  
  \begin{document}
  \frontmatter{}
  
  \begin{document}
  \frontmatter{}
@@ -189,12 +194,28 @@ uses piano's and synthesizers. The droning synthesizers often operate in the
  same frequency as the vocals.
  
  \section{\gls{MFCC} Features}
  same frequency as the vocals.
  
  \section{\gls{MFCC} Features}
-The waveforms are converted to \glspl{MFCC} feature vectors using the
-\emph{python\_speech\_features}%
+The waveforms in itself are not very suitable to be used as features due to the
+high dimensionality and correlation. Therefore we use the aften used
+\glspl{MFCC} feature vectors.\todo{cite which papers use this} The actual
+conversion is done using the \emph{python\_speech\_features}%
  \footnote{\url{https://github.com/jameslyons/python_speech_features}} package.
  \footnote{\url{https://github.com/jameslyons/python_speech_features}} package.
-All these steps combined results in thirteen tab separated features per line in
-a file for every source file. Technical info about the processing steps is
-given in the following sections. 
+
+\gls{MFCC} features are nature inspired and built incrementally in a several of
+steps. 
+\begin{enumerate}
+       \item The first step in the process is converting the time representation
+               of the signal to a spectral representation using a sliding window with
+               overlap. The width of the window and the step size are two important
+               parameters in the system. In classical phonetic analysis window sizes
+               of $25ms$ with a step of $10ms$ are often chosen because they are small
+               enough to only contain subphone entities. Singing for $25ms$ is
+               impossible so it is arguable that the window size is very small.
+       \item The standard \gls{FT} gives a spectral representation that has
+               linearly scaled frequencies. This scale is converted to the \gls{MS}
+               using triangular overlapping windows.
+       \item
+\end{enumerate}
+
  
  \todo{Explain why MFCC and which parameters}
  
  
  \todo{Explain why MFCC and which parameters}
  
@@ -209,7 +230,11 @@ given in the following sections.
  
  
  \chapter{Conclusion \& Discussion}
  
  
  \chapter{Conclusion \& Discussion}
+\section{Conclusion}
  %Discussion section
  %Discussion section
+
+\section{Discussion}
+
  \todo{Novelty}
  \todo{Weaknesses}
  \todo{Dataset is not very varied but\ldots}
  \todo{Novelty}
  \todo{Weaknesses}
  \todo{Dataset is not very varied but\ldots}