From 3eef972bfeccfa8ab537fedc904322afbef1d634 Mon Sep 17 00:00:00 2001
From: Mart Lubbers <mart@martlubbers.net>
Date: Sat, 6 May 2017 13:53:26 +0200
Subject: [PATCH] up

---
 asr.tex | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/asr.tex b/asr.tex
index 0aa6d0a..3afed4f 100644
--- a/asr.tex
+++ b/asr.tex
@@ -16,6 +16,11 @@
 \newglossaryentry{dom}{name={Doom Metal},
 	description={is an extreme heavy metal music style with growling vocals and
 	pounding drums played very slowly}}
+\newglossaryentry{FT}{name={Fourier Transform},
+	description={is a technique of converting a time representation signal to a
+	frequency representation}}
+\newglossaryentry{MS}{name={Mel-Scale},
+	description={is a human ear inspired scale for spectral signals.}}
 
 \begin{document}
 \frontmatter{}
@@ -189,12 +194,28 @@ uses piano's and synthesizers. The droning synthesizers often operate in the
 same frequency as the vocals.
 
 \section{\gls{MFCC} Features}
-The waveforms are converted to \glspl{MFCC} feature vectors using the
-\emph{python\_speech\_features}%
+The waveforms in itself are not very suitable to be used as features due to the
+high dimensionality and correlation. Therefore we use the aften used
+\glspl{MFCC} feature vectors.\todo{cite which papers use this} The actual
+conversion is done using the \emph{python\_speech\_features}%
 \footnote{\url{https://github.com/jameslyons/python_speech_features}} package.
-All these steps combined results in thirteen tab separated features per line in
-a file for every source file. Technical info about the processing steps is
-given in the following sections. 
+
+\gls{MFCC} features are nature inspired and built incrementally in a several of
+steps. 
+\begin{enumerate}
+	\item The first step in the process is converting the time representation
+		of the signal to a spectral representation using a sliding window with
+		overlap. The width of the window and the step size are two important
+		parameters in the system. In classical phonetic analysis window sizes
+		of $25ms$ with a step of $10ms$ are often chosen because they are small
+		enough to only contain subphone entities. Singing for $25ms$ is
+		impossible so it is arguable that the window size is very small.
+	\item The standard \gls{FT} gives a spectral representation that has
+		linearly scaled frequencies. This scale is converted to the \gls{MS}
+		using triangular overlapping windows.
+	\item
+\end{enumerate}
+
 
 \todo{Explain why MFCC and which parameters}
 
-- 
2.20.1