From: Mart Lubbers Date: Thu, 20 Oct 2016 10:43:07 +0000 (+0200) Subject: inital version exma X-Git-Url: https://git.martlubbers.net/?a=commitdiff_plain;h=5b1558b90593b0f55805ca7e1c30e3ca43452fce;p=itlast1617.git inital version exma --- diff --git a/exam/Makefile b/exam/Makefile new file mode 100644 index 0000000..b74a345 --- /dev/null +++ b/exam/Makefile @@ -0,0 +1,12 @@ +DOCUMENTS:=exam +PDFLATEXFLAGS:=-halt-on-error +PDFLATEX:=pdflatex $(PDFLATEXFLAGS) + +all: $(addsuffix .pdf,$(DOCUMENTS)) + +%.pdf: %.tex q1.tex q2.tex q3.tex + $(PDFLATEX) $< + $(PDFLATEX) $< + +clean: + $(RM) -v $(addprefix $(DOCUMENTS),.pdf .log .aux) diff --git a/exam/exam.tex b/exam/exam.tex new file mode 100644 index 0000000..26a6d74 --- /dev/null +++ b/exam/exam.tex @@ -0,0 +1,29 @@ +\documentclass[titlepage,a4paper]{article} + +\usepackage{rutitlepage} +\usepackage{geometry} +\usepackage{enumitem} +\usepackage{listings} + +\title{Midterm Exam} +\author{Mart Lubbers\\s4109503} +\date{\today} + +\begin{document} +\maketitleru[% + course={Introduction to Language and Speech Technology}, + authorstext={Author:}] +\begin{enumerate} + % Question 1 + \item\input{q1.tex} + + \newpage + % Question 2 + \item\input{q2.tex} + + \newpage + % Question 3 + \item\input{q3.tex} +\end{enumerate} + +\end{document} diff --git a/exam/q1.tex b/exam/q1.tex new file mode 100644 index 0000000..a79e17f --- /dev/null +++ b/exam/q1.tex @@ -0,0 +1,67 @@ +\begin{enumerate} + % Question 1a + \item Disfluencies are annotated by surrounding them with square braces. + The first bit shows the \emph{reparandum}, the second bit denoted with + the \texttt{+} shows the \emph{editing phase} and the last bit shows + the \emph{repair}. We want to only keep the repair since that depicts + the correct, meant by the speaker, speech. + + \verb#s/\[.*?\+\{.*?\}(.*?)\]/\1/g# + + Bit by bit: + \begin{itemize} + \item \verb#s/# Substitution. + \item \verb#\[# Matches the opening square bracket. We escape this + because \verb#[# is a regular expression control character and + we want to match a literal. + \item \verb#.*?\+# Matches non-greedily everything up to the plus + mark. Thus the \emph{reparandum}. Note that the + \emph{reparandum} can be empty (in case the speaker immediately + start editing). We escape the \verb#+# for the same reason as + the previous segment. + \item \verb#\{.*?\}# Matches everything between the curly braces. + Thus the \emph{editing phase}. Note again that this match can + only contain empty curly braces since the \emph{editing phase} + can be empty. + \item \verb#(.*?)# Matches non-greedily everything up to the + closing square brace and captures it in the group. Thus the + \emph{repair}. Note that we do not require this group to be the + exact same as the \emph{reparandum}. + \item \verb#\]/# Matches the closing square bracket and we proceed + to the replacement. We escape this for the same reason as + before. + \item \verb#\1/g# We replace the entire match with only the + captured \emph{repair} group and do this globally since there + can be multiple repairs in an utterance. + \end{itemize} + + % Question 1b + \item \textsc{MEMM}'s use features to add extra information to words. + \textsc{IOB} tagging is a partial parsing or chunking method that only + discriminates between \emph{Beginning} (\texttt{B}), \emph{Internal} + (\texttt{I}) and \emph{Outside} (\texttt{O}) categories. + + Say we use the same segmentation as before, we should mark the + \emph{reparandum} and \emph{editing phase} as \emph{Outside} + (\texttt{O}) parts and the repair should be parsed as usual. Note that + a chunk then can include \texttt{O} marked segments. For example in ``a + car uh plane'' the ``car uh'' part will be tagged as \texttt{O}, ``a'' + as \texttt{B\_NP} and ``plane'' as \texttt{I\_NP}. + + For the algorithms it might be necessary to add a different tag to + denote internal \texttt{O} segments. This can be done by adding a + suffix to the \texttt{O} tag. In the previous example the text will + then be chunked as: \texttt{B\_NP O\_NP I\_NP}. + + Concerning the \textsc{MEMM} features, obviously editing phase segments + should be marked as such but also the reparandum should be tagged as + such to not confuse it with a regular segment. + + % Question 1c + \item Repairs are only noticed when you can lookahead to the \emph{editing + phase} markers. It might be necessary to either lookahead a little bit + or to work outwards from the identified \emph{editing phase}. + Right-to-left has the same problem as left-to-right in the sense that + it will see the repair first and also has to lookahead to know whether + it is part of a repair. +\end{enumerate} diff --git a/exam/q2.tex b/exam/q2.tex new file mode 100644 index 0000000..ecd1781 --- /dev/null +++ b/exam/q2.tex @@ -0,0 +1,47 @@ +\begin{enumerate} + % Question 2a + \item This can be achieved by adding disfluency rules to the \textsc{CFG}. + This has to be done for all rules that can possible produce + disfluencies. Most likely only the lowest level of rules (unit + productions) need such disfluency structures. For example, if we would + do it for the rule that transforms a \texttt{Noun} into a word it would + look like this: + + \begin{lstlisting} +Noun -> TrueNoun | EditNoun TrueNoun +TrueNoun -> flight | ... + +EditNoun -> TrueNoun EditWord +EditWord -> uh | ... + \end{lstlisting} + + With feature structures this can be generalized and have less + ambiguitiy. Features can for example force the \emph{Reparandum} to be + of the same \texttt{CAT} as the \emph{Repair} and disfluencies might + have some constraints that can also be expressed with features. + + % Question 2b + \item Standard \textsc{CKY} parsing only works for grammars in + \emph{Chomsky Normal Form} (\textsc{CNF}). This means that the tree + returned will not exactly represent the \textsc{CFG} since it possibly + had to be converted to \textsc{CNF}. To adapt \textsc{CKY} in a + fundamental way so that it correctly parses repair structures would be + very difficult, albeit impossible. It basically means that, in the + deepest loop, you have to build in functionality that is similar to the + grammar that recognizes such structures and behave accordingly. While + this is probably theoretically possible, it will result in a different + algorithm that has a hard-coded sub-grammar in itself. + + % Question 2c + \item Similar to the previous sub-question; while it is possible to make the + \emph{Predictor} more smart and add disfluency structures to the chart + it would change the \emph{Earley} algorithm significantly. The change + of the algorithm would also be very specific to certain disfluency + structures and makes it possibly unusable for languages that do not + have such structures. Note that it is more easy to add this to an + \emph{Earley} parser compared to adding it to an \emph{CKY} parser. For + an \emph{Earley} parser it just means hard-coding some extra grammar + rules in the \emph{Predictor}. For \emph{CKY} it means transforming + the rules to specific transformations in the table which might not be + trivial. +\end{enumerate} diff --git a/exam/q3.tex b/exam/q3.tex new file mode 100644 index 0000000..b3061a3 --- /dev/null +++ b/exam/q3.tex @@ -0,0 +1,61 @@ +\begin{enumerate} + % Question 3a + \item In an \emph{ASR} system we can expect problems in several phases. + + The first phase of an \emph{ASR} is just extracting the features. We + do not expect problems there since it will just produce slightly + different features for some part but that is not something the feature + extraction cares about. It just objectively has to extract features and + since it is still human speech, there are no problem with disfluencies. + + When trying to transform the cepstral features into a sentence several + components are involved. First a phone likelihood is calculated, we + maybe expect slight problems here since even the phones might be + reduced and the \emph{editing phase} words could just be rudimentary + sounds instead of phones and thus it might select suboptimal + likelihoods. + + When decoding the phone likelihood into words a lexicon is used. This + lexicon might not contain the edit words and possibly also not the + reduced \emph{reparanda}. + + Finally during \emph{Viterbi} \emph{N-Gram} models come into play and + if they are not extracted from a dataset that also included + disfluencies it might be the case that the probabilities of + disfluencies appearing are so low that it tries to fit similarly + sounding real words instead of the disfluency. + + % Question 3b + \item Solving the problem of phone likelihood computation can be done by + shrinking the window of the feature extraction so that strongly reduced + phones are also correctly recognized. + + Solving the second problem can be done by adding disfluency words to + the lexicon and also more reductive pronunciations of words. + + Lastly we can increase the decoding performance by specifically + extracting the \emph{N-Gram} probabilities from data that also contains + disfluencies. + + % Question 3c + \item To add disfluencies to speech synthesis one must know how they arise. + There are some word categories that have more disfluencies than others. + Also they may be produced to give speaker some more time to think about + the rest of the sentence. When you know such properties of disfluencies + you can model them in the speech synthesis in the normalization phase. + + In the normalization phase the system can add disfluencies at sections + that often produce them. This most likely is the most effective in + tokenisation. Specific tokens that can be selected to be expanded to a + disfluency. + + Later on in the pipeline the system must also be adapted. Namely in the + waveform synthesis. Depending on the technique applied some + improvements can be done. When the synthesis technique is unit + selection it might be helpful to have units for common disfluencies and + at least units for \emph{editing phase} words. It might also be helpful + to add units that represent a reduced pronunciation to be used in the + \emph{reparandum}. When \emph{diphone synthesis} is used there do not + have to be big changes to be applied since most likely the diphone + combinations already exist in the database. +\end{enumerate} diff --git a/exam/rutitlepage.sty b/exam/rutitlepage.sty new file mode 100644 index 0000000..ab0afc5 --- /dev/null +++ b/exam/rutitlepage.sty @@ -0,0 +1,58 @@ +% Radboud University Nijmegen titlepage +% Author: Mart Lubbers +% Date: 2016-06-28 +% +% TODO +% - Internationalize (dutch logos) +% - Nice document +% - Make CTAN ready +\RequirePackage{graphicx,ifpdf,keyval} + +\makeatletter +\define@key{maketitleru}{course}{\def\@rutitlecourse{#1}} +\define@key{maketitleru}{institute}{\def\@rutitleinst{#1}} +\define@key{maketitleru}{authorstext}{\def\@rutitleauthorstext{#1}} +\define@key{maketitleru}{righttext}{\def\@rutitlerighttext{#1}} +\define@key{maketitleru}{righttextheader}{\def\@rutitlerighttextheader{#1}} +\setkeys{maketitleru}{% + course={}, + institute={Radboud University Nijmegen}, + authorstext={Authors:}, + righttextheader={}, + righttext={} +} +\newcommand{\maketitleru}[1][]{ + \setkeys{maketitleru}{#1} + \begin{titlepage} + \makeatletter + \begin{center} + \textsc{\LARGE\@rutitlecourse}\\[1.5cm] + \ifpdf\includegraphics[height=150pt]{logo.pdf}\\ + \else\includegraphics[height=150pt]{logo.eps}\\ + \fi + \vspace{0.4cm} + \textsc{\Large\@rutitleinst}\\[1cm] + \hrule + \vspace{0.4cm} + \textbf{\large\@title}\\[0.4cm] + \hrule + \vspace{2cm} + \begin{minipage}[t]{0.45\textwidth} + \begin{flushleft}\large + \textit{\@rutitleauthorstext}\\ + \@author{} + \end{flushleft} + \end{minipage} + \begin{minipage}[t]{0.45\textwidth} + \begin{flushright}\large + \textit{\@rutitlerighttextheader}\\ + \@rutitlerighttext + \end{flushright} + \end{minipage} + \vfill + {\large\@date} + \end{center} + \makeatother + \end{titlepage} +} +\makeatother