From da9d9fb2f16d64d20691dbb8bdec6c1047ca3912 Mon Sep 17 00:00:00 2001
From: Mart Lubbers <mart@martlubbers.net>
Date: Mon, 2 Feb 2015 21:21:55 +0100
Subject: [PATCH] update

---
 thesis2/3.methods.tex | 131 +++++++++++++++++++++++-------------------
 1 file changed, 72 insertions(+), 59 deletions(-)

diff --git a/thesis2/3.methods.tex b/thesis2/3.methods.tex
index 3a6b28b..17a180d 100644
--- a/thesis2/3.methods.tex
+++ b/thesis2/3.methods.tex
@@ -70,52 +70,65 @@ character and we mark the last node as final. From then on all words are added
 using a stepwise approach.
 \begin{itemize}
 	\item 
-
-
-
-Incrementally node-lists are added to create a graph. For example in
-{Subgraphs in Figure}~\ref{dawg1} visualizes the construction of the
-DAWG from the entries: \texttt{aibc}, \texttt{aibc} and \texttt{ajbd}.
+		Step one is finding the common prefix of the word in the graph and we chop
+		it of the word. When the suffix that remains is empty and the last state of
+		the common prefix is a final state we are done adding the word. When the
+		suffix is non empty or the common prefix does not end in a final state we
+		apply step two.
+	\item
+		We find the first confluence state in the common prefix and from there on
+		we add the suffix. When there is a confluence state the route all the way
+		back to the starting node will be cloned to make sure no extra words are
+		added to the graph unwillingly.
+	\item
+		Finally if in the suffix propagating from the end to the beginning there
+		are equal states we merge the state from the suffix to the equal state in
+		the graph.
+\end{itemize}
 
 Pseudocode for the algorithm can be found in Listing~\ref{pseudodawg}.
 
-In SG0 the graph is only the null graph, described by
-$G=(\{q_0\},\{q_0\},\{\}\{\})$ and does not contain any entries.
-
-Adding the first entry \texttt{a,bc} is not a
-hard task because there is no common prefix nor suffix so the entry becomes a
-strain of nodes starting from $q_0$ and is visualized in subgraph SG1.
-
-For the second entry \texttt{a.bc} some smart optimization has to be applied.
-The common prefix is \texttt{a} and the common suffix is \texttt{bc}. Therefore
-the first node in which the common prefix starts needs to be copied and split
-off to make room for an alternative route. This is exactly what happens in SG2.
-Node $q_2$ is copied and gets a connection to node $q_3$. From the last node of
-the common prefix a route is built towards the first node, that is the copied
-node, of the common suffix resulting in an extra path $q_1\rightarrow
-q_5\rightarrow q_3$. This results in subgraph SG2.
-
-Adding the thing node \texttt{a.bd} in the same naive way as the second node
-results in subgraph SG3 and introduces a bad side effect. Namely that a new
-word, that was not specifically added was introduced, is added to the DAWG.
-This is because within the common prefix there is a node that has some other
-arrow leading to it. When a new path is added the algorithm checks the common
-prefix and suffix for confluence nodes. Confluence nodes are nodes that have
-multiple arrows leading in and because of the multiple arrows they can lead to
-unwanted additional words. The first confluence node found must be copied and
-detached and the specific suffix from the entry must be copied with it to
-separate the path from the existing paths. Applying this technique in the
-example leads to subgraph SG4. With common prefix \texttt{a.b} and an empty
-common suffix node $q_3$ is found as a confluence node in the common prefix and
-therefore node $q_3$ is copied to the new node $q_6$ taking the paths leading
-to the final state with it. In this way no new words are added to the DAWG and
-the DAWG is still optimal.
-
+\subsection{Example}
+We visualize this with an example shown in the {Subgraphs in
+Figure}~\ref{dawg1} that builds a DAWG with the following entries:
+\texttt{abcd}, \texttt{aecd} and \texttt{aecf}. In SG0 we begin with the null
+graph.
+
+Adding the first entry \texttt{abcd} is trivial and just creates a single path
+and does not require any hard work. The result of adding the first word is
+visible in subgraph SG1.
+
+For the second entry \texttt{aecd} we will have to follow the steps described
+earlier. The common prefix is \texttt{a} and the common suffix is \texttt{bc}.
+Therefore the first node in which the common prefix starts needs to be copied
+and split off to make room for an alternative route. This is exactly what
+happens in SG2. Node $q_2$ is copied and gets a connection to node $q_3$. From
+the last node of the common prefix a route is built towards the first node,
+that is the copied node, of the common suffix resulting in an extra path
+$q_1\rightarrow q_5\rightarrow q_3$. Since there are no confluence nodes we can
+leave the prefix as it is. This results in subgraph SG2.
+
+The last entry to add is the word \texttt{aecf}. When this is done without
+confluence node checking we create extra paths in the graph that are unwanted.
+The common prefix is \texttt{aec} and the common suffix is \texttt{f}.
+\texttt{f} can not be merged into existing edges and thus a new edge will be
+created from the end of the common suffix to the end. This results in subgraph
+SG3. Note that we did not take care of confluence nodes in the common prefix
+and because of that the word \texttt{abcf} is also accepted even when it was
+not added to the graph. This is unwanted behaviour. Going back to the common
+prefix \texttt{aec}. When we check for confluence nodes we see that the last
+node of the common prefix is such a node and that we need to clone the path
+and separate the route to the final node. $q_3$ will be cloned into $q_6$ with
+the route of the common prefix. Tracking the route back we do not encounter any
+other confluence states and therefore we can merge in the suffix which results
+in the final DAWG visible in subgraph SG4. No new words are added.
+
+\texttt{abcd}, \texttt{aecd} and \texttt{aecf}. In SG0 we begin with the null
 \begin{figure}[H]
 	\caption{Step 1}
 	\label{dawg1}
 	\centering
-	\digraph[width=\linewidth]{inccons}{
+	\digraph[scale=0.4]{inccons}{
 		rankdir=LR;
 		n4 [style=invis];
 		q40 [label="q0"];
@@ -127,13 +140,13 @@ the DAWG is still optimal.
 		q46 [label="q6"];
 		n4 -> q40[label="SG4"];
 		q40 -> q41[label="a"];
-		q41 -> q42[label="i"];
-		q42 -> q43[label="b"];
-		q43 -> q44[label="c"];
-		q41 -> q45[label="j"];
-		q45 -> q46[label="b"];
+		q41 -> q42[label="b"];
+		q42 -> q43[label="c"];
+		q43 -> q44[label="d"];
+		q41 -> q45[label="e"];
+		q45 -> q46[label="c"];
 		q46 -> q44[label="d"];
-		q46 -> q44[label="c"];
+		q46 -> q44[label="f"];
 
 		n3 [style=invis];
 		q30 [label="q0"];
@@ -144,12 +157,12 @@ the DAWG is still optimal.
 		q35 [label="q5"];
 		n3 -> q30[label="SG3"];
 		q30 -> q31[label="a"];
-		q31 -> q32[label="i"];
-		q32 -> q33[label="b"];
-		q33 -> q34[label="c"];
-		q33 -> q34[label="d"iconstraint=false];
-		q31 -> q35[label="j"];
-		q35 -> q33[label="b"];
+		q31 -> q32[label="b"];
+		q32 -> q33[label="c"];
+		q33 -> q34[label="d"];
+		q33 -> q34[label="f"iconstraint=false];
+		q31 -> q35[label="e"];
+		q35 -> q33[label="c"];
 
 		n2 [style=invis];
 		q20 [label="q0"];
@@ -160,11 +173,11 @@ the DAWG is still optimal.
 		q25 [label="q5"];
 		n2 -> q20[label="SG2"];
 		q20 -> q21[label="a"];
-		q21 -> q22[label="i"];
-		q22 -> q23[label="b"];
-		q23 -> q24[label="c"];
-		q21 -> q25[label="j"];
-		q25 -> q23[label="b"];
+		q21 -> q22[label="b"];
+		q22 -> q23[label="c"];
+		q23 -> q24[label="d"];
+		q21 -> q25[label="e"];
+		q25 -> q23[label="c"];
 
 		n1 [style=invis];
 		q10 [label="q0"];
@@ -174,9 +187,9 @@ the DAWG is still optimal.
 		q14 [label="q4"ishape=doublecircle];
 		n1 -> q10[label="SG1"];
 		q10 -> q11[label="a"];
-		q11 -> q12[label="i"];
-		q12 -> q13[label="b"];
-		q13 -> q14[label="c"];
+		q11 -> q12[label="b"];
+		q12 -> q13[label="c"];
+		q13 -> q14[label="d"];
 
 		n [style=invis];
 		q0 [label="q0"];
-- 
2.20.1