From fe5c9369d437d1134f128610d9bb37b4ae13a644 Mon Sep 17 00:00:00 2001 From: Mart Lubbers Date: Mon, 9 Feb 2015 22:30:40 +0100 Subject: [PATCH] version 0.9 --- program/dawg/dawg.py | 18 ++++--- program/everything/apacheconfig | 37 ++++++++++++++ thesis2/3.methods.tex | 85 +++++++++++++++++---------------- thesis2/Makefile | 2 +- thesis2/todo | 2 - 5 files changed, 93 insertions(+), 51 deletions(-) create mode 100644 program/everything/apacheconfig diff --git a/program/dawg/dawg.py b/program/dawg/dawg.py index 82a0cda..934f39b 100644 --- a/program/dawg/dawg.py +++ b/program/dawg/dawg.py @@ -64,22 +64,26 @@ class DAWG: # words = sorted(words) while words: word = words.pop() + print('adding: {}'.format(word)) common_prefix = self.common_prefix(word) print('common_prefix=', common_prefix) last_state = self.delta_star(self.start, common_prefix) print('last_state=', last_state) current_suffix = word[len(common_prefix):] print('current_suffix=', current_suffix) - suffix = self.add_suffix(last_state, current_suffix) - print('added_suffix=', suffix) - self.replace_or_register(suffix) - print('after: register: ', self.register) + if current_suffix: + suffix = self.add_suffix(last_state, current_suffix) + print('added_suffix=', suffix) + self.replace_or_register(suffix) + print('after: register: ', self.register) + else: + self.final.add(last_state) def to_dot(self, options=''): s = 'digraph {{\n{}\nn0 [style=invis]\n'.format(options) for node in self.nodes: s += '{} [shape={}circle]\n'.format( - node, 'double' if node in self.final else '') + node, 'double' if node in self.final else '') s += 'n0 -> {}\n'.format(self.start) for node, transitions in self.delta.items(): for letter, state in transitions.items(): @@ -90,6 +94,6 @@ class DAWG: if __name__ == '__main__': d = DAWG() - d.add_words_sorted(['abd', 'bad', 'bae']) -# d.add_words_sorted(['abcde', 'fghde', 'fghdghde']) +# d.add_words_sorted(['abd', 'bad', 'bae']) + d.add_words_sorted(['abcde', 'fghde', 'fghdghde']) sys.stderr.write(d.to_dot(options='rankdir=LR')) diff --git a/program/everything/apacheconfig b/program/everything/apacheconfig new file mode 100644 index 0000000..768f7f9 --- /dev/null +++ b/program/everything/apacheconfig @@ -0,0 +1,37 @@ + + ServerAdmin webmaster@localhost + + DocumentRoot /var/www + + Options FollowSymLinks + AllowOverride None + + + Options Indexes FollowSymLinks MultiViews + AllowOverride None + Order allow,deny + allow from all + + + ScriptAlias /cgi-bin/ /usr/lib/cgi-bin/ + + AllowOverride None + Options +ExecCGI -MultiViews +SymLinksIfOwnerMatch + Order allow,deny + Allow from all + + + + AddHandler mod_python .py + PythonHandler input_app + PythonDebug on + + + ErrorLog ${APACHE_LOG_DIR}/error.log + + # Possible values include: debug, info, notice, warn, error, crit, + # alert, emerg. + LogLevel warn + + CustomLog ${APACHE_LOG_DIR}/access.log combined + diff --git a/thesis2/3.methods.tex b/thesis2/3.methods.tex index c878087..bd0bd4f 100644 --- a/thesis2/3.methods.tex +++ b/thesis2/3.methods.tex @@ -79,7 +79,7 @@ of all words present in the DAWG. \subsection{Algorithm} The algorithm of building DAWGs is an iterative process that goes roughly in -three steps. We start with the null graph that can be described by +two steps. We start with the null graph that can be described by $G_0=(\{q_0\},\{q_0\},\{\}\{\})$ and does not contain any edges, one node and $\mathcal{L}(G_0)=\emptyset$. The first word that is added to the graph will be added in a naive way. We just create a new node for every transition of @@ -87,20 +87,22 @@ character and we mark the last node as final. From then on all words are added using a stepwise approach. \begin{enumerate} \item - Step one is finding the common prefix of the word in the graph and we chop - it of the word. When the suffix that remains is empty and the last state of - the common prefix is a final state we are done adding the word. When the - suffix is non empty or the common prefix does not end in a final state we - apply step two. + Say we add word $w$ to the grahp. Step one is finding the common prefix of + the word already in the graph. The common prefix is defined as the longest + subword $w'$ for which there is a $\delta^*(q_0, w')$. When the common + prefix is found we change the starting state to the last state of the + common prefix and remain with suffix $w''$ where $w=w'w''$. \item - We find the first confluence state in the common prefix and from there on - we add the suffix. When there is a confluence state the route all the way - back to the starting node will be cloned to make sure no extra words are - added to the graph unwillingly. + We add the suffix to the graph starting from the last node of the common + prefix. + \item + When there are confluence nodes present within the common prefix they are + cloned starting from the last confluence node to avoid adding unwanted + words. \item - Finally if in the suffix propagating from the end to the beginning there - are equal states we merge the state from the suffix to the equal state in - the graph. + From the last node of the suffix up to the first node we replace the nodes + by checking if there are equivalent nodes present in the graph that we can + merge with. \end{enumerate} Pseudocode for the algorithm can be found in Listing~\ref{pseudodawg}. @@ -111,34 +113,35 @@ Figure}~\ref{dawg1} that builds a DAWG with the following entries: \texttt{abcd}, \texttt{aecd} and \texttt{aecf}. In SG0 we begin with the null graph. -Adding the first entry \texttt{abcd} is trivial and just creates a single path -and does not require any hard work. The result of adding the first word is -visible in subgraph SG1. - -For the second entry \texttt{aecd} we will have to follow the steps described -earlier. The common prefix is \texttt{a} and the common suffix is \texttt{bc}. -Therefore the first node in which the common prefix starts needs to be copied -and split off to make room for an alternative route. This is exactly what -happens in SG2. Node $q_2$ is copied and gets a connection to node $q_3$. From -the last node of the common prefix a route is built towards the first node, -that is the copied node, of the common suffix resulting in an extra path -$q_1\rightarrow q_5\rightarrow q_3$. Since there are no confluence nodes we can -leave the prefix as it is. This results in subgraph SG2. - -The last entry to add is the word \texttt{aecf}. When this is done without -confluence node checking we create extra paths in the graph that are unwanted. -The common prefix is \texttt{aec} and the common suffix is \texttt{f}. -\texttt{f} can not be merged into existing edges and thus a new edge will be -created from the end of the common suffix to the end. This results in subgraph -SG3. Note that we did not take care of confluence nodes in the common prefix -and because of that the word \texttt{abcf} is also accepted even when it was -not added to the graph. This is unwanted behaviour. Going back to the common -prefix \texttt{aec}. When we check for confluence nodes we see that the last -node of the common prefix is such a node and that we need to clone the path -and separate the route to the final node. $q_3$ will be cloned into $q_6$ with -the route of the common prefix. Tracking the route back we do not encounter any -other confluence states and therefore we can merge in the suffix which results -in the final DAWG visible in subgraph SG4. No new words are added. +Adding the first entry \texttt{abcd} is trivial because just creates a single +path and does not require any hard work. Because the common prefix we find in +Step 1 is empty the suffix will be the entire word. There is also no +possibility of merging nodes because there were no nodes to begin with. The +result of adding the first word is visible in subgraph SG1. + +For the second entry \texttt{aecd} we will have to do some extra work. +The common prefix found in Step 1 is \texttt{a} which we add to the graph. This +leaves us in Step 2 with the suffix \texttt{ecd} which we add too. In Step 3 we +see that there are no confluence nodes in our common prefix and therefore we +do not have to clone nodes. In Step 4 we traverse from the last node back to +the beginning of the suffix and find a common suffix \texttt{cd} and we can +merge these nodes. In this way we can reuse the transition from $q_3$ to $q_4$. +This leaves us with subgraph SG2. + +We now add the last entry which is the word \texttt{aecf}. When we do this +without the confluence node checking we encounter an unwanted extra word. In +Step 1 we find the common prefix \texttt{aec} and that leaves us with the +suffix \texttt{f} which we will add. This creates subgraph SG3 and we notice +there is an extra word present in the graph, namely the word \texttt{abcf}. +This extra word appeared because of the confluence node $q_3$ which was present +in the common prefix introducing an unwanted path. Therefore in Step 3 when we +check for confluence node we would see that node $q_3$ is a confluence node and +needs to be cloned off the original path, by cloning we take the added suffix +with it to create an entirely new route. Tracking the route back again we do +not encounter any other confluence nodes so we can start Step 4. For this word +there is no common suffix and therefore no merging will be applied. This +results in subgraph SG4 which is the final DAWG containing only the words +added. \begin{figure}[H] \caption{Step 1} diff --git a/thesis2/Makefile b/thesis2/Makefile index e2c2ad8..647c756 100644 --- a/thesis2/Makefile +++ b/thesis2/Makefile @@ -1,5 +1,5 @@ SHELL:=/bin/bash -VERSION:=0.8 +VERSION:=0.9 all: thesis diff --git a/thesis2/todo b/thesis2/todo index 7a76ff7..e6baa38 100644 --- a/thesis2/todo +++ b/thesis2/todo @@ -2,9 +2,7 @@ Algemeen: - Formele taal + natuurlijke Specifiek: -- Uitleggen minimaliseren + maat - Minimaliseert dit algorithme? -- Stappen in het voorbeeld beter beschrijven(punt en komma veranderen) - Referenties toevoegen en andere (huidige) toepassingen van algorithme Algoritme: -- 2.20.1