Frameword for fsm almost complete, graphviz output too
authorMart Lubbers <mart@martlubbers.net>
Wed, 2 Jul 2014 17:46:41 +0000 (19:46 +0200)
committerMart Lubbers <mart@martlubbers.net>
Wed, 2 Jul 2014 17:46:41 +0000 (19:46 +0200)
program/regexex/.gitignore [new file with mode: 0644]
program/regexex/fsm.py [new file with mode: 0644]
thesis/methods.tex [new file with mode: 0644]
thesis/thesis.pdf
thesis/thesis.tex

diff --git a/program/regexex/.gitignore b/program/regexex/.gitignore
new file mode 100644 (file)
index 0000000..a136337
--- /dev/null
@@ -0,0 +1 @@
+*.pdf
diff --git a/program/regexex/fsm.py b/program/regexex/fsm.py
new file mode 100644 (file)
index 0000000..2f14fcc
--- /dev/null
@@ -0,0 +1,194 @@
+#!/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import re
+
+
+class fsm():
+    split = re.compile(r"""
+(
+    (?:
+        \[.*?\]|        # character group
+        \(.*\)|         # group
+        \\?.            # single character
+    )
+    (?:
+        \{[0-9,]+\}|    # strict quantifier
+        [?+*]           # loose quantifier
+    )?
+)""")
+
+    def __init__(self):
+        # List of nodes number as key, info as value
+        self.nodes = {-1: '#start', -2: '#end'}
+        # List of connection tuples
+        self.connections = {}
+        # Dictionary of string that are added and have to be integrated
+        self.strings = {}
+        # Dictionary of marking locations in the graph
+        self.markings = {}
+
+    def add_node(self, string, key=-1):
+        """Add a node to the finite state machine
+
+        Required arguments:
+        string -- the string to which the node will match
+
+        Keyword arguments:
+        key    -- key to which the node is assigned in the internal structure,
+                  when not provided the key is generated (default 1)
+        """
+        key = key if key != -1 else\
+            0 if not self.nodes else max(self.nodes) + 1
+        self.nodes[key] = string
+        return key
+
+    def add_connection(self, node_from, node_to, check=True):
+        """Add a connection to the finite state machine, will throw an
+        exception if the check flag is set to True and some nodes aren't
+        present
+
+        Required arguments:
+        node_from -- key of the starting point of the connection
+        node_to   -- key of the ending point of the connection
+
+        Keyword arguments:
+        check     -- flag to check if the end and start point even exists
+                     before creating the connection (default False)
+        """
+        keys = self.nodes.keys()
+        if check or (node_from in keys and node_to in keys):
+            if node_from not in self.connections:
+                self.connections[node_from] = set()
+            self.connections[node_from].add(node_to)
+        else:
+            raise Exception('One or more nodes not found')
+
+    def add_string(self, string, markings):
+        """Add a string to the finite state machine
+
+        Required arguments:
+        string   -- raw string to integrate in the finite state machine
+        markings -- dictionary of markings of the following form:
+                    {'mark0': (start, end), 'mark1': (start, end) ... }
+        """
+        string = string.replace(' ', '_')
+        self.strings[string] = markings
+        self.integrate(string)
+
+    def integrate(self, string):
+        """Integrate the given string in the finite state machine, will throw
+        an exception if the string doesn't exist in the nodes list
+
+        Required arguments:
+        string -- the raw string to integrate
+        """
+        if string not in self.strings:
+            raise Exception('Given string not present in the nodes')
+        # Add all the markings that were not added
+        markings = self.strings[string]
+        for k in markings.iterkeys():
+            if k not in self.markings:
+                self.markings[k] = (self.add_node('#{}_start'.format(k)),
+                                    self.add_node('#{}_end'.format(k)))
+        # Loop over all tokens to add them as a node
+        beforelast = -1
+        for enum, token in enumerate(string):
+            last = self.add_node(token)
+            for k, v in markings.iteritems():
+                # When the index matches the starting point of a category
+                if v[0] == enum:
+                    self.add_connection(beforelast, self.markings[k][0])
+                    self.add_connection(self.markings[k][0], last)
+                    break
+                # When the index matches the ending point of a category
+                elif v[1] == enum:
+                    self.add_connection(beforelast, last)
+                    self.add_connection(last, self.markings[k][1])
+                    last = self.markings[k][1]
+                    break
+            # No category match
+            else:
+                self.add_connection(beforelast, last)
+            # Remember the last node so the new node can connect to it
+            beforelast = last
+        # Connect to the end point
+        self.add_connection(last, -2)
+
+    def optimize(self):
+        """Optimize the finite state machine"""
+        pass
+
+    def graphviz(self, fp='-'):
+        """Print the finite state machine in graphviz format
+
+        Keyword arguments:
+        fp -- filepath, '-' for stdout (default '-')
+        """
+        # Open file and write header
+        fp = sys.stdout if fp == '-' else open(fp, 'w')
+        fp.write('digraph fsm{\n')
+
+        # Find subgraphs travers the graph
+        subgraphs = {a: ([], []) for a in self.markings.keys() + ['none']}
+        self.travers(-1, subgraphs)
+
+        # Print the nodes that are not in a subgraph
+        for nodes in subgraphs['none'][0] + subgraphs['none'][1]:
+            fp.write('{:<2}{}\n'.format(' ', nodes))
+        del(subgraphs['none'])
+
+        #Print the nodes that are in a subgraph
+        for key, sg in subgraphs.iteritems():
+            fp.write('{0:<2}subgraph cluster_{1} {{\n'.format(' ', key))
+            for nodes in sg[0] + sg[1]:
+                fp.write('{:<4}{}\n'.format(' ', nodes))
+            fp.write('{:<2}}}\n'.format(' '))
+
+        # write footer and close file
+        fp.write('}')
+        if fp != sys.stdout:
+            fp.close()
+
+    def travers(self, current, subgraphs, state='none', visited=set()):
+        """Traverse the graph and fill the dictionary for graphviz output
+
+        Required arguments:
+        current   -- current node key
+        subgraphs -- dictionary of subgraphs of the form:
+                     {'subgraph1': ([], []), 'subgraph2': ([], []) ... }
+
+        Keyword arguments:
+        state     -- current state the traverser is in
+        visited   -- set of visited nodes
+        """
+        if current != -2 and current not in visited:
+            visited.add(current)
+            subgraphs[state][0].append('{} [label="{}"]'.format(
+                current, self.nodes[current]))
+            for c in self.connections[current]:
+                subgraphs[state][1].append('{} -> {}'.format(current, c))
+                for name, markings in self.markings.iteritems():
+                    if markings[0] == c:
+                        state = name
+                        break
+                    elif markings[1] == c:
+                        state = 'none'
+                        break
+                self.travers(c, subgraphs, state)
+
+
+if __name__ == '__main__':
+    f = fsm()
+    f.add_string('maandag 11 augustus 2014 19:30 - Neutral Milk Hotel',
+                 {'wanneer': (0, 29), 'wat': (33, 50)})
+    f.add_string('dinsdag 19 augustus 2014 22:00 - Arkells',
+                 {'wanneer': (0, 29), 'wat': (33, 39)})
+    f.add_string('maandag 24 november 2014 20:30 - Fink',
+                 {'wanneer': (0, 29), 'wat': (33, 36)})
+    f.add_string(
+        'maandag 29 december 2014 20:30 - Alain Clark - Locatie: De Duif',
+        {'wanneer': (0, 29), 'wat': (33, 43), 'waar': (47, 62)})
+    f.optimize()
+    f.graphviz()
diff --git a/thesis/methods.tex b/thesis/methods.tex
new file mode 100644 (file)
index 0000000..20b7ae8
--- /dev/null
@@ -0,0 +1,4 @@
+\section{Regular expressions, finite state machines and automata}
+
+\section{Algorithm}
+
index 186b654..abacf2e 100644 (file)
Binary files a/thesis/thesis.pdf and b/thesis/thesis.pdf differ
index 793d707..2d1d9c5 100644 (file)
 \tableofcontents
 \newpage
 
-%\begin{abstract}
-%      \input{abstract.tex}
-%\end{abstract}
+\chapter*{
+       \centering 
+       \begin{normalsize}
+               Abstract
+       \end{normalsize}
+}
+\begin{quotation}
+       \noindent
+       \input{abstract.tex}
+\end{quotation}
+\clearpage
 
 \chapter{Introduction}
 \input{introduction.tex}
 
 \chapter{Methods}
+\input{methods.tex}
 
 \chapter{Results}