From: Mart Lubbers Date: Wed, 2 Jul 2014 17:46:41 +0000 (+0200) Subject: Frameword for fsm almost complete, graphviz output too X-Git-Url: https://git.martlubbers.net/?a=commitdiff_plain;h=ad5d1a45c502a0f68d30c52de86b32e065d99c86;p=bsc-thesis1415.git Frameword for fsm almost complete, graphviz output too --- diff --git a/program/regexex/.gitignore b/program/regexex/.gitignore new file mode 100644 index 0000000..a136337 --- /dev/null +++ b/program/regexex/.gitignore @@ -0,0 +1 @@ +*.pdf diff --git a/program/regexex/fsm.py b/program/regexex/fsm.py new file mode 100644 index 0000000..2f14fcc --- /dev/null +++ b/program/regexex/fsm.py @@ -0,0 +1,194 @@ +#!/bin/env python +# -*- coding: utf-8 -*- + +import sys +import re + + +class fsm(): + split = re.compile(r""" +( + (?: + \[.*?\]| # character group + \(.*\)| # group + \\?. # single character + ) + (?: + \{[0-9,]+\}| # strict quantifier + [?+*] # loose quantifier + )? +)""") + + def __init__(self): + # List of nodes number as key, info as value + self.nodes = {-1: '#start', -2: '#end'} + # List of connection tuples + self.connections = {} + # Dictionary of string that are added and have to be integrated + self.strings = {} + # Dictionary of marking locations in the graph + self.markings = {} + + def add_node(self, string, key=-1): + """Add a node to the finite state machine + + Required arguments: + string -- the string to which the node will match + + Keyword arguments: + key -- key to which the node is assigned in the internal structure, + when not provided the key is generated (default 1) + """ + key = key if key != -1 else\ + 0 if not self.nodes else max(self.nodes) + 1 + self.nodes[key] = string + return key + + def add_connection(self, node_from, node_to, check=True): + """Add a connection to the finite state machine, will throw an + exception if the check flag is set to True and some nodes aren't + present + + Required arguments: + node_from -- key of the starting point of the connection + node_to -- key of the ending point of the connection + + Keyword arguments: + check -- flag to check if the end and start point even exists + before creating the connection (default False) + """ + keys = self.nodes.keys() + if check or (node_from in keys and node_to in keys): + if node_from not in self.connections: + self.connections[node_from] = set() + self.connections[node_from].add(node_to) + else: + raise Exception('One or more nodes not found') + + def add_string(self, string, markings): + """Add a string to the finite state machine + + Required arguments: + string -- raw string to integrate in the finite state machine + markings -- dictionary of markings of the following form: + {'mark0': (start, end), 'mark1': (start, end) ... } + """ + string = string.replace(' ', '_') + self.strings[string] = markings + self.integrate(string) + + def integrate(self, string): + """Integrate the given string in the finite state machine, will throw + an exception if the string doesn't exist in the nodes list + + Required arguments: + string -- the raw string to integrate + """ + if string not in self.strings: + raise Exception('Given string not present in the nodes') + # Add all the markings that were not added + markings = self.strings[string] + for k in markings.iterkeys(): + if k not in self.markings: + self.markings[k] = (self.add_node('#{}_start'.format(k)), + self.add_node('#{}_end'.format(k))) + # Loop over all tokens to add them as a node + beforelast = -1 + for enum, token in enumerate(string): + last = self.add_node(token) + for k, v in markings.iteritems(): + # When the index matches the starting point of a category + if v[0] == enum: + self.add_connection(beforelast, self.markings[k][0]) + self.add_connection(self.markings[k][0], last) + break + # When the index matches the ending point of a category + elif v[1] == enum: + self.add_connection(beforelast, last) + self.add_connection(last, self.markings[k][1]) + last = self.markings[k][1] + break + # No category match + else: + self.add_connection(beforelast, last) + # Remember the last node so the new node can connect to it + beforelast = last + # Connect to the end point + self.add_connection(last, -2) + + def optimize(self): + """Optimize the finite state machine""" + pass + + def graphviz(self, fp='-'): + """Print the finite state machine in graphviz format + + Keyword arguments: + fp -- filepath, '-' for stdout (default '-') + """ + # Open file and write header + fp = sys.stdout if fp == '-' else open(fp, 'w') + fp.write('digraph fsm{\n') + + # Find subgraphs travers the graph + subgraphs = {a: ([], []) for a in self.markings.keys() + ['none']} + self.travers(-1, subgraphs) + + # Print the nodes that are not in a subgraph + for nodes in subgraphs['none'][0] + subgraphs['none'][1]: + fp.write('{:<2}{}\n'.format(' ', nodes)) + del(subgraphs['none']) + + #Print the nodes that are in a subgraph + for key, sg in subgraphs.iteritems(): + fp.write('{0:<2}subgraph cluster_{1} {{\n'.format(' ', key)) + for nodes in sg[0] + sg[1]: + fp.write('{:<4}{}\n'.format(' ', nodes)) + fp.write('{:<2}}}\n'.format(' ')) + + # write footer and close file + fp.write('}') + if fp != sys.stdout: + fp.close() + + def travers(self, current, subgraphs, state='none', visited=set()): + """Traverse the graph and fill the dictionary for graphviz output + + Required arguments: + current -- current node key + subgraphs -- dictionary of subgraphs of the form: + {'subgraph1': ([], []), 'subgraph2': ([], []) ... } + + Keyword arguments: + state -- current state the traverser is in + visited -- set of visited nodes + """ + if current != -2 and current not in visited: + visited.add(current) + subgraphs[state][0].append('{} [label="{}"]'.format( + current, self.nodes[current])) + for c in self.connections[current]: + subgraphs[state][1].append('{} -> {}'.format(current, c)) + for name, markings in self.markings.iteritems(): + if markings[0] == c: + state = name + break + elif markings[1] == c: + state = 'none' + break + self.travers(c, subgraphs, state) + + +if __name__ == '__main__': + f = fsm() + f.add_string('maandag 11 augustus 2014 19:30 - Neutral Milk Hotel', + {'wanneer': (0, 29), 'wat': (33, 50)}) + f.add_string('dinsdag 19 augustus 2014 22:00 - Arkells', + {'wanneer': (0, 29), 'wat': (33, 39)}) + f.add_string('maandag 24 november 2014 20:30 - Fink', + {'wanneer': (0, 29), 'wat': (33, 36)}) + f.add_string( + 'maandag 29 december 2014 20:30 - Alain Clark - Locatie: De Duif', + {'wanneer': (0, 29), 'wat': (33, 43), 'waar': (47, 62)}) + f.optimize() + f.graphviz() diff --git a/thesis/methods.tex b/thesis/methods.tex new file mode 100644 index 0000000..20b7ae8 --- /dev/null +++ b/thesis/methods.tex @@ -0,0 +1,4 @@ +\section{Regular expressions, finite state machines and automata} + +\section{Algorithm} + diff --git a/thesis/thesis.pdf b/thesis/thesis.pdf index 186b654..abacf2e 100644 Binary files a/thesis/thesis.pdf and b/thesis/thesis.pdf differ diff --git a/thesis/thesis.tex b/thesis/thesis.tex index 793d707..2d1d9c5 100644 --- a/thesis/thesis.tex +++ b/thesis/thesis.tex @@ -21,14 +21,23 @@ \tableofcontents \newpage -%\begin{abstract} -% \input{abstract.tex} -%\end{abstract} +\chapter*{ + \centering + \begin{normalsize} + Abstract + \end{normalsize} +} +\begin{quotation} + \noindent + \input{abstract.tex} +\end{quotation} +\clearpage \chapter{Introduction} \input{introduction.tex} \chapter{Methods} +\input{methods.tex} \chapter{Results}