--- /dev/null
+#!/bin/env python
+# -*- coding: utf-8 -*-
+import sys
+import re
+class fsm():
+ split = re.compile(r"""
+ (?:
+ \[.*?\]| # character group
+ \(.*\)| # group
+ \\?. # single character
+ )
+ (?:
+ \{[0-9,]+\}| # strict quantifier
+ [?+*] # loose quantifier
+ )?
+ def __init__(self):
+ # List of nodes number as key, info as value
+ self.nodes = {-1: '#start', -2: '#end'}
+ # List of connection tuples
+ self.connections = {}
+ # Dictionary of string that are added and have to be integrated
+ self.strings = {}
+ # Dictionary of marking locations in the graph
+ self.markings = {}
+ def add_node(self, string, key=-1):
+ """Add a node to the finite state machine
+ Required arguments:
+ string -- the string to which the node will match
+ Keyword arguments:
+ key -- key to which the node is assigned in the internal structure,
+ when not provided the key is generated (default 1)
+ """
+ key = key if key != -1 else\
+ 0 if not self.nodes else max(self.nodes) + 1
+ self.nodes[key] = string
+ return key
+ def add_connection(self, node_from, node_to, check=True):
+ """Add a connection to the finite state machine, will throw an
+ exception if the check flag is set to True and some nodes aren't
+ present
+ Required arguments:
+ node_from -- key of the starting point of the connection
+ node_to -- key of the ending point of the connection
+ Keyword arguments:
+ check -- flag to check if the end and start point even exists
+ before creating the connection (default False)
+ """
+ keys = self.nodes.keys()
+ if check or (node_from in keys and node_to in keys):
+ if node_from not in self.connections:
+ self.connections[node_from] = set()
+ self.connections[node_from].add(node_to)
+ else:
+ raise Exception('One or more nodes not found')
+ def add_string(self, string, markings):
+ """Add a string to the finite state machine
+ Required arguments:
+ string -- raw string to integrate in the finite state machine
+ markings -- dictionary of markings of the following form:
+ {'mark0': (start, end), 'mark1': (start, end) ... }
+ """
+ string = string.replace(' ', '_')
+ self.strings[string] = markings
+ self.integrate(string)
+ def integrate(self, string):
+ """Integrate the given string in the finite state machine, will throw
+ an exception if the string doesn't exist in the nodes list
+ Required arguments:
+ string -- the raw string to integrate
+ """
+ if string not in self.strings:
+ raise Exception('Given string not present in the nodes')
+ # Add all the markings that were not added
+ markings = self.strings[string]
+ for k in markings.iterkeys():
+ if k not in self.markings:
+ self.markings[k] = (self.add_node('#{}_start'.format(k)),
+ self.add_node('#{}_end'.format(k)))
+ # Loop over all tokens to add them as a node
+ beforelast = -1
+ for enum, token in enumerate(string):
+ last = self.add_node(token)
+ for k, v in markings.iteritems():
+ # When the index matches the starting point of a category
+ if v[0] == enum:
+ self.add_connection(beforelast, self.markings[k][0])
+ self.add_connection(self.markings[k][0], last)
+ break
+ # When the index matches the ending point of a category
+ elif v[1] == enum:
+ self.add_connection(beforelast, last)
+ self.add_connection(last, self.markings[k][1])
+ last = self.markings[k][1]
+ break
+ # No category match
+ else:
+ self.add_connection(beforelast, last)
+ # Remember the last node so the new node can connect to it
+ beforelast = last
+ # Connect to the end point
+ self.add_connection(last, -2)
+ def optimize(self):
+ """Optimize the finite state machine"""
+ pass
+ def graphviz(self, fp='-'):
+ """Print the finite state machine in graphviz format
+ Keyword arguments:
+ fp -- filepath, '-' for stdout (default '-')
+ """
+ # Open file and write header
+ fp = sys.stdout if fp == '-' else open(fp, 'w')
+ fp.write('digraph fsm{\n')
+ # Find subgraphs travers the graph
+ subgraphs = {a: ([], []) for a in self.markings.keys() + ['none']}
+ self.travers(-1, subgraphs)
+ # Print the nodes that are not in a subgraph
+ for nodes in subgraphs['none'][0] + subgraphs['none'][1]:
+ fp.write('{:<2}{}\n'.format(' ', nodes))
+ del(subgraphs['none'])
+ #Print the nodes that are in a subgraph
+ for key, sg in subgraphs.iteritems():
+ fp.write('{0:<2}subgraph cluster_{1} {{\n'.format(' ', key))
+ for nodes in sg[0] + sg[1]:
+ fp.write('{:<4}{}\n'.format(' ', nodes))
+ fp.write('{:<2}}}\n'.format(' '))
+ # write footer and close file
+ fp.write('}')
+ if fp != sys.stdout:
+ fp.close()
+ def travers(self, current, subgraphs, state='none', visited=set()):
+ """Traverse the graph and fill the dictionary for graphviz output
+ Required arguments:
+ current -- current node key
+ subgraphs -- dictionary of subgraphs of the form:
+ {'subgraph1': ([], []), 'subgraph2': ([], []) ... }
+ Keyword arguments:
+ state -- current state the traverser is in
+ visited -- set of visited nodes
+ """
+ if current != -2 and current not in visited:
+ visited.add(current)
+ subgraphs[state][0].append('{} [label="{}"]'.format(
+ current, self.nodes[current]))
+ for c in self.connections[current]:
+ subgraphs[state][1].append('{} -> {}'.format(current, c))
+ for name, markings in self.markings.iteritems():
+ if markings[0] == c:
+ state = name
+ break
+ elif markings[1] == c:
+ state = 'none'
+ break
+ self.travers(c, subgraphs, state)
+if __name__ == '__main__':
+ f = fsm()
+ f.add_string('maandag 11 augustus 2014 19:30 - Neutral Milk Hotel',
+ {'wanneer': (0, 29), 'wat': (33, 50)})
+ f.add_string('dinsdag 19 augustus 2014 22:00 - Arkells',
+ {'wanneer': (0, 29), 'wat': (33, 39)})
+ f.add_string('maandag 24 november 2014 20:30 - Fink',
+ {'wanneer': (0, 29), 'wat': (33, 36)})
+ f.add_string(
+ 'maandag 29 december 2014 20:30 - Alain Clark - Locatie: De Duif',
+ {'wanneer': (0, 29), 'wat': (33, 43), 'waar': (47, 62)})
+ f.optimize()
+ f.graphviz()