small commit
authorMart Lubbers <mart@martlubbers.net>
Mon, 7 Jul 2014 15:12:06 +0000 (17:12 +0200)
committerMart Lubbers <mart@martlubbers.net>
Mon, 7 Jul 2014 15:12:06 +0000 (17:12 +0200)
program/regexex/fsm.py

index 1a24e88..c3b6357 100644 (file)
@@ -2,32 +2,28 @@
 # -*- coding: utf-8 -*-
 
 import sys
-import re
 
 
 class fsm():
-    split = re.compile(r"""
-(
-    (?:
-        \[.*?\]|        # character group
-        \(.*\)|         # group
-        \\?.            # single character
-    )
-    (?:
-        \{[0-9,]+\}|    # strict quantifier
-        [?+*]           # loose quantifier
-    )?
-)""")
+    """Class describing a finite state machine in the form of a directed graph
+
+    Internal variables:
+    nodes       -- dict for the node information,
+                   {id -> string}
+    connections -- dict for all nodes the connections to other nodes,
+                   {from_id -> to_id}
+    strings     -- dict containing the strings the fsm is trained on:
+                   {string -> [(marking_start, marking_end), ...]}
+    markings    -- dictionary containing markings within the fsm
+                   {marking_name -> (node_start, node_end)}
+    """
+    nodestr = '{} [label="{}"]'
+    edgestr = '{} -> {}'
 
     def __init__(self):
-        # List of nodes number as key, info as value
         self.nodes = {-1: '#start', -2: '#end'}
-        # List of connection tuples
-#        self.connections = {-2: set()}
-        self.connections = {-2: list()}
-        # Dictionary of string that are added and have to be integrated
+        self.connections = {-2: set()}
         self.strings = {}
-        # Dictionary of marking locations in the graph
         self.markings = {}
 
     def add_node(self, string, key=-1):
@@ -61,10 +57,8 @@ class fsm():
         keys = self.nodes.keys()
         if check or (node_from in keys and node_to in keys):
             if node_from not in self.connections:
-#                self.connections[node_from] = set()
-                self.connections[node_from] = list()
-#            self.connections[node_from].add(node_to)
-            self.connections[node_from].append(node_to)
+                self.connections[node_from] = set()
+            self.connections[node_from].add(node_to)
         else:
             raise Exception('One or more nodes not found')
 
@@ -121,6 +115,7 @@ class fsm():
 
     def optimize(self):
         """Optimize the finite state machine"""
+        # Search the patterns outside the markings and try to merge
         pass
 
     def graphviz(self, fp='-'):
@@ -138,14 +133,16 @@ class fsm():
         self.travers(-1, subgraphs)
 
         # Print the nodes that are not in a subgraph
-        for nodes in subgraphs['none'][0] + subgraphs['none'][1]:
+        for nodes in [self.nodestr.format(*n) for n in subgraphs['none'][0]] +\
+                [self.edgestr.format(*n) for n in subgraphs['none'][1]]:
             fp.write('{:<2}{}\n'.format(' ', nodes))
         del(subgraphs['none'])
 
         #Print the nodes that are in a subgraph
         for key, sg in subgraphs.iteritems():
             fp.write('{0:<2}subgraph cluster_{1} {{\n'.format(' ', key))
-            for nodes in sg[0] + sg[1]:
+            for nodes in [self.nodestr.format(*n) for n in sg[0]] +\
+                    [self.edgestr.format(*n) for n in sg[1]]:
                 fp.write('{:<4}{}\n'.format(' ', nodes))
             fp.write('{:<2}}}\n'.format(' '))
 
@@ -154,35 +151,39 @@ class fsm():
         if fp != sys.stdout:
             fp.close()
 
-    def travers(self, current, subgraphs, state='none', visited=set(),
-                nodestring='{} [label="{}"]', edgestring='{} -> {}'):
+    def travers(self, current, subgraphs, state='none', visited=set()):
         """Traverse the graph and fill the dictionary for graphviz output
 
         Required arguments:
-        current   -- current node key
-        subgraphs -- dictionary of subgraphs of the form:
-                     {'subgraph1': ([], []), 'subgraph2': ([], []) ... }
+        current    -- current node key
+        subgraphs  -- dictionary of subgraphs of the form:
+                      {'subgraph1': ([label], [edge]),
+                       'subgraph2': ([label], [edge]),
+                       ... }
 
         Keyword arguments:
-        state     -- current state the traverser is in
-        visited   -- set of visited nodes
-        nodestring-- format string for the node-dot output
-        edgestring-- format string for the edge-dot output
+        state      -- current state the traverser is in
+        visited    -- set of visited nodes
+        nodestr -- format string for the node-dot output
+        edgestr -- format string for the edge-dot output
         """
+        # Stop when the node is already visited
         if current not in visited:
+            # Add the node and mark visited
             visited.add(current)
-            subgraphs[state][0].append(nodestring.format(
-                current, self.nodes[current]))
+            subgraphs[state][0].append((current, self.nodes[current]))
+            # Go through all the connections from this node
             for c in self.connections[current]:
-                subgraphs[state][1].append(edgestring.format(current, c))
+                subgraphs[state][1].append((current, c))
+                newstate = state
+                # Check if the current node is a border case for a category
                 for name, markings in self.markings.iteritems():
                     if markings[0] == c:
-                        state = name
-                        break
+                        newstate = name
                     elif markings[1] == c:
-                        state = 'none'
-                        break
-                self.travers(c, subgraphs, state)
+                        newstate = 'none'
+                # Traverse the node
+                self.travers(c, subgraphs, newstate)
 
 
 if __name__ == '__main__':