dafsa first rty

author Mart Lubbers <mart@martlubbers.net>

Tue, 15 Jul 2014 15:03:39 +0000 (17:03 +0200)

committer Mart Lubbers <mart@martlubbers.net>

Tue, 15 Jul 2014 15:03:39 +0000 (17:03 +0200)
author Mart Lubbers <mart@martlubbers.net>
Tue, 15 Jul 2014 15:03:39 +0000 (17:03 +0200)
committer Mart Lubbers <mart@martlubbers.net>
Tue, 15 Jul 2014 15:03:39 +0000 (17:03 +0200)
diff --git a/program/regexex/dafsa.py b/program/regexex/dafsa.py

new file mode 100644 (file)

index 0000000..f855c7a
--- /dev/null
+++ b/program/regexex/dafsa.py
@@ -0,0 +1,185 @@
+#!/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+
+
+class Afda():
+    def __init__(self):
+        pass
+
+    def common_prefix_p(self, cons, word, res, c_node, c_index, cur):
+        if c_index >= len(word):
+            childs = []
+        else:
+            childs = [c for c in cons[c_node] if c[0] == word[c_index]]
+        if not childs or c_index >= len(word):
+            res.append(cur + [c_node])
+        else:
+            for child in childs:
+                self.common_prefix_p(cons, word, res, child[1], c_index+1,
+                                     cur+[(c_node, child)])
+
+    def common_prefix(self, cons, word):
+        results = []
+        self.common_prefix_p(cons, word, results, -1, 0, [])
+        return max(results, key=lambda x: len(x))
+
+    def first_cstate(self, connections, common_prefix):
+        print common_prefix
+        for node in common_prefix:
+            print node
+            if not isinstance(node, int):
+                node = node[1][1]
+            num = 0
+            for c, n in connections.iteritems():
+                if node in [m[1] for m in n]:
+                    num += 1
+            if num > 1:
+                return node
+
+    def find_value(self, connections, from_node, to_node):
+        return [c for c in connections[from_node] if c[1] == to_node][0][0]
+
+    def clone_p(self, connections, point):
+        """clone everything from this point on"""
+        # Translation dictionary for the new points
+        newnames = {point[0]: point[0], point[1]: self.new_num(connections)}
+        # Fix point all the endpoints(should be only one
+        for c, n in connections.iteritems():
+            if len(n) == 0:
+                newnames[c] = c
+        # Make the first connection
+        connections[point[0]].append(
+            (self.find_value(connections, point[0], point[1]),
+             newnames[point[1]]))
+        # Add the first to-clone point to the frontier and give it a new node
+        toclone = [point[1]]
+        connections[newnames[point[1]]] = []
+        # While there are still nodes to be cloned, add them and visit their
+        # children
+        while toclone:
+            to = toclone.pop()
+            for child in connections[to]:
+                if child[1] not in newnames:
+                    # If the node is an endpoint, dont create new node
+                    if not connections[child[1]]:
+                        newnames[child[1]] = newnames[child[1]]
+                    # If the node is not an endpoint create a new node and add
+                    # it to the translation dictionary
+                    else:
+                        newnames[child[1]] = self.new_num(connections)
+                        connections[newnames[child[1]]] = []
+                        toclone.append(child[1])
+                # Make the connection
+                connections[newnames[to]].append(
+                    (child[0], newnames[child[1]]))
+        return newnames[point[1]]
+
+    def clone(self, connections, states, from_node):
+        # Remember the last node number
+        last_node = states[0] if isinstance(states[0], int) else states[0][0]
+        # While the node is not found
+        for c in states:
+            # Derive the node number
+            node = c if isinstance(c, int) else c[0]
+            # If the node is the one we search for, search for its parent and
+            # start cloning
+            if node == from_node:
+                for i, rem in enumerate(connections[last_node]):
+                    if rem[1] == node:
+                        result = self.clone_p(connections, (last_node, rem[1]))
+                        del(connections[last_node][i])
+                        return result
+            last_node = node
+
+    def new_num(self, connections):
+        return max(connections.keys())+1
+
+    def add_suffix(self, connections, last_state, current_suffix, first=False):
+        if first:
+            this = current_suffix
+            that = None
+        else:
+            this = current_suffix[:-1]
+            that = current_suffix[-1]
+        for c in this:
+            newnum = self.new_num(connections)
+            connections[last_state].append((c, newnum))
+            connections[newnum] = []
+            last_state = newnum
+        if that:
+            endstate = [i for i in connections.iteritems() if not i[1]][0]
+            connections[last_state].append((that, endstate[0]))
+
+    def train(self, wlist):
+#       register = None
+#        connections = {-1: [('a', 2), ('b', 4)], 2: [('b', 3)], 3: [('d', 5)],
+#                       4: [('a', 3)], 5: []}
+        connections = {-1: []}
+        first = True
+        for word in wlist:
+            if first:
+                self.add_suffix(connections, -1, word, True)
+                first = False
+            print word
+# Find the common prefix and suffix and skip if word is in the adfa
+            common_prefix = self.common_prefix(connections, word)
+            current_suffix = word[len(common_prefix)-1:]
+            print 'common_prefix: {}'.format(common_prefix)
+            print 'current_suffix: {}'.format(current_suffix)
+            if not current_suffix:
+                continue
+
+# Find the first confluence state in the common prefix path
+            first_state = self.first_cstate(connections, common_prefix)
+            print 'first_state: {}'.format(first_state)
+            if first_state:
+                last_state = self.clone(
+                    connections, common_prefix, first_state)
+            else:
+                last_state = common_prefix[-1]
+            print 'last_state: {}'.format(last_state)
+            self.add_suffix(connections, last_state, current_suffix)
+
+# If there is a confluence state and there has been cloning, loop through the
+# new states backwards and replace or register the states
+            if first_state:
+                print 'consider replace register clone'
+                first_state = self.first_cstate(connections, common_prefix)
+                print 'first_state {}'.format(first_state)
+                current_index = common_prefix.index(first_state)
+                print range(len(common_prefix, current_index, -1))
+                for i in range(len(common_prefix, current_index, -1)):
+                    pass
+# There is no confluence state so the current index will be the last of the
+# common prefix
+            else:
+                current_index = len(common_prefix)
+
+# While we change things try to find equivalent states in the suffix to
+# replace or register
+            #changed = True
+            raw_input('continue...\n')
+
+        self.graphviz(connections, 'f.dot')
+
+    def graphviz(self, connections, fp='-'):
+        fout = sys.stdout if fp == '-' else open(fp, 'w')
+        fout.write('digraph finite_state_machine {\n')
+        fout.write('\tnode [shape = doublecircle]; {}\n'.format(
+            ' '.join(str(c) for c in connections if not connections[c])))
+        fout.write('\tnode [shape = circle]; {}\n'.format(
+            ' '.join(str(c) for c in connections if connections[c])))
+        for node, targets in connections.iteritems():
+            for target in targets:
+                fout.write('\t{} -> {} [label = "{}"];\n'.format(
+                    node, target[1], target[0]))
+        fout.write('}')
+        if fp != '-':
+            fout.close()
+
+
+if __name__ == '__main__':
+    afda = Afda()
+    afda.train(['abd', 'bad', 'bae'])
diff --git a/program/regexex/f.dot b/program/regexex/f.dot

new file mode 100644 (file)

index 0000000..4411aa4
--- /dev/null
+++ b/program/regexex/f.dot
@@ -0,0 +1,11 @@
+digraph finite_state_machine {
+       node [shape = doublecircle]; 2
+       node [shape = circle]; 0 1 3 4 -1
+       0 -> 1 [label = "b"];
+       1 -> 2 [label = "d"];
+       3 -> 4 [label = "a"];
+       4 -> 2 [label = "d"];
+       4 -> 2 [label = "e"];
+       -1 -> 0 [label = "a"];
+       -1 -> 3 [label = "b"];
+}
+\ No newline at end of file
author	Mart Lubbers <mart@martlubbers.net>
	Tue, 15 Jul 2014 15:03:39 +0000 (17:03 +0200)
committer	Mart Lubbers <mart@martlubbers.net>
	Tue, 15 Jul 2014 15:03:39 +0000 (17:03 +0200)
program/regexex/dafsa.py	[new file with mode: 0644]	patch \| blob
program/regexex/f.dot	[new file with mode: 0644]	patch \| blob