working example
authorMart Lubbers <mart@martlubbers.net>
Wed, 23 Jul 2014 18:11:42 +0000 (20:11 +0200)
committerMart Lubbers <mart@martlubbers.net>
Wed, 23 Jul 2014 18:11:42 +0000 (20:11 +0200)
program/regexex/dafsa.py
program/regexex/f.dot

index d9d4766..de29c92 100644 (file)
@@ -24,7 +24,7 @@ class Afda():
             childs = []
         # End is not reached, find all the children with the correct letter
         else:
-            childs = [c for c in cons[c_node] if c[0] == word[c_index]]
+            childs = [c for c in cons[c_node][0] if c[0] == word[c_index]]
         # If there are no childs left, append the result
         if not childs or c_index >= len(word):
             res.append(cur + [c_node])
@@ -52,91 +52,56 @@ class Afda():
         connections   -- connections dictionary
         common_prefix -- common prefix
         """
-        print common_prefix
         for node in common_prefix:
-            print node
             if not isinstance(node, int):
                 node = node[1][1]
             num = 0
-            for c, n in connections.iteritems():
+            for c, (n, _) in connections.iteritems():
                 if node in [m[1] for m in n]:
                     num += 1
             if num > 1:
                 return node
 
-    def find_value(self, connections, from_node, to_node):
-        """Find the value from a node to another
-
-        Required arguments:
-        connections -- connections dictionary
-        from_node   -- from
-        to_node     -- to
-        """
-        return [c for c in connections[from_node] if c[1] == to_node][0][0]
-
-    def clone_p(self, connections, point):
-        """Clone everything from this point on
-
-        Required arguments:
-        connections -- connections dictionary
-        point       -- point to clone from
-        """
-        # Translation dictionary for the new points
-        newnames = {point[0]: point[0], point[1]: self.new_num(connections)}
-        # Fix point all the endpoints(should be only one
-        for c, n in connections.iteritems():
-            if len(n) == 0:
-                newnames[c] = c
-        # Make the first connection
-        connections[point[0]].append(
-            (self.find_value(connections, point[0], point[1]),
-             newnames[point[1]]))
-        # Add the first to-clone point to the frontier and give it a new node
-        toclone = [point[1]]
-        connections[newnames[point[1]]] = []
-        # While there are still nodes to be cloned, add them and visit their
-        # children
-        while toclone:
-            to = toclone.pop()
-            for child in connections[to]:
-                if child[1] not in newnames:
-                    # If the node is an endpoint, dont create new node
-                    if not connections[child[1]]:
-                        newnames[child[1]] = newnames[child[1]]
-                    # If the node is not an endpoint create a new node and add
-                    # it to the translation dictionary
-                    else:
-                        newnames[child[1]] = self.new_num(connections)
-                        connections[newnames[child[1]]] = []
-                        toclone.append(child[1])
-                # Make the connection
-                connections[newnames[to]].append(
-                    (child[0], newnames[child[1]]))
-        return newnames[point[1]]
-
     def clone(self, connections, states, from_node):
-        """Clone a subtree
-
-        Required arguments:
-        connections  -- connections dictionary
-        states       -- states to clone
-        from_node    -- node to start from
-        """
-        # Remember the last node number
-        last_node = states[0] if isinstance(states[0], int) else states[0][0]
-        # While the node is not found
-        for c in states:
-            # Derive the node number
-            node = c if isinstance(c, int) else c[0]
-            # If the node is the one we search for, search for its parent and
-            # start cloning
-            if node == from_node:
-                for i, rem in enumerate(connections[last_node]):
-                    if rem[1] == node:
-                        result = self.clone_p(connections, (last_node, rem[1]))
-                        del(connections[last_node][i])
-                        return result
-            last_node = node
+        print 'clone: {} from {}'.format(states, from_node)
+        newnames = {}
+        last_state = None
+        for i, c in enumerate(states):
+            node_number = c if isinstance(c, int) else c[0]
+            if node_number == from_node:
+                newnames[node_number] = self.new_num(connections)
+                connections[newnames[node_number]] =\
+                    ([], connections[node_number][1])
+                new_con = (connections[last_state[0]][0][0][0],
+                           newnames[node_number])
+                connections[last_state[0]] =\
+                    ([new_con], connections[last_state[0]][1])
+                last_state = newnames[node_number]
+                break
+            last_state = node_number, connections[node_number][0]
+        toclone = [newnames.keys()[0]]
+        for node, (cs, _) in connections.iteritems():
+            if not cs:
+                newnames[node] = node
+        while toclone:
+            current = toclone.pop()
+            for symbol, childnode in connections[current][0]:
+                #New point, clone
+                if childnode not in newnames:
+                    newnames[childnode] = self.new_num(connections)
+                if newnames[current] not in connections:
+                    connections[newnames[current]] =\
+                        ([], connections[current][1])
+                connections[newnames[current]][0].append(
+                    (symbol, newnames[childnode]))
+                toclone.append(childnode)
+        for i, state in enumerate(states[:]):
+            if isinstance(state, int):
+                states[i] = newnames.get(state, state)
+            else:
+                fr, (sym, to) = state
+                states[i] = (newnames.get(fr, fr), (sym, newnames.get(to, to)))
+        return states, states[-1]
 
     def new_num(self, connections):
         """Generate a new node number
@@ -153,27 +118,31 @@ class Afda():
         connections -- connections dictionary
         tra         -- parent, (symbol, state)
         """
-        for c in sorted(connections.iteritems()):
-            print c
-        print 'replace or register: {}'.format(tra)
         parent, (symbol, state) = tra
-        print 'parent, symbol, state = {} {} {}'.format(parent, symbol, state)
         replace = None
-        for cparent, children in connections.iteritems():
+        for cparent, (children, _) in connections.iteritems():
             for csymbol, cstate in children:
                 if symbol == csymbol and state == cstate and parent != cparent:
                     replace = (cparent, (csymbol, cstate))
-                    print 'found possible replace match: {}'.format(replace)
-#        matches = [c for c in connections[parent]
-#                   if c[0] == symbol and c[1] != state]
         if replace:
             # Delete the child
             # Register the child to the existing state
-            print 'replacing {} with {}'.format(tra, replace)
+            for node, (conns, _) in connections.items():
+                for enum, (symbolchild, nodechild) in\
+                        reversed(list(enumerate(conns[:]))):
+                    if nodechild == tra[0]:
+                        del(connections[node][0][enum])
+                        connections[node][0].insert(
+                            enum, (symbolchild, replace[0]))
+            for enum, child in\
+                    reversed(list(enumerate(connections[tra[0]][0]))):
+                if child == tra[1]:
+                    del(connections[tra[0]][0][enum])
+            if not connections[tra[0]][0]:
+                del(connections[tra[0]])
             return replace
         else:
             # It's already in the register, just go on
-            print 'registering'
             return tra
 
     def add_suffix(self, connections, last_state, current_suffix):
@@ -186,116 +155,122 @@ class Afda():
         """
         suffix = []
         endpoints = [i[0] for i in connections.iteritems()
-                     if not i[1] and i[0] != -1]
+                     if i[1][1]]
         for i, c in enumerate(current_suffix):
             if i == len(current_suffix)-1:
-                print 'last_state reached in adding suffix {}'.format(c)
-                print 'endpoints: {}'.format(endpoints)
                 if not endpoints:
-                    print 'no end point yet'
                     newnum = self.new_num(connections)
-                    connections[newnum] = []
+                    connections[newnum] = ([], True)
                 else:
-                    print 'end point found'
-                    newnum = endpoints[0]
+                    newnum = min(endpoints)
             else:
                 newnum = self.new_num(connections)
-                connections[newnum] = []
-            connections[last_state].append((c, newnum))
+                connections[newnum] = ([], False)
+            connections[last_state][0].append((c, newnum))
             suffix.append((last_state, (c, newnum)))
             last_state = newnum
 
-        print 'registering or replacing, add_suffix'
         prev = None
-        for i, tra in list(reversed(list(enumerate(suffix)))):
+        for i, tra in list(reversed(list(enumerate(suffix[1:])))):
             if prev:
-                connections[tra[0]] = [(tra[1][0], prev[0])]
+                tra = (tra[0], (tra[1][0], prev[0]))
                 prev = None
-            print '\nfrom {}'.format(tra)
             tra2 = self.replace_or_register(connections, tra)
             if tra2 != tra:
                 prev = tra2
                 suffix[i] = tra2
-                del(connections[tra[0]])
-            print 'to {}'.format(tra2)
         return suffix
 
     def train(self, wlist):
-# 0. start
-        register = set()
+# v0. start
 #        connections = {-1: [('a', 2), ('b', 4)], 2: [('b', 3)], 3: [('d', 5)],
 #                       4: [('a', 3)], 5: []}
-        connections = {-1: []}
+        connections = {-1: ([], False)}
         for word in wlist:
-            print word
+# v0. prepare
+            raw_input('Press enter to train the next word\n\n')
+            print 'word := {}'.format(word)
+
 # v1. Find the common prefix and suffix and skip if word is in the adfa
-            print '\tI\n'
             common_prefix = self.common_prefix(connections, word)
+            print 'CommonPrefix := {}'.format(common_prefix)
             current_suffix = word[len(common_prefix)-1:]
-            print 'common_prefix: {}'.format(common_prefix)
-            print 'current_suffix: {}'.format(current_suffix)
-            if not current_suffix and not connections[common_prefix[-1]]:
+            print 'CurrentSuffix := {}'.format(current_suffix)
+            print 'if CurrentSuffix = ...'
+            if not current_suffix:
+                if not connections[common_prefix[-1]][1]:
+                    connections[common_prefix[-1]] =\
+                        (connections[common_prefix[-1]][0], True)
+                print 'continue'
                 continue
 
 # v2. Find the first confluence state in the common prefix path
-            print '\tII\n'
             first_state = self.first_cstate(connections, common_prefix)
-            print 'first_state: {}'.format(first_state)
+            print 'FirstState := {}'.format(first_state)
+            print 'if FirstState = ...'
             if not first_state:
-                print 'not cloning'
                 last_state = common_prefix[-1]
+                print 'LastState := {}'.format(last_state)
             else:
-                print 'cloning'
-                last_state = self.clone(
+                commen_prefix, last_state = self.clone(
                     connections, common_prefix, first_state)
-            print 'adding suffix: {}'.format(current_suffix)
+                print 'LastState := {} (cloned new common prefix: {}'.format(
+                    last_state, common_prefix)
             suffix = self.add_suffix(connections, last_state, current_suffix)
-            print 'suffix added: {}'.format(suffix)
-            path = [c[0] for c in common_prefix + suffix
-                    if not isinstance(c, int)]
-            print 'path: {}'.format(path)
+            print 'add_suffix({}, {})'.format(last_state, current_suffix)
 
 # 3a. If there is a confluence state and there has been cloning, loop through
 # the new states backwards and replace or register the states
-            print '\tIII\n'
+            print 'if FirstState is not ...'
             if first_state:
-                continue
-                print 'consider replace register clone'
                 first_state = self.first_cstate(connections, common_prefix)
-                print 'first_state {}'.format(first_state)
-                current_index = common_prefix.index(first_state)
-                print 'range: ', range(len(common_prefix, current_index, -1))
-                for i in range(len(common_prefix, current_index, -1)):
-                    pass
+                print 'First_state := {}'.format(first_state)
+                if first_state:
+                    current_index = None
+                    for enum, c in enumerate(common_prefix):
+                        if c[0] == first_state:
+                            current_index = enum
+                            break
+                    print 'CurrentIndex := {}'.format(current_index)
+#                    for ind in range(len(common_prefix)-1, current_index-1,
+#                    -1): print 'index: {} maps to {}'.format(ind,
+#                                                           common_prefix[ind])
+#                        self.clone(connections, common_prefix[:ind],
+#                                   first_state)
+#                        break
+#                        #Do stuff
+#                        pass
+                current_index = len(common_prefix)
 # v3b. There is no confluence state so the current index will be the last of
 # the common prefix
             else:
                 current_index = len(common_prefix)
+                print 'CurrentIndex := {}'.format(current_index)
 
 # 4. While we change things try to find equivalent states in the suffix to
 # replace or register
-            print '\tIV\n'
+            print 'part IV: pref: {}, suf: {}'.format(common_prefix,
+                                                      suffix)
             changed = True
             while changed:
+                break
                 print 'current_index: {}'.format(current_index)
                 print 'last_state: {}'.format(last_state)
                 current_index -= 1
                 current_state = None
-                current_state = path[current_index]
+#                current_state = path[current_index]
                 print 'current state: {}'.format(current_state)
                 old_state = last_state
                 if current_index > 0:
                     print 'replace or register changing state'
-                    register = register - {last_state}
+#                    register = register - {last_state}
 #                self.replace_or_register(connections, current_state)
                 changed = old_state != last_state
 
 # v5. If there has been no change and there are characters left
-            print '\tV\n'
             if not changed and current_index > 0:
-                register = register.union({current_state})
-            import pdb
-            pdb.set_trace()
+                pass
+#                register = register.union({current_state})
 
         self.graphviz(connections, 'f.dot')
 
@@ -303,10 +278,10 @@ class Afda():
         fout = sys.stdout if fp == '-' else open(fp, 'w')
         fout.write('digraph finite_state_machine {\n')
         fout.write('\tnode [shape = doublecircle]; {}\n'.format(
-            ' '.join(str(c) for c in connections if not connections[c])))
+            ' '.join(str(c) for c in connections if connections[c][1])))
         fout.write('\tnode [shape = circle]; {}\n'.format(
-            ' '.join(str(c) for c in connections if connections[c])))
-        for node, targets in connections.iteritems():
+            ' '.join(str(c) for c in connections if not connections[c][1])))
+        for node, (targets, _) in connections.iteritems():
             for target in targets:
                 fout.write('\t{} -> {} [label = "{}"];\n'.format(
                     node, target[1], target[0]))
@@ -317,4 +292,8 @@ class Afda():
 
 if __name__ == '__main__':
     afda = Afda()
-    afda.train(['abd', 'bad', 'bae'])
+#    afda.train(['abcde', 'fghde', 'fghdghde'])
+    afda.train([['wie', '_', 'h', '_', 'wat'],
+                ['wie', '_', 'h', '_', 'wat', '_', 'h', '_', 'wanneer'],
+                ['wie', '_', 'h', '_', 'wat']])
+#    afda.train(['abd', 'ab', 'bad', 'bae'])
index 4411aa4..1104fab 100644 (file)
@@ -1,11 +1,13 @@
 digraph finite_state_machine {
-       node [shape = doublecircle]; 2
-       node [shape = circle]; 0 1 3 4 -1
-       0 -> 1 [label = "b"];
-       1 -> 2 [label = "d"];
-       3 -> 4 [label = "a"];
-       4 -> 2 [label = "d"];
-       4 -> 2 [label = "e"];
-       -1 -> 0 [label = "a"];
-       -1 -> 3 [label = "b"];
+       node [shape = doublecircle]; 4
+       node [shape = circle]; 0 1 2 3 5 6 7 -1
+       0 -> 1 [label = "_"];
+       1 -> 2 [label = "h"];
+       2 -> 3 [label = "_"];
+       3 -> 4 [label = "wat"];
+       4 -> 5 [label = "_"];
+       5 -> 6 [label = "h"];
+       6 -> 7 [label = "_"];
+       7 -> 4 [label = "wanneer"];
+       -1 -> 0 [label = "wie"];
 }
\ No newline at end of file