update
authorMart Lubbers <mart@martlubbers.net>
Wed, 11 Feb 2015 12:02:38 +0000 (13:02 +0100)
committerMart Lubbers <mart@martlubbers.net>
Wed, 11 Feb 2015 12:02:38 +0000 (13:02 +0100)
program/dawg/dawg.py

index 934f39b..feefcd0 100644 (file)
-import sys
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
 
 
 class DAWG:
+    """Class representing a DAWG
+
+    Variables:
+    v     - Set of nodes
+    v0    - Start node
+    f     - Set of final nodes
+    delta - Delta function
+    """
     def __init__(self):
-        self.nodes = {'q0'}
+        """Initialize with null graph."""
+        self.v = {'q0'}
         self.delta = {'q0': {}}
-        self.final = set()
-        self.start = 'q0'
+        self.f = set()
+        self.v0 = 'q0'
         self.register = None
 
-    def common_prefix(self, word, current=None):
-        if current is None:
-            current = self.start
-        if word == '':
-            return ''
-        char, rest = word[0], word[1:]
-        if char in self.delta[current]:
+    def common_prefix(self, word, current):
+        """Find the common prefix of word starting from current."""
+        try:
+            char, rest = word[0], word[1:]
             return char + self.common_prefix(rest, self.delta[current][char])
-        else:
+        except (KeyError, IndexError):
             return ''
 
-    def delta_star(self, start, word):
-        if not word:
-            return start
-        else:
-            return self.delta_star(self.delta[start][word[0]], word[1:])
+    def delta_star(self, c, w):
+        """Calculate the final node when traversing the graph from node c with
+        word w"""
+        return c if not w else self.delta_star(self.delta[c][w[0]], w[1:])
 
     def equiv(self, a, b):
-        return (a in self.final) == (b in self.final) and\
-            len(self.delta[a].keys()) == len(self.delta[b].keys()) and\
-            self.delta[a].keys() == self.delta[b].keys()
+        """Determine if two nodes are equivalent. This is the case when they
+        have the same final flag, the same number of children and their
+        children are equal."""
+        if (a in self.f) != (b in self.f) or\
+                self.delta[a].keys() != self.delta[b].keys():
+            return False
+        return all(self.equiv(x, y) for x, y in
+                   zip(self.delta[a].values(), self.delta[b].values()))
 
     def replace_or_register(self, suffix):
+        """Starting from the back try to merge nodes in the suffix."""
         while suffix:
             parent, char, state = suffix.pop()
             for r in self.register:
                 if self.equiv(state, r):
                     self.delta[parent][char] = r
-                    if state in self.final:
-                        self.final.remove(state)
-                    self.nodes.remove(state)
+                    if state in self.f:
+                        self.f.remove(state)
+                    self.v.remove(state)
                     del(self.delta[state])
                     break
             else:
                 self.register.add(state)
 
     def add_suffix(self, state, current_suffix):
-        nodenum = max(int(w[1:]) for w in self.nodes)+1
-        reverse = []
+        """Add the current_suffix to the graph from state and return it"""
+        nodenum = max(int(w[1:]) for w in self.v)+1
+        suffix = []
         for c in current_suffix:
             newnode = 'q{}'.format(nodenum)
-            self.nodes.add(newnode)
+            self.v.add(newnode)
             nodenum += 1
             self.delta[state][c] = newnode
             self.delta[newnode] = {}
-            reverse.append((state, c, newnode))
+            suffix.append((state, c, newnode))
             state = newnode
-        self.final.add(newnode)
-        return reverse
+        self.f.add(newnode)
+        return suffix
 
-    def add_words_sorted(self, words, start=None):
+    def add_words(self, words):
+        """Add words to the dawg"""
         self.register = set()
-#        words = sorted(words)
+        words = sorted(words)
         while words:
             word = words.pop()
-            print('adding: {}'.format(word))
-            common_prefix = self.common_prefix(word)
-            print('common_prefix=', common_prefix)
-            last_state = self.delta_star(self.start, common_prefix)
-            print('last_state=', last_state)
+            common_prefix = self.common_prefix(word, self.v0)
+            last_state = self.delta_star(self.v0, common_prefix)
             current_suffix = word[len(common_prefix):]
-            print('current_suffix=', current_suffix)
             if current_suffix:
                 suffix = self.add_suffix(last_state, current_suffix)
-                print('added_suffix=', suffix)
                 self.replace_or_register(suffix)
-                print('after: register: ', self.register)
             else:
-                self.final.add(last_state)
+                self.f.add(last_state)
 
     def to_dot(self, options=''):
+        """Return the graphviz(dot) string representation of the graph"""
         s = 'digraph {{\n{}\nn0 [style=invis]\n'.format(options)
-        for node in self.nodes:
+        for node in self.v:
             s += '{} [shape={}circle]\n'.format(
-                node, 'double' if node in self.final else '')
-        s += 'n0 -> {}\n'.format(self.start)
+                node, 'double' if node in self.f else '')
+        s += 'n0 -> {}\n'.format(self.v0)
         for node, transitions in self.delta.items():
             for letter, state in transitions.items():
                 s += '{} -> {} [label="{}"]\n'.format(node, state, letter)
-        s += '}\n'
-        return s
+        return s + '}\n'
 
 
 if __name__ == '__main__':
     d = DAWG()
-#    d.add_words_sorted(['abd', 'bad', 'bae'])
-    d.add_words_sorted(['abcde', 'fghde', 'fghdghde'])
-    sys.stderr.write(d.to_dot(options='rankdir=LR'))
+    d.add_words(['abd', 'bad', 'bae'])
+#    d.add_words(['abcde', 'fghde', 'fghdghde'])
+    print repr(d.common_prefix('', d.v0))
+    print d.to_dot(options='rankdir=LR')