small dawg implementation
authorMart Lubbers <mart@martlubbers.net>
Mon, 9 Feb 2015 11:29:20 +0000 (12:29 +0100)
committerMart Lubbers <mart@martlubbers.net>
Mon, 9 Feb 2015 11:29:20 +0000 (12:29 +0100)
program/dawg/dawg.py [new file with mode: 0644]

diff --git a/program/dawg/dawg.py b/program/dawg/dawg.py
new file mode 100644 (file)
index 0000000..82a0cda
--- /dev/null
@@ -0,0 +1,95 @@
+import sys
+
+
+class DAWG:
+    def __init__(self):
+        self.nodes = {'q0'}
+        self.delta = {'q0': {}}
+        self.final = set()
+        self.start = 'q0'
+        self.register = None
+
+    def common_prefix(self, word, current=None):
+        if current is None:
+            current = self.start
+        if word == '':
+            return ''
+        char, rest = word[0], word[1:]
+        if char in self.delta[current]:
+            return char + self.common_prefix(rest, self.delta[current][char])
+        else:
+            return ''
+
+    def delta_star(self, start, word):
+        if not word:
+            return start
+        else:
+            return self.delta_star(self.delta[start][word[0]], word[1:])
+
+    def equiv(self, a, b):
+        return (a in self.final) == (b in self.final) and\
+            len(self.delta[a].keys()) == len(self.delta[b].keys()) and\
+            self.delta[a].keys() == self.delta[b].keys()
+
+    def replace_or_register(self, suffix):
+        while suffix:
+            parent, char, state = suffix.pop()
+            for r in self.register:
+                if self.equiv(state, r):
+                    self.delta[parent][char] = r
+                    if state in self.final:
+                        self.final.remove(state)
+                    self.nodes.remove(state)
+                    del(self.delta[state])
+                    break
+            else:
+                self.register.add(state)
+
+    def add_suffix(self, state, current_suffix):
+        nodenum = max(int(w[1:]) for w in self.nodes)+1
+        reverse = []
+        for c in current_suffix:
+            newnode = 'q{}'.format(nodenum)
+            self.nodes.add(newnode)
+            nodenum += 1
+            self.delta[state][c] = newnode
+            self.delta[newnode] = {}
+            reverse.append((state, c, newnode))
+            state = newnode
+        self.final.add(newnode)
+        return reverse
+
+    def add_words_sorted(self, words, start=None):
+        self.register = set()
+#        words = sorted(words)
+        while words:
+            word = words.pop()
+            common_prefix = self.common_prefix(word)
+            print('common_prefix=', common_prefix)
+            last_state = self.delta_star(self.start, common_prefix)
+            print('last_state=', last_state)
+            current_suffix = word[len(common_prefix):]
+            print('current_suffix=', current_suffix)
+            suffix = self.add_suffix(last_state, current_suffix)
+            print('added_suffix=', suffix)
+            self.replace_or_register(suffix)
+            print('after: register: ', self.register)
+
+    def to_dot(self, options=''):
+        s = 'digraph {{\n{}\nn0 [style=invis]\n'.format(options)
+        for node in self.nodes:
+            s += '{} [shape={}circle]\n'.format(
+                    node, 'double' if node in self.final else '')
+        s += 'n0 -> {}\n'.format(self.start)
+        for node, transitions in self.delta.items():
+            for letter, state in transitions.items():
+                s += '{} -> {} [label="{}"]\n'.format(node, state, letter)
+        s += '}\n'
+        return s
+
+
+if __name__ == '__main__':
+    d = DAWG()
+    d.add_words_sorted(['abd', 'bad', 'bae'])
+#    d.add_words_sorted(['abcde', 'fghde', 'fghdghde'])
+    sys.stderr.write(d.to_dot(options='rankdir=LR'))