-import sys
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
class DAWG:
+ """Class representing a DAWG
+
+ Variables:
+ v - Set of nodes
+ v0 - Start node
+ f - Set of final nodes
+ delta - Delta function
+ """
def __init__(self):
- self.nodes = {'q0'}
+ """Initialize with null graph."""
+ self.v = {'q0'}
self.delta = {'q0': {}}
- self.final = set()
- self.start = 'q0'
+ self.f = set()
+ self.v0 = 'q0'
self.register = None
- def common_prefix(self, word, current=None):
- if current is None:
- current = self.start
- if word == '':
- return ''
- char, rest = word[0], word[1:]
- if char in self.delta[current]:
+ def common_prefix(self, word, current):
+ """Find the common prefix of word starting from current."""
+ try:
+ char, rest = word[0], word[1:]
return char + self.common_prefix(rest, self.delta[current][char])
- else:
+ except (KeyError, IndexError):
return ''
- def delta_star(self, start, word):
- if not word:
- return start
- else:
- return self.delta_star(self.delta[start][word[0]], word[1:])
+ def delta_star(self, c, w):
+ """Calculate the final node when traversing the graph from node c with
+ word w"""
+ return c if not w else self.delta_star(self.delta[c][w[0]], w[1:])
def equiv(self, a, b):
- return (a in self.final) == (b in self.final) and\
- len(self.delta[a].keys()) == len(self.delta[b].keys()) and\
- self.delta[a].keys() == self.delta[b].keys()
+ """Determine if two nodes are equivalent. This is the case when they
+ have the same final flag, the same number of children and their
+ children are equal."""
+ if (a in self.f) != (b in self.f) or\
+ self.delta[a].keys() != self.delta[b].keys():
+ return False
+ return all(self.equiv(x, y) for x, y in
+ zip(self.delta[a].values(), self.delta[b].values()))
def replace_or_register(self, suffix):
+ """Starting from the back try to merge nodes in the suffix."""
while suffix:
parent, char, state = suffix.pop()
for r in self.register:
if self.equiv(state, r):
self.delta[parent][char] = r
- if state in self.final:
- self.final.remove(state)
- self.nodes.remove(state)
+ if state in self.f:
+ self.f.remove(state)
+ self.v.remove(state)
del(self.delta[state])
break
else:
self.register.add(state)
def add_suffix(self, state, current_suffix):
- nodenum = max(int(w[1:]) for w in self.nodes)+1
- reverse = []
+ """Add the current_suffix to the graph from state and return it"""
+ nodenum = max(int(w[1:]) for w in self.v)+1
+ suffix = []
for c in current_suffix:
newnode = 'q{}'.format(nodenum)
- self.nodes.add(newnode)
+ self.v.add(newnode)
nodenum += 1
self.delta[state][c] = newnode
self.delta[newnode] = {}
- reverse.append((state, c, newnode))
+ suffix.append((state, c, newnode))
state = newnode
- self.final.add(newnode)
- return reverse
+ self.f.add(newnode)
+ return suffix
- def add_words_sorted(self, words, start=None):
+ def add_words(self, words):
+ """Add words to the dawg"""
self.register = set()
-# words = sorted(words)
+ words = sorted(words)
while words:
word = words.pop()
- print('adding: {}'.format(word))
- common_prefix = self.common_prefix(word)
- print('common_prefix=', common_prefix)
- last_state = self.delta_star(self.start, common_prefix)
- print('last_state=', last_state)
+ common_prefix = self.common_prefix(word, self.v0)
+ last_state = self.delta_star(self.v0, common_prefix)
current_suffix = word[len(common_prefix):]
- print('current_suffix=', current_suffix)
if current_suffix:
suffix = self.add_suffix(last_state, current_suffix)
- print('added_suffix=', suffix)
self.replace_or_register(suffix)
- print('after: register: ', self.register)
else:
- self.final.add(last_state)
+ self.f.add(last_state)
def to_dot(self, options=''):
+ """Return the graphviz(dot) string representation of the graph"""
s = 'digraph {{\n{}\nn0 [style=invis]\n'.format(options)
- for node in self.nodes:
+ for node in self.v:
s += '{} [shape={}circle]\n'.format(
- node, 'double' if node in self.final else '')
- s += 'n0 -> {}\n'.format(self.start)
+ node, 'double' if node in self.f else '')
+ s += 'n0 -> {}\n'.format(self.v0)
for node, transitions in self.delta.items():
for letter, state in transitions.items():
s += '{} -> {} [label="{}"]\n'.format(node, state, letter)
- s += '}\n'
- return s
+ return s + '}\n'
if __name__ == '__main__':
d = DAWG()
-# d.add_words_sorted(['abd', 'bad', 'bae'])
- d.add_words_sorted(['abcde', 'fghde', 'fghdghde'])
- sys.stderr.write(d.to_dot(options='rankdir=LR'))
+ d.add_words(['abd', 'bad', 'bae'])
+# d.add_words(['abcde', 'fghde', 'fghdghde'])
+ print repr(d.common_prefix('', d.v0))
+ print d.to_dot(options='rankdir=LR')