From: Mart Lubbers Date: Wed, 11 Feb 2015 12:02:38 +0000 (+0100) Subject: update X-Git-Url: https://git.martlubbers.net/?a=commitdiff_plain;h=7cf724ebcb0d8634f436ec83e6f247dc8a00b662;p=bsc-thesis1415.git update --- diff --git a/program/dawg/dawg.py b/program/dawg/dawg.py index 934f39b..feefcd0 100644 --- a/program/dawg/dawg.py +++ b/program/dawg/dawg.py @@ -1,99 +1,108 @@ -import sys +#!/usr/bin/env python +# -*- coding: utf-8 -*- class DAWG: + """Class representing a DAWG + + Variables: + v - Set of nodes + v0 - Start node + f - Set of final nodes + delta - Delta function + """ def __init__(self): - self.nodes = {'q0'} + """Initialize with null graph.""" + self.v = {'q0'} self.delta = {'q0': {}} - self.final = set() - self.start = 'q0' + self.f = set() + self.v0 = 'q0' self.register = None - def common_prefix(self, word, current=None): - if current is None: - current = self.start - if word == '': - return '' - char, rest = word[0], word[1:] - if char in self.delta[current]: + def common_prefix(self, word, current): + """Find the common prefix of word starting from current.""" + try: + char, rest = word[0], word[1:] return char + self.common_prefix(rest, self.delta[current][char]) - else: + except (KeyError, IndexError): return '' - def delta_star(self, start, word): - if not word: - return start - else: - return self.delta_star(self.delta[start][word[0]], word[1:]) + def delta_star(self, c, w): + """Calculate the final node when traversing the graph from node c with + word w""" + return c if not w else self.delta_star(self.delta[c][w[0]], w[1:]) def equiv(self, a, b): - return (a in self.final) == (b in self.final) and\ - len(self.delta[a].keys()) == len(self.delta[b].keys()) and\ - self.delta[a].keys() == self.delta[b].keys() + """Determine if two nodes are equivalent. This is the case when they + have the same final flag, the same number of children and their + children are equal.""" + if (a in self.f) != (b in self.f) or\ + self.delta[a].keys() != self.delta[b].keys(): + return False + return all(self.equiv(x, y) for x, y in + zip(self.delta[a].values(), self.delta[b].values())) def replace_or_register(self, suffix): + """Starting from the back try to merge nodes in the suffix.""" while suffix: parent, char, state = suffix.pop() for r in self.register: if self.equiv(state, r): self.delta[parent][char] = r - if state in self.final: - self.final.remove(state) - self.nodes.remove(state) + if state in self.f: + self.f.remove(state) + self.v.remove(state) del(self.delta[state]) break else: self.register.add(state) def add_suffix(self, state, current_suffix): - nodenum = max(int(w[1:]) for w in self.nodes)+1 - reverse = [] + """Add the current_suffix to the graph from state and return it""" + nodenum = max(int(w[1:]) for w in self.v)+1 + suffix = [] for c in current_suffix: newnode = 'q{}'.format(nodenum) - self.nodes.add(newnode) + self.v.add(newnode) nodenum += 1 self.delta[state][c] = newnode self.delta[newnode] = {} - reverse.append((state, c, newnode)) + suffix.append((state, c, newnode)) state = newnode - self.final.add(newnode) - return reverse + self.f.add(newnode) + return suffix - def add_words_sorted(self, words, start=None): + def add_words(self, words): + """Add words to the dawg""" self.register = set() -# words = sorted(words) + words = sorted(words) while words: word = words.pop() - print('adding: {}'.format(word)) - common_prefix = self.common_prefix(word) - print('common_prefix=', common_prefix) - last_state = self.delta_star(self.start, common_prefix) - print('last_state=', last_state) + common_prefix = self.common_prefix(word, self.v0) + last_state = self.delta_star(self.v0, common_prefix) current_suffix = word[len(common_prefix):] - print('current_suffix=', current_suffix) if current_suffix: suffix = self.add_suffix(last_state, current_suffix) - print('added_suffix=', suffix) self.replace_or_register(suffix) - print('after: register: ', self.register) else: - self.final.add(last_state) + self.f.add(last_state) def to_dot(self, options=''): + """Return the graphviz(dot) string representation of the graph""" s = 'digraph {{\n{}\nn0 [style=invis]\n'.format(options) - for node in self.nodes: + for node in self.v: s += '{} [shape={}circle]\n'.format( - node, 'double' if node in self.final else '') - s += 'n0 -> {}\n'.format(self.start) + node, 'double' if node in self.f else '') + s += 'n0 -> {}\n'.format(self.v0) for node, transitions in self.delta.items(): for letter, state in transitions.items(): s += '{} -> {} [label="{}"]\n'.format(node, state, letter) - s += '}\n' - return s + return s + '}\n' if __name__ == '__main__': d = DAWG() -# d.add_words_sorted(['abd', 'bad', 'bae']) - d.add_words_sorted(['abcde', 'fghde', 'fghdghde']) - sys.stderr.write(d.to_dot(options='rankdir=LR')) + d.add_words(['abd', 'bad', 'bae']) +# d.add_words(['abcde', 'fghde', 'fghdghde']) + print repr(d.common_prefix('', d.v0)) + print d.to_dot(options='rankdir=LR')