rc1
[bsc-thesis1415.git] / program / dawg / dawg.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4
5 class DAWG:
6 """Class representing a DAWG
7
8 Variables:
9 v - Set of nodes
10 v0 - Start node
11 f - Set of final nodes
12 delta - Delta function
13 """
14 def __init__(self):
15 """Initialize with null graph."""
16 self.v = {'q0'}
17 self.delta = {'q0': {}}
18 self.f = set()
19 self.v0 = 'q0'
20 self.register = None
21
22 def common_prefix(self, word, current):
23 """Find the common prefix of word starting from current."""
24 try:
25 char, rest = word[0], word[1:]
26 return char + self.common_prefix(rest, self.delta[current][char])
27 except (KeyError, IndexError):
28 return ''
29
30 def delta_star(self, c, w):
31 """Calculate the final node when traversing the graph from node c with
32 word w"""
33 return c if not w else self.delta_star(self.delta[c][w[0]], w[1:])
34
35 def equiv(self, a, b):
36 """Determine if two nodes are equivalent. This is the case when they
37 have the same final flag, the same number of children and their
38 children are equal."""
39 if (a in self.f) != (b in self.f) or\
40 self.delta[a].keys() != self.delta[b].keys():
41 return False
42 return all(self.equiv(x, y) for x, y in
43 zip(self.delta[a].values(), self.delta[b].values()))
44
45 def replace_or_register(self, suffix):
46 """Starting from the back try to merge nodes in the suffix."""
47 while suffix:
48 parent, char, state = suffix.pop()
49 for r in self.register:
50 if self.equiv(state, r):
51 self.delta[parent][char] = r
52 if state in self.f:
53 self.f.remove(state)
54 self.v.remove(state)
55 del(self.delta[state])
56 break
57 else:
58 self.register.add(state)
59
60 def add_suffix(self, state, current_suffix):
61 """Add the current_suffix to the graph from state and return it"""
62 nodenum = max(int(w[1:]) for w in self.v)+1
63 suffix = []
64 for c in current_suffix:
65 newnode = 'q{}'.format(nodenum)
66 self.v.add(newnode)
67 nodenum += 1
68 self.delta[state][c] = newnode
69 self.delta[newnode] = {}
70 suffix.append((state, c, newnode))
71 state = newnode
72 self.f.add(newnode)
73 return suffix
74
75 def add_words(self, words):
76 """Add words to the dawg"""
77 self.register = set()
78 words = sorted(words)
79 while words:
80 word = words.pop()
81 common_prefix = self.common_prefix(word, self.v0)
82 last_state = self.delta_star(self.v0, common_prefix)
83 current_suffix = word[len(common_prefix):]
84 if current_suffix:
85 suffix = self.add_suffix(last_state, current_suffix)
86 self.replace_or_register(suffix)
87 else:
88 self.f.add(last_state)
89
90 def to_dot(self, options=''):
91 """Return the graphviz(dot) string representation of the graph"""
92 s = 'digraph {{\n{}\nn0 [style=invis]\n'.format(options)
93 for node in self.v:
94 s += '{} [shape={}circle]\n'.format(
95 node, 'double' if node in self.f else '')
96 s += 'n0 -> {}\n'.format(self.v0)
97 for node, transitions in self.delta.items():
98 for letter, state in transitions.items():
99 s += '{} -> {} [label="{}"]\n'.format(node, state, letter)
100 return s + '}\n'