934f39b3498f76701b51bb0c69e6727b6b22892b
[bsc-thesis1415.git] / program / dawg / dawg.py
1 import sys
2
3
4 class DAWG:
5 def __init__(self):
6 self.nodes = {'q0'}
7 self.delta = {'q0': {}}
8 self.final = set()
9 self.start = 'q0'
10 self.register = None
11
12 def common_prefix(self, word, current=None):
13 if current is None:
14 current = self.start
15 if word == '':
16 return ''
17 char, rest = word[0], word[1:]
18 if char in self.delta[current]:
19 return char + self.common_prefix(rest, self.delta[current][char])
20 else:
21 return ''
22
23 def delta_star(self, start, word):
24 if not word:
25 return start
26 else:
27 return self.delta_star(self.delta[start][word[0]], word[1:])
28
29 def equiv(self, a, b):
30 return (a in self.final) == (b in self.final) and\
31 len(self.delta[a].keys()) == len(self.delta[b].keys()) and\
32 self.delta[a].keys() == self.delta[b].keys()
33
34 def replace_or_register(self, suffix):
35 while suffix:
36 parent, char, state = suffix.pop()
37 for r in self.register:
38 if self.equiv(state, r):
39 self.delta[parent][char] = r
40 if state in self.final:
41 self.final.remove(state)
42 self.nodes.remove(state)
43 del(self.delta[state])
44 break
45 else:
46 self.register.add(state)
47
48 def add_suffix(self, state, current_suffix):
49 nodenum = max(int(w[1:]) for w in self.nodes)+1
50 reverse = []
51 for c in current_suffix:
52 newnode = 'q{}'.format(nodenum)
53 self.nodes.add(newnode)
54 nodenum += 1
55 self.delta[state][c] = newnode
56 self.delta[newnode] = {}
57 reverse.append((state, c, newnode))
58 state = newnode
59 self.final.add(newnode)
60 return reverse
61
62 def add_words_sorted(self, words, start=None):
63 self.register = set()
64 # words = sorted(words)
65 while words:
66 word = words.pop()
67 print('adding: {}'.format(word))
68 common_prefix = self.common_prefix(word)
69 print('common_prefix=', common_prefix)
70 last_state = self.delta_star(self.start, common_prefix)
71 print('last_state=', last_state)
72 current_suffix = word[len(common_prefix):]
73 print('current_suffix=', current_suffix)
74 if current_suffix:
75 suffix = self.add_suffix(last_state, current_suffix)
76 print('added_suffix=', suffix)
77 self.replace_or_register(suffix)
78 print('after: register: ', self.register)
79 else:
80 self.final.add(last_state)
81
82 def to_dot(self, options=''):
83 s = 'digraph {{\n{}\nn0 [style=invis]\n'.format(options)
84 for node in self.nodes:
85 s += '{} [shape={}circle]\n'.format(
86 node, 'double' if node in self.final else '')
87 s += 'n0 -> {}\n'.format(self.start)
88 for node, transitions in self.delta.items():
89 for letter, state in transitions.items():
90 s += '{} -> {} [label="{}"]\n'.format(node, state, letter)
91 s += '}\n'
92 return s
93
94
95 if __name__ == '__main__':
96 d = DAWG()
97 # d.add_words_sorted(['abd', 'bad', 'bae'])
98 d.add_words_sorted(['abcde', 'fghde', 'fghdghde'])
99 sys.stderr.write(d.to_dot(options='rankdir=LR'))