From 2af331855cf5b243c21c4bfcef001e87cc542ba5 Mon Sep 17 00:00:00 2001 From: Mart Lubbers Date: Wed, 23 Jul 2014 20:11:42 +0200 Subject: [PATCH] working example --- program/regexex/dafsa.py | 255 ++++++++++++++++++--------------------- program/regexex/f.dot | 20 +-- 2 files changed, 128 insertions(+), 147 deletions(-) diff --git a/program/regexex/dafsa.py b/program/regexex/dafsa.py index d9d4766..de29c92 100644 --- a/program/regexex/dafsa.py +++ b/program/regexex/dafsa.py @@ -24,7 +24,7 @@ class Afda(): childs = [] # End is not reached, find all the children with the correct letter else: - childs = [c for c in cons[c_node] if c[0] == word[c_index]] + childs = [c for c in cons[c_node][0] if c[0] == word[c_index]] # If there are no childs left, append the result if not childs or c_index >= len(word): res.append(cur + [c_node]) @@ -52,91 +52,56 @@ class Afda(): connections -- connections dictionary common_prefix -- common prefix """ - print common_prefix for node in common_prefix: - print node if not isinstance(node, int): node = node[1][1] num = 0 - for c, n in connections.iteritems(): + for c, (n, _) in connections.iteritems(): if node in [m[1] for m in n]: num += 1 if num > 1: return node - def find_value(self, connections, from_node, to_node): - """Find the value from a node to another - - Required arguments: - connections -- connections dictionary - from_node -- from - to_node -- to - """ - return [c for c in connections[from_node] if c[1] == to_node][0][0] - - def clone_p(self, connections, point): - """Clone everything from this point on - - Required arguments: - connections -- connections dictionary - point -- point to clone from - """ - # Translation dictionary for the new points - newnames = {point[0]: point[0], point[1]: self.new_num(connections)} - # Fix point all the endpoints(should be only one - for c, n in connections.iteritems(): - if len(n) == 0: - newnames[c] = c - # Make the first connection - connections[point[0]].append( - (self.find_value(connections, point[0], point[1]), - newnames[point[1]])) - # Add the first to-clone point to the frontier and give it a new node - toclone = [point[1]] - connections[newnames[point[1]]] = [] - # While there are still nodes to be cloned, add them and visit their - # children - while toclone: - to = toclone.pop() - for child in connections[to]: - if child[1] not in newnames: - # If the node is an endpoint, dont create new node - if not connections[child[1]]: - newnames[child[1]] = newnames[child[1]] - # If the node is not an endpoint create a new node and add - # it to the translation dictionary - else: - newnames[child[1]] = self.new_num(connections) - connections[newnames[child[1]]] = [] - toclone.append(child[1]) - # Make the connection - connections[newnames[to]].append( - (child[0], newnames[child[1]])) - return newnames[point[1]] - def clone(self, connections, states, from_node): - """Clone a subtree - - Required arguments: - connections -- connections dictionary - states -- states to clone - from_node -- node to start from - """ - # Remember the last node number - last_node = states[0] if isinstance(states[0], int) else states[0][0] - # While the node is not found - for c in states: - # Derive the node number - node = c if isinstance(c, int) else c[0] - # If the node is the one we search for, search for its parent and - # start cloning - if node == from_node: - for i, rem in enumerate(connections[last_node]): - if rem[1] == node: - result = self.clone_p(connections, (last_node, rem[1])) - del(connections[last_node][i]) - return result - last_node = node + print 'clone: {} from {}'.format(states, from_node) + newnames = {} + last_state = None + for i, c in enumerate(states): + node_number = c if isinstance(c, int) else c[0] + if node_number == from_node: + newnames[node_number] = self.new_num(connections) + connections[newnames[node_number]] =\ + ([], connections[node_number][1]) + new_con = (connections[last_state[0]][0][0][0], + newnames[node_number]) + connections[last_state[0]] =\ + ([new_con], connections[last_state[0]][1]) + last_state = newnames[node_number] + break + last_state = node_number, connections[node_number][0] + toclone = [newnames.keys()[0]] + for node, (cs, _) in connections.iteritems(): + if not cs: + newnames[node] = node + while toclone: + current = toclone.pop() + for symbol, childnode in connections[current][0]: + #New point, clone + if childnode not in newnames: + newnames[childnode] = self.new_num(connections) + if newnames[current] not in connections: + connections[newnames[current]] =\ + ([], connections[current][1]) + connections[newnames[current]][0].append( + (symbol, newnames[childnode])) + toclone.append(childnode) + for i, state in enumerate(states[:]): + if isinstance(state, int): + states[i] = newnames.get(state, state) + else: + fr, (sym, to) = state + states[i] = (newnames.get(fr, fr), (sym, newnames.get(to, to))) + return states, states[-1] def new_num(self, connections): """Generate a new node number @@ -153,27 +118,31 @@ class Afda(): connections -- connections dictionary tra -- parent, (symbol, state) """ - for c in sorted(connections.iteritems()): - print c - print 'replace or register: {}'.format(tra) parent, (symbol, state) = tra - print 'parent, symbol, state = {} {} {}'.format(parent, symbol, state) replace = None - for cparent, children in connections.iteritems(): + for cparent, (children, _) in connections.iteritems(): for csymbol, cstate in children: if symbol == csymbol and state == cstate and parent != cparent: replace = (cparent, (csymbol, cstate)) - print 'found possible replace match: {}'.format(replace) -# matches = [c for c in connections[parent] -# if c[0] == symbol and c[1] != state] if replace: # Delete the child # Register the child to the existing state - print 'replacing {} with {}'.format(tra, replace) + for node, (conns, _) in connections.items(): + for enum, (symbolchild, nodechild) in\ + reversed(list(enumerate(conns[:]))): + if nodechild == tra[0]: + del(connections[node][0][enum]) + connections[node][0].insert( + enum, (symbolchild, replace[0])) + for enum, child in\ + reversed(list(enumerate(connections[tra[0]][0]))): + if child == tra[1]: + del(connections[tra[0]][0][enum]) + if not connections[tra[0]][0]: + del(connections[tra[0]]) return replace else: # It's already in the register, just go on - print 'registering' return tra def add_suffix(self, connections, last_state, current_suffix): @@ -186,116 +155,122 @@ class Afda(): """ suffix = [] endpoints = [i[0] for i in connections.iteritems() - if not i[1] and i[0] != -1] + if i[1][1]] for i, c in enumerate(current_suffix): if i == len(current_suffix)-1: - print 'last_state reached in adding suffix {}'.format(c) - print 'endpoints: {}'.format(endpoints) if not endpoints: - print 'no end point yet' newnum = self.new_num(connections) - connections[newnum] = [] + connections[newnum] = ([], True) else: - print 'end point found' - newnum = endpoints[0] + newnum = min(endpoints) else: newnum = self.new_num(connections) - connections[newnum] = [] - connections[last_state].append((c, newnum)) + connections[newnum] = ([], False) + connections[last_state][0].append((c, newnum)) suffix.append((last_state, (c, newnum))) last_state = newnum - print 'registering or replacing, add_suffix' prev = None - for i, tra in list(reversed(list(enumerate(suffix)))): + for i, tra in list(reversed(list(enumerate(suffix[1:])))): if prev: - connections[tra[0]] = [(tra[1][0], prev[0])] + tra = (tra[0], (tra[1][0], prev[0])) prev = None - print '\nfrom {}'.format(tra) tra2 = self.replace_or_register(connections, tra) if tra2 != tra: prev = tra2 suffix[i] = tra2 - del(connections[tra[0]]) - print 'to {}'.format(tra2) return suffix def train(self, wlist): -# 0. start - register = set() +# v0. start # connections = {-1: [('a', 2), ('b', 4)], 2: [('b', 3)], 3: [('d', 5)], # 4: [('a', 3)], 5: []} - connections = {-1: []} + connections = {-1: ([], False)} for word in wlist: - print word +# v0. prepare + raw_input('Press enter to train the next word\n\n') + print 'word := {}'.format(word) + # v1. Find the common prefix and suffix and skip if word is in the adfa - print '\tI\n' common_prefix = self.common_prefix(connections, word) + print 'CommonPrefix := {}'.format(common_prefix) current_suffix = word[len(common_prefix)-1:] - print 'common_prefix: {}'.format(common_prefix) - print 'current_suffix: {}'.format(current_suffix) - if not current_suffix and not connections[common_prefix[-1]]: + print 'CurrentSuffix := {}'.format(current_suffix) + print 'if CurrentSuffix = ...' + if not current_suffix: + if not connections[common_prefix[-1]][1]: + connections[common_prefix[-1]] =\ + (connections[common_prefix[-1]][0], True) + print 'continue' continue # v2. Find the first confluence state in the common prefix path - print '\tII\n' first_state = self.first_cstate(connections, common_prefix) - print 'first_state: {}'.format(first_state) + print 'FirstState := {}'.format(first_state) + print 'if FirstState = ...' if not first_state: - print 'not cloning' last_state = common_prefix[-1] + print 'LastState := {}'.format(last_state) else: - print 'cloning' - last_state = self.clone( + commen_prefix, last_state = self.clone( connections, common_prefix, first_state) - print 'adding suffix: {}'.format(current_suffix) + print 'LastState := {} (cloned new common prefix: {}'.format( + last_state, common_prefix) suffix = self.add_suffix(connections, last_state, current_suffix) - print 'suffix added: {}'.format(suffix) - path = [c[0] for c in common_prefix + suffix - if not isinstance(c, int)] - print 'path: {}'.format(path) + print 'add_suffix({}, {})'.format(last_state, current_suffix) # 3a. If there is a confluence state and there has been cloning, loop through # the new states backwards and replace or register the states - print '\tIII\n' + print 'if FirstState is not ...' if first_state: - continue - print 'consider replace register clone' first_state = self.first_cstate(connections, common_prefix) - print 'first_state {}'.format(first_state) - current_index = common_prefix.index(first_state) - print 'range: ', range(len(common_prefix, current_index, -1)) - for i in range(len(common_prefix, current_index, -1)): - pass + print 'First_state := {}'.format(first_state) + if first_state: + current_index = None + for enum, c in enumerate(common_prefix): + if c[0] == first_state: + current_index = enum + break + print 'CurrentIndex := {}'.format(current_index) +# for ind in range(len(common_prefix)-1, current_index-1, +# -1): print 'index: {} maps to {}'.format(ind, +# common_prefix[ind]) +# self.clone(connections, common_prefix[:ind], +# first_state) +# break +# #Do stuff +# pass + current_index = len(common_prefix) # v3b. There is no confluence state so the current index will be the last of # the common prefix else: current_index = len(common_prefix) + print 'CurrentIndex := {}'.format(current_index) # 4. While we change things try to find equivalent states in the suffix to # replace or register - print '\tIV\n' + print 'part IV: pref: {}, suf: {}'.format(common_prefix, + suffix) changed = True while changed: + break print 'current_index: {}'.format(current_index) print 'last_state: {}'.format(last_state) current_index -= 1 current_state = None - current_state = path[current_index] +# current_state = path[current_index] print 'current state: {}'.format(current_state) old_state = last_state if current_index > 0: print 'replace or register changing state' - register = register - {last_state} +# register = register - {last_state} # self.replace_or_register(connections, current_state) changed = old_state != last_state # v5. If there has been no change and there are characters left - print '\tV\n' if not changed and current_index > 0: - register = register.union({current_state}) - import pdb - pdb.set_trace() + pass +# register = register.union({current_state}) self.graphviz(connections, 'f.dot') @@ -303,10 +278,10 @@ class Afda(): fout = sys.stdout if fp == '-' else open(fp, 'w') fout.write('digraph finite_state_machine {\n') fout.write('\tnode [shape = doublecircle]; {}\n'.format( - ' '.join(str(c) for c in connections if not connections[c]))) + ' '.join(str(c) for c in connections if connections[c][1]))) fout.write('\tnode [shape = circle]; {}\n'.format( - ' '.join(str(c) for c in connections if connections[c]))) - for node, targets in connections.iteritems(): + ' '.join(str(c) for c in connections if not connections[c][1]))) + for node, (targets, _) in connections.iteritems(): for target in targets: fout.write('\t{} -> {} [label = "{}"];\n'.format( node, target[1], target[0])) @@ -317,4 +292,8 @@ class Afda(): if __name__ == '__main__': afda = Afda() - afda.train(['abd', 'bad', 'bae']) +# afda.train(['abcde', 'fghde', 'fghdghde']) + afda.train([['wie', '_', 'h', '_', 'wat'], + ['wie', '_', 'h', '_', 'wat', '_', 'h', '_', 'wanneer'], + ['wie', '_', 'h', '_', 'wat']]) +# afda.train(['abd', 'ab', 'bad', 'bae']) diff --git a/program/regexex/f.dot b/program/regexex/f.dot index 4411aa4..1104fab 100644 --- a/program/regexex/f.dot +++ b/program/regexex/f.dot @@ -1,11 +1,13 @@ digraph finite_state_machine { - node [shape = doublecircle]; 2 - node [shape = circle]; 0 1 3 4 -1 - 0 -> 1 [label = "b"]; - 1 -> 2 [label = "d"]; - 3 -> 4 [label = "a"]; - 4 -> 2 [label = "d"]; - 4 -> 2 [label = "e"]; - -1 -> 0 [label = "a"]; - -1 -> 3 [label = "b"]; + node [shape = doublecircle]; 4 + node [shape = circle]; 0 1 2 3 5 6 7 -1 + 0 -> 1 [label = "_"]; + 1 -> 2 [label = "h"]; + 2 -> 3 [label = "_"]; + 3 -> 4 [label = "wat"]; + 4 -> 5 [label = "_"]; + 5 -> 6 [label = "h"]; + 6 -> 7 [label = "_"]; + 7 -> 4 [label = "wanneer"]; + -1 -> 0 [label = "wie"]; } \ No newline at end of file -- 2.20.1