From ab4c744faaa50417a221c7ebe4ff6cf81e2e19b2 Mon Sep 17 00:00:00 2001 From: Mart Lubbers Date: Tue, 15 Jul 2014 17:03:39 +0200 Subject: [PATCH] dafsa first rty --- program/regexex/dafsa.py | 185 +++++++++++++++++++++++++++++++++++++++ program/regexex/f.dot | 11 +++ 2 files changed, 196 insertions(+) create mode 100644 program/regexex/dafsa.py create mode 100644 program/regexex/f.dot diff --git a/program/regexex/dafsa.py b/program/regexex/dafsa.py new file mode 100644 index 0000000..f855c7a --- /dev/null +++ b/program/regexex/dafsa.py @@ -0,0 +1,185 @@ +#!/bin/env python +# -*- coding: utf-8 -*- + +import sys + + +class Afda(): + def __init__(self): + pass + + def common_prefix_p(self, cons, word, res, c_node, c_index, cur): + if c_index >= len(word): + childs = [] + else: + childs = [c for c in cons[c_node] if c[0] == word[c_index]] + if not childs or c_index >= len(word): + res.append(cur + [c_node]) + else: + for child in childs: + self.common_prefix_p(cons, word, res, child[1], c_index+1, + cur+[(c_node, child)]) + + def common_prefix(self, cons, word): + results = [] + self.common_prefix_p(cons, word, results, -1, 0, []) + return max(results, key=lambda x: len(x)) + + def first_cstate(self, connections, common_prefix): + print common_prefix + for node in common_prefix: + print node + if not isinstance(node, int): + node = node[1][1] + num = 0 + for c, n in connections.iteritems(): + if node in [m[1] for m in n]: + num += 1 + if num > 1: + return node + + def find_value(self, connections, from_node, to_node): + return [c for c in connections[from_node] if c[1] == to_node][0][0] + + def clone_p(self, connections, point): + """clone everything from this point on""" + # Translation dictionary for the new points + newnames = {point[0]: point[0], point[1]: self.new_num(connections)} + # Fix point all the endpoints(should be only one + for c, n in connections.iteritems(): + if len(n) == 0: + newnames[c] = c + # Make the first connection + connections[point[0]].append( + (self.find_value(connections, point[0], point[1]), + newnames[point[1]])) + # Add the first to-clone point to the frontier and give it a new node + toclone = [point[1]] + connections[newnames[point[1]]] = [] + # While there are still nodes to be cloned, add them and visit their + # children + while toclone: + to = toclone.pop() + for child in connections[to]: + if child[1] not in newnames: + # If the node is an endpoint, dont create new node + if not connections[child[1]]: + newnames[child[1]] = newnames[child[1]] + # If the node is not an endpoint create a new node and add + # it to the translation dictionary + else: + newnames[child[1]] = self.new_num(connections) + connections[newnames[child[1]]] = [] + toclone.append(child[1]) + # Make the connection + connections[newnames[to]].append( + (child[0], newnames[child[1]])) + return newnames[point[1]] + + def clone(self, connections, states, from_node): + # Remember the last node number + last_node = states[0] if isinstance(states[0], int) else states[0][0] + # While the node is not found + for c in states: + # Derive the node number + node = c if isinstance(c, int) else c[0] + # If the node is the one we search for, search for its parent and + # start cloning + if node == from_node: + for i, rem in enumerate(connections[last_node]): + if rem[1] == node: + result = self.clone_p(connections, (last_node, rem[1])) + del(connections[last_node][i]) + return result + last_node = node + + def new_num(self, connections): + return max(connections.keys())+1 + + def add_suffix(self, connections, last_state, current_suffix, first=False): + if first: + this = current_suffix + that = None + else: + this = current_suffix[:-1] + that = current_suffix[-1] + for c in this: + newnum = self.new_num(connections) + connections[last_state].append((c, newnum)) + connections[newnum] = [] + last_state = newnum + if that: + endstate = [i for i in connections.iteritems() if not i[1]][0] + connections[last_state].append((that, endstate[0])) + + def train(self, wlist): +# register = None +# connections = {-1: [('a', 2), ('b', 4)], 2: [('b', 3)], 3: [('d', 5)], +# 4: [('a', 3)], 5: []} + connections = {-1: []} + first = True + for word in wlist: + if first: + self.add_suffix(connections, -1, word, True) + first = False + print word +# Find the common prefix and suffix and skip if word is in the adfa + common_prefix = self.common_prefix(connections, word) + current_suffix = word[len(common_prefix)-1:] + print 'common_prefix: {}'.format(common_prefix) + print 'current_suffix: {}'.format(current_suffix) + if not current_suffix: + continue + +# Find the first confluence state in the common prefix path + first_state = self.first_cstate(connections, common_prefix) + print 'first_state: {}'.format(first_state) + if first_state: + last_state = self.clone( + connections, common_prefix, first_state) + else: + last_state = common_prefix[-1] + print 'last_state: {}'.format(last_state) + self.add_suffix(connections, last_state, current_suffix) + +# If there is a confluence state and there has been cloning, loop through the +# new states backwards and replace or register the states + if first_state: + print 'consider replace register clone' + first_state = self.first_cstate(connections, common_prefix) + print 'first_state {}'.format(first_state) + current_index = common_prefix.index(first_state) + print range(len(common_prefix, current_index, -1)) + for i in range(len(common_prefix, current_index, -1)): + pass +# There is no confluence state so the current index will be the last of the +# common prefix + else: + current_index = len(common_prefix) + +# While we change things try to find equivalent states in the suffix to +# replace or register + #changed = True + raw_input('continue...\n') + + self.graphviz(connections, 'f.dot') + + def graphviz(self, connections, fp='-'): + fout = sys.stdout if fp == '-' else open(fp, 'w') + fout.write('digraph finite_state_machine {\n') + fout.write('\tnode [shape = doublecircle]; {}\n'.format( + ' '.join(str(c) for c in connections if not connections[c]))) + fout.write('\tnode [shape = circle]; {}\n'.format( + ' '.join(str(c) for c in connections if connections[c]))) + for node, targets in connections.iteritems(): + for target in targets: + fout.write('\t{} -> {} [label = "{}"];\n'.format( + node, target[1], target[0])) + fout.write('}') + if fp != '-': + fout.close() + + +if __name__ == '__main__': + afda = Afda() + afda.train(['abd', 'bad', 'bae']) diff --git a/program/regexex/f.dot b/program/regexex/f.dot new file mode 100644 index 0000000..4411aa4 --- /dev/null +++ b/program/regexex/f.dot @@ -0,0 +1,11 @@ +digraph finite_state_machine { + node [shape = doublecircle]; 2 + node [shape = circle]; 0 1 3 4 -1 + 0 -> 1 [label = "b"]; + 1 -> 2 [label = "d"]; + 3 -> 4 [label = "a"]; + 4 -> 2 [label = "d"]; + 4 -> 2 [label = "e"]; + -1 -> 0 [label = "a"]; + -1 -> 3 [label = "b"]; +} \ No newline at end of file -- 2.20.1