From 7774d8f6027d0d71312255de0b53ca5f5ec13196 Mon Sep 17 00:00:00 2001 From: Mart Lubbers Date: Thu, 7 Aug 2014 13:24:36 +0200 Subject: [PATCH] laptop --- program/regexex/dawg.py | 71 +++++++++ program/regexex/pyDAWG | 1 + program/regexex/pydawg.py | 289 +++++++++++++++++++++++++++++++++++++ program/regexex/pydawg.pyc | Bin 0 -> 8257 bytes 4 files changed, 361 insertions(+) create mode 100644 program/regexex/dawg.py create mode 160000 program/regexex/pyDAWG create mode 100644 program/regexex/pydawg.py create mode 100644 program/regexex/pydawg.pyc diff --git a/program/regexex/dawg.py b/program/regexex/dawg.py new file mode 100644 index 0000000..be0757b --- /dev/null +++ b/program/regexex/dawg.py @@ -0,0 +1,71 @@ +#!/bin/env python +# -*- coding: utf-8 -*- + +import pydawg + + +def to_dot(filepath, q0): + nodenum = 0 + final_nodes = [] + nodes = [] + edges = [] + to_visit = [(0, q0)] + visited = set() + translation = [] + if q0.final: + final_nodes.append(nodenum) + else: + nodes.append(nodenum) + + nodenum += 1 + while to_visit: + current = to_visit.pop() + if not current[0] in visited: + visited.add(current[0]) + for char, child in current[1].children.iteritems(): + matches = [c for c in translation if c[0] == child] + curnum = -1 + if matches: + curnum = matches[-1][1] + else: + translation.append((child, nodenum)) + curnum = nodenum + nodenum += 1 + if child.final: + final_nodes.append(curnum) + else: + nodes.append(curnum) + edges.append((current[0], char, curnum)) + to_visit.append((curnum, child)) + print 'digraph dawg {' + print '\tnode [shape = doublecircle]; {}'.format( + ' '.join(str(n) for n in final_nodes)) + print '\tnode [shape = circle]; {}'.format( + ' '.join(str(n) for n in nodes)) + for fr, ch, to in edges: + print '\t{} -> {} [label = "{}"];'.format(fr, to, ch) + print '}' + + +d = pydawg.DAWG() + +regs = [ + 'wdag dag maand jaar tijd - wat', + 'dag maand jaar tijd - wat', + 'wdag dag maand jaar tijd - wat', + 'wdag dag maand jaar tijd - wat - Locatie: waar', + 'wdag dag maand jaar tijd - wat - Locatie: waar'] + +#regs = [ +# 'maandag 11 augustus 2014 19:30 - Neutral Milk Hotel', +# 'dinsdag 19 augustus 2014 22:00 - Arkells', +# 'maandag 24 november 2014 20:30 - Fink', +# 'woensdag 19 november 2014 20:00 - Michael Schulte', +# 'zondag 26 oktober 2014 21:00 - The Majority Says - Locatie: Bitterzoet', +# 'maandag 15 september 2014 20:30 - Ani DiFranco', +# 'maandag 13 oktober 2014 20:30 - Tarrus Riley', +# 'maandag 29 december 2014 20:30 - Alain Clark - Locatie: De Duif'] +for w in sorted(set(regs)): + d.add_word(w) + +to_dot('t.dot', d.q0) diff --git a/program/regexex/pyDAWG b/program/regexex/pyDAWG new file mode 160000 index 0000000..0a2eaed --- /dev/null +++ b/program/regexex/pyDAWG @@ -0,0 +1 @@ +Subproject commit 0a2eaed05a5724fb5574e11bc8e9a68ea103faba diff --git a/program/regexex/pydawg.py b/program/regexex/pydawg.py new file mode 100644 index 0000000..18475c1 --- /dev/null +++ b/program/regexex/pydawg.py @@ -0,0 +1,289 @@ +# -*- coding: utf-8 -*- +""" + This is part of pydawg Python module. + + Pure python implementation. + + Author : Wojciech Muła, wojciech_mula@poczta.onet.pl + WWW : http://0x80.pl/proj/pydawg/ + License : Public domain + Date : $Date$ + + $Id$ +""" + + +class DAWGNode: + __slots__ = ["children", "final", "number"] + + def __init__(self, char): + self.children = {} + self.final = False + self.number = None + + def get_next(self, char): + try: + return self.children[char] + except KeyError: + return None + + def set_next(self, char, child): + self.children[char] = child + + def has_transition(self, char): + return char in self.children + + def __str__(self): + return "<" + "".join(self.children.keys()) + ">" + + +def equivalence(p, q): + "check if states p and q are equivalent" + + if p.final != q.final: + return False + + if len(p.children) != len(q.children): + return False + + s = set(p.children) + if s != set(q.children): + return False + + """ + # exact definition of equivalence + for c in s: + if not equivalence(p.children[c], q.children[c]): + return False + """ + # pratical implementation - constraints make + # this much simpler and faster + for c in s: + if p.children[c] != q.children[c]: + return False + + return True + + +class DAWG: + def __init__(self): + self._numbers_valid = False + self.register = set() + self.q0 = DAWGNode(None); + self.wp = '' + + + def add_word(self, word): + assert word > self.wp + return self.add_word_unchecked(word) + + + def add_word_unchecked(self, word): + # 1. skip existing + i = 0; + s = self.q0 + while i < len(word) and s.has_transition(word[i]): + s = s.get_next(word[i]) + i = i + 1 + + assert s != None + + # 2. minimize + if i < len(self.wp): + self._replace_or_register(s, self.wp[i:]) + + + # 3. add suffix + while i < len(word): + n = DAWGNode(word[i]) + s.set_next(word[i], n) + assert n == s.get_next(word[i]) + s = n + i = i + 1 + + s.final = True + self.wp = word + self._numbers_valid = False + + + def _replace_or_register(self, state, suffix): + stack = [] + while suffix: + letter = suffix[0] + next = state.get_next(letter) + stack.append((state, letter, next)) + + state = next + suffix = suffix[1:] + + while stack: + parent, letter, state = stack.pop() + + found = False + for r in self.register: + if equivalence(state, r): + assert(parent.children[letter] == state) + parent.children[letter] = r + + found = True + break + + if not found: + self.register.add(state) + + + def freeze(self): + self._replace_or_register(self.q0, self.wp) + self._numbers_valid = False + + close = freeze + + + def _num_nodes(self): + def clear_aux(node): + node.number = None + for child in node.children.values(): + clear_aux(child) + + def num_aux(node): + if node.number is None: + n = int(node.final) + for child in node.children.values(): + n += num_aux(child) + + node.number = n + + return node.number + + if not self._numbers_valid: + clear_aux(self.q0) + num_aux(self.q0) + self._numbers_valid = True + + + def word2index(self, word): + self._num_nodes() + + state = self.q0 + index = 0 + for c in word: + try: + next = state.children[c] + except KeyError: + return None + + for C in sorted(state.children): + if C < c: + index += state.children[C].number + else: + break + + state = next + if state.final: + index = index + 1 + #for + + return index + + + def index2word(self, index): + self._num_nodes() + + state = self.q0 + count = index + output_word = "" + while True: + for c in sorted(state.children): + tmp = state.get_next(c) + if tmp.number < count: + count -= tmp.number + else: + output_word += c + state = tmp + if state.final: + count -= 1 + + break + #for + if count <= 0: + break + + return output_word + + + def as_dot(self, file): + nodes = set() + edges = [] + tmp = set() + + def aux(node): + nodes.add((id(node), node.final)) + tmp.add(node) + + for letter, child in node.children.items(): + aux(child) + + aux(self.q0) + + for node in tmp: + for letter, child in node.children.items(): + edges.append((id(node), letter, id(child))) + + import dump2dot + dump2dot.dumpdata2dot(nodes, edges, file) + + + def words(self): + L = [] + def aux(node, word): + if node.final: + L.append(word) + + for letter, child in node.children.items(): + aux(child, word + letter) + + aux(self.q0, '') + return L + + + def __iter__(self): + return iter(self.words()) + + +import os + +def main(): + words = "aimaient aimais aimait aime aiment".split() + words = "cat rat attribute tribute".split() + + def dump(name): + with open(name, 'wt') as f: + D.as_dot(f) + + + D = DAWG() + for word in sorted(words): + print(word) + D.add_word(word) + + D.freeze() + + # MPH test + for word in words: + print(word, "=>", D.word2index(word)) + + for index in range(1, len(words) + 1): + print(index, "=>", D.index2word(index)) + + + if 1: + # show image of graph + name = "dawg.dot" + dump(name) + os.system("dotty %s" % name) + + print(D.words(), set(D.words()) == set(words)) + + +if __name__ == '__main__': + main() diff --git a/program/regexex/pydawg.pyc b/program/regexex/pydawg.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20959026e322235557296546bd05380cffbb65c0 GIT binary patch literal 8257 zcmb_hOLH968NIjXABPVLacH`z}P|6jwCgedUbMrlo0Hlb%eHRgdaKe&ze9A9pGCd`}gfC&qu zzMKh*CY&(Myt$3ONsTL*2#qDP<(c?H6BW3OZDZvLb8BIWyZseKx%BQvBdOr8TZ_|5 zXRXrR4r}+;D>t^&jZVAL>V*Afv{){eZuH{_ooa2gy3ME+wbNSK=(O2;xsQ&xf}fWw z)y`(U5!E*;@AUulN$rKoz3e?`^_#WVx}Ew*X>GC7j?%?$vs|iHtDJ_Pmn$1-+I@Lx z>BYf?7ty!WjXRr5?mSE7(%X%C)J`G|ywShgY}6}Zr&VjT%cbR7%64_l^JN}KntwB# zFQ=0ra`|%g+Da#k9%1m@VsZt+f(L+@WCB=(=t*P{G}?_c2#%nhkTPRS_#IKkV~R5F zwtbKUZGswW=5AHnGbRrjx5_4d##n1f-TS*pBms^@?(|l)eKn3dvCkz%m(TtD3BKal zA9~iKG-yYI^l>bha9d+@_Mn}%b)IJUh@!)2xbKZAy147;6pW6(ARyG_f#Sx6g2j$TH=ICQsbEB37XNHJ$6w5`(ozBo?G6uofvN~4@5cQ| zP!fE)DtjsumG!Y0_8@`u0Ja>Rt{!pGWEO?V&Dgy6G@gU@n9Y}`?Gtv=)5++Y>Kwu! zV*7PGNe@Ld^h=+j0-s>zcHof|6wuK-9|e!(d{mJhAMPmgRFgwbRie-5A_yp4Y=G#Y zq?Rd08SI*-6XWsgJOiJRCHW3spn7ysg2V$?&r$55RFz$Dht7Lpe>yErU-1v3l&1K3 zu0rz=I9Hei5Md+qX?NvV;BFd88pVRKY(C&;Oz%bgyVu=kwU$C&#Ri$tFWD)W{gE3R zQGXgk#&$Gsl4C^dC(g1;KcFZ>A5F;l&!7;cmy;xlX#?fP(iybZ!Z5hkiNm1Z7Q&-& z?AQbccJ2=rCwFc4HtGo_U;sbSNT15*3Z7&h@`m)N%<|UI*TEZ7Y>;Vb3AT)hs|HF7 z^Gt4dW)7`c6SRBOL{$e?J#!qZ^=3>JbASh^da}vLg^Yv2K#84-!=-0Rr!@2m!=G~`5ES&s%c#OR4()h5ygbgZlN^n3c z3DoHnPZUf$P*c7u2-gs^QAHMrXP*f1kF$_bmyIWD{dpEb!%@_NIO;ZQ^(g4XsP4wP z9Ksw26Lr!Qzk-?z`DEf~5@o0w{)u6=y)QWN&tScqcoNcKydr|R1$)Fk=FQ}e*(dVH z>>+#9p0q`@D9|Gh=-LP;q30i{tQ#_O@FqnS-xLH8@$ERj00n|XfUklC0@>paT?Mly zCt^|C5iWq6lf+HQR0lud;P~&FqL|=L&g>Ko*2Gv?`ke_gxFNXY;bg9wokbpQZx)L4Yu#?t4&{itovxGRDRlY_b>x>EEGTY`f<;&$I~A}fT9n7rVo85(tuYuj z*0z<12UA)SQIL)RhOWnFNEOpkf^ z2uxzwa3keabh+p!YpfZZ${5B#49V>3A!-0Oaf|lmGGvl9D~fXDK9vu;P>58*-S(qo zoFBH45c~A_a;%D0>&>VZ2etly+&6m&g#4E<|4XjHotYtKEF~+6Tv7P}Dh?K)b!*bZ z?<(R1I9zNA(x26UyxG+C#q#Q`oRp&;2(sopSkYUA-ipnRvu@3yPv!|6D`=FZbB%WD zOs&9l9?dRmyTGB(vl#P*(LW-Vec{S0SnKC_6rdX@4b;dU^`<_Ip#Pa#c&K=K7wqYJ`;3J1$2q`pXgxf@9*_pac9?HmdV7_?-u0tSW+F#KN- zjLWN^(L8S-ru}E20Lu%T@6!I9(*CzbY5(9AnQIM%9G6$0r*(%}#wiDLmo&*X0MHZ)7Ncb|n0tP8fI&m6> z4lU%s0$CtyPR|_0?%oWUt!^-g2LD`tZfmp z$=Smw3*Iau*5h6Q7Pn*%yBv?lrHV;nzeB~T!XpxcpukAVEpG;&Q{&>_m_d`8L4L|p zFRy+eZ;f}jROlW?OE~nWkR;^OlzikxFHkTSGM9dyP9JW1B%PmT(%~qWy&D**pb*O- zf;^3e(*}Q4$UV#Ul{F~9YJh|+e;CV+%HhAtLK2k!CDBspT@t&*wvoi@@QbMuv`#0yG7>LH z?{ljN3xTtH_2`P=etzsRgc>=dhfd(mEg(30$n^WV6!d$P0(5fdLl(^)Pq7r_E2#t4 z7y&^;4y;S)S$x*3nriPdyo_!pn~<&p!-S!CQ%3M0XLOzC!lODRLz(cAt*@ej!3Y6d z?9Yu91+2-8TO8Ge2k5Yw*~fu_MtZziofaVU%0@K$Z=%EDo7C!9 z{&m(!KfThT}6~ zou~phK(C7uf$PkgOGy|&kWv9_1r;|qo+CKmma@1A#%8dXp==2IrvxM*KPTu8bHn3| zMSIUZf+z0cFeHJ<)=)k!z!HDYa>$6uF0|sfx5Z%bdk?(TBLBj2qZ^7t0FS#;(P`+# z{%@2(o@n+Sb{Cdt!jgZEk5IHT6Nkpaq=pGVymf?Wq8wm+bSE?nzc@w!ZSZdA`*{2M zkklC?xqVvQ-m9j{ z2H9LYI74wRuRfIx#vLO&|OYH1ob?)LEsG5cSqdJ#oMJhC#+q2d4z)`Pe& zk7ha8*xc}T;HLYU#i=Xq+A