import ast
import logging
import re
+import pydawg
def structure_data(d):
def parse_line(line):
- re_spa = re.compile('(?P<b><span.*?background-color:(?P<c>.*?);.*?>)(?P<co'
- 'ntent>.*?)(?P<e></span>)')
+ re_spa = re.compile('(?P<b><span.*?background-color:\s*(?P<c>.*?);.*?>)(?P'
+ '<content>.*?)(?P<e></span>)')
results = []
for column in line:
results.append([])
def create_nodes(d):
- print d
+ color_dict = {
+ 'rgb(139, 0, 0)': '\x01', # 'datum',
+ 'red': '\x02', # 'tijd',
+ 'green': '\x03', # 'wat',
+ 'blue': '\x04' # 'wanneer'
+ }
+ line_w_match = []
+ d['content'] = d['content'][1:]
+ for i, m in enumerate(d['matchdata']):
+ if filter(None, m):
+ line_w_match.append((d['content'][i], m))
+ nodelists = {'Title': [], 'Summary': []}
+ for (title_l, summary_l), (title_m, summary_m) in line_w_match:
+ # Title
+ if title_m:
+ title = title_m[0]
+ matches = reversed(sorted(title, key=lambda x: x.end('e')))
+ for match in matches:
+ title_l = title_l[:match.start('e')] + title_l[match.end('e'):]
+ title_l = title_l[:match.start('content')] +\
+ color_dict[match.group('c').strip()] +\
+ title_l[match.end('content'):]
+ title_l = title_l[:match.start('b')] + title_l[match.end('b'):]
+ nodelists['Title'].append(title_l)
+ if summary_m:
+ summary = summary_m[0]
+ matches = reversed(sorted(summary, key=lambda x: x.end('e')))
+ for match in matches:
+ summary_l = summary_l[:match.start('e')] +\
+ summary_l[match.end('e'):]
+ summary_l = summary_l[:match.start('content')] +\
+ color_dict[match.group('c').strip()] +\
+ summary_l[match.end('content'):]
+ summary_l = summary_l[:match.start('b')] +\
+ summary_l[match.end('b'):]
+ nodelists['Summary'].append(summary_l)
+ return nodelists
+
+
+def to_dot(q0):
+ nodenum = 0
+ final_nodes = []
+ nodes = []
+ edges = []
+ to_visit = [(0, q0)]
+ visited = set()
+ translation = []
+ if q0.final:
+ final_nodes.append(nodenum)
+ else:
+ nodes.append(nodenum)
+
+ nodenum += 1
+ while to_visit:
+ current = to_visit.pop()
+ if not current[0] in visited:
+ visited.add(current[0])
+ for char, child in current[1].children.iteritems():
+ matches = [c for c in translation if c[0] == child]
+ curnum = -1
+ if matches:
+ curnum = matches[-1][1]
+ else:
+ translation.append((child, nodenum))
+ curnum = nodenum
+ nodenum += 1
+ if child.final:
+ final_nodes.append(curnum)
+ else:
+ nodes.append(curnum)
+ edges.append((current[0], char, curnum))
+ to_visit.append((curnum, child))
+ print 'digraph dawg {'
+ print '\tnode [shape = doublecircle]; {}'.format(
+ ' '.join(str(n) for n in final_nodes))
+ print '\tnode [shape = circle]; {}'.format(
+ ' '.join(str(n) for n in nodes))
+ for fr, ch, to in edges:
+ print '\t{} -> {} [label = "{}"];'.format(fr, to, ch)
+ print '}'
def main():
d['matchdata'] = []
for line in filter(None, d['content']):
d['matchdata'].append(parse_line(line))
- create_nodes(d)
+ nodelists = create_nodes(d)
+ titledawg = pydawg.DAWG()
+ for n in sorted(nodelists['Title']):
+ titledawg.add_word(n)
+ to_dot(titledawg.q0)
if __name__ == '__main__':
- logging.basicConfig(level=logging.DEBUG)
+ logging.basicConfig(level=logging.WARNING)
main()
--- /dev/null
+#!/bin/env python
+# -*- coding: utf-8 -*-
+
+import pydawg
+
+
+def to_dot(filepath, q0):
+ nodenum = 0
+ final_nodes = []
+ nodes = []
+ edges = []
+ to_visit = [(0, q0)]
+ visited = set()
+ translation = []
+ if q0.final:
+ final_nodes.append(nodenum)
+ else:
+ nodes.append(nodenum)
+
+ nodenum += 1
+ while to_visit:
+ current = to_visit.pop()
+ if not current[0] in visited:
+ visited.add(current[0])
+ for char, child in current[1].children.iteritems():
+ matches = [c for c in translation if c[0] == child]
+ curnum = -1
+ if matches:
+ curnum = matches[-1][1]
+ else:
+ translation.append((child, nodenum))
+ curnum = nodenum
+ nodenum += 1
+ if child.final:
+ final_nodes.append(curnum)
+ else:
+ nodes.append(curnum)
+ edges.append((current[0], char, curnum))
+ to_visit.append((curnum, child))
+ print 'digraph dawg {'
+ print '\tnode [shape = doublecircle]; {}'.format(
+ ' '.join(str(n) for n in final_nodes))
+ print '\tnode [shape = circle]; {}'.format(
+ ' '.join(str(n) for n in nodes))
+ for fr, ch, to in edges:
+ print '\t{} -> {} [label = "{}"];'.format(fr, to, ch)
+ print '}'
+
+
+d = pydawg.DAWG()
+
+regs = [
+ 'wdag dag maand jaar tijd - wat',
+ 'dag maand jaar tijd - wat',
+ 'wdag dag maand jaar tijd - wat',
+ 'wdag dag maand jaar tijd - wat - Locatie: waar',
+ 'wdag dag maand jaar tijd - wat - Locatie: waar']
+
+#regs = [
+# 'maandag 11 augustus 2014 19:30 - Neutral Milk Hotel',
+# 'dinsdag 19 augustus 2014 22:00 - Arkells',
+# 'maandag 24 november 2014 20:30 - Fink',
+# 'woensdag 19 november 2014 20:00 - Michael Schulte',
+# 'zondag 26 oktober 2014 21:00 - The Majority Says - Locatie: Bitterzoet',
+# 'maandag 15 september 2014 20:30 - Ani DiFranco',
+# 'maandag 13 oktober 2014 20:30 - Tarrus Riley',
+# 'maandag 29 december 2014 20:30 - Alain Clark - Locatie: De Duif']
+for w in sorted(set(regs)):
+ d.add_word(w)
+
+to_dot('t.dot', d.q0)
--- /dev/null
+# -*- coding: utf-8 -*-
+"""
+ This is part of pydawg Python module.
+
+ Pure python implementation.
+
+ Author : Wojciech Muła, wojciech_mula@poczta.onet.pl
+ WWW : http://0x80.pl/proj/pydawg/
+ License : Public domain
+ Date : $Date$
+
+ $Id$
+"""
+
+
+class DAWGNode:
+ __slots__ = ["children", "final", "number"]
+
+ def __init__(self, char):
+ self.children = {}
+ self.final = False
+ self.number = None
+
+ def get_next(self, char):
+ try:
+ return self.children[char]
+ except KeyError:
+ return None
+
+ def set_next(self, char, child):
+ self.children[char] = child
+
+ def has_transition(self, char):
+ return char in self.children
+
+ def __str__(self):
+ return "<" + "".join(self.children.keys()) + ">"
+
+
+def equivalence(p, q):
+ "check if states p and q are equivalent"
+
+ if p.final != q.final:
+ return False
+
+ if len(p.children) != len(q.children):
+ return False
+
+ s = set(p.children)
+ if s != set(q.children):
+ return False
+
+ """
+ # exact definition of equivalence
+ for c in s:
+ if not equivalence(p.children[c], q.children[c]):
+ return False
+ """
+ # pratical implementation - constraints make
+ # this much simpler and faster
+ for c in s:
+ if p.children[c] != q.children[c]:
+ return False
+
+ return True
+
+
+class DAWG:
+ def __init__(self):
+ self._numbers_valid = False
+ self.register = set()
+ self.q0 = DAWGNode(None);
+ self.wp = ''
+
+
+ def add_word(self, word):
+ assert word > self.wp
+ return self.add_word_unchecked(word)
+
+
+ def add_word_unchecked(self, word):
+ # 1. skip existing
+ i = 0;
+ s = self.q0
+ while i < len(word) and s.has_transition(word[i]):
+ s = s.get_next(word[i])
+ i = i + 1
+
+ assert s != None
+
+ # 2. minimize
+ if i < len(self.wp):
+ self._replace_or_register(s, self.wp[i:])
+
+
+ # 3. add suffix
+ while i < len(word):
+ n = DAWGNode(word[i])
+ s.set_next(word[i], n)
+ assert n == s.get_next(word[i])
+ s = n
+ i = i + 1
+
+ s.final = True
+ self.wp = word
+ self._numbers_valid = False
+
+
+ def _replace_or_register(self, state, suffix):
+ stack = []
+ while suffix:
+ letter = suffix[0]
+ next = state.get_next(letter)
+ stack.append((state, letter, next))
+
+ state = next
+ suffix = suffix[1:]
+
+ while stack:
+ parent, letter, state = stack.pop()
+
+ found = False
+ for r in self.register:
+ if equivalence(state, r):
+ assert(parent.children[letter] == state)
+ parent.children[letter] = r
+
+ found = True
+ break
+
+ if not found:
+ self.register.add(state)
+
+
+ def freeze(self):
+ self._replace_or_register(self.q0, self.wp)
+ self._numbers_valid = False
+
+ close = freeze
+
+
+ def _num_nodes(self):
+ def clear_aux(node):
+ node.number = None
+ for child in node.children.values():
+ clear_aux(child)
+
+ def num_aux(node):
+ if node.number is None:
+ n = int(node.final)
+ for child in node.children.values():
+ n += num_aux(child)
+
+ node.number = n
+
+ return node.number
+
+ if not self._numbers_valid:
+ clear_aux(self.q0)
+ num_aux(self.q0)
+ self._numbers_valid = True
+
+
+ def word2index(self, word):
+ self._num_nodes()
+
+ state = self.q0
+ index = 0
+ for c in word:
+ try:
+ next = state.children[c]
+ except KeyError:
+ return None
+
+ for C in sorted(state.children):
+ if C < c:
+ index += state.children[C].number
+ else:
+ break
+
+ state = next
+ if state.final:
+ index = index + 1
+ #for
+
+ return index
+
+
+ def index2word(self, index):
+ self._num_nodes()
+
+ state = self.q0
+ count = index
+ output_word = ""
+ while True:
+ for c in sorted(state.children):
+ tmp = state.get_next(c)
+ if tmp.number < count:
+ count -= tmp.number
+ else:
+ output_word += c
+ state = tmp
+ if state.final:
+ count -= 1
+
+ break
+ #for
+ if count <= 0:
+ break
+
+ return output_word
+
+
+ def as_dot(self, file):
+ nodes = set()
+ edges = []
+ tmp = set()
+
+ def aux(node):
+ nodes.add((id(node), node.final))
+ tmp.add(node)
+
+ for letter, child in node.children.items():
+ aux(child)
+
+ aux(self.q0)
+
+ for node in tmp:
+ for letter, child in node.children.items():
+ edges.append((id(node), letter, id(child)))
+
+ import dump2dot
+ dump2dot.dumpdata2dot(nodes, edges, file)
+
+
+ def words(self):
+ L = []
+ def aux(node, word):
+ if node.final:
+ L.append(word)
+
+ for letter, child in node.children.items():
+ aux(child, word + letter)
+
+ aux(self.q0, '')
+ return L
+
+
+ def __iter__(self):
+ return iter(self.words())
+
+
+import os
+
+def main():
+ words = "aimaient aimais aimait aime aiment".split()
+ words = "cat rat attribute tribute".split()
+
+ def dump(name):
+ with open(name, 'wt') as f:
+ D.as_dot(f)
+
+
+ D = DAWG()
+ for word in sorted(words):
+ print(word)
+ D.add_word(word)
+
+ D.freeze()
+
+ # MPH test
+ for word in words:
+ print(word, "=>", D.word2index(word))
+
+ for index in range(1, len(words) + 1):
+ print(index, "=>", D.index2word(index))
+
+
+ if 1:
+ # show image of graph
+ name = "dawg.dot"
+ dump(name)
+ os.system("dotty %s" % name)
+
+ print(D.words(), set(D.words()) == set(words))
+
+
+if __name__ == '__main__':
+ main()
return false
}
-function init(a, w) {
- console.log(a)
- var b = document.createElement("DIV");
- b.id = "contextmenu";
- if (!w) var w = 120;
- b.style.width = w + "px";
- var c = '<div style="position:relative;left:5px;top:-4px;">';
- c += a;
- c += '</div>';
- b.innerHTML = c;
- b.style.position = "absolute";
- b.style.left = "0px";
- b.style.top = "0px";
- b.style.visibility = "hidden";
- b.style.overflow = "hidden";
- b.style.padding = "4px";
- b.style.backgroundColor = "#ffffff";
- b.style.border = "1px solid #6a6868";
- document.body.appendChild(b);
- delete b
-}
-
function mouseUp(e) {
var curselection = window.getSelection().getRangeAt(0);
if (curselection.endOffset - curselection.startOffset > 0)