From: Mart Lubbers <mart@martlubbers.net>
Date: Thu, 7 Aug 2014 20:04:08 +0000 (+0200)
Subject: another update
X-Git-Url: https://git.martlubbers.net/?a=commitdiff_plain;h=acb285acb6a0d89392ce5813737b2e5b4d15fb0e;p=bsc-thesis1415.git

another update
---

diff --git a/program/everything/data_processing.py b/program/everything/data_processing.py
index 08d7d0f..fffcc98 100644
--- a/program/everything/data_processing.py
+++ b/program/everything/data_processing.py
@@ -4,6 +4,7 @@
 import ast
 import logging
 import re
+import pydawg
 
 
 def structure_data(d):
@@ -27,8 +28,8 @@ def structure_data(d):
 
 
 def parse_line(line):
-    re_spa = re.compile('(?P<b><span.*?background-color:(?P<c>.*?);.*?>)(?P<co'
-                        'ntent>.*?)(?P<e></span>)')
+    re_spa = re.compile('(?P<b><span.*?background-color:\s*(?P<c>.*?);.*?>)(?P'
+                        '<content>.*?)(?P<e></span>)')
     results = []
     for column in line:
         results.append([])
@@ -39,7 +40,86 @@ def parse_line(line):
 
 
 def create_nodes(d):
-    print d
+    color_dict = {
+        'rgb(139, 0, 0)': '\x01',  # 'datum',
+        'red': '\x02',  # 'tijd',
+        'green': '\x03',  # 'wat',
+        'blue': '\x04'  # 'wanneer'
+        }
+    line_w_match = []
+    d['content'] = d['content'][1:]
+    for i, m in enumerate(d['matchdata']):
+        if filter(None, m):
+            line_w_match.append((d['content'][i], m))
+    nodelists = {'Title': [], 'Summary': []}
+    for (title_l, summary_l), (title_m, summary_m) in line_w_match:
+        # Title
+        if title_m:
+            title = title_m[0]
+            matches = reversed(sorted(title, key=lambda x: x.end('e')))
+            for match in matches:
+                title_l = title_l[:match.start('e')] + title_l[match.end('e'):]
+                title_l = title_l[:match.start('content')] +\
+                    color_dict[match.group('c').strip()] +\
+                    title_l[match.end('content'):]
+                title_l = title_l[:match.start('b')] + title_l[match.end('b'):]
+            nodelists['Title'].append(title_l)
+        if summary_m:
+            summary = summary_m[0]
+            matches = reversed(sorted(summary, key=lambda x: x.end('e')))
+            for match in matches:
+                summary_l = summary_l[:match.start('e')] +\
+                    summary_l[match.end('e'):]
+                summary_l = summary_l[:match.start('content')] +\
+                    color_dict[match.group('c').strip()] +\
+                    summary_l[match.end('content'):]
+                summary_l = summary_l[:match.start('b')] +\
+                    summary_l[match.end('b'):]
+            nodelists['Summary'].append(summary_l)
+    return nodelists
+
+
+def to_dot(q0):
+    nodenum = 0
+    final_nodes = []
+    nodes = []
+    edges = []
+    to_visit = [(0, q0)]
+    visited = set()
+    translation = []
+    if q0.final:
+        final_nodes.append(nodenum)
+    else:
+        nodes.append(nodenum)
+
+    nodenum += 1
+    while to_visit:
+        current = to_visit.pop()
+        if not current[0] in visited:
+            visited.add(current[0])
+            for char, child in current[1].children.iteritems():
+                matches = [c for c in translation if c[0] == child]
+                curnum = -1
+                if matches:
+                    curnum = matches[-1][1]
+                else:
+                    translation.append((child, nodenum))
+                    curnum = nodenum
+                    nodenum += 1
+                if child.final:
+                    final_nodes.append(curnum)
+                else:
+                    nodes.append(curnum)
+                edges.append((current[0], char, curnum))
+                to_visit.append((curnum, child))
+    print 'digraph dawg {'
+    print '\tnode [shape = doublecircle]; {}'.format(
+        ' '.join(str(n) for n in final_nodes))
+    print '\tnode [shape = circle]; {}'.format(
+        ' '.join(str(n) for n in nodes))
+    for fr, ch, to in edges:
+        print '\t{} -> {} [label = "{}"];'.format(fr, to, ch)
+    print '}'
 
 
 def main():
@@ -57,9 +137,13 @@ def main():
     d['matchdata'] = []
     for line in filter(None, d['content']):
         d['matchdata'].append(parse_line(line))
-    create_nodes(d)
+    nodelists = create_nodes(d)
+    titledawg = pydawg.DAWG()
+    for n in sorted(nodelists['Title']):
+        titledawg.add_word(n)
+    to_dot(titledawg.q0)
 
 
 if __name__ == '__main__':
-    logging.basicConfig(level=logging.DEBUG)
+    logging.basicConfig(level=logging.WARNING)
     main()
diff --git a/program/everything/dawg.py b/program/everything/dawg.py
new file mode 100644
index 0000000..be0757b
--- /dev/null
+++ b/program/everything/dawg.py
@@ -0,0 +1,71 @@
+#!/bin/env python
+# -*- coding: utf-8 -*-
+
+import pydawg
+
+
+def to_dot(filepath, q0):
+    nodenum = 0
+    final_nodes = []
+    nodes = []
+    edges = []
+    to_visit = [(0, q0)]
+    visited = set()
+    translation = []
+    if q0.final:
+        final_nodes.append(nodenum)
+    else:
+        nodes.append(nodenum)
+
+    nodenum += 1
+    while to_visit:
+        current = to_visit.pop()
+        if not current[0] in visited:
+            visited.add(current[0])
+            for char, child in current[1].children.iteritems():
+                matches = [c for c in translation if c[0] == child]
+                curnum = -1
+                if matches:
+                    curnum = matches[-1][1]
+                else:
+                    translation.append((child, nodenum))
+                    curnum = nodenum
+                    nodenum += 1
+                if child.final:
+                    final_nodes.append(curnum)
+                else:
+                    nodes.append(curnum)
+                edges.append((current[0], char, curnum))
+                to_visit.append((curnum, child))
+    print 'digraph dawg {'
+    print '\tnode [shape = doublecircle]; {}'.format(
+        ' '.join(str(n) for n in final_nodes))
+    print '\tnode [shape = circle]; {}'.format(
+        ' '.join(str(n) for n in nodes))
+    for fr, ch, to in edges:
+        print '\t{} -> {} [label = "{}"];'.format(fr, to, ch)
+    print '}'
+
+
+d = pydawg.DAWG()
+
+regs = [
+    'wdag dag maand jaar tijd - wat',
+    'dag maand jaar tijd - wat',
+    'wdag dag maand jaar tijd - wat',
+    'wdag dag maand jaar tijd - wat - Locatie: waar',
+    'wdag dag maand jaar tijd - wat - Locatie: waar']
+
+#regs = [
+#    'maandag 11 augustus 2014 19:30 - Neutral Milk Hotel',
+#    'dinsdag 19 augustus 2014 22:00 - Arkells',
+#    'maandag 24 november 2014 20:30 - Fink',
+#    'woensdag 19 november 2014 20:00 - Michael Schulte',
+#    'zondag 26 oktober 2014 21:00 - The Majority Says - Locatie: Bitterzoet',
+#    'maandag 15 september 2014 20:30 - Ani DiFranco',
+#    'maandag 13 oktober 2014 20:30 - Tarrus Riley',
+#    'maandag 29 december 2014 20:30 - Alain Clark - Locatie: De Duif']
+for w in sorted(set(regs)):
+    d.add_word(w)
+
+to_dot('t.dot', d.q0)
diff --git a/program/everything/pydawg.py b/program/everything/pydawg.py
new file mode 100644
index 0000000..18475c1
--- /dev/null
+++ b/program/everything/pydawg.py
@@ -0,0 +1,289 @@
+# -*- coding: utf-8 -*-
+"""
+	This is part of pydawg Python module.
+
+	Pure python implementation.
+
+	Author    : Wojciech MuÅa, wojciech_mula@poczta.onet.pl
+	WWW       : http://0x80.pl/proj/pydawg/
+	License   : Public domain
+	Date      : $Date$
+
+	$Id$
+"""
+
+
+class DAWGNode:
+	__slots__ = ["children", "final", "number"]
+
+	def __init__(self, char):
+		self.children = {}
+		self.final  = False
+		self.number = None
+
+	def get_next(self, char):
+		try:
+			return self.children[char]
+		except KeyError:
+			return None
+
+	def set_next(self, char, child):
+		self.children[char] = child
+
+	def has_transition(self, char):
+		return char in self.children
+
+	def __str__(self):
+		return "<" + "".join(self.children.keys()) + ">"
+
+
+def equivalence(p, q):
+	"check if states p and q are equivalent"
+
+	if p.final != q.final:
+		return False
+
+	if len(p.children) != len(q.children):
+		return False
+
+	s = set(p.children)
+	if s != set(q.children):
+		return False
+
+	"""
+	# exact definition of equivalence
+	for c in s:
+		if not equivalence(p.children[c], q.children[c]):
+				return False
+	"""
+	# pratical implementation - constraints make
+	# this much simpler and faster
+	for c in s:
+		if p.children[c] != q.children[c]:
+			return False
+
+	return True
+
+
+class DAWG:
+	def __init__(self):
+		self._numbers_valid = False
+		self.register = set()
+		self.q0 = DAWGNode(None);
+		self.wp = ''
+
+
+	def add_word(self, word):
+		assert word > self.wp
+		return self.add_word_unchecked(word)
+
+
+	def add_word_unchecked(self, word):
+		# 1. skip existing
+		i = 0;
+		s = self.q0
+		while i < len(word) and s.has_transition(word[i]):
+			s = s.get_next(word[i])
+			i = i + 1
+
+		assert s != None
+
+		# 2. minimize
+		if i < len(self.wp):
+			self._replace_or_register(s, self.wp[i:])
+
+
+		# 3. add suffix
+		while i < len(word):
+			n = DAWGNode(word[i])
+			s.set_next(word[i], n)
+			assert n == s.get_next(word[i])
+			s = n
+			i = i + 1
+
+		s.final = True
+		self.wp = word
+		self._numbers_valid = False
+
+
+	def _replace_or_register(self, state, suffix):
+		stack = []
+		while suffix:
+			letter = suffix[0]
+			next   = state.get_next(letter)
+			stack.append((state, letter, next))
+
+			state = next
+			suffix = suffix[1:]
+
+		while stack:
+			parent, letter, state = stack.pop()
+
+			found = False
+			for r in self.register:
+				if equivalence(state, r):
+					assert(parent.children[letter] == state)
+					parent.children[letter] = r
+
+					found = True
+					break
+
+			if not found:
+				self.register.add(state)
+			
+
+	def freeze(self):
+		self._replace_or_register(self.q0, self.wp)
+		self._numbers_valid = False
+
+	close = freeze
+
+
+	def _num_nodes(self):
+		def clear_aux(node):
+			node.number = None
+			for child in node.children.values():
+				clear_aux(child)
+
+		def num_aux(node):
+			if node.number is None:
+				n = int(node.final)
+				for child in node.children.values():
+					n += num_aux(child)
+
+				node.number = n
+
+			return node.number
+
+		if not self._numbers_valid:
+			clear_aux(self.q0)
+			num_aux(self.q0)
+			self._numbers_valid = True
+
+
+	def word2index(self, word):
+		self._num_nodes()
+
+		state = self.q0
+		index = 0
+		for c in word:
+			try:
+				next = state.children[c]
+			except KeyError:
+				return None
+
+			for C in sorted(state.children):
+				if C < c:
+					index += state.children[C].number
+				else:
+					break
+
+			state = next
+			if state.final:
+				index = index + 1
+		#for
+
+		return index
+
+
+	def index2word(self, index):
+		self._num_nodes()
+
+		state = self.q0
+		count = index
+		output_word = ""
+		while True:
+			for c in sorted(state.children):
+				tmp = state.get_next(c)
+				if tmp.number < count:
+					count -= tmp.number
+				else:
+					output_word += c
+					state = tmp
+					if state.final:
+						count -= 1
+
+					break
+			#for
+			if count <= 0:
+				break
+
+		return output_word
+
+
+	def as_dot(self, file):
+		nodes = set()
+		edges = []
+		tmp   = set()
+
+		def aux(node):
+			nodes.add((id(node), node.final))
+			tmp.add(node)
+
+			for letter, child in node.children.items():
+				aux(child)
+
+		aux(self.q0)
+
+		for node in tmp:
+			for letter, child in node.children.items():
+				edges.append((id(node), letter, id(child)))
+
+		import dump2dot
+		dump2dot.dumpdata2dot(nodes, edges, file)
+
+
+	def words(self):
+		L = []
+		def aux(node, word):
+			if node.final:
+				L.append(word)
+
+			for letter, child in node.children.items():
+				aux(child, word + letter)
+
+		aux(self.q0, '')
+		return L
+
+
+	def __iter__(self):
+		return iter(self.words())
+
+
+import os
+
+def main():
+	words = "aimaient aimais aimait aime aiment".split()
+	words = "cat rat attribute tribute".split()
+
+	def dump(name):
+		with open(name, 'wt') as f:
+			D.as_dot(f)
+
+
+	D = DAWG()
+	for word in sorted(words):
+		print(word)
+		D.add_word(word)
+
+	D.freeze()
+
+	# MPH test
+	for word in words:
+		print(word, "=>", D.word2index(word))
+
+	for index in range(1, len(words) + 1):
+		print(index, "=>", D.index2word(index))
+
+
+	if 1:
+		# show image of graph
+		name = "dawg.dot"
+		dump(name)
+		os.system("dotty %s" % name)
+
+	print(D.words(), set(D.words()) == set(words))
+
+
+if __name__ == '__main__':
+	main()
diff --git a/program/everything/pydawg.pyc b/program/everything/pydawg.pyc
new file mode 100644
index 0000000..e6507ee
Binary files /dev/null and b/program/everything/pydawg.pyc differ
diff --git a/program/everything/t.dot b/program/everything/t.dot
new file mode 100644
index 0000000..ffe9df4
Binary files /dev/null and b/program/everything/t.dot differ
diff --git a/program/everything/webdata/contextmenu_o.js b/program/everything/webdata/contextmenu_o.js
index 8b0cc05..5311a67 100644
--- a/program/everything/webdata/contextmenu_o.js
+++ b/program/everything/webdata/contextmenu_o.js
@@ -20,28 +20,6 @@ function RightMouseDown() {
     return false
 }
 
-function init(a, w) {
-	console.log(a)
-    var b = document.createElement("DIV");
-    b.id = "contextmenu";
-    if (!w) var w = 120;
-    b.style.width = w + "px";
-    var c = '<div style="position:relative;left:5px;top:-4px;">';
-    c += a;
-    c += '</div>';
-    b.innerHTML = c;
-    b.style.position = "absolute";
-    b.style.left = "0px";
-    b.style.top = "0px";
-    b.style.visibility = "hidden";
-    b.style.overflow = "hidden";
-    b.style.padding = "4px";
-    b.style.backgroundColor = "#ffffff";
-    b.style.border = "1px solid #6a6868";
-    document.body.appendChild(b);
-    delete b
-}
-
 function mouseUp(e) {
     var curselection = window.getSelection().getRangeAt(0);
     if (curselection.endOffset - curselection.startOffset > 0)