update
authorMart Lubbers <mart@martlubbers.net>
Thu, 7 Aug 2014 13:48:48 +0000 (15:48 +0200)
committerMart Lubbers <mart@martlubbers.net>
Thu, 7 Aug 2014 13:48:48 +0000 (15:48 +0200)
15 files changed:
program/everything/crawler.py [new file with mode: 0644]
program/everything/data_processing.py [new file with mode: 0644]
program/everything/input_app.py [new file with mode: 0644]
program/everything/install.sh [new file with mode: 0755]
program/everything/output_data [new symlink]
program/everything/webdata/contextmenu_o.js [moved from program/hypfront/contextmenu_o.js with 97% similarity]
program/everything/webdata/index.html [moved from program/hypfront/index.html with 88% similarity]
program/everything/webdata/paradiso.rss.xml [moved from program/hypfront/paradiso.rss.xml with 100% similarity]
program/hypconvert/hypconvert.py
program/hypfront/hyper.py [deleted file]
program/hypfront/install.sh [deleted file]
thesis/methods.tex
thesis/methods_crawl.tex [new file with mode: 0644]
thesis/methods_data.tex [new file with mode: 0644]
thesis/methods_input.tex [new file with mode: 0644]

diff --git a/program/everything/crawler.py b/program/everything/crawler.py
new file mode 100644 (file)
index 0000000..0370fed
--- /dev/null
@@ -0,0 +1,3 @@
+#!/bin/env python
+# -*- coding: utf-8 -*-
+
diff --git a/program/everything/data_processing.py b/program/everything/data_processing.py
new file mode 100644 (file)
index 0000000..08d7d0f
--- /dev/null
@@ -0,0 +1,65 @@
+#!/bin/env python
+# -*- coding: utf-8 -*-
+
+import ast
+import logging
+import re
+
+
+def structure_data(d):
+    re_hdr = re.compile('<th>(?P<h>.*?)</th>', flags=re.MULTILINE | re.DOTALL)
+    re_row = re.compile('<tr>(?P<row>.*)</tr>', flags=re.MULTILINE | re.DOTALL)
+    re_dualcel = re.compile('<td id="cel">(?P<c>.*?)</td><!--cel-->',
+                            flags=re.MULTILINE | re.DOTALL)
+    con = d['content']
+    d['content'] = []
+    d['headers'] = []
+    for line in con.split('\n\t\t'):
+        if not line:
+            continue
+        row = re_row.search(line)
+        row = row.group('row')
+        for header in re_hdr.finditer(row):
+            d['headers'].append(header.group('h'))
+        d['content'].append([])
+        for cell in re_dualcel.finditer(row):
+            d['content'][-1].append(cell.group('c'))
+
+
+def parse_line(line):
+    re_spa = re.compile('(?P<b><span.*?background-color:(?P<c>.*?);.*?>)(?P<co'
+                        'ntent>.*?)(?P<e></span>)')
+    results = []
+    for column in line:
+        results.append([])
+        markings = list(re_spa.finditer(column))
+        if markings:
+            results[-1].append(markings)
+    return results
+
+
+def create_nodes(d):
+    print d
+
+
+def main():
+    with open('./output_data/raw_out.txt', 'r') as data:
+        logging.info('raw data loaded, going to parse data')
+        d = data.readline()
+        d = re.sub('\)\]}$', '}',
+                   re.sub('\)\],', ',',
+                          re.sub('\[Field\(\'.*?\', ', '', d)))
+        d = ast.literal_eval(d)
+    logging.info('raw data parsed, going to structure data')
+    structure_data(d)
+    logging.info('data structured, parsed headers: {}'.format(d['headers']))
+    logging.info('lines: {}'.format(len(d['content'])))
+    d['matchdata'] = []
+    for line in filter(None, d['content']):
+        d['matchdata'].append(parse_line(line))
+    create_nodes(d)
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.DEBUG)
+    main()
diff --git a/program/everything/input_app.py b/program/everything/input_app.py
new file mode 100644 (file)
index 0000000..4117fa5
--- /dev/null
@@ -0,0 +1,93 @@
+#!/bin/env python
+# -*- coding: utf-8 -*-
+
+from mod_python import apache, util
+import feedparser
+import re
+import urllib
+import os
+
+
+def req_pre_pos(req):
+    req.log_error('handler')
+    req.content_type = 'text/html'
+    req.send_http_header()
+    args = util.FieldStorage(req)
+    req.write(
+        '<html>\n<head>\n'
+        '\t<title>VER: 0.01 - HyperFrontend RSS feed POSTREQUEST</title>'
+        '</head>\n<body>\n'
+        '\tThanks submitting: <br />\n'
+        '\t<a href="index.html">Enter new rss feed</a>\n<pre>\n'
+        '{}\n</pre>\n</body>\n</html>'.format(args))
+    os.chdir('/var/www/py/files')
+    with open('raw_out.txt', 'w') as f:
+        f.write(str(args))
+
+
+def req_pre(req):
+    req.log_error('handler')
+    req.content_type = 'text/html'
+    req.send_http_header()
+    req.write(
+        '<html>\n<head>\n'
+        '\t<title>HyperFrontend RSS feed input</title>\n'
+        '\t<script src="contextmenu_o.js"></script>\n</head>\n<body>\n'
+        '\t<table>\n'
+        '\t\t<tr><td>Venue: </td><td>\n'
+        '\t\t\t<input type="text" name="venue" class="indat"></td></tr>\n'
+        '\t\t<tr><td>Frequency: </td><td>\n'
+        '\t\t\t<input type="text" name="freq" class="indat"></td></tr>\n'
+        '\t\t<tr><td>Default location name: </td><td>\n'
+        '\t\t\t<input type="text" name="dloc" class="indat"></td></tr>\n'
+        '\t\t<tr><td>Adress: </td><td>\n'
+        '\t\t\t<input type="text" name="adress" class="indat"></td></tr>\n'
+        '\t\t<tr><td>Website: </td><td>\n'
+        '\t\t\t<input type="text" name="website" class="indat"></td></tr>\n'
+        '\t</table>\n'
+        '\tSelecteer iets en klik de link<br />\n'
+        '\t<button style="color:blue" onclick="javascript:f_waar()">'
+        'Waar</button>\n'
+        '\t<button style="color:green" onclick="javascript:f_wat()">'
+        'Wat</button>\n'
+        '\t<button style="color:red" onclick="javascript:f_wann_tijd()">'
+        'Tijd</button>\n'
+        '\t<button style="color:darkred" onclick="javascript:f_wann_dat()">'
+        'Datum</button>\n')
+
+
+def req_post(req):
+    req.write('\t<button onclick="javascript:post_all()" method="post" '
+              'target="_blank">Submit</button>\n</body>\n</html>')
+
+
+def feed2html(req, url, name):
+    url = urllib.unquote(url)
+    url = url if re.match('https?://', url) else 'http://{}'.format(url)
+    req.write(
+        '\tLoading "{}" as <p id="rssname">"{}"</p><br />\n'.format(url, name))
+    feed = feedparser.parse(url)
+    # channel = feed.feed
+    req.write('\t<table id="content-table" border="1" id="htab">\n')
+    req.write('\t\t<tr><th>Title</th><th>Summary</th></tr>\n')
+    for i in feed.entries:
+        req.write(('\t\t<tr><td id="cel">{}</td><!--cel--><td id="cel">{}</td>'
+                   '<!--cel--></tr>\n').format(
+                       i['title'].encode('ascii', 'xmlcharrefreplace'),
+                       i['summary'].encode('ascii', 'xmlcharrefreplace')))
+    req.write('\t</table>\n<br />')
+
+
+def handler(req):
+    if req.method == "POST":
+        req_pre_pos(req)
+    else:
+        req_pre(req)
+        args = util.FieldStorage(req)
+        if 'url' not in args and 'name' not in args:
+            req.write('Something went wrong, empty fields?<br />')
+            req.write('<a href="index.html">back</a>')
+        else:
+            feed2html(req, args['url'], args['name'])
+        req_post(req)
+    return apache.OK
diff --git a/program/everything/install.sh b/program/everything/install.sh
new file mode 100755 (executable)
index 0000000..0e87fd6
--- /dev/null
@@ -0,0 +1,7 @@
+sudo rm -rv /var/www/py/*
+sudo cp -v ./input_app.py /var/www/py
+sudo cp -v ./webdata/*.{xml,html,js} /var/www/py/
+sudo mkdir /var/www/py/files
+sudo chown -vR mart:www-data /var/www/py
+sudo chmod -vR 770 /var/www/py
+ln -s /var/www/py/files/ ./output_data
diff --git a/program/everything/output_data b/program/everything/output_data
new file mode 120000 (symlink)
index 0000000..6550f0b
--- /dev/null
@@ -0,0 +1 @@
+/var/www/py/files/
\ No newline at end of file
similarity index 97%
rename from program/hypfront/contextmenu_o.js
rename to program/everything/webdata/contextmenu_o.js
index feb3135..8b0cc05 100644 (file)
@@ -72,10 +72,14 @@ function stylizeHighlightedString(range, col)
     range.insertNode(span);
 }
 
-function f_wanneer() {
+function f_wann_tijd() {
     stylizeHighlightedString(selection, "red")
 }
 
+function f_wann_dat() {
+    stylizeHighlightedString(selection, "darkred")
+}
+
 function f_wat() {
     stylizeHighlightedString(selection, "green")
 }
@@ -84,6 +88,7 @@ function f_waar() {
     stylizeHighlightedString(selection, "blue")
 }
 
+
 function post_all() {
     var xmlhttp = new XMLHttpRequest();
     xmlhttp.onreadystatechange=function()
similarity index 88%
rename from program/hypfront/index.html
rename to program/everything/webdata/index.html
index 788eeee..00ce528 100644 (file)
@@ -2,7 +2,7 @@
     <head>
     </head>
     <body>
-        <form method="get" action="./hyper.py">
+        <form method="get" action="./input_app.py">
             <table>
                 <tr><td><p>RSS URL:  </td><td><input type="text" name="url"
                         value="localhost/py/paradiso.rss.xml"></td></tr>
index 34548ff..81193c4 100644 (file)
@@ -1,3 +1,4 @@
+#!/bin/env python
 # -*- coding: utf-8 -*-
 
 import ast
diff --git a/program/hypfront/hyper.py b/program/hypfront/hyper.py
deleted file mode 100644 (file)
index 78e2a53..0000000
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/bin/env python
-# -*- coding: utf-8 -*-
-
-from mod_python import apache, util
-import feedparser
-import re
-import urllib
-import subprocess
-import os
-
-def req_pre_pos(req):
-    req.log_error('handler')
-    req.content_type = 'text/html'
-    req.send_http_header()
-    args = util.FieldStorage(req)
-    req.write("""\
-<html>
-    <head>
-        <title>VER: 0.01 - HyperFrontend RSS feed POSTREQUEST</title>
-    </head>
-    <body>
-        Thanks submitting: <br />
-        <a href="index.html">Enter new rss feed</a>
-        <pre>
-{}
-        </pre>
-    </body>
-</html>
-""".format(args))
-    os.chdir('/var/www/py/files')
-    with open('raw_out.txt', 'w') as f:
-        f.write(str(args))
-
-
-def req_pre(req):
-    req.log_error('handler')
-    req.content_type = 'text/html'
-    req.send_http_header()
-    req.write("""\
-<html>
-    <head>
-        <title>HyperFrontend RSS feed input</title>
-        <script src="contextmenu_o.js"></script>
-    </head>
-    <body>
-
-    <table>
-        <tr><td>Venue: </td><td>
-            <input type="text" name="venue" class="indat"></td></tr>
-        <tr><td>Frequency: </td><td>
-            <input type="text" name="freq" class="indat"></td></tr>
-        <tr><td>Default location name: </td><td>
-            <input type="text" name="dloc" class="indat"></td></tr>
-        <tr><td>Adress: </td><td>
-            <input type="text" name="adress" class="indat"></td></tr>
-        <tr><td>Website: </td><td>
-            <input type="text" name="website" class="indat"></td></tr>
-    </table>
-
-    Selecteer iets en klik de link<br />
-    <button style="color:blue" onclick="javascript:f_waar()">Waar</button>
-    <button style="color:green" onclick="javascript:f_wat()">Wat</button>
-    <button style="color:red" onclick="javascript:f_wanneer()">Wanneer</button>
-    <br />
-
-<div style="position:absolute;left:12px;width:500px;"></div>
-<script language="javascript" type="text/javascript">
-    var content='<b>Categorize</b><br />';
-    content+=' <a href="#" onclick="javascript:f_waar()">Waar</a><br />';
-    content+=' <a href="#" onclick="javascript:f_wat()">Wat</a>';
-    content+=' <a href="#" onclick="javascript:f_wanneer()>Wanneer</a><br />';
-    content+=' Test 123';
-    init(content,120);
-</script>
-""")
-
-
-def req_post(req):
-    req.write("""\
-            <button onclick="javascript:post_all()" method="post" target="_bla\
-nk">Submit</button>
-    </body>
-</html>
-""")
-
-
-def feed2html(req, url, name):
-    url = urllib.unquote(url)
-    url = url if re.match('https?://', url) else 'http://{}'.format(url)
-    req.write(
-        '\tLoading "{}" as <p id="rssname">"{}"</p><br />\n'.format(url, name))
-    feed = feedparser.parse(url)
-    channel = feed.feed
-    req.write('\t<table id="content-table" border="1" id="htab">\n')
-    req.write('\t\t<tr><th>Title</th><th>Summary</th></tr>\n')
-    for i in feed.entries:
-        req.write('\t\t<tr><td>{}</td><td>{}</td></tr>\n'.
-                  format(i['title'].encode('ascii', 'xmlcharrefreplace'),
-                         i['summary'].encode('ascii', 'xmlcharrefreplace')))
-    req.write('\t</table>\n<br />')
-
-
-def handler(req):
-    if req.method == "POST":
-        req_pre_pos(req)
-    else:
-        req_pre(req)
-        args = util.FieldStorage(req)
-        if 'url' not in args and 'name' not in args:
-            req.write('Something went wrong, empty fields?<br />')
-            req.write('<a href="index.html">back</a>')
-        else:
-            feed2html(req, args['url'], args['name'])
-        req_post(req)
-    return apache.OK
diff --git a/program/hypfront/install.sh b/program/hypfront/install.sh
deleted file mode 100644 (file)
index ab68ea4..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-sudo rm -rv /var/www/py/*
-sudo cp -v ./*.{xml,py,html,js} /var/www/py/
-#sudo chown -vR www-data:www-data /var/www/py
-sudo mkdir /var/www/py/files
-sudo chown -R mart:www-data /var/www/py
-sudo chmod -R 770 /var/www/py
-ln -s /var/www/py/files ../output
index 05bc2ed..475d80b 100644 (file)
@@ -6,58 +6,8 @@ application and compiles them into computer interpretable patterns and the
 crawler interprets the patterns and visits the sources from time to time to
 extract the information.
 
-\section{Input application}
-The purpose of the input application is to define the patterns together with
-the user so that the information can be transferred to the data processing
-application.
-The user input all goes through the familiar interface of the user's preferred
-web browser. By visiting the crawler's train website the user can specify the
-metadata of the source it wants to be periodically crawled through simple web
-forms as seen in figure~\ref{fig:mf1}
-\begin{figure}[H]
-       \centering
-       \caption{Webforms for source metadata}
-       \label{fig:mf1}
-       \includegraphics[width=80mm]{./img/img1.png}
-\end{figure}
+\input{methods_input.tex}
 
-\section{Data processing application}
-\subsection{Directed acyclic graphs and finite state automata} Directed acyclic
-graphs(DAG) and finite state automata(FSA) have a lot in common concerning
-pattern recognition and information extraction. By feeding words\footnote{A
-word is a finite combination of letters from the graphs alphabet, thus a word
-is not limited to linguistic words but can be anything as long as the
-components are in the graphs alphabet} into an algorithm a DAG can be generated
-so that it matches certain patters present in the given words.
-Figure~\ref{fig:mg1} for example shows a FSA that matches on the words
-\textit{ab} and \textit{ac}.
-\begin{figure}[H]
-       \centering
-       \caption{Example DAG/FSA}
-       \label{fig:mg1}
-       \includegraphics[width=15mm]{./dots/graph1.png}
-\end{figure}
+\input{methods_data.tex}
 
-With this FSA we can test if a word fits to the constraints it the FSA
-describes. And with a little adaptation we can extract dynamic information from
-semi-structured data.\\
-
-
-
-\subsection{Back to DAG's and FSA's}
-Nodes in this data structure can be single letters but also bigger
-constructions. The example in Figure~\ref{fig:mg2} describes different
-separator pattern for event data with its three component: what, when, where.
-In this example the nodes with the labels \textit{what, when, where} can also
-be complete subgrahps. In this way data on a larger level can be using the
-NIP markings and data within the categories can be processed autonomously.
-\begin{figure}[H]
-       \centering
-       \caption{Example event data}
-       \label{fig:mg2}
-       \includegraphics[width=\linewidth]{./dots/graph2.png}
-\end{figure}
-
-\subsection{Algorithm}
-
-\section{Crawler application}
+\input{methods_crawl.tex}
diff --git a/thesis/methods_crawl.tex b/thesis/methods_crawl.tex
new file mode 100644 (file)
index 0000000..167d233
--- /dev/null
@@ -0,0 +1,7 @@
+\section{Crawler application}
+The crawler application is the program that periodically visits the sources and
+extracts the information conform the given pattern from these sources. When the
+crawler is trying to extract information from a website and when this fails it
+will send a message to a system administrator, to either add the entry to the
+training set or to retrain the entire network again with new data because there
+have been changes in the internal structure of the visited source.
diff --git a/thesis/methods_data.tex b/thesis/methods_data.tex
new file mode 100644 (file)
index 0000000..9b16004
--- /dev/null
@@ -0,0 +1,40 @@
+\section{Data processing application}
+The data processing application is the application that takes the input from
+the input application and converts it to crawler rules. This application is
+triggered after a user is done with the input application, the data processing
+application creates crawler patterns so that the crawler can periodically and
+systematically extract the information from the sources.
+
+\subsection{Directed acyclic graphs and finite state automata} 
+Directed acyclic graphs(DAG) and finite state automata(FSA) have a lot in
+common concerning pattern recognition and information extraction. By feeding
+words\footnote{A word is a finite combination of letters from the graphs
+alphabet, thus a word is not limited to linguistic words but can be anything
+as long as the components are in the graphs alphabet} into an algorithm a DAG
+can be generated so that it matches certain patters present in the given words.
+Figure~\ref{fig:mg1} for example shows a FSA that matches on the words
+\textit{ab} and \textit{ac}.
+\begin{figure}[H]
+       \centering
+       \caption{Example DAG/FSA}
+       \label{fig:mg1}
+       \includegraphics[width=15mm]{./dots/graph1.png}
+\end{figure}
+
+With this FSA we can test if a word fits to the constraints it the FSA
+describes. And with a little adaptation we can extract dynamic information from
+semi-structured data.\\
+
+\subsection{Back to DAG's and FSA's}
+Nodes in this data structure can be single letters but also bigger
+constructions. The example in Figure~\ref{fig:mg2} describes different
+separator pattern for event data with its three component: what, when, where.
+In this example the nodes with the labels \textit{what, when, where} can also
+be complete subgrahps. In this way data on a larger level can be using the
+NIP markings and data within the categories can be processed autonomously.
+\begin{figure}[H]
+       \centering
+       \caption{Example event data}
+       \label{fig:mg2}
+       \includegraphics[width=\linewidth]{./dots/graph2.png}
+\end{figure}
diff --git a/thesis/methods_input.tex b/thesis/methods_input.tex
new file mode 100644 (file)
index 0000000..40c8e3b
--- /dev/null
@@ -0,0 +1,14 @@
+\section{Input application}
+The purpose of the input application is to define the patterns together with
+the user so that the information can be transferred to the data processing
+application.
+The user input all goes through the familiar interface of the user's preferred
+web browser. By visiting the crawler's train website the user can specify the
+metadata of the source it wants to be periodically crawled through simple web
+forms as seen in figure~\ref{fig:mf1}
+\begin{figure}[H]
+       \centering
+       \caption{Webforms for source metadata}
+       \label{fig:mf1}
+       \includegraphics[width=80mm]{./img/img1.png}
+\end{figure}