--- /dev/null
+#!/bin/env python
+# -*- coding: utf-8 -*-
+
--- /dev/null
+#!/bin/env python
+# -*- coding: utf-8 -*-
+
+import ast
+import logging
+import re
+
+
+def structure_data(d):
+ re_hdr = re.compile('<th>(?P<h>.*?)</th>', flags=re.MULTILINE | re.DOTALL)
+ re_row = re.compile('<tr>(?P<row>.*)</tr>', flags=re.MULTILINE | re.DOTALL)
+ re_dualcel = re.compile('<td id="cel">(?P<c>.*?)</td><!--cel-->',
+ flags=re.MULTILINE | re.DOTALL)
+ con = d['content']
+ d['content'] = []
+ d['headers'] = []
+ for line in con.split('\n\t\t'):
+ if not line:
+ continue
+ row = re_row.search(line)
+ row = row.group('row')
+ for header in re_hdr.finditer(row):
+ d['headers'].append(header.group('h'))
+ d['content'].append([])
+ for cell in re_dualcel.finditer(row):
+ d['content'][-1].append(cell.group('c'))
+
+
+def parse_line(line):
+ re_spa = re.compile('(?P<b><span.*?background-color:(?P<c>.*?);.*?>)(?P<co'
+ 'ntent>.*?)(?P<e></span>)')
+ results = []
+ for column in line:
+ results.append([])
+ markings = list(re_spa.finditer(column))
+ if markings:
+ results[-1].append(markings)
+ return results
+
+
+def create_nodes(d):
+ print d
+
+
+def main():
+ with open('./output_data/raw_out.txt', 'r') as data:
+ logging.info('raw data loaded, going to parse data')
+ d = data.readline()
+ d = re.sub('\)\]}$', '}',
+ re.sub('\)\],', ',',
+ re.sub('\[Field\(\'.*?\', ', '', d)))
+ d = ast.literal_eval(d)
+ logging.info('raw data parsed, going to structure data')
+ structure_data(d)
+ logging.info('data structured, parsed headers: {}'.format(d['headers']))
+ logging.info('lines: {}'.format(len(d['content'])))
+ d['matchdata'] = []
+ for line in filter(None, d['content']):
+ d['matchdata'].append(parse_line(line))
+ create_nodes(d)
+
+
+if __name__ == '__main__':
+ logging.basicConfig(level=logging.DEBUG)
+ main()
--- /dev/null
+#!/bin/env python
+# -*- coding: utf-8 -*-
+
+from mod_python import apache, util
+import feedparser
+import re
+import urllib
+import os
+
+
+def req_pre_pos(req):
+ req.log_error('handler')
+ req.content_type = 'text/html'
+ req.send_http_header()
+ args = util.FieldStorage(req)
+ req.write(
+ '<html>\n<head>\n'
+ '\t<title>VER: 0.01 - HyperFrontend RSS feed POSTREQUEST</title>'
+ '</head>\n<body>\n'
+ '\tThanks submitting: <br />\n'
+ '\t<a href="index.html">Enter new rss feed</a>\n<pre>\n'
+ '{}\n</pre>\n</body>\n</html>'.format(args))
+ os.chdir('/var/www/py/files')
+ with open('raw_out.txt', 'w') as f:
+ f.write(str(args))
+
+
+def req_pre(req):
+ req.log_error('handler')
+ req.content_type = 'text/html'
+ req.send_http_header()
+ req.write(
+ '<html>\n<head>\n'
+ '\t<title>HyperFrontend RSS feed input</title>\n'
+ '\t<script src="contextmenu_o.js"></script>\n</head>\n<body>\n'
+ '\t<table>\n'
+ '\t\t<tr><td>Venue: </td><td>\n'
+ '\t\t\t<input type="text" name="venue" class="indat"></td></tr>\n'
+ '\t\t<tr><td>Frequency: </td><td>\n'
+ '\t\t\t<input type="text" name="freq" class="indat"></td></tr>\n'
+ '\t\t<tr><td>Default location name: </td><td>\n'
+ '\t\t\t<input type="text" name="dloc" class="indat"></td></tr>\n'
+ '\t\t<tr><td>Adress: </td><td>\n'
+ '\t\t\t<input type="text" name="adress" class="indat"></td></tr>\n'
+ '\t\t<tr><td>Website: </td><td>\n'
+ '\t\t\t<input type="text" name="website" class="indat"></td></tr>\n'
+ '\t</table>\n'
+ '\tSelecteer iets en klik de link<br />\n'
+ '\t<button style="color:blue" onclick="javascript:f_waar()">'
+ 'Waar</button>\n'
+ '\t<button style="color:green" onclick="javascript:f_wat()">'
+ 'Wat</button>\n'
+ '\t<button style="color:red" onclick="javascript:f_wann_tijd()">'
+ 'Tijd</button>\n'
+ '\t<button style="color:darkred" onclick="javascript:f_wann_dat()">'
+ 'Datum</button>\n')
+
+
+def req_post(req):
+ req.write('\t<button onclick="javascript:post_all()" method="post" '
+ 'target="_blank">Submit</button>\n</body>\n</html>')
+
+
+def feed2html(req, url, name):
+ url = urllib.unquote(url)
+ url = url if re.match('https?://', url) else 'http://{}'.format(url)
+ req.write(
+ '\tLoading "{}" as <p id="rssname">"{}"</p><br />\n'.format(url, name))
+ feed = feedparser.parse(url)
+ # channel = feed.feed
+ req.write('\t<table id="content-table" border="1" id="htab">\n')
+ req.write('\t\t<tr><th>Title</th><th>Summary</th></tr>\n')
+ for i in feed.entries:
+ req.write(('\t\t<tr><td id="cel">{}</td><!--cel--><td id="cel">{}</td>'
+ '<!--cel--></tr>\n').format(
+ i['title'].encode('ascii', 'xmlcharrefreplace'),
+ i['summary'].encode('ascii', 'xmlcharrefreplace')))
+ req.write('\t</table>\n<br />')
+
+
+def handler(req):
+ if req.method == "POST":
+ req_pre_pos(req)
+ else:
+ req_pre(req)
+ args = util.FieldStorage(req)
+ if 'url' not in args and 'name' not in args:
+ req.write('Something went wrong, empty fields?<br />')
+ req.write('<a href="index.html">back</a>')
+ else:
+ feed2html(req, args['url'], args['name'])
+ req_post(req)
+ return apache.OK
--- /dev/null
+sudo rm -rv /var/www/py/*
+sudo cp -v ./input_app.py /var/www/py
+sudo cp -v ./webdata/*.{xml,html,js} /var/www/py/
+sudo mkdir /var/www/py/files
+sudo chown -vR mart:www-data /var/www/py
+sudo chmod -vR 770 /var/www/py
+ln -s /var/www/py/files/ ./output_data
--- /dev/null
+/var/www/py/files/
\ No newline at end of file
range.insertNode(span);
}
-function f_wanneer() {
+function f_wann_tijd() {
stylizeHighlightedString(selection, "red")
}
+function f_wann_dat() {
+ stylizeHighlightedString(selection, "darkred")
+}
+
function f_wat() {
stylizeHighlightedString(selection, "green")
}
stylizeHighlightedString(selection, "blue")
}
+
function post_all() {
var xmlhttp = new XMLHttpRequest();
xmlhttp.onreadystatechange=function()
<head>
</head>
<body>
- <form method="get" action="./hyper.py">
+ <form method="get" action="./input_app.py">
<table>
<tr><td><p>RSS URL: </td><td><input type="text" name="url"
value="localhost/py/paradiso.rss.xml"></td></tr>
+#!/bin/env python
# -*- coding: utf-8 -*-
import ast
+++ /dev/null
-#!/bin/env python
-# -*- coding: utf-8 -*-
-
-from mod_python import apache, util
-import feedparser
-import re
-import urllib
-import subprocess
-import os
-
-def req_pre_pos(req):
- req.log_error('handler')
- req.content_type = 'text/html'
- req.send_http_header()
- args = util.FieldStorage(req)
- req.write("""\
-<html>
- <head>
- <title>VER: 0.01 - HyperFrontend RSS feed POSTREQUEST</title>
- </head>
- <body>
- Thanks submitting: <br />
- <a href="index.html">Enter new rss feed</a>
- <pre>
-{}
- </pre>
- </body>
-</html>
-""".format(args))
- os.chdir('/var/www/py/files')
- with open('raw_out.txt', 'w') as f:
- f.write(str(args))
-
-
-def req_pre(req):
- req.log_error('handler')
- req.content_type = 'text/html'
- req.send_http_header()
- req.write("""\
-<html>
- <head>
- <title>HyperFrontend RSS feed input</title>
- <script src="contextmenu_o.js"></script>
- </head>
- <body>
-
- <table>
- <tr><td>Venue: </td><td>
- <input type="text" name="venue" class="indat"></td></tr>
- <tr><td>Frequency: </td><td>
- <input type="text" name="freq" class="indat"></td></tr>
- <tr><td>Default location name: </td><td>
- <input type="text" name="dloc" class="indat"></td></tr>
- <tr><td>Adress: </td><td>
- <input type="text" name="adress" class="indat"></td></tr>
- <tr><td>Website: </td><td>
- <input type="text" name="website" class="indat"></td></tr>
- </table>
-
- Selecteer iets en klik de link<br />
- <button style="color:blue" onclick="javascript:f_waar()">Waar</button>
- <button style="color:green" onclick="javascript:f_wat()">Wat</button>
- <button style="color:red" onclick="javascript:f_wanneer()">Wanneer</button>
- <br />
-
-<div style="position:absolute;left:12px;width:500px;"></div>
-<script language="javascript" type="text/javascript">
- var content='<b>Categorize</b><br />';
- content+=' <a href="#" onclick="javascript:f_waar()">Waar</a><br />';
- content+=' <a href="#" onclick="javascript:f_wat()">Wat</a>';
- content+=' <a href="#" onclick="javascript:f_wanneer()>Wanneer</a><br />';
- content+=' Test 123';
- init(content,120);
-</script>
-""")
-
-
-def req_post(req):
- req.write("""\
- <button onclick="javascript:post_all()" method="post" target="_bla\
-nk">Submit</button>
- </body>
-</html>
-""")
-
-
-def feed2html(req, url, name):
- url = urllib.unquote(url)
- url = url if re.match('https?://', url) else 'http://{}'.format(url)
- req.write(
- '\tLoading "{}" as <p id="rssname">"{}"</p><br />\n'.format(url, name))
- feed = feedparser.parse(url)
- channel = feed.feed
- req.write('\t<table id="content-table" border="1" id="htab">\n')
- req.write('\t\t<tr><th>Title</th><th>Summary</th></tr>\n')
- for i in feed.entries:
- req.write('\t\t<tr><td>{}</td><td>{}</td></tr>\n'.
- format(i['title'].encode('ascii', 'xmlcharrefreplace'),
- i['summary'].encode('ascii', 'xmlcharrefreplace')))
- req.write('\t</table>\n<br />')
-
-
-def handler(req):
- if req.method == "POST":
- req_pre_pos(req)
- else:
- req_pre(req)
- args = util.FieldStorage(req)
- if 'url' not in args and 'name' not in args:
- req.write('Something went wrong, empty fields?<br />')
- req.write('<a href="index.html">back</a>')
- else:
- feed2html(req, args['url'], args['name'])
- req_post(req)
- return apache.OK
+++ /dev/null
-sudo rm -rv /var/www/py/*
-sudo cp -v ./*.{xml,py,html,js} /var/www/py/
-#sudo chown -vR www-data:www-data /var/www/py
-sudo mkdir /var/www/py/files
-sudo chown -R mart:www-data /var/www/py
-sudo chmod -R 770 /var/www/py
-ln -s /var/www/py/files ../output
crawler interprets the patterns and visits the sources from time to time to
extract the information.
-\section{Input application}
-The purpose of the input application is to define the patterns together with
-the user so that the information can be transferred to the data processing
-application.
-The user input all goes through the familiar interface of the user's preferred
-web browser. By visiting the crawler's train website the user can specify the
-metadata of the source it wants to be periodically crawled through simple web
-forms as seen in figure~\ref{fig:mf1}
-\begin{figure}[H]
- \centering
- \caption{Webforms for source metadata}
- \label{fig:mf1}
- \includegraphics[width=80mm]{./img/img1.png}
-\end{figure}
+\input{methods_input.tex}
-\section{Data processing application}
-\subsection{Directed acyclic graphs and finite state automata} Directed acyclic
-graphs(DAG) and finite state automata(FSA) have a lot in common concerning
-pattern recognition and information extraction. By feeding words\footnote{A
-word is a finite combination of letters from the graphs alphabet, thus a word
-is not limited to linguistic words but can be anything as long as the
-components are in the graphs alphabet} into an algorithm a DAG can be generated
-so that it matches certain patters present in the given words.
-Figure~\ref{fig:mg1} for example shows a FSA that matches on the words
-\textit{ab} and \textit{ac}.
-\begin{figure}[H]
- \centering
- \caption{Example DAG/FSA}
- \label{fig:mg1}
- \includegraphics[width=15mm]{./dots/graph1.png}
-\end{figure}
+\input{methods_data.tex}
-With this FSA we can test if a word fits to the constraints it the FSA
-describes. And with a little adaptation we can extract dynamic information from
-semi-structured data.\\
-
-
-
-\subsection{Back to DAG's and FSA's}
-Nodes in this data structure can be single letters but also bigger
-constructions. The example in Figure~\ref{fig:mg2} describes different
-separator pattern for event data with its three component: what, when, where.
-In this example the nodes with the labels \textit{what, when, where} can also
-be complete subgrahps. In this way data on a larger level can be using the
-NIP markings and data within the categories can be processed autonomously.
-\begin{figure}[H]
- \centering
- \caption{Example event data}
- \label{fig:mg2}
- \includegraphics[width=\linewidth]{./dots/graph2.png}
-\end{figure}
-
-\subsection{Algorithm}
-
-\section{Crawler application}
+\input{methods_crawl.tex}
--- /dev/null
+\section{Crawler application}
+The crawler application is the program that periodically visits the sources and
+extracts the information conform the given pattern from these sources. When the
+crawler is trying to extract information from a website and when this fails it
+will send a message to a system administrator, to either add the entry to the
+training set or to retrain the entire network again with new data because there
+have been changes in the internal structure of the visited source.
--- /dev/null
+\section{Data processing application}
+The data processing application is the application that takes the input from
+the input application and converts it to crawler rules. This application is
+triggered after a user is done with the input application, the data processing
+application creates crawler patterns so that the crawler can periodically and
+systematically extract the information from the sources.
+
+\subsection{Directed acyclic graphs and finite state automata}
+Directed acyclic graphs(DAG) and finite state automata(FSA) have a lot in
+common concerning pattern recognition and information extraction. By feeding
+words\footnote{A word is a finite combination of letters from the graphs
+alphabet, thus a word is not limited to linguistic words but can be anything
+as long as the components are in the graphs alphabet} into an algorithm a DAG
+can be generated so that it matches certain patters present in the given words.
+Figure~\ref{fig:mg1} for example shows a FSA that matches on the words
+\textit{ab} and \textit{ac}.
+\begin{figure}[H]
+ \centering
+ \caption{Example DAG/FSA}
+ \label{fig:mg1}
+ \includegraphics[width=15mm]{./dots/graph1.png}
+\end{figure}
+
+With this FSA we can test if a word fits to the constraints it the FSA
+describes. And with a little adaptation we can extract dynamic information from
+semi-structured data.\\
+
+\subsection{Back to DAG's and FSA's}
+Nodes in this data structure can be single letters but also bigger
+constructions. The example in Figure~\ref{fig:mg2} describes different
+separator pattern for event data with its three component: what, when, where.
+In this example the nodes with the labels \textit{what, when, where} can also
+be complete subgrahps. In this way data on a larger level can be using the
+NIP markings and data within the categories can be processed autonomously.
+\begin{figure}[H]
+ \centering
+ \caption{Example event data}
+ \label{fig:mg2}
+ \includegraphics[width=\linewidth]{./dots/graph2.png}
+\end{figure}
--- /dev/null
+\section{Input application}
+The purpose of the input application is to define the patterns together with
+the user so that the information can be transferred to the data processing
+application.
+The user input all goes through the familiar interface of the user's preferred
+web browser. By visiting the crawler's train website the user can specify the
+metadata of the source it wants to be periodically crawled through simple web
+forms as seen in figure~\ref{fig:mf1}
+\begin{figure}[H]
+ \centering
+ \caption{Webforms for source metadata}
+ \label{fig:mf1}
+ \includegraphics[width=80mm]{./img/img1.png}
+\end{figure}