From cda4dfa1b759704a1ba4c12611a9f480443aa37d Mon Sep 17 00:00:00 2001 From: Mart Lubbers Date: Thu, 7 Aug 2014 15:48:48 +0200 Subject: [PATCH] update --- program/everything/crawler.py | 3 + program/everything/data_processing.py | 65 ++++++++++ program/everything/input_app.py | 93 ++++++++++++++ program/everything/install.sh | 7 ++ program/everything/output_data | 1 + .../webdata}/contextmenu_o.js | 7 +- .../webdata}/index.html | 2 +- .../webdata}/paradiso.rss.xml | 0 program/hypconvert/hypconvert.py | 1 + program/hypfront/hyper.py | 115 ------------------ program/hypfront/install.sh | 7 -- thesis/methods.tex | 56 +-------- thesis/methods_crawl.tex | 7 ++ thesis/methods_data.tex | 40 ++++++ thesis/methods_input.tex | 14 +++ 15 files changed, 241 insertions(+), 177 deletions(-) create mode 100644 program/everything/crawler.py create mode 100644 program/everything/data_processing.py create mode 100644 program/everything/input_app.py create mode 100755 program/everything/install.sh create mode 120000 program/everything/output_data rename program/{hypfront => everything/webdata}/contextmenu_o.js (97%) rename program/{hypfront => everything/webdata}/index.html (88%) rename program/{hypfront => everything/webdata}/paradiso.rss.xml (100%) delete mode 100644 program/hypfront/hyper.py delete mode 100644 program/hypfront/install.sh create mode 100644 thesis/methods_crawl.tex create mode 100644 thesis/methods_data.tex create mode 100644 thesis/methods_input.tex diff --git a/program/everything/crawler.py b/program/everything/crawler.py new file mode 100644 index 0000000..0370fed --- /dev/null +++ b/program/everything/crawler.py @@ -0,0 +1,3 @@ +#!/bin/env python +# -*- coding: utf-8 -*- + diff --git a/program/everything/data_processing.py b/program/everything/data_processing.py new file mode 100644 index 0000000..08d7d0f --- /dev/null +++ b/program/everything/data_processing.py @@ -0,0 +1,65 @@ +#!/bin/env python +# -*- coding: utf-8 -*- + +import ast +import logging +import re + + +def structure_data(d): + re_hdr = re.compile('(?P.*?)', flags=re.MULTILINE | re.DOTALL) + re_row = re.compile('(?P.*)', flags=re.MULTILINE | re.DOTALL) + re_dualcel = re.compile('(?P.*?)', + flags=re.MULTILINE | re.DOTALL) + con = d['content'] + d['content'] = [] + d['headers'] = [] + for line in con.split('\n\t\t'): + if not line: + continue + row = re_row.search(line) + row = row.group('row') + for header in re_hdr.finditer(row): + d['headers'].append(header.group('h')) + d['content'].append([]) + for cell in re_dualcel.finditer(row): + d['content'][-1].append(cell.group('c')) + + +def parse_line(line): + re_spa = re.compile('(?P.*?);.*?>)(?P.*?)(?P)') + results = [] + for column in line: + results.append([]) + markings = list(re_spa.finditer(column)) + if markings: + results[-1].append(markings) + return results + + +def create_nodes(d): + print d + + +def main(): + with open('./output_data/raw_out.txt', 'r') as data: + logging.info('raw data loaded, going to parse data') + d = data.readline() + d = re.sub('\)\]}$', '}', + re.sub('\)\],', ',', + re.sub('\[Field\(\'.*?\', ', '', d))) + d = ast.literal_eval(d) + logging.info('raw data parsed, going to structure data') + structure_data(d) + logging.info('data structured, parsed headers: {}'.format(d['headers'])) + logging.info('lines: {}'.format(len(d['content']))) + d['matchdata'] = [] + for line in filter(None, d['content']): + d['matchdata'].append(parse_line(line)) + create_nodes(d) + + +if __name__ == '__main__': + logging.basicConfig(level=logging.DEBUG) + main() diff --git a/program/everything/input_app.py b/program/everything/input_app.py new file mode 100644 index 0000000..4117fa5 --- /dev/null +++ b/program/everything/input_app.py @@ -0,0 +1,93 @@ +#!/bin/env python +# -*- coding: utf-8 -*- + +from mod_python import apache, util +import feedparser +import re +import urllib +import os + + +def req_pre_pos(req): + req.log_error('handler') + req.content_type = 'text/html' + req.send_http_header() + args = util.FieldStorage(req) + req.write( + '\n\n' + '\tVER: 0.01 - HyperFrontend RSS feed POSTREQUEST' + '\n\n' + '\tThanks submitting:
\n' + '\tEnter new rss feed\n
\n'
+        '{}\n
\n\n'.format(args)) + os.chdir('/var/www/py/files') + with open('raw_out.txt', 'w') as f: + f.write(str(args)) + + +def req_pre(req): + req.log_error('handler') + req.content_type = 'text/html' + req.send_http_header() + req.write( + '\n\n' + '\tHyperFrontend RSS feed input\n' + '\t\n\n\n' + '\t\n' + '\t\t\n' + '\t\t\n' + '\t\t\n' + '\t\t\n' + '\t\t\n' + '\t
Venue: \n' + '\t\t\t
Frequency: \n' + '\t\t\t
Default location name: \n' + '\t\t\t
Adress: \n' + '\t\t\t
Website: \n' + '\t\t\t
\n' + '\tSelecteer iets en klik de link
\n' + '\t\n' + '\t\n' + '\t\n' + '\t\n') + + +def req_post(req): + req.write('\t\n\n') + + +def feed2html(req, url, name): + url = urllib.unquote(url) + url = url if re.match('https?://', url) else 'http://{}'.format(url) + req.write( + '\tLoading "{}" as

"{}"


\n'.format(url, name)) + feed = feedparser.parse(url) + # channel = feed.feed + req.write('\t\n') + req.write('\t\t\n') + for i in feed.entries: + req.write(('\t\t' + '\n').format( + i['title'].encode('ascii', 'xmlcharrefreplace'), + i['summary'].encode('ascii', 'xmlcharrefreplace'))) + req.write('\t
TitleSummary
{}{}
\n
') + + +def handler(req): + if req.method == "POST": + req_pre_pos(req) + else: + req_pre(req) + args = util.FieldStorage(req) + if 'url' not in args and 'name' not in args: + req.write('Something went wrong, empty fields?
') + req.write('back') + else: + feed2html(req, args['url'], args['name']) + req_post(req) + return apache.OK diff --git a/program/everything/install.sh b/program/everything/install.sh new file mode 100755 index 0000000..0e87fd6 --- /dev/null +++ b/program/everything/install.sh @@ -0,0 +1,7 @@ +sudo rm -rv /var/www/py/* +sudo cp -v ./input_app.py /var/www/py +sudo cp -v ./webdata/*.{xml,html,js} /var/www/py/ +sudo mkdir /var/www/py/files +sudo chown -vR mart:www-data /var/www/py +sudo chmod -vR 770 /var/www/py +ln -s /var/www/py/files/ ./output_data diff --git a/program/everything/output_data b/program/everything/output_data new file mode 120000 index 0000000..6550f0b --- /dev/null +++ b/program/everything/output_data @@ -0,0 +1 @@ +/var/www/py/files/ \ No newline at end of file diff --git a/program/hypfront/contextmenu_o.js b/program/everything/webdata/contextmenu_o.js similarity index 97% rename from program/hypfront/contextmenu_o.js rename to program/everything/webdata/contextmenu_o.js index feb3135..8b0cc05 100644 --- a/program/hypfront/contextmenu_o.js +++ b/program/everything/webdata/contextmenu_o.js @@ -72,10 +72,14 @@ function stylizeHighlightedString(range, col) range.insertNode(span); } -function f_wanneer() { +function f_wann_tijd() { stylizeHighlightedString(selection, "red") } +function f_wann_dat() { + stylizeHighlightedString(selection, "darkred") +} + function f_wat() { stylizeHighlightedString(selection, "green") } @@ -84,6 +88,7 @@ function f_waar() { stylizeHighlightedString(selection, "blue") } + function post_all() { var xmlhttp = new XMLHttpRequest(); xmlhttp.onreadystatechange=function() diff --git a/program/hypfront/index.html b/program/everything/webdata/index.html similarity index 88% rename from program/hypfront/index.html rename to program/everything/webdata/index.html index 788eeee..00ce528 100644 --- a/program/hypfront/index.html +++ b/program/everything/webdata/index.html @@ -2,7 +2,7 @@ -
+ diff --git a/program/hypfront/paradiso.rss.xml b/program/everything/webdata/paradiso.rss.xml similarity index 100% rename from program/hypfront/paradiso.rss.xml rename to program/everything/webdata/paradiso.rss.xml diff --git a/program/hypconvert/hypconvert.py b/program/hypconvert/hypconvert.py index 34548ff..81193c4 100644 --- a/program/hypconvert/hypconvert.py +++ b/program/hypconvert/hypconvert.py @@ -1,3 +1,4 @@ +#!/bin/env python # -*- coding: utf-8 -*- import ast diff --git a/program/hypfront/hyper.py b/program/hypfront/hyper.py deleted file mode 100644 index 78e2a53..0000000 --- a/program/hypfront/hyper.py +++ /dev/null @@ -1,115 +0,0 @@ -#!/bin/env python -# -*- coding: utf-8 -*- - -from mod_python import apache, util -import feedparser -import re -import urllib -import subprocess -import os - -def req_pre_pos(req): - req.log_error('handler') - req.content_type = 'text/html' - req.send_http_header() - args = util.FieldStorage(req) - req.write("""\ - - - VER: 0.01 - HyperFrontend RSS feed POSTREQUEST - - - Thanks submitting:
- Enter new rss feed -
-{}
-        
- - -""".format(args)) - os.chdir('/var/www/py/files') - with open('raw_out.txt', 'w') as f: - f.write(str(args)) - - -def req_pre(req): - req.log_error('handler') - req.content_type = 'text/html' - req.send_http_header() - req.write("""\ - - - HyperFrontend RSS feed input - - - - -

RSS URL:

- - - - - -
Venue: -
Frequency: -
Default location name: -
Adress: -
Website: -
- - Selecteer iets en klik de link
- - - -
- -
- -""") - - -def req_post(req): - req.write("""\ - - - -""") - - -def feed2html(req, url, name): - url = urllib.unquote(url) - url = url if re.match('https?://', url) else 'http://{}'.format(url) - req.write( - '\tLoading "{}" as

"{}"


\n'.format(url, name)) - feed = feedparser.parse(url) - channel = feed.feed - req.write('\t\n') - req.write('\t\t\n') - for i in feed.entries: - req.write('\t\t\n'. - format(i['title'].encode('ascii', 'xmlcharrefreplace'), - i['summary'].encode('ascii', 'xmlcharrefreplace'))) - req.write('\t
TitleSummary
{}{}
\n
') - - -def handler(req): - if req.method == "POST": - req_pre_pos(req) - else: - req_pre(req) - args = util.FieldStorage(req) - if 'url' not in args and 'name' not in args: - req.write('Something went wrong, empty fields?
') - req.write('back') - else: - feed2html(req, args['url'], args['name']) - req_post(req) - return apache.OK diff --git a/program/hypfront/install.sh b/program/hypfront/install.sh deleted file mode 100644 index ab68ea4..0000000 --- a/program/hypfront/install.sh +++ /dev/null @@ -1,7 +0,0 @@ -sudo rm -rv /var/www/py/* -sudo cp -v ./*.{xml,py,html,js} /var/www/py/ -#sudo chown -vR www-data:www-data /var/www/py -sudo mkdir /var/www/py/files -sudo chown -R mart:www-data /var/www/py -sudo chmod -R 770 /var/www/py -ln -s /var/www/py/files ../output diff --git a/thesis/methods.tex b/thesis/methods.tex index 05bc2ed..475d80b 100644 --- a/thesis/methods.tex +++ b/thesis/methods.tex @@ -6,58 +6,8 @@ application and compiles them into computer interpretable patterns and the crawler interprets the patterns and visits the sources from time to time to extract the information. -\section{Input application} -The purpose of the input application is to define the patterns together with -the user so that the information can be transferred to the data processing -application. -The user input all goes through the familiar interface of the user's preferred -web browser. By visiting the crawler's train website the user can specify the -metadata of the source it wants to be periodically crawled through simple web -forms as seen in figure~\ref{fig:mf1} -\begin{figure}[H] - \centering - \caption{Webforms for source metadata} - \label{fig:mf1} - \includegraphics[width=80mm]{./img/img1.png} -\end{figure} +\input{methods_input.tex} -\section{Data processing application} -\subsection{Directed acyclic graphs and finite state automata} Directed acyclic -graphs(DAG) and finite state automata(FSA) have a lot in common concerning -pattern recognition and information extraction. By feeding words\footnote{A -word is a finite combination of letters from the graphs alphabet, thus a word -is not limited to linguistic words but can be anything as long as the -components are in the graphs alphabet} into an algorithm a DAG can be generated -so that it matches certain patters present in the given words. -Figure~\ref{fig:mg1} for example shows a FSA that matches on the words -\textit{ab} and \textit{ac}. -\begin{figure}[H] - \centering - \caption{Example DAG/FSA} - \label{fig:mg1} - \includegraphics[width=15mm]{./dots/graph1.png} -\end{figure} +\input{methods_data.tex} -With this FSA we can test if a word fits to the constraints it the FSA -describes. And with a little adaptation we can extract dynamic information from -semi-structured data.\\ - - - -\subsection{Back to DAG's and FSA's} -Nodes in this data structure can be single letters but also bigger -constructions. The example in Figure~\ref{fig:mg2} describes different -separator pattern for event data with its three component: what, when, where. -In this example the nodes with the labels \textit{what, when, where} can also -be complete subgrahps. In this way data on a larger level can be using the -NIP markings and data within the categories can be processed autonomously. -\begin{figure}[H] - \centering - \caption{Example event data} - \label{fig:mg2} - \includegraphics[width=\linewidth]{./dots/graph2.png} -\end{figure} - -\subsection{Algorithm} - -\section{Crawler application} +\input{methods_crawl.tex} diff --git a/thesis/methods_crawl.tex b/thesis/methods_crawl.tex new file mode 100644 index 0000000..167d233 --- /dev/null +++ b/thesis/methods_crawl.tex @@ -0,0 +1,7 @@ +\section{Crawler application} +The crawler application is the program that periodically visits the sources and +extracts the information conform the given pattern from these sources. When the +crawler is trying to extract information from a website and when this fails it +will send a message to a system administrator, to either add the entry to the +training set or to retrain the entire network again with new data because there +have been changes in the internal structure of the visited source. diff --git a/thesis/methods_data.tex b/thesis/methods_data.tex new file mode 100644 index 0000000..9b16004 --- /dev/null +++ b/thesis/methods_data.tex @@ -0,0 +1,40 @@ +\section{Data processing application} +The data processing application is the application that takes the input from +the input application and converts it to crawler rules. This application is +triggered after a user is done with the input application, the data processing +application creates crawler patterns so that the crawler can periodically and +systematically extract the information from the sources. + +\subsection{Directed acyclic graphs and finite state automata} +Directed acyclic graphs(DAG) and finite state automata(FSA) have a lot in +common concerning pattern recognition and information extraction. By feeding +words\footnote{A word is a finite combination of letters from the graphs +alphabet, thus a word is not limited to linguistic words but can be anything +as long as the components are in the graphs alphabet} into an algorithm a DAG +can be generated so that it matches certain patters present in the given words. +Figure~\ref{fig:mg1} for example shows a FSA that matches on the words +\textit{ab} and \textit{ac}. +\begin{figure}[H] + \centering + \caption{Example DAG/FSA} + \label{fig:mg1} + \includegraphics[width=15mm]{./dots/graph1.png} +\end{figure} + +With this FSA we can test if a word fits to the constraints it the FSA +describes. And with a little adaptation we can extract dynamic information from +semi-structured data.\\ + +\subsection{Back to DAG's and FSA's} +Nodes in this data structure can be single letters but also bigger +constructions. The example in Figure~\ref{fig:mg2} describes different +separator pattern for event data with its three component: what, when, where. +In this example the nodes with the labels \textit{what, when, where} can also +be complete subgrahps. In this way data on a larger level can be using the +NIP markings and data within the categories can be processed autonomously. +\begin{figure}[H] + \centering + \caption{Example event data} + \label{fig:mg2} + \includegraphics[width=\linewidth]{./dots/graph2.png} +\end{figure} diff --git a/thesis/methods_input.tex b/thesis/methods_input.tex new file mode 100644 index 0000000..40c8e3b --- /dev/null +++ b/thesis/methods_input.tex @@ -0,0 +1,14 @@ +\section{Input application} +The purpose of the input application is to define the patterns together with +the user so that the information can be transferred to the data processing +application. +The user input all goes through the familiar interface of the user's preferred +web browser. By visiting the crawler's train website the user can specify the +metadata of the source it wants to be periodically crawled through simple web +forms as seen in figure~\ref{fig:mf1} +\begin{figure}[H] + \centering + \caption{Webforms for source metadata} + \label{fig:mf1} + \includegraphics[width=80mm]{./img/img1.png} +\end{figure} -- 2.20.1