first commit
authorMart Lubbers <mart@martlubbers.net>
Wed, 30 Apr 2014 18:50:17 +0000 (20:50 +0200)
committerMart Lubbers <mart@martlubbers.net>
Wed, 30 Apr 2014 18:50:17 +0000 (20:50 +0200)
28 files changed:
docs/BSc-Thesis-RR-12-13-final.pdf [new file with mode: 0755]
log/2014-04-08.txt [new file with mode: 0644]
log/2014-04-09.txt [new file with mode: 0644]
log/log.bash [new file with mode: 0755]
log/skeleton [new file with mode: 0644]
program/example_crawldata [new file with mode: 0644]
program/example_structured.xml [new file with mode: 0644]
program/hypconvert/hypconvert.py [new file with mode: 0644]
program/hypcrawl/hypcrawl.py [new file with mode: 0644]
proposal/Makefile [new file with mode: 0644]
proposal/proposal.aux [new file with mode: 0644]
proposal/proposal.bbl [new file with mode: 0644]
proposal/proposal.bib [new file with mode: 0755]
proposal/proposal.blg [new file with mode: 0644]
proposal/proposal.dvi [new file with mode: 0644]
proposal/proposal.log [new file with mode: 0644]
proposal/proposal.out [new file with mode: 0644]
proposal/proposal.pdf [new file with mode: 0644]
proposal/proposal.tex [new file with mode: 0755]
proposal/proposal.toc [new file with mode: 0644]
softwaredesign/classdiagram/Makefile [new file with mode: 0644]
softwaredesign/classdiagram/class.dot [new file with mode: 0644]
softwaredesign/classdiagram/classdiagram.pdf [new file with mode: 0644]
softwaredesign/workflow.txt [new file with mode: 0644]
thesis/Makefile [new file with mode: 0644]
thesis/abstract.tex [new file with mode: 0644]
thesis/introduction.tex [new file with mode: 0644]
thesis/thesis.tex [new file with mode: 0644]

diff --git a/docs/BSc-Thesis-RR-12-13-final.pdf b/docs/BSc-Thesis-RR-12-13-final.pdf
new file mode 100755 (executable)
index 0000000..fcf92a0
Binary files /dev/null and b/docs/BSc-Thesis-RR-12-13-final.pdf differ
diff --git a/log/2014-04-08.txt b/log/2014-04-08.txt
new file mode 100644 (file)
index 0000000..88d5fab
--- /dev/null
@@ -0,0 +1,24 @@
+TIME: 3
+
+Meeting with Alessandro and discussed with Jan about the project scope.
+
+Worst case a trainable by non IT rss feed crawler. Best case also websites
+parseable. 
+
+PLANS
+=====
+literature research, compare programming languages, python, php/javascript. 
+Server of HL has python. Crawler is going to be python for sure.
+
+So basically there is are three components:
+- Frontend
+       The frontend is the user interface for the non IT user and is probably a
+       plugin for chrome or firefox. This generates a scheme which is parseable by 
+       the     crawler.
+- Crawler
+       The crawler periodically crawls the sites/feeds using the generated schemes
+       and notifies the admins if there is a change in layout. The crawler 
+       generates xml that is later parsed by the backend.
+- Backend
+       The backend is not within the scope of this project but it will parse the 
+       xml given by the crawler.
diff --git a/log/2014-04-09.txt b/log/2014-04-09.txt
new file mode 100644 (file)
index 0000000..617d407
--- /dev/null
@@ -0,0 +1,6 @@
+TIME: 3
+
+Created the log files and looked up some info about php and javascript. 
+Probably choosing python anyway. XML scheme draft written for the output.
+
+Created skeleton for the thesis
diff --git a/log/log.bash b/log/log.bash
new file mode 100755 (executable)
index 0000000..4296c09
--- /dev/null
@@ -0,0 +1,7 @@
+#!/bin/bash
+file=$(date +"%Y-%m-%d").txt
+if [ ! -f $file ]
+then
+       cp skeleton $file
+fi
+vi $file
diff --git a/log/skeleton b/log/skeleton
new file mode 100644 (file)
index 0000000..0ba02e1
--- /dev/null
@@ -0,0 +1,3 @@
+TIME: 
+
+
diff --git a/program/example_crawldata b/program/example_crawldata
new file mode 100644 (file)
index 0000000..76653ac
--- /dev/null
@@ -0,0 +1,8 @@
+name   Paradiso
+
+       uri     http://www.paradiso.nl/rss.xml
+       freq    1d # SOME COMMENT
+# SOME OTHER COMMENT
+
+name   fake name
+       uri     http://test.net
diff --git a/program/example_structured.xml b/program/example_structured.xml
new file mode 100644 (file)
index 0000000..5844096
--- /dev/null
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<crawlerdata xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+       xsi:noNamespaceSchemaLocation="crawlerdata_scheme.xsd">
+       <entry>
+               <date>2014-04-02T11:30:10+01:00</date>
+               <venue>Doornroosje</venue>
+               <url type="site">http://doornroosje.com/agenda/nile</url>
+               <url type="video">http://youtube.com/video</url>                
+       </entry>
+       <entry>
+               <date>2014-04-03T12:00:00+01:00</date>
+               <venue>Paradiso</venue>
+       </entry>
+</crawlerdata>
diff --git a/program/hypconvert/hypconvert.py b/program/hypconvert/hypconvert.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/program/hypcrawl/hypcrawl.py b/program/hypcrawl/hypcrawl.py
new file mode 100644 (file)
index 0000000..04fffb3
--- /dev/null
@@ -0,0 +1,43 @@
+#!/bin/env python
+# -*- coding: utf-8 -*-
+
+import codecs
+import itertools as it
+import re
+import sys
+
+
+def parseconfig(filepath, enc='utf-8', com='#'):
+    inp = sys.stdin if filepath == '-' else codecs.open(filepath, 'r', enc)
+    pattern = re.compile('^\s*(?P<k>\S+)\t(?P<v>[^{0}]*[^{0}\s])'.format(com))
+    skeleton = {'uri': '', 'type': u'RSS', 'freq': u'1d'}
+    tests = {
+        u'name': re.compile('(.*)'),
+        u'freq': re.compile('([0-9]*\.?[0-9]*)([smhdwmy])', re.I),
+        u'uri': re.compile('(rss|mail|https?)://(\S?)'),
+        }
+    entries = []
+    for line in it.ifilter(bool, it.imap(pattern.search, inp)):
+        if line.group('k') == 'name':
+            entries.append(skeleton.copy())
+        key = line.group('k')
+        value = tests[key].match(line.group('v'))
+        if value is None:
+            print '{}:{} not properly formatted, skipping'.\
+                format(key, line.group('v'))
+        else:
+            entries[-1][key] = value.groups()
+    if sys.stdin != inp:
+        inp.close()
+    return entries
+
+
+def generatecron(config, output='-'):
+    raise NotImplementedError
+
+
+def crawlentry(entry):
+    raise NotImplementedError
+
+if __name__ == '__main__':
+    print parseconfig('../example_crawldata')
diff --git a/proposal/Makefile b/proposal/Makefile
new file mode 100644 (file)
index 0000000..a86dd0c
--- /dev/null
@@ -0,0 +1,11 @@
+all: proposal
+
+proposal:
+       latex proposal.tex
+       latex proposal.tex
+       bibtex proposal.aux
+       latex proposal.tex
+       dvipdfm proposal.dvi
+
+clean:
+       rm -vf *.aux *.bbl *.blg *.dvi *.log *.out *.pdf *.toc 
diff --git a/proposal/proposal.aux b/proposal/proposal.aux
new file mode 100644 (file)
index 0000000..5def743
--- /dev/null
@@ -0,0 +1,29 @@
+\relax 
+\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
+\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
+\global\let\oldcontentsline\contentsline
+\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
+\global\let\oldnewlabel\newlabel
+\gdef\newlabel#1#2{\newlabelxx{#1}#2}
+\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
+\AtEndDocument{\ifx\hyper@anchor\@undefined
+\let\contentsline\oldcontentsline
+\let\newlabel\oldnewlabel
+\fi}
+\fi}
+\global\let\hyper@last\relax 
+\gdef\HyperFirstAtBeginDocument#1{#1}
+\providecommand*\HyPL@Entry[1]{}
+\HyPL@Entry{0<</S/D>>}
+\citation{Roelofs2009}
+\@writefile{toc}{\contentsline {section}{\numberline {1}Supervisors}{2}{section.1}}
+\@writefile{toc}{\contentsline {section}{\numberline {2}Abstract\relax \fontsize  {5}{6}\selectfont  73 words}{2}{section.2}}
+\@writefile{toc}{\contentsline {section}{\numberline {3}Project Description\relax \fontsize  {5}{6}\selectfont  484 words}{2}{section.3}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Research Question and Motivation}{2}{subsection.3.1}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Aim}{2}{subsection.3.2}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Research Plan and Schedule}{2}{subsection.3.3}}
+\bibstyle{ieeetr}
+\bibdata{proposal}
+\bibcite{Roelofs2009}{1}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.4}Weekly planning}{3}{subsection.3.4}}
+\@writefile{toc}{\contentsline {section}{\numberline {4}Scientific relevance\relax \fontsize  {5}{6}\selectfont  52 words}{3}{section.4}}
diff --git a/proposal/proposal.bbl b/proposal/proposal.bbl
new file mode 100644 (file)
index 0000000..56b5abc
--- /dev/null
@@ -0,0 +1,8 @@
+\begin{thebibliography}{1}
+
+\bibitem{Roelofs2009}
+W.~Roelofs, A.~T. Paula, and F.~Grootjen, ``{Programming by Clicking},'' in
+  {\em Proceedings of the Dutch Information Retrieval Conference}, pp.~2--3,
+  2009.
+
+\end{thebibliography}
diff --git a/proposal/proposal.bib b/proposal/proposal.bib
new file mode 100755 (executable)
index 0000000..264802f
--- /dev/null
@@ -0,0 +1,9 @@
+@inproceedings{Roelofs2009,
+author = {Roelofs, Wouter and Paula, Alessandro Tadeo and Grootjen, Franc},
+booktitle = {Proceedings of the Dutch Information Retrieval Conference},
+file = {:C$\backslash$:/Users/mart/Downloads/dir09.pdf:pdf},
+keywords = {levenshtein matching,subtree matching,web crawler},
+pages = {2--3},
+title = {{Programming by Clicking}},
+year = {2009}
+}
diff --git a/proposal/proposal.blg b/proposal/proposal.blg
new file mode 100644 (file)
index 0000000..afb404d
--- /dev/null
@@ -0,0 +1,46 @@
+This is BibTeX, Version 0.99d (TeX Live 2012/Debian)
+Capacity: max_strings=35307, hash_size=35307, hash_prime=30011
+The top-level auxiliary file: proposal.aux
+The style file: ieeetr.bst
+Database file #1: proposal.bib
+You've used 1 entry,
+            1876 wiz_defined-function locations,
+            483 strings with 3641 characters,
+and the built_in function-call counts, 305 in all, are:
+= -- 26
+> -- 11
+< -- 0
++ -- 4
+- -- 3
+* -- 21
+:= -- 49
+add.period$ -- 1
+call.type$ -- 1
+change.case$ -- 1
+chr.to.int$ -- 0
+cite$ -- 1
+duplicate$ -- 17
+empty$ -- 32
+format.name$ -- 3
+if$ -- 71
+int.to.chr$ -- 0
+int.to.str$ -- 1
+missing$ -- 1
+newline$ -- 6
+num.names$ -- 1
+pop$ -- 7
+preamble$ -- 1
+purify$ -- 0
+quote$ -- 0
+skip$ -- 9
+stack$ -- 0
+substring$ -- 16
+swap$ -- 5
+text.length$ -- 0
+text.prefix$ -- 0
+top$ -- 0
+type$ -- 0
+warning$ -- 0
+while$ -- 4
+width$ -- 2
+write$ -- 11
diff --git a/proposal/proposal.dvi b/proposal/proposal.dvi
new file mode 100644 (file)
index 0000000..6fbdc10
Binary files /dev/null and b/proposal/proposal.dvi differ
diff --git a/proposal/proposal.log b/proposal/proposal.log
new file mode 100644 (file)
index 0000000..00feb94
--- /dev/null
@@ -0,0 +1,274 @@
+This is pdfTeX, Version 3.1415926-2.4-1.40.13 (TeX Live 2012/Debian) (format=latex 2014.2.6)  8 APR 2014 13:21
+entering extended mode
+ restricted \write18 enabled.
+ %&-line parsing enabled.
+**proposal.tex
+(./proposal.tex
+LaTeX2e <2011/06/27>
+Babel <v3.8m> and hyphenation patterns for english, dumylang, nohyphenation, et
+hiopic, farsi, arabic, pinyin, croatian, bulgarian, ukrainian, russian, slovak,
+ czech, danish, dutch, usenglishmax, ukenglish, finnish, french, basque, ngerma
+n, german, swissgerman, ngerman-x-2012-05-30, german-x-2012-05-30, monogreek, g
+reek, ibycus, ancientgreek, hungarian, bengali, tamil, hindi, telugu, gujarati,
+ sanskrit, malayalam, kannada, assamese, marathi, oriya, panjabi, italian, lati
+n, latvian, lithuanian, mongolian, mongolianlmc, nynorsk, bokmal, indonesian, e
+speranto, coptic, welsh, irish, interlingua, serbian, serbianc, slovenian, friu
+lan, romansh, estonian, romanian, armenian, uppersorbian, turkish, afrikaans, i
+celandic, kurmanji, polish, portuguese, galician, catalan, spanish, swedish, th
+ai, loaded.
+(/usr/share/texlive/texmf-dist/tex/latex/base/article.cls
+Document Class: article 2007/10/19 v1.4h Standard LaTeX document class
+(/usr/share/texlive/texmf-dist/tex/latex/base/size10.clo
+File: size10.clo 2007/10/19 v1.4h Standard LaTeX file (size option)
+)
+\c@part=\count79
+\c@section=\count80
+\c@subsection=\count81
+\c@subsubsection=\count82
+\c@paragraph=\count83
+\c@subparagraph=\count84
+\c@figure=\count85
+\c@table=\count86
+\abovecaptionskip=\skip41
+\belowcaptionskip=\skip42
+\bibindent=\dimen102
+)
+(/usr/share/texlive/texmf-dist/tex/latex/hyperref/hyperref.sty
+Package: hyperref 2012/05/13 v6.82q Hypertext links for LaTeX
+
+(/usr/share/texlive/texmf-dist/tex/generic/oberdiek/hobsub-hyperref.sty
+Package: hobsub-hyperref 2012/05/28 v1.13 Bundle oberdiek, subset hyperref (HO)
+
+
+(/usr/share/texlive/texmf-dist/tex/generic/oberdiek/hobsub-generic.sty
+Package: hobsub-generic 2012/05/28 v1.13 Bundle oberdiek, subset generic (HO)
+Package: hobsub 2012/05/28 v1.13 Construct package bundles (HO)
+Package: infwarerr 2010/04/08 v1.3 Providing info/warning/error messages (HO)
+Package: ltxcmds 2011/11/09 v1.22 LaTeX kernel commands for general use (HO)
+Package: ifluatex 2010/03/01 v1.3 Provides the ifluatex switch (HO)
+Package ifluatex Info: LuaTeX not detected.
+Package: ifvtex 2010/03/01 v1.5 Detect VTeX and its facilities (HO)
+Package ifvtex Info: VTeX not detected.
+Package: intcalc 2007/09/27 v1.1 Expandable calculations with integers (HO)
+Package: ifpdf 2011/01/30 v2.3 Provides the ifpdf switch (HO)
+Package ifpdf Info: pdfTeX in PDF mode is not detected.
+Package: etexcmds 2011/02/16 v1.5 Avoid name clashes with e-TeX commands (HO)
+Package etexcmds Info: Could not find \expanded.
+(etexcmds)             That can mean that you are not using pdfTeX 1.50 or
+(etexcmds)             that some package has redefined \expanded.
+(etexcmds)             In the latter case, load this package earlier.
+Package: kvsetkeys 2012/04/25 v1.16 Key value parser (HO)
+Package: kvdefinekeys 2011/04/07 v1.3 Define keys (HO)
+Package: pdftexcmds 2011/11/29 v0.20 Utility functions of pdfTeX for LuaTeX (HO
+)
+Package pdftexcmds Info: LuaTeX not detected.
+Package pdftexcmds Info: \pdf@primitive is available.
+Package pdftexcmds Info: \pdf@ifprimitive is available.
+Package pdftexcmds Info: \pdfdraftmode is ignored in DVI mode.
+Package: pdfescape 2011/11/25 v1.13 Implements pdfTeX's escape features (HO)
+Package: bigintcalc 2012/04/08 v1.3 Expandable calculations on big integers (HO
+)
+Package: bitset 2011/01/30 v1.1 Handle bit-vector datatype (HO)
+Package: uniquecounter 2011/01/30 v1.2 Provide unlimited unique counter (HO)
+)
+Package hobsub Info: Skipping package `hobsub' (already loaded).
+Package: letltxmacro 2010/09/02 v1.4 Let assignment for LaTeX macros (HO)
+Package: hopatch 2012/05/28 v1.2 Wrapper for package hooks (HO)
+Package: xcolor-patch 2011/01/30 xcolor patch
+Package: atveryend 2011/06/30 v1.8 Hooks at the very end of document (HO)
+Package atveryend Info: \enddocument detected (standard20110627).
+Package: atbegshi 2011/10/05 v1.16 At begin shipout hook (HO)
+Package: refcount 2011/10/16 v3.4 Data extraction from label references (HO)
+Package: hycolor 2011/01/30 v1.7 Color options for hyperref/bookmark (HO)
+)
+(/usr/share/texlive/texmf-dist/tex/latex/graphics/keyval.sty
+Package: keyval 1999/03/16 v1.13 key=value parser (DPC)
+\KV@toks@=\toks14
+)
+(/usr/share/texlive/texmf-dist/tex/generic/ifxetex/ifxetex.sty
+Package: ifxetex 2010/09/12 v0.6 Provides ifxetex conditional
+)
+(/usr/share/texlive/texmf-dist/tex/latex/oberdiek/kvoptions.sty
+Package: kvoptions 2011/06/30 v3.11 Key value format for package options (HO)
+)
+\@linkdim=\dimen103
+\Hy@linkcounter=\count87
+\Hy@pagecounter=\count88
+
+(/usr/share/texlive/texmf-dist/tex/latex/hyperref/pd1enc.def
+File: pd1enc.def 2012/05/13 v6.82q Hyperref: PDFDocEncoding definition (HO)
+)
+\Hy@SavedSpaceFactor=\count89
+
+(/usr/share/texlive/texmf-dist/tex/latex/latexconfig/hyperref.cfg
+File: hyperref.cfg 2002/06/06 v1.2 hyperref configuration of TeXLive
+)
+Package hyperref Info: Hyper figures OFF on input line 4062.
+Package hyperref Info: Link nesting OFF on input line 4067.
+Package hyperref Info: Hyper index ON on input line 4070.
+Package hyperref Info: Plain pages OFF on input line 4077.
+Package hyperref Info: Backreferencing OFF on input line 4082.
+Package hyperref Info: Implicit mode ON; LaTeX internals redefined.
+Package hyperref Info: Bookmarks ON on input line 4300.
+\c@Hy@tempcnt=\count90
+
+(/usr/share/texlive/texmf-dist/tex/latex/url/url.sty
+\Urlmuskip=\muskip10
+Package: url 2006/04/12  ver 3.3  Verb mode for urls, etc.
+)
+LaTeX Info: Redefining \url on input line 4653.
+\Fld@menulength=\count91
+\Field@Width=\dimen104
+\Fld@charsize=\dimen105
+Package hyperref Info: Hyper figures OFF on input line 5773.
+Package hyperref Info: Link nesting OFF on input line 5778.
+Package hyperref Info: Hyper index ON on input line 5781.
+Package hyperref Info: backreferencing OFF on input line 5788.
+Package hyperref Info: Link coloring OFF on input line 5793.
+Package hyperref Info: Link coloring with OCG OFF on input line 5798.
+Package hyperref Info: PDF/A mode OFF on input line 5803.
+LaTeX Info: Redefining \ref on input line 5843.
+LaTeX Info: Redefining \pageref on input line 5847.
+\Hy@abspage=\count92
+\c@Item=\count93
+\c@Hfootnote=\count94
+)
+
+Package hyperref Message: Driver: hdvipdfm.
+
+(/usr/share/texlive/texmf-dist/tex/latex/hyperref/hdvipdfm.def
+File: hdvipdfm.def 2012/05/13 v6.82q Hyperref driver for dvipdfm
+\pdfm@box=\box26
+\c@Hy@AnnotLevel=\count95
+\HyField@AnnotCount=\count96
+\Fld@listcount=\count97
+\c@bookmark@seq@number=\count98
+
+(/usr/share/texlive/texmf-dist/tex/latex/oberdiek/rerunfilecheck.sty
+Package: rerunfilecheck 2011/04/15 v1.7 Rerun checks for auxiliary files (HO)
+Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 2
+82.
+)
+\Hy@SectionHShift=\skip43
+)
+(/usr/share/texlive/texmf-dist/tex/latex/tools/calc.sty
+Package: calc 2007/08/22 v4.3 Infix arithmetic (KKT,FJ)
+\calc@Acount=\count99
+\calc@Bcount=\count100
+\calc@Adimen=\dimen106
+\calc@Bdimen=\dimen107
+\calc@Askip=\skip44
+\calc@Bskip=\skip45
+LaTeX Info: Redefining \setlength on input line 76.
+LaTeX Info: Redefining \addtolength on input line 77.
+\calc@Ccount=\count101
+\calc@Cskip=\skip46
+)
+(/usr/share/texlive/texmf-dist/tex/latex/preprint/fullpage.sty
+Package: fullpage 1999/02/23 1.1 (PWD)
+\FP@margin=\skip47
+) (./proposal.aux)
+\openout1 = `proposal.aux'.
+
+LaTeX Font Info:    Checking defaults for OML/cmm/m/it on input line 13.
+LaTeX Font Info:    ... okay on input line 13.
+LaTeX Font Info:    Checking defaults for T1/cmr/m/n on input line 13.
+LaTeX Font Info:    ... okay on input line 13.
+LaTeX Font Info:    Checking defaults for OT1/cmr/m/n on input line 13.
+LaTeX Font Info:    ... okay on input line 13.
+LaTeX Font Info:    Checking defaults for OMS/cmsy/m/n on input line 13.
+LaTeX Font Info:    ... okay on input line 13.
+LaTeX Font Info:    Checking defaults for OMX/cmex/m/n on input line 13.
+LaTeX Font Info:    ... okay on input line 13.
+LaTeX Font Info:    Checking defaults for U/cmr/m/n on input line 13.
+LaTeX Font Info:    ... okay on input line 13.
+LaTeX Font Info:    Checking defaults for PD1/pdf/m/n on input line 13.
+LaTeX Font Info:    ... okay on input line 13.
+\AtBeginShipoutBox=\box27
+Package hyperref Info: Link coloring OFF on input line 13.
+ (/usr/share/texlive/texmf-dist/tex/latex/hyperref/nameref.sty
+Package: nameref 2010/04/30 v2.40 Cross-referencing by name of section
+
+(/usr/share/texlive/texmf-dist/tex/generic/oberdiek/gettitlestring.sty
+Package: gettitlestring 2010/12/03 v1.4 Cleanup title references (HO)
+)
+\c@section@level=\count102
+)
+LaTeX Info: Redefining \ref on input line 13.
+LaTeX Info: Redefining \pageref on input line 13.
+LaTeX Info: Redefining \nameref on input line 13.
+
+(./proposal.out) (./proposal.out)
+\@outlinefile=\write3
+\openout3 = `proposal.out'.
+
+LaTeX Font Info:    External font `cmex10' loaded for size
+(Font)              <12> on input line 15.
+LaTeX Font Info:    External font `cmex10' loaded for size
+(Font)              <8> on input line 15.
+LaTeX Font Info:    External font `cmex10' loaded for size
+(Font)              <6> on input line 15.
+ (./proposal.toc
+LaTeX Font Info:    External font `cmex10' loaded for size
+(Font)              <7> on input line 4.
+LaTeX Font Info:    External font `cmex10' loaded for size
+(Font)              <5> on input line 4.
+)
+\tf@toc=\write4
+\openout4 = `proposal.toc'.
+
+ [1
+
+]
+
+LaTeX Warning: Citation `Roelofs2009' on page 2 undefined on input line 62.
+
+LaTeX Font Info:    Try loading font information for OMS+cmr on input line 71.
+(/usr/share/texlive/texmf-dist/tex/latex/base/omscmr.fd
+File: omscmr.fd 1999/05/25 v2.5h Standard LaTeX font definitions
+)
+LaTeX Font Info:    Font shape `OMS/cmr/m/n' in size <10> not available
+(Font)              Font shape `OMS/cmsy/m/n' tried instead on input line 71.
+ [2]
+Overfull \hbox (3.55562pt too wide) in paragraph at lines 100--100
+[]\OT1/cmr/m/n/10 Wk| 
+ []
+
+
+Underfull \hbox (badness 7308) in paragraph at lines 103--103
+[]\OT1/cmr/m/n/10 references and test en-vi-ron-ment
+ []
+
+
+Overfull \hbox (4.03265pt too wide) in paragraph at lines 98--132
+[][] 
+ []
+
+(./proposal.bbl)
+Package atveryend Info: Empty hook `BeforeClearDocument' on input line 143.
+ [3]
+Package atveryend Info: Empty hook `AfterLastShipout' on input line 143.
+ (./proposal.aux)
+Package atveryend Info: Empty hook `AtVeryEndDocument' on input line 143.
+Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 143.
+Package rerunfilecheck Info: File `proposal.out' has not changed.
+(rerunfilecheck)             Checksum: F96E1A7A2E9548367B7EB6BB8EC41B1F;495.
+
+
+LaTeX Warning: There were undefined references.
+
+
+LaTeX Warning: Label(s) may have changed. Rerun to get cross-references right.
+
+Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 143.
+ ) 
+Here is how much of TeX's memory you used:
+ 4240 strings out of 493486
+ 63493 string characters out of 3143550
+ 144546 words of memory out of 3000000
+ 7554 multiletter control sequences out of 15000+200000
+ 8849 words of font info for 31 fonts, out of 3000000 for 9000
+ 957 hyphenation exceptions out of 8191
+ 29i,8n,28p,174b,373s stack positions out of 5000i,500n,10000p,200000b,50000s
+
+Output written on proposal.dvi (3 pages, 14824 bytes).
diff --git a/proposal/proposal.out b/proposal/proposal.out
new file mode 100644 (file)
index 0000000..5266d8e
--- /dev/null
@@ -0,0 +1,8 @@
+\BOOKMARK [1][-]{section.1}{Supervisors}{}% 1
+\BOOKMARK [1][-]{section.2}{Abstract73 words}{}% 2
+\BOOKMARK [1][-]{section.3}{Project Description484 words}{}% 3
+\BOOKMARK [2][-]{subsection.3.1}{Research Question and Motivation}{section.3}% 4
+\BOOKMARK [2][-]{subsection.3.2}{Aim}{section.3}% 5
+\BOOKMARK [2][-]{subsection.3.3}{Research Plan and Schedule}{section.3}% 6
+\BOOKMARK [2][-]{subsection.3.4}{Weekly planning}{section.3}% 7
+\BOOKMARK [1][-]{section.4}{Scientific relevance52 words}{}% 8
diff --git a/proposal/proposal.pdf b/proposal/proposal.pdf
new file mode 100644 (file)
index 0000000..b9ba2d2
Binary files /dev/null and b/proposal/proposal.pdf differ
diff --git a/proposal/proposal.tex b/proposal/proposal.tex
new file mode 100755 (executable)
index 0000000..c03cbaf
--- /dev/null
@@ -0,0 +1,143 @@
+\documentclass[a4paper]{article}
+
+\usepackage[dvipdfmx]{hyperref}
+\usepackage{calc}
+\usepackage{fullpage}
+
+\author{Mart Lubbers\\ 0651371972\\ s4109503\\
+               \href{mailto:mart@martlubbers.net}{mart@martlubbers.net}}
+\title{Non IT configurable adaptive data mining solution used in transforming
+          raw data to structured data\\\small A proposal}
+\date{\today}
+
+\begin{document}
+\maketitle
+\tableofcontents
+\newpage
+\section{Supervisors}
+\begin{center}
+       \begin{tabular}{cc}
+               Franc Grootjen  & Alessandro Paula\\
+               Radboud University Nijmegen     & Hyperleap\\
+               Nijmegen, The Netherlands       & Nijmegen, The Netherlands\\
+               \href{mailto:f.grootjen@psych.ru.nl}{f.grootjen@psych.ru.nl} & 
+                       \href{mailto:aldo@hyperleap.nl}{aldo@hyperleap.nl}
+               \\
+               \\
+               Signature       & Signature\\
+               \\
+               \rule{2.5cm}{0.4pt}     & \rule{2.5cm}{0.4pt}\\
+       \end{tabular}
+\end{center}
+
+\section{Abstract\tiny 73 words}
+Raw data from information providers is usually hard to interpret for a software
+solution and the conversion of raw data to structured data is usually done by
+hand.  This project aims towards an adaptable, configurable data transformation
+program optionally in combination with a webcrawler that can perform the
+conversion from raw data to structured data.  This is all done in under
+supervision of Franc Grootjen and Alessandro Paula and under commissioned by
+Hyperleap.
+
+\section{Project Description\tiny 484 words}
+\subsection{Research Question and Motivation}
+The main research question is: \textit{How can we make an adaptive, autonomous
+and programmable data mining program that can be set up by a non IT
+professional which is able to transform raw data into structured data.}\\
+Hyperleap is a small company that is specialized in infotainment
+(information+entertainment) and administrates several websites which bundle
+information about entertainment in a ordered and complete way.  Right now, most
+of the data input is done by hand and takes a lot of time to type in.
+
+\subsection{Aim}
+The practical goal and aim of the project is to make a crawler(web or other
+document types) that can autonomously gather information after it has been
+setup by a, not necessarily IT trained, employer via an intuitive interface.
+Optionally the crawler shouldn't be susceptible by small structure changes in
+the website, be able to handle advanced website display techniques such as
+javascript and should be able to notify the administrator when the site has
+become uncrawlable and the crawler needs to be reprogrammed for that particular
+site. But the main purpose is the translation from raw data to structured data.
+The projects is in principle a continuation of a past project done by Wouter
+Roelofs\cite{Roelofs2009} which was also supervised by Franc Grootjen and
+Alessandro Paula, however it was never taken out of the experimental phase and
+therefore is in need continuation. 
+
+\subsection{Research Plan and Schedule}
+The schedule or plan for the project can be divided into 4 stages namely the
+initial, developmental, testing and writing stage. These stages are not
+mutually exclusive and therefore can and will overlap.
+\begin{itemize}
+       \item{Initiating stage:}
+               In this stage we will look at the past project and present literature
+               on the subject and create a explicit plan for the eventual software.
+               There probably is a lot of literature written on how to parse certain
+               information fields such as dates, places and artist information. The
+               date parsing and recognizing was a main part in the past project.
+       \item{Developmental stage:}
+               The developmental stage is the stage where most of the programming is
+               done and the where the algorithms for crawling and transformation are
+               implemented.  For web-frontend the framework choice has fallen upon
+               firefox extensions which are mainly written in javascript and cfx. The
+               data transformer will probably be written in python due to the robust
+               natural language tools and portability.
+       \item{Testing stage:}
+               This stage will overlap greatly with the developmental stage because
+               this will save a lot of time.
+       \item{Writing stage:}
+               The last stage will be the stage in which the thesis is written and the
+               project presented. During all other stages certain parts of the thesis
+               can already be written down.
+\end{itemize}
+
+\subsection{Weekly planning}
+Because of some mandatory courses in the first semester of the next year the
+schedule can be seen as provisional meaning that there is room to extend the
+schedule.(in practice at maximum up to december 2014).
+
+\begin{tabular}{|p{1em}|p{1.2em}|p{5em}|p{16em}|p{15em}|}
+       \hline
+       \#      & Wk    & Date                  & Task  & Deliverables\\\hline
+       1       & 15    & 2014-04-07    & proposal and references       & 
+                                                                               proposal signed by both parties\\
+       2       & 16    & 2014-04-14    & references and test environment setup & 
+                                                                               test environment\\
+       3       & 17    & 2014-04-21    & planning for writing the tool & 
+                                                                               software design\\
+       4       & 18    & 2014-04-28    & writing thesis and programming        & 
+                                                                               introduction\\
+       5       & 19    & 2014-05-05    & writing thesis and programming &
+                                                                               \\
+       6       & 20    & 2014-05-12    & idem  & 
+                                                                               methods\\
+       7       & 21    & 2014-05-19    & idem  & 
+                                                                               first prototype software\\
+       8       & 22    & 2014-05-26    & testing, programming and thesis       &
+                                                                               \\
+       9       & 23    & 2014-06-02    & testing, implementation bigger picture &
+                                                                               \\
+       10      & 24    & 2014-06-09    & testing       & 
+                                                                               working tool and results and abstract\\
+       11      & 25    & 2014-06-16    & presentation and thesis       & 
+                                                                               discussion and presentation\\
+       12      & 26    & 2014-06-23    & presentation  & 
+                                                                               presentiation\\
+       13      & 27    & 2014-06-29    & presentation  &
+                                                                               \\
+       \hline
+\end{tabular}\\
+There will also be bi-weekly meetings with both supervisors to make sure we are
+on schedule.  If necessary the frequency of meetings with the external
+supervisor can be increased.
+
+\section{Scientific relevance\tiny 52 words}
+Currently the techniques for conversion from non structured data to structured
+data are static and mainly only usable by IT specialists. There is a great need
+of data mining in non structured data because the data within companies and on
+the internet is piling up and are usually left to catch dust.
+
+
+\bibliographystyle{ieeetr}
+\bibliography{proposal}
+
+\end{document}
diff --git a/proposal/proposal.toc b/proposal/proposal.toc
new file mode 100644 (file)
index 0000000..8bbc95e
--- /dev/null
@@ -0,0 +1,8 @@
+\contentsline {section}{\numberline {1}Supervisors}{2}{section.1}
+\contentsline {section}{\numberline {2}Abstract\relax \fontsize {5}{6}\selectfont 73 words}{2}{section.2}
+\contentsline {section}{\numberline {3}Project Description\relax \fontsize {5}{6}\selectfont 484 words}{2}{section.3}
+\contentsline {subsection}{\numberline {3.1}Research Question and Motivation}{2}{subsection.3.1}
+\contentsline {subsection}{\numberline {3.2}Aim}{2}{subsection.3.2}
+\contentsline {subsection}{\numberline {3.3}Research Plan and Schedule}{2}{subsection.3.3}
+\contentsline {subsection}{\numberline {3.4}Weekly planning}{3}{subsection.3.4}
+\contentsline {section}{\numberline {4}Scientific relevance\relax \fontsize {5}{6}\selectfont 52 words}{3}{section.4}
diff --git a/softwaredesign/classdiagram/Makefile b/softwaredesign/classdiagram/Makefile
new file mode 100644 (file)
index 0000000..08ce210
--- /dev/null
@@ -0,0 +1,5 @@
+all:
+       dot -T pdf -o classdiagram.pdf class.dot
+
+clean:
+       rm *.pdf
diff --git a/softwaredesign/classdiagram/class.dot b/softwaredesign/classdiagram/class.dot
new file mode 100644 (file)
index 0000000..956f1a3
--- /dev/null
@@ -0,0 +1,27 @@
+digraph G {
+       fontname = "Bitstream Vera Sans"
+       fontsize = 8
+       
+       splines=ortho
+       
+       node [
+               fontname = "Bitstream Vera Sans"
+               fontsize = 8
+               shape = "record"
+       ]
+       
+       edge [
+               fontname = "Bitstream Vera Sans"
+               fontsize = 8
+       ]
+
+       Browser Extension [
+               label = ""
+       ]
+
+       Sources [
+               label = ""
+       ]
+
+       Sources -> Browser Extension
+}
diff --git a/softwaredesign/classdiagram/classdiagram.pdf b/softwaredesign/classdiagram/classdiagram.pdf
new file mode 100644 (file)
index 0000000..5c25dca
Binary files /dev/null and b/softwaredesign/classdiagram/classdiagram.pdf differ
diff --git a/softwaredesign/workflow.txt b/softwaredesign/workflow.txt
new file mode 100644 (file)
index 0000000..2654cd6
--- /dev/null
@@ -0,0 +1,16 @@
+The program consist of three modules named:
+- Converter: hypconvert
+- Crawler: hypcrawl
+- Frontend: hypfront
+
+The frontend consists of submodules for different data sources. For example
+rss, html, email.
+The frontend guides the user with categorizing data fields in the source and
+outputs it in a database which can be read by the crawler and the converter.
+
+The crawler periodically collects the data specified by the frontend and passes
+it through to the converter for processing.
+
+The converter processes the data aquired from the crawler with the patterns and
+rules given by the frontend and outputs the structured data in the specified
+xml format.
diff --git a/thesis/Makefile b/thesis/Makefile
new file mode 100644 (file)
index 0000000..bc01082
--- /dev/null
@@ -0,0 +1,11 @@
+all: thesis
+
+thesis:
+       latex thesis.tex
+       latex thesis.tex
+#      bibtex thesis.aux
+       latex thesis.tex
+       dvipdfm thesis.dvi
+
+clean:
+       rm -vf *.aux *.bbl *.blg *.dvi *.log *.out *.pdf *.toc 
diff --git a/thesis/abstract.tex b/thesis/abstract.tex
new file mode 100644 (file)
index 0000000..5cdb1a4
--- /dev/null
@@ -0,0 +1,4 @@
+\begin{center}
+       \textbf{Abstract}\\
+\end{center}
+\lipsum[1]
diff --git a/thesis/introduction.tex b/thesis/introduction.tex
new file mode 100644 (file)
index 0000000..f6276be
--- /dev/null
@@ -0,0 +1 @@
+\lipsum[1]
diff --git a/thesis/thesis.tex b/thesis/thesis.tex
new file mode 100644 (file)
index 0000000..793d707
--- /dev/null
@@ -0,0 +1,39 @@
+\documentclass{scrbook}
+
+\usepackage{lipsum}
+
+\author{Mart Lubbers\\s4109053}
+\title{Non IT congurable adaptive data mining solution used in transforming raw data to structured data} 
+\subtitle{
+       Bachelor's Thesis in Artificial Intelligence\\
+       Radboud University Nijmegen\\
+       \vspace{15mm}
+       \begin{tabular}{cp{5em}c}
+               Franc Grootjen  && Alessandro Paula\\
+               RU                              && Hyperleap
+       \end{tabular}
+       }
+
+\date{\today}
+
+\begin{document}
+\maketitle
+\tableofcontents
+\newpage
+
+%\begin{abstract}
+%      \input{abstract.tex}
+%\end{abstract}
+
+\chapter{Introduction}
+\input{introduction.tex}
+
+\chapter{Methods}
+
+\chapter{Results}
+
+\chapter{Discussion}
+
+\chapter{Appendices}
+
+\end{document}