From: Mart Lubbers Date: Wed, 30 Apr 2014 18:50:17 +0000 (+0200) Subject: first commit X-Git-Url: https://git.martlubbers.net/?a=commitdiff_plain;h=b667431dc15a4305749137ae1941b1137d14747d;p=bsc-thesis1415.git first commit --- b667431dc15a4305749137ae1941b1137d14747d diff --git a/docs/BSc-Thesis-RR-12-13-final.pdf b/docs/BSc-Thesis-RR-12-13-final.pdf new file mode 100755 index 0000000..fcf92a0 Binary files /dev/null and b/docs/BSc-Thesis-RR-12-13-final.pdf differ diff --git a/log/2014-04-08.txt b/log/2014-04-08.txt new file mode 100644 index 0000000..88d5fab --- /dev/null +++ b/log/2014-04-08.txt @@ -0,0 +1,24 @@ +TIME: 3 + +Meeting with Alessandro and discussed with Jan about the project scope. + +Worst case a trainable by non IT rss feed crawler. Best case also websites +parseable. + +PLANS +===== +literature research, compare programming languages, python, php/javascript. +Server of HL has python. Crawler is going to be python for sure. + +So basically there is are three components: +- Frontend + The frontend is the user interface for the non IT user and is probably a + plugin for chrome or firefox. This generates a scheme which is parseable by + the crawler. +- Crawler + The crawler periodically crawls the sites/feeds using the generated schemes + and notifies the admins if there is a change in layout. The crawler + generates xml that is later parsed by the backend. +- Backend + The backend is not within the scope of this project but it will parse the + xml given by the crawler. diff --git a/log/2014-04-09.txt b/log/2014-04-09.txt new file mode 100644 index 0000000..617d407 --- /dev/null +++ b/log/2014-04-09.txt @@ -0,0 +1,6 @@ +TIME: 3 + +Created the log files and looked up some info about php and javascript. +Probably choosing python anyway. XML scheme draft written for the output. + +Created skeleton for the thesis diff --git a/log/log.bash b/log/log.bash new file mode 100755 index 0000000..4296c09 --- /dev/null +++ b/log/log.bash @@ -0,0 +1,7 @@ +#!/bin/bash +file=$(date +"%Y-%m-%d").txt +if [ ! -f $file ] +then + cp skeleton $file +fi +vi $file diff --git a/log/skeleton b/log/skeleton new file mode 100644 index 0000000..0ba02e1 --- /dev/null +++ b/log/skeleton @@ -0,0 +1,3 @@ +TIME: + + diff --git a/program/example_crawldata b/program/example_crawldata new file mode 100644 index 0000000..76653ac --- /dev/null +++ b/program/example_crawldata @@ -0,0 +1,8 @@ +name Paradiso + + uri http://www.paradiso.nl/rss.xml + freq 1d # SOME COMMENT +# SOME OTHER COMMENT + +name fake name + uri http://test.net diff --git a/program/example_structured.xml b/program/example_structured.xml new file mode 100644 index 0000000..5844096 --- /dev/null +++ b/program/example_structured.xml @@ -0,0 +1,14 @@ + + + + 2014-04-02T11:30:10+01:00 + Doornroosje + http://doornroosje.com/agenda/nile + http://youtube.com/video + + + 2014-04-03T12:00:00+01:00 + Paradiso + + diff --git a/program/hypconvert/hypconvert.py b/program/hypconvert/hypconvert.py new file mode 100644 index 0000000..e69de29 diff --git a/program/hypcrawl/hypcrawl.py b/program/hypcrawl/hypcrawl.py new file mode 100644 index 0000000..04fffb3 --- /dev/null +++ b/program/hypcrawl/hypcrawl.py @@ -0,0 +1,43 @@ +#!/bin/env python +# -*- coding: utf-8 -*- + +import codecs +import itertools as it +import re +import sys + + +def parseconfig(filepath, enc='utf-8', com='#'): + inp = sys.stdin if filepath == '-' else codecs.open(filepath, 'r', enc) + pattern = re.compile('^\s*(?P\S+)\t(?P[^{0}]*[^{0}\s])'.format(com)) + skeleton = {'uri': '', 'type': u'RSS', 'freq': u'1d'} + tests = { + u'name': re.compile('(.*)'), + u'freq': re.compile('([0-9]*\.?[0-9]*)([smhdwmy])', re.I), + u'uri': re.compile('(rss|mail|https?)://(\S?)'), + } + entries = [] + for line in it.ifilter(bool, it.imap(pattern.search, inp)): + if line.group('k') == 'name': + entries.append(skeleton.copy()) + key = line.group('k') + value = tests[key].match(line.group('v')) + if value is None: + print '{}:{} not properly formatted, skipping'.\ + format(key, line.group('v')) + else: + entries[-1][key] = value.groups() + if sys.stdin != inp: + inp.close() + return entries + + +def generatecron(config, output='-'): + raise NotImplementedError + + +def crawlentry(entry): + raise NotImplementedError + +if __name__ == '__main__': + print parseconfig('../example_crawldata') diff --git a/proposal/Makefile b/proposal/Makefile new file mode 100644 index 0000000..a86dd0c --- /dev/null +++ b/proposal/Makefile @@ -0,0 +1,11 @@ +all: proposal + +proposal: + latex proposal.tex + latex proposal.tex + bibtex proposal.aux + latex proposal.tex + dvipdfm proposal.dvi + +clean: + rm -vf *.aux *.bbl *.blg *.dvi *.log *.out *.pdf *.toc diff --git a/proposal/proposal.aux b/proposal/proposal.aux new file mode 100644 index 0000000..5def743 --- /dev/null +++ b/proposal/proposal.aux @@ -0,0 +1,29 @@ +\relax +\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} +\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined +\global\let\oldcontentsline\contentsline +\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}} +\global\let\oldnewlabel\newlabel +\gdef\newlabel#1#2{\newlabelxx{#1}#2} +\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} +\AtEndDocument{\ifx\hyper@anchor\@undefined +\let\contentsline\oldcontentsline +\let\newlabel\oldnewlabel +\fi} +\fi} +\global\let\hyper@last\relax +\gdef\HyperFirstAtBeginDocument#1{#1} +\providecommand*\HyPL@Entry[1]{} +\HyPL@Entry{0<>} +\citation{Roelofs2009} +\@writefile{toc}{\contentsline {section}{\numberline {1}Supervisors}{2}{section.1}} +\@writefile{toc}{\contentsline {section}{\numberline {2}Abstract\relax \fontsize {5}{6}\selectfont 73 words}{2}{section.2}} +\@writefile{toc}{\contentsline {section}{\numberline {3}Project Description\relax \fontsize {5}{6}\selectfont 484 words}{2}{section.3}} +\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Research Question and Motivation}{2}{subsection.3.1}} +\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Aim}{2}{subsection.3.2}} +\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Research Plan and Schedule}{2}{subsection.3.3}} +\bibstyle{ieeetr} +\bibdata{proposal} +\bibcite{Roelofs2009}{1} +\@writefile{toc}{\contentsline {subsection}{\numberline {3.4}Weekly planning}{3}{subsection.3.4}} +\@writefile{toc}{\contentsline {section}{\numberline {4}Scientific relevance\relax \fontsize {5}{6}\selectfont 52 words}{3}{section.4}} diff --git a/proposal/proposal.bbl b/proposal/proposal.bbl new file mode 100644 index 0000000..56b5abc --- /dev/null +++ b/proposal/proposal.bbl @@ -0,0 +1,8 @@ +\begin{thebibliography}{1} + +\bibitem{Roelofs2009} +W.~Roelofs, A.~T. Paula, and F.~Grootjen, ``{Programming by Clicking},'' in + {\em Proceedings of the Dutch Information Retrieval Conference}, pp.~2--3, + 2009. + +\end{thebibliography} diff --git a/proposal/proposal.bib b/proposal/proposal.bib new file mode 100755 index 0000000..264802f --- /dev/null +++ b/proposal/proposal.bib @@ -0,0 +1,9 @@ +@inproceedings{Roelofs2009, +author = {Roelofs, Wouter and Paula, Alessandro Tadeo and Grootjen, Franc}, +booktitle = {Proceedings of the Dutch Information Retrieval Conference}, +file = {:C$\backslash$:/Users/mart/Downloads/dir09.pdf:pdf}, +keywords = {levenshtein matching,subtree matching,web crawler}, +pages = {2--3}, +title = {{Programming by Clicking}}, +year = {2009} +} diff --git a/proposal/proposal.blg b/proposal/proposal.blg new file mode 100644 index 0000000..afb404d --- /dev/null +++ b/proposal/proposal.blg @@ -0,0 +1,46 @@ +This is BibTeX, Version 0.99d (TeX Live 2012/Debian) +Capacity: max_strings=35307, hash_size=35307, hash_prime=30011 +The top-level auxiliary file: proposal.aux +The style file: ieeetr.bst +Database file #1: proposal.bib +You've used 1 entry, + 1876 wiz_defined-function locations, + 483 strings with 3641 characters, +and the built_in function-call counts, 305 in all, are: += -- 26 +> -- 11 +< -- 0 ++ -- 4 +- -- 3 +* -- 21 +:= -- 49 +add.period$ -- 1 +call.type$ -- 1 +change.case$ -- 1 +chr.to.int$ -- 0 +cite$ -- 1 +duplicate$ -- 17 +empty$ -- 32 +format.name$ -- 3 +if$ -- 71 +int.to.chr$ -- 0 +int.to.str$ -- 1 +missing$ -- 1 +newline$ -- 6 +num.names$ -- 1 +pop$ -- 7 +preamble$ -- 1 +purify$ -- 0 +quote$ -- 0 +skip$ -- 9 +stack$ -- 0 +substring$ -- 16 +swap$ -- 5 +text.length$ -- 0 +text.prefix$ -- 0 +top$ -- 0 +type$ -- 0 +warning$ -- 0 +while$ -- 4 +width$ -- 2 +write$ -- 11 diff --git a/proposal/proposal.dvi b/proposal/proposal.dvi new file mode 100644 index 0000000..6fbdc10 Binary files /dev/null and b/proposal/proposal.dvi differ diff --git a/proposal/proposal.log b/proposal/proposal.log new file mode 100644 index 0000000..00feb94 --- /dev/null +++ b/proposal/proposal.log @@ -0,0 +1,274 @@ +This is pdfTeX, Version 3.1415926-2.4-1.40.13 (TeX Live 2012/Debian) (format=latex 2014.2.6) 8 APR 2014 13:21 +entering extended mode + restricted \write18 enabled. + %&-line parsing enabled. +**proposal.tex +(./proposal.tex +LaTeX2e <2011/06/27> +Babel and hyphenation patterns for english, dumylang, nohyphenation, et +hiopic, farsi, arabic, pinyin, croatian, bulgarian, ukrainian, russian, slovak, + czech, danish, dutch, usenglishmax, ukenglish, finnish, french, basque, ngerma +n, german, swissgerman, ngerman-x-2012-05-30, german-x-2012-05-30, monogreek, g +reek, ibycus, ancientgreek, hungarian, bengali, tamil, hindi, telugu, gujarati, + sanskrit, malayalam, kannada, assamese, marathi, oriya, panjabi, italian, lati +n, latvian, lithuanian, mongolian, mongolianlmc, nynorsk, bokmal, indonesian, e +speranto, coptic, welsh, irish, interlingua, serbian, serbianc, slovenian, friu +lan, romansh, estonian, romanian, armenian, uppersorbian, turkish, afrikaans, i +celandic, kurmanji, polish, portuguese, galician, catalan, spanish, swedish, th +ai, loaded. +(/usr/share/texlive/texmf-dist/tex/latex/base/article.cls +Document Class: article 2007/10/19 v1.4h Standard LaTeX document class +(/usr/share/texlive/texmf-dist/tex/latex/base/size10.clo +File: size10.clo 2007/10/19 v1.4h Standard LaTeX file (size option) +) +\c@part=\count79 +\c@section=\count80 +\c@subsection=\count81 +\c@subsubsection=\count82 +\c@paragraph=\count83 +\c@subparagraph=\count84 +\c@figure=\count85 +\c@table=\count86 +\abovecaptionskip=\skip41 +\belowcaptionskip=\skip42 +\bibindent=\dimen102 +) +(/usr/share/texlive/texmf-dist/tex/latex/hyperref/hyperref.sty +Package: hyperref 2012/05/13 v6.82q Hypertext links for LaTeX + +(/usr/share/texlive/texmf-dist/tex/generic/oberdiek/hobsub-hyperref.sty +Package: hobsub-hyperref 2012/05/28 v1.13 Bundle oberdiek, subset hyperref (HO) + + +(/usr/share/texlive/texmf-dist/tex/generic/oberdiek/hobsub-generic.sty +Package: hobsub-generic 2012/05/28 v1.13 Bundle oberdiek, subset generic (HO) +Package: hobsub 2012/05/28 v1.13 Construct package bundles (HO) +Package: infwarerr 2010/04/08 v1.3 Providing info/warning/error messages (HO) +Package: ltxcmds 2011/11/09 v1.22 LaTeX kernel commands for general use (HO) +Package: ifluatex 2010/03/01 v1.3 Provides the ifluatex switch (HO) +Package ifluatex Info: LuaTeX not detected. +Package: ifvtex 2010/03/01 v1.5 Detect VTeX and its facilities (HO) +Package ifvtex Info: VTeX not detected. +Package: intcalc 2007/09/27 v1.1 Expandable calculations with integers (HO) +Package: ifpdf 2011/01/30 v2.3 Provides the ifpdf switch (HO) +Package ifpdf Info: pdfTeX in PDF mode is not detected. +Package: etexcmds 2011/02/16 v1.5 Avoid name clashes with e-TeX commands (HO) +Package etexcmds Info: Could not find \expanded. +(etexcmds) That can mean that you are not using pdfTeX 1.50 or +(etexcmds) that some package has redefined \expanded. +(etexcmds) In the latter case, load this package earlier. +Package: kvsetkeys 2012/04/25 v1.16 Key value parser (HO) +Package: kvdefinekeys 2011/04/07 v1.3 Define keys (HO) +Package: pdftexcmds 2011/11/29 v0.20 Utility functions of pdfTeX for LuaTeX (HO +) +Package pdftexcmds Info: LuaTeX not detected. +Package pdftexcmds Info: \pdf@primitive is available. +Package pdftexcmds Info: \pdf@ifprimitive is available. +Package pdftexcmds Info: \pdfdraftmode is ignored in DVI mode. +Package: pdfescape 2011/11/25 v1.13 Implements pdfTeX's escape features (HO) +Package: bigintcalc 2012/04/08 v1.3 Expandable calculations on big integers (HO +) +Package: bitset 2011/01/30 v1.1 Handle bit-vector datatype (HO) +Package: uniquecounter 2011/01/30 v1.2 Provide unlimited unique counter (HO) +) +Package hobsub Info: Skipping package `hobsub' (already loaded). +Package: letltxmacro 2010/09/02 v1.4 Let assignment for LaTeX macros (HO) +Package: hopatch 2012/05/28 v1.2 Wrapper for package hooks (HO) +Package: xcolor-patch 2011/01/30 xcolor patch +Package: atveryend 2011/06/30 v1.8 Hooks at the very end of document (HO) +Package atveryend Info: \enddocument detected (standard20110627). +Package: atbegshi 2011/10/05 v1.16 At begin shipout hook (HO) +Package: refcount 2011/10/16 v3.4 Data extraction from label references (HO) +Package: hycolor 2011/01/30 v1.7 Color options for hyperref/bookmark (HO) +) +(/usr/share/texlive/texmf-dist/tex/latex/graphics/keyval.sty +Package: keyval 1999/03/16 v1.13 key=value parser (DPC) +\KV@toks@=\toks14 +) +(/usr/share/texlive/texmf-dist/tex/generic/ifxetex/ifxetex.sty +Package: ifxetex 2010/09/12 v0.6 Provides ifxetex conditional +) +(/usr/share/texlive/texmf-dist/tex/latex/oberdiek/kvoptions.sty +Package: kvoptions 2011/06/30 v3.11 Key value format for package options (HO) +) +\@linkdim=\dimen103 +\Hy@linkcounter=\count87 +\Hy@pagecounter=\count88 + +(/usr/share/texlive/texmf-dist/tex/latex/hyperref/pd1enc.def +File: pd1enc.def 2012/05/13 v6.82q Hyperref: PDFDocEncoding definition (HO) +) +\Hy@SavedSpaceFactor=\count89 + +(/usr/share/texlive/texmf-dist/tex/latex/latexconfig/hyperref.cfg +File: hyperref.cfg 2002/06/06 v1.2 hyperref configuration of TeXLive +) +Package hyperref Info: Hyper figures OFF on input line 4062. +Package hyperref Info: Link nesting OFF on input line 4067. +Package hyperref Info: Hyper index ON on input line 4070. +Package hyperref Info: Plain pages OFF on input line 4077. +Package hyperref Info: Backreferencing OFF on input line 4082. +Package hyperref Info: Implicit mode ON; LaTeX internals redefined. +Package hyperref Info: Bookmarks ON on input line 4300. +\c@Hy@tempcnt=\count90 + +(/usr/share/texlive/texmf-dist/tex/latex/url/url.sty +\Urlmuskip=\muskip10 +Package: url 2006/04/12 ver 3.3 Verb mode for urls, etc. +) +LaTeX Info: Redefining \url on input line 4653. +\Fld@menulength=\count91 +\Field@Width=\dimen104 +\Fld@charsize=\dimen105 +Package hyperref Info: Hyper figures OFF on input line 5773. +Package hyperref Info: Link nesting OFF on input line 5778. +Package hyperref Info: Hyper index ON on input line 5781. +Package hyperref Info: backreferencing OFF on input line 5788. +Package hyperref Info: Link coloring OFF on input line 5793. +Package hyperref Info: Link coloring with OCG OFF on input line 5798. +Package hyperref Info: PDF/A mode OFF on input line 5803. +LaTeX Info: Redefining \ref on input line 5843. +LaTeX Info: Redefining \pageref on input line 5847. +\Hy@abspage=\count92 +\c@Item=\count93 +\c@Hfootnote=\count94 +) + +Package hyperref Message: Driver: hdvipdfm. + +(/usr/share/texlive/texmf-dist/tex/latex/hyperref/hdvipdfm.def +File: hdvipdfm.def 2012/05/13 v6.82q Hyperref driver for dvipdfm +\pdfm@box=\box26 +\c@Hy@AnnotLevel=\count95 +\HyField@AnnotCount=\count96 +\Fld@listcount=\count97 +\c@bookmark@seq@number=\count98 + +(/usr/share/texlive/texmf-dist/tex/latex/oberdiek/rerunfilecheck.sty +Package: rerunfilecheck 2011/04/15 v1.7 Rerun checks for auxiliary files (HO) +Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 2 +82. +) +\Hy@SectionHShift=\skip43 +) +(/usr/share/texlive/texmf-dist/tex/latex/tools/calc.sty +Package: calc 2007/08/22 v4.3 Infix arithmetic (KKT,FJ) +\calc@Acount=\count99 +\calc@Bcount=\count100 +\calc@Adimen=\dimen106 +\calc@Bdimen=\dimen107 +\calc@Askip=\skip44 +\calc@Bskip=\skip45 +LaTeX Info: Redefining \setlength on input line 76. +LaTeX Info: Redefining \addtolength on input line 77. +\calc@Ccount=\count101 +\calc@Cskip=\skip46 +) +(/usr/share/texlive/texmf-dist/tex/latex/preprint/fullpage.sty +Package: fullpage 1999/02/23 1.1 (PWD) +\FP@margin=\skip47 +) (./proposal.aux) +\openout1 = `proposal.aux'. + +LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 13. +LaTeX Font Info: ... okay on input line 13. +LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 13. +LaTeX Font Info: ... okay on input line 13. +LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 13. +LaTeX Font Info: ... okay on input line 13. +LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 13. +LaTeX Font Info: ... okay on input line 13. +LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 13. +LaTeX Font Info: ... okay on input line 13. +LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 13. +LaTeX Font Info: ... okay on input line 13. +LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 13. +LaTeX Font Info: ... okay on input line 13. +\AtBeginShipoutBox=\box27 +Package hyperref Info: Link coloring OFF on input line 13. + (/usr/share/texlive/texmf-dist/tex/latex/hyperref/nameref.sty +Package: nameref 2010/04/30 v2.40 Cross-referencing by name of section + +(/usr/share/texlive/texmf-dist/tex/generic/oberdiek/gettitlestring.sty +Package: gettitlestring 2010/12/03 v1.4 Cleanup title references (HO) +) +\c@section@level=\count102 +) +LaTeX Info: Redefining \ref on input line 13. +LaTeX Info: Redefining \pageref on input line 13. +LaTeX Info: Redefining \nameref on input line 13. + +(./proposal.out) (./proposal.out) +\@outlinefile=\write3 +\openout3 = `proposal.out'. + +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <12> on input line 15. +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <8> on input line 15. +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <6> on input line 15. + (./proposal.toc +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <7> on input line 4. +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <5> on input line 4. +) +\tf@toc=\write4 +\openout4 = `proposal.toc'. + + [1 + +] + +LaTeX Warning: Citation `Roelofs2009' on page 2 undefined on input line 62. + +LaTeX Font Info: Try loading font information for OMS+cmr on input line 71. +(/usr/share/texlive/texmf-dist/tex/latex/base/omscmr.fd +File: omscmr.fd 1999/05/25 v2.5h Standard LaTeX font definitions +) +LaTeX Font Info: Font shape `OMS/cmr/m/n' in size <10> not available +(Font) Font shape `OMS/cmsy/m/n' tried instead on input line 71. + [2] +Overfull \hbox (3.55562pt too wide) in paragraph at lines 100--100 +[]\OT1/cmr/m/n/10 Wk| + [] + + +Underfull \hbox (badness 7308) in paragraph at lines 103--103 +[]\OT1/cmr/m/n/10 references and test en-vi-ron-ment + [] + + +Overfull \hbox (4.03265pt too wide) in paragraph at lines 98--132 +[][] + [] + +(./proposal.bbl) +Package atveryend Info: Empty hook `BeforeClearDocument' on input line 143. + [3] +Package atveryend Info: Empty hook `AfterLastShipout' on input line 143. + (./proposal.aux) +Package atveryend Info: Empty hook `AtVeryEndDocument' on input line 143. +Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 143. +Package rerunfilecheck Info: File `proposal.out' has not changed. +(rerunfilecheck) Checksum: F96E1A7A2E9548367B7EB6BB8EC41B1F;495. + + +LaTeX Warning: There were undefined references. + + +LaTeX Warning: Label(s) may have changed. Rerun to get cross-references right. + +Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 143. + ) +Here is how much of TeX's memory you used: + 4240 strings out of 493486 + 63493 string characters out of 3143550 + 144546 words of memory out of 3000000 + 7554 multiletter control sequences out of 15000+200000 + 8849 words of font info for 31 fonts, out of 3000000 for 9000 + 957 hyphenation exceptions out of 8191 + 29i,8n,28p,174b,373s stack positions out of 5000i,500n,10000p,200000b,50000s + +Output written on proposal.dvi (3 pages, 14824 bytes). diff --git a/proposal/proposal.out b/proposal/proposal.out new file mode 100644 index 0000000..5266d8e --- /dev/null +++ b/proposal/proposal.out @@ -0,0 +1,8 @@ +\BOOKMARK [1][-]{section.1}{Supervisors}{}% 1 +\BOOKMARK [1][-]{section.2}{Abstract73 words}{}% 2 +\BOOKMARK [1][-]{section.3}{Project Description484 words}{}% 3 +\BOOKMARK [2][-]{subsection.3.1}{Research Question and Motivation}{section.3}% 4 +\BOOKMARK [2][-]{subsection.3.2}{Aim}{section.3}% 5 +\BOOKMARK [2][-]{subsection.3.3}{Research Plan and Schedule}{section.3}% 6 +\BOOKMARK [2][-]{subsection.3.4}{Weekly planning}{section.3}% 7 +\BOOKMARK [1][-]{section.4}{Scientific relevance52 words}{}% 8 diff --git a/proposal/proposal.pdf b/proposal/proposal.pdf new file mode 100644 index 0000000..b9ba2d2 Binary files /dev/null and b/proposal/proposal.pdf differ diff --git a/proposal/proposal.tex b/proposal/proposal.tex new file mode 100755 index 0000000..c03cbaf --- /dev/null +++ b/proposal/proposal.tex @@ -0,0 +1,143 @@ +\documentclass[a4paper]{article} + +\usepackage[dvipdfmx]{hyperref} +\usepackage{calc} +\usepackage{fullpage} + +\author{Mart Lubbers\\ 0651371972\\ s4109503\\ + \href{mailto:mart@martlubbers.net}{mart@martlubbers.net}} +\title{Non IT configurable adaptive data mining solution used in transforming + raw data to structured data\\\small A proposal} +\date{\today} + +\begin{document} +\maketitle +\tableofcontents +\newpage +\section{Supervisors} +\begin{center} + \begin{tabular}{cc} + Franc Grootjen & Alessandro Paula\\ + Radboud University Nijmegen & Hyperleap\\ + Nijmegen, The Netherlands & Nijmegen, The Netherlands\\ + \href{mailto:f.grootjen@psych.ru.nl}{f.grootjen@psych.ru.nl} & + \href{mailto:aldo@hyperleap.nl}{aldo@hyperleap.nl} + \\ + \\ + Signature & Signature\\ + \\ + \rule{2.5cm}{0.4pt} & \rule{2.5cm}{0.4pt}\\ + \end{tabular} +\end{center} + +\section{Abstract\tiny 73 words} +Raw data from information providers is usually hard to interpret for a software +solution and the conversion of raw data to structured data is usually done by +hand. This project aims towards an adaptable, configurable data transformation +program optionally in combination with a webcrawler that can perform the +conversion from raw data to structured data. This is all done in under +supervision of Franc Grootjen and Alessandro Paula and under commissioned by +Hyperleap. + +\section{Project Description\tiny 484 words} +\subsection{Research Question and Motivation} +The main research question is: \textit{How can we make an adaptive, autonomous +and programmable data mining program that can be set up by a non IT +professional which is able to transform raw data into structured data.}\\ +Hyperleap is a small company that is specialized in infotainment +(information+entertainment) and administrates several websites which bundle +information about entertainment in a ordered and complete way. Right now, most +of the data input is done by hand and takes a lot of time to type in. + +\subsection{Aim} +The practical goal and aim of the project is to make a crawler(web or other +document types) that can autonomously gather information after it has been +setup by a, not necessarily IT trained, employer via an intuitive interface. +Optionally the crawler shouldn't be susceptible by small structure changes in +the website, be able to handle advanced website display techniques such as +javascript and should be able to notify the administrator when the site has +become uncrawlable and the crawler needs to be reprogrammed for that particular +site. But the main purpose is the translation from raw data to structured data. +The projects is in principle a continuation of a past project done by Wouter +Roelofs\cite{Roelofs2009} which was also supervised by Franc Grootjen and +Alessandro Paula, however it was never taken out of the experimental phase and +therefore is in need continuation. + +\subsection{Research Plan and Schedule} +The schedule or plan for the project can be divided into 4 stages namely the +initial, developmental, testing and writing stage. These stages are not +mutually exclusive and therefore can and will overlap. +\begin{itemize} + \item{Initiating stage:} + In this stage we will look at the past project and present literature + on the subject and create a explicit plan for the eventual software. + There probably is a lot of literature written on how to parse certain + information fields such as dates, places and artist information. The + date parsing and recognizing was a main part in the past project. + \item{Developmental stage:} + The developmental stage is the stage where most of the programming is + done and the where the algorithms for crawling and transformation are + implemented. For web-frontend the framework choice has fallen upon + firefox extensions which are mainly written in javascript and cfx. The + data transformer will probably be written in python due to the robust + natural language tools and portability. + \item{Testing stage:} + This stage will overlap greatly with the developmental stage because + this will save a lot of time. + \item{Writing stage:} + The last stage will be the stage in which the thesis is written and the + project presented. During all other stages certain parts of the thesis + can already be written down. +\end{itemize} + +\subsection{Weekly planning} +Because of some mandatory courses in the first semester of the next year the +schedule can be seen as provisional meaning that there is room to extend the +schedule.(in practice at maximum up to december 2014). + +\begin{tabular}{|p{1em}|p{1.2em}|p{5em}|p{16em}|p{15em}|} + \hline + \# & Wk & Date & Task & Deliverables\\\hline + 1 & 15 & 2014-04-07 & proposal and references & + proposal signed by both parties\\ + 2 & 16 & 2014-04-14 & references and test environment setup & + test environment\\ + 3 & 17 & 2014-04-21 & planning for writing the tool & + software design\\ + 4 & 18 & 2014-04-28 & writing thesis and programming & + introduction\\ + 5 & 19 & 2014-05-05 & writing thesis and programming & + \\ + 6 & 20 & 2014-05-12 & idem & + methods\\ + 7 & 21 & 2014-05-19 & idem & + first prototype software\\ + 8 & 22 & 2014-05-26 & testing, programming and thesis & + \\ + 9 & 23 & 2014-06-02 & testing, implementation bigger picture & + \\ + 10 & 24 & 2014-06-09 & testing & + working tool and results and abstract\\ + 11 & 25 & 2014-06-16 & presentation and thesis & + discussion and presentation\\ + 12 & 26 & 2014-06-23 & presentation & + presentiation\\ + 13 & 27 & 2014-06-29 & presentation & + \\ + \hline +\end{tabular}\\ +There will also be bi-weekly meetings with both supervisors to make sure we are +on schedule. If necessary the frequency of meetings with the external +supervisor can be increased. + +\section{Scientific relevance\tiny 52 words} +Currently the techniques for conversion from non structured data to structured +data are static and mainly only usable by IT specialists. There is a great need +of data mining in non structured data because the data within companies and on +the internet is piling up and are usually left to catch dust. + + +\bibliographystyle{ieeetr} +\bibliography{proposal} + +\end{document} diff --git a/proposal/proposal.toc b/proposal/proposal.toc new file mode 100644 index 0000000..8bbc95e --- /dev/null +++ b/proposal/proposal.toc @@ -0,0 +1,8 @@ +\contentsline {section}{\numberline {1}Supervisors}{2}{section.1} +\contentsline {section}{\numberline {2}Abstract\relax \fontsize {5}{6}\selectfont 73 words}{2}{section.2} +\contentsline {section}{\numberline {3}Project Description\relax \fontsize {5}{6}\selectfont 484 words}{2}{section.3} +\contentsline {subsection}{\numberline {3.1}Research Question and Motivation}{2}{subsection.3.1} +\contentsline {subsection}{\numberline {3.2}Aim}{2}{subsection.3.2} +\contentsline {subsection}{\numberline {3.3}Research Plan and Schedule}{2}{subsection.3.3} +\contentsline {subsection}{\numberline {3.4}Weekly planning}{3}{subsection.3.4} +\contentsline {section}{\numberline {4}Scientific relevance\relax \fontsize {5}{6}\selectfont 52 words}{3}{section.4} diff --git a/softwaredesign/classdiagram/Makefile b/softwaredesign/classdiagram/Makefile new file mode 100644 index 0000000..08ce210 --- /dev/null +++ b/softwaredesign/classdiagram/Makefile @@ -0,0 +1,5 @@ +all: + dot -T pdf -o classdiagram.pdf class.dot + +clean: + rm *.pdf diff --git a/softwaredesign/classdiagram/class.dot b/softwaredesign/classdiagram/class.dot new file mode 100644 index 0000000..956f1a3 --- /dev/null +++ b/softwaredesign/classdiagram/class.dot @@ -0,0 +1,27 @@ +digraph G { + fontname = "Bitstream Vera Sans" + fontsize = 8 + + splines=ortho + + node [ + fontname = "Bitstream Vera Sans" + fontsize = 8 + shape = "record" + ] + + edge [ + fontname = "Bitstream Vera Sans" + fontsize = 8 + ] + + Browser Extension [ + label = "" + ] + + Sources [ + label = "" + ] + + Sources -> Browser Extension +} diff --git a/softwaredesign/classdiagram/classdiagram.pdf b/softwaredesign/classdiagram/classdiagram.pdf new file mode 100644 index 0000000..5c25dca Binary files /dev/null and b/softwaredesign/classdiagram/classdiagram.pdf differ diff --git a/softwaredesign/workflow.txt b/softwaredesign/workflow.txt new file mode 100644 index 0000000..2654cd6 --- /dev/null +++ b/softwaredesign/workflow.txt @@ -0,0 +1,16 @@ +The program consist of three modules named: +- Converter: hypconvert +- Crawler: hypcrawl +- Frontend: hypfront + +The frontend consists of submodules for different data sources. For example +rss, html, email. +The frontend guides the user with categorizing data fields in the source and +outputs it in a database which can be read by the crawler and the converter. + +The crawler periodically collects the data specified by the frontend and passes +it through to the converter for processing. + +The converter processes the data aquired from the crawler with the patterns and +rules given by the frontend and outputs the structured data in the specified +xml format. diff --git a/thesis/Makefile b/thesis/Makefile new file mode 100644 index 0000000..bc01082 --- /dev/null +++ b/thesis/Makefile @@ -0,0 +1,11 @@ +all: thesis + +thesis: + latex thesis.tex + latex thesis.tex +# bibtex thesis.aux + latex thesis.tex + dvipdfm thesis.dvi + +clean: + rm -vf *.aux *.bbl *.blg *.dvi *.log *.out *.pdf *.toc diff --git a/thesis/abstract.tex b/thesis/abstract.tex new file mode 100644 index 0000000..5cdb1a4 --- /dev/null +++ b/thesis/abstract.tex @@ -0,0 +1,4 @@ +\begin{center} + \textbf{Abstract}\\ +\end{center} +\lipsum[1] diff --git a/thesis/introduction.tex b/thesis/introduction.tex new file mode 100644 index 0000000..f6276be --- /dev/null +++ b/thesis/introduction.tex @@ -0,0 +1 @@ +\lipsum[1] diff --git a/thesis/thesis.tex b/thesis/thesis.tex new file mode 100644 index 0000000..793d707 --- /dev/null +++ b/thesis/thesis.tex @@ -0,0 +1,39 @@ +\documentclass{scrbook} + +\usepackage{lipsum} + +\author{Mart Lubbers\\s4109053} +\title{Non IT congurable adaptive data mining solution used in transforming raw data to structured data} +\subtitle{ + Bachelor's Thesis in Artificial Intelligence\\ + Radboud University Nijmegen\\ + \vspace{15mm} + \begin{tabular}{cp{5em}c} + Franc Grootjen && Alessandro Paula\\ + RU && Hyperleap + \end{tabular} + } + +\date{\today} + +\begin{document} +\maketitle +\tableofcontents +\newpage + +%\begin{abstract} +% \input{abstract.tex} +%\end{abstract} + +\chapter{Introduction} +\input{introduction.tex} + +\chapter{Methods} + +\chapter{Results} + +\chapter{Discussion} + +\chapter{Appendices} + +\end{document}