From c4f80f20d471ab664664fec76127ec8c0448528e Mon Sep 17 00:00:00 2001 From: Mart Lubbers Date: Tue, 26 Aug 2014 11:03:05 +0200 Subject: [PATCH] final planning 2.0 --- README.md | 7 +- planning/final_planning.txt | 13 +- program/everything/crawler.db | 429 ++++++++++++++++++++++++++++++++++ program/everything/crawler.py | 60 +++-- program/everything/t.dot | 30 +++ thesis/introduction.tex | 14 +- thesis/methods.tex | 3 +- 7 files changed, 509 insertions(+), 47 deletions(-) create mode 100644 program/everything/crawler.db create mode 100644 program/everything/t.dot diff --git a/README.md b/README.md index 02d5943..9c558e3 100644 --- a/README.md +++ b/README.md @@ -1,6 +1 @@ -VERSION 0.02 - -todo: git instructions mailen -skype: hyperoffice - -My skype is mart.lubbers +. diff --git a/planning/final_planning.txt b/planning/final_planning.txt index 7d7194a..95e2955 100644 --- a/planning/final_planning.txt +++ b/planning/final_planning.txt @@ -1,7 +1,6 @@ -2014-08-14 - Datumprikker maken voor overleg met franc. -2014-08-21 - -2014-08-28 - Programma af -2014-09-04 - Scriptie af? -2014-09-11 - Commentaar verwerken? -2014-09-18 - Scriptie laatste versie -2014-09-25 - ?? Presenteren +2014-08-26 - Meeting Franc en Alessandro +2014-08-28 - Scriptie introductie af +2014-09-04 - Scriptie methods af, gui voor crawler en crawler functionaliteit +2014-09-11 - Scriptie afronden, programma afronden +2014-09-18 - Evt commentaar verwerken, programma testen +2014-09-25 - Afsluiten diff --git a/program/everything/crawler.db b/program/everything/crawler.db new file mode 100644 index 0000000..919733d --- /dev/null +++ b/program/everything/crawler.db @@ -0,0 +1,429 @@ +(dp0 +S'para' +p1 +(dp2 +S'website' +p3 +S'www.test.com' +p4 +sS'name' +p5 +g1 +sS'titledawg' +p6 +(ipydawg +DAWG +p7 +(dp8 +S'q0' +p9 +(ipydawg +DAWGNode +p10 +(dp11 +S'number' +p12 +NsS'children' +p13 +(dp14 +S'\x01' +p15 +(ipydawg +DAWGNode +p16 +(dp17 +g12 +Nsg13 +(dp18 +S' ' +p19 +(ipydawg +DAWGNode +p20 +(dp21 +g12 +Nsg13 +(dp22 +S'\x02' +p23 +(ipydawg +DAWGNode +p24 +(dp25 +g12 +Nsg13 +(dp26 +g19 +(ipydawg +DAWGNode +p27 +(dp28 +g12 +Nsg13 +(dp29 +S'-' +p30 +(ipydawg +DAWGNode +p31 +(dp32 +g12 +Nsg13 +(dp33 +g19 +(ipydawg +DAWGNode +p34 +(dp35 +g12 +Nsg13 +(dp36 +S'\x03' +p37 +(ipydawg +DAWGNode +p38 +(dp39 +g12 +Nsg13 +(dp40 +g19 +(ipydawg +DAWGNode +p41 +(dp42 +g12 +Nsg13 +(dp43 +g19 +(ipydawg +DAWGNode +p44 +(dp45 +g12 +Nsg13 +(dp46 +g19 +(ipydawg +DAWGNode +p47 +(dp48 +g12 +Nsg13 +(dp49 +g30 +(ipydawg +DAWGNode +p50 +(dp51 +g12 +Nsg13 +(dp52 +g19 +(ipydawg +DAWGNode +p53 +(dp54 +g12 +Nsg13 +(dp55 +g19 +(ipydawg +DAWGNode +p56 +(dp57 +g12 +Nsg13 +(dp58 +g19 +(ipydawg +DAWGNode +p59 +(dp60 +g12 +Nsg13 +(dp61 +S'L' +p62 +(ipydawg +DAWGNode +p63 +(dp64 +g12 +Nsg13 +(dp65 +S'o' +p66 +(ipydawg +DAWGNode +p67 +(dp68 +g12 +Nsg13 +(dp69 +S'c' +p70 +(ipydawg +DAWGNode +p71 +(dp72 +g12 +Nsg13 +(dp73 +S'a' +p74 +(ipydawg +DAWGNode +p75 +(dp76 +g12 +Nsg13 +(dp77 +S't' +p78 +(ipydawg +DAWGNode +p79 +(dp80 +g12 +Nsg13 +(dp81 +S'i' +p82 +(ipydawg +DAWGNode +p83 +(dp84 +g12 +Nsg13 +(dp85 +S'e' +p86 +(ipydawg +DAWGNode +p87 +(dp88 +g12 +Nsg13 +(dp89 +S':' +p90 +(ipydawg +DAWGNode +p91 +(dp92 +g12 +Nsg13 +(dp93 +g19 +(ipydawg +DAWGNode +p94 +(dp95 +g12 +Nsg13 +(dp96 +S'\x04' +p97 +(ipydawg +DAWGNode +p98 +(dp99 +g12 +Nsg13 +(dp100 +sS'final' +p101 +I01 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbsg30 +g56 +ssg101 +I00 +sbssg101 +I00 +sbsg97 +(ipydawg +DAWGNode +p102 +(dp103 +g12 +Nsg13 +(dp104 +sg101 +I01 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbssg101 +I00 +sbsS'_numbers_valid' +p105 +I00 +sS'register' +p106 +c__builtin__ +set +p107 +((lp108 +g38 +ag98 +ag71 +ag59 +ag47 +ag75 +ag53 +ag44 +ag79 +ag83 +ag56 +ag41 +ag87 +ag50 +ag63 +ag91 +ag67 +ag94 +atp109 +Rp110 +sS'wp' +p111 +S'\x01 \x02 - \x04' +p112 +sbsS'dloc' +p113 +S'grote zaal' +p114 +sS'venue' +p115 +S'paradiso' +p116 +sS'content' +p117 +(lp118 +(lp119 +S'zaterdag 31 mei 2014 - Lentekabinet Festival Afterparty - Locatie: Tolhuistuin (zaal)' +p120 +aS'' +p121 +aa(lp122 +S'vrijdag 4 juli 2014 20:30 - The Crimson Projekct - Locatie: Tolhuistuin (zaal)' +p123 +aS'Muziek rond King Crimson' +p124 +aa(lp125 +S'dinsdag 10 juni 2014 20:30 - Het Ultieme Natuurkunde Feestje \xe2\x80\x93 keynote Amanda Gefter' +p126 +ag121 +aa(lp127 +S'dinsdag 12 augustus 2014 21:00 - Kevin Drew - Locatie: Bitterzoet' +p128 +aS'mede-oprichter Broken Social Scene solo' +p129 +aa(lp130 +S'vrijdag 4 juli 2014 22:00 - Palenke Soultribe' +p131 +aS'Electronische muziek en Afro-Colombiaanse ritmes' +p132 +aa(lp133 +S'maandag 3 november 2014 20:15 - Eefje de Visser: Waterwereldsteden - Locatie: Het Concertgebouw' +p134 +ag121 +aa(lp135 +S'zaterdag 27 september 2014 20:30 - A Great Big World - Locatie: Tolhuistuin (zaal)' +p136 +aS'Hitschrijvers uit New York' +p137 +aa(lp138 +S'zaterdag 7 juni 2014 23:00 - Benefietavond Marokkaanse Boot' +p139 +aS'Van Amsterdam naar Tanger' +p140 +aa(lp141 +S'donderdag 13 november 2014 19:30 - Wouter Hamel' +p142 +aS'Sprankelende jazzy pop' +p143 +aa(lp144 +S'vrijdag 13 juni 2014 00:00 - Legends' +p145 +ag121 +aasS'headers' +p146 +(lp147 +S'Title' +p148 +aS'Summary' +p149 +asS'summarydawg' +p150 +(ipydawg +DAWG +p151 +(dp152 +g9 +(ipydawg +DAWGNode +p153 +(dp154 +g12 +Nsg13 +(dp155 +sg101 +I00 +sbsg105 +I00 +sg106 +g107 +((lp156 +tp157 +Rp158 +sg111 +g121 +sbsS'freq' +p159 +S'1w' +p160 +sS'adress' +p161 +S'test' +p162 +ss. \ No newline at end of file diff --git a/program/everything/crawler.py b/program/everything/crawler.py index a0029b0..5ab68d7 100644 --- a/program/everything/crawler.py +++ b/program/everything/crawler.py @@ -9,7 +9,15 @@ import sys URL_REG = re.compile( - ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))') + ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<' + ur'>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+' + ur'\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))') + +REPL = [ + ('\x01', '(?P.+)'), + ('\x02', '(?P.+)'), + ('\x03', '(?P.+)'), + ('\x04', '(?P.+)')] class Crawler(): @@ -47,25 +55,11 @@ class Crawler(): d_s = self.entries[name]['summarydawg'] r_t, r_s = [], [] for i, w in enumerate(d_t.words()): - w = w.replace('\x01', - '(?P.+)'.format(i)) - w = w.replace('\x02', - '(?P.+)'.format(i)) - w = w.replace('\x03', - '(?P.+)'.format(i)) - w = w.replace('\x04', - '(?P.+)'.format(i)) + w = reduce(lambda x, y: x.replace(y[0], y[1].format(i)), REPL, w) w = re.sub('\s+', '\\s+', w) r_t.append(w) for i, w in enumerate(d_s.words()): - w = w.replace('\x01', - '(?P.+)'.format(i)) - w = w.replace('\x02', - '(?P.+)'.format(i)) - w = w.replace('\x03', - '(?P.+)'.format(i)) - w = w.replace('\x04', - '(?P.+)'.format(i)) + w = reduce(lambda x, y: x.replace(y[0], y[1].format(i)), REPL, w) w = re.sub('\s+', '\\s+', w) r_s.append(w) return r_t, r_s @@ -119,31 +113,31 @@ class Crawler(): def test_entry(self, name, title, summary): print '\n', repr(title), repr(summary) - #Get regexes and match + # Get regexes and match r_t, r_s = self.get_regex(name) matcht = [re.search(t, title) for t in r_t] matchs = [re.search(t, summary) for t in r_s] - #Remove all the empty matches + # Remove all the empty matches matcht = filter(lambda x: x is not None, matcht) matchs = filter(lambda x: x is not None, matchs) - #Title urls + # Title urls print 'urls:' for i, u in enumerate(URL_REG.findall(title), 1): - print '{}: {}'.format(i, u) - #Title best match + print '{}: {}'.format(i, filter(None, u)) + # Title best match if matcht: pprint.pprint( [m.groupdict() for m in reversed(sorted(matcht, key=lambda x: len(x.groups())))][0]) else: print 'no title match' - #Summary urls + # Summary urls print 'urls:' for i, u in enumerate(URL_REG.findall(summary), 1): - print '{}: {}'.format(i, u) - #Summary best match + print '{}: {}'.format(i, filter(None, u)) + # Summary best match if matchs: pprint.pprint( [m.groupdict() for m in @@ -154,15 +148,17 @@ class Crawler(): def main(): cr = Crawler() - cr.test_entry('dedoelen', 'vr 5 mei, 08:00 uur - Abba live', '') - cr.test_entry('dedoelen', 'vr 5 mei08:00 uur - Abba live', '') - cr.test_entry('paradiso', + # cr.test_entry('dedoelen', 'vr 5 mei, 08:00 uur - Abba live', '') + # cr.test_entry('dedoelen', 'vr 5 mei08:00 uur - Abba live', '') + print cr.get_regex('para') + cr.test_entry('para', 'donderdag 13 november 2014 19:30 - Wouter Hamel', '') - cr.test_entry('paradiso', 'zaterdag 27 september 2014 20:30 - A Great Big ' + cr.test_entry('para', 'zaterdag 27 september 2014 20:30 - A Great Big ' 'World - Locatie: Tolhuistuin (zaal)', '') - cr.test_entry('paradiso', 'zaterdag 27 september 2014 20:30 - A Great Big ' - 'World - Locatie: Tolhuistuin (zaal)', '') - cr.to_dot('paradiso', 't.dot') + cr.test_entry('para', 'zaterdag 27 september 2014 20:30 - A Great Big ' + 'World - Locatie: Tolhuistuin (zaal)', + 'http://wwww.test.nl') + cr.to_dot('para', 't.dot') if __name__ == '__main__': main() diff --git a/program/everything/t.dot b/program/everything/t.dot new file mode 100644 index 0000000..00f979b --- /dev/null +++ b/program/everything/t.dot @@ -0,0 +1,30 @@ +digraph dawg { + node [shape = doublecircle]; 8 22 + node [shape = circle]; 0 1 2 3 4 5 6 7 9 10 11 12 13 14 15 16 17 18 19 20 21 23 24 25 11 + 0 -> 1 [label = ""]; + 1 -> 2 [label = " "]; + 2 -> 3 [label = ""]; + 3 -> 4 [label = " "]; + 4 -> 5 [label = "-"]; + 5 -> 6 [label = " "]; + 6 -> 7 [label = ""]; + 6 -> 8 [label = ""]; + 7 -> 9 [label = " "]; + 9 -> 10 [label = " "]; + 9 -> 11 [label = "-"]; + 11 -> 12 [label = " "]; + 12 -> 13 [label = "L"]; + 13 -> 14 [label = "o"]; + 14 -> 15 [label = "c"]; + 15 -> 16 [label = "a"]; + 16 -> 17 [label = "t"]; + 17 -> 18 [label = "i"]; + 18 -> 19 [label = "e"]; + 19 -> 20 [label = ":"]; + 20 -> 21 [label = " "]; + 21 -> 22 [label = ""]; + 10 -> 23 [label = " "]; + 23 -> 24 [label = "-"]; + 24 -> 25 [label = " "]; + 25 -> 11 [label = " "]; +} diff --git a/thesis/introduction.tex b/thesis/introduction.tex index edcdf76..6e2cdb4 100644 --- a/thesis/introduction.tex +++ b/thesis/introduction.tex @@ -47,7 +47,6 @@ is capable of transforming raw data into structured data.} In practise this means that the end product is a software solution which does the previously described tasks. - \section{Scientific relevance} Currently the techniques for conversion from non structured data to structured data are static and mainly only usable by computer science experts. There is a @@ -58,3 +57,16 @@ The project is a continuation of the past project done by Roelofs et al.\cite{Roelofs2009}. The techniques described by Roelofs et al. are more focussed on extracting data from already isolated data so it can be an addition to the current project. + +\section{Why RSS} +Web sites change often in overall structure and the way the data is presented +does also change a lot because of new insights and layout changes. RSS feeds on +the other hand are often generated from the internal database of the venue's +site and therefore almost always very precise, structured and consistent. When +the structure of a RSS feed changes it is mostly because of the content +management system of the website changes.\\ +Because RSS does not have a structural dimension compared to websites there is +less information available, RSS feeds are basically raw strings which contain +all the information, sometimes venues choose to put html in the RSS feeds but +this is most of the time only to display big chunks of unstructured text in a +nicer fashion. diff --git a/thesis/methods.tex b/thesis/methods.tex index 5e5838d..47a02bb 100644 --- a/thesis/methods.tex +++ b/thesis/methods.tex @@ -2,11 +2,12 @@ The program can be divided into three components: input, data processing and the crawler. The applications are strictly separated and perform completely different tasks. The overall workflow is divided in two subtasks, namely -training and crawling. +training and crawling. The workflow is visible in figure~\ref{fig:meth1}. \begin{figure}[H] \centering \caption{Workflow of the application} + \label{fig:meth1} \scalebox{0.9}{ \begin{sequencediagram} \newinst{d}{:Database} -- 2.20.1