From f6f9e0b1870496d9f4165c135e3c233bf115388f Mon Sep 17 00:00:00 2001 From: Mart Lubbers Date: Tue, 16 Sep 2014 20:49:57 +0200 Subject: [PATCH] big update concerning web interface --- .../everything/{webdata => }/contextmenu_o.js | 3 +- program/everything/crawler.db | 1789 +---------------- program/everything/crawler.py | 74 +- program/everything/data_processing.py | 160 -- .../everything/{webdata => }/dedoelen.rss.xml | 0 program/everything/index.py | 31 + program/everything/input_app.py | 171 +- program/everything/install.sh | 5 +- program/everything/main.html.t | 41 + program/everything/output_data | 1 - .../everything/{webdata => }/paradiso.rss.xml | 0 .../everything/{webdata => }/podiuminfo.xml | 0 .../{webdata => }/ticketunlimitid.rss.xml | 0 .../everything/{webdata => }/tivoli.rss.xml | 0 program/everything/todo.txt | 3 - program/everything/uri.txt | 7 - program/everything/webdata/index.html | 14 - 17 files changed, 332 insertions(+), 1967 deletions(-) rename program/everything/{webdata => }/contextmenu_o.js (95%) delete mode 100644 program/everything/data_processing.py rename program/everything/{webdata => }/dedoelen.rss.xml (100%) create mode 100644 program/everything/index.py create mode 100644 program/everything/main.html.t delete mode 120000 program/everything/output_data rename program/everything/{webdata => }/paradiso.rss.xml (100%) rename program/everything/{webdata => }/podiuminfo.xml (100%) rename program/everything/{webdata => }/ticketunlimitid.rss.xml (100%) rename program/everything/{webdata => }/tivoli.rss.xml (100%) delete mode 100644 program/everything/todo.txt delete mode 100644 program/everything/uri.txt delete mode 100644 program/everything/webdata/index.html diff --git a/program/everything/webdata/contextmenu_o.js b/program/everything/contextmenu_o.js similarity index 95% rename from program/everything/webdata/contextmenu_o.js rename to program/everything/contextmenu_o.js index 5311a67..d241f82 100644 --- a/program/everything/webdata/contextmenu_o.js +++ b/program/everything/contextmenu_o.js @@ -25,8 +25,7 @@ function mouseUp(e) { if (curselection.endOffset - curselection.startOffset > 0) selection = curselection; console.log(selection) - if (e.which == 1) document.getElementById("contextmenu").style.visibility = "hidden"; - else if (e.which == 3) mouse_right = false + if (e.which == 3) mouse_right = false } function mouseDown(e) { diff --git a/program/everything/crawler.db b/program/everything/crawler.db index ac49fe2..e9287f8 100644 --- a/program/everything/crawler.db +++ b/program/everything/crawler.db @@ -1,5 +1,5 @@ (dp0 -S'Test1' +S'Paradiso_test1' p1 (dp2 S'website' @@ -11,1756 +11,89 @@ p5 g1 sS'url' p6 -S'http://www.paradiso.nl/rss.xml' +S'localhost/py/paradiso.rss.xml' p7 -sS'venue' +sS'dloc' p8 -S'Paradiso' +S'test' p9 -sS'dloc' +sS'venue' p10 -S'grote zaal' +S'p' p11 -sS'db' +sS'content' p12 -(dp13 -S'63662c13105245c8c98a5cc17443268a' -p14 -(dp15 -S'raw' +(lp13 +(lp14 +S'zaterdag 31 mei 2014 - Lentekabinet Festival Afterparty - Locatie: Tolhuistuin (zaal)' +p15 +aS'' p16 -(Vzondag 30 november 2014 21:00 - Catfish & The Bottlemen - Locatie: Bitterzoet -p17 -V +aa(lp17 +S'vrijdag 4 juli 2014 20:30 - The Crimson Projekct - Locatie: Tolhuistuin (zaal)' p18 -tp19 -sS'results' -p20 -(dp21 -S'waar2' -p22 -VBitterzoet +aS'Muziek rond King Crimson' +p19 +aa(lp20 +S'dinsdag 10 juni 2014 20:30 - Het Ultieme Natuurkunde Feestje \xe2\x80\x93 keynote Amanda Gefter' +p21 +ag16 +aa(lp22 +S'dinsdag 12 augustus 2014 21:00 - Kevin Drew - Locatie: Bitterzoet' p23 -sS'tijd2' +aS'mede-oprichter Broken Social Scene solo' p24 -V21:00 -p25 -sS'wat2' +aa(lp25 +S'vrijdag 4 juli 2014 22:00 - Palenke Soultribe' p26 -VCatfish & The Bottlemen +aS'Electronische muziek en Afro-Colombiaanse ritmes' p27 -sS'datum2' -p28 -Vzondag 30 november 2014 +aa(lp28 +S'maandag 3 november 2014 20:15 - Eefje de Visser: Waterwereldsteden - Locatie: Het Concertgebouw' p29 -sssS'497afcdf3c8fe95e5b63a7fd3483c88e' -p30 -(dp31 -g16 -(Vdonderdag 4 december 2014 21:30 - She Keeps Bees +ag16 +aa(lp30 +S'zaterdag 27 september 2014 20:30 - A Great Big World - Locatie: Tolhuistuin (zaal)' +p31 +aS'Hitschrijvers uit New York' p32 -g18 -tp33 -sg20 -(dp34 -S'tijd0' +aa(lp33 +S'zaterdag 7 juni 2014 23:00 - Benefietavond Marokkaanse Boot' +p34 +aS'Van Amsterdam naar Tanger' p35 -V21:30 -p36 -sS'wat0' +aa(lp36 +S'donderdag 13 november 2014 19:30 - Wouter Hamel' p37 -VShe Keeps Bees +aS'Sprankelende jazzy pop' p38 -sS'datum0' -p39 -Vdonderdag 4 december 2014 +aa(lp39 +S'vrijdag 13 juni 2014 00:00 - Legends' p40 -sssS'bc20fb8295d411fc5dd3b2b1f6cb0f4e' -p41 -(dp42 -g16 -(Vdonderdag 30 oktober 2014 22:00 - Dondergrondse: hosted by The Daily Indie @ Kelder -p43 -g18 -tp44 -sg20 -(dp45 -g35 -V22:00 -p46 -sg37 -VDondergrondse: hosted by The Daily Indie @ Kelder -p47 -sg39 -Vdonderdag 30 oktober 2014 -p48 -sssS'1aae9feccc63dfc41b653af9cebe97ed' -p49 -(dp50 -g16 -(Vzaterdag 22 november 2014 21:00 - Selda feat. Boom Pam - Locatie: Bitterzoet -p51 -g18 -tp52 -sg20 -(dp53 -g22 -VBitterzoet -p54 -sg24 -V21:00 -p55 -sg26 -VSelda feat. Boom Pam -p56 -sg28 -Vzaterdag 22 november 2014 -p57 -sssS'0e90f944f7d591d6e3c47bff6ba40301' -p58 -(dp59 -g16 -(Vvrijdag 7 november 2014 20:00 - The Mahones -p60 -g18 -tp61 -sg20 -(dp62 -g35 -V20:00 -p63 -sg37 -VThe Mahones -p64 -sg39 -Vvrijdag 7 november 2014 -p65 -sssS'73801ebeb1dfc852b8a441d4534bfd37' -p66 -(dp67 -g16 -(Vmaandag 24 november 2014 21:00 - Twin Forks - Locatie: Bitterzoet -p68 -g18 -tp69 -sg20 -(dp70 -g22 -VBitterzoet -p71 -sg24 -V21:00 -p72 -sg26 -VTwin Forks -p73 -sg28 -Vmaandag 24 november 2014 -p74 -sssS'6ae02cb00a5deb63cb417a870900d3cc' -p75 -(dp76 -g16 -(Vvrijdag 31 oktober 2014 20:30 - Rocket Cinema: Night of the Living Dead - Locatie: Paradiso Noord, Tolhuistuin (tuin) -p77 -VZombie Horror In Tolhuistuin -p78 -tp79 -sg20 -(dp80 -g22 -VParadiso Noord, Tolhuistuin (tuin) -p81 -sg24 -V20:30 -p82 -sg26 -VRocket Cinema: Night of the Living Dead -p83 -sg28 -Vvrijdag 31 oktober 2014 -p84 -sssS'fadc5b32d15a55b4bc60f5448a1c5342' -p85 -(dp86 -g16 -(Vdonderdag 18 december 2014 19:30 - dEUS -p87 -V\u2018\u2018Selected Songs 1994-2014\u201d -p88 -tp89 -sg20 -(dp90 -g35 -V19:30 -p91 -sg37 -VdEUS -p92 -sg39 -Vdonderdag 18 december 2014 -p93 -sssS'60632ff265f05913f6ebe8f6b0bf1995' -p94 -(dp95 -g16 -(Vdonderdag 9 oktober 2014 23:30 - Dondergrondse: hosted by Sweet Dreams @ Kelder -p96 -g18 -tp97 -sg20 -(dp98 -g35 -V23:30 -p99 -sg37 -VDondergrondse: hosted by Sweet Dreams @ Kelder -p100 -sg39 -Vdonderdag 9 oktober 2014 -p101 -sssS'ae8b5e28b321c9caaca5a7b56d892670' -p102 -(dp103 -g16 -(Vzondag 28 september 2014 11:00 - C4C Affordable Vintage & Fashion Fair - Locatie: Paradiso Noord, Tolhuistuin -p104 -VVintage Fashion Markt & V.I.P. Shoppen -p105 -tp106 -sg20 -(dp107 -g22 -VParadiso Noord, Tolhuistuin -p108 -sg24 -V11:00 -p109 -sg26 -VC4C Affordable Vintage & Fashion Fair -p110 -sg28 -Vzondag 28 september 2014 -p111 -sssS'c09b1ed0e482c4e27bb7c61fae4d0e15' -p112 -(dp113 -g16 -(Vvrijdag 16 januari 2015 20:30 - Kraftwerk The Catalogue 12345678 in 3-D - Autobahn (1974) -p114 -g18 -tp115 -sg20 -(dp116 -g35 -V3-D -p117 -sg37 -VAutobahn (1974) -p118 -sg39 -Vvrijdag 16 januari 2015 20:30 - Kraftwerk The Catalogue 12345678 in -p119 -sssS'91e127c55ea69be373514604cd3f0d54' -p120 -(dp121 -g16 -(Vzondag 8 maart 2015 20:30 - Thanasis Papakonstantinou & band -p122 -g18 -tp123 -sg20 -(dp124 -g35 -V20:30 -p125 -sg37 -VThanasis Papakonstantinou & band -p126 -sg39 -Vzondag 8 maart 2015 -p127 -sssS'492d8f49e92850c4b71345cf78b9cd5c' -p128 -(dp129 -g16 -(Vzaterdag 15 november 2014 21:00 - Kris Berry & Perquisite afscheidsconcert - Locatie: Bitterzoet -p130 -g18 -tp131 -sg20 -(dp132 -g22 -VBitterzoet -p133 -sg24 -V21:00 -p134 -sg26 -VKris Berry & Perquisite afscheidsconcert -p135 -sg28 -Vzaterdag 15 november 2014 -p136 -sssS'03e21261cc548333f352f3aeefb655ca' -p137 -(dp138 -g16 -(Vwoensdag 29 oktober 2014 19:00 - Sofia Dragt -p139 -g18 -tp140 -sg20 -(dp141 -g35 -V19:00 -p142 -sg37 -VSofia Dragt -p143 -sg39 -Vwoensdag 29 oktober 2014 -p144 -sssS'05acd9f7674136b0497d96d1d7331ed2' -p145 -(dp146 -g16 -(Vdonderdag 23 oktober 2014 23:30 - Noodlanding! -p147 -VDansnacht, alternatieve hits -p148 -tp149 -sg20 -(dp150 -g35 -V23:30 -p151 -sg37 -VNoodlanding! -p152 -sg39 -Vdonderdag 23 oktober 2014 -p153 -sssS'6e517291b28ec790c33819a477869519' -p154 -(dp155 -g16 -(Vvrijdag 31 oktober 2014 22:00 - Halloween Hairball - Locatie: Paradiso Noord, Tolhuistuin -p156 -VThe Freakiest Halloween Special Everrrr... -p157 -tp158 -sg20 -(dp159 -g22 -VParadiso Noord, Tolhuistuin -p160 -sg24 -V22:00 -p161 -sg26 -VHalloween Hairball -p162 -sg28 -Vvrijdag 31 oktober 2014 -p163 -sssS'7b6aeecede53f52256d7ba3278b35679' -p164 -(dp165 -g16 -(Vzaterdag 4 oktober 2014 20:30 - Fixkes - Locatie: Paradiso Noord, Tolhuistuin -p166 -g18 -tp167 -sg20 -(dp168 -g22 -VParadiso Noord, Tolhuistuin -p169 -sg24 -V20:30 -p170 -sg26 -VFixkes -p171 -sg28 -Vzaterdag 4 oktober 2014 -p172 -sssS'aca18d82bffadc7a8756531c0febec9f' -p173 -(dp174 -g16 -(Vdonderdag 2 oktober 2014 22:00 - Scoop -p175 -VParadiso's nieuwe Amsterdamse band-avond -p176 -tp177 -sg20 -(dp178 -g35 -V22:00 -p179 -sg37 -VScoop -p180 -sg39 -Vdonderdag 2 oktober 2014 -p181 -sssS'62db0423c9274dc47fdc8713e405cf14' -p182 -(dp183 -g16 -(Vwoensdag 12 november 2014 20:30 - The Drums - Locatie: Paradiso Noord, Tolhuistuin -p184 -VEncyclopedia -p185 -tp186 -sg20 -(dp187 -g22 -VParadiso Noord, Tolhuistuin -p188 -sg24 -V20:30 -p189 -sg26 -VThe Drums -p190 -sg28 -Vwoensdag 12 november 2014 -p191 -sssS'c0fbf389b316e3f931dd33fb0ec51a43' -p192 -(dp193 -g16 -(Vdonderdag 20 november 2014 20:30 - Bombay Bicycle Club -p194 -VIntelligente indie -p195 -tp196 -sg20 -(dp197 -g35 -V20:30 -p198 -sg37 -VBombay Bicycle Club -p199 -sg39 -Vdonderdag 20 november 2014 -p200 -ssssS'last_run' -p201 -F1410345938.705529 -sS'content' -p202 -(lp203 -(lp204 -S'maandag 24 november 2014 21:00 - Twin Forks - Locatie: Bitterzoet' -p205 -aS'' -p206 -aa(lp207 -S'zaterdag 22 november 2014 21:00 - Selda feat. Boom Pam - Locatie: Bitterzoet' -p208 -ag206 -aa(lp209 -S'donderdag 30 oktober 2014 22:00 - Dondergrondse: hosted by The Daily Indie @ Kelder' -p210 -ag206 -aa(lp211 -S'donderdag 9 oktober 2014 23:30 - Dondergrondse: hosted by Sweet Dreams @ Kelder' -p212 -ag206 -aa(lp213 -S'donderdag 23 oktober 2014 23:30 - Noodlanding!' -p214 -aS'Dansnacht, alternatieve hits' -p215 -aa(lp216 -S'donderdag 2 oktober 2014 22:00 - Scoop' -p217 -aS"Paradiso's nieuwe Amsterdamse band-avond" -p218 -aa(lp219 -S'donderdag 18 december 2014 19:30 - dEUS' -p220 -aS'\xe2\x80\x98\xe2\x80\x98Selected Songs 1994-2014\xe2\x80\x9d' -p221 -aa(lp222 -S'donderdag 4 december 2014 21:30 - She Keeps Bees' -p223 -ag206 -aa(lp224 -S'donderdag 20 november 2014 20:30 - Bombay Bicycle Club' -p225 -aS'Intelligente indie' -p226 -aa(lp227 -S'vrijdag 31 oktober 2014 20:30 - Rocket Cinema: Night of the Living Dead - Locatie: Paradiso Noord, Tolhuistuin (tuin)' -p228 -aS'Zombie Horror In Tolhuistuin' -p229 +ag16 aasS'headers' -p230 -(lp231 -S'Title' -p232 -aS'Summary' -p233 -asS'summarydawg' -p234 -(ipydawg -DAWG -p235 -(dp236 -S'q0' -p237 -(ipydawg -DAWGNode -p238 -(dp239 -S'children' -p240 -(dp241 -sS'final' -p242 -I00 -sS'number' -p243 -NsbsS'_numbers_valid' -p244 -I00 -sS'register' -p245 -c__builtin__ -set -p246 -((lp247 -tp248 -Rp249 -sS'wp' -p250 -g206 -sbsS'titledawg' -p251 -(ipydawg -DAWG -p252 -(dp253 -g237 -(ipydawg -DAWGNode -p254 -(dp255 -g240 -(dp256 -S'\x01' -p257 -(ipydawg -DAWGNode -p258 -(dp259 -g240 -(dp260 -S' ' -p261 -(ipydawg -DAWGNode -p262 -(dp263 -g240 -(dp264 -S'\x02' -p265 -(ipydawg -DAWGNode -p266 -(dp267 -g240 -(dp268 -g261 -(ipydawg -DAWGNode -p269 -(dp270 -g240 -(dp271 -S'-' -p272 -(ipydawg -DAWGNode -p273 -(dp274 -g240 -(dp275 -g261 -(ipydawg -DAWGNode -p276 -(dp277 -g240 -(dp278 -S'\x03' -p279 -(ipydawg -DAWGNode -p280 -(dp281 -g240 -(dp282 -g261 -(ipydawg -DAWGNode -p283 -(dp284 -g240 -(dp285 -g272 -(ipydawg -DAWGNode -p286 -(dp287 -g240 -(dp288 -g261 -(ipydawg -DAWGNode -p289 -(dp290 -g240 -(dp291 -S'L' -p292 -(ipydawg -DAWGNode -p293 -(dp294 -g240 -(dp295 -S'o' -p296 -(ipydawg -DAWGNode -p297 -(dp298 -g240 -(dp299 -S'c' -p300 -(ipydawg -DAWGNode -p301 -(dp302 -g240 -(dp303 -S'a' -p304 -(ipydawg -DAWGNode -p305 -(dp306 -g240 -(dp307 -S't' -p308 -(ipydawg -DAWGNode -p309 -(dp310 -g240 -(dp311 -S'i' -p312 -(ipydawg -DAWGNode -p313 -(dp314 -g240 -(dp315 -S'e' -p316 -(ipydawg -DAWGNode -p317 -(dp318 -g240 -(dp319 -S':' -p320 -(ipydawg -DAWGNode -p321 -(dp322 -g240 -(dp323 -g261 -(ipydawg -DAWGNode -p324 -(dp325 -g240 -(dp326 -S'\x04' -p327 -(ipydawg -DAWGNode -p328 -(dp329 -g240 -(dp330 -sg242 -I01 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbsg272 -(ipydawg -DAWGNode -p331 -(dp332 -g240 -(dp333 -g261 -(ipydawg -DAWGNode -p334 -(dp335 -g240 -(dp336 -g292 -(ipydawg -DAWGNode -p337 -(dp338 -g240 -(dp339 -g296 -(ipydawg -DAWGNode -p340 -(dp341 -g240 -(dp342 -g300 -(ipydawg -DAWGNode -p343 -(dp344 -g240 -(dp345 -g304 -(ipydawg -DAWGNode -p346 -(dp347 -g240 -(dp348 -g308 -(ipydawg -DAWGNode -p349 -(dp350 -g240 -(dp351 -g312 -(ipydawg -DAWGNode -p352 -(dp353 -g240 -(dp354 -g316 -(ipydawg -DAWGNode -p355 -(dp356 -g240 -(dp357 -g320 -(ipydawg -DAWGNode -p358 -(dp359 -g240 -(dp360 -g261 -(ipydawg -DAWGNode -p361 -(dp362 -g240 -(dp363 -g327 -(ipydawg -DAWGNode -p364 -(dp365 -g240 -(dp366 -sg242 -I01 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I01 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbssg242 -I00 -sg243 -Nsbsg244 -I00 -sg245 -g246 -((lp367 -g305 -ag317 -ag309 -ag283 -ag313 -ag286 -ag289 -ag321 -ag293 -ag297 -ag324 -ag328 -ag301 -atp368 -Rp369 -sg250 -S'\x01 \x02 - \x03- Locatie: \x04' -p370 -sbsS'freq' -p371 -S'1w' -p372 -sS'adress' -p373 -S'test' -p374 -ssS'paradiso' -p375 -(dp376 -S'website' -p377 -S'www.paradiso.nl' -p378 -sS'name' -p379 -g375 -sS'url' -p380 -S'http://www.paradiso.nl/rss.xml' -p381 -sS'db' -p382 -(dp383 -S'63662c13105245c8c98a5cc17443268a' -p384 -(dp385 -S'raw' -p386 -(Vzondag 30 november 2014 21:00 - Catfish & The Bottlemen - Locatie: Bitterzoet -p387 -g18 -tp388 -sS'results' -p389 -(dp390 -S'waar1' -p391 -VBitterzoet -p392 -sS'datum1' -p393 -Vzondag 30 november 2014 -p394 -sS'tijd1' -p395 -V21:00 -p396 -sS'wat1' -p397 -VCatfish & The Bottlemen -p398 -sssS'497afcdf3c8fe95e5b63a7fd3483c88e' -p399 -(dp400 -g386 -(Vdonderdag 4 december 2014 21:30 - She Keeps Bees -p401 -g18 -tp402 -sg389 -(dp403 -S'wat0' -p404 -VShe Keeps Bees -p405 -sS'tijd0' -p406 -V21:30 -p407 -sS'datum0' -p408 -Vdonderdag 4 december 2014 -p409 -sssS'bc20fb8295d411fc5dd3b2b1f6cb0f4e' -p410 -(dp411 -g386 -(Vdonderdag 30 oktober 2014 22:00 - Dondergrondse: hosted by The Daily Indie @ Kelder -p412 -g18 -tp413 -sg389 -(dp414 -g404 -VDondergrondse: hosted by The Daily Indie @ Kelder -p415 -sg406 -V22:00 -p416 -sg408 -Vdonderdag 30 oktober 2014 -p417 -sssS'1aae9feccc63dfc41b653af9cebe97ed' -p418 -(dp419 -g386 -(Vzaterdag 22 november 2014 21:00 - Selda feat. Boom Pam - Locatie: Bitterzoet -p420 -g18 -tp421 -sg389 -(dp422 -g391 -VBitterzoet -p423 -sg393 -Vzaterdag 22 november 2014 -p424 -sg395 -V21:00 -p425 -sg397 -VSelda feat. Boom Pam -p426 -sssS'03e21261cc548333f352f3aeefb655ca' -p427 -(dp428 -g386 -(Vwoensdag 29 oktober 2014 19:00 - Sofia Dragt -p429 -g18 -tp430 -sg389 -(dp431 -g404 -VSofia Dragt -p432 -sg406 -V19:00 -p433 -sg408 -Vwoensdag 29 oktober 2014 -p434 -sssS'62db0423c9274dc47fdc8713e405cf14' -p435 -(dp436 -g386 -(Vwoensdag 12 november 2014 20:30 - The Drums - Locatie: Paradiso Noord, Tolhuistuin -p437 -VEncyclopedia -p438 -tp439 -sg389 -(dp440 -g393 -Vwoensdag 12 november 2014 -p441 -sg391 -VParadiso Noord, Tolhuistuin -p442 -sS'wat0' -p443 -g438 -sg395 -V20:30 -p444 -sg397 -VThe Drums -p445 -sssS'6ae02cb00a5deb63cb417a870900d3cc' -p446 -(dp447 -g386 -(Vvrijdag 31 oktober 2014 20:30 - Rocket Cinema: Night of the Living Dead - Locatie: Paradiso Noord, Tolhuistuin (tuin) -p448 -VZombie Horror In Tolhuistuin -p449 -tp450 -sg389 -(dp451 -g393 -Vvrijdag 31 oktober 2014 -p452 -sg391 -VParadiso Noord, Tolhuistuin (tuin) -p453 -sg443 -g449 -sg395 -V20:30 -p454 -sg397 -VRocket Cinema: Night of the Living Dead -p455 -sssS'fadc5b32d15a55b4bc60f5448a1c5342' -p456 -(dp457 -g386 -(Vdonderdag 18 december 2014 19:30 - dEUS -p458 -V\u2018\u2018Selected Songs 1994-2014\u201d -p459 -tp460 -sg389 -(dp461 -g406 -V19:30 -p462 -sg443 -VdEUS -p463 -sg408 -Vdonderdag 18 december 2014 -p464 -sssS'60632ff265f05913f6ebe8f6b0bf1995' -p465 -(dp466 -g386 -(Vdonderdag 9 oktober 2014 23:30 - Dondergrondse: hosted by Sweet Dreams @ Kelder -p467 -g18 -tp468 -sg389 -(dp469 -g404 -VDondergrondse: hosted by Sweet Dreams @ Kelder -p470 -sg406 -V23:30 -p471 -sg408 -Vdonderdag 9 oktober 2014 -p472 -sssS'ae8b5e28b321c9caaca5a7b56d892670' -p473 -(dp474 -g386 -(Vzondag 28 september 2014 11:00 - C4C Affordable Vintage & Fashion Fair - Locatie: Paradiso Noord, Tolhuistuin -p475 -VVintage Fashion Markt & V.I.P. Shoppen -p476 -tp477 -sg389 -(dp478 -g393 -Vzondag 28 september 2014 -p479 -sg391 -VParadiso Noord, Tolhuistuin -p480 -sg443 -g476 -sg395 -V11:00 -p481 -sg397 -VC4C Affordable Vintage & Fashion Fair -p482 -sssS'c09b1ed0e482c4e27bb7c61fae4d0e15' -p483 -(dp484 -g386 -(Vvrijdag 16 januari 2015 20:30 - Kraftwerk The Catalogue 12345678 in 3-D - Autobahn (1974) -p485 -g18 -tp486 -sg389 -(dp487 -g404 -VAutobahn (1974) -p488 -sg406 -V3-D -p489 -sg408 -Vvrijdag 16 januari 2015 20:30 - Kraftwerk The Catalogue 12345678 in -p490 -sssS'91e127c55ea69be373514604cd3f0d54' -p491 -(dp492 -g386 -(Vzondag 8 maart 2015 20:30 - Thanasis Papakonstantinou & band -p493 -g18 -tp494 -sg389 -(dp495 -g404 -VThanasis Papakonstantinou & band -p496 -sg406 -V20:30 -p497 -sg408 -Vzondag 8 maart 2015 -p498 -sssS'492d8f49e92850c4b71345cf78b9cd5c' -p499 -(dp500 -g386 -(Vzaterdag 15 november 2014 21:00 - Kris Berry & Perquisite afscheidsconcert - Locatie: Bitterzoet -p501 -g18 -tp502 -sg389 -(dp503 -g391 -VBitterzoet -p504 -sg393 -Vzaterdag 15 november 2014 -p505 -sg395 -V21:00 -p506 -sg397 -VKris Berry & Perquisite afscheidsconcert -p507 -sssS'73801ebeb1dfc852b8a441d4534bfd37' -p508 -(dp509 -g386 -(Vmaandag 24 november 2014 21:00 - Twin Forks - Locatie: Bitterzoet -p510 -g18 -tp511 -sg389 -(dp512 -g391 -VBitterzoet -p513 -sg393 -Vmaandag 24 november 2014 -p514 -sg395 -V21:00 -p515 -sg397 -VTwin Forks -p516 -sssS'05acd9f7674136b0497d96d1d7331ed2' -p517 -(dp518 -g386 -(Vdonderdag 23 oktober 2014 23:30 - Noodlanding! -p519 -VDansnacht, alternatieve hits -p520 -tp521 -sg389 -(dp522 -g406 -V23:30 -p523 -sg443 -VNoodlanding! -p524 -sg408 -Vdonderdag 23 oktober 2014 -p525 -sssS'6e517291b28ec790c33819a477869519' -p526 -(dp527 -g386 -(Vvrijdag 31 oktober 2014 22:00 - Halloween Hairball - Locatie: Paradiso Noord, Tolhuistuin -p528 -VThe Freakiest Halloween Special Everrrr... -p529 -tp530 -sg389 -(dp531 -g393 -Vvrijdag 31 oktober 2014 -p532 -sg391 -VParadiso Noord, Tolhuistuin -p533 -sg443 -g529 -sg395 -V22:00 -p534 -sg397 -VHalloween Hairball -p535 -sssS'7b6aeecede53f52256d7ba3278b35679' -p536 -(dp537 -g386 -(Vzaterdag 4 oktober 2014 20:30 - Fixkes - Locatie: Paradiso Noord, Tolhuistuin -p538 -g18 -tp539 -sg389 -(dp540 -g391 -VParadiso Noord, Tolhuistuin -p541 -sg393 -Vzaterdag 4 oktober 2014 -p542 -sg395 -V20:30 -p543 -sg397 -VFixkes -p544 -sssS'aca18d82bffadc7a8756531c0febec9f' -p545 -(dp546 -g386 -(Vdonderdag 2 oktober 2014 22:00 - Scoop -p547 -VParadiso's nieuwe Amsterdamse band-avond -p548 -tp549 -sg389 -(dp550 -g406 -V22:00 -p551 -sg443 -VScoop -p552 -sg408 -Vdonderdag 2 oktober 2014 -p553 -sssS'0e90f944f7d591d6e3c47bff6ba40301' -p554 -(dp555 -g386 -(Vvrijdag 7 november 2014 20:00 - The Mahones -p556 -g18 -tp557 -sg389 -(dp558 -g404 -VThe Mahones -p559 -sg406 -V20:00 -p560 -sg408 -Vvrijdag 7 november 2014 -p561 -sssS'c0fbf389b316e3f931dd33fb0ec51a43' -p562 -(dp563 -g386 -(Vdonderdag 20 november 2014 20:30 - Bombay Bicycle Club -p564 -VIntelligente indie -p565 -tp566 -sg389 -(dp567 -g406 -V20:30 -p568 -sg443 -VBombay Bicycle Club -p569 -sg408 -Vdonderdag 20 november 2014 -p570 -ssssS'dloc' -p571 -S'nee' -p572 -sS'venue' -p573 -S'Paradiso' -p574 -sS'last_run' -p575 -F1410338734.431591 -sS'content' -p576 -(lp577 -(lp578 -S'zaterdag 22 november 2014 21:00 - Selda feat. Boom Pam - Locatie: Bitterzoet' -p579 -ag206 -aa(lp580 -S'donderdag 30 oktober 2014 22:00 - Dondergrondse: hosted by The Daily Indie @ Kelder' -p581 -ag206 -aa(lp582 -S'donderdag 9 oktober 2014 23:30 - Dondergrondse: hosted by Sweet Dreams @ Kelder' -p583 -ag206 -aa(lp584 -S'donderdag 23 oktober 2014 23:30 - Noodlanding!' -p585 -aS'Dansnacht, alternatieve hits' -p586 -aa(lp587 -S'donderdag 2 oktober 2014 22:00 - Scoop' -p588 -aS"Paradiso's nieuwe Amsterdamse band-avond" -p589 -aa(lp590 -S'donderdag 18 december 2014 19:30 - dEUS' -p591 -aS'\xe2\x80\x98\xe2\x80\x98Selected Songs 1994-2014\xe2\x80\x9d' -p592 -aa(lp593 -S'donderdag 4 december 2014 21:30 - She Keeps Bees' -p594 -ag206 -aa(lp595 -S'donderdag 20 november 2014 20:30 - Bombay Bicycle Club' -p596 -aS'Intelligente indie' -p597 -aa(lp598 -S'vrijdag 31 oktober 2014 20:30 - Rocket Cinema: Night of the Living Dead - Locatie: Paradiso Noord, Tolhuistuin (tuin)' -p599 -aS'Zombie Horror In Tolhuistuin' -p600 -aa(lp601 -S'vrijdag 31 oktober 2014 22:00 - Halloween Hairball - Locatie: Paradiso Noord, Tolhuistuin' -p602 -aS'The Freakiest Halloween Special Everrrr...' -p603 -aasS'headers' -p604 -(lp605 +p41 +(lp42 S'Title' -p606 +p43 aS'Summary' -p607 +p44 asS'summarydawg' -p608 -(ipydawg -DAWG -p609 -(dp610 -g237 -(ipydawg -DAWGNode -p611 -(dp612 -g242 -I00 -sg243 -Nsg240 -(dp613 -S'\x03' -p614 -(ipydawg -DAWGNode -p615 -(dp616 -g242 -I01 -sg243 -Nsg240 -(dp617 -sbssbsg244 -I00 -sg245 -g246 -((lp618 -tp619 -Rp620 -sg250 -g614 -sbsS'titledawg' -p621 -(ipydawg -DAWG -p622 -(dp623 -g237 -(ipydawg -DAWGNode -p624 -(dp625 -g242 -I00 -sg243 -Nsg240 -(dp626 -S'\x01' -p627 -(ipydawg -DAWGNode -p628 -(dp629 -g242 -I00 -sg243 -Nsg240 -(dp630 -S' ' -p631 -(ipydawg -DAWGNode -p632 -(dp633 -g242 -I00 -sg243 -Nsg240 -(dp634 -S'\x02' -p635 -(ipydawg -DAWGNode -p636 -(dp637 -g242 -I00 -sg243 -Nsg240 -(dp638 -g631 -(ipydawg -DAWGNode -p639 -(dp640 -g242 -I00 -sg243 -Nsg240 -(dp641 -S'-' -p642 -(ipydawg -DAWGNode -p643 -(dp644 -g242 -I00 -sg243 -Nsg240 -(dp645 -g631 -(ipydawg -DAWGNode -p646 -(dp647 -g242 -I00 -sg243 -Nsg240 -(dp648 -g614 -(ipydawg -DAWGNode -p649 -(dp650 -g242 -I01 -sg243 -Nsg240 -(dp651 -g631 -(ipydawg -DAWGNode -p652 -(dp653 -g242 -I00 -sg243 -Nsg240 -(dp654 -g631 -(ipydawg -DAWGNode -p655 -(dp656 -g242 -I00 -sg243 -Nsg240 -(dp657 -g631 -(ipydawg -DAWGNode -p658 -(dp659 -g242 -I00 -sg243 -Nsg240 -(dp660 -g642 -(ipydawg -DAWGNode -p661 -(dp662 -g242 -I00 -sg243 -Nsg240 -(dp663 -g631 -(ipydawg -DAWGNode -p664 -(dp665 -g242 -I00 -sg243 -Nsg240 -(dp666 -g631 -(ipydawg -DAWGNode -p667 -(dp668 -g242 -I00 -sg243 -Nsg240 -(dp669 -g631 -(ipydawg -DAWGNode -p670 -(dp671 -g242 -I00 -sg243 -Nsg240 -(dp672 -S'L' -p673 -(ipydawg -DAWGNode -p674 -(dp675 -g242 -I00 -sg243 -Nsg240 -(dp676 -S'o' -p677 -(ipydawg -DAWGNode -p678 -(dp679 -g242 -I00 -sg243 -Nsg240 -(dp680 -S'c' -p681 -(ipydawg -DAWGNode -p682 -(dp683 -g242 -I00 -sg243 -Nsg240 -(dp684 -S'a' -p685 -(ipydawg -DAWGNode -p686 -(dp687 -g242 -I00 -sg243 -Nsg240 -(dp688 -S't' -p689 -(ipydawg -DAWGNode -p690 -(dp691 -g242 -I00 -sg243 -Nsg240 -(dp692 -S'i' -p693 -(ipydawg -DAWGNode -p694 -(dp695 -g242 -I00 -sg243 -Nsg240 -(dp696 -S'e' -p697 -(ipydawg -DAWGNode -p698 -(dp699 -g242 -I00 -sg243 -Nsg240 -(dp700 -S':' -p701 -(ipydawg -DAWGNode -p702 -(dp703 -g242 -I00 -sg243 -Nsg240 -(dp704 -g631 -(ipydawg -DAWGNode -p705 -(dp706 -g242 -I00 -sg243 -Nsg240 -(dp707 -S'\x04' -p708 -(ipydawg -DAWGNode -p709 -(dp710 -g242 -I01 -sg243 -Nsg240 -(dp711 -sbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbsg244 -I00 -sg245 -g246 -((lp712 -tp713 -Rp714 -sg250 +p45 +(lp46 +sS'titledawg' +p47 +(lp48 S'\x01 \x02 - \x03 - Locatie: \x04' -p715 -sbsS'freq' -p716 +p49 +aS'\x01 \x02 - \x03' +p50 +asS'freq' +p51 S'1w' -p717 +p52 sS'adress' -p718 -S'amsterdam' -p719 +p53 +S'adres' +p54 ss. \ No newline at end of file diff --git a/program/everything/crawler.py b/program/everything/crawler.py index 2b531ff..a377d5b 100644 --- a/program/everything/crawler.py +++ b/program/everything/crawler.py @@ -8,6 +8,7 @@ import pickle import re import sys import time +import pydawg URL_REG = re.compile( @@ -27,41 +28,52 @@ REGEX_INT = re.compile('\d+[{}]'.format(''.join(TIMES.keys()))) class Crawler(): - def __init__(self, dbfile='./crawler.db'): + def __init__(self, dbfile='/var/www/py/crawler.db', init=False): if not os.path.exists(dbfile): self.entries = {} else: with open(dbfile, 'rb') as f: self.entries = pickle.loads(f.read()) + if init: + for k, v in self.entries.iteritems(): + if 'titledawg' in v and 'summarydawg' in v: + v['titledawg_t'] = pydawg.DAWG() + for t in sorted(set(v['titledawg'])): + v['titledawg_t'].add_word(t) + v['summarydawg_t'] = pydawg.DAWG() + for t in sorted(set(v['summarydawg'])): + v['summarydawg_t'].add_word(t) def list_names(self): - return str(self.entries.keys()) + return self.entries.keys() def add_entry(self, d): if d['name'] in self.entries: - print 'content already present... skipping' + raise Exception('That name is already present') else: self.entries[d['name']] = d - for e in self.entries: - print e - def write(self, path='./crawler.db'): + def write(self, path='/var/www/py/crawler.db'): + entries2 = {kk: {k: v for k, v in vv.iteritems() + if k not in ['summarydawg_t', 'titledawg_t']} + for kk, vv in self.entries.iteritems()} if os.path.exists(path): os.rename(path, '{}.bak'.format(path)) try: with open(path, 'wb') as f: - f.write(pickle.dumps(self.entries)) + f.write(pickle.dumps(entries2)) except Exception, e: - print 'something went wrong writing: {}'.format(e) - print 'restoring backup' + # print 'something went wrong writing: {}'.format(e) + # print 'restoring backup' + raise e os.rename('{}.bak'.format(path), path) finally: if os.path.exists('{}.bak'.format(path)): os.remove('{}.bak'.format(path)) def get_regex(self, name): - d_t = self.entries[name]['titledawg'] - d_s = self.entries[name]['summarydawg'] + d_t = self.entries[name]['titledawg_t'] + d_s = self.entries[name]['summarydawg_t'] r_t, r_s = [], [] for i, w in enumerate(d_t.words()): w = reduce(lambda x, y: x.replace(y[0], y[1].format(i)), REPL, w) @@ -175,37 +187,37 @@ class Crawler(): 'results': results, 'raw': (i['title'], i['summary']) } - print edict['db'][hashvalue]['raw'] - print edict['db'][hashvalue]['results'] - print hashvalue + # print edict['db'][hashvalue]['raw'] + # print edict['db'][hashvalue]['results'] + # print hashvalue raw_input('Press enter for the next one') -def main(): - if len(sys.argv) == 5 and sys.argv[1] == 'test': +def main(argv): + if len(argv) == 5 and argv[1] == 'test': + cr = Crawler(init=True) + print cr.test_entry(*argv[2:]) + elif len(argv) == 3 and argv[1] == 'del': cr = Crawler() - print cr.test_entry(*sys.argv[2:]) - elif len(sys.argv) == 3 and sys.argv[1] == 'del': - cr = Crawler() - if sys.argv[2] in cr.entries: - del(cr.entries[sys.argv[2]]) + if argv[2] in cr.entries: + del(cr.entries[argv[2]]) print 'Succesfull' cr.write() else: - print '{} not in the entries'.format(sys.argv[2]) - elif len(sys.argv) == 3 and sys.argv[1] == 'export': + print '{} not in the entries'.format(argv[2]) + elif len(argv) == 3 and argv[1] == 'export': cr = Crawler() for k, v in cr.entries.iteritems(): print k, '----' for kk, vv in sorted(v.iteritems()): print kk, ':', vv - elif len(sys.argv) == 5 and sys.argv[1] == 'edit': + elif len(argv) == 5 and argv[1] == 'edit': cr = Crawler() - name, key, value = sys.argv[2:] + name, key, value = argv[2:] cr.entries[name][key] = value cr.write() - elif len(sys.argv) >= 2 and sys.argv[1] == 'run': - args = sys.argv[2:] + elif len(argv) >= 2 and argv[1] == 'run': + args = argv[2:] force = True if '-f' in args else False cr = Crawler() to_run = [] @@ -227,9 +239,9 @@ def main(): else: print 'Skipping because last run was within interval' cr.write() - elif len(sys.argv) == 2 and sys.argv[1] == 'list': + elif len(argv) == 2 and argv[1] == 'list': cr = Crawler() - print cr.list_names() + print str(cr.list_names()) else: print ('Usage:\n' '\t{0} del crawlername\n' @@ -237,7 +249,7 @@ def main(): '\t{0} export FILE\n' '\t{0} list\n' '\t{0} run -f {{item1 item2 ...|all}}\n' - '\t{0} test crawlername title summary\n').format(sys.argv[0]) + '\t{0} test crawlername title summary\n').format(argv[0]) if __name__ == '__main__': - main() + main(sys.argv) diff --git a/program/everything/data_processing.py b/program/everything/data_processing.py deleted file mode 100644 index c4e971f..0000000 --- a/program/everything/data_processing.py +++ /dev/null @@ -1,160 +0,0 @@ -#!/bin/env python -# -*- coding: utf-8 -*- - -import ast -import logging -import re -import pydawg -import crawler - - -def structure_data(d): - re_hdr = re.compile('(?P.*?)', flags=re.MULTILINE | re.DOTALL) - re_row = re.compile('(?P.*)', flags=re.MULTILINE | re.DOTALL) - re_dualcel = re.compile('(?P.*?)', - flags=re.MULTILINE | re.DOTALL) - con = d['content'] - d['content'] = [] - d['headers'] = [] - for line in con.split('\n\t\t'): - if not line: - continue - row = re_row.search(line) - row = row.group('row') - for header in re_hdr.finditer(row): - d['headers'].append(header.group('h')) - d['content'].append([]) - for cell in re_dualcel.finditer(row): - d['content'][-1].append(cell.group('c')) - - -def parse_line(line): - re_spa = re.compile('(?P.*?);.*?>)(?P' - '.*?)(?P)') - results = [] - for column in line: - results.append([]) - markings = list(re_spa.finditer(column)) - if markings: - results[-1].append(markings) - return results - - -def create_nodes(d): - color_dict = { - 'rgb(139, 0, 0)': '\x01', # datum - 'red': '\x02', # tijd - 'green': '\x03', # wat - 'blue': '\x04' # wanneer - } - line_w_match = [] - d['content'] = d['content'][1:] - for i, m in enumerate(d['matchdata']): - if filter(None, m): - line_w_match.append((d['content'][i], m)) - nodelists = {'Title': [], 'Summary': []} - for (title_l, summary_l), (title_m, summary_m) in line_w_match: - # Title - if title_m: - title = title_m[0] - matches = reversed(sorted(title, key=lambda x: x.end('e'))) - for match in matches: - title_l = title_l[:match.start('e')] + title_l[match.end('e'):] - title_l = title_l[:match.start('content')] +\ - color_dict[match.group('c').strip()] +\ - title_l[match.end('content'):] - title_l = title_l[:match.start('b')] + title_l[match.end('b'):] - nodelists['Title'].append(title_l) - # Summary - if summary_m: - summary = summary_m[0] - matches = reversed(sorted(summary, key=lambda x: x.end('e'))) - for match in matches: - summary_l = summary_l[:match.start('e')] +\ - summary_l[match.end('e'):] - summary_l = summary_l[:match.start('content')] +\ - color_dict[match.group('c').strip()] +\ - summary_l[match.end('content'):] - summary_l = summary_l[:match.start('b')] +\ - summary_l[match.end('b'):] - nodelists['Summary'].append(summary_l) - return nodelists - - -def to_dot(q0): - nodenum = 0 - final_nodes = [] - nodes = [] - edges = [] - to_visit = [(0, q0)] - visited = set() - translation = [] - if q0.final: - final_nodes.append(nodenum) - else: - nodes.append(nodenum) - - nodenum += 1 - while to_visit: - current = to_visit.pop() - if not current[0] in visited: - visited.add(current[0]) - for char, child in current[1].children.iteritems(): - matches = [c for c in translation if c[0] == child] - curnum = -1 - if matches: - curnum = matches[-1][1] - else: - translation.append((child, nodenum)) - curnum = nodenum - nodenum += 1 - if child.final: - final_nodes.append(curnum) - else: - nodes.append(curnum) - edges.append((current[0], char, curnum)) - to_visit.append((curnum, child)) - print 'digraph dawg {' - print '\tnode [shape = doublecircle]; {}'.format( - ' '.join(str(n) for n in final_nodes)) - print '\tnode [shape = circle]; {}'.format( - ' '.join(str(n) for n in nodes)) - for fr, ch, to in edges: - print '\t{} -> {} [label = "{}"];'.format(fr, to, ch) - print '}' - - -def main(): - with open('./output_data/raw_out.txt', 'r') as data: - logging.info('raw data loaded, going to parse data') - d = data.readline() - d = re.sub('\)\]}$', '}', - re.sub('\)\],', ',', - re.sub('\[Field\(\'.*?\', ', '', d))) - d = ast.literal_eval(d) - logging.info('raw data parsed, going to structure data') - structure_data(d) - logging.info('data structured, parsed headers: {}'.format(d['headers'])) - logging.info('lines: {}'.format(len(d['content']))) - d['matchdata'] = [] - for line in filter(None, d['content']): - d['matchdata'].append(parse_line(line)) - nodelists = create_nodes(d) - titledawg = pydawg.DAWG() - for n in sorted(set(nodelists['Title'])): - titledawg.add_word(n) - summarydawg = pydawg.DAWG() - for n in sorted(set(nodelists['Summary'])): - summarydawg.add_word(n) - raw_input('Going to write to crawler and finish up ok?\n') - crawl = crawler.Crawler() - d['titledawg'] = titledawg - d['summarydawg'] = summarydawg - del(d['matchdata']) - crawl.add_entry(d) - crawl.write() - - -if __name__ == '__main__': - logging.basicConfig(level=logging.WARNING) - main() diff --git a/program/everything/webdata/dedoelen.rss.xml b/program/everything/dedoelen.rss.xml similarity index 100% rename from program/everything/webdata/dedoelen.rss.xml rename to program/everything/dedoelen.rss.xml diff --git a/program/everything/index.py b/program/everything/index.py new file mode 100644 index 0000000..e331645 --- /dev/null +++ b/program/everything/index.py @@ -0,0 +1,31 @@ +#!/bin/env python +# -*- codng: utf-8 -*- + +import crawler + + +def index(req, args, apok): + req.log_error('handler') + req.content_type = 'text/html' + req.send_http_header() + with open('/var/www/py/main.html.t', 'r') as f: + data = f.read() + cr = crawler.Crawler('/var/www/py/crawler.db') + ns = cr.list_names() + params = { + 'active_crawlers': + '\n'.join('{0}
'. + format(a) for a in ns), + 'active_crawlers_dropdown': + '\n'.join(''.format(a) for a in ns) + } + req.write(data.format(**params)) + return apok + + +def crawler_edit(req, args, apok): + return apok + + +def crawler_test(req, args, apok): + return apok diff --git a/program/everything/input_app.py b/program/everything/input_app.py index 8c153b9..6a155e3 100644 --- a/program/everything/input_app.py +++ b/program/everything/input_app.py @@ -3,9 +3,10 @@ from mod_python import apache, util import feedparser +import index +import crawler import re import urllib -import os def req_pre_pos(req): @@ -13,16 +14,146 @@ def req_pre_pos(req): req.content_type = 'text/html' req.send_http_header() args = util.FieldStorage(req) + listing = data_main(args) req.write( '\n\n' '\tVER: 0.01 - HyperFrontend RSS feed POSTREQUEST' '\n\n' '\tThanks submitting:
\n' - '\tEnter new rss feed\n
\n'
-        '{}\n
\n\n'.format(args)) - os.chdir('/var/www/py/files') - with open('raw_out.txt', 'w') as f: - f.write(str(args)) + '\tGo back...\n
\n'
+        'Current crawlers: {}\n
\n\n'.format(listing)) + + +def structure_data(d): + re_hdr = re.compile('(?P.*?)', flags=re.MULTILINE | re.DOTALL) + re_row = re.compile('(?P.*)', flags=re.MULTILINE | re.DOTALL) + re_dualcel = re.compile('(?P.*?)', + flags=re.MULTILINE | re.DOTALL) + con = d['content'] + d['content'] = [] + d['headers'] = [] + for line in con.split('\n\t\t'): + if not line: + continue + row = re_row.search(line) + row = row.group('row') + for header in re_hdr.finditer(row): + d['headers'].append(header.group('h')) + d['content'].append([]) + for cell in re_dualcel.finditer(row): + d['content'][-1].append(cell.group('c')) + + +def parse_line(line): + re_spa = re.compile('(?P.*?);.*?>)(?P' + '.*?)(?P)') + results = [] + for column in line: + results.append([]) + markings = list(re_spa.finditer(column)) + if markings: + results[-1].append(markings) + return results + + +def create_nodes(d): + color_dict = { + 'rgb(139, 0, 0)': '\x01', # datum + 'red': '\x02', # tijd + 'green': '\x03', # wat + 'blue': '\x04' # wanneer + } + line_w_match = [] + d['content'] = d['content'][1:] + for i, m in enumerate(d['matchdata']): + if filter(None, m): + line_w_match.append((d['content'][i], m)) + nodelists = {'Title': [], 'Summary': []} + for (title_l, summary_l), (title_m, summary_m) in line_w_match: + # Title + if title_m: + title = title_m[0] + matches = reversed(sorted(title, key=lambda x: x.end('e'))) + for match in matches: + title_l = title_l[:match.start('e')] + title_l[match.end('e'):] + title_l = title_l[:match.start('content')] +\ + color_dict[match.group('c').strip()] +\ + title_l[match.end('content'):] + title_l = title_l[:match.start('b')] + title_l[match.end('b'):] + nodelists['Title'].append(title_l) + # Summary + if summary_m: + summary = summary_m[0] + matches = reversed(sorted(summary, key=lambda x: x.end('e'))) + for match in matches: + summary_l = summary_l[:match.start('e')] +\ + summary_l[match.end('e'):] + summary_l = summary_l[:match.start('content')] +\ + color_dict[match.group('c').strip()] +\ + summary_l[match.end('content'):] + summary_l = summary_l[:match.start('b')] +\ + summary_l[match.end('b'):] + nodelists['Summary'].append(summary_l) + return nodelists + + +def to_dot(q0): + nodenum = 0 + final_nodes = [] + nodes = [] + edges = [] + to_visit = [(0, q0)] + visited = set() + translation = [] + if q0.final: + final_nodes.append(nodenum) + else: + nodes.append(nodenum) + + nodenum += 1 + while to_visit: + current = to_visit.pop() + if not current[0] in visited: + visited.add(current[0]) + for char, child in current[1].children.iteritems(): + matches = [c for c in translation if c[0] == child] + curnum = -1 + if matches: + curnum = matches[-1][1] + else: + translation.append((child, nodenum)) + curnum = nodenum + nodenum += 1 + if child.final: + final_nodes.append(curnum) + else: + nodes.append(curnum) + edges.append((current[0], char, curnum)) + to_visit.append((curnum, child)) + print 'digraph dawg {' + print '\tnode [shape = doublecircle]; {}'.format( + ' '.join(str(n) for n in final_nodes)) + print '\tnode [shape = circle]; {}'.format( + ' '.join(str(n) for n in nodes)) + for fr, ch, to in edges: + print '\t{} -> {} [label = "{}"];'.format(fr, to, ch) + print '}' + + +def data_main(d): + d = {k: str(v) for k, v in dict(d).iteritems()} + structure_data(d) + d['matchdata'] = [] + for line in filter(None, d['content']): + d['matchdata'].append(parse_line(line)) + nodelists = create_nodes(d) + d['titledawg'] = nodelists['Title'] + d['summarydawg'] = nodelists['Summary'] + del(d['matchdata']) + crawl = crawler.Crawler() + crawl.add_entry(d) + crawl.write() + return crawl.list_names() def req_pre(req, args): @@ -71,7 +202,6 @@ def feed2html(req, url, name): req.write( '\tLoading "{}" as

{}


\n'.format(url, name)) feed = feedparser.parse(url) -# channel = feed.feed req.write('\t\n') req.write('\t\t\n') for i in feed.entries[:10]: @@ -83,15 +213,22 @@ def feed2html(req, url, name): def handler(req): - if req.method == "POST": - req_pre_pos(req) + if req.uri.split('/')[-1] == 'index.py': + return index.index(req, util.FieldStorage(req), apache.OK) + elif req.uri.split('/')[-1] == 'crawler_test.py': + return index.crawler_test(req, util.FieldStorage(req), apache.OK) + elif req.uri.split('/')[-1] == 'crawler_edit.py': + return index.crawler_edit(req, util.FieldStorage(req), apache.OK) else: - args = util.FieldStorage(req) - req_pre(req, args) - if 'url' not in args and 'name' not in args: - req.write('Something went wrong, empty fields?
') - req.write('back') + if req.method == "POST": + req_pre_pos(req) else: - feed2html(req, args['url'], args['name']) - req_post(req) - return apache.OK + args = util.FieldStorage(req) + req_pre(req, args) + if 'url' not in args and 'name' not in args: + req.write('Something went wrong, empty fields?
') + req.write('back') + else: + feed2html(req, args['url'], args['name']) + req_post(req) + return apache.OK diff --git a/program/everything/install.sh b/program/everything/install.sh index 0e87fd6..741094d 100755 --- a/program/everything/install.sh +++ b/program/everything/install.sh @@ -1,7 +1,4 @@ sudo rm -rv /var/www/py/* -sudo cp -v ./input_app.py /var/www/py -sudo cp -v ./webdata/*.{xml,html,js} /var/www/py/ -sudo mkdir /var/www/py/files +sudo cp -v * /var/www/py/ sudo chown -vR mart:www-data /var/www/py sudo chmod -vR 770 /var/www/py -ln -s /var/www/py/files/ ./output_data diff --git a/program/everything/main.html.t b/program/everything/main.html.t new file mode 100644 index 0000000..4a7be42 --- /dev/null +++ b/program/everything/main.html.t @@ -0,0 +1,41 @@ + + + Crawler control center + + +
TitleSummary
+ + + + + + + + +
Inspect/edit crawlerAdd new crawlerTest crawler
+ {active_crawlers} + +
+ + + + +

RSS URL:

RSS Name:

+
+
+
+
+ + + + + +
+ +
Title:
Summary:
+
+
+ + diff --git a/program/everything/output_data b/program/everything/output_data deleted file mode 120000 index 6550f0b..0000000 --- a/program/everything/output_data +++ /dev/null @@ -1 +0,0 @@ -/var/www/py/files/ \ No newline at end of file diff --git a/program/everything/webdata/paradiso.rss.xml b/program/everything/paradiso.rss.xml similarity index 100% rename from program/everything/webdata/paradiso.rss.xml rename to program/everything/paradiso.rss.xml diff --git a/program/everything/webdata/podiuminfo.xml b/program/everything/podiuminfo.xml similarity index 100% rename from program/everything/webdata/podiuminfo.xml rename to program/everything/podiuminfo.xml diff --git a/program/everything/webdata/ticketunlimitid.rss.xml b/program/everything/ticketunlimitid.rss.xml similarity index 100% rename from program/everything/webdata/ticketunlimitid.rss.xml rename to program/everything/ticketunlimitid.rss.xml diff --git a/program/everything/webdata/tivoli.rss.xml b/program/everything/tivoli.rss.xml similarity index 100% rename from program/everything/webdata/tivoli.rss.xml rename to program/everything/tivoli.rss.xml diff --git a/program/everything/todo.txt b/program/everything/todo.txt deleted file mode 100644 index 20a3a77..0000000 --- a/program/everything/todo.txt +++ /dev/null @@ -1,3 +0,0 @@ -meer containers -minimale eisen rss feed -benadrukken waarom rss diff --git a/program/everything/uri.txt b/program/everything/uri.txt deleted file mode 100644 index 6029c01..0000000 --- a/program/everything/uri.txt +++ /dev/null @@ -1,7 +0,0 @@ -http://www.paradiso.nl/rss.xml -http://www.tivoli.nl/rss/agenda/ -http://www.stadsschouwburgendevereeniging.nl/_rss/rss.php?type=voorstellingen -http://www.dedoelen.nl/_rss/rss.php?type=voorstellingen -http://www.parktheater.nl/_rss/rss.php?type=voorstellingen -http://www.ticketunlimited.nl/ProductFeed/rssproductfeed.xml -podiuminfo.nl diff --git a/program/everything/webdata/index.html b/program/everything/webdata/index.html deleted file mode 100644 index 00ce528..0000000 --- a/program/everything/webdata/index.html +++ /dev/null @@ -1,14 +0,0 @@ - - - - -
- - - -

RSS URL:

RSS Name:
-
-
- - -- 2.20.1