big update concerning web interface
authorMart Lubbers <mart@martlubbers.net>
Tue, 16 Sep 2014 18:49:57 +0000 (20:49 +0200)
committerMart Lubbers <mart@martlubbers.net>
Tue, 16 Sep 2014 18:49:57 +0000 (20:49 +0200)
17 files changed:
program/everything/contextmenu_o.js [moved from program/everything/webdata/contextmenu_o.js with 95% similarity]
program/everything/crawler.db
program/everything/crawler.py
program/everything/data_processing.py [deleted file]
program/everything/dedoelen.rss.xml [moved from program/everything/webdata/dedoelen.rss.xml with 100% similarity]
program/everything/index.py [new file with mode: 0644]
program/everything/input_app.py
program/everything/install.sh
program/everything/main.html.t [new file with mode: 0644]
program/everything/output_data [deleted symlink]
program/everything/paradiso.rss.xml [moved from program/everything/webdata/paradiso.rss.xml with 100% similarity]
program/everything/podiuminfo.xml [moved from program/everything/webdata/podiuminfo.xml with 100% similarity]
program/everything/ticketunlimitid.rss.xml [moved from program/everything/webdata/ticketunlimitid.rss.xml with 100% similarity]
program/everything/tivoli.rss.xml [moved from program/everything/webdata/tivoli.rss.xml with 100% similarity]
program/everything/todo.txt [deleted file]
program/everything/uri.txt [deleted file]
program/everything/webdata/index.html [deleted file]

similarity index 95%
rename from program/everything/webdata/contextmenu_o.js
rename to program/everything/contextmenu_o.js
index 5311a67..d241f82 100644 (file)
@@ -25,8 +25,7 @@ function mouseUp(e) {
     if (curselection.endOffset - curselection.startOffset > 0)
        selection = curselection;
     console.log(selection)
-    if (e.which == 1) document.getElementById("contextmenu").style.visibility = "hidden";
-    else if (e.which == 3) mouse_right = false
+    if (e.which == 3) mouse_right = false
 }
 
 function mouseDown(e) {
index ac49fe2..e9287f8 100644 (file)
@@ -1,5 +1,5 @@
 (dp0
-S'Test1'
+S'Paradiso_test1'
 p1
 (dp2
 S'website'
@@ -11,1756 +11,89 @@ p5
 g1
 sS'url'
 p6
-S'http://www.paradiso.nl/rss.xml'
+S'localhost/py/paradiso.rss.xml'
 p7
-sS'venue'
+sS'dloc'
 p8
-S'Paradiso'
+S'test'
 p9
-sS'dloc'
+sS'venue'
 p10
-S'grote zaal'
+S'p'
 p11
-sS'db'
+sS'content'
 p12
-(dp13
-S'63662c13105245c8c98a5cc17443268a'
-p14
-(dp15
-S'raw'
+(lp13
+(lp14
+S'zaterdag 31 mei 2014  - Lentekabinet Festival Afterparty   -   Locatie: Tolhuistuin (zaal)'
+p15
+aS''
 p16
-(Vzondag 30 november 2014 21:00 - Catfish & The Bottlemen - Locatie: Bitterzoet
-p17
-V
+aa(lp17
+S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">vrijdag 4 juli 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">20:30</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">The Crimson Projekct</span>   -   Locatie: <span class="uiWebviewHighlight" style="color: white; background-color: blue;">Tolhuistuin (zaal)</span>'
 p18
-tp19
-sS'results'
-p20
-(dp21
-S'waar2'
-p22
-VBitterzoet
+aS'Muziek rond King Crimson'
+p19
+aa(lp20
+S'dinsdag 10 juni 2014 20:30 - Het Ultieme Natuurkunde Feestje \xe2\x80\x93 keynote Amanda Gefter'
+p21
+ag16
+aa(lp22
+S'dinsdag 12 augustus 2014 21:00 - Kevin Drew - Locatie: Bitterzoet'
 p23
-sS'tijd2'
+aS'mede-oprichter Broken Social Scene solo'
 p24
-V21:00
-p25
-sS'wat2'
+aa(lp25
+S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">vrijdag 4 juli 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">22:00</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">Palenke Soultribe</span>'
 p26
-VCatfish & The Bottlemen 
+aS'Electronische muziek en Afro-Colombiaanse ritmes'
 p27
-sS'datum2'
-p28
-Vzondag 30 november 2014
+aa(lp28
+S'maandag 3 november 2014 20:15 - Eefje de Visser: Waterwereldsteden - Locatie: Het Concertgebouw'
 p29
-sssS'497afcdf3c8fe95e5b63a7fd3483c88e'
-p30
-(dp31
-g16
-(Vdonderdag 4 december 2014 21:30 - She Keeps Bees
+ag16
+aa(lp30
+S'zaterdag 27 september 2014 20:30 - A Great Big World - Locatie: Tolhuistuin (zaal)'
+p31
+aS'Hitschrijvers uit New York'
 p32
-g18
-tp33
-sg20
-(dp34
-S'tijd0'
+aa(lp33
+S'zaterdag 7 juni 2014 23:00 - Benefietavond Marokkaanse Boot'
+p34
+aS'Van Amsterdam naar Tanger'
 p35
-V21:30
-p36
-sS'wat0'
+aa(lp36
+S'donderdag 13 november 2014 19:30 - Wouter Hamel'
 p37
-VShe Keeps Bees
+aS'Sprankelende jazzy pop'
 p38
-sS'datum0'
-p39
-Vdonderdag 4 december 2014
+aa(lp39
+S'vrijdag 13 juni 2014 00:00 - Legends'
 p40
-sssS'bc20fb8295d411fc5dd3b2b1f6cb0f4e'
-p41
-(dp42
-g16
-(Vdonderdag 30 oktober 2014 22:00 - Dondergrondse: hosted by The Daily Indie @ Kelder
-p43
-g18
-tp44
-sg20
-(dp45
-g35
-V22:00
-p46
-sg37
-VDondergrondse: hosted by The Daily Indie @ Kelder
-p47
-sg39
-Vdonderdag 30 oktober 2014
-p48
-sssS'1aae9feccc63dfc41b653af9cebe97ed'
-p49
-(dp50
-g16
-(Vzaterdag 22 november 2014 21:00 - Selda feat. Boom Pam   -   Locatie: Bitterzoet
-p51
-g18
-tp52
-sg20
-(dp53
-g22
-VBitterzoet
-p54
-sg24
-V21:00
-p55
-sg26
-VSelda feat. Boom Pam   
-p56
-sg28
-Vzaterdag 22 november 2014
-p57
-sssS'0e90f944f7d591d6e3c47bff6ba40301'
-p58
-(dp59
-g16
-(Vvrijdag 7 november 2014 20:00 - The Mahones
-p60
-g18
-tp61
-sg20
-(dp62
-g35
-V20:00
-p63
-sg37
-VThe Mahones
-p64
-sg39
-Vvrijdag 7 november 2014
-p65
-sssS'73801ebeb1dfc852b8a441d4534bfd37'
-p66
-(dp67
-g16
-(Vmaandag 24 november 2014 21:00 - Twin Forks - Locatie: Bitterzoet
-p68
-g18
-tp69
-sg20
-(dp70
-g22
-VBitterzoet
-p71
-sg24
-V21:00
-p72
-sg26
-VTwin Forks 
-p73
-sg28
-Vmaandag 24 november 2014
-p74
-sssS'6ae02cb00a5deb63cb417a870900d3cc'
-p75
-(dp76
-g16
-(Vvrijdag 31 oktober 2014 20:30 - Rocket Cinema: Night of the Living Dead - Locatie: Paradiso Noord, Tolhuistuin (tuin)
-p77
-VZombie Horror In Tolhuistuin
-p78
-tp79
-sg20
-(dp80
-g22
-VParadiso Noord, Tolhuistuin (tuin)
-p81
-sg24
-V20:30
-p82
-sg26
-VRocket Cinema: Night of the Living Dead 
-p83
-sg28
-Vvrijdag 31 oktober 2014
-p84
-sssS'fadc5b32d15a55b4bc60f5448a1c5342'
-p85
-(dp86
-g16
-(Vdonderdag 18 december 2014 19:30 - dEUS
-p87
-V\u2018\u2018Selected Songs 1994-2014\u201d
-p88
-tp89
-sg20
-(dp90
-g35
-V19:30
-p91
-sg37
-VdEUS
-p92
-sg39
-Vdonderdag 18 december 2014
-p93
-sssS'60632ff265f05913f6ebe8f6b0bf1995'
-p94
-(dp95
-g16
-(Vdonderdag 9 oktober 2014 23:30 - Dondergrondse:  hosted by Sweet Dreams @ Kelder
-p96
-g18
-tp97
-sg20
-(dp98
-g35
-V23:30
-p99
-sg37
-VDondergrondse:  hosted by Sweet Dreams @ Kelder
-p100
-sg39
-Vdonderdag 9 oktober 2014
-p101
-sssS'ae8b5e28b321c9caaca5a7b56d892670'
-p102
-(dp103
-g16
-(Vzondag 28 september 2014 11:00 - C4C Affordable Vintage & Fashion Fair   -   Locatie: Paradiso Noord, Tolhuistuin
-p104
-VVintage Fashion Markt & V.I.P. Shoppen
-p105
-tp106
-sg20
-(dp107
-g22
-VParadiso Noord, Tolhuistuin
-p108
-sg24
-V11:00
-p109
-sg26
-VC4C Affordable Vintage & Fashion Fair   
-p110
-sg28
-Vzondag 28 september 2014
-p111
-sssS'c09b1ed0e482c4e27bb7c61fae4d0e15'
-p112
-(dp113
-g16
-(Vvrijdag 16 januari 2015 20:30 - Kraftwerk The Catalogue 12345678 in 3-D - Autobahn (1974)
-p114
-g18
-tp115
-sg20
-(dp116
-g35
-V3-D
-p117
-sg37
-VAutobahn (1974)
-p118
-sg39
-Vvrijdag 16 januari 2015 20:30 - Kraftwerk The Catalogue 12345678 in
-p119
-sssS'91e127c55ea69be373514604cd3f0d54'
-p120
-(dp121
-g16
-(Vzondag 8 maart 2015 20:30 - Thanasis Papakonstantinou & band
-p122
-g18
-tp123
-sg20
-(dp124
-g35
-V20:30
-p125
-sg37
-VThanasis Papakonstantinou & band
-p126
-sg39
-Vzondag 8 maart 2015
-p127
-sssS'492d8f49e92850c4b71345cf78b9cd5c'
-p128
-(dp129
-g16
-(Vzaterdag 15 november 2014 21:00 - Kris Berry & Perquisite afscheidsconcert - Locatie: Bitterzoet
-p130
-g18
-tp131
-sg20
-(dp132
-g22
-VBitterzoet
-p133
-sg24
-V21:00
-p134
-sg26
-VKris Berry & Perquisite afscheidsconcert 
-p135
-sg28
-Vzaterdag 15 november 2014
-p136
-sssS'03e21261cc548333f352f3aeefb655ca'
-p137
-(dp138
-g16
-(Vwoensdag 29 oktober 2014 19:00 - Sofia Dragt
-p139
-g18
-tp140
-sg20
-(dp141
-g35
-V19:00
-p142
-sg37
-VSofia Dragt
-p143
-sg39
-Vwoensdag 29 oktober 2014
-p144
-sssS'05acd9f7674136b0497d96d1d7331ed2'
-p145
-(dp146
-g16
-(Vdonderdag 23 oktober 2014 23:30 - Noodlanding!
-p147
-VDansnacht, alternatieve hits
-p148
-tp149
-sg20
-(dp150
-g35
-V23:30
-p151
-sg37
-VNoodlanding!
-p152
-sg39
-Vdonderdag 23 oktober 2014
-p153
-sssS'6e517291b28ec790c33819a477869519'
-p154
-(dp155
-g16
-(Vvrijdag 31 oktober 2014 22:00 - Halloween Hairball - Locatie: Paradiso Noord, Tolhuistuin
-p156
-VThe Freakiest Halloween Special Everrrr...
-p157
-tp158
-sg20
-(dp159
-g22
-VParadiso Noord, Tolhuistuin
-p160
-sg24
-V22:00
-p161
-sg26
-VHalloween Hairball 
-p162
-sg28
-Vvrijdag 31 oktober 2014
-p163
-sssS'7b6aeecede53f52256d7ba3278b35679'
-p164
-(dp165
-g16
-(Vzaterdag 4 oktober 2014 20:30 - Fixkes - Locatie: Paradiso Noord, Tolhuistuin
-p166
-g18
-tp167
-sg20
-(dp168
-g22
-VParadiso Noord, Tolhuistuin
-p169
-sg24
-V20:30
-p170
-sg26
-VFixkes 
-p171
-sg28
-Vzaterdag 4 oktober 2014
-p172
-sssS'aca18d82bffadc7a8756531c0febec9f'
-p173
-(dp174
-g16
-(Vdonderdag 2 oktober 2014 22:00 - Scoop
-p175
-VParadiso's nieuwe Amsterdamse band-avond
-p176
-tp177
-sg20
-(dp178
-g35
-V22:00
-p179
-sg37
-VScoop
-p180
-sg39
-Vdonderdag 2 oktober 2014
-p181
-sssS'62db0423c9274dc47fdc8713e405cf14'
-p182
-(dp183
-g16
-(Vwoensdag 12 november 2014 20:30 - The Drums   -   Locatie: Paradiso Noord, Tolhuistuin
-p184
-VEncyclopedia
-p185
-tp186
-sg20
-(dp187
-g22
-VParadiso Noord, Tolhuistuin
-p188
-sg24
-V20:30
-p189
-sg26
-VThe Drums   
-p190
-sg28
-Vwoensdag 12 november 2014
-p191
-sssS'c0fbf389b316e3f931dd33fb0ec51a43'
-p192
-(dp193
-g16
-(Vdonderdag 20 november 2014 20:30 - Bombay Bicycle Club
-p194
-VIntelligente indie
-p195
-tp196
-sg20
-(dp197
-g35
-V20:30
-p198
-sg37
-VBombay Bicycle Club
-p199
-sg39
-Vdonderdag 20 november 2014
-p200
-ssssS'last_run'
-p201
-F1410345938.705529
-sS'content'
-p202
-(lp203
-(lp204
-S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">maandag 24 november 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">21:00</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">Twin Forks </span>- Locatie: <span class="uiWebviewHighlight" style="color: white; background-color: blue;">Bitterzoet</span>'
-p205
-aS''
-p206
-aa(lp207
-S'zaterdag 22 november 2014 21:00 - Selda feat. Boom Pam   -   Locatie: Bitterzoet'
-p208
-ag206
-aa(lp209
-S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">donderdag 30 oktober 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">22:00</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">Dondergrondse: hosted by The Daily Indie @ Kelder</span>'
-p210
-ag206
-aa(lp211
-S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">donderdag 9 oktober 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">23:30</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">Dondergrondse:  hosted by Sweet Dreams @ Kelder</span>'
-p212
-ag206
-aa(lp213
-S'donderdag 23 oktober 2014 23:30 - Noodlanding!'
-p214
-aS'Dansnacht, alternatieve hits'
-p215
-aa(lp216
-S'donderdag 2 oktober 2014 22:00 - Scoop'
-p217
-aS"Paradiso's nieuwe Amsterdamse band-avond"
-p218
-aa(lp219
-S'donderdag 18 december 2014 19:30 - dEUS'
-p220
-aS'\xe2\x80\x98\xe2\x80\x98Selected Songs 1994-2014\xe2\x80\x9d'
-p221
-aa(lp222
-S'donderdag 4 december 2014 21:30 - She Keeps Bees'
-p223
-ag206
-aa(lp224
-S'donderdag 20 november 2014 20:30 - Bombay Bicycle Club'
-p225
-aS'Intelligente indie'
-p226
-aa(lp227
-S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">vrijdag 31 oktober 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">20:30</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">Rocket Cinema: Night of the Living Dead</span> - Locatie: <span class="uiWebviewHighlight" style="color: white; background-color: blue;">Paradiso Noord, Tolhuistuin (tuin)</span>'
-p228
-aS'Zombie Horror In Tolhuistuin'
-p229
+ag16
 aasS'headers'
-p230
-(lp231
-S'Title'
-p232
-aS'Summary'
-p233
-asS'summarydawg'
-p234
-(ipydawg
-DAWG
-p235
-(dp236
-S'q0'
-p237
-(ipydawg
-DAWGNode
-p238
-(dp239
-S'children'
-p240
-(dp241
-sS'final'
-p242
-I00
-sS'number'
-p243
-NsbsS'_numbers_valid'
-p244
-I00
-sS'register'
-p245
-c__builtin__
-set
-p246
-((lp247
-tp248
-Rp249
-sS'wp'
-p250
-g206
-sbsS'titledawg'
-p251
-(ipydawg
-DAWG
-p252
-(dp253
-g237
-(ipydawg
-DAWGNode
-p254
-(dp255
-g240
-(dp256
-S'\x01'
-p257
-(ipydawg
-DAWGNode
-p258
-(dp259
-g240
-(dp260
-S' '
-p261
-(ipydawg
-DAWGNode
-p262
-(dp263
-g240
-(dp264
-S'\x02'
-p265
-(ipydawg
-DAWGNode
-p266
-(dp267
-g240
-(dp268
-g261
-(ipydawg
-DAWGNode
-p269
-(dp270
-g240
-(dp271
-S'-'
-p272
-(ipydawg
-DAWGNode
-p273
-(dp274
-g240
-(dp275
-g261
-(ipydawg
-DAWGNode
-p276
-(dp277
-g240
-(dp278
-S'\x03'
-p279
-(ipydawg
-DAWGNode
-p280
-(dp281
-g240
-(dp282
-g261
-(ipydawg
-DAWGNode
-p283
-(dp284
-g240
-(dp285
-g272
-(ipydawg
-DAWGNode
-p286
-(dp287
-g240
-(dp288
-g261
-(ipydawg
-DAWGNode
-p289
-(dp290
-g240
-(dp291
-S'L'
-p292
-(ipydawg
-DAWGNode
-p293
-(dp294
-g240
-(dp295
-S'o'
-p296
-(ipydawg
-DAWGNode
-p297
-(dp298
-g240
-(dp299
-S'c'
-p300
-(ipydawg
-DAWGNode
-p301
-(dp302
-g240
-(dp303
-S'a'
-p304
-(ipydawg
-DAWGNode
-p305
-(dp306
-g240
-(dp307
-S't'
-p308
-(ipydawg
-DAWGNode
-p309
-(dp310
-g240
-(dp311
-S'i'
-p312
-(ipydawg
-DAWGNode
-p313
-(dp314
-g240
-(dp315
-S'e'
-p316
-(ipydawg
-DAWGNode
-p317
-(dp318
-g240
-(dp319
-S':'
-p320
-(ipydawg
-DAWGNode
-p321
-(dp322
-g240
-(dp323
-g261
-(ipydawg
-DAWGNode
-p324
-(dp325
-g240
-(dp326
-S'\x04'
-p327
-(ipydawg
-DAWGNode
-p328
-(dp329
-g240
-(dp330
-sg242
-I01
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbsg272
-(ipydawg
-DAWGNode
-p331
-(dp332
-g240
-(dp333
-g261
-(ipydawg
-DAWGNode
-p334
-(dp335
-g240
-(dp336
-g292
-(ipydawg
-DAWGNode
-p337
-(dp338
-g240
-(dp339
-g296
-(ipydawg
-DAWGNode
-p340
-(dp341
-g240
-(dp342
-g300
-(ipydawg
-DAWGNode
-p343
-(dp344
-g240
-(dp345
-g304
-(ipydawg
-DAWGNode
-p346
-(dp347
-g240
-(dp348
-g308
-(ipydawg
-DAWGNode
-p349
-(dp350
-g240
-(dp351
-g312
-(ipydawg
-DAWGNode
-p352
-(dp353
-g240
-(dp354
-g316
-(ipydawg
-DAWGNode
-p355
-(dp356
-g240
-(dp357
-g320
-(ipydawg
-DAWGNode
-p358
-(dp359
-g240
-(dp360
-g261
-(ipydawg
-DAWGNode
-p361
-(dp362
-g240
-(dp363
-g327
-(ipydawg
-DAWGNode
-p364
-(dp365
-g240
-(dp366
-sg242
-I01
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I01
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbssg242
-I00
-sg243
-Nsbsg244
-I00
-sg245
-g246
-((lp367
-g305
-ag317
-ag309
-ag283
-ag313
-ag286
-ag289
-ag321
-ag293
-ag297
-ag324
-ag328
-ag301
-atp368
-Rp369
-sg250
-S'\x01 \x02 - \x03- Locatie: \x04'
-p370
-sbsS'freq'
-p371
-S'1w'
-p372
-sS'adress'
-p373
-S'test'
-p374
-ssS'paradiso'
-p375
-(dp376
-S'website'
-p377
-S'www.paradiso.nl'
-p378
-sS'name'
-p379
-g375
-sS'url'
-p380
-S'http://www.paradiso.nl/rss.xml'
-p381
-sS'db'
-p382
-(dp383
-S'63662c13105245c8c98a5cc17443268a'
-p384
-(dp385
-S'raw'
-p386
-(Vzondag 30 november 2014 21:00 - Catfish & The Bottlemen - Locatie: Bitterzoet
-p387
-g18
-tp388
-sS'results'
-p389
-(dp390
-S'waar1'
-p391
-VBitterzoet
-p392
-sS'datum1'
-p393
-Vzondag 30 november 2014
-p394
-sS'tijd1'
-p395
-V21:00
-p396
-sS'wat1'
-p397
-VCatfish & The Bottlemen
-p398
-sssS'497afcdf3c8fe95e5b63a7fd3483c88e'
-p399
-(dp400
-g386
-(Vdonderdag 4 december 2014 21:30 - She Keeps Bees
-p401
-g18
-tp402
-sg389
-(dp403
-S'wat0'
-p404
-VShe Keeps Bees
-p405
-sS'tijd0'
-p406
-V21:30
-p407
-sS'datum0'
-p408
-Vdonderdag 4 december 2014
-p409
-sssS'bc20fb8295d411fc5dd3b2b1f6cb0f4e'
-p410
-(dp411
-g386
-(Vdonderdag 30 oktober 2014 22:00 - Dondergrondse: hosted by The Daily Indie @ Kelder
-p412
-g18
-tp413
-sg389
-(dp414
-g404
-VDondergrondse: hosted by The Daily Indie @ Kelder
-p415
-sg406
-V22:00
-p416
-sg408
-Vdonderdag 30 oktober 2014
-p417
-sssS'1aae9feccc63dfc41b653af9cebe97ed'
-p418
-(dp419
-g386
-(Vzaterdag 22 november 2014 21:00 - Selda feat. Boom Pam   -   Locatie: Bitterzoet
-p420
-g18
-tp421
-sg389
-(dp422
-g391
-VBitterzoet
-p423
-sg393
-Vzaterdag 22 november 2014
-p424
-sg395
-V21:00
-p425
-sg397
-VSelda feat. Boom Pam  
-p426
-sssS'03e21261cc548333f352f3aeefb655ca'
-p427
-(dp428
-g386
-(Vwoensdag 29 oktober 2014 19:00 - Sofia Dragt
-p429
-g18
-tp430
-sg389
-(dp431
-g404
-VSofia Dragt
-p432
-sg406
-V19:00
-p433
-sg408
-Vwoensdag 29 oktober 2014
-p434
-sssS'62db0423c9274dc47fdc8713e405cf14'
-p435
-(dp436
-g386
-(Vwoensdag 12 november 2014 20:30 - The Drums   -   Locatie: Paradiso Noord, Tolhuistuin
-p437
-VEncyclopedia
-p438
-tp439
-sg389
-(dp440
-g393
-Vwoensdag 12 november 2014
-p441
-sg391
-VParadiso Noord, Tolhuistuin
-p442
-sS'wat0'
-p443
-g438
-sg395
-V20:30
-p444
-sg397
-VThe Drums  
-p445
-sssS'6ae02cb00a5deb63cb417a870900d3cc'
-p446
-(dp447
-g386
-(Vvrijdag 31 oktober 2014 20:30 - Rocket Cinema: Night of the Living Dead - Locatie: Paradiso Noord, Tolhuistuin (tuin)
-p448
-VZombie Horror In Tolhuistuin
-p449
-tp450
-sg389
-(dp451
-g393
-Vvrijdag 31 oktober 2014
-p452
-sg391
-VParadiso Noord, Tolhuistuin (tuin)
-p453
-sg443
-g449
-sg395
-V20:30
-p454
-sg397
-VRocket Cinema: Night of the Living Dead
-p455
-sssS'fadc5b32d15a55b4bc60f5448a1c5342'
-p456
-(dp457
-g386
-(Vdonderdag 18 december 2014 19:30 - dEUS
-p458
-V\u2018\u2018Selected Songs 1994-2014\u201d
-p459
-tp460
-sg389
-(dp461
-g406
-V19:30
-p462
-sg443
-VdEUS
-p463
-sg408
-Vdonderdag 18 december 2014
-p464
-sssS'60632ff265f05913f6ebe8f6b0bf1995'
-p465
-(dp466
-g386
-(Vdonderdag 9 oktober 2014 23:30 - Dondergrondse:  hosted by Sweet Dreams @ Kelder
-p467
-g18
-tp468
-sg389
-(dp469
-g404
-VDondergrondse:  hosted by Sweet Dreams @ Kelder
-p470
-sg406
-V23:30
-p471
-sg408
-Vdonderdag 9 oktober 2014
-p472
-sssS'ae8b5e28b321c9caaca5a7b56d892670'
-p473
-(dp474
-g386
-(Vzondag 28 september 2014 11:00 - C4C Affordable Vintage & Fashion Fair   -   Locatie: Paradiso Noord, Tolhuistuin
-p475
-VVintage Fashion Markt & V.I.P. Shoppen
-p476
-tp477
-sg389
-(dp478
-g393
-Vzondag 28 september 2014
-p479
-sg391
-VParadiso Noord, Tolhuistuin
-p480
-sg443
-g476
-sg395
-V11:00
-p481
-sg397
-VC4C Affordable Vintage & Fashion Fair  
-p482
-sssS'c09b1ed0e482c4e27bb7c61fae4d0e15'
-p483
-(dp484
-g386
-(Vvrijdag 16 januari 2015 20:30 - Kraftwerk The Catalogue 12345678 in 3-D - Autobahn (1974)
-p485
-g18
-tp486
-sg389
-(dp487
-g404
-VAutobahn (1974)
-p488
-sg406
-V3-D
-p489
-sg408
-Vvrijdag 16 januari 2015 20:30 - Kraftwerk The Catalogue 12345678 in
-p490
-sssS'91e127c55ea69be373514604cd3f0d54'
-p491
-(dp492
-g386
-(Vzondag 8 maart 2015 20:30 - Thanasis Papakonstantinou & band
-p493
-g18
-tp494
-sg389
-(dp495
-g404
-VThanasis Papakonstantinou & band
-p496
-sg406
-V20:30
-p497
-sg408
-Vzondag 8 maart 2015
-p498
-sssS'492d8f49e92850c4b71345cf78b9cd5c'
-p499
-(dp500
-g386
-(Vzaterdag 15 november 2014 21:00 - Kris Berry & Perquisite afscheidsconcert - Locatie: Bitterzoet
-p501
-g18
-tp502
-sg389
-(dp503
-g391
-VBitterzoet
-p504
-sg393
-Vzaterdag 15 november 2014
-p505
-sg395
-V21:00
-p506
-sg397
-VKris Berry & Perquisite afscheidsconcert
-p507
-sssS'73801ebeb1dfc852b8a441d4534bfd37'
-p508
-(dp509
-g386
-(Vmaandag 24 november 2014 21:00 - Twin Forks - Locatie: Bitterzoet
-p510
-g18
-tp511
-sg389
-(dp512
-g391
-VBitterzoet
-p513
-sg393
-Vmaandag 24 november 2014
-p514
-sg395
-V21:00
-p515
-sg397
-VTwin Forks
-p516
-sssS'05acd9f7674136b0497d96d1d7331ed2'
-p517
-(dp518
-g386
-(Vdonderdag 23 oktober 2014 23:30 - Noodlanding!
-p519
-VDansnacht, alternatieve hits
-p520
-tp521
-sg389
-(dp522
-g406
-V23:30
-p523
-sg443
-VNoodlanding!
-p524
-sg408
-Vdonderdag 23 oktober 2014
-p525
-sssS'6e517291b28ec790c33819a477869519'
-p526
-(dp527
-g386
-(Vvrijdag 31 oktober 2014 22:00 - Halloween Hairball - Locatie: Paradiso Noord, Tolhuistuin
-p528
-VThe Freakiest Halloween Special Everrrr...
-p529
-tp530
-sg389
-(dp531
-g393
-Vvrijdag 31 oktober 2014
-p532
-sg391
-VParadiso Noord, Tolhuistuin
-p533
-sg443
-g529
-sg395
-V22:00
-p534
-sg397
-VHalloween Hairball
-p535
-sssS'7b6aeecede53f52256d7ba3278b35679'
-p536
-(dp537
-g386
-(Vzaterdag 4 oktober 2014 20:30 - Fixkes - Locatie: Paradiso Noord, Tolhuistuin
-p538
-g18
-tp539
-sg389
-(dp540
-g391
-VParadiso Noord, Tolhuistuin
-p541
-sg393
-Vzaterdag 4 oktober 2014
-p542
-sg395
-V20:30
-p543
-sg397
-VFixkes
-p544
-sssS'aca18d82bffadc7a8756531c0febec9f'
-p545
-(dp546
-g386
-(Vdonderdag 2 oktober 2014 22:00 - Scoop
-p547
-VParadiso's nieuwe Amsterdamse band-avond
-p548
-tp549
-sg389
-(dp550
-g406
-V22:00
-p551
-sg443
-VScoop
-p552
-sg408
-Vdonderdag 2 oktober 2014
-p553
-sssS'0e90f944f7d591d6e3c47bff6ba40301'
-p554
-(dp555
-g386
-(Vvrijdag 7 november 2014 20:00 - The Mahones
-p556
-g18
-tp557
-sg389
-(dp558
-g404
-VThe Mahones
-p559
-sg406
-V20:00
-p560
-sg408
-Vvrijdag 7 november 2014
-p561
-sssS'c0fbf389b316e3f931dd33fb0ec51a43'
-p562
-(dp563
-g386
-(Vdonderdag 20 november 2014 20:30 - Bombay Bicycle Club
-p564
-VIntelligente indie
-p565
-tp566
-sg389
-(dp567
-g406
-V20:30
-p568
-sg443
-VBombay Bicycle Club
-p569
-sg408
-Vdonderdag 20 november 2014
-p570
-ssssS'dloc'
-p571
-S'nee'
-p572
-sS'venue'
-p573
-S'Paradiso'
-p574
-sS'last_run'
-p575
-F1410338734.431591
-sS'content'
-p576
-(lp577
-(lp578
-S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">zaterdag 22 november 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">21:00</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">Selda feat. Boom Pam</span>   -   Locatie: <span class="uiWebviewHighlight" style="color: white; background-color: blue;">Bitterzoet</span>'
-p579
-ag206
-aa(lp580
-S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">donderdag 30 oktober 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">22:00</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">Dondergrondse: hosted by The Daily Indie @ Kelder</span>'
-p581
-ag206
-aa(lp582
-S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">donderdag 9 oktober 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">23:30</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">Dondergrondse:  hosted by Sweet Dreams @ Kelder</span>'
-p583
-ag206
-aa(lp584
-S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">donderdag 23 oktober 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">23:30</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">Noodlanding!</span>'
-p585
-aS'<span class="uiWebviewHighlight" style="color: white; background-color: green;">Dansnacht, alternatieve hits</span>'
-p586
-aa(lp587
-S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">donderdag 2 oktober 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">22:00</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">Scoop</span>'
-p588
-aS"Paradiso's nieuwe Amsterdamse band-avond"
-p589
-aa(lp590
-S'donderdag 18 december 2014 19:30 - dEUS'
-p591
-aS'\xe2\x80\x98\xe2\x80\x98Selected Songs 1994-2014\xe2\x80\x9d'
-p592
-aa(lp593
-S'donderdag 4 december 2014 21:30 - She Keeps Bees'
-p594
-ag206
-aa(lp595
-S'donderdag 20 november 2014 20:30 - Bombay Bicycle Club'
-p596
-aS'Intelligente indie'
-p597
-aa(lp598
-S'vrijdag 31 oktober 2014 20:30 - Rocket Cinema: Night of the Living Dead - Locatie: Paradiso Noord, Tolhuistuin (tuin)'
-p599
-aS'Zombie Horror In Tolhuistuin'
-p600
-aa(lp601
-S'vrijdag 31 oktober 2014 22:00 - Halloween Hairball - Locatie: Paradiso Noord, Tolhuistuin'
-p602
-aS'The Freakiest Halloween Special Everrrr...'
-p603
-aasS'headers'
-p604
-(lp605
+p41
+(lp42
 S'Title'
-p606
+p43
 aS'Summary'
-p607
+p44
 asS'summarydawg'
-p608
-(ipydawg
-DAWG
-p609
-(dp610
-g237
-(ipydawg
-DAWGNode
-p611
-(dp612
-g242
-I00
-sg243
-Nsg240
-(dp613
-S'\x03'
-p614
-(ipydawg
-DAWGNode
-p615
-(dp616
-g242
-I01
-sg243
-Nsg240
-(dp617
-sbssbsg244
-I00
-sg245
-g246
-((lp618
-tp619
-Rp620
-sg250
-g614
-sbsS'titledawg'
-p621
-(ipydawg
-DAWG
-p622
-(dp623
-g237
-(ipydawg
-DAWGNode
-p624
-(dp625
-g242
-I00
-sg243
-Nsg240
-(dp626
-S'\x01'
-p627
-(ipydawg
-DAWGNode
-p628
-(dp629
-g242
-I00
-sg243
-Nsg240
-(dp630
-S' '
-p631
-(ipydawg
-DAWGNode
-p632
-(dp633
-g242
-I00
-sg243
-Nsg240
-(dp634
-S'\x02'
-p635
-(ipydawg
-DAWGNode
-p636
-(dp637
-g242
-I00
-sg243
-Nsg240
-(dp638
-g631
-(ipydawg
-DAWGNode
-p639
-(dp640
-g242
-I00
-sg243
-Nsg240
-(dp641
-S'-'
-p642
-(ipydawg
-DAWGNode
-p643
-(dp644
-g242
-I00
-sg243
-Nsg240
-(dp645
-g631
-(ipydawg
-DAWGNode
-p646
-(dp647
-g242
-I00
-sg243
-Nsg240
-(dp648
-g614
-(ipydawg
-DAWGNode
-p649
-(dp650
-g242
-I01
-sg243
-Nsg240
-(dp651
-g631
-(ipydawg
-DAWGNode
-p652
-(dp653
-g242
-I00
-sg243
-Nsg240
-(dp654
-g631
-(ipydawg
-DAWGNode
-p655
-(dp656
-g242
-I00
-sg243
-Nsg240
-(dp657
-g631
-(ipydawg
-DAWGNode
-p658
-(dp659
-g242
-I00
-sg243
-Nsg240
-(dp660
-g642
-(ipydawg
-DAWGNode
-p661
-(dp662
-g242
-I00
-sg243
-Nsg240
-(dp663
-g631
-(ipydawg
-DAWGNode
-p664
-(dp665
-g242
-I00
-sg243
-Nsg240
-(dp666
-g631
-(ipydawg
-DAWGNode
-p667
-(dp668
-g242
-I00
-sg243
-Nsg240
-(dp669
-g631
-(ipydawg
-DAWGNode
-p670
-(dp671
-g242
-I00
-sg243
-Nsg240
-(dp672
-S'L'
-p673
-(ipydawg
-DAWGNode
-p674
-(dp675
-g242
-I00
-sg243
-Nsg240
-(dp676
-S'o'
-p677
-(ipydawg
-DAWGNode
-p678
-(dp679
-g242
-I00
-sg243
-Nsg240
-(dp680
-S'c'
-p681
-(ipydawg
-DAWGNode
-p682
-(dp683
-g242
-I00
-sg243
-Nsg240
-(dp684
-S'a'
-p685
-(ipydawg
-DAWGNode
-p686
-(dp687
-g242
-I00
-sg243
-Nsg240
-(dp688
-S't'
-p689
-(ipydawg
-DAWGNode
-p690
-(dp691
-g242
-I00
-sg243
-Nsg240
-(dp692
-S'i'
-p693
-(ipydawg
-DAWGNode
-p694
-(dp695
-g242
-I00
-sg243
-Nsg240
-(dp696
-S'e'
-p697
-(ipydawg
-DAWGNode
-p698
-(dp699
-g242
-I00
-sg243
-Nsg240
-(dp700
-S':'
-p701
-(ipydawg
-DAWGNode
-p702
-(dp703
-g242
-I00
-sg243
-Nsg240
-(dp704
-g631
-(ipydawg
-DAWGNode
-p705
-(dp706
-g242
-I00
-sg243
-Nsg240
-(dp707
-S'\x04'
-p708
-(ipydawg
-DAWGNode
-p709
-(dp710
-g242
-I01
-sg243
-Nsg240
-(dp711
-sbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbssbsg244
-I00
-sg245
-g246
-((lp712
-tp713
-Rp714
-sg250
+p45
+(lp46
+sS'titledawg'
+p47
+(lp48
 S'\x01 \x02 - \x03   -   Locatie: \x04'
-p715
-sbsS'freq'
-p716
+p49
+aS'\x01 \x02 - \x03'
+p50
+asS'freq'
+p51
 S'1w'
-p717
+p52
 sS'adress'
-p718
-S'amsterdam'
-p719
+p53
+S'adres'
+p54
 ss.
\ No newline at end of file
index 2b531ff..a377d5b 100644 (file)
@@ -8,6 +8,7 @@ import pickle
 import re
 import sys
 import time
+import pydawg
 
 
 URL_REG = re.compile(
@@ -27,41 +28,52 @@ REGEX_INT = re.compile('\d+[{}]'.format(''.join(TIMES.keys())))
 
 
 class Crawler():
-    def __init__(self, dbfile='./crawler.db'):
+    def __init__(self, dbfile='/var/www/py/crawler.db', init=False):
         if not os.path.exists(dbfile):
             self.entries = {}
         else:
             with open(dbfile, 'rb') as f:
                 self.entries = pickle.loads(f.read())
+        if init:
+            for k, v in self.entries.iteritems():
+                if 'titledawg' in v and 'summarydawg' in v:
+                    v['titledawg_t'] = pydawg.DAWG()
+                    for t in sorted(set(v['titledawg'])):
+                        v['titledawg_t'].add_word(t)
+                    v['summarydawg_t'] = pydawg.DAWG()
+                    for t in sorted(set(v['summarydawg'])):
+                        v['summarydawg_t'].add_word(t)
 
     def list_names(self):
-        return str(self.entries.keys())
+        return self.entries.keys()
 
     def add_entry(self, d):
         if d['name'] in self.entries:
-            print 'content already present... skipping'
+            raise Exception('That name is already present')
         else:
             self.entries[d['name']] = d
-        for e in self.entries:
-            print e
 
-    def write(self, path='./crawler.db'):
+    def write(self, path='/var/www/py/crawler.db'):
+        entries2 = {kk: {k: v for k, v in vv.iteritems()
+                    if k not in ['summarydawg_t', 'titledawg_t']}
+                    for kk, vv in self.entries.iteritems()}
         if os.path.exists(path):
             os.rename(path, '{}.bak'.format(path))
         try:
             with open(path, 'wb') as f:
-                f.write(pickle.dumps(self.entries))
+                f.write(pickle.dumps(entries2))
         except Exception, e:
-            print 'something went wrong writing: {}'.format(e)
-            print 'restoring backup'
+            #  print 'something went wrong writing: {}'.format(e)
+            #  print 'restoring backup'
+            raise e
             os.rename('{}.bak'.format(path), path)
         finally:
             if os.path.exists('{}.bak'.format(path)):
                 os.remove('{}.bak'.format(path))
 
     def get_regex(self, name):
-        d_t = self.entries[name]['titledawg']
-        d_s = self.entries[name]['summarydawg']
+        d_t = self.entries[name]['titledawg_t']
+        d_s = self.entries[name]['summarydawg_t']
         r_t, r_s = [], []
         for i, w in enumerate(d_t.words()):
             w = reduce(lambda x, y: x.replace(y[0], y[1].format(i)), REPL, w)
@@ -175,37 +187,37 @@ class Crawler():
                 'results': results,
                 'raw': (i['title'], i['summary'])
             }
-            print edict['db'][hashvalue]['raw']
-            print edict['db'][hashvalue]['results']
-            print hashvalue
+            #  print edict['db'][hashvalue]['raw']
+            #  print edict['db'][hashvalue]['results']
+            #  print hashvalue
             raw_input('Press enter for the next one')
 
 
-def main():
-    if len(sys.argv) == 5 and sys.argv[1] == 'test':
+def main(argv):
+    if len(argv) == 5 and argv[1] == 'test':
+        cr = Crawler(init=True)
+        print cr.test_entry(*argv[2:])
+    elif len(argv) == 3 and argv[1] == 'del':
         cr = Crawler()
-        print cr.test_entry(*sys.argv[2:])
-    elif len(sys.argv) == 3 and sys.argv[1] == 'del':
-        cr = Crawler()
-        if sys.argv[2] in cr.entries:
-            del(cr.entries[sys.argv[2]])
+        if argv[2] in cr.entries:
+            del(cr.entries[argv[2]])
             print 'Succesfull'
             cr.write()
         else:
-            print '{} not in the entries'.format(sys.argv[2])
-    elif len(sys.argv) == 3 and sys.argv[1] == 'export':
+            print '{} not in the entries'.format(argv[2])
+    elif len(argv) == 3 and argv[1] == 'export':
         cr = Crawler()
         for k, v in cr.entries.iteritems():
             print k, '----'
             for kk, vv in sorted(v.iteritems()):
                 print kk, ':', vv
-    elif len(sys.argv) == 5 and sys.argv[1] == 'edit':
+    elif len(argv) == 5 and argv[1] == 'edit':
         cr = Crawler()
-        name, key, value = sys.argv[2:]
+        name, key, value = argv[2:]
         cr.entries[name][key] = value
         cr.write()
-    elif len(sys.argv) >= 2 and sys.argv[1] == 'run':
-        args = sys.argv[2:]
+    elif len(argv) >= 2 and argv[1] == 'run':
+        args = argv[2:]
         force = True if '-f' in args else False
         cr = Crawler()
         to_run = []
@@ -227,9 +239,9 @@ def main():
             else:
                 print 'Skipping because last run was within interval'
         cr.write()
-    elif len(sys.argv) == 2 and sys.argv[1] == 'list':
+    elif len(argv) == 2 and argv[1] == 'list':
         cr = Crawler()
-        print cr.list_names()
+        print str(cr.list_names())
     else:
         print ('Usage:\n'
                '\t{0} del crawlername\n'
@@ -237,7 +249,7 @@ def main():
                '\t{0} export FILE\n'
                '\t{0} list\n'
                '\t{0} run -f {{item1 item2 ...|all}}\n'
-               '\t{0} test crawlername title summary\n').format(sys.argv[0])
+               '\t{0} test crawlername title summary\n').format(argv[0])
 
 if __name__ == '__main__':
-    main()
+    main(sys.argv)
diff --git a/program/everything/data_processing.py b/program/everything/data_processing.py
deleted file mode 100644 (file)
index c4e971f..0000000
+++ /dev/null
@@ -1,160 +0,0 @@
-#!/bin/env python
-# -*- coding: utf-8 -*-
-
-import ast
-import logging
-import re
-import pydawg
-import crawler
-
-
-def structure_data(d):
-    re_hdr = re.compile('<th>(?P<h>.*?)</th>', flags=re.MULTILINE | re.DOTALL)
-    re_row = re.compile('<tr>(?P<row>.*)</tr>', flags=re.MULTILINE | re.DOTALL)
-    re_dualcel = re.compile('<td id="cel">(?P<c>.*?)</td><!--cel-->',
-                            flags=re.MULTILINE | re.DOTALL)
-    con = d['content']
-    d['content'] = []
-    d['headers'] = []
-    for line in con.split('\n\t\t'):
-        if not line:
-            continue
-        row = re_row.search(line)
-        row = row.group('row')
-        for header in re_hdr.finditer(row):
-            d['headers'].append(header.group('h'))
-        d['content'].append([])
-        for cell in re_dualcel.finditer(row):
-            d['content'][-1].append(cell.group('c'))
-
-
-def parse_line(line):
-    re_spa = re.compile('(?P<b><span.*?background-color:\s*(?P<c>.*?);.*?>)(?P'
-                        '<content>.*?)(?P<e></span>)')
-    results = []
-    for column in line:
-        results.append([])
-        markings = list(re_spa.finditer(column))
-        if markings:
-            results[-1].append(markings)
-    return results
-
-
-def create_nodes(d):
-    color_dict = {
-        'rgb(139, 0, 0)': '\x01',   # datum
-        'red': '\x02',              # tijd
-        'green': '\x03',            # wat
-        'blue': '\x04'              # wanneer
-        }
-    line_w_match = []
-    d['content'] = d['content'][1:]
-    for i, m in enumerate(d['matchdata']):
-        if filter(None, m):
-            line_w_match.append((d['content'][i], m))
-    nodelists = {'Title': [], 'Summary': []}
-    for (title_l, summary_l), (title_m, summary_m) in line_w_match:
-        # Title
-        if title_m:
-            title = title_m[0]
-            matches = reversed(sorted(title, key=lambda x: x.end('e')))
-            for match in matches:
-                title_l = title_l[:match.start('e')] + title_l[match.end('e'):]
-                title_l = title_l[:match.start('content')] +\
-                    color_dict[match.group('c').strip()] +\
-                    title_l[match.end('content'):]
-                title_l = title_l[:match.start('b')] + title_l[match.end('b'):]
-            nodelists['Title'].append(title_l)
-        # Summary
-        if summary_m:
-            summary = summary_m[0]
-            matches = reversed(sorted(summary, key=lambda x: x.end('e')))
-            for match in matches:
-                summary_l = summary_l[:match.start('e')] +\
-                    summary_l[match.end('e'):]
-                summary_l = summary_l[:match.start('content')] +\
-                    color_dict[match.group('c').strip()] +\
-                    summary_l[match.end('content'):]
-                summary_l = summary_l[:match.start('b')] +\
-                    summary_l[match.end('b'):]
-            nodelists['Summary'].append(summary_l)
-    return nodelists
-
-
-def to_dot(q0):
-    nodenum = 0
-    final_nodes = []
-    nodes = []
-    edges = []
-    to_visit = [(0, q0)]
-    visited = set()
-    translation = []
-    if q0.final:
-        final_nodes.append(nodenum)
-    else:
-        nodes.append(nodenum)
-
-    nodenum += 1
-    while to_visit:
-        current = to_visit.pop()
-        if not current[0] in visited:
-            visited.add(current[0])
-            for char, child in current[1].children.iteritems():
-                matches = [c for c in translation if c[0] == child]
-                curnum = -1
-                if matches:
-                    curnum = matches[-1][1]
-                else:
-                    translation.append((child, nodenum))
-                    curnum = nodenum
-                    nodenum += 1
-                if child.final:
-                    final_nodes.append(curnum)
-                else:
-                    nodes.append(curnum)
-                edges.append((current[0], char, curnum))
-                to_visit.append((curnum, child))
-    print 'digraph dawg {'
-    print '\tnode [shape = doublecircle]; {}'.format(
-        ' '.join(str(n) for n in final_nodes))
-    print '\tnode [shape = circle]; {}'.format(
-        ' '.join(str(n) for n in nodes))
-    for fr, ch, to in edges:
-        print '\t{} -> {} [label = "{}"];'.format(fr, to, ch)
-    print '}'
-
-
-def main():
-    with open('./output_data/raw_out.txt', 'r') as data:
-        logging.info('raw data loaded, going to parse data')
-        d = data.readline()
-        d = re.sub('\)\]}$', '}',
-                   re.sub('\)\],', ',',
-                          re.sub('\[Field\(\'.*?\', ', '', d)))
-        d = ast.literal_eval(d)
-    logging.info('raw data parsed, going to structure data')
-    structure_data(d)
-    logging.info('data structured, parsed headers: {}'.format(d['headers']))
-    logging.info('lines: {}'.format(len(d['content'])))
-    d['matchdata'] = []
-    for line in filter(None, d['content']):
-        d['matchdata'].append(parse_line(line))
-    nodelists = create_nodes(d)
-    titledawg = pydawg.DAWG()
-    for n in sorted(set(nodelists['Title'])):
-        titledawg.add_word(n)
-    summarydawg = pydawg.DAWG()
-    for n in sorted(set(nodelists['Summary'])):
-        summarydawg.add_word(n)
-    raw_input('Going to write to crawler and finish up ok?\n')
-    crawl = crawler.Crawler()
-    d['titledawg'] = titledawg
-    d['summarydawg'] = summarydawg
-    del(d['matchdata'])
-    crawl.add_entry(d)
-    crawl.write()
-
-
-if __name__ == '__main__':
-    logging.basicConfig(level=logging.WARNING)
-    main()
diff --git a/program/everything/index.py b/program/everything/index.py
new file mode 100644 (file)
index 0000000..e331645
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/env python
+# -*- codng: utf-8 -*-
+
+import crawler
+
+
+def index(req, args, apok):
+    req.log_error('handler')
+    req.content_type = 'text/html'
+    req.send_http_header()
+    with open('/var/www/py/main.html.t', 'r') as f:
+        data = f.read()
+    cr = crawler.Crawler('/var/www/py/crawler.db')
+    ns = cr.list_names()
+    params = {
+        'active_crawlers':
+            '\n'.join('<a href="./crawler_edit.py?url={0}">{0}</a><br>'.
+                      format(a) for a in ns),
+        'active_crawlers_dropdown':
+            '\n'.join('<option value={0}>{0}</option>'.format(a) for a in ns)
+    }
+    req.write(data.format(**params))
+    return apok
+
+
+def crawler_edit(req, args, apok):
+    return apok
+
+
+def crawler_test(req, args, apok):
+    return apok
index 8c153b9..6a155e3 100644 (file)
@@ -3,9 +3,10 @@
 
 from mod_python import apache, util
 import feedparser
+import index
+import crawler
 import re
 import urllib
-import os
 
 
 def req_pre_pos(req):
@@ -13,16 +14,146 @@ def req_pre_pos(req):
     req.content_type = 'text/html'
     req.send_http_header()
     args = util.FieldStorage(req)
+    listing = data_main(args)
     req.write(
         '<html>\n<head>\n'
         '\t<title>VER: 0.01 - HyperFrontend RSS feed POSTREQUEST</title>'
         '</head>\n<body>\n'
         '\tThanks submitting: <br />\n'
-        '\t<a href="index.html">Enter new rss feed</a>\n<pre>\n'
-        '{}\n</pre>\n</body>\n</html>'.format(args))
-    os.chdir('/var/www/py/files')
-    with open('raw_out.txt', 'w') as f:
-        f.write(str(args))
+        '\t<a href="index.py">Go back...</a>\n<pre>\n'
+        'Current crawlers: {}\n</pre>\n</body>\n</html>'.format(listing))
+
+
+def structure_data(d):
+    re_hdr = re.compile('<th>(?P<h>.*?)</th>', flags=re.MULTILINE | re.DOTALL)
+    re_row = re.compile('<tr>(?P<row>.*)</tr>', flags=re.MULTILINE | re.DOTALL)
+    re_dualcel = re.compile('<td id="cel">(?P<c>.*?)</td><!--cel-->',
+                            flags=re.MULTILINE | re.DOTALL)
+    con = d['content']
+    d['content'] = []
+    d['headers'] = []
+    for line in con.split('\n\t\t'):
+        if not line:
+            continue
+        row = re_row.search(line)
+        row = row.group('row')
+        for header in re_hdr.finditer(row):
+            d['headers'].append(header.group('h'))
+        d['content'].append([])
+        for cell in re_dualcel.finditer(row):
+            d['content'][-1].append(cell.group('c'))
+
+
+def parse_line(line):
+    re_spa = re.compile('(?P<b><span.*?background-color:\s*(?P<c>.*?);.*?>)(?P'
+                        '<content>.*?)(?P<e></span>)')
+    results = []
+    for column in line:
+        results.append([])
+        markings = list(re_spa.finditer(column))
+        if markings:
+            results[-1].append(markings)
+    return results
+
+
+def create_nodes(d):
+    color_dict = {
+        'rgb(139, 0, 0)': '\x01',   # datum
+        'red': '\x02',              # tijd
+        'green': '\x03',            # wat
+        'blue': '\x04'              # wanneer
+        }
+    line_w_match = []
+    d['content'] = d['content'][1:]
+    for i, m in enumerate(d['matchdata']):
+        if filter(None, m):
+            line_w_match.append((d['content'][i], m))
+    nodelists = {'Title': [], 'Summary': []}
+    for (title_l, summary_l), (title_m, summary_m) in line_w_match:
+        # Title
+        if title_m:
+            title = title_m[0]
+            matches = reversed(sorted(title, key=lambda x: x.end('e')))
+            for match in matches:
+                title_l = title_l[:match.start('e')] + title_l[match.end('e'):]
+                title_l = title_l[:match.start('content')] +\
+                    color_dict[match.group('c').strip()] +\
+                    title_l[match.end('content'):]
+                title_l = title_l[:match.start('b')] + title_l[match.end('b'):]
+            nodelists['Title'].append(title_l)
+        # Summary
+        if summary_m:
+            summary = summary_m[0]
+            matches = reversed(sorted(summary, key=lambda x: x.end('e')))
+            for match in matches:
+                summary_l = summary_l[:match.start('e')] +\
+                    summary_l[match.end('e'):]
+                summary_l = summary_l[:match.start('content')] +\
+                    color_dict[match.group('c').strip()] +\
+                    summary_l[match.end('content'):]
+                summary_l = summary_l[:match.start('b')] +\
+                    summary_l[match.end('b'):]
+            nodelists['Summary'].append(summary_l)
+    return nodelists
+
+
+def to_dot(q0):
+    nodenum = 0
+    final_nodes = []
+    nodes = []
+    edges = []
+    to_visit = [(0, q0)]
+    visited = set()
+    translation = []
+    if q0.final:
+        final_nodes.append(nodenum)
+    else:
+        nodes.append(nodenum)
+
+    nodenum += 1
+    while to_visit:
+        current = to_visit.pop()
+        if not current[0] in visited:
+            visited.add(current[0])
+            for char, child in current[1].children.iteritems():
+                matches = [c for c in translation if c[0] == child]
+                curnum = -1
+                if matches:
+                    curnum = matches[-1][1]
+                else:
+                    translation.append((child, nodenum))
+                    curnum = nodenum
+                    nodenum += 1
+                if child.final:
+                    final_nodes.append(curnum)
+                else:
+                    nodes.append(curnum)
+                edges.append((current[0], char, curnum))
+                to_visit.append((curnum, child))
+    print 'digraph dawg {'
+    print '\tnode [shape = doublecircle]; {}'.format(
+        ' '.join(str(n) for n in final_nodes))
+    print '\tnode [shape = circle]; {}'.format(
+        ' '.join(str(n) for n in nodes))
+    for fr, ch, to in edges:
+        print '\t{} -> {} [label = "{}"];'.format(fr, to, ch)
+    print '}'
+
+
+def data_main(d):
+    d = {k: str(v) for k, v in dict(d).iteritems()}
+    structure_data(d)
+    d['matchdata'] = []
+    for line in filter(None, d['content']):
+        d['matchdata'].append(parse_line(line))
+    nodelists = create_nodes(d)
+    d['titledawg'] = nodelists['Title']
+    d['summarydawg'] = nodelists['Summary']
+    del(d['matchdata'])
+    crawl = crawler.Crawler()
+    crawl.add_entry(d)
+    crawl.write()
+    return crawl.list_names()
 
 
 def req_pre(req, args):
@@ -71,7 +202,6 @@ def feed2html(req, url, name):
     req.write(
         '\tLoading "{}" as <p id="rssname">{}</p><br />\n'.format(url, name))
     feed = feedparser.parse(url)
-#    channel = feed.feed
     req.write('\t<table id="content-table" border="1" id="htab">\n')
     req.write('\t\t<tr><th>Title</th><th>Summary</th></tr>\n')
     for i in feed.entries[:10]:
@@ -83,15 +213,22 @@ def feed2html(req, url, name):
 
 
 def handler(req):
-    if req.method == "POST":
-        req_pre_pos(req)
+    if req.uri.split('/')[-1] == 'index.py':
+        return index.index(req, util.FieldStorage(req), apache.OK)
+    elif req.uri.split('/')[-1] == 'crawler_test.py':
+        return index.crawler_test(req, util.FieldStorage(req), apache.OK)
+    elif req.uri.split('/')[-1] == 'crawler_edit.py':
+        return index.crawler_edit(req, util.FieldStorage(req), apache.OK)
     else:
-        args = util.FieldStorage(req)
-        req_pre(req, args)
-        if 'url' not in args and 'name' not in args:
-            req.write('Something went wrong, empty fields?<br />')
-            req.write('<a href="index.html">back</a>')
+        if req.method == "POST":
+            req_pre_pos(req)
         else:
-            feed2html(req, args['url'], args['name'])
-        req_post(req)
-    return apache.OK
+            args = util.FieldStorage(req)
+            req_pre(req, args)
+            if 'url' not in args and 'name' not in args:
+                req.write('Something went wrong, empty fields?<br />')
+                req.write('<a href="index.html">back</a>')
+            else:
+                feed2html(req, args['url'], args['name'])
+            req_post(req)
+        return apache.OK
index 0e87fd6..741094d 100755 (executable)
@@ -1,7 +1,4 @@
 sudo rm -rv /var/www/py/*
-sudo cp -v ./input_app.py /var/www/py
-sudo cp -v ./webdata/*.{xml,html,js} /var/www/py/
-sudo mkdir /var/www/py/files
+sudo cp -v * /var/www/py/
 sudo chown -vR mart:www-data /var/www/py
 sudo chmod -vR 770 /var/www/py
-ln -s /var/www/py/files/ ./output_data
diff --git a/program/everything/main.html.t b/program/everything/main.html.t
new file mode 100644 (file)
index 0000000..4a7be42
--- /dev/null
@@ -0,0 +1,41 @@
+<html>
+<head>
+       <title>Crawler control center</title>
+</head>
+<body>
+       <table border=1>
+               <tr>
+                       <td>Inspect/edit crawler</td>
+                       <td>Add new crawler</td>
+                       <td>Test crawler</td>
+               <tr>
+                       <td>
+                               {active_crawlers}
+                       </td>
+                       <td>
+                               <form method="get" action="./input_app.py">
+                                       <table>
+                                               <tr><td><p>RSS URL:</td><td><input type="text" name="url" value="localhost/py/paradiso.rss.xml"></td></tr>
+                                               <tr><td>RSS Name:</td><td><input type="text" name="name"></td></tr>
+                                               <tr><td><input type="submit" value="Submit"></p></td></tr>
+                                       </table>
+                               </form>
+                       <td>
+                               <br />
+                               <form method="get" action="./crawler_test.py">
+                                       <table>
+                                               <tr><td>
+                                                       <select name="name">
+                                                               {active_crawlers_dropdown}
+                                                       </select>
+                                               </td></tr>
+                                               <tr><td>Title:</td><td><input type="text" name="title"></td></tr>
+                                               <tr><td>Summary:</td><td><input type="text" name="summary"></td></tr>
+                                               <tr><td><input type="submit" value="Submit"></td></tr>
+                                       </table>
+                               </form>
+                       </td>
+               </tr>
+       </table>
+</body>
+</html>
diff --git a/program/everything/output_data b/program/everything/output_data
deleted file mode 120000 (symlink)
index 6550f0b..0000000
+++ /dev/null
@@ -1 +0,0 @@
-/var/www/py/files/
\ No newline at end of file
diff --git a/program/everything/todo.txt b/program/everything/todo.txt
deleted file mode 100644 (file)
index 20a3a77..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-meer containers
-minimale eisen rss feed
-benadrukken waarom rss
diff --git a/program/everything/uri.txt b/program/everything/uri.txt
deleted file mode 100644 (file)
index 6029c01..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-http://www.paradiso.nl/rss.xml
-http://www.tivoli.nl/rss/agenda/
-http://www.stadsschouwburgendevereeniging.nl/_rss/rss.php?type=voorstellingen
-http://www.dedoelen.nl/_rss/rss.php?type=voorstellingen
-http://www.parktheater.nl/_rss/rss.php?type=voorstellingen
-http://www.ticketunlimited.nl/ProductFeed/rssproductfeed.xml
-podiuminfo.nl
diff --git a/program/everything/webdata/index.html b/program/everything/webdata/index.html
deleted file mode 100644 (file)
index 00ce528..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-<html>
-    <head>
-    </head>
-    <body>
-        <form method="get" action="./input_app.py">
-            <table>
-                <tr><td><p>RSS URL:  </td><td><input type="text" name="url"
-                        value="localhost/py/paradiso.rss.xml"></td></tr>
-                <tr><td>RSS Name: </td><td><input type="text" name="name"></td></tr>
-                <tr><td><input type="submit" value="Submit"</p>
-            </table>
-        </form>
-    </body>
-</html>