--- /dev/null
+2014-08-14 - Datumprikker maken voor overleg met franc.
+2014-08-21 -
+2014-08-28 - Programma af
+2014-09-04 - Scriptie af?
+2014-09-11 - Commentaar verwerken?
+2014-09-18 - Scriptie laatste versie
+2014-09-25 - ?? Presenteren
DAWGNode
p10
(dp11
-S'final'
+S'children'
p12
-I00
-sS'number'
-p13
-NsS'children'
-p14
-(dp15
+(dp13
S'\x01'
-p16
+p14
(ipydawg
DAWGNode
-p17
-(dp18
+p15
+(dp16
g12
-I00
-sg13
-Nsg14
-(dp19
+(dp17
S','
-p20
+p18
(ipydawg
DAWGNode
-p21
-(dp22
+p19
+(dp20
g12
-I00
-sg13
-Nsg14
-(dp23
+(dp21
S' '
-p24
+p22
(ipydawg
DAWGNode
-p25
-(dp26
+p23
+(dp24
g12
-I00
-sg13
-Nsg14
-(dp27
+(dp25
S'\x02'
-p28
+p26
(ipydawg
DAWGNode
-p29
-(dp30
+p27
+(dp28
g12
-I00
-sg13
-Nsg14
-(dp31
-g24
+(dp29
+g22
(ipydawg
DAWGNode
-p32
-(dp33
+p30
+(dp31
g12
-I00
-sg13
-Nsg14
-(dp34
+(dp32
S'u'
-p35
+p33
(ipydawg
DAWGNode
-p36
-(dp37
+p34
+(dp35
g12
-I00
-sg13
-Nsg14
-(dp38
-g35
+(dp36
+g33
(ipydawg
DAWGNode
-p39
-(dp40
+p37
+(dp38
g12
-I00
-sg13
-Nsg14
-(dp41
+(dp39
S'r'
-p42
+p40
(ipydawg
DAWGNode
-p43
-(dp44
+p41
+(dp42
g12
-I00
-sg13
-Nsg14
-(dp45
-g24
+(dp43
+g22
(ipydawg
DAWGNode
-p46
-(dp47
+p44
+(dp45
g12
-I00
-sg13
-Nsg14
-(dp48
+(dp46
S'-'
-p49
+p47
(ipydawg
DAWGNode
-p50
-(dp51
+p48
+(dp49
g12
-I00
-sg13
-Nsg14
-(dp52
-g24
+(dp50
+g22
(ipydawg
DAWGNode
-p53
-(dp54
+p51
+(dp52
g12
-I00
-sg13
-Nsg14
-(dp55
+(dp53
S'\x03'
-p56
+p54
(ipydawg
DAWGNode
-p57
-(dp58
+p55
+(dp56
g12
+(dp57
+sS'final'
+p58
I01
-sg13
-Nsg14
-(dp59
-sbssbssbssbssbssbssbssbssbssbssbssbssbsS'_numbers_valid'
+sS'number'
+p59
+Nsbssg58
+I00
+sg59
+Nsbssg58
+I00
+sg59
+Nsbssg58
+I00
+sg59
+Nsbssg58
+I00
+sg59
+Nsbssg58
+I00
+sg59
+Nsbssg58
+I00
+sg59
+Nsbssg58
+I00
+sg59
+Nsbssg58
+I00
+sg59
+Nsbssg58
+I00
+sg59
+Nsbssg58
+I00
+sg59
+Nsbssg58
+I00
+sg59
+Nsbssg58
+I00
+sg59
+NsbsS'_numbers_valid'
p60
I00
sS'register'
p67
sbsS'dloc'
p68
-S'test'
+S'Grote zaal'
p69
sS'venue'
p70
-S'De doelen'
+S'De Doelen'
p71
sS'content'
p72
(lp73
(lp74
-S'<span class="uiWebviewHighlight" style="background-color: rgb(139, 0, 0); color: white;">za 16 aug</span>, <span class="uiWebviewHighlight" style="background-color: red; color: white;">20.15</span> uur - <span class="uiWebviewHighlight" style="background-color: green; color: white;">Elvis & More 2014 - Ren\xc3\xa9 Shuman & Angel-Eye</span>'
+S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">za 16 aug</span>, <span class="uiWebviewHighlight" style="color: white; background-color: red;">20.15</span> uur - <span class="uiWebviewHighlight" style="color: white; background-color: green;">Elvis & More 2014 - Ren\xc3\xa9 Shuman & Angel-Eye</span>'
p75
aS'<b>Ode aan 60 jaar rock-and-roll door Ren\xc3\xa9 Shuman & Angel-Eye</b><br>In 1954 werd de song That\xe2\x80\x99s allright mama door Elvis Presley, The King of Rock and Roll, opgenomen en uitgebracht. Het duo Ren\xc3\xa9 Shuman & Angel-Eye houdt zijn muziek in ere met hun theatershows, tv-specials en opnamen. Ze trekken met hun showband volle zalen met hun Rock and Roll-vertolkingen. En jaarlijks geeft het duo een bijzonder concert op 16 augustus, de sterfdag van Elvis.<br><br>Ren\xc3\xa9: \xe2\x80\x98Dit jaar zal ons Elvis & More-concert plaatsvinden in de Doelen, de grootste klassieke concertzaal van Nederland met een prachtige akoestiek. We willen onze fans trakteren op een energieke en mooie show met subliem geluid en comfortabele zitplaatsen. Een hele avond genieten van 60 jaar Rock and Roll-geschiedenis!\xe2\x80\x99<br><br><ul><li><a href="http://www.elvisandmore.com" target="_blank">www.elvisandmore.com</a></li></ul>'
p76
aa(lp77
-S'<span class="uiWebviewHighlight" style="background-color: rgb(139, 0, 0); color: white;">vr 05 sep</span>, <span class="uiWebviewHighlight" style="background-color: red; color: white;">20.00</span> uur - <span class="uiWebviewHighlight" style="background-color: green; color: white;">Night of Ziryab - Amsterdams Andalusisch Orkest i.s.m. Orchestre Temsamani (Tetuan)</span>'
+S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">vr 05 sep</span>, <span class="uiWebviewHighlight" style="color: white; background-color: red;">20.00</span> uur - <span class="uiWebviewHighlight" style="color: white; background-color: green;">Night of Ziryab - Amsterdams Andalusisch Orkest i.s.m. Orchestre Temsamani (Tetuan)</span>'
p78
aS''
p79
aa(lp80
-S'<span class="uiWebviewHighlight" style="background-color: rgb(139, 0, 0); color: white;">vr 12 sep</span>, <span class="uiWebviewHighlight" style="background-color: red; color: white;">13.00</span> uur - <span class="uiWebviewHighlight" style="background-color: green; color: white;">Rotterdam Philharmonic Gergiev Festival - Locatietheater | Niemandsland (I)</span>'
+S'vr 12 sep, 13.00 uur - Rotterdam Philharmonic Gergiev Festival - Locatietheater | Niemandsland (I)'
p81
aS'Rode draad door het Gergiev Festival vormt de muziektheatervoorstelling \xe2\x80\x98Niemandsland\xe2\x80\x99 van regisseur Serge van Veggel. Een veteraan haalt herinneringen op aan zijn oorlogsverleden, dat in donkere kelders opeens weer heel dichtbij lijkt te komen. Hij symboliseert de \xe2\x80\x98Onbekende Soldaat\xe2\x80\x99 in een collage van originele dagboekfragmenten en gedichten, liedjes of soms absurde oorlogsverhalen. Muziek komt van het theatrale Ragazze Kwartet: gaandeweg zal de sublieme eenheid van het klassieke strijkkwartet door de toenemende chaos steeds meer verbrokkelen.<br><br>\xe2\x80\x98...hoe je met weinig middelen schitterende sc\xc3\xa8nes kunt bouwen\xe2\x80\x99 - NRC Handelsblad over het locatietheater van regisseur Serge van Veggel.<br><br>Deze voorstelling wordt georganiseerd in samenwerking met het muziektheatergezelschap OPERA2DAY.<br><br>Locatietheater met als startpunt de Eduard Flipse Zaal. De voorstelling bestaat uit twee gedeelten. Het tweede gedeelte is niet goed te bezoeken voor hen die slecht ter been zijn of voor rolstoelgebruikers.<br><br><ul><li><a href="http://www.gergievfestival.nl" target="_blank">www.gergievfestival.nl</a></li></ul>'
p82
p111
(dp112
g12
-I00
-sg13
-Nsg14
(dp113
-sbsg60
+sg58
+I00
+sg59
+Nsbsg60
I00
sg61
g62
g79
sbsS'freq'
p117
-S'1w'
+S'1d'
p118
sS'adress'
p119
-S'amsterdam'
+S'Straat 1 Eindhoven'
p120
ssS'paradiso'
p121
(dp122
-S'website'
+S'name'
p123
-S'www.paradiso.nl'
-p124
-sS'name'
-p125
g121
sS'titledawg'
-p126
+p124
(ipydawg
DAWG
-p127
-(dp128
+p125
+(dp126
g9
(ipydawg
DAWGNode
-p129
-(dp130
-g13
-Nsg14
-(dp131
+p127
+(dp128
+g59
+Nsg12
+(dp129
S'\x01'
-p132
+p130
(ipydawg
DAWGNode
-p133
-(dp134
-g13
-Nsg14
-(dp135
+p131
+(dp132
+g59
+Nsg12
+(dp133
S' '
-p136
+p134
(ipydawg
DAWGNode
-p137
-(dp138
-g13
-Nsg14
-(dp139
+p135
+(dp136
+g59
+Nsg12
+(dp137
S'\x02'
-p140
+p138
(ipydawg
DAWGNode
-p141
-(dp142
-g13
-Nsg14
-(dp143
-g136
+p139
+(dp140
+g59
+Nsg12
+(dp141
+g134
(ipydawg
DAWGNode
-p144
-(dp145
-g13
-Nsg14
-(dp146
+p142
+(dp143
+g59
+Nsg12
+(dp144
S'-'
-p147
+p145
(ipydawg
DAWGNode
-p148
-(dp149
-g13
-Nsg14
-(dp150
-g136
+p146
+(dp147
+g59
+Nsg12
+(dp148
+g134
(ipydawg
DAWGNode
-p151
-(dp152
-g13
-Nsg14
-(dp153
+p149
+(dp150
+g59
+Nsg12
+(dp151
S'\x03'
-p154
+p152
+(ipydawg
+DAWGNode
+p153
+(dp154
+g59
+Nsg12
+(dp155
+g134
(ipydawg
DAWGNode
-p155
-(dp156
-g13
-Nsg14
+p156
(dp157
-g136
+g59
+Nsg12
+(dp158
+g134
(ipydawg
DAWGNode
-p158
-(dp159
-g13
-Nsg14
+p159
(dp160
-g136
+g59
+Nsg12
+(dp161
+g134
(ipydawg
DAWGNode
-p161
-(dp162
-g13
-Nsg14
+p162
(dp163
-g136
+g59
+Nsg12
+(dp164
+g145
(ipydawg
DAWGNode
-p164
-(dp165
-g13
-Nsg14
+p165
(dp166
-g147
+g59
+Nsg12
+(dp167
+g134
(ipydawg
DAWGNode
-p167
-(dp168
-g13
-Nsg14
+p168
(dp169
-g136
+g59
+Nsg12
+(dp170
+g134
(ipydawg
DAWGNode
-p170
-(dp171
-g13
-Nsg14
+p171
(dp172
-g136
+g59
+Nsg12
+(dp173
+g134
(ipydawg
DAWGNode
-p173
-(dp174
-g13
-Nsg14
+p174
(dp175
-g136
-(ipydawg
-DAWGNode
-p176
-(dp177
-g13
-Nsg14
-(dp178
+g59
+Nsg12
+(dp176
S'L'
-p179
+p177
(ipydawg
DAWGNode
-p180
-(dp181
-g13
-Nsg14
-(dp182
+p178
+(dp179
+g59
+Nsg12
+(dp180
S'o'
-p183
+p181
(ipydawg
DAWGNode
-p184
-(dp185
-g13
-Nsg14
-(dp186
+p182
+(dp183
+g59
+Nsg12
+(dp184
S'c'
-p187
+p185
(ipydawg
DAWGNode
-p188
-(dp189
-g13
-Nsg14
-(dp190
+p186
+(dp187
+g59
+Nsg12
+(dp188
S'a'
-p191
+p189
(ipydawg
DAWGNode
-p192
-(dp193
-g13
-Nsg14
-(dp194
+p190
+(dp191
+g59
+Nsg12
+(dp192
S't'
-p195
+p193
(ipydawg
DAWGNode
-p196
-(dp197
-g13
-Nsg14
-(dp198
+p194
+(dp195
+g59
+Nsg12
+(dp196
S'i'
-p199
+p197
(ipydawg
DAWGNode
-p200
-(dp201
-g13
-Nsg14
-(dp202
+p198
+(dp199
+g59
+Nsg12
+(dp200
S'e'
-p203
+p201
(ipydawg
DAWGNode
-p204
-(dp205
-g13
-Nsg14
-(dp206
+p202
+(dp203
+g59
+Nsg12
+(dp204
S':'
-p207
+p205
(ipydawg
DAWGNode
-p208
-(dp209
-g13
-Nsg14
-(dp210
-g136
+p206
+(dp207
+g59
+Nsg12
+(dp208
+g134
(ipydawg
DAWGNode
-p211
-(dp212
-g13
-Nsg14
-(dp213
+p209
+(dp210
+g59
+Nsg12
+(dp211
S'\x04'
-p214
+p212
(ipydawg
DAWGNode
-p215
-(dp216
-g13
-Nsg14
-(dp217
-sg12
+p213
+(dp214
+g59
+Nsg12
+(dp215
+sg58
I01
-sbssg12
-I00
-sbssg12
-I00
-sbssg12
-I00
-sbssg12
-I00
-sbssg12
-I00
-sbssg12
-I00
-sbssg12
-I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
-I00
-sbssg12
-I00
-sbssg12
-I00
-sbssg12
-I00
-sbsg147
-(ipydawg
-DAWGNode
-p218
-(dp219
-g13
-Nsg14
-(dp220
-g136
-(ipydawg
-DAWGNode
-p221
-(dp222
-g13
-Nsg14
-(dp223
-g179
-(ipydawg
-DAWGNode
-p224
-(dp225
-g13
-Nsg14
-(dp226
-g183
-(ipydawg
-DAWGNode
-p227
-(dp228
-g13
-Nsg14
-(dp229
-g187
-(ipydawg
-DAWGNode
-p230
-(dp231
-g13
-Nsg14
-(dp232
-g191
-(ipydawg
-DAWGNode
-p233
-(dp234
-g13
-Nsg14
-(dp235
-g195
-(ipydawg
-DAWGNode
-p236
-(dp237
-g13
-Nsg14
-(dp238
-g199
-(ipydawg
-DAWGNode
-p239
-(dp240
-g13
-Nsg14
-(dp241
-g203
-(ipydawg
-DAWGNode
-p242
-(dp243
-g13
-Nsg14
-(dp244
-g207
-(ipydawg
-DAWGNode
-p245
-(dp246
-g13
-Nsg14
-(dp247
-g136
-(ipydawg
-DAWGNode
-p248
-(dp249
-g13
-Nsg14
-(dp250
-g214
-(ipydawg
-DAWGNode
-p251
-(dp252
-g13
-Nsg14
-(dp253
-sg12
-I01
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I01
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
-sbssg12
+sbssg58
I00
sbsg60
I00
sg61
g62
-((lp254
-g164
-ag188
-ag161
-ag192
-ag208
-ag196
-ag170
-ag200
-ag173
-ag204
-ag176
-ag180
-ag167
-ag211
-ag184
-ag215
-atp255
-Rp256
+((lp216
+tp217
+Rp218
sg66
-S'\x01 \x02 - \x03 - Locatie: \x04'
-p257
-sbsS'dloc'
-p258
-S'Grote Zaal'
-p259
-sS'venue'
-p260
-S'Paradiso'
-p261
-sS'content'
-p262
-(lp263
-(lp264
+S'\x01 \x02 - \x03 - Locatie: \x04'
+p219
+sbsS'content'
+p220
+(lp221
+(lp222
S'zaterdag 31 mei 2014 - Lentekabinet Festival Afterparty - Locatie: Tolhuistuin (zaal)'
-p265
+p223
ag79
-aa(lp266
-S'<span class="uiWebviewHighlight" style="background-color: rgb(139, 0, 0); color: white;">vrijdag 4 juli 2014</span> <span class="uiWebviewHighlight" style="background-color: red; color: white;">20:30</span> - <span class="uiWebviewHighlight" style="background-color: green; color: white;">The Crimson Projekct</span> - Locatie: <span class="uiWebviewHighlight" style="background-color: blue; color: white;">Tolhuistuin (zaal)</span>'
-p267
+aa(lp224
+S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">vrijdag 4 juli 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">20:30</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">The Crimson Projekct</span> - Locatie: <span class="uiWebviewHighlight" style="color: white; background-color: blue;">Tolhuistuin (zaal)</span>'
+p225
aS'Muziek rond King Crimson'
-p268
-aa(lp269
-S'<span class="uiWebviewHighlight" style="background-color: rgb(139, 0, 0); color: white;">dinsdag 10 juni 2014</span> <span class="uiWebviewHighlight" style="background-color: red; color: white;">20:30</span> - <span class="uiWebviewHighlight" style="background-color: green; color: white;">Het Ultieme Natuurkunde Feestje \xe2\x80\x93 keynote Amanda Gefter</span>'
-p270
+p226
+aa(lp227
+S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">dinsdag 10 juni 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">20:30</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">Het Ultieme Natuurkunde Feestje \xe2\x80\x93 keynote Amanda Gefter</span>'
+p228
ag79
-aa(lp271
-S'<span class="uiWebviewHighlight" style="background-color: rgb(139, 0, 0); color: white;">dinsdag 12 augustus 2014</span> <span class="uiWebviewHighlight" style="background-color: red; color: white;">21:00</span> - <span class="uiWebviewHighlight" style="background-color: green; color: white;">Kevin Drew</span> - Locatie: <span class="uiWebviewHighlight" style="background-color: blue; color: white;">Bitterzoet</span>'
-p272
+aa(lp229
+S'dinsdag 12 augustus 2014 21:00 - Kevin Drew - Locatie: Bitterzoet'
+p230
aS'mede-oprichter Broken Social Scene solo'
-p273
-aa(lp274
-S'vrijdag 4 juli 2014 22:00 - Palenke Soultribe'
-p275
+p231
+aa(lp232
+S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">vrijdag 4 juli 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">22:00</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">Palenke Soultribe</span>'
+p233
aS'Electronische muziek en Afro-Colombiaanse ritmes'
-p276
-aa(lp277
+p234
+aa(lp235
S'maandag 3 november 2014 20:15 - Eefje de Visser: Waterwereldsteden - Locatie: Het Concertgebouw'
-p278
+p236
ag79
-aa(lp279
+aa(lp237
S'zaterdag 27 september 2014 20:30 - A Great Big World - Locatie: Tolhuistuin (zaal)'
-p280
+p238
aS'Hitschrijvers uit New York'
-p281
-aa(lp282
+p239
+aa(lp240
S'zaterdag 7 juni 2014 23:00 - Benefietavond Marokkaanse Boot'
-p283
+p241
aS'Van Amsterdam naar Tanger'
-p284
-aa(lp285
+p242
+aa(lp243
S'donderdag 13 november 2014 19:30 - Wouter Hamel'
-p286
+p244
aS'Sprankelende jazzy pop'
-p287
-aa(lp288
+p245
+aa(lp246
S'vrijdag 13 juni 2014 00:00 - Legends'
-p289
+p247
ag79
aasS'headers'
-p290
-(lp291
+p248
+(lp249
S'Title'
-p292
+p250
aS'Summary'
-p293
+p251
asS'summarydawg'
-p294
+p252
(ipydawg
DAWG
-p295
-(dp296
+p253
+(dp254
g9
(ipydawg
DAWGNode
-p297
-(dp298
-g13
-Nsg14
-(dp299
-sg12
+p255
+(dp256
+g59
+Nsg12
+(dp257
+sg58
I00
sbsg60
I00
sg61
g62
-((lp300
-tp301
-Rp302
+((lp258
+tp259
+Rp260
sg66
g79
-sbsS'freq'
-p303
-S'1w'
-p304
-sS'adress'
-p305
-S'Amsterdam'
-p306
-ss.
\ No newline at end of file
+sbss.
\ No newline at end of file
import pickle
import re
import os
+import pprint
+import sys
class Crawler():
d_s = self.entries[name]['summarydawg']
r_t, r_s = [], []
for i, w in enumerate(d_t.words()):
- w = w.replace('\x01', '(?P<datum{}>.*)'.format(i))
- w = w.replace('\x02', '(?P<tijd{}>.*)'.format(i))
- w = w.replace('\x03', '(?P<wat{}>.*)'.format(i))
- w = w.replace('\x04', '(?P<waar{}>.*)'.format(i))
+ w = w.replace('\x01',
+ '(?P<datum{}>.+)'.format(i))
+ w = w.replace('\x02',
+ '(?P<tijd{}>.+)'.format(i))
+ w = w.replace('\x03',
+ '(?P<wat{}>.+)'.format(i))
+ w = w.replace('\x04',
+ '(?P<waar{}>.+)'.format(i))
+ w = re.sub('\s+', '\\s+', w)
r_t.append(w)
for i, w in enumerate(d_s.words()):
- w = w.replace('\x01', '(?P<datum>.*)'.format(i))
- w = w.replace('\x02', '(?P<tijd>.*)'.format(i))
- w = w.replace('\x03', '(?P<wat>.*)'.format(i))
- w = w.replace('\x04', '(?P<waar>.*)'.format(i))
+ w = w.replace('\x01',
+ '(?P<datum{}>.+)'.format(i))
+ w = w.replace('\x02',
+ '(?P<tijd{}>.+)'.format(i))
+ w = w.replace('\x03',
+ '(?P<wat{}>.+)'.format(i))
+ w = w.replace('\x04',
+ '(?P<waar{}>.+)'.format(i))
+ w = re.sub('\s+', '\\s+', w)
r_s.append(w)
- r_t = '' if not r_t else '({})'.format('|'.join(
- reversed(sorted(r_t, key=lambda x: len(x)))))
- r_s = '' if not r_s else '({})'.format('|'.join(
- reversed(sorted(r_s, key=lambda x: len(x)))))
+# r_t = '' if not r_t else '({})'.format('|'.join(
+# reversed(sorted(r_t, key=lambda x: len(x)))))
+# r_s = '' if not r_s else '({})'.format('|'.join(
+# reversed(sorted(r_s, key=lambda x: len(x)))))
return r_t, r_s
+ def to_dot(self, name, out='-'):
+ out = sys.stdout if out == '-' else open(out, 'w')
+ try:
+ q0 = self.entries[name]['titledawg'].q0
+ nodenum = 0
+ final_nodes = []
+ nodes = []
+ edges = []
+ to_visit = [(0, q0)]
+ visited = set()
+ translation = []
+ if q0.final:
+ final_nodes.append(nodenum)
+ else:
+ nodes.append(nodenum)
+
+ nodenum += 1
+ while to_visit:
+ current = to_visit.pop()
+ if not current[0] in visited:
+ visited.add(current[0])
+ for char, child in current[1].children.iteritems():
+ matches = [c for c in translation if c[0] == child]
+ curnum = -1
+ if matches:
+ curnum = matches[-1][1]
+ else:
+ translation.append((child, nodenum))
+ curnum = nodenum
+ nodenum += 1
+ if child.final:
+ final_nodes.append(curnum)
+ else:
+ nodes.append(curnum)
+ edges.append((current[0], char, curnum))
+ to_visit.append((curnum, child))
+ out.write('digraph dawg {\n')
+ out.write('\tnode [shape = doublecircle]; {}\n'.format(
+ ' '.join(str(n) for n in final_nodes)))
+ out.write('\tnode [shape = circle]; {}\n'.format(
+ ' '.join(str(n) for n in nodes)))
+ for fr, ch, to in edges:
+ out.write('\t{} -> {} [label = "{}"];\n'.format(fr, to, ch))
+ out.write('}\n')
+ except:
+ out.close()
+
def test_entry(self, name, title, summary):
+ print '\n', repr(title), repr(summary)
r_t, r_s = self.get_regex(name)
- print r_t, r_s
- rtm = re.search(r_t, title)
- print '\ntrying to match: "{}", "{}"'.format(title, summary)
- print 'matching to: "{}", "{}"'.format(repr(r_t), repr(r_s))
- if rtm and r_t:
- for k, v in [(k, v) for k, v in rtm.groupdict().iteritems() if v]:
- print '{}: {}'.format(k, v)
+ matcht = [re.search(t, title) for t in r_t]
+ matchs = [re.search(t, summary) for t in r_s]
+
+ matcht = filter(lambda x: x is not None, matcht)
+ matchs = filter(lambda x: x is not None, matchs)
+
+ if matcht:
+ pprint.pprint(
+ [m.groupdict() for m in
+ reversed(sorted(matcht, key=lambda x: len(x.groups())))][0])
else:
print 'no title match'
- rsm = re.search(r_s, summary)
- if rsm and r_s:
- for k, v in [(k, v) for k, v in rsm.groupdict().iteritems() if v]:
- print '{}: {}'.format(k, v)
+ if matchs:
+ pprint.pprint(
+ [m.groupdict() for m in
+ reversed(sorted(matchs, key=lambda x: len(x.groups())))][0])
else:
print 'no summary match'
cr.test_entry('dedoelen', 'vr 5 mei08:00 uur - Abba live', '')
cr.test_entry('paradiso',
'donderdag 13 november 2014 19:30 - Wouter Hamel', '')
- cr.test_entry('paradiso',
- 'zaterdag 27 september 2014 20:30 - A Great Big World - '
- 'Locatie: Tolhuistuin (zaal)', '')
- cr.test_entry('paradiso',
- 'zaterdag 31 mei 2014 - Lentekabinet Festival Afterparty - '
- 'Locatie: Tolhuistuin (zaal)', '')
+ cr.test_entry('paradiso', 'zaterdag 27 september 2014 20:30 - A Great Big '
+ 'World - Locatie: Tolhuistuin (zaal)', '')
+ cr.test_entry('paradiso', 'zaterdag 27 september 2014 20:30 - A Great Big '
+ 'World - Locatie: Tolhuistuin (zaal)', '')
+ cr.to_dot('paradiso', 't.dot')
if __name__ == '__main__':
main()
+++ /dev/null
-#!/bin/env python
-# -*- coding: utf-8 -*-
-
-import pydawg
-
-
-def to_dot(filepath, q0):
- nodenum = 0
- final_nodes = []
- nodes = []
- edges = []
- to_visit = [(0, q0)]
- visited = set()
- translation = []
- if q0.final:
- final_nodes.append(nodenum)
- else:
- nodes.append(nodenum)
-
- nodenum += 1
- while to_visit:
- current = to_visit.pop()
- if not current[0] in visited:
- visited.add(current[0])
- for char, child in current[1].children.iteritems():
- matches = [c for c in translation if c[0] == child]
- curnum = -1
- if matches:
- curnum = matches[-1][1]
- else:
- translation.append((child, nodenum))
- curnum = nodenum
- nodenum += 1
- if child.final:
- final_nodes.append(curnum)
- else:
- nodes.append(curnum)
- edges.append((current[0], char, curnum))
- to_visit.append((curnum, child))
- print 'digraph dawg {'
- print '\tnode [shape = doublecircle]; {}'.format(
- ' '.join(str(n) for n in final_nodes))
- print '\tnode [shape = circle]; {}'.format(
- ' '.join(str(n) for n in nodes))
- for fr, ch, to in edges:
- print '\t{} -> {} [label = "{}"];'.format(fr, to, ch)
- print '}'
-
-
-d = pydawg.DAWG()
-
-regs = [
- 'wdag dag maand jaar tijd - wat',
- 'dag maand jaar tijd - wat',
- 'wdag dag maand jaar tijd - wat',
- 'wdag dag maand jaar tijd - wat - Locatie: waar',
- 'wdag dag maand jaar tijd - wat - Locatie: waar']
-
-#regs = [
-# 'maandag 11 augustus 2014 19:30 - Neutral Milk Hotel',
-# 'dinsdag 19 augustus 2014 22:00 - Arkells',
-# 'maandag 24 november 2014 20:30 - Fink',
-# 'woensdag 19 november 2014 20:00 - Michael Schulte',
-# 'zondag 26 oktober 2014 21:00 - The Majority Says - Locatie: Bitterzoet',
-# 'maandag 15 september 2014 20:30 - Ani DiFranco',
-# 'maandag 13 oktober 2014 20:30 - Tarrus Riley',
-# 'maandag 29 december 2014 20:30 - Alain Clark - Locatie: De Duif']
-for w in sorted(set(regs)):
- d.add_word(w)
-
-to_dot('t.dot', d.q0)
http://www.stadsschouwburgendevereeniging.nl/_rss/rss.php?type=voorstellingen
http://www.dedoelen.nl/_rss/rss.php?type=voorstellingen
http://www.parktheater.nl/_rss/rss.php?type=voorstellingen
+http://www.ticketunlimited.nl/ProductFeed/rssproductfeed.xml