--- /dev/null
+(dp0
+S'para'
+p1
+(dp2
+S'website'
+p3
+S'www.test.com'
+p4
+sS'name'
+p5
+g1
+sS'titledawg'
+p6
+(ipydawg
+DAWG
+p7
+(dp8
+S'q0'
+p9
+(ipydawg
+DAWGNode
+p10
+(dp11
+S'number'
+p12
+NsS'children'
+p13
+(dp14
+S'\x01'
+p15
+(ipydawg
+DAWGNode
+p16
+(dp17
+g12
+Nsg13
+(dp18
+S' '
+p19
+(ipydawg
+DAWGNode
+p20
+(dp21
+g12
+Nsg13
+(dp22
+S'\x02'
+p23
+(ipydawg
+DAWGNode
+p24
+(dp25
+g12
+Nsg13
+(dp26
+g19
+(ipydawg
+DAWGNode
+p27
+(dp28
+g12
+Nsg13
+(dp29
+S'-'
+p30
+(ipydawg
+DAWGNode
+p31
+(dp32
+g12
+Nsg13
+(dp33
+g19
+(ipydawg
+DAWGNode
+p34
+(dp35
+g12
+Nsg13
+(dp36
+S'\x03'
+p37
+(ipydawg
+DAWGNode
+p38
+(dp39
+g12
+Nsg13
+(dp40
+g19
+(ipydawg
+DAWGNode
+p41
+(dp42
+g12
+Nsg13
+(dp43
+g19
+(ipydawg
+DAWGNode
+p44
+(dp45
+g12
+Nsg13
+(dp46
+g19
+(ipydawg
+DAWGNode
+p47
+(dp48
+g12
+Nsg13
+(dp49
+g30
+(ipydawg
+DAWGNode
+p50
+(dp51
+g12
+Nsg13
+(dp52
+g19
+(ipydawg
+DAWGNode
+p53
+(dp54
+g12
+Nsg13
+(dp55
+g19
+(ipydawg
+DAWGNode
+p56
+(dp57
+g12
+Nsg13
+(dp58
+g19
+(ipydawg
+DAWGNode
+p59
+(dp60
+g12
+Nsg13
+(dp61
+S'L'
+p62
+(ipydawg
+DAWGNode
+p63
+(dp64
+g12
+Nsg13
+(dp65
+S'o'
+p66
+(ipydawg
+DAWGNode
+p67
+(dp68
+g12
+Nsg13
+(dp69
+S'c'
+p70
+(ipydawg
+DAWGNode
+p71
+(dp72
+g12
+Nsg13
+(dp73
+S'a'
+p74
+(ipydawg
+DAWGNode
+p75
+(dp76
+g12
+Nsg13
+(dp77
+S't'
+p78
+(ipydawg
+DAWGNode
+p79
+(dp80
+g12
+Nsg13
+(dp81
+S'i'
+p82
+(ipydawg
+DAWGNode
+p83
+(dp84
+g12
+Nsg13
+(dp85
+S'e'
+p86
+(ipydawg
+DAWGNode
+p87
+(dp88
+g12
+Nsg13
+(dp89
+S':'
+p90
+(ipydawg
+DAWGNode
+p91
+(dp92
+g12
+Nsg13
+(dp93
+g19
+(ipydawg
+DAWGNode
+p94
+(dp95
+g12
+Nsg13
+(dp96
+S'\x04'
+p97
+(ipydawg
+DAWGNode
+p98
+(dp99
+g12
+Nsg13
+(dp100
+sS'final'
+p101
+I01
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbsg30
+g56
+ssg101
+I00
+sbssg101
+I00
+sbsg97
+(ipydawg
+DAWGNode
+p102
+(dp103
+g12
+Nsg13
+(dp104
+sg101
+I01
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbssg101
+I00
+sbsS'_numbers_valid'
+p105
+I00
+sS'register'
+p106
+c__builtin__
+set
+p107
+((lp108
+g38
+ag98
+ag71
+ag59
+ag47
+ag75
+ag53
+ag44
+ag79
+ag83
+ag56
+ag41
+ag87
+ag50
+ag63
+ag91
+ag67
+ag94
+atp109
+Rp110
+sS'wp'
+p111
+S'\x01 \x02 - \x04'
+p112
+sbsS'dloc'
+p113
+S'grote zaal'
+p114
+sS'venue'
+p115
+S'paradiso'
+p116
+sS'content'
+p117
+(lp118
+(lp119
+S'zaterdag 31 mei 2014 - Lentekabinet Festival Afterparty - Locatie: Tolhuistuin (zaal)'
+p120
+aS''
+p121
+aa(lp122
+S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">vrijdag 4 juli 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">20:30</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">The Crimson Projekct</span> - Locatie: <span class="uiWebviewHighlight" style="color: white; background-color: blue;">Tolhuistuin (zaal)</span>'
+p123
+aS'Muziek rond King Crimson'
+p124
+aa(lp125
+S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">dinsdag 10 juni 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">20:30</span> - <span class="uiWebviewHighlight" style="color: white; background-color: blue;">Het Ultieme Natuurkunde Feestje \xe2\x80\x93 keynote Amanda Gefter</span>'
+p126
+ag121
+aa(lp127
+S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">dinsdag 12 augustus 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">21:00</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">Kevin Drew</span> - Locatie: <span class="uiWebviewHighlight" style="color: white; background-color: blue;">Bitterzoet</span>'
+p128
+aS'mede-oprichter Broken Social Scene solo'
+p129
+aa(lp130
+S'vrijdag 4 juli 2014 22:00 - Palenke Soultribe'
+p131
+aS'Electronische muziek en Afro-Colombiaanse ritmes'
+p132
+aa(lp133
+S'maandag 3 november 2014 20:15 - Eefje de Visser: Waterwereldsteden - Locatie: Het Concertgebouw'
+p134
+ag121
+aa(lp135
+S'zaterdag 27 september 2014 20:30 - A Great Big World - Locatie: Tolhuistuin (zaal)'
+p136
+aS'Hitschrijvers uit New York'
+p137
+aa(lp138
+S'zaterdag 7 juni 2014 23:00 - Benefietavond Marokkaanse Boot'
+p139
+aS'Van Amsterdam naar Tanger'
+p140
+aa(lp141
+S'donderdag 13 november 2014 19:30 - Wouter Hamel'
+p142
+aS'Sprankelende jazzy pop'
+p143
+aa(lp144
+S'vrijdag 13 juni 2014 00:00 - Legends'
+p145
+ag121
+aasS'headers'
+p146
+(lp147
+S'Title'
+p148
+aS'Summary'
+p149
+asS'summarydawg'
+p150
+(ipydawg
+DAWG
+p151
+(dp152
+g9
+(ipydawg
+DAWGNode
+p153
+(dp154
+g12
+Nsg13
+(dp155
+sg101
+I00
+sbsg105
+I00
+sg106
+g107
+((lp156
+tp157
+Rp158
+sg111
+g121
+sbsS'freq'
+p159
+S'1w'
+p160
+sS'adress'
+p161
+S'test'
+p162
+ss.
\ No newline at end of file
URL_REG = re.compile(
- ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
+ ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<'
+ ur'>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+'
+ ur'\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
+
+REPL = [
+ ('\x01', '(?P<datum{}>.+)'),
+ ('\x02', '(?P<tijd{}>.+)'),
+ ('\x03', '(?P<wat{}>.+)'),
+ ('\x04', '(?P<waar{}>.+)')]
class Crawler():
d_s = self.entries[name]['summarydawg']
r_t, r_s = [], []
for i, w in enumerate(d_t.words()):
- w = w.replace('\x01',
- '(?P<datum{}>.+)'.format(i))
- w = w.replace('\x02',
- '(?P<tijd{}>.+)'.format(i))
- w = w.replace('\x03',
- '(?P<wat{}>.+)'.format(i))
- w = w.replace('\x04',
- '(?P<waar{}>.+)'.format(i))
+ w = reduce(lambda x, y: x.replace(y[0], y[1].format(i)), REPL, w)
w = re.sub('\s+', '\\s+', w)
r_t.append(w)
for i, w in enumerate(d_s.words()):
- w = w.replace('\x01',
- '(?P<datum{}>.+)'.format(i))
- w = w.replace('\x02',
- '(?P<tijd{}>.+)'.format(i))
- w = w.replace('\x03',
- '(?P<wat{}>.+)'.format(i))
- w = w.replace('\x04',
- '(?P<waar{}>.+)'.format(i))
+ w = reduce(lambda x, y: x.replace(y[0], y[1].format(i)), REPL, w)
w = re.sub('\s+', '\\s+', w)
r_s.append(w)
return r_t, r_s
def test_entry(self, name, title, summary):
print '\n', repr(title), repr(summary)
- #Get regexes and match
+ # Get regexes and match
r_t, r_s = self.get_regex(name)
matcht = [re.search(t, title) for t in r_t]
matchs = [re.search(t, summary) for t in r_s]
- #Remove all the empty matches
+ # Remove all the empty matches
matcht = filter(lambda x: x is not None, matcht)
matchs = filter(lambda x: x is not None, matchs)
- #Title urls
+ # Title urls
print 'urls:'
for i, u in enumerate(URL_REG.findall(title), 1):
- print '{}: {}'.format(i, u)
- #Title best match
+ print '{}: {}'.format(i, filter(None, u))
+ # Title best match
if matcht:
pprint.pprint(
[m.groupdict() for m in
reversed(sorted(matcht, key=lambda x: len(x.groups())))][0])
else:
print 'no title match'
- #Summary urls
+ # Summary urls
print 'urls:'
for i, u in enumerate(URL_REG.findall(summary), 1):
- print '{}: {}'.format(i, u)
- #Summary best match
+ print '{}: {}'.format(i, filter(None, u))
+ # Summary best match
if matchs:
pprint.pprint(
[m.groupdict() for m in
def main():
cr = Crawler()
- cr.test_entry('dedoelen', 'vr 5 mei, 08:00 uur - Abba live', '')
- cr.test_entry('dedoelen', 'vr 5 mei08:00 uur - Abba live', '')
- cr.test_entry('paradiso',
+ # cr.test_entry('dedoelen', 'vr 5 mei, 08:00 uur - Abba live', '')
+ # cr.test_entry('dedoelen', 'vr 5 mei08:00 uur - Abba live', '')
+ print cr.get_regex('para')
+ cr.test_entry('para',
'donderdag 13 november 2014 19:30 - Wouter Hamel', '')
- cr.test_entry('paradiso', 'zaterdag 27 september 2014 20:30 - A Great Big '
+ cr.test_entry('para', 'zaterdag 27 september 2014 20:30 - A Great Big '
'World - Locatie: Tolhuistuin (zaal)', '')
- cr.test_entry('paradiso', 'zaterdag 27 september 2014 20:30 - A Great Big '
- 'World - Locatie: Tolhuistuin (zaal)', '')
- cr.to_dot('paradiso', 't.dot')
+ cr.test_entry('para', 'zaterdag 27 september 2014 20:30 - A Great Big '
+ 'World - Locatie: Tolhuistuin (zaal)',
+ 'http://wwww.test.nl')
+ cr.to_dot('para', 't.dot')
if __name__ == '__main__':
main()
--- /dev/null
+digraph dawg {
+ node [shape = doublecircle]; 8 22
+ node [shape = circle]; 0 1 2 3 4 5 6 7 9 10 11 12 13 14 15 16 17 18 19 20 21 23 24 25 11
+ 0 -> 1 [label = "\ 1"];
+ 1 -> 2 [label = " "];
+ 2 -> 3 [label = "\ 2"];
+ 3 -> 4 [label = " "];
+ 4 -> 5 [label = "-"];
+ 5 -> 6 [label = " "];
+ 6 -> 7 [label = "\ 3"];
+ 6 -> 8 [label = "\ 4"];
+ 7 -> 9 [label = " "];
+ 9 -> 10 [label = " "];
+ 9 -> 11 [label = "-"];
+ 11 -> 12 [label = " "];
+ 12 -> 13 [label = "L"];
+ 13 -> 14 [label = "o"];
+ 14 -> 15 [label = "c"];
+ 15 -> 16 [label = "a"];
+ 16 -> 17 [label = "t"];
+ 17 -> 18 [label = "i"];
+ 18 -> 19 [label = "e"];
+ 19 -> 20 [label = ":"];
+ 20 -> 21 [label = " "];
+ 21 -> 22 [label = "\ 4"];
+ 10 -> 23 [label = " "];
+ 23 -> 24 [label = "-"];
+ 24 -> 25 [label = " "];
+ 25 -> 11 [label = " "];
+}
In practise this means that the end product is a software solution which does
the previously described tasks.
-
\section{Scientific relevance}
Currently the techniques for conversion from non structured data to structured
data are static and mainly only usable by computer science experts. There is a
al.\cite{Roelofs2009}. The techniques described by Roelofs et al. are more
focussed on extracting data from already isolated data so it can be an addition
to the current project.
+
+\section{Why RSS}
+Web sites change often in overall structure and the way the data is presented
+does also change a lot because of new insights and layout changes. RSS feeds on
+the other hand are often generated from the internal database of the venue's
+site and therefore almost always very precise, structured and consistent. When
+the structure of a RSS feed changes it is mostly because of the content
+management system of the website changes.\\
+Because RSS does not have a structural dimension compared to websites there is
+less information available, RSS feeds are basically raw strings which contain
+all the information, sometimes venues choose to put html in the RSS feeds but
+this is most of the time only to display big chunks of unstructured text in a
+nicer fashion.