(dp0
-S'Paradiso_test1'
+S'test'
p1
(dp2
S'website'
p3
-S'www.test.nl'
+S't'
p4
sS'name'
p5
p7
sS'dloc'
p8
-S'test'
+S'ut'
p9
sS'venue'
p10
-S'p'
+S'Para'
p11
sS'content'
p12
aS'mede-oprichter Broken Social Scene solo'
p24
aa(lp25
-S'<span class="uiWebviewHighlight" style="color: white; background-color: rgb(139, 0, 0);">vrijdag 4 juli 2014</span> <span class="uiWebviewHighlight" style="color: white; background-color: red;">22:00</span> - <span class="uiWebviewHighlight" style="color: white; background-color: green;">Palenke Soultribe</span>'
+S'vrijdag 4 juli 2014 22:00 - Palenke Soultribe'
p26
aS'Electronische muziek en Afro-Colombiaanse ritmes'
p27
(lp48
S'\x01 \x02 - \x03 - Locatie: \x04'
p49
-aS'\x01 \x02 - \x03'
-p50
asS'freq'
-p51
+p50
S'1w'
-p52
+p51
sS'adress'
-p53
-S'adres'
-p54
+p52
+g4
ss.
\ No newline at end of file
self.entries[d['name']] = d
def write(self, path='/var/www/py/crawler.db'):
+ status = ''
entries2 = {kk: {k: v for k, v in vv.iteritems()
if k not in ['summarydawg_t', 'titledawg_t']}
for kk, vv in self.entries.iteritems()}
if os.path.exists(path):
os.rename(path, '{}.bak'.format(path))
+ status += 'Old crawler file found, created backup<br />'
try:
with open(path, 'wb') as f:
f.write(pickle.dumps(entries2))
+ status += 'Crawler written succesfully<br />'
except Exception, e:
# print 'something went wrong writing: {}'.format(e)
# print 'restoring backup'
- raise e
+ status += 'Something went wrong: {}<br />'.format(e)
os.rename('{}.bak'.format(path), path)
+ status += 'Writing failed, restored backup<br />'
finally:
if os.path.exists('{}.bak'.format(path)):
os.remove('{}.bak'.format(path))
+ status += 'Backup file removed<br />'
+ return status
def get_regex(self, name):
d_t = self.entries[name]['titledawg_t']
if matchs:
matches = sorted(matchs, key=lambda x: len(x.groups()))
results['summary'] = list(reversed(matches))[0].groupdict()
- return dict(results['summary'].items() + results['title'].items())
+ outputdct = dict(results['summary'].items() + results['title'].items())
+ return {re.sub('\d', '', k): v for k, v in outputdct.iteritems()}
def has_to_run(self, interval, last_run, now):
time_wait = sum(
cr = crawler.Crawler('/var/www/py/crawler.db')
ns = cr.list_names()
params = {
- 'active_crawlers':
- '\n'.join('<a href="./crawler_edit.py?url={0}">{0}</a><br>'.
- format(a) for a in ns),
'active_crawlers_dropdown':
'\n'.join('<option value={0}>{0}</option>'.format(a) for a in ns)
}
def crawler_edit(req, args, apok):
+ args['name'] = args.get('name', '')
+ req.log_error('handler')
+ req.content_type = 'text/html'
+ req.send_http_header()
+ if args['action'] == 'remove':
+ req.write('Remove {}<br />'.format(args['name']))
+ cr = crawler.Crawler()
+ status = ''
+ try:
+ del(cr.entries[args['name']])
+ status = 'Succes...<br />'
+ status += cr.write()
+ except KeyError:
+ status = 'Name not in the crawler<br />'
+ except Exception, e:
+ status = 'Other exception thrown: {}<br />'.format(e)
+ req.write(status)
+ elif args['action'] == 'edit':
+ req.write('Edit {}\n'.format(args['name']))
+ else:
+ req.write('Unknown editing action: {}'.format(args['action']))
+ req.write('<a href="index.py">Go back</a>')
return apok
def crawler_test(req, args, apok):
+ req.log_error('handler')
+ req.content_type = 'text/html'
+ req.send_http_header()
+ cr = crawler.Crawler(init=True)
+ if 'name' not in args or str(args['name']) not in cr.entries:
+ req.write('Name not in the crawler...<br/>')
+ else:
+ args['summary'] = args.get('summary', '')
+ args['title'] = args.get('title', '')
+ respons = cr.test_entry(str(args['name']), str(args['title']),
+ str(args['summary']))
+ req.write('<b>Title:</b> {}<br/><b>Summary: </b>{}<br/>'.format(
+ str(args['title']), str(args['summary'])))
+ req.write('<br/><b>Results:</b><br/>')
+ req.write('<br/>'.join('{}: {}'.format(k, v)
+ for k, v in sorted(respons.iteritems())))
+ req.write('<br/><br/><a href="index.py">Go back</a>')
return apok
<body>
<table border=1>
<tr>
- <td>Inspect/edit crawler</td>
+ <td>Edit/Remove crawler</td>
<td>Add new crawler</td>
<td>Test crawler</td>
<tr>
<td>
- {active_crawlers}
+ <form method="get" action="./crawler_edit.py">
+ <table>
+ <tr><td>
+ <select name="name">
+ {active_crawlers_dropdown}
+ </select>
+ </td></tr><tr><td>
+ <select name="action">
+ <option value="remove">Remove</option>
+ <option value="edit">Edit</option>
+ </select>
+ </tr></td><tr><td>
+ <input type="submit" value="Submit">
+ </td></tr>
+ </table>
+ </form>
</td>
<td>
<form method="get" action="./input_app.py">