import re
import os
import sys
+import time
URL_REG = re.compile(
('\x03', '(?P<wat{}>.+)'),
('\x04', '(?P<waar{}>.+)')]
+TIMES = {'m': 2628000, 'w': 604800, 'd': 3600, 'm': 60, 's': 1}
+
+REGEX_INT = re.compile('\d+[{}]'.format(''.join(TIMES.keys())))
+
class Crawler():
def __init__(self, dbfile='./crawler.db'):
with open(dbfile, 'rb') as f:
self.entries = pickle.loads(f.read())
+ def list_names(self):
+ return str(self.entries.keys())
+
def add_entry(self, d):
if d['name'] in self.entries:
print 'content already present... skipping'
results['summary'] = list(reversed(matches))[0].groupdict()
return results
+ def has_to_run(self, interval, last_run, now):
+ time_wait = sum(
+ TIMES[i[-1]]*int(i[:-1]) for i in REGEX_INT.findall(interval))
+ if time_wait == 0:
+ raise Exception('interval is either 0 or unparsable')
+ return now - last_run >= time_wait
+
+ def run_entry(self, name):
+ print 'force run', name
+ pass
+
def main():
if len(sys.argv) == 5 and sys.argv[1] == 'test':
- # print 'Crawler: {}\nTitle: {}\nSummary: {}'.format(*sys.argv[2:])
cr = Crawler()
print cr.test_entry(*sys.argv[2:])
- elif len(sys.argv) == 3 and sys.argv[1] == 'run':
- print 'Crawler: {}'.format(sys.argv[2])
+ elif len(sys.argv) >= 2 and sys.argv[1] == 'run':
+ args = sys.argv[2:]
+ force = True if '-f' in args else False
+ cr = Crawler()
+ to_run = []
+ if not args or 'all' in args:
+ for name in cr.entries:
+ to_run.append(name)
+ else:
+ for name in args:
+ if name in cr.entries:
+ to_run.append(name)
+ elif name != '-f':
+ print '"{}" not in the entries.'.format(name)
+ for name in to_run:
+ data = cr.entries[name]
+ if force or 'last_run' not in data or\
+ cr.has_to_run(data['freq'], data['last_run'], time.time()):
+ data['last_run'] = time.time()
+ cr.run_entry(name)
+ else:
+ print 'Skipping because last run was within interval'
+ elif len(sys.argv) == 2 and sys.argv[1] == 'list':
cr = Crawler()
+ print cr.list_names()
else:
- print '{} test crawlername title summary'.format(sys.argv[0])
- print '{} run crawlername'.format(sys.argv[0])
+ print ('{0} list'
+ '{0} test crawlername title summary'
+ '{0} run -f {item1 item2 ...|all}').format(sys.argv[0])
if __name__ == '__main__':
main()