From: Mart Lubbers Date: Sun, 7 Sep 2014 12:47:22 +0000 (+0200) Subject: Run, test and list options in crawler X-Git-Url: https://git.martlubbers.net/?a=commitdiff_plain;h=cfb2b7c261c76000b2f949aee46d7751eb88bf53;p=bsc-thesis1415.git Run, test and list options in crawler todo: edit dictionary values gui --- diff --git a/program/everything/crawler.py b/program/everything/crawler.py index 767fb80..8a864b0 100644 --- a/program/everything/crawler.py +++ b/program/everything/crawler.py @@ -5,6 +5,7 @@ import pickle import re import os import sys +import time URL_REG = re.compile( @@ -18,6 +19,10 @@ REPL = [ ('\x03', '(?P.+)'), ('\x04', '(?P.+)')] +TIMES = {'m': 2628000, 'w': 604800, 'd': 3600, 'm': 60, 's': 1} + +REGEX_INT = re.compile('\d+[{}]'.format(''.join(TIMES.keys()))) + class Crawler(): def __init__(self, dbfile='./crawler.db'): @@ -27,6 +32,9 @@ class Crawler(): with open(dbfile, 'rb') as f: self.entries = pickle.loads(f.read()) + def list_names(self): + return str(self.entries.keys()) + def add_entry(self, d): if d['name'] in self.entries: print 'content already present... skipping' @@ -142,18 +150,51 @@ class Crawler(): results['summary'] = list(reversed(matches))[0].groupdict() return results + def has_to_run(self, interval, last_run, now): + time_wait = sum( + TIMES[i[-1]]*int(i[:-1]) for i in REGEX_INT.findall(interval)) + if time_wait == 0: + raise Exception('interval is either 0 or unparsable') + return now - last_run >= time_wait + + def run_entry(self, name): + print 'force run', name + pass + def main(): if len(sys.argv) == 5 and sys.argv[1] == 'test': - # print 'Crawler: {}\nTitle: {}\nSummary: {}'.format(*sys.argv[2:]) cr = Crawler() print cr.test_entry(*sys.argv[2:]) - elif len(sys.argv) == 3 and sys.argv[1] == 'run': - print 'Crawler: {}'.format(sys.argv[2]) + elif len(sys.argv) >= 2 and sys.argv[1] == 'run': + args = sys.argv[2:] + force = True if '-f' in args else False + cr = Crawler() + to_run = [] + if not args or 'all' in args: + for name in cr.entries: + to_run.append(name) + else: + for name in args: + if name in cr.entries: + to_run.append(name) + elif name != '-f': + print '"{}" not in the entries.'.format(name) + for name in to_run: + data = cr.entries[name] + if force or 'last_run' not in data or\ + cr.has_to_run(data['freq'], data['last_run'], time.time()): + data['last_run'] = time.time() + cr.run_entry(name) + else: + print 'Skipping because last run was within interval' + elif len(sys.argv) == 2 and sys.argv[1] == 'list': cr = Crawler() + print cr.list_names() else: - print '{} test crawlername title summary'.format(sys.argv[0]) - print '{} run crawlername'.format(sys.argv[0]) + print ('{0} list' + '{0} test crawlername title summary' + '{0} run -f {item1 item2 ...|all}').format(sys.argv[0]) if __name__ == '__main__': main()