From: Mart Lubbers Date: Tue, 2 Sep 2014 13:15:36 +0000 (+0200) Subject: new X-Git-Url: https://git.martlubbers.net/?a=commitdiff_plain;h=3362aa39198494e289e9c1b5c97541055a27e1ee;p=bsc-thesis1415.git new --- diff --git a/program/everything/crawler.py b/program/everything/crawler.py index 13c6b08..767fb80 100644 --- a/program/everything/crawler.py +++ b/program/everything/crawler.py @@ -4,7 +4,6 @@ import pickle import re import os -import pprint import sys @@ -112,7 +111,7 @@ class Crawler(): out.close() def test_entry(self, name, title, summary): - print '\n', repr(title), repr(summary) + results = {} # Get regexes and match r_t, r_s = self.get_regex(name) matcht = [re.search(t, title) for t in r_t] @@ -123,34 +122,32 @@ class Crawler(): matchs = filter(lambda x: x is not None, matchs) # Title urls - print 'urls:' - for i, u in enumerate(URL_REG.findall(title), 1): - print '{}: {}'.format(i, filter(None, u)) + for u in URL_REG.findall(title): + results['url'] += list(filter(None, u)) + results['url'] = URL_REG.findall(title) + # Title best match + results['title'] = {} if matcht: - pprint.pprint( - [m.groupdict() for m in - reversed(sorted(matcht, key=lambda x: len(x.groups())))][0]) - else: - print 'no title match' + matches = sorted(matcht, key=lambda x: len(x.groups())) + results['title'] = list(reversed(matches))[0].groupdict() + # Summary urls - print 'urls:' - for i, u in enumerate(URL_REG.findall(summary), 1): - print '{}: {}'.format(i, filter(None, u)) + for u in URL_REG.findall(summary): + results['url'] += list(filter(None, u)) # Summary best match + results['summary'] = {} if matchs: - pprint.pprint( - [m.groupdict() for m in - reversed(sorted(matchs, key=lambda x: len(x.groups())))][0]) - else: - print 'no summary match' + matches = sorted(matchs, key=lambda x: len(x.groups())) + results['summary'] = list(reversed(matches))[0].groupdict() + return results def main(): if len(sys.argv) == 5 and sys.argv[1] == 'test': - print 'Crawler: {}\nTitle: {}\nSummary: {}'.format(*sys.argv[2:]) + # print 'Crawler: {}\nTitle: {}\nSummary: {}'.format(*sys.argv[2:]) cr = Crawler() - cr.test_entry(*sys.argv[2:]) + print cr.test_entry(*sys.argv[2:]) elif len(sys.argv) == 3 and sys.argv[1] == 'run': print 'Crawler: {}'.format(sys.argv[2]) cr = Crawler()