import pickle
import re
import os
-import pprint
import sys
out.close()
def test_entry(self, name, title, summary):
- print '\n', repr(title), repr(summary)
+ results = {}
# Get regexes and match
r_t, r_s = self.get_regex(name)
matcht = [re.search(t, title) for t in r_t]
matchs = filter(lambda x: x is not None, matchs)
# Title urls
- print 'urls:'
- for i, u in enumerate(URL_REG.findall(title), 1):
- print '{}: {}'.format(i, filter(None, u))
+ for u in URL_REG.findall(title):
+ results['url'] += list(filter(None, u))
+ results['url'] = URL_REG.findall(title)
+
# Title best match
+ results['title'] = {}
if matcht:
- pprint.pprint(
- [m.groupdict() for m in
- reversed(sorted(matcht, key=lambda x: len(x.groups())))][0])
- else:
- print 'no title match'
+ matches = sorted(matcht, key=lambda x: len(x.groups()))
+ results['title'] = list(reversed(matches))[0].groupdict()
+
# Summary urls
- print 'urls:'
- for i, u in enumerate(URL_REG.findall(summary), 1):
- print '{}: {}'.format(i, filter(None, u))
+ for u in URL_REG.findall(summary):
+ results['url'] += list(filter(None, u))
# Summary best match
+ results['summary'] = {}
if matchs:
- pprint.pprint(
- [m.groupdict() for m in
- reversed(sorted(matchs, key=lambda x: len(x.groups())))][0])
- else:
- print 'no summary match'
+ matches = sorted(matchs, key=lambda x: len(x.groups()))
+ results['summary'] = list(reversed(matches))[0].groupdict()
+ return results
def main():
if len(sys.argv) == 5 and sys.argv[1] == 'test':
- print 'Crawler: {}\nTitle: {}\nSummary: {}'.format(*sys.argv[2:])
+ # print 'Crawler: {}\nTitle: {}\nSummary: {}'.format(*sys.argv[2:])
cr = Crawler()
- cr.test_entry(*sys.argv[2:])
+ print cr.test_entry(*sys.argv[2:])
elif len(sys.argv) == 3 and sys.argv[1] == 'run':
print 'Crawler: {}'.format(sys.argv[2])
cr = Crawler()