URL_REG = re.compile(
- ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
+ ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<'
+ ur'>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+'
+ ur'\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
class Crawler():
def main():
- cr = Crawler()
- cr.test_entry('dedoelen', 'vr 5 mei, 08:00 uur - Abba live', '')
- cr.test_entry('dedoelen', 'vr 5 mei08:00 uur - Abba live', '')
- cr.test_entry('paradiso',
- 'donderdag 13 november 2014 19:30 - Wouter Hamel', '')
- cr.test_entry('paradiso', 'zaterdag 27 september 2014 20:30 - A Great Big '
- 'World - Locatie: Tolhuistuin (zaal)', '')
- cr.test_entry('paradiso', 'zaterdag 27 september 2014 20:30 - A Great Big '
- 'World - Locatie: Tolhuistuin (zaal)', '')
- cr.to_dot('paradiso', 't.dot')
+ if len(sys.argv) == 5 and sys.argv[1] == 'test':
+ print 'Crawler: {}\nTitle: {}\nSummary: {}'.format(*sys.argv[2:])
+ cr = Crawler()
+ cr.test_entry(*sys.argv[2:])
+ elif len(sys.argv) == 3 and sys.argv[1] == 'run':
+ print 'Crawler: {}'.format(sys.argv[2])
+ cr = Crawler()
+ else:
+ print '{} test crawlername title summary'.format(sys.argv[0])
+ print '{} run crawlername'.format(sys.argv[0])
+
if __name__ == '__main__':
main()