-#!/bin/env python
# -*- coding: utf-8 -*-
-import pprint
-import json
+
import ast
+import logging
+import pprint
import re
+re_hdr = re.compile('<th>(?P<h>.*?)</th>')
+re_row = re.compile('<tr>(?P<row>.*?)</tr>')
+re_cel = re.compile('<td>(?P<c>.*?)</td>')
+re_spa = re.compile('(?P<b><span.*background-color:(?P<c>.*?);.*>).*?(?P<e></span>)')
+
+
+def structure_data(d):
+ con = d['content']
+ d['content'] = []
+ d['headers'] = [m.group('h') for m in re_hdr.finditer(con)]
+ logging.debug('headers parsed: {}'.format(d['headers']))
+ for line in re_row.finditer(con):
+ logging.debug('going to parse: {}'.format(line.groupdict()))
+ row = line.group('row')
+ d['content'].append([c.group('c') for c in re_cel.finditer(row)])
+ logging.debug('parsed into: {}'.format(d['content'][-1]))
+ logging.debug('structured into: {}'.format(d))
+
+
+def parse_line(line):
+ logging.debug('parsing line: {}'.format(line))
+ for column in line:
+ logging.debug('parsing column: {}'.format(column))
+ markings = list(re_spa.finditer(column))
+ print markings
+
+
def main():
+ logging.debug('loading file')
with open('../output/raw_out.txt', 'r') as data:
+ logging.info('raw data loaded, going to parse data')
d = data.readline()
+ logging.debug('loaded into: {}'.format(d))
d = re.sub('\)\]}$', '}',
re.sub('\)\],', ',',
re.sub('\[Field\(\'.*?\', ', '', d)))
+ logging.debug('converted to: {}'.format(d))
d = ast.literal_eval(d)
- pprint.pprint(d)
+ logging.debug('parsed into: {}'.format(d))
+ logging.info('raw data parsed, going to structure data')
+ structure_data(d)
+ logging.info('data structured, parsed headers: {}'.format(d['headers']))
+ for line in filter(None, d['content']):
+ parse_line(line)
+
if __name__ == '__main__':
+ logging.basicConfig(level=logging.INFO)
main()
feed = feedparser.parse(url)
channel = feed.feed
req.write('\t<table id="content-table" border="1" id="htab">\n')
- req.write('\t\t<tr><th>Title</th><th>Summary</th><th>...</th></tr>\n')
+ req.write('\t\t<tr><th>Title</th><th>Summary</th></tr>\n')
for i in feed.entries:
- req.write('\t\t<tr><td>{}</td><td>{}</td><td>...</td></tr>\n'.
+ req.write('\t\t<tr><td>{}</td><td>{}</td></tr>\n'.
format(i['title'].encode('ascii', 'xmlcharrefreplace'),
i['summary'].encode('ascii', 'xmlcharrefreplace')))
req.write('\t</table>\n<br />')