re_hdr = re.compile('<th>(?P<h>.*?)</th>')
re_row = re.compile('<tr>(?P<row>.*?)</tr>')
re_cel = re.compile('<td>(?P<c>.*?)</td>')
-re_spa = re.compile('(?P<b><span.*?background-color:(?P<c>.*?);.*?>)(?P<content>.*?)(?P<e></span>)')
+re_spa = re.compile('(?P<b><span.*?background-color:\s*(?P<c>.*?);.*?>)(?P<c' +
+ 'ontent>.*?)(?P<e></span>)')
+
+colordict = {'green': 'wat', 'blue': 'waar', 'red': 'wanneer'}
def structure_data(d):
logging.debug('parsing column: {}'.format(column))
markings = list(re_spa.finditer(column))
if markings:
+ markings = [(colordict[m.group('c')], m.span('content'))
+ for m in markings]
results.append(markings)
logging.debug('found {} spans in the column'.format(len(markings)))
logging.debug('found {} columns with markings'.format(len(results)))
+ return results
def main():
logging.info('raw data parsed, going to structure data')
structure_data(d)
logging.info('data structured, parsed headers: {}'.format(d['headers']))
- for line in filter(None, d['content']):
- parse_line(line)
+ # Isolate the span objects from the lines and columns
+ lines = filter(None, d['content'])
+ matches = [(line, parse_line(line)) for line in lines if parse_line(line)]
+ pprint.pprint(matches)
if __name__ == '__main__':