From: Mart Lubbers Date: Mon, 2 Jun 2014 14:06:06 +0000 (+0200) Subject: euao X-Git-Url: https://git.martlubbers.net/?a=commitdiff_plain;h=65c4edb0769d9f9c78730f230a5ba7287c59a344;p=bsc-thesis1415.git euao --- diff --git a/program/hypconvert/hypconvert.py b/program/hypconvert/hypconvert.py index 34548ff..b93f6fd 100644 --- a/program/hypconvert/hypconvert.py +++ b/program/hypconvert/hypconvert.py @@ -8,7 +8,10 @@ import re re_hdr = re.compile('(?P.*?)') re_row = re.compile('(?P.*?)') re_cel = re.compile('(?P.*?)') -re_spa = re.compile('(?P.*?);.*?>)(?P.*?)(?P)') +re_spa = re.compile('(?P.*?);.*?>)(?P.*?)(?P)') + +colordict = {'green': 'wat', 'blue': 'waar', 'red': 'wanneer'} def structure_data(d): @@ -31,9 +34,12 @@ def parse_line(line): logging.debug('parsing column: {}'.format(column)) markings = list(re_spa.finditer(column)) if markings: + markings = [(colordict[m.group('c')], m.span('content')) + for m in markings] results.append(markings) logging.debug('found {} spans in the column'.format(len(markings))) logging.debug('found {} columns with markings'.format(len(results))) + return results def main(): @@ -51,8 +57,10 @@ def main(): logging.info('raw data parsed, going to structure data') structure_data(d) logging.info('data structured, parsed headers: {}'.format(d['headers'])) - for line in filter(None, d['content']): - parse_line(line) + # Isolate the span objects from the lines and columns + lines = filter(None, d['content']) + matches = [(line, parse_line(line)) for line in lines if parse_line(line)] + pprint.pprint(matches) if __name__ == '__main__':