From 9d60e58160d75c0d07e0c53d4794975dc74faf53 Mon Sep 17 00:00:00 2001 From: Mart Lubbers Date: Thu, 22 May 2014 12:05:56 +0200 Subject: [PATCH] crawlen column and row separation done --- README.md | 2 +- program/hypconvert/hypconvert.py | 46 +++++++++++++++++++++++++++++--- program/hypfront/hyper.py | 4 +-- program/hypfront/install.sh | 2 +- 4 files changed, 46 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 5f41d2f..02d5943 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -VERSION 0.01 +VERSION 0.02 todo: git instructions mailen skype: hyperoffice diff --git a/program/hypconvert/hypconvert.py b/program/hypconvert/hypconvert.py index 7aa7cd0..cf67dfd 100644 --- a/program/hypconvert/hypconvert.py +++ b/program/hypconvert/hypconvert.py @@ -1,18 +1,56 @@ -#!/bin/env python # -*- coding: utf-8 -*- -import pprint -import json + import ast +import logging +import pprint import re +re_hdr = re.compile('(?P.*?)') +re_row = re.compile('(?P.*?)') +re_cel = re.compile('(?P.*?)') +re_spa = re.compile('(?P.*?);.*>).*?(?P)') + + +def structure_data(d): + con = d['content'] + d['content'] = [] + d['headers'] = [m.group('h') for m in re_hdr.finditer(con)] + logging.debug('headers parsed: {}'.format(d['headers'])) + for line in re_row.finditer(con): + logging.debug('going to parse: {}'.format(line.groupdict())) + row = line.group('row') + d['content'].append([c.group('c') for c in re_cel.finditer(row)]) + logging.debug('parsed into: {}'.format(d['content'][-1])) + logging.debug('structured into: {}'.format(d)) + + +def parse_line(line): + logging.debug('parsing line: {}'.format(line)) + for column in line: + logging.debug('parsing column: {}'.format(column)) + markings = list(re_spa.finditer(column)) + print markings + + def main(): + logging.debug('loading file') with open('../output/raw_out.txt', 'r') as data: + logging.info('raw data loaded, going to parse data') d = data.readline() + logging.debug('loaded into: {}'.format(d)) d = re.sub('\)\]}$', '}', re.sub('\)\],', ',', re.sub('\[Field\(\'.*?\', ', '', d))) + logging.debug('converted to: {}'.format(d)) d = ast.literal_eval(d) - pprint.pprint(d) + logging.debug('parsed into: {}'.format(d)) + logging.info('raw data parsed, going to structure data') + structure_data(d) + logging.info('data structured, parsed headers: {}'.format(d['headers'])) + for line in filter(None, d['content']): + parse_line(line) + if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) main() diff --git a/program/hypfront/hyper.py b/program/hypfront/hyper.py index 2591b6f..78e2a53 100644 --- a/program/hypfront/hyper.py +++ b/program/hypfront/hyper.py @@ -92,9 +92,9 @@ def feed2html(req, url, name): feed = feedparser.parse(url) channel = feed.feed req.write('\t\n') - req.write('\t\t\n') + req.write('\t\t\n') for i in feed.entries: - req.write('\t\t\n'. + req.write('\t\t\n'. format(i['title'].encode('ascii', 'xmlcharrefreplace'), i['summary'].encode('ascii', 'xmlcharrefreplace'))) req.write('\t
TitleSummary...
TitleSummary
{}{}...
{}{}
\n
') diff --git a/program/hypfront/install.sh b/program/hypfront/install.sh index 868501d..ab68ea4 100644 --- a/program/hypfront/install.sh +++ b/program/hypfront/install.sh @@ -4,4 +4,4 @@ sudo cp -v ./*.{xml,py,html,js} /var/www/py/ sudo mkdir /var/www/py/files sudo chown -R mart:www-data /var/www/py sudo chmod -R 770 /var/www/py -ln -s /var/www/py/files/ ../output +ln -s /var/www/py/files ../output -- 2.20.1