crawlen column and row separation done
authorMart Lubbers <mart@martlubbers.net>
Thu, 22 May 2014 10:05:56 +0000 (12:05 +0200)
committerMart Lubbers <mart@martlubbers.net>
Thu, 22 May 2014 10:05:56 +0000 (12:05 +0200)
README.md
program/hypconvert/hypconvert.py
program/hypfront/hyper.py
program/hypfront/install.sh

index 5f41d2f..02d5943 100644 (file)
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-VERSION 0.01
+VERSION 0.02
 
 todo: git instructions mailen
 skype: hyperoffice
index 7aa7cd0..cf67dfd 100644 (file)
@@ -1,18 +1,56 @@
-#!/bin/env python
 # -*- coding: utf-8 -*-
-import pprint
-import json
+
 import ast
+import logging
+import pprint
 import re
 
+re_hdr = re.compile('<th>(?P<h>.*?)</th>')
+re_row = re.compile('<tr>(?P<row>.*?)</tr>')
+re_cel = re.compile('<td>(?P<c>.*?)</td>')
+re_spa = re.compile('(?P<b><span.*background-color:(?P<c>.*?);.*>).*?(?P<e></span>)')
+
+
+def structure_data(d):
+    con = d['content']
+    d['content'] = []
+    d['headers'] = [m.group('h') for m in re_hdr.finditer(con)]
+    logging.debug('headers parsed: {}'.format(d['headers']))
+    for line in re_row.finditer(con):
+        logging.debug('going to parse: {}'.format(line.groupdict()))
+        row = line.group('row')
+        d['content'].append([c.group('c') for c in re_cel.finditer(row)])
+        logging.debug('parsed into: {}'.format(d['content'][-1]))
+    logging.debug('structured into: {}'.format(d))
+
+
+def parse_line(line):
+    logging.debug('parsing line: {}'.format(line))
+    for column in line:
+        logging.debug('parsing column: {}'.format(column))
+        markings = list(re_spa.finditer(column))
+        print markings
+
+
 def main():
+    logging.debug('loading file')
     with open('../output/raw_out.txt', 'r') as data:
+        logging.info('raw data loaded, going to parse data')
         d = data.readline()
+        logging.debug('loaded into: {}'.format(d))
         d = re.sub('\)\]}$', '}',
                    re.sub('\)\],', ',',
                           re.sub('\[Field\(\'.*?\', ', '', d)))
+        logging.debug('converted to: {}'.format(d))
         d = ast.literal_eval(d)
-    pprint.pprint(d)
+        logging.debug('parsed into: {}'.format(d))
+    logging.info('raw data parsed, going to structure data')
+    structure_data(d)
+    logging.info('data structured, parsed headers: {}'.format(d['headers']))
+    for line in filter(None, d['content']):
+        parse_line(line)
+
 
 if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
     main()
index 2591b6f..78e2a53 100644 (file)
@@ -92,9 +92,9 @@ def feed2html(req, url, name):
     feed = feedparser.parse(url)
     channel = feed.feed
     req.write('\t<table id="content-table" border="1" id="htab">\n')
-    req.write('\t\t<tr><th>Title</th><th>Summary</th><th>...</th></tr>\n')
+    req.write('\t\t<tr><th>Title</th><th>Summary</th></tr>\n')
     for i in feed.entries:
-        req.write('\t\t<tr><td>{}</td><td>{}</td><td>...</td></tr>\n'.
+        req.write('\t\t<tr><td>{}</td><td>{}</td></tr>\n'.
                   format(i['title'].encode('ascii', 'xmlcharrefreplace'),
                          i['summary'].encode('ascii', 'xmlcharrefreplace')))
     req.write('\t</table>\n<br />')
index 868501d..ab68ea4 100644 (file)
@@ -4,4 +4,4 @@ sudo cp -v ./*.{xml,py,html,js} /var/www/py/
 sudo mkdir /var/www/py/files
 sudo chown -R mart:www-data /var/www/py
 sudo chmod -R 770 /var/www/py
-ln -s /var/www/py/files/ ../output
+ln -s /var/www/py/files ../output