program/everything/input_app.py

   1 #!/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from mod_python import apache, util
   5 import feedparser
   6 import index
   7 import crawler
   8 import re
   9 import urllib
  10 import buttons
  11
  12
  13 def req_pre_pos(req):
  14     req.log_error('handler')
  15     req.content_type = 'text/html'
  16     req.send_http_header()
  17     args = util.FieldStorage(req)
  18     listing, crawl = data_main(args)
  19     req.write(
  20         '<html>\n<head>\n'
  21         '\t<title>VER: 0.01 - HyperFrontend RSS feed POSTREQUEST</title>'
  22         '</head>\n<body>\n'
  23         '\tThanks submitting: <br />\n'
  24         '\t<a href="index.py">Go back...</a>\n<pre>\n'
  25         'Current crawlers: {}\n</pre>\n</body>\n</html>'.format(listing))
  26
  27
  28 def structure_data(d):
  29     re_hdr = re.compile('<th>(?P<h>.*?)</th>', flags=re.MULTILINE | re.DOTALL)
  30     re_row = re.compile('<tr>(?P<row>.*)</tr>', flags=re.MULTILINE | re.DOTALL)
  31     re_dualcel = re.compile('<td id="cel">(?P<c>.*?)</td><!--cel-->',
  32                             flags=re.MULTILINE | re.DOTALL)
  33     con = d['content']
  34     d['content'] = []
  35     d['headers'] = []
  36     for line in con.split('<!--line-->'):
  37         if not line or '<!--cel-->' not in line:
  38             continue
  39         row = re_row.search(line)
  40         row = row.group('row')
  41         for header in re_hdr.finditer(row):
  42             d['headers'].append(header.group('h'))
  43         d['content'].append([])
  44         for cell in re_dualcel.finditer(row):
  45             d['content'][-1].append(cell.group('c'))
  46
  47
  48 def parse_line(line):
  49     re_spa = re.compile('(?P<b><span.*?background-color:\s*(?P<c>.*?);.*?>)(?P'
  50                         '<content>.*?)(?P<e></span>)')
  51     results = []
  52     for column in line:
  53         results.append([])
  54         markings = list(re_spa.finditer(column))
  55         if markings:
  56             results[-1].append(markings)
  57     return results
  58
  59
  60 def create_nodes(d):
  61     color_dict = {'rgb({}, {}, {})'.format(
  62         int(x[1:3], 16), int(x[3:5], 16), int(x[5:], 16)): chr(i)
  63         for i, (x, _) in enumerate(buttons.BUTTONPAIRS, 1)}
  64     line_w_match = []
  65 #    d['content'] = d['content'][1:]
  66     for i, m in enumerate(d['matchdata']):
  67         if filter(None, m):
  68             line_w_match.append((d['content'][i], m))
  69     nodelists = {'Title': [], 'Summary': []}
  70     for (title_l, summary_l), (title_m, summary_m) in line_w_match:
  71         # Title
  72         if title_m:
  73             title = title_m[0]
  74             matches = reversed(sorted(title, key=lambda x: x.end('e')))
  75             for match in matches:
  76                 title_l = title_l[:match.start('e')] + title_l[match.end('e'):]
  77                 title_l = title_l[:match.start('content')] +\
  78                     color_dict[match.group('c').strip()] +\
  79                     title_l[match.end('content'):]
  80                 title_l = title_l[:match.start('b')] + title_l[match.end('b'):]
  81             nodelists['Title'].append(title_l)
  82         # Summary
  83         if summary_m:
  84             summary = summary_m[0]
  85             matches = reversed(sorted(summary, key=lambda x: x.end('e')))
  86             for match in matches:
  87                 summary_l = summary_l[:match.start('e')] +\
  88                     summary_l[match.end('e'):]
  89                 summary_l = summary_l[:match.start('content')] +\
  90                     color_dict[match.group('c').strip()] +\
  91                     summary_l[match.end('content'):]
  92                 summary_l = summary_l[:match.start('b')] +\
  93                     summary_l[match.end('b'):]
  94             nodelists['Summary'].append(summary_l)
  95     return nodelists
  96
  97
  98 def to_dot(q0):
  99     nodenum = 0
 100     final_nodes = []
 101     nodes = []
 102     edges = []
 103     to_visit = [(0, q0)]
 104     visited = set()
 105     translation = []
 106     if q0.final:
 107         final_nodes.append(nodenum)
 108     else:
 109         nodes.append(nodenum)
 110
 111     nodenum += 1
 112     while to_visit:
 113         current = to_visit.pop()
 114         if not current[0] in visited:
 115             visited.add(current[0])
 116             for char, child in current[1].children.iteritems():
 117                 matches = [c for c in translation if c[0] == child]
 118                 curnum = -1
 119                 if matches:
 120                     curnum = matches[-1][1]
 121                 else:
 122                     translation.append((child, nodenum))
 123                     curnum = nodenum
 124                     nodenum += 1
 125                 if child.final:
 126                     final_nodes.append(curnum)
 127                 else:
 128                     nodes.append(curnum)
 129                 edges.append((current[0], char, curnum))
 130                 to_visit.append((curnum, child))
 131     print 'digraph dawg {'
 132     print '\tnode [shape = doublecircle]; {}'.format(
 133         ' '.join(str(n) for n in final_nodes))
 134     print '\tnode [shape = circle]; {}'.format(
 135         ' '.join(str(n) for n in nodes))
 136     for fr, ch, to in edges:
 137         print '\t{} -> {} [label = "{}"];'.format(fr, to, ch)
 138     print '}'
 139
 140
 141 def ata_main(d):
 142     d = {k: str(v) for k, v in dict(d).iteritems() if k != 'write'}
 143     structure_data(d)
 144     d['matchdata'] = []
 145     for line in filter(None, d['content']):
 146         d['matchdata'].append(parse_line(line))
 147     nodelists = create_nodes(d)
 148     d['titledawg'] = nodelists['Title']
 149     d['summarydawg'] = nodelists['Summary']
 150     del(d['matchdata'])
 151     crawl = crawler.Crawler()
 152     crawl.add_entry(d)
 153     crawl.write()
 154     return crawl.list_names(), crawl
 155
 156
 157 def feed2html(req, url, name):
 158     url = urllib.unquote(url)
 159     url = url if re.match('https?://', url) else 'http://{}'.format(url)
 160     req.write(
 161         '\tLoading "{}" as <p id="rssname">{}</p><br />\n'.format(url, name))
 162     feed = feedparser.parse(url)
 163     req.write('\t<table id="content-table" border="1">\n')
 164     req.write('\t\t<tr><th>Title</th><th>Summary</th></tr>\n')
 165     for i in feed.entries[:10]:
 166         req.write(('\t\t<tr><td id="cel">{}</td><!--cel--><td id="cel">{}</td>'
 167                    '<!--cel--></tr>\n').format(
 168             i['title'].encode('ascii', 'xmlcharrefreplace'),
 169             i['summary'].encode('ascii', 'xmlcharrefreplace')))
 170     req.write('\t</table>\n<br />')
 171
 172
 173 def handler(req):
 174     if req.uri.split('/')[-1] == 'index.py':
 175         return index.index(req, util.FieldStorage(req), apache.OK)
 176     elif req.uri.split('/')[-1] == 'crawler_test.py':
 177         return index.crawler_test(req, util.FieldStorage(req), apache.OK)
 178     elif req.uri.split('/')[-1] == 'crawler_edit.py':
 179         return index.crawler_edit(req, util.FieldStorage(req), apache.OK)
 180     elif req.uri.split('/')[-1] == 'crawler_new.py':
 181         return index.crawler_new(req, util.FieldStorage(req), apache.OK)
 182     elif req.uri.split('/')[-1] == 'crawler_xml.py':
 183         return index.crawler_xml(req, util.FieldStorage(req), apache.OK)
 184     else:
 185         if req.method == "POST":
 186             if req.uri.split('/')[-1] == 'hyper.py':
 187                 req_pre_pos(req)
 188             elif req.uri.split('/')[-1] == 'preview.py':
 189                 args = util.FieldStorage(req)
 190                 cr = crawler.Crawler()
 191                 listing, crawl = data_main(args)
 192                 returncode = index.crawler_test(req, args, apache.OK)
 193                 cr.write()
 194                 return returncode
 195         else:
 196             req.write('Unknown case')
 197         return apache.OK