up
[bsc-thesis1415.git] / program / everything / input_app.py
1 #!/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from mod_python import apache, util
5 import feedparser
6 import index
7 import crawler
8 import re
9 import urllib
10 import buttons
11
12
13 def req_pre_pos(req):
14 req.log_error('handler')
15 req.content_type = 'text/html'
16 req.send_http_header()
17 args = util.FieldStorage(req)
18 listing, crawl = data_main(args)
19 req.write(
20 '<html>\n<head>\n'
21 '\t<title>VER: 0.01 - HyperFrontend RSS feed POSTREQUEST</title>'
22 '</head>\n<body>\n'
23 '\tThanks submitting: <br />\n'
24 '\t<a href="index.py">Go back...</a>\n<pre>\n'
25 'Current crawlers: {}\n</pre>\n</body>\n</html>'.format(listing))
26
27
28 def structure_data(d):
29 re_hdr = re.compile('<th>(?P<h>.*?)</th>', flags=re.MULTILINE | re.DOTALL)
30 re_row = re.compile('<tr>(?P<row>.*)</tr>', flags=re.MULTILINE | re.DOTALL)
31 re_dualcel = re.compile('<td id="cel">(?P<c>.*?)</td><!--cel-->',
32 flags=re.MULTILINE | re.DOTALL)
33 con = d['content']
34 d['content'] = []
35 d['headers'] = []
36 for line in con.split('<!--line-->'):
37 if not line or '<!--cel-->' not in line:
38 continue
39 row = re_row.search(line)
40 row = row.group('row')
41 for header in re_hdr.finditer(row):
42 d['headers'].append(header.group('h'))
43 d['content'].append([])
44 for cell in re_dualcel.finditer(row):
45 d['content'][-1].append(cell.group('c'))
46
47
48 def parse_line(line):
49 re_spa = re.compile('(?P<b><span.*?background-color:\s*(?P<c>.*?);.*?>)(?P'
50 '<content>.*?)(?P<e></span>)')
51 results = []
52 for column in line:
53 results.append([])
54 markings = list(re_spa.finditer(column))
55 if markings:
56 results[-1].append(markings)
57 return results
58
59
60 def create_nodes(d):
61 color_dict = {'rgb({}, {}, {})'.format(
62 int(x[1:3], 16), int(x[3:5], 16), int(x[5:], 16)): chr(i)
63 for i, (x, _) in enumerate(buttons.BUTTONPAIRS, 1)}
64 line_w_match = []
65 # d['content'] = d['content'][1:]
66 for i, m in enumerate(d['matchdata']):
67 if filter(None, m):
68 line_w_match.append((d['content'][i], m))
69 nodelists = {'Title': [], 'Summary': []}
70 for (title_l, summary_l), (title_m, summary_m) in line_w_match:
71 # Title
72 if title_m:
73 title = title_m[0]
74 matches = reversed(sorted(title, key=lambda x: x.end('e')))
75 for match in matches:
76 title_l = title_l[:match.start('e')] + title_l[match.end('e'):]
77 title_l = title_l[:match.start('content')] +\
78 color_dict[match.group('c').strip()] +\
79 title_l[match.end('content'):]
80 title_l = title_l[:match.start('b')] + title_l[match.end('b'):]
81 nodelists['Title'].append(title_l)
82 # Summary
83 if summary_m:
84 summary = summary_m[0]
85 matches = reversed(sorted(summary, key=lambda x: x.end('e')))
86 for match in matches:
87 summary_l = summary_l[:match.start('e')] +\
88 summary_l[match.end('e'):]
89 summary_l = summary_l[:match.start('content')] +\
90 color_dict[match.group('c').strip()] +\
91 summary_l[match.end('content'):]
92 summary_l = summary_l[:match.start('b')] +\
93 summary_l[match.end('b'):]
94 nodelists['Summary'].append(summary_l)
95 return nodelists
96
97
98 def to_dot(q0):
99 nodenum = 0
100 final_nodes = []
101 nodes = []
102 edges = []
103 to_visit = [(0, q0)]
104 visited = set()
105 translation = []
106 if q0.final:
107 final_nodes.append(nodenum)
108 else:
109 nodes.append(nodenum)
110
111 nodenum += 1
112 while to_visit:
113 current = to_visit.pop()
114 if not current[0] in visited:
115 visited.add(current[0])
116 for char, child in current[1].children.iteritems():
117 matches = [c for c in translation if c[0] == child]
118 curnum = -1
119 if matches:
120 curnum = matches[-1][1]
121 else:
122 translation.append((child, nodenum))
123 curnum = nodenum
124 nodenum += 1
125 if child.final:
126 final_nodes.append(curnum)
127 else:
128 nodes.append(curnum)
129 edges.append((current[0], char, curnum))
130 to_visit.append((curnum, child))
131 print 'digraph dawg {'
132 print '\tnode [shape = doublecircle]; {}'.format(
133 ' '.join(str(n) for n in final_nodes))
134 print '\tnode [shape = circle]; {}'.format(
135 ' '.join(str(n) for n in nodes))
136 for fr, ch, to in edges:
137 print '\t{} -> {} [label = "{}"];'.format(fr, to, ch)
138 print '}'
139
140
141 def ata_main(d):
142 d = {k: str(v) for k, v in dict(d).iteritems() if k != 'write'}
143 structure_data(d)
144 d['matchdata'] = []
145 for line in filter(None, d['content']):
146 d['matchdata'].append(parse_line(line))
147 nodelists = create_nodes(d)
148 d['titledawg'] = nodelists['Title']
149 d['summarydawg'] = nodelists['Summary']
150 del(d['matchdata'])
151 crawl = crawler.Crawler()
152 crawl.add_entry(d)
153 crawl.write()
154 return crawl.list_names(), crawl
155
156
157 def feed2html(req, url, name):
158 url = urllib.unquote(url)
159 url = url if re.match('https?://', url) else 'http://{}'.format(url)
160 req.write(
161 '\tLoading "{}" as <p id="rssname">{}</p><br />\n'.format(url, name))
162 feed = feedparser.parse(url)
163 req.write('\t<table id="content-table" border="1">\n')
164 req.write('\t\t<tr><th>Title</th><th>Summary</th></tr>\n')
165 for i in feed.entries[:10]:
166 req.write(('\t\t<tr><td id="cel">{}</td><!--cel--><td id="cel">{}</td>'
167 '<!--cel--></tr>\n').format(
168 i['title'].encode('ascii', 'xmlcharrefreplace'),
169 i['summary'].encode('ascii', 'xmlcharrefreplace')))
170 req.write('\t</table>\n<br />')
171
172
173 def handler(req):
174 if req.uri.split('/')[-1] == 'index.py':
175 return index.index(req, util.FieldStorage(req), apache.OK)
176 elif req.uri.split('/')[-1] == 'crawler_test.py':
177 return index.crawler_test(req, util.FieldStorage(req), apache.OK)
178 elif req.uri.split('/')[-1] == 'crawler_edit.py':
179 return index.crawler_edit(req, util.FieldStorage(req), apache.OK)
180 elif req.uri.split('/')[-1] == 'crawler_new.py':
181 return index.crawler_new(req, util.FieldStorage(req), apache.OK)
182 elif req.uri.split('/')[-1] == 'crawler_xml.py':
183 return index.crawler_xml(req, util.FieldStorage(req), apache.OK)
184 else:
185 if req.method == "POST":
186 if req.uri.split('/')[-1] == 'hyper.py':
187 req_pre_pos(req)
188 elif req.uri.split('/')[-1] == 'preview.py':
189 args = util.FieldStorage(req)
190 cr = crawler.Crawler()
191 listing, crawl = data_main(args)
192 returncode = index.crawler_test(req, args, apache.OK)
193 cr.write()
194 return returncode
195 else:
196 req.write('Unknown case')
197 return apache.OK