-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathyle.py
executable file
·32 lines (27 loc) · 1.03 KB
/
yle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#!/usr/bin/python
# -*- coding: utf-8 -*-
from BeautifulSoup import BeautifulSoup
import re
import urllib2
def fetch_data():
return urllib2.urlopen("http://www.yle.fi/tekstitv/txt/P100_01.html").read()
def print_http_headers(content_length):
print "\n".join((
"Expires: -1",
"Cache-Control: private, max-age=0",
"Content-Type: text/html; charset=utf-8",
"Content-Length: %d" % content_length,
))
print "\n"
response_text = '';
soup = BeautifulSoup(fetch_data().decode("iso-8859-1"))
bigs = soup.findAll('big')
for big in bigs:
m = re.search('<big><a href="([^"]+)">(\d+)</a>(.+)</big>', str(big))
if m:
href = m.group(1)
inner = m.group(3)
title = ''.join(BeautifulSoup(inner).findAll(text=True))
response_text += '<div class="item"><a target="_blank" href="%s">%s</a></div>\n' % (href, title)
print_http_headers(len(response_text))
print response_text.encode("utf-8").replace("\xec\xb1\x84", "ä")