-
Notifications
You must be signed in to change notification settings - Fork 0
/
twitter.py
248 lines (208 loc) · 7.61 KB
/
twitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import re
import os
import sys
import json
import time
import textwrap
import hashlib
import calendar
import traceback
from collections import namedtuple
from operator import itemgetter
from cStringIO import StringIO
from contextlib import closing
import requests
import Image
blocked = set(line.strip() for line in file("blocked.txt") if line.strip())
Video = namedtuple("Video", "duration url")
def extract_content(tweet):
text = tweet.text
all_entities = tweet.entities.copy()
if hasattr(tweet, 'extended_entities'):
# import pprint
# print text.encode("utf8")
# pprint.pprint(tweet.extended_entities)
all_entities.update(tweet.extended_entities)
sorted_entities = []
for etype, entities in all_entities.iteritems():
for entity in entities:
s, e = entity['indices']
sorted_entities.append((s, e, etype, entity))
sorted_entities.sort(reverse=True)
def replace(str, s, e, new):
return str[:s] + new + str[e:]
images = []
video = None
for s, e, etype, entity in sorted_entities:
if etype == 'media':
text = replace(text, s, e, '')
url = None
for size in ('large', 'medium'): # try the preferred sizes
size_info = entity['sizes'].get(size)
if size_info and size_info['h'] < 2048 and size_info['w'] < 2048:
url = entity['media_url_https'] + ":" + size
break
if url is None:
url = entity['media_url_https']
images.append(url)
video_info = entity.get('video_info')
if video_info:
print >>sys.stderr, "got videos: %r" % (video_info,)
duration = video_info['duration_millis'] / 1000. \
if 'duration_millis' in video_info else None
usable_videos = []
for variant in video_info['variants']:
if variant['content_type'] == 'video/mp4':
usable_videos.append((variant['bitrate'], variant['url']))
usable_videos.sort(reverse=True) # highest bitrate first
while len(usable_videos) >= 2:
highest_bitrate = usable_videos[0][0]
if highest_bitrate > 1000000:
print >>sys.stderr, "discarding variant with too high bitrate %d" % highest_bitrate
usable_videos.pop(0)
else:
break
if usable_videos:
video = Video(duration, usable_videos[0][1])
elif etype == 'urls':
text = replace(text, s, e, entity['display_url'])
# print sorted_entities
# print images
# why does twitter return html entities!?
text = text.replace("&", "&")
text = text.replace("<", "<")
text = text.replace(">", ">")
return text, images, video
def cache_image(url, ext='jpg'):
cache_name = 'cache-image-%s.%s' % (hashlib.md5(url).hexdigest(), ext)
print >>sys.stderr, 'caching %s' % url
if not os.path.exists(cache_name):
try:
r = requests.get(url, timeout=20)
fobj = StringIO(r.content)
im = Image.open(fobj) # test if it opens
del fobj
im.save(cache_name)
except:
traceback.print_exc()
return
return cache_name
def cache_video(url):
cache_name = 'cache-video-%s.mp4' % hashlib.md5(url).hexdigest()
print >>sys.stderr, 'caching %s' % url
if not os.path.exists(cache_name):
try:
with closing(requests.get(url, stream=True, timeout=20)) as r:
with file(cache_name, "wb") as out:
for chunk in r.iter_content(chunk_size = 2**16):
out.write(chunk)
except:
traceback.print_exc()
return
return cache_name
def cache_images(urls):
cached_images = []
for url in urls:
cached = cache_image(url)
if cached:
cached_images.append(cached)
return cached_images
def wrap(text, width=30):
out = []
for line in text.splitlines():
out.extend(textwrap.wrap(line, width=width))
if len(out) > 8: # more than 8 lines
out = textwrap.wrap(text, width=width)
return out
def profile_image(url):
# url = url.replace('normal', 'bigger')
url = url.replace('normal', '200x200')
image = cache_image(url, 'png')
if not image:
return 'default-profile.png'
return image
def clean_whitespace(text):
return re.sub("\s+", " ", text).strip()
def convert(tweet):
text, images, video = extract_content(tweet)
cached_images = cache_images(images)
cached_video = cache_video(video.url) if video else None
converted = dict(
name = tweet.user.name,
created_at = calendar.timegm(tweet.created_at.utctimetuple()),
screen_name = tweet.user.screen_name,
text = clean_whitespace(text),
lines = wrap(text),
profile_image = profile_image(tweet.user.profile_image_url_https),
images = cached_images,
)
if cached_video:
converted['video'] = dict(
filename = cached_video,
)
if video.duration:
converted['video']['duration'] = video.duration
return converted
def save_tweets(tweets):
tweets = [convert(tweet) for tweet in tweets]
tweets.sort(key=itemgetter("created_at"), reverse=True)
with file('tweets.json', 'wb') as f:
f.write(json.dumps(tweets, ensure_ascii=False).encode('utf-8'))
def is_tweet_garbage(tweet):
if tweet.user.name in blocked:
print >>sys.stderr, "GARBAGE: blocked user"
return True
if hasattr(tweet, 'retweeted_status'):
print >>sys.stderr, "GARBAGE: ehh. retweet"
return True
if tweet.user.default_profile:
print >>sys.stderr, "GARBAGE: Default profile"
return True
if tweet.user.default_profile_image:
print >>sys.stderr, "GARBAGE: Default profile image"
return True
if len(tweet.text) < 10:
print >>sys.stderr, "GARBAGE: Too short"
return True
if tweet.text.startswith("."):
print >>sys.stderr, "GARBAGE: Dot tweet"
return True
if tweet.text.startswith("@"):
print >>sys.stderr, "GARBAGE: starts with @"
return True
if tweet.text.startswith("RT "):
print >>sys.stderr, "GARBAGE: starts with RT"
return True
if tweet.user.followers_count < 10:
print >>sys.stderr, "GARBAGE: too few followers"
return True
if tweet.user.description is None:
print >>sys.stderr, "GARBAGE: no description"
return True
return False
def filter_and_save(tweets, not_before, count, filter_garbage):
print >>sys.stderr, "got %d tweets" % len(tweets)
for tweet in tweets:
print >>sys.stderr, "%s %s" % (
tweet.created_at.date(), not_before
)
tweets = [
tweet for tweet in tweets
if tweet.created_at.date() >= not_before and
(not filter_garbage or not is_tweet_garbage(tweet))
][:count]
print >>sys.stderr, "handling %d tweets" % len(tweets)
save_tweets(tweets)
def cleanup(max_age=12*3600):
global blocked
blocked = set(line.strip() for line in file("blocked.txt") if line.strip())
now = time.time()
for filename in os.listdir("."):
if not filename.startswith('cache-'):
continue
age = now - os.path.getctime(filename)
if age > max_age:
try:
os.unlink(filename)
except:
traceback.print_exc()