forked from paticake/twitter-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwitter_streaming_tweepy.py
78 lines (72 loc) · 2.64 KB
/
twitter_streaming_tweepy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# this code allows us to search Twitter for keyword(s) and collect the tweets on a .csv file
# it can also collect tweets within a certain timeframe
import tweepy
import csv
import time
import json
# use the Tweepy python library to access the Twitter API
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
api = tweepy.API(auth)
# this function takes out characters that can break import into Excel and replaces them with spaces
# it also does the unicode bit
def getVal(val):
clean = ""
if val:
val = val.replace('|', ' ')
val = val.replace('\n', ' ')
val = val.replace('\r', ' ')
clean = val.encode('utf-8')
return clean
# sets up a file to write to
csvfile = open('streamed_tweets.csv', 'w')
csvwriter = csv.writer(csvfile, delimiter='|')
query = '' #keyword goes here
max_tweets = 10000 #set a max number of tweets to collect
#shows full text of truncated tweets
def on_status(self, status):
if hasattr(status, "retweeted_status"): # Check if Retweet
try:
print(status.retweeted_status.extended_tweet["full_text"])
except AttributeError:
print(status.retweeted_status.text)
else:
try:
print(status.extended_tweet["full_text"])
except AttributeError:
print(status.text)
# since_id and max_id allow you to set a timeframe for tweet scraping if not collecting tweets live
# wait_on_rate_limit automatically waits for rate limits to replenish
# tweet_mode='extended' avoids tweets showing as truncated
for status in tweepy.Cursor(api.search, q=query, since_id=,
max_id=, wait_on_rate_limit=True,
tweet_mode='extended').items(max_tweets):
try:
tweet = dict(status._json)
tweet_text = tweet['full_text']
# write the values to .csv file
print(tweet_text)
csvwriter.writerow([
tweet['created_at'],
getVal(tweet['user']['screen_name']),
getVal(tweet_text),
getVal(tweet['user']['location']),
tweet['user']['geo_enabled'],
tweet['place'],
tweet['lang'],
tweet['retweet_count'],
tweet['source'],
tweet['user']['verified'],
tweet['user']['statuses_count'],
tweet['user']['followers_count'],
tweet['user']['friends_count'],
tweet['user']['created_at'],
tweet['user']['id']
])
except Exception, err:
print(err)
pass