-
Notifications
You must be signed in to change notification settings - Fork 0
/
Response_Retriver_Brexit.py
123 lines (95 loc) · 4.87 KB
/
Response_Retriver_Brexit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import tweepy as tw
import pandas as pd
import subprocess
import shlex
import os
import io
#add API personal keys
#These are my personal login keys. If someone else uses this code, he/she has to put in their own credentials
consumer_key = ""
consumer_secret = ""
atoken = ""
asecret = ""
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(atoken, asecret)
api = tw.API(auth, wait_on_rate_limit=True)
### creating the Initial (Original) Tweets df. (df1)
## df1 should include all original tweets about a topic (search words). I could also change the way of search to a list of hashtags in an
## external text file, like Amits example. This file is so far more practise and I will have to think about how to search in general.
# in case we want a list of old tweet id like the Hillary one we could use : api.get_status(tweet_text["id"].iloc[0]) to look up old lists and
#create the df1 list this way. Shouldnt be a big deal
search_words = "#brexit" + " -filter:retweets"
date_since = "2018-11-16"
#tweet_list =[]
original_tweets = tw.Cursor(api.search,
q=search_words,
lang="en",
since=date_since).items(50)
# Iterate and print tweets (One Way)
#for tweet in tweets:
#tweet_list.append(tweet.text)
users_locs = [[tweet.user.screen_name, tweet.user.location, tweet.text, tweet.id, tweet.favorite_count] for tweet in original_tweets]
tweet_text = pd.DataFrame(data=users_locs, columns=['user', "location", "text", "id", "likes"])
ldf1 = len(tweet_text["user"])
df1 = tweet_text.drop_duplicates(subset='text', keep="last") #In my documentation I refere to the initial(original) list of tweets as df1.
### extrakting responses to original tweets based on tweetID
retweets_df = pd.DataFrame(columns = ["screen_name", "original_text","response"])
replies = []
full_text = []
user_name = []
response_likes = []
n = 0
for index, row in df1.iterrows():
name = (tweet_text["user"].iloc[n])
o_id = (tweet_text["id"].iloc[n]) - 1
for full_tweets in tw.Cursor(api.user_timeline,screen_name = name, since_id = o_id,timeout=20).items(1):
for retweet in tw.Cursor(api.search,q='to:'+ name,result_type='recent',timeout=15).items(10):
if hasattr(retweet, 'in_reply_to_status_id_str'):
if (retweet.in_reply_to_status_id_str == full_tweets.id_str):
replies.append(retweet.text)
full_text.append(full_tweets.text)
user_name.append(name)
response_likes.append(retweet.favorite_count)
print(replies)
else:
print("no responses")
else:
print("Error")
n = n + 1
retweets_df = pd.DataFrame(full_text, columns=['original_text'])
retweets_df["response"] = replies
retweets_df["name"] = user_name
retweets_df["response_likes"] = response_likes
df2 = retweets_df.drop_duplicates(subset='response', keep="last")
### creating files that senitstrenght can work with#
d1 = df1[['text']]
d1 = d1.values.tolist()
d2 = df2[["response"]]
d2 = d2.values.tolist()
d_full = d1 + d2
with open('tweets_text.txt', 'w', encoding="utf-8") as f:
for item in d_full:
f.write("%s\n" % item)
## Sentistrenght at work
SentiStrengthLocation = 'sentiment_tool/SentiStrength.jar' #The location of SentiStrength on your computer
SentiStrengthLanguageFolder ='sentiment_tool/SentiStrength_Data/' #The location of the unzipped SentiStrength data files on your computer
FileToClassify = "tweets_text.txt"
classifiedSentimentFile = "tweets_rating.txt"
with io.open(FileToClassify, encoding="utf-8") as f:
for line in f:
p = subprocess.Popen(shlex.split("java -jar '" + SentiStrengthLocation + "' stdin sentidata '" + SentiStrengthLanguageFolder + "'"),stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
b = bytes(line.replace(" ","+"), 'utf-8') #Can't send string in Python 3, must send bytes
stdout_byte, stderr_text = p.communicate(b)
stdout_text = stdout_byte.decode("utf-8") #convert from byte
stdout_text = stdout_text.rstrip().replace(" ","\t") #remove the tab spacing between the positive and negative ratings. e.g. 1 -5 -> 1 -5
with open(classifiedSentimentFile, "a", encoding="utf-8") as myfile:
myfile.write(stdout_text + "\t" + line)
print("Finished! The results will be in:\n" + classifiedSentimentFile)
###create finished file
# 1. Remove the formating of sentistrenght, so that the spacing is replaced by "[". This is to make sure the CSV file seperates correctly
# 2. The colom seperation is set to "[" in step 1. now also the delimeter has to be set to "["
# 3. seperate files again into original responses (df1) and (df2)
# 4. combine the files with extra information from intitial tweet
with open('tweets_rating.txt', encoding="utf-8", errors="ignore") as infile, open('tweets_rating_raw.csv','w', encoding="utf-8", errors="ignore") as outfile:
for line in infile:
outfile.write(line.replace('\t','['))