-
Notifications
You must be signed in to change notification settings - Fork 2
/
spikes.py
170 lines (146 loc) · 5.14 KB
/
spikes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
null = ""
false = False
true = True
import pandas as pd
import json
from datetime import datetime
prev_avg = 0
prev_count = 0
def extract_datetime(tweet):
'''
Extracts datetime object from tweet object
'''
return datetime.strptime(tweet['created_at'], "%a %b %d %H:%M:%S +0000 %Y")
def datestring(tweet):
'''
Extracts date string
'''
x = tweet['datetime']
return str(x)[0:10]
def moving_average(row):
'''
Calculates moving average for each row
'''
global prev_avg
alpha = 1/row['n']
new_avg = (1-alpha)*prev_avg + alpha*row['count']
prev_avg = new_avg
return new_avg
def plus_one(row):
'''
Adds 1 to the index to create a column for 'n'-
the number of days thus far. We didn't directly
use the index because we sort the df and reset the
index a couple of times
'''
return row['index'] +1
def add_dates(start_date,increment):
'''
Helper function to add dates within a single month
'''
new_date = int(start_date[-2:]) + increment
new_string = start_date[:-2] + str(new_date)
return new_string
def get_tweetbuckets(counts_df):
'''
Convert df to appropriate output format
required to send to highcharts for plotting
'''
test = counts_df[['count','datestring']]
tweetbuckets = []
for index,row in test.iterrows():
dictionary = pd.Series.to_dict(row)
tweetbuckets.append(dictionary)
return tweetbuckets
def change_avg(row):
'''
Compute change over moving average
'''
val = row['count']
avg = row['moving_average']
return (val-avg)/avg
def change_count(row):
'''
Compute change over prev day's count
'''
global prev_count
if prev_count == 0:
prev_count = 1
current = row['count']
day_change = (current - prev_count)/prev_count
prev_count = current
return day_change
def spikes(sample_hashtag,threshold=0.2,spikes=3):
'''
Take in a hashtag dictionary, a threshold and the number of
spikes required and return
boolean: whether or not there is a spikes
spike_dates : list of dates on which spikes occurred
output : dictionary required to plot graph
list_of_list_of_tweets : tweet text needed for tweet scraper
'''
df = pd.DataFrame.from_dict(sample_hashtag['tweets'])
hashtag = sample_hashtag['hashtag']
df['datetime'] = df.apply (lambda row: extract_datetime(row),axis=1)
df['datestring'] = df.apply (lambda row: datestring(row), axis=1 )
counts = df['datestring'].value_counts()
counts_df = pd.Series.to_frame(counts)
counts_df['index'] = counts_df.index
counts_df.columns = ['count','datestring']
#sort the dataframe by date
counts_df = counts_df.sort_values(by='datestring')
counts_df = counts_df.reset_index()
counts_df = counts_df.drop(['index'],axis = 1)
counts_df['index'] = counts_df.index
prev_count = counts_df.iloc[0]['count']
counts_df['change_prev'] = counts_df.apply (lambda row: change_count(row), axis=1 )
counts_df['n'] = counts_df.apply (lambda row: plus_one(row), axis=1 )
prev_avg = 0
counts_df['moving_average'] = counts_df.apply (lambda row: moving_average(row), axis=1 )
counts_df['change'] = counts_df.apply (lambda row: change_avg(row), axis=1 )
counts_df = counts_df.sort_values(by='change',ascending=False)
has_spikes = []
output={}
spike_dates = []
list_of_list_of_tweets = []
for i in range(spikes):
change1 = counts_df.iloc[i]['change']
change2 = counts_df.iloc[i]['change_prev']
if change1 >= threshold and change2 >= threshold*2:
has_spike = True
spike_date = counts_df.iloc[i]['datestring']
filtered_df = df.loc[df['datestring'] == spike_date]
tweet_text_list = filtered_df['tweet_text'].tolist()
spike_dates.append(spike_date)
has_spikes.append(has_spike)
list_of_list_of_tweets.append(tweet_text_list)
if len(has_spikes)>0:
tweetbuckets = get_tweetbuckets(counts_df)
output['hashtag'] = hashtag
output['tweetbuckets'] = tweetbuckets
return True, spike_dates, output, list_of_list_of_tweets
return False, None, None, None
def insert_missing_data(start_date, end_date, counts_df):
'''
Helper function to plug in values for dates on which
RCC was down - ultimately, highcharts handles this issue
'''
# index by date
indexed_df = counts_df.set_index(['datestring'])
start_value = int(indexed_df['count'][start_date])
end_value = int(indexed_df['count'][end_date])
num_days = int(end_date[-2:]) - int(start_date[-2:])
missing_dict = {}
for i in range(num_days):
key = add_dates(start_date,i+1)
value = start_value + (end_value-start_value)/num_days
start_value = value
missing_dict[key]=value
counts_df = counts_df[counts_df['datestring'] != str(key)]
missing = pd.Series(missing_dict, name='count')
missing.index.name = 'datestring'
df = missing.to_frame()
df = df.reset_index()
new = counts_df.append(df, ignore_index=True)
new = new.sort_values(by='datestring')
return new