-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_chunk.py
168 lines (154 loc) · 6.02 KB
/
process_chunk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/local/bin/python3
import os
from time import sleep, gmtime, strftime
import csv
import pandas as pd
from constants import (
EDGELIST_FAILED_FNAME,
EDGELIST_FNAME,
EDGELIST_JOBS_FOLDER_NAME,
NEXT_CHUNK_START_FNAME,
NEXT_EDGELIST_JOB_FNAME,
TWITTER_IDS_FNAME,
)
from utils.job_load import get_edgelist_job_dir_path
from utils.twitter_auth import get_twitter_api
def load_crontab_job_num():
"""
Load the number of the current crontab job.
"""
with open(
f"{os.path.abspath(os.getcwd())}{EDGELIST_JOBS_FOLDER_NAME}{NEXT_EDGELIST_JOB_FNAME}",
"r",
) as f:
job_num = int(f.readline()) - 1
print(f"{strftime('%Y-%m-%d %H:%M:%S', gmtime())}\tProcessing job {job_num}")
return job_num
def load_chunk(job_dir, max_chunk_size=14):
"""
Load chunk from the file. The start index of current chunk is specified in a separate file.
"""
# Load start index of current chunk
fname = f"{job_dir}{NEXT_CHUNK_START_FNAME}"
if not os.path.exists(fname):
with open(fname, "w") as f:
start = 0
f.write(str(start))
else:
with open(fname, "r") as f:
start = int(f.readline())
# Load chunk
with open(f"{job_dir}{TWITTER_IDS_FNAME}", "r") as f:
lines = list(f.readlines())
id_list_len = len(lines)
# Make sure start and end don't exceed length of ID list
start = min(start, id_list_len)
end = min(start + max_chunk_size, id_list_len)
chunk = [line.strip() for line in lines[start:end]]
print(
f"{strftime('%Y-%m-%d %H:%M:%S', gmtime())}\tLoaded chunk from ID list, starting at line {start} of max size {max_chunk_size}"
)
return chunk
def process_chunk(api, chunk, job_dir, max_chunk_size=14, friends_limit=15000):
"""
Process chunk of Twitter IDs.
"""
failed = []
# Initialize counters
req_left = max_chunk_size # min(max_chunk_size, len(chunk))
req_per_id_left = 3
cur_cursor = -1
explored = set()
with open(f"{job_dir}{EDGELIST_FNAME}", "a") as f:
writer = csv.writer(f)
while len(chunk) > 0 and (req_left > 2 or req_left >= req_per_id_left):
id = chunk.pop(0)
print("Selected ID:", id)
res = []
try:
# Explore ID
res = api.get_friend_ids(
user_id=id, stringify_ids=True, cursor=cur_cursor
)
print(
f"{strftime('%Y-%m-%d %H:%M:%S', gmtime())}\tExtracted {len(res[0])} friends of ID {id}"
)
sleep(2)
except Exception as e:
print("Exception:", repr(str(e)))
print("Response:", repr(res))
print(
f"{strftime('%Y-%m-%d %H:%M:%S', gmtime())}\tFailed to load friends of ID {id}"
)
failed.append(
{"id_str": id, "error_message": str(e).split("\n")[0]})
req_left -= 1
# If next cursor isn't 0 (end of following list not reached)
is_res_valid = False
try:
if res[0][0]:
is_res_valid = True
except:
pass
if is_res_valid:
if res[1][1] != 0:
if req_per_id_left == 3:
# Get total number of accounts followed by user
user = api.get_user(user_id=id)._json
sleep(1)
print(f"User follows {user['friends_count']} accounts")
if user["friends_count"] > friends_limit:
failed.append(
{"id_str": id, "error_message": f"User follows over {friends_limit} accounts"}
)
print(
f"User {id} excluded because they follow over {friends_limit} accounts")
# Reset cursor and counter
cur_cursor = -1
req_per_id_left = 3
else:
cur_cursor = res[1][1]
chunk.insert(0, id)
req_per_id_left -= 1
for friend_id in res[0]:
writer.writerow([id, friend_id])
explored.add(id)
else:
cur_cursor = res[1][1]
chunk.insert(0, id)
req_per_id_left -= 1
for friend_id in res[0]:
writer.writerow([id, friend_id])
explored.add(id)
else:
# Reset counters
cur_cursor = -1
req_per_id_left = 3
for friend_id in res[0]:
writer.writerow([id, friend_id])
explored.add(id)
# Save failed to file
pd.DataFrame(failed).to_csv(
f"{job_dir}{EDGELIST_FAILED_FNAME}", mode="a", header=False, index=False
)
print(
f"Sent {max_chunk_size - req_left} requests to explore {len(explored)+len(failed)} users. {len(failed)} users were excluded")
print(f"{strftime('%Y-%m-%d %H:%M:%S', gmtime())}\tChunk processed")
return len(explored)
def move_next_chunk_start(job_dir, move_by):
"""
Move the start index of next chunk.
"""
fname = f"{job_dir}{NEXT_CHUNK_START_FNAME}"
with open(fname, "r") as f:
start = int(f.readline())
print(f"New chunk size: {start} + {move_by} = {start+move_by}")
with open(fname, "w") as f:
f.write(str(start + move_by))
return 0
api = get_twitter_api()
job_num = load_crontab_job_num()
job_dir = get_edgelist_job_dir_path(job_num)
chunk = load_chunk(job_dir, max_chunk_size=14)
move_by = process_chunk(api, chunk, job_dir, friends_limit=15000)
move_next_chunk_start(job_dir, move_by=move_by)