This repository has been archived by the owner on Jul 12, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwitter_stream.py
143 lines (107 loc) · 4.37 KB
/
twitter_stream.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python
# Copyright 2020 @TwitterDev
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modifications copyright (C) 2022 @morsapaes
import logging
import requests
import json
import os
from kafka import KafkaProducer
p = KafkaProducer(bootstrap_servers='redpanda:9092')
bearer_token= os.environ['BEARER_TOKEN']
def bearer_oauth(r):
r.headers["Authorization"] = f"Bearer {bearer_token}"
r.headers["User-Agent"] = "v2FilteredStreamPython"
return r
def get_rules():
response = requests.get(
"https://api.twitter.com/2/tweets/search/stream/rules", auth=bearer_oauth
)
if response.status_code != 200:
raise Exception(
"Cannot get rules (HTTP {}): {}".format(response.status_code, response.text)
)
logging.info(json.dumps(response.json()))
return response.json()
def delete_rules(r_get):
if r_get is None or "data" not in r_get:
return None
ids = list(map(lambda rule: rule["id"], r_get["data"]))
payload = {"delete": {"ids": ids}}
response = requests.post(
"https://api.twitter.com/2/tweets/search/stream/rules",
auth=bearer_oauth,
json=payload
)
if response.status_code != 200:
raise Exception(
"Cannot delete rules (HTTP {}): {}".format(
response.status_code, response.text
)
)
logging.info(json.dumps(response.json()))
def set_rules(r_delete):
# Filter the stream to include Data Council content, excluding retweets
# (but including tweets, quote tweets and replies)
rules = [
{"value": "(@DataCouncilAI OR \"Data Council\") -is:retweet", "tag": "Tweets about Data Council Austin 2022"}
]
payload = {"add": rules}
response = requests.post(
"https://api.twitter.com/2/tweets/search/stream/rules",
auth=bearer_oauth,
json=payload,
)
if response.status_code != 201:
raise Exception(
"Cannot add rules (HTTP {}): {}".format(response.status_code, response.text)
)
logging.info(json.dumps(response.json()))
def get_stream(r_filter):
response = requests.get(
"https://api.twitter.com/2/tweets/search/stream",
auth=bearer_oauth,
stream=True,
params={'expansions': 'author_id,geo.place_id',
'tweet.fields': 'author_id,created_at,in_reply_to_user_id,geo,attachments,referenced_tweets',
'place.fields': 'name,place_type',
'user.fields': 'location'}
)
logging.info(response.status_code)
if response.status_code != 200:
raise Exception(
"Cannot get stream (HTTP {}): {}".format(
response.status_code, response.text
)
)
for response_line in response.iter_lines():
if response_line:
json_response = json.loads(response_line)
p.send(topic='dc_tweets', value=json.dumps(json_response['data'], ensure_ascii=False).encode('utf-8'))
for usr in json_response['includes']['users']:
p.send(topic='dc_users', key=json.dumps(usr['id']).encode('utf8'), value=json.dumps(usr, ensure_ascii=False).encode('utf-8'))
if 'places' in json_response['includes']:
for pl in json_response['includes']['places']:
p.send(topic='dc_places', key=json.dumps(pl['id']).encode('utf8'), value=json.dumps(pl, ensure_ascii=False).encode('utf-8'))
p.flush()
def main():
r_get = get_rules()
r_delete = delete_rules(r_get)
r_filter = set_rules(r_delete)
get_stream(r_filter)
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG, filename="logfile", filemode="a+",
format="%(asctime)-15s %(levelname)-8s %(message)s")
while True:
try:
main()
except requests.exceptions.ChunkedEncodingError:
print('restarting')