-
Notifications
You must be signed in to change notification settings - Fork 0
/
backup_fireflies_meetings.py
341 lines (295 loc) · 10 KB
/
backup_fireflies_meetings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
# %%
# Visit Fireflies.ai in a logged-in state and open the developer tools in your browser
# then go to https://app.fireflies.ai/notebook/all
# and look at the network tab in the developer tools
# identify the request to the URL https://app.fireflies.ai/api/v2/graphql
# copy the request headers and payload from the browser request, by right clicking on the request
# then select copy as fetch
# extract the headers from the fetch request
# or use the official Fireflies API to key a key to access the API,
# but note that it may have rate limits that prevent you from downloading all your data at once
# Copy the headers from the browser request here:
headers = {}
# Ensure you have the requests library installed by running `pip install requests`,
# then run this script like so: `python backup_fireflies_meetings.py`
import os
import random
from time import sleep
import requests
url = "https://app.fireflies.ai/api/v2/graphql"
get_meeting_graphql_payload = {
"operationName": "fetchChannelMeetings",
"variables": {"channelId": "all", "from": 0, "size": 400, "search": ""},
"query": """query fetchChannelMeetings($from: Int!, $size: Int!, $channelId: String!, $search: String, $isPrivate: Boolean) {
getChannelMeetings(
from: $from
size: $size
channelId: $channelId
search: $search
isPrivate: $isPrivate
) {
total
meetings {
parseId
hasCaptions
date
owner
title
createdAt
channelIds
creator_email
creatorProfile {
picture
name
lastName
firstName
email
__typename
}
privacy
id
users
validAttendees
durationMins
duration
allEmails
audioOnly
processMeetingStatus
audioServiceMetadata {
apiSource
silentMeeting
__typename
}
audio_url
addedBy
meetingExpirySettings {
meetingTtl
unit
deletionTime
__typename
}
__typename
}
__typename
}
}""",
}
fetch_meeting_notes_graphql_query_string = """
fragment CoreAskFredFields on FredResponse {
id
question
answer
rating
status
rating
meetingId
answer
privacy
source
__typename
}
query fetchNotepadMeeting($meetingNoteId: String!) {
meetingNote(_id: $meetingNoteId) {
_id
transcriptParseId
captions {
index
sentence
speaker_id
time
endTime
match
metrics {
word
category
__typename
}
sentiment
sentimentType
filterType
__typename
}
attendees {
email
displayName
__typename
}
title
audio_url
date
parseId
processMeetingStatus
createdAt
hasCaptions
creator_email
waveformData
userPermission
privacy
owner
audioOnly
speakerMeta
sentenceMeta
paragraphMeta
isGuestAccessEnabled
labelMeta
video_url
txService
defaultChannelAccess
audioServiceMetadata {
silentMeeting
languageCode
preferredLanguage
__typename
}
addedBy
allEmails
fredQA {
...CoreAskFredFields
__typename
}
durationMins
meetingExpirySettings {
meetingTtl
unit
deletionTime
__typename
}
summaryStatus
__typename
}
}"""
def random_sleep():
# It's polite to sleep for a random amount of time between requests to avoid overloading the server
sleep(random.uniform(0.2, 3.0))
def get_meetings():
response = requests.post(url, headers=headers, json=get_meeting_graphql_payload)
response.raise_for_status()
data = response.json()
meetings = data["data"]["getChannelMeetings"]["meetings"]
results = []
for meeting in meetings:
result = {
"title": meeting["title"],
"date": meeting["date"],
"id": meeting["id"],
}
results.append(result)
return results
meetings = get_meetings()
for meeting in meetings:
print(meeting)
# %%
def get_meeting_transcript(meeting_note_id):
get_meeting_notes_graphql_query_payload = {
"operationName": "fetchNotepadMeeting",
"variables": {"meetingNoteId": meeting_note_id},
"query": fetch_meeting_notes_graphql_query_string,
}
response = requests.post(
url, headers=headers, json=get_meeting_notes_graphql_query_payload
)
response.raise_for_status()
data = response.json()
# save the json response to a file named by the meeting_note_id as a .json file:
final_filename = f"{meeting_note_id}.json"
with open(final_filename, "w") as f:
f.write(response.text)
captions = data["data"]["meetingNote"]["captions"]
speakers = data["data"]["meetingNote"]["speakerMeta"]
#The speaker ID is 0 based in captions, but 1 based in speakerMeta!
transcript_list = []
for caption in captions:
adjusted_speaker_id = caption["speaker_id"] + 1 # Fixed to ensure speaker IDs are not off by 1!
speaker_name = speakers.get(str(adjusted_speaker_id), "Unknown Speaker")
time = caption["time"]
sentence = caption["sentence"]
transcript_list.append(f'{time}: {speaker_name} said: "{sentence}"')
transcript = "\n".join(transcript_list)
return transcript
# %%
# Test that we can get a transcript:
print(get_meeting_transcript(meetings[0]["id"]))
# %%
for meeting in meetings:
print(meeting["date"], meeting["title"])
transcript = get_meeting_transcript(meeting["id"])
print(transcript)
meeting["transcript"] = transcript
# also we save a .txt file with the transcript, using the meeting date, title and id in the filename
filename = f"{meeting['date']}_{meeting['title']}_{meeting['id']}"
# only allow alphanumeric characters, spaces and underscores in the filename because other characters are not allowed in filenames
# and cause issues with saving files
safe_filename = "".join(c if c.isalnum() or c in " _" else "_" for c in filename)
safe_filename += ".txt"
with open(safe_filename, "w") as f:
f.write(transcript)
random_sleep()
# %%
# This potentially more efficient way didn't work, it caused a 403 issue, the MP3 URL paths were observed to be different from the JSON response and the MP3 URLs were not accessible:
def get_meeting_audio_urls():
payload = {
"operationName": "fetchChannelMeetings",
"variables": {"channelId": "all", "from": 0, "size": 200, "search": ""},
"query": """query fetchChannelMeetings($from: Int!, $size: Int!, $channelId: String!, $search: String, $isPrivate: Boolean) {
getChannelMeetings(
from: $from
size: $size
channelId: $channelId
search: $search
isPrivate: $isPrivate
) {
total
meetings {
id
date
audio_url
}
}
}""",
}
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
meetings = data["data"]["getChannelMeetings"]["meetings"]
results = []
for meeting in meetings:
result = {
"id": meeting["id"],
"date": meeting["date"],
"audio_url": meeting["audio_url"],
}
results.append(result)
return results
# %%
# This was repurposed from the get_meeting_transcript function to get the audio url, because that did provide working audio URLs:
def get_meeting_audio_url(meeting_note_id):
payload = {
"operationName": "fetchNotepadMeeting",
"variables": {"meetingNoteId": meeting_note_id},
"query": fetch_meeting_notes_graphql_query_string,
}
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
# get the audio_url and return it
audio_url = data["data"]["meetingNote"]["audio_url"]
return audio_url
for meeting in meetings:
# Make a filename string for the audio file, using the meeting date and id from the meeting object
audio_filename = f"{meeting['date']}_{meeting['id']}"
# Certain characters are not allowed in filenames, so we replace them with underscores:
safe_audio_filename = "".join(
c if c.isalnum() or c in " _" else "_" for c in audio_filename
)
safe_audio_filename += ".mp3"
file_exists = os.path.exists(safe_audio_filename)
if not file_exists:
print(meeting["date"], meeting["title"])
audio_url = get_meeting_audio_url(meeting["id"])
print(audio_url)
meeting["audio_url"] = audio_url
with open(safe_audio_filename, "wb") as f:
audio_response = requests.get(audio_url, headers=headers)
audio_response.raise_for_status()
f.write(audio_response.content)
random_sleep()