Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write xmls #8

Merged
merged 4 commits into from
Aug 29, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 51 additions & 31 deletions scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def get_authenticated_service():

# Trusted testers can download this discovery document from the developers page
# and it should be in the same directory with the code.
with open("youtube-v3-api-captions.json", "r") as f:
with open("youtube-v3-api-captions.json", "r", encoding='utf8') as f:
doc = f.read()
return build_from_document(doc, http=credentials.authorize(httplib2.Http()))

Expand All @@ -70,7 +70,7 @@ def get_upload_playlist_id(youtube, channel_id):
part="contentDetails"
).execute()
upload_playlist_id = results["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
print(upload_playlist_id)
# print(upload_playlist_id)
return upload_playlist_id

def get_playlist_video_id(youtube, playlist_id, **kwargs):
Expand All @@ -84,63 +84,83 @@ def get_playlist_video_id(youtube, playlist_id, **kwargs):
return results


def print_all_pages():
def collect_all_pages():
playlist_videos = get_playlist_video_id(youtube, upload_playlist_id)
print(playlist_videos["nextPageToken"])
# print(playlist_videos["nextPageToken"])
next_page_token = playlist_videos["nextPageToken"]

while ('nextPageToken' in playlist_videos):
print(next_page_token)
# print(next_page_token)
next_page = get_playlist_video_id(youtube, upload_playlist_id, pageToken=next_page_token)
playlist_videos['items'] = playlist_videos['items'] + next_page['items']
if 'nextPageToken' not in next_page:
playlist_videos.pop('nextPageToken', None)
else:
next_page_token = next_page['nextPageToken']
for i in playlist_videos["items"]:
print i["contentDetails"]["videoId"]
# for i in playlist_videos["items"]:
# print(i["contentDetails"]["videoId"])
return playlist_videos



# Call the API's captions.list method to list the existing caption tracks.
def get_caption_id(youtube, video_id):
results = youtube.captions().list(
part="snippet",
videoId=video_id
).execute()
caption_id = results["items"][0]["id"]
return caption_id

results = youtube.captions().list(
part="snippet",
videoId=video_id
).execute()
try:
caption_id = results["items"][0]["id"]
# print(caption_id)
return caption_id
except:
print("IndexError %s" % video_id)
pass
# print(caption_id)
# return caption_id




# Call the API's captions.download method to download an existing caption track.
def download_caption(youtube, caption_id, tfmt):
subtitle = youtube.captions().download(
id=caption_id,
tfmt=tfmt
).execute()
print(subtitle.decode("utf-8"))
return subtitle

if caption_id is not None:

subtitle = youtube.captions().download(
id=caption_id,
tfmt=tfmt
).execute()
#print(subtitle.decode("utf-8"))
return subtitle

def write_caption(path_name, object):
f = open(path_name, "wb")
f.write(object)
f.close()
if not os.path.exists(path_name):
with open(path_name, "wb") as f:
f.write(object)

if __name__ == "__main__":
youtube = get_authenticated_service()
try:
upload_playlist_id = get_upload_playlist_id(youtube, channel_id)
pages = print_all_pages()
youtube = get_authenticated_service()
try:
upload_playlist_id = get_upload_playlist_id(youtube, channel_id)
pages = collect_all_pages()
for i in pages["items"]:
# print(i["contentDetails"]["videoId"])
video_id = i["contentDetails"]["videoId"]
transcript = download_caption(youtube, get_caption_id(youtube, video_id), 'ttml')
write_caption("./transcripts/%s.xml" % video_id, transcript)
except TypeError:
print("error")
pass

#playlist_videos = get_playlist_video_id(youtube, upload_playlist_id)

#transcript = download_caption(youtube, get_caption_id(youtube, video_id), 'ttml')

#write_caption("scraped_file1", transcript)
# download_caption(youtube, args.captionid, 'ttml') # sbv (plaintext) or ttml (xml)
# use ttml (xml): parse it and get text + time start/end for each line
except HttpError as e:
print("An HTTP error %d occurred:\n%s" % (e.resp.status, e.content))
else:
print("done")
except HttpError as e:
print("An HTTP error %d occurred:\n%s" % (e.resp.status, e.content))
else:
print("done")
71 changes: 71 additions & 0 deletions transcripts/-Q9iAXpWxjA.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
<?xml version="1.0" encoding="utf-8" ?>
<tt xml:lang="en" xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata" xmlns:tts="http://www.w3.org/ns/ttml#styling" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" ttp:profile="http://www.w3.org/TR/profile/sdp-us" >
<head>
<styling>
<style xml:id="s1" tts:textAlign="center" tts:extent="90% 90%" tts:origin="5% 5%" tts:displayAlign="after"/>
<style xml:id="s2" tts:fontSize=".72c" tts:backgroundColor="black" tts:color="white"/>
<style xml:id="s3" tts:color="#E5E5E5"/>
<style xml:id="s4" tts:color="#CCCCCC"/>
</styling>
<layout>
<region xml:id="r1" style="s1"/>
</layout>
</head>
<body region="r1">
<div>
<p begin="00:00:00.000" end="00:00:09.960" style="s2"><span style="s3">I</span> have a night class in graph theory</p>
<p begin="00:00:07.830" end="00:00:12.000" style="s2">that I hate because it&#39;s a night class</p>
<p begin="00:00:09.960" end="00:00:14.610" style="s2">and it&#39;s filled with<span style="s3"> algorithms</span> that are</p>
<p begin="00:00:12.000" end="00:00:17.460" style="s2">hard to pronounce but last week&#39;s class</p>
<p begin="00:00:14.610" end="00:00:19.380" style="s2">was particularly weird after the lecture</p>
<p begin="00:00:17.460" end="00:00:21.539" style="s2">the professor took questions from the</p>
<p begin="00:00:19.380" end="00:00:23.100" style="s2">class that weren&#39;t so much questions as</p>
<p begin="00:00:21.539" end="00:00:24.960" style="s2"><span style="s4">they</span> were<span style="s3"> long-winded commentary</span></p>
<p begin="00:00:23.100" end="00:00:31.679" style="s2">punctuated by question marks because the</p>
<p begin="00:00:24.960" end="00:00:33.390" style="s2">asker had an inflection but then the</p>
<p begin="00:00:31.679" end="00:00:37.050" style="s2">class changed when<span style="s4"> the</span> professor ended</p>
<p begin="00:00:33.390" end="00:00:38.640" style="s2">his lecture and said and now it&#39;s<span style="s3"> time</span></p>
<p begin="00:00:37.050" end="00:00:41.040" style="s2"><span style="s3">for</span><span style="s4"> the</span> civic hacking portion of the</p>
<p begin="00:00:38.640" end="00:00:43.590" style="s2">evening all of a sudden<span style="s4"> the</span> class</p>
<p begin="00:00:41.040" end="00:00:45.719" style="s2">exploded people were by<span style="s3"> white boards and</span></p>
<p begin="00:00:43.590" end="00:00:48.390" style="s2">desks and groups working on problems</p>
<p begin="00:00:45.719" end="00:00:50.430" style="s2">like making it more effective to spread</p>
<p begin="00:00:48.390" end="00:00:53.100" style="s2">resources and support<span style="s4"> to</span><span style="s3"> parts</span> of the</p>
<p begin="00:00:50.430" end="00:00:54.809" style="s2">city and even designing more efficient</p>
<p begin="00:00:53.100" end="00:00:57.390" style="s2">transport so that citizens could have</p>
<p begin="00:00:54.809" end="00:00:59.579" style="s2">better access to<span style="s3"> jobs</span> not only<span style="s3"> were</span> we</p>
<p begin="00:00:57.390" end="00:01:02.309" style="s2">finally learning what Dijkstra&#39;s theorem</p>
<p begin="00:00:59.579" end="00:01:05.369" style="s2">was good for but we had choice over what</p>
<p begin="00:01:02.309" end="00:01:08.070" style="s2">we got to work on<span style="s3"> with</span> a real impact and</p>
<p begin="00:01:05.369" end="00:01:10.470" style="s2">I hadn&#39;t noticed it but<span style="s3"> a</span> few a few</p>
<p begin="00:01:08.070" end="00:01:12.180" style="s2"><span style="s3">people</span> who were psychology and policy</p>
<p begin="00:01:10.470" end="00:01:15.000" style="s2">majors had snuck in helping us build</p>
<p begin="00:01:12.180" end="00:01:17.280" style="s2">maps and CS students don&#39;t like to admit</p>
<p begin="00:01:15.000" end="00:01:18.659" style="s2">it<span style="s4"> but</span> since they were there we felt a</p>
<p begin="00:01:17.280" end="00:01:22.250" style="s2"><span style="s4">little</span> more comfortable looking up<span style="s3"> from</span></p>
<p begin="00:01:18.659" end="00:01:24.960" style="s2">our laptop and talking to each other</p>
<p begin="00:01:22.250" end="00:01:27.060" style="s2">people even stayed afterwards late into</p>
<p begin="00:01:24.960" end="00:01:28.470" style="s2">the night working on problems and that</p>
<p begin="00:01:27.060" end="00:01:32.850" style="s2">may have had<span style="s4"> something</span> to<span style="s3"> do</span> with<span style="s3"> their</span></p>
<p begin="00:01:28.470" end="00:01:34.409" style="s2">being empanadas<span style="s4"> in</span> the room<span style="s3"> but</span> when I</p>
<p begin="00:01:32.850" end="00:01:37.590" style="s2">took a bite of one of those empanadas</p>
<p begin="00:01:34.409" end="00:01:40.530" style="s2">and it tasted just like the brown rice</p>
<p begin="00:01:37.590" end="00:01:42.540" style="s2">from our school&#39;s cafeteria I knew that</p>
<p begin="00:01:40.530" end="00:01:44.740" style="s2"><span style="s4">I</span> had fallen asleep<span style="s3"> in</span> lecture again and</p>
<p begin="00:01:42.540" end="00:01:48.820" style="s2">it all just been<span style="s3"> a</span> dream</p>
<p begin="00:01:44.740" end="00:01:50.200" style="s2">I should have I should have realized</p>
<p begin="00:01:48.820" end="00:01:51.490" style="s2">sooner because in the back of the</p>
<p begin="00:01:50.200" end="00:01:56.649" style="s2">classroom there was a pool with a</p>
<p begin="00:01:51.490" end="00:02:01.409" style="s2"><span style="s3">dolphin</span> in it and unlike my professor</p>
<p begin="00:01:56.649" end="00:02:08.170" style="s2">this dolphin never criticized me so</p>
<p begin="00:02:01.409" end="00:02:11.620" style="s2"><span style="s3">finally</span><span style="s4"> I dolphin table what hit every</span></p>
<p begin="00:02:08.170" end="00:02:16.330" style="s2">class<span style="s4"> and</span> in a series of trips and</p>
<p begin="00:02:11.620" end="00:02:18.340" style="s2"><span style="s4">splitters</span> she told<span style="s3"> me</span> it&#39;s going<span style="s3"> to</span> be a</p>
<p begin="00:02:16.330" end="00:02:20.680" style="s2">long time when<span style="s3"> H</span> before your classroom</p>
<p begin="00:02:18.340" end="00:02:24.730" style="s2">looks like this but your other night</p>
<p begin="00:02:20.680" end="00:02:28.470" style="s2">class on Tuesday at 6pm tells you that</p>
<p begin="00:02:24.730" end="00:02:28.470" style="s2">the city is your<span style="s4"> classroom</span></p>
<p begin="00:02:35.390" end="00:02:37.450" style="s2">you</p>
</div>
</body>
</tt>
Loading