-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
handle youtube #110
handle youtube #110
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
from align_data.sources.youtube.youtube import YouTubeChannelDataset, YouTubePlaylistDataset | ||
|
||
YOUTUBE_REGISTRY = [ | ||
YouTubeChannelDataset( | ||
name='rob_miles_ai_safety', | ||
channel_id='UCLB7AzTwc6VFZrBsO2ucBMg', | ||
authors=['Rob Miles'], | ||
), | ||
YouTubeChannelDataset( | ||
name='ai_safety_talks', | ||
channel_id='UCXowyqjXvFS-tMKF1GwhpkA', | ||
authors=['Evan Hubinger'], | ||
), | ||
YouTubeChannelDataset( | ||
name='ai_safety_reading_group', | ||
channel_id='UC-C23F-9rK2gtRiJZMWsTzQ', | ||
authors=[], | ||
), | ||
YouTubeChannelDataset( | ||
name='ai_tech_tu_delft', | ||
channel_id='UCPK-Ell2WYxyfP5UYzRzjAA', | ||
authors=[], | ||
), | ||
YouTubeChannelDataset( | ||
name='ai_explained', | ||
channel_id='UCNJ1Ymd5yFuUPtn21xtRbbw', | ||
authors=[], | ||
), | ||
YouTubePlaylistDataset( | ||
name='ai_alignment_playlist', | ||
playlist_ids=[ | ||
'PLqYmG7hTraZCRwoyGxvQkqVrZgDQi4m-5', | ||
'PLqYmG7hTraZBiUr6_Qf8YTS2Oqy3OGZEj', | ||
'PLAPVC5uNprwY0q4_nyeeHqIT07wZqwjGO', | ||
'PLCRVRLd2RhZTpdUdEzJjo3qhmX3y3skWA', | ||
'PLTYHZYmxohXpn5uf8JZ2OouB1PsDJAk-x', | ||
] | ||
), | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
import collections | ||
import logging | ||
from dataclasses import dataclass | ||
from typing import List | ||
|
||
from googleapiclient.discovery import build | ||
from youtube_transcript_api import YouTubeTranscriptApi | ||
from youtube_transcript_api._errors import NoTranscriptFound, VideoUnavailable, TranscriptsDisabled | ||
|
||
from align_data.settings import YOUTUBE_API_KEY | ||
from align_data.common.alignment_dataset import AlignmentDataset | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class YouTubeDataset(AlignmentDataset): | ||
|
||
done_key = 'url' | ||
batch_size = 1 | ||
# COOLDOWN = 2 | ||
authors = None | ||
collection_ids = [] | ||
|
||
def setup(self): | ||
super().setup() | ||
if not YOUTUBE_API_KEY: | ||
raise ValueError('No YOUTUBE_API_KEY provided!') | ||
self.youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY) | ||
|
||
def next_page(self, collection_id, next_page_token): | ||
return {'items': []} | ||
|
||
@staticmethod | ||
def _get_id(item): | ||
if item.get('kind') == 'youtube#searchResult': | ||
resource = item['id'] | ||
elif item.get('kind') == 'youtube#playlistItem': | ||
resource = item['snippet']['resourceId'] | ||
else: | ||
return None | ||
|
||
if resource['kind'] == 'youtube#video': | ||
return resource['videoId'] | ||
|
||
def fetch_videos(self, collection_id): | ||
next_page_token = None | ||
while True: | ||
videos_response = self.next_page(collection_id, next_page_token) | ||
|
||
for item in videos_response.get('items'): | ||
if self._get_id(item): | ||
yield item | ||
|
||
next_page_token = videos_response.get('nextPageToken') | ||
if not next_page_token: | ||
return | ||
|
||
@property | ||
def items_list(self): | ||
return ( | ||
video | ||
for collection_id in self.collection_ids | ||
for video in self.fetch_videos(collection_id) | ||
) | ||
|
||
def get_item_key(self, item): | ||
video_id = self._get_id(item) | ||
return f'https://www.youtube.com/watch?v={video_id}' | ||
|
||
def _get_contents(self, video): | ||
video_id = self._get_id(video) | ||
try: | ||
transcript = YouTubeTranscriptApi.list_transcripts(video_id).find_transcript(['en', 'en-GB']).fetch() | ||
return '\n'.join([i['text'] for i in transcript]) | ||
except (NoTranscriptFound, VideoUnavailable): | ||
return None | ||
except TranscriptsDisabled: | ||
logger.error(f'Transcripts disabled for https://www.youtube.com/watch?v={video_id} - skipping') | ||
return None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Were there many that had transcriptions disabled or unavailable? Would something like this help? https://huggingface.co/spaces/SteveDigital/free-fast-youtube-url-video-to-text-using-openai-whisper There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes. Or some kind of alternative. Issue added - #112 |
||
|
||
def extract_authors(self, video): | ||
if self.authors: | ||
return self.authors | ||
return [video['snippet']['channelTitle'].strip()] | ||
|
||
def process_entry(self, video): | ||
video_url = self.get_item_key(video) | ||
contents = self._get_contents(video) | ||
|
||
if not contents: | ||
return None | ||
|
||
return self.make_data_entry({ | ||
"text": contents, | ||
"url": video_url, | ||
"title": video['snippet']['title'], | ||
"source": self.name, | ||
"source_type": "youtube", | ||
"date_published": self._get_published_date(video), | ||
"authors": self.extract_authors(video), | ||
}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you find that we were able to get fairly decent metadata this way? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes. The main missing thing is the authors, as it just does the channel name, but even that should be ok. |
||
|
||
|
||
@dataclass | ||
class YouTubeChannelDataset(YouTubeDataset): | ||
|
||
channel_id: str | ||
authors: List[str] | ||
|
||
@property | ||
def collection_ids(self): | ||
return [self.channel_id] | ||
|
||
def next_page(self, collection_id, next_page_token): | ||
return self.youtube.search().list( | ||
part='snippet', | ||
channelId=collection_id, | ||
maxResults=50, | ||
pageToken=next_page_token | ||
).execute() | ||
|
||
def _get_published_date(self, video): | ||
return super()._get_published_date(video['snippet']['publishTime']) | ||
|
||
|
||
@dataclass | ||
class YouTubePlaylistDataset(YouTubeDataset): | ||
|
||
playlist_ids: str | ||
|
||
@property | ||
def collection_ids(self): | ||
return self.playlist_ids | ||
|
||
def next_page(self, collection_id, next_page_token): | ||
return self.youtube.playlistItems().list( | ||
part='snippet', | ||
playlistId=collection_id, | ||
maxResults=50, | ||
pageToken=next_page_token, | ||
).execute() | ||
|
||
def _get_published_date(self, video): | ||
return super()._get_published_date(video['snippet']['publishedAt']) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice! Each channel is a dataset and all the playlists are a dataset? So to add an individual video, we add it to a playlist we're already tracking? Might have quite a few duplicates since all of Rob's videos will also be in a playlist too? And are we skipping "Robert Miles 2" channel since it's only a handful of videos and more off topic?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Duplicates are fine - they will be ignored. This is part of the reason why I put all the playlists together :D
That's part of the reason I skipped it - I just wanted the main body of this to be merged, as later additions will be simple