StampyAI · mruwnik · Aug 4, 2023 · Aug 1, 2023 · ccstan99 · Aug 4, 2023
diff --git a/.github/workflows/fetch-dataset.yml b/.github/workflows/fetch-dataset.yml
@@ -9,6 +9,9 @@ on:
       coda_token:
         type: string
         required: true
+      youtube_api_key:
+        type: string
+        required: true
       db_user:
         type: string
         required: true
@@ -29,6 +32,11 @@ on:
           - aipulse
           - aisafety.camp
           - aisafety.info
+          - ai_alignment_playlist
+          - ai_explained
+          - ai_safety_talks
+          - ai_safety_reading_group
+          - ai_tech_tu_delft
           - alignmentforum
           - alignment_newsletter
           - arbital
@@ -56,6 +64,7 @@ on:
           - openai.research
           - pdfs
           - reports
+          - rob_miles_ai_safety
           - vkrakovna_blog
           - yudkowsky_blog
 
@@ -78,6 +87,7 @@ jobs:
     - name: Generate dataset file
       env:
         CODA_TOKEN: ${{ secrets.CODA_TOKEN || inputs.coda_token }}
+        YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY || inputs.youtube_api_key }}
         ARD_DB_USER: ${{ secrets.ARD_DB_USER || inputs.db_user }}
         ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD || inputs.db_password }}
         ARD_DB_HOST: ${{ secrets.ARD_DB_HOST || inputs.db_host }}

diff --git a/.github/workflows/fetch-weekly.yml b/.github/workflows/fetch-weekly.yml
@@ -13,6 +13,11 @@ jobs:
           - aiimpacts
           - aipulse
           - aisafety.camp
+          - ai_alignment_playlist
+          - ai_explained
+          - ai_safety_talks
+          - ai_safety_reading_group
+          - ai_tech_tu_delft
           - alignment_newsletter
           - arbital
           - arxiv
@@ -39,13 +44,15 @@ jobs:
           - openai.research
           - pdfs
           - reports
+          - rob_miles_ai_safety
           - vkrakovna_blog
           - yudkowsky_blog
 
     uses: ./.github/workflows/fetch-dataset.yml
     with:
       datasource: ${{ matrix.datasource }}
       coda_token: ${{ inputs.coda_token }}
+      youtube_api_key: ${{ inputs.youtube_api_key }}
       db_user: ${{ inputs.db_user }}
       db_password: ${{ inputs.db_password }}
       db_host: ${{ inputs.db_host }}

diff --git a/.github/workflows/push-datasets.yml b/.github/workflows/push-datasets.yml
@@ -14,6 +14,11 @@ on:
           - aipulse
           - aisafety.camp
           - aisafety.info
+          - ai_alignment_playlist
+          - ai_explained
+          - ai_safety_talks
+          - ai_safety_reading_group
+          - ai_tech_tu_delft
           - alignmentforum
           - alignment_newsletter
           - arbital
@@ -42,6 +47,7 @@ on:
           - openai.research
           - pdfs
           - reports
+          - rob_miles_ai_safety
           - vkrakovna_blog
           - yudkowsky_blog
 
@@ -64,6 +70,7 @@ jobs:
     - name: Generate dataset file
       env:
         CODA_TOKEN: ${{ secrets.CODA_TOKEN }}
+        YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
         ARD_DB_USER: ${{ secrets.ARD_DB_USER }}
         ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD }}
         ARD_DB_HOST: ${{ secrets.ARD_DB_HOST }}

diff --git a/align_data/__init__.py b/align_data/__init__.py
@@ -8,6 +8,7 @@
 import align_data.sources.stampy as stampy
 import align_data.sources.alignment_newsletter as alignment_newsletter
 import align_data.sources.distill as distill
+import align_data.sources.youtube as youtube
 
 DATASET_REGISTRY = (
     arbital.ARBITAL_REGISTRY
@@ -20,6 +21,7 @@
     + stampy.STAMPY_REGISTRY
     + distill.DISTILL_REGISTRY
     + alignment_newsletter.ALIGNMENT_NEWSLETTER_REGISTRY
+    + youtube.YOUTUBE_REGISTRY
 )
 
 ALL_DATASETS = sorted([dataset.name for dataset in DATASET_REGISTRY])

diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py
@@ -154,9 +154,12 @@ def _load_outputted_items(self):
         """Load the output file (if it exists) in order to know which items have already been output."""
         with make_session() as session:
             if hasattr(Article, self.done_key):
-                return set(session.scalars(select(getattr(Article, self.done_key)).where(Article.source==self.name)).all())
+                # This doesn't filter by self.name. The good thing about that is that it should handle a lot more
+                # duplicates. The bad thing is that this could potentially return a massive amount of data if there
+                # are lots of items.
+                return set(session.scalars(select(getattr(Article, self.done_key))).all())
             # TODO: Properly handle this - it should create a proper SQL JSON select
-            return {item.get(self.done_key) for item in session.scalars(select(Article.meta).where(Article.source==self.name)).all()}
+            return {item.get(self.done_key) for item in session.scalars(select(Article.meta)).all()}
 
     def unprocessed_items(self, items=None):
         """Return a list of all items to be processed.

diff --git a/align_data/settings.py b/align_data/settings.py
@@ -15,6 +15,9 @@
 METADATA_SOURCE_SHEET = os.environ.get('METADATA_SOURCE_SHEET', 'special_docs.csv')
 METADATA_OUTPUT_SPREADSHEET = os.environ.get('METADATA_OUTPUT_SPREADSHEET', '1l3azVJVukGAvZPgg0GyeqiaQe8bEMZvycBJaA8cRXf4')
 
+### YouTube ###
+YOUTUBE_API_KEY = os.environ.get('YOUTUBE_API_KEY')
+
 ### MYSQL ###
 user = os.environ.get('ARD_DB_USER', 'user')
 password = os.environ.get('ARD_DB_PASSWORD', 'we all live in a yellow submarine')
@@ -46,4 +49,4 @@
 
 ### MISCELLANEOUS ###
 CHUNK_SIZE = 1750
-MAX_NUM_AUTHORS_IN_SIGNATURE = 3
+MAX_NUM_AUTHORS_IN_SIGNATURE = 3
diff --git a/align_data/sources/youtube/__init__.py b/align_data/sources/youtube/__init__.py
@@ -0,0 +1,39 @@
+from align_data.sources.youtube.youtube import YouTubeChannelDataset, YouTubePlaylistDataset
+
+YOUTUBE_REGISTRY = [
+    YouTubeChannelDataset(
+        name='rob_miles_ai_safety',
+        channel_id='UCLB7AzTwc6VFZrBsO2ucBMg',
+        authors=['Rob Miles'],
+    ),
+    YouTubeChannelDataset(
+        name='ai_safety_talks',
+        channel_id='UCXowyqjXvFS-tMKF1GwhpkA',
+        authors=['Evan Hubinger'],
+    ),
+    YouTubeChannelDataset(
+        name='ai_safety_reading_group',
+        channel_id='UC-C23F-9rK2gtRiJZMWsTzQ',
+        authors=[],
+    ),
+    YouTubeChannelDataset(
+        name='ai_tech_tu_delft',
+        channel_id='UCPK-Ell2WYxyfP5UYzRzjAA',
+        authors=[],
+    ),
+    YouTubeChannelDataset(
+        name='ai_explained',
+        channel_id='UCNJ1Ymd5yFuUPtn21xtRbbw',
+        authors=[],
+    ),
+    YouTubePlaylistDataset(
+        name='ai_alignment_playlist',
+        playlist_ids=[
+            'PLqYmG7hTraZCRwoyGxvQkqVrZgDQi4m-5',
+            'PLqYmG7hTraZBiUr6_Qf8YTS2Oqy3OGZEj',
+            'PLAPVC5uNprwY0q4_nyeeHqIT07wZqwjGO',
+            'PLCRVRLd2RhZTpdUdEzJjo3qhmX3y3skWA',
+            'PLTYHZYmxohXpn5uf8JZ2OouB1PsDJAk-x',
+        ]
+    ),
+]
diff --git a/align_data/sources/youtube/youtube.py b/align_data/sources/youtube/youtube.py
@@ -0,0 +1,145 @@
+import collections
+import logging
+from dataclasses import dataclass
+from typing import List
+
+from googleapiclient.discovery import build
+from youtube_transcript_api import YouTubeTranscriptApi
+from youtube_transcript_api._errors import NoTranscriptFound, VideoUnavailable, TranscriptsDisabled
+
+from align_data.settings import YOUTUBE_API_KEY
+from align_data.common.alignment_dataset import AlignmentDataset
+
+
+logger = logging.getLogger(__name__)
+
+
+class YouTubeDataset(AlignmentDataset):
+
+    done_key = 'url'
+    batch_size = 1
+    # COOLDOWN = 2
+    authors = None
+    collection_ids = []
+
+    def setup(self):
+        super().setup()
+        if not YOUTUBE_API_KEY:
+            raise ValueError('No YOUTUBE_API_KEY provided!')
+        self.youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
+
+    def next_page(self, collection_id, next_page_token):
+        return {'items': []}
+
+    @staticmethod
+    def _get_id(item):
+        if item.get('kind') == 'youtube#searchResult':
+            resource = item['id']
+        elif item.get('kind') == 'youtube#playlistItem':
+            resource = item['snippet']['resourceId']
+        else:
+            return None
+
+        if resource['kind'] == 'youtube#video':
+            return resource['videoId']
+
+    def fetch_videos(self, collection_id):
+        next_page_token = None
+        while True:
+            videos_response = self.next_page(collection_id, next_page_token)
+
+            for item in videos_response.get('items'):
+                if self._get_id(item):
+                    yield item
+
+            next_page_token = videos_response.get('nextPageToken')
+            if not next_page_token:
+                return
+
+    @property
+    def items_list(self):
+        return (
+            video
+            for collection_id in self.collection_ids
+            for video in self.fetch_videos(collection_id)
+        )
+
+    def get_item_key(self, item):
+        video_id = self._get_id(item)
+        return f'https://www.youtube.com/watch?v={video_id}'
+
+    def _get_contents(self, video):
+        video_id = self._get_id(video)
+        try:
+            transcript = YouTubeTranscriptApi.list_transcripts(video_id).find_transcript(['en', 'en-GB']).fetch()
+            return '\n'.join([i['text'] for i in transcript])
+        except (NoTranscriptFound, VideoUnavailable):
+            return None
+        except TranscriptsDisabled:
+            logger.error(f'Transcripts disabled for https://www.youtube.com/watch?v={video_id} - skipping')
+            return None
+
+    def extract_authors(self, video):
+        if self.authors:
+            return self.authors
+        return [video['snippet']['channelTitle'].strip()]
+
+    def process_entry(self, video):
+        video_url = self.get_item_key(video)
+        contents = self._get_contents(video)
+
+        if not contents:
+            return None
+
+        return self.make_data_entry({
+            "text": contents,
+            "url": video_url,
+            "title": video['snippet']['title'],
+            "source": self.name,
+            "source_type": "youtube",
+            "date_published": self._get_published_date(video),
+            "authors": self.extract_authors(video),
+        })
+
+
+@dataclass
+class YouTubeChannelDataset(YouTubeDataset):
+
+    channel_id: str
+    authors: List[str]
+
+    @property
+    def collection_ids(self):
+        return [self.channel_id]
+
+    def next_page(self, collection_id, next_page_token):
+        return self.youtube.search().list(
+            part='snippet',
+            channelId=collection_id,
+            maxResults=50,
+            pageToken=next_page_token
+        ).execute()
+
+    def _get_published_date(self, video):
+        return super()._get_published_date(video['snippet']['publishTime'])
+
+
+@dataclass
+class YouTubePlaylistDataset(YouTubeDataset):
+
+    playlist_ids: str
+
+    @property
+    def collection_ids(self):
+        return self.playlist_ids
+
+    def next_page(self, collection_id, next_page_token):
+        return self.youtube.playlistItems().list(
+            part='snippet',
+            playlistId=collection_id,
+            maxResults=50,
+            pageToken=next_page_token,
+        ).execute()
+
+    def _get_published_date(self, video):
+        return super()._get_published_date(video['snippet']['publishedAt'])
diff --git a/requirements.txt b/requirements.txt
@@ -27,6 +27,7 @@ google-auth-oauthlib
 google-auth-httplib2
 google-api-python-client
 gspread
+youtube-transcript-api
 
 alembic
 mysqlclient