From 42e75f089f6b6b6198912c8d7fc8ac0353559c9b Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Tue, 22 Aug 2023 23:51:11 +0200 Subject: [PATCH] Bunch up blogs, special_docs and youtube (#147) * Bunch up blogs, special_docs and youtube * update readme to match bunched datasets --------- Co-authored-by: ccstan99 --- README.md | 87 +++++++++++++------------ align_data/common/alignment_dataset.py | 37 +++++++++++ align_data/sources/articles/__init__.py | 9 ++- align_data/sources/articles/datasets.py | 4 +- align_data/sources/blogs/__init__.py | 8 ++- align_data/sources/youtube/__init__.py | 8 ++- align_data/sources/youtube/youtube.py | 1 - 7 files changed, 107 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index 5afbcf19..ec3927ab 100644 --- a/README.md +++ b/README.md @@ -4,63 +4,68 @@ The AI Alignment Research Dataset is a collection of documents related to AI Ali ## Sources -The following list of sources may change and items may be renamed: - -- [agentmodels](https://agentmodels.org/) -- [aiimpacts](https://aiimpacts.org/) -- [aisafety.camp](https://aisafety.camp/) -- [aisafety.info](https://aisafety.info/) -- [ai_alignment_playlist]() -- [ai_explained](https://www.youtube.com/@ai-explained-) -- [ai_safety_talks](https://www.youtube.com/@aisafetytalks) -- [ai_safety_reading_group](https://www.youtube.com/@aisafetyreadinggroup/videos) -- [ai_tech_tu_delft](https://www.youtube.com/@AiTechTUDelft/) +Here are the list of sources along with sample contents: + +- [agentmodel](https://agentmodels.org/) +- [aisafety.info](https://aisafety.info/) - Stampy's FAQ - [alignmentforum](https://www.alignmentforum.org) - [alignment_newsletter](https://rohinshah.com/alignment-newsletter/) - [arbital](https://arbital.com/) -- arxiv - alignment research papers from [arxiv](https://arxiv.org/) -- [carado.moe](https://carado.moe/) -- [cold_takes](https://www.cold-takes.com/) -- [deepmind_blog](https://deepmindsafetyresearch.medium.com/) -- [deepmind_technical_blog](https://www.deepmind.com/blog-categories/technical-blogs) +- [arxiv](https://arxiv.org/) - relevant research papers + +- blogs - entire websites automatically scraped + - [AI Impacts](https://aiimpacts.org/) + - [AI Safety Camp](https://aisafety.camp/) + - [carado.moe](https://carado.moe/) + - [Cold Takes](https://www.cold-takes.com/) + - [DeepMind technical blogs](https://www.deepmind.com/blog-categories/technical-blogs) + - [DeepMind AI Safety Research](https://deepmindsafetyresearch.medium.com/) + - [EleutherAI](https://blog.eleuther.ai/) + - [generative.ink](https://generative.ink/posts/) + - [Gwern Branwen's blog](https://gwern.net/) + - [Jack Clark's Import AI](https://importai.substack.com/) + - [MIRI](https://intelligence.org/) + - [Jacob Steinhardt's blog](https://jsteinhardt.wordpress.com/) + - [ML Safety Newsletter](https://newsletter.mlsafety.org/) + - [Transformer Circuits Thread](https://transformer-circuits.pub/) + - [Open AI Research](https://openai.com/research/) + - [Victoria Krakovna's blog](https://vkrakovna.wordpress.com/) + - [Eliezer Yudkowsky's blog](https://www.yudkowsky.net/) + - [distill](https://distill.pub/) - [eaforum](https://forum.effectivealtruism.org/) - selected posts -- [eleuther.ai](https://blog.eleuther.ai/) -- [generative.ink](https://generative.ink/posts/) -- [gwern_blog](https://gwern.net/) -- gdocs - various doc files stored on Google drive -- html_articles - various articles on websites -- [import.ai](https://importai.substack.com) -- [jsteinhardt_blog](https://jsteinhardt.wordpress.com/) - [lesswrong](https://www.lesswrong.com/) - selected posts -- markdown -- [miri](https://intelligence.org/) - MIRI -- [ml_safety_newsletter](https://newsletter.mlsafety.org) -- [openai.research](https://openai.com/research) -- pdfs - various pdfs from different places -- [rob_miles_ai_safety](https://www.youtube.com/@RobertMilesAI) -- [vkrakovna_blog](https://vkrakovna.wordpress.com) -- [waitbutwhy](https://waitbutwhy.com/) -- [yudkowsky_blog](https://www.yudkowsky.net/) -- xmls - various articles stored as XML files +- special_docs - individual documents curated from various resources + - [Make a suggestion](https://bit.ly/ard-suggestion) for sources not already in the dataset + +- youtube - playlists & channels + - [AI Alignment playlist](https://www.youtube.com/playlist?list=PLCRVRLd2RhZTpdUdEzJjo3qhmX3y3skWA) and other lists + - [AI Explained](https://www.youtube.com/@aiexplained-official) + - [Evan Hubinger's AI Safety Talks](https://www.youtube.com/@aisafetytalks) + - [AI Safety Reading Group](https://www.youtube.com/@aisafetyreadinggroup/videos) + - [AiTech - TU Delft](https://www.youtube.com/@AiTechTUDelft/) + - [Rob Miles AI](https://www.youtube.com/@RobertMilesAI) ## Keys -Not all of the entries contain the same keys, but they all have the following: +All entries contain the following keys: -- `id` - unique identifier -- `source` - based on the data source listed in the previous section -- `title` - title of document +- `id` - string of unique identifier +- `source` - string of data source listed above +- `title` - string of document title of document +- `authors` - list of strings - `text` - full text of document content -- `url` - some values may be `'n/a'`, still being updated -- `date_published` - some `'n/a'` +- `url` - string of valid link to text content +- `date_published` - in UTC format -The values of the keys are still being cleaned up for consistency. Additional keys are available depending on the source document. +Additional keys may be available depending on the source document. ## Development Environment -To set up the development environment, run the following steps. You'll have to also set up [mysqlclient](https://pypi.org/project/mysqlclient/): +Follow the [instructions to install **mysqlclient** on your operating system](https://pypi.org/project/mysqlclient/) toward the middle to bottom of the linked page. + +To set up the development environment, run the following steps: ```bash git clone https://github.com/StampyAI/alignment-research-dataset diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py index 54e4ff2e..78753169 100644 --- a/align_data/common/alignment_dataset.py +++ b/align_data/common/alignment_dataset.py @@ -259,3 +259,40 @@ def merge(item): return item session.add_all(map(merge, batch)) + + +@dataclass +class MultiDataset(AlignmentDataset): + + datasets: List[AlignmentDataset] + + @property + def names(self): + return [dataset.name for dataset in self.datasets] + + @property + def items_list(self) -> Iterable: + """Returns a collection of items to be processed.""" + return ((item, dataset) for dataset in self.datasets for item in dataset.items_list) + + def setup(self): + for dataset in self.datasets: + dataset.setup() + + def get_item_key(self, entry): + item, dataset = entry + return dataset.get_item_key(item) + + def process_entry(self, entry) -> Optional[Article]: + item, dataset = entry + article = dataset.process_entry(item) + article.add_meta('initial_source', article.source) + article.source = self.name + + def fetch_entries(self): + for dataset in self.datasets: + for article in dataset.fetch_entries(): + if article.source != self.name: + article.add_meta('initial_source', article.source) + article.source = self.name + yield article diff --git a/align_data/sources/articles/__init__.py b/align_data/sources/articles/__init__.py index b5264ce9..6fd45fbc 100644 --- a/align_data/sources/articles/__init__.py +++ b/align_data/sources/articles/__init__.py @@ -9,8 +9,10 @@ XMLArticles, ) from align_data.sources.articles.indices import IndicesDataset +from align_data.common.alignment_dataset import MultiDataset -ARTICLES_REGISTRY = [ + +ARTICLES_DATASETS = [ PDFArticles( name="pdfs", spreadsheet_id="1l3azVJVukGAvZPgg0GyeqiaQe8bEMZvycBJaA8cRXf4", @@ -46,6 +48,11 @@ spreadsheet_id="1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI", sheet_id="980957638", ), +] + + +ARTICLES_REGISTRY = [ + MultiDataset(name='special_docs', datasets=ARTICLES_DATASETS), ArxivPapers( name="arxiv", spreadsheet_id="1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI", diff --git a/align_data/sources/articles/datasets.py b/align_data/sources/articles/datasets.py index f8f15ab0..37ab0940 100644 --- a/align_data/sources/articles/datasets.py +++ b/align_data/sources/articles/datasets.py @@ -186,7 +186,7 @@ class XMLArticles(SpreadsheetDataset): def _get_text(self, item): vals = extract_gdrive_contents(item.source_url) - return vals["text"] + return vals.get("text") class MarkdownArticles(SpreadsheetDataset): @@ -195,7 +195,7 @@ class MarkdownArticles(SpreadsheetDataset): def _get_text(self, item): file_id = item.source_url.split("/")[-2] vals = fetch_markdown(file_id) - return vals["text"] + return vals.get("text") class DocArticles(SpreadsheetDataset): diff --git a/align_data/sources/blogs/__init__.py b/align_data/sources/blogs/__init__.py index a0ede6d7..05831f3e 100644 --- a/align_data/sources/blogs/__init__.py +++ b/align_data/sources/blogs/__init__.py @@ -11,9 +11,10 @@ ) from align_data.sources.blogs.substack_blog import SubstackBlog from align_data.sources.articles.parsers import MediumParser +from align_data.common.alignment_dataset import MultiDataset -BLOG_REGISTRY = [ +BLOG_DATASETS = [ WordpressBlog(name="aiimpacts", url="https://aiimpacts.org"), WordpressBlog(name="aisafety.camp", url="https://aisafety.camp"), WordpressBlog(name="miri", url="https://intelligence.org"), @@ -57,3 +58,8 @@ ), TransformerCircuits(name='transformer-circuits', url='https://transformer-circuits.pub/'), ] + + +BLOG_REGISTRY = [ + MultiDataset(name='blogs', datasets=BLOG_DATASETS), +] diff --git a/align_data/sources/youtube/__init__.py b/align_data/sources/youtube/__init__.py index 06c8defe..ca0d9b33 100644 --- a/align_data/sources/youtube/__init__.py +++ b/align_data/sources/youtube/__init__.py @@ -1,9 +1,10 @@ +from align_data.common.alignment_dataset import MultiDataset from align_data.sources.youtube.youtube import ( YouTubeChannelDataset, YouTubePlaylistDataset, ) -YOUTUBE_REGISTRY = [ +YOUTUBE_DATASETS = [ YouTubeChannelDataset( name="rob_miles_ai_safety", channel_id="UCLB7AzTwc6VFZrBsO2ucBMg", @@ -40,3 +41,8 @@ ], ), ] + + +YOUTUBE_REGISTRY = [ + MultiDataset(name='youtube', datasets=YOUTUBE_DATASETS), +] diff --git a/align_data/sources/youtube/youtube.py b/align_data/sources/youtube/youtube.py index 8670b691..876dc09e 100644 --- a/align_data/sources/youtube/youtube.py +++ b/align_data/sources/youtube/youtube.py @@ -1,4 +1,3 @@ -import collections import logging from dataclasses import dataclass from typing import List