Merge branch 'main' into implement_more_parsers

StampyAI · Aug 27, 2023 · cf0bdf4 · cf0bdf4
2 parents 70a9757 + f0bacc9
commit cf0bdf4
Show file tree

Hide file tree

Showing 77 changed files with 2,552 additions and 1,286 deletions.
diff --git a/.env.example b/.env.example
@@ -9,3 +9,4 @@ OPENAI_API_KEY="sk-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
 PINECONE_INDEX_NAME="stampy-chat-ard"
 PINECONE_API_KEY="xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
 PINECONE_ENVIRONMENT="xx-xxxxx-gcp"
+YOUTUBE_API_KEY=""
diff --git a/.github/workflows/fetch-daily.yml b/.github/workflows/fetch-daily.yml
@@ -14,6 +14,7 @@ jobs:
     with:
       datasource: ${{ matrix.datasource }}
       coda_token: ${{ inputs.coda_token }}
+      airtable_api_key: ${{ inputs.airtable_api_key }}
       youtube_api_key: ${{ inputs.youtube_api_key }}
       db_user: ${{ inputs.db_user }}
       db_password: ${{ inputs.db_password }}

diff --git a/.github/workflows/fetch-dataset.yml b/.github/workflows/fetch-dataset.yml
@@ -9,6 +9,9 @@ on:
       coda_token:
         type: string
         required: true
+      airtable_api_key:
+        type: string
+        required: true
       youtube_api_key:
         type: string
         required: true
@@ -28,44 +31,19 @@ on:
         type: choice
         options:
           - agentmodels
-          - aiimpacts
-          - aisafety.camp
+          - agisf
           - aisafety.info
-          - ai_alignment_playlist
-          - ai_explained
-          - ai_safety_talks
-          - ai_safety_reading_group
-          - ai_tech_tu_delft
-          - alignmentforum
           - alignment_newsletter
+          - alignmentforum
           - arbital
           - arxiv
-          - carado.moe
-          - cold_takes
-          - deepmind_blog
-          - deepmind_technical_blog
+          - blogs
           - distill
           - eaforum
-          - ebooks
-          - eleuther.ai
-          - gdocs
-          - generative.ink
-          - gwern_blog
-          - html_articles
-          - importai
           - indices
-          - jsteinhardt_blog
           - lesswrong
-          - markdown
-          - miri
-          - ml_safety_newsletter
-          - openai.research
-          - pdfs
-          - rob_miles_ai_safety
           - special_docs
-          - vkrakovna_blog
-          - yudkowsky_blog
-          - xmls
+          - youtube
 
 jobs:
   build-dataset:
@@ -93,6 +71,7 @@ jobs:
     - name: Process dataset
       env:
         CODA_TOKEN: ${{ secrets.CODA_TOKEN || inputs.coda_token }}
+        AIRTABLE_API_KEY: ${{ secrets.AIRTABLE_API_KEY || inputs.airtable_api_key }}
         YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY || inputs.youtube_api_key }}
         ARD_DB_USER: ${{ secrets.ARD_DB_USER || inputs.db_user }}
         ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD || inputs.db_password }}

diff --git a/.github/workflows/fetch-weekly.yml b/.github/workflows/fetch-weekly.yml
@@ -14,6 +14,7 @@ jobs:
     with:
       datasource: ${{ matrix.datasource }}
       coda_token: ${{ inputs.coda_token }}
+      airtable_api_key: ${{ inputs.airtable_api_key }}
       youtube_api_key: ${{ inputs.youtube_api_key }}
       db_user: ${{ inputs.db_user }}
       db_password: ${{ inputs.db_password }}

diff --git a/.github/workflows/push-dataset.yml b/.github/workflows/push-dataset.yml
@@ -24,37 +24,17 @@ on:
         options:
           - all
           - agentmodels
-          - aiimpacts
-          - aisafety.camp
+          - agisf
           - aisafety.info
-          - ai_alignment_playlist
-          - ai_explained
-          - ai_safety_talks
-          - ai_safety_reading_group
-          - ai_tech_tu_delft
           - alignmentforum
           - arbital
           - arxiv
-          - carado.moe
-          - cold_takes
-          - deepmind_blog
-          - deepmind_technical_blog
+          - blogs
           - distill
           - eaforum
-          - eleuther.ai
-          - gdocs
-          - generative.ink
-          - gwern_blog
-          - importai
-          - jsteinhardt_blog
           - lesswrong
-          - miri
-          - ml_safety_newsletter
-          - openai.research
-          - rob_miles_ai_safety
           - special_docs
-          - vkrakovna_blog
-          - yudkowsky_blog
+          - youtube
 
 jobs:
   generate-dataset:

diff --git a/.github/workflows/update-metadata.yml b/.github/workflows/update-metadata.yml
@@ -31,4 +31,9 @@ jobs:
         run: curl -L "${{ inputs.csv_url }}" -o data.csv
 
       - name: Run Script
+        env:
+          ARD_DB_USER: ${{ secrets.ARD_DB_USER }}
+          ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD }}
+          ARD_DB_HOST: ${{ secrets.ARD_DB_HOST }}
+          ARD_DB_NAME: alignment_research_dataset
         run: python main.py update data.csv ${{ inputs.delimiter }}
diff --git a/.github/workflows/update-pinecone.yml b/.github/workflows/update-pinecone.yml
@@ -32,43 +32,18 @@ on:
         options:
           - all
           - agentmodels
-          - aiimpacts
-          - aisafety.camp
+          - agisf
           - aisafety.info
-          - ai_alignment_playlist
-          - ai_explained
-          - ai_safety_talks
-          - ai_safety_reading_group
-          - ai_tech_tu_delft
           - alignmentforum
           - arbital
           - arxiv
-          - carado.moe
-          - cold_takes
-          - deepmind_blog
-          - deepmind_technical_blog
+          - blogs
           - distill
           - eaforum
-          - ebooks
-          - eleuther.ai
-          - gdocs
-          - generative.ink
-          - gwern_blog
-          - html_articles
-          - importai
           - indices
-          - jsteinhardt_blog
           - lesswrong
-          - markdown
-          - miri
-          - ml_safety_newsletter
-          - openai.research
-          - pdfs
-          - rob_miles_ai_safety
           - special_docs
-          - vkrakovna_blog
-          - yudkowsky_blog
-          - xmls
+          - youtube
 
 jobs:
   build-dataset:
@@ -78,28 +53,28 @@ jobs:
     - name: Checkout repository
       uses: actions/checkout@v2
 
-    - name: Setup Python environment
-      uses: actions/setup-python@v2
-      with:
-        python-version: '3.x'
+    # - name: Setup Python environment
+    #   uses: actions/setup-python@v2
+    #   with:
+    #     python-version: '3.x'
 
-    - name: Install dependencies
-      run: |
-        pip install -r requirements.txt;
-        python -c 'import nltk; nltk.download("punkt")'
+    # - name: Install dependencies
+    #   run: |
+    #     pip install -r requirements.txt;
+    #     python -c 'import nltk; nltk.download("punkt")'
 
-    - name: Process dataset
-      env:
-        ARD_DB_USER: ${{ secrets.ARD_DB_USER || inputs.db_user }}
-        ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD || inputs.db_password }}
-        ARD_DB_HOST: ${{ secrets.ARD_DB_HOST || inputs.db_host }}
-        ARD_DB_NAME: alignment_research_dataset
-        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY || inputs.openai_api_key }}
-        PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY || inputs.pinecone_api_key }}
-        PINECONE_ENVIRONMENT: ${{ secrets.PINECONE_ENVIRONMENT || inputs.pinecone_environment }}
-      run: |
-        if [ "${{ inputs.datasource }}" = "all" ]; then
-          python main.py pinecone_update_all
-        else
-          python main.py pinecone_update ${{ inputs.datasource }}
-        fi
+    # - name: Process dataset
+    #   env:
+    #     ARD_DB_USER: ${{ secrets.ARD_DB_USER || inputs.db_user }}
+    #     ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD || inputs.db_password }}
+    #     ARD_DB_HOST: ${{ secrets.ARD_DB_HOST || inputs.db_host }}
+    #     ARD_DB_NAME: alignment_research_dataset
+    #     OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY || inputs.openai_api_key }}
+    #     PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY || inputs.pinecone_api_key }}
+    #     PINECONE_ENVIRONMENT: ${{ secrets.PINECONE_ENVIRONMENT || inputs.pinecone_environment }}
+    #   run: |
+    #     if [ "${{ inputs.datasource }}" = "all" ]; then
+    #       python main.py pinecone_update_all
+    #     else
+    #       python main.py pinecone_update ${{ inputs.datasource }}
+    #     fi
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,11 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+    -   id: trailing-whitespace
+    -   id: end-of-file-fixer
+-   repo: https://github.com/psf/black
+    rev: 23.7.0
+    hooks:
+    -   id: black
+        language_version: python3.11
diff --git a/README.md b/README.md
@@ -4,63 +4,69 @@ The AI Alignment Research Dataset is a collection of documents related to AI Ali
 
 ## Sources
 
-The following list of sources may change and items may be renamed:
-
-- [agentmodels](https://agentmodels.org/)
-- [aiimpacts](https://aiimpacts.org/)
-- [aisafety.camp](https://aisafety.camp/)
-- [aisafety.info](https://aisafety.info/)
-- [ai_alignment_playlist]()
-- [ai_explained](https://www.youtube.com/@ai-explained-)
-- [ai_safety_talks](https://www.youtube.com/@aisafetytalks)
-- [ai_safety_reading_group](https://www.youtube.com/@aisafetyreadinggroup/videos)
-- [ai_tech_tu_delft](https://www.youtube.com/@AiTechTUDelft/)
+Here are the list of sources along with sample contents:
+
+- [agentmodel](https://agentmodels.org/)
+- [agisf](https://course.aisafetyfundamentals.com/) - recommended readings from AGI Safety Fundamentals
+- [aisafety.info](https://aisafety.info/) - Stampy's FAQ
 - [alignmentforum](https://www.alignmentforum.org)
 - [alignment_newsletter](https://rohinshah.com/alignment-newsletter/)
 - [arbital](https://arbital.com/)
-- arxiv - alignment research papers from [arxiv](https://arxiv.org/)
-- [carado.moe](https://carado.moe/)
-- [cold_takes](https://www.cold-takes.com/)
-- [deepmind_blog](https://deepmindsafetyresearch.medium.com/)
-- [deepmind_technical_blog](https://www.deepmind.com/blog-categories/technical-blogs)
+- [arxiv](https://arxiv.org/) - relevant research papers
+
+- blogs - entire websites automatically scraped
+  - [AI Impacts](https://aiimpacts.org/)
+  - [AI Safety Camp](https://aisafety.camp/)
+  - [carado.moe](https://carado.moe/)
+  - [Cold Takes](https://www.cold-takes.com/)
+  - [DeepMind technical blogs](https://www.deepmind.com/blog-categories/technical-blogs)
+  - [DeepMind AI Safety Research](https://deepmindsafetyresearch.medium.com/)
+  - [EleutherAI](https://blog.eleuther.ai/)
+  - [generative.ink](https://generative.ink/posts/)
+  - [Gwern Branwen's blog](https://gwern.net/)
+  - [Jack Clark's Import AI](https://importai.substack.com/)
+  - [MIRI](https://intelligence.org/)
+  - [Jacob Steinhardt's blog](https://jsteinhardt.wordpress.com/)
+  - [ML Safety Newsletter](https://newsletter.mlsafety.org/)
+  - [Transformer Circuits Thread](https://transformer-circuits.pub/)
+  - [Open AI Research](https://openai.com/research/)
+  - [Victoria Krakovna's blog](https://vkrakovna.wordpress.com/)
+  - [Eliezer Yudkowsky's blog](https://www.yudkowsky.net/)
+
 - [distill](https://distill.pub/)
 - [eaforum](https://forum.effectivealtruism.org/) - selected posts
-- [eleuther.ai](https://blog.eleuther.ai/)
-- [generative.ink](https://generative.ink/posts/)
-- [gwern_blog](https://gwern.net/)
-- gdocs - various doc files stored on Google drive
-- html_articles - various articles on websites
-- [import.ai](https://importai.substack.com)
-- [jsteinhardt_blog](https://jsteinhardt.wordpress.com/)
 - [lesswrong](https://www.lesswrong.com/) - selected posts
-- markdown
-- [miri](https://intelligence.org/) - MIRI
-- [ml_safety_newsletter](https://newsletter.mlsafety.org)
-- [openai.research](https://openai.com/research)
-- pdfs - various pdfs from different places
-- [rob_miles_ai_safety](https://www.youtube.com/@RobertMilesAI)
-- [vkrakovna_blog](https://vkrakovna.wordpress.com)
-- [waitbutwhy](https://waitbutwhy.com/)
-- [yudkowsky_blog](https://www.yudkowsky.net/)
-- xmls - various articles stored as XML files
 
+- special_docs - individual documents curated from various resources
+  - [Make a suggestion](https://bit.ly/ard-suggestion) for sources not already in the dataset
+
+- youtube - playlists & channels
+  - [AI Alignment playlist](https://www.youtube.com/playlist?list=PLCRVRLd2RhZTpdUdEzJjo3qhmX3y3skWA) and other lists
+  - [AI Explained](https://www.youtube.com/@aiexplained-official)
+  - [Evan Hubinger's AI Safety Talks](https://www.youtube.com/@aisafetytalks)
+  - [AI Safety Reading Group](https://www.youtube.com/@aisafetyreadinggroup/videos)
+  - [AiTech - TU Delft](https://www.youtube.com/@AiTechTUDelft/)
+  - [Rob Miles AI](https://www.youtube.com/@RobertMilesAI)
 
 ## Keys
 
-Not all of the entries contain the same keys, but they all have the following:
+All entries contain the following keys:
 
-- `id` - unique identifier
-- `source` - based on the data source listed in the previous section
-- `title` - title of document
+- `id` - string of unique identifier
+- `source` - string of data source listed above
+- `title` - string of document title of document
+- `authors` - list of strings
 - `text` - full text of document content
-- `url` - some values may be `'n/a'`, still being updated
-- `date_published` - some `'n/a'`
+- `url` - string of valid link to text content
+- `date_published` - in UTC format
 
-The values of the keys are still being cleaned up for consistency. Additional keys are available depending on the source document.
+Additional keys may be available depending on the source document.
 
 ## Development Environment
 
-To set up the development environment, run the following steps. You'll have to also set up [mysqlclient](https://pypi.org/project/mysqlclient/):
+Follow the [instructions to install **mysqlclient** on your operating system](https://pypi.org/project/mysqlclient/) toward the middle to bottom of the linked page.
+
+To set up the development environment, run the following steps:
 
 ```bash
 git clone https://github.com/StampyAI/alignment-research-dataset

diff --git a/align_data/__init__.py b/align_data/__init__.py
@@ -1,5 +1,6 @@
 import align_data.sources.arbital as arbital
 import align_data.sources.articles as articles
+import align_data.sources.agisf as agisf
 import align_data.sources.blogs as blogs
 import align_data.sources.ebooks as ebooks
 import align_data.sources.greaterwrong as greaterwrong
@@ -11,6 +12,7 @@
 DATASET_REGISTRY = (
     arbital.ARBITAL_REGISTRY
     + articles.ARTICLES_REGISTRY
+    + agisf.AGISF_DATASETS
     + blogs.BLOG_REGISTRY
     + ebooks.EBOOK_REGISTRY
     + greaterwrong.GREATERWRONG_REGISTRY

diff --git a/align_data/analysis/analyse_jsonl_data.py b/align_data/analysis/analyse_jsonl_data.py
@@ -69,9 +69,7 @@ def process_jsonl_files(data_dir):
 
     for id, duplicates in seen_urls.items():
         if len(duplicates) > 1:
-            list_of_duplicates = "\n".join(
-                get_data_dict_str(duplicate) for duplicate in duplicates
-            )
+            list_of_duplicates = "\n".join(get_data_dict_str(duplicate) for duplicate in duplicates)
             print(
                 f"{len(duplicates)} duplicate ids found. \nId: {id}\n{list_of_duplicates}\n\n\n\n"
             )