Skip to content

Commit

Permalink
Merge branch 'main' into implement_more_parsers
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas-Lemoine committed Aug 27, 2023
2 parents 70a9757 + f0bacc9 commit cf0bdf4
Show file tree
Hide file tree
Showing 77 changed files with 2,552 additions and 1,286 deletions.
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ OPENAI_API_KEY="sk-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
PINECONE_INDEX_NAME="stampy-chat-ard"
PINECONE_API_KEY="xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
PINECONE_ENVIRONMENT="xx-xxxxx-gcp"
YOUTUBE_API_KEY=""
1 change: 1 addition & 0 deletions .github/workflows/fetch-daily.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ jobs:
with:
datasource: ${{ matrix.datasource }}
coda_token: ${{ inputs.coda_token }}
airtable_api_key: ${{ inputs.airtable_api_key }}
youtube_api_key: ${{ inputs.youtube_api_key }}
db_user: ${{ inputs.db_user }}
db_password: ${{ inputs.db_password }}
Expand Down
37 changes: 8 additions & 29 deletions .github/workflows/fetch-dataset.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ on:
coda_token:
type: string
required: true
airtable_api_key:
type: string
required: true
youtube_api_key:
type: string
required: true
Expand All @@ -28,44 +31,19 @@ on:
type: choice
options:
- agentmodels
- aiimpacts
- aisafety.camp
- agisf
- aisafety.info
- ai_alignment_playlist
- ai_explained
- ai_safety_talks
- ai_safety_reading_group
- ai_tech_tu_delft
- alignmentforum
- alignment_newsletter
- alignmentforum
- arbital
- arxiv
- carado.moe
- cold_takes
- deepmind_blog
- deepmind_technical_blog
- blogs
- distill
- eaforum
- ebooks
- eleuther.ai
- gdocs
- generative.ink
- gwern_blog
- html_articles
- importai
- indices
- jsteinhardt_blog
- lesswrong
- markdown
- miri
- ml_safety_newsletter
- openai.research
- pdfs
- rob_miles_ai_safety
- special_docs
- vkrakovna_blog
- yudkowsky_blog
- xmls
- youtube

jobs:
build-dataset:
Expand Down Expand Up @@ -93,6 +71,7 @@ jobs:
- name: Process dataset
env:
CODA_TOKEN: ${{ secrets.CODA_TOKEN || inputs.coda_token }}
AIRTABLE_API_KEY: ${{ secrets.AIRTABLE_API_KEY || inputs.airtable_api_key }}
YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY || inputs.youtube_api_key }}
ARD_DB_USER: ${{ secrets.ARD_DB_USER || inputs.db_user }}
ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD || inputs.db_password }}
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/fetch-weekly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ jobs:
with:
datasource: ${{ matrix.datasource }}
coda_token: ${{ inputs.coda_token }}
airtable_api_key: ${{ inputs.airtable_api_key }}
youtube_api_key: ${{ inputs.youtube_api_key }}
db_user: ${{ inputs.db_user }}
db_password: ${{ inputs.db_password }}
Expand Down
26 changes: 3 additions & 23 deletions .github/workflows/push-dataset.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,37 +24,17 @@ on:
options:
- all
- agentmodels
- aiimpacts
- aisafety.camp
- agisf
- aisafety.info
- ai_alignment_playlist
- ai_explained
- ai_safety_talks
- ai_safety_reading_group
- ai_tech_tu_delft
- alignmentforum
- arbital
- arxiv
- carado.moe
- cold_takes
- deepmind_blog
- deepmind_technical_blog
- blogs
- distill
- eaforum
- eleuther.ai
- gdocs
- generative.ink
- gwern_blog
- importai
- jsteinhardt_blog
- lesswrong
- miri
- ml_safety_newsletter
- openai.research
- rob_miles_ai_safety
- special_docs
- vkrakovna_blog
- yudkowsky_blog
- youtube

jobs:
generate-dataset:
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/update-metadata.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,9 @@ jobs:
run: curl -L "${{ inputs.csv_url }}" -o data.csv

- name: Run Script
env:
ARD_DB_USER: ${{ secrets.ARD_DB_USER }}
ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD }}
ARD_DB_HOST: ${{ secrets.ARD_DB_HOST }}
ARD_DB_NAME: alignment_research_dataset
run: python main.py update data.csv ${{ inputs.delimiter }}
77 changes: 26 additions & 51 deletions .github/workflows/update-pinecone.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,43 +32,18 @@ on:
options:
- all
- agentmodels
- aiimpacts
- aisafety.camp
- agisf
- aisafety.info
- ai_alignment_playlist
- ai_explained
- ai_safety_talks
- ai_safety_reading_group
- ai_tech_tu_delft
- alignmentforum
- arbital
- arxiv
- carado.moe
- cold_takes
- deepmind_blog
- deepmind_technical_blog
- blogs
- distill
- eaforum
- ebooks
- eleuther.ai
- gdocs
- generative.ink
- gwern_blog
- html_articles
- importai
- indices
- jsteinhardt_blog
- lesswrong
- markdown
- miri
- ml_safety_newsletter
- openai.research
- pdfs
- rob_miles_ai_safety
- special_docs
- vkrakovna_blog
- yudkowsky_blog
- xmls
- youtube

jobs:
build-dataset:
Expand All @@ -78,28 +53,28 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v2

- name: Setup Python environment
uses: actions/setup-python@v2
with:
python-version: '3.x'
# - name: Setup Python environment
# uses: actions/setup-python@v2
# with:
# python-version: '3.x'

- name: Install dependencies
run: |
pip install -r requirements.txt;
python -c 'import nltk; nltk.download("punkt")'
# - name: Install dependencies
# run: |
# pip install -r requirements.txt;
# python -c 'import nltk; nltk.download("punkt")'

- name: Process dataset
env:
ARD_DB_USER: ${{ secrets.ARD_DB_USER || inputs.db_user }}
ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD || inputs.db_password }}
ARD_DB_HOST: ${{ secrets.ARD_DB_HOST || inputs.db_host }}
ARD_DB_NAME: alignment_research_dataset
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY || inputs.openai_api_key }}
PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY || inputs.pinecone_api_key }}
PINECONE_ENVIRONMENT: ${{ secrets.PINECONE_ENVIRONMENT || inputs.pinecone_environment }}
run: |
if [ "${{ inputs.datasource }}" = "all" ]; then
python main.py pinecone_update_all
else
python main.py pinecone_update ${{ inputs.datasource }}
fi
# - name: Process dataset
# env:
# ARD_DB_USER: ${{ secrets.ARD_DB_USER || inputs.db_user }}
# ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD || inputs.db_password }}
# ARD_DB_HOST: ${{ secrets.ARD_DB_HOST || inputs.db_host }}
# ARD_DB_NAME: alignment_research_dataset
# OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY || inputs.openai_api_key }}
# PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY || inputs.pinecone_api_key }}
# PINECONE_ENVIRONMENT: ${{ secrets.PINECONE_ENVIRONMENT || inputs.pinecone_environment }}
# run: |
# if [ "${{ inputs.datasource }}" = "all" ]; then
# python main.py pinecone_update_all
# else
# python main.py pinecone_update ${{ inputs.datasource }}
# fi
11 changes: 11 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- repo: https://github.com/psf/black
rev: 23.7.0
hooks:
- id: black
language_version: python3.11
88 changes: 47 additions & 41 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,63 +4,69 @@ The AI Alignment Research Dataset is a collection of documents related to AI Ali

## Sources

The following list of sources may change and items may be renamed:

- [agentmodels](https://agentmodels.org/)
- [aiimpacts](https://aiimpacts.org/)
- [aisafety.camp](https://aisafety.camp/)
- [aisafety.info](https://aisafety.info/)
- [ai_alignment_playlist]()
- [ai_explained](https://www.youtube.com/@ai-explained-)
- [ai_safety_talks](https://www.youtube.com/@aisafetytalks)
- [ai_safety_reading_group](https://www.youtube.com/@aisafetyreadinggroup/videos)
- [ai_tech_tu_delft](https://www.youtube.com/@AiTechTUDelft/)
Here are the list of sources along with sample contents:

- [agentmodel](https://agentmodels.org/)
- [agisf](https://course.aisafetyfundamentals.com/) - recommended readings from AGI Safety Fundamentals
- [aisafety.info](https://aisafety.info/) - Stampy's FAQ
- [alignmentforum](https://www.alignmentforum.org)
- [alignment_newsletter](https://rohinshah.com/alignment-newsletter/)
- [arbital](https://arbital.com/)
- arxiv - alignment research papers from [arxiv](https://arxiv.org/)
- [carado.moe](https://carado.moe/)
- [cold_takes](https://www.cold-takes.com/)
- [deepmind_blog](https://deepmindsafetyresearch.medium.com/)
- [deepmind_technical_blog](https://www.deepmind.com/blog-categories/technical-blogs)
- [arxiv](https://arxiv.org/) - relevant research papers

- blogs - entire websites automatically scraped
- [AI Impacts](https://aiimpacts.org/)
- [AI Safety Camp](https://aisafety.camp/)
- [carado.moe](https://carado.moe/)
- [Cold Takes](https://www.cold-takes.com/)
- [DeepMind technical blogs](https://www.deepmind.com/blog-categories/technical-blogs)
- [DeepMind AI Safety Research](https://deepmindsafetyresearch.medium.com/)
- [EleutherAI](https://blog.eleuther.ai/)
- [generative.ink](https://generative.ink/posts/)
- [Gwern Branwen's blog](https://gwern.net/)
- [Jack Clark's Import AI](https://importai.substack.com/)
- [MIRI](https://intelligence.org/)
- [Jacob Steinhardt's blog](https://jsteinhardt.wordpress.com/)
- [ML Safety Newsletter](https://newsletter.mlsafety.org/)
- [Transformer Circuits Thread](https://transformer-circuits.pub/)
- [Open AI Research](https://openai.com/research/)
- [Victoria Krakovna's blog](https://vkrakovna.wordpress.com/)
- [Eliezer Yudkowsky's blog](https://www.yudkowsky.net/)

- [distill](https://distill.pub/)
- [eaforum](https://forum.effectivealtruism.org/) - selected posts
- [eleuther.ai](https://blog.eleuther.ai/)
- [generative.ink](https://generative.ink/posts/)
- [gwern_blog](https://gwern.net/)
- gdocs - various doc files stored on Google drive
- html_articles - various articles on websites
- [import.ai](https://importai.substack.com)
- [jsteinhardt_blog](https://jsteinhardt.wordpress.com/)
- [lesswrong](https://www.lesswrong.com/) - selected posts
- markdown
- [miri](https://intelligence.org/) - MIRI
- [ml_safety_newsletter](https://newsletter.mlsafety.org)
- [openai.research](https://openai.com/research)
- pdfs - various pdfs from different places
- [rob_miles_ai_safety](https://www.youtube.com/@RobertMilesAI)
- [vkrakovna_blog](https://vkrakovna.wordpress.com)
- [waitbutwhy](https://waitbutwhy.com/)
- [yudkowsky_blog](https://www.yudkowsky.net/)
- xmls - various articles stored as XML files

- special_docs - individual documents curated from various resources
- [Make a suggestion](https://bit.ly/ard-suggestion) for sources not already in the dataset

- youtube - playlists & channels
- [AI Alignment playlist](https://www.youtube.com/playlist?list=PLCRVRLd2RhZTpdUdEzJjo3qhmX3y3skWA) and other lists
- [AI Explained](https://www.youtube.com/@aiexplained-official)
- [Evan Hubinger's AI Safety Talks](https://www.youtube.com/@aisafetytalks)
- [AI Safety Reading Group](https://www.youtube.com/@aisafetyreadinggroup/videos)
- [AiTech - TU Delft](https://www.youtube.com/@AiTechTUDelft/)
- [Rob Miles AI](https://www.youtube.com/@RobertMilesAI)

## Keys

Not all of the entries contain the same keys, but they all have the following:
All entries contain the following keys:

- `id` - unique identifier
- `source` - based on the data source listed in the previous section
- `title` - title of document
- `id` - string of unique identifier
- `source` - string of data source listed above
- `title` - string of document title of document
- `authors` - list of strings
- `text` - full text of document content
- `url` - some values may be `'n/a'`, still being updated
- `date_published` - some `'n/a'`
- `url` - string of valid link to text content
- `date_published` - in UTC format

The values of the keys are still being cleaned up for consistency. Additional keys are available depending on the source document.
Additional keys may be available depending on the source document.

## Development Environment

To set up the development environment, run the following steps. You'll have to also set up [mysqlclient](https://pypi.org/project/mysqlclient/):
Follow the [instructions to install **mysqlclient** on your operating system](https://pypi.org/project/mysqlclient/) toward the middle to bottom of the linked page.

To set up the development environment, run the following steps:

```bash
git clone https://github.com/StampyAI/alignment-research-dataset
Expand Down
2 changes: 2 additions & 0 deletions align_data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import align_data.sources.arbital as arbital
import align_data.sources.articles as articles
import align_data.sources.agisf as agisf
import align_data.sources.blogs as blogs
import align_data.sources.ebooks as ebooks
import align_data.sources.greaterwrong as greaterwrong
Expand All @@ -11,6 +12,7 @@
DATASET_REGISTRY = (
arbital.ARBITAL_REGISTRY
+ articles.ARTICLES_REGISTRY
+ agisf.AGISF_DATASETS
+ blogs.BLOG_REGISTRY
+ ebooks.EBOOK_REGISTRY
+ greaterwrong.GREATERWRONG_REGISTRY
Expand Down
4 changes: 1 addition & 3 deletions align_data/analysis/analyse_jsonl_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,7 @@ def process_jsonl_files(data_dir):

for id, duplicates in seen_urls.items():
if len(duplicates) > 1:
list_of_duplicates = "\n".join(
get_data_dict_str(duplicate) for duplicate in duplicates
)
list_of_duplicates = "\n".join(get_data_dict_str(duplicate) for duplicate in duplicates)
print(
f"{len(duplicates)} duplicate ids found. \nId: {id}\n{list_of_duplicates}\n\n\n\n"
)
Expand Down
Loading

0 comments on commit cf0bdf4

Please sign in to comment.