Regular pinecone updates (#120)

StampyAI · Aug 10, 2023 · 99292e8 · 99292e8
1 parent a814bbc
commit 99292e8
Show file tree

Hide file tree

Showing 6 changed files with 172 additions and 41 deletions.
diff --git a/.env.example b/.env.example
@@ -1,3 +1,4 @@
+LOG_LEVEL="INFO"
 CODA_TOKEN=""
 ARD_DB_USER="user"
 ARD_DB_PASSWORD="we all live in a yellow submarine"

diff --git a/.github/workflows/fetch-daily.yml b/.github/workflows/fetch-daily.yml
@@ -5,19 +5,34 @@ on:
     - cron: "0 0 * * *"  # Every day at midnight
 
 jobs:
-  update_dateset:
+  update_dataset:
     strategy:
       matrix:
-        datasource:
-          - lesswrong
-          - alignmentforum
-          - aisafety.info
+        datasource: ${{ fromJson(vars.DAILY_DATASOURCES) }}
 
     uses: ./.github/workflows/fetch-dataset.yml
     with:
       datasource: ${{ matrix.datasource }}
       coda_token: ${{ inputs.coda_token }}
+      youtube_api_key: ${{ inputs.youtube_api_key }}
       db_user: ${{ inputs.db_user }}
       db_password: ${{ inputs.db_password }}
       db_host: ${{ inputs.db_host }}
     secrets: inherit
+
+  update_indexes:
+    needs: update_dataset
+    strategy:
+      matrix:
+        datasource: ${{ fromJson(vars.DAILY_DATASOURCES) }}
+
+    uses: ./.github/workflows/update-pinecone.yml
+    with:
+      datasource: ${{ matrix.datasource }}
+      db_user: ${{ inputs.db_user }}
+      db_password: ${{ inputs.db_password }}
+      db_host: ${{ inputs.db_host }}
+      openai_api_key: ${{ inputs.openai_api_key }}
+      pinecone_api_key: ${{ inputs.pinecone_api_key }}
+      pinecone_environment: ${{ inputs.pinecone_environment }}
+    secrets: inherit
diff --git a/.github/workflows/fetch-weekly.yml b/.github/workflows/fetch-weekly.yml
@@ -5,44 +5,10 @@ on:
     - cron: "0 0 * * 0"  # Every Sunday at midnight
 
 jobs:
-  update_dateset:
+  update_dataset:
     strategy:
       matrix:
-        datasource:
-          - agentmodels
-          - aiimpacts
-          - aisafety.camp
-          - ai_alignment_playlist
-          - ai_explained
-          - ai_safety_talks
-          - ai_safety_reading_group
-          - ai_tech_tu_delft
-          - alignment_newsletter
-          - arbital
-          - arxiv
-          - carado.moe
-          - cold_takes
-          - deepmind_blog
-          - deepmind_technical_blog
-          - distill
-          - eaforum
-          - ebooks
-          - eleuther.ai
-          - gdocs
-          - generative.ink
-          - gwern_blog
-          - html_articles
-          - importai
-          - jsteinhardt_blog
-          - markdown
-          - miri
-          - ml_safety_newsletter
-          - openai.research
-          - pdfs
-          - rob_miles_ai_safety
-          - vkrakovna_blog
-          - yudkowsky_blog
-          - xmls
+        datasource: ${{ fromJson(vars.WEEKLY_DATASOURCES) }}
 
     uses: ./.github/workflows/fetch-dataset.yml
     with:
@@ -53,3 +19,20 @@ jobs:
       db_password: ${{ inputs.db_password }}
       db_host: ${{ inputs.db_host }}
     secrets: inherit
+
+  update_indexes:
+    needs: update_dataset
+    strategy:
+      matrix:
+        datasource: ${{ fromJson(vars.WEEKLY_DATASOURCES) }}
+
+    uses: ./.github/workflows/update-pinecone.yml
+    with:
+      datasource: ${{ matrix.datasource }}
+      db_user: ${{ inputs.db_user }}
+      db_password: ${{ inputs.db_password }}
+      db_host: ${{ inputs.db_host }}
+      openai_api_key: ${{ inputs.openai_api_key }}
+      pinecone_api_key: ${{ inputs.pinecone_api_key }}
+      pinecone_environment: ${{ inputs.pinecone_environment }}
+    secrets: inherit
diff --git a/.github/workflows/update-pinecone.yml b/.github/workflows/update-pinecone.yml
@@ -0,0 +1,103 @@
+name: Update Alignment Research Dataset embeddings in Pinecone
+
+on:
+  workflow_call:
+    inputs:
+      datasource:
+        type: string
+        required: true
+      db_user:
+        type: string
+        required: true
+      db_password:
+        type: string
+        required: true
+      db_host:
+        type: string
+        required: true
+      pinecone_api_key:
+        type: string
+        required: true
+      pinecone_environment:
+        type: string
+        required: true
+      openai_api_key:
+        type: string
+        required: true
+  workflow_dispatch: # allow manual triggering
+    inputs:
+      datasource:
+        description: 'The datasource to process'
+        type: choice
+        options:
+          - all
+          - agentmodels
+          - aiimpacts
+          - aisafety.camp
+          - aisafety.info
+          - ai_alignment_playlist
+          - ai_explained
+          - ai_safety_talks
+          - ai_safety_reading_group
+          - ai_tech_tu_delft
+          - alignmentforum
+          - arbital
+          - arxiv
+          - carado.moe
+          - cold_takes
+          - deepmind_blog
+          - deepmind_technical_blog
+          - distill
+          - eaforum
+          - ebooks
+          - eleuther.ai
+          - gdocs
+          - generative.ink
+          - gwern_blog
+          - html_articles
+          - importai
+          - indices
+          - jsteinhardt_blog
+          - lesswrong
+          - markdown
+          - miri
+          - ml_safety_newsletter
+          - openai.research
+          - pdfs
+          - rob_miles_ai_safety
+          - special_docs
+          - vkrakovna_blog
+          - yudkowsky_blog
+          - xmls
+
+jobs:
+  build-dataset:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+
+    - name: Setup Python environment
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+
+    - name: Install dependencies
+      run: pip install -r requirements.txt
+
+    - name: Process dataset
+      env:
+        ARD_DB_USER: ${{ secrets.ARD_DB_USER || inputs.db_user }}
+        ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD || inputs.db_password }}
+        ARD_DB_HOST: ${{ secrets.ARD_DB_HOST || inputs.db_host }}
+        ARD_DB_NAME: alignment_research_dataset
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY || inputs.openai_api_key }}
+        PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY || inputs.pinecone_api_key }}
+        PINECONE_ENVIRONMENT: ${{ secrets.PINECONE_ENVIRONMENT || inputs.pinecone_environment }}
+      run: |
+        if [ "${{ inputs.datasource }}" = "all" ]; then
+          python main.py pinecone_update_all
+        else
+          python main.py pinecone_update ${{ inputs.datasource }}
+        fi
diff --git a/README.md b/README.md
@@ -146,10 +146,35 @@ There are Datasets defined for various types of data sources - first check if an
 
 When wishing to update the whole dataset, run `python main.py fetch_all`. You can also fetch a specific subsection of a dataset by its name, for example `python main.py fetch aisafety.info`.
 
+## Configuration
+
+Various subcomponents use various external services, so need credentials set. This is done via environment variables, the easiest way of setting which is by copying `~/.env.example` to `~/.env` and changing the appropriate values.
+
+### Logging
+
+The log level can be configured with the `LOG_LEVEL` environment variable. The default level is 'WARNING'.
+
 ### Coda
 
 To update the stampy portion of the dataset, you will need a Coda token. go to coda.io, log in, and generate an API token in your account settings. Add restrictions: Doc or table, Read only, for the doc with url https://coda.io/d/_dfau7sl2hmG. Then, create a .env file at the root of the alignment research dataset, and write CODA_TOKEN="<coda_token>". It will be accessible in align_data/stampy/stampy.py
 
+### MySQL
+
+The datasets are stored in MySQL. The connection string can be configured via the `ARD_DB_USER`,
+`ARD_DB_PASSWORD`, `ARD_DB_HOST`, `ARD_DB_PORT` and `ARD_DB_NAME` environment variables. A local
+database can be started in Docker by running
+
+    ./local_db.sh
+
+### Pinecone
+
+For Pinecone updates to work, you'll need to configure the API key:
+
+1. Get an API key, as described [here](https://docs.pinecone.io/docs/quickstart#2-get-and-verify-your-pinecone-api-key)
+2. Create a Pinecone index named "stampy-chat-ard" (or whatever is set as `PINECONE_INDEX_NAME`) with the `dotproduct` metric and 1536 dimensions
+3. Set the `PINECONE_API_KEY` to the key from step 1
+4. Set the `PINECONE_ENVIRONMENT` to whatever is the environment of your index
+
 ### Metadata updates
 
 There are a couple of datasources that consist of singular articles (html, pdfs, ebooks, etc), rather than all the contents of a given website. These are managed in [Google sheets](https://docs.google.com/spreadsheets/d/1l3azVJVukGAvZPgg0GyeqiaQe8bEMZvycBJaA8cRXf4/edit#gid=0). It's assumed that the contents of that document are clean, in that all required fields are set, and that there is a `source_url` pointing to a valid document. Rather than having to manually fill these fields, there is a magical script that automatically populates them from a messy [input worksheet](https://docs.google.com/spreadsheets/d/1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI/edit?pli=1#gid=980957638), which contains all kinds of info. The following will execute this script:

diff --git a/align_data/settings.py b/align_data/settings.py
@@ -1,8 +1,12 @@
 import os
+import logging
 from dotenv import load_dotenv
 
 load_dotenv()
 
+LOG_LEVEL = os.environ.get('LOG_LEVEL', 'WARNING').upper()
+logging.basicConfig(level=LOG_LEVEL)
+
 ### CODA ###
 CODA_TOKEN = os.environ.get("CODA_TOKEN")
 CODA_DOC_ID = os.environ.get("CODA_DOC_ID", "fau7sl2hmG")