Merge remote-tracking branch 'origin/main' into pinecone-updater

StampyAI · Jul 30, 2023 · ba76656 · ba76656
2 parents 9256573 + 3bc0633
commit ba76656
Show file tree

Hide file tree

Showing 27 changed files with 202 additions and 368 deletions.
diff --git a/.github/workflows/fetch-daily.yml b/.github/workflows/fetch-daily.yml
@@ -16,4 +16,8 @@ jobs:
     uses: ./.github/workflows/fetch-dataset.yml
     with:
       datasource: ${{ matrix.datasource }}
+      coda_token: ${{ inputs.coda_token }}
+      db_user: ${{ inputs.db_user }}
+      db_password: ${{ inputs.db_password }}
+      db_host: ${{ inputs.db_host }}
     secrets: inherit
diff --git a/.github/workflows/fetch-dataset.yml b/.github/workflows/fetch-dataset.yml
@@ -6,6 +6,18 @@ on:
       datasource:
         type: string
         required: true
+      coda_token:
+        type: string
+        required: true
+      db_user:
+        type: string
+        required: true
+      db_password:
+        type: string
+        required: true
+      db_host:
+        type: string
+        required: true
   workflow_dispatch: # allow manual triggering
     inputs:
       datasource:
@@ -20,7 +32,6 @@ on:
           - alignmentforum
           - alignment_newsletter
           - arbital
-          - audio_transcripts
           - carado.moe
           - cold_takes
           - deepmind_blog
@@ -37,7 +48,7 @@ on:
           - importai
           - jsteinhardt_blog
           - lesswrong
-          - markdown.ebooks
+          - markdown
           - miri
           - ml_safety_newsletter
           - nonarxiv_papers
@@ -64,40 +75,11 @@ jobs:
     - name: Install dependencies
       run: pip install -r requirements.txt
 
-    - name: Fetch dataset
+    - name: Generate dataset file
       env:
-        CODA_TOKEN: ${{ secrets.CODA_TOKEN }}
-      run: python main.py fetch ${{ inputs.datasource }} --fetch_prev=True
-
-    - name: Upload Artifact
-      uses: actions/upload-artifact@v3
-      with:
-        name: ${{ inputs.datasource }}
-        path: data/${{ inputs.datasource }}.jsonl
-        retention-days: 1
-
-  upload:
-    runs-on: ubuntu-latest
-    needs: build-dataset
-
-    if: github.ref == 'refs/heads/main'
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-
-      - name: Setup Python environment
-        uses: actions/setup-python@v2
-        with:
-          python-version: '3.x'
-
-      - name: Setup Huggingface client
-        run: pip install huggingface_hub gdown jsonlines datasets
-
-      - name: Download a single artifact
-        uses: actions/download-artifact@v3
-        with:
-          name: ${{ inputs.datasource }}
-          path: data/
-
-      - name: Upload file
-        run: python upload_to_huggingface.py ${{ secrets.HUGGINGFACE_TOKEN }} ${{ inputs.datasource }}
+        CODA_TOKEN: ${{ secrets.CODA_TOKEN || inputs.coda_token }}
+        ARD_DB_USER: ${{ secrets.ARD_DB_USER || inputs.db_user }}
+        ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD || inputs.db_password }}
+        ARD_DB_HOST: ${{ secrets.ARD_DB_HOST || inputs.db_host }}
+        ARD_DB_NAME: alignment_research_dataset
+      run: python main.py fetch ${{ inputs.datasource }}
diff --git a/.github/workflows/fetch-weekly.yml b/.github/workflows/fetch-weekly.yml
@@ -1,5 +1,6 @@
 name: Weekly dataset updates
 on:
+  workflow_dispatch: # allow manual triggering
   schedule:
     - cron: "0 0 * * 0"  # Every Sunday at midnight
 
@@ -45,4 +46,8 @@ jobs:
     uses: ./.github/workflows/fetch-dataset.yml
     with:
       datasource: ${{ matrix.datasource }}
+      coda_token: ${{ inputs.coda_token }}
+      db_user: ${{ inputs.db_user }}
+      db_password: ${{ inputs.db_password }}
+      db_host: ${{ inputs.db_host }}
     secrets: inherit
diff --git a/.github/workflows/push-datasets.yml b/.github/workflows/push-datasets.yml
@@ -18,7 +18,6 @@ on:
           - alignment_newsletter
           - arbital
           - arxiv
-          - audio_transcripts
           - carado.moe
           - cold_takes
           - deepmind_blog
@@ -35,7 +34,7 @@ on:
           - importai
           - jsteinhardt_blog
           - lesswrong
-          - markdown.ebooks
+          - markdown
           - miri
           - ml_safety_newsletter
           - nonarxiv_papers
@@ -47,8 +46,40 @@ on:
           - yudkowsky_blog
 
 jobs:
+  generate-dataset:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+
+    - name: Setup Python environment
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+
+    - name: Install dependencies
+      run: pip install -r requirements.txt
+
+    - name: Generate dataset file
+      env:
+        CODA_TOKEN: ${{ secrets.CODA_TOKEN }}
+        ARD_DB_USER: ${{ secrets.ARD_DB_USER }}
+        ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD }}
+        ARD_DB_HOST: ${{ secrets.ARD_DB_HOST }}
+        ARD_DB_NAME: alignment_research_dataset
+      run: python main.py generate_jsonl_files ${{ inputs.datasource }}
+
+    - name: Upload Artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: ${{ inputs.datasource }}
+        path: data/${{ inputs.datasource }}.jsonl
+        retention-days: 1
+
   upload:
     runs-on: ubuntu-latest
+    needs: generate-dataset
 
     steps:
     - name: Checkout repository

diff --git a/align_data/__init__.py b/align_data/__init__.py
@@ -6,7 +6,6 @@
 import align_data.sources.reports as reports
 import align_data.sources.greaterwrong as greaterwrong
 import align_data.sources.stampy as stampy
-import align_data.sources.audio_transcripts as audio_transcripts
 import align_data.sources.alignment_newsletter as alignment_newsletter
 import align_data.sources.distill as distill
 import align_data.sources.gdocs as gdocs
@@ -20,7 +19,6 @@
     + reports.REPORT_REGISTRY
     + greaterwrong.GREATERWRONG_REGISTRY
     + stampy.STAMPY_REGISTRY
-    + audio_transcripts.AUDIO_TRANSCRIPTS_REGISTRY
     + distill.DISTILL_REGISTRY
     + alignment_newsletter.ALIGNMENT_NEWSLETTER_REGISTRY
     + gdocs.GDOCS_REGISTRY

diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py
@@ -53,14 +53,13 @@ class AlignmentDataset:
     """The key of the entry containing the summary contents. This is used both to get the summary, but also where
     it should be put in the target entry."""
 
-    glob = '*.md'
-    """How to identify files to be processed when going through a folder for files"""
-
     COOLDOWN = 0
     """An optional cool down between processing entries"""
 
     lazy_eval = False
     """Whether to lazy fetch items. This is nice in that it will start processing, but messes up the progress bar."""
+    batch_size = 20
+    """The number of items to collect before flushing to the database."""
 
     # Internal housekeeping variables
     _entry_idx = 0
@@ -80,8 +79,6 @@ def __post_init__(self, data_path=Path(__file__).parent / '../../data/'):
 
         # set the default place to look for data
         self.files_path = self.raw_data_path / self.name
-        # TODO: get rid of self.jsonl_path
-        self.jsonl_path = self.data_path / f"{self.name}.jsonl"
 
     def make_data_entry(self, data, **kwargs):
         data = dict(data, **kwargs)
@@ -102,10 +99,12 @@ def to_jsonl(self, out_path=None, filename=None):
 
         if not filename:
             filename = f"{self.name}.jsonl"
+        filename = Path(out_path) / filename
 
-        with jsonlines.open(Path(out_path) / filename, 'w') as jsonl_writer:
+        with jsonlines.open(filename, 'w') as jsonl_writer:
             for article in self.read_entries():
                 jsonl_writer.write(article.to_dict())
+        return filename.resolve()
 
     def read_entries(self, sort_by=None):
         """Iterate through all the saved entries."""
@@ -125,8 +124,9 @@ def commit():
                 session.rollback()
 
         with make_session() as session:
-            while batch := tuple(islice(entries, 20)):
-                session.add_all(entries)
+            items = iter(entries)
+            while batch := tuple(islice(items, self.batch_size)):
+                session.add_all(batch)
                 # there might be duplicates in the batch, so if they cause
                 # an exception, try to commit them one by one
                 if not commit():
@@ -136,15 +136,12 @@ def commit():
                             logger.error(f'found duplicate of {entry}')
 
     def setup(self):
-        # make sure the path to the raw data exists
-        self.files_path.mkdir(parents=True, exist_ok=True)
-
         self._outputted_items = self._load_outputted_items()
 
     @property
     def items_list(self):
         """Returns a generator of items to be processed."""
-        return self.files_path.glob(self.glob)
+        return []
 
     def get_item_key(self, item):
         """Get the identifier of the given `item` so it can be checked to see whether it's been output.
@@ -159,7 +156,7 @@ def _load_outputted_items(self):
             if hasattr(Article, self.done_key):
                 return set(session.scalars(select(getattr(Article, self.done_key)).where(Article.source==self.name)).all())
             # TODO: Properly handle this - it should create a proper SQL JSON select
-            return {getattr(item, self.done_key) for item in session.scalars(select(Article.meta).where(Article.source==self.name)).all()}
+            return {item.get(self.done_key) for item in session.scalars(select(Article.meta).where(Article.source==self.name)).all()}
 
     def unprocessed_items(self, items=None):
         """Return a list of all items to be processed.
@@ -216,6 +213,14 @@ class GdocDataset(AlignmentDataset):
     gdrive_address: str
     """The full URL to the gdrive file"""
 
+    glob = '*.md'
+    """How to identify files to be processed when going through a folder for files"""
+
+    @property
+    def items_list(self):
+        """Returns a generator of items to be processed."""
+        return self.files_path.glob(self.glob)
+
     @property
     def zip_file(self):
         """The name of the downloaded data, if a zip file."""

diff --git a/align_data/db/models.py b/align_data/db/models.py
@@ -34,6 +34,7 @@ class Article(Base):
     source_type: Mapped[Optional[str]] = mapped_column(String(128))
     authors: Mapped[str] = mapped_column(String(1024))
     text: Mapped[Optional[str]] = mapped_column(LONGTEXT)
+    confidence: Mapped[Optional[float]]  # Describes the confidence in how good this article is, as a value <0, 1>
     date_published: Mapped[Optional[datetime]]
     meta: Mapped[Optional[JSON]] = mapped_column(JSON, name='metadata', default='{}')
     date_created: Mapped[datetime] = mapped_column(DateTime, default=func.now())

diff --git a/align_data/sources/alignment_newsletter/alignment_newsletter.py b/align_data/sources/alignment_newsletter/alignment_newsletter.py
@@ -1,6 +1,7 @@
 # %%
 import logging
 from datetime import datetime, timezone
+from pathlib import Path
 import pandas as pd
 
 from dataclasses import dataclass
@@ -17,8 +18,13 @@ class AlignmentNewsletter(AlignmentDataset):
     source_key = 'url'
     summary_key = 'text'
 
+    def __post_init__(self, data_path=Path(__file__).parent / '../../../data/'):
+        self.data_path = data_path
+        self.raw_data_path = self.data_path / 'raw'
+
     def setup(self) -> None:
         super().setup()
+
         self.newsletter_xlsx_path = self.raw_data_path / "alignment_newsletter.xlsx"
         self.df = pd.read_excel(self.newsletter_xlsx_path)
 

diff --git a/align_data/sources/articles/__init__.py b/align_data/sources/articles/__init__.py
@@ -1,4 +1,6 @@
-from align_data.sources.articles.datasets import PDFArticles, HTMLArticles, EbookArticles, XMLArticles
+from align_data.sources.articles.datasets import (
+    PDFArticles, HTMLArticles, EbookArticles, XMLArticles, MarkdownArticles
+)
 
 ARTICLES_REGISTRY = [
     PDFArticles(
@@ -21,4 +23,9 @@
         spreadsheet_id='1l3azVJVukGAvZPgg0GyeqiaQe8bEMZvycBJaA8cRXf4',
         sheet_id='823056509'
     ),
+    MarkdownArticles(
+        name='markdown',
+        spreadsheet_id='1l3azVJVukGAvZPgg0GyeqiaQe8bEMZvycBJaA8cRXf4',
+        sheet_id='1003473759'
+    ),
 ]