Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into pinecone-updater
Browse files Browse the repository at this point in the history
  • Loading branch information
henri123lemoine committed Jul 30, 2023
2 parents 9256573 + 3bc0633 commit ba76656
Show file tree
Hide file tree
Showing 27 changed files with 202 additions and 368 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/fetch-daily.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,8 @@ jobs:
uses: ./.github/workflows/fetch-dataset.yml
with:
datasource: ${{ matrix.datasource }}
coda_token: ${{ inputs.coda_token }}
db_user: ${{ inputs.db_user }}
db_password: ${{ inputs.db_password }}
db_host: ${{ inputs.db_host }}
secrets: inherit
58 changes: 20 additions & 38 deletions .github/workflows/fetch-dataset.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@ on:
datasource:
type: string
required: true
coda_token:
type: string
required: true
db_user:
type: string
required: true
db_password:
type: string
required: true
db_host:
type: string
required: true
workflow_dispatch: # allow manual triggering
inputs:
datasource:
Expand All @@ -20,7 +32,6 @@ on:
- alignmentforum
- alignment_newsletter
- arbital
- audio_transcripts
- carado.moe
- cold_takes
- deepmind_blog
Expand All @@ -37,7 +48,7 @@ on:
- importai
- jsteinhardt_blog
- lesswrong
- markdown.ebooks
- markdown
- miri
- ml_safety_newsletter
- nonarxiv_papers
Expand All @@ -64,40 +75,11 @@ jobs:
- name: Install dependencies
run: pip install -r requirements.txt

- name: Fetch dataset
- name: Generate dataset file
env:
CODA_TOKEN: ${{ secrets.CODA_TOKEN }}
run: python main.py fetch ${{ inputs.datasource }} --fetch_prev=True

- name: Upload Artifact
uses: actions/upload-artifact@v3
with:
name: ${{ inputs.datasource }}
path: data/${{ inputs.datasource }}.jsonl
retention-days: 1

upload:
runs-on: ubuntu-latest
needs: build-dataset

if: github.ref == 'refs/heads/main'
steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Setup Python environment
uses: actions/setup-python@v2
with:
python-version: '3.x'

- name: Setup Huggingface client
run: pip install huggingface_hub gdown jsonlines datasets

- name: Download a single artifact
uses: actions/download-artifact@v3
with:
name: ${{ inputs.datasource }}
path: data/

- name: Upload file
run: python upload_to_huggingface.py ${{ secrets.HUGGINGFACE_TOKEN }} ${{ inputs.datasource }}
CODA_TOKEN: ${{ secrets.CODA_TOKEN || inputs.coda_token }}
ARD_DB_USER: ${{ secrets.ARD_DB_USER || inputs.db_user }}
ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD || inputs.db_password }}
ARD_DB_HOST: ${{ secrets.ARD_DB_HOST || inputs.db_host }}
ARD_DB_NAME: alignment_research_dataset
run: python main.py fetch ${{ inputs.datasource }}
5 changes: 5 additions & 0 deletions .github/workflows/fetch-weekly.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
name: Weekly dataset updates
on:
workflow_dispatch: # allow manual triggering
schedule:
- cron: "0 0 * * 0" # Every Sunday at midnight

Expand Down Expand Up @@ -45,4 +46,8 @@ jobs:
uses: ./.github/workflows/fetch-dataset.yml
with:
datasource: ${{ matrix.datasource }}
coda_token: ${{ inputs.coda_token }}
db_user: ${{ inputs.db_user }}
db_password: ${{ inputs.db_password }}
db_host: ${{ inputs.db_host }}
secrets: inherit
35 changes: 33 additions & 2 deletions .github/workflows/push-datasets.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ on:
- alignment_newsletter
- arbital
- arxiv
- audio_transcripts
- carado.moe
- cold_takes
- deepmind_blog
Expand All @@ -35,7 +34,7 @@ on:
- importai
- jsteinhardt_blog
- lesswrong
- markdown.ebooks
- markdown
- miri
- ml_safety_newsletter
- nonarxiv_papers
Expand All @@ -47,8 +46,40 @@ on:
- yudkowsky_blog

jobs:
generate-dataset:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Setup Python environment
uses: actions/setup-python@v2
with:
python-version: '3.x'

- name: Install dependencies
run: pip install -r requirements.txt

- name: Generate dataset file
env:
CODA_TOKEN: ${{ secrets.CODA_TOKEN }}
ARD_DB_USER: ${{ secrets.ARD_DB_USER }}
ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD }}
ARD_DB_HOST: ${{ secrets.ARD_DB_HOST }}
ARD_DB_NAME: alignment_research_dataset
run: python main.py generate_jsonl_files ${{ inputs.datasource }}

- name: Upload Artifact
uses: actions/upload-artifact@v3
with:
name: ${{ inputs.datasource }}
path: data/${{ inputs.datasource }}.jsonl
retention-days: 1

upload:
runs-on: ubuntu-latest
needs: generate-dataset

steps:
- name: Checkout repository
Expand Down
2 changes: 0 additions & 2 deletions align_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import align_data.sources.reports as reports
import align_data.sources.greaterwrong as greaterwrong
import align_data.sources.stampy as stampy
import align_data.sources.audio_transcripts as audio_transcripts
import align_data.sources.alignment_newsletter as alignment_newsletter
import align_data.sources.distill as distill
import align_data.sources.gdocs as gdocs
Expand All @@ -20,7 +19,6 @@
+ reports.REPORT_REGISTRY
+ greaterwrong.GREATERWRONG_REGISTRY
+ stampy.STAMPY_REGISTRY
+ audio_transcripts.AUDIO_TRANSCRIPTS_REGISTRY
+ distill.DISTILL_REGISTRY
+ alignment_newsletter.ALIGNMENT_NEWSLETTER_REGISTRY
+ gdocs.GDOCS_REGISTRY
Expand Down
31 changes: 18 additions & 13 deletions align_data/common/alignment_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,13 @@ class AlignmentDataset:
"""The key of the entry containing the summary contents. This is used both to get the summary, but also where
it should be put in the target entry."""

glob = '*.md'
"""How to identify files to be processed when going through a folder for files"""

COOLDOWN = 0
"""An optional cool down between processing entries"""

lazy_eval = False
"""Whether to lazy fetch items. This is nice in that it will start processing, but messes up the progress bar."""
batch_size = 20
"""The number of items to collect before flushing to the database."""

# Internal housekeeping variables
_entry_idx = 0
Expand All @@ -80,8 +79,6 @@ def __post_init__(self, data_path=Path(__file__).parent / '../../data/'):

# set the default place to look for data
self.files_path = self.raw_data_path / self.name
# TODO: get rid of self.jsonl_path
self.jsonl_path = self.data_path / f"{self.name}.jsonl"

def make_data_entry(self, data, **kwargs):
data = dict(data, **kwargs)
Expand All @@ -102,10 +99,12 @@ def to_jsonl(self, out_path=None, filename=None):

if not filename:
filename = f"{self.name}.jsonl"
filename = Path(out_path) / filename

with jsonlines.open(Path(out_path) / filename, 'w') as jsonl_writer:
with jsonlines.open(filename, 'w') as jsonl_writer:
for article in self.read_entries():
jsonl_writer.write(article.to_dict())
return filename.resolve()

def read_entries(self, sort_by=None):
"""Iterate through all the saved entries."""
Expand All @@ -125,8 +124,9 @@ def commit():
session.rollback()

with make_session() as session:
while batch := tuple(islice(entries, 20)):
session.add_all(entries)
items = iter(entries)
while batch := tuple(islice(items, self.batch_size)):
session.add_all(batch)
# there might be duplicates in the batch, so if they cause
# an exception, try to commit them one by one
if not commit():
Expand All @@ -136,15 +136,12 @@ def commit():
logger.error(f'found duplicate of {entry}')

def setup(self):
# make sure the path to the raw data exists
self.files_path.mkdir(parents=True, exist_ok=True)

self._outputted_items = self._load_outputted_items()

@property
def items_list(self):
"""Returns a generator of items to be processed."""
return self.files_path.glob(self.glob)
return []

def get_item_key(self, item):
"""Get the identifier of the given `item` so it can be checked to see whether it's been output.
Expand All @@ -159,7 +156,7 @@ def _load_outputted_items(self):
if hasattr(Article, self.done_key):
return set(session.scalars(select(getattr(Article, self.done_key)).where(Article.source==self.name)).all())
# TODO: Properly handle this - it should create a proper SQL JSON select
return {getattr(item, self.done_key) for item in session.scalars(select(Article.meta).where(Article.source==self.name)).all()}
return {item.get(self.done_key) for item in session.scalars(select(Article.meta).where(Article.source==self.name)).all()}

def unprocessed_items(self, items=None):
"""Return a list of all items to be processed.
Expand Down Expand Up @@ -216,6 +213,14 @@ class GdocDataset(AlignmentDataset):
gdrive_address: str
"""The full URL to the gdrive file"""

glob = '*.md'
"""How to identify files to be processed when going through a folder for files"""

@property
def items_list(self):
"""Returns a generator of items to be processed."""
return self.files_path.glob(self.glob)

@property
def zip_file(self):
"""The name of the downloaded data, if a zip file."""
Expand Down
1 change: 1 addition & 0 deletions align_data/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class Article(Base):
source_type: Mapped[Optional[str]] = mapped_column(String(128))
authors: Mapped[str] = mapped_column(String(1024))
text: Mapped[Optional[str]] = mapped_column(LONGTEXT)
confidence: Mapped[Optional[float]] # Describes the confidence in how good this article is, as a value <0, 1>
date_published: Mapped[Optional[datetime]]
meta: Mapped[Optional[JSON]] = mapped_column(JSON, name='metadata', default='{}')
date_created: Mapped[datetime] = mapped_column(DateTime, default=func.now())
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# %%
import logging
from datetime import datetime, timezone
from pathlib import Path
import pandas as pd

from dataclasses import dataclass
Expand All @@ -17,8 +18,13 @@ class AlignmentNewsletter(AlignmentDataset):
source_key = 'url'
summary_key = 'text'

def __post_init__(self, data_path=Path(__file__).parent / '../../../data/'):
self.data_path = data_path
self.raw_data_path = self.data_path / 'raw'

def setup(self) -> None:
super().setup()

self.newsletter_xlsx_path = self.raw_data_path / "alignment_newsletter.xlsx"
self.df = pd.read_excel(self.newsletter_xlsx_path)

Expand Down
9 changes: 8 additions & 1 deletion align_data/sources/articles/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from align_data.sources.articles.datasets import PDFArticles, HTMLArticles, EbookArticles, XMLArticles
from align_data.sources.articles.datasets import (
PDFArticles, HTMLArticles, EbookArticles, XMLArticles, MarkdownArticles
)

ARTICLES_REGISTRY = [
PDFArticles(
Expand All @@ -21,4 +23,9 @@
spreadsheet_id='1l3azVJVukGAvZPgg0GyeqiaQe8bEMZvycBJaA8cRXf4',
sheet_id='823056509'
),
MarkdownArticles(
name='markdown',
spreadsheet_id='1l3azVJVukGAvZPgg0GyeqiaQe8bEMZvycBJaA8cRXf4',
sheet_id='1003473759'
),
]
Loading

0 comments on commit ba76656

Please sign in to comment.