From ebf34817b3d98b2258f94c88995f19d7c782aaa1 Mon Sep 17 00:00:00 2001
From: Daniel O'Connell <github@ahiru.pl>
Date: Mon, 7 Aug 2023 21:08:10 +0200
Subject: [PATCH] Fix actions (#111)

* Remove unused datasets

* remove reports

* remove GdocsDataset

* alignment newsletter

* update actions names

* weekly HF sync

* PR changes
---
 .github/workflows/fetch-dataset.yml           |  16 ++-
 .github/workflows/fetch-weekly.yml            |   6 +-
 .../{push-datasets.yml => push-dataset.yml}   |  36 +++--
 .github/workflows/upload-to-huggingface.yml   |  56 ++++++++
 README.md                                     |  22 +++-
 align_data/__init__.py                        |   2 -
 align_data/common/alignment_dataset.py        | 124 ++++++++----------
 align_data/db/models.py                       |  29 +++-
 align_data/db/session.py                      |   2 +-
 .../alignment_newsletter.py                   |  28 ++--
 align_data/sources/articles/__init__.py       |   2 +-
 align_data/sources/articles/datasets.py       |  16 ++-
 .../sources/arxiv_papers/arxiv_papers.py      |   2 +-
 align_data/sources/blogs/__init__.py          |   1 -
 align_data/sources/reports/__init__.py        |   9 --
 align_data/sources/reports/reports.py         |  58 --------
 main.py                                       |   2 +
 tests/align_data/articles/test_datasets.py    |   5 +-
 tests/align_data/test_alignment_newsletter.py |  14 +-
 tests/align_data/test_arxiv.py                |   3 +-
 20 files changed, 227 insertions(+), 206 deletions(-)
 rename .github/workflows/{push-datasets.yml => push-dataset.yml} (76%)
 create mode 100644 .github/workflows/upload-to-huggingface.yml
 delete mode 100644 align_data/sources/reports/__init__.py
 delete mode 100644 align_data/sources/reports/reports.py

diff --git a/.github/workflows/fetch-dataset.yml b/.github/workflows/fetch-dataset.yml
index b896304b..788e24fc 100644
--- a/.github/workflows/fetch-dataset.yml
+++ b/.github/workflows/fetch-dataset.yml
@@ -29,7 +29,6 @@ on:
         options:
           - agentmodels
           - aiimpacts
-          - aipulse
           - aisafety.camp
           - aisafety.info
           - ai_alignment_playlist
@@ -40,6 +39,7 @@ on:
           - alignmentforum
           - alignment_newsletter
           - arbital
+          - arxiv
           - carado.moe
           - cold_takes
           - deepmind_blog
@@ -49,7 +49,6 @@ on:
           - ebooks
           - eleuther.ai
           - gdocs
-          - gdrive_ebooks
           - generative.ink
           - gwern_blog
           - html_articles
@@ -59,14 +58,12 @@ on:
           - markdown
           - miri
           - ml_safety_newsletter
-          - nonarxiv_papers
-          - qualiacomputing
           - openai.research
           - pdfs
-          - reports
           - rob_miles_ai_safety
           - vkrakovna_blog
           - yudkowsky_blog
+          - xmls
 
 jobs:
   build-dataset:
@@ -81,10 +78,17 @@ jobs:
       with:
         python-version: '3.x'
 
+    - name: Install Pandoc
+      run: |
+        if [ "${{ inputs.datasource }}" = "gdocs" ]; then
+          sudo apt-get update
+          sudo apt-get -y install pandoc
+        fi
+
     - name: Install dependencies
       run: pip install -r requirements.txt
 
-    - name: Generate dataset file
+    - name: Process dataset
       env:
         CODA_TOKEN: ${{ secrets.CODA_TOKEN || inputs.coda_token }}
         YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY || inputs.youtube_api_key }}
diff --git a/.github/workflows/fetch-weekly.yml b/.github/workflows/fetch-weekly.yml
index 0ff31b9e..4af1fa26 100644
--- a/.github/workflows/fetch-weekly.yml
+++ b/.github/workflows/fetch-weekly.yml
@@ -11,7 +11,6 @@ jobs:
         datasource:
           - agentmodels
           - aiimpacts
-          - aipulse
           - aisafety.camp
           - ai_alignment_playlist
           - ai_explained
@@ -30,7 +29,6 @@ jobs:
           - ebooks
           - eleuther.ai
           - gdocs
-          - gdrive_ebooks
           - generative.ink
           - gwern_blog
           - html_articles
@@ -39,14 +37,12 @@ jobs:
           - markdown
           - miri
           - ml_safety_newsletter
-          - nonarxiv_papers
-          - qualiacomputing
           - openai.research
           - pdfs
-          - reports
           - rob_miles_ai_safety
           - vkrakovna_blog
           - yudkowsky_blog
+          - xmls
 
     uses: ./.github/workflows/fetch-dataset.yml
     with:
diff --git a/.github/workflows/push-datasets.yml b/.github/workflows/push-dataset.yml
similarity index 76%
rename from .github/workflows/push-datasets.yml
rename to .github/workflows/push-dataset.yml
index 5fd67742..768cf2a6 100644
--- a/.github/workflows/push-datasets.yml
+++ b/.github/workflows/push-dataset.yml
@@ -1,6 +1,23 @@
 name: Synch uploaded jsonl files to HuggingFace
 
 on:
+  workflow_call:
+    inputs:
+      datasource:
+        type: string
+        required: true
+      coda_token:
+        type: string
+        required: true
+      db_user:
+        type: string
+        required: true
+      db_password:
+        type: string
+        required: true
+      db_host:
+        type: string
+        required: true
   workflow_dispatch: # allow manual triggering
     inputs:
       datasource:
@@ -8,10 +25,8 @@ on:
         type: choice
         default: all
         options:
-          - all
           - agentmodels
           - aiimpacts
-          - aipulse
           - aisafety.camp
           - aisafety.info
           - ai_alignment_playlist
@@ -20,7 +35,6 @@ on:
           - ai_safety_reading_group
           - ai_tech_tu_delft
           - alignmentforum
-          - alignment_newsletter
           - arbital
           - arxiv
           - carado.moe
@@ -29,10 +43,8 @@ on:
           - deepmind_technical_blog
           - distill
           - eaforum
-          - ebooks
           - eleuther.ai
           - gdocs
-          - gdrive_ebooks
           - generative.ink
           - gwern_blog
           - html_articles
@@ -42,14 +54,12 @@ on:
           - markdown
           - miri
           - ml_safety_newsletter
-          - nonarxiv_papers
-          - qualiacomputing
           - openai.research
           - pdfs
-          - reports
           - rob_miles_ai_safety
           - vkrakovna_blog
           - yudkowsky_blog
+          - xmls
 
 jobs:
   generate-dataset:
@@ -69,11 +79,11 @@ jobs:
 
     - name: Generate dataset file
       env:
-        CODA_TOKEN: ${{ secrets.CODA_TOKEN }}
-        YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
-        ARD_DB_USER: ${{ secrets.ARD_DB_USER }}
-        ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD }}
-        ARD_DB_HOST: ${{ secrets.ARD_DB_HOST }}
+        CODA_TOKEN: ${{ secrets.CODA_TOKEN || inputs.coda_token }}
+        YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY || inputs.youtube_api_key }}
+        ARD_DB_USER: ${{ secrets.ARD_DB_USER || inputs.db_user }}
+        ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD || inputs.db_password }}
+        ARD_DB_HOST: ${{ secrets.ARD_DB_HOST || inputs.db_host }}
         ARD_DB_NAME: alignment_research_dataset
       run: python main.py generate_jsonl_files ${{ inputs.datasource }}
 
diff --git a/.github/workflows/upload-to-huggingface.yml b/.github/workflows/upload-to-huggingface.yml
new file mode 100644
index 00000000..eaac2ceb
--- /dev/null
+++ b/.github/workflows/upload-to-huggingface.yml
@@ -0,0 +1,56 @@
+name: Upload datasets to Huggingface
+on:
+  workflow_dispatch: # allow manual triggering
+  schedule:
+    - cron: "0 3 * * 0"  # Every Sunday at 3 AM
+
+jobs:
+  update_dateset:
+    strategy:
+      matrix:
+        datasource:
+          - agentmodels
+          - aiimpacts
+          - aisafety.camp
+          - aisafety.info
+          - ai_alignment_playlist
+          - ai_explained
+          - ai_safety_talks
+          - ai_safety_reading_group
+          - ai_tech_tu_delft
+          - alignmentforum
+          - arbital
+          - arxiv
+          - carado.moe
+          - cold_takes
+          - deepmind_blog
+          - deepmind_technical_blog
+          - distill
+          - eaforum
+          - eleuther.ai
+          - gdocs
+          - generative.ink
+          - gwern_blog
+          - html_articles
+          - importai
+          - jsteinhardt_blog
+          - lesswrong
+          - markdown
+          - miri
+          - ml_safety_newsletter
+          - openai.research
+          - pdfs
+          - rob_miles_ai_safety
+          - vkrakovna_blog
+          - yudkowsky_blog
+          - xmls
+
+    uses: ./.github/workflows/push-dataset.yml
+    with:
+      datasource: ${{ matrix.datasource }}
+      coda_token: ${{ inputs.coda_token }}
+      youtube_api_key: ${{ inputs.youtube_api_key }}
+      db_user: ${{ inputs.db_user }}
+      db_password: ${{ inputs.db_password }}
+      db_host: ${{ inputs.db_host }}
+    secrets: inherit
diff --git a/README.md b/README.md
index 3d3974aa..3e820519 100644
--- a/README.md
+++ b/README.md
@@ -10,32 +10,40 @@ The following list of sources may change and items may be renamed:
 - [aiimpacts](https://aiimpacts.org/)
 - [aisafety.camp](https://aisafety.camp/)
 - [aisafety.info](https://aisafety.info/)
+- [ai_alignment_playlist]()
+- [ai_explained](https://www.youtube.com/@ai-explained-)
+- [ai_safety_talks](https://www.youtube.com/@aisafetytalks)
+- [ai_safety_reading_group](https://www.youtube.com/@aisafetyreadinggroup/videos)
+- [ai_tech_tu_delft](https://www.youtube.com/@AiTechTUDelft/)
 - [alignmentforum](https://www.alignmentforum.org)
 - [alignment_newsletter](https://rohinshah.com/alignment-newsletter/)
 - [arbital](https://arbital.com/)
 - arxiv - alignment research papers from [arxiv](https://arxiv.org/)
-- audio_transcripts - transcripts from interviews with various researchers and other audio recordings
 - [carado.moe](https://carado.moe/)
 - [cold_takes](https://www.cold-takes.com/)
 - [deepmind_blog](https://deepmindsafetyresearch.medium.com/)
+- [deepmind_technical_blog](https://www.deepmind.com/blog-categories/technical-blogs)
 - [distill](https://distill.pub/)
 - [eaforum](https://forum.effectivealtruism.org/) - selected posts
-- gdocs
-- gdrive_ebooks - books include [Superintelligence](https://www.goodreads.com/book/show/20527133-superintelligence), [Human Compatible](https://www.goodreads.com/book/show/44767248-human-compatible), [Life 3.0](https://www.goodreads.com/book/show/34272565-life-3-0), [The Precipice](https://www.goodreads.com/book/show/50485582-the-precipice), and others
+- [eleuther.ai](https://blog.eleuther.ai/)
 - [generative.ink](https://generative.ink/posts/)
 - [gwern_blog](https://gwern.net/)
+- gdocs - various doc files stored on Google drive
+- html_articles - various articles on websites
 - [import.ai](https://importai.substack.com)
 - [jsteinhardt_blog](https://jsteinhardt.wordpress.com/)
 - [lesswrong](https://www.lesswrong.com/) - selected posts
-- markdown.ebooks
+- markdown
 - [miri](https://intelligence.org/) - MIRI
 - [ml_safety_newsletter](https://newsletter.mlsafety.org)
-- nonarxiv_papers - other alignment research papers
-- [qualiacomputing](https://qualiacomputing.com/)
-- reports
+- [openai.research](https://openai.com/research)
+- pdfs - various pdfs from different places
+- [rob_miles_ai_safety](https://www.youtube.com/@RobertMilesAI)
 - [vkrakovna_blog](https://vkrakovna.wordpress.com)
 - [waitbutwhy](https://waitbutwhy.com/)
 - [yudkowsky_blog](https://www.yudkowsky.net/)
+- xmls - various articles stored as XML files
+
 
 ## Keys
 
diff --git a/align_data/__init__.py b/align_data/__init__.py
index 563c38df..a602f121 100644
--- a/align_data/__init__.py
+++ b/align_data/__init__.py
@@ -3,7 +3,6 @@
 import align_data.sources.blogs as blogs
 import align_data.sources.ebooks as ebooks
 import align_data.sources.arxiv_papers as arxiv_papers
-import align_data.sources.reports as reports
 import align_data.sources.greaterwrong as greaterwrong
 import align_data.sources.stampy as stampy
 import align_data.sources.alignment_newsletter as alignment_newsletter
@@ -16,7 +15,6 @@
     + blogs.BLOG_REGISTRY
     + ebooks.EBOOK_REGISTRY
     + arxiv_papers.ARXIV_REGISTRY
-    + reports.REPORT_REGISTRY
     + greaterwrong.GREATERWRONG_REGISTRY
     + stampy.STAMPY_REGISTRY
     + distill.DISTILL_REGISTRY
diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py
index b761181e..c07f2844 100644
--- a/align_data/common/alignment_dataset.py
+++ b/align_data/common/alignment_dataset.py
@@ -1,19 +1,19 @@
+from datetime import datetime
 import logging
 import time
-import zipfile
 from dataclasses import dataclass, field, KW_ONLY
 from itertools import islice
 from pathlib import Path
-from typing import List
+from typing import Iterable, List, Optional, Set
 from sqlalchemy import select
 from sqlalchemy.exc import IntegrityError
+from sqlalchemy.orm import joinedload
 
-import gdown
 import jsonlines
 import pytz
 from dateutil.parser import parse, ParserError
 from tqdm import tqdm
-from align_data.db.models import Article
+from align_data.db.models import Article, Summary
 from align_data.db.session import make_session
 
 
@@ -43,15 +43,7 @@ class AlignmentDataset:
     """The path where data can be found. Usually a folder"""
 
     done_key = 'id'
-
     """The key of the entry to use as the id when checking if already processed."""
-    # Used to extract summaries - if `source_key` is set, the class will be deemed to collect summaries of other
-    # articles.
-    source_key = None
-    """The key of the entry to use as an identifier of the article which it's summarizing - should be an URL"""
-    summary_key = None
-    """The key of the entry containing the summary contents. This is used both to get the summary, but also where
-    it should be put in the target entry."""
 
     COOLDOWN = 0
     """An optional cool down between processing entries"""
@@ -80,20 +72,29 @@ def __post_init__(self, data_path=Path(__file__).parent / '../../data/'):
         # set the default place to look for data
         self.files_path = self.raw_data_path / self.name
 
-    def make_data_entry(self, data, **kwargs):
-        data = dict(data, **kwargs)
+    def _add_authors(self, article: Article, authors: List[str]) -> Article:
         # TODO: Don't keep adding the same authors - come up with some way to reuse them
-        # TODO: Prettify this
-        data['authors'] = ','.join(data.get('authors', []))
-        if summary := ('summary' in data and data.pop('summary')):
-            data['summaries'] = [summary]
-        return Article(
+        article.authors = ','.join(authors)
+        if len(article.authors) > 1024:
+            article.authors = ','.join(article.authors[:1024].split(',')[:-1])
+        return article
+
+    def make_data_entry(self, data, **kwargs) -> Article:
+        data = dict(data, **kwargs)
+        summary = data.pop('summary', None)
+        authors = data.pop('authors', [])
+
+        article = Article(
             id_fields=self.id_fields,
             meta={k: v for k, v in data.items() if k not in INIT_DICT},
             **{k: v for k, v in data.items() if k in INIT_DICT},
         )
+        self._add_authors(article, authors)
+        if summary:
+            article.summaries.append(Summary(text=summary, source=self.name))
+        return article
 
-    def to_jsonl(self, out_path=None, filename=None):
+    def to_jsonl(self, out_path=None, filename=None) -> Path:
         if not out_path:
             out_path=Path(__file__).parent / '../../data/'
 
@@ -115,6 +116,9 @@ def read_entries(self, sort_by=None):
             for item in session.scalars(query):
                 yield item
 
+    def _add_batch(self, session, batch):
+        session.add_all(batch)
+
     def add_entries(self, entries):
         def commit():
             try:
@@ -126,7 +130,7 @@ def commit():
         with make_session() as session:
             items = iter(entries)
             while batch := tuple(islice(items, self.batch_size)):
-                session.add_all(batch)
+                self._add_batch(session, batch)
                 # there might be duplicates in the batch, so if they cause
                 # an exception, try to commit them one by one
                 if not commit():
@@ -139,7 +143,7 @@ def setup(self):
         self._outputted_items = self._load_outputted_items()
 
     @property
-    def items_list(self):
+    def items_list(self) -> Iterable:
         """Returns a collection of items to be processed."""
         return []
 
@@ -150,7 +154,7 @@ def get_item_key(self, item):
         """
         return item.name
 
-    def _load_outputted_items(self):
+    def _load_outputted_items(self) -> Set[str]:
         """Load the output file (if it exists) in order to know which items have already been output."""
         with make_session() as session:
             if hasattr(Article, self.done_key):
@@ -161,7 +165,7 @@ def _load_outputted_items(self):
             # TODO: Properly handle this - it should create a proper SQL JSON select
             return {item.get(self.done_key) for item in session.scalars(select(Article.meta)).all()}
 
-    def unprocessed_items(self, items=None):
+    def unprocessed_items(self, items=None) -> Iterable:
         """Return a list of all items to be processed.
 
         This will automatically remove any items that have already been processed,
@@ -178,11 +182,11 @@ def not_processed(item):
         if not self.lazy_eval:
             filtered = list(filtered)
 
-        return tqdm(filtered, desc=f"Processing {self.name}")
+        return filtered
 
     def fetch_entries(self):
         """Get all entries to be written to the file."""
-        for item in self.unprocessed_items():
+        for item in tqdm(self.unprocessed_items(), desc=f"Processing {self.name}"):
              entry = self.process_entry(item)
              if not entry:
                  continue
@@ -197,10 +201,10 @@ def process_entry(self, entry):
         raise NotImplementedError
 
     @staticmethod
-    def _format_datetime(date):
+    def _format_datetime(date) -> str:
         return date.strftime("%Y-%m-%dT%H:%M:%SZ")
 
-    def _get_published_date(self, date):
+    def _get_published_date(self, date) -> Optional[datetime]:
         try:
             # Totally ignore any timezone info, forcing everything to UTC
             return parse(str(date)).replace(tzinfo=pytz.UTC)
@@ -209,52 +213,30 @@ def _get_published_date(self, date):
         return None
 
 
-@dataclass
-class GdocDataset(AlignmentDataset):
-    """A base Dataset handler for files that are saved on Gdrive,"""
-
-    gdrive_address: str
-    """The full URL to the gdrive file"""
+class SummaryDataset(AlignmentDataset):
 
-    glob = '*.md'
-    """How to identify files to be processed when going through a folder for files"""
-
-    @property
-    def items_list(self):
-        """Returns a generator of items to be processed."""
-        return self.files_path.glob(self.glob)
+    def unprocessed_items(self, items=None) -> Iterable:
+        # This breaks the possible lazy loading of the items. Should be fine...
+        items = list(super().unprocessed_items(items))
 
-    @property
-    def zip_file(self):
-        """The name of the downloaded data, if a zip file."""
-        return self.raw_data_path / f"{self.name}.zip"
-
-    def zip_from_gdrive(self, url=None, filename=None, path=None):
-        """Fetch the data a zip file from Gdrive.
-
-        :param str url: the url to the file. Will use `self.gdrive_address` if empty
-        :param str filename: the name of the zip file. Will use `self.zip_file` if empty
-        :param str path: the path where the zip file should be extracted to. Will use `self.files_path` if empty
-        """
-        filename = filename or self.zip_file
+        urls = map(self.get_item_key, items)
+        with make_session() as session:
+            self.articles = {
+                a.url: a for a in session.query(Article).options(joinedload(Article.summaries)).filter(Article.url.in_(urls))
+                if a.url
+            }
 
-        with open(filename, 'wb') as output:
-            gdown.download(url=url or self.gdrive_address,
-                           output=output,
-                           quiet=False)
+        return items
 
-        logger.info("Unzipping")
-        with zipfile.ZipFile(filename, 'r') as zip_ref:
-            zip_ref.extractall(path or self.files_path)
+    def _load_outputted_items(self) -> Set[str]:
+        """Load the output file (if it exists) in order to know which items have already been output."""
+        with make_session() as session:
+            return set(session.scalars(select(Article.url).join(Article.summaries).filter(Summary.source == self.name)))
 
-    def folder_from_gdrive(self, url=None, output=None):
-        """Download a folder from gdrive.
+    def _add_batch(self, session, batch):
+        def merge(item):
+            if prev := self.articles.get(item.url):
+                return session.merge(item.update(prev))
+            return item
 
-        :param str url: the url to the file. Will use `self.gdrive_address` if empty
-        :param str output: the path where the folder should be downloaded to. Will use `self.files_path` if empty
-        """
-        gdown.download_folder(
-            url=url or self.gdrive_address,
-            output=str(output or self.files_path),
-            quiet=False
-        )
+        session.add_all(map(merge, batch))
diff --git a/align_data/db/models.py b/align_data/db/models.py
index 378b553a..38402d2f 100644
--- a/align_data/db/models.py
+++ b/align_data/db/models.py
@@ -65,6 +65,21 @@ def verify_id(self):
         id_from_fields = hashlib.md5(id_string).hexdigest()
         assert self.id == id_from_fields, f"Entry id {self.id} does not match id from id_fields, {id_from_fields}"
 
+    def update(self, other):
+        for field in self.__table__.columns.keys():
+            if field not in ['id', 'hash_id', 'metadata'] and getattr(other, field):
+                setattr(self, field, getattr(other, field))
+        self.meta.update({k: v for k, v in other.meta.items() if k and v})
+
+        if other._id:
+            self._id = other._id
+        self.id = None  # update the hash id so it calculates a new one if needed
+        return self
+
+    def _set_id(self):
+        id_string = self.generate_id_string()
+        self.id = hashlib.md5(id_string).hexdigest()
+
     @classmethod
     def before_write(cls, mapper, connection, target):
         target.verify_fields()
@@ -72,12 +87,16 @@ def before_write(cls, mapper, connection, target):
         if target.id:
             target.verify_id()
         else:
-            id_string = target.generate_id_string()
-            target.id = hashlib.md5(id_string).hexdigest()
+            target._set_id()
 
     def to_dict(self):
         if date := self.date_published:
             date = date.replace(tzinfo=pytz.UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+        authors = []
+        if self.authors and self.authors.strip():
+            authors = [i.strip() for i in self.authors.split(',')]
+
         return {
             'id': self.id,
             'title': self.title,
@@ -86,9 +105,9 @@ def to_dict(self):
             'source_type': self.source_type,
             'text': self.text,
             'date_published': date,
-            'authors': [i.strip() for i in self.authors.split(',')] if self.authors.strip() else [],
-            'summaries': [s.text for s in self.summaries],
-            **self.meta,
+            'authors': authors,
+            'summaries': [s.text for s in (self.summaries or [])],
+            **(self.meta or {}),
         }
 
 
diff --git a/align_data/db/session.py b/align_data/db/session.py
index 16ff48e4..c8949f8f 100644
--- a/align_data/db/session.py
+++ b/align_data/db/session.py
@@ -7,7 +7,7 @@
 @contextmanager
 def make_session(auto_commit=False):
     engine = create_engine(DB_CONNECTION_URI, echo=False)
-    with Session(engine) as session:
+    with Session(engine).no_autoflush as session:
         yield session
         if auto_commit:
             session.commit()
diff --git a/align_data/sources/alignment_newsletter/alignment_newsletter.py b/align_data/sources/alignment_newsletter/alignment_newsletter.py
index 1dc3a670..fa613640 100644
--- a/align_data/sources/alignment_newsletter/alignment_newsletter.py
+++ b/align_data/sources/alignment_newsletter/alignment_newsletter.py
@@ -5,18 +5,15 @@
 import pandas as pd
 
 from dataclasses import dataclass
-from align_data.common.alignment_dataset import AlignmentDataset
+from align_data.common.alignment_dataset import SummaryDataset
 
 logger = logging.getLogger(__name__)
 
 
 @dataclass
-class AlignmentNewsletter(AlignmentDataset):
+class AlignmentNewsletter(SummaryDataset):
 
-    done_key = "title"
-
-    source_key = 'url'
-    summary_key = 'text'
+    done_key = "url"
 
     def __post_init__(self, data_path=Path(__file__).parent / '../../../data/'):
         self.data_path = data_path
@@ -28,12 +25,17 @@ def setup(self) -> None:
         self.newsletter_xlsx_path = self.raw_data_path / "alignment_newsletter.xlsx"
         self.df = pd.read_excel(self.newsletter_xlsx_path)
 
+    @staticmethod
+    def maybe(val):
+        if pd.isna(val):
+            return None
+        return val
+
     def get_item_key(self, row):
-        return row.Title
+        return self.maybe(row.URL)
 
-    @staticmethod
-    def _get_published_date(year):
-        if not year or pd.isna(year):
+    def _get_published_date(self, year):
+        if not self.maybe(year):
             return None
         return datetime(int(year), 1, 1, tzinfo=timezone.utc)
 
@@ -47,11 +49,11 @@ def process_entry(self, row):
         converted_with, source_type, venue, newsletter_category, highlight, newsletter_number,
         summarizer, opinion, prerequisites, read_more, title, authors, date_published, text
         """
-        if pd.isna(row.Summary) or not row.Summary:
+        if not self.maybe(row.Summary) or not self.maybe(row.URL):
             return None
 
         def handle_na(v, cast=None):
-            if not v or pd.isna(v):
+            if not self.maybe(v):
                 return ''
             if cast:
                 return cast(v)
@@ -73,5 +75,5 @@ def handle_na(v, cast=None):
             "title": handle_na(row.Title, str),
             "authors": [i.strip() for i in str(row.Authors).split(',')],
             "date_published": self._get_published_date(row.Year),
-            "text": handle_na(row.Summary, str),
+            "summary": handle_na(row.Summary, str),
         })
diff --git a/align_data/sources/articles/__init__.py b/align_data/sources/articles/__init__.py
index a6fff663..6775e496 100644
--- a/align_data/sources/articles/__init__.py
+++ b/align_data/sources/articles/__init__.py
@@ -19,7 +19,7 @@
         sheet_id='1800487220'
     ),
     XMLArticles(
-        name='nonarxiv_papers',
+        name='xmls',
         spreadsheet_id='1l3azVJVukGAvZPgg0GyeqiaQe8bEMZvycBJaA8cRXf4',
         sheet_id='823056509'
     ),
diff --git a/align_data/sources/articles/datasets.py b/align_data/sources/articles/datasets.py
index 9c7767af..a6328f42 100644
--- a/align_data/sources/articles/datasets.py
+++ b/align_data/sources/articles/datasets.py
@@ -82,12 +82,16 @@ class PDFArticles(SpreadsheetDataset):
     COOLDOWN = 1
     batch_size = 1
 
+    def setup(self):
+        super().setup()
+        self.files_path.mkdir(exist_ok=True, parents=True)
+
     def _get_text(self, item):
         url = f'https://drive.google.com/uc?id={item.file_id}'
 
         filename = self.files_path / f'{item.title}.pdf'
-        download(str(filename), id=item.file_id)
-        return read_pdf(filename)
+        if download(output=str(filename), id=item.file_id):
+            return read_pdf(filename)
 
 
 class HTMLArticles(SpreadsheetDataset):
@@ -107,6 +111,10 @@ class EbookArticles(SpreadsheetDataset):
     COOLDOWN = 10 # Add a large cooldown, as google complains a lot
     batch_size = 1
 
+    def setup(self):
+        super().setup()
+        self.files_path.mkdir(exist_ok=True, parents=True)
+
     def _get_text(self, item):
         file_id = item.source_url.split('/')[-2]
         filename = download(output=str(self.files_path / f'{item.title}.epub'), id=file_id)
@@ -136,6 +144,10 @@ class DocArticles(SpreadsheetDataset):
 
     source_filetype = 'docx'
 
+    def setup(self):
+        super().setup()
+        self.files_path.mkdir(exist_ok=True, parents=True)
+
     def _get_text(self, item):
         pandoc_path = Path('data/raw/pandoc/pandoc/')
         if pandoc_path.exists():
diff --git a/align_data/sources/arxiv_papers/arxiv_papers.py b/align_data/sources/arxiv_papers/arxiv_papers.py
index d4eef69f..ae9b7cb9 100644
--- a/align_data/sources/arxiv_papers/arxiv_papers.py
+++ b/align_data/sources/arxiv_papers/arxiv_papers.py
@@ -62,7 +62,7 @@ def process_entry(self, item) -> None:
             "authors": authors,
             "date_published": self._get_published_date(self.is_val(item.date_published) or paper.get('date_published')),
             "data_last_modified": str(metadata.updated),
-            "abstract": metadata.summary.replace("\n", " "),
+            "summary": metadata.summary.replace("\n", " "),
             "author_comment": metadata.comment,
             "journal_ref": metadata.journal_ref,
             "doi": metadata.doi,
diff --git a/align_data/sources/blogs/__init__.py b/align_data/sources/blogs/__init__.py
index 8f1d5fc1..7021c994 100644
--- a/align_data/sources/blogs/__init__.py
+++ b/align_data/sources/blogs/__init__.py
@@ -12,7 +12,6 @@
     WordpressBlog(name="aisafety.camp", url="https://aisafety.camp"),
     WordpressBlog(name="miri", url="https://intelligence.org"),
     WordpressBlog(name="jsteinhardt_blog", url="https://jsteinhardt.wordpress.com"),
-    WordpressBlog(name="qualiacomputing", url="https://qualiacomputing.com"),
     WordpressBlog(name="vkrakovna_blog", url="https://vkrakovna.wordpress.com"),
     WordpressBlog(name="yudkowsky_blog", url="https://yudkowsky.net"),
     MediumBlog(name="deepmind_blog", url="https://deepmindsafetyresearch.medium.com/", authors=["DeepMind Safety Research"]),
diff --git a/align_data/sources/reports/__init__.py b/align_data/sources/reports/__init__.py
deleted file mode 100644
index af75ef38..00000000
--- a/align_data/sources/reports/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from .reports import Reports
-
-
-REPORT_REGISTRY = [
-    Reports(
-        name='reports',
-        gdrive_address="https://drive.google.com/uc?id=1TzOyQ_TTc7BptFijJiojPDkA8vDm1laN"
-    )
-]
diff --git a/align_data/sources/reports/reports.py b/align_data/sources/reports/reports.py
deleted file mode 100644
index e79a8909..00000000
--- a/align_data/sources/reports/reports.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from dataclasses import dataclass
-from align_data.common.alignment_dataset import GdocDataset
-import logging
-import grobid_tei_xml
-
-from datetime import datetime, timezone
-from dateutil.parser import parse
-
-logger = logging.getLogger(__name__)
-
-@dataclass
-class Reports(GdocDataset):
-
-    done_key = "filename"
-    glob = "*.xml"
-
-    def setup(self):
-        super().setup()
-
-        logger.info('Fetching data from Gdrive')
-        self.files_path = self.raw_data_path / 'report_teis'
-        self.zip_from_gdrive(path=self.raw_data_path)
-        logger.info('Fetched data')
-
-    @property
-    def zip_file(self):
-        return self.raw_data_path / "report_teis.zip"
-
-    @staticmethod
-    def _get_published_data(doc_dict):
-        date_str = doc_dict["header"].get('date')
-        if date_str:
-            return parse(date_str).astimezone(timezone.utc)
-        return None
-
-    def process_entry(self, filename):
-        logger.info(f"Processing {filename.name}")
-        xml_text = filename.read_text(encoding='utf-8')
-        try:
-            doc_dict = grobid_tei_xml.parse_document_xml(xml_text).to_dict()
-            abstract = doc_dict.get("abstract")
-            logger.info(f"Doc: {list(doc_dict.keys())}")
-            return self.make_data_entry({
-                "summary": [abstract] if abstract else [],
-                "authors": [xx["full_name"] for xx in doc_dict["header"]["authors"]],
-                "title": doc_dict["header"]["title"],
-                "text": doc_dict["body"],
-                "source": self.name,
-                "source_type": "pdf",
-                "date_published": self._get_published_data(doc_dict),
-                "url": "",
-                "filename": filename.name,
-            })
-        except Exception as e:
-            logger.error(f"Error: {e}")
-            logger.info('Skipping %s', filename.name)
-
-        return None
diff --git a/main.py b/main.py
index 4578ffe7..78371d7d 100644
--- a/main.py
+++ b/main.py
@@ -62,6 +62,8 @@ def generate_jsonl_files(self, *names):
 
         :param List[str] names: The names of the datasets to generate
         """
+        if names == ('all',):
+            names = ALL_DATASETS
         missing = {name for name in names if name not in ALL_DATASETS}
         assert not missing, f"{missing} are not valid dataset names"
         for name in names:
diff --git a/tests/align_data/articles/test_datasets.py b/tests/align_data/articles/test_datasets.py
index 3773539f..48340000 100644
--- a/tests/align_data/articles/test_datasets.py
+++ b/tests/align_data/articles/test_datasets.py
@@ -55,9 +55,10 @@ def test_pdf_articles_get_text():
     dataset = PDFArticles(name='bla', spreadsheet_id='123', sheet_id='456')
     item = Mock(file_id='23423', title='bla bla bla')
 
-    def check_downloads(filename, id):
-        assert filename == str(dataset.files_path / 'bla bla bla.pdf')
+    def check_downloads(output, id):
+        assert output == str(dataset.files_path / 'bla bla bla.pdf')
         assert id == '23423'
+        return output
 
     def read_pdf(filename):
         assert filename == dataset.files_path / 'bla bla bla.pdf'
diff --git a/tests/align_data/test_alignment_newsletter.py b/tests/align_data/test_alignment_newsletter.py
index 0e9db7a4..249f77e9 100644
--- a/tests/align_data/test_alignment_newsletter.py
+++ b/tests/align_data/test_alignment_newsletter.py
@@ -19,14 +19,14 @@ def test_xlsx_file_loaded(dataset):
 def test_get_item_key(dataset):
     items = list(dataset.items_list)
 
-    assert dataset.get_item_key(items[0]) == 'Adversarial Examples Are Not Bugs, They Are Features'
+    assert dataset.get_item_key(items[0]) == 'http://gradientscience.org/adv/'
 
 
 def test_process_entry_no_summary(dataset):
     items = pd.DataFrame([
-        {'Title': 'An item without a summary field'},
-        {'Title': 'An item with a None summary field', 'Summary': None},
-        {'Title': 'An item with an invalid summary field', 'Summary': pd.NA},
+        {'Url': 'http://bla.bla/3', 'Title': 'An item without a summary field'},
+        {'Url': 'http://bla.bla/2', 'Title': 'An item with a None summary field', 'Summary': None},
+        {'Url': 'http://bla.bla/1', 'Title': 'An item with an invalid summary field', 'Summary': pd.NA},
     ])
     for item in items.itertuples():
         assert dataset.process_entry(item) is None
@@ -68,8 +68,7 @@ def test_process_entry(dataset):
         'source': 'text',
         'source_type': 'google-sheets',
         'summarizer': 'Rohin',
-        'summaries': [],
-        'text': (
+        'summaries': [(
             '_Distill published a discussion of this paper. This highlights '
             'section will cover the full discussion; all of these summaries and '
             'opinions are meant to be read together._\n'
@@ -114,8 +113,9 @@ def test_process_entry(dataset):
             'chosen to be y + 1. For both datasets, if you train a new model on '
             'the dataset, you get good performance **on the original test set**, '
             'showing that the "non-robust features" do generalize.'
-        ),
+        )],
         'title': 'Adversarial Examples Are Not Bugs, They Are Features',
         'url': 'http://gradientscience.org/adv/',
         'venue': 'arXiv',
+        'text': None,
 }
diff --git a/tests/align_data/test_arxiv.py b/tests/align_data/test_arxiv.py
index 00b07969..30717d9e 100644
--- a/tests/align_data/test_arxiv.py
+++ b/tests/align_data/test_arxiv.py
@@ -44,7 +44,6 @@ def test_process_entry():
     with patch('align_data.arxiv_papers.arxiv_papers.parse_vanity', return_value=contents):
         with patch('align_data.arxiv_papers.arxiv_papers.arxiv', arxiv):
             assert dataset.process_entry(item).to_dict() == {
-                'abstract': 'abstract bla bla',
                 'author_comment': 'no comment',
                 'authors': ['mr blobby'],
                 'categories': 'wut',
@@ -56,7 +55,7 @@ def test_process_entry():
                 'primary_category': 'cat',
                 'source': 'asd',
                 'source_type': 'html',
-                'summaries': [],
+                'summaries': ['abstract bla bla'],
                 'text': 'this is the text',
                 'title': 'this is the title',
                 'url': 'https://arxiv.org/abs/2001.11038',