From ebf34817b3d98b2258f94c88995f19d7c782aaa1 Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Mon, 7 Aug 2023 21:08:10 +0200 Subject: [PATCH] Fix actions (#111) * Remove unused datasets * remove reports * remove GdocsDataset * alignment newsletter * update actions names * weekly HF sync * PR changes --- .github/workflows/fetch-dataset.yml | 16 ++- .github/workflows/fetch-weekly.yml | 6 +- .../{push-datasets.yml => push-dataset.yml} | 36 +++-- .github/workflows/upload-to-huggingface.yml | 56 ++++++++ README.md | 22 +++- align_data/__init__.py | 2 - align_data/common/alignment_dataset.py | 124 ++++++++---------- align_data/db/models.py | 29 +++- align_data/db/session.py | 2 +- .../alignment_newsletter.py | 28 ++-- align_data/sources/articles/__init__.py | 2 +- align_data/sources/articles/datasets.py | 16 ++- .../sources/arxiv_papers/arxiv_papers.py | 2 +- align_data/sources/blogs/__init__.py | 1 - align_data/sources/reports/__init__.py | 9 -- align_data/sources/reports/reports.py | 58 -------- main.py | 2 + tests/align_data/articles/test_datasets.py | 5 +- tests/align_data/test_alignment_newsletter.py | 14 +- tests/align_data/test_arxiv.py | 3 +- 20 files changed, 227 insertions(+), 206 deletions(-) rename .github/workflows/{push-datasets.yml => push-dataset.yml} (76%) create mode 100644 .github/workflows/upload-to-huggingface.yml delete mode 100644 align_data/sources/reports/__init__.py delete mode 100644 align_data/sources/reports/reports.py diff --git a/.github/workflows/fetch-dataset.yml b/.github/workflows/fetch-dataset.yml index b896304b..788e24fc 100644 --- a/.github/workflows/fetch-dataset.yml +++ b/.github/workflows/fetch-dataset.yml @@ -29,7 +29,6 @@ on: options: - agentmodels - aiimpacts - - aipulse - aisafety.camp - aisafety.info - ai_alignment_playlist @@ -40,6 +39,7 @@ on: - alignmentforum - alignment_newsletter - arbital + - arxiv - carado.moe - cold_takes - deepmind_blog @@ -49,7 +49,6 @@ on: - ebooks - eleuther.ai - gdocs - - gdrive_ebooks - generative.ink - gwern_blog - html_articles @@ -59,14 +58,12 @@ on: - markdown - miri - ml_safety_newsletter - - nonarxiv_papers - - qualiacomputing - openai.research - pdfs - - reports - rob_miles_ai_safety - vkrakovna_blog - yudkowsky_blog + - xmls jobs: build-dataset: @@ -81,10 +78,17 @@ jobs: with: python-version: '3.x' + - name: Install Pandoc + run: | + if [ "${{ inputs.datasource }}" = "gdocs" ]; then + sudo apt-get update + sudo apt-get -y install pandoc + fi + - name: Install dependencies run: pip install -r requirements.txt - - name: Generate dataset file + - name: Process dataset env: CODA_TOKEN: ${{ secrets.CODA_TOKEN || inputs.coda_token }} YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY || inputs.youtube_api_key }} diff --git a/.github/workflows/fetch-weekly.yml b/.github/workflows/fetch-weekly.yml index 0ff31b9e..4af1fa26 100644 --- a/.github/workflows/fetch-weekly.yml +++ b/.github/workflows/fetch-weekly.yml @@ -11,7 +11,6 @@ jobs: datasource: - agentmodels - aiimpacts - - aipulse - aisafety.camp - ai_alignment_playlist - ai_explained @@ -30,7 +29,6 @@ jobs: - ebooks - eleuther.ai - gdocs - - gdrive_ebooks - generative.ink - gwern_blog - html_articles @@ -39,14 +37,12 @@ jobs: - markdown - miri - ml_safety_newsletter - - nonarxiv_papers - - qualiacomputing - openai.research - pdfs - - reports - rob_miles_ai_safety - vkrakovna_blog - yudkowsky_blog + - xmls uses: ./.github/workflows/fetch-dataset.yml with: diff --git a/.github/workflows/push-datasets.yml b/.github/workflows/push-dataset.yml similarity index 76% rename from .github/workflows/push-datasets.yml rename to .github/workflows/push-dataset.yml index 5fd67742..768cf2a6 100644 --- a/.github/workflows/push-datasets.yml +++ b/.github/workflows/push-dataset.yml @@ -1,6 +1,23 @@ name: Synch uploaded jsonl files to HuggingFace on: + workflow_call: + inputs: + datasource: + type: string + required: true + coda_token: + type: string + required: true + db_user: + type: string + required: true + db_password: + type: string + required: true + db_host: + type: string + required: true workflow_dispatch: # allow manual triggering inputs: datasource: @@ -8,10 +25,8 @@ on: type: choice default: all options: - - all - agentmodels - aiimpacts - - aipulse - aisafety.camp - aisafety.info - ai_alignment_playlist @@ -20,7 +35,6 @@ on: - ai_safety_reading_group - ai_tech_tu_delft - alignmentforum - - alignment_newsletter - arbital - arxiv - carado.moe @@ -29,10 +43,8 @@ on: - deepmind_technical_blog - distill - eaforum - - ebooks - eleuther.ai - gdocs - - gdrive_ebooks - generative.ink - gwern_blog - html_articles @@ -42,14 +54,12 @@ on: - markdown - miri - ml_safety_newsletter - - nonarxiv_papers - - qualiacomputing - openai.research - pdfs - - reports - rob_miles_ai_safety - vkrakovna_blog - yudkowsky_blog + - xmls jobs: generate-dataset: @@ -69,11 +79,11 @@ jobs: - name: Generate dataset file env: - CODA_TOKEN: ${{ secrets.CODA_TOKEN }} - YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }} - ARD_DB_USER: ${{ secrets.ARD_DB_USER }} - ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD }} - ARD_DB_HOST: ${{ secrets.ARD_DB_HOST }} + CODA_TOKEN: ${{ secrets.CODA_TOKEN || inputs.coda_token }} + YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY || inputs.youtube_api_key }} + ARD_DB_USER: ${{ secrets.ARD_DB_USER || inputs.db_user }} + ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD || inputs.db_password }} + ARD_DB_HOST: ${{ secrets.ARD_DB_HOST || inputs.db_host }} ARD_DB_NAME: alignment_research_dataset run: python main.py generate_jsonl_files ${{ inputs.datasource }} diff --git a/.github/workflows/upload-to-huggingface.yml b/.github/workflows/upload-to-huggingface.yml new file mode 100644 index 00000000..eaac2ceb --- /dev/null +++ b/.github/workflows/upload-to-huggingface.yml @@ -0,0 +1,56 @@ +name: Upload datasets to Huggingface +on: + workflow_dispatch: # allow manual triggering + schedule: + - cron: "0 3 * * 0" # Every Sunday at 3 AM + +jobs: + update_dateset: + strategy: + matrix: + datasource: + - agentmodels + - aiimpacts + - aisafety.camp + - aisafety.info + - ai_alignment_playlist + - ai_explained + - ai_safety_talks + - ai_safety_reading_group + - ai_tech_tu_delft + - alignmentforum + - arbital + - arxiv + - carado.moe + - cold_takes + - deepmind_blog + - deepmind_technical_blog + - distill + - eaforum + - eleuther.ai + - gdocs + - generative.ink + - gwern_blog + - html_articles + - importai + - jsteinhardt_blog + - lesswrong + - markdown + - miri + - ml_safety_newsletter + - openai.research + - pdfs + - rob_miles_ai_safety + - vkrakovna_blog + - yudkowsky_blog + - xmls + + uses: ./.github/workflows/push-dataset.yml + with: + datasource: ${{ matrix.datasource }} + coda_token: ${{ inputs.coda_token }} + youtube_api_key: ${{ inputs.youtube_api_key }} + db_user: ${{ inputs.db_user }} + db_password: ${{ inputs.db_password }} + db_host: ${{ inputs.db_host }} + secrets: inherit diff --git a/README.md b/README.md index 3d3974aa..3e820519 100644 --- a/README.md +++ b/README.md @@ -10,32 +10,40 @@ The following list of sources may change and items may be renamed: - [aiimpacts](https://aiimpacts.org/) - [aisafety.camp](https://aisafety.camp/) - [aisafety.info](https://aisafety.info/) +- [ai_alignment_playlist]() +- [ai_explained](https://www.youtube.com/@ai-explained-) +- [ai_safety_talks](https://www.youtube.com/@aisafetytalks) +- [ai_safety_reading_group](https://www.youtube.com/@aisafetyreadinggroup/videos) +- [ai_tech_tu_delft](https://www.youtube.com/@AiTechTUDelft/) - [alignmentforum](https://www.alignmentforum.org) - [alignment_newsletter](https://rohinshah.com/alignment-newsletter/) - [arbital](https://arbital.com/) - arxiv - alignment research papers from [arxiv](https://arxiv.org/) -- audio_transcripts - transcripts from interviews with various researchers and other audio recordings - [carado.moe](https://carado.moe/) - [cold_takes](https://www.cold-takes.com/) - [deepmind_blog](https://deepmindsafetyresearch.medium.com/) +- [deepmind_technical_blog](https://www.deepmind.com/blog-categories/technical-blogs) - [distill](https://distill.pub/) - [eaforum](https://forum.effectivealtruism.org/) - selected posts -- gdocs -- gdrive_ebooks - books include [Superintelligence](https://www.goodreads.com/book/show/20527133-superintelligence), [Human Compatible](https://www.goodreads.com/book/show/44767248-human-compatible), [Life 3.0](https://www.goodreads.com/book/show/34272565-life-3-0), [The Precipice](https://www.goodreads.com/book/show/50485582-the-precipice), and others +- [eleuther.ai](https://blog.eleuther.ai/) - [generative.ink](https://generative.ink/posts/) - [gwern_blog](https://gwern.net/) +- gdocs - various doc files stored on Google drive +- html_articles - various articles on websites - [import.ai](https://importai.substack.com) - [jsteinhardt_blog](https://jsteinhardt.wordpress.com/) - [lesswrong](https://www.lesswrong.com/) - selected posts -- markdown.ebooks +- markdown - [miri](https://intelligence.org/) - MIRI - [ml_safety_newsletter](https://newsletter.mlsafety.org) -- nonarxiv_papers - other alignment research papers -- [qualiacomputing](https://qualiacomputing.com/) -- reports +- [openai.research](https://openai.com/research) +- pdfs - various pdfs from different places +- [rob_miles_ai_safety](https://www.youtube.com/@RobertMilesAI) - [vkrakovna_blog](https://vkrakovna.wordpress.com) - [waitbutwhy](https://waitbutwhy.com/) - [yudkowsky_blog](https://www.yudkowsky.net/) +- xmls - various articles stored as XML files + ## Keys diff --git a/align_data/__init__.py b/align_data/__init__.py index 563c38df..a602f121 100644 --- a/align_data/__init__.py +++ b/align_data/__init__.py @@ -3,7 +3,6 @@ import align_data.sources.blogs as blogs import align_data.sources.ebooks as ebooks import align_data.sources.arxiv_papers as arxiv_papers -import align_data.sources.reports as reports import align_data.sources.greaterwrong as greaterwrong import align_data.sources.stampy as stampy import align_data.sources.alignment_newsletter as alignment_newsletter @@ -16,7 +15,6 @@ + blogs.BLOG_REGISTRY + ebooks.EBOOK_REGISTRY + arxiv_papers.ARXIV_REGISTRY - + reports.REPORT_REGISTRY + greaterwrong.GREATERWRONG_REGISTRY + stampy.STAMPY_REGISTRY + distill.DISTILL_REGISTRY diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py index b761181e..c07f2844 100644 --- a/align_data/common/alignment_dataset.py +++ b/align_data/common/alignment_dataset.py @@ -1,19 +1,19 @@ +from datetime import datetime import logging import time -import zipfile from dataclasses import dataclass, field, KW_ONLY from itertools import islice from pathlib import Path -from typing import List +from typing import Iterable, List, Optional, Set from sqlalchemy import select from sqlalchemy.exc import IntegrityError +from sqlalchemy.orm import joinedload -import gdown import jsonlines import pytz from dateutil.parser import parse, ParserError from tqdm import tqdm -from align_data.db.models import Article +from align_data.db.models import Article, Summary from align_data.db.session import make_session @@ -43,15 +43,7 @@ class AlignmentDataset: """The path where data can be found. Usually a folder""" done_key = 'id' - """The key of the entry to use as the id when checking if already processed.""" - # Used to extract summaries - if `source_key` is set, the class will be deemed to collect summaries of other - # articles. - source_key = None - """The key of the entry to use as an identifier of the article which it's summarizing - should be an URL""" - summary_key = None - """The key of the entry containing the summary contents. This is used both to get the summary, but also where - it should be put in the target entry.""" COOLDOWN = 0 """An optional cool down between processing entries""" @@ -80,20 +72,29 @@ def __post_init__(self, data_path=Path(__file__).parent / '../../data/'): # set the default place to look for data self.files_path = self.raw_data_path / self.name - def make_data_entry(self, data, **kwargs): - data = dict(data, **kwargs) + def _add_authors(self, article: Article, authors: List[str]) -> Article: # TODO: Don't keep adding the same authors - come up with some way to reuse them - # TODO: Prettify this - data['authors'] = ','.join(data.get('authors', [])) - if summary := ('summary' in data and data.pop('summary')): - data['summaries'] = [summary] - return Article( + article.authors = ','.join(authors) + if len(article.authors) > 1024: + article.authors = ','.join(article.authors[:1024].split(',')[:-1]) + return article + + def make_data_entry(self, data, **kwargs) -> Article: + data = dict(data, **kwargs) + summary = data.pop('summary', None) + authors = data.pop('authors', []) + + article = Article( id_fields=self.id_fields, meta={k: v for k, v in data.items() if k not in INIT_DICT}, **{k: v for k, v in data.items() if k in INIT_DICT}, ) + self._add_authors(article, authors) + if summary: + article.summaries.append(Summary(text=summary, source=self.name)) + return article - def to_jsonl(self, out_path=None, filename=None): + def to_jsonl(self, out_path=None, filename=None) -> Path: if not out_path: out_path=Path(__file__).parent / '../../data/' @@ -115,6 +116,9 @@ def read_entries(self, sort_by=None): for item in session.scalars(query): yield item + def _add_batch(self, session, batch): + session.add_all(batch) + def add_entries(self, entries): def commit(): try: @@ -126,7 +130,7 @@ def commit(): with make_session() as session: items = iter(entries) while batch := tuple(islice(items, self.batch_size)): - session.add_all(batch) + self._add_batch(session, batch) # there might be duplicates in the batch, so if they cause # an exception, try to commit them one by one if not commit(): @@ -139,7 +143,7 @@ def setup(self): self._outputted_items = self._load_outputted_items() @property - def items_list(self): + def items_list(self) -> Iterable: """Returns a collection of items to be processed.""" return [] @@ -150,7 +154,7 @@ def get_item_key(self, item): """ return item.name - def _load_outputted_items(self): + def _load_outputted_items(self) -> Set[str]: """Load the output file (if it exists) in order to know which items have already been output.""" with make_session() as session: if hasattr(Article, self.done_key): @@ -161,7 +165,7 @@ def _load_outputted_items(self): # TODO: Properly handle this - it should create a proper SQL JSON select return {item.get(self.done_key) for item in session.scalars(select(Article.meta)).all()} - def unprocessed_items(self, items=None): + def unprocessed_items(self, items=None) -> Iterable: """Return a list of all items to be processed. This will automatically remove any items that have already been processed, @@ -178,11 +182,11 @@ def not_processed(item): if not self.lazy_eval: filtered = list(filtered) - return tqdm(filtered, desc=f"Processing {self.name}") + return filtered def fetch_entries(self): """Get all entries to be written to the file.""" - for item in self.unprocessed_items(): + for item in tqdm(self.unprocessed_items(), desc=f"Processing {self.name}"): entry = self.process_entry(item) if not entry: continue @@ -197,10 +201,10 @@ def process_entry(self, entry): raise NotImplementedError @staticmethod - def _format_datetime(date): + def _format_datetime(date) -> str: return date.strftime("%Y-%m-%dT%H:%M:%SZ") - def _get_published_date(self, date): + def _get_published_date(self, date) -> Optional[datetime]: try: # Totally ignore any timezone info, forcing everything to UTC return parse(str(date)).replace(tzinfo=pytz.UTC) @@ -209,52 +213,30 @@ def _get_published_date(self, date): return None -@dataclass -class GdocDataset(AlignmentDataset): - """A base Dataset handler for files that are saved on Gdrive,""" - - gdrive_address: str - """The full URL to the gdrive file""" +class SummaryDataset(AlignmentDataset): - glob = '*.md' - """How to identify files to be processed when going through a folder for files""" - - @property - def items_list(self): - """Returns a generator of items to be processed.""" - return self.files_path.glob(self.glob) + def unprocessed_items(self, items=None) -> Iterable: + # This breaks the possible lazy loading of the items. Should be fine... + items = list(super().unprocessed_items(items)) - @property - def zip_file(self): - """The name of the downloaded data, if a zip file.""" - return self.raw_data_path / f"{self.name}.zip" - - def zip_from_gdrive(self, url=None, filename=None, path=None): - """Fetch the data a zip file from Gdrive. - - :param str url: the url to the file. Will use `self.gdrive_address` if empty - :param str filename: the name of the zip file. Will use `self.zip_file` if empty - :param str path: the path where the zip file should be extracted to. Will use `self.files_path` if empty - """ - filename = filename or self.zip_file + urls = map(self.get_item_key, items) + with make_session() as session: + self.articles = { + a.url: a for a in session.query(Article).options(joinedload(Article.summaries)).filter(Article.url.in_(urls)) + if a.url + } - with open(filename, 'wb') as output: - gdown.download(url=url or self.gdrive_address, - output=output, - quiet=False) + return items - logger.info("Unzipping") - with zipfile.ZipFile(filename, 'r') as zip_ref: - zip_ref.extractall(path or self.files_path) + def _load_outputted_items(self) -> Set[str]: + """Load the output file (if it exists) in order to know which items have already been output.""" + with make_session() as session: + return set(session.scalars(select(Article.url).join(Article.summaries).filter(Summary.source == self.name))) - def folder_from_gdrive(self, url=None, output=None): - """Download a folder from gdrive. + def _add_batch(self, session, batch): + def merge(item): + if prev := self.articles.get(item.url): + return session.merge(item.update(prev)) + return item - :param str url: the url to the file. Will use `self.gdrive_address` if empty - :param str output: the path where the folder should be downloaded to. Will use `self.files_path` if empty - """ - gdown.download_folder( - url=url or self.gdrive_address, - output=str(output or self.files_path), - quiet=False - ) + session.add_all(map(merge, batch)) diff --git a/align_data/db/models.py b/align_data/db/models.py index 378b553a..38402d2f 100644 --- a/align_data/db/models.py +++ b/align_data/db/models.py @@ -65,6 +65,21 @@ def verify_id(self): id_from_fields = hashlib.md5(id_string).hexdigest() assert self.id == id_from_fields, f"Entry id {self.id} does not match id from id_fields, {id_from_fields}" + def update(self, other): + for field in self.__table__.columns.keys(): + if field not in ['id', 'hash_id', 'metadata'] and getattr(other, field): + setattr(self, field, getattr(other, field)) + self.meta.update({k: v for k, v in other.meta.items() if k and v}) + + if other._id: + self._id = other._id + self.id = None # update the hash id so it calculates a new one if needed + return self + + def _set_id(self): + id_string = self.generate_id_string() + self.id = hashlib.md5(id_string).hexdigest() + @classmethod def before_write(cls, mapper, connection, target): target.verify_fields() @@ -72,12 +87,16 @@ def before_write(cls, mapper, connection, target): if target.id: target.verify_id() else: - id_string = target.generate_id_string() - target.id = hashlib.md5(id_string).hexdigest() + target._set_id() def to_dict(self): if date := self.date_published: date = date.replace(tzinfo=pytz.UTC).strftime("%Y-%m-%dT%H:%M:%SZ") + + authors = [] + if self.authors and self.authors.strip(): + authors = [i.strip() for i in self.authors.split(',')] + return { 'id': self.id, 'title': self.title, @@ -86,9 +105,9 @@ def to_dict(self): 'source_type': self.source_type, 'text': self.text, 'date_published': date, - 'authors': [i.strip() for i in self.authors.split(',')] if self.authors.strip() else [], - 'summaries': [s.text for s in self.summaries], - **self.meta, + 'authors': authors, + 'summaries': [s.text for s in (self.summaries or [])], + **(self.meta or {}), } diff --git a/align_data/db/session.py b/align_data/db/session.py index 16ff48e4..c8949f8f 100644 --- a/align_data/db/session.py +++ b/align_data/db/session.py @@ -7,7 +7,7 @@ @contextmanager def make_session(auto_commit=False): engine = create_engine(DB_CONNECTION_URI, echo=False) - with Session(engine) as session: + with Session(engine).no_autoflush as session: yield session if auto_commit: session.commit() diff --git a/align_data/sources/alignment_newsletter/alignment_newsletter.py b/align_data/sources/alignment_newsletter/alignment_newsletter.py index 1dc3a670..fa613640 100644 --- a/align_data/sources/alignment_newsletter/alignment_newsletter.py +++ b/align_data/sources/alignment_newsletter/alignment_newsletter.py @@ -5,18 +5,15 @@ import pandas as pd from dataclasses import dataclass -from align_data.common.alignment_dataset import AlignmentDataset +from align_data.common.alignment_dataset import SummaryDataset logger = logging.getLogger(__name__) @dataclass -class AlignmentNewsletter(AlignmentDataset): +class AlignmentNewsletter(SummaryDataset): - done_key = "title" - - source_key = 'url' - summary_key = 'text' + done_key = "url" def __post_init__(self, data_path=Path(__file__).parent / '../../../data/'): self.data_path = data_path @@ -28,12 +25,17 @@ def setup(self) -> None: self.newsletter_xlsx_path = self.raw_data_path / "alignment_newsletter.xlsx" self.df = pd.read_excel(self.newsletter_xlsx_path) + @staticmethod + def maybe(val): + if pd.isna(val): + return None + return val + def get_item_key(self, row): - return row.Title + return self.maybe(row.URL) - @staticmethod - def _get_published_date(year): - if not year or pd.isna(year): + def _get_published_date(self, year): + if not self.maybe(year): return None return datetime(int(year), 1, 1, tzinfo=timezone.utc) @@ -47,11 +49,11 @@ def process_entry(self, row): converted_with, source_type, venue, newsletter_category, highlight, newsletter_number, summarizer, opinion, prerequisites, read_more, title, authors, date_published, text """ - if pd.isna(row.Summary) or not row.Summary: + if not self.maybe(row.Summary) or not self.maybe(row.URL): return None def handle_na(v, cast=None): - if not v or pd.isna(v): + if not self.maybe(v): return '' if cast: return cast(v) @@ -73,5 +75,5 @@ def handle_na(v, cast=None): "title": handle_na(row.Title, str), "authors": [i.strip() for i in str(row.Authors).split(',')], "date_published": self._get_published_date(row.Year), - "text": handle_na(row.Summary, str), + "summary": handle_na(row.Summary, str), }) diff --git a/align_data/sources/articles/__init__.py b/align_data/sources/articles/__init__.py index a6fff663..6775e496 100644 --- a/align_data/sources/articles/__init__.py +++ b/align_data/sources/articles/__init__.py @@ -19,7 +19,7 @@ sheet_id='1800487220' ), XMLArticles( - name='nonarxiv_papers', + name='xmls', spreadsheet_id='1l3azVJVukGAvZPgg0GyeqiaQe8bEMZvycBJaA8cRXf4', sheet_id='823056509' ), diff --git a/align_data/sources/articles/datasets.py b/align_data/sources/articles/datasets.py index 9c7767af..a6328f42 100644 --- a/align_data/sources/articles/datasets.py +++ b/align_data/sources/articles/datasets.py @@ -82,12 +82,16 @@ class PDFArticles(SpreadsheetDataset): COOLDOWN = 1 batch_size = 1 + def setup(self): + super().setup() + self.files_path.mkdir(exist_ok=True, parents=True) + def _get_text(self, item): url = f'https://drive.google.com/uc?id={item.file_id}' filename = self.files_path / f'{item.title}.pdf' - download(str(filename), id=item.file_id) - return read_pdf(filename) + if download(output=str(filename), id=item.file_id): + return read_pdf(filename) class HTMLArticles(SpreadsheetDataset): @@ -107,6 +111,10 @@ class EbookArticles(SpreadsheetDataset): COOLDOWN = 10 # Add a large cooldown, as google complains a lot batch_size = 1 + def setup(self): + super().setup() + self.files_path.mkdir(exist_ok=True, parents=True) + def _get_text(self, item): file_id = item.source_url.split('/')[-2] filename = download(output=str(self.files_path / f'{item.title}.epub'), id=file_id) @@ -136,6 +144,10 @@ class DocArticles(SpreadsheetDataset): source_filetype = 'docx' + def setup(self): + super().setup() + self.files_path.mkdir(exist_ok=True, parents=True) + def _get_text(self, item): pandoc_path = Path('data/raw/pandoc/pandoc/') if pandoc_path.exists(): diff --git a/align_data/sources/arxiv_papers/arxiv_papers.py b/align_data/sources/arxiv_papers/arxiv_papers.py index d4eef69f..ae9b7cb9 100644 --- a/align_data/sources/arxiv_papers/arxiv_papers.py +++ b/align_data/sources/arxiv_papers/arxiv_papers.py @@ -62,7 +62,7 @@ def process_entry(self, item) -> None: "authors": authors, "date_published": self._get_published_date(self.is_val(item.date_published) or paper.get('date_published')), "data_last_modified": str(metadata.updated), - "abstract": metadata.summary.replace("\n", " "), + "summary": metadata.summary.replace("\n", " "), "author_comment": metadata.comment, "journal_ref": metadata.journal_ref, "doi": metadata.doi, diff --git a/align_data/sources/blogs/__init__.py b/align_data/sources/blogs/__init__.py index 8f1d5fc1..7021c994 100644 --- a/align_data/sources/blogs/__init__.py +++ b/align_data/sources/blogs/__init__.py @@ -12,7 +12,6 @@ WordpressBlog(name="aisafety.camp", url="https://aisafety.camp"), WordpressBlog(name="miri", url="https://intelligence.org"), WordpressBlog(name="jsteinhardt_blog", url="https://jsteinhardt.wordpress.com"), - WordpressBlog(name="qualiacomputing", url="https://qualiacomputing.com"), WordpressBlog(name="vkrakovna_blog", url="https://vkrakovna.wordpress.com"), WordpressBlog(name="yudkowsky_blog", url="https://yudkowsky.net"), MediumBlog(name="deepmind_blog", url="https://deepmindsafetyresearch.medium.com/", authors=["DeepMind Safety Research"]), diff --git a/align_data/sources/reports/__init__.py b/align_data/sources/reports/__init__.py deleted file mode 100644 index af75ef38..00000000 --- a/align_data/sources/reports/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from .reports import Reports - - -REPORT_REGISTRY = [ - Reports( - name='reports', - gdrive_address="https://drive.google.com/uc?id=1TzOyQ_TTc7BptFijJiojPDkA8vDm1laN" - ) -] diff --git a/align_data/sources/reports/reports.py b/align_data/sources/reports/reports.py deleted file mode 100644 index e79a8909..00000000 --- a/align_data/sources/reports/reports.py +++ /dev/null @@ -1,58 +0,0 @@ -from dataclasses import dataclass -from align_data.common.alignment_dataset import GdocDataset -import logging -import grobid_tei_xml - -from datetime import datetime, timezone -from dateutil.parser import parse - -logger = logging.getLogger(__name__) - -@dataclass -class Reports(GdocDataset): - - done_key = "filename" - glob = "*.xml" - - def setup(self): - super().setup() - - logger.info('Fetching data from Gdrive') - self.files_path = self.raw_data_path / 'report_teis' - self.zip_from_gdrive(path=self.raw_data_path) - logger.info('Fetched data') - - @property - def zip_file(self): - return self.raw_data_path / "report_teis.zip" - - @staticmethod - def _get_published_data(doc_dict): - date_str = doc_dict["header"].get('date') - if date_str: - return parse(date_str).astimezone(timezone.utc) - return None - - def process_entry(self, filename): - logger.info(f"Processing {filename.name}") - xml_text = filename.read_text(encoding='utf-8') - try: - doc_dict = grobid_tei_xml.parse_document_xml(xml_text).to_dict() - abstract = doc_dict.get("abstract") - logger.info(f"Doc: {list(doc_dict.keys())}") - return self.make_data_entry({ - "summary": [abstract] if abstract else [], - "authors": [xx["full_name"] for xx in doc_dict["header"]["authors"]], - "title": doc_dict["header"]["title"], - "text": doc_dict["body"], - "source": self.name, - "source_type": "pdf", - "date_published": self._get_published_data(doc_dict), - "url": "", - "filename": filename.name, - }) - except Exception as e: - logger.error(f"Error: {e}") - logger.info('Skipping %s', filename.name) - - return None diff --git a/main.py b/main.py index 4578ffe7..78371d7d 100644 --- a/main.py +++ b/main.py @@ -62,6 +62,8 @@ def generate_jsonl_files(self, *names): :param List[str] names: The names of the datasets to generate """ + if names == ('all',): + names = ALL_DATASETS missing = {name for name in names if name not in ALL_DATASETS} assert not missing, f"{missing} are not valid dataset names" for name in names: diff --git a/tests/align_data/articles/test_datasets.py b/tests/align_data/articles/test_datasets.py index 3773539f..48340000 100644 --- a/tests/align_data/articles/test_datasets.py +++ b/tests/align_data/articles/test_datasets.py @@ -55,9 +55,10 @@ def test_pdf_articles_get_text(): dataset = PDFArticles(name='bla', spreadsheet_id='123', sheet_id='456') item = Mock(file_id='23423', title='bla bla bla') - def check_downloads(filename, id): - assert filename == str(dataset.files_path / 'bla bla bla.pdf') + def check_downloads(output, id): + assert output == str(dataset.files_path / 'bla bla bla.pdf') assert id == '23423' + return output def read_pdf(filename): assert filename == dataset.files_path / 'bla bla bla.pdf' diff --git a/tests/align_data/test_alignment_newsletter.py b/tests/align_data/test_alignment_newsletter.py index 0e9db7a4..249f77e9 100644 --- a/tests/align_data/test_alignment_newsletter.py +++ b/tests/align_data/test_alignment_newsletter.py @@ -19,14 +19,14 @@ def test_xlsx_file_loaded(dataset): def test_get_item_key(dataset): items = list(dataset.items_list) - assert dataset.get_item_key(items[0]) == 'Adversarial Examples Are Not Bugs, They Are Features' + assert dataset.get_item_key(items[0]) == 'http://gradientscience.org/adv/' def test_process_entry_no_summary(dataset): items = pd.DataFrame([ - {'Title': 'An item without a summary field'}, - {'Title': 'An item with a None summary field', 'Summary': None}, - {'Title': 'An item with an invalid summary field', 'Summary': pd.NA}, + {'Url': 'http://bla.bla/3', 'Title': 'An item without a summary field'}, + {'Url': 'http://bla.bla/2', 'Title': 'An item with a None summary field', 'Summary': None}, + {'Url': 'http://bla.bla/1', 'Title': 'An item with an invalid summary field', 'Summary': pd.NA}, ]) for item in items.itertuples(): assert dataset.process_entry(item) is None @@ -68,8 +68,7 @@ def test_process_entry(dataset): 'source': 'text', 'source_type': 'google-sheets', 'summarizer': 'Rohin', - 'summaries': [], - 'text': ( + 'summaries': [( '_Distill published a discussion of this paper. This highlights ' 'section will cover the full discussion; all of these summaries and ' 'opinions are meant to be read together._\n' @@ -114,8 +113,9 @@ def test_process_entry(dataset): 'chosen to be y + 1. For both datasets, if you train a new model on ' 'the dataset, you get good performance **on the original test set**, ' 'showing that the "non-robust features" do generalize.' - ), + )], 'title': 'Adversarial Examples Are Not Bugs, They Are Features', 'url': 'http://gradientscience.org/adv/', 'venue': 'arXiv', + 'text': None, } diff --git a/tests/align_data/test_arxiv.py b/tests/align_data/test_arxiv.py index 00b07969..30717d9e 100644 --- a/tests/align_data/test_arxiv.py +++ b/tests/align_data/test_arxiv.py @@ -44,7 +44,6 @@ def test_process_entry(): with patch('align_data.arxiv_papers.arxiv_papers.parse_vanity', return_value=contents): with patch('align_data.arxiv_papers.arxiv_papers.arxiv', arxiv): assert dataset.process_entry(item).to_dict() == { - 'abstract': 'abstract bla bla', 'author_comment': 'no comment', 'authors': ['mr blobby'], 'categories': 'wut', @@ -56,7 +55,7 @@ def test_process_entry(): 'primary_category': 'cat', 'source': 'asd', 'source_type': 'html', - 'summaries': [], + 'summaries': ['abstract bla bla'], 'text': 'this is the text', 'title': 'this is the title', 'url': 'https://arxiv.org/abs/2001.11038',