Handle Arxiv links in special docs (#131)

* Handle Arxiv links in special docs * Handle retracted arxiv articles * rename Retracted -> Withdrawn
StampyAI · Aug 14, 2023 · e47d8b0 · e47d8b0
1 parent 188c827
commit e47d8b0
Show file tree

Hide file tree

Showing 13 changed files with 341 additions and 154 deletions.
diff --git a/align_data/__init__.py b/align_data/__init__.py
@@ -2,7 +2,6 @@
 import align_data.sources.articles as articles
 import align_data.sources.blogs as blogs
 import align_data.sources.ebooks as ebooks
-import align_data.sources.arxiv_papers as arxiv_papers
 import align_data.sources.greaterwrong as greaterwrong
 import align_data.sources.stampy as stampy
 import align_data.sources.alignment_newsletter as alignment_newsletter
@@ -14,7 +13,6 @@
     + articles.ARTICLES_REGISTRY
     + blogs.BLOG_REGISTRY
     + ebooks.EBOOK_REGISTRY
-    + arxiv_papers.ARXIV_REGISTRY
     + greaterwrong.GREATERWRONG_REGISTRY
     + stampy.STAMPY_REGISTRY
     + distill.DISTILL_REGISTRY

diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py
@@ -209,15 +209,16 @@ def fetch_entries(self):
             if self.COOLDOWN:
                 time.sleep(self.COOLDOWN)
 
-    def process_entry(self, entry):
+    def process_entry(self, entry) -> Optional[Article]:
         """Process a single entry."""
         raise NotImplementedError
 
     @staticmethod
     def _format_datetime(date) -> str:
         return date.strftime("%Y-%m-%dT%H:%M:%SZ")
 
-    def _get_published_date(self, date) -> Optional[datetime]:
+    @staticmethod
+    def _get_published_date(date) -> Optional[datetime]:
         try:
             # Totally ignore any timezone info, forcing everything to UTC
             return parse(str(date)).replace(tzinfo=pytz.UTC)

diff --git a/align_data/sources/articles/__init__.py b/align_data/sources/articles/__init__.py
@@ -1,5 +1,6 @@
 from align_data.sources.articles.datasets import (
-    EbookArticles, DocArticles, HTMLArticles, MarkdownArticles, PDFArticles, SpecialDocs, XMLArticles
+    ArxivPapers, EbookArticles, DocArticles, HTMLArticles,
+    MarkdownArticles, PDFArticles, SpecialDocs, XMLArticles
 )
 from align_data.sources.articles.indices import IndicesDataset
 
@@ -39,5 +40,10 @@
         spreadsheet_id='1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI',
         sheet_id='980957638',
     ),
+    ArxivPapers(
+        name="arxiv",
+        spreadsheet_id="1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI",
+        sheet_id="655836697",
+    ),
     IndicesDataset('indices'),
 ]
diff --git a/align_data/sources/articles/articles.py b/align_data/sources/articles/articles.py
@@ -65,7 +65,7 @@ def process_row(row, sheets):
         row.set_status(error)
         return
 
-    data_source = contents.get("data_source")
+    data_source = contents.get("source_type")
     if data_source not in sheets:
         error = "Unhandled data type"
         logger.error(error)

diff --git a/align_data/sources/articles/datasets.py b/align_data/sources/articles/datasets.py
@@ -2,6 +2,7 @@
 import os
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Dict
 from urllib.parse import urlparse
 
 import pandas as pd
@@ -13,8 +14,11 @@
 from align_data.common.alignment_dataset import AlignmentDataset
 from align_data.db.models import Article
 from align_data.sources.articles.google_cloud import fetch_file, fetch_markdown
-from align_data.sources.articles.parsers import HTML_PARSERS, extract_gdrive_contents, item_metadata
+from align_data.sources.articles.parsers import (
+    HTML_PARSERS, extract_gdrive_contents, item_metadata, parse_domain
+)
 from align_data.sources.articles.pdf import read_pdf
+from align_data.sources.arxiv_papers.arxiv_papers import fetch as fetch_arxiv
 
 logger = logging.getLogger(__name__)
 
@@ -81,21 +85,30 @@ def _query_items(self):
         special_docs_types = ["pdf", "html", "xml", "markdown", "docx"]
         return select(Article).where(Article.source.in_(special_docs_types))
 
-    def process_entry(self, item):
+    def get_contents(self, item) -> Dict:
         metadata = {}
         if url := self.maybe(item.source_url) or self.maybe(item.url):
             metadata = item_metadata(url)
 
-        return self.make_data_entry({
-            'source': metadata.get('data_source') or self.name,
+        return {
             'url': self.maybe(item.url),
             'title': self.maybe(item.title) or metadata.get('title'),
+            'source': metadata.get('source_type') or self.name,
             'source_type': self.maybe(item.source_type),
             'date_published': self._get_published_date(item.date_published) or metadata.get('date_published'),
             'authors': self.extract_authors(item) or metadata.get('authors', []),
             'text': metadata.get('text'),
             'status': metadata.get('error'),
-        })
+        }
+
+    def process_entry(self, item):
+        if parse_domain(item.url) == "arxiv.org":
+            contents = ArxivPapers.get_contents(item)
+            contents['source'] = 'arxiv'
+        else:
+            contents = self.get_contents(item)
+
+        return self.make_data_entry(contents)
 
 
 class PDFArticles(SpreadsheetDataset):
@@ -175,3 +188,26 @@ def _get_text(self, item):
         file_id = item.source_url.split("/")[-2]
         file_name = fetch_file(file_id)
         return convert_file(file_name, "md", format="docx", extra_args=["--wrap=none"])
+
+
+class ArxivPapers(SpreadsheetDataset):
+    COOLDOWN: int = 1
+
+    @classmethod
+    def get_contents(cls, item) -> Dict:
+        contents = fetch_arxiv(item.url or item.source_url)
+
+        if cls.maybe(item.authors) and item.authors.strip():
+            contents['authors'] = [i.strip() for i in item.authors.split(',')]
+        if cls.maybe(item.title):
+            contents['title'] = cls.maybe(item.title)
+
+        contents['date_published'] = cls._get_published_date(
+            cls.maybe(item.date_published) or contents.get('date_published')
+        )
+        return contents
+
+    def process_entry(self, item):
+        logger.info(f"Processing {item.title}")
+
+        return self.make_data_entry(self.get_contents(item), source=self.name)
diff --git a/align_data/sources/articles/google_cloud.py b/align_data/sources/articles/google_cloud.py
@@ -143,7 +143,7 @@ def fetch_markdown(file_id):
         file_name = fetch_file(file_id)
         return {
             "text": Path(file_name).read_text(),
-            "data_source": "markdown",
+            "source_type": "markdown",
         }
     except Exception as e:
         return {'error': str(e)}
@@ -156,15 +156,15 @@ def parse_grobid(contents):
     if not doc_dict.get('body'):
         return {
             'error': 'No contents in XML file',
-            'data_source': 'xml',
+            'source_type': 'xml',
         }
 
     return {
         "title": doc_dict.get("header", {}).get("title"),
         "abstract": doc_dict.get("abstract"),
         "text": doc_dict["body"],
         "authors": list(filter(None, authors)),
-        "data_source": "xml",
+        "source_type": "xml",
     }
 
 
@@ -198,7 +198,7 @@ def extract_gdrive_contents(link):
     elif content_type & {'text/markdown'}:
         result.update(fetch_markdown(file_id))
     elif content_type & {'application/epub+zip', 'application/epub'}:
-        result['data_source'] = 'ebook'
+        result['source_type'] = 'ebook'
     elif content_type & {'text/html'}:
         res = fetch(url)
         if 'Google Drive - Virus scan warning' in res.text:
@@ -213,7 +213,7 @@ def extract_gdrive_contents(link):
             soup = BeautifulSoup(res.content, "html.parser")
             result.update({
                 'text': MarkdownConverter().convert_soup(soup.select_one('body')).strip(),
-                'data_source': 'html',
+                'source_type': 'html',
             })
         else:
             result['error'] = f'unknown content type: {content_type}'

diff --git a/align_data/sources/articles/parsers.py b/align_data/sources/articles/parsers.py
@@ -250,8 +250,12 @@ def getter(url):
 }
 
 
+def parse_domain(url: str) -> str:
+    return url and urlparse(url).netloc.lstrip('www.')
+
+
 def item_metadata(url) -> Dict[str, str]:
-    domain = urlparse(url).netloc.lstrip('www.')
+    domain = parse_domain(url)
     try:
         res = fetch(url, 'head')
     except (MissingSchema, InvalidSchema, ConnectionError) as e:
@@ -265,7 +269,7 @@ def item_metadata(url) -> Dict[str, str]:
         if parser := HTML_PARSERS.get(domain):
             if res := parser(url):
                 # Proper contents were found on the page, so use them
-                return {'source_url': url, 'data_source': 'html', 'text': res}
+                return {'source_url': url, 'source_type': 'html', 'text': res}
 
         if parser := PDF_PARSERS.get(domain):
             if res := parser(url):
@@ -286,6 +290,6 @@ def item_metadata(url) -> Dict[str, str]:
     elif content_type & {"application/epub+zip", "application/epub"}:
         # it looks like an ebook. Assume it's fine.
         # TODO: validate that the ebook is readable
-        return {"source_url": url, "data_source": "ebook"}
+        return {"source_url": url, "source_type": "ebook"}
     else:
         return {"error": f"Unhandled content type: {content_type}"}
diff --git a/align_data/sources/articles/pdf.py b/align_data/sources/articles/pdf.py
@@ -66,7 +66,7 @@ def fetch_pdf(link):
         return {
             "source_url": link,
             "text": "\n".join(page.extract_text() for page in pdf_reader.pages),
-            "data_source": "pdf",
+            "source_type": "pdf",
         }
     except (TypeError, PdfReadError) as e:
         logger.error('Could not read PDF file: %s', e)
@@ -170,5 +170,5 @@ def get_first_child(item):
         "authors": authors,
         "text": text,
         "date_published": date_published,
-        "data_source": "html",
+        "source_type": "html",
     }
diff --git a/align_data/sources/arxiv_papers/__init__.py b/align_data/sources/arxiv_papers/__init__.py
@@ -1,9 +0,0 @@
-from .arxiv_papers import ArxivPapers
-
-ARXIV_REGISTRY = [
-    ArxivPapers(
-        name="arxiv",
-        spreadsheet_id="1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI",
-        sheet_id="655836697",
-    )
-]