Skip to content

Commit

Permalink
Handle Arxiv links in special docs (#131)
Browse files Browse the repository at this point in the history
* Handle Arxiv links in special docs

* Handle retracted arxiv articles

* rename Retracted -> Withdrawn
  • Loading branch information
mruwnik authored Aug 14, 2023
1 parent 188c827 commit e47d8b0
Show file tree
Hide file tree
Showing 13 changed files with 341 additions and 154 deletions.
2 changes: 0 additions & 2 deletions align_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import align_data.sources.articles as articles
import align_data.sources.blogs as blogs
import align_data.sources.ebooks as ebooks
import align_data.sources.arxiv_papers as arxiv_papers
import align_data.sources.greaterwrong as greaterwrong
import align_data.sources.stampy as stampy
import align_data.sources.alignment_newsletter as alignment_newsletter
Expand All @@ -14,7 +13,6 @@
+ articles.ARTICLES_REGISTRY
+ blogs.BLOG_REGISTRY
+ ebooks.EBOOK_REGISTRY
+ arxiv_papers.ARXIV_REGISTRY
+ greaterwrong.GREATERWRONG_REGISTRY
+ stampy.STAMPY_REGISTRY
+ distill.DISTILL_REGISTRY
Expand Down
5 changes: 3 additions & 2 deletions align_data/common/alignment_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,15 +209,16 @@ def fetch_entries(self):
if self.COOLDOWN:
time.sleep(self.COOLDOWN)

def process_entry(self, entry):
def process_entry(self, entry) -> Optional[Article]:
"""Process a single entry."""
raise NotImplementedError

@staticmethod
def _format_datetime(date) -> str:
return date.strftime("%Y-%m-%dT%H:%M:%SZ")

def _get_published_date(self, date) -> Optional[datetime]:
@staticmethod
def _get_published_date(date) -> Optional[datetime]:
try:
# Totally ignore any timezone info, forcing everything to UTC
return parse(str(date)).replace(tzinfo=pytz.UTC)
Expand Down
8 changes: 7 additions & 1 deletion align_data/sources/articles/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from align_data.sources.articles.datasets import (
EbookArticles, DocArticles, HTMLArticles, MarkdownArticles, PDFArticles, SpecialDocs, XMLArticles
ArxivPapers, EbookArticles, DocArticles, HTMLArticles,
MarkdownArticles, PDFArticles, SpecialDocs, XMLArticles
)
from align_data.sources.articles.indices import IndicesDataset

Expand Down Expand Up @@ -39,5 +40,10 @@
spreadsheet_id='1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI',
sheet_id='980957638',
),
ArxivPapers(
name="arxiv",
spreadsheet_id="1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI",
sheet_id="655836697",
),
IndicesDataset('indices'),
]
2 changes: 1 addition & 1 deletion align_data/sources/articles/articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def process_row(row, sheets):
row.set_status(error)
return

data_source = contents.get("data_source")
data_source = contents.get("source_type")
if data_source not in sheets:
error = "Unhandled data type"
logger.error(error)
Expand Down
46 changes: 41 additions & 5 deletions align_data/sources/articles/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Dict
from urllib.parse import urlparse

import pandas as pd
Expand All @@ -13,8 +14,11 @@
from align_data.common.alignment_dataset import AlignmentDataset
from align_data.db.models import Article
from align_data.sources.articles.google_cloud import fetch_file, fetch_markdown
from align_data.sources.articles.parsers import HTML_PARSERS, extract_gdrive_contents, item_metadata
from align_data.sources.articles.parsers import (
HTML_PARSERS, extract_gdrive_contents, item_metadata, parse_domain
)
from align_data.sources.articles.pdf import read_pdf
from align_data.sources.arxiv_papers.arxiv_papers import fetch as fetch_arxiv

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -81,21 +85,30 @@ def _query_items(self):
special_docs_types = ["pdf", "html", "xml", "markdown", "docx"]
return select(Article).where(Article.source.in_(special_docs_types))

def process_entry(self, item):
def get_contents(self, item) -> Dict:
metadata = {}
if url := self.maybe(item.source_url) or self.maybe(item.url):
metadata = item_metadata(url)

return self.make_data_entry({
'source': metadata.get('data_source') or self.name,
return {
'url': self.maybe(item.url),
'title': self.maybe(item.title) or metadata.get('title'),
'source': metadata.get('source_type') or self.name,
'source_type': self.maybe(item.source_type),
'date_published': self._get_published_date(item.date_published) or metadata.get('date_published'),
'authors': self.extract_authors(item) or metadata.get('authors', []),
'text': metadata.get('text'),
'status': metadata.get('error'),
})
}

def process_entry(self, item):
if parse_domain(item.url) == "arxiv.org":
contents = ArxivPapers.get_contents(item)
contents['source'] = 'arxiv'
else:
contents = self.get_contents(item)

return self.make_data_entry(contents)


class PDFArticles(SpreadsheetDataset):
Expand Down Expand Up @@ -175,3 +188,26 @@ def _get_text(self, item):
file_id = item.source_url.split("/")[-2]
file_name = fetch_file(file_id)
return convert_file(file_name, "md", format="docx", extra_args=["--wrap=none"])


class ArxivPapers(SpreadsheetDataset):
COOLDOWN: int = 1

@classmethod
def get_contents(cls, item) -> Dict:
contents = fetch_arxiv(item.url or item.source_url)

if cls.maybe(item.authors) and item.authors.strip():
contents['authors'] = [i.strip() for i in item.authors.split(',')]
if cls.maybe(item.title):
contents['title'] = cls.maybe(item.title)

contents['date_published'] = cls._get_published_date(
cls.maybe(item.date_published) or contents.get('date_published')
)
return contents

def process_entry(self, item):
logger.info(f"Processing {item.title}")

return self.make_data_entry(self.get_contents(item), source=self.name)
10 changes: 5 additions & 5 deletions align_data/sources/articles/google_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def fetch_markdown(file_id):
file_name = fetch_file(file_id)
return {
"text": Path(file_name).read_text(),
"data_source": "markdown",
"source_type": "markdown",
}
except Exception as e:
return {'error': str(e)}
Expand All @@ -156,15 +156,15 @@ def parse_grobid(contents):
if not doc_dict.get('body'):
return {
'error': 'No contents in XML file',
'data_source': 'xml',
'source_type': 'xml',
}

return {
"title": doc_dict.get("header", {}).get("title"),
"abstract": doc_dict.get("abstract"),
"text": doc_dict["body"],
"authors": list(filter(None, authors)),
"data_source": "xml",
"source_type": "xml",
}


Expand Down Expand Up @@ -198,7 +198,7 @@ def extract_gdrive_contents(link):
elif content_type & {'text/markdown'}:
result.update(fetch_markdown(file_id))
elif content_type & {'application/epub+zip', 'application/epub'}:
result['data_source'] = 'ebook'
result['source_type'] = 'ebook'
elif content_type & {'text/html'}:
res = fetch(url)
if 'Google Drive - Virus scan warning' in res.text:
Expand All @@ -213,7 +213,7 @@ def extract_gdrive_contents(link):
soup = BeautifulSoup(res.content, "html.parser")
result.update({
'text': MarkdownConverter().convert_soup(soup.select_one('body')).strip(),
'data_source': 'html',
'source_type': 'html',
})
else:
result['error'] = f'unknown content type: {content_type}'
Expand Down
10 changes: 7 additions & 3 deletions align_data/sources/articles/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,8 +250,12 @@ def getter(url):
}


def parse_domain(url: str) -> str:
return url and urlparse(url).netloc.lstrip('www.')


def item_metadata(url) -> Dict[str, str]:
domain = urlparse(url).netloc.lstrip('www.')
domain = parse_domain(url)
try:
res = fetch(url, 'head')
except (MissingSchema, InvalidSchema, ConnectionError) as e:
Expand All @@ -265,7 +269,7 @@ def item_metadata(url) -> Dict[str, str]:
if parser := HTML_PARSERS.get(domain):
if res := parser(url):
# Proper contents were found on the page, so use them
return {'source_url': url, 'data_source': 'html', 'text': res}
return {'source_url': url, 'source_type': 'html', 'text': res}

if parser := PDF_PARSERS.get(domain):
if res := parser(url):
Expand All @@ -286,6 +290,6 @@ def item_metadata(url) -> Dict[str, str]:
elif content_type & {"application/epub+zip", "application/epub"}:
# it looks like an ebook. Assume it's fine.
# TODO: validate that the ebook is readable
return {"source_url": url, "data_source": "ebook"}
return {"source_url": url, "source_type": "ebook"}
else:
return {"error": f"Unhandled content type: {content_type}"}
4 changes: 2 additions & 2 deletions align_data/sources/articles/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def fetch_pdf(link):
return {
"source_url": link,
"text": "\n".join(page.extract_text() for page in pdf_reader.pages),
"data_source": "pdf",
"source_type": "pdf",
}
except (TypeError, PdfReadError) as e:
logger.error('Could not read PDF file: %s', e)
Expand Down Expand Up @@ -170,5 +170,5 @@ def get_first_child(item):
"authors": authors,
"text": text,
"date_published": date_published,
"data_source": "html",
"source_type": "html",
}
9 changes: 0 additions & 9 deletions align_data/sources/arxiv_papers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +0,0 @@
from .arxiv_papers import ArxivPapers

ARXIV_REGISTRY = [
ArxivPapers(
name="arxiv",
spreadsheet_id="1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI",
sheet_id="655836697",
)
]
Loading

0 comments on commit e47d8b0

Please sign in to comment.