Skip to content

Commit

Permalink
Arxiv pypdf (#82)
Browse files Browse the repository at this point in the history
* Pypdf module and requests with retries

* Request with retries

* Fixing one record in the CSV

* Added pypdf to requirements

* use parsers

* remove arxiv csv file

---------

Co-authored-by: Daniel O'Connell <[email protected]>
  • Loading branch information
jknowak and mruwnik authored Jul 31, 2023
1 parent 2ca30e8 commit 0de45e9
Show file tree
Hide file tree
Showing 9 changed files with 184 additions and 1,874 deletions.
4 changes: 3 additions & 1 deletion align_data/sources/articles/articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
import logging

from tqdm import tqdm
import gspread

from align_data.sources.articles.google_cloud import iterate_rows, get_spreadsheet, get_sheet, upload_file, OK, with_retry
from align_data.sources.articles.parsers import item_metadata, fetch
from align_data.sources.articles.indices import fetch_all
from align_data.sources.articles.html import with_retry
from align_data.settings import PDFS_FOLDER_ID


Expand Down Expand Up @@ -36,7 +38,7 @@ def save_pdf(filename, link):
)


@with_retry(times=3)
@with_retry(times=3, exceptions=gspread.exceptions.APIError)
def process_row(row, sheets):
"""Check the given `row` and fetch its metadata + optional extra stuff."""
logger.info('Checking "%s"', row['title'])
Expand Down
6 changes: 6 additions & 0 deletions align_data/sources/articles/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ def maybe(val):
return None
return val

@staticmethod
def is_val(val):
if pd.isna(val):
return None
return val

@property
def items_list(self):
logger.info(f'Fetching https://docs.google.com/spreadsheets/d/{self.spreadsheet_id}/export?format=CS&gid={self.sheet_id}')
Expand Down
17 changes: 17 additions & 0 deletions align_data/sources/articles/html.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import time
import logging
from typing import Union

Expand All @@ -13,6 +14,22 @@
}


def with_retry(times=3, exceptions=requests.exceptions.RequestException):
"""A decorator that will retry the wrapped function up to `times` times in case of google sheets errors."""
def wrapper(f):
def retrier(*args, **kwargs):
for i in range(times):
try:
return f(*args, **kwargs)
except exceptions as e:
logger.error(f'{e} - retrying up to {times - i} times')
# Do a logarithmic backoff
time.sleep((i + 1) ** 2)
raise ValueError(f'Gave up after {times} tries')
return retrier
return wrapper


def fetch(url, method='get', headers=DEFAULT_HEADERS):
"""Fetch the given `url`.
Expand Down
4 changes: 3 additions & 1 deletion align_data/sources/articles/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import grobid_tei_xml
import regex as re
from align_data.sources.articles.html import element_extractor, fetch, fetch_element
from align_data.sources.articles.pdf import doi_getter, fetch_pdf, get_pdf_from_page, get_arxiv_pdf
from align_data.sources.articles.pdf import doi_getter, fetch_pdf, get_pdf_from_page, get_arxiv_pdf, parse_vanity
from align_data.sources.articles.google_cloud import fetch_markdown
from markdownify import MarkdownConverter
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -154,6 +154,8 @@ def getter(url):
HTML_PARSERS = {
'academic.oup.com': element_extractor('#ContentTab'),
'ai.googleblog.com': element_extractor('div.post-body.entry-content'),
'arxiv-vanity.com': parse_vanity,
'ar5iv.labs.arxiv.org': parse_vanity,
'bair.berkeley.edu': element_extractor('article'),
'mediangroup.org': element_extractor('div.entry-content'),
'www.alexirpan.com': element_extractor('article'),
Expand Down
42 changes: 41 additions & 1 deletion align_data/sources/articles/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
import pandas as pd
from PyPDF2 import PdfReader
from PyPDF2.errors import PdfReadError
from markdownify import MarkdownConverter

from align_data.sources.articles.html import fetch, fetch_element
from align_data.sources.articles.html import fetch, fetch_element, with_retry

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -41,6 +42,7 @@ def read_pdf(filename):
return None


@with_retry(times=3)
def fetch_pdf(link):
"""Return the contents of the pdf file at `link` as a markdown string.
Expand Down Expand Up @@ -151,3 +153,41 @@ def getter(url):
return pdf
return {'error': f'Could not fetch pdf from {link}'}
return getter


def parse_vanity(url):
contents = fetch_element(url, 'article')
if not contents:
return None

if title := contents.select_one('h1.ltx_title'):
title = title.text

def get_first_child(item):
child = next(item.children)
if not child:
return []

if not isinstance(child, str):
child = child.text
return child.split(',')

authors = [
a.strip() for item in contents.select('div.ltx_authors .ltx_personname') for a in get_first_child(item)
]

if date_published := contents.select_one('div.ltx_dates'):
date_published = date_published.text.strip('()')

text = '\n\n'.join([
MarkdownConverter().convert_soup(elem).strip()
for elem in contents.select('section.ltx_section')
])

return {
'title': title,
'authors': authors,
'text': text,
'date_published': date_published,
'data_source': 'html',
}
6 changes: 5 additions & 1 deletion align_data/sources/arxiv_papers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from .arxiv_papers import ArxivPapers

ARXIV_REGISTRY = [
ArxivPapers(name = "arxiv")
ArxivPapers(
name="arxiv",
spreadsheet_id='1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI',
sheet_id='655836697'
)
]
191 changes: 46 additions & 145 deletions align_data/sources/arxiv_papers/arxiv_papers.py
Original file line number Diff line number Diff line change
@@ -1,171 +1,72 @@
import arxiv
import requests
import logging
import time
import jsonlines

import pandas as pd

import re
from dataclasses import dataclass
from markdownify import markdownify
from bs4 import BeautifulSoup
from tqdm import tqdm
from align_data.common.alignment_dataset import AlignmentDataset

import arxiv
from align_data.sources.articles.datasets import SpreadsheetDataset
from align_data.sources.articles.pdf import fetch_pdf, parse_vanity

logger = logging.getLogger(__name__)


@dataclass
class ArxivPapers(AlignmentDataset):
class ArxivPapers(SpreadsheetDataset):
summary_key: str = 'summary'
COOLDOWN: int = 1
done_key = "url"
batch_size = 1

def _get_arxiv_metadata(self, paper_id) -> arxiv.Result:
"""
Get metadata from arxiv
"""
try:
search = arxiv.Search(id_list=[paper_id], max_results=1)
return next(search.results())
except Exception as e:
logger.error(e)
return None
return next(search.results())

@property
def items_list(self):
self.papers_csv_path = self.raw_data_path / "ai-alignment-arxiv-papers.csv"

self.df = pd.read_csv(self.papers_csv_path)
self.df_arxiv = self.df[self.df["Url"].str.contains(
"arxiv.org/abs") == True].drop_duplicates(subset="Url", keep="first")

return [xx.split('/abs/')[1] for xx in self.df_arxiv.Url]

def process_entry(self, ids) -> None:
logger.info(f"Processing {ids}")

markdown = self.process_id(ids)

paper = self._get_arxiv_metadata(ids)
if markdown is None or paper is None:
logger.info(f"Skipping {ids}")
return None
else:
new_entry = self.make_data_entry({
"url": self.get_item_key(ids),
"source": self.name,
"source_type": "html",
"converted_with": "markdownify",
"title": paper.title,
"authors": [str(x) for x in paper.authors],
"date_published": paper.published,
"data_last_modified": str(paper.updated),
"abstract": paper.summary.replace("\n", " "),
"author_comment": paper.comment,
"journal_ref": paper.journal_ref,
"doi": paper.doi,
"primary_category": paper.primary_category,
"categories": paper.categories,
"text": markdown,
})
return new_entry

def get_id(self, item):
if res := re.search(r'https://arxiv.org/abs/(.*?)/?$', item.url):
return res.group(1)

def _is_bad_soup(self, soup, parser='vanity') -> bool:
if parser == 'vanity':
vanity_wrapper = soup.find("div", class_="arxiv-vanity-wrapper")
if vanity_wrapper is None:
return None
vanity_wrapper = vanity_wrapper.text
return vanity_wrapper and "don’t have to squint at a PDF" not in vanity_wrapper
if parser == 'ar5iv':
ar5iv_error = soup.find("span", class_="ltx_ERROR")
if ar5iv_error is None:
return False
else:
ar5iv_error = ar5iv_error.text
if "document may be truncated or damaged" in ar5iv_error:
return True
return False
def get_contents(self, item) -> dict:
paper_id = self.get_id(item)
for link in [f"https://www.arxiv-vanity.com/papers/{paper_id}", f"https://ar5iv.org/abs/{paper_id}"]:
if contents := parse_vanity(link):
return contents
return fetch_pdf(f'https://arxiv.org/pdf/{paper_id}.pdf')

def process_entry(self, item) -> None:
logger.info(f"Processing {item.title}")

def _is_dud(self, markdown) -> bool:
"""
Check if markdown is a dud
"""
return (
"Paper Not Renderable" in markdown or
"This document may be truncated" in markdown or
"don’t have to squint at a PDF" not in markdown
)

def _article_markdown_from_soup(self, soup):
"""
Get markdown of the article from BeautifulSoup object of the page
"""
article = soup.article
if article is None:
paper = self.get_contents(item)
if not paper or not paper.get('text'):
return None
article = self._remove_bib_from_article_soup(article)
markdown = markdownify(str(article))
return markdown


def _get_parser_markdown(self, paper_id, parser="vanity") -> str:
"""
Get markdown from the parser website, arxiv-vanity or ar5iv.org
"""
if parser == "vanity":
link = f"https://www.arxiv-vanity.com/papers/{paper_id}"
elif parser == "ar5iv":
link = f"https://ar5iv.org/abs/{paper_id}"
logger.info(f"Fetching {link}")
try:
r = requests.get(link, timeout=5 * self.COOLDOWN)
except ValueError as e:
logger.error(f'{e}')
return None
if "//arxiv.org" in r.url:
return None
try:
soup = BeautifulSoup(r.content, features="xml")
except ValueError as e:
logger.error(f'{e}')
return None
if not self._is_bad_soup(soup,parser=parser):
return self._article_markdown_from_soup(soup)
return None


def get_item_key(self, paper_id) -> str:
"""
Get arxiv link
"""
return f"https://arxiv.org/abs/{paper_id}"

def _remove_bib_from_article_soup(self, article_soup) -> str:
"""
Strip markdown
"""
bib = article_soup.find("section", id="bib")
if bib:
bib.decompose()
return article_soup

def _strip_markdown(self, s_markdown):
return s_markdown.split("\nReferences\n")[0].replace("\n\n", "\n")

def process_id(self, paper_id) -> str:
"""
Process arxiv id
"""
markdown = self._get_parser_markdown(paper_id, parser="vanity")
if markdown is None:
markdown = self._get_parser_markdown(paper_id, parser="ar5iv")
if markdown is None:
return None
mardown_excerpt = markdown.replace('\n', '')[:100]
logger.info(f"Stripping markdown, {mardown_excerpt}")
s_markdown = self._strip_markdown(markdown)
return s_markdown
metadata = self._get_arxiv_metadata(self.get_id(item))
if self.is_val(item.authors) and item.authors.strip():
authors = item.authors.split(',')
elif metadata and metadata.authors:
authors = metadata.authors
else:
authors = paper.get('authors') or []
authors = [str(a).strip() for a in authors]

return self.make_data_entry({
"url": self.get_item_key(item),
"source": self.name,
"source_type": paper['data_source'],
"title": self.is_val(item.title) or paper.get('title'),
"authors": authors,
"date_published": self._get_published_date(self.is_val(item.date_published) or paper.get('date_published')),
"data_last_modified": str(metadata.updated),
"abstract": metadata.summary.replace("\n", " "),
"author_comment": metadata.comment,
"journal_ref": metadata.journal_ref,
"doi": metadata.doi,
"primary_category": metadata.primary_category,
"categories": metadata.categories,
"text": paper['text'],
})
Loading

0 comments on commit 0de45e9

Please sign in to comment.