Arxiv pypdf (#82)

* Pypdf module and requests with retries * Request with retries * Fixing one record in the CSV * Added pypdf to requirements * use parsers * remove arxiv csv file --------- Co-authored-by: Daniel O'Connell <[email protected]>
StampyAI · Jul 31, 2023 · 0de45e9 · 0de45e9
1 parent 2ca30e8
commit 0de45e9
Show file tree

Hide file tree

Showing 9 changed files with 184 additions and 1,874 deletions.
diff --git a/align_data/sources/articles/articles.py b/align_data/sources/articles/articles.py
@@ -2,10 +2,12 @@
 import logging
 
 from tqdm import tqdm
+import gspread
 
 from align_data.sources.articles.google_cloud import iterate_rows, get_spreadsheet, get_sheet, upload_file, OK, with_retry
 from align_data.sources.articles.parsers import item_metadata, fetch
 from align_data.sources.articles.indices import fetch_all
+from align_data.sources.articles.html import with_retry
 from align_data.settings import PDFS_FOLDER_ID
 
 
@@ -36,7 +38,7 @@ def save_pdf(filename, link):
     )
 
 
-@with_retry(times=3)
+@with_retry(times=3, exceptions=gspread.exceptions.APIError)
 def process_row(row, sheets):
     """Check the given `row` and fetch its metadata + optional extra stuff."""
     logger.info('Checking "%s"', row['title'])

diff --git a/align_data/sources/articles/datasets.py b/align_data/sources/articles/datasets.py
@@ -32,6 +32,12 @@ def maybe(val):
             return None
         return val
 
+    @staticmethod
+    def is_val(val):
+        if pd.isna(val):
+            return None
+        return val
+
     @property
     def items_list(self):
         logger.info(f'Fetching https://docs.google.com/spreadsheets/d/{self.spreadsheet_id}/export?format=CS&gid={self.sheet_id}')

diff --git a/align_data/sources/articles/html.py b/align_data/sources/articles/html.py
@@ -1,3 +1,4 @@
+import time
 import logging
 from typing import Union
 
@@ -13,6 +14,22 @@
 }
 
 
+def with_retry(times=3, exceptions=requests.exceptions.RequestException):
+    """A decorator that will retry the wrapped function up to `times` times in case of google sheets errors."""
+    def wrapper(f):
+        def retrier(*args, **kwargs):
+            for i in range(times):
+                try:
+                    return f(*args, **kwargs)
+                except exceptions as e:
+                    logger.error(f'{e} - retrying up to {times - i} times')
+                    # Do a logarithmic backoff
+                    time.sleep((i + 1) ** 2)
+            raise ValueError(f'Gave up after {times} tries')
+        return retrier
+    return wrapper
+
+
 def fetch(url, method='get', headers=DEFAULT_HEADERS):
     """Fetch the given `url`.
 

diff --git a/align_data/sources/articles/parsers.py b/align_data/sources/articles/parsers.py
@@ -5,7 +5,7 @@
 import grobid_tei_xml
 import regex as re
 from align_data.sources.articles.html import element_extractor, fetch, fetch_element
-from align_data.sources.articles.pdf import doi_getter, fetch_pdf, get_pdf_from_page, get_arxiv_pdf
+from align_data.sources.articles.pdf import doi_getter, fetch_pdf, get_pdf_from_page, get_arxiv_pdf, parse_vanity
 from align_data.sources.articles.google_cloud import fetch_markdown
 from markdownify import MarkdownConverter
 from bs4 import BeautifulSoup
@@ -154,6 +154,8 @@ def getter(url):
 HTML_PARSERS = {
     'academic.oup.com': element_extractor('#ContentTab'),
     'ai.googleblog.com': element_extractor('div.post-body.entry-content'),
+    'arxiv-vanity.com': parse_vanity,
+    'ar5iv.labs.arxiv.org': parse_vanity,
     'bair.berkeley.edu': element_extractor('article'),
     'mediangroup.org': element_extractor('div.entry-content'),
     'www.alexirpan.com': element_extractor('article'),

diff --git a/align_data/sources/articles/pdf.py b/align_data/sources/articles/pdf.py
@@ -8,8 +8,9 @@
 import pandas as pd
 from PyPDF2 import PdfReader
 from PyPDF2.errors import PdfReadError
+from markdownify import MarkdownConverter
 
-from align_data.sources.articles.html import fetch, fetch_element
+from align_data.sources.articles.html import fetch, fetch_element, with_retry
 
 logger = logging.getLogger(__name__)
 
@@ -41,6 +42,7 @@ def read_pdf(filename):
     return None
 
 
+@with_retry(times=3)
 def fetch_pdf(link):
     """Return the contents of the pdf file at `link` as a markdown string.
 
@@ -151,3 +153,41 @@ def getter(url):
             return pdf
         return {'error': f'Could not fetch pdf from {link}'}
     return getter
+
+
+def parse_vanity(url):
+    contents = fetch_element(url, 'article')
+    if not contents:
+        return None
+
+    if title := contents.select_one('h1.ltx_title'):
+        title = title.text
+
+    def get_first_child(item):
+        child = next(item.children)
+        if not child:
+            return []
+
+        if not isinstance(child, str):
+            child = child.text
+        return child.split(',')
+
+    authors = [
+        a.strip() for item in contents.select('div.ltx_authors .ltx_personname') for a in get_first_child(item)
+    ]
+
+    if date_published := contents.select_one('div.ltx_dates'):
+       date_published = date_published.text.strip('()')
+
+    text = '\n\n'.join([
+        MarkdownConverter().convert_soup(elem).strip()
+        for elem in contents.select('section.ltx_section')
+    ])
+
+    return {
+        'title': title,
+        'authors': authors,
+        'text': text,
+        'date_published': date_published,
+        'data_source': 'html',
+    }
diff --git a/align_data/sources/arxiv_papers/__init__.py b/align_data/sources/arxiv_papers/__init__.py
@@ -1,5 +1,9 @@
 from .arxiv_papers import ArxivPapers
 
 ARXIV_REGISTRY = [
-    ArxivPapers(name = "arxiv")
+    ArxivPapers(
+        name="arxiv",
+        spreadsheet_id='1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI',
+        sheet_id='655836697'
+    )
 ]
diff --git a/align_data/sources/arxiv_papers/arxiv_papers.py b/align_data/sources/arxiv_papers/arxiv_papers.py
@@ -1,171 +1,72 @@
-import arxiv
-import requests
 import logging
-import time
-import jsonlines
-
-import pandas as pd
-
+import re
 from dataclasses import dataclass
-from markdownify import markdownify
-from bs4 import BeautifulSoup
-from tqdm import tqdm
-from align_data.common.alignment_dataset import AlignmentDataset
+
+import arxiv
+from align_data.sources.articles.datasets import SpreadsheetDataset
+from align_data.sources.articles.pdf import fetch_pdf, parse_vanity
 
 logger = logging.getLogger(__name__)
 
 
 @dataclass
-class ArxivPapers(AlignmentDataset):
+class ArxivPapers(SpreadsheetDataset):
     summary_key: str = 'summary'
     COOLDOWN: int = 1
     done_key = "url"
+    batch_size = 1
 
     def _get_arxiv_metadata(self, paper_id) -> arxiv.Result:
         """
         Get metadata from arxiv
         """
         try:
             search = arxiv.Search(id_list=[paper_id], max_results=1)
+            return next(search.results())
         except Exception as e:
             logger.error(e)
             return None
-        return next(search.results())
-
-    @property
-    def items_list(self):
-        self.papers_csv_path = self.raw_data_path / "ai-alignment-arxiv-papers.csv"
-
-        self.df = pd.read_csv(self.papers_csv_path)
-        self.df_arxiv = self.df[self.df["Url"].str.contains(
-            "arxiv.org/abs") == True].drop_duplicates(subset="Url", keep="first")
-
-        return [xx.split('/abs/')[1] for xx in self.df_arxiv.Url]
-
-    def process_entry(self, ids) -> None:
-        logger.info(f"Processing {ids}")
-
-        markdown = self.process_id(ids)
-
-        paper = self._get_arxiv_metadata(ids)
-        if markdown is None or paper is None:
-            logger.info(f"Skipping {ids}")
-            return None
-        else:
-            new_entry = self.make_data_entry({
-                "url": self.get_item_key(ids),
-                "source": self.name,
-                "source_type": "html",
-                "converted_with": "markdownify",
-                "title": paper.title,
-                "authors": [str(x) for x in paper.authors],
-                "date_published": paper.published,
-                "data_last_modified": str(paper.updated),
-                "abstract": paper.summary.replace("\n", " "),
-                "author_comment": paper.comment,
-                "journal_ref": paper.journal_ref,
-                "doi": paper.doi,
-                "primary_category": paper.primary_category,
-                "categories": paper.categories,
-                "text": markdown,
-            })
-        return new_entry
 
+    def get_id(self, item):
+        if res := re.search(r'https://arxiv.org/abs/(.*?)/?$', item.url):
+            return res.group(1)
 
-    def _is_bad_soup(self, soup, parser='vanity') -> bool:
-        if parser == 'vanity':
-            vanity_wrapper = soup.find("div", class_="arxiv-vanity-wrapper")
-            if vanity_wrapper is None:
-                return None
-            vanity_wrapper = vanity_wrapper.text
-            return vanity_wrapper and "don’t have to squint at a PDF" not in vanity_wrapper
-        if parser == 'ar5iv':
-            ar5iv_error = soup.find("span", class_="ltx_ERROR")
-            if ar5iv_error is None:
-                return False
-            else:
-                ar5iv_error = ar5iv_error.text
-            if "document may be truncated or damaged" in ar5iv_error:
-                return True
-        return False
+    def get_contents(self, item) -> dict:
+        paper_id = self.get_id(item)
+        for link in [f"https://www.arxiv-vanity.com/papers/{paper_id}", f"https://ar5iv.org/abs/{paper_id}"]:
+            if contents := parse_vanity(link):
+                return contents
+        return fetch_pdf(f'https://arxiv.org/pdf/{paper_id}.pdf')
 
+    def process_entry(self, item) -> None:
+        logger.info(f"Processing {item.title}")
 
-    def _is_dud(self, markdown) -> bool:
-        """
-        Check if markdown is a dud
-        """
-        return (
-            "Paper Not Renderable" in markdown or
-            "This document may be truncated" in markdown or
-            "don’t have to squint at a PDF" not in markdown
-        )
-
-    def _article_markdown_from_soup(self, soup):
-        """
-        Get markdown of the article from BeautifulSoup object of the page
-        """
-        article = soup.article
-        if article is None:
+        paper = self.get_contents(item)
+        if not paper or not paper.get('text'):
             return None
-        article = self._remove_bib_from_article_soup(article)
-        markdown = markdownify(str(article))
-        return markdown
 
-
-    def _get_parser_markdown(self, paper_id, parser="vanity") -> str:
-        """
-        Get markdown from the parser website, arxiv-vanity or ar5iv.org
-        """
-        if parser == "vanity":
-            link = f"https://www.arxiv-vanity.com/papers/{paper_id}"
-        elif parser == "ar5iv":
-            link = f"https://ar5iv.org/abs/{paper_id}"
-        logger.info(f"Fetching {link}")
-        try:
-            r = requests.get(link, timeout=5 * self.COOLDOWN)
-        except ValueError as e:
-            logger.error(f'{e}')
-            return None
-        if "//arxiv.org" in r.url:
-            return None
-        try:
-            soup = BeautifulSoup(r.content, features="xml")
-        except ValueError as e:
-            logger.error(f'{e}')
-            return None
-        if not self._is_bad_soup(soup,parser=parser):
-            return self._article_markdown_from_soup(soup)
-        return None
-
-
-    def get_item_key(self, paper_id) -> str:
-        """
-        Get arxiv link
-        """
-        return f"https://arxiv.org/abs/{paper_id}"
-
-    def _remove_bib_from_article_soup(self, article_soup) -> str:
-        """
-        Strip markdown
-        """
-        bib = article_soup.find("section", id="bib")
-        if bib:
-            bib.decompose()
-        return article_soup
-
-    def _strip_markdown(self, s_markdown):
-        return s_markdown.split("\nReferences\n")[0].replace("\n\n", "\n")
-
-    def process_id(self, paper_id) -> str:
-        """
-        Process arxiv id
-        """
-        markdown = self._get_parser_markdown(paper_id, parser="vanity")
-        if markdown is None:
-            markdown = self._get_parser_markdown(paper_id, parser="ar5iv")
-        if markdown is None:
-            return None
-        mardown_excerpt = markdown.replace('\n', '')[:100]
-        logger.info(f"Stripping markdown, {mardown_excerpt}")
-        s_markdown = self._strip_markdown(markdown)
-        return s_markdown
+        metadata = self._get_arxiv_metadata(self.get_id(item))
+        if self.is_val(item.authors) and item.authors.strip():
+            authors = item.authors.split(',')
+        elif metadata and metadata.authors:
+            authors = metadata.authors
+        else:
+            authors = paper.get('authors') or []
+        authors = [str(a).strip() for a in authors]
+
+        return self.make_data_entry({
+            "url": self.get_item_key(item),
+            "source": self.name,
+            "source_type": paper['data_source'],
+            "title": self.is_val(item.title) or paper.get('title'),
+            "authors": authors,
+            "date_published": self._get_published_date(self.is_val(item.date_published) or paper.get('date_published')),
+            "data_last_modified": str(metadata.updated),
+            "abstract": metadata.summary.replace("\n", " "),
+            "author_comment": metadata.comment,
+            "journal_ref": metadata.journal_ref,
+            "doi": metadata.doi,
+            "primary_category": metadata.primary_category,
+            "categories": metadata.categories,
+            "text": paper['text'],
+        })