-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Pypdf module and requests with retries * Request with retries * Fixing one record in the CSV * Added pypdf to requirements * use parsers * remove arxiv csv file --------- Co-authored-by: Daniel O'Connell <[email protected]>
- Loading branch information
Showing
9 changed files
with
184 additions
and
1,874 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,9 @@ | ||
from .arxiv_papers import ArxivPapers | ||
|
||
ARXIV_REGISTRY = [ | ||
ArxivPapers(name = "arxiv") | ||
ArxivPapers( | ||
name="arxiv", | ||
spreadsheet_id='1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI', | ||
sheet_id='655836697' | ||
) | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,171 +1,72 @@ | ||
import arxiv | ||
import requests | ||
import logging | ||
import time | ||
import jsonlines | ||
|
||
import pandas as pd | ||
|
||
import re | ||
from dataclasses import dataclass | ||
from markdownify import markdownify | ||
from bs4 import BeautifulSoup | ||
from tqdm import tqdm | ||
from align_data.common.alignment_dataset import AlignmentDataset | ||
|
||
import arxiv | ||
from align_data.sources.articles.datasets import SpreadsheetDataset | ||
from align_data.sources.articles.pdf import fetch_pdf, parse_vanity | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
@dataclass | ||
class ArxivPapers(AlignmentDataset): | ||
class ArxivPapers(SpreadsheetDataset): | ||
summary_key: str = 'summary' | ||
COOLDOWN: int = 1 | ||
done_key = "url" | ||
batch_size = 1 | ||
|
||
def _get_arxiv_metadata(self, paper_id) -> arxiv.Result: | ||
""" | ||
Get metadata from arxiv | ||
""" | ||
try: | ||
search = arxiv.Search(id_list=[paper_id], max_results=1) | ||
return next(search.results()) | ||
except Exception as e: | ||
logger.error(e) | ||
return None | ||
return next(search.results()) | ||
|
||
@property | ||
def items_list(self): | ||
self.papers_csv_path = self.raw_data_path / "ai-alignment-arxiv-papers.csv" | ||
|
||
self.df = pd.read_csv(self.papers_csv_path) | ||
self.df_arxiv = self.df[self.df["Url"].str.contains( | ||
"arxiv.org/abs") == True].drop_duplicates(subset="Url", keep="first") | ||
|
||
return [xx.split('/abs/')[1] for xx in self.df_arxiv.Url] | ||
|
||
def process_entry(self, ids) -> None: | ||
logger.info(f"Processing {ids}") | ||
|
||
markdown = self.process_id(ids) | ||
|
||
paper = self._get_arxiv_metadata(ids) | ||
if markdown is None or paper is None: | ||
logger.info(f"Skipping {ids}") | ||
return None | ||
else: | ||
new_entry = self.make_data_entry({ | ||
"url": self.get_item_key(ids), | ||
"source": self.name, | ||
"source_type": "html", | ||
"converted_with": "markdownify", | ||
"title": paper.title, | ||
"authors": [str(x) for x in paper.authors], | ||
"date_published": paper.published, | ||
"data_last_modified": str(paper.updated), | ||
"abstract": paper.summary.replace("\n", " "), | ||
"author_comment": paper.comment, | ||
"journal_ref": paper.journal_ref, | ||
"doi": paper.doi, | ||
"primary_category": paper.primary_category, | ||
"categories": paper.categories, | ||
"text": markdown, | ||
}) | ||
return new_entry | ||
|
||
def get_id(self, item): | ||
if res := re.search(r'https://arxiv.org/abs/(.*?)/?$', item.url): | ||
return res.group(1) | ||
|
||
def _is_bad_soup(self, soup, parser='vanity') -> bool: | ||
if parser == 'vanity': | ||
vanity_wrapper = soup.find("div", class_="arxiv-vanity-wrapper") | ||
if vanity_wrapper is None: | ||
return None | ||
vanity_wrapper = vanity_wrapper.text | ||
return vanity_wrapper and "don’t have to squint at a PDF" not in vanity_wrapper | ||
if parser == 'ar5iv': | ||
ar5iv_error = soup.find("span", class_="ltx_ERROR") | ||
if ar5iv_error is None: | ||
return False | ||
else: | ||
ar5iv_error = ar5iv_error.text | ||
if "document may be truncated or damaged" in ar5iv_error: | ||
return True | ||
return False | ||
def get_contents(self, item) -> dict: | ||
paper_id = self.get_id(item) | ||
for link in [f"https://www.arxiv-vanity.com/papers/{paper_id}", f"https://ar5iv.org/abs/{paper_id}"]: | ||
if contents := parse_vanity(link): | ||
return contents | ||
return fetch_pdf(f'https://arxiv.org/pdf/{paper_id}.pdf') | ||
|
||
def process_entry(self, item) -> None: | ||
logger.info(f"Processing {item.title}") | ||
|
||
def _is_dud(self, markdown) -> bool: | ||
""" | ||
Check if markdown is a dud | ||
""" | ||
return ( | ||
"Paper Not Renderable" in markdown or | ||
"This document may be truncated" in markdown or | ||
"don’t have to squint at a PDF" not in markdown | ||
) | ||
|
||
def _article_markdown_from_soup(self, soup): | ||
""" | ||
Get markdown of the article from BeautifulSoup object of the page | ||
""" | ||
article = soup.article | ||
if article is None: | ||
paper = self.get_contents(item) | ||
if not paper or not paper.get('text'): | ||
return None | ||
article = self._remove_bib_from_article_soup(article) | ||
markdown = markdownify(str(article)) | ||
return markdown | ||
|
||
|
||
def _get_parser_markdown(self, paper_id, parser="vanity") -> str: | ||
""" | ||
Get markdown from the parser website, arxiv-vanity or ar5iv.org | ||
""" | ||
if parser == "vanity": | ||
link = f"https://www.arxiv-vanity.com/papers/{paper_id}" | ||
elif parser == "ar5iv": | ||
link = f"https://ar5iv.org/abs/{paper_id}" | ||
logger.info(f"Fetching {link}") | ||
try: | ||
r = requests.get(link, timeout=5 * self.COOLDOWN) | ||
except ValueError as e: | ||
logger.error(f'{e}') | ||
return None | ||
if "//arxiv.org" in r.url: | ||
return None | ||
try: | ||
soup = BeautifulSoup(r.content, features="xml") | ||
except ValueError as e: | ||
logger.error(f'{e}') | ||
return None | ||
if not self._is_bad_soup(soup,parser=parser): | ||
return self._article_markdown_from_soup(soup) | ||
return None | ||
|
||
|
||
def get_item_key(self, paper_id) -> str: | ||
""" | ||
Get arxiv link | ||
""" | ||
return f"https://arxiv.org/abs/{paper_id}" | ||
|
||
def _remove_bib_from_article_soup(self, article_soup) -> str: | ||
""" | ||
Strip markdown | ||
""" | ||
bib = article_soup.find("section", id="bib") | ||
if bib: | ||
bib.decompose() | ||
return article_soup | ||
|
||
def _strip_markdown(self, s_markdown): | ||
return s_markdown.split("\nReferences\n")[0].replace("\n\n", "\n") | ||
|
||
def process_id(self, paper_id) -> str: | ||
""" | ||
Process arxiv id | ||
""" | ||
markdown = self._get_parser_markdown(paper_id, parser="vanity") | ||
if markdown is None: | ||
markdown = self._get_parser_markdown(paper_id, parser="ar5iv") | ||
if markdown is None: | ||
return None | ||
mardown_excerpt = markdown.replace('\n', '')[:100] | ||
logger.info(f"Stripping markdown, {mardown_excerpt}") | ||
s_markdown = self._strip_markdown(markdown) | ||
return s_markdown | ||
metadata = self._get_arxiv_metadata(self.get_id(item)) | ||
if self.is_val(item.authors) and item.authors.strip(): | ||
authors = item.authors.split(',') | ||
elif metadata and metadata.authors: | ||
authors = metadata.authors | ||
else: | ||
authors = paper.get('authors') or [] | ||
authors = [str(a).strip() for a in authors] | ||
|
||
return self.make_data_entry({ | ||
"url": self.get_item_key(item), | ||
"source": self.name, | ||
"source_type": paper['data_source'], | ||
"title": self.is_val(item.title) or paper.get('title'), | ||
"authors": authors, | ||
"date_published": self._get_published_date(self.is_val(item.date_published) or paper.get('date_published')), | ||
"data_last_modified": str(metadata.updated), | ||
"abstract": metadata.summary.replace("\n", " "), | ||
"author_comment": metadata.comment, | ||
"journal_ref": metadata.journal_ref, | ||
"doi": metadata.doi, | ||
"primary_category": metadata.primary_category, | ||
"categories": metadata.categories, | ||
"text": paper['text'], | ||
}) |
Oops, something went wrong.