diff --git a/align_data/__init__.py b/align_data/__init__.py index 9f6c9893..54041500 100644 --- a/align_data/__init__.py +++ b/align_data/__init__.py @@ -2,7 +2,6 @@ import align_data.sources.articles as articles import align_data.sources.blogs as blogs import align_data.sources.ebooks as ebooks -import align_data.sources.arxiv_papers as arxiv_papers import align_data.sources.greaterwrong as greaterwrong import align_data.sources.stampy as stampy import align_data.sources.alignment_newsletter as alignment_newsletter @@ -14,7 +13,6 @@ + articles.ARTICLES_REGISTRY + blogs.BLOG_REGISTRY + ebooks.EBOOK_REGISTRY - + arxiv_papers.ARXIV_REGISTRY + greaterwrong.GREATERWRONG_REGISTRY + stampy.STAMPY_REGISTRY + distill.DISTILL_REGISTRY diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py index 6bbb2109..b129407a 100644 --- a/align_data/common/alignment_dataset.py +++ b/align_data/common/alignment_dataset.py @@ -209,7 +209,7 @@ def fetch_entries(self): if self.COOLDOWN: time.sleep(self.COOLDOWN) - def process_entry(self, entry): + def process_entry(self, entry) -> Optional[Article]: """Process a single entry.""" raise NotImplementedError @@ -217,7 +217,8 @@ def process_entry(self, entry): def _format_datetime(date) -> str: return date.strftime("%Y-%m-%dT%H:%M:%SZ") - def _get_published_date(self, date) -> Optional[datetime]: + @staticmethod + def _get_published_date(date) -> Optional[datetime]: try: # Totally ignore any timezone info, forcing everything to UTC return parse(str(date)).replace(tzinfo=pytz.UTC) diff --git a/align_data/sources/articles/__init__.py b/align_data/sources/articles/__init__.py index 7e9fdbde..da7f3a6b 100644 --- a/align_data/sources/articles/__init__.py +++ b/align_data/sources/articles/__init__.py @@ -1,5 +1,6 @@ from align_data.sources.articles.datasets import ( - EbookArticles, DocArticles, HTMLArticles, MarkdownArticles, PDFArticles, SpecialDocs, XMLArticles + ArxivPapers, EbookArticles, DocArticles, HTMLArticles, + MarkdownArticles, PDFArticles, SpecialDocs, XMLArticles ) from align_data.sources.articles.indices import IndicesDataset @@ -39,5 +40,10 @@ spreadsheet_id='1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI', sheet_id='980957638', ), + ArxivPapers( + name="arxiv", + spreadsheet_id="1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI", + sheet_id="655836697", + ), IndicesDataset('indices'), ] diff --git a/align_data/sources/articles/articles.py b/align_data/sources/articles/articles.py index 7485ce9e..9f16da77 100644 --- a/align_data/sources/articles/articles.py +++ b/align_data/sources/articles/articles.py @@ -65,7 +65,7 @@ def process_row(row, sheets): row.set_status(error) return - data_source = contents.get("data_source") + data_source = contents.get("source_type") if data_source not in sheets: error = "Unhandled data type" logger.error(error) diff --git a/align_data/sources/articles/datasets.py b/align_data/sources/articles/datasets.py index ceec27f5..6b223b61 100644 --- a/align_data/sources/articles/datasets.py +++ b/align_data/sources/articles/datasets.py @@ -2,6 +2,7 @@ import os from dataclasses import dataclass from pathlib import Path +from typing import Dict from urllib.parse import urlparse import pandas as pd @@ -13,8 +14,11 @@ from align_data.common.alignment_dataset import AlignmentDataset from align_data.db.models import Article from align_data.sources.articles.google_cloud import fetch_file, fetch_markdown -from align_data.sources.articles.parsers import HTML_PARSERS, extract_gdrive_contents, item_metadata +from align_data.sources.articles.parsers import ( + HTML_PARSERS, extract_gdrive_contents, item_metadata, parse_domain +) from align_data.sources.articles.pdf import read_pdf +from align_data.sources.arxiv_papers.arxiv_papers import fetch as fetch_arxiv logger = logging.getLogger(__name__) @@ -81,21 +85,30 @@ def _query_items(self): special_docs_types = ["pdf", "html", "xml", "markdown", "docx"] return select(Article).where(Article.source.in_(special_docs_types)) - def process_entry(self, item): + def get_contents(self, item) -> Dict: metadata = {} if url := self.maybe(item.source_url) or self.maybe(item.url): metadata = item_metadata(url) - return self.make_data_entry({ - 'source': metadata.get('data_source') or self.name, + return { 'url': self.maybe(item.url), 'title': self.maybe(item.title) or metadata.get('title'), + 'source': metadata.get('source_type') or self.name, 'source_type': self.maybe(item.source_type), 'date_published': self._get_published_date(item.date_published) or metadata.get('date_published'), 'authors': self.extract_authors(item) or metadata.get('authors', []), 'text': metadata.get('text'), 'status': metadata.get('error'), - }) + } + + def process_entry(self, item): + if parse_domain(item.url) == "arxiv.org": + contents = ArxivPapers.get_contents(item) + contents['source'] = 'arxiv' + else: + contents = self.get_contents(item) + + return self.make_data_entry(contents) class PDFArticles(SpreadsheetDataset): @@ -175,3 +188,26 @@ def _get_text(self, item): file_id = item.source_url.split("/")[-2] file_name = fetch_file(file_id) return convert_file(file_name, "md", format="docx", extra_args=["--wrap=none"]) + + +class ArxivPapers(SpreadsheetDataset): + COOLDOWN: int = 1 + + @classmethod + def get_contents(cls, item) -> Dict: + contents = fetch_arxiv(item.url or item.source_url) + + if cls.maybe(item.authors) and item.authors.strip(): + contents['authors'] = [i.strip() for i in item.authors.split(',')] + if cls.maybe(item.title): + contents['title'] = cls.maybe(item.title) + + contents['date_published'] = cls._get_published_date( + cls.maybe(item.date_published) or contents.get('date_published') + ) + return contents + + def process_entry(self, item): + logger.info(f"Processing {item.title}") + + return self.make_data_entry(self.get_contents(item), source=self.name) diff --git a/align_data/sources/articles/google_cloud.py b/align_data/sources/articles/google_cloud.py index 6cd9e337..b1e957f8 100644 --- a/align_data/sources/articles/google_cloud.py +++ b/align_data/sources/articles/google_cloud.py @@ -143,7 +143,7 @@ def fetch_markdown(file_id): file_name = fetch_file(file_id) return { "text": Path(file_name).read_text(), - "data_source": "markdown", + "source_type": "markdown", } except Exception as e: return {'error': str(e)} @@ -156,7 +156,7 @@ def parse_grobid(contents): if not doc_dict.get('body'): return { 'error': 'No contents in XML file', - 'data_source': 'xml', + 'source_type': 'xml', } return { @@ -164,7 +164,7 @@ def parse_grobid(contents): "abstract": doc_dict.get("abstract"), "text": doc_dict["body"], "authors": list(filter(None, authors)), - "data_source": "xml", + "source_type": "xml", } @@ -198,7 +198,7 @@ def extract_gdrive_contents(link): elif content_type & {'text/markdown'}: result.update(fetch_markdown(file_id)) elif content_type & {'application/epub+zip', 'application/epub'}: - result['data_source'] = 'ebook' + result['source_type'] = 'ebook' elif content_type & {'text/html'}: res = fetch(url) if 'Google Drive - Virus scan warning' in res.text: @@ -213,7 +213,7 @@ def extract_gdrive_contents(link): soup = BeautifulSoup(res.content, "html.parser") result.update({ 'text': MarkdownConverter().convert_soup(soup.select_one('body')).strip(), - 'data_source': 'html', + 'source_type': 'html', }) else: result['error'] = f'unknown content type: {content_type}' diff --git a/align_data/sources/articles/parsers.py b/align_data/sources/articles/parsers.py index 85d23fe8..42c25c9f 100644 --- a/align_data/sources/articles/parsers.py +++ b/align_data/sources/articles/parsers.py @@ -250,8 +250,12 @@ def getter(url): } +def parse_domain(url: str) -> str: + return url and urlparse(url).netloc.lstrip('www.') + + def item_metadata(url) -> Dict[str, str]: - domain = urlparse(url).netloc.lstrip('www.') + domain = parse_domain(url) try: res = fetch(url, 'head') except (MissingSchema, InvalidSchema, ConnectionError) as e: @@ -265,7 +269,7 @@ def item_metadata(url) -> Dict[str, str]: if parser := HTML_PARSERS.get(domain): if res := parser(url): # Proper contents were found on the page, so use them - return {'source_url': url, 'data_source': 'html', 'text': res} + return {'source_url': url, 'source_type': 'html', 'text': res} if parser := PDF_PARSERS.get(domain): if res := parser(url): @@ -286,6 +290,6 @@ def item_metadata(url) -> Dict[str, str]: elif content_type & {"application/epub+zip", "application/epub"}: # it looks like an ebook. Assume it's fine. # TODO: validate that the ebook is readable - return {"source_url": url, "data_source": "ebook"} + return {"source_url": url, "source_type": "ebook"} else: return {"error": f"Unhandled content type: {content_type}"} diff --git a/align_data/sources/articles/pdf.py b/align_data/sources/articles/pdf.py index 9db52b9b..aca627f1 100644 --- a/align_data/sources/articles/pdf.py +++ b/align_data/sources/articles/pdf.py @@ -66,7 +66,7 @@ def fetch_pdf(link): return { "source_url": link, "text": "\n".join(page.extract_text() for page in pdf_reader.pages), - "data_source": "pdf", + "source_type": "pdf", } except (TypeError, PdfReadError) as e: logger.error('Could not read PDF file: %s', e) @@ -170,5 +170,5 @@ def get_first_child(item): "authors": authors, "text": text, "date_published": date_published, - "data_source": "html", + "source_type": "html", } diff --git a/align_data/sources/arxiv_papers/__init__.py b/align_data/sources/arxiv_papers/__init__.py index 29258480..e69de29b 100644 --- a/align_data/sources/arxiv_papers/__init__.py +++ b/align_data/sources/arxiv_papers/__init__.py @@ -1,9 +0,0 @@ -from .arxiv_papers import ArxivPapers - -ARXIV_REGISTRY = [ - ArxivPapers( - name="arxiv", - spreadsheet_id="1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI", - sheet_id="655836697", - ) -] diff --git a/align_data/sources/arxiv_papers/arxiv_papers.py b/align_data/sources/arxiv_papers/arxiv_papers.py index 42f6dcaa..04bb85b8 100644 --- a/align_data/sources/arxiv_papers/arxiv_papers.py +++ b/align_data/sources/arxiv_papers/arxiv_papers.py @@ -1,75 +1,86 @@ import logging import re -from dataclasses import dataclass +from typing import Dict, Optional import arxiv -from align_data.sources.articles.datasets import SpreadsheetDataset from align_data.sources.articles.pdf import fetch_pdf, parse_vanity +from align_data.sources.articles.html import fetch_element logger = logging.getLogger(__name__) -@dataclass -class ArxivPapers(SpreadsheetDataset): - summary_key: str = "summary" - COOLDOWN: int = 1 - done_key = "url" - batch_size = 1 - - def _get_arxiv_metadata(self, paper_id) -> arxiv.Result: - """ - Get metadata from arxiv - """ - try: - search = arxiv.Search(id_list=[paper_id], max_results=1) - return next(search.results()) - except Exception as e: - logger.error(e) - return None - - def get_id(self, item): - if res := re.search(r"https://arxiv.org/abs/(.*?)/?$", item.url): - return res.group(1) - - def get_contents(self, item) -> dict: - paper_id = self.get_id(item) - for link in [ - f"https://www.arxiv-vanity.com/papers/{paper_id}", - f"https://ar5iv.org/abs/{paper_id}", - ]: - if contents := parse_vanity(link): - return contents - return fetch_pdf(f"https://arxiv.org/pdf/{paper_id}.pdf") - - def process_entry(self, item) -> None: - logger.info(f"Processing {item.title}") - - paper = self.get_contents(item) - if not paper or not paper.get("text"): - return None - - metadata = self._get_arxiv_metadata(self.get_id(item)) - if self.maybe(item.authors) and item.authors.strip(): - authors = item.authors.split(',') - elif metadata and metadata.authors: - authors = metadata.authors - else: - authors = paper.get("authors") or [] - authors = [str(a).strip() for a in authors] - - return self.make_data_entry({ - "url": self.get_item_key(item), - "source": self.name, - "source_type": paper['data_source'], - "title": self.maybe(item.title) or paper.get('title'), - "authors": authors, - "date_published": self._get_published_date(self.maybe(item.date_published) or paper.get('date_published')), - "data_last_modified": str(metadata.updated), - "summary": metadata.summary.replace("\n", " "), - "author_comment": metadata.comment, - "journal_ref": metadata.journal_ref, - "doi": metadata.doi, - "primary_category": metadata.primary_category, - "categories": metadata.categories, - "text": paper['text'], - }) +def get_arxiv_metadata(paper_id) -> arxiv.Result: + """ + Get metadata from arxiv + """ + try: + search = arxiv.Search(id_list=[paper_id], max_results=1) + return next(search.results()) + except Exception as e: + logger.error(e) + return None + + +def get_id(url: str) -> Optional[str]: + if res := re.search(r"https?://arxiv.org/(?:abs|pdf)/(.*?)(?:v\d+)?(?:/|\.pdf)?$", url): + return res.group(1) + + +def canonical_url(url: str) -> str: + if paper_id := get_id(url): + return f'https://arxiv.org/abs/{paper_id}' + return url + + +def get_contents(paper_id: str) -> dict: + for link in [ + f"https://www.arxiv-vanity.com/papers/{paper_id}", + f"https://ar5iv.org/abs/{paper_id}", + ]: + if contents := parse_vanity(link): + return contents + return fetch_pdf(f"https://arxiv.org/pdf/{paper_id}.pdf") + + +def get_version(id: str) -> Optional[str]: + if res := re.search(r'.*v(\d+)$', id): + return res.group(1) + + +def is_withdrawn(url: str): + if elem := fetch_element(canonical_url(url), '.extra-services .full-text ul'): + return elem.text.strip().lower() == 'withdrawn' + return None + + +def fetch(url) -> Dict: + paper_id = get_id(url) + if not paper_id: + return {'error': 'Could not extract arxiv id'} + + metadata = get_arxiv_metadata(paper_id) + + if is_withdrawn(url): + paper = {'status': 'Withdrawn'} + else: + paper = get_contents(paper_id) + if metadata and metadata.authors: + authors = metadata.authors + else: + authors = paper.get("authors") or [] + authors = [str(a).strip() for a in authors] + + return dict({ + "title": metadata.title, + "url": canonical_url(url), + "authors": authors, + "date_published": metadata.published, + "data_last_modified": metadata.updated.isoformat(), + "summary": metadata.summary.replace("\n", " "), + "comment": metadata.comment, + "journal_ref": metadata.journal_ref, + "doi": metadata.doi, + "primary_category": metadata.primary_category, + "categories": metadata.categories, + "version": get_version(metadata.get_short_id()), + }, **paper) diff --git a/tests/align_data/articles/test_datasets.py b/tests/align_data/articles/test_datasets.py index eed98094..11a02816 100644 --- a/tests/align_data/articles/test_datasets.py +++ b/tests/align_data/articles/test_datasets.py @@ -1,14 +1,17 @@ +from datetime import datetime from unittest.mock import Mock, patch import pandas as pd import pytest from align_data.sources.articles.datasets import ( + ArxivPapers, EbookArticles, DocArticles, HTMLArticles, MarkdownArticles, PDFArticles, SpreadsheetDataset, + SpecialDocs, XMLArticles, ) @@ -32,6 +35,26 @@ def articles(): return pd.DataFrame(articles) +@pytest.fixture +def mock_arxiv(): + metadata = Mock( + summary="abstract bla bla", + comment="no comment", + categories="wut", + updated=datetime.fromisoformat("2023-01-01T00:00:00"), + authors=[], + doi="123", + journal_ref="sdf", + primary_category="cat", + ) + metadata.get_short_id.return_value = '2001.11038' + arxiv = Mock() + arxiv.Search.return_value.results.return_value = iter([metadata]) + + with patch("align_data.sources.arxiv_papers.arxiv_papers.arxiv", arxiv): + yield + + def test_spreadsheet_dataset_items_list(articles): dataset = SpreadsheetDataset(name="bla", spreadsheet_id="123", sheet_id="456") df = pd.concat( @@ -288,3 +311,149 @@ def test_doc_articles_process_entry(articles): "title": "article no 0", "url": "http://example.com/item/0", } + + +@patch('requests.get', return_value=Mock(content='')) +def test_arxiv_process_entry(_, mock_arxiv): + dataset = ArxivPapers(name="asd", spreadsheet_id="ad", sheet_id="da") + item = Mock( + title="this is the title", + url="https://arxiv.org/abs/2001.11038", + authors="", + date_published="2020-01-29", + ) + contents = { + "text": "this is the text", + "date_published": "December 12, 2021", + "authors": ["mr blobby"], + "source_type": "html", + } + with patch( + "align_data.sources.arxiv_papers.arxiv_papers.parse_vanity", return_value=contents + ): + assert dataset.process_entry(item).to_dict() == { + "comment": "no comment", + "authors": ["mr blobby"], + "categories": "wut", + "data_last_modified": "2023-01-01T00:00:00", + "date_published": "2020-01-29T00:00:00Z", + "doi": "123", + "id": None, + "journal_ref": "sdf", + "primary_category": "cat", + "source": "asd", + "source_type": "html", + "summaries": ["abstract bla bla"], + "text": "this is the text", + "title": "this is the title", + "url": "https://arxiv.org/abs/2001.11038", + } + + +def test_arxiv_process_entry_retracted(mock_arxiv): + dataset = ArxivPapers(name="asd", spreadsheet_id="ad", sheet_id="da") + item = Mock( + title="this is the title", + url="https://arxiv.org/abs/2001.11038", + authors="", + date_published="2020-01-29", + ) + response = """ +
+ """ + + with patch('requests.get', return_value=Mock(content=response)): + assert dataset.process_entry(item).to_dict() == { + "comment": "no comment", + "authors": [], + "categories": "wut", + "data_last_modified": "2023-01-01T00:00:00", + "date_published": "2020-01-29T00:00:00Z", + "doi": "123", + "id": None, + "journal_ref": "sdf", + "primary_category": "cat", + "source": "asd", + "source_type": None, + "summaries": ["abstract bla bla"], + "title": "this is the title", + "url": "https://arxiv.org/abs/2001.11038", + "status": "Withdrawn", + "text": None, + } + + +def test_special_docs_process_entry(): + dataset = SpecialDocs(name="asd", spreadsheet_id="ad", sheet_id="da") + item = Mock( + title="this is the title", + url="https://bla.bla.bla", + authors="mr. blobby", + date_published="2023-10-02T01:23:45", + source_type=None, + ) + contents = { + "text": "this is the text", + "date_published": "December 12, 2021", + "authors": ["mr blobby"], + "source_type": "html", + } + + with patch("align_data.sources.articles.datasets.item_metadata", return_value=contents): + assert dataset.process_entry(item).to_dict() == { + 'authors': ['mr. blobby'], + 'date_published': '2023-10-02T01:23:45Z', + 'id': None, + 'source': 'html', + 'source_type': None, + 'summaries': [], + 'text': 'this is the text', + 'title': 'this is the title', + 'url': 'https://bla.bla.bla', + } + + +@patch('requests.get', return_value=Mock(content='')) +def test_special_docs_process_entry_arxiv(_, mock_arxiv): + dataset = SpecialDocs(name="asd", spreadsheet_id="ad", sheet_id="da") + item = Mock( + title="this is the title", + url="https://arxiv.org/abs/2001.11038", + authors="", + date_published="2020-01-29", + ) + contents = { + "text": "this is the text", + "date_published": "December 12, 2021", + "authors": ["mr blobby"], + "source_type": "pdf", + } + + with patch( + "align_data.sources.arxiv_papers.arxiv_papers.parse_vanity", return_value=contents + ): + assert dataset.process_entry(item).to_dict() == { + "comment": "no comment", + "authors": ["mr blobby"], + "categories": "wut", + "data_last_modified": "2023-01-01T00:00:00", + "date_published": "2020-01-29T00:00:00Z", + "doi": "123", + "id": None, + "journal_ref": "sdf", + "primary_category": "cat", + "source": "arxiv", + "source_type": "pdf", + "summaries": ["abstract bla bla"], + "text": "this is the text", + "title": "this is the title", + "url": "https://arxiv.org/abs/2001.11038", + } diff --git a/tests/align_data/articles/test_google_cloud.py b/tests/align_data/articles/test_google_cloud.py index 3232978a..7b268e43 100644 --- a/tests/align_data/articles/test_google_cloud.py +++ b/tests/align_data/articles/test_google_cloud.py @@ -78,7 +78,7 @@ def test_parse_grobid(): 'authors': ['Cullen Oâ\x80\x99Keefe'], 'text': 'This is the contents', 'title': 'The title!!', - 'data_source': 'xml', + 'source_type': 'xml', } @@ -100,7 +100,7 @@ def test_parse_grobid_no_body(): """ - assert parse_grobid(xml) == {'error': 'No contents in XML file', 'data_source': 'xml'} + assert parse_grobid(xml) == {'error': 'No contents in XML file', 'source_type': 'xml'} @pytest.mark.parametrize('header, expected', ( @@ -160,7 +160,7 @@ def test_extract_gdrive_contents_ebook(header): assert extract_gdrive_contents(url) == { 'downloaded_from': 'google drive', 'source_url': 'https://drive.google.com/file/d/1OrKZlksba2a8gKa5bAQfP2qF717O_57I/view?usp=sharing', - 'data_source': 'ebook', + 'source_type': 'ebook', } @@ -185,7 +185,7 @@ def test_extract_gdrive_contents_html(): 'downloaded_from': 'google drive', 'source_url': 'https://drive.google.com/file/d/1OrKZlksba2a8gKa5bAQfP2qF717O_57I/view?usp=sharing', 'text': 'bla bla', - 'data_source': 'html', + 'source_type': 'html', } @@ -207,7 +207,7 @@ def test_extract_gdrive_contents_xml(): 'source_url': 'https://drive.google.com/file/d/1OrKZlksba2a8gKa5bAQfP2qF717O_57I/view?usp=sharing', 'text': 'This is the contents', 'title': 'The title!!', - 'data_source': 'xml', + 'source_type': 'xml', } @@ -238,7 +238,7 @@ def fetcher(link, *args, **kwargs): 'source_url': 'https://drive.google.com/file/d/1OrKZlksba2a8gKa5bAQfP2qF717O_57I/view?usp=sharing', 'text': 'This is the contents', 'title': 'The title!!', - 'data_source': 'xml', + 'source_type': 'xml', } diff --git a/tests/align_data/test_arxiv.py b/tests/align_data/test_arxiv.py index 5817fd2c..898fef70 100644 --- a/tests/align_data/test_arxiv.py +++ b/tests/align_data/test_arxiv.py @@ -1,7 +1,5 @@ -from datetime import datetime -from unittest.mock import patch, Mock import pytest -from align_data.sources.arxiv_papers.arxiv_papers import ArxivPapers +from align_data.sources.arxiv_papers.arxiv_papers import get_id, canonical_url, get_version @pytest.mark.parametrize( @@ -13,55 +11,28 @@ ), ) def test_get_id(url, expected): - dataset = ArxivPapers(name="asd", spreadsheet_id="ad", sheet_id="da") - assert dataset.get_id(Mock(url="https://arxiv.org/abs/2001.11038")) == "2001.11038" + assert get_id("https://arxiv.org/abs/2001.11038") == "2001.11038" -def test_process_entry(): - dataset = ArxivPapers(name="asd", spreadsheet_id="ad", sheet_id="da") - item = Mock( - title="this is the title", - url="https://arxiv.org/abs/2001.11038", - authors="", - date_published="2020-01-29", - ) - contents = { - "text": "this is the text", - "date_published": "December 12, 2021", - "authors": ["mr blobby"], - "data_source": "html", - } - metadata = Mock( - summary="abstract bla bla", - comment="no comment", - categories="wut", - updated="2023-01-01", - authors=[], - doi="123", - journal_ref="sdf", - primary_category="cat", - ) - arxiv = Mock() - arxiv.Search.return_value.results.return_value = iter([metadata]) +@pytest.mark.parametrize('url, expected', ( + ("http://bla.bla", "http://bla.bla"), + ("http://arxiv.org/abs/2001.11038", "https://arxiv.org/abs/2001.11038"), + ("https://arxiv.org/abs/2001.11038", "https://arxiv.org/abs/2001.11038"), + ("https://arxiv.org/abs/2001.11038/", "https://arxiv.org/abs/2001.11038"), + ("https://arxiv.org/pdf/2001.11038", "https://arxiv.org/abs/2001.11038"), + ("https://arxiv.org/pdf/2001.11038.pdf", "https://arxiv.org/abs/2001.11038"), + ("https://arxiv.org/pdf/2001.11038v3.pdf", "https://arxiv.org/abs/2001.11038"), + ("https://arxiv.org/abs/math/2001.11038", "https://arxiv.org/abs/math/2001.11038"), +)) +def test_canonical_url(url, expected): + assert canonical_url(url) == expected - with patch( - "align_data.arxiv_papers.arxiv_papers.parse_vanity", return_value=contents - ): - with patch("align_data.arxiv_papers.arxiv_papers.arxiv", arxiv): - assert dataset.process_entry(item).to_dict() == { - "author_comment": "no comment", - "authors": ["mr blobby"], - "categories": "wut", - "data_last_modified": "2023-01-01", - "date_published": "2020-01-29T00:00:00Z", - "doi": "123", - "id": None, - "journal_ref": "sdf", - "primary_category": "cat", - "source": "asd", - "source_type": "html", - "summaries": ["abstract bla bla"], - "text": "this is the text", - "title": "this is the title", - "url": "https://arxiv.org/abs/2001.11038", - } + +@pytest.mark.parametrize('id, version', ( + ('123.123', None), + ('math/312', None), + ('3123123v1', '1'), + ('3123123v123', '123'), +)) +def test_get_version(id, version): + assert get_version(id) == version