diff --git a/align_data/__init__.py b/align_data/__init__.py
index 9f6c9893..54041500 100644
--- a/align_data/__init__.py
+++ b/align_data/__init__.py
@@ -2,7 +2,6 @@
 import align_data.sources.articles as articles
 import align_data.sources.blogs as blogs
 import align_data.sources.ebooks as ebooks
-import align_data.sources.arxiv_papers as arxiv_papers
 import align_data.sources.greaterwrong as greaterwrong
 import align_data.sources.stampy as stampy
 import align_data.sources.alignment_newsletter as alignment_newsletter
@@ -14,7 +13,6 @@
     + articles.ARTICLES_REGISTRY
     + blogs.BLOG_REGISTRY
     + ebooks.EBOOK_REGISTRY
-    + arxiv_papers.ARXIV_REGISTRY
     + greaterwrong.GREATERWRONG_REGISTRY
     + stampy.STAMPY_REGISTRY
     + distill.DISTILL_REGISTRY
diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py
index 6bbb2109..b129407a 100644
--- a/align_data/common/alignment_dataset.py
+++ b/align_data/common/alignment_dataset.py
@@ -209,7 +209,7 @@ def fetch_entries(self):
             if self.COOLDOWN:
                 time.sleep(self.COOLDOWN)
 
-    def process_entry(self, entry):
+    def process_entry(self, entry) -> Optional[Article]:
         """Process a single entry."""
         raise NotImplementedError
 
@@ -217,7 +217,8 @@ def process_entry(self, entry):
     def _format_datetime(date) -> str:
         return date.strftime("%Y-%m-%dT%H:%M:%SZ")
 
-    def _get_published_date(self, date) -> Optional[datetime]:
+    @staticmethod
+    def _get_published_date(date) -> Optional[datetime]:
         try:
             # Totally ignore any timezone info, forcing everything to UTC
             return parse(str(date)).replace(tzinfo=pytz.UTC)
diff --git a/align_data/sources/articles/__init__.py b/align_data/sources/articles/__init__.py
index 7e9fdbde..da7f3a6b 100644
--- a/align_data/sources/articles/__init__.py
+++ b/align_data/sources/articles/__init__.py
@@ -1,5 +1,6 @@
 from align_data.sources.articles.datasets import (
-    EbookArticles, DocArticles, HTMLArticles, MarkdownArticles, PDFArticles, SpecialDocs, XMLArticles
+    ArxivPapers, EbookArticles, DocArticles, HTMLArticles,
+    MarkdownArticles, PDFArticles, SpecialDocs, XMLArticles
 )
 from align_data.sources.articles.indices import IndicesDataset
 
@@ -39,5 +40,10 @@
         spreadsheet_id='1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI',
         sheet_id='980957638',
     ),
+    ArxivPapers(
+        name="arxiv",
+        spreadsheet_id="1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI",
+        sheet_id="655836697",
+    ),
     IndicesDataset('indices'),
 ]
diff --git a/align_data/sources/articles/articles.py b/align_data/sources/articles/articles.py
index 7485ce9e..9f16da77 100644
--- a/align_data/sources/articles/articles.py
+++ b/align_data/sources/articles/articles.py
@@ -65,7 +65,7 @@ def process_row(row, sheets):
         row.set_status(error)
         return
 
-    data_source = contents.get("data_source")
+    data_source = contents.get("source_type")
     if data_source not in sheets:
         error = "Unhandled data type"
         logger.error(error)
diff --git a/align_data/sources/articles/datasets.py b/align_data/sources/articles/datasets.py
index ceec27f5..6b223b61 100644
--- a/align_data/sources/articles/datasets.py
+++ b/align_data/sources/articles/datasets.py
@@ -2,6 +2,7 @@
 import os
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Dict
 from urllib.parse import urlparse
 
 import pandas as pd
@@ -13,8 +14,11 @@
 from align_data.common.alignment_dataset import AlignmentDataset
 from align_data.db.models import Article
 from align_data.sources.articles.google_cloud import fetch_file, fetch_markdown
-from align_data.sources.articles.parsers import HTML_PARSERS, extract_gdrive_contents, item_metadata
+from align_data.sources.articles.parsers import (
+    HTML_PARSERS, extract_gdrive_contents, item_metadata, parse_domain
+)
 from align_data.sources.articles.pdf import read_pdf
+from align_data.sources.arxiv_papers.arxiv_papers import fetch as fetch_arxiv
 
 logger = logging.getLogger(__name__)
 
@@ -81,21 +85,30 @@ def _query_items(self):
         special_docs_types = ["pdf", "html", "xml", "markdown", "docx"]
         return select(Article).where(Article.source.in_(special_docs_types))
 
-    def process_entry(self, item):
+    def get_contents(self, item) -> Dict:
         metadata = {}
         if url := self.maybe(item.source_url) or self.maybe(item.url):
             metadata = item_metadata(url)
 
-        return self.make_data_entry({
-            'source': metadata.get('data_source') or self.name,
+        return {
             'url': self.maybe(item.url),
             'title': self.maybe(item.title) or metadata.get('title'),
+            'source': metadata.get('source_type') or self.name,
             'source_type': self.maybe(item.source_type),
             'date_published': self._get_published_date(item.date_published) or metadata.get('date_published'),
             'authors': self.extract_authors(item) or metadata.get('authors', []),
             'text': metadata.get('text'),
             'status': metadata.get('error'),
-        })
+        }
+
+    def process_entry(self, item):
+        if parse_domain(item.url) == "arxiv.org":
+            contents = ArxivPapers.get_contents(item)
+            contents['source'] = 'arxiv'
+        else:
+            contents = self.get_contents(item)
+
+        return self.make_data_entry(contents)
 
 
 class PDFArticles(SpreadsheetDataset):
@@ -175,3 +188,26 @@ def _get_text(self, item):
         file_id = item.source_url.split("/")[-2]
         file_name = fetch_file(file_id)
         return convert_file(file_name, "md", format="docx", extra_args=["--wrap=none"])
+
+
+class ArxivPapers(SpreadsheetDataset):
+    COOLDOWN: int = 1
+
+    @classmethod
+    def get_contents(cls, item) -> Dict:
+        contents = fetch_arxiv(item.url or item.source_url)
+
+        if cls.maybe(item.authors) and item.authors.strip():
+            contents['authors'] = [i.strip() for i in item.authors.split(',')]
+        if cls.maybe(item.title):
+            contents['title'] = cls.maybe(item.title)
+
+        contents['date_published'] = cls._get_published_date(
+            cls.maybe(item.date_published) or contents.get('date_published')
+        )
+        return contents
+
+    def process_entry(self, item):
+        logger.info(f"Processing {item.title}")
+
+        return self.make_data_entry(self.get_contents(item), source=self.name)
diff --git a/align_data/sources/articles/google_cloud.py b/align_data/sources/articles/google_cloud.py
index 6cd9e337..b1e957f8 100644
--- a/align_data/sources/articles/google_cloud.py
+++ b/align_data/sources/articles/google_cloud.py
@@ -143,7 +143,7 @@ def fetch_markdown(file_id):
         file_name = fetch_file(file_id)
         return {
             "text": Path(file_name).read_text(),
-            "data_source": "markdown",
+            "source_type": "markdown",
         }
     except Exception as e:
         return {'error': str(e)}
@@ -156,7 +156,7 @@ def parse_grobid(contents):
     if not doc_dict.get('body'):
         return {
             'error': 'No contents in XML file',
-            'data_source': 'xml',
+            'source_type': 'xml',
         }
 
     return {
@@ -164,7 +164,7 @@ def parse_grobid(contents):
         "abstract": doc_dict.get("abstract"),
         "text": doc_dict["body"],
         "authors": list(filter(None, authors)),
-        "data_source": "xml",
+        "source_type": "xml",
     }
 
 
@@ -198,7 +198,7 @@ def extract_gdrive_contents(link):
     elif content_type & {'text/markdown'}:
         result.update(fetch_markdown(file_id))
     elif content_type & {'application/epub+zip', 'application/epub'}:
-        result['data_source'] = 'ebook'
+        result['source_type'] = 'ebook'
     elif content_type & {'text/html'}:
         res = fetch(url)
         if 'Google Drive - Virus scan warning' in res.text:
@@ -213,7 +213,7 @@ def extract_gdrive_contents(link):
             soup = BeautifulSoup(res.content, "html.parser")
             result.update({
                 'text': MarkdownConverter().convert_soup(soup.select_one('body')).strip(),
-                'data_source': 'html',
+                'source_type': 'html',
             })
         else:
             result['error'] = f'unknown content type: {content_type}'
diff --git a/align_data/sources/articles/parsers.py b/align_data/sources/articles/parsers.py
index 85d23fe8..42c25c9f 100644
--- a/align_data/sources/articles/parsers.py
+++ b/align_data/sources/articles/parsers.py
@@ -250,8 +250,12 @@ def getter(url):
 }
 
 
+def parse_domain(url: str) -> str:
+    return url and urlparse(url).netloc.lstrip('www.')
+
+
 def item_metadata(url) -> Dict[str, str]:
-    domain = urlparse(url).netloc.lstrip('www.')
+    domain = parse_domain(url)
     try:
         res = fetch(url, 'head')
     except (MissingSchema, InvalidSchema, ConnectionError) as e:
@@ -265,7 +269,7 @@ def item_metadata(url) -> Dict[str, str]:
         if parser := HTML_PARSERS.get(domain):
             if res := parser(url):
                 # Proper contents were found on the page, so use them
-                return {'source_url': url, 'data_source': 'html', 'text': res}
+                return {'source_url': url, 'source_type': 'html', 'text': res}
 
         if parser := PDF_PARSERS.get(domain):
             if res := parser(url):
@@ -286,6 +290,6 @@ def item_metadata(url) -> Dict[str, str]:
     elif content_type & {"application/epub+zip", "application/epub"}:
         # it looks like an ebook. Assume it's fine.
         # TODO: validate that the ebook is readable
-        return {"source_url": url, "data_source": "ebook"}
+        return {"source_url": url, "source_type": "ebook"}
     else:
         return {"error": f"Unhandled content type: {content_type}"}
diff --git a/align_data/sources/articles/pdf.py b/align_data/sources/articles/pdf.py
index 9db52b9b..aca627f1 100644
--- a/align_data/sources/articles/pdf.py
+++ b/align_data/sources/articles/pdf.py
@@ -66,7 +66,7 @@ def fetch_pdf(link):
         return {
             "source_url": link,
             "text": "\n".join(page.extract_text() for page in pdf_reader.pages),
-            "data_source": "pdf",
+            "source_type": "pdf",
         }
     except (TypeError, PdfReadError) as e:
         logger.error('Could not read PDF file: %s', e)
@@ -170,5 +170,5 @@ def get_first_child(item):
         "authors": authors,
         "text": text,
         "date_published": date_published,
-        "data_source": "html",
+        "source_type": "html",
     }
diff --git a/align_data/sources/arxiv_papers/__init__.py b/align_data/sources/arxiv_papers/__init__.py
index 29258480..e69de29b 100644
--- a/align_data/sources/arxiv_papers/__init__.py
+++ b/align_data/sources/arxiv_papers/__init__.py
@@ -1,9 +0,0 @@
-from .arxiv_papers import ArxivPapers
-
-ARXIV_REGISTRY = [
-    ArxivPapers(
-        name="arxiv",
-        spreadsheet_id="1pgG3HzercOhf4gniaqp3tBc3uvZnHpPhXErwHcthmbI",
-        sheet_id="655836697",
-    )
-]
diff --git a/align_data/sources/arxiv_papers/arxiv_papers.py b/align_data/sources/arxiv_papers/arxiv_papers.py
index 42f6dcaa..04bb85b8 100644
--- a/align_data/sources/arxiv_papers/arxiv_papers.py
+++ b/align_data/sources/arxiv_papers/arxiv_papers.py
@@ -1,75 +1,86 @@
 import logging
 import re
-from dataclasses import dataclass
+from typing import Dict, Optional
 
 import arxiv
-from align_data.sources.articles.datasets import SpreadsheetDataset
 from align_data.sources.articles.pdf import fetch_pdf, parse_vanity
+from align_data.sources.articles.html import fetch_element
 
 logger = logging.getLogger(__name__)
 
 
-@dataclass
-class ArxivPapers(SpreadsheetDataset):
-    summary_key: str = "summary"
-    COOLDOWN: int = 1
-    done_key = "url"
-    batch_size = 1
-
-    def _get_arxiv_metadata(self, paper_id) -> arxiv.Result:
-        """
-        Get metadata from arxiv
-        """
-        try:
-            search = arxiv.Search(id_list=[paper_id], max_results=1)
-            return next(search.results())
-        except Exception as e:
-            logger.error(e)
-            return None
-
-    def get_id(self, item):
-        if res := re.search(r"https://arxiv.org/abs/(.*?)/?$", item.url):
-            return res.group(1)
-
-    def get_contents(self, item) -> dict:
-        paper_id = self.get_id(item)
-        for link in [
-            f"https://www.arxiv-vanity.com/papers/{paper_id}",
-            f"https://ar5iv.org/abs/{paper_id}",
-        ]:
-            if contents := parse_vanity(link):
-                return contents
-        return fetch_pdf(f"https://arxiv.org/pdf/{paper_id}.pdf")
-
-    def process_entry(self, item) -> None:
-        logger.info(f"Processing {item.title}")
-
-        paper = self.get_contents(item)
-        if not paper or not paper.get("text"):
-            return None
-
-        metadata = self._get_arxiv_metadata(self.get_id(item))
-        if self.maybe(item.authors) and item.authors.strip():
-            authors = item.authors.split(',')
-        elif metadata and metadata.authors:
-            authors = metadata.authors
-        else:
-            authors = paper.get("authors") or []
-        authors = [str(a).strip() for a in authors]
-
-        return self.make_data_entry({
-            "url": self.get_item_key(item),
-            "source": self.name,
-            "source_type": paper['data_source'],
-            "title": self.maybe(item.title) or paper.get('title'),
-            "authors": authors,
-            "date_published": self._get_published_date(self.maybe(item.date_published) or paper.get('date_published')),
-            "data_last_modified": str(metadata.updated),
-            "summary": metadata.summary.replace("\n", " "),
-            "author_comment": metadata.comment,
-            "journal_ref": metadata.journal_ref,
-            "doi": metadata.doi,
-            "primary_category": metadata.primary_category,
-            "categories": metadata.categories,
-            "text": paper['text'],
-        })
+def get_arxiv_metadata(paper_id) -> arxiv.Result:
+    """
+    Get metadata from arxiv
+    """
+    try:
+        search = arxiv.Search(id_list=[paper_id], max_results=1)
+        return next(search.results())
+    except Exception as e:
+        logger.error(e)
+    return None
+
+
+def get_id(url: str) -> Optional[str]:
+    if res := re.search(r"https?://arxiv.org/(?:abs|pdf)/(.*?)(?:v\d+)?(?:/|\.pdf)?$", url):
+        return res.group(1)
+
+
+def canonical_url(url: str) -> str:
+    if paper_id := get_id(url):
+        return f'https://arxiv.org/abs/{paper_id}'
+    return url
+
+
+def get_contents(paper_id: str) -> dict:
+    for link in [
+        f"https://www.arxiv-vanity.com/papers/{paper_id}",
+        f"https://ar5iv.org/abs/{paper_id}",
+    ]:
+        if contents := parse_vanity(link):
+            return contents
+    return fetch_pdf(f"https://arxiv.org/pdf/{paper_id}.pdf")
+
+
+def get_version(id: str) -> Optional[str]:
+    if res := re.search(r'.*v(\d+)$', id):
+        return res.group(1)
+
+
+def is_withdrawn(url: str):
+    if elem := fetch_element(canonical_url(url), '.extra-services .full-text ul'):
+        return elem.text.strip().lower() == 'withdrawn'
+    return None
+
+
+def fetch(url) -> Dict:
+    paper_id = get_id(url)
+    if not paper_id:
+        return {'error': 'Could not extract arxiv id'}
+
+    metadata = get_arxiv_metadata(paper_id)
+
+    if is_withdrawn(url):
+        paper = {'status': 'Withdrawn'}
+    else:
+        paper = get_contents(paper_id)
+    if metadata and metadata.authors:
+        authors = metadata.authors
+    else:
+        authors = paper.get("authors") or []
+    authors = [str(a).strip() for a in authors]
+
+    return dict({
+        "title": metadata.title,
+        "url": canonical_url(url),
+        "authors": authors,
+        "date_published": metadata.published,
+        "data_last_modified": metadata.updated.isoformat(),
+        "summary": metadata.summary.replace("\n", " "),
+        "comment": metadata.comment,
+        "journal_ref": metadata.journal_ref,
+        "doi": metadata.doi,
+        "primary_category": metadata.primary_category,
+        "categories": metadata.categories,
+        "version": get_version(metadata.get_short_id()),
+    }, **paper)
diff --git a/tests/align_data/articles/test_datasets.py b/tests/align_data/articles/test_datasets.py
index eed98094..11a02816 100644
--- a/tests/align_data/articles/test_datasets.py
+++ b/tests/align_data/articles/test_datasets.py
@@ -1,14 +1,17 @@
+from datetime import datetime
 from unittest.mock import Mock, patch
 
 import pandas as pd
 import pytest
 from align_data.sources.articles.datasets import (
+    ArxivPapers,
     EbookArticles,
     DocArticles,
     HTMLArticles,
     MarkdownArticles,
     PDFArticles,
     SpreadsheetDataset,
+    SpecialDocs,
     XMLArticles,
 )
 
@@ -32,6 +35,26 @@ def articles():
     return pd.DataFrame(articles)
 
 
+@pytest.fixture
+def mock_arxiv():
+    metadata = Mock(
+        summary="abstract bla bla",
+        comment="no comment",
+        categories="wut",
+        updated=datetime.fromisoformat("2023-01-01T00:00:00"),
+        authors=[],
+        doi="123",
+        journal_ref="sdf",
+        primary_category="cat",
+    )
+    metadata.get_short_id.return_value = '2001.11038'
+    arxiv = Mock()
+    arxiv.Search.return_value.results.return_value = iter([metadata])
+
+    with patch("align_data.sources.arxiv_papers.arxiv_papers.arxiv", arxiv):
+        yield
+
+
 def test_spreadsheet_dataset_items_list(articles):
     dataset = SpreadsheetDataset(name="bla", spreadsheet_id="123", sheet_id="456")
     df = pd.concat(
@@ -288,3 +311,149 @@ def test_doc_articles_process_entry(articles):
                 "title": "article no 0",
                 "url": "http://example.com/item/0",
             }
+
+
+@patch('requests.get', return_value=Mock(content=''))
+def test_arxiv_process_entry(_, mock_arxiv):
+    dataset = ArxivPapers(name="asd", spreadsheet_id="ad", sheet_id="da")
+    item = Mock(
+        title="this is the title",
+        url="https://arxiv.org/abs/2001.11038",
+        authors="",
+        date_published="2020-01-29",
+    )
+    contents = {
+        "text": "this is the text",
+        "date_published": "December 12, 2021",
+        "authors": ["mr blobby"],
+        "source_type": "html",
+    }
+    with patch(
+        "align_data.sources.arxiv_papers.arxiv_papers.parse_vanity", return_value=contents
+    ):
+        assert dataset.process_entry(item).to_dict() == {
+            "comment": "no comment",
+            "authors": ["mr blobby"],
+            "categories": "wut",
+            "data_last_modified": "2023-01-01T00:00:00",
+            "date_published": "2020-01-29T00:00:00Z",
+            "doi": "123",
+            "id": None,
+            "journal_ref": "sdf",
+            "primary_category": "cat",
+            "source": "asd",
+            "source_type": "html",
+            "summaries": ["abstract bla bla"],
+            "text": "this is the text",
+            "title": "this is the title",
+            "url": "https://arxiv.org/abs/2001.11038",
+        }
+
+
+def test_arxiv_process_entry_retracted(mock_arxiv):
+    dataset = ArxivPapers(name="asd", spreadsheet_id="ad", sheet_id="da")
+    item = Mock(
+        title="this is the title",
+        url="https://arxiv.org/abs/2001.11038",
+        authors="",
+        date_published="2020-01-29",
+    )
+    response = """
+    <div class="extra-services">
+      <div class="full-text">
+        <a name="other"></a>
+        <span class="descriptor">Full-text links:</span>
+        <h2>Download:</h2>
+        <ul><li>Withdrawn</li></ul>
+        <div class="abs-license"><div hidden="">No license for this version due to withdrawn</div></div>
+      </div>
+     </div>
+    """
+
+    with patch('requests.get', return_value=Mock(content=response)):
+        assert dataset.process_entry(item).to_dict() == {
+            "comment": "no comment",
+            "authors": [],
+            "categories": "wut",
+            "data_last_modified": "2023-01-01T00:00:00",
+            "date_published": "2020-01-29T00:00:00Z",
+            "doi": "123",
+            "id": None,
+            "journal_ref": "sdf",
+            "primary_category": "cat",
+            "source": "asd",
+            "source_type": None,
+            "summaries": ["abstract bla bla"],
+            "title": "this is the title",
+            "url": "https://arxiv.org/abs/2001.11038",
+            "status": "Withdrawn",
+            "text": None,
+        }
+
+
+def test_special_docs_process_entry():
+    dataset = SpecialDocs(name="asd", spreadsheet_id="ad", sheet_id="da")
+    item = Mock(
+        title="this is the title",
+        url="https://bla.bla.bla",
+        authors="mr. blobby",
+        date_published="2023-10-02T01:23:45",
+        source_type=None,
+    )
+    contents = {
+        "text": "this is the text",
+        "date_published": "December 12, 2021",
+        "authors": ["mr blobby"],
+        "source_type": "html",
+    }
+
+    with patch("align_data.sources.articles.datasets.item_metadata", return_value=contents):
+        assert dataset.process_entry(item).to_dict() == {
+            'authors': ['mr. blobby'],
+            'date_published': '2023-10-02T01:23:45Z',
+            'id': None,
+            'source': 'html',
+            'source_type': None,
+            'summaries': [],
+            'text': 'this is the text',
+            'title': 'this is the title',
+            'url': 'https://bla.bla.bla',
+        }
+
+
+@patch('requests.get', return_value=Mock(content=''))
+def test_special_docs_process_entry_arxiv(_, mock_arxiv):
+    dataset = SpecialDocs(name="asd", spreadsheet_id="ad", sheet_id="da")
+    item = Mock(
+        title="this is the title",
+        url="https://arxiv.org/abs/2001.11038",
+        authors="",
+        date_published="2020-01-29",
+    )
+    contents = {
+        "text": "this is the text",
+        "date_published": "December 12, 2021",
+        "authors": ["mr blobby"],
+        "source_type": "pdf",
+    }
+
+    with patch(
+        "align_data.sources.arxiv_papers.arxiv_papers.parse_vanity", return_value=contents
+    ):
+        assert dataset.process_entry(item).to_dict() == {
+            "comment": "no comment",
+            "authors": ["mr blobby"],
+            "categories": "wut",
+            "data_last_modified": "2023-01-01T00:00:00",
+            "date_published": "2020-01-29T00:00:00Z",
+            "doi": "123",
+            "id": None,
+            "journal_ref": "sdf",
+            "primary_category": "cat",
+            "source": "arxiv",
+            "source_type": "pdf",
+            "summaries": ["abstract bla bla"],
+            "text": "this is the text",
+            "title": "this is the title",
+            "url": "https://arxiv.org/abs/2001.11038",
+        }
diff --git a/tests/align_data/articles/test_google_cloud.py b/tests/align_data/articles/test_google_cloud.py
index 3232978a..7b268e43 100644
--- a/tests/align_data/articles/test_google_cloud.py
+++ b/tests/align_data/articles/test_google_cloud.py
@@ -78,7 +78,7 @@ def test_parse_grobid():
         'authors': ['Cullen Oâ\x80\x99Keefe'],
         'text': 'This is the contents',
         'title': 'The title!!',
-        'data_source': 'xml',
+        'source_type': 'xml',
     }
 
 
@@ -100,7 +100,7 @@ def test_parse_grobid_no_body():
             </text>
         </TEI>
     """
-    assert parse_grobid(xml) == {'error': 'No contents in XML file', 'data_source': 'xml'}
+    assert parse_grobid(xml) == {'error': 'No contents in XML file', 'source_type': 'xml'}
 
 
 @pytest.mark.parametrize('header, expected', (
@@ -160,7 +160,7 @@ def test_extract_gdrive_contents_ebook(header):
         assert extract_gdrive_contents(url) == {
             'downloaded_from': 'google drive',
             'source_url': 'https://drive.google.com/file/d/1OrKZlksba2a8gKa5bAQfP2qF717O_57I/view?usp=sharing',
-            'data_source': 'ebook',
+            'source_type': 'ebook',
         }
 
 
@@ -185,7 +185,7 @@ def test_extract_gdrive_contents_html():
                 'downloaded_from': 'google drive',
                 'source_url': 'https://drive.google.com/file/d/1OrKZlksba2a8gKa5bAQfP2qF717O_57I/view?usp=sharing',
                 'text': 'bla bla',
-                'data_source': 'html',
+                'source_type': 'html',
             }
 
 
@@ -207,7 +207,7 @@ def test_extract_gdrive_contents_xml():
                 'source_url': 'https://drive.google.com/file/d/1OrKZlksba2a8gKa5bAQfP2qF717O_57I/view?usp=sharing',
                 'text': 'This is the contents',
                 'title': 'The title!!',
-                'data_source': 'xml',
+                'source_type': 'xml',
             }
 
 
@@ -238,7 +238,7 @@ def fetcher(link, *args, **kwargs):
                 'source_url': 'https://drive.google.com/file/d/1OrKZlksba2a8gKa5bAQfP2qF717O_57I/view?usp=sharing',
                 'text': 'This is the contents',
                 'title': 'The title!!',
-                'data_source': 'xml',
+                'source_type': 'xml',
             }
 
 
diff --git a/tests/align_data/test_arxiv.py b/tests/align_data/test_arxiv.py
index 5817fd2c..898fef70 100644
--- a/tests/align_data/test_arxiv.py
+++ b/tests/align_data/test_arxiv.py
@@ -1,7 +1,5 @@
-from datetime import datetime
-from unittest.mock import patch, Mock
 import pytest
-from align_data.sources.arxiv_papers.arxiv_papers import ArxivPapers
+from align_data.sources.arxiv_papers.arxiv_papers import get_id, canonical_url, get_version
 
 
 @pytest.mark.parametrize(
@@ -13,55 +11,28 @@
     ),
 )
 def test_get_id(url, expected):
-    dataset = ArxivPapers(name="asd", spreadsheet_id="ad", sheet_id="da")
-    assert dataset.get_id(Mock(url="https://arxiv.org/abs/2001.11038")) == "2001.11038"
+    assert get_id("https://arxiv.org/abs/2001.11038") == "2001.11038"
 
 
-def test_process_entry():
-    dataset = ArxivPapers(name="asd", spreadsheet_id="ad", sheet_id="da")
-    item = Mock(
-        title="this is the title",
-        url="https://arxiv.org/abs/2001.11038",
-        authors="",
-        date_published="2020-01-29",
-    )
-    contents = {
-        "text": "this is the text",
-        "date_published": "December 12, 2021",
-        "authors": ["mr blobby"],
-        "data_source": "html",
-    }
-    metadata = Mock(
-        summary="abstract bla bla",
-        comment="no comment",
-        categories="wut",
-        updated="2023-01-01",
-        authors=[],
-        doi="123",
-        journal_ref="sdf",
-        primary_category="cat",
-    )
-    arxiv = Mock()
-    arxiv.Search.return_value.results.return_value = iter([metadata])
+@pytest.mark.parametrize('url, expected', (
+    ("http://bla.bla", "http://bla.bla"),
+    ("http://arxiv.org/abs/2001.11038", "https://arxiv.org/abs/2001.11038"),
+    ("https://arxiv.org/abs/2001.11038", "https://arxiv.org/abs/2001.11038"),
+    ("https://arxiv.org/abs/2001.11038/", "https://arxiv.org/abs/2001.11038"),
+    ("https://arxiv.org/pdf/2001.11038", "https://arxiv.org/abs/2001.11038"),
+    ("https://arxiv.org/pdf/2001.11038.pdf", "https://arxiv.org/abs/2001.11038"),
+    ("https://arxiv.org/pdf/2001.11038v3.pdf", "https://arxiv.org/abs/2001.11038"),
+    ("https://arxiv.org/abs/math/2001.11038", "https://arxiv.org/abs/math/2001.11038"),
+))
+def test_canonical_url(url, expected):
+    assert canonical_url(url) == expected
 
-    with patch(
-        "align_data.arxiv_papers.arxiv_papers.parse_vanity", return_value=contents
-    ):
-        with patch("align_data.arxiv_papers.arxiv_papers.arxiv", arxiv):
-            assert dataset.process_entry(item).to_dict() == {
-                "author_comment": "no comment",
-                "authors": ["mr blobby"],
-                "categories": "wut",
-                "data_last_modified": "2023-01-01",
-                "date_published": "2020-01-29T00:00:00Z",
-                "doi": "123",
-                "id": None,
-                "journal_ref": "sdf",
-                "primary_category": "cat",
-                "source": "asd",
-                "source_type": "html",
-                "summaries": ["abstract bla bla"],
-                "text": "this is the text",
-                "title": "this is the title",
-                "url": "https://arxiv.org/abs/2001.11038",
-            }
+
+@pytest.mark.parametrize('id, version', (
+    ('123.123', None),
+    ('math/312', None),
+    ('3123123v1', '1'),
+    ('3123123v123', '123'),
+))
+def test_get_version(id, version):
+    assert get_version(id) == version