From 047ef0f61301cb0edd9bbb33058e9d0ec1215f8c Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Mon, 21 Aug 2023 11:20:51 +0200 Subject: [PATCH 1/2] filter out empty values when merging dicts --- align_data/sources/arxiv_papers.py | 13 ++++++++--- tests/align_data/test_arxiv.py | 37 +++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/align_data/sources/arxiv_papers.py b/align_data/sources/arxiv_papers.py index 85c3a152..2b98223f 100644 --- a/align_data/sources/arxiv_papers.py +++ b/align_data/sources/arxiv_papers.py @@ -9,6 +9,13 @@ logger = logging.getLogger(__name__) +def merge_dicts(*dicts): + final = {} + for d in dicts: + final = dict(final, **{k: v for k, v in d.items() if v}) + return final + + def get_arxiv_metadata(paper_id) -> arxiv.Result: """ Get metadata from arxiv @@ -59,7 +66,7 @@ def add_metadata(data, paper_id): metadata = get_arxiv_metadata(paper_id) if not metadata: return {} - return dict({ + return merge_dicts({ "authors": metadata.authors, "title": metadata.title, "date_published": metadata.published, @@ -71,7 +78,7 @@ def add_metadata(data, paper_id): "primary_category": metadata.primary_category, "categories": metadata.categories, "version": get_version(metadata.get_short_id()), - }, **data) + }, data) def fetch_arxiv(url) -> Dict: @@ -91,4 +98,4 @@ def fetch_arxiv(url) -> Dict: authors = data.get('authors') or paper.get("authors") or [] data['authors'] = [str(a).strip() for a in authors] - return dict(data, **paper) + return merge_dicts(data, paper) diff --git a/tests/align_data/test_arxiv.py b/tests/align_data/test_arxiv.py index d5bf1c8e..25c40dbc 100644 --- a/tests/align_data/test_arxiv.py +++ b/tests/align_data/test_arxiv.py @@ -1,5 +1,5 @@ import pytest -from align_data.sources.arxiv_papers import get_id, canonical_url, get_version +from align_data.sources.arxiv_papers import get_id, canonical_url, get_version, merge_dicts @pytest.mark.parametrize( @@ -36,3 +36,38 @@ def test_canonical_url(url, expected): )) def test_get_version(id, version): assert get_version(id) == version + + +def test_merge_dicts_no_args(): + """Test merge_dicts function with no arguments.""" + result = merge_dicts() + assert result == {} + + +def test_merge_dicts_single_dict(): + """Test merge_dicts function with a single dictionary.""" + result = merge_dicts({'a': 1, 'b': 2}) + assert result == {'a': 1, 'b': 2} + + +def test_merge_dicts_dicts_with_no_overlap(): + """Test merge_dicts function with multiple dictionaries with no overlapping keys.""" + result = merge_dicts({'a': 1}, {'b': 2}, {'c': 3}) + assert result == {'a': 1, 'b': 2, 'c': 3} + + +def test_merge_dicts_dicts_with_overlap(): + """Test merge_dicts function with multiple dictionaries with overlapping keys.""" + result = merge_dicts({'a': 1, 'b': 2}, {'b': 3, 'c': 4}, {'c': 5, 'd': 6}) + assert result == {'a': 1, 'b': 3, 'c': 5, 'd': 6} + + +@pytest.mark.parametrize("input_dicts, expected", [ + ([{'a': 1, 'b': None}, {'b': 3}], {'a': 1, 'b': 3}), + ([{'a': 0, 'b': 2}, {'b': None}], {'b': 2}), + ([{'a': ''}, {'b': 'test'}], {'b': 'test'}), +]) +def test_merge_dicts_with_none_or_falsey_values(input_dicts, expected): + """Test merge_dicts function with dictionaries containing None or falsey values.""" + result = merge_dicts(*input_dicts) + assert result == expected From 48e89c535456d0a8e5841e38f643b2bac00fc324 Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Mon, 21 Aug 2023 11:35:07 +0200 Subject: [PATCH 2/2] handle invalid dates in arxiv vanity --- align_data/sources/articles/pdf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/align_data/sources/articles/pdf.py b/align_data/sources/articles/pdf.py index 205802f9..b0e31951 100644 --- a/align_data/sources/articles/pdf.py +++ b/align_data/sources/articles/pdf.py @@ -2,6 +2,7 @@ import logging from urllib.parse import urlparse from typing import Dict, Any +from dateutil.parser import ParserError, parse import requests from PyPDF2 import PdfReader @@ -155,7 +156,10 @@ def get_first_child(item): ] if date_published := contents.select_one("div.ltx_dates"): - date_published = date_published.text.strip("()") + try: + date_published = parse(date_published.text.strip("()")) + except ParserError: + "If the date couldn't be parsed, hope that later phases will be more successful" text = "\n\n".join( MarkdownConverter().convert_soup(elem).strip()