Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tidy up #144

Merged
merged 2 commits into from
Aug 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion align_data/sources/articles/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
from urllib.parse import urlparse
from typing import Dict, Any
from dateutil.parser import ParserError, parse

import requests
from PyPDF2 import PdfReader
Expand Down Expand Up @@ -155,7 +156,10 @@ def get_first_child(item):
]

if date_published := contents.select_one("div.ltx_dates"):
date_published = date_published.text.strip("()")
try:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just noticed while merging #97 into main that if there's a ParserError, the behaviour
"If the date couldn't be parsed, hope that later phases will be more successful"
is probably a typo. maybe you mean to log it or to return it as an error, but date_published probably ought to be None?

Copy link
Collaborator

@Thomas-Lemoine Thomas-Lemoine Aug 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe just pass and have that as a comment, too

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

    selected_date = contents.select_one("div.ltx_dates")
    try:
        date_published = parse(selected_date.text.strip("()")) if selected_date else None
    except ParserError:
        # "If the date couldn't be parsed, hope that later phases will be more successful"
        date_published = None

?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok I modified it there, #97

date_published = parse(date_published.text.strip("()"))
except ParserError:
"If the date couldn't be parsed, hope that later phases will be more successful"

text = "\n\n".join(
MarkdownConverter().convert_soup(elem).strip()
Expand Down
13 changes: 10 additions & 3 deletions align_data/sources/arxiv_papers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@
logger = logging.getLogger(__name__)


def merge_dicts(*dicts):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This works but what benefit does it have over the previous implementation?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dict1 = {'a': 1, 'b': None}
dict2 = {'a': 2, 'b': 3, 'c': 4}

dict(dict1, **dict2) == {'a': 2, 'b': 3, 'c': 4)
dict(dict2, **dict1) == {'a': 1, 'b': None, 'c': 4}

merge_dicts(dict1, dict2) == {'a': 2, 'b': 3, 'c': 4}
merge_dicts(dict2, dict1) == {'a': 1, 'b': 3, 'c': 4} # this is crucial one

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see, thanks

final = {}
for d in dicts:
final = dict(final, **{k: v for k, v in d.items() if v})
return final


def get_arxiv_metadata(paper_id) -> arxiv.Result:
"""
Get metadata from arxiv
Expand Down Expand Up @@ -59,7 +66,7 @@ def add_metadata(data, paper_id):
metadata = get_arxiv_metadata(paper_id)
if not metadata:
return {}
return dict({
return merge_dicts({
"authors": metadata.authors,
"title": metadata.title,
"date_published": metadata.published,
Expand All @@ -71,7 +78,7 @@ def add_metadata(data, paper_id):
"primary_category": metadata.primary_category,
"categories": metadata.categories,
"version": get_version(metadata.get_short_id()),
}, **data)
}, data)


def fetch_arxiv(url) -> Dict:
Expand All @@ -91,4 +98,4 @@ def fetch_arxiv(url) -> Dict:
authors = data.get('authors') or paper.get("authors") or []
data['authors'] = [str(a).strip() for a in authors]

return dict(data, **paper)
return merge_dicts(data, paper)
37 changes: 36 additions & 1 deletion tests/align_data/test_arxiv.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pytest
from align_data.sources.arxiv_papers import get_id, canonical_url, get_version
from align_data.sources.arxiv_papers import get_id, canonical_url, get_version, merge_dicts


@pytest.mark.parametrize(
Expand Down Expand Up @@ -36,3 +36,38 @@ def test_canonical_url(url, expected):
))
def test_get_version(id, version):
assert get_version(id) == version


def test_merge_dicts_no_args():
"""Test merge_dicts function with no arguments."""
result = merge_dicts()
assert result == {}


def test_merge_dicts_single_dict():
"""Test merge_dicts function with a single dictionary."""
result = merge_dicts({'a': 1, 'b': 2})
assert result == {'a': 1, 'b': 2}


def test_merge_dicts_dicts_with_no_overlap():
"""Test merge_dicts function with multiple dictionaries with no overlapping keys."""
result = merge_dicts({'a': 1}, {'b': 2}, {'c': 3})
assert result == {'a': 1, 'b': 2, 'c': 3}


def test_merge_dicts_dicts_with_overlap():
"""Test merge_dicts function with multiple dictionaries with overlapping keys."""
result = merge_dicts({'a': 1, 'b': 2}, {'b': 3, 'c': 4}, {'c': 5, 'd': 6})
assert result == {'a': 1, 'b': 3, 'c': 5, 'd': 6}


@pytest.mark.parametrize("input_dicts, expected", [
([{'a': 1, 'b': None}, {'b': 3}], {'a': 1, 'b': 3}),
([{'a': 0, 'b': 2}, {'b': None}], {'b': 2}),
([{'a': ''}, {'b': 'test'}], {'b': 'test'}),
])
def test_merge_dicts_with_none_or_falsey_values(input_dicts, expected):
"""Test merge_dicts function with dictionaries containing None or falsey values."""
result = merge_dicts(*input_dicts)
assert result == expected