Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check all available checksum algorithm in DataVerse registry population #437

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 41 additions & 4 deletions pooch/downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -1157,7 +1157,44 @@
The pooch instance that the registry will be added to.
"""

for filedata in self.api_response.json()["data"]["latestVersion"]["files"]:
pooch.registry[filedata["dataFile"]["filename"]] = (
f"md5:{filedata['dataFile']['md5']}"
)
for file in self.api_response.json()["data"]["latestVersion"]["files"]:
filedata = file["dataFile"]
# Support old API: algorithm listed in the dataFile key
algorithms = [
k for k in {"md5", "sha1", "sha256", "sha512"} if k in filedata.keys()
]
if algorithms:
(algorithm,) = algorithms
pooch.registry[filedata["filename"]] = (
f"{algorithm}:{filedata[algorithm]}"
)

# Support new API
elif "checksum" in filedata.keys():
algorithm = self._parse_hashing_algorithm(filedata["checksum"]["type"])
pooch.registry[filedata["filename"]] = (
f"{algorithm}:{filedata['checksum']['value']}"
)

else:
raise ValueError(

Check warning on line 1180 in pooch/downloaders.py

View check run for this annotation

Codecov / codecov/patch

pooch/downloaders.py#L1180

Added line #L1180 was not covered by tests
f"Checksum for file '{filedata['filename']}'"
" not found in the DataVerse API response."
)

def _parse_hashing_algorithm(self, algorithm):
"""
Parse hashing algorithm in Dataverse API responses.

Parse the algorithms (MD5, SHA-1, SHA-256, SHA-512, etc.) present under
the "checksum" key in Dataverse API responses to the corresponding ones
supported by Pooch.
"""
algorithm = algorithm.lower()
if algorithm == "sha-1":
return "sha1"

Check warning on line 1195 in pooch/downloaders.py

View check run for this annotation

Codecov / codecov/patch

pooch/downloaders.py#L1195

Added line #L1195 was not covered by tests
if algorithm == "sha-256":
return "sha256"

Check warning on line 1197 in pooch/downloaders.py

View check run for this annotation

Codecov / codecov/patch

pooch/downloaders.py#L1197

Added line #L1197 was not covered by tests
if algorithm == "sha-512":
return "sha512"

Check warning on line 1199 in pooch/downloaders.py

View check run for this annotation

Codecov / codecov/patch

pooch/downloaders.py#L1199

Added line #L1199 was not covered by tests
return algorithm
144 changes: 144 additions & 0 deletions pooch/tests/test_downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,3 +543,147 @@ def test_populate_registry(self, httpserver, tmp_path, api_response):
# Populate registry
downloader.populate_registry(puppy)
assert puppy.registry == {self.file_name: f"md5:{self.file_checksum}"}


class MockOldResponse:
"""
Mock request response to test checksum algorithm in DataverseRepository.
"""

status_code = 200

def __init__(self, algorithm: str, checksum_key=False):
"""
Parameters
----------
algorithm : str
Hashing algorithm that will be used in the fake repsonse.
checksum_key : bool, optional
Whether to add the 'checksum' key in the response (new API), or
use the hashing algorithm type as key (old API).

Notes
-----

Old API response:

.. code::

{
...
"data": {
"files": [
{
"label": "foobar.txt",
"dataFile": {
...
"id": 12345,
"filename": "foobar.txt",
"md5": "0123456789abcdef",
}
}
...
]
}
}


New API response:

.. code::

{
...
"data": {
"files": [
{
"label": "foobar.txt",
"dataFile": {
...
"id": 12345,
"filename": "foobar.txt",
"checksum": {
"type": "MD5",
"value": "0123456789abcdef"
}
}
}
...
]
}
}
"""

self._algorithm = algorithm
self._checksum_key = checksum_key

@property
def _file(self) -> dict:
file_dict = {
"label": "foobar.txt",
"dataFile": {
"id": 12345,
"filename": "foobar.txt",
},
}
checksum = "0123456789abcdef"
if self._checksum_key:
file_dict = {
"label": "foobar.txt",
"dataFile": {
"id": 12345,
"filename": "foobar.txt",
"checksum": {
"type": self._algorithm.upper(),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be "SHA-1", "SHA-256", etc. Not just "SHA1" and "SHA256".

"value": checksum,
},
},
}
else:
file_dict = {
"label": "foobar.txt",
"dataFile": {
"id": 12345,
"filename": "foobar.txt",
self._algorithm: checksum,
},
}
return file_dict

def json(self) -> dict:
"""
Return a dictionary with a fake response.
"""
files = [self._file]
response = {"data": {"latestVersion": {"files": files}}}
return response


class MockDataverse(DataverseRepository):
"""
Mock class to test checksum algorithms in DataverseRepository.
"""

def __init__(self, response):
super().__init__(doi=None, archive_url=None)
self._api_response = response


class TestDataversePopulateRegistry:
"""
Test checksum algorithms in Dataverse downloaders.
"""

@pytest.mark.parametrize("checksum_key", (False, True), ids=["old_api", "new_api"])
@pytest.mark.parametrize("algorithm", ["md5", "sha1", "sha256", "sha512"])
def test_populate_registry(self, tmp_path, algorithm, checksum_key):
"""
Test populating registry of DataverseRepository.

Test if the new and old APIs are supported.
"""
response = MockOldResponse(algorithm, checksum_key)
downloader = MockDataverse(response)
puppy = Pooch(path=tmp_path, base_url="https://foo.bar/")
downloader.populate_registry(puppy)
assert puppy.registry == {"foobar.txt": f"{algorithm}:0123456789abcdef"}
Loading