From 4c494eb309e0f9aafafd8854789ba782ac511147 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 18 Dec 2023 09:54:25 -0500 Subject: [PATCH 01/16] Use dandischema 0.9 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index afb287298..122340fe7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,7 +33,7 @@ install_requires = bidsschematools ~= 0.7.0 click >= 7.1 click-didyoumean - dandischema ~= 0.8.0 + dandischema ~= 0.9.0 etelemetry >= 0.2.2 fasteners fscacher >= 0.3.0 From 28513d8fb3a0f21328415fc3536bfdb0c391551a Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 18 Dec 2023 09:55:06 -0500 Subject: [PATCH 02/16] Update pydantic requirement to ~= 2.0 --- setup.cfg | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 122340fe7..34f7e7637 100644 --- a/setup.cfg +++ b/setup.cfg @@ -46,7 +46,7 @@ install_requires = packaging platformdirs pycryptodomex # for EncryptedKeyring backend in keyrings.alt - pydantic >= 1.9.0 + pydantic ~= 2.0 pynwb >= 1.0.3,!=1.1.0,!=2.3.0 nwbinspector >= 0.4.28,!=0.4.32 pyout >=0.5, !=0.6.0 @@ -218,4 +218,3 @@ ignore_missing_imports = True [pydantic-mypy] init_forbid_extra = True -warn_untypes_fields = True From 5f46be39b14f79a136b6c78487c93e0694def6d5 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 18 Dec 2023 16:11:03 -0500 Subject: [PATCH 03/16] Update for Pydantic v2 --- dandi/cli/cmd_download.py | 6 +- dandi/cli/cmd_ls.py | 6 +- dandi/cli/cmd_service_scripts.py | 2 +- .../update_dandiset_from_doi/biorxiv.json | 2 +- .../data/update_dandiset_from_doi/elife.json | 2 +- .../update_dandiset_from_doi/jneurosci.json | 2 +- .../data/update_dandiset_from_doi/nature.json | 2 +- .../data/update_dandiset_from_doi/neuron.json | 2 +- dandi/dandiapi.py | 73 +++++++++---------- dandi/dandiarchive.py | 7 +- dandi/files/bases.py | 4 +- dandi/files/bids.py | 9 ++- dandi/metadata/core.py | 6 +- dandi/metadata/util.py | 10 +-- .../data/metadata/metadata2asset_simple1.json | 2 +- dandi/tests/test_dandiapi.py | 2 +- dandi/tests/test_metadata.py | 55 +++++++------- dandi/upload.py | 2 +- dandi/utils.py | 57 ++++++++++----- 19 files changed, 140 insertions(+), 111 deletions(-) diff --git a/dandi/cli/cmd_download.py b/dandi/cli/cmd_download.py index 7bd4bf76d..ee43459d6 100644 --- a/dandi/cli/cmd_download.py +++ b/dandi/cli/cmd_download.py @@ -9,7 +9,7 @@ from ..dandiarchive import _dandi_url_parser, parse_dandi_url from ..dandiset import Dandiset from ..download import DownloadExisting, DownloadFormat, PathType -from ..utils import get_instance +from ..utils import get_instance, joinurl # The use of f-strings apparently makes this not a proper docstring, and so @@ -131,9 +131,9 @@ def download( pass else: if instance.gui is not None: - url = [f"{instance.gui}/#/dandiset/{dandiset_id}/draft"] + url = [joinurl(instance.gui, f"/#/dandiset/{dandiset_id}/draft")] else: - url = [f"{instance.api}/dandisets/{dandiset_id}/"] + url = [joinurl(instance.api, f"/dandisets/{dandiset_id}/")] return download.download( url, diff --git a/dandi/cli/cmd_ls.py b/dandi/cli/cmd_ls.py index 72e933aad..eeb3dbecd 100644 --- a/dandi/cli/cmd_ls.py +++ b/dandi/cli/cmd_ls.py @@ -96,8 +96,8 @@ def ls( all_fields = tuple( sorted( set(common_fields) - | models.Dandiset.__fields__.keys() - | models.Asset.__fields__.keys() + | models.Dandiset.model_fields.keys() + | models.Asset.model_fields.keys() ) ) else: @@ -345,7 +345,7 @@ def fn(): path, schema_version=schema, digest=Digest.dandi_etag(digest), - ).json_dict() + ).model_dump(mode="json", exclude_none=True) else: if path.endswith(tuple(ZARR_EXTENSIONS)): if use_fake_digest: diff --git a/dandi/cli/cmd_service_scripts.py b/dandi/cli/cmd_service_scripts.py index e8a2c4b11..9e25b13db 100644 --- a/dandi/cli/cmd_service_scripts.py +++ b/dandi/cli/cmd_service_scripts.py @@ -104,7 +104,7 @@ def reextract_metadata(url: str, diff: bool, when: str) -> None: lgr.info("Extracting new metadata for asset") metadata = nwb2asset(asset.as_readable(), digest=digest) metadata.path = asset.path - mddict = metadata.json_dict() + mddict = metadata.model_dump(mode="json", exclude_none=True) if diff: oldmd = asset.get_raw_metadata() oldmd_str = yaml_dump(oldmd) diff --git a/dandi/cli/tests/data/update_dandiset_from_doi/biorxiv.json b/dandi/cli/tests/data/update_dandiset_from_doi/biorxiv.json index eb6a40f38..6fe83f612 100644 --- a/dandi/cli/tests/data/update_dandiset_from_doi/biorxiv.json +++ b/dandi/cli/tests/data/update_dandiset_from_doi/biorxiv.json @@ -386,7 +386,7 @@ "includeInCitation": true } ], - "dateCreated": "2023-04-25T16:28:26.500181+00:00", + "dateCreated": "2023-04-25T16:28:26.500181Z", "description": "Progress in science requires standardized assays whose results can be readily shared, compared, and reproduced across laboratories. Reproducibility, however, has been a concern in neuroscience, particularly for measurements of mouse behavior. Here we show that a standardized task to probe decision-making in mice produces reproducible results across multiple laboratories. We designed a task for head-fixed mice that combines established assays of perceptual and value-based decision making, and we standardized training protocol and experimental hardware, software, and procedures. We trained 140 mice across seven laboratories in three countries, and we collected 5 million mouse choices into a publicly available database. Learning speed was variable across mice and laboratories, but once training was complete there were no significant differences in behavior across laboratories. Mice in different laboratories adopted similar reliance on visual stimuli, on past successes and failures, and on estimates of stimulus prior probability to guide their choices. These results reveal that a complex mouse behavior can be successfully reproduced across multiple laboratories. They establish a standard for reproducible rodent behavior, and provide an unprecedented dataset and open-access tools to study decision-making in mice. More generally, they indicate a path towards achieving reproducibility in neuroscience through collaborative open-science approaches.", "assetsSummary": { "schemaKey": "AssetsSummary", diff --git a/dandi/cli/tests/data/update_dandiset_from_doi/elife.json b/dandi/cli/tests/data/update_dandiset_from_doi/elife.json index 7008fa0ab..54dbf581f 100644 --- a/dandi/cli/tests/data/update_dandiset_from_doi/elife.json +++ b/dandi/cli/tests/data/update_dandiset_from_doi/elife.json @@ -105,7 +105,7 @@ "includeInCitation": true } ], - "dateCreated": "2023-04-25T16:28:30.453019+00:00", + "dateCreated": "2023-04-25T16:28:30.453019Z", "description": "Proprioception, the sense of body position, movement, and associated forces, remains poorly understood, despite its critical role in movement. Most studies of area 2, a proprioceptive area of somatosensory cortex, have simply compared neurons\u2019 activities to the movement of the hand through space. Using motion tracking, we sought to elaborate this relationship by characterizing how area 2 activity relates to whole arm movements. We found that a whole-arm model, unlike classic models, successfully predicted how features of neural activity changed as monkeys reached to targets in two workspaces. However, when we then evaluated this whole-arm model across active and passive movements, we found that many neurons did not consistently represent the whole arm over both conditions. These results suggest that 1) neural activity in area 2 includes representation of the whole arm during reaching and 2) many of these neurons represented limb state differently during active and passive movements.", "assetsSummary": { "schemaKey": "AssetsSummary", diff --git a/dandi/cli/tests/data/update_dandiset_from_doi/jneurosci.json b/dandi/cli/tests/data/update_dandiset_from_doi/jneurosci.json index ebed2dcaf..14f081ebd 100644 --- a/dandi/cli/tests/data/update_dandiset_from_doi/jneurosci.json +++ b/dandi/cli/tests/data/update_dandiset_from_doi/jneurosci.json @@ -45,7 +45,7 @@ "includeInCitation": true } ], - "dateCreated": "2023-04-25T16:28:28.308094+00:00", + "dateCreated": "2023-04-25T16:28:28.308094Z", "description": "Reinforcement learning theory plays a key role in understanding the behavioral and neural mechanisms of choice behavior in animals and humans. Especially, intermediate variables of learning models estimated from behavioral data, such as the expectation of reward for each candidate choice (action value), have been used in searches for the neural correlates of computational elements in learning and decision making. The aims of the present study are as follows: (1) to test which computational model best captures the choice learning process in animals and (2) to elucidate how action values are represented in different parts of the corticobasal ganglia circuit. We compared different behavioral learning algorithms to predict the choice sequences generated by rats during a free-choice task and analyzed associated neural activity in the nucleus accumbens (NAc) and ventral pallidum (VP). The major findings of this study were as follows: (1) modified versions of an action\u2013value learning model captured a variety of choice strategies of rats, including win-stay\u2013lose-switch and persevering behavior, and predicted rats' choice sequences better than the best multistep Markov model; and (2) information about action values and future actions was coded in both the NAc and VP, but was less dominant than information about trial types, selected actions, and reward outcome. The results of our model-based analysis suggest that the primary role of the NAc and VP is to monitor information important for updating choice behaviors. Information represented in the NAc and VP might contribute to a choice mechanism that is situated elsewhere.", "assetsSummary": { "schemaKey": "AssetsSummary", diff --git a/dandi/cli/tests/data/update_dandiset_from_doi/nature.json b/dandi/cli/tests/data/update_dandiset_from_doi/nature.json index 17f456374..ce5449d09 100644 --- a/dandi/cli/tests/data/update_dandiset_from_doi/nature.json +++ b/dandi/cli/tests/data/update_dandiset_from_doi/nature.json @@ -46,7 +46,7 @@ "includeInCitation": true } ], - "dateCreated": "2023-04-25T16:28:31.601155+00:00", + "dateCreated": "2023-04-25T16:28:31.601155Z", "description": "AbstractSpatial cognition depends on an accurate representation of orientation within an environment. Head direction cells in distributed brain regions receive a range of sensory inputs, but visual input is particularly important for aligning their responses to environmental landmarks. To investigate how population-level heading responses are aligned to visual input, we recorded from retrosplenial cortex (RSC) of head-fixed mice in a moving environment using two-photon calcium imaging. We show that RSC neurons are tuned to the animal\u2019s relative orientation in the environment, even in the absence of head movement. Next, we found that RSC receives functionally distinct projections from visual and thalamic areas and contains several functional classes of neurons. While some functional classes mirror RSC inputs, a newly discovered class coregisters visual and thalamic signals. Finally, decoding analyses reveal unique contributions to heading from each class. Our results suggest an RSC circuit for anchoring heading representations to environmental visual landmarks.", "assetsSummary": { "schemaKey": "AssetsSummary", diff --git a/dandi/cli/tests/data/update_dandiset_from_doi/neuron.json b/dandi/cli/tests/data/update_dandiset_from_doi/neuron.json index 87264d4a1..82a81a7fa 100644 --- a/dandi/cli/tests/data/update_dandiset_from_doi/neuron.json +++ b/dandi/cli/tests/data/update_dandiset_from_doi/neuron.json @@ -45,7 +45,7 @@ "includeInCitation": true } ], - "dateCreated": "2023-04-25T16:28:29.373034+00:00", + "dateCreated": "2023-04-25T16:28:29.373034Z", "description": "A test Dandiset", "assetsSummary": { "schemaKey": "AssetsSummary", diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index d50b94827..5b00f743c 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -13,7 +13,7 @@ import re from time import sleep, time from types import TracebackType -from typing import TYPE_CHECKING, Any, ClassVar, Dict, FrozenSet, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional from urllib.parse import quote_plus, urlparse, urlunparse import click @@ -44,6 +44,7 @@ get_instance, is_interactive, is_page2_url, + joinurl, ) if TYPE_CHECKING: @@ -285,16 +286,12 @@ def request( def get_url(self, path: str) -> str: """ Append a slash-separated ``path`` to the instance's base URL. The two - components are separated by a single slash, and any trailing slashes - are removed. + components are separated by a single slash, removing any excess slashes + that would be present after naïve concatenation. If ``path`` is already an absolute URL, it is returned unchanged. """ - # Construct the url - if path.lower().startswith(("http://", "https://")): - return path - else: - return self.api_url.rstrip("/") + "/" + path.lstrip("/") + return joinurl(self.api_url, path) def get(self, path: str, **kwargs: Any) -> Any: """ @@ -614,7 +611,8 @@ def get_asset(self, asset_id: str) -> BaseRemoteAsset: return BaseRemoteAsset.from_base_data(self, info, metadata) -class APIBase(BaseModel): +# `arbitrary_types_allowed` is needed for `client: DandiAPIClient` +class APIBase(BaseModel, populate_by_name=True, arbitrary_types_allowed=True): """ Base class for API objects implemented in pydantic. @@ -622,21 +620,12 @@ class APIBase(BaseModel): detail; do not rely on it. """ - JSON_EXCLUDE: ClassVar[FrozenSet[str]] = frozenset(["client"]) - def json_dict(self) -> dict[str, Any]: """ Convert to a JSONable `dict`, omitting the ``client`` attribute and using the same field names as in the API """ - data = json.loads(self.json(exclude=self.JSON_EXCLUDE, by_alias=True)) - assert isinstance(data, dict) - return data - - class Config: - allow_population_by_field_name = True - # To allow `client: Session`: - arbitrary_types_allowed = True + return self.model_dump(mode="json", by_alias=True) class Version(APIBase): @@ -710,7 +699,7 @@ class RemoteDandisetData(APIBase): modified: datetime contact_person: str embargo_status: EmbargoStatus - most_recent_published_version: Optional[Version] + most_recent_published_version: Optional[Version] = None draft_version: Version @@ -752,7 +741,7 @@ def __init__( self._version = version self._data: RemoteDandisetData | None if data is not None: - self._data = RemoteDandisetData.parse_obj(data) + self._data = RemoteDandisetData.model_validate(data) else: self._data = None @@ -762,7 +751,7 @@ def __str__(self) -> str: def _get_data(self) -> RemoteDandisetData: if self._data is None: try: - self._data = RemoteDandisetData.parse_obj( + self._data = RemoteDandisetData.model_validate( self.client.get(f"/dandisets/{self.identifier}/") ) except HTTP404Error: @@ -875,9 +864,9 @@ def from_data(cls, client: DandiAPIClient, data: dict[str, Any]) -> RemoteDandis when acquiring data using means outside of this library. """ if data.get("most_recent_published_version") is not None: - version = Version.parse_obj(data["most_recent_published_version"]) + version = Version.model_validate(data["most_recent_published_version"]) else: - version = Version.parse_obj(data["draft_version"]) + version = Version.model_validate(data["draft_version"]) return cls( client=client, identifier=data["identifier"], version=version, data=data ) @@ -917,7 +906,7 @@ def get_versions(self, order: str | None = None) -> Iterator[Version]: for v in self.client.paginate( f"{self.api_path}versions/", params={"order": order} ): - yield Version.parse_obj(v) + yield Version.model_validate(v) except HTTP404Error: raise NotFoundError(f"No such Dandiset: {self.identifier!r}") @@ -932,7 +921,7 @@ def get_version(self, version_id: str) -> VersionInfo: `Version`. """ try: - return VersionInfo.parse_obj( + return VersionInfo.model_validate( self.client.get( f"/dandisets/{self.identifier}/versions/{version_id}/info/" ) @@ -978,7 +967,7 @@ def get_metadata(self) -> models.Dandiset: metadata. Consider using `get_raw_metadata()` instead in order to fetch unstructured, possibly-invalid metadata. """ - return models.Dandiset.parse_obj(self.get_raw_metadata()) + return models.Dandiset.model_validate(self.get_raw_metadata()) def get_raw_metadata(self) -> dict[str, Any]: """ @@ -996,7 +985,7 @@ def set_metadata(self, metadata: models.Dandiset) -> None: """ Set the metadata for this version of the Dandiset to the given value """ - self.set_raw_metadata(metadata.json_dict()) + self.set_raw_metadata(metadata.model_dump(mode="json", exclude_none=True)) def set_raw_metadata(self, metadata: dict[str, Any]) -> None: """ @@ -1049,7 +1038,7 @@ def publish(self, max_time: float = 120) -> RemoteDandiset: ) start = time() while time() - start < max_time: - v = Version.parse_obj(self.client.get(f"{draft_api_path}info/")) + v = Version.model_validate(self.client.get(f"{draft_api_path}info/")) if v.status is VersionStatus.PUBLISHED: break sleep(0.5) @@ -1273,7 +1262,7 @@ class BaseRemoteAsset(ABC, APIBase): #: The `DandiAPIClient` instance that returned this `BaseRemoteAsset` #: and which the latter will use for API requests - client: DandiAPIClient + client: DandiAPIClient = Field(exclude=True) #: The asset identifier identifier: str = Field(alias="asset_id") #: The asset's (forward-slash-separated) path @@ -1294,6 +1283,15 @@ def __init__(self, **data: Any) -> None: # type: ignore[no-redef] # underscores, so we have to do it ourselves. self._metadata = data.get("metadata", data.get("_metadata")) + def __eq__(self, other: Any) -> bool: + if type(self) is type(other): + # dict() includes fields with `exclude=True` (which are absent from + # the return value of `model_dump()`) but not private fields. We + # want to compare the former but not the latter. + return dict(self) == dict(other) + else: + return NotImplemented + def __str__(self) -> str: return f"{self.client._instance_id}:assets/{self.identifier}" @@ -1360,7 +1358,7 @@ def get_metadata(self) -> models.Asset: valid metadata. Consider using `get_raw_metadata()` instead in order to fetch unstructured, possibly-invalid metadata. """ - return models.Asset.parse_obj(self.get_raw_metadata()) + return models.Asset.model_validate(self.get_raw_metadata()) def get_raw_metadata(self) -> dict[str, Any]: """Fetch the metadata for the asset as an unprocessed `dict`""" @@ -1610,7 +1608,7 @@ def iterfiles(self, prefix: str | None = None) -> Iterator[RemoteZarrEntry]: for r in self.client.paginate( f"{self.client.api_url}/zarr/{self.zarr}/files", params={"prefix": prefix} ): - data = ZarrEntryServerData.parse_obj(r) + data = ZarrEntryServerData.model_validate(r) yield RemoteZarrEntry.from_server_data(self, data) def get_entry_by_path(self, path: str) -> RemoteZarrEntry: @@ -1667,13 +1665,12 @@ class RemoteAsset(BaseRemoteAsset): `RemoteDandiset`. """ - JSON_EXCLUDE = frozenset(["client", "dandiset_id", "version_id"]) - #: The identifier for the Dandiset to which the asset belongs - dandiset_id: str + dandiset_id: str = Field(exclude=True) + #: The identifier for the version of the Dandiset to which the asset #: belongs - version_id: str + version_id: str = Field(exclude=True) @classmethod def from_data( @@ -1738,7 +1735,9 @@ def set_metadata(self, metadata: models.Asset) -> None: Set the metadata for the asset to the given value and update the `RemoteAsset` in place. """ - return self.set_raw_metadata(metadata.json_dict()) + return self.set_raw_metadata( + metadata.model_dump(mode="json", exclude_none=True) + ) @abstractmethod def set_raw_metadata(self, metadata: dict[str, Any]) -> None: diff --git a/dandi/dandiarchive.py b/dandi/dandiarchive.py index 3a849bbe9..53c3506cb 100644 --- a/dandi/dandiarchive.py +++ b/dandi/dandiarchive.py @@ -37,7 +37,7 @@ from typing import Any from urllib.parse import unquote as urlunquote -from pydantic import AnyHttpUrl, parse_obj_as +from pydantic import AnyHttpUrl, TypeAdapter import requests from . import get_logger @@ -82,9 +82,8 @@ class ParsedDandiURL(ABC): def api_url(self) -> AnyHttpUrl: """The base URL of the Dandi API service, without a trailing slash""" # Kept for backwards compatibility - r = parse_obj_as(AnyHttpUrl, self.instance.api.rstrip("/")) - assert isinstance(r, AnyHttpUrl) - return r # type: ignore[no-any-return] + adapter = TypeAdapter(AnyHttpUrl) + return adapter.validate_python(self.instance.api.rstrip("/")) def get_client(self) -> DandiAPIClient: """ diff --git a/dandi/files/bases.py b/dandi/files/bases.py index c105f9bd6..563ed07de 100644 --- a/dandi/files/bases.py +++ b/dandi/files/bases.py @@ -97,7 +97,7 @@ def get_metadata( """Return the Dandiset metadata inside the file""" with open(self.filepath) as f: meta = yaml_load(f, typ="safe") - return DandisetMeta.unvalidated(**meta) + return DandisetMeta.model_construct(**meta) # TODO: @validate_cache.memoize_path def get_validation_errors( @@ -183,7 +183,7 @@ def get_validation_errors( ) try: asset = self.get_metadata(digest=self._DUMMY_DIGEST) - BareAsset(**asset.dict()) + BareAsset(**asset.model_dump()) except ValidationError as e: if devel_debug: raise diff --git a/dandi/files/bids.py b/dandi/files/bids.py index 0ab0784f0..8541b5030 100644 --- a/dandi/files/bids.py +++ b/dandi/files/bids.py @@ -94,7 +94,9 @@ def _validate(self) -> None: ) # Don't apply eta-reduction to the lambda, as mypy needs to be # assured that defaultdict's argument takes no parameters. - self._asset_metadata = defaultdict(lambda: BareAsset.unvalidated()) + self._asset_metadata = defaultdict( + lambda: BareAsset.model_construct() # type: ignore[call-arg] + ) for result in results: if result.id in BIDS_ASSET_ERRORS: assert result.path @@ -230,7 +232,10 @@ def get_metadata( bids_metadata = BIDSAsset.get_metadata(self, digest, ignore_errors) nwb_metadata = NWBAsset.get_metadata(self, digest, ignore_errors) return BareAsset( - **{**bids_metadata.dict(), **nwb_metadata.dict(exclude_none=True)} + **{ + **bids_metadata.model_dump(), + **nwb_metadata.model_dump(exclude_none=True), + } ) diff --git a/dandi/metadata/core.py b/dandi/metadata/core.py index 3e97dbc24..12a53264e 100644 --- a/dandi/metadata/core.py +++ b/dandi/metadata/core.py @@ -5,7 +5,7 @@ import re from dandischema import models -from pydantic import ByteSize, parse_obj_as +from pydantic import ByteSize from .util import extract_model, get_generator from .. import get_logger @@ -18,7 +18,7 @@ def get_default_metadata( path: str | Path | Readable, digest: Digest | None = None ) -> models.BareAsset: - metadata = models.BareAsset.unvalidated() + metadata = models.BareAsset.model_construct() # type: ignore[call-arg] start_time = end_time = datetime.now().astimezone() add_common_metadata(metadata, path, start_time, end_time, digest) return metadata @@ -56,7 +56,7 @@ def add_common_metadata( ) if m: size = int(m["size"]) - metadata.contentSize = parse_obj_as(ByteSize, size) + metadata.contentSize = ByteSize(size) if metadata.wasGeneratedBy is None: metadata.wasGeneratedBy = [] metadata.wasGeneratedBy.append(get_generator(start_time, end_time)) diff --git a/dandi/metadata/util.py b/dandi/metadata/util.py index cfc57bb49..b9cb33aa5 100644 --- a/dandi/metadata/util.py +++ b/dandi/metadata/util.py @@ -498,13 +498,13 @@ def extract_anatomy(metadata: dict) -> list[models.Anatomy] | None: def extract_model(modelcls: type[M], metadata: dict, **kwargs: Any) -> M: - m = modelcls.unvalidated() - for field in m.__fields__.keys(): + m = modelcls.model_construct() + for field in m.model_fields.keys(): value = kwargs.get(field, extract_field(field, metadata)) if value is not None: setattr(m, field, value) - # return modelcls(**m.dict()) - return m + # I can't figure out why mypy doesn't like this line: + return m # type: ignore[return-value] def extract_model_list( @@ -514,7 +514,7 @@ def func(metadata: dict) -> list[M]: m = extract_model( modelcls, metadata, **{id_field: metadata.get(id_source)}, **kwargs ) - if all(v is None for k, v in m.dict().items() if k != "schemaKey"): + if all(v is None for k, v in m.model_dump().items() if k != "schemaKey"): return [] else: return [m] diff --git a/dandi/tests/data/metadata/metadata2asset_simple1.json b/dandi/tests/data/metadata/metadata2asset_simple1.json index 9c742c1b9..04babc1a1 100644 --- a/dandi/tests/data/metadata/metadata2asset_simple1.json +++ b/dandi/tests/data/metadata/metadata2asset_simple1.json @@ -18,7 +18,7 @@ "identifier": "session_id1", "name": "session_id1", "description": "session_description1", - "startDate": "2017-04-15T12:00:00+00:00" + "startDate": "2017-04-15T12:00:00Z" } ], "contentSize": 69105, diff --git a/dandi/tests/test_dandiapi.py b/dandi/tests/test_dandiapi.py index 4d354b809..077f16ce7 100644 --- a/dandi/tests/test_dandiapi.py +++ b/dandi/tests/test_dandiapi.py @@ -496,7 +496,7 @@ def test_set_asset_metadata(text_dandiset: SampleDandiset) -> None: md = asset.get_metadata() md.blobDateModified = datetime(2038, 1, 19, 3, 14, 7, tzinfo=timezone.utc) asset.set_metadata(md) - assert asset.get_raw_metadata()["blobDateModified"] == "2038-01-19T03:14:07+00:00" + assert asset.get_raw_metadata()["blobDateModified"] == "2038-01-19T03:14:07Z" def test_remote_dandiset_json_dict(text_dandiset: SampleDandiset) -> None: diff --git a/dandi/tests/test_metadata.py b/dandi/tests/test_metadata.py index 4c0e3bf0b..07c19b24a 100644 --- a/dandi/tests/test_metadata.py +++ b/dandi/tests/test_metadata.py @@ -27,6 +27,7 @@ ) from dandischema.models import Dandiset as DandisetMeta from dateutil.tz import tzutc +from pydantic import ByteSize import pytest from semantic_version import Version @@ -397,7 +398,7 @@ def test_timedelta2duration(td: timedelta, duration: str) -> None: ], ) def test_prepare_metadata(filename: str, metadata: dict[str, Any]) -> None: - data = prepare_metadata(metadata).json_dict() + data = prepare_metadata(metadata).model_dump(mode="json", exclude_none=True) with (METADATA_DIR / filename).open() as fp: data_as_dict = json.load(fp) data_as_dict["schemaVersion"] = DANDI_SCHEMA_VERSION @@ -492,7 +493,7 @@ def test_parseobourl(url, value): @mark.skipif_no_network def test_species(): m = {"species": "http://purl.obolibrary.org/obo/NCBITaxon_28584"} - assert extract_species(m).json_dict() == { + assert extract_species(m).model_dump(mode="json", exclude_none=True) == { "identifier": "http://purl.obolibrary.org/obo/NCBITaxon_28584", "schemaKey": "SpeciesType", "name": "Drosophila suzukii", @@ -772,7 +773,7 @@ def test_species_map(): ], ) def test_ndtypes(ndtypes, asset_dict): - metadata = BareAsset.unvalidated( + metadata = BareAsset( contentSize=1, encodingFormat="application/x-nwb", digest={DigestType.dandi_etag: "0" * 32 + "-1"}, @@ -790,24 +791,26 @@ def test_ndtypes(ndtypes, asset_dict): @mark.skipif_no_network def test_nwb2asset(simple2_nwb: Path) -> None: - assert nwb2asset(simple2_nwb, digest=DUMMY_DANDI_ETAG) == BareAsset.unvalidated( + # Classes with ANY_AWARE_DATETIME fields need to be constructed with + # model_construct() + assert nwb2asset(simple2_nwb, digest=DUMMY_DANDI_ETAG) == BareAsset.model_construct( schemaKey="Asset", schemaVersion=DANDI_SCHEMA_VERSION, keywords=["keyword1", "keyword 2"], access=[ - AccessRequirements.unvalidated( + AccessRequirements( schemaKey="AccessRequirements", status=AccessType.OpenAccess ) ], wasGeneratedBy=[ - Session.unvalidated( + Session.model_construct( schemaKey="Session", identifier="session_id1", name="session_id1", description="session_description1", startDate=ANY_AWARE_DATETIME, ), - Activity.unvalidated( + Activity.model_construct( id=AnyFullmatch( r"urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}" ), @@ -817,7 +820,7 @@ def test_nwb2asset(simple2_nwb: Path) -> None: startDate=ANY_AWARE_DATETIME, endDate=ANY_AWARE_DATETIME, wasAssociatedWith=[ - Software.unvalidated( + Software( schemaKey="Software", identifier="RRID:SCR_019009", name="DANDI Command Line Interface", @@ -827,27 +830,27 @@ def test_nwb2asset(simple2_nwb: Path) -> None: ], ), ], - contentSize=19664, + contentSize=ByteSize(19664), encodingFormat="application/x-nwb", digest={DigestType.dandi_etag: "dddddddddddddddddddddddddddddddd-1"}, path=str(simple2_nwb), dateModified=ANY_AWARE_DATETIME, blobDateModified=ANY_AWARE_DATETIME, wasAttributedTo=[ - Participant.unvalidated( + Participant( identifier="mouse001", schemaKey="Participant", - age=PropertyValue.unvalidated( + age=PropertyValue( schemaKey="PropertyValue", unitText="ISO-8601 duration", value="P135DT43200S", - valueReference=PropertyValue.unvalidated( + valueReference=PropertyValue( schemaKey="PropertyValue", value=AgeReferenceType.BirthReference, ), ), - sex=SexType.unvalidated(schemaKey="SexType", name="Unknown"), - species=SpeciesType.unvalidated( + sex=SexType(schemaKey="SexType", name="Unknown"), + species=SpeciesType( schemaKey="SpeciesType", identifier="http://purl.obolibrary.org/obo/NCBITaxon_10090", name="Mus musculus - House mouse", @@ -867,24 +870,26 @@ def test_nwb2asset_remote_asset(nwb_dandiset: SampleDandiset) -> None: mtime = ensure_datetime(asset.get_raw_metadata()["blobDateModified"]) assert isinstance(asset, RemoteBlobAsset) r = asset.as_readable() - assert nwb2asset(r, digest=digest) == BareAsset.unvalidated( + # Classes with ANY_AWARE_DATETIME fields need to be constructed with + # model_construct() + assert nwb2asset(r, digest=digest) == BareAsset.model_construct( schemaKey="Asset", schemaVersion=DANDI_SCHEMA_VERSION, keywords=["keyword1", "keyword 2"], access=[ - AccessRequirements.unvalidated( + AccessRequirements( schemaKey="AccessRequirements", status=AccessType.OpenAccess ) ], wasGeneratedBy=[ - Session.unvalidated( + Session.model_construct( schemaKey="Session", identifier="session_id1", name="session_id1", description="session_description1", startDate=ANY_AWARE_DATETIME, ), - Activity.unvalidated( + Activity.model_construct( id=AnyFullmatch( r"urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}" ), @@ -894,7 +899,7 @@ def test_nwb2asset_remote_asset(nwb_dandiset: SampleDandiset) -> None: startDate=ANY_AWARE_DATETIME, endDate=ANY_AWARE_DATETIME, wasAssociatedWith=[ - Software.unvalidated( + Software( schemaKey="Software", identifier="RRID:SCR_019009", name="DANDI Command Line Interface", @@ -904,27 +909,27 @@ def test_nwb2asset_remote_asset(nwb_dandiset: SampleDandiset) -> None: ], ), ], - contentSize=asset.size, + contentSize=ByteSize(asset.size), encodingFormat="application/x-nwb", digest={DigestType.dandi_etag: digest.value}, path="sub-mouse001.nwb", dateModified=ANY_AWARE_DATETIME, blobDateModified=mtime, wasAttributedTo=[ - Participant.unvalidated( + Participant( identifier="mouse001", schemaKey="Participant", - age=PropertyValue.unvalidated( + age=PropertyValue( schemaKey="PropertyValue", unitText="ISO-8601 duration", value="P135DT43200S", - valueReference=PropertyValue.unvalidated( + valueReference=PropertyValue( schemaKey="PropertyValue", value=AgeReferenceType.BirthReference, ), ), - sex=SexType.unvalidated(schemaKey="SexType", name="Unknown"), - species=SpeciesType.unvalidated( + sex=SexType(schemaKey="SexType", name="Unknown"), + species=SpeciesType( schemaKey="SpeciesType", identifier="http://purl.obolibrary.org/obo/NCBITaxon_10090", name="Mus musculus - House mouse", diff --git a/dandi/upload.py b/dandi/upload.py index 1fec6e75c..c5f553514 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -345,7 +345,7 @@ def process_path(dfile: DandiFile) -> Iterator[dict]: try: metadata = dfile.get_metadata( digest=file_etag, ignore_errors=allow_any_path - ).json_dict() + ).model_dump(mode="json", exclude_none=True) except Exception as e: raise UploadError("failed to extract metadata: %s" % str(e)) diff --git a/dandi/utils.py b/dandi/utils.py index 1fada784d..9576f8e5b 100644 --- a/dandi/utils.py +++ b/dandi/utils.py @@ -26,7 +26,7 @@ from urllib.parse import parse_qs, urlparse, urlunparse import dateutil.parser -from pydantic import AnyHttpUrl, BaseModel, Field +from pydantic import BaseModel, Field import requests import ruamel.yaml from semantic_version import Version @@ -531,7 +531,9 @@ def delayed(*args, **kwargs): class ServiceURL(BaseModel): - url: AnyHttpUrl + # Don't use pydantic.AnyHttpUrl, as that adds a trailing slash, and so URLs + # retrieved for known instances won't match the known values + url: str class ServerServices(BaseModel): @@ -557,11 +559,11 @@ def get_instance(dandi_instance_id: str | DandiInstance) -> DandiInstance: instance = dandi_instance_id dandi_id = instance.name elif dandi_instance_id.lower().startswith(("http://", "https://")): - redirector_url = dandi_instance_id - dandi_id = known_instances_rev.get(redirector_url.rstrip("/")) + redirector_url = dandi_instance_id.rstrip("/") + dandi_id = known_instances_rev.get(redirector_url) if dandi_id is not None: instance = known_instances[dandi_id] - is_api = instance.api.rstrip("/") == redirector_url.rstrip("/") + is_api = instance.api.rstrip("/") == redirector_url else: instance = None is_api = False @@ -574,7 +576,7 @@ def get_instance(dandi_instance_id: str | DandiInstance) -> DandiInstance: assert instance is not None return _get_instance(instance.api.rstrip("/"), True, instance, dandi_id) else: - return _get_instance(redirector_url.rstrip("/"), is_api, instance, dandi_id) + return _get_instance(redirector_url, is_api, instance, dandi_id) @lru_cache @@ -583,13 +585,13 @@ def _get_instance( ) -> DandiInstance: try: if is_api: - r = requests.get(f"{url}/info/") + r = requests.get(joinurl(url, "/info/")) else: - r = requests.get(f"{url}/server-info") + r = requests.get(joinurl(url, "/server-info")) if r.status_code == 404: - r = requests.get(f"{url}/api/info/") + r = requests.get(joinurl(url, "/api/info/")) r.raise_for_status() - server_info = ServerInfo.parse_obj(r.json()) + server_info = ServerInfo.model_validate(r.json()) except Exception as e: lgr.warning("Request to %s failed (%s)", url, str(e)) if instance is not None: @@ -615,18 +617,23 @@ def _get_instance( raise BadCliVersionError(our_version, minversion, bad_versions) api_url = server_info.services.api.url if dandi_id is None: - dandi_id = api_url.host - assert dandi_id is not None - if api_url.port is not None: - if ":" in dandi_id: - dandi_id = f"[{dandi_id}]" - dandi_id += f":{api_url.port}" + # Don't use pydantic.AnyHttpUrl, as that sets the `port` attribute even + # if it's not present in the string. + bits = urlparse(api_url) + if bits.hostname is not None: + dandi_id = bits.hostname + if bits.port is not None: + if ":" in dandi_id: + dandi_id = f"[{dandi_id}]" + dandi_id += f":{bits.port}" + else: + dandi_id = api_url return DandiInstance( name=dandi_id, - gui=str(server_info.services.webui.url) + gui=server_info.services.webui.url if server_info.services.webui is not None else None, - api=str(api_url), + api=api_url, ) @@ -860,3 +867,17 @@ def post_upload_size_check(path: Path, pre_check_size: int, erroring: bool) -> N lgr.error(msg) else: raise RuntimeError(msg) + + +def joinurl(base: str, path: str) -> str: + """ + Append a slash-separated ``path`` to a base URL ``base``. The two + components are separated by a single slash, removing any excess slashes + that would be present after naïve concatenation. + + If ``path`` is already an absolute URL, it is returned unchanged. + """ + if path.lower().startswith(("http://", "https://")): + return path + else: + return base.rstrip("/") + "/" + path.lstrip("/") From 9ca1d309101bde5249d398308f67fa2874d804cb Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 12 Feb 2024 14:41:25 -0500 Subject: [PATCH 04/16] Accept both dandischema 0.9.x and 0.10.x --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 34f7e7637..f6ca8b1c8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,7 +33,7 @@ install_requires = bidsschematools ~= 0.7.0 click >= 7.1 click-didyoumean - dandischema ~= 0.9.0 + dandischema >= 0.9.0, < 0.11 etelemetry >= 0.2.2 fasteners fscacher >= 0.3.0 From 728882f13cd824254e77bdce61b65500957a6c28 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 22 Feb 2024 15:30:58 -0500 Subject: [PATCH 05/16] Adjust joinurl() docs --- dandi/utils.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/dandi/utils.py b/dandi/utils.py index 9576f8e5b..ec06d8ce7 100644 --- a/dandi/utils.py +++ b/dandi/utils.py @@ -630,9 +630,11 @@ def _get_instance( dandi_id = api_url return DandiInstance( name=dandi_id, - gui=server_info.services.webui.url - if server_info.services.webui is not None - else None, + gui=( + server_info.services.webui.url + if server_info.services.webui is not None + else None + ), api=api_url, ) @@ -871,11 +873,14 @@ def post_upload_size_check(path: Path, pre_check_size: int, erroring: bool) -> N def joinurl(base: str, path: str) -> str: """ - Append a slash-separated ``path`` to a base URL ``base``. The two + Append a slash-separated ``path`` to a base HTTP(S) URL ``base``. The two components are separated by a single slash, removing any excess slashes that would be present after naïve concatenation. - If ``path`` is already an absolute URL, it is returned unchanged. + If ``path`` is already an absolute HTTP(S) URL, it is returned unchanged. + + Note that this function differs from `urllib.parse.urljoin()` when the path + portion of ``base`` is nonempty and does not end in a slash. """ if path.lower().startswith(("http://", "https://")): return path From fc5c81302240cefe6efec2ed7f87d6232be3863f Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 26 Feb 2024 16:47:48 -0500 Subject: [PATCH 06/16] upload: Rename "upload" pyout column to "progress" --- .pre-commit-config.yaml | 2 +- dandi/files/bases.py | 2 +- dandi/files/zarr.py | 2 +- dandi/support/pyout.py | 2 +- dandi/upload.py | 8 ++++---- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6e3ed5c6e..ae903d142 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,7 +24,7 @@ repos: - id: codespell exclude: ^(dandi/_version\.py|dandi/due\.py|versioneer\.py)$ - repo: https://github.com/PyCQA/flake8 - rev: 4.0.1 + rev: 7.0.0 hooks: - id: flake8 exclude: ^(dandi/_version\.py|dandi/due\.py|versioneer\.py)$ diff --git a/dandi/files/bases.py b/dandi/files/bases.py index 563ed07de..751f6d27e 100644 --- a/dandi/files/bases.py +++ b/dandi/files/bases.py @@ -401,7 +401,7 @@ def iter_upload( bytes_uploaded += out_part["size"] yield { "status": "uploading", - "upload": 100 * bytes_uploaded / total_size, + "progress": 100 * bytes_uploaded / total_size, "current": bytes_uploaded, } parts_out.append(out_part) diff --git a/dandi/files/zarr.py b/dandi/files/zarr.py index 9b7e82cd1..029eecbc9 100644 --- a/dandi/files/zarr.py +++ b/dandi/files/zarr.py @@ -497,7 +497,7 @@ def mkzarr() -> str: bytes_uploaded += size yield { "status": "uploading", - "upload": 100 + "progress": 100 * bytes_uploaded / to_upload.total_size, "current": bytes_uploaded, diff --git a/dandi/support/pyout.py b/dandi/support/pyout.py index 25719ada7..7ceaf5680 100644 --- a/dandi/support/pyout.py +++ b/dandi/support/pyout.py @@ -160,7 +160,7 @@ def get_style(hide_if_missing=True): ), aggregate=counts, ), - "upload": progress_style, + "progress": progress_style, "done%": progress_style, "checksum": dict( align="center", diff --git a/dandi/upload.py b/dandi/upload.py index c5f553514..3a8be977f 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -146,13 +146,13 @@ def check_len(obj: io.IOBase, name: Any) -> None: raise RuntimeError( f"requests.utils.super_len() reported size of 0 for" f" {name!r}, but os.stat() reported size" - f" {stat_size} bytes {i+1} tries later" + f" {stat_size} bytes {i + 1} tries later" ) if fstat_size not in (None, 0): raise RuntimeError( f"requests.utils.super_len() reported size of 0 for" f" {name!r}, but os.fstat() reported size" - f" {fstat_size} bytes {i+1} tries later" + f" {fstat_size} bytes {i + 1} tries later" ) lgr.debug( "- Size of %r still appears to be 0 after 10 rounds of" @@ -403,9 +403,9 @@ def upload_agg(*ignored: Any) -> str: return "%s/s" % naturalsize(speed) pyout_style = pyouts.get_style(hide_if_missing=False) - pyout_style["upload"]["aggregate"] = upload_agg + pyout_style["progress"]["aggregate"] = upload_agg - rec_fields = ["path", "size", "errors", "upload", "status", "message"] + rec_fields = ["path", "size", "errors", "progress", "status", "message"] out = pyouts.LogSafeTabular( style=pyout_style, columns=rec_fields, max_workers=jobs or 5 ) From 5970a3bbaddcfd7e379ac91c4281bb95f73d32b9 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 26 Feb 2024 17:38:25 -0500 Subject: [PATCH 07/16] Report progress in deleting Zarr entries during upload --- dandi/files/zarr.py | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/dandi/files/zarr.py b/dandi/files/zarr.py index 029eecbc9..1c5140c08 100644 --- a/dandi/files/zarr.py +++ b/dandi/files/zarr.py @@ -19,6 +19,7 @@ from dandi import get_logger from dandi.consts import ( MAX_ZARR_DEPTH, + ZARR_DELETE_BATCH_SIZE, ZARR_MIME_TYPE, ZARR_UPLOAD_BATCH_SIZE, EmbargoStatus, @@ -442,7 +443,11 @@ def mkzarr() -> str: local_digest, ) if to_delete: - a.rmfiles(to_delete, reingest=False) + yield from _rmfiles( + asset=a, + entries=to_delete, + status="deleting conflicting remote files", + ) else: yield {"status": "traversing local Zarr"} for local_entry in self.iterfiles(): @@ -506,13 +511,16 @@ def mkzarr() -> str: lgr.debug("%s: All files uploaded", asset_path) old_zarr_files = list(old_zarr_entries.values()) if old_zarr_files: - yield {"status": "deleting extra remote files"} lgr.debug( "%s: Deleting %s in remote Zarr not present locally", asset_path, pluralize(len(old_zarr_files), "file"), ) - a.rmfiles(old_zarr_files, reingest=False) + yield from _rmfiles( + asset=a, + entries=old_zarr_files, + status="deleting extra remote files", + ) changed = True if changed: lgr.debug( @@ -533,9 +541,9 @@ def mkzarr() -> str: lgr.info( "%s: Asset checksum mismatch (local: %s;" " server: %s); redoing upload", + asset_path, our_checksum, server_checksum, - asset_path, ) yield {"status": "Checksum mismatch"} break @@ -677,3 +685,24 @@ def _cmp_digests( else: lgr.debug("%s: File %s already on server; skipping", asset_path, local_entry) return (local_entry, local_digest, False) + + +def _rmfiles( + asset: RemoteZarrAsset, entries: list[RemoteZarrEntry], status: str +) -> Iterator[dict]: + # Do the batching outside of the rmfiles() method so that we can report + # progress on the completion of each batch + yield { + "status": status, + "progress": 0, + "current": 0, + } + deleted = 0 + for ents in chunked(entries, ZARR_DELETE_BATCH_SIZE): + asset.rmfiles(ents, reingest=False) + deleted += len(ents) + yield { + "status": status, + "progress": deleted / len(entries) * 100, + "current": deleted, + } From c99385df1cf54fd219e767a186bfb58d227f3dc9 Mon Sep 17 00:00:00 2001 From: DANDI Bot Date: Thu, 29 Feb 2024 14:34:37 +0000 Subject: [PATCH 08/16] Update CHANGELOG.md [skip ci] --- CHANGELOG.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 69572ee93..2d8c02ec9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,21 @@ +# 0.60.0 (Thu Feb 29 2024) + +#### 🚀 Enhancement + +- Report progress in deleting Zarr entries during upload [#1412](https://github.com/dandi/dandi-cli/pull/1412) ([@jwodder](https://github.com/jwodder)) +- Update for Pydantic v2 [#1381](https://github.com/dandi/dandi-cli/pull/1381) ([@jwodder](https://github.com/jwodder)) + +#### 🏠 Internal + +- [gh-actions](deps): Bump codecov/codecov-action from 3 to 4 [#1402](https://github.com/dandi/dandi-cli/pull/1402) ([@dependabot[bot]](https://github.com/dependabot[bot])) + +#### Authors: 2 + +- [@dependabot[bot]](https://github.com/dependabot[bot]) +- John T. Wodder II ([@jwodder](https://github.com/jwodder)) + +--- + # 0.59.1 (Fri Feb 02 2024) #### 🏠 Internal From b4ccbeea72e6d13150b1ba7d5556d4c3a981d288 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Wed, 28 Feb 2024 12:02:52 -0500 Subject: [PATCH 09/16] Add arguments for API query parameters when fetching all Dandisets --- dandi/dandiapi.py | 56 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 5b00f743c..af35008ba 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -554,13 +554,63 @@ def get_dandiset( return d.for_version(version_id) return d - def get_dandisets(self) -> Iterator[RemoteDandiset]: + def get_dandisets( + self, + *, + draft: bool | None = None, + embargoed: bool | None = None, + empty: bool | None = None, + mine: bool | None = None, + order: str | None = None, + search: str | None = None, + ) -> Iterator[RemoteDandiset]: """ Returns a generator of all Dandisets on the server. For each Dandiset, the `RemoteDandiset`'s version is set to the most recent published version if there is one, otherwise to the draft version. - """ - for data in self.paginate("/dandisets/"): + + .. versionchanged:: 0.61.0 + + ``draft``, ``embargoed``, ``empty``, ``mine``, ``order``, and + ``search`` parameters added + + :param draft: + If true, Dandisets that have only draft versions (i.e., that + haven't yet been published) will be included in the results + (default true) + + :param embargoed: + If true, embargoed Dandisets will be included in the results + (default false) + + :param empty: + If true, empty Dandisets will be included in the results (default + true) + + :param mine: + If true, only Dandisets owned by the authenticated user will be + retrieved (default false) + + :param order: + The field to sort the results by. The accepted field names are + ``"id"``, ``"name"``, ``"modified"``, and ``"size"``. Prepend a + hyphen to the field name to reverse the sort order. + + :param search: + A search string to filter the returned Dandisets by. The string is + searched for in the metadata of Dandiset versions. + """ + for data in self.paginate( + "/dandisets/", + params={ + "draft": draft, + "embargoed": embargoed, + "empty": empty, + "ordering": order, + "search": search, + "user": "me" if mine else None, + }, + ): yield RemoteDandiset.from_data(self, data) def create_dandiset(self, name: str, metadata: dict[str, Any]) -> RemoteDandiset: From 939cbf119924f62e800d0fa6cd92467faf79154b Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 29 Feb 2024 13:39:00 -0500 Subject: [PATCH 10/16] Add `embargo` option to `create_dandiset()` --- dandi/dandiapi.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index af35008ba..95960825e 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -613,10 +613,24 @@ def get_dandisets( ): yield RemoteDandiset.from_data(self, data) - def create_dandiset(self, name: str, metadata: dict[str, Any]) -> RemoteDandiset: - """Creates a Dandiset with the given name & metadata""" + def create_dandiset( + self, name: str, metadata: dict[str, Any], *, embargo: bool = False + ) -> RemoteDandiset: + """ + Creates a Dandiset with the given name & metadata. If ``embargo`` is + `True`, the resulting Dandiset will be embargoed. + + .. versionchanged:: 0.61.0 + + ``embargo`` argument added + """ return RemoteDandiset.from_data( - self, self.post("/dandisets/", json={"name": name, "metadata": metadata}) + self, + self.post( + "/dandisets/", + json={"name": name, "metadata": metadata}, + params={"embargo": "true" if embargo else "false"}, + ), ) def check_schema_version(self, schema_version: str | None = None) -> None: From 1ed5a5562bf0cde70507d9aa664b0807702c9220 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 29 Feb 2024 14:14:03 -0500 Subject: [PATCH 11/16] Add tests --- dandi/tests/fixtures.py | 3 ++- dandi/tests/test_dandiapi.py | 44 ++++++++++++++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/dandi/tests/fixtures.py b/dandi/tests/fixtures.py index 2981ddd62..a18123e15 100644 --- a/dandi/tests/fixtures.py +++ b/dandi/tests/fixtures.py @@ -520,7 +520,7 @@ class SampleDandisetFactory: local_dandi_api: DandiAPI tmp_path_factory: pytest.TempPathFactory - def mkdandiset(self, name: str) -> SampleDandiset: + def mkdandiset(self, name: str, embargo: bool = False) -> SampleDandiset: d = self.local_dandi_api.client.create_dandiset( name, # Minimal metadata needed to create a publishable Dandiset: @@ -539,6 +539,7 @@ def mkdandiset(self, name: str) -> SampleDandiset: } ], }, + embargo=embargo, ) dspath = self.tmp_path_factory.mktemp("dandiset") (dspath / dandiset_metadata_file).write_text(f"identifier: '{d.identifier}'\n") diff --git a/dandi/tests/test_dandiapi.py b/dandi/tests/test_dandiapi.py index 077f16ce7..38d558da7 100644 --- a/dandi/tests/test_dandiapi.py +++ b/dandi/tests/test_dandiapi.py @@ -16,7 +16,7 @@ import requests import responses -from .fixtures import DandiAPI, SampleDandiset +from .fixtures import DandiAPI, SampleDandiset, SampleDandisetFactory from .skip import mark from .. import dandiapi from ..consts import ( @@ -329,7 +329,47 @@ def test_check_schema_version_mismatch() -> None: def test_get_dandisets(text_dandiset: SampleDandiset) -> None: dandisets = list(text_dandiset.client.get_dandisets()) - assert sum(1 for d in dandisets if d.identifier == text_dandiset.dandiset_id) == 1 + assert text_dandiset.dandiset_id in [d.identifier for d in dandisets] + + +def test_get_embargoed_dandisets( + sample_dandiset_factory: SampleDandisetFactory, +) -> None: + ds = sample_dandiset_factory.mkdandiset("Embargoed Dandiset", embargo=True) + dandisets = list(ds.client.get_dandisets(embargoed=True)) + assert ds.dandiset_id in [d.identifier for d in dandisets] + dandisets = list(ds.client.get_dandisets(embargoed=False)) + assert ds.dandiset_id not in [d.identifier for d in dandisets] + dandisets = list(ds.client.get_dandisets(embargoed=None)) + assert ds.dandiset_id not in [d.identifier for d in dandisets] + + +def test_get_draft_dandisets(new_dandiset: SampleDandiset) -> None: + dandisets = list(new_dandiset.client.get_dandisets(draft=True)) + assert new_dandiset.dandiset_id in [d.identifier for d in dandisets] + dandisets = list(new_dandiset.client.get_dandisets(draft=False)) + assert new_dandiset.dandiset_id not in [d.identifier for d in dandisets] + dandisets = list(new_dandiset.client.get_dandisets(draft=None)) + assert new_dandiset.dandiset_id in [d.identifier for d in dandisets] + + +def test_get_empty_dandisets(new_dandiset: SampleDandiset) -> None: + dandisets = list(new_dandiset.client.get_dandisets(empty=True)) + assert new_dandiset.dandiset_id in [d.identifier for d in dandisets] + dandisets = list(new_dandiset.client.get_dandisets(empty=False)) + assert new_dandiset.dandiset_id not in [d.identifier for d in dandisets] + dandisets = list(new_dandiset.client.get_dandisets(empty=None)) + assert new_dandiset.dandiset_id in [d.identifier for d in dandisets] + + +def test_search_get_dandisets( + sample_dandiset_factory: SampleDandisetFactory, +) -> None: + ds = sample_dandiset_factory.mkdandiset("Unicorn Dandiset") + dandisets = list(ds.client.get_dandisets(search="Unicorn")) + assert ds.dandiset_id in [d.identifier for d in dandisets] + dandisets = list(ds.client.get_dandisets(search="Dragon")) + assert ds.dandiset_id not in [d.identifier for d in dandisets] def test_get_dandiset_lazy( From 94eae70205ed8c1c52f5138ab959f1afc6d3a37a Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 1 Mar 2024 12:04:14 -0500 Subject: [PATCH 12/16] Replace most uses of urllib with yarl --- dandi/dandiapi.py | 11 ++++++----- dandi/utils.py | 32 ++++++++++++++++++-------------- setup.cfg | 1 + 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 95960825e..a7169c3af 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -14,13 +14,13 @@ from time import sleep, time from types import TracebackType from typing import TYPE_CHECKING, Any, Dict, List, Optional -from urllib.parse import quote_plus, urlparse, urlunparse import click from dandischema import models from pydantic import BaseModel, Field, PrivateAttr import requests import tenacity +from yarl import URL from . import get_logger from .consts import ( @@ -1521,7 +1521,7 @@ def get_content_url( else: raise # reraise since we need to figure out how to handle such a case if strip_query: - url = urlunparse(urlparse(url)._replace(query="")) + url = str(URL(url).with_query(None)) return url def get_download_file_iter( @@ -1970,9 +1970,10 @@ def get_download_file_iter( Returns a function that when called (optionally with an offset into the file to start downloading at) returns a generator of chunks of the file """ - prefix = quote_plus(str(self)) - url = self.client.get_url( - f"/zarr/{self.zarr_id}/files?prefix={prefix}&download=true" + url = str( + URL(self.client.get_url(f"/zarr/{self.zarr_id}/files/")).with_query( + {"prefix": str(self), "download": "true"} + ) ) def downloader(start_at: int = 0) -> Iterator[bytes]: diff --git a/dandi/utils.py b/dandi/utils.py index ec06d8ce7..b7f9ed5b5 100644 --- a/dandi/utils.py +++ b/dandi/utils.py @@ -23,13 +23,14 @@ import traceback import types from typing import IO, Any, List, Optional, Protocol, TypeVar, Union -from urllib.parse import parse_qs, urlparse, urlunparse import dateutil.parser +from multidict import MultiDict # dependency of yarl from pydantic import BaseModel, Field import requests import ruamel.yaml from semantic_version import Version +from yarl import URL from . import __version__, get_logger from .consts import DandiInstance, known_instances, known_instances_rev @@ -567,8 +568,9 @@ def get_instance(dandi_instance_id: str | DandiInstance) -> DandiInstance: else: instance = None is_api = False - bits = urlparse(redirector_url) - redirector_url = urlunparse((bits[0], bits[1], "", "", "", "")) + redirector_url = str( + URL(redirector_url).with_path("").with_query(None).with_fragment(None) + ) else: dandi_id = dandi_instance_id instance = known_instances[dandi_id] @@ -619,13 +621,13 @@ def _get_instance( if dandi_id is None: # Don't use pydantic.AnyHttpUrl, as that sets the `port` attribute even # if it's not present in the string. - bits = urlparse(api_url) - if bits.hostname is not None: - dandi_id = bits.hostname - if bits.port is not None: + u = URL(api_url) + if u.host is not None: + dandi_id = u.host + if (port := u.explicit_port) is not None: if ":" in dandi_id: dandi_id = f"[{dandi_id}]" - dandi_id += f":{bits.port}" + dandi_id += f":{port}" else: dandi_id = api_url return DandiInstance( @@ -790,12 +792,14 @@ def is_page2_url(page1: str, page2: str) -> bool: Tests whether the URL ``page2`` is the same as ``page1`` but with the ``page`` query parameter set to ``2`` """ - bits1 = urlparse(page1) - params1 = parse_qs(bits1.query) - params1["page"] = ["2"] - bits2 = urlparse(page2) - params2 = parse_qs(bits2.query) - return (bits1[:3], params1, bits1.fragment) == (bits2[:3], params2, bits2.fragment) + url1 = URL(page1) + params1 = MultiDict(url1.query) + params1["page"] = "2" + url1 = url1.with_query(None) + url2 = URL(page2) + params2 = url2.query + url2 = url2.with_query(None) + return (url1, sorted(params1.items())) == (url2, sorted(params2.items())) def exclude_from_zarr(path: Path) -> bool: diff --git a/setup.cfg b/setup.cfg index f6ca8b1c8..0e0ebcd8b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -55,6 +55,7 @@ install_requires = ruamel.yaml >=0.15, <1 semantic-version tenacity + yarl ~= 1.9 zarr ~= 2.10 zarr_checksum ~= 0.4.0 zip_safe = False From 1610bf95011cf71c8ba71127ec04b6bc2c5ce775 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 1 Mar 2024 12:14:11 -0500 Subject: [PATCH 13/16] Use yarl in `is_same_url()` --- dandi/delete.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dandi/delete.py b/dandi/delete.py index cae325968..214fde211 100644 --- a/dandi/delete.py +++ b/dandi/delete.py @@ -6,6 +6,7 @@ from pathlib import Path import click +from yarl import URL from .consts import DRAFT, ZARR_EXTENSIONS, DandiInstance, dandiset_metadata_file from .dandiapi import DandiAPIClient, RemoteAsset, RemoteDandiset @@ -246,5 +247,8 @@ def find_local_asset(filepath: str) -> tuple[str, str]: def is_same_url(url1: str, url2: str) -> bool: - # TODO: Use a real URL library like furl, purl, or yarl - return url1.rstrip("/") == url2.rstrip("/") + u1 = URL(url1) + u1 = u1.with_path(u1.path.rstrip("/")) + u2 = URL(url2) + u2 = u2.with_path(u2.path.rstrip("/")) + return u1 == u2 From a62fe7df0a33399b46fe65919c7456eccf5c3aec Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 1 Mar 2024 12:40:52 -0500 Subject: [PATCH 14/16] Clean up URL parsing in `extract_species()` --- dandi/metadata/util.py | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/dandi/metadata/util.py b/dandi/metadata/util.py index b9cb33aa5..e517928e4 100644 --- a/dandi/metadata/util.py +++ b/dandi/metadata/util.py @@ -329,66 +329,68 @@ def extract_cellLine(metadata: dict) -> str | None: return None +SPECIES_URI_TEMPLATE = "http://purl.obolibrary.org/obo/NCBITaxon_{}" + # common_names, prefix, uri, name species_map = [ ( ["mouse"], "mus", - "http://purl.obolibrary.org/obo/NCBITaxon_10090", + SPECIES_URI_TEMPLATE.format("10090"), "Mus musculus - House mouse", ), ( ["human"], "homo", - "http://purl.obolibrary.org/obo/NCBITaxon_9606", + SPECIES_URI_TEMPLATE.format("9606"), "Homo sapiens - Human", ), ( ["rat", "norvegicus"], None, - "http://purl.obolibrary.org/obo/NCBITaxon_10116", + SPECIES_URI_TEMPLATE.format("10116"), "Rattus norvegicus - Norway rat", ), ( ["rattus rattus"], None, - "http://purl.obolibrary.org/obo/NCBITaxon_10117", + SPECIES_URI_TEMPLATE.format("10117"), "Rattus rattus - Black rat", ), ( ["mulatta", "rhesus"], None, - "http://purl.obolibrary.org/obo/NCBITaxon_9544", + SPECIES_URI_TEMPLATE.format("9544"), "Macaca mulatta - Rhesus monkey", ), ( ["jacchus"], None, - "http://purl.obolibrary.org/obo/NCBITaxon_9483", + SPECIES_URI_TEMPLATE.format("9483"), "Callithrix jacchus - Common marmoset", ), ( ["melanogaster", "fruit fly"], None, - "http://purl.obolibrary.org/obo/NCBITaxon_7227", + SPECIES_URI_TEMPLATE.format("7227"), "Drosophila melanogaster - Fruit fly", ), ( ["danio", "zebrafish", "zebra fish"], None, - "http://purl.obolibrary.org/obo/NCBITaxon_7955", + SPECIES_URI_TEMPLATE.format("7955"), "Danio rerio - Zebra fish", ), ( ["c. elegans", "caenorhabditis elegans"], "caenorhabditis", - "http://purl.obolibrary.org/obo/NCBITaxon_6239", + SPECIES_URI_TEMPLATE.format("6239"), "Caenorhabditis elegans", ), ( ["pig-tailed macaque", "pigtail monkey", "pigtail macaque"], None, - "http://purl.obolibrary.org/obo/NCBITaxon_9545", + SPECIES_URI_TEMPLATE.format("9545"), "Macaca nemestrina", ), ] @@ -434,14 +436,18 @@ def extract_species(metadata: dict) -> models.SpeciesType | None: value_orig = metadata.get("species", None) value_id = None if value_orig is not None and value_orig != "": - value = value_orig.lower().rstrip("/") - if value.startswith("http://purl.obolibrary.org/obo/NCBITaxon_".lower()): - for common_names, prefix, uri, name in species_map: - if value.split("//")[1] == uri.lower().rstrip("/").split("//")[1]: + if m := re.fullmatch( + r"https?://purl\.obolibrary\.org/obo/NCBITaxon_([0-9]+)/?", + value_orig, + flags=re.I, + ): + normed_value = SPECIES_URI_TEMPLATE.format(m[1]) + for _common_names, _prefix, uri, name in species_map: + if uri == normed_value: value_id = uri value = name break - if value_id is None: + else: value_id = value_orig lookup = ("rdfs:label", "oboInOwl:hasExactSynonym") try: @@ -457,9 +463,10 @@ def extract_species(metadata: dict) -> models.SpeciesType | None: [result[key] for key in lookup if key in result] ) else: + lower_value = value_orig.lower() for common_names, prefix, uri, name in species_map: - if any(key in value for key in common_names) or ( - prefix and value.startswith(prefix) + if any(key in lower_value for key in common_names) or ( + prefix is not None and lower_value.startswith(prefix) ): value_id = uri value = name From a6704745f3cb037112437b65d39521f01acfef84 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 1 Mar 2024 17:16:20 -0500 Subject: [PATCH 15/16] [DATALAD RUNCMD] Rename SPECIES_URI_TEMPLATE into NCBITAXON_URI_TEMPLATE since otherwise while looking at the use not clear what that index is. May be later it would be even some other level (but still the same template) than species so best to just mention that it is NCBITAXON URL === Do not change lines below === { "chain": [], "cmd": "git-sedi SPECIES_URI_TEMPLATE NCBITAXON_URI_TEMPLATE", "exit": 0, "extra_inputs": [], "inputs": [], "outputs": [], "pwd": "." } ^^^ Do not change lines above ^^^ --- dandi/metadata/util.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/dandi/metadata/util.py b/dandi/metadata/util.py index e517928e4..8849a9164 100644 --- a/dandi/metadata/util.py +++ b/dandi/metadata/util.py @@ -329,68 +329,68 @@ def extract_cellLine(metadata: dict) -> str | None: return None -SPECIES_URI_TEMPLATE = "http://purl.obolibrary.org/obo/NCBITaxon_{}" +NCBITAXON_URI_TEMPLATE = "http://purl.obolibrary.org/obo/NCBITaxon_{}" # common_names, prefix, uri, name species_map = [ ( ["mouse"], "mus", - SPECIES_URI_TEMPLATE.format("10090"), + NCBITAXON_URI_TEMPLATE.format("10090"), "Mus musculus - House mouse", ), ( ["human"], "homo", - SPECIES_URI_TEMPLATE.format("9606"), + NCBITAXON_URI_TEMPLATE.format("9606"), "Homo sapiens - Human", ), ( ["rat", "norvegicus"], None, - SPECIES_URI_TEMPLATE.format("10116"), + NCBITAXON_URI_TEMPLATE.format("10116"), "Rattus norvegicus - Norway rat", ), ( ["rattus rattus"], None, - SPECIES_URI_TEMPLATE.format("10117"), + NCBITAXON_URI_TEMPLATE.format("10117"), "Rattus rattus - Black rat", ), ( ["mulatta", "rhesus"], None, - SPECIES_URI_TEMPLATE.format("9544"), + NCBITAXON_URI_TEMPLATE.format("9544"), "Macaca mulatta - Rhesus monkey", ), ( ["jacchus"], None, - SPECIES_URI_TEMPLATE.format("9483"), + NCBITAXON_URI_TEMPLATE.format("9483"), "Callithrix jacchus - Common marmoset", ), ( ["melanogaster", "fruit fly"], None, - SPECIES_URI_TEMPLATE.format("7227"), + NCBITAXON_URI_TEMPLATE.format("7227"), "Drosophila melanogaster - Fruit fly", ), ( ["danio", "zebrafish", "zebra fish"], None, - SPECIES_URI_TEMPLATE.format("7955"), + NCBITAXON_URI_TEMPLATE.format("7955"), "Danio rerio - Zebra fish", ), ( ["c. elegans", "caenorhabditis elegans"], "caenorhabditis", - SPECIES_URI_TEMPLATE.format("6239"), + NCBITAXON_URI_TEMPLATE.format("6239"), "Caenorhabditis elegans", ), ( ["pig-tailed macaque", "pigtail monkey", "pigtail macaque"], None, - SPECIES_URI_TEMPLATE.format("9545"), + NCBITAXON_URI_TEMPLATE.format("9545"), "Macaca nemestrina", ), ] @@ -441,7 +441,7 @@ def extract_species(metadata: dict) -> models.SpeciesType | None: value_orig, flags=re.I, ): - normed_value = SPECIES_URI_TEMPLATE.format(m[1]) + normed_value = NCBITAXON_URI_TEMPLATE.format(m[1]) for _common_names, _prefix, uri, name in species_map: if uri == normed_value: value_id = uri From e58df6254f9767c355b69eb9d13e7ecd9e52b322 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 5 Mar 2024 07:42:07 -0500 Subject: [PATCH 16/16] Add tests for `is_same_url()` --- dandi/tests/test_delete.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/dandi/tests/test_delete.py b/dandi/tests/test_delete.py index 310b1205e..2b33367ad 100644 --- a/dandi/tests/test_delete.py +++ b/dandi/tests/test_delete.py @@ -8,7 +8,7 @@ from .fixtures import DandiAPI, SampleDandiset from ..consts import DRAFT, dandiset_metadata_file from ..dandiapi import RESTFullAPIClient -from ..delete import delete +from ..delete import delete, is_same_url from ..download import download from ..exceptions import NotFoundError from ..utils import list_paths @@ -439,3 +439,16 @@ def test_delete_zarr_path( assert list_paths(tmp_path) == [ tmp_path / zarr_dandiset.dandiset_id / "dandiset.yaml" ] + + +@pytest.mark.parametrize( + "url1,url2,r", + [ + ("https://example.com/api", "https://example.com/api/", True), + ("https://example.com/api", "http://example.com/api", False), + ("https://example.com/api", "HTTPS://EXAMPLE.COM/api", True), + ("https://example.珠宝/api", "https://example.xn--pbt977c/api", True), + ], +) +def test_is_same_url(url1: str, url2: str, r: bool) -> None: + assert is_same_url(url1, url2) is r