From 8abd6471bc2e5c7ac0b50fc54a4c666a30af78a3 Mon Sep 17 00:00:00 2001 From: Nikita Yurasov Date: Mon, 2 Sep 2024 18:02:35 +0200 Subject: [PATCH 1/3] fix: download subtree without clashes --- dbxio/blobs/download.py | 18 +++++++++++++++--- dbxio/core/cloud/azure/object_storage.py | 4 ++++ dbxio/core/cloud/client/object_storage.py | 4 ++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/dbxio/blobs/download.py b/dbxio/blobs/download.py index 1a122f3..c0b0ae5 100644 --- a/dbxio/blobs/download.py +++ b/dbxio/blobs/download.py @@ -9,13 +9,25 @@ @retry(stop=stop_after_attempt(3), wait=wait_fixed(10)) def download_blob_tree( - object_storage_client: 'ObjectStorageClient', local_path: Path, prefix_path: Optional[str] = None + object_storage_client: 'ObjectStorageClient', + local_path: Path, + prefix_path: Optional[str] = None, ): for blob in object_storage_client.list_blobs(prefix=prefix_path): - if blob.name == prefix_path: + is_dir = object_storage_client.is_directory(blob.name) + if blob.name == prefix_path and is_dir: + # prefix path is subdir, skip it on first iteration continue + relative_blob_path = blob.name[len(prefix_path) + 1 :] if prefix_path else blob.name - if blob.content_settings.content_type is None: + if not relative_blob_path: + # if the prefix path is full path to one file + relative_blob_path = blob.name[len(object_storage_client.blobs_path) + 1 :] + + if is_dir: + if not blob.name.startswith(f'{prefix_path}/'): + # we found a directory with the same prefix, but it's not from our subtree + continue # it's a directory, create it Path(local_path / relative_blob_path).mkdir(parents=True, exist_ok=True) continue diff --git a/dbxio/core/cloud/azure/object_storage.py b/dbxio/core/cloud/azure/object_storage.py index 4c55d27..9c79857 100644 --- a/dbxio/core/cloud/azure/object_storage.py +++ b/dbxio/core/cloud/azure/object_storage.py @@ -55,6 +55,10 @@ def list_blobs(self, prefix: Optional[str] = None, **kwargs) -> Iterator: container_client = self.blob_service_client.get_container_client(self.container_name) return container_client.list_blobs(name_starts_with=prefix, **kwargs) + def is_directory(self, blob_name: str) -> bool: + blob_client = self.blob_service_client.get_container_client(self.container_name).get_blob_client(blob_name) + return blob_client.get_blob_properties().metadata.get('hdi_isfolder') == 'true' + def download_blob(self, blob_name: str) -> bytes: blob_client = self.blob_service_client.get_blob_client(container=self.container_name, blob=blob_name) return blob_client.download_blob().readall() diff --git a/dbxio/core/cloud/client/object_storage.py b/dbxio/core/cloud/client/object_storage.py index e50e4f3..1551b29 100644 --- a/dbxio/core/cloud/client/object_storage.py +++ b/dbxio/core/cloud/client/object_storage.py @@ -51,6 +51,10 @@ def try_delete_blob(self, blob_name: str): def list_blobs(self, prefix: Optional[str] = None, **kwargs) -> Iterator: raise NotImplementedError + @abstractmethod + def is_directory(self, blob_name: str) -> bool: + raise NotImplementedError + @abstractmethod def download_blob(self, blob_name: str) -> bytes: raise NotImplementedError From f76dce7675577636eb448731066d26809ec275a9 Mon Sep 17 00:00:00 2001 From: Nikita Yurasov Date: Mon, 2 Sep 2024 18:03:22 +0200 Subject: [PATCH 2/3] bump version --- dbxio/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbxio/__init__.py b/dbxio/__init__.py index f0ac856..75f72d1 100644 --- a/dbxio/__init__.py +++ b/dbxio/__init__.py @@ -4,4 +4,4 @@ from dbxio.utils import * # noqa: F403 from dbxio.volume import * # noqa: F403 -__version__ = '0.4.0' # single source of truth +__version__ = '0.4.1' # single source of truth From c4dcccb47965ed1b8fbdef3832ce7feb0a531592 Mon Sep 17 00:00:00 2001 From: Nikita Yurasov Date: Mon, 2 Sep 2024 18:06:33 +0200 Subject: [PATCH 3/3] refactoring --- dbxio/blobs/download.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dbxio/blobs/download.py b/dbxio/blobs/download.py index c0b0ae5..9d233f0 100644 --- a/dbxio/blobs/download.py +++ b/dbxio/blobs/download.py @@ -22,6 +22,7 @@ def download_blob_tree( relative_blob_path = blob.name[len(prefix_path) + 1 :] if prefix_path else blob.name if not relative_blob_path: # if the prefix path is full path to one file + assert object_storage_client.blobs_path, 'blobs_path is not set' relative_blob_path = blob.name[len(object_storage_client.blobs_path) + 1 :] if is_dir: