Skip to content

Commit

Permalink
fix: download subtree without clashes
Browse files Browse the repository at this point in the history
  • Loading branch information
NikitaYurasov committed Sep 2, 2024
1 parent 81d69e2 commit 8abd647
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 3 deletions.
18 changes: 15 additions & 3 deletions dbxio/blobs/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,25 @@

@retry(stop=stop_after_attempt(3), wait=wait_fixed(10))
def download_blob_tree(
object_storage_client: 'ObjectStorageClient', local_path: Path, prefix_path: Optional[str] = None
object_storage_client: 'ObjectStorageClient',
local_path: Path,
prefix_path: Optional[str] = None,
):
for blob in object_storage_client.list_blobs(prefix=prefix_path):
if blob.name == prefix_path:
is_dir = object_storage_client.is_directory(blob.name)
if blob.name == prefix_path and is_dir:
# prefix path is subdir, skip it on first iteration
continue

relative_blob_path = blob.name[len(prefix_path) + 1 :] if prefix_path else blob.name
if blob.content_settings.content_type is None:
if not relative_blob_path:
# if the prefix path is full path to one file
relative_blob_path = blob.name[len(object_storage_client.blobs_path) + 1 :]

if is_dir:
if not blob.name.startswith(f'{prefix_path}/'):
# we found a directory with the same prefix, but it's not from our subtree
continue
# it's a directory, create it
Path(local_path / relative_blob_path).mkdir(parents=True, exist_ok=True)
continue
Expand Down
4 changes: 4 additions & 0 deletions dbxio/core/cloud/azure/object_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ def list_blobs(self, prefix: Optional[str] = None, **kwargs) -> Iterator:
container_client = self.blob_service_client.get_container_client(self.container_name)
return container_client.list_blobs(name_starts_with=prefix, **kwargs)

def is_directory(self, blob_name: str) -> bool:
blob_client = self.blob_service_client.get_container_client(self.container_name).get_blob_client(blob_name)
return blob_client.get_blob_properties().metadata.get('hdi_isfolder') == 'true'

def download_blob(self, blob_name: str) -> bytes:
blob_client = self.blob_service_client.get_blob_client(container=self.container_name, blob=blob_name)
return blob_client.download_blob().readall()
Expand Down
4 changes: 4 additions & 0 deletions dbxio/core/cloud/client/object_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ def try_delete_blob(self, blob_name: str):
def list_blobs(self, prefix: Optional[str] = None, **kwargs) -> Iterator:
raise NotImplementedError

@abstractmethod
def is_directory(self, blob_name: str) -> bool:
raise NotImplementedError

@abstractmethod
def download_blob(self, blob_name: str) -> bytes:
raise NotImplementedError
Expand Down

0 comments on commit 8abd647

Please sign in to comment.