From 9c6b808189c6cb99e83933bd606426f00cc1ac66 Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Fri, 13 Dec 2024 11:10:21 +0100 Subject: [PATCH 01/18] Test with UPath --- CI/SCRIPTS/script_utils.py | 9 ++- CI/SCRIPTS/test_ci.py | 27 ++++---- CI/SCRIPTS/test_files.py | 125 +++++++++++++++++-------------------- CI/SCRIPTS/test_s3.py | 5 +- CI/SCRIPTS/test_unistra.py | 4 +- sertit/__init__.py | 12 +++- sertit/files.py | 36 +++++------ sertit/path.py | 22 +++++-- sertit/rasters_rio.py | 4 +- sertit/s3.py | 33 ++++++++++ sertit/types.py | 7 ++- sertit/vectors.py | 4 +- 12 files changed, 166 insertions(+), 122 deletions(-) diff --git a/CI/SCRIPTS/script_utils.py b/CI/SCRIPTS/script_utils.py index 5afe358..1d8863e 100644 --- a/CI/SCRIPTS/script_utils.py +++ b/CI/SCRIPTS/script_utils.py @@ -39,8 +39,13 @@ class Polarization(ListEnum): def get_s3_ci_path(): """Get S3 CI path""" - unistra.define_s3_client() - return AnyPath("s3://sertit-sertit-utils-ci") + # unistra.define_s3_client() + + from sertit.unistra import UNISTRA_S3_ENDPOINT + + return AnyPath( + "s3://sertit-sertit-utils-ci", endpoint_url=f"https://{UNISTRA_S3_ENDPOINT}" + ) def get_proj_path(): diff --git a/CI/SCRIPTS/test_ci.py b/CI/SCRIPTS/test_ci.py index fb0a65d..72a5632 100644 --- a/CI/SCRIPTS/test_ci.py +++ b/CI/SCRIPTS/test_ci.py @@ -22,7 +22,7 @@ from lxml import etree from CI.SCRIPTS.script_utils import files_path, rasters_path, s3_env, vectors_path -from sertit import ci, path, rasters, rasters_rio, vectors +from sertit import ci, path, rasters, rasters_rio, s3, vectors ci.reduce_verbosity() @@ -169,15 +169,15 @@ def test_assert_raster(): @s3_env -def test_assert_xml(): +def test_assert_xml(tmp_path): # XML xml_folder = files_path().joinpath("LM05_L1TP_200030_20121230_20200820_02_T2_CI") xml_path = xml_folder.joinpath("LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml") xml_bad_path = xml_folder.joinpath("false_xml.xml") if path.is_cloud_path(files_path()): - xml_path = xml_path.fspath - xml_bad_path = xml_bad_path.fspath + xml_path = s3.download(xml_path, tmp_path) + xml_bad_path = s3.download(xml_bad_path, tmp_path) xml_ok = etree.parse(str(xml_path)).getroot() xml_nok = etree.parse(str(xml_bad_path)).getroot() @@ -188,19 +188,18 @@ def test_assert_xml(): @s3_env -def test_assert_html(): +def test_assert_html(tmp_path): # HTML html_path = files_path().joinpath("productPreview.html") html_bad_path = files_path().joinpath("false.html") - with tempfile.TemporaryDirectory() as tmp_dir: - if path.is_cloud_path(files_path()): - html_path = html_path.download_to(tmp_dir) - html_bad_path = html_bad_path.download_to(tmp_dir) + if path.is_cloud_path(files_path()): + html_path = s3.download(html_path, tmp_path) + html_bad_path = s3.download(html_bad_path, tmp_path) - html_ok = etree.parse(str(html_path)).getroot() - html_nok = etree.parse(str(html_bad_path)).getroot() + html_ok = etree.parse(str(html_path)).getroot() + html_nok = etree.parse(str(html_bad_path)).getroot() - ci.assert_xml_equal(html_ok, html_ok) - with pytest.raises(AssertionError): - ci.assert_xml_equal(html_ok, html_nok) + ci.assert_xml_equal(html_ok, html_ok) + with pytest.raises(AssertionError): + ci.assert_xml_equal(html_ok, html_nok) diff --git a/CI/SCRIPTS/test_files.py b/CI/SCRIPTS/test_files.py index 0339720..56aa530 100644 --- a/CI/SCRIPTS/test_files.py +++ b/CI/SCRIPTS/test_files.py @@ -25,7 +25,7 @@ from lxml import etree, html from CI.SCRIPTS.script_utils import Polarization, files_path, s3_env -from sertit import AnyPath, ci, files, path, vectors +from sertit import AnyPath, ci, files, path, s3, vectors ci.reduce_verbosity() @@ -94,7 +94,7 @@ def test_archive(): @s3_env -def test_archived_files(): +def test_archived_files(tmp_path): landsat_name = "LM05_L1TP_200030_20121230_20200820_02_T2_CI" ok_folder = files_path().joinpath(landsat_name) zip_file = files_path().joinpath(f"{landsat_name}.zip") @@ -114,70 +114,63 @@ def test_archived_files(): ci.assert_geom_equal(vect_ok, vect_zip) ci.assert_geom_equal(vect_ok, vect_tar) - with tempfile.TemporaryDirectory() as tmp_dir: - # XML - xml_name = "LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml" - xml_ok_path = ok_folder.joinpath(xml_name) - if path.is_cloud_path(files_path()): - xml_ok_path = str(xml_ok_path.download_to(tmp_dir)) - else: - xml_ok_path = str(xml_ok_path) - - xml_regex = f".*{xml_name}" - xml_zip = files.read_archived_xml(zip_file, xml_regex) - xml_tar = files.read_archived_xml(tar_file, r".*_MTL\.xml") - xml_ok = etree.parse(xml_ok_path).getroot() - ci.assert_xml_equal(xml_ok, xml_zip) - ci.assert_xml_equal(xml_ok, xml_tar) - - # FILE + HTML - html_zip_file = files_path().joinpath("productPreview.zip") - html_tar_file = files_path().joinpath("productPreview.tar") - html_name = "productPreview.html" - html_ok_path = files_path().joinpath(html_name) - if path.is_cloud_path(files_path()): - html_ok_path = str(html_ok_path.download_to(tmp_dir)) - else: - html_ok_path = str(html_ok_path) - - html_regex = f".*{html_name}" - - # FILE - file_zip = files.read_archived_file(html_zip_file, html_regex) - file_tar = files.read_archived_file(html_tar_file, html_regex) - html_ok = html.parse(html_ok_path).getroot() - ci.assert_html_equal(html_ok, html.fromstring(file_zip)) - ci.assert_html_equal(html_ok, html.fromstring(file_tar)) - - file_list = path.get_archived_file_list(html_zip_file) - ci.assert_html_equal( - html_ok, - html.fromstring( - files.read_archived_file(html_zip_file, html_regex, file_list=file_list) - ), - ) - - # HTML - html_zip = files.read_archived_html(html_zip_file, html_regex) - html_tar = files.read_archived_html(html_tar_file, html_regex) - ci.assert_html_equal(html_ok, html_zip) - ci.assert_html_equal(html_ok, html_tar) - ci.assert_html_equal( - html_ok, - files.read_archived_html( - html_tar_file, - html_regex, - file_list=path.get_archived_file_list(html_tar_file), - ), - ) - - # ERRORS - with pytest.raises(TypeError): - files.read_archived_file(targz_file, xml_regex) - with pytest.raises(TypeError): - files.read_archived_file(sz_file, xml_regex) - with pytest.raises(FileNotFoundError): - files.read_archived_file(zip_file, "cdzeferf") + # XML + xml_name = "LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml" + xml_ok_path = ok_folder.joinpath(xml_name) + xml_ok_path = str(s3.download(xml_ok_path, tmp_path)) + + xml_regex = f".*{xml_name}" + xml_zip = files.read_archived_xml(zip_file, xml_regex) + xml_tar = files.read_archived_xml(tar_file, r".*_MTL\.xml") + xml_ok = etree.parse(xml_ok_path).getroot() + ci.assert_xml_equal(xml_ok, xml_zip) + ci.assert_xml_equal(xml_ok, xml_tar) + + # FILE + HTML + html_zip_file = files_path().joinpath("productPreview.zip") + html_tar_file = files_path().joinpath("productPreview.tar") + html_name = "productPreview.html" + html_ok_path = files_path().joinpath(html_name) + html_ok_path = str(s3.download(html_ok_path, tmp_path)) + + html_regex = f".*{html_name}" + + # FILE + file_zip = files.read_archived_file(html_zip_file, html_regex) + file_tar = files.read_archived_file(html_tar_file, html_regex) + html_ok = html.parse(html_ok_path).getroot() + ci.assert_html_equal(html_ok, html.fromstring(file_zip)) + ci.assert_html_equal(html_ok, html.fromstring(file_tar)) + + file_list = path.get_archived_file_list(html_zip_file) + ci.assert_html_equal( + html_ok, + html.fromstring( + files.read_archived_file(html_zip_file, html_regex, file_list=file_list) + ), + ) + + # HTML + html_zip = files.read_archived_html(html_zip_file, html_regex) + html_tar = files.read_archived_html(html_tar_file, html_regex) + ci.assert_html_equal(html_ok, html_zip) + ci.assert_html_equal(html_ok, html_tar) + ci.assert_html_equal( + html_ok, + files.read_archived_html( + html_tar_file, + html_regex, + file_list=path.get_archived_file_list(html_tar_file), + ), + ) + + # ERRORS + with pytest.raises(TypeError): + files.read_archived_file(targz_file, xml_regex) + with pytest.raises(TypeError): + files.read_archived_file(sz_file, xml_regex) + with pytest.raises(FileNotFoundError): + files.read_archived_file(zip_file, "cdzeferf") def test_cp_rm(): diff --git a/CI/SCRIPTS/test_s3.py b/CI/SCRIPTS/test_s3.py index f99b073..032376b 100644 --- a/CI/SCRIPTS/test_s3.py +++ b/CI/SCRIPTS/test_s3.py @@ -19,11 +19,10 @@ import pytest import rasterio -from cloudpathlib import AnyPath, S3Client from tempenv import tempenv from CI.SCRIPTS.script_utils import CI_SERTIT_S3 -from sertit import rasters +from sertit import AnyPath, rasters from sertit.s3 import USE_S3_STORAGE, s3_env, temp_s3 @@ -43,6 +42,8 @@ def with_s3(variable_1, variable_2): def without_s3(): + from cloudpathlib import S3Client + S3Client().set_as_default_client() return base_fct(None) diff --git a/CI/SCRIPTS/test_unistra.py b/CI/SCRIPTS/test_unistra.py index bb31894..6f58bb9 100644 --- a/CI/SCRIPTS/test_unistra.py +++ b/CI/SCRIPTS/test_unistra.py @@ -16,11 +16,11 @@ # limitations under the License. """ Script testing the CI """ import pytest -from cloudpathlib import AnyPath, S3Client +from cloudpathlib import S3Client from tempenv import tempenv from CI.SCRIPTS.script_utils import CI_SERTIT_S3 -from sertit import ci, misc, rasters, s3 +from sertit import AnyPath, ci, misc, rasters, s3 from sertit.unistra import ( _get_db_path, get_db2_path, diff --git a/sertit/__init__.py b/sertit/__init__.py index d63b9d5..5189825 100644 --- a/sertit/__init__.py +++ b/sertit/__init__.py @@ -20,11 +20,17 @@ .. include:: ../README.md """ try: - from cloudpathlib import AnyPath + from upath import UPath + + AnyPath = UPath - AnyPath = AnyPath except ImportError: - pass + try: + from cloudpathlib import AnyPath + + AnyPath = AnyPath + except ImportError: + pass # flake8: noqa from .__meta__ import ( diff --git a/sertit/files.py b/sertit/files.py index 918a173..a5dea19 100644 --- a/sertit/files.py +++ b/sertit/files.py @@ -36,7 +36,7 @@ from lxml import etree, html from tqdm import tqdm -from sertit import AnyPath, logs, path +from sertit import AnyPath, logs, path, s3 from sertit.logs import SU_NAME from sertit.strings import DATE_FORMAT from sertit.types import AnyPathStrType, AnyPathType @@ -515,26 +515,20 @@ def archive( 'D:/path/to/output/folder_to_archive.tar.gz' """ archive_path = AnyPath(archive_path) - folder_path = AnyPath(folder_path) - - tmp_dir = None - if path.is_cloud_path(folder_path): - tmp_dir = tempfile.TemporaryDirectory() - folder_path = folder_path.download_to(tmp_dir.name) - - # Shutil make_archive needs a path without extension - archive_base = os.path.splitext(archive_path)[0] - - # Archive the folder - archive_fn = shutil.make_archive( - archive_base, - format=fmt, - root_dir=folder_path.parent, - base_dir=folder_path.name, - ) - if tmp_dir is not None: - tmp_dir.cleanup() + with tempfile.TemporaryDirectory() as tmp_path: + folder_path = s3.download(AnyPath(folder_path), tmp_path) + + # Shutil make_archive needs a path without extension + archive_base = os.path.splitext(archive_path)[0] + + # Archive the folder + archive_fn = shutil.make_archive( + archive_base, + format=fmt, + root_dir=folder_path.parent, + base_dir=folder_path.name, + ) return AnyPath(archive_fn) @@ -755,7 +749,7 @@ def copy(src: AnyPathStrType, dst: AnyPathStrType) -> AnyPathType: src = AnyPath(src) if path.is_cloud_path(src): - out = src.download_to(dst) + out = s3.download(src, dst) else: out = None try: diff --git a/sertit/path.py b/sertit/path.py index 94a8106..079ec5a 100644 --- a/sertit/path.py +++ b/sertit/path.py @@ -593,11 +593,22 @@ def is_cloud_path(path: AnyPathStrType): bool: True if the file is store on the cloud. """ try: - from cloudpathlib import CloudPath + return AnyPath(path).protocol in [ + "s3", + "az", + "adl", + "abfs", + "abfss", + "gs", + "gcs", + ] + except ImportError: + try: + from cloudpathlib import CloudPath - return isinstance(AnyPath(path), CloudPath) - except Exception: - return False + return isinstance(AnyPath(path), CloudPath) + except Exception: + return False def is_path(path: Any) -> bool: @@ -613,5 +624,6 @@ def is_path(path: Any) -> bool: from pathlib import Path from cloudpathlib import CloudPath + from upath import UPath - return isinstance(path, (str, Path, CloudPath)) + return isinstance(path, (str, Path, CloudPath, UPath)) diff --git a/sertit/rasters_rio.py b/sertit/rasters_rio.py index a95af4e..6a0b9ee 100644 --- a/sertit/rasters_rio.py +++ b/sertit/rasters_rio.py @@ -44,7 +44,7 @@ "Please install 'rasterio' to use the 'rasters_rio' package." ) from ex -from sertit import AnyPath, geometry, logs, misc, path, strings, vectors, xml +from sertit import AnyPath, geometry, logs, misc, path, s3, strings, vectors, xml from sertit.logs import SU_NAME from sertit.types import AnyNumpyArray, AnyPathStrType, AnyPathType, AnyRasterType @@ -1435,7 +1435,7 @@ def merge_vrt( crs_path = AnyPath(crs_path) # Download file if VRT is needed if path.is_cloud_path(crs_path): - crs_path = crs_path.download_to(merged_path.parent) + crs_path = s3.download(crs_path, merged_path.parent) with rasterio.open(str(crs_path)) as src: if first_crs is None: diff --git a/sertit/s3.py b/sertit/s3.py index d9bf2e8..f1e35e1 100644 --- a/sertit/s3.py +++ b/sertit/s3.py @@ -24,6 +24,7 @@ from cloudpathlib import S3Client +from sertit import AnyPath, path from sertit.logs import SU_NAME LOGGER = logging.getLogger(SU_NAME) @@ -272,3 +273,35 @@ def define_s3_client( client = S3Client(**args_s3_client) client.set_as_default_client() + + +def download(src, dst): + + # By default, use the src path + downloaded_path = src + + if path.is_path(src): + from cloudpathlib import CloudPath + from upath import UPath + + # Universal pathlib + if isinstance(src, UPath): + import shutil + + dst = AnyPath(dst) + if dst.is_dir(): + downloaded_path = dst / src.name + else: + downloaded_path = dst + + with src.open("rb") as f0, downloaded_path.open("wb") as f1: + shutil.copyfileobj(f0, f1) + + # cloudpathlib + elif isinstance(src, CloudPath): + if dst is None: + downloaded_path = src.fspath + else: + downloaded_path = src.download_to(dst) + + return downloaded_path diff --git a/sertit/types.py b/sertit/types.py index ef02fcb..d44b625 100644 --- a/sertit/types.py +++ b/sertit/types.py @@ -8,11 +8,12 @@ from cloudpathlib import CloudPath from rasterio.io import DatasetReader, DatasetWriter from shapely import MultiPolygon, Polygon +from upath import UPath -AnyPathType = Union[CloudPath, Path] -"""Any Path Type (derived from Pathlib and CloudpathLib)""" +AnyPathType = Union[CloudPath, Path, UPath] +"""Any Path Type (derived from Pathlib, Universal Pathlib and CloudpathLib)""" -AnyPathStrType = Union[str, CloudPath, Path] +AnyPathStrType = Union[str, AnyPathType] """Same as :code:`AnyPathType` but appened with :code:`str`""" AnyXrDataStructure = Union[xr.DataArray, xr.Dataset] diff --git a/sertit/vectors.py b/sertit/vectors.py index 818b7ee..ef3b626 100644 --- a/sertit/vectors.py +++ b/sertit/vectors.py @@ -35,7 +35,7 @@ from cloudpathlib.exceptions import AnyPathTypeError from shapely import Polygon, wkt -from sertit import AnyPath, files, geometry, logs, misc, path, strings +from sertit import AnyPath, files, geometry, logs, misc, path, s3, strings from sertit.logs import SU_NAME from sertit.types import AnyPathStrType, AnyPathType @@ -717,7 +717,7 @@ def ogr2geojson( else: # vector_path should be downloaded to work with 'ogr2ogr' if path.is_cloud_path(vector_path): - vector_path = AnyPath(vector_path).fspath + vector_path = s3.download(vector_path, out_dir) vect_path = vector_path vect_path_gj = os.path.join( From 7a2cb5db715838a4adf7e858235b92f5041e8a1c Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Fri, 13 Dec 2024 11:57:43 +0100 Subject: [PATCH 02/18] Updates to make UPath work with zipfile and tarfiles --- CI/SCRIPTS/test_s3.py | 3 ++- CI/SCRIPTS/test_types.py | 3 ++- CI/SCRIPTS/test_unistra.py | 6 +++--- sertit/files.py | 25 +++++++++++++++++++------ sertit/path.py | 21 +++++++++++++++------ sertit/s3.py | 12 ++++++++++++ sertit/vectors.py | 17 ++++++++++++++--- sertit/xml.py | 6 +++--- 8 files changed, 70 insertions(+), 23 deletions(-) diff --git a/CI/SCRIPTS/test_s3.py b/CI/SCRIPTS/test_s3.py index 032376b..d7e1604 100644 --- a/CI/SCRIPTS/test_s3.py +++ b/CI/SCRIPTS/test_s3.py @@ -19,10 +19,11 @@ import pytest import rasterio +from cloudpathlib import AnyPath from tempenv import tempenv from CI.SCRIPTS.script_utils import CI_SERTIT_S3 -from sertit import AnyPath, rasters +from sertit import rasters from sertit.s3 import USE_S3_STORAGE, s3_env, temp_s3 diff --git a/CI/SCRIPTS/test_types.py b/CI/SCRIPTS/test_types.py index 53e6f95..6013570 100644 --- a/CI/SCRIPTS/test_types.py +++ b/CI/SCRIPTS/test_types.py @@ -3,6 +3,7 @@ import numpy as np from cloudpathlib import CloudPath +from upath import UPath from sertit import AnyPath from sertit.types import AnyPathType, is_iterable, make_iterable @@ -10,7 +11,7 @@ def test_types(): """Test some type aliases""" - assert AnyPathType == Union[Path, CloudPath] + assert AnyPathType == Union[Path, CloudPath, UPath] def test_is_iterable(): diff --git a/CI/SCRIPTS/test_unistra.py b/CI/SCRIPTS/test_unistra.py index 6f58bb9..7406400 100644 --- a/CI/SCRIPTS/test_unistra.py +++ b/CI/SCRIPTS/test_unistra.py @@ -16,11 +16,11 @@ # limitations under the License. """ Script testing the CI """ import pytest -from cloudpathlib import S3Client +from cloudpathlib import AnyPath, S3Client from tempenv import tempenv from CI.SCRIPTS.script_utils import CI_SERTIT_S3 -from sertit import AnyPath, ci, misc, rasters, s3 +from sertit import ci, misc, rasters, s3 from sertit.unistra import ( _get_db_path, get_db2_path, @@ -73,7 +73,7 @@ def test_unistra_s3(): assert with_s3() == 1 # Test get_geodatastore with s3 - assert str(get_geodatastore()) == "s3://sertit-geodatastore" + assert str(get_geodatastore()) == "s3://sertit-geodatastore/" # Test get_geodatastore without s3 with tempenv.TemporaryEnvironment({s3.USE_S3_STORAGE: "0"}): diff --git a/sertit/files.py b/sertit/files.py index a5dea19..5d30cd1 100644 --- a/sertit/files.py +++ b/sertit/files.py @@ -221,16 +221,25 @@ def extract_sub_dir(arch, filename_list): arch.extractall(archive_output) # Manage archive type + if file_path.suffix == ".zip": + if path.is_cloud_path(file_path): + file_path = s3.read(file_path) with zipfile.ZipFile(file_path, "r") as zip_file: extract_sub_dir(zip_file, zip_file.namelist()) elif file_path.suffix == ".tar" or file_path.suffixes == [".tar", ".gz"]: - with tarfile.open(file_path, "r") as tar_file: + if path.is_cloud_path(file_path): + args = {"fileobj": s3.read(file_path), "mode": "r"} + else: + args = {"name": file_path, "mode": "r"} + with tarfile.open(**args) as tar_file: extract_sub_dir(tar_file, tar_file.getnames()) elif file_path.suffix == ".7z": try: import py7zr + if path.is_cloud_path(file_path): + file_path = s3.read(file_path) with py7zr.SevenZipFile(file_path, "r") as z7_file: extract_sub_dir(z7_file, z7_file.getnames()) except ModuleNotFoundError: @@ -394,14 +403,18 @@ def read_archived_file( bytes: Archived file in bytes """ archive_path = AnyPath(archive_path) - + archive_fn = get_filename(archive_path) # Compile regex regex = re.compile(regex) # Open tar and zip XML try: if archive_path.suffix == ".tar": - with tarfile.open(archive_path) as tar_ds: + if path.is_cloud_path(archive_path): + args = {"fileobj": s3.read(archive_path), "mode": "r"} + else: + args = {"name": archive_path, "mode": "r"} + with tarfile.open(**args) as tar_ds: # file_list is not very useful for TAR files... if file_list is None: tar_mb = tar_ds.getmembers() @@ -410,6 +423,8 @@ def read_archived_file( tarinfo = tar_ds.getmember(name) file_str = tar_ds.extractfile(tarinfo).read() elif archive_path.suffix == ".zip": + if path.is_cloud_path(archive_path): + archive_path = s3.read(archive_path) with zipfile.ZipFile(archive_path) as zip_ds: if file_list is None: file_list = [f.filename for f in zip_ds.filelist] @@ -425,9 +440,7 @@ def read_archived_file( "Only .zip and .tar files can be read from inside its archive." ) except IndexError: - raise FileNotFoundError( - f"Impossible to find file {regex} in {path.get_filename(archive_path)}" - ) + raise FileNotFoundError(f"Impossible to find file {regex} in {archive_fn}") return file_str diff --git a/sertit/path.py b/sertit/path.py index 079ec5a..69b97d6 100644 --- a/sertit/path.py +++ b/sertit/path.py @@ -26,7 +26,7 @@ import zipfile from typing import Any, Union -from sertit import AnyPath, logs +from sertit import AnyPath, logs, s3 from sertit.logs import SU_NAME from sertit.types import AnyPathStrType, AnyPathType @@ -167,18 +167,27 @@ def get_archived_file_list(archive_path: AnyPathStrType) -> list: ['file_1.txt', 'file_2.tif', 'file_3.xml', 'file_4.geojson'] """ archive_path = AnyPath(archive_path) - if archive_path.suffix == ".zip": + + is_zip = archive_path.suffix == ".zip" + archive_fn = get_filename(archive_path) + if is_zip: + + if is_cloud_path(archive_path): + archive_path = s3.read(archive_path) + with zipfile.ZipFile(archive_path) as zip_ds: file_list = [f.filename for f in zip_ds.filelist] else: try: - with tarfile.open(archive_path) as tar_ds: + if is_cloud_path(archive_path): + args = {"fileobj": s3.read(archive_path), "mode": "r"} + else: + args = {"name": archive_path, "mode": "r"} + with tarfile.open(**args) as tar_ds: tar_mb = tar_ds.getmembers() file_list = [mb.name for mb in tar_mb] except tarfile.ReadError as ex: - raise tarfile.ReadError( - f"Impossible to open archive: {archive_path}" - ) from ex + raise tarfile.ReadError(f"Impossible to open archive: {archive_fn}") from ex return file_list diff --git a/sertit/s3.py b/sertit/s3.py index f1e35e1..3b92d19 100644 --- a/sertit/s3.py +++ b/sertit/s3.py @@ -21,6 +21,7 @@ import os from contextlib import contextmanager from functools import wraps +from io import BytesIO from cloudpathlib import S3Client @@ -305,3 +306,14 @@ def download(src, dst): downloaded_path = src.download_to(dst) return downloaded_path + + +def read(src): + src = AnyPath(src) + try: + b = src.read_bytes() + except Exception: + with src.open("rb") as f: + b = f.read() + + return BytesIO(b) diff --git a/sertit/vectors.py b/sertit/vectors.py index ef3b626..7a39e17 100644 --- a/sertit/vectors.py +++ b/sertit/vectors.py @@ -255,8 +255,11 @@ def get_aoi_wkt(aoi_path: AnyPathStrType, as_str: bool = True) -> Union[str, Pol if aoi_path.suffix == ".wkt": try: - with open(aoi_path, "r") as aoi_f: - aoi = wkt.load(aoi_f) + if path.is_cloud_path(aoi_path): + aoi = wkt.load(s3.read(aoi_path)) + else: + with open(aoi_path, "r") as aoi_f: + aoi = wkt.load(aoi_f) except Exception as ex: raise ValueError("AOI WKT cannot be read") from ex else: @@ -707,11 +710,19 @@ def ogr2geojson( vector_path = AnyPath(vector_path) # archived vector_path are extracted in a tmp folder so no need to be downloaded + if vector_path.suffix == ".zip": + if path.is_cloud_path(vector_path): + vector_path = s3.read(vector_path) with zipfile.ZipFile(vector_path, "r") as zip_ds: vect_path = zip_ds.extract(arch_vect_path, out_dir) elif vector_path.suffix == ".tar": - with tarfile.open(vector_path, "r") as tar_ds: + if path.is_cloud_path(vector_path): + args = {"fileobj": s3.read(vector_path), "mode": "r"} + else: + args = {"name": vector_path, "mode": "r"} + + with tarfile.open(**args) as tar_ds: tar_ds.extract(arch_vect_path, out_dir) vect_path = os.path.join(out_dir, arch_vect_path) else: diff --git a/sertit/xml.py b/sertit/xml.py index 0f358af..738c544 100644 --- a/sertit/xml.py +++ b/sertit/xml.py @@ -29,7 +29,7 @@ ) from lxml.html.builder import E -from sertit import AnyPath, files, path +from sertit import AnyPath, files, path, s3 from sertit.logs import SU_NAME from sertit.misc import ListEnum from sertit.types import AnyPathStrType @@ -55,12 +55,12 @@ def read(xml_path: AnyPathStrType) -> _Element: try: # Try using read_text (faster) root = fromstring(xml_path.read_text()) - except ValueError: + except (ValueError, PermissionError): # Try using read_bytes # Slower but works with: # {ValueError}Unicode strings with encoding declaration are not supported. # Please use bytes input or XML fragments without declaration. - root = fromstring(xml_path.read_bytes()) + root = fromstring(s3.read(xml_path)) else: # pylint: disable=I1101: # Module 'lxml.etree' has no 'parse' member, but source is unavailable. From 118cad12473f9cc3b28f702d2a5b0422bdf36754 Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Fri, 13 Dec 2024 12:33:56 +0100 Subject: [PATCH 03/18] Don't recreate Path without storage options to make it work with UPath --- CI/SCRIPTS/test_vectors.py | 5 ++++- CI/SCRIPTS/test_xml.py | 5 +++-- sertit/vectors.py | 6 +++++- sertit/xml.py | 38 +++++++++++++++++++++++++++++--------- 4 files changed, 41 insertions(+), 13 deletions(-) diff --git a/CI/SCRIPTS/test_vectors.py b/CI/SCRIPTS/test_vectors.py index 0ecd93a..d84e2c0 100644 --- a/CI/SCRIPTS/test_vectors.py +++ b/CI/SCRIPTS/test_vectors.py @@ -279,7 +279,10 @@ def test_read_archived(): map_overlay_extracted = vectors.read(map_overlay_extracted_path) ci.assert_geom_equal( - map_overlay_extracted, vectors.read(f"{zip_landsat}!{landsat}/{map_overlay}") + map_overlay_extracted, + vectors.read( + zip_landsat.parent / (zip_landsat.name + f"!{landsat}/{map_overlay}") + ), ) ci.assert_geom_equal( map_overlay_extracted, diff --git a/CI/SCRIPTS/test_xml.py b/CI/SCRIPTS/test_xml.py index fb74f16..63c23c9 100644 --- a/CI/SCRIPTS/test_xml.py +++ b/CI/SCRIPTS/test_xml.py @@ -111,7 +111,7 @@ def test_xml(): _assert_str(cv_xml.findtext(".//Age"), "20") # Write - true_xml = str(xml_path() / "true.xml") + true_xml = xml_path() / "true.xml" with tempfile.TemporaryDirectory() as tmp_dir: tmp_xml = os.path.join(tmp_dir, "tmp.xml") xml.write(cv_xml, tmp_xml) @@ -121,7 +121,8 @@ def test_xml(): # Based on `files.read_archived_xml`, so it is considered to work. # Just test the case with complete path to the archive l8_archived = files_path() / "LM05_L1TP_200030_20121230_20200820_02_T2_CI.zip" - xml_archived = f"{l8_archived}!LM05_L1TP_200030_20121230_20200820_02_T2_CI/LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml" + xml_path_in_zip = "!LM05_L1TP_200030_20121230_20200820_02_T2_CI/LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml" + xml_archived = l8_archived.parent / (l8_archived.name + xml_path_in_zip) ci.assert_xml_equal( xml.read_archive(l8_archived, r".*_MTL\.xml"), xml.read_archive(xml_archived) diff --git a/sertit/vectors.py b/sertit/vectors.py index 7a39e17..75769eb 100644 --- a/sertit/vectors.py +++ b/sertit/vectors.py @@ -476,7 +476,11 @@ def read( if "!" in str(vector_path): split_vect = str(vector_path).split("!") archive_regex = ".*{0}".format(split_vect[1].replace(".", r"\.")) - vector_path = AnyPath(split_vect[0]) + try: + vector_path = AnyPath(split_vect[0], **vector_path.storage_options) + except Exception: + # Cloudpathlib + vector_path = AnyPath(split_vect[0]) # Manage archive case if vector_path.suffix in [".tar", ".zip"]: diff --git a/sertit/xml.py b/sertit/xml.py index 738c544..86e8471 100644 --- a/sertit/xml.py +++ b/sertit/xml.py @@ -29,7 +29,7 @@ ) from lxml.html.builder import E -from sertit import AnyPath, files, path, s3 +from sertit import AnyPath, files, logs, path, s3 from sertit.logs import SU_NAME from sertit.misc import ListEnum from sertit.types import AnyPathStrType @@ -60,7 +60,7 @@ def read(xml_path: AnyPathStrType) -> _Element: # Slower but works with: # {ValueError}Unicode strings with encoding declaration are not supported. # Please use bytes input or XML fragments without declaration. - root = fromstring(s3.read(xml_path)) + root = fromstring(s3.read(xml_path).read()) else: # pylint: disable=I1101: # Module 'lxml.etree' has no 'parse' member, but source is unavailable. @@ -74,7 +74,10 @@ def read(xml_path: AnyPathStrType) -> _Element: def read_archive( - path: AnyPathStrType, regex: str = None, file_list: list = None + archive_path: AnyPathStrType = None, + regex: str = None, + file_list: list = None, + **kwargs, ) -> _Element: """ Read an XML file from inside an archive (zip or tar) @@ -86,25 +89,42 @@ def read_archive( - path to the archive plus a regex looking inside the archive. Duplicate behaviour to :py:func:`files.read_archived_xml` Args: - path (AnyPathStrType): Path to the XML file, stored inside an archive or path to the archive itself + archive_path (AnyPathStrType): Path to the XML file, stored inside an archive or path to the archive itself regex (str): Optional. If specified, the path should be the archive path and the regex should be the key to find the XML file inside the archive. file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. Returns: _Element: XML Root """ + if archive_path is None: + logs.deprecation_warning( + "'path' argument is deprecated, use 'archive_path' instead." + ) + archive_path = kwargs.pop("path") try: if not regex: - path, basename = str(path).split("!") + archive_base_path, basename = str(archive_path).split("!") regex = basename - if path.startswith("zip://") or path.startswith("tar://"): - path = path[5:] + if archive_base_path.startswith("zip://") or archive_base_path.startswith( + "tar://" + ): + archive_base_path = archive_base_path[5:] - return files.read_archived_xml(path, regex, file_list=file_list) + # For UPath + try: + archive_base_path = AnyPath( + archive_base_path, **archive_path.storage_options + ) + except Exception: + pass + else: + archive_base_path = archive_path + + return files.read_archived_xml(archive_base_path, regex, file_list=file_list) except XMLSyntaxError: - raise ValueError(f"Invalid metadata XML for {path}!") + raise ValueError(f"Invalid metadata XML for {archive_path}!") def write(xml: _Element, path: str) -> None: From e6ba4387856b440291bb3142d4e7dcc21ba7ea10 Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Fri, 13 Dec 2024 14:30:27 +0100 Subject: [PATCH 04/18] Create archives module (to avoid circular imports between path and files) + remove some depr functions + fix erroneous merge --- CI/SCRIPTS/test_archives.py | 147 ++++++++++ CI/SCRIPTS/test_files.py | 146 +--------- CI/SCRIPTS/test_path.py | 57 +--- CI/SCRIPTS/test_vectors.py | 4 +- sertit/archives.py | 558 ++++++++++++++++++++++++++++++++++++ sertit/files.py | 458 +---------------------------- sertit/path.py | 188 +----------- sertit/vectors.py | 25 +- sertit/xml.py | 36 ++- 9 files changed, 756 insertions(+), 863 deletions(-) create mode 100644 CI/SCRIPTS/test_archives.py create mode 100644 sertit/archives.py diff --git a/CI/SCRIPTS/test_archives.py b/CI/SCRIPTS/test_archives.py new file mode 100644 index 0000000..3415618 --- /dev/null +++ b/CI/SCRIPTS/test_archives.py @@ -0,0 +1,147 @@ +import os +import shutil + +import pytest +from lxml import etree, html + +from CI.SCRIPTS.script_utils import files_path, s3_env +from sertit import archives, ci, files, path, s3, vectors + + +def test_archive(tmp_path): + """Test extracting functions""" + # Archives + zip_file = files_path().joinpath("test_zip.zip") + zip2_file = files_path().joinpath("test_zip.zip") # For overwrite + zip_without_directory = files_path().joinpath("test_zip_without_directory.zip") + tar_file = files_path().joinpath("test_tar.tar") + tar_gz_file = files_path().joinpath("test_targz.tar.gz") + + # Core dir + core_dir = files_path().joinpath("core") + folder = core_dir + arch = [ + zip_file, + tar_file, + tar_gz_file, + folder, + zip2_file, + zip_without_directory, + ] + + # Extract + extracted_dirs = archives.extract_files(arch, tmp_path, overwrite=True) + archives.extract_files([zip2_file], tmp_path, overwrite=False) # Already existing + + # Test + for ex_dir in extracted_dirs: + ci.assert_dir_equal(core_dir, ex_dir) + + # Archive + archive_base = os.path.join(tmp_path, "archive") + for fmt in ["zip", "tar", "gztar"]: + archive_fn = archives.archive( + folder_path=core_dir, archive_path=archive_base, fmt=fmt + ) + out = archives.extract_file(archive_fn, tmp_path) + # an additional folder is created + out_dir = path.listdir_abspath(out)[0] + ci.assert_dir_equal(core_dir, out_dir) + + # Remove out directory in order to avoid any interferences + files.remove(out) + + # Add to zip + zip_out = zip2_file if path.is_cloud_path(zip2_file) else archive_base + ".zip" + core_copy = files.copy(core_dir, os.path.join(tmp_path, "core2")) + zip_out = archives.add_to_zip(zip_out, core_copy) + + # Extract + unzip_out = os.path.join(tmp_path, "out") + unzip_out = archives.extract_file(zip_out, unzip_out) + + # Test + unzip_dirs = path.listdir_abspath(unzip_out) + + assert len(unzip_dirs) == 2 + ci.assert_dir_equal(unzip_dirs[0], unzip_dirs[1]) + + +@s3_env +def test_archived_files(tmp_path): + landsat_name = "LM05_L1TP_200030_20121230_20200820_02_T2_CI" + ok_folder = files_path().joinpath(landsat_name) + zip_file = files_path().joinpath(f"{landsat_name}.zip") + tar_file = files_path().joinpath(f"{landsat_name}.tar") + targz_file = files_path().joinpath(f"{landsat_name}.tar.gz") + sz_file = files_path().joinpath(f"{landsat_name}.7z") + + # VECTORS + vect_name = "map-overlay.kml" + vec_ok_path = ok_folder.joinpath(vect_name) + if shutil.which("ogr2ogr"): # Only works if ogr2ogr can be found. + vect_regex = f".*{vect_name}" + vect_zip = vectors.read(zip_file, archive_regex=vect_regex) + vect_tar = vectors.read(tar_file, archive_regex=r".*overlay\.kml") + vect_ok = vectors.read(vec_ok_path) + assert not vect_ok.empty + ci.assert_geom_equal(vect_ok, vect_zip) + ci.assert_geom_equal(vect_ok, vect_tar) + + # XML + xml_name = "LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml" + xml_ok_path = ok_folder.joinpath(xml_name) + xml_ok_path = str(s3.download(xml_ok_path, tmp_path)) + + xml_regex = f".*{xml_name}" + xml_zip = archives.read_archived_xml(zip_file, xml_regex) + xml_tar = archives.read_archived_xml(tar_file, r".*_MTL\.xml") + xml_ok = etree.parse(xml_ok_path).getroot() + ci.assert_xml_equal(xml_ok, xml_zip) + ci.assert_xml_equal(xml_ok, xml_tar) + + # FILE + HTML + html_zip_file = files_path().joinpath("productPreview.zip") + html_tar_file = files_path().joinpath("productPreview.tar") + html_name = "productPreview.html" + html_ok_path = files_path().joinpath(html_name) + html_ok_path = str(s3.download(html_ok_path, tmp_path)) + + html_regex = f".*{html_name}" + + # FILE + file_zip = archives.read_archived_file(html_zip_file, html_regex) + file_tar = archives.read_archived_file(html_tar_file, html_regex) + html_ok = html.parse(html_ok_path).getroot() + ci.assert_html_equal(html_ok, html.fromstring(file_zip)) + ci.assert_html_equal(html_ok, html.fromstring(file_tar)) + + file_list = archives.get_archived_file_list(html_zip_file) + ci.assert_html_equal( + html_ok, + html.fromstring( + archives.read_archived_file(html_zip_file, html_regex, file_list=file_list) + ), + ) + + # HTML + html_zip = archives.read_archived_html(html_zip_file, html_regex) + html_tar = archives.read_archived_html(html_tar_file, html_regex) + ci.assert_html_equal(html_ok, html_zip) + ci.assert_html_equal(html_ok, html_tar) + ci.assert_html_equal( + html_ok, + archives.read_archived_html( + html_tar_file, + html_regex, + file_list=archives.get_archived_file_list(html_tar_file), + ), + ) + + # ERRORS + with pytest.raises(TypeError): + archives.read_archived_file(targz_file, xml_regex) + with pytest.raises(TypeError): + archives.read_archived_file(sz_file, xml_regex) + with pytest.raises(FileNotFoundError): + archives.read_archived_file(zip_file, "cdzeferf") diff --git a/CI/SCRIPTS/test_files.py b/CI/SCRIPTS/test_files.py index 04015b7..a0f9889 100644 --- a/CI/SCRIPTS/test_files.py +++ b/CI/SCRIPTS/test_files.py @@ -16,160 +16,18 @@ """Script testing the files""" import os -import shutil import tempfile from datetime import date, datetime import numpy as np import pytest -from lxml import etree, html -from CI.SCRIPTS.script_utils import Polarization, files_path, s3_env -from sertit import AnyPath, ci, files, path, s3, vectors +from CI.SCRIPTS.script_utils import Polarization +from sertit import AnyPath, ci, files ci.reduce_verbosity() -def test_archive(): - """Test extracting functions""" - with tempfile.TemporaryDirectory() as tmp_dir: - # Archives - zip_file = files_path().joinpath("test_zip.zip") - zip2_file = files_path().joinpath("test_zip.zip") # For overwrite - zip_without_directory = files_path().joinpath("test_zip_without_directory.zip") - tar_file = files_path().joinpath("test_tar.tar") - tar_gz_file = files_path().joinpath("test_targz.tar.gz") - - # Core dir - core_dir = files_path().joinpath("core") - folder = core_dir - archives = [ - zip_file, - tar_file, - tar_gz_file, - folder, - zip2_file, - zip_without_directory, - ] - - # Extract - extracted_dirs = files.extract_files(archives, tmp_dir, overwrite=True) - files.extract_files([zip2_file], tmp_dir, overwrite=False) # Already existing - - # Test - for ex_dir in extracted_dirs: - ci.assert_dir_equal(core_dir, ex_dir) - - # Archive - archive_base = os.path.join(tmp_dir, "archive") - for fmt in ["zip", "tar", "gztar"]: - archive_fn = files.archive( - folder_path=core_dir, archive_path=archive_base, fmt=fmt - ) - out = files.extract_file(archive_fn, tmp_dir) - # an additional folder is created - out_dir = path.listdir_abspath(out)[0] - ci.assert_dir_equal(core_dir, out_dir) - - # Remove out directory in order to avoid any interferences - files.remove(out) - - # Add to zip - zip_out = zip2_file if path.is_cloud_path(zip2_file) else archive_base + ".zip" - core_copy = files.copy(core_dir, os.path.join(tmp_dir, "core2")) - zip_out = files.add_to_zip(zip_out, core_copy) - - # Extract - unzip_out = os.path.join(tmp_dir, "out") - unzip_out = files.extract_file(zip_out, unzip_out) - - # Test - unzip_dirs = path.listdir_abspath(unzip_out) - - assert len(unzip_dirs) == 2 - ci.assert_dir_equal(unzip_dirs[0], unzip_dirs[1]) - - -@s3_env -def test_archived_files(tmp_path): - landsat_name = "LM05_L1TP_200030_20121230_20200820_02_T2_CI" - ok_folder = files_path().joinpath(landsat_name) - zip_file = files_path().joinpath(f"{landsat_name}.zip") - tar_file = files_path().joinpath(f"{landsat_name}.tar") - targz_file = files_path().joinpath(f"{landsat_name}.tar.gz") - sz_file = files_path().joinpath(f"{landsat_name}.7z") - - # VECTORS - vect_name = "map-overlay.kml" - vec_ok_path = ok_folder.joinpath(vect_name) - if shutil.which("ogr2ogr"): # Only works if ogr2ogr can be found. - vect_regex = f".*{vect_name}" - vect_zip = vectors.read(zip_file, archive_regex=vect_regex) - vect_tar = vectors.read(tar_file, archive_regex=r".*overlay\.kml") - vect_ok = vectors.read(vec_ok_path) - assert not vect_ok.empty - ci.assert_geom_equal(vect_ok, vect_zip) - ci.assert_geom_equal(vect_ok, vect_tar) - - # XML - xml_name = "LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml" - xml_ok_path = ok_folder.joinpath(xml_name) - xml_ok_path = str(s3.download(xml_ok_path, tmp_path)) - - xml_regex = f".*{xml_name}" - xml_zip = files.read_archived_xml(zip_file, xml_regex) - xml_tar = files.read_archived_xml(tar_file, r".*_MTL\.xml") - xml_ok = etree.parse(xml_ok_path).getroot() - ci.assert_xml_equal(xml_ok, xml_zip) - ci.assert_xml_equal(xml_ok, xml_tar) - - # FILE + HTML - html_zip_file = files_path().joinpath("productPreview.zip") - html_tar_file = files_path().joinpath("productPreview.tar") - html_name = "productPreview.html" - html_ok_path = files_path().joinpath(html_name) - html_ok_path = str(s3.download(html_ok_path, tmp_path)) - - html_regex = f".*{html_name}" - - # FILE - file_zip = files.read_archived_file(html_zip_file, html_regex) - file_tar = files.read_archived_file(html_tar_file, html_regex) - html_ok = html.parse(html_ok_path).getroot() - ci.assert_html_equal(html_ok, html.fromstring(file_zip)) - ci.assert_html_equal(html_ok, html.fromstring(file_tar)) - - file_list = path.get_archived_file_list(html_zip_file) - ci.assert_html_equal( - html_ok, - html.fromstring( - files.read_archived_file(html_zip_file, html_regex, file_list=file_list) - ), - ) - - # HTML - html_zip = files.read_archived_html(html_zip_file, html_regex) - html_tar = files.read_archived_html(html_tar_file, html_regex) - ci.assert_html_equal(html_ok, html_zip) - ci.assert_html_equal(html_ok, html_tar) - ci.assert_html_equal( - html_ok, - files.read_archived_html( - html_tar_file, - html_regex, - file_list=path.get_archived_file_list(html_tar_file), - ), - ) - - # ERRORS - with pytest.raises(TypeError): - files.read_archived_file(targz_file, xml_regex) - with pytest.raises(TypeError): - files.read_archived_file(sz_file, xml_regex) - with pytest.raises(FileNotFoundError): - files.read_archived_file(zip_file, "cdzeferf") - - def test_cp_rm(): """Test CP/RM functions""" with tempfile.TemporaryDirectory() as tmp_dir: diff --git a/CI/SCRIPTS/test_path.py b/CI/SCRIPTS/test_path.py index bf335ec..d173bc3 100644 --- a/CI/SCRIPTS/test_path.py +++ b/CI/SCRIPTS/test_path.py @@ -16,13 +16,12 @@ """Script testing the files""" import os -import shutil import tempfile import pytest -from CI.SCRIPTS.script_utils import files_path, get_s3_ci_path, s3_env -from sertit import AnyPath, ci, misc, path, vectors +from CI.SCRIPTS.script_utils import get_s3_ci_path +from sertit import AnyPath, ci, misc, path ci.reduce_verbosity() @@ -65,58 +64,6 @@ def test_paths(): assert not path.is_writable("cvfgbherth") # Non-existing -@s3_env -def test_archived_paths(): - landsat_name = "LM05_L1TP_200030_20121230_20200820_02_T2_CI" - ok_folder = files_path().joinpath(landsat_name) - zip_file = files_path().joinpath(f"{landsat_name}.zip") - tar_file = files_path().joinpath(f"{landsat_name}.tar") - targz_file = files_path().joinpath(f"{landsat_name}.tar.gz") - sz_file = files_path().joinpath(f"{landsat_name}.7z") - - # Archive file - tif_name = "LM05_L1TP_200030_20121230_20200820_02_T2_QA_RADSAT.TIF" - tif_ok = f"{ok_folder.name}/{tif_name}" - tif_regex = f".*{tif_name}" - assert tif_ok == path.get_archived_path(zip_file, tif_regex) - assert tif_ok == path.get_archived_path(zip_file, tif_regex, as_list=True)[0] - assert tif_ok == path.get_archived_path(tar_file, ".*RADSAT") - - # RASTERIO - tif_zip = path.get_archived_rio_path(zip_file, tif_regex) - tif_list = path.get_archived_rio_path(zip_file, tif_regex, as_list=True) - tif_tar = path.get_archived_rio_path(tar_file, ".*RADSAT") - tif_ok = ok_folder.joinpath(tif_name) - ci.assert_raster_equal(tif_ok, tif_zip) - ci.assert_raster_equal(tif_ok, tif_list[0]) - ci.assert_raster_equal(tif_ok, tif_tar) - - file_list = path.get_archived_file_list(zip_file) - ci.assert_raster_equal( - tif_ok, path.get_archived_rio_path(zip_file, tif_regex, file_list=file_list) - ) - - # VECTORS - vect_name = "map-overlay.kml" - vec_ok_path = ok_folder.joinpath(vect_name) - if shutil.which("ogr2ogr"): # Only works if ogr2ogr can be found. - vect_regex = f".*{vect_name}" - vect_zip = vectors.read(zip_file, archive_regex=vect_regex) - vect_tar = vectors.read(tar_file, archive_regex=r".*overlay\.kml") - vect_ok = vectors.read(vec_ok_path) - assert not vect_ok.empty - ci.assert_geom_equal(vect_ok, vect_zip) - ci.assert_geom_equal(vect_ok, vect_tar) - - # ERRORS - with pytest.raises(TypeError): - path.get_archived_rio_path(targz_file, tif_regex) - with pytest.raises(TypeError): - path.get_archived_rio_path(sz_file, tif_regex) - with pytest.raises(FileNotFoundError): - path.get_archived_rio_path(zip_file, "cdzeferf") - - def test_get_file_name(): """Test get_file_name""" file_name = path.get_filename(__file__) diff --git a/CI/SCRIPTS/test_vectors.py b/CI/SCRIPTS/test_vectors.py index 5f9bd92..5a79272 100644 --- a/CI/SCRIPTS/test_vectors.py +++ b/CI/SCRIPTS/test_vectors.py @@ -25,7 +25,7 @@ from shapely import wkt from CI.SCRIPTS.script_utils import KAPUT_KWARGS, files_path, s3_env, vectors_path -from sertit import ci, files, path, vectors +from sertit import archives, ci, files, path, vectors from sertit.vectors import EPSG_4326, DataSourceError ci.reduce_verbosity() @@ -294,7 +294,7 @@ def test_read_archived(): vectors.read(tar_landsat, archive_regex=map_overlay_regex), ) - file_list = path.get_archived_file_list(tar_landsat) + file_list = archives.get_archived_file_list(tar_landsat) ci.assert_geom_equal( map_overlay_extracted, vectors.read(tar_landsat, archive_regex=map_overlay_regex, file_list=file_list), diff --git a/sertit/archives.py b/sertit/archives.py new file mode 100644 index 0000000..2115071 --- /dev/null +++ b/sertit/archives.py @@ -0,0 +1,558 @@ +import logging +import os +import re +import shutil +import tarfile +import tempfile +import zipfile +from contextlib import contextmanager +from pathlib import Path +from typing import Union + +from lxml import etree, html +from tqdm import tqdm + +from sertit import AnyPath, logs, path, s3 +from sertit.logs import SU_NAME +from sertit.types import AnyPathStrType, AnyPathType + +LOGGER = logging.getLogger(SU_NAME) + + +@contextmanager +def open_zipfile(file_path, mode="r"): + if path.is_cloud_path(file_path): + file_path = s3.read(file_path) + + with zipfile.ZipFile(file_path, mode) as zip_file: + yield zip_file + + +@contextmanager +def open_tarfile(file_path, mode="r"): + if path.is_cloud_path(file_path): + args = {"fileobj": s3.read(file_path), "mode": mode} + else: + args = {"name": file_path, "mode": mode} + with tarfile.open(**args) as tar_file: + yield tar_file + + +def extract_file( + file_path: AnyPathStrType, + output: AnyPathStrType, + overwrite: bool = False, +) -> AnyPathType: + """ + Extract an archived file (zip or others). Overwrites if specified. + If the archive don't contain a root directory with the name of the archive without the extension, create it + + Args: + file_path (str): Archive file path + output (str): Output where to put the extracted directory + overwrite (bool): Overwrite found extracted directory + + Returns: + AnyPathType: Extracted directory paths + + Example: + >>> file_path = 'D:/path/to/zip.zip' + >>> output = 'D:/path/to/output' + >>> extract_file(file_path, output, overwrite=True) + D:/path/to/output/zip' + """ + # Convert to path + file_path = AnyPath(file_path) + output = AnyPath(output) + + # In case a folder is given, returns it (this means that the file is already extracted) + if file_path.is_dir(): + return file_path + + # Beware with .SEN3 and .SAFE extensions + archive_output = output.joinpath(path.get_filename(file_path)) + + # In case not overwrite and the extracted directory already exists + if not overwrite and archive_output.exists(): + LOGGER.debug( + "Already existing extracted %s. It won't be overwritten.", + archive_output, + ) + return archive_output + + def extract_sub_dir(arch, filename_list): + top_level_files = list({item.split("/")[0] for item in filename_list}) + + # When the only root directory in the archive has the right name, we don't have to create it + if len(top_level_files) == 1 and archive_output.name == path.get_filename( + top_level_files[0] + ): + arch.extractall(archive_output.parent) + archive_output.parent.joinpath(top_level_files[0]).rename(archive_output) + else: + arch.extractall(archive_output) + + # Manage archive type + if file_path.suffix == ".zip": + with open_zipfile(file_path) as zip_file: + extract_sub_dir(zip_file, zip_file.namelist()) + elif file_path.suffix == ".tar" or file_path.suffixes == [".tar", ".gz"]: + with open_tarfile(file_path) as tar_file: + extract_sub_dir(tar_file, tar_file.getnames()) + elif file_path.suffix == ".7z": + try: + import py7zr + + with py7zr.SevenZipFile(file_path, "r") as z7_file: + extract_sub_dir(z7_file, z7_file.getnames()) + except ModuleNotFoundError as exc: + raise TypeError("Please install 'py7zr' to extract .7z files") from exc + else: + raise TypeError( + f"Only .zip, .tar, .tar.gz and .7z files can be extracted, not {file_path}" + ) + + return archive_output + + +def extract_files( + archives: list, output: AnyPathStrType, overwrite: bool = False +) -> list: + """ + Extract all archived files. Overwrites if specified. + + Example: + >>> file_path = ['D:/path/to/zip1.zip', 'D:/path/to/zip2.zip'] + >>> output = 'D:/path/to/output' + >>> extract_files(file_path, output, overwrite=True) + ['D:/path/to/output.zip1', 'D:/path/to/output.zip2'] + + Args: + archives (list of str): List of archives to be extracted + output (str): Output folder where extracted files will be written + overwrite (bool): Overwrite found extracted files + + Returns: + list: Extracted files (even pre-existing ones) + """ + LOGGER.info("Extracting products in %s", output) + progress_bar = tqdm(archives) + extracts = [] + for arch in progress_bar: + progress_bar.set_description(f"Extracting product {os.path.basename(arch)}") + extracts.append(extract_file(arch, output, overwrite)) + + return extracts + + +def read_archived_file( + archive_path: AnyPathStrType, regex: str, file_list: list = None +) -> bytes: + """ + Read archived file (in bytes) from :code:`zip` or :code:`tar` archives. + + You can use this `site `_ to build your regex. + + Args: + archive_path (AnyPathStrType): Archive path + regex (str): Regex (used by re) as it can be found in the getmembers() list + file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. + + Returns: + bytes: Archived file in bytes + """ + archive_path = AnyPath(archive_path) + + # Compile regex + regex = re.compile(regex) + + # Open tar and zip XML + try: + if archive_path.suffix == ".tar": + with open_tarfile(archive_path) as tar_ds: + # file_list is not very useful for TAR files... + if file_list is None: + tar_mb = tar_ds.getmembers() + file_list = [mb.name for mb in tar_mb] + name = list(filter(regex.match, file_list))[0] + tarinfo = tar_ds.getmember(name) + file_str = tar_ds.extractfile(tarinfo).read() + elif archive_path.suffix == ".zip": + with open_zipfile(archive_path) as zip_ds: + if file_list is None: + file_list = [f.filename for f in zip_ds.filelist] + name = list(filter(regex.match, file_list))[0] + file_str = zip_ds.read(name) + + elif archive_path.suffix == ".tar.gz": + raise TypeError( + ".tar.gz files are too slow to read from inside the archive. Please extract them instead." + ) + else: + raise TypeError( + "Only .zip and .tar files can be read from inside its archive." + ) + except IndexError as exc: + raise FileNotFoundError( + f"Impossible to find file {regex} in {path.get_filename(archive_path)}" + ) from exc + + return file_str + + +def read_archived_xml( + archive_path: AnyPathStrType, regex: str = None, file_list: list = None, **kwargs +) -> etree._Element: + """ + Read archived XML from :code:`zip` or :code:`tar` archives. + + You can use this `site `_ to build your regex. + + Args: + archive_path (AnyPathStrType): Archive path + regex (str): XML regex (used by re) as it can be found in the getmembers() list + file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. + + Returns: + etree._Element: XML file + + Example: + >>> arch_path = 'D:/path/to/zip.zip' + >>> file_regex = '.*dir.*file_name' # Use .* for any character + >>> read_archived_xml(arch_path, file_regex) + + """ + if regex is None: + logs.deprecation_warning( + "'xml_regex' is deprecated, please use 'regex' instead." + ) + regex = kwargs.pop("xml_regex") + + xml_bytes = read_archived_file(archive_path, regex=regex, file_list=file_list) + + return etree.fromstring(xml_bytes) + + +def read_archived_html( + archive_path: AnyPathStrType, regex: str, file_list: list = None +) -> html.HtmlElement: + """ + Read archived HTML from :code:`zip` or :code:`tar` archives. + + You can use this `site `_ to build your regex. + + Args: + archive_path (AnyPathStrType): Archive path + regex (str): HTML regex (used by re) as it can be found in the getmembers() list + file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. + + Returns: + html._Element: HTML file + + Example: + >>> arch_path = 'D:/path/to/zip.zip' + >>> file_regex = '.*dir.*file_name' # Use .* for any character + >>> read_archived_html(arch_path, file_regex) + + """ + html_bytes = read_archived_file(archive_path, regex, file_list=file_list) + + return html.fromstring(html_bytes) + + +def archive( + folder_path: AnyPathStrType, + archive_path: AnyPathStrType, + fmt: str = "zip", +) -> AnyPathType: + """ + Archives a folder recursively. + + Args: + folder_path (AnyPathStrType): Folder to archive + archive_path (AnyPathStrType): Archive path, with or without extension + fmt (str): Format of the archive, used by :code:`shutil.make_archive`. Choose between [zip, tar, gztar, bztar, xztar] + + Returns: + str: Archive filename + + Example: + >>> folder_path = 'D:/path/to/folder_to_archive' + >>> archive_path = 'D:/path/to/output' + >>> archive = archive(folder_path, archive_path, fmt="gztar") + 'D:/path/to/output/folder_to_archive.tar.gz' + """ + archive_path = AnyPath(archive_path) + folder_path = AnyPath(folder_path) + + tmp_dir = None + if path.is_cloud_path(folder_path): + tmp_dir = tempfile.TemporaryDirectory() + folder_path = folder_path.download_to(tmp_dir.name) + + # Shutil make_archive needs a path without extension + archive_base = os.path.splitext(archive_path)[0] + + # Archive the folder + archive_fn = shutil.make_archive( + archive_base, + format=fmt, + root_dir=folder_path.parent, + base_dir=folder_path.name, + ) + + if tmp_dir is not None: + tmp_dir.cleanup() + + return AnyPath(archive_fn) + + +def add_to_zip( + zip_path: AnyPathStrType, + dirs_to_add: Union[list, AnyPathStrType], +) -> AnyPathType: + """ + Add folders to an already existing zip file (recursively). + + Args: + zip_path (AnyPathStrType): Already existing zip file + dirs_to_add (Union[list, AnyPathStrType]): Directories to add + + Returns: + AnyPathType: Updated zip_path + + Example: + >>> zip_path = 'D:/path/to/zip.zip' + >>> dirs_to_add = ['D:/path/to/dir1', 'D:/path/to/dir2'] + >>> add_to_zip(zip_path, dirs_to_add) + zip.zip contains 2 more folders, dir1 and dir2 + """ + zip_path = AnyPath(zip_path) + + # If the zip is on the cloud, cache it (zipfile doesn't like cloud paths) + if path.is_cloud_path(zip_path): + zip_path = AnyPath(zip_path.fspath) + + # Check if existing zipfile + if not zip_path.is_file(): + raise FileNotFoundError(f"Non existing {zip_path}") + + # Convert to list if needed + if not isinstance(dirs_to_add, list): + dirs_to_add = [dirs_to_add] + + # Add all folders to the existing zip + # Forced to use ZipFile because make_archive only works with one folder and not existing zipfile + with open_zipfile(zip_path, "a") as zip_file: + progress_bar = tqdm(dirs_to_add) + for dir_to_add_path in progress_bar: + # Just to be sure, use str instead of Paths + if isinstance(dir_to_add_path, Path): + dir_to_add = str(dir_to_add_path) + elif path.is_cloud_path(dir_to_add_path): + dir_to_add = dir_to_add_path.fspath + else: + dir_to_add = dir_to_add_path + + progress_bar.set_description( + f"Adding {os.path.basename(dir_to_add)} to {os.path.basename(zip_path)}" + ) + tmp = tempfile.TemporaryDirectory() + if os.path.isfile(dir_to_add): + dir_to_add = extract_file(dir_to_add, tmp.name) + + for root, _, files in os.walk(dir_to_add): + base_path = os.path.join(dir_to_add, "..") + + # Write dir (in namelist at least) + zip_file.write(root, os.path.relpath(root, base_path)) + + # Write files + for file in files: + zip_file.write( + os.path.join(root, file), + os.path.relpath( + os.path.join(root, file), os.path.join(dir_to_add, "..") + ), + ) + + # Clean tmp + tmp.cleanup() + + return zip_path + + +def get_archived_file_list(archive_path: AnyPathStrType) -> list: + """ + Get the list of all the files contained in an archive. + + Args: + archive_path (AnyPathStrType): Archive path + + Returns: + list: All files contained in the given archive + + Example: + >>> arch_path = 'D:/path/to/zip.zip' + >>> get_archived_file_list(arch_path, file_regex) + ['file_1.txt', 'file_2.tif', 'file_3.xml', 'file_4.geojson'] + """ + archive_path = AnyPath(archive_path) + + is_zip = archive_path.suffix == ".zip" + archive_fn = path.get_filename(archive_path) + if is_zip: + with open_zipfile(archive_path) as zip_ds: + file_list = [f.filename for f in zip_ds.filelist] + else: + try: + with open_tarfile(archive_path) as tar_ds: + tar_mb = tar_ds.getmembers() + file_list = [mb.name for mb in tar_mb] + except tarfile.ReadError as ex: + raise tarfile.ReadError(f"Impossible to open archive: {archive_fn}") from ex + + return file_list + + +def get_archived_path( + archive_path: AnyPathStrType, + regex: str, + as_list: bool = False, + case_sensitive: bool = False, + file_list: list = None, + **kwargs, +) -> Union[list, AnyPathType]: + """ + Get archived file path from inside the archive. + + .. WARNING:: + If :code:`as_list` is :code:`False`, it will only return the first file matched ! + + You can use this `site `_ to build your regex. + + Args: + archive_path (AnyPathStrType): Archive path + regex (str): File regex (used by re) as it can be found in the getmembers() list + as_list (bool): If true, returns a list (including all found files). If false, returns only the first match + case_sensitive (bool): If true, the regex is case-sensitive. + file_list (list): List of files to get archived from. Optional, if not given it will be re-computed. + + Returns: + Union[list, str]: Path from inside the zipfile + + Example: + >>> arch_path = 'D:/path/to/zip.zip' + >>> file_regex = '.*dir.*file_name' # Use .* for any character + >>> path = get_archived_path(arch_path, file_regex) + 'dir/filename.tif' + """ + if regex is None: + logs.deprecation_warning( + "'file_regex' is deprecated, please use 'regex' instead." + ) + regex = kwargs.pop("file_regex") + + # Get file list + archive_path = AnyPath(archive_path) + + # Offer the ability to give the file list directly, as this operation is expensive when done with large archives stored on the cloud + if file_list is None: + file_list = get_archived_file_list(archive_path) + + # Search for file + re_rgx = re.compile(regex) if case_sensitive else re.compile(regex, re.IGNORECASE) + archived_band_paths = list(filter(re_rgx.match, file_list)) + if not archived_band_paths: + raise FileNotFoundError( + f"Impossible to find file {regex} in {path.get_filename(archive_path)}" + ) + + # Convert to str if needed + if not as_list: + archived_band_paths = archived_band_paths[0] + + return archived_band_paths + + +def get_archived_rio_path( + archive_path: AnyPathStrType, + regex: str, + as_list: bool = False, + file_list: list = None, + **kwargs, +) -> Union[list, AnyPathType]: + """ + Get archived file path from inside the archive, to be read with rasterio: + + - :code:`zip+file://{zip_path}!{file_name}` + - :code:`tar+file://{tar_path}!{file_name}` + + + See `here `_ + for more information. + + .. WARNING:: + It wont be readable by pandas, geopandas or xmltree ! + + .. WARNING:: + If :code:`as_list` is :code:`False`, it will only return the first file matched ! + + You can use this `site `_ to build your regex. + + Args: + archive_path (AnyPathStrType): Archive path + regex (str): File regex (used by re) as it can be found in the getmembers() list + as_list (bool): If true, returns a list (including all found files). If false, returns only the first match + file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. + + Returns: + Union[list, str]: Band path that can be read by rasterio + + Example: + >>> arch_path = 'D:/path/to/zip.zip' + >>> file_regex = '.*dir.*file_name' # Use .* for any character + >>> path = get_archived_tif_path(arch_path, file_regex) + 'zip+file://D:/path/to/output.zip!dir/filename.tif' + >>> rasterio.open(path) + + """ + if regex is None: + logs.deprecation_warning( + "'file_regex' is deprecated, please use 'regex' instead." + ) + regex = kwargs.pop("file_regex") + + archive_path = AnyPath(archive_path) + if archive_path.suffix in [".tar", ".zip"]: + prefix = archive_path.suffix[-3:] + elif archive_path.suffix == ".tar.gz": + raise TypeError( + ".tar.gz files are too slow to be read from inside the archive. Please extract them instead." + ) + else: + raise TypeError("Only .zip and .tar files can be read from inside its archive.") + + # Search for file + archived_band_paths = get_archived_path( + archive_path, regex=regex, as_list=True, file_list=file_list + ) + + # Convert to rio path + if path.is_cloud_path(archive_path): + archived_band_paths = [ + f"{prefix}+file+{archive_path}!{p}" for p in archived_band_paths + ] + else: + # archived_band_paths = [ + # f"{prefix}+file://{archive_path}!{path}" for path in archived_band_paths + # ] + archived_band_paths = [ + f"/vsi{prefix}/{archive_path}/{p}" for p in archived_band_paths + ] + + # Convert to str if needed + if not as_list: + archived_band_paths = archived_band_paths[0] + + return archived_band_paths diff --git a/sertit/files.py b/sertit/files.py index 4bdb6c2..c2ec5a3 100644 --- a/sertit/files.py +++ b/sertit/files.py @@ -19,11 +19,7 @@ import json import logging import os -import re import shutil -import tarfile -import tempfile -import zipfile from datetime import date, datetime from enum import Enum from json import JSONDecoder, JSONEncoder @@ -32,10 +28,8 @@ import dill import numpy as np -from lxml import etree, html -from tqdm import tqdm -from sertit import AnyPath, logs, path +from sertit import AnyPath, logs, path, s3 from sertit.logs import SU_NAME from sertit.strings import DATE_FORMAT from sertit.types import AnyPathStrType, AnyPathType @@ -165,454 +159,6 @@ def real_rel_path(raw_path: AnyPathStrType, start: AnyPathStrType) -> AnyPathTyp return path.real_rel_path(raw_path, start) -def extract_file( - file_path: AnyPathStrType, - output: AnyPathStrType, - overwrite: bool = False, -) -> AnyPathType: - """ - Extract an archived file (zip or others). Overwrites if specified. - If the archive don't contain a root directory with the name of the archive without the extension, create it - - Args: - file_path (str): Archive file path - output (str): Output where to put the extracted directory - overwrite (bool): Overwrite found extracted directory - - Returns: - AnyPathType: Extracted directory paths - - Example: - >>> file_path = 'D:/path/to/zip.zip' - >>> output = 'D:/path/to/output' - >>> extract_file(file_path, output, overwrite=True) - D:/path/to/output/zip' - """ - # Convert to path - file_path = AnyPath(file_path) - output = AnyPath(output) - - # In case a folder is given, returns it (this means that the file is already extracted) - if file_path.is_dir(): - return file_path - - # Beware with .SEN3 and .SAFE extensions - archive_output = output.joinpath(path.get_filename(file_path)) - - # In case not overwrite and the extracted directory already exists - if not overwrite and archive_output.exists(): - LOGGER.debug( - "Already existing extracted %s. It won't be overwritten.", - archive_output, - ) - return archive_output - - def extract_sub_dir(arch, filename_list): - top_level_files = list({item.split("/")[0] for item in filename_list}) - - # When the only root directory in the archive has the right name, we don't have to create it - if len(top_level_files) == 1 and archive_output.name == path.get_filename( - top_level_files[0] - ): - arch.extractall(archive_output.parent) - archive_output.parent.joinpath(top_level_files[0]).rename(archive_output) - else: - arch.extractall(archive_output) - - # Manage archive type - if file_path.suffix == ".zip": - with zipfile.ZipFile(file_path, "r") as zip_file: - extract_sub_dir(zip_file, zip_file.namelist()) - elif file_path.suffix == ".tar" or file_path.suffixes == [".tar", ".gz"]: - with tarfile.open(file_path, "r") as tar_file: - extract_sub_dir(tar_file, tar_file.getnames()) - elif file_path.suffix == ".7z": - try: - import py7zr - - with py7zr.SevenZipFile(file_path, "r") as z7_file: - extract_sub_dir(z7_file, z7_file.getnames()) - except ModuleNotFoundError as exc: - raise TypeError("Please install 'py7zr' to extract .7z files") from exc - else: - raise TypeError( - f"Only .zip, .tar, .tar.gz and .7z files can be extracted, not {file_path}" - ) - - return archive_output - - -def extract_files( - archives: list, output: AnyPathStrType, overwrite: bool = False -) -> list: - """ - Extract all archived files. Overwrites if specified. - - Example: - >>> file_path = ['D:/path/to/zip1.zip', 'D:/path/to/zip2.zip'] - >>> output = 'D:/path/to/output' - >>> extract_files(file_path, output, overwrite=True) - ['D:/path/to/output.zip1', 'D:/path/to/output.zip2'] - - Args: - archives (list of str): List of archives to be extracted - output (str): Output folder where extracted files will be written - overwrite (bool): Overwrite found extracted files - - Returns: - list: Extracted files (even pre-existing ones) - """ - LOGGER.info("Extracting products in %s", output) - progress_bar = tqdm(archives) - extracts = [] - for arch in progress_bar: - progress_bar.set_description(f"Extracting product {os.path.basename(arch)}") - extracts.append(extract_file(arch, output, overwrite)) - - return extracts - - -def get_archived_file_list(archive_path: AnyPathStrType) -> list: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Get the list of all the files contained in an archive. - - Args: - archive_path (AnyPathStrType): Archive path - - Returns: - list: All files contained in the given archive - - Example: - >>> arch_path = 'D:/path/to/zip.zip' - >>> get_archived_file_list(arch_path, file_regex) - ['file_1.txt', 'file_2.tif', 'file_3.xml', 'file_4.geojson'] - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.get_archived_file_list(archive_path) - - -def get_archived_path( - archive_path: AnyPathStrType, file_regex: str, as_list: bool = False -) -> Union[list, AnyPathType]: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Get archived file path from inside the archive. - - .. WARNING:: - If :code:`as_list` is :code:`False`, it will only return the first file matched ! - - You can use this `site `_ to build your regex. - - Example: - >>> arch_path = 'D:/path/to/zip.zip' - >>> file_regex = '.*dir.*file_name' # Use .* for any character - >>> path = get_archived_path(arch_path, file_regex) - 'dir/filename.tif' - - Args: - archive_path (AnyPathStrType): Archive path - file_regex (str): File regex (used by re) as it can be found in the getmembers() list - as_list (bool): If true, returns a list (including all found files). If false, returns only the first match - - Returns: - Union[list, str]: Path from inside the zipfile - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.get_archived_path(archive_path, file_regex, as_list) - - -def get_archived_rio_path( - archive_path: AnyPathStrType, file_regex: str, as_list: bool = False -) -> Union[list, AnyPathType]: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Get archived file path from inside the archive, to be read with rasterio: - - - :code:`zip+file://{zip_path}!{file_name}` - - :code:`tar+file://{tar_path}!{file_name}` - - - See `here `_ - for more information. - - .. WARNING:: - It won't be readable by pandas, geopandas or xmltree ! - - .. WARNING:: - If :code:`as_list` is :code:`False`, it will only return the first file matched ! - - You can use this `site `_ to build your regex. - - Args: - archive_path (AnyPathStrType): Archive path - file_regex (str): File regex (used by re) as it can be found in the getmembers() list - as_list (bool): If true, returns a list (including all found files). If false, returns only the first match - - Returns: - Union[list, str]: Band path that can be read by rasterio - - Example: - >>> arch_path = 'D:/path/to/zip.zip' - >>> file_regex = '.*dir.*file_name' # Use .* for any character - >>> path = get_archived_tif_path(arch_path, file_regex) - 'zip+file://D:/path/to/output.zip!dir/filename.tif' - >>> rasterio.open(path) - - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.get_archived_rio_path(archive_path, file_regex, as_list) - - -def read_archived_file( - archive_path: AnyPathStrType, regex: str, file_list: list = None -) -> bytes: - """ - Read archived file (in bytes) from :code:`zip` or :code:`tar` archives. - - You can use this `site `_ to build your regex. - - Args: - archive_path (AnyPathStrType): Archive path - regex (str): Regex (used by re) as it can be found in the getmembers() list - file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. - - Returns: - bytes: Archived file in bytes - """ - archive_path = AnyPath(archive_path) - - # Compile regex - regex = re.compile(regex) - - # Open tar and zip XML - try: - if archive_path.suffix == ".tar": - with tarfile.open(archive_path) as tar_ds: - # file_list is not very useful for TAR files... - if file_list is None: - tar_mb = tar_ds.getmembers() - file_list = [mb.name for mb in tar_mb] - name = list(filter(regex.match, file_list))[0] - tarinfo = tar_ds.getmember(name) - file_str = tar_ds.extractfile(tarinfo).read() - elif archive_path.suffix == ".zip": - with zipfile.ZipFile(archive_path) as zip_ds: - if file_list is None: - file_list = [f.filename for f in zip_ds.filelist] - name = list(filter(regex.match, file_list))[0] - file_str = zip_ds.read(name) - - elif archive_path.suffix == ".tar.gz": - raise TypeError( - ".tar.gz files are too slow to read from inside the archive. Please extract them instead." - ) - else: - raise TypeError( - "Only .zip and .tar files can be read from inside its archive." - ) - except IndexError as exc: - raise FileNotFoundError( - f"Impossible to find file {regex} in {path.get_filename(archive_path)}" - ) from exc - - return file_str - - -def read_archived_xml( - archive_path: AnyPathStrType, regex: str = None, file_list: list = None, **kwargs -) -> etree._Element: - """ - Read archived XML from :code:`zip` or :code:`tar` archives. - - You can use this `site `_ to build your regex. - - Args: - archive_path (AnyPathStrType): Archive path - regex (str): XML regex (used by re) as it can be found in the getmembers() list - file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. - - Returns: - etree._Element: XML file - - Example: - >>> arch_path = 'D:/path/to/zip.zip' - >>> file_regex = '.*dir.*file_name' # Use .* for any character - >>> read_archived_xml(arch_path, file_regex) - - """ - if regex is None: - logs.deprecation_warning( - "'xml_regex' is deprecated, please use 'regex' instead." - ) - regex = kwargs.pop("xml_regex") - - xml_bytes = read_archived_file(archive_path, regex=regex, file_list=file_list) - - return etree.fromstring(xml_bytes) - - -def read_archived_html( - archive_path: AnyPathStrType, regex: str, file_list: list = None -) -> html.HtmlElement: - """ - Read archived HTML from :code:`zip` or :code:`tar` archives. - - You can use this `site `_ to build your regex. - - Args: - archive_path (AnyPathStrType): Archive path - regex (str): HTML regex (used by re) as it can be found in the getmembers() list - file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. - - Returns: - html._Element: HTML file - - Example: - >>> arch_path = 'D:/path/to/zip.zip' - >>> file_regex = '.*dir.*file_name' # Use .* for any character - >>> read_archived_html(arch_path, file_regex) - - """ - html_bytes = read_archived_file(archive_path, regex, file_list=file_list) - - return html.fromstring(html_bytes) - - -def archive( - folder_path: AnyPathStrType, - archive_path: AnyPathStrType, - fmt: str = "zip", -) -> AnyPathType: - """ - Archives a folder recursively. - - Args: - folder_path (AnyPathStrType): Folder to archive - archive_path (AnyPathStrType): Archive path, with or without extension - fmt (str): Format of the archive, used by :code:`shutil.make_archive`. Choose between [zip, tar, gztar, bztar, xztar] - - Returns: - str: Archive filename - - Example: - >>> folder_path = 'D:/path/to/folder_to_archive' - >>> archive_path = 'D:/path/to/output' - >>> archive = archive(folder_path, archive_path, fmt="gztar") - 'D:/path/to/output/folder_to_archive.tar.gz' - """ - archive_path = AnyPath(archive_path) - folder_path = AnyPath(folder_path) - - tmp_dir = None - if path.is_cloud_path(folder_path): - tmp_dir = tempfile.TemporaryDirectory() - folder_path = folder_path.download_to(tmp_dir.name) - - # Shutil make_archive needs a path without extension - archive_base = os.path.splitext(archive_path)[0] - - # Archive the folder - archive_fn = shutil.make_archive( - archive_base, - format=fmt, - root_dir=folder_path.parent, - base_dir=folder_path.name, - ) - - if tmp_dir is not None: - tmp_dir.cleanup() - - return AnyPath(archive_fn) - - -def add_to_zip( - zip_path: AnyPathStrType, - dirs_to_add: Union[list, AnyPathStrType], -) -> AnyPathType: - """ - Add folders to an already existing zip file (recursively). - - Args: - zip_path (AnyPathStrType): Already existing zip file - dirs_to_add (Union[list, AnyPathStrType]): Directories to add - - Returns: - AnyPathType: Updated zip_path - - Example: - >>> zip_path = 'D:/path/to/zip.zip' - >>> dirs_to_add = ['D:/path/to/dir1', 'D:/path/to/dir2'] - >>> add_to_zip(zip_path, dirs_to_add) - zip.zip contains 2 more folders, dir1 and dir2 - """ - zip_path = AnyPath(zip_path) - - # If the zip is on the cloud, cache it (zipfile doesn't like cloud paths) - if path.is_cloud_path(zip_path): - zip_path = AnyPath(zip_path.fspath) - - # Check if existing zipfile - if not zip_path.is_file(): - raise FileNotFoundError(f"Non existing {zip_path}") - - # Convert to list if needed - if not isinstance(dirs_to_add, list): - dirs_to_add = [dirs_to_add] - - # Add all folders to the existing zip - # Forced to use ZipFile because make_archive only works with one folder and not existing zipfile - with zipfile.ZipFile(zip_path, "a") as zip_file: - progress_bar = tqdm(dirs_to_add) - for dir_to_add_path in progress_bar: - # Just to be sure, use str instead of Paths - if isinstance(dir_to_add_path, Path): - dir_to_add = str(dir_to_add_path) - elif path.is_cloud_path(dir_to_add_path): - dir_to_add = dir_to_add_path.fspath - else: - dir_to_add = dir_to_add_path - - progress_bar.set_description( - f"Adding {os.path.basename(dir_to_add)} to {os.path.basename(zip_path)}" - ) - tmp = tempfile.TemporaryDirectory() - if os.path.isfile(dir_to_add): - dir_to_add = extract_file(dir_to_add, tmp.name) - - for root, _, files in os.walk(dir_to_add): - base_path = os.path.join(dir_to_add, "..") - - # Write dir (in namelist at least) - zip_file.write(root, os.path.relpath(root, base_path)) - - # Write files - for file in files: - zip_file.write( - os.path.join(root, file), - os.path.relpath( - os.path.join(root, file), os.path.join(dir_to_add, "..") - ), - ) - - # Clean tmp - tmp.cleanup() - - return zip_path - - def get_filename(file_path: AnyPathStrType, other_exts: Union[list, str] = None) -> str: """ .. deprecated:: 1.30.0 @@ -754,7 +300,7 @@ def copy(src: AnyPathStrType, dst: AnyPathStrType) -> AnyPathType: src = AnyPath(src) if path.is_cloud_path(src): - out = src.download_to(dst) + out = s3.download(src, dst) else: out = None try: diff --git a/sertit/path.py b/sertit/path.py index 48e9b90..30451e1 100644 --- a/sertit/path.py +++ b/sertit/path.py @@ -19,13 +19,10 @@ import logging import os import pprint -import re -import tarfile import tempfile -import zipfile from typing import Any, Union -from sertit import AnyPath, logs, s3 +from sertit import AnyPath from sertit.logs import SU_NAME from sertit.types import AnyPathStrType, AnyPathType @@ -150,189 +147,6 @@ def real_rel_path(raw_path: AnyPathStrType, start: AnyPathStrType) -> AnyPathTyp return rel_path -def get_archived_file_list(archive_path: AnyPathStrType) -> list: - """ - Get the list of all the files contained in an archive. - - Args: - archive_path (AnyPathStrType): Archive path - - Returns: - list: All files contained in the given archive - - Example: - >>> arch_path = 'D:/path/to/zip.zip' - >>> get_archived_file_list(arch_path, file_regex) - ['file_1.txt', 'file_2.tif', 'file_3.xml', 'file_4.geojson'] - """ - archive_path = AnyPath(archive_path) - - is_zip = archive_path.suffix == ".zip" - archive_fn = get_filename(archive_path) - if is_zip: - if is_cloud_path(archive_path): - archive_path = s3.read(archive_path) - - with zipfile.ZipFile(archive_path) as zip_ds: - file_list = [f.filename for f in zip_ds.filelist] - else: - try: - if is_cloud_path(archive_path): - args = {"fileobj": s3.read(archive_path), "mode": "r"} - else: - args = {"name": archive_path, "mode": "r"} - with tarfile.open(**args) as tar_ds: - tar_mb = tar_ds.getmembers() - file_list = [mb.name for mb in tar_mb] - except tarfile.ReadError as ex: - raise tarfile.ReadError(f"Impossible to open archive: {archive_fn}") from ex - - return file_list - - -def get_archived_path( - archive_path: AnyPathStrType, - regex: str, - as_list: bool = False, - case_sensitive: bool = False, - file_list: list = None, - **kwargs, -) -> Union[list, AnyPathType]: - """ - Get archived file path from inside the archive. - - .. WARNING:: - If :code:`as_list` is :code:`False`, it will only return the first file matched ! - - You can use this `site `_ to build your regex. - - Args: - archive_path (AnyPathStrType): Archive path - regex (str): File regex (used by re) as it can be found in the getmembers() list - as_list (bool): If true, returns a list (including all found files). If false, returns only the first match - case_sensitive (bool): If true, the regex is case-sensitive. - file_list (list): List of files to get archived from. Optional, if not given it will be re-computed. - - Returns: - Union[list, str]: Path from inside the zipfile - - Example: - >>> arch_path = 'D:/path/to/zip.zip' - >>> file_regex = '.*dir.*file_name' # Use .* for any character - >>> path = get_archived_path(arch_path, file_regex) - 'dir/filename.tif' - """ - if regex is None: - logs.deprecation_warning( - "'file_regex' is deprecated, please use 'regex' instead." - ) - regex = kwargs.pop("file_regex") - - # Get file list - archive_path = AnyPath(archive_path) - - # Offer the ability to give the file list directly, as this operation is expensive when done with large archives stored on the cloud - if file_list is None: - file_list = get_archived_file_list(archive_path) - - # Search for file - re_rgx = re.compile(regex) if case_sensitive else re.compile(regex, re.IGNORECASE) - archived_band_paths = list(filter(re_rgx.match, file_list)) - if not archived_band_paths: - raise FileNotFoundError( - f"Impossible to find file {regex} in {get_filename(archive_path)}" - ) - - # Convert to str if needed - if not as_list: - archived_band_paths = archived_band_paths[0] - - return archived_band_paths - - -def get_archived_rio_path( - archive_path: AnyPathStrType, - regex: str, - as_list: bool = False, - file_list: list = None, - **kwargs, -) -> Union[list, AnyPathType]: - """ - Get archived file path from inside the archive, to be read with rasterio: - - - :code:`zip+file://{zip_path}!{file_name}` - - :code:`tar+file://{tar_path}!{file_name}` - - - See `here `_ - for more information. - - .. WARNING:: - It wont be readable by pandas, geopandas or xmltree ! - - .. WARNING:: - If :code:`as_list` is :code:`False`, it will only return the first file matched ! - - You can use this `site `_ to build your regex. - - Args: - archive_path (AnyPathStrType): Archive path - regex (str): File regex (used by re) as it can be found in the getmembers() list - as_list (bool): If true, returns a list (including all found files). If false, returns only the first match - file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. - - Returns: - Union[list, str]: Band path that can be read by rasterio - - Example: - >>> arch_path = 'D:/path/to/zip.zip' - >>> file_regex = '.*dir.*file_name' # Use .* for any character - >>> path = get_archived_tif_path(arch_path, file_regex) - 'zip+file://D:/path/to/output.zip!dir/filename.tif' - >>> rasterio.open(path) - - """ - if regex is None: - logs.deprecation_warning( - "'file_regex' is deprecated, please use 'regex' instead." - ) - regex = kwargs.pop("file_regex") - - archive_path = AnyPath(archive_path) - if archive_path.suffix in [".tar", ".zip"]: - prefix = archive_path.suffix[-3:] - elif archive_path.suffix == ".tar.gz": - raise TypeError( - ".tar.gz files are too slow to be read from inside the archive. Please extract them instead." - ) - else: - raise TypeError("Only .zip and .tar files can be read from inside its archive.") - - # Search for file - archived_band_paths = get_archived_path( - archive_path, regex=regex, as_list=True, file_list=file_list - ) - - # Convert to rio path - if is_cloud_path(archive_path): - archived_band_paths = [ - f"{prefix}+file+{archive_path}!{path}" for path in archived_band_paths - ] - else: - # archived_band_paths = [ - # f"{prefix}+file://{archive_path}!{path}" for path in archived_band_paths - # ] - archived_band_paths = [ - f"/vsi{prefix}/{archive_path}/{path}" for path in archived_band_paths - ] - - # Convert to str if needed - if not as_list: - archived_band_paths = archived_band_paths[0] - - return archived_band_paths - - def get_filename(file_path: AnyPathStrType, other_exts: Union[list, str] = None) -> str: """ Get file name (without extension) from file path, ie: diff --git a/sertit/vectors.py b/sertit/vectors.py index 9b214b2..87856e9 100644 --- a/sertit/vectors.py +++ b/sertit/vectors.py @@ -23,9 +23,7 @@ import os import re import shutil -import tarfile import tempfile -import zipfile from collections.abc import Generator from contextlib import contextmanager from typing import Any, Union @@ -36,7 +34,7 @@ from cloudpathlib.exceptions import AnyPathTypeError from shapely import Polygon, wkt -from sertit import AnyPath, files, geometry, logs, misc, path, strings +from sertit import AnyPath, archives, files, geometry, logs, misc, path, s3, strings from sertit.logs import SU_NAME from sertit.types import AnyPathStrType, AnyPathType @@ -256,8 +254,11 @@ def get_aoi_wkt(aoi_path: AnyPathStrType, as_str: bool = True) -> Union[str, Pol if aoi_path.suffix == ".wkt": try: - with open(aoi_path) as aoi_f: - aoi = wkt.load(aoi_f) + if path.is_cloud_path(aoi_path): + aoi = wkt.load(s3.read(aoi_path)) + else: + with open(aoi_path) as aoi_f: + aoi = wkt.load(aoi_f) except Exception as ex: raise ValueError("AOI WKT cannot be read") from ex else: @@ -471,13 +472,17 @@ def read( if "!" in str(vector_path): split_vect = str(vector_path).split("!") archive_regex = ".*{}".format(split_vect[1].replace(".", r"\.")) - vector_path = AnyPath(split_vect[0]) + try: + vector_path = AnyPath(split_vect[0], **vector_path.storage_options) + except Exception: + # Cloudpathlib + vector_path = AnyPath(split_vect[0]) # Manage archive case if vector_path.suffix in [".tar", ".zip"]: prefix = vector_path.suffix[-3:] file_list = kwargs.pop( - "file_list", path.get_archived_file_list(vector_path) + "file_list", archives.get_archived_file_list(vector_path) ) try: @@ -710,16 +715,16 @@ def ogr2geojson( # archived vector_path are extracted in a tmp folder so no need to be downloaded if vector_path.suffix == ".zip": - with zipfile.ZipFile(vector_path, "r") as zip_ds: + with archives.open_zipfile(vector_path, "r") as zip_ds: vect_path = zip_ds.extract(arch_vect_path, out_dir) elif vector_path.suffix == ".tar": - with tarfile.open(vector_path, "r") as tar_ds: + with archives.open_tarfile(vector_path, "r") as tar_ds: tar_ds.extract(arch_vect_path, out_dir) vect_path = os.path.join(out_dir, arch_vect_path) else: # vector_path should be downloaded to work with 'ogr2ogr' if path.is_cloud_path(vector_path): - vector_path = AnyPath(vector_path).fspath + vector_path = s3.download(vector_path, out_dir) vect_path = vector_path vect_path_gj = os.path.join( diff --git a/sertit/xml.py b/sertit/xml.py index 9ddc44a..8d334ef 100644 --- a/sertit/xml.py +++ b/sertit/xml.py @@ -30,7 +30,7 @@ ) from lxml.html.builder import E -from sertit import AnyPath, files, path +from sertit import AnyPath, archives, logs, path, s3 from sertit.logs import SU_NAME from sertit.misc import ListEnum from sertit.types import AnyPathStrType @@ -61,7 +61,7 @@ def read(xml_path: AnyPathStrType) -> _Element: # Slower but works with: # {ValueError}Unicode strings with encoding declaration are not supported. # Please use bytes input or XML fragments without declaration. - root = fromstring(xml_path.read_bytes()) + root = fromstring(s3.read(xml_path).read()) else: # pylint: disable=I1101: # Module 'lxml.etree' has no 'parse' member, but source is unavailable. @@ -75,7 +75,10 @@ def read(xml_path: AnyPathStrType) -> _Element: def read_archive( - path: AnyPathStrType, regex: str = None, file_list: list = None + archive_path: AnyPathStrType, + regex: str = None, + file_list: list = None, + **kwargs, ) -> _Element: """ Read an XML file from inside an archive (zip or tar) @@ -87,25 +90,40 @@ def read_archive( - path to the archive plus a regex looking inside the archive. Duplicate behaviour to :py:func:`files.read_archived_xml` Args: - path (AnyPathStrType): Path to the XML file, stored inside an archive or path to the archive itself + archive_path (AnyPathStrType): Path to the XML file, stored inside an archive or path to the archive itself regex (str): Optional. If specified, the path should be the archive path and the regex should be the key to find the XML file inside the archive. file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. Returns: _Element: XML Root """ + if archive_path is None: + logs.deprecation_warning( + "'path' argument is deprecated, use 'archive_path' instead." + ) + archive_path = kwargs.pop("path") try: if not regex: - path, basename = str(path).split("!") + archive_base_path, basename = str(archive_path).split("!") regex = basename - if path.startswith("zip://") or path.startswith("tar://"): - path = path[5:] + if archive_base_path.startswith("zip://") or archive_base_path.startswith( + "tar://" + ): + archive_base_path = archive_base_path[5:] + + # For UPath + with contextlib.suppress(Exception): + archive_base_path = AnyPath( + archive_base_path, **archive_path.storage_options + ) + else: + archive_base_path = archive_path - return files.read_archived_xml(path, regex, file_list=file_list) + return archives.read_archived_xml(archive_base_path, regex, file_list=file_list) except XMLSyntaxError as exc: - raise ValueError(f"Invalid metadata XML for {path}!") from exc + raise ValueError(f"Invalid metadata XML for {archive_path}!") from exc def write(xml: _Element, path: str) -> None: From fa06fafb1054a8c522b7a3bbe0fae42628c00bfc Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Fri, 13 Dec 2024 14:57:26 +0100 Subject: [PATCH 05/18] Simplify pre-commit hooks --- .pre-commit-config.yaml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 38aaf7b..1275d62 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,16 +10,12 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks.git rev: v5.0.0 hooks: - - id: trailing-whitespace - - id: end-of-file-fixer - id: check-json - id: check-yaml args: [ --allow-multiple-documents, --unsafe ] - id: check-xml - id: check-added-large-files args: [ '--maxkb=1600' ] - - id: debug-statements - - id: check-merge-conflict - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. @@ -30,3 +26,13 @@ repos: args: [ --fix ] # Run the formatter. - id: ruff-format + + + + + + + + + + From 8b95aac87cd925628e6c3bb374335244e147ac5c Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Fri, 13 Dec 2024 15:50:21 +0100 Subject: [PATCH 06/18] Make UPath work with all archive functions and dcmp --- CI/SCRIPTS/test_archives.py | 8 ++- sertit/archives.py | 108 +++++++++++++++++++----------------- sertit/ci.py | 47 +++++++++------- sertit/s3.py | 17 +++++- 4 files changed, 106 insertions(+), 74 deletions(-) diff --git a/CI/SCRIPTS/test_archives.py b/CI/SCRIPTS/test_archives.py index 3415618..ddb5d0f 100644 --- a/CI/SCRIPTS/test_archives.py +++ b/CI/SCRIPTS/test_archives.py @@ -8,6 +8,7 @@ from sertit import archives, ci, files, path, s3, vectors +@s3_env def test_archive(tmp_path): """Test extracting functions""" # Archives @@ -31,6 +32,11 @@ def test_archive(tmp_path): # Extract extracted_dirs = archives.extract_files(arch, tmp_path, overwrite=True) + + # Test + for ex_dir in extracted_dirs: + ci.assert_dir_equal(core_dir, ex_dir) + archives.extract_files([zip2_file], tmp_path, overwrite=False) # Already existing # Test @@ -54,7 +60,7 @@ def test_archive(tmp_path): # Add to zip zip_out = zip2_file if path.is_cloud_path(zip2_file) else archive_base + ".zip" core_copy = files.copy(core_dir, os.path.join(tmp_path, "core2")) - zip_out = archives.add_to_zip(zip_out, core_copy) + zip_out = archives.add_to_zip(s3.download(zip_out, tmp_path), core_copy) # Extract unzip_out = os.path.join(tmp_path, "out") diff --git a/sertit/archives.py b/sertit/archives.py index 2115071..b739e5f 100644 --- a/sertit/archives.py +++ b/sertit/archives.py @@ -285,10 +285,14 @@ def archive( archive_path = AnyPath(archive_path) folder_path = AnyPath(folder_path) + # with zipfile.ZipFile(archive_path, mode='w', compression=zipfile.ZIP_DEFLATED) as zipf: + # for f in folder_path.glob("**"): + # zipf.write(f, f.relative_to(folder_path.name)) + tmp_dir = None if path.is_cloud_path(folder_path): tmp_dir = tempfile.TemporaryDirectory() - folder_path = folder_path.download_to(tmp_dir.name) + folder_path = s3.download(folder_path, tmp_dir.name) # Shutil make_archive needs a path without extension archive_base = os.path.splitext(archive_path)[0] @@ -304,7 +308,12 @@ def archive( if tmp_dir is not None: tmp_dir.cleanup() - return AnyPath(archive_fn) + try: + arch = AnyPath(archive_fn, folder_path.storage_options) + except Exception: + arch = AnyPath(archive_fn) + + return arch def add_to_zip( @@ -329,55 +338,54 @@ def add_to_zip( """ zip_path = AnyPath(zip_path) - # If the zip is on the cloud, cache it (zipfile doesn't like cloud paths) - if path.is_cloud_path(zip_path): - zip_path = AnyPath(zip_path.fspath) - - # Check if existing zipfile - if not zip_path.is_file(): - raise FileNotFoundError(f"Non existing {zip_path}") - - # Convert to list if needed - if not isinstance(dirs_to_add, list): - dirs_to_add = [dirs_to_add] - - # Add all folders to the existing zip - # Forced to use ZipFile because make_archive only works with one folder and not existing zipfile - with open_zipfile(zip_path, "a") as zip_file: - progress_bar = tqdm(dirs_to_add) - for dir_to_add_path in progress_bar: - # Just to be sure, use str instead of Paths - if isinstance(dir_to_add_path, Path): - dir_to_add = str(dir_to_add_path) - elif path.is_cloud_path(dir_to_add_path): - dir_to_add = dir_to_add_path.fspath - else: - dir_to_add = dir_to_add_path - - progress_bar.set_description( - f"Adding {os.path.basename(dir_to_add)} to {os.path.basename(zip_path)}" + with tempfile.TemporaryDirectory() as tmp_dir: + # If the zip is on the cloud, cache it (zipfile doesn't like cloud paths) + if path.is_cloud_path(zip_path): + raise NotImplementedError( + "Impossible (for now) to update a zip stored in the cloud!" ) - tmp = tempfile.TemporaryDirectory() - if os.path.isfile(dir_to_add): - dir_to_add = extract_file(dir_to_add, tmp.name) - - for root, _, files in os.walk(dir_to_add): - base_path = os.path.join(dir_to_add, "..") - - # Write dir (in namelist at least) - zip_file.write(root, os.path.relpath(root, base_path)) - - # Write files - for file in files: - zip_file.write( - os.path.join(root, file), - os.path.relpath( - os.path.join(root, file), os.path.join(dir_to_add, "..") - ), - ) - - # Clean tmp - tmp.cleanup() + + # Check if existing zipfile + if not zip_path.is_file(): + raise FileNotFoundError(f"Non existing {zip_path}") + + # Convert to list if needed + if not isinstance(dirs_to_add, list): + dirs_to_add = [dirs_to_add] + + # Add all folders to the existing zip + # Forced to use ZipFile because make_archive only works with one folder and not existing zipfile + with open_zipfile(zip_path, "a") as zip_file: + progress_bar = tqdm(dirs_to_add) + for dir_to_add_path in progress_bar: + # Just to be sure, use str instead of Paths + if isinstance(dir_to_add_path, Path): + dir_to_add = str(dir_to_add_path) + elif path.is_cloud_path(dir_to_add_path): + dir_to_add = dir_to_add_path.fspath + else: + dir_to_add = dir_to_add_path + + progress_bar.set_description( + f"Adding {os.path.basename(dir_to_add)} to {os.path.basename(zip_path)}" + ) + if os.path.isfile(dir_to_add): + dir_to_add = extract_file(dir_to_add, tmp_dir) + + for root, _, files in os.walk(dir_to_add): + base_path = os.path.join(dir_to_add, "..") + + # Write dir (in namelist at least) + zip_file.write(root, os.path.relpath(root, base_path)) + + # Write files + for file in files: + zip_file.write( + os.path.join(root, file), + os.path.relpath( + os.path.join(root, file), os.path.join(dir_to_add, "..") + ), + ) return zip_path diff --git a/sertit/ci.py b/sertit/ci.py index 8b000ba..a0280e4 100644 --- a/sertit/ci.py +++ b/sertit/ci.py @@ -20,6 +20,7 @@ import filecmp import logging import pprint +import tempfile from doctest import Example from typing import Any, Union @@ -30,7 +31,7 @@ from shapely import force_2d, normalize from shapely.testing import assert_geometries_equal -from sertit import AnyPath, files, s3, unistra +from sertit import AnyPath, files, path, s3, unistra from sertit.logs import SU_NAME, deprecation_warning from sertit.types import AnyPathStrType, AnyXrDataStructure @@ -381,27 +382,33 @@ def assert_dir_equal(path_1: AnyPathStrType, path_2: AnyPathStrType) -> None: assert path_1.is_dir(), f"{path_1} is not a directory!" assert path_2.is_dir(), f"{path_2} is not a directory!" - dcmp = filecmp.dircmp(path_1, path_2) - try: - assert ( - dcmp.left_only == [] - ), f"More files in {path_1}!\n{pprint.pformat(list(dcmp.left_only))}" - assert ( - dcmp.right_only == [] - ), f"More files in {path_2}!\n{pprint.pformat(list(dcmp.right_only))}" - except FileNotFoundError: - files_1 = [AnyPath(p).name for p in AnyPath(path_1).iterdir()] - files_2 = [AnyPath(p).name for p in AnyPath(path_2).iterdir()] - - for f1 in files_1: - assert ( - f1 in files_2 - ), f"File missing!\n{f1} not in {pprint.pformat(files_2)}" + with tempfile.TemporaryDirectory() as tmpdir: + if path.is_cloud_path(path_1): + path_1 = s3.download(path_1, tmpdir) + if path.is_cloud_path(path_2): + path_2 = s3.download(path_2, tmpdir) - for f2 in files_2: + dcmp = filecmp.dircmp(path_1, path_2) + try: + assert ( + dcmp.left_only == [] + ), f"More files in {path_1}!\n{pprint.pformat(list(dcmp.left_only))}" assert ( - f2 in files_1 - ), f"File missing!\n{f2} not in {pprint.pformat(files_1)}" + dcmp.right_only == [] + ), f"More files in {path_2}!\n{pprint.pformat(list(dcmp.right_only))}" + except FileNotFoundError: + files_1 = [p.name for p in path_1.iterdir()] + files_2 = [p.name for p in path_2.iterdir()] + + for f1 in files_1: + assert ( + f1 in files_2 + ), f"File missing!\n{f1} not in {pprint.pformat(files_2)}" + + for f2 in files_2: + assert ( + f2 in files_1 + ), f"File missing!\n{f2} not in {pprint.pformat(files_1)}" def assert_geom_equal( diff --git a/sertit/s3.py b/sertit/s3.py index aba8660..55cae8f 100644 --- a/sertit/s3.py +++ b/sertit/s3.py @@ -288,10 +288,21 @@ def download(src, dst): import shutil dst = AnyPath(dst) - downloaded_path = dst / src.name if dst.is_dir() else dst + if dst.is_dir() and src.name != dst.name: + downloaded_path = dst / src.name + else: + downloaded_path = dst - with src.open("rb") as f0, downloaded_path.open("wb") as f1: - shutil.copyfileobj(f0, f1) + if src.is_file(): + with src.open("rb") as f0, downloaded_path.open("wb") as f1: + shutil.copyfileobj(f0, f1) + else: + for f in src.glob("**"): + dst_file = downloaded_path / f.name + if f.is_file(): + dst_file.parent.mkdir(parents=True, exist_ok=True) + with f.open("rb") as f0, dst_file.open("wb") as f1: + shutil.copyfileobj(f0, f1) # cloudpathlib elif isinstance(src, CloudPath): From 39190377a5e026cb2d722649f0ef9f31eff884c8 Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Fri, 13 Dec 2024 16:19:22 +0100 Subject: [PATCH 07/18] Make the functions work with cloudpathlib also --- CI/SCRIPTS/script_utils.py | 13 ++++++--- CI/SCRIPTS/test_types.py | 12 ++++++-- CI/SCRIPTS/test_unistra.py | 5 +++- sertit/archives.py | 4 +-- sertit/ci.py | 7 +++-- sertit/path.py | 22 +++++++++++---- sertit/s3.py | 58 +++++++++++++++++++++----------------- sertit/types.py | 12 ++++++-- sertit/vectors.py | 6 ++-- sertit/xml.py | 4 +-- 10 files changed, 94 insertions(+), 49 deletions(-) diff --git a/CI/SCRIPTS/script_utils.py b/CI/SCRIPTS/script_utils.py index ba450df..583682a 100644 --- a/CI/SCRIPTS/script_utils.py +++ b/CI/SCRIPTS/script_utils.py @@ -38,13 +38,18 @@ class Polarization(ListEnum): def get_s3_ci_path(): """Get S3 CI path""" - # unistra.define_s3_client() from sertit.unistra import UNISTRA_S3_ENDPOINT - return AnyPath( - "s3://sertit-sertit-utils-ci", endpoint_url=f"https://{UNISTRA_S3_ENDPOINT}" - ) + try: + ci_path = AnyPath( + "s3://sertit-sertit-utils-ci", endpoint_url=f"https://{UNISTRA_S3_ENDPOINT}" + ) + except TypeError: + unistra.define_s3_client() + ci_path = AnyPath("s3://sertit-sertit-utils-ci") + + return ci_path def get_proj_path(): diff --git a/CI/SCRIPTS/test_types.py b/CI/SCRIPTS/test_types.py index 01b246e..1daf305 100644 --- a/CI/SCRIPTS/test_types.py +++ b/CI/SCRIPTS/test_types.py @@ -2,12 +2,20 @@ from typing import Union import numpy as np -from cloudpathlib import CloudPath -from upath import UPath from sertit import AnyPath from sertit.types import AnyPathType, is_iterable, make_iterable +try: + from upath import UPath +except ImportError: + UPath = None + +try: + from cloudpathlib import CloudPath +except ImportError: + CloudPath = None + def test_types(): """Test some type aliases""" diff --git a/CI/SCRIPTS/test_unistra.py b/CI/SCRIPTS/test_unistra.py index 8910b34..a0738da 100644 --- a/CI/SCRIPTS/test_unistra.py +++ b/CI/SCRIPTS/test_unistra.py @@ -73,7 +73,10 @@ def test_unistra_s3(): assert with_s3() == 1 # Test get_geodatastore with s3 - assert str(get_geodatastore()) == "s3://sertit-geodatastore/" + try: + assert str(get_geodatastore()) == "s3://sertit-geodatastore/" + except AssertionError: + assert str(get_geodatastore()) == "s3://sertit-geodatastore" # Test get_geodatastore without s3 with tempenv.TemporaryEnvironment({s3.USE_S3_STORAGE: "0"}): diff --git a/sertit/archives.py b/sertit/archives.py index b739e5f..990dad2 100644 --- a/sertit/archives.py +++ b/sertit/archives.py @@ -309,8 +309,8 @@ def archive( tmp_dir.cleanup() try: - arch = AnyPath(archive_fn, folder_path.storage_options) - except Exception: + arch = AnyPath(archive_fn, storage_options=folder_path.storage_options) + except AttributeError: arch = AnyPath(archive_fn) return arch diff --git a/sertit/ci.py b/sertit/ci.py index a0280e4..d8d2956 100644 --- a/sertit/ci.py +++ b/sertit/ci.py @@ -382,11 +382,14 @@ def assert_dir_equal(path_1: AnyPathStrType, path_2: AnyPathStrType) -> None: assert path_1.is_dir(), f"{path_1} is not a directory!" assert path_2.is_dir(), f"{path_2} is not a directory!" - with tempfile.TemporaryDirectory() as tmpdir: + with ( + tempfile.TemporaryDirectory() as tmpdir, + tempfile.TemporaryDirectory() as tmpdir2, + ): if path.is_cloud_path(path_1): path_1 = s3.download(path_1, tmpdir) if path.is_cloud_path(path_2): - path_2 = s3.download(path_2, tmpdir) + path_2 = s3.download(path_2, tmpdir2) dcmp = filecmp.dircmp(path_1, path_2) try: diff --git a/sertit/path.py b/sertit/path.py index 30451e1..504c526 100644 --- a/sertit/path.py +++ b/sertit/path.py @@ -15,6 +15,7 @@ # limitations under the License. """Tools for paths""" +import contextlib import errno import logging import os @@ -420,7 +421,7 @@ def is_cloud_path(path: AnyPathStrType): "gs", "gcs", ] - except ImportError: + except AttributeError: try: from cloudpathlib import CloudPath @@ -431,17 +432,26 @@ def is_cloud_path(path: AnyPathStrType): def is_path(path: Any) -> bool: """ - Determine whether the path corresponds to a file stored on the cloud or not. + Determine whether the path is really a path or not: either str, Path, UPath or CloudPath Args: path (AnyPathStrType): File path Returns: - bool: True if the file is store on the cloud. + bool: True if the file is a path """ from pathlib import Path - from cloudpathlib import CloudPath - from upath import UPath + is_path = isinstance(path, (str, Path)) + + with contextlib.suppress(ImportError): + from upath import UPath + + is_path = is_path or isinstance(path, UPath) + + with contextlib.suppress(ImportError): + from cloudpathlib import CloudPath + + is_path = is_path or isinstance(path, CloudPath) - return isinstance(path, (str, Path, CloudPath, UPath)) + return is_path diff --git a/sertit/s3.py b/sertit/s3.py index 55cae8f..bc14535 100644 --- a/sertit/s3.py +++ b/sertit/s3.py @@ -17,6 +17,7 @@ S3 tools """ +import contextlib import logging import os from contextlib import contextmanager @@ -279,34 +280,39 @@ def download(src, dst): # By default, use the src path downloaded_path = src - if path.is_path(src): - from cloudpathlib import CloudPath - from upath import UPath - - # Universal pathlib - if isinstance(src, UPath): - import shutil - - dst = AnyPath(dst) - if dst.is_dir() and src.name != dst.name: - downloaded_path = dst / src.name - else: - downloaded_path = dst - - if src.is_file(): - with src.open("rb") as f0, downloaded_path.open("wb") as f1: - shutil.copyfileobj(f0, f1) - else: - for f in src.glob("**"): - dst_file = downloaded_path / f.name - if f.is_file(): - dst_file.parent.mkdir(parents=True, exist_ok=True) - with f.open("rb") as f0, dst_file.open("wb") as f1: - shutil.copyfileobj(f0, f1) + # Universal pathlib + if path.is_cloud_path(src): + import shutil + + with contextlib.suppress(ImportError): + from upath import UPath + + if isinstance(src, UPath): + dst = AnyPath(dst) + if dst.is_dir() and src.name != dst.name: + downloaded_path = dst / src.name + else: + downloaded_path = dst + + if src.is_file(): + with src.open("rb") as f0, downloaded_path.open("wb") as f1: + shutil.copyfileobj(f0, f1) + else: + downloaded_path.parent.mkdir(parents=True, exist_ok=True) + + for f in src.glob("**"): + dst_file = downloaded_path / f.name + if f.is_file(): + dst_file.parent.mkdir(parents=True, exist_ok=True) + with f.open("rb") as f0, dst_file.open("wb") as f1: + shutil.copyfileobj(f0, f1) # cloudpathlib - elif isinstance(src, CloudPath): - downloaded_path = src.fspath if dst is None else src.download_to(dst) + with contextlib.suppress(ImportError): + from cloudpathlib import CloudPath + + if isinstance(src, CloudPath): + downloaded_path = src.fspath if dst is None else src.download_to(dst) return downloaded_path diff --git a/sertit/types.py b/sertit/types.py index 433fe29..3cea7a9 100644 --- a/sertit/types.py +++ b/sertit/types.py @@ -5,10 +5,18 @@ import geopandas as gpd import numpy as np import xarray as xr -from cloudpathlib import CloudPath from rasterio.io import DatasetReader, DatasetWriter from shapely import MultiPolygon, Polygon -from upath import UPath + +try: + from upath import UPath +except ImportError: + UPath = None + +try: + from cloudpathlib import CloudPath +except ImportError: + CloudPath = None AnyPathType = Union[CloudPath, Path, UPath] """Any Path Type (derived from Pathlib, Universal Pathlib and CloudpathLib)""" diff --git a/sertit/vectors.py b/sertit/vectors.py index 87856e9..2f9df17 100644 --- a/sertit/vectors.py +++ b/sertit/vectors.py @@ -473,8 +473,10 @@ def read( split_vect = str(vector_path).split("!") archive_regex = ".*{}".format(split_vect[1].replace(".", r"\.")) try: - vector_path = AnyPath(split_vect[0], **vector_path.storage_options) - except Exception: + vector_path = AnyPath( + split_vect[0], storage_options=vector_path.storage_options + ) + except AttributeError: # Cloudpathlib vector_path = AnyPath(split_vect[0]) diff --git a/sertit/xml.py b/sertit/xml.py index 8d334ef..d52dfa0 100644 --- a/sertit/xml.py +++ b/sertit/xml.py @@ -113,9 +113,9 @@ def read_archive( archive_base_path = archive_base_path[5:] # For UPath - with contextlib.suppress(Exception): + with contextlib.suppress(AttributeError): archive_base_path = AnyPath( - archive_base_path, **archive_path.storage_options + archive_base_path, storage_options=archive_path.storage_options ) else: archive_base_path = archive_path From fd953c2c82043bf0b003f1ae0fcaf9e6aeca10c4 Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Fri, 13 Dec 2024 16:50:00 +0100 Subject: [PATCH 08/18] **ENH: Use `universal_pathlib` instead of `cloudpathlib` (even if the code is still compatible with `cloudpathlib`)** --- CHANGES.md | 4 ++++ pyproject.toml | 5 +++-- requirements.txt | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index bb9dc5f..ec37141 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,9 @@ # Release History +## 2.0.0 (20xx-xx-xx) + +- **ENH: Use `universal_pathlib` instead of `cloudpathlib` (even if the code is still compatible with `cloudpathlib`)** ([#4](https://github.com/sertit/sertit-utils/issues/4)) + ## 1.44.x (20xx-xx-xx) - **ENH: Drop `isort`, `black` and `flake8` and use `ruff`** diff --git a/pyproject.toml b/pyproject.toml index 1faf5d5..aed03c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,8 +35,8 @@ dependencies = [ "dill", "psutil", "geopandas>=0.14.4", - "cloudpathlib[all]>=0.12.1", "xarray>=2024.06.0", + "universal_pathlib>=0.2.6" ] dynamic = ["version"] @@ -60,7 +60,8 @@ dask = [ "odc-geo>=0.4.6", "xarray-spatial>=0.3.6", ] -full = ["sertit[colorlog,rasters_rio,rasters,dask]"] +cloudpathlib = ["cloudpathlib[all]>=0.12.1"] +full = ["sertit[colorlog,rasters_rio,rasters,dask,cloudpathlib]"] [project.urls] Bug_Tracker = "https://github.com/sertit/sertit-utils/issues" diff --git a/requirements.txt b/requirements.txt index d02da9c..c4682aa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ lxml dill psutil geopandas>=0.14.4 +universal_pathlib>=0.2.6 cloudpathlib[all]>=0.12.1 xarray>=2024.06.0 shapely >= 2.0.0 From 455716a9df927ae1092d7bc5f7ce3d7d49065e6d Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Fri, 13 Dec 2024 16:58:34 +0100 Subject: [PATCH 09/18] BREAKING CHANGE: Remove all deprecations from `sertit==1.*` #3 --- CHANGES.md | 6 + CI/SCRIPTS/test_files.py | 5 - CI/SCRIPTS/test_rasters.py | 52 ------ CI/SCRIPTS/test_rasters_rio.py | 54 ------ CI/SCRIPTS/test_vectors.py | 10 -- sertit/archives.py | 20 +-- sertit/arcpy.py | 28 --- sertit/ci.py | 59 +------ sertit/files.py | 314 +-------------------------------- sertit/rasters.py | 46 ----- sertit/rasters_rio.py | 53 +----- sertit/vectors.py | 42 +---- sertit/xml.py | 8 +- 13 files changed, 15 insertions(+), 682 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index ec37141..c82e4cb 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,12 @@ ## 2.0.0 (20xx-xx-xx) +- **BREAKING CHANGE**: Remove all deprecations from `sertit==1.*` ([#3](https://github.com/sertit/sertit-utils/issues/3)): + - Duplication between `path` and `files` modules + - Duplication between `ci`, `s3` and `unistra` modules + - Arguments in functions + - Renaming functions + - Others - **ENH: Use `universal_pathlib` instead of `cloudpathlib` (even if the code is still compatible with `cloudpathlib`)** ([#4](https://github.com/sertit/sertit-utils/issues/4)) ## 1.44.x (20xx-xx-xx) diff --git a/CI/SCRIPTS/test_files.py b/CI/SCRIPTS/test_files.py index a0f9889..84539f7 100644 --- a/CI/SCRIPTS/test_files.py +++ b/CI/SCRIPTS/test_files.py @@ -20,7 +20,6 @@ from datetime import date, datetime import numpy as np -import pytest from CI.SCRIPTS.script_utils import Polarization from sertit import AnyPath, ci, files @@ -93,10 +92,6 @@ def test_json(): ) # Enum are stored following their value assert obj == test_dict - # Test deprecation - with pytest.deprecated_call(): - files.save_json(json_file, test_dict) - def test_pickle(): """Test pickle functions""" diff --git a/CI/SCRIPTS/test_rasters.py b/CI/SCRIPTS/test_rasters.py index 0fe7d2b..d97c5ea 100644 --- a/CI/SCRIPTS/test_rasters.py +++ b/CI/SCRIPTS/test_rasters.py @@ -32,8 +32,6 @@ INT8_NODATA, UINT8_NODATA, UINT16_NODATA, - any_raster_to_xr_ds, - get_nodata_value_from_dtype, get_nodata_value_from_xr, ) from sertit.vectors import EPSG_4326 @@ -675,11 +673,6 @@ def test_write(dtype, nodata_val, tmp_path, xda): ) _test_raster_after_write(test_path, dtype, nodata_val) - # test deprecation warning - test_deprecated_path = os.path.join(tmp_path, "test_depr.tif") - with pytest.deprecated_call(): - rasters.write(xda, path=test_deprecated_path, dtype=dtype) - def test_dim(): """Test on BEAM-DIMAP function""" @@ -900,51 +893,6 @@ def test_rasterize(tmp_path, raster_path): ci.assert_raster_almost_equal(raster_true_path, out_path, decimal=4) -@s3_env -def test_decorator_deprecation(raster_path): - from sertit.rasters import path_xarr_dst - - @any_raster_to_xr_ds - def _ok_rasters(xds): - assert isinstance(xds, xr.DataArray) - return xds - - @path_xarr_dst - def _depr_rasters(xds): - assert isinstance(xds, xr.DataArray) - return xds - - # Not able to warn deprecation from inside the decorator - xr.testing.assert_equal(_ok_rasters(raster_path), _depr_rasters(raster_path)) - - -def test_get_nodata_deprecation(): - """Test deprecation of get_nodata_value""" - # Test deprecation - for dtype in [ - np.uint8, - np.int8, - np.uint16, - np.uint32, - np.int32, - np.int64, - np.uint64, - int, - "int", - np.int16, - np.float32, - np.float64, - float, - "float", - ]: - with pytest.deprecated_call(): - from sertit.rasters import get_nodata_value - - ci.assert_val( - get_nodata_value_from_dtype(dtype), get_nodata_value(dtype), dtype - ) - - @s3_env @dask_env def test_get_notata_from_xr(raster_path): diff --git a/CI/SCRIPTS/test_rasters_rio.py b/CI/SCRIPTS/test_rasters_rio.py index 7eb921a..62e6664 100644 --- a/CI/SCRIPTS/test_rasters_rio.py +++ b/CI/SCRIPTS/test_rasters_rio.py @@ -26,7 +26,6 @@ from CI.SCRIPTS.script_utils import KAPUT_KWARGS, rasters_path, s3_env from sertit import ci, rasters_rio, vectors -from sertit.rasters_rio import any_raster_to_rio_ds, get_nodata_value_from_dtype from sertit.vectors import EPSG_4326 ci.reduce_verbosity() @@ -421,56 +420,3 @@ def _test_idx(idx_list): _test_idx([1]) _test_idx([1, 2]) _test_idx(1) - - -@s3_env -def test_decorator_deprecation(raster_path): - from sertit.rasters_rio import path_arr_dst - - @any_raster_to_rio_ds - def _ok_rasters(ds): - return ds.read() - - @path_arr_dst - def _depr_rasters(ds): - return ds.read() - - # Not able to warn deprecation from inside the decorator - np.testing.assert_equal(_ok_rasters(raster_path), _depr_rasters(raster_path)) - - -def test_get_nodata_deprecation(): - """Test deprecation of get_nodata_value""" - # Test deprecation - for dtype in [ - np.uint8, - np.int8, - np.uint16, - np.uint32, - np.int32, - np.int64, - np.uint64, - int, - "int", - np.int16, - np.float32, - np.float64, - float, - "float", - ]: - with pytest.deprecated_call(): - from sertit.rasters_rio import get_nodata_value - - ci.assert_val( - get_nodata_value_from_dtype(dtype), get_nodata_value(dtype), dtype - ) - - -@s3_env -def test_write_deprecated(tmp_path, raster_path): - test_deprecated_path = os.path.join(tmp_path, "test_depr.tif") - raster, mtd = rasters_rio.read(raster_path) - - # test deprecation warning - with pytest.deprecated_call(): - rasters_rio.write(raster, mtd, path=test_deprecated_path) diff --git a/CI/SCRIPTS/test_vectors.py b/CI/SCRIPTS/test_vectors.py index 5a79272..98c9dd2 100644 --- a/CI/SCRIPTS/test_vectors.py +++ b/CI/SCRIPTS/test_vectors.py @@ -21,7 +21,6 @@ import geopandas as gpd import pytest -from rasterio import CRS from shapely import wkt from CI.SCRIPTS.script_utils import KAPUT_KWARGS, files_path, s3_env, vectors_path @@ -81,15 +80,6 @@ def test_vectors(): aoi = vectors.read(kml_path, **KAPUT_KWARGS) _assert_attributes(aoi, kml_path) - with pytest.deprecated_call(): - assert ( - vectors.corresponding_utm_projection(aoi.centroid.x, aoi.centroid.y) - == "EPSG:32638" - ) - assert CRS.from_string("EPSG:32638") == vectors.to_utm_crs( - aoi.centroid.x, aoi.centroid.y - ) - env = aoi.envelope[0] # Test kwargs (should be slightly not equal toi AOI to prove bbox does sth) diff --git a/sertit/archives.py b/sertit/archives.py index 990dad2..d8dc86a 100644 --- a/sertit/archives.py +++ b/sertit/archives.py @@ -12,7 +12,7 @@ from lxml import etree, html from tqdm import tqdm -from sertit import AnyPath, logs, path, s3 +from sertit import AnyPath, path, s3 from sertit.logs import SU_NAME from sertit.types import AnyPathStrType, AnyPathType @@ -222,12 +222,6 @@ def read_archived_xml( >>> read_archived_xml(arch_path, file_regex) """ - if regex is None: - logs.deprecation_warning( - "'xml_regex' is deprecated, please use 'regex' instead." - ) - regex = kwargs.pop("xml_regex") - xml_bytes = read_archived_file(archive_path, regex=regex, file_list=file_list) return etree.fromstring(xml_bytes) @@ -455,12 +449,6 @@ def get_archived_path( >>> path = get_archived_path(arch_path, file_regex) 'dir/filename.tif' """ - if regex is None: - logs.deprecation_warning( - "'file_regex' is deprecated, please use 'regex' instead." - ) - regex = kwargs.pop("file_regex") - # Get file list archive_path = AnyPath(archive_path) @@ -525,12 +513,6 @@ def get_archived_rio_path( >>> rasterio.open(path) """ - if regex is None: - logs.deprecation_warning( - "'file_regex' is deprecated, please use 'regex' instead." - ) - regex = kwargs.pop("file_regex") - archive_path = AnyPath(archive_path) if archive_path.suffix in [".tar", ".zip"]: prefix = archive_path.suffix[-3:] diff --git a/sertit/arcpy.py b/sertit/arcpy.py index a82f603..64cc9f9 100644 --- a/sertit/arcpy.py +++ b/sertit/arcpy.py @@ -1,8 +1,6 @@ import logging import logging.handlers -from sertit.logs import deprecation_warning - # Arcpy types from inside a schema SHORT = "int32:4" """ 'Short' type for ArcGis GDB """ @@ -153,32 +151,6 @@ def emit(self, record): super(ArcPyLogHandler, self).emit(record) -def feature_layer_to_path(feature_layer) -> str: - """ - .. deprecated:: 1.36.0 - Use :py:func:`gp_layer_to_path` instead. - - Use :func:`gp_layer_to_path` instead. - - Convert a feature layer to its source path. - - Args: - feature_layer: Feature layer - - Returns: - str: Path to the feature layer source - - """ - deprecation_warning("This function is deprecated. Use gp_layer_to_path instead.") - # Get path - if hasattr(feature_layer, "dataSource"): - path = feature_layer.dataSource - else: - path = str(feature_layer) - - return path - - def gp_layer_to_path(feature_layer) -> str: """ Convert a GP layer to its source path. diff --git a/sertit/ci.py b/sertit/ci.py index d8d2956..43b28e3 100644 --- a/sertit/ci.py +++ b/sertit/ci.py @@ -31,8 +31,8 @@ from shapely import force_2d, normalize from shapely.testing import assert_geometries_equal -from sertit import AnyPath, files, path, s3, unistra -from sertit.logs import SU_NAME, deprecation_warning +from sertit import AnyPath, files, path, s3 +from sertit.logs import SU_NAME from sertit.types import AnyPathStrType, AnyXrDataStructure LOGGER = logging.getLogger(SU_NAME) @@ -43,61 +43,6 @@ AWS_S3_ENDPOINT = s3.AWS_S3_ENDPOINT -def s3_env(*args, **kwargs): - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.unistra` instead of :py:mod:`sertit.ci` - """ - deprecation_warning( - "This function is deprecated. Import it from 'sertit.unistra' instead of 'sertit.ci'" - ) - return unistra.s3_env(*args, **kwargs) - - -def define_s3_client(): - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.unistra` instead of :py:mod:`sertit.ci` - """ - deprecation_warning( - "This function is deprecated. Import it from 'sertit.unistra' instead of 'sertit.ci'" - ) - return unistra.define_s3_client() - - -def get_db2_path(): - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.unistra` instead of :py:mod:`sertit.ci` - """ - deprecation_warning( - "This function is deprecated. Import it from 'sertit.unistra' instead of 'sertit.ci'" - ) - return unistra.get_db2_path() - - -def get_db3_path(): - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.unistra` instead of :py:mod:`sertit.ci` - """ - deprecation_warning( - "This function is deprecated. Import it from 'sertit.unistra' instead of 'sertit.ci'" - ) - return unistra.get_db3_path() - - -def get_db4_path(): - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.unistra` instead of :py:mod:`sertit.ci` - """ - deprecation_warning( - "This function is deprecated. Import it from 'sertit.unistra' instead of 'sertit.ci'" - ) - return unistra.get_db4_path() - - def assert_val(val_1: Any, val_2: Any, field: str) -> None: """ Compare two values corresponding to a field diff --git a/sertit/files.py b/sertit/files.py index c2ec5a3..7be06e9 100644 --- a/sertit/files.py +++ b/sertit/files.py @@ -24,12 +24,12 @@ from enum import Enum from json import JSONDecoder, JSONEncoder from pathlib import Path -from typing import Any, Union +from typing import Any import dill import numpy as np -from sertit import AnyPath, logs, path, s3 +from sertit import AnyPath, path, s3 from sertit.logs import SU_NAME from sertit.strings import DATE_FORMAT from sertit.types import AnyPathStrType, AnyPathType @@ -37,180 +37,6 @@ LOGGER = logging.getLogger(SU_NAME) -def get_root_path() -> AnyPathType: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Get the root path of the current disk: - - - On Linux this returns :code:`/` - - On Windows this returns :code:`C:/` or whatever the current drive is - - Example: - >>> get_root_path() - "/" on Linux - "C:/" on Windows (if you run this code from the C: drive) - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.get_root_path() - - -def listdir_abspath(directory: AnyPathStrType) -> list: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Get absolute path of all files in the given directory. - - It is the same function than :code:`os.listdir` but returning absolute paths. - - Args: - directory (AnyPathStrType): Relative or absolute path to the directory to be scanned - - Returns: - str: Absolute path of all files in the given directory - - Example: - >>> folder = "." - >>> listdir_abspath(folder) - ['D:/_SERTIT_UTILS/sertit-utils/sertit/files.py', - 'D:/_SERTIT_UTILS/sertit-utils/sertit/logs.py', - 'D:/_SERTIT_UTILS/sertit-utils/sertit/misc.py', - 'D:/_SERTIT_UTILS/sertit-utils/sertit/network.py', - 'D:/_SERTIT_UTILS/sertit-utils/sertit/rasters_rio.py', - 'D:/_SERTIT_UTILS/sertit-utils/sertit/strings.py', - 'D:/_SERTIT_UTILS/sertit-utils/sertit/vectors.py', - 'D:/_SERTIT_UTILS/sertit-utils/sertit/version.py', - 'D:/_SERTIT_UTILS/sertit-utils/sertit/__init__.py'] - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.listdir_abspath(directory) - - -def to_abspath( - raw_path: AnyPathStrType, - create: bool = True, - raise_file_not_found: bool = True, -) -> AnyPathType: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Return the absolute path of the specified path and check if it exists - - If not: - - - If it is a file (aka has an extension), it raises an exception - - If it is a folder, it creates it - - To be used with argparse to retrieve the absolute path of a file, like: - - Args: - raw_path (AnyPathStrType): Path as a string (relative or absolute) - create (bool): Create directory if not existing - - Returns: - AnyPathType: Absolute path - - Example: - >>> parser = argparse.ArgumentParser() - >>> # Add config file path key - >>> parser.add_argument( - >>> "--config", - >>> help="Config file path (absolute or relative)", - >>> type=to_abspath - >>> ) - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.to_abspath(raw_path, create, raise_file_not_found) - - -def real_rel_path(raw_path: AnyPathStrType, start: AnyPathStrType) -> AnyPathType: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Gives the real relative path from a starting folder. - (and not just adding :code:`../..` between the start and the target) - - Args: - raw_path (AnyPathStrType): Path to make relative - start (AnyPathStrType): Start, the path being relative from this folder. - - Returns: - Relative path - - Example: - >>> path = r'D:/_SERTIT_UTILS/sertit-utils/sertit' - >>> start = os.path.join(".", "..", "..") - >>> real_rel_path(path, start) - 'sertit-utils/sertit' - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.real_rel_path(raw_path, start) - - -def get_filename(file_path: AnyPathStrType, other_exts: Union[list, str] = None) -> str: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Get file name (without extension) from file path, i.e.: - - Args: - file_path (AnyPathStrType): Absolute or relative file path (the file doesn't need to exist) - other_exts (Union[list, str]): Other double extensions to discard - - Returns: - str: File name (without extension) - - Example: - >>> file_path = 'D:/path/to/filename.zip' - >>> get_file_name(file_path) - 'filename' - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.get_filename(file_path, other_exts) - - -def get_ext(file_path: AnyPathStrType) -> str: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Get file extension from file path. - - .. WARNING:: - Extension is given WITHOUT THE FIRST POINT - - Args: - file_path (AnyPathStrType): Absolute or relative file path (the file doesn't need to exist) - - Returns: - str: File name (without extension) - - Example: - >>> file_path = 'D:/path/to/filename.zip' - >>> get_ext(file_path) - 'zip' - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.get_ext(file_path) - - def remove(path: AnyPathStrType) -> None: """ Deletes a file or a directory (recursively) using :code:`shutil.rmtree` or :code:`os.remove`. @@ -318,54 +144,6 @@ def copy(src: AnyPathStrType, dst: AnyPathStrType) -> AnyPathType: return out -def find_files( - names: Union[list, str], - root_paths: Union[list, AnyPathStrType], - max_nof_files: int = -1, - get_as_str: bool = False, -) -> Union[list, str]: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Returns matching files recursively from a list of root paths. - - Regex are allowed (using glob) - - Args: - names (Union[list, str]): File names. - root_paths (Union[list, str]): Root paths - max_nof_files (int): Maximum number of files (set to -1 for unlimited) - get_as_str (bool): if only one file is found, it can be retrieved as a string instead of a list - - Returns: - list: File name - - Examples: - >>> root_path = 'D:/root' - >>> dir1_path = 'D:/root/dir1' - >>> dir2_path = 'D:/root/dir2' - >>> - >>> os.listdir(dir1_path) - ["haha.txt", "huhu.txt", "hoho.txt"] - >>> os.listdir(dir2_path) - ["huhu.txt", "hehe.txt"] - >>> - >>> find_files("huhu.txt", root_path) - ['D:/root/dir1/huhu.txt', 'D:/root/dir2/huhu.txt'] - >>> - >>> find_files("huhu.txt", root_path, max_nof_files=1) - ['D:/root/dir1/huhu.txt'] - - >>> find_files("huhu.txt", root_path, max_nof_files=1, get_as_str=True) - found = 'D:/root/dir1/huhu.txt' - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.find_files(names, root_paths, max_nof_files, get_as_str) - - # subclass JSONDecoder class CustomDecoder(JSONDecoder): """Decoder for JSON with methods for datetimes""" @@ -473,15 +251,6 @@ def save_json(json_dict: dict, output_json: AnyPathStrType, **kwargs) -> None: >>> json_dict = {"A": np.int64(1), "B": datetime.today(), "C": SomeEnum.some_name} >>> save_json(output_json, json_dict) """ - if isinstance(output_json, dict): - # Old order. Swap the variables. - logs.deprecation_warning( - "The order of the function has changed. Please set json_dict in first!" - ) - tmp = output_json - output_json = json_dict - json_dict = tmp - kwargs["indent"] = kwargs.get("indent", 3) kwargs["cls"] = kwargs.get("cls", CustomEncoder) @@ -528,66 +297,6 @@ def load_obj(path: AnyPathStrType) -> Any: return dill.load(file) -# too many arguments -# pylint: disable=R0913 -def get_file_in_dir( - directory: AnyPathStrType, - pattern_str: str, - extension: str = None, - filename_only: bool = False, - get_list: bool = False, - exact_name: bool = False, -) -> Union[AnyPathType, list]: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Get one or all matching files (pattern + extension) from inside a directory. - - Note that the pattern is a regex with glob's convention, i.e. :code:`*pattern*`. - - If :code:`exact_name` is :code:`False`, the searched pattern will be :code:`*{pattern}*.{extension}`, - else :code:`{pattern}.{extension}`. - - Args: - directory (str): Directory where to find the files - pattern_str (str): Pattern wanted as a string, with glob's convention. - extension (str): Extension wanted, optional. With or without point. (:code:`yaml` or :code:`.yaml` accepted) - filename_only (bool): Get only the filename - get_list (bool): Get the whole list of matching files - exact_name (bool): Get the exact name (without adding :code:`*` before and after the given pattern) - - Returns: - Union[AnyPathType, list]: File - - Example: - >>> directory = 'D:/path/to/dir' - >>> os.listdir(directory) - ["haha.txt", "huhu1.txt", "huhu1.geojson", "hoho.txt"] - >>> - >>> get_file_in_dir(directory, "huhu") - 'D:/path/to/dir/huhu1.geojson' - >>> - >>> get_file_in_dir(directory, "huhu", extension="txt") - 'D:/path/to/dir/huhu1.txt' - >>> - >>> get_file_in_dir(directory, "huhu", get_list=True) - ['D:/path/to/dir/huhu1.txt', 'D:/path/to/dir/huhu1.geojson'] - >>> - >>> get_file_in_dir(directory, "huhu", filename_only=True, get_list=True) - ['huhu1.txt', 'huhu1.geojson'] - >>> - >>> get_file_in_dir(directory, "huhu", get_list=True, exact_name=True) - [] - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.get_file_in_dir( - directory, pattern_str, extension, filename_only, get_list, exact_name - ) - - # pylint: disable=E1121 def hash_file_content(file_content: str, len_param: int = 5) -> str: """ @@ -610,22 +319,3 @@ def hash_file_content(file_content: str, len_param: int = 5) -> str: hasher = hashlib.shake_256() hasher.update(str.encode(file_content)) return hasher.hexdigest(len_param) - - -def is_writable(dir_path: AnyPathStrType): - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Determine whether the directory is writeable or not. - - Args: - dir_path (AnyPathStrType): Directory path - - Returns: - bool: True if the directory is writable - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.is_writable(dir_path) diff --git a/sertit/rasters.py b/sertit/rasters.py index aa7dd35..ccbd2a4 100644 --- a/sertit/rasters.py +++ b/sertit/rasters.py @@ -126,25 +126,6 @@ def get_nodata_value_from_dtype(dtype) -> float: return rasters_rio.get_nodata_value_from_dtype(dtype) -def get_nodata_value(dtype) -> float: - """ - .. deprecated:: 1.41.0 - Use :code:`get_nodata_value_from_dtype` instead. - - Get default nodata value: - - Args: - dtype: Dtype for the wanted nodata. Best if numpy's dtype. - - Returns: - float: Nodata value - """ - logs.deprecation_warning( - "This function is deprecated. Use 'get_nodata_value_from_dtype' instead." - ) - return get_nodata_value_from_dtype(dtype) - - def any_raster_to_xr_ds(function: Callable) -> Callable: """ Allows a function to ingest AnyRasterType and convert it into a xr.DataArray: @@ -256,27 +237,6 @@ def wrapper(any_raster_type: AnyRasterType, *args, **kwargs) -> Any: return wrapper -def path_xarr_dst(function: Callable) -> Callable: - """ - .. deprecated:: 1.40.0 - Use :py:func:`rasters.any_raster_to_xr_ds` instead. - """ - logs.deprecation_warning( - "Deprecated 'path_xarr_dst' decorator. Please use 'any_raster_to_xr_ds' instead." - ) - return any_raster_to_xr_ds(function) - - -@any_raster_to_xr_ds -def get_nodata_mask(xds: AnyXrDataStructure) -> np.ndarray: - """ - .. deprecated:: 1.36.0 - Use :py:func:`rasters.get_data_mask` instead. - """ - logs.deprecation_warning("This function is deprecated. Use 'get_data_mask' instead") - return get_data_mask(xds) - - @any_raster_to_xr_ds def get_data_mask(xds: AnyXrDataStructure) -> np.ndarray: """ @@ -1104,12 +1064,6 @@ def write( >>> # Rewrite it >>> write(xds, raster_out) """ - if output_path is None: - logs.deprecation_warning( - "'path' is deprecated in 'rasters.write'. Use 'output_path' instead." - ) - output_path = kwargs.pop("path") - # Prune empty kwargs to avoid throwing GDAL warnings/errors kwargs = {k: v for k, v in kwargs.items() if v is not None} diff --git a/sertit/rasters_rio.py b/sertit/rasters_rio.py index c01971b..9663590 100644 --- a/sertit/rasters_rio.py +++ b/sertit/rasters_rio.py @@ -43,7 +43,7 @@ "Please install 'rasterio' to use the 'rasters_rio' package." ) from ex -from sertit import AnyPath, geometry, logs, misc, path, s3, strings, vectors, xml +from sertit import AnyPath, geometry, misc, path, s3, strings, vectors, xml from sertit.logs import SU_NAME from sertit.types import AnyNumpyArray, AnyPathStrType, AnyPathType, AnyRasterType @@ -112,25 +112,6 @@ def get_nodata_value_from_dtype(dtype) -> float: return nodata -def get_nodata_value(dtype) -> float: - """ - .. deprecated:: 1.41.0 - Use :code:`get_nodata_value_from_dtype` instead. - - Get default nodata value: - - Args: - dtype: Dtype for the wanted nodata. Best if numpy's dtype. - - Returns: - float: Nodata value - """ - logs.deprecation_warning( - "This function is deprecated. Use 'get_nodata_value_from_dtype' instead." - ) - return get_nodata_value_from_dtype(dtype) - - def bigtiff_value(arr: Any) -> str: """ Returns :code:`YES` if array is larger than 4 GB, :code:`IF_NEEDED` otherwise. @@ -250,17 +231,6 @@ def wrapper(any_raster_type: AnyRasterType, *args, **kwargs) -> Any: return wrapper -def path_arr_dst(function: Callable) -> Callable: - """ - .. deprecated:: 1.40.0 - Use :py:func:`rasters.any_raster_to_rio_ds` instead. - """ - logs.deprecation_warning( - "Deprecated 'path_arr_dst' decorator. Please use 'any_raster_to_rio_ds' instead." - ) - return any_raster_to_rio_ds(function) - - @any_raster_to_rio_ds def get_new_shape( ds: AnyRasterType, @@ -424,19 +394,6 @@ def update_meta(arr: AnyNumpyArray, meta: dict) -> dict: return out_meta -def get_nodata_mask( - array: AnyNumpyArray, - has_nodata: bool, - default_nodata: int = 0, -) -> np.ndarray: - """ - .. deprecated:: 1.36.0 - Use :py:func:`rasters_rio.get_data_mask` instead. - """ - logs.deprecation_warning("This function is deprecated. Use 'get_data_mask' instead") - return get_data_mask(array, has_nodata, default_nodata) - - def get_data_mask( array: AnyNumpyArray, has_nodata: bool, @@ -540,7 +497,7 @@ def rasterize( if not np.can_cast(np.array(nodata, dtype=ds.dtypes[0]), dtype): old_nodata = nodata - nodata = get_nodata_value(dtype) + nodata = get_nodata_value_from_dtype(dtype) # Only throw a warning if the value is really different (we don't care about 255.0 being replaced by 255) if old_nodata - nodata != 0.0: @@ -1090,12 +1047,6 @@ def write( >>> # Rewrite it on disk >>> write(raster, meta, raster_out) """ - if output_path is None: - logs.deprecation_warning( - "'path' is deprecated in 'rasters_rio.write'. Use 'output_path' instead." - ) - output_path = kwargs.pop("path") - raster_out = raster.copy() # Prune empty kwargs to avoid throwing GDAL warnings/errors diff --git a/sertit/vectors.py b/sertit/vectors.py index 2f9df17..10ab4f7 100644 --- a/sertit/vectors.py +++ b/sertit/vectors.py @@ -34,7 +34,7 @@ from cloudpathlib.exceptions import AnyPathTypeError from shapely import Polygon, wkt -from sertit import AnyPath, archives, files, geometry, logs, misc, path, s3, strings +from sertit import AnyPath, archives, files, geometry, misc, path, s3, strings from sertit.logs import SU_NAME from sertit.types import AnyPathStrType, AnyPathType @@ -78,9 +78,6 @@ def is_geopandas_1_0(): def to_utm_crs(lon: float, lat: float) -> "CRS": # noqa: F821 """ - .. deprecated:: 1.29.1 - Use `estimate_utm_crs `_ instead, which directly returs a CRS instead of a string. - Find the EPSG code of the UTM CRS from a lon/lat in WGS84. Args: @@ -116,43 +113,6 @@ def to_utm_crs(lon: float, lat: float) -> "CRS": # noqa: F821 return gpd.GeoDataFrame(geometry=point, crs=EPSG_4326).estimate_utm_crs() -def corresponding_utm_projection(lon: float, lat: float) -> str: - """ - .. deprecated:: 1.29.1 - Use `estimate_utm_crs `_ instead, which directly returs a CRS instead of a string. - - Find the EPSG code of the UTM CRS from a lon/lat in WGS84. - - Args: - lon (float): Longitude (WGS84, epsg:4326) - lat (float): Latitude (WGS84, epsg:4326) - - Returns: - CRS: UTM CRS - - Example: - >>> to_utm_crs(lon=7.8, lat=48.6) # Strasbourg - - Name: WGS 84 / UTM zone 32N - Axis Info [cartesian]: - - E[east]: Easting (metre) - - N[north]: Northing (metre) - Area of Use: - - bounds: (6.0, 0.0, 12.0, 84.0) - Coordinate Operation: - - name: UTM zone 32N - - method: Transverse Mercator - Datum: World Geodetic System 1984 ensemble - - Ellipsoid: WGS 84 - - Prime Meridian: Greenwich - - """ - logs.deprecation_warning( - "Deprecated, use 'to_utm_crs' instead, which directly returs a CRS instead of a string." - ) - return to_utm_crs(lon, lat).to_string() - - def get_geodf(geom: Union[Polygon, list, gpd.GeoSeries], crs: str) -> gpd.GeoDataFrame: """ Get a GeoDataFrame from a geometry and a crs diff --git a/sertit/xml.py b/sertit/xml.py index d52dfa0..0bae6c9 100644 --- a/sertit/xml.py +++ b/sertit/xml.py @@ -30,7 +30,7 @@ ) from lxml.html.builder import E -from sertit import AnyPath, archives, logs, path, s3 +from sertit import AnyPath, archives, path, s3 from sertit.logs import SU_NAME from sertit.misc import ListEnum from sertit.types import AnyPathStrType @@ -97,12 +97,6 @@ def read_archive( Returns: _Element: XML Root """ - if archive_path is None: - logs.deprecation_warning( - "'path' argument is deprecated, use 'archive_path' instead." - ) - archive_path = kwargs.pop("path") - try: if not regex: archive_base_path, basename = str(archive_path).split("!") From 829ef817445765505d647956a20b437a39e058ca Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Fri, 13 Dec 2024 17:11:26 +0100 Subject: [PATCH 10/18] Fix storage option handling --- sertit/archives.py | 2 +- sertit/vectors.py | 4 +--- sertit/xml.py | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/sertit/archives.py b/sertit/archives.py index d8dc86a..db5d540 100644 --- a/sertit/archives.py +++ b/sertit/archives.py @@ -303,7 +303,7 @@ def archive( tmp_dir.cleanup() try: - arch = AnyPath(archive_fn, storage_options=folder_path.storage_options) + arch = AnyPath(archive_fn, **folder_path.storage_options) except AttributeError: arch = AnyPath(archive_fn) diff --git a/sertit/vectors.py b/sertit/vectors.py index 10ab4f7..1113bb9 100644 --- a/sertit/vectors.py +++ b/sertit/vectors.py @@ -433,9 +433,7 @@ def read( split_vect = str(vector_path).split("!") archive_regex = ".*{}".format(split_vect[1].replace(".", r"\.")) try: - vector_path = AnyPath( - split_vect[0], storage_options=vector_path.storage_options - ) + vector_path = AnyPath(split_vect[0], **vector_path.storage_options) except AttributeError: # Cloudpathlib vector_path = AnyPath(split_vect[0]) diff --git a/sertit/xml.py b/sertit/xml.py index 0bae6c9..befa80f 100644 --- a/sertit/xml.py +++ b/sertit/xml.py @@ -109,7 +109,7 @@ def read_archive( # For UPath with contextlib.suppress(AttributeError): archive_base_path = AnyPath( - archive_base_path, storage_options=archive_path.storage_options + archive_base_path, **archive_path.storage_options ) else: archive_base_path = archive_path From b6e8509f799fd54619711e37dc72ec405fc7bc27 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 12:30:07 +0000 Subject: [PATCH 11/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ci/test_archives.py | 2 +- ci/test_ci.py | 2 +- ci/test_files.py | 2 +- ci/test_path.py | 2 +- ci/test_vectors.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/test_archives.py b/ci/test_archives.py index ddb5d0f..d6e6801 100644 --- a/ci/test_archives.py +++ b/ci/test_archives.py @@ -2,9 +2,9 @@ import shutil import pytest +from CI.SCRIPTS.script_utils import files_path, s3_env from lxml import etree, html -from CI.SCRIPTS.script_utils import files_path, s3_env from sertit import archives, ci, files, path, s3, vectors diff --git a/ci/test_ci.py b/ci/test_ci.py index be911cc..14370c6 100644 --- a/ci/test_ci.py +++ b/ci/test_ci.py @@ -19,9 +19,9 @@ import tempfile import pytest +from CI.SCRIPTS.script_utils import files_path, rasters_path, s3_env, vectors_path from lxml import etree -from CI.SCRIPTS.script_utils import files_path, rasters_path, s3_env, vectors_path from sertit import ci, path, rasters, rasters_rio, s3, vectors ci.reduce_verbosity() diff --git a/ci/test_files.py b/ci/test_files.py index 84539f7..b5233c2 100644 --- a/ci/test_files.py +++ b/ci/test_files.py @@ -20,8 +20,8 @@ from datetime import date, datetime import numpy as np - from CI.SCRIPTS.script_utils import Polarization + from sertit import AnyPath, ci, files ci.reduce_verbosity() diff --git a/ci/test_path.py b/ci/test_path.py index d173bc3..bb94f47 100644 --- a/ci/test_path.py +++ b/ci/test_path.py @@ -19,8 +19,8 @@ import tempfile import pytest - from CI.SCRIPTS.script_utils import get_s3_ci_path + from sertit import AnyPath, ci, misc, path ci.reduce_verbosity() diff --git a/ci/test_vectors.py b/ci/test_vectors.py index 98c9dd2..750435d 100644 --- a/ci/test_vectors.py +++ b/ci/test_vectors.py @@ -21,9 +21,9 @@ import geopandas as gpd import pytest +from CI.SCRIPTS.script_utils import KAPUT_KWARGS, files_path, s3_env, vectors_path from shapely import wkt -from CI.SCRIPTS.script_utils import KAPUT_KWARGS, files_path, s3_env, vectors_path from sertit import archives, ci, files, path, vectors from sertit.vectors import EPSG_4326, DataSourceError From c9e3f3c93f5437f53c21e25cea8455315325376b Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Mon, 23 Dec 2024 13:32:20 +0100 Subject: [PATCH 12/18] FIX: Allow str as paths in `ci.assert_files_equal` --- CHANGES.md | 1 + ci/test_ci.py | 1 + sertit/ci.py | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 3e0d1f1..f6bbfee 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -18,6 +18,7 @@ - FIX: Fix deprecation warning for `get_nodata_value_from_dtype` in `rasters_rio` - FIX: Force blocksize to 128 when writing small COGs on disk (in order to have multiple overview levels) - FIX: Use `np.tan` in `rasters.slope` +- FIX: Allow str as paths in `ci.assert_files_equal` - OPTIM: Compute the spatial index by default in `vectors.read` (set `vectors.read(..., compute_sindex=False)` if you don't want to compute them) - CI: Rename CI folder and remove unnecessary intermediate folder diff --git a/ci/test_ci.py b/ci/test_ci.py index 14370c6..0e2125d 100644 --- a/ci/test_ci.py +++ b/ci/test_ci.py @@ -73,6 +73,7 @@ def test_assert_files(): false_path = files_path().joinpath("false.html") ci.assert_files_equal(ok_path, ok_path) + ci.assert_files_equal(str(ok_path), str(ok_path)) with pytest.raises(AssertionError): ci.assert_files_equal(ok_path, false_path) diff --git a/sertit/ci.py b/sertit/ci.py index 43b28e3..9e07937 100644 --- a/sertit/ci.py +++ b/sertit/ci.py @@ -86,7 +86,7 @@ def assert_files_equal(file_1: AnyPathStrType, file_2: AnyPathStrType): file_1 (str): Path to file 1 file_2 (str): Path to file 2 """ - with file_1.open("r") as f1, file_2.open("r") as f2: + with AnyPath(file_1).open("r") as f1, AnyPath(file_2).open("r") as f2: assert files.hash_file_content(f1.read()) == files.hash_file_content(f2.read()) From 2be7216620b3dd27a46ce31086ce726f48459dd5 Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Mon, 23 Dec 2024 14:09:26 +0100 Subject: [PATCH 13/18] Fix files after CI folder renaming --- .gitignore | 6 +++--- .gitlab-ci.yml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 5eb2f90..b054ac0 100644 --- a/.gitignore +++ b/.gitignore @@ -24,9 +24,9 @@ dist dask-worker-space/* # Data in CI -CI/*.tif -CI/*.zip -CI/*.vrt +ci/*.tif +ci/*.zip +ci/*.vrt # Docs & Notebooks docs/_build/* diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index cf8aeb1..aa372cb 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -27,7 +27,7 @@ pytest: - pip install --ignore-installed PyYAML - pip install -e .[full] script: - - python -m pytest -v --durations=0 --cov-report term --cov-report xml:cov.xml --cov=sertit ci/on_push --cov-config=.coveragerc --log-cli-level DEBUG --capture=sys + - python -m pytest -v --durations=0 --cov-report term --cov-report xml:cov.xml --cov=sertit ci --cov-config=.coveragerc --log-cli-level DEBUG --capture=sys coverage: '/TOTAL\s+\d+\s+\d+\s+(\d+%)/' tags: - sertit @@ -50,7 +50,7 @@ pytest_s3: - pip install --ignore-installed PyYAML - pip install -e .[full] script: - - python -m pytest -v --durations=0 --cov-report term --cov-report xml:cov.xml --cov=sertit ci/on_push --cov-config=.coveragerc --log-cli-level DEBUG --capture=sys + - python -m pytest -v --durations=0 --cov-report term --cov-report xml:cov.xml --cov=sertit ci --cov-config=.coveragerc --log-cli-level DEBUG --capture=sys coverage: '/TOTAL\s+\d+\s+\d+\s+(\d+%)/' tags: - sertit From 28b5f25719e0d69a18af83f1d9b67a17e4d9cabb Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Mon, 23 Dec 2024 14:27:46 +0100 Subject: [PATCH 14/18] Remove useless fct in CI --- ci/script_utils.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/ci/script_utils.py b/ci/script_utils.py index 583682a..33f0b05 100644 --- a/ci/script_utils.py +++ b/ci/script_utils.py @@ -52,21 +52,12 @@ def get_s3_ci_path(): return ci_path -def get_proj_path(): - """Get project path""" - if int(os.getenv(CI_SERTIT_S3, 1)) and sys.platform != "win32": - return get_s3_ci_path() - else: - # ON DISK - return AnyPath(unistra.get_db3_path()) - - def get_ci_data_path(): """Get CI DATA path""" if int(os.getenv(CI_SERTIT_S3, 1)) and sys.platform != "win32": - return get_proj_path().joinpath("DATA") + return get_s3_ci_path() / "DATA" else: - return get_proj_path().joinpath("CI", "sertit_utils", "DATA") + return AnyPath(unistra.get_db3_path()) / "CI" / "sertit_utils" / "DATA" def dask_env(function): From fcc353abb5b752267f4ff55f53f94535cadbf8c7 Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Mon, 23 Dec 2024 14:29:55 +0100 Subject: [PATCH 15/18] FIX: Better alignement between `rasters.read` function and `rasters.any_raster_to_xr_ds` decorator --- CHANGES.md | 1 + ci/test_rasters.py | 2 +- sertit/rasters.py | 55 ++++++++++++++-------------------------------- 3 files changed, 18 insertions(+), 40 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index f6bbfee..7ac82ef 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -19,6 +19,7 @@ - FIX: Force blocksize to 128 when writing small COGs on disk (in order to have multiple overview levels) - FIX: Use `np.tan` in `rasters.slope` - FIX: Allow str as paths in `ci.assert_files_equal` +- FIX: Better alignement between `rasters.read` function and `rasters.any_raster_to_xr_ds` decorator - OPTIM: Compute the spatial index by default in `vectors.read` (set `vectors.read(..., compute_sindex=False)` if you don't want to compute them) - CI: Rename CI folder and remove unnecessary intermediate folder diff --git a/ci/test_rasters.py b/ci/test_rasters.py index 70145e0..99cb7ac 100644 --- a/ci/test_rasters.py +++ b/ci/test_rasters.py @@ -102,7 +102,7 @@ def ds_name(raster_path): @pytest.fixture def ds_dtype(raster_path): with rasterio.open(str(raster_path)) as ds: - return ds.meta["dtype"] + return getattr(np, ds.meta["dtype"]) @pytest.fixture diff --git a/sertit/rasters.py b/sertit/rasters.py index ec338e4..c2a71e8 100644 --- a/sertit/rasters.py +++ b/sertit/rasters.py @@ -33,7 +33,7 @@ try: import rasterio import rioxarray - from rasterio import MemoryFile, features + from rasterio import features from rasterio.enums import Resampling from rioxarray.exceptions import MissingCRS except ModuleNotFoundError as ex: @@ -172,8 +172,8 @@ def wrapper(any_raster_type: AnyRasterType, *args, **kwargs) -> Any: if any_raster_type is None: raise ValueError("'any_raster_type' shouldn't be None!") - default_chunks = True if dask.get_client() is not None else None - + default_chunks = "auto" if dask.get_client() is not None else None + masked = kwargs.get("masked", True) # By default, try with the input fct try: out = function(any_raster_type, *args, **kwargs) @@ -197,41 +197,12 @@ def wrapper(any_raster_type: AnyRasterType, *args, **kwargs) -> Any: except Exception as ex: raise TypeError("Function not available for xarray.Dataset") from ex - elif isinstance(any_raster_type, tuple): - arr, meta = any_raster_type - with ( - MemoryFile() as memfile, - memfile.open( - **meta, BIGTIFF=rasters_rio.bigtiff_value(any_raster_type) - ) as ds, - ): - ds.write(arr.data) - - with rioxarray.open_rasterio( - any_raster_type, - masked=True, - default_name=ds.name, - chunks=kwargs.pop("chunks", default_chunks), - ) as xds: - out = function(xds, *args, **kwargs) else: - # Get the path from the input - if path.is_path(any_raster_type): - name = str(any_raster_type) - any_raster_type = str(any_raster_type) - else: - # For rasterio datasets, '.name' gives the path - name = any_raster_type.name - - # Convert path or rasterio.dataset to xr.dataset - with rioxarray.open_rasterio( - any_raster_type, - masked=True, - default_name=name, - chunks=kwargs.pop("chunks", default_chunks), - ) as xds: - out = function(xds, *args, **kwargs) - + out = function( + read(any_raster_type, chunks=default_chunks, masked=masked), + *args, + **kwargs, + ) return out return wrapper @@ -948,13 +919,19 @@ def read( rioxarray.set_options(export_grid_mapping=False), rioxarray.open_rasterio( ds, - lock=False, default_name=path.get_filename(ds.name), chunks=chunks, + masked=masked, **kwargs, ) as xda, ): - orig_dtype = xda.dtype + orig_dtype = xda.encoding.get( + "rasterio_dtype", xda.encoding.get("dtype", xda.dtype) + ) + + if isinstance(orig_dtype, str): + with contextlib.suppress(AttributeError): + orig_dtype = getattr(np, orig_dtype) # Windows if window is not None: From 69bf6d133854c202b5fb053e8f3c1ce35d994d78 Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Mon, 23 Dec 2024 14:49:30 +0100 Subject: [PATCH 16/18] FIX: Fix `rasters.sieve` function with `xr.apply_ufunc` --- CHANGES.md | 1 + ci/test_rasters.py | 6 +++++- sertit/rasters.py | 9 +++++---- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 7ac82ef..7527a5f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -20,6 +20,7 @@ - FIX: Use `np.tan` in `rasters.slope` - FIX: Allow str as paths in `ci.assert_files_equal` - FIX: Better alignement between `rasters.read` function and `rasters.any_raster_to_xr_ds` decorator +- FIX: Fix `rasters.sieve` function with `xr.apply_ufunc` - OPTIM: Compute the spatial index by default in `vectors.read` (set `vectors.read(..., compute_sindex=False)` if you don't want to compute them) - CI: Rename CI folder and remove unnecessary intermediate folder diff --git a/ci/test_rasters.py b/ci/test_rasters.py index 99cb7ac..74680a3 100644 --- a/ci/test_rasters.py +++ b/ci/test_rasters.py @@ -314,7 +314,7 @@ def test_crop(tmp_path, xda, xds, xda_dask, mask): @s3_env @dask_env -def test_sieve(tmp_path, xda, xds, xda_dask): +def test_sieve(tmp_path, raster_path, xda, xds, xda_dask): """Test sieve function""" # DataArray xda_sieved = os.path.join(tmp_path, "test_sieved_xda.tif") @@ -348,6 +348,10 @@ def test_sieve(tmp_path, xda, xds, xda_dask): ci.assert_raster_equal(xda_sieved, raster_sieved_path) ci.assert_raster_equal(xds_sieved, raster_sieved_path) + # From path + sieve_xda_path = rasters.sieve(raster_path, sieve_thresh=20, connectivity=4) + np.testing.assert_array_equal(sieve_xda, sieve_xda_path) + @s3_env @dask_env diff --git a/sertit/rasters.py b/sertit/rasters.py index c2a71e8..52ca67c 100644 --- a/sertit/rasters.py +++ b/sertit/rasters.py @@ -1334,14 +1334,15 @@ def sieve( assert connectivity in [4, 8] - # Use this trick to make the sieve work - mask = np.where(np.isnan(xds.data), 0, 1).astype(np.uint8) - data = xds.data.astype(np.uint8) + mask = xr.where(np.isnan(xds), 0, 1).astype(np.uint8).data + data = xds.astype(np.uint8).data # Sieve try: sieved_arr = xr.apply_ufunc( - features.sieve, data, sieve_thresh, connectivity, mask + features.sieve, + data, + kwargs={"size": sieve_thresh, "connectivity": connectivity, "mask": mask}, ) except ValueError: sieved_arr = features.sieve( From c47581a51645a24f64870867623e4d750ea7c651 Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Mon, 23 Dec 2024 15:16:37 +0100 Subject: [PATCH 17/18] Fix CI --- ci/test_archives.py | 2 +- ci/test_ci.py | 9 ++++++--- ci/test_files.py | 2 +- ci/test_path.py | 2 +- ci/test_vectors.py | 2 +- 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/ci/test_archives.py b/ci/test_archives.py index d6e6801..8283d6d 100644 --- a/ci/test_archives.py +++ b/ci/test_archives.py @@ -2,9 +2,9 @@ import shutil import pytest -from CI.SCRIPTS.script_utils import files_path, s3_env from lxml import etree, html +from ci.script_utils import files_path, s3_env from sertit import archives, ci, files, path, s3, vectors diff --git a/ci/test_ci.py b/ci/test_ci.py index 0e2125d..3625bfb 100644 --- a/ci/test_ci.py +++ b/ci/test_ci.py @@ -19,9 +19,9 @@ import tempfile import pytest -from CI.SCRIPTS.script_utils import files_path, rasters_path, s3_env, vectors_path from lxml import etree +from ci.script_utils import files_path, rasters_path, s3_env, vectors_path from sertit import ci, path, rasters, rasters_rio, s3, vectors ci.reduce_verbosity() @@ -67,13 +67,16 @@ def test_assert_dir(): @s3_env -def test_assert_files(): +def test_assert_files(tmp_path): """Test CI functions""" ok_path = files_path().joinpath("productPreview.html") false_path = files_path().joinpath("false.html") ci.assert_files_equal(ok_path, ok_path) - ci.assert_files_equal(str(ok_path), str(ok_path)) + if path.is_cloud_path(ok_path): + str_ok_path = str(s3.download(ok_path, tmp_path)) + + ci.assert_files_equal(str_ok_path, str_ok_path) with pytest.raises(AssertionError): ci.assert_files_equal(ok_path, false_path) diff --git a/ci/test_files.py b/ci/test_files.py index b5233c2..d8de011 100644 --- a/ci/test_files.py +++ b/ci/test_files.py @@ -20,8 +20,8 @@ from datetime import date, datetime import numpy as np -from CI.SCRIPTS.script_utils import Polarization +from ci.script_utils import Polarization from sertit import AnyPath, ci, files ci.reduce_verbosity() diff --git a/ci/test_path.py b/ci/test_path.py index bb94f47..6da5458 100644 --- a/ci/test_path.py +++ b/ci/test_path.py @@ -19,8 +19,8 @@ import tempfile import pytest -from CI.SCRIPTS.script_utils import get_s3_ci_path +from ci.script_utils import get_s3_ci_path from sertit import AnyPath, ci, misc, path ci.reduce_verbosity() diff --git a/ci/test_vectors.py b/ci/test_vectors.py index 750435d..3aea31d 100644 --- a/ci/test_vectors.py +++ b/ci/test_vectors.py @@ -21,9 +21,9 @@ import geopandas as gpd import pytest -from CI.SCRIPTS.script_utils import KAPUT_KWARGS, files_path, s3_env, vectors_path from shapely import wkt +from ci.script_utils import KAPUT_KWARGS, files_path, s3_env, vectors_path from sertit import archives, ci, files, path, vectors from sertit.vectors import EPSG_4326, DataSourceError From e54eb497fb0aa3c7fee362d25cdc64a8b2b980ae Mon Sep 17 00:00:00 2001 From: BRAUN REMI Date: Mon, 23 Dec 2024 16:08:37 +0100 Subject: [PATCH 18/18] Fix CI --- ci/test_ci.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/test_ci.py b/ci/test_ci.py index 3625bfb..3a836d6 100644 --- a/ci/test_ci.py +++ b/ci/test_ci.py @@ -75,6 +75,8 @@ def test_assert_files(tmp_path): ci.assert_files_equal(ok_path, ok_path) if path.is_cloud_path(ok_path): str_ok_path = str(s3.download(ok_path, tmp_path)) + else: + str_ok_path = ok_path ci.assert_files_equal(str_ok_path, str_ok_path) with pytest.raises(AssertionError):