diff --git a/.gitignore b/.gitignore
index 5eb2f90..b054ac0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,9 +24,9 @@ dist
dask-worker-space/*
# Data in CI
-CI/*.tif
-CI/*.zip
-CI/*.vrt
+ci/*.tif
+ci/*.zip
+ci/*.vrt
# Docs & Notebooks
docs/_build/*
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index cf8aeb1..aa372cb 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -27,7 +27,7 @@ pytest:
- pip install --ignore-installed PyYAML
- pip install -e .[full]
script:
- - python -m pytest -v --durations=0 --cov-report term --cov-report xml:cov.xml --cov=sertit ci/on_push --cov-config=.coveragerc --log-cli-level DEBUG --capture=sys
+ - python -m pytest -v --durations=0 --cov-report term --cov-report xml:cov.xml --cov=sertit ci --cov-config=.coveragerc --log-cli-level DEBUG --capture=sys
coverage: '/TOTAL\s+\d+\s+\d+\s+(\d+%)/'
tags:
- sertit
@@ -50,7 +50,7 @@ pytest_s3:
- pip install --ignore-installed PyYAML
- pip install -e .[full]
script:
- - python -m pytest -v --durations=0 --cov-report term --cov-report xml:cov.xml --cov=sertit ci/on_push --cov-config=.coveragerc --log-cli-level DEBUG --capture=sys
+ - python -m pytest -v --durations=0 --cov-report term --cov-report xml:cov.xml --cov=sertit ci --cov-config=.coveragerc --log-cli-level DEBUG --capture=sys
coverage: '/TOTAL\s+\d+\s+\d+\s+(\d+%)/'
tags:
- sertit
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c2b369b..1275d62 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -26,3 +26,13 @@ repos:
args: [ --fix ]
# Run the formatter.
- id: ruff-format
+
+
+
+
+
+
+
+
+
+
diff --git a/CHANGES.md b/CHANGES.md
index 42f0bd8..7527a5f 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,15 @@
# Release History
+## 2.0.0 (20xx-xx-xx)
+
+- **BREAKING CHANGE**: Remove all deprecations from `sertit==1.*` ([#3](https://github.com/sertit/sertit-utils/issues/3)):
+ - Duplication between `path` and `files` modules
+ - Duplication between `ci`, `s3` and `unistra` modules
+ - Arguments in functions
+ - Renaming functions
+ - Others
+- **ENH: Use `universal_pathlib` instead of `cloudpathlib` (even if the code is still compatible with `cloudpathlib`)** ([#4](https://github.com/sertit/sertit-utils/issues/4))
+
## 1.44.x (20xx-xx-xx)
- **ENH: Drop `isort`, `black` and `flake8` and use `ruff`**
@@ -8,6 +18,9 @@
- FIX: Fix deprecation warning for `get_nodata_value_from_dtype` in `rasters_rio`
- FIX: Force blocksize to 128 when writing small COGs on disk (in order to have multiple overview levels)
- FIX: Use `np.tan` in `rasters.slope`
+- FIX: Allow str as paths in `ci.assert_files_equal`
+- FIX: Better alignement between `rasters.read` function and `rasters.any_raster_to_xr_ds` decorator
+- FIX: Fix `rasters.sieve` function with `xr.apply_ufunc`
- OPTIM: Compute the spatial index by default in `vectors.read` (set `vectors.read(..., compute_sindex=False)` if you don't want to compute them)
- CI: Rename CI folder and remove unnecessary intermediate folder
diff --git a/ci/script_utils.py b/ci/script_utils.py
index 63679bd..33f0b05 100644
--- a/ci/script_utils.py
+++ b/ci/script_utils.py
@@ -38,25 +38,26 @@ class Polarization(ListEnum):
def get_s3_ci_path():
"""Get S3 CI path"""
- unistra.define_s3_client()
- return AnyPath("s3://sertit-sertit-utils-ci")
+ from sertit.unistra import UNISTRA_S3_ENDPOINT
-def get_proj_path():
- """Get project path"""
- if int(os.getenv(CI_SERTIT_S3, 1)) and sys.platform != "win32":
- return get_s3_ci_path()
- else:
- # ON DISK
- return AnyPath(unistra.get_db3_path())
+ try:
+ ci_path = AnyPath(
+ "s3://sertit-sertit-utils-ci", endpoint_url=f"https://{UNISTRA_S3_ENDPOINT}"
+ )
+ except TypeError:
+ unistra.define_s3_client()
+ ci_path = AnyPath("s3://sertit-sertit-utils-ci")
+
+ return ci_path
def get_ci_data_path():
"""Get CI DATA path"""
if int(os.getenv(CI_SERTIT_S3, 1)) and sys.platform != "win32":
- return get_proj_path().joinpath("DATA")
+ return get_s3_ci_path() / "DATA"
else:
- return get_proj_path().joinpath("CI", "sertit_utils", "DATA")
+ return AnyPath(unistra.get_db3_path()) / "CI" / "sertit_utils" / "DATA"
def dask_env(function):
diff --git a/ci/test_archives.py b/ci/test_archives.py
new file mode 100644
index 0000000..8283d6d
--- /dev/null
+++ b/ci/test_archives.py
@@ -0,0 +1,153 @@
+import os
+import shutil
+
+import pytest
+from lxml import etree, html
+
+from ci.script_utils import files_path, s3_env
+from sertit import archives, ci, files, path, s3, vectors
+
+
+@s3_env
+def test_archive(tmp_path):
+ """Test extracting functions"""
+ # Archives
+ zip_file = files_path().joinpath("test_zip.zip")
+ zip2_file = files_path().joinpath("test_zip.zip") # For overwrite
+ zip_without_directory = files_path().joinpath("test_zip_without_directory.zip")
+ tar_file = files_path().joinpath("test_tar.tar")
+ tar_gz_file = files_path().joinpath("test_targz.tar.gz")
+
+ # Core dir
+ core_dir = files_path().joinpath("core")
+ folder = core_dir
+ arch = [
+ zip_file,
+ tar_file,
+ tar_gz_file,
+ folder,
+ zip2_file,
+ zip_without_directory,
+ ]
+
+ # Extract
+ extracted_dirs = archives.extract_files(arch, tmp_path, overwrite=True)
+
+ # Test
+ for ex_dir in extracted_dirs:
+ ci.assert_dir_equal(core_dir, ex_dir)
+
+ archives.extract_files([zip2_file], tmp_path, overwrite=False) # Already existing
+
+ # Test
+ for ex_dir in extracted_dirs:
+ ci.assert_dir_equal(core_dir, ex_dir)
+
+ # Archive
+ archive_base = os.path.join(tmp_path, "archive")
+ for fmt in ["zip", "tar", "gztar"]:
+ archive_fn = archives.archive(
+ folder_path=core_dir, archive_path=archive_base, fmt=fmt
+ )
+ out = archives.extract_file(archive_fn, tmp_path)
+ # an additional folder is created
+ out_dir = path.listdir_abspath(out)[0]
+ ci.assert_dir_equal(core_dir, out_dir)
+
+ # Remove out directory in order to avoid any interferences
+ files.remove(out)
+
+ # Add to zip
+ zip_out = zip2_file if path.is_cloud_path(zip2_file) else archive_base + ".zip"
+ core_copy = files.copy(core_dir, os.path.join(tmp_path, "core2"))
+ zip_out = archives.add_to_zip(s3.download(zip_out, tmp_path), core_copy)
+
+ # Extract
+ unzip_out = os.path.join(tmp_path, "out")
+ unzip_out = archives.extract_file(zip_out, unzip_out)
+
+ # Test
+ unzip_dirs = path.listdir_abspath(unzip_out)
+
+ assert len(unzip_dirs) == 2
+ ci.assert_dir_equal(unzip_dirs[0], unzip_dirs[1])
+
+
+@s3_env
+def test_archived_files(tmp_path):
+ landsat_name = "LM05_L1TP_200030_20121230_20200820_02_T2_CI"
+ ok_folder = files_path().joinpath(landsat_name)
+ zip_file = files_path().joinpath(f"{landsat_name}.zip")
+ tar_file = files_path().joinpath(f"{landsat_name}.tar")
+ targz_file = files_path().joinpath(f"{landsat_name}.tar.gz")
+ sz_file = files_path().joinpath(f"{landsat_name}.7z")
+
+ # VECTORS
+ vect_name = "map-overlay.kml"
+ vec_ok_path = ok_folder.joinpath(vect_name)
+ if shutil.which("ogr2ogr"): # Only works if ogr2ogr can be found.
+ vect_regex = f".*{vect_name}"
+ vect_zip = vectors.read(zip_file, archive_regex=vect_regex)
+ vect_tar = vectors.read(tar_file, archive_regex=r".*overlay\.kml")
+ vect_ok = vectors.read(vec_ok_path)
+ assert not vect_ok.empty
+ ci.assert_geom_equal(vect_ok, vect_zip)
+ ci.assert_geom_equal(vect_ok, vect_tar)
+
+ # XML
+ xml_name = "LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml"
+ xml_ok_path = ok_folder.joinpath(xml_name)
+ xml_ok_path = str(s3.download(xml_ok_path, tmp_path))
+
+ xml_regex = f".*{xml_name}"
+ xml_zip = archives.read_archived_xml(zip_file, xml_regex)
+ xml_tar = archives.read_archived_xml(tar_file, r".*_MTL\.xml")
+ xml_ok = etree.parse(xml_ok_path).getroot()
+ ci.assert_xml_equal(xml_ok, xml_zip)
+ ci.assert_xml_equal(xml_ok, xml_tar)
+
+ # FILE + HTML
+ html_zip_file = files_path().joinpath("productPreview.zip")
+ html_tar_file = files_path().joinpath("productPreview.tar")
+ html_name = "productPreview.html"
+ html_ok_path = files_path().joinpath(html_name)
+ html_ok_path = str(s3.download(html_ok_path, tmp_path))
+
+ html_regex = f".*{html_name}"
+
+ # FILE
+ file_zip = archives.read_archived_file(html_zip_file, html_regex)
+ file_tar = archives.read_archived_file(html_tar_file, html_regex)
+ html_ok = html.parse(html_ok_path).getroot()
+ ci.assert_html_equal(html_ok, html.fromstring(file_zip))
+ ci.assert_html_equal(html_ok, html.fromstring(file_tar))
+
+ file_list = archives.get_archived_file_list(html_zip_file)
+ ci.assert_html_equal(
+ html_ok,
+ html.fromstring(
+ archives.read_archived_file(html_zip_file, html_regex, file_list=file_list)
+ ),
+ )
+
+ # HTML
+ html_zip = archives.read_archived_html(html_zip_file, html_regex)
+ html_tar = archives.read_archived_html(html_tar_file, html_regex)
+ ci.assert_html_equal(html_ok, html_zip)
+ ci.assert_html_equal(html_ok, html_tar)
+ ci.assert_html_equal(
+ html_ok,
+ archives.read_archived_html(
+ html_tar_file,
+ html_regex,
+ file_list=archives.get_archived_file_list(html_tar_file),
+ ),
+ )
+
+ # ERRORS
+ with pytest.raises(TypeError):
+ archives.read_archived_file(targz_file, xml_regex)
+ with pytest.raises(TypeError):
+ archives.read_archived_file(sz_file, xml_regex)
+ with pytest.raises(FileNotFoundError):
+ archives.read_archived_file(zip_file, "cdzeferf")
diff --git a/ci/test_ci.py b/ci/test_ci.py
index 803fe05..3a836d6 100644
--- a/ci/test_ci.py
+++ b/ci/test_ci.py
@@ -22,7 +22,7 @@
from lxml import etree
from ci.script_utils import files_path, rasters_path, s3_env, vectors_path
-from sertit import ci, path, rasters, rasters_rio, vectors
+from sertit import ci, path, rasters, rasters_rio, s3, vectors
ci.reduce_verbosity()
@@ -67,12 +67,18 @@ def test_assert_dir():
@s3_env
-def test_assert_files():
+def test_assert_files(tmp_path):
"""Test CI functions"""
ok_path = files_path().joinpath("productPreview.html")
false_path = files_path().joinpath("false.html")
ci.assert_files_equal(ok_path, ok_path)
+ if path.is_cloud_path(ok_path):
+ str_ok_path = str(s3.download(ok_path, tmp_path))
+ else:
+ str_ok_path = ok_path
+
+ ci.assert_files_equal(str_ok_path, str_ok_path)
with pytest.raises(AssertionError):
ci.assert_files_equal(ok_path, false_path)
@@ -169,15 +175,15 @@ def test_assert_raster():
@s3_env
-def test_assert_xml():
+def test_assert_xml(tmp_path):
# XML
xml_folder = files_path().joinpath("LM05_L1TP_200030_20121230_20200820_02_T2_CI")
xml_path = xml_folder.joinpath("LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml")
xml_bad_path = xml_folder.joinpath("false_xml.xml")
if path.is_cloud_path(files_path()):
- xml_path = xml_path.fspath
- xml_bad_path = xml_bad_path.fspath
+ xml_path = s3.download(xml_path, tmp_path)
+ xml_bad_path = s3.download(xml_bad_path, tmp_path)
xml_ok = etree.parse(str(xml_path)).getroot()
xml_nok = etree.parse(str(xml_bad_path)).getroot()
@@ -188,19 +194,18 @@ def test_assert_xml():
@s3_env
-def test_assert_html():
+def test_assert_html(tmp_path):
# HTML
html_path = files_path().joinpath("productPreview.html")
html_bad_path = files_path().joinpath("false.html")
- with tempfile.TemporaryDirectory() as tmp_dir:
- if path.is_cloud_path(files_path()):
- html_path = html_path.download_to(tmp_dir)
- html_bad_path = html_bad_path.download_to(tmp_dir)
+ if path.is_cloud_path(files_path()):
+ html_path = s3.download(html_path, tmp_path)
+ html_bad_path = s3.download(html_bad_path, tmp_path)
- html_ok = etree.parse(str(html_path)).getroot()
- html_nok = etree.parse(str(html_bad_path)).getroot()
+ html_ok = etree.parse(str(html_path)).getroot()
+ html_nok = etree.parse(str(html_bad_path)).getroot()
- ci.assert_xml_equal(html_ok, html_ok)
- with pytest.raises(AssertionError):
- ci.assert_xml_equal(html_ok, html_nok)
+ ci.assert_xml_equal(html_ok, html_ok)
+ with pytest.raises(AssertionError):
+ ci.assert_xml_equal(html_ok, html_nok)
diff --git a/ci/test_files.py b/ci/test_files.py
index 0a06572..d8de011 100644
--- a/ci/test_files.py
+++ b/ci/test_files.py
@@ -16,167 +16,17 @@
"""Script testing the files"""
import os
-import shutil
import tempfile
from datetime import date, datetime
import numpy as np
-import pytest
-from lxml import etree, html
-from ci.script_utils import Polarization, files_path, s3_env
-from sertit import AnyPath, ci, files, path, vectors
+from ci.script_utils import Polarization
+from sertit import AnyPath, ci, files
ci.reduce_verbosity()
-def test_archive():
- """Test extracting functions"""
- with tempfile.TemporaryDirectory() as tmp_dir:
- # Archives
- zip_file = files_path().joinpath("test_zip.zip")
- zip2_file = files_path().joinpath("test_zip.zip") # For overwrite
- zip_without_directory = files_path().joinpath("test_zip_without_directory.zip")
- tar_file = files_path().joinpath("test_tar.tar")
- tar_gz_file = files_path().joinpath("test_targz.tar.gz")
-
- # Core dir
- core_dir = files_path().joinpath("core")
- folder = core_dir
- archives = [
- zip_file,
- tar_file,
- tar_gz_file,
- folder,
- zip2_file,
- zip_without_directory,
- ]
-
- # Extract
- extracted_dirs = files.extract_files(archives, tmp_dir, overwrite=True)
- files.extract_files([zip2_file], tmp_dir, overwrite=False) # Already existing
-
- # Test
- for ex_dir in extracted_dirs:
- ci.assert_dir_equal(core_dir, ex_dir)
-
- # Archive
- archive_base = os.path.join(tmp_dir, "archive")
- for fmt in ["zip", "tar", "gztar"]:
- archive_fn = files.archive(
- folder_path=core_dir, archive_path=archive_base, fmt=fmt
- )
- out = files.extract_file(archive_fn, tmp_dir)
- # an additional folder is created
- out_dir = path.listdir_abspath(out)[0]
- ci.assert_dir_equal(core_dir, out_dir)
-
- # Remove out directory in order to avoid any interferences
- files.remove(out)
-
- # Add to zip
- zip_out = zip2_file if path.is_cloud_path(zip2_file) else archive_base + ".zip"
- core_copy = files.copy(core_dir, os.path.join(tmp_dir, "core2"))
- zip_out = files.add_to_zip(zip_out, core_copy)
-
- # Extract
- unzip_out = os.path.join(tmp_dir, "out")
- unzip_out = files.extract_file(zip_out, unzip_out)
-
- # Test
- unzip_dirs = path.listdir_abspath(unzip_out)
-
- assert len(unzip_dirs) == 2
- ci.assert_dir_equal(unzip_dirs[0], unzip_dirs[1])
-
-
-@s3_env
-def test_archived_files():
- landsat_name = "LM05_L1TP_200030_20121230_20200820_02_T2_CI"
- ok_folder = files_path().joinpath(landsat_name)
- zip_file = files_path().joinpath(f"{landsat_name}.zip")
- tar_file = files_path().joinpath(f"{landsat_name}.tar")
- targz_file = files_path().joinpath(f"{landsat_name}.tar.gz")
- sz_file = files_path().joinpath(f"{landsat_name}.7z")
-
- # VECTORS
- vect_name = "map-overlay.kml"
- vec_ok_path = ok_folder.joinpath(vect_name)
- if shutil.which("ogr2ogr"): # Only works if ogr2ogr can be found.
- vect_regex = f".*{vect_name}"
- vect_zip = vectors.read(zip_file, archive_regex=vect_regex)
- vect_tar = vectors.read(tar_file, archive_regex=r".*overlay\.kml")
- vect_ok = vectors.read(vec_ok_path)
- assert not vect_ok.empty
- ci.assert_geom_equal(vect_ok, vect_zip)
- ci.assert_geom_equal(vect_ok, vect_tar)
-
- with tempfile.TemporaryDirectory() as tmp_dir:
- # XML
- xml_name = "LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml"
- xml_ok_path = ok_folder.joinpath(xml_name)
- if path.is_cloud_path(files_path()):
- xml_ok_path = str(xml_ok_path.download_to(tmp_dir))
- else:
- xml_ok_path = str(xml_ok_path)
-
- xml_regex = f".*{xml_name}"
- xml_zip = files.read_archived_xml(zip_file, xml_regex)
- xml_tar = files.read_archived_xml(tar_file, r".*_MTL\.xml")
- xml_ok = etree.parse(xml_ok_path).getroot()
- ci.assert_xml_equal(xml_ok, xml_zip)
- ci.assert_xml_equal(xml_ok, xml_tar)
-
- # FILE + HTML
- html_zip_file = files_path().joinpath("productPreview.zip")
- html_tar_file = files_path().joinpath("productPreview.tar")
- html_name = "productPreview.html"
- html_ok_path = files_path().joinpath(html_name)
- if path.is_cloud_path(files_path()):
- html_ok_path = str(html_ok_path.download_to(tmp_dir))
- else:
- html_ok_path = str(html_ok_path)
-
- html_regex = f".*{html_name}"
-
- # FILE
- file_zip = files.read_archived_file(html_zip_file, html_regex)
- file_tar = files.read_archived_file(html_tar_file, html_regex)
- html_ok = html.parse(html_ok_path).getroot()
- ci.assert_html_equal(html_ok, html.fromstring(file_zip))
- ci.assert_html_equal(html_ok, html.fromstring(file_tar))
-
- file_list = path.get_archived_file_list(html_zip_file)
- ci.assert_html_equal(
- html_ok,
- html.fromstring(
- files.read_archived_file(html_zip_file, html_regex, file_list=file_list)
- ),
- )
-
- # HTML
- html_zip = files.read_archived_html(html_zip_file, html_regex)
- html_tar = files.read_archived_html(html_tar_file, html_regex)
- ci.assert_html_equal(html_ok, html_zip)
- ci.assert_html_equal(html_ok, html_tar)
- ci.assert_html_equal(
- html_ok,
- files.read_archived_html(
- html_tar_file,
- html_regex,
- file_list=path.get_archived_file_list(html_tar_file),
- ),
- )
-
- # ERRORS
- with pytest.raises(TypeError):
- files.read_archived_file(targz_file, xml_regex)
- with pytest.raises(TypeError):
- files.read_archived_file(sz_file, xml_regex)
- with pytest.raises(FileNotFoundError):
- files.read_archived_file(zip_file, "cdzeferf")
-
-
def test_cp_rm():
"""Test CP/RM functions"""
with tempfile.TemporaryDirectory() as tmp_dir:
@@ -242,10 +92,6 @@ def test_json():
) # Enum are stored following their value
assert obj == test_dict
- # Test deprecation
- with pytest.deprecated_call():
- files.save_json(json_file, test_dict)
-
def test_pickle():
"""Test pickle functions"""
diff --git a/ci/test_path.py b/ci/test_path.py
index 66ad609..6da5458 100644
--- a/ci/test_path.py
+++ b/ci/test_path.py
@@ -16,13 +16,12 @@
"""Script testing the files"""
import os
-import shutil
import tempfile
import pytest
-from ci.script_utils import files_path, get_s3_ci_path, s3_env
-from sertit import AnyPath, ci, misc, path, vectors
+from ci.script_utils import get_s3_ci_path
+from sertit import AnyPath, ci, misc, path
ci.reduce_verbosity()
@@ -65,58 +64,6 @@ def test_paths():
assert not path.is_writable("cvfgbherth") # Non-existing
-@s3_env
-def test_archived_paths():
- landsat_name = "LM05_L1TP_200030_20121230_20200820_02_T2_CI"
- ok_folder = files_path().joinpath(landsat_name)
- zip_file = files_path().joinpath(f"{landsat_name}.zip")
- tar_file = files_path().joinpath(f"{landsat_name}.tar")
- targz_file = files_path().joinpath(f"{landsat_name}.tar.gz")
- sz_file = files_path().joinpath(f"{landsat_name}.7z")
-
- # Archive file
- tif_name = "LM05_L1TP_200030_20121230_20200820_02_T2_QA_RADSAT.TIF"
- tif_ok = f"{ok_folder.name}/{tif_name}"
- tif_regex = f".*{tif_name}"
- assert tif_ok == path.get_archived_path(zip_file, tif_regex)
- assert tif_ok == path.get_archived_path(zip_file, tif_regex, as_list=True)[0]
- assert tif_ok == path.get_archived_path(tar_file, ".*RADSAT")
-
- # RASTERIO
- tif_zip = path.get_archived_rio_path(zip_file, tif_regex)
- tif_list = path.get_archived_rio_path(zip_file, tif_regex, as_list=True)
- tif_tar = path.get_archived_rio_path(tar_file, ".*RADSAT")
- tif_ok = ok_folder.joinpath(tif_name)
- ci.assert_raster_equal(tif_ok, tif_zip)
- ci.assert_raster_equal(tif_ok, tif_list[0])
- ci.assert_raster_equal(tif_ok, tif_tar)
-
- file_list = path.get_archived_file_list(zip_file)
- ci.assert_raster_equal(
- tif_ok, path.get_archived_rio_path(zip_file, tif_regex, file_list=file_list)
- )
-
- # VECTORS
- vect_name = "map-overlay.kml"
- vec_ok_path = ok_folder.joinpath(vect_name)
- if shutil.which("ogr2ogr"): # Only works if ogr2ogr can be found.
- vect_regex = f".*{vect_name}"
- vect_zip = vectors.read(zip_file, archive_regex=vect_regex)
- vect_tar = vectors.read(tar_file, archive_regex=r".*overlay\.kml")
- vect_ok = vectors.read(vec_ok_path)
- assert not vect_ok.empty
- ci.assert_geom_equal(vect_ok, vect_zip)
- ci.assert_geom_equal(vect_ok, vect_tar)
-
- # ERRORS
- with pytest.raises(TypeError):
- path.get_archived_rio_path(targz_file, tif_regex)
- with pytest.raises(TypeError):
- path.get_archived_rio_path(sz_file, tif_regex)
- with pytest.raises(FileNotFoundError):
- path.get_archived_rio_path(zip_file, "cdzeferf")
-
-
def test_get_file_name():
"""Test get_file_name"""
file_name = path.get_filename(__file__)
diff --git a/ci/test_rasters.py b/ci/test_rasters.py
index cca0295..74680a3 100644
--- a/ci/test_rasters.py
+++ b/ci/test_rasters.py
@@ -32,8 +32,6 @@
INT8_NODATA,
UINT8_NODATA,
UINT16_NODATA,
- any_raster_to_xr_ds,
- get_nodata_value_from_dtype,
get_nodata_value_from_xr,
)
from sertit.vectors import EPSG_4326
@@ -104,7 +102,7 @@ def ds_name(raster_path):
@pytest.fixture
def ds_dtype(raster_path):
with rasterio.open(str(raster_path)) as ds:
- return ds.meta["dtype"]
+ return getattr(np, ds.meta["dtype"])
@pytest.fixture
@@ -316,7 +314,7 @@ def test_crop(tmp_path, xda, xds, xda_dask, mask):
@s3_env
@dask_env
-def test_sieve(tmp_path, xda, xds, xda_dask):
+def test_sieve(tmp_path, raster_path, xda, xds, xda_dask):
"""Test sieve function"""
# DataArray
xda_sieved = os.path.join(tmp_path, "test_sieved_xda.tif")
@@ -350,6 +348,10 @@ def test_sieve(tmp_path, xda, xds, xda_dask):
ci.assert_raster_equal(xda_sieved, raster_sieved_path)
ci.assert_raster_equal(xds_sieved, raster_sieved_path)
+ # From path
+ sieve_xda_path = rasters.sieve(raster_path, sieve_thresh=20, connectivity=4)
+ np.testing.assert_array_equal(sieve_xda, sieve_xda_path)
+
@s3_env
@dask_env
@@ -675,11 +677,6 @@ def test_write(dtype, nodata_val, tmp_path, xda):
)
_test_raster_after_write(test_path, dtype, nodata_val)
- # test deprecation warning
- test_deprecated_path = os.path.join(tmp_path, "test_depr.tif")
- with pytest.deprecated_call():
- rasters.write(xda, path=test_deprecated_path, dtype=dtype)
-
def test_dim():
"""Test on BEAM-DIMAP function"""
@@ -900,51 +897,6 @@ def test_rasterize(tmp_path, raster_path):
ci.assert_raster_almost_equal(raster_true_path, out_path, decimal=4)
-@s3_env
-def test_decorator_deprecation(raster_path):
- from sertit.rasters import path_xarr_dst
-
- @any_raster_to_xr_ds
- def _ok_rasters(xds):
- assert isinstance(xds, xr.DataArray)
- return xds
-
- @path_xarr_dst
- def _depr_rasters(xds):
- assert isinstance(xds, xr.DataArray)
- return xds
-
- # Not able to warn deprecation from inside the decorator
- xr.testing.assert_equal(_ok_rasters(raster_path), _depr_rasters(raster_path))
-
-
-def test_get_nodata_deprecation():
- """Test deprecation of get_nodata_value"""
- # Test deprecation
- for dtype in [
- np.uint8,
- np.int8,
- np.uint16,
- np.uint32,
- np.int32,
- np.int64,
- np.uint64,
- int,
- "int",
- np.int16,
- np.float32,
- np.float64,
- float,
- "float",
- ]:
- with pytest.deprecated_call():
- from sertit.rasters import get_nodata_value
-
- ci.assert_val(
- get_nodata_value_from_dtype(dtype), get_nodata_value(dtype), dtype
- )
-
-
@s3_env
@dask_env
def test_get_notata_from_xr(raster_path):
diff --git a/ci/test_rasters_rio.py b/ci/test_rasters_rio.py
index 54a7d92..e69dc2c 100644
--- a/ci/test_rasters_rio.py
+++ b/ci/test_rasters_rio.py
@@ -26,7 +26,6 @@
from ci.script_utils import KAPUT_KWARGS, rasters_path, s3_env
from sertit import ci, rasters_rio, vectors
-from sertit.rasters_rio import any_raster_to_rio_ds, get_nodata_value_from_dtype
from sertit.vectors import EPSG_4326
ci.reduce_verbosity()
@@ -421,56 +420,3 @@ def _test_idx(idx_list):
_test_idx([1])
_test_idx([1, 2])
_test_idx(1)
-
-
-@s3_env
-def test_decorator_deprecation(raster_path):
- from sertit.rasters_rio import path_arr_dst
-
- @any_raster_to_rio_ds
- def _ok_rasters(ds):
- return ds.read()
-
- @path_arr_dst
- def _depr_rasters(ds):
- return ds.read()
-
- # Not able to warn deprecation from inside the decorator
- np.testing.assert_equal(_ok_rasters(raster_path), _depr_rasters(raster_path))
-
-
-def test_get_nodata_deprecation():
- """Test deprecation of get_nodata_value"""
- # Test deprecation
- for dtype in [
- np.uint8,
- np.int8,
- np.uint16,
- np.uint32,
- np.int32,
- np.int64,
- np.uint64,
- int,
- "int",
- np.int16,
- np.float32,
- np.float64,
- float,
- "float",
- ]:
- with pytest.deprecated_call():
- from sertit.rasters_rio import get_nodata_value
-
- ci.assert_val(
- get_nodata_value_from_dtype(dtype), get_nodata_value(dtype), dtype
- )
-
-
-@s3_env
-def test_write_deprecated(tmp_path, raster_path):
- test_deprecated_path = os.path.join(tmp_path, "test_depr.tif")
- raster, mtd = rasters_rio.read(raster_path)
-
- # test deprecation warning
- with pytest.deprecated_call():
- rasters_rio.write(raster, mtd, path=test_deprecated_path)
diff --git a/ci/test_s3.py b/ci/test_s3.py
index 81069d2..6cfe091 100644
--- a/ci/test_s3.py
+++ b/ci/test_s3.py
@@ -19,7 +19,7 @@
import pytest
import rasterio
-from cloudpathlib import AnyPath, S3Client
+from cloudpathlib import AnyPath
from tempenv import tempenv
from ci.script_utils import CI_SERTIT_S3
@@ -43,6 +43,8 @@ def with_s3(variable_1, variable_2):
def without_s3():
+ from cloudpathlib import S3Client
+
S3Client().set_as_default_client()
return base_fct(None)
diff --git a/ci/test_types.py b/ci/test_types.py
index b0cd0e3..1daf305 100644
--- a/ci/test_types.py
+++ b/ci/test_types.py
@@ -2,15 +2,24 @@
from typing import Union
import numpy as np
-from cloudpathlib import CloudPath
from sertit import AnyPath
from sertit.types import AnyPathType, is_iterable, make_iterable
+try:
+ from upath import UPath
+except ImportError:
+ UPath = None
+
+try:
+ from cloudpathlib import CloudPath
+except ImportError:
+ CloudPath = None
+
def test_types():
"""Test some type aliases"""
- assert AnyPathType == Union[Path, CloudPath]
+ assert AnyPathType == Union[Path, CloudPath, UPath]
def test_is_iterable():
diff --git a/ci/test_unistra.py b/ci/test_unistra.py
index c754b90..7d7c144 100644
--- a/ci/test_unistra.py
+++ b/ci/test_unistra.py
@@ -73,7 +73,10 @@ def test_unistra_s3():
assert with_s3() == 1
# Test get_geodatastore with s3
- assert str(get_geodatastore()) == "s3://sertit-geodatastore"
+ try:
+ assert str(get_geodatastore()) == "s3://sertit-geodatastore/"
+ except AssertionError:
+ assert str(get_geodatastore()) == "s3://sertit-geodatastore"
# Test get_geodatastore without s3
with tempenv.TemporaryEnvironment({s3.USE_S3_STORAGE: "0"}):
diff --git a/ci/test_vectors.py b/ci/test_vectors.py
index 273077c..3aea31d 100644
--- a/ci/test_vectors.py
+++ b/ci/test_vectors.py
@@ -21,11 +21,10 @@
import geopandas as gpd
import pytest
-from rasterio import CRS
from shapely import wkt
from ci.script_utils import KAPUT_KWARGS, files_path, s3_env, vectors_path
-from sertit import ci, files, path, vectors
+from sertit import archives, ci, files, path, vectors
from sertit.vectors import EPSG_4326, DataSourceError
ci.reduce_verbosity()
@@ -81,15 +80,6 @@ def test_vectors():
aoi = vectors.read(kml_path, **KAPUT_KWARGS)
_assert_attributes(aoi, kml_path)
- with pytest.deprecated_call():
- assert (
- vectors.corresponding_utm_projection(aoi.centroid.x, aoi.centroid.y)
- == "EPSG:32638"
- )
- assert CRS.from_string("EPSG:32638") == vectors.to_utm_crs(
- aoi.centroid.x, aoi.centroid.y
- )
-
env = aoi.envelope[0]
# Test kwargs (should be slightly not equal toi AOI to prove bbox does sth)
@@ -280,7 +270,10 @@ def test_read_archived():
map_overlay_extracted = vectors.read(map_overlay_extracted_path)
ci.assert_geom_equal(
- map_overlay_extracted, vectors.read(f"{zip_landsat}!{landsat}/{map_overlay}")
+ map_overlay_extracted,
+ vectors.read(
+ zip_landsat.parent / (zip_landsat.name + f"!{landsat}/{map_overlay}")
+ ),
)
ci.assert_geom_equal(
map_overlay_extracted,
@@ -291,7 +284,7 @@ def test_read_archived():
vectors.read(tar_landsat, archive_regex=map_overlay_regex),
)
- file_list = path.get_archived_file_list(tar_landsat)
+ file_list = archives.get_archived_file_list(tar_landsat)
ci.assert_geom_equal(
map_overlay_extracted,
vectors.read(tar_landsat, archive_regex=map_overlay_regex, file_list=file_list),
diff --git a/ci/test_xml.py b/ci/test_xml.py
index 2236d23..000df2a 100644
--- a/ci/test_xml.py
+++ b/ci/test_xml.py
@@ -111,7 +111,7 @@ def test_xml():
_assert_str(cv_xml.findtext(".//Age"), "20")
# Write
- true_xml = str(xml_path() / "true.xml")
+ true_xml = xml_path() / "true.xml"
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_xml = os.path.join(tmp_dir, "tmp.xml")
xml.write(cv_xml, tmp_xml)
@@ -121,7 +121,8 @@ def test_xml():
# Based on `files.read_archived_xml`, so it is considered to work.
# Just test the case with complete path to the archive
l8_archived = files_path() / "LM05_L1TP_200030_20121230_20200820_02_T2_CI.zip"
- xml_archived = f"{l8_archived}!LM05_L1TP_200030_20121230_20200820_02_T2_CI/LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml"
+ xml_path_in_zip = "!LM05_L1TP_200030_20121230_20200820_02_T2_CI/LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml"
+ xml_archived = l8_archived.parent / (l8_archived.name + xml_path_in_zip)
ci.assert_xml_equal(
xml.read_archive(l8_archived, r".*_MTL\.xml"), xml.read_archive(xml_archived)
diff --git a/pyproject.toml b/pyproject.toml
index 4bfa4ce..d78644f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,8 +35,8 @@ dependencies = [
"dill",
"psutil",
"geopandas>=0.14.4",
- "cloudpathlib[all]>=0.12.1",
"xarray>=2024.06.0",
+ "universal_pathlib>=0.2.6"
]
dynamic = ["version"]
@@ -60,7 +60,8 @@ dask = [
"odc-geo>=0.4.6",
"xarray-spatial>=0.3.6",
]
-full = ["sertit[colorlog,rasters_rio,rasters,dask]"]
+cloudpathlib = ["cloudpathlib[all]>=0.12.1"]
+full = ["sertit[colorlog,rasters_rio,rasters,dask,cloudpathlib]"]
[project.urls]
Bug_Tracker = "https://github.com/sertit/sertit-utils/issues"
diff --git a/requirements.txt b/requirements.txt
index d02da9c..c4682aa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,6 +19,7 @@ lxml
dill
psutil
geopandas>=0.14.4
+universal_pathlib>=0.2.6
cloudpathlib[all]>=0.12.1
xarray>=2024.06.0
shapely >= 2.0.0
diff --git a/sertit/__init__.py b/sertit/__init__.py
index 4f4a348..ca7071b 100644
--- a/sertit/__init__.py
+++ b/sertit/__init__.py
@@ -21,11 +21,17 @@
"""
try:
- from cloudpathlib import AnyPath
+ from upath import UPath
+
+ AnyPath = UPath
- AnyPath = AnyPath
except ImportError:
- pass
+ try:
+ from cloudpathlib import AnyPath
+
+ AnyPath = AnyPath
+ except ImportError:
+ pass
# flake8: noqa
from .__meta__ import (
diff --git a/sertit/archives.py b/sertit/archives.py
new file mode 100644
index 0000000..db5d540
--- /dev/null
+++ b/sertit/archives.py
@@ -0,0 +1,548 @@
+import logging
+import os
+import re
+import shutil
+import tarfile
+import tempfile
+import zipfile
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Union
+
+from lxml import etree, html
+from tqdm import tqdm
+
+from sertit import AnyPath, path, s3
+from sertit.logs import SU_NAME
+from sertit.types import AnyPathStrType, AnyPathType
+
+LOGGER = logging.getLogger(SU_NAME)
+
+
+@contextmanager
+def open_zipfile(file_path, mode="r"):
+ if path.is_cloud_path(file_path):
+ file_path = s3.read(file_path)
+
+ with zipfile.ZipFile(file_path, mode) as zip_file:
+ yield zip_file
+
+
+@contextmanager
+def open_tarfile(file_path, mode="r"):
+ if path.is_cloud_path(file_path):
+ args = {"fileobj": s3.read(file_path), "mode": mode}
+ else:
+ args = {"name": file_path, "mode": mode}
+ with tarfile.open(**args) as tar_file:
+ yield tar_file
+
+
+def extract_file(
+ file_path: AnyPathStrType,
+ output: AnyPathStrType,
+ overwrite: bool = False,
+) -> AnyPathType:
+ """
+ Extract an archived file (zip or others). Overwrites if specified.
+ If the archive don't contain a root directory with the name of the archive without the extension, create it
+
+ Args:
+ file_path (str): Archive file path
+ output (str): Output where to put the extracted directory
+ overwrite (bool): Overwrite found extracted directory
+
+ Returns:
+ AnyPathType: Extracted directory paths
+
+ Example:
+ >>> file_path = 'D:/path/to/zip.zip'
+ >>> output = 'D:/path/to/output'
+ >>> extract_file(file_path, output, overwrite=True)
+ D:/path/to/output/zip'
+ """
+ # Convert to path
+ file_path = AnyPath(file_path)
+ output = AnyPath(output)
+
+ # In case a folder is given, returns it (this means that the file is already extracted)
+ if file_path.is_dir():
+ return file_path
+
+ # Beware with .SEN3 and .SAFE extensions
+ archive_output = output.joinpath(path.get_filename(file_path))
+
+ # In case not overwrite and the extracted directory already exists
+ if not overwrite and archive_output.exists():
+ LOGGER.debug(
+ "Already existing extracted %s. It won't be overwritten.",
+ archive_output,
+ )
+ return archive_output
+
+ def extract_sub_dir(arch, filename_list):
+ top_level_files = list({item.split("/")[0] for item in filename_list})
+
+ # When the only root directory in the archive has the right name, we don't have to create it
+ if len(top_level_files) == 1 and archive_output.name == path.get_filename(
+ top_level_files[0]
+ ):
+ arch.extractall(archive_output.parent)
+ archive_output.parent.joinpath(top_level_files[0]).rename(archive_output)
+ else:
+ arch.extractall(archive_output)
+
+ # Manage archive type
+ if file_path.suffix == ".zip":
+ with open_zipfile(file_path) as zip_file:
+ extract_sub_dir(zip_file, zip_file.namelist())
+ elif file_path.suffix == ".tar" or file_path.suffixes == [".tar", ".gz"]:
+ with open_tarfile(file_path) as tar_file:
+ extract_sub_dir(tar_file, tar_file.getnames())
+ elif file_path.suffix == ".7z":
+ try:
+ import py7zr
+
+ with py7zr.SevenZipFile(file_path, "r") as z7_file:
+ extract_sub_dir(z7_file, z7_file.getnames())
+ except ModuleNotFoundError as exc:
+ raise TypeError("Please install 'py7zr' to extract .7z files") from exc
+ else:
+ raise TypeError(
+ f"Only .zip, .tar, .tar.gz and .7z files can be extracted, not {file_path}"
+ )
+
+ return archive_output
+
+
+def extract_files(
+ archives: list, output: AnyPathStrType, overwrite: bool = False
+) -> list:
+ """
+ Extract all archived files. Overwrites if specified.
+
+ Example:
+ >>> file_path = ['D:/path/to/zip1.zip', 'D:/path/to/zip2.zip']
+ >>> output = 'D:/path/to/output'
+ >>> extract_files(file_path, output, overwrite=True)
+ ['D:/path/to/output.zip1', 'D:/path/to/output.zip2']
+
+ Args:
+ archives (list of str): List of archives to be extracted
+ output (str): Output folder where extracted files will be written
+ overwrite (bool): Overwrite found extracted files
+
+ Returns:
+ list: Extracted files (even pre-existing ones)
+ """
+ LOGGER.info("Extracting products in %s", output)
+ progress_bar = tqdm(archives)
+ extracts = []
+ for arch in progress_bar:
+ progress_bar.set_description(f"Extracting product {os.path.basename(arch)}")
+ extracts.append(extract_file(arch, output, overwrite))
+
+ return extracts
+
+
+def read_archived_file(
+ archive_path: AnyPathStrType, regex: str, file_list: list = None
+) -> bytes:
+ """
+ Read archived file (in bytes) from :code:`zip` or :code:`tar` archives.
+
+ You can use this `site `_ to build your regex.
+
+ Args:
+ archive_path (AnyPathStrType): Archive path
+ regex (str): Regex (used by re) as it can be found in the getmembers() list
+ file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
+
+ Returns:
+ bytes: Archived file in bytes
+ """
+ archive_path = AnyPath(archive_path)
+
+ # Compile regex
+ regex = re.compile(regex)
+
+ # Open tar and zip XML
+ try:
+ if archive_path.suffix == ".tar":
+ with open_tarfile(archive_path) as tar_ds:
+ # file_list is not very useful for TAR files...
+ if file_list is None:
+ tar_mb = tar_ds.getmembers()
+ file_list = [mb.name for mb in tar_mb]
+ name = list(filter(regex.match, file_list))[0]
+ tarinfo = tar_ds.getmember(name)
+ file_str = tar_ds.extractfile(tarinfo).read()
+ elif archive_path.suffix == ".zip":
+ with open_zipfile(archive_path) as zip_ds:
+ if file_list is None:
+ file_list = [f.filename for f in zip_ds.filelist]
+ name = list(filter(regex.match, file_list))[0]
+ file_str = zip_ds.read(name)
+
+ elif archive_path.suffix == ".tar.gz":
+ raise TypeError(
+ ".tar.gz files are too slow to read from inside the archive. Please extract them instead."
+ )
+ else:
+ raise TypeError(
+ "Only .zip and .tar files can be read from inside its archive."
+ )
+ except IndexError as exc:
+ raise FileNotFoundError(
+ f"Impossible to find file {regex} in {path.get_filename(archive_path)}"
+ ) from exc
+
+ return file_str
+
+
+def read_archived_xml(
+ archive_path: AnyPathStrType, regex: str = None, file_list: list = None, **kwargs
+) -> etree._Element:
+ """
+ Read archived XML from :code:`zip` or :code:`tar` archives.
+
+ You can use this `site `_ to build your regex.
+
+ Args:
+ archive_path (AnyPathStrType): Archive path
+ regex (str): XML regex (used by re) as it can be found in the getmembers() list
+ file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
+
+ Returns:
+ etree._Element: XML file
+
+ Example:
+ >>> arch_path = 'D:/path/to/zip.zip'
+ >>> file_regex = '.*dir.*file_name' # Use .* for any character
+ >>> read_archived_xml(arch_path, file_regex)
+
+ """
+ xml_bytes = read_archived_file(archive_path, regex=regex, file_list=file_list)
+
+ return etree.fromstring(xml_bytes)
+
+
+def read_archived_html(
+ archive_path: AnyPathStrType, regex: str, file_list: list = None
+) -> html.HtmlElement:
+ """
+ Read archived HTML from :code:`zip` or :code:`tar` archives.
+
+ You can use this `site `_ to build your regex.
+
+ Args:
+ archive_path (AnyPathStrType): Archive path
+ regex (str): HTML regex (used by re) as it can be found in the getmembers() list
+ file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
+
+ Returns:
+ html._Element: HTML file
+
+ Example:
+ >>> arch_path = 'D:/path/to/zip.zip'
+ >>> file_regex = '.*dir.*file_name' # Use .* for any character
+ >>> read_archived_html(arch_path, file_regex)
+
+ """
+ html_bytes = read_archived_file(archive_path, regex, file_list=file_list)
+
+ return html.fromstring(html_bytes)
+
+
+def archive(
+ folder_path: AnyPathStrType,
+ archive_path: AnyPathStrType,
+ fmt: str = "zip",
+) -> AnyPathType:
+ """
+ Archives a folder recursively.
+
+ Args:
+ folder_path (AnyPathStrType): Folder to archive
+ archive_path (AnyPathStrType): Archive path, with or without extension
+ fmt (str): Format of the archive, used by :code:`shutil.make_archive`. Choose between [zip, tar, gztar, bztar, xztar]
+
+ Returns:
+ str: Archive filename
+
+ Example:
+ >>> folder_path = 'D:/path/to/folder_to_archive'
+ >>> archive_path = 'D:/path/to/output'
+ >>> archive = archive(folder_path, archive_path, fmt="gztar")
+ 'D:/path/to/output/folder_to_archive.tar.gz'
+ """
+ archive_path = AnyPath(archive_path)
+ folder_path = AnyPath(folder_path)
+
+ # with zipfile.ZipFile(archive_path, mode='w', compression=zipfile.ZIP_DEFLATED) as zipf:
+ # for f in folder_path.glob("**"):
+ # zipf.write(f, f.relative_to(folder_path.name))
+
+ tmp_dir = None
+ if path.is_cloud_path(folder_path):
+ tmp_dir = tempfile.TemporaryDirectory()
+ folder_path = s3.download(folder_path, tmp_dir.name)
+
+ # Shutil make_archive needs a path without extension
+ archive_base = os.path.splitext(archive_path)[0]
+
+ # Archive the folder
+ archive_fn = shutil.make_archive(
+ archive_base,
+ format=fmt,
+ root_dir=folder_path.parent,
+ base_dir=folder_path.name,
+ )
+
+ if tmp_dir is not None:
+ tmp_dir.cleanup()
+
+ try:
+ arch = AnyPath(archive_fn, **folder_path.storage_options)
+ except AttributeError:
+ arch = AnyPath(archive_fn)
+
+ return arch
+
+
+def add_to_zip(
+ zip_path: AnyPathStrType,
+ dirs_to_add: Union[list, AnyPathStrType],
+) -> AnyPathType:
+ """
+ Add folders to an already existing zip file (recursively).
+
+ Args:
+ zip_path (AnyPathStrType): Already existing zip file
+ dirs_to_add (Union[list, AnyPathStrType]): Directories to add
+
+ Returns:
+ AnyPathType: Updated zip_path
+
+ Example:
+ >>> zip_path = 'D:/path/to/zip.zip'
+ >>> dirs_to_add = ['D:/path/to/dir1', 'D:/path/to/dir2']
+ >>> add_to_zip(zip_path, dirs_to_add)
+ zip.zip contains 2 more folders, dir1 and dir2
+ """
+ zip_path = AnyPath(zip_path)
+
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ # If the zip is on the cloud, cache it (zipfile doesn't like cloud paths)
+ if path.is_cloud_path(zip_path):
+ raise NotImplementedError(
+ "Impossible (for now) to update a zip stored in the cloud!"
+ )
+
+ # Check if existing zipfile
+ if not zip_path.is_file():
+ raise FileNotFoundError(f"Non existing {zip_path}")
+
+ # Convert to list if needed
+ if not isinstance(dirs_to_add, list):
+ dirs_to_add = [dirs_to_add]
+
+ # Add all folders to the existing zip
+ # Forced to use ZipFile because make_archive only works with one folder and not existing zipfile
+ with open_zipfile(zip_path, "a") as zip_file:
+ progress_bar = tqdm(dirs_to_add)
+ for dir_to_add_path in progress_bar:
+ # Just to be sure, use str instead of Paths
+ if isinstance(dir_to_add_path, Path):
+ dir_to_add = str(dir_to_add_path)
+ elif path.is_cloud_path(dir_to_add_path):
+ dir_to_add = dir_to_add_path.fspath
+ else:
+ dir_to_add = dir_to_add_path
+
+ progress_bar.set_description(
+ f"Adding {os.path.basename(dir_to_add)} to {os.path.basename(zip_path)}"
+ )
+ if os.path.isfile(dir_to_add):
+ dir_to_add = extract_file(dir_to_add, tmp_dir)
+
+ for root, _, files in os.walk(dir_to_add):
+ base_path = os.path.join(dir_to_add, "..")
+
+ # Write dir (in namelist at least)
+ zip_file.write(root, os.path.relpath(root, base_path))
+
+ # Write files
+ for file in files:
+ zip_file.write(
+ os.path.join(root, file),
+ os.path.relpath(
+ os.path.join(root, file), os.path.join(dir_to_add, "..")
+ ),
+ )
+
+ return zip_path
+
+
+def get_archived_file_list(archive_path: AnyPathStrType) -> list:
+ """
+ Get the list of all the files contained in an archive.
+
+ Args:
+ archive_path (AnyPathStrType): Archive path
+
+ Returns:
+ list: All files contained in the given archive
+
+ Example:
+ >>> arch_path = 'D:/path/to/zip.zip'
+ >>> get_archived_file_list(arch_path, file_regex)
+ ['file_1.txt', 'file_2.tif', 'file_3.xml', 'file_4.geojson']
+ """
+ archive_path = AnyPath(archive_path)
+
+ is_zip = archive_path.suffix == ".zip"
+ archive_fn = path.get_filename(archive_path)
+ if is_zip:
+ with open_zipfile(archive_path) as zip_ds:
+ file_list = [f.filename for f in zip_ds.filelist]
+ else:
+ try:
+ with open_tarfile(archive_path) as tar_ds:
+ tar_mb = tar_ds.getmembers()
+ file_list = [mb.name for mb in tar_mb]
+ except tarfile.ReadError as ex:
+ raise tarfile.ReadError(f"Impossible to open archive: {archive_fn}") from ex
+
+ return file_list
+
+
+def get_archived_path(
+ archive_path: AnyPathStrType,
+ regex: str,
+ as_list: bool = False,
+ case_sensitive: bool = False,
+ file_list: list = None,
+ **kwargs,
+) -> Union[list, AnyPathType]:
+ """
+ Get archived file path from inside the archive.
+
+ .. WARNING::
+ If :code:`as_list` is :code:`False`, it will only return the first file matched !
+
+ You can use this `site `_ to build your regex.
+
+ Args:
+ archive_path (AnyPathStrType): Archive path
+ regex (str): File regex (used by re) as it can be found in the getmembers() list
+ as_list (bool): If true, returns a list (including all found files). If false, returns only the first match
+ case_sensitive (bool): If true, the regex is case-sensitive.
+ file_list (list): List of files to get archived from. Optional, if not given it will be re-computed.
+
+ Returns:
+ Union[list, str]: Path from inside the zipfile
+
+ Example:
+ >>> arch_path = 'D:/path/to/zip.zip'
+ >>> file_regex = '.*dir.*file_name' # Use .* for any character
+ >>> path = get_archived_path(arch_path, file_regex)
+ 'dir/filename.tif'
+ """
+ # Get file list
+ archive_path = AnyPath(archive_path)
+
+ # Offer the ability to give the file list directly, as this operation is expensive when done with large archives stored on the cloud
+ if file_list is None:
+ file_list = get_archived_file_list(archive_path)
+
+ # Search for file
+ re_rgx = re.compile(regex) if case_sensitive else re.compile(regex, re.IGNORECASE)
+ archived_band_paths = list(filter(re_rgx.match, file_list))
+ if not archived_band_paths:
+ raise FileNotFoundError(
+ f"Impossible to find file {regex} in {path.get_filename(archive_path)}"
+ )
+
+ # Convert to str if needed
+ if not as_list:
+ archived_band_paths = archived_band_paths[0]
+
+ return archived_band_paths
+
+
+def get_archived_rio_path(
+ archive_path: AnyPathStrType,
+ regex: str,
+ as_list: bool = False,
+ file_list: list = None,
+ **kwargs,
+) -> Union[list, AnyPathType]:
+ """
+ Get archived file path from inside the archive, to be read with rasterio:
+
+ - :code:`zip+file://{zip_path}!{file_name}`
+ - :code:`tar+file://{tar_path}!{file_name}`
+
+
+ See `here `_
+ for more information.
+
+ .. WARNING::
+ It wont be readable by pandas, geopandas or xmltree !
+
+ .. WARNING::
+ If :code:`as_list` is :code:`False`, it will only return the first file matched !
+
+ You can use this `site `_ to build your regex.
+
+ Args:
+ archive_path (AnyPathStrType): Archive path
+ regex (str): File regex (used by re) as it can be found in the getmembers() list
+ as_list (bool): If true, returns a list (including all found files). If false, returns only the first match
+ file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
+
+ Returns:
+ Union[list, str]: Band path that can be read by rasterio
+
+ Example:
+ >>> arch_path = 'D:/path/to/zip.zip'
+ >>> file_regex = '.*dir.*file_name' # Use .* for any character
+ >>> path = get_archived_tif_path(arch_path, file_regex)
+ 'zip+file://D:/path/to/output.zip!dir/filename.tif'
+ >>> rasterio.open(path)
+
+ """
+ archive_path = AnyPath(archive_path)
+ if archive_path.suffix in [".tar", ".zip"]:
+ prefix = archive_path.suffix[-3:]
+ elif archive_path.suffix == ".tar.gz":
+ raise TypeError(
+ ".tar.gz files are too slow to be read from inside the archive. Please extract them instead."
+ )
+ else:
+ raise TypeError("Only .zip and .tar files can be read from inside its archive.")
+
+ # Search for file
+ archived_band_paths = get_archived_path(
+ archive_path, regex=regex, as_list=True, file_list=file_list
+ )
+
+ # Convert to rio path
+ if path.is_cloud_path(archive_path):
+ archived_band_paths = [
+ f"{prefix}+file+{archive_path}!{p}" for p in archived_band_paths
+ ]
+ else:
+ # archived_band_paths = [
+ # f"{prefix}+file://{archive_path}!{path}" for path in archived_band_paths
+ # ]
+ archived_band_paths = [
+ f"/vsi{prefix}/{archive_path}/{p}" for p in archived_band_paths
+ ]
+
+ # Convert to str if needed
+ if not as_list:
+ archived_band_paths = archived_band_paths[0]
+
+ return archived_band_paths
diff --git a/sertit/arcpy.py b/sertit/arcpy.py
index a82f603..64cc9f9 100644
--- a/sertit/arcpy.py
+++ b/sertit/arcpy.py
@@ -1,8 +1,6 @@
import logging
import logging.handlers
-from sertit.logs import deprecation_warning
-
# Arcpy types from inside a schema
SHORT = "int32:4"
""" 'Short' type for ArcGis GDB """
@@ -153,32 +151,6 @@ def emit(self, record):
super(ArcPyLogHandler, self).emit(record)
-def feature_layer_to_path(feature_layer) -> str:
- """
- .. deprecated:: 1.36.0
- Use :py:func:`gp_layer_to_path` instead.
-
- Use :func:`gp_layer_to_path` instead.
-
- Convert a feature layer to its source path.
-
- Args:
- feature_layer: Feature layer
-
- Returns:
- str: Path to the feature layer source
-
- """
- deprecation_warning("This function is deprecated. Use gp_layer_to_path instead.")
- # Get path
- if hasattr(feature_layer, "dataSource"):
- path = feature_layer.dataSource
- else:
- path = str(feature_layer)
-
- return path
-
-
def gp_layer_to_path(feature_layer) -> str:
"""
Convert a GP layer to its source path.
diff --git a/sertit/ci.py b/sertit/ci.py
index 8b000ba..9e07937 100644
--- a/sertit/ci.py
+++ b/sertit/ci.py
@@ -20,6 +20,7 @@
import filecmp
import logging
import pprint
+import tempfile
from doctest import Example
from typing import Any, Union
@@ -30,8 +31,8 @@
from shapely import force_2d, normalize
from shapely.testing import assert_geometries_equal
-from sertit import AnyPath, files, s3, unistra
-from sertit.logs import SU_NAME, deprecation_warning
+from sertit import AnyPath, files, path, s3
+from sertit.logs import SU_NAME
from sertit.types import AnyPathStrType, AnyXrDataStructure
LOGGER = logging.getLogger(SU_NAME)
@@ -42,61 +43,6 @@
AWS_S3_ENDPOINT = s3.AWS_S3_ENDPOINT
-def s3_env(*args, **kwargs):
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.unistra` instead of :py:mod:`sertit.ci`
- """
- deprecation_warning(
- "This function is deprecated. Import it from 'sertit.unistra' instead of 'sertit.ci'"
- )
- return unistra.s3_env(*args, **kwargs)
-
-
-def define_s3_client():
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.unistra` instead of :py:mod:`sertit.ci`
- """
- deprecation_warning(
- "This function is deprecated. Import it from 'sertit.unistra' instead of 'sertit.ci'"
- )
- return unistra.define_s3_client()
-
-
-def get_db2_path():
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.unistra` instead of :py:mod:`sertit.ci`
- """
- deprecation_warning(
- "This function is deprecated. Import it from 'sertit.unistra' instead of 'sertit.ci'"
- )
- return unistra.get_db2_path()
-
-
-def get_db3_path():
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.unistra` instead of :py:mod:`sertit.ci`
- """
- deprecation_warning(
- "This function is deprecated. Import it from 'sertit.unistra' instead of 'sertit.ci'"
- )
- return unistra.get_db3_path()
-
-
-def get_db4_path():
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.unistra` instead of :py:mod:`sertit.ci`
- """
- deprecation_warning(
- "This function is deprecated. Import it from 'sertit.unistra' instead of 'sertit.ci'"
- )
- return unistra.get_db4_path()
-
-
def assert_val(val_1: Any, val_2: Any, field: str) -> None:
"""
Compare two values corresponding to a field
@@ -140,7 +86,7 @@ def assert_files_equal(file_1: AnyPathStrType, file_2: AnyPathStrType):
file_1 (str): Path to file 1
file_2 (str): Path to file 2
"""
- with file_1.open("r") as f1, file_2.open("r") as f2:
+ with AnyPath(file_1).open("r") as f1, AnyPath(file_2).open("r") as f2:
assert files.hash_file_content(f1.read()) == files.hash_file_content(f2.read())
@@ -381,27 +327,36 @@ def assert_dir_equal(path_1: AnyPathStrType, path_2: AnyPathStrType) -> None:
assert path_1.is_dir(), f"{path_1} is not a directory!"
assert path_2.is_dir(), f"{path_2} is not a directory!"
- dcmp = filecmp.dircmp(path_1, path_2)
- try:
- assert (
- dcmp.left_only == []
- ), f"More files in {path_1}!\n{pprint.pformat(list(dcmp.left_only))}"
- assert (
- dcmp.right_only == []
- ), f"More files in {path_2}!\n{pprint.pformat(list(dcmp.right_only))}"
- except FileNotFoundError:
- files_1 = [AnyPath(p).name for p in AnyPath(path_1).iterdir()]
- files_2 = [AnyPath(p).name for p in AnyPath(path_2).iterdir()]
-
- for f1 in files_1:
- assert (
- f1 in files_2
- ), f"File missing!\n{f1} not in {pprint.pformat(files_2)}"
+ with (
+ tempfile.TemporaryDirectory() as tmpdir,
+ tempfile.TemporaryDirectory() as tmpdir2,
+ ):
+ if path.is_cloud_path(path_1):
+ path_1 = s3.download(path_1, tmpdir)
+ if path.is_cloud_path(path_2):
+ path_2 = s3.download(path_2, tmpdir2)
- for f2 in files_2:
+ dcmp = filecmp.dircmp(path_1, path_2)
+ try:
+ assert (
+ dcmp.left_only == []
+ ), f"More files in {path_1}!\n{pprint.pformat(list(dcmp.left_only))}"
assert (
- f2 in files_1
- ), f"File missing!\n{f2} not in {pprint.pformat(files_1)}"
+ dcmp.right_only == []
+ ), f"More files in {path_2}!\n{pprint.pformat(list(dcmp.right_only))}"
+ except FileNotFoundError:
+ files_1 = [p.name for p in path_1.iterdir()]
+ files_2 = [p.name for p in path_2.iterdir()]
+
+ for f1 in files_1:
+ assert (
+ f1 in files_2
+ ), f"File missing!\n{f1} not in {pprint.pformat(files_2)}"
+
+ for f2 in files_2:
+ assert (
+ f2 in files_1
+ ), f"File missing!\n{f2} not in {pprint.pformat(files_1)}"
def assert_geom_equal(
diff --git a/sertit/files.py b/sertit/files.py
index 4bdb6c2..7be06e9 100644
--- a/sertit/files.py
+++ b/sertit/files.py
@@ -19,23 +19,17 @@
import json
import logging
import os
-import re
import shutil
-import tarfile
-import tempfile
-import zipfile
from datetime import date, datetime
from enum import Enum
from json import JSONDecoder, JSONEncoder
from pathlib import Path
-from typing import Any, Union
+from typing import Any
import dill
import numpy as np
-from lxml import etree, html
-from tqdm import tqdm
-from sertit import AnyPath, logs, path
+from sertit import AnyPath, path, s3
from sertit.logs import SU_NAME
from sertit.strings import DATE_FORMAT
from sertit.types import AnyPathStrType, AnyPathType
@@ -43,628 +37,6 @@
LOGGER = logging.getLogger(SU_NAME)
-def get_root_path() -> AnyPathType:
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files`
-
- Get the root path of the current disk:
-
- - On Linux this returns :code:`/`
- - On Windows this returns :code:`C:/` or whatever the current drive is
-
- Example:
- >>> get_root_path()
- "/" on Linux
- "C:/" on Windows (if you run this code from the C: drive)
- """
- logs.deprecation_warning(
- "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'"
- )
- return path.get_root_path()
-
-
-def listdir_abspath(directory: AnyPathStrType) -> list:
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files`
-
- Get absolute path of all files in the given directory.
-
- It is the same function than :code:`os.listdir` but returning absolute paths.
-
- Args:
- directory (AnyPathStrType): Relative or absolute path to the directory to be scanned
-
- Returns:
- str: Absolute path of all files in the given directory
-
- Example:
- >>> folder = "."
- >>> listdir_abspath(folder)
- ['D:/_SERTIT_UTILS/sertit-utils/sertit/files.py',
- 'D:/_SERTIT_UTILS/sertit-utils/sertit/logs.py',
- 'D:/_SERTIT_UTILS/sertit-utils/sertit/misc.py',
- 'D:/_SERTIT_UTILS/sertit-utils/sertit/network.py',
- 'D:/_SERTIT_UTILS/sertit-utils/sertit/rasters_rio.py',
- 'D:/_SERTIT_UTILS/sertit-utils/sertit/strings.py',
- 'D:/_SERTIT_UTILS/sertit-utils/sertit/vectors.py',
- 'D:/_SERTIT_UTILS/sertit-utils/sertit/version.py',
- 'D:/_SERTIT_UTILS/sertit-utils/sertit/__init__.py']
- """
- logs.deprecation_warning(
- "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'"
- )
- return path.listdir_abspath(directory)
-
-
-def to_abspath(
- raw_path: AnyPathStrType,
- create: bool = True,
- raise_file_not_found: bool = True,
-) -> AnyPathType:
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files`
-
- Return the absolute path of the specified path and check if it exists
-
- If not:
-
- - If it is a file (aka has an extension), it raises an exception
- - If it is a folder, it creates it
-
- To be used with argparse to retrieve the absolute path of a file, like:
-
- Args:
- raw_path (AnyPathStrType): Path as a string (relative or absolute)
- create (bool): Create directory if not existing
-
- Returns:
- AnyPathType: Absolute path
-
- Example:
- >>> parser = argparse.ArgumentParser()
- >>> # Add config file path key
- >>> parser.add_argument(
- >>> "--config",
- >>> help="Config file path (absolute or relative)",
- >>> type=to_abspath
- >>> )
- """
- logs.deprecation_warning(
- "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'"
- )
- return path.to_abspath(raw_path, create, raise_file_not_found)
-
-
-def real_rel_path(raw_path: AnyPathStrType, start: AnyPathStrType) -> AnyPathType:
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files`
-
- Gives the real relative path from a starting folder.
- (and not just adding :code:`../..` between the start and the target)
-
- Args:
- raw_path (AnyPathStrType): Path to make relative
- start (AnyPathStrType): Start, the path being relative from this folder.
-
- Returns:
- Relative path
-
- Example:
- >>> path = r'D:/_SERTIT_UTILS/sertit-utils/sertit'
- >>> start = os.path.join(".", "..", "..")
- >>> real_rel_path(path, start)
- 'sertit-utils/sertit'
- """
- logs.deprecation_warning(
- "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'"
- )
- return path.real_rel_path(raw_path, start)
-
-
-def extract_file(
- file_path: AnyPathStrType,
- output: AnyPathStrType,
- overwrite: bool = False,
-) -> AnyPathType:
- """
- Extract an archived file (zip or others). Overwrites if specified.
- If the archive don't contain a root directory with the name of the archive without the extension, create it
-
- Args:
- file_path (str): Archive file path
- output (str): Output where to put the extracted directory
- overwrite (bool): Overwrite found extracted directory
-
- Returns:
- AnyPathType: Extracted directory paths
-
- Example:
- >>> file_path = 'D:/path/to/zip.zip'
- >>> output = 'D:/path/to/output'
- >>> extract_file(file_path, output, overwrite=True)
- D:/path/to/output/zip'
- """
- # Convert to path
- file_path = AnyPath(file_path)
- output = AnyPath(output)
-
- # In case a folder is given, returns it (this means that the file is already extracted)
- if file_path.is_dir():
- return file_path
-
- # Beware with .SEN3 and .SAFE extensions
- archive_output = output.joinpath(path.get_filename(file_path))
-
- # In case not overwrite and the extracted directory already exists
- if not overwrite and archive_output.exists():
- LOGGER.debug(
- "Already existing extracted %s. It won't be overwritten.",
- archive_output,
- )
- return archive_output
-
- def extract_sub_dir(arch, filename_list):
- top_level_files = list({item.split("/")[0] for item in filename_list})
-
- # When the only root directory in the archive has the right name, we don't have to create it
- if len(top_level_files) == 1 and archive_output.name == path.get_filename(
- top_level_files[0]
- ):
- arch.extractall(archive_output.parent)
- archive_output.parent.joinpath(top_level_files[0]).rename(archive_output)
- else:
- arch.extractall(archive_output)
-
- # Manage archive type
- if file_path.suffix == ".zip":
- with zipfile.ZipFile(file_path, "r") as zip_file:
- extract_sub_dir(zip_file, zip_file.namelist())
- elif file_path.suffix == ".tar" or file_path.suffixes == [".tar", ".gz"]:
- with tarfile.open(file_path, "r") as tar_file:
- extract_sub_dir(tar_file, tar_file.getnames())
- elif file_path.suffix == ".7z":
- try:
- import py7zr
-
- with py7zr.SevenZipFile(file_path, "r") as z7_file:
- extract_sub_dir(z7_file, z7_file.getnames())
- except ModuleNotFoundError as exc:
- raise TypeError("Please install 'py7zr' to extract .7z files") from exc
- else:
- raise TypeError(
- f"Only .zip, .tar, .tar.gz and .7z files can be extracted, not {file_path}"
- )
-
- return archive_output
-
-
-def extract_files(
- archives: list, output: AnyPathStrType, overwrite: bool = False
-) -> list:
- """
- Extract all archived files. Overwrites if specified.
-
- Example:
- >>> file_path = ['D:/path/to/zip1.zip', 'D:/path/to/zip2.zip']
- >>> output = 'D:/path/to/output'
- >>> extract_files(file_path, output, overwrite=True)
- ['D:/path/to/output.zip1', 'D:/path/to/output.zip2']
-
- Args:
- archives (list of str): List of archives to be extracted
- output (str): Output folder where extracted files will be written
- overwrite (bool): Overwrite found extracted files
-
- Returns:
- list: Extracted files (even pre-existing ones)
- """
- LOGGER.info("Extracting products in %s", output)
- progress_bar = tqdm(archives)
- extracts = []
- for arch in progress_bar:
- progress_bar.set_description(f"Extracting product {os.path.basename(arch)}")
- extracts.append(extract_file(arch, output, overwrite))
-
- return extracts
-
-
-def get_archived_file_list(archive_path: AnyPathStrType) -> list:
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files`
-
- Get the list of all the files contained in an archive.
-
- Args:
- archive_path (AnyPathStrType): Archive path
-
- Returns:
- list: All files contained in the given archive
-
- Example:
- >>> arch_path = 'D:/path/to/zip.zip'
- >>> get_archived_file_list(arch_path, file_regex)
- ['file_1.txt', 'file_2.tif', 'file_3.xml', 'file_4.geojson']
- """
- logs.deprecation_warning(
- "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'"
- )
- return path.get_archived_file_list(archive_path)
-
-
-def get_archived_path(
- archive_path: AnyPathStrType, file_regex: str, as_list: bool = False
-) -> Union[list, AnyPathType]:
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files`
-
- Get archived file path from inside the archive.
-
- .. WARNING::
- If :code:`as_list` is :code:`False`, it will only return the first file matched !
-
- You can use this `site `_ to build your regex.
-
- Example:
- >>> arch_path = 'D:/path/to/zip.zip'
- >>> file_regex = '.*dir.*file_name' # Use .* for any character
- >>> path = get_archived_path(arch_path, file_regex)
- 'dir/filename.tif'
-
- Args:
- archive_path (AnyPathStrType): Archive path
- file_regex (str): File regex (used by re) as it can be found in the getmembers() list
- as_list (bool): If true, returns a list (including all found files). If false, returns only the first match
-
- Returns:
- Union[list, str]: Path from inside the zipfile
- """
- logs.deprecation_warning(
- "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'"
- )
- return path.get_archived_path(archive_path, file_regex, as_list)
-
-
-def get_archived_rio_path(
- archive_path: AnyPathStrType, file_regex: str, as_list: bool = False
-) -> Union[list, AnyPathType]:
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files`
-
- Get archived file path from inside the archive, to be read with rasterio:
-
- - :code:`zip+file://{zip_path}!{file_name}`
- - :code:`tar+file://{tar_path}!{file_name}`
-
-
- See `here `_
- for more information.
-
- .. WARNING::
- It won't be readable by pandas, geopandas or xmltree !
-
- .. WARNING::
- If :code:`as_list` is :code:`False`, it will only return the first file matched !
-
- You can use this `site `_ to build your regex.
-
- Args:
- archive_path (AnyPathStrType): Archive path
- file_regex (str): File regex (used by re) as it can be found in the getmembers() list
- as_list (bool): If true, returns a list (including all found files). If false, returns only the first match
-
- Returns:
- Union[list, str]: Band path that can be read by rasterio
-
- Example:
- >>> arch_path = 'D:/path/to/zip.zip'
- >>> file_regex = '.*dir.*file_name' # Use .* for any character
- >>> path = get_archived_tif_path(arch_path, file_regex)
- 'zip+file://D:/path/to/output.zip!dir/filename.tif'
- >>> rasterio.open(path)
-
- """
- logs.deprecation_warning(
- "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'"
- )
- return path.get_archived_rio_path(archive_path, file_regex, as_list)
-
-
-def read_archived_file(
- archive_path: AnyPathStrType, regex: str, file_list: list = None
-) -> bytes:
- """
- Read archived file (in bytes) from :code:`zip` or :code:`tar` archives.
-
- You can use this `site `_ to build your regex.
-
- Args:
- archive_path (AnyPathStrType): Archive path
- regex (str): Regex (used by re) as it can be found in the getmembers() list
- file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
-
- Returns:
- bytes: Archived file in bytes
- """
- archive_path = AnyPath(archive_path)
-
- # Compile regex
- regex = re.compile(regex)
-
- # Open tar and zip XML
- try:
- if archive_path.suffix == ".tar":
- with tarfile.open(archive_path) as tar_ds:
- # file_list is not very useful for TAR files...
- if file_list is None:
- tar_mb = tar_ds.getmembers()
- file_list = [mb.name for mb in tar_mb]
- name = list(filter(regex.match, file_list))[0]
- tarinfo = tar_ds.getmember(name)
- file_str = tar_ds.extractfile(tarinfo).read()
- elif archive_path.suffix == ".zip":
- with zipfile.ZipFile(archive_path) as zip_ds:
- if file_list is None:
- file_list = [f.filename for f in zip_ds.filelist]
- name = list(filter(regex.match, file_list))[0]
- file_str = zip_ds.read(name)
-
- elif archive_path.suffix == ".tar.gz":
- raise TypeError(
- ".tar.gz files are too slow to read from inside the archive. Please extract them instead."
- )
- else:
- raise TypeError(
- "Only .zip and .tar files can be read from inside its archive."
- )
- except IndexError as exc:
- raise FileNotFoundError(
- f"Impossible to find file {regex} in {path.get_filename(archive_path)}"
- ) from exc
-
- return file_str
-
-
-def read_archived_xml(
- archive_path: AnyPathStrType, regex: str = None, file_list: list = None, **kwargs
-) -> etree._Element:
- """
- Read archived XML from :code:`zip` or :code:`tar` archives.
-
- You can use this `site `_ to build your regex.
-
- Args:
- archive_path (AnyPathStrType): Archive path
- regex (str): XML regex (used by re) as it can be found in the getmembers() list
- file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
-
- Returns:
- etree._Element: XML file
-
- Example:
- >>> arch_path = 'D:/path/to/zip.zip'
- >>> file_regex = '.*dir.*file_name' # Use .* for any character
- >>> read_archived_xml(arch_path, file_regex)
-
- """
- if regex is None:
- logs.deprecation_warning(
- "'xml_regex' is deprecated, please use 'regex' instead."
- )
- regex = kwargs.pop("xml_regex")
-
- xml_bytes = read_archived_file(archive_path, regex=regex, file_list=file_list)
-
- return etree.fromstring(xml_bytes)
-
-
-def read_archived_html(
- archive_path: AnyPathStrType, regex: str, file_list: list = None
-) -> html.HtmlElement:
- """
- Read archived HTML from :code:`zip` or :code:`tar` archives.
-
- You can use this `site `_ to build your regex.
-
- Args:
- archive_path (AnyPathStrType): Archive path
- regex (str): HTML regex (used by re) as it can be found in the getmembers() list
- file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
-
- Returns:
- html._Element: HTML file
-
- Example:
- >>> arch_path = 'D:/path/to/zip.zip'
- >>> file_regex = '.*dir.*file_name' # Use .* for any character
- >>> read_archived_html(arch_path, file_regex)
-
- """
- html_bytes = read_archived_file(archive_path, regex, file_list=file_list)
-
- return html.fromstring(html_bytes)
-
-
-def archive(
- folder_path: AnyPathStrType,
- archive_path: AnyPathStrType,
- fmt: str = "zip",
-) -> AnyPathType:
- """
- Archives a folder recursively.
-
- Args:
- folder_path (AnyPathStrType): Folder to archive
- archive_path (AnyPathStrType): Archive path, with or without extension
- fmt (str): Format of the archive, used by :code:`shutil.make_archive`. Choose between [zip, tar, gztar, bztar, xztar]
-
- Returns:
- str: Archive filename
-
- Example:
- >>> folder_path = 'D:/path/to/folder_to_archive'
- >>> archive_path = 'D:/path/to/output'
- >>> archive = archive(folder_path, archive_path, fmt="gztar")
- 'D:/path/to/output/folder_to_archive.tar.gz'
- """
- archive_path = AnyPath(archive_path)
- folder_path = AnyPath(folder_path)
-
- tmp_dir = None
- if path.is_cloud_path(folder_path):
- tmp_dir = tempfile.TemporaryDirectory()
- folder_path = folder_path.download_to(tmp_dir.name)
-
- # Shutil make_archive needs a path without extension
- archive_base = os.path.splitext(archive_path)[0]
-
- # Archive the folder
- archive_fn = shutil.make_archive(
- archive_base,
- format=fmt,
- root_dir=folder_path.parent,
- base_dir=folder_path.name,
- )
-
- if tmp_dir is not None:
- tmp_dir.cleanup()
-
- return AnyPath(archive_fn)
-
-
-def add_to_zip(
- zip_path: AnyPathStrType,
- dirs_to_add: Union[list, AnyPathStrType],
-) -> AnyPathType:
- """
- Add folders to an already existing zip file (recursively).
-
- Args:
- zip_path (AnyPathStrType): Already existing zip file
- dirs_to_add (Union[list, AnyPathStrType]): Directories to add
-
- Returns:
- AnyPathType: Updated zip_path
-
- Example:
- >>> zip_path = 'D:/path/to/zip.zip'
- >>> dirs_to_add = ['D:/path/to/dir1', 'D:/path/to/dir2']
- >>> add_to_zip(zip_path, dirs_to_add)
- zip.zip contains 2 more folders, dir1 and dir2
- """
- zip_path = AnyPath(zip_path)
-
- # If the zip is on the cloud, cache it (zipfile doesn't like cloud paths)
- if path.is_cloud_path(zip_path):
- zip_path = AnyPath(zip_path.fspath)
-
- # Check if existing zipfile
- if not zip_path.is_file():
- raise FileNotFoundError(f"Non existing {zip_path}")
-
- # Convert to list if needed
- if not isinstance(dirs_to_add, list):
- dirs_to_add = [dirs_to_add]
-
- # Add all folders to the existing zip
- # Forced to use ZipFile because make_archive only works with one folder and not existing zipfile
- with zipfile.ZipFile(zip_path, "a") as zip_file:
- progress_bar = tqdm(dirs_to_add)
- for dir_to_add_path in progress_bar:
- # Just to be sure, use str instead of Paths
- if isinstance(dir_to_add_path, Path):
- dir_to_add = str(dir_to_add_path)
- elif path.is_cloud_path(dir_to_add_path):
- dir_to_add = dir_to_add_path.fspath
- else:
- dir_to_add = dir_to_add_path
-
- progress_bar.set_description(
- f"Adding {os.path.basename(dir_to_add)} to {os.path.basename(zip_path)}"
- )
- tmp = tempfile.TemporaryDirectory()
- if os.path.isfile(dir_to_add):
- dir_to_add = extract_file(dir_to_add, tmp.name)
-
- for root, _, files in os.walk(dir_to_add):
- base_path = os.path.join(dir_to_add, "..")
-
- # Write dir (in namelist at least)
- zip_file.write(root, os.path.relpath(root, base_path))
-
- # Write files
- for file in files:
- zip_file.write(
- os.path.join(root, file),
- os.path.relpath(
- os.path.join(root, file), os.path.join(dir_to_add, "..")
- ),
- )
-
- # Clean tmp
- tmp.cleanup()
-
- return zip_path
-
-
-def get_filename(file_path: AnyPathStrType, other_exts: Union[list, str] = None) -> str:
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files`
-
- Get file name (without extension) from file path, i.e.:
-
- Args:
- file_path (AnyPathStrType): Absolute or relative file path (the file doesn't need to exist)
- other_exts (Union[list, str]): Other double extensions to discard
-
- Returns:
- str: File name (without extension)
-
- Example:
- >>> file_path = 'D:/path/to/filename.zip'
- >>> get_file_name(file_path)
- 'filename'
- """
- logs.deprecation_warning(
- "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'"
- )
- return path.get_filename(file_path, other_exts)
-
-
-def get_ext(file_path: AnyPathStrType) -> str:
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files`
-
- Get file extension from file path.
-
- .. WARNING::
- Extension is given WITHOUT THE FIRST POINT
-
- Args:
- file_path (AnyPathStrType): Absolute or relative file path (the file doesn't need to exist)
-
- Returns:
- str: File name (without extension)
-
- Example:
- >>> file_path = 'D:/path/to/filename.zip'
- >>> get_ext(file_path)
- 'zip'
- """
- logs.deprecation_warning(
- "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'"
- )
- return path.get_ext(file_path)
-
-
def remove(path: AnyPathStrType) -> None:
"""
Deletes a file or a directory (recursively) using :code:`shutil.rmtree` or :code:`os.remove`.
@@ -754,7 +126,7 @@ def copy(src: AnyPathStrType, dst: AnyPathStrType) -> AnyPathType:
src = AnyPath(src)
if path.is_cloud_path(src):
- out = src.download_to(dst)
+ out = s3.download(src, dst)
else:
out = None
try:
@@ -772,54 +144,6 @@ def copy(src: AnyPathStrType, dst: AnyPathStrType) -> AnyPathType:
return out
-def find_files(
- names: Union[list, str],
- root_paths: Union[list, AnyPathStrType],
- max_nof_files: int = -1,
- get_as_str: bool = False,
-) -> Union[list, str]:
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files`
-
- Returns matching files recursively from a list of root paths.
-
- Regex are allowed (using glob)
-
- Args:
- names (Union[list, str]): File names.
- root_paths (Union[list, str]): Root paths
- max_nof_files (int): Maximum number of files (set to -1 for unlimited)
- get_as_str (bool): if only one file is found, it can be retrieved as a string instead of a list
-
- Returns:
- list: File name
-
- Examples:
- >>> root_path = 'D:/root'
- >>> dir1_path = 'D:/root/dir1'
- >>> dir2_path = 'D:/root/dir2'
- >>>
- >>> os.listdir(dir1_path)
- ["haha.txt", "huhu.txt", "hoho.txt"]
- >>> os.listdir(dir2_path)
- ["huhu.txt", "hehe.txt"]
- >>>
- >>> find_files("huhu.txt", root_path)
- ['D:/root/dir1/huhu.txt', 'D:/root/dir2/huhu.txt']
- >>>
- >>> find_files("huhu.txt", root_path, max_nof_files=1)
- ['D:/root/dir1/huhu.txt']
-
- >>> find_files("huhu.txt", root_path, max_nof_files=1, get_as_str=True)
- found = 'D:/root/dir1/huhu.txt'
- """
- logs.deprecation_warning(
- "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'"
- )
- return path.find_files(names, root_paths, max_nof_files, get_as_str)
-
-
# subclass JSONDecoder
class CustomDecoder(JSONDecoder):
"""Decoder for JSON with methods for datetimes"""
@@ -927,15 +251,6 @@ def save_json(json_dict: dict, output_json: AnyPathStrType, **kwargs) -> None:
>>> json_dict = {"A": np.int64(1), "B": datetime.today(), "C": SomeEnum.some_name}
>>> save_json(output_json, json_dict)
"""
- if isinstance(output_json, dict):
- # Old order. Swap the variables.
- logs.deprecation_warning(
- "The order of the function has changed. Please set json_dict in first!"
- )
- tmp = output_json
- output_json = json_dict
- json_dict = tmp
-
kwargs["indent"] = kwargs.get("indent", 3)
kwargs["cls"] = kwargs.get("cls", CustomEncoder)
@@ -982,66 +297,6 @@ def load_obj(path: AnyPathStrType) -> Any:
return dill.load(file)
-# too many arguments
-# pylint: disable=R0913
-def get_file_in_dir(
- directory: AnyPathStrType,
- pattern_str: str,
- extension: str = None,
- filename_only: bool = False,
- get_list: bool = False,
- exact_name: bool = False,
-) -> Union[AnyPathType, list]:
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files`
-
- Get one or all matching files (pattern + extension) from inside a directory.
-
- Note that the pattern is a regex with glob's convention, i.e. :code:`*pattern*`.
-
- If :code:`exact_name` is :code:`False`, the searched pattern will be :code:`*{pattern}*.{extension}`,
- else :code:`{pattern}.{extension}`.
-
- Args:
- directory (str): Directory where to find the files
- pattern_str (str): Pattern wanted as a string, with glob's convention.
- extension (str): Extension wanted, optional. With or without point. (:code:`yaml` or :code:`.yaml` accepted)
- filename_only (bool): Get only the filename
- get_list (bool): Get the whole list of matching files
- exact_name (bool): Get the exact name (without adding :code:`*` before and after the given pattern)
-
- Returns:
- Union[AnyPathType, list]: File
-
- Example:
- >>> directory = 'D:/path/to/dir'
- >>> os.listdir(directory)
- ["haha.txt", "huhu1.txt", "huhu1.geojson", "hoho.txt"]
- >>>
- >>> get_file_in_dir(directory, "huhu")
- 'D:/path/to/dir/huhu1.geojson'
- >>>
- >>> get_file_in_dir(directory, "huhu", extension="txt")
- 'D:/path/to/dir/huhu1.txt'
- >>>
- >>> get_file_in_dir(directory, "huhu", get_list=True)
- ['D:/path/to/dir/huhu1.txt', 'D:/path/to/dir/huhu1.geojson']
- >>>
- >>> get_file_in_dir(directory, "huhu", filename_only=True, get_list=True)
- ['huhu1.txt', 'huhu1.geojson']
- >>>
- >>> get_file_in_dir(directory, "huhu", get_list=True, exact_name=True)
- []
- """
- logs.deprecation_warning(
- "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'"
- )
- return path.get_file_in_dir(
- directory, pattern_str, extension, filename_only, get_list, exact_name
- )
-
-
# pylint: disable=E1121
def hash_file_content(file_content: str, len_param: int = 5) -> str:
"""
@@ -1064,22 +319,3 @@ def hash_file_content(file_content: str, len_param: int = 5) -> str:
hasher = hashlib.shake_256()
hasher.update(str.encode(file_content))
return hasher.hexdigest(len_param)
-
-
-def is_writable(dir_path: AnyPathStrType):
- """
- .. deprecated:: 1.30.0
- Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files`
-
- Determine whether the directory is writeable or not.
-
- Args:
- dir_path (AnyPathStrType): Directory path
-
- Returns:
- bool: True if the directory is writable
- """
- logs.deprecation_warning(
- "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'"
- )
- return path.is_writable(dir_path)
diff --git a/sertit/path.py b/sertit/path.py
index aeb1f12..504c526 100644
--- a/sertit/path.py
+++ b/sertit/path.py
@@ -15,17 +15,15 @@
# limitations under the License.
"""Tools for paths"""
+import contextlib
import errno
import logging
import os
import pprint
-import re
-import tarfile
import tempfile
-import zipfile
from typing import Any, Union
-from sertit import AnyPath, logs
+from sertit import AnyPath
from sertit.logs import SU_NAME
from sertit.types import AnyPathStrType, AnyPathType
@@ -150,181 +148,6 @@ def real_rel_path(raw_path: AnyPathStrType, start: AnyPathStrType) -> AnyPathTyp
return rel_path
-def get_archived_file_list(archive_path: AnyPathStrType) -> list:
- """
- Get the list of all the files contained in an archive.
-
- Args:
- archive_path (AnyPathStrType): Archive path
-
- Returns:
- list: All files contained in the given archive
-
- Example:
- >>> arch_path = 'D:/path/to/zip.zip'
- >>> get_archived_file_list(arch_path, file_regex)
- ['file_1.txt', 'file_2.tif', 'file_3.xml', 'file_4.geojson']
- """
- archive_path = AnyPath(archive_path)
- if archive_path.suffix == ".zip":
- with zipfile.ZipFile(archive_path) as zip_ds:
- file_list = [f.filename for f in zip_ds.filelist]
- else:
- try:
- with tarfile.open(archive_path) as tar_ds:
- tar_mb = tar_ds.getmembers()
- file_list = [mb.name for mb in tar_mb]
- except tarfile.ReadError as ex:
- raise tarfile.ReadError(
- f"Impossible to open archive: {archive_path}"
- ) from ex
-
- return file_list
-
-
-def get_archived_path(
- archive_path: AnyPathStrType,
- regex: str,
- as_list: bool = False,
- case_sensitive: bool = False,
- file_list: list = None,
- **kwargs,
-) -> Union[list, AnyPathType]:
- """
- Get archived file path from inside the archive.
-
- .. WARNING::
- If :code:`as_list` is :code:`False`, it will only return the first file matched !
-
- You can use this `site `_ to build your regex.
-
- Args:
- archive_path (AnyPathStrType): Archive path
- regex (str): File regex (used by re) as it can be found in the getmembers() list
- as_list (bool): If true, returns a list (including all found files). If false, returns only the first match
- case_sensitive (bool): If true, the regex is case-sensitive.
- file_list (list): List of files to get archived from. Optional, if not given it will be re-computed.
-
- Returns:
- Union[list, str]: Path from inside the zipfile
-
- Example:
- >>> arch_path = 'D:/path/to/zip.zip'
- >>> file_regex = '.*dir.*file_name' # Use .* for any character
- >>> path = get_archived_path(arch_path, file_regex)
- 'dir/filename.tif'
- """
- if regex is None:
- logs.deprecation_warning(
- "'file_regex' is deprecated, please use 'regex' instead."
- )
- regex = kwargs.pop("file_regex")
-
- # Get file list
- archive_path = AnyPath(archive_path)
-
- # Offer the ability to give the file list directly, as this operation is expensive when done with large archives stored on the cloud
- if file_list is None:
- file_list = get_archived_file_list(archive_path)
-
- # Search for file
- re_rgx = re.compile(regex) if case_sensitive else re.compile(regex, re.IGNORECASE)
- archived_band_paths = list(filter(re_rgx.match, file_list))
- if not archived_band_paths:
- raise FileNotFoundError(
- f"Impossible to find file {regex} in {get_filename(archive_path)}"
- )
-
- # Convert to str if needed
- if not as_list:
- archived_band_paths = archived_band_paths[0]
-
- return archived_band_paths
-
-
-def get_archived_rio_path(
- archive_path: AnyPathStrType,
- regex: str,
- as_list: bool = False,
- file_list: list = None,
- **kwargs,
-) -> Union[list, AnyPathType]:
- """
- Get archived file path from inside the archive, to be read with rasterio:
-
- - :code:`zip+file://{zip_path}!{file_name}`
- - :code:`tar+file://{tar_path}!{file_name}`
-
-
- See `here `_
- for more information.
-
- .. WARNING::
- It wont be readable by pandas, geopandas or xmltree !
-
- .. WARNING::
- If :code:`as_list` is :code:`False`, it will only return the first file matched !
-
- You can use this `site `_ to build your regex.
-
- Args:
- archive_path (AnyPathStrType): Archive path
- regex (str): File regex (used by re) as it can be found in the getmembers() list
- as_list (bool): If true, returns a list (including all found files). If false, returns only the first match
- file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
-
- Returns:
- Union[list, str]: Band path that can be read by rasterio
-
- Example:
- >>> arch_path = 'D:/path/to/zip.zip'
- >>> file_regex = '.*dir.*file_name' # Use .* for any character
- >>> path = get_archived_tif_path(arch_path, file_regex)
- 'zip+file://D:/path/to/output.zip!dir/filename.tif'
- >>> rasterio.open(path)
-
- """
- if regex is None:
- logs.deprecation_warning(
- "'file_regex' is deprecated, please use 'regex' instead."
- )
- regex = kwargs.pop("file_regex")
-
- archive_path = AnyPath(archive_path)
- if archive_path.suffix in [".tar", ".zip"]:
- prefix = archive_path.suffix[-3:]
- elif archive_path.suffix == ".tar.gz":
- raise TypeError(
- ".tar.gz files are too slow to be read from inside the archive. Please extract them instead."
- )
- else:
- raise TypeError("Only .zip and .tar files can be read from inside its archive.")
-
- # Search for file
- archived_band_paths = get_archived_path(
- archive_path, regex=regex, as_list=True, file_list=file_list
- )
-
- # Convert to rio path
- if is_cloud_path(archive_path):
- archived_band_paths = [
- f"{prefix}+file+{archive_path}!{path}" for path in archived_band_paths
- ]
- else:
- # archived_band_paths = [
- # f"{prefix}+file://{archive_path}!{path}" for path in archived_band_paths
- # ]
- archived_band_paths = [
- f"/vsi{prefix}/{archive_path}/{path}" for path in archived_band_paths
- ]
-
- # Convert to str if needed
- if not as_list:
- archived_band_paths = archived_band_paths[0]
-
- return archived_band_paths
-
-
def get_filename(file_path: AnyPathStrType, other_exts: Union[list, str] = None) -> str:
"""
Get file name (without extension) from file path, ie:
@@ -589,25 +412,46 @@ def is_cloud_path(path: AnyPathStrType):
bool: True if the file is store on the cloud.
"""
try:
- from cloudpathlib import CloudPath
+ return AnyPath(path).protocol in [
+ "s3",
+ "az",
+ "adl",
+ "abfs",
+ "abfss",
+ "gs",
+ "gcs",
+ ]
+ except AttributeError:
+ try:
+ from cloudpathlib import CloudPath
- return isinstance(AnyPath(path), CloudPath)
- except Exception:
- return False
+ return isinstance(AnyPath(path), CloudPath)
+ except Exception:
+ return False
def is_path(path: Any) -> bool:
"""
- Determine whether the path corresponds to a file stored on the cloud or not.
+ Determine whether the path is really a path or not: either str, Path, UPath or CloudPath
Args:
path (AnyPathStrType): File path
Returns:
- bool: True if the file is store on the cloud.
+ bool: True if the file is a path
"""
from pathlib import Path
- from cloudpathlib import CloudPath
+ is_path = isinstance(path, (str, Path))
+
+ with contextlib.suppress(ImportError):
+ from upath import UPath
+
+ is_path = is_path or isinstance(path, UPath)
+
+ with contextlib.suppress(ImportError):
+ from cloudpathlib import CloudPath
+
+ is_path = is_path or isinstance(path, CloudPath)
- return isinstance(path, (str, Path, CloudPath))
+ return is_path
diff --git a/sertit/rasters.py b/sertit/rasters.py
index a835a66..52ca67c 100644
--- a/sertit/rasters.py
+++ b/sertit/rasters.py
@@ -33,7 +33,7 @@
try:
import rasterio
import rioxarray
- from rasterio import MemoryFile, features
+ from rasterio import features
from rasterio.enums import Resampling
from rioxarray.exceptions import MissingCRS
except ModuleNotFoundError as ex:
@@ -126,25 +126,6 @@ def get_nodata_value_from_dtype(dtype) -> float:
return rasters_rio.get_nodata_value_from_dtype(dtype)
-def get_nodata_value(dtype) -> float:
- """
- .. deprecated:: 1.41.0
- Use :code:`get_nodata_value_from_dtype` instead.
-
- Get default nodata value:
-
- Args:
- dtype: Dtype for the wanted nodata. Best if numpy's dtype.
-
- Returns:
- float: Nodata value
- """
- logs.deprecation_warning(
- "This function is deprecated. Use 'get_nodata_value_from_dtype' instead."
- )
- return get_nodata_value_from_dtype(dtype)
-
-
def any_raster_to_xr_ds(function: Callable) -> Callable:
"""
Allows a function to ingest AnyRasterType and convert it into a xr.DataArray:
@@ -191,8 +172,8 @@ def wrapper(any_raster_type: AnyRasterType, *args, **kwargs) -> Any:
if any_raster_type is None:
raise ValueError("'any_raster_type' shouldn't be None!")
- default_chunks = True if dask.get_client() is not None else None
-
+ default_chunks = "auto" if dask.get_client() is not None else None
+ masked = kwargs.get("masked", True)
# By default, try with the input fct
try:
out = function(any_raster_type, *args, **kwargs)
@@ -216,67 +197,17 @@ def wrapper(any_raster_type: AnyRasterType, *args, **kwargs) -> Any:
except Exception as ex:
raise TypeError("Function not available for xarray.Dataset") from ex
- elif isinstance(any_raster_type, tuple):
- arr, meta = any_raster_type
- with (
- MemoryFile() as memfile,
- memfile.open(
- **meta, BIGTIFF=rasters_rio.bigtiff_value(any_raster_type)
- ) as ds,
- ):
- ds.write(arr.data)
-
- with rioxarray.open_rasterio(
- any_raster_type,
- masked=True,
- default_name=ds.name,
- chunks=kwargs.pop("chunks", default_chunks),
- ) as xds:
- out = function(xds, *args, **kwargs)
else:
- # Get the path from the input
- if path.is_path(any_raster_type):
- name = str(any_raster_type)
- any_raster_type = str(any_raster_type)
- else:
- # For rasterio datasets, '.name' gives the path
- name = any_raster_type.name
-
- # Convert path or rasterio.dataset to xr.dataset
- with rioxarray.open_rasterio(
- any_raster_type,
- masked=True,
- default_name=name,
- chunks=kwargs.pop("chunks", default_chunks),
- ) as xds:
- out = function(xds, *args, **kwargs)
-
+ out = function(
+ read(any_raster_type, chunks=default_chunks, masked=masked),
+ *args,
+ **kwargs,
+ )
return out
return wrapper
-def path_xarr_dst(function: Callable) -> Callable:
- """
- .. deprecated:: 1.40.0
- Use :py:func:`rasters.any_raster_to_xr_ds` instead.
- """
- logs.deprecation_warning(
- "Deprecated 'path_xarr_dst' decorator. Please use 'any_raster_to_xr_ds' instead."
- )
- return any_raster_to_xr_ds(function)
-
-
-@any_raster_to_xr_ds
-def get_nodata_mask(xds: AnyXrDataStructure) -> np.ndarray:
- """
- .. deprecated:: 1.36.0
- Use :py:func:`rasters.get_data_mask` instead.
- """
- logs.deprecation_warning("This function is deprecated. Use 'get_data_mask' instead")
- return get_data_mask(xds)
-
-
@any_raster_to_xr_ds
def get_data_mask(xds: AnyXrDataStructure) -> np.ndarray:
"""
@@ -988,13 +919,19 @@ def read(
rioxarray.set_options(export_grid_mapping=False),
rioxarray.open_rasterio(
ds,
- lock=False,
default_name=path.get_filename(ds.name),
chunks=chunks,
+ masked=masked,
**kwargs,
) as xda,
):
- orig_dtype = xda.dtype
+ orig_dtype = xda.encoding.get(
+ "rasterio_dtype", xda.encoding.get("dtype", xda.dtype)
+ )
+
+ if isinstance(orig_dtype, str):
+ with contextlib.suppress(AttributeError):
+ orig_dtype = getattr(np, orig_dtype)
# Windows
if window is not None:
@@ -1104,12 +1041,6 @@ def write(
>>> # Rewrite it
>>> write(xds, raster_out)
"""
- if output_path is None:
- logs.deprecation_warning(
- "'path' is deprecated in 'rasters.write'. Use 'output_path' instead."
- )
- output_path = kwargs.pop("path")
-
# Prune empty kwargs to avoid throwing GDAL warnings/errors
kwargs = {k: v for k, v in kwargs.items() if v is not None}
@@ -1403,14 +1334,15 @@ def sieve(
assert connectivity in [4, 8]
- # Use this trick to make the sieve work
- mask = np.where(np.isnan(xds.data), 0, 1).astype(np.uint8)
- data = xds.data.astype(np.uint8)
+ mask = xr.where(np.isnan(xds), 0, 1).astype(np.uint8).data
+ data = xds.astype(np.uint8).data
# Sieve
try:
sieved_arr = xr.apply_ufunc(
- features.sieve, data, sieve_thresh, connectivity, mask
+ features.sieve,
+ data,
+ kwargs={"size": sieve_thresh, "connectivity": connectivity, "mask": mask},
)
except ValueError:
sieved_arr = features.sieve(
diff --git a/sertit/rasters_rio.py b/sertit/rasters_rio.py
index bc0fcdf..9663590 100644
--- a/sertit/rasters_rio.py
+++ b/sertit/rasters_rio.py
@@ -43,7 +43,7 @@
"Please install 'rasterio' to use the 'rasters_rio' package."
) from ex
-from sertit import AnyPath, geometry, logs, misc, path, strings, vectors, xml
+from sertit import AnyPath, geometry, misc, path, s3, strings, vectors, xml
from sertit.logs import SU_NAME
from sertit.types import AnyNumpyArray, AnyPathStrType, AnyPathType, AnyRasterType
@@ -112,25 +112,6 @@ def get_nodata_value_from_dtype(dtype) -> float:
return nodata
-def get_nodata_value(dtype) -> float:
- """
- .. deprecated:: 1.41.0
- Use :code:`get_nodata_value_from_dtype` instead.
-
- Get default nodata value:
-
- Args:
- dtype: Dtype for the wanted nodata. Best if numpy's dtype.
-
- Returns:
- float: Nodata value
- """
- logs.deprecation_warning(
- "This function is deprecated. Use 'get_nodata_value_from_dtype' instead."
- )
- return get_nodata_value_from_dtype(dtype)
-
-
def bigtiff_value(arr: Any) -> str:
"""
Returns :code:`YES` if array is larger than 4 GB, :code:`IF_NEEDED` otherwise.
@@ -250,17 +231,6 @@ def wrapper(any_raster_type: AnyRasterType, *args, **kwargs) -> Any:
return wrapper
-def path_arr_dst(function: Callable) -> Callable:
- """
- .. deprecated:: 1.40.0
- Use :py:func:`rasters.any_raster_to_rio_ds` instead.
- """
- logs.deprecation_warning(
- "Deprecated 'path_arr_dst' decorator. Please use 'any_raster_to_rio_ds' instead."
- )
- return any_raster_to_rio_ds(function)
-
-
@any_raster_to_rio_ds
def get_new_shape(
ds: AnyRasterType,
@@ -424,19 +394,6 @@ def update_meta(arr: AnyNumpyArray, meta: dict) -> dict:
return out_meta
-def get_nodata_mask(
- array: AnyNumpyArray,
- has_nodata: bool,
- default_nodata: int = 0,
-) -> np.ndarray:
- """
- .. deprecated:: 1.36.0
- Use :py:func:`rasters_rio.get_data_mask` instead.
- """
- logs.deprecation_warning("This function is deprecated. Use 'get_data_mask' instead")
- return get_data_mask(array, has_nodata, default_nodata)
-
-
def get_data_mask(
array: AnyNumpyArray,
has_nodata: bool,
@@ -1090,12 +1047,6 @@ def write(
>>> # Rewrite it on disk
>>> write(raster, meta, raster_out)
"""
- if output_path is None:
- logs.deprecation_warning(
- "'path' is deprecated in 'rasters_rio.write'. Use 'output_path' instead."
- )
- output_path = kwargs.pop("path")
-
raster_out = raster.copy()
# Prune empty kwargs to avoid throwing GDAL warnings/errors
@@ -1427,7 +1378,7 @@ def merge_vrt(
crs_path = AnyPath(crs_path)
# Download file if VRT is needed
if path.is_cloud_path(crs_path):
- crs_path = crs_path.download_to(merged_path.parent)
+ crs_path = s3.download(crs_path, merged_path.parent)
with rasterio.open(str(crs_path)) as src:
if first_crs is None:
diff --git a/sertit/s3.py b/sertit/s3.py
index 37588e9..bc14535 100644
--- a/sertit/s3.py
+++ b/sertit/s3.py
@@ -17,13 +17,16 @@
S3 tools
"""
+import contextlib
import logging
import os
from contextlib import contextmanager
from functools import wraps
+from io import BytesIO
from cloudpathlib import S3Client
+from sertit import AnyPath, path
from sertit.logs import SU_NAME
LOGGER = logging.getLogger(SU_NAME)
@@ -271,3 +274,55 @@ def define_s3_client(
client = S3Client(**args_s3_client)
client.set_as_default_client()
+
+
+def download(src, dst):
+ # By default, use the src path
+ downloaded_path = src
+
+ # Universal pathlib
+ if path.is_cloud_path(src):
+ import shutil
+
+ with contextlib.suppress(ImportError):
+ from upath import UPath
+
+ if isinstance(src, UPath):
+ dst = AnyPath(dst)
+ if dst.is_dir() and src.name != dst.name:
+ downloaded_path = dst / src.name
+ else:
+ downloaded_path = dst
+
+ if src.is_file():
+ with src.open("rb") as f0, downloaded_path.open("wb") as f1:
+ shutil.copyfileobj(f0, f1)
+ else:
+ downloaded_path.parent.mkdir(parents=True, exist_ok=True)
+
+ for f in src.glob("**"):
+ dst_file = downloaded_path / f.name
+ if f.is_file():
+ dst_file.parent.mkdir(parents=True, exist_ok=True)
+ with f.open("rb") as f0, dst_file.open("wb") as f1:
+ shutil.copyfileobj(f0, f1)
+
+ # cloudpathlib
+ with contextlib.suppress(ImportError):
+ from cloudpathlib import CloudPath
+
+ if isinstance(src, CloudPath):
+ downloaded_path = src.fspath if dst is None else src.download_to(dst)
+
+ return downloaded_path
+
+
+def read(src):
+ src = AnyPath(src)
+ try:
+ b = src.read_bytes()
+ except Exception:
+ with src.open("rb") as f:
+ b = f.read()
+
+ return BytesIO(b)
diff --git a/sertit/types.py b/sertit/types.py
index 59c99fe..3cea7a9 100644
--- a/sertit/types.py
+++ b/sertit/types.py
@@ -5,14 +5,23 @@
import geopandas as gpd
import numpy as np
import xarray as xr
-from cloudpathlib import CloudPath
from rasterio.io import DatasetReader, DatasetWriter
from shapely import MultiPolygon, Polygon
-AnyPathType = Union[CloudPath, Path]
-"""Any Path Type (derived from Pathlib and CloudpathLib)"""
+try:
+ from upath import UPath
+except ImportError:
+ UPath = None
-AnyPathStrType = Union[str, CloudPath, Path]
+try:
+ from cloudpathlib import CloudPath
+except ImportError:
+ CloudPath = None
+
+AnyPathType = Union[CloudPath, Path, UPath]
+"""Any Path Type (derived from Pathlib, Universal Pathlib and CloudpathLib)"""
+
+AnyPathStrType = Union[str, AnyPathType]
"""Same as :code:`AnyPathType` but appened with :code:`str`"""
AnyXrDataStructure = Union[xr.DataArray, xr.Dataset]
diff --git a/sertit/vectors.py b/sertit/vectors.py
index 15074e8..42aa728 100644
--- a/sertit/vectors.py
+++ b/sertit/vectors.py
@@ -23,9 +23,7 @@
import os
import re
import shutil
-import tarfile
import tempfile
-import zipfile
from collections.abc import Generator
from contextlib import contextmanager
from typing import Any, Union
@@ -36,7 +34,7 @@
from cloudpathlib.exceptions import AnyPathTypeError
from shapely import Polygon, wkt
-from sertit import AnyPath, files, geometry, logs, misc, path, strings
+from sertit import AnyPath, archives, files, geometry, misc, path, s3, strings
from sertit.logs import SU_NAME
from sertit.types import AnyPathStrType, AnyPathType
@@ -80,9 +78,6 @@ def is_geopandas_1_0():
def to_utm_crs(lon: float, lat: float) -> "CRS": # noqa: F821
"""
- .. deprecated:: 1.29.1
- Use `estimate_utm_crs `_ instead, which directly returs a CRS instead of a string.
-
Find the EPSG code of the UTM CRS from a lon/lat in WGS84.
Args:
@@ -118,43 +113,6 @@ def to_utm_crs(lon: float, lat: float) -> "CRS": # noqa: F821
return gpd.GeoDataFrame(geometry=point, crs=EPSG_4326).estimate_utm_crs()
-def corresponding_utm_projection(lon: float, lat: float) -> str:
- """
- .. deprecated:: 1.29.1
- Use `estimate_utm_crs `_ instead, which directly returs a CRS instead of a string.
-
- Find the EPSG code of the UTM CRS from a lon/lat in WGS84.
-
- Args:
- lon (float): Longitude (WGS84, epsg:4326)
- lat (float): Latitude (WGS84, epsg:4326)
-
- Returns:
- CRS: UTM CRS
-
- Example:
- >>> to_utm_crs(lon=7.8, lat=48.6) # Strasbourg
-
- Name: WGS 84 / UTM zone 32N
- Axis Info [cartesian]:
- - E[east]: Easting (metre)
- - N[north]: Northing (metre)
- Area of Use:
- - bounds: (6.0, 0.0, 12.0, 84.0)
- Coordinate Operation:
- - name: UTM zone 32N
- - method: Transverse Mercator
- Datum: World Geodetic System 1984 ensemble
- - Ellipsoid: WGS 84
- - Prime Meridian: Greenwich
-
- """
- logs.deprecation_warning(
- "Deprecated, use 'to_utm_crs' instead, which directly returs a CRS instead of a string."
- )
- return to_utm_crs(lon, lat).to_string()
-
-
def get_geodf(geom: Union[Polygon, list, gpd.GeoSeries], crs: str) -> gpd.GeoDataFrame:
"""
Get a GeoDataFrame from a geometry and a crs
@@ -256,8 +214,11 @@ def get_aoi_wkt(aoi_path: AnyPathStrType, as_str: bool = True) -> Union[str, Pol
if aoi_path.suffix == ".wkt":
try:
- with open(aoi_path) as aoi_f:
- aoi = wkt.load(aoi_f)
+ if path.is_cloud_path(aoi_path):
+ aoi = wkt.load(s3.read(aoi_path))
+ else:
+ with open(aoi_path) as aoi_f:
+ aoi = wkt.load(aoi_f)
except Exception as ex:
raise ValueError("AOI WKT cannot be read") from ex
else:
@@ -472,13 +433,17 @@ def read(
if "!" in str(vector_path):
split_vect = str(vector_path).split("!")
archive_regex = ".*{}".format(split_vect[1].replace(".", r"\."))
- vector_path = AnyPath(split_vect[0])
+ try:
+ vector_path = AnyPath(split_vect[0], **vector_path.storage_options)
+ except AttributeError:
+ # Cloudpathlib
+ vector_path = AnyPath(split_vect[0])
# Manage archive case
if vector_path.suffix in [".tar", ".zip"]:
prefix = vector_path.suffix[-3:]
file_list = kwargs.pop(
- "file_list", path.get_archived_file_list(vector_path)
+ "file_list", archives.get_archived_file_list(vector_path)
)
try:
@@ -715,16 +680,16 @@ def ogr2geojson(
# archived vector_path are extracted in a tmp folder so no need to be downloaded
if vector_path.suffix == ".zip":
- with zipfile.ZipFile(vector_path, "r") as zip_ds:
+ with archives.open_zipfile(vector_path, "r") as zip_ds:
vect_path = zip_ds.extract(arch_vect_path, out_dir)
elif vector_path.suffix == ".tar":
- with tarfile.open(vector_path, "r") as tar_ds:
+ with archives.open_tarfile(vector_path, "r") as tar_ds:
tar_ds.extract(arch_vect_path, out_dir)
vect_path = os.path.join(out_dir, arch_vect_path)
else:
# vector_path should be downloaded to work with 'ogr2ogr'
if path.is_cloud_path(vector_path):
- vector_path = AnyPath(vector_path).fspath
+ vector_path = s3.download(vector_path, out_dir)
vect_path = vector_path
vect_path_gj = os.path.join(
diff --git a/sertit/xml.py b/sertit/xml.py
index 9ddc44a..befa80f 100644
--- a/sertit/xml.py
+++ b/sertit/xml.py
@@ -30,7 +30,7 @@
)
from lxml.html.builder import E
-from sertit import AnyPath, files, path
+from sertit import AnyPath, archives, path, s3
from sertit.logs import SU_NAME
from sertit.misc import ListEnum
from sertit.types import AnyPathStrType
@@ -61,7 +61,7 @@ def read(xml_path: AnyPathStrType) -> _Element:
# Slower but works with:
# {ValueError}Unicode strings with encoding declaration are not supported.
# Please use bytes input or XML fragments without declaration.
- root = fromstring(xml_path.read_bytes())
+ root = fromstring(s3.read(xml_path).read())
else:
# pylint: disable=I1101:
# Module 'lxml.etree' has no 'parse' member, but source is unavailable.
@@ -75,7 +75,10 @@ def read(xml_path: AnyPathStrType) -> _Element:
def read_archive(
- path: AnyPathStrType, regex: str = None, file_list: list = None
+ archive_path: AnyPathStrType,
+ regex: str = None,
+ file_list: list = None,
+ **kwargs,
) -> _Element:
"""
Read an XML file from inside an archive (zip or tar)
@@ -87,25 +90,34 @@ def read_archive(
- path to the archive plus a regex looking inside the archive. Duplicate behaviour to :py:func:`files.read_archived_xml`
Args:
- path (AnyPathStrType): Path to the XML file, stored inside an archive or path to the archive itself
+ archive_path (AnyPathStrType): Path to the XML file, stored inside an archive or path to the archive itself
regex (str): Optional. If specified, the path should be the archive path and the regex should be the key to find the XML file inside the archive.
file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
Returns:
_Element: XML Root
"""
-
try:
if not regex:
- path, basename = str(path).split("!")
+ archive_base_path, basename = str(archive_path).split("!")
regex = basename
- if path.startswith("zip://") or path.startswith("tar://"):
- path = path[5:]
+ if archive_base_path.startswith("zip://") or archive_base_path.startswith(
+ "tar://"
+ ):
+ archive_base_path = archive_base_path[5:]
+
+ # For UPath
+ with contextlib.suppress(AttributeError):
+ archive_base_path = AnyPath(
+ archive_base_path, **archive_path.storage_options
+ )
+ else:
+ archive_base_path = archive_path
- return files.read_archived_xml(path, regex, file_list=file_list)
+ return archives.read_archived_xml(archive_base_path, regex, file_list=file_list)
except XMLSyntaxError as exc:
- raise ValueError(f"Invalid metadata XML for {path}!") from exc
+ raise ValueError(f"Invalid metadata XML for {archive_path}!") from exc
def write(xml: _Element, path: str) -> None: