diff --git a/.gitignore b/.gitignore index 5eb2f90..b054ac0 100644 --- a/.gitignore +++ b/.gitignore @@ -24,9 +24,9 @@ dist dask-worker-space/* # Data in CI -CI/*.tif -CI/*.zip -CI/*.vrt +ci/*.tif +ci/*.zip +ci/*.vrt # Docs & Notebooks docs/_build/* diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index cf8aeb1..aa372cb 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -27,7 +27,7 @@ pytest: - pip install --ignore-installed PyYAML - pip install -e .[full] script: - - python -m pytest -v --durations=0 --cov-report term --cov-report xml:cov.xml --cov=sertit ci/on_push --cov-config=.coveragerc --log-cli-level DEBUG --capture=sys + - python -m pytest -v --durations=0 --cov-report term --cov-report xml:cov.xml --cov=sertit ci --cov-config=.coveragerc --log-cli-level DEBUG --capture=sys coverage: '/TOTAL\s+\d+\s+\d+\s+(\d+%)/' tags: - sertit @@ -50,7 +50,7 @@ pytest_s3: - pip install --ignore-installed PyYAML - pip install -e .[full] script: - - python -m pytest -v --durations=0 --cov-report term --cov-report xml:cov.xml --cov=sertit ci/on_push --cov-config=.coveragerc --log-cli-level DEBUG --capture=sys + - python -m pytest -v --durations=0 --cov-report term --cov-report xml:cov.xml --cov=sertit ci --cov-config=.coveragerc --log-cli-level DEBUG --capture=sys coverage: '/TOTAL\s+\d+\s+\d+\s+(\d+%)/' tags: - sertit diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c2b369b..1275d62 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,3 +26,13 @@ repos: args: [ --fix ] # Run the formatter. - id: ruff-format + + + + + + + + + + diff --git a/CHANGES.md b/CHANGES.md index 42f0bd8..7527a5f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,15 @@ # Release History +## 2.0.0 (20xx-xx-xx) + +- **BREAKING CHANGE**: Remove all deprecations from `sertit==1.*` ([#3](https://github.com/sertit/sertit-utils/issues/3)): + - Duplication between `path` and `files` modules + - Duplication between `ci`, `s3` and `unistra` modules + - Arguments in functions + - Renaming functions + - Others +- **ENH: Use `universal_pathlib` instead of `cloudpathlib` (even if the code is still compatible with `cloudpathlib`)** ([#4](https://github.com/sertit/sertit-utils/issues/4)) + ## 1.44.x (20xx-xx-xx) - **ENH: Drop `isort`, `black` and `flake8` and use `ruff`** @@ -8,6 +18,9 @@ - FIX: Fix deprecation warning for `get_nodata_value_from_dtype` in `rasters_rio` - FIX: Force blocksize to 128 when writing small COGs on disk (in order to have multiple overview levels) - FIX: Use `np.tan` in `rasters.slope` +- FIX: Allow str as paths in `ci.assert_files_equal` +- FIX: Better alignement between `rasters.read` function and `rasters.any_raster_to_xr_ds` decorator +- FIX: Fix `rasters.sieve` function with `xr.apply_ufunc` - OPTIM: Compute the spatial index by default in `vectors.read` (set `vectors.read(..., compute_sindex=False)` if you don't want to compute them) - CI: Rename CI folder and remove unnecessary intermediate folder diff --git a/ci/script_utils.py b/ci/script_utils.py index 63679bd..33f0b05 100644 --- a/ci/script_utils.py +++ b/ci/script_utils.py @@ -38,25 +38,26 @@ class Polarization(ListEnum): def get_s3_ci_path(): """Get S3 CI path""" - unistra.define_s3_client() - return AnyPath("s3://sertit-sertit-utils-ci") + from sertit.unistra import UNISTRA_S3_ENDPOINT -def get_proj_path(): - """Get project path""" - if int(os.getenv(CI_SERTIT_S3, 1)) and sys.platform != "win32": - return get_s3_ci_path() - else: - # ON DISK - return AnyPath(unistra.get_db3_path()) + try: + ci_path = AnyPath( + "s3://sertit-sertit-utils-ci", endpoint_url=f"https://{UNISTRA_S3_ENDPOINT}" + ) + except TypeError: + unistra.define_s3_client() + ci_path = AnyPath("s3://sertit-sertit-utils-ci") + + return ci_path def get_ci_data_path(): """Get CI DATA path""" if int(os.getenv(CI_SERTIT_S3, 1)) and sys.platform != "win32": - return get_proj_path().joinpath("DATA") + return get_s3_ci_path() / "DATA" else: - return get_proj_path().joinpath("CI", "sertit_utils", "DATA") + return AnyPath(unistra.get_db3_path()) / "CI" / "sertit_utils" / "DATA" def dask_env(function): diff --git a/ci/test_archives.py b/ci/test_archives.py new file mode 100644 index 0000000..8283d6d --- /dev/null +++ b/ci/test_archives.py @@ -0,0 +1,153 @@ +import os +import shutil + +import pytest +from lxml import etree, html + +from ci.script_utils import files_path, s3_env +from sertit import archives, ci, files, path, s3, vectors + + +@s3_env +def test_archive(tmp_path): + """Test extracting functions""" + # Archives + zip_file = files_path().joinpath("test_zip.zip") + zip2_file = files_path().joinpath("test_zip.zip") # For overwrite + zip_without_directory = files_path().joinpath("test_zip_without_directory.zip") + tar_file = files_path().joinpath("test_tar.tar") + tar_gz_file = files_path().joinpath("test_targz.tar.gz") + + # Core dir + core_dir = files_path().joinpath("core") + folder = core_dir + arch = [ + zip_file, + tar_file, + tar_gz_file, + folder, + zip2_file, + zip_without_directory, + ] + + # Extract + extracted_dirs = archives.extract_files(arch, tmp_path, overwrite=True) + + # Test + for ex_dir in extracted_dirs: + ci.assert_dir_equal(core_dir, ex_dir) + + archives.extract_files([zip2_file], tmp_path, overwrite=False) # Already existing + + # Test + for ex_dir in extracted_dirs: + ci.assert_dir_equal(core_dir, ex_dir) + + # Archive + archive_base = os.path.join(tmp_path, "archive") + for fmt in ["zip", "tar", "gztar"]: + archive_fn = archives.archive( + folder_path=core_dir, archive_path=archive_base, fmt=fmt + ) + out = archives.extract_file(archive_fn, tmp_path) + # an additional folder is created + out_dir = path.listdir_abspath(out)[0] + ci.assert_dir_equal(core_dir, out_dir) + + # Remove out directory in order to avoid any interferences + files.remove(out) + + # Add to zip + zip_out = zip2_file if path.is_cloud_path(zip2_file) else archive_base + ".zip" + core_copy = files.copy(core_dir, os.path.join(tmp_path, "core2")) + zip_out = archives.add_to_zip(s3.download(zip_out, tmp_path), core_copy) + + # Extract + unzip_out = os.path.join(tmp_path, "out") + unzip_out = archives.extract_file(zip_out, unzip_out) + + # Test + unzip_dirs = path.listdir_abspath(unzip_out) + + assert len(unzip_dirs) == 2 + ci.assert_dir_equal(unzip_dirs[0], unzip_dirs[1]) + + +@s3_env +def test_archived_files(tmp_path): + landsat_name = "LM05_L1TP_200030_20121230_20200820_02_T2_CI" + ok_folder = files_path().joinpath(landsat_name) + zip_file = files_path().joinpath(f"{landsat_name}.zip") + tar_file = files_path().joinpath(f"{landsat_name}.tar") + targz_file = files_path().joinpath(f"{landsat_name}.tar.gz") + sz_file = files_path().joinpath(f"{landsat_name}.7z") + + # VECTORS + vect_name = "map-overlay.kml" + vec_ok_path = ok_folder.joinpath(vect_name) + if shutil.which("ogr2ogr"): # Only works if ogr2ogr can be found. + vect_regex = f".*{vect_name}" + vect_zip = vectors.read(zip_file, archive_regex=vect_regex) + vect_tar = vectors.read(tar_file, archive_regex=r".*overlay\.kml") + vect_ok = vectors.read(vec_ok_path) + assert not vect_ok.empty + ci.assert_geom_equal(vect_ok, vect_zip) + ci.assert_geom_equal(vect_ok, vect_tar) + + # XML + xml_name = "LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml" + xml_ok_path = ok_folder.joinpath(xml_name) + xml_ok_path = str(s3.download(xml_ok_path, tmp_path)) + + xml_regex = f".*{xml_name}" + xml_zip = archives.read_archived_xml(zip_file, xml_regex) + xml_tar = archives.read_archived_xml(tar_file, r".*_MTL\.xml") + xml_ok = etree.parse(xml_ok_path).getroot() + ci.assert_xml_equal(xml_ok, xml_zip) + ci.assert_xml_equal(xml_ok, xml_tar) + + # FILE + HTML + html_zip_file = files_path().joinpath("productPreview.zip") + html_tar_file = files_path().joinpath("productPreview.tar") + html_name = "productPreview.html" + html_ok_path = files_path().joinpath(html_name) + html_ok_path = str(s3.download(html_ok_path, tmp_path)) + + html_regex = f".*{html_name}" + + # FILE + file_zip = archives.read_archived_file(html_zip_file, html_regex) + file_tar = archives.read_archived_file(html_tar_file, html_regex) + html_ok = html.parse(html_ok_path).getroot() + ci.assert_html_equal(html_ok, html.fromstring(file_zip)) + ci.assert_html_equal(html_ok, html.fromstring(file_tar)) + + file_list = archives.get_archived_file_list(html_zip_file) + ci.assert_html_equal( + html_ok, + html.fromstring( + archives.read_archived_file(html_zip_file, html_regex, file_list=file_list) + ), + ) + + # HTML + html_zip = archives.read_archived_html(html_zip_file, html_regex) + html_tar = archives.read_archived_html(html_tar_file, html_regex) + ci.assert_html_equal(html_ok, html_zip) + ci.assert_html_equal(html_ok, html_tar) + ci.assert_html_equal( + html_ok, + archives.read_archived_html( + html_tar_file, + html_regex, + file_list=archives.get_archived_file_list(html_tar_file), + ), + ) + + # ERRORS + with pytest.raises(TypeError): + archives.read_archived_file(targz_file, xml_regex) + with pytest.raises(TypeError): + archives.read_archived_file(sz_file, xml_regex) + with pytest.raises(FileNotFoundError): + archives.read_archived_file(zip_file, "cdzeferf") diff --git a/ci/test_ci.py b/ci/test_ci.py index 803fe05..3a836d6 100644 --- a/ci/test_ci.py +++ b/ci/test_ci.py @@ -22,7 +22,7 @@ from lxml import etree from ci.script_utils import files_path, rasters_path, s3_env, vectors_path -from sertit import ci, path, rasters, rasters_rio, vectors +from sertit import ci, path, rasters, rasters_rio, s3, vectors ci.reduce_verbosity() @@ -67,12 +67,18 @@ def test_assert_dir(): @s3_env -def test_assert_files(): +def test_assert_files(tmp_path): """Test CI functions""" ok_path = files_path().joinpath("productPreview.html") false_path = files_path().joinpath("false.html") ci.assert_files_equal(ok_path, ok_path) + if path.is_cloud_path(ok_path): + str_ok_path = str(s3.download(ok_path, tmp_path)) + else: + str_ok_path = ok_path + + ci.assert_files_equal(str_ok_path, str_ok_path) with pytest.raises(AssertionError): ci.assert_files_equal(ok_path, false_path) @@ -169,15 +175,15 @@ def test_assert_raster(): @s3_env -def test_assert_xml(): +def test_assert_xml(tmp_path): # XML xml_folder = files_path().joinpath("LM05_L1TP_200030_20121230_20200820_02_T2_CI") xml_path = xml_folder.joinpath("LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml") xml_bad_path = xml_folder.joinpath("false_xml.xml") if path.is_cloud_path(files_path()): - xml_path = xml_path.fspath - xml_bad_path = xml_bad_path.fspath + xml_path = s3.download(xml_path, tmp_path) + xml_bad_path = s3.download(xml_bad_path, tmp_path) xml_ok = etree.parse(str(xml_path)).getroot() xml_nok = etree.parse(str(xml_bad_path)).getroot() @@ -188,19 +194,18 @@ def test_assert_xml(): @s3_env -def test_assert_html(): +def test_assert_html(tmp_path): # HTML html_path = files_path().joinpath("productPreview.html") html_bad_path = files_path().joinpath("false.html") - with tempfile.TemporaryDirectory() as tmp_dir: - if path.is_cloud_path(files_path()): - html_path = html_path.download_to(tmp_dir) - html_bad_path = html_bad_path.download_to(tmp_dir) + if path.is_cloud_path(files_path()): + html_path = s3.download(html_path, tmp_path) + html_bad_path = s3.download(html_bad_path, tmp_path) - html_ok = etree.parse(str(html_path)).getroot() - html_nok = etree.parse(str(html_bad_path)).getroot() + html_ok = etree.parse(str(html_path)).getroot() + html_nok = etree.parse(str(html_bad_path)).getroot() - ci.assert_xml_equal(html_ok, html_ok) - with pytest.raises(AssertionError): - ci.assert_xml_equal(html_ok, html_nok) + ci.assert_xml_equal(html_ok, html_ok) + with pytest.raises(AssertionError): + ci.assert_xml_equal(html_ok, html_nok) diff --git a/ci/test_files.py b/ci/test_files.py index 0a06572..d8de011 100644 --- a/ci/test_files.py +++ b/ci/test_files.py @@ -16,167 +16,17 @@ """Script testing the files""" import os -import shutil import tempfile from datetime import date, datetime import numpy as np -import pytest -from lxml import etree, html -from ci.script_utils import Polarization, files_path, s3_env -from sertit import AnyPath, ci, files, path, vectors +from ci.script_utils import Polarization +from sertit import AnyPath, ci, files ci.reduce_verbosity() -def test_archive(): - """Test extracting functions""" - with tempfile.TemporaryDirectory() as tmp_dir: - # Archives - zip_file = files_path().joinpath("test_zip.zip") - zip2_file = files_path().joinpath("test_zip.zip") # For overwrite - zip_without_directory = files_path().joinpath("test_zip_without_directory.zip") - tar_file = files_path().joinpath("test_tar.tar") - tar_gz_file = files_path().joinpath("test_targz.tar.gz") - - # Core dir - core_dir = files_path().joinpath("core") - folder = core_dir - archives = [ - zip_file, - tar_file, - tar_gz_file, - folder, - zip2_file, - zip_without_directory, - ] - - # Extract - extracted_dirs = files.extract_files(archives, tmp_dir, overwrite=True) - files.extract_files([zip2_file], tmp_dir, overwrite=False) # Already existing - - # Test - for ex_dir in extracted_dirs: - ci.assert_dir_equal(core_dir, ex_dir) - - # Archive - archive_base = os.path.join(tmp_dir, "archive") - for fmt in ["zip", "tar", "gztar"]: - archive_fn = files.archive( - folder_path=core_dir, archive_path=archive_base, fmt=fmt - ) - out = files.extract_file(archive_fn, tmp_dir) - # an additional folder is created - out_dir = path.listdir_abspath(out)[0] - ci.assert_dir_equal(core_dir, out_dir) - - # Remove out directory in order to avoid any interferences - files.remove(out) - - # Add to zip - zip_out = zip2_file if path.is_cloud_path(zip2_file) else archive_base + ".zip" - core_copy = files.copy(core_dir, os.path.join(tmp_dir, "core2")) - zip_out = files.add_to_zip(zip_out, core_copy) - - # Extract - unzip_out = os.path.join(tmp_dir, "out") - unzip_out = files.extract_file(zip_out, unzip_out) - - # Test - unzip_dirs = path.listdir_abspath(unzip_out) - - assert len(unzip_dirs) == 2 - ci.assert_dir_equal(unzip_dirs[0], unzip_dirs[1]) - - -@s3_env -def test_archived_files(): - landsat_name = "LM05_L1TP_200030_20121230_20200820_02_T2_CI" - ok_folder = files_path().joinpath(landsat_name) - zip_file = files_path().joinpath(f"{landsat_name}.zip") - tar_file = files_path().joinpath(f"{landsat_name}.tar") - targz_file = files_path().joinpath(f"{landsat_name}.tar.gz") - sz_file = files_path().joinpath(f"{landsat_name}.7z") - - # VECTORS - vect_name = "map-overlay.kml" - vec_ok_path = ok_folder.joinpath(vect_name) - if shutil.which("ogr2ogr"): # Only works if ogr2ogr can be found. - vect_regex = f".*{vect_name}" - vect_zip = vectors.read(zip_file, archive_regex=vect_regex) - vect_tar = vectors.read(tar_file, archive_regex=r".*overlay\.kml") - vect_ok = vectors.read(vec_ok_path) - assert not vect_ok.empty - ci.assert_geom_equal(vect_ok, vect_zip) - ci.assert_geom_equal(vect_ok, vect_tar) - - with tempfile.TemporaryDirectory() as tmp_dir: - # XML - xml_name = "LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml" - xml_ok_path = ok_folder.joinpath(xml_name) - if path.is_cloud_path(files_path()): - xml_ok_path = str(xml_ok_path.download_to(tmp_dir)) - else: - xml_ok_path = str(xml_ok_path) - - xml_regex = f".*{xml_name}" - xml_zip = files.read_archived_xml(zip_file, xml_regex) - xml_tar = files.read_archived_xml(tar_file, r".*_MTL\.xml") - xml_ok = etree.parse(xml_ok_path).getroot() - ci.assert_xml_equal(xml_ok, xml_zip) - ci.assert_xml_equal(xml_ok, xml_tar) - - # FILE + HTML - html_zip_file = files_path().joinpath("productPreview.zip") - html_tar_file = files_path().joinpath("productPreview.tar") - html_name = "productPreview.html" - html_ok_path = files_path().joinpath(html_name) - if path.is_cloud_path(files_path()): - html_ok_path = str(html_ok_path.download_to(tmp_dir)) - else: - html_ok_path = str(html_ok_path) - - html_regex = f".*{html_name}" - - # FILE - file_zip = files.read_archived_file(html_zip_file, html_regex) - file_tar = files.read_archived_file(html_tar_file, html_regex) - html_ok = html.parse(html_ok_path).getroot() - ci.assert_html_equal(html_ok, html.fromstring(file_zip)) - ci.assert_html_equal(html_ok, html.fromstring(file_tar)) - - file_list = path.get_archived_file_list(html_zip_file) - ci.assert_html_equal( - html_ok, - html.fromstring( - files.read_archived_file(html_zip_file, html_regex, file_list=file_list) - ), - ) - - # HTML - html_zip = files.read_archived_html(html_zip_file, html_regex) - html_tar = files.read_archived_html(html_tar_file, html_regex) - ci.assert_html_equal(html_ok, html_zip) - ci.assert_html_equal(html_ok, html_tar) - ci.assert_html_equal( - html_ok, - files.read_archived_html( - html_tar_file, - html_regex, - file_list=path.get_archived_file_list(html_tar_file), - ), - ) - - # ERRORS - with pytest.raises(TypeError): - files.read_archived_file(targz_file, xml_regex) - with pytest.raises(TypeError): - files.read_archived_file(sz_file, xml_regex) - with pytest.raises(FileNotFoundError): - files.read_archived_file(zip_file, "cdzeferf") - - def test_cp_rm(): """Test CP/RM functions""" with tempfile.TemporaryDirectory() as tmp_dir: @@ -242,10 +92,6 @@ def test_json(): ) # Enum are stored following their value assert obj == test_dict - # Test deprecation - with pytest.deprecated_call(): - files.save_json(json_file, test_dict) - def test_pickle(): """Test pickle functions""" diff --git a/ci/test_path.py b/ci/test_path.py index 66ad609..6da5458 100644 --- a/ci/test_path.py +++ b/ci/test_path.py @@ -16,13 +16,12 @@ """Script testing the files""" import os -import shutil import tempfile import pytest -from ci.script_utils import files_path, get_s3_ci_path, s3_env -from sertit import AnyPath, ci, misc, path, vectors +from ci.script_utils import get_s3_ci_path +from sertit import AnyPath, ci, misc, path ci.reduce_verbosity() @@ -65,58 +64,6 @@ def test_paths(): assert not path.is_writable("cvfgbherth") # Non-existing -@s3_env -def test_archived_paths(): - landsat_name = "LM05_L1TP_200030_20121230_20200820_02_T2_CI" - ok_folder = files_path().joinpath(landsat_name) - zip_file = files_path().joinpath(f"{landsat_name}.zip") - tar_file = files_path().joinpath(f"{landsat_name}.tar") - targz_file = files_path().joinpath(f"{landsat_name}.tar.gz") - sz_file = files_path().joinpath(f"{landsat_name}.7z") - - # Archive file - tif_name = "LM05_L1TP_200030_20121230_20200820_02_T2_QA_RADSAT.TIF" - tif_ok = f"{ok_folder.name}/{tif_name}" - tif_regex = f".*{tif_name}" - assert tif_ok == path.get_archived_path(zip_file, tif_regex) - assert tif_ok == path.get_archived_path(zip_file, tif_regex, as_list=True)[0] - assert tif_ok == path.get_archived_path(tar_file, ".*RADSAT") - - # RASTERIO - tif_zip = path.get_archived_rio_path(zip_file, tif_regex) - tif_list = path.get_archived_rio_path(zip_file, tif_regex, as_list=True) - tif_tar = path.get_archived_rio_path(tar_file, ".*RADSAT") - tif_ok = ok_folder.joinpath(tif_name) - ci.assert_raster_equal(tif_ok, tif_zip) - ci.assert_raster_equal(tif_ok, tif_list[0]) - ci.assert_raster_equal(tif_ok, tif_tar) - - file_list = path.get_archived_file_list(zip_file) - ci.assert_raster_equal( - tif_ok, path.get_archived_rio_path(zip_file, tif_regex, file_list=file_list) - ) - - # VECTORS - vect_name = "map-overlay.kml" - vec_ok_path = ok_folder.joinpath(vect_name) - if shutil.which("ogr2ogr"): # Only works if ogr2ogr can be found. - vect_regex = f".*{vect_name}" - vect_zip = vectors.read(zip_file, archive_regex=vect_regex) - vect_tar = vectors.read(tar_file, archive_regex=r".*overlay\.kml") - vect_ok = vectors.read(vec_ok_path) - assert not vect_ok.empty - ci.assert_geom_equal(vect_ok, vect_zip) - ci.assert_geom_equal(vect_ok, vect_tar) - - # ERRORS - with pytest.raises(TypeError): - path.get_archived_rio_path(targz_file, tif_regex) - with pytest.raises(TypeError): - path.get_archived_rio_path(sz_file, tif_regex) - with pytest.raises(FileNotFoundError): - path.get_archived_rio_path(zip_file, "cdzeferf") - - def test_get_file_name(): """Test get_file_name""" file_name = path.get_filename(__file__) diff --git a/ci/test_rasters.py b/ci/test_rasters.py index cca0295..74680a3 100644 --- a/ci/test_rasters.py +++ b/ci/test_rasters.py @@ -32,8 +32,6 @@ INT8_NODATA, UINT8_NODATA, UINT16_NODATA, - any_raster_to_xr_ds, - get_nodata_value_from_dtype, get_nodata_value_from_xr, ) from sertit.vectors import EPSG_4326 @@ -104,7 +102,7 @@ def ds_name(raster_path): @pytest.fixture def ds_dtype(raster_path): with rasterio.open(str(raster_path)) as ds: - return ds.meta["dtype"] + return getattr(np, ds.meta["dtype"]) @pytest.fixture @@ -316,7 +314,7 @@ def test_crop(tmp_path, xda, xds, xda_dask, mask): @s3_env @dask_env -def test_sieve(tmp_path, xda, xds, xda_dask): +def test_sieve(tmp_path, raster_path, xda, xds, xda_dask): """Test sieve function""" # DataArray xda_sieved = os.path.join(tmp_path, "test_sieved_xda.tif") @@ -350,6 +348,10 @@ def test_sieve(tmp_path, xda, xds, xda_dask): ci.assert_raster_equal(xda_sieved, raster_sieved_path) ci.assert_raster_equal(xds_sieved, raster_sieved_path) + # From path + sieve_xda_path = rasters.sieve(raster_path, sieve_thresh=20, connectivity=4) + np.testing.assert_array_equal(sieve_xda, sieve_xda_path) + @s3_env @dask_env @@ -675,11 +677,6 @@ def test_write(dtype, nodata_val, tmp_path, xda): ) _test_raster_after_write(test_path, dtype, nodata_val) - # test deprecation warning - test_deprecated_path = os.path.join(tmp_path, "test_depr.tif") - with pytest.deprecated_call(): - rasters.write(xda, path=test_deprecated_path, dtype=dtype) - def test_dim(): """Test on BEAM-DIMAP function""" @@ -900,51 +897,6 @@ def test_rasterize(tmp_path, raster_path): ci.assert_raster_almost_equal(raster_true_path, out_path, decimal=4) -@s3_env -def test_decorator_deprecation(raster_path): - from sertit.rasters import path_xarr_dst - - @any_raster_to_xr_ds - def _ok_rasters(xds): - assert isinstance(xds, xr.DataArray) - return xds - - @path_xarr_dst - def _depr_rasters(xds): - assert isinstance(xds, xr.DataArray) - return xds - - # Not able to warn deprecation from inside the decorator - xr.testing.assert_equal(_ok_rasters(raster_path), _depr_rasters(raster_path)) - - -def test_get_nodata_deprecation(): - """Test deprecation of get_nodata_value""" - # Test deprecation - for dtype in [ - np.uint8, - np.int8, - np.uint16, - np.uint32, - np.int32, - np.int64, - np.uint64, - int, - "int", - np.int16, - np.float32, - np.float64, - float, - "float", - ]: - with pytest.deprecated_call(): - from sertit.rasters import get_nodata_value - - ci.assert_val( - get_nodata_value_from_dtype(dtype), get_nodata_value(dtype), dtype - ) - - @s3_env @dask_env def test_get_notata_from_xr(raster_path): diff --git a/ci/test_rasters_rio.py b/ci/test_rasters_rio.py index 54a7d92..e69dc2c 100644 --- a/ci/test_rasters_rio.py +++ b/ci/test_rasters_rio.py @@ -26,7 +26,6 @@ from ci.script_utils import KAPUT_KWARGS, rasters_path, s3_env from sertit import ci, rasters_rio, vectors -from sertit.rasters_rio import any_raster_to_rio_ds, get_nodata_value_from_dtype from sertit.vectors import EPSG_4326 ci.reduce_verbosity() @@ -421,56 +420,3 @@ def _test_idx(idx_list): _test_idx([1]) _test_idx([1, 2]) _test_idx(1) - - -@s3_env -def test_decorator_deprecation(raster_path): - from sertit.rasters_rio import path_arr_dst - - @any_raster_to_rio_ds - def _ok_rasters(ds): - return ds.read() - - @path_arr_dst - def _depr_rasters(ds): - return ds.read() - - # Not able to warn deprecation from inside the decorator - np.testing.assert_equal(_ok_rasters(raster_path), _depr_rasters(raster_path)) - - -def test_get_nodata_deprecation(): - """Test deprecation of get_nodata_value""" - # Test deprecation - for dtype in [ - np.uint8, - np.int8, - np.uint16, - np.uint32, - np.int32, - np.int64, - np.uint64, - int, - "int", - np.int16, - np.float32, - np.float64, - float, - "float", - ]: - with pytest.deprecated_call(): - from sertit.rasters_rio import get_nodata_value - - ci.assert_val( - get_nodata_value_from_dtype(dtype), get_nodata_value(dtype), dtype - ) - - -@s3_env -def test_write_deprecated(tmp_path, raster_path): - test_deprecated_path = os.path.join(tmp_path, "test_depr.tif") - raster, mtd = rasters_rio.read(raster_path) - - # test deprecation warning - with pytest.deprecated_call(): - rasters_rio.write(raster, mtd, path=test_deprecated_path) diff --git a/ci/test_s3.py b/ci/test_s3.py index 81069d2..6cfe091 100644 --- a/ci/test_s3.py +++ b/ci/test_s3.py @@ -19,7 +19,7 @@ import pytest import rasterio -from cloudpathlib import AnyPath, S3Client +from cloudpathlib import AnyPath from tempenv import tempenv from ci.script_utils import CI_SERTIT_S3 @@ -43,6 +43,8 @@ def with_s3(variable_1, variable_2): def without_s3(): + from cloudpathlib import S3Client + S3Client().set_as_default_client() return base_fct(None) diff --git a/ci/test_types.py b/ci/test_types.py index b0cd0e3..1daf305 100644 --- a/ci/test_types.py +++ b/ci/test_types.py @@ -2,15 +2,24 @@ from typing import Union import numpy as np -from cloudpathlib import CloudPath from sertit import AnyPath from sertit.types import AnyPathType, is_iterable, make_iterable +try: + from upath import UPath +except ImportError: + UPath = None + +try: + from cloudpathlib import CloudPath +except ImportError: + CloudPath = None + def test_types(): """Test some type aliases""" - assert AnyPathType == Union[Path, CloudPath] + assert AnyPathType == Union[Path, CloudPath, UPath] def test_is_iterable(): diff --git a/ci/test_unistra.py b/ci/test_unistra.py index c754b90..7d7c144 100644 --- a/ci/test_unistra.py +++ b/ci/test_unistra.py @@ -73,7 +73,10 @@ def test_unistra_s3(): assert with_s3() == 1 # Test get_geodatastore with s3 - assert str(get_geodatastore()) == "s3://sertit-geodatastore" + try: + assert str(get_geodatastore()) == "s3://sertit-geodatastore/" + except AssertionError: + assert str(get_geodatastore()) == "s3://sertit-geodatastore" # Test get_geodatastore without s3 with tempenv.TemporaryEnvironment({s3.USE_S3_STORAGE: "0"}): diff --git a/ci/test_vectors.py b/ci/test_vectors.py index 273077c..3aea31d 100644 --- a/ci/test_vectors.py +++ b/ci/test_vectors.py @@ -21,11 +21,10 @@ import geopandas as gpd import pytest -from rasterio import CRS from shapely import wkt from ci.script_utils import KAPUT_KWARGS, files_path, s3_env, vectors_path -from sertit import ci, files, path, vectors +from sertit import archives, ci, files, path, vectors from sertit.vectors import EPSG_4326, DataSourceError ci.reduce_verbosity() @@ -81,15 +80,6 @@ def test_vectors(): aoi = vectors.read(kml_path, **KAPUT_KWARGS) _assert_attributes(aoi, kml_path) - with pytest.deprecated_call(): - assert ( - vectors.corresponding_utm_projection(aoi.centroid.x, aoi.centroid.y) - == "EPSG:32638" - ) - assert CRS.from_string("EPSG:32638") == vectors.to_utm_crs( - aoi.centroid.x, aoi.centroid.y - ) - env = aoi.envelope[0] # Test kwargs (should be slightly not equal toi AOI to prove bbox does sth) @@ -280,7 +270,10 @@ def test_read_archived(): map_overlay_extracted = vectors.read(map_overlay_extracted_path) ci.assert_geom_equal( - map_overlay_extracted, vectors.read(f"{zip_landsat}!{landsat}/{map_overlay}") + map_overlay_extracted, + vectors.read( + zip_landsat.parent / (zip_landsat.name + f"!{landsat}/{map_overlay}") + ), ) ci.assert_geom_equal( map_overlay_extracted, @@ -291,7 +284,7 @@ def test_read_archived(): vectors.read(tar_landsat, archive_regex=map_overlay_regex), ) - file_list = path.get_archived_file_list(tar_landsat) + file_list = archives.get_archived_file_list(tar_landsat) ci.assert_geom_equal( map_overlay_extracted, vectors.read(tar_landsat, archive_regex=map_overlay_regex, file_list=file_list), diff --git a/ci/test_xml.py b/ci/test_xml.py index 2236d23..000df2a 100644 --- a/ci/test_xml.py +++ b/ci/test_xml.py @@ -111,7 +111,7 @@ def test_xml(): _assert_str(cv_xml.findtext(".//Age"), "20") # Write - true_xml = str(xml_path() / "true.xml") + true_xml = xml_path() / "true.xml" with tempfile.TemporaryDirectory() as tmp_dir: tmp_xml = os.path.join(tmp_dir, "tmp.xml") xml.write(cv_xml, tmp_xml) @@ -121,7 +121,8 @@ def test_xml(): # Based on `files.read_archived_xml`, so it is considered to work. # Just test the case with complete path to the archive l8_archived = files_path() / "LM05_L1TP_200030_20121230_20200820_02_T2_CI.zip" - xml_archived = f"{l8_archived}!LM05_L1TP_200030_20121230_20200820_02_T2_CI/LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml" + xml_path_in_zip = "!LM05_L1TP_200030_20121230_20200820_02_T2_CI/LM05_L1TP_200030_20121230_20200820_02_T2_MTL.xml" + xml_archived = l8_archived.parent / (l8_archived.name + xml_path_in_zip) ci.assert_xml_equal( xml.read_archive(l8_archived, r".*_MTL\.xml"), xml.read_archive(xml_archived) diff --git a/pyproject.toml b/pyproject.toml index 4bfa4ce..d78644f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,8 +35,8 @@ dependencies = [ "dill", "psutil", "geopandas>=0.14.4", - "cloudpathlib[all]>=0.12.1", "xarray>=2024.06.0", + "universal_pathlib>=0.2.6" ] dynamic = ["version"] @@ -60,7 +60,8 @@ dask = [ "odc-geo>=0.4.6", "xarray-spatial>=0.3.6", ] -full = ["sertit[colorlog,rasters_rio,rasters,dask]"] +cloudpathlib = ["cloudpathlib[all]>=0.12.1"] +full = ["sertit[colorlog,rasters_rio,rasters,dask,cloudpathlib]"] [project.urls] Bug_Tracker = "https://github.com/sertit/sertit-utils/issues" diff --git a/requirements.txt b/requirements.txt index d02da9c..c4682aa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ lxml dill psutil geopandas>=0.14.4 +universal_pathlib>=0.2.6 cloudpathlib[all]>=0.12.1 xarray>=2024.06.0 shapely >= 2.0.0 diff --git a/sertit/__init__.py b/sertit/__init__.py index 4f4a348..ca7071b 100644 --- a/sertit/__init__.py +++ b/sertit/__init__.py @@ -21,11 +21,17 @@ """ try: - from cloudpathlib import AnyPath + from upath import UPath + + AnyPath = UPath - AnyPath = AnyPath except ImportError: - pass + try: + from cloudpathlib import AnyPath + + AnyPath = AnyPath + except ImportError: + pass # flake8: noqa from .__meta__ import ( diff --git a/sertit/archives.py b/sertit/archives.py new file mode 100644 index 0000000..db5d540 --- /dev/null +++ b/sertit/archives.py @@ -0,0 +1,548 @@ +import logging +import os +import re +import shutil +import tarfile +import tempfile +import zipfile +from contextlib import contextmanager +from pathlib import Path +from typing import Union + +from lxml import etree, html +from tqdm import tqdm + +from sertit import AnyPath, path, s3 +from sertit.logs import SU_NAME +from sertit.types import AnyPathStrType, AnyPathType + +LOGGER = logging.getLogger(SU_NAME) + + +@contextmanager +def open_zipfile(file_path, mode="r"): + if path.is_cloud_path(file_path): + file_path = s3.read(file_path) + + with zipfile.ZipFile(file_path, mode) as zip_file: + yield zip_file + + +@contextmanager +def open_tarfile(file_path, mode="r"): + if path.is_cloud_path(file_path): + args = {"fileobj": s3.read(file_path), "mode": mode} + else: + args = {"name": file_path, "mode": mode} + with tarfile.open(**args) as tar_file: + yield tar_file + + +def extract_file( + file_path: AnyPathStrType, + output: AnyPathStrType, + overwrite: bool = False, +) -> AnyPathType: + """ + Extract an archived file (zip or others). Overwrites if specified. + If the archive don't contain a root directory with the name of the archive without the extension, create it + + Args: + file_path (str): Archive file path + output (str): Output where to put the extracted directory + overwrite (bool): Overwrite found extracted directory + + Returns: + AnyPathType: Extracted directory paths + + Example: + >>> file_path = 'D:/path/to/zip.zip' + >>> output = 'D:/path/to/output' + >>> extract_file(file_path, output, overwrite=True) + D:/path/to/output/zip' + """ + # Convert to path + file_path = AnyPath(file_path) + output = AnyPath(output) + + # In case a folder is given, returns it (this means that the file is already extracted) + if file_path.is_dir(): + return file_path + + # Beware with .SEN3 and .SAFE extensions + archive_output = output.joinpath(path.get_filename(file_path)) + + # In case not overwrite and the extracted directory already exists + if not overwrite and archive_output.exists(): + LOGGER.debug( + "Already existing extracted %s. It won't be overwritten.", + archive_output, + ) + return archive_output + + def extract_sub_dir(arch, filename_list): + top_level_files = list({item.split("/")[0] for item in filename_list}) + + # When the only root directory in the archive has the right name, we don't have to create it + if len(top_level_files) == 1 and archive_output.name == path.get_filename( + top_level_files[0] + ): + arch.extractall(archive_output.parent) + archive_output.parent.joinpath(top_level_files[0]).rename(archive_output) + else: + arch.extractall(archive_output) + + # Manage archive type + if file_path.suffix == ".zip": + with open_zipfile(file_path) as zip_file: + extract_sub_dir(zip_file, zip_file.namelist()) + elif file_path.suffix == ".tar" or file_path.suffixes == [".tar", ".gz"]: + with open_tarfile(file_path) as tar_file: + extract_sub_dir(tar_file, tar_file.getnames()) + elif file_path.suffix == ".7z": + try: + import py7zr + + with py7zr.SevenZipFile(file_path, "r") as z7_file: + extract_sub_dir(z7_file, z7_file.getnames()) + except ModuleNotFoundError as exc: + raise TypeError("Please install 'py7zr' to extract .7z files") from exc + else: + raise TypeError( + f"Only .zip, .tar, .tar.gz and .7z files can be extracted, not {file_path}" + ) + + return archive_output + + +def extract_files( + archives: list, output: AnyPathStrType, overwrite: bool = False +) -> list: + """ + Extract all archived files. Overwrites if specified. + + Example: + >>> file_path = ['D:/path/to/zip1.zip', 'D:/path/to/zip2.zip'] + >>> output = 'D:/path/to/output' + >>> extract_files(file_path, output, overwrite=True) + ['D:/path/to/output.zip1', 'D:/path/to/output.zip2'] + + Args: + archives (list of str): List of archives to be extracted + output (str): Output folder where extracted files will be written + overwrite (bool): Overwrite found extracted files + + Returns: + list: Extracted files (even pre-existing ones) + """ + LOGGER.info("Extracting products in %s", output) + progress_bar = tqdm(archives) + extracts = [] + for arch in progress_bar: + progress_bar.set_description(f"Extracting product {os.path.basename(arch)}") + extracts.append(extract_file(arch, output, overwrite)) + + return extracts + + +def read_archived_file( + archive_path: AnyPathStrType, regex: str, file_list: list = None +) -> bytes: + """ + Read archived file (in bytes) from :code:`zip` or :code:`tar` archives. + + You can use this `site `_ to build your regex. + + Args: + archive_path (AnyPathStrType): Archive path + regex (str): Regex (used by re) as it can be found in the getmembers() list + file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. + + Returns: + bytes: Archived file in bytes + """ + archive_path = AnyPath(archive_path) + + # Compile regex + regex = re.compile(regex) + + # Open tar and zip XML + try: + if archive_path.suffix == ".tar": + with open_tarfile(archive_path) as tar_ds: + # file_list is not very useful for TAR files... + if file_list is None: + tar_mb = tar_ds.getmembers() + file_list = [mb.name for mb in tar_mb] + name = list(filter(regex.match, file_list))[0] + tarinfo = tar_ds.getmember(name) + file_str = tar_ds.extractfile(tarinfo).read() + elif archive_path.suffix == ".zip": + with open_zipfile(archive_path) as zip_ds: + if file_list is None: + file_list = [f.filename for f in zip_ds.filelist] + name = list(filter(regex.match, file_list))[0] + file_str = zip_ds.read(name) + + elif archive_path.suffix == ".tar.gz": + raise TypeError( + ".tar.gz files are too slow to read from inside the archive. Please extract them instead." + ) + else: + raise TypeError( + "Only .zip and .tar files can be read from inside its archive." + ) + except IndexError as exc: + raise FileNotFoundError( + f"Impossible to find file {regex} in {path.get_filename(archive_path)}" + ) from exc + + return file_str + + +def read_archived_xml( + archive_path: AnyPathStrType, regex: str = None, file_list: list = None, **kwargs +) -> etree._Element: + """ + Read archived XML from :code:`zip` or :code:`tar` archives. + + You can use this `site `_ to build your regex. + + Args: + archive_path (AnyPathStrType): Archive path + regex (str): XML regex (used by re) as it can be found in the getmembers() list + file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. + + Returns: + etree._Element: XML file + + Example: + >>> arch_path = 'D:/path/to/zip.zip' + >>> file_regex = '.*dir.*file_name' # Use .* for any character + >>> read_archived_xml(arch_path, file_regex) + + """ + xml_bytes = read_archived_file(archive_path, regex=regex, file_list=file_list) + + return etree.fromstring(xml_bytes) + + +def read_archived_html( + archive_path: AnyPathStrType, regex: str, file_list: list = None +) -> html.HtmlElement: + """ + Read archived HTML from :code:`zip` or :code:`tar` archives. + + You can use this `site `_ to build your regex. + + Args: + archive_path (AnyPathStrType): Archive path + regex (str): HTML regex (used by re) as it can be found in the getmembers() list + file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. + + Returns: + html._Element: HTML file + + Example: + >>> arch_path = 'D:/path/to/zip.zip' + >>> file_regex = '.*dir.*file_name' # Use .* for any character + >>> read_archived_html(arch_path, file_regex) + + """ + html_bytes = read_archived_file(archive_path, regex, file_list=file_list) + + return html.fromstring(html_bytes) + + +def archive( + folder_path: AnyPathStrType, + archive_path: AnyPathStrType, + fmt: str = "zip", +) -> AnyPathType: + """ + Archives a folder recursively. + + Args: + folder_path (AnyPathStrType): Folder to archive + archive_path (AnyPathStrType): Archive path, with or without extension + fmt (str): Format of the archive, used by :code:`shutil.make_archive`. Choose between [zip, tar, gztar, bztar, xztar] + + Returns: + str: Archive filename + + Example: + >>> folder_path = 'D:/path/to/folder_to_archive' + >>> archive_path = 'D:/path/to/output' + >>> archive = archive(folder_path, archive_path, fmt="gztar") + 'D:/path/to/output/folder_to_archive.tar.gz' + """ + archive_path = AnyPath(archive_path) + folder_path = AnyPath(folder_path) + + # with zipfile.ZipFile(archive_path, mode='w', compression=zipfile.ZIP_DEFLATED) as zipf: + # for f in folder_path.glob("**"): + # zipf.write(f, f.relative_to(folder_path.name)) + + tmp_dir = None + if path.is_cloud_path(folder_path): + tmp_dir = tempfile.TemporaryDirectory() + folder_path = s3.download(folder_path, tmp_dir.name) + + # Shutil make_archive needs a path without extension + archive_base = os.path.splitext(archive_path)[0] + + # Archive the folder + archive_fn = shutil.make_archive( + archive_base, + format=fmt, + root_dir=folder_path.parent, + base_dir=folder_path.name, + ) + + if tmp_dir is not None: + tmp_dir.cleanup() + + try: + arch = AnyPath(archive_fn, **folder_path.storage_options) + except AttributeError: + arch = AnyPath(archive_fn) + + return arch + + +def add_to_zip( + zip_path: AnyPathStrType, + dirs_to_add: Union[list, AnyPathStrType], +) -> AnyPathType: + """ + Add folders to an already existing zip file (recursively). + + Args: + zip_path (AnyPathStrType): Already existing zip file + dirs_to_add (Union[list, AnyPathStrType]): Directories to add + + Returns: + AnyPathType: Updated zip_path + + Example: + >>> zip_path = 'D:/path/to/zip.zip' + >>> dirs_to_add = ['D:/path/to/dir1', 'D:/path/to/dir2'] + >>> add_to_zip(zip_path, dirs_to_add) + zip.zip contains 2 more folders, dir1 and dir2 + """ + zip_path = AnyPath(zip_path) + + with tempfile.TemporaryDirectory() as tmp_dir: + # If the zip is on the cloud, cache it (zipfile doesn't like cloud paths) + if path.is_cloud_path(zip_path): + raise NotImplementedError( + "Impossible (for now) to update a zip stored in the cloud!" + ) + + # Check if existing zipfile + if not zip_path.is_file(): + raise FileNotFoundError(f"Non existing {zip_path}") + + # Convert to list if needed + if not isinstance(dirs_to_add, list): + dirs_to_add = [dirs_to_add] + + # Add all folders to the existing zip + # Forced to use ZipFile because make_archive only works with one folder and not existing zipfile + with open_zipfile(zip_path, "a") as zip_file: + progress_bar = tqdm(dirs_to_add) + for dir_to_add_path in progress_bar: + # Just to be sure, use str instead of Paths + if isinstance(dir_to_add_path, Path): + dir_to_add = str(dir_to_add_path) + elif path.is_cloud_path(dir_to_add_path): + dir_to_add = dir_to_add_path.fspath + else: + dir_to_add = dir_to_add_path + + progress_bar.set_description( + f"Adding {os.path.basename(dir_to_add)} to {os.path.basename(zip_path)}" + ) + if os.path.isfile(dir_to_add): + dir_to_add = extract_file(dir_to_add, tmp_dir) + + for root, _, files in os.walk(dir_to_add): + base_path = os.path.join(dir_to_add, "..") + + # Write dir (in namelist at least) + zip_file.write(root, os.path.relpath(root, base_path)) + + # Write files + for file in files: + zip_file.write( + os.path.join(root, file), + os.path.relpath( + os.path.join(root, file), os.path.join(dir_to_add, "..") + ), + ) + + return zip_path + + +def get_archived_file_list(archive_path: AnyPathStrType) -> list: + """ + Get the list of all the files contained in an archive. + + Args: + archive_path (AnyPathStrType): Archive path + + Returns: + list: All files contained in the given archive + + Example: + >>> arch_path = 'D:/path/to/zip.zip' + >>> get_archived_file_list(arch_path, file_regex) + ['file_1.txt', 'file_2.tif', 'file_3.xml', 'file_4.geojson'] + """ + archive_path = AnyPath(archive_path) + + is_zip = archive_path.suffix == ".zip" + archive_fn = path.get_filename(archive_path) + if is_zip: + with open_zipfile(archive_path) as zip_ds: + file_list = [f.filename for f in zip_ds.filelist] + else: + try: + with open_tarfile(archive_path) as tar_ds: + tar_mb = tar_ds.getmembers() + file_list = [mb.name for mb in tar_mb] + except tarfile.ReadError as ex: + raise tarfile.ReadError(f"Impossible to open archive: {archive_fn}") from ex + + return file_list + + +def get_archived_path( + archive_path: AnyPathStrType, + regex: str, + as_list: bool = False, + case_sensitive: bool = False, + file_list: list = None, + **kwargs, +) -> Union[list, AnyPathType]: + """ + Get archived file path from inside the archive. + + .. WARNING:: + If :code:`as_list` is :code:`False`, it will only return the first file matched ! + + You can use this `site `_ to build your regex. + + Args: + archive_path (AnyPathStrType): Archive path + regex (str): File regex (used by re) as it can be found in the getmembers() list + as_list (bool): If true, returns a list (including all found files). If false, returns only the first match + case_sensitive (bool): If true, the regex is case-sensitive. + file_list (list): List of files to get archived from. Optional, if not given it will be re-computed. + + Returns: + Union[list, str]: Path from inside the zipfile + + Example: + >>> arch_path = 'D:/path/to/zip.zip' + >>> file_regex = '.*dir.*file_name' # Use .* for any character + >>> path = get_archived_path(arch_path, file_regex) + 'dir/filename.tif' + """ + # Get file list + archive_path = AnyPath(archive_path) + + # Offer the ability to give the file list directly, as this operation is expensive when done with large archives stored on the cloud + if file_list is None: + file_list = get_archived_file_list(archive_path) + + # Search for file + re_rgx = re.compile(regex) if case_sensitive else re.compile(regex, re.IGNORECASE) + archived_band_paths = list(filter(re_rgx.match, file_list)) + if not archived_band_paths: + raise FileNotFoundError( + f"Impossible to find file {regex} in {path.get_filename(archive_path)}" + ) + + # Convert to str if needed + if not as_list: + archived_band_paths = archived_band_paths[0] + + return archived_band_paths + + +def get_archived_rio_path( + archive_path: AnyPathStrType, + regex: str, + as_list: bool = False, + file_list: list = None, + **kwargs, +) -> Union[list, AnyPathType]: + """ + Get archived file path from inside the archive, to be read with rasterio: + + - :code:`zip+file://{zip_path}!{file_name}` + - :code:`tar+file://{tar_path}!{file_name}` + + + See `here `_ + for more information. + + .. WARNING:: + It wont be readable by pandas, geopandas or xmltree ! + + .. WARNING:: + If :code:`as_list` is :code:`False`, it will only return the first file matched ! + + You can use this `site `_ to build your regex. + + Args: + archive_path (AnyPathStrType): Archive path + regex (str): File regex (used by re) as it can be found in the getmembers() list + as_list (bool): If true, returns a list (including all found files). If false, returns only the first match + file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. + + Returns: + Union[list, str]: Band path that can be read by rasterio + + Example: + >>> arch_path = 'D:/path/to/zip.zip' + >>> file_regex = '.*dir.*file_name' # Use .* for any character + >>> path = get_archived_tif_path(arch_path, file_regex) + 'zip+file://D:/path/to/output.zip!dir/filename.tif' + >>> rasterio.open(path) + + """ + archive_path = AnyPath(archive_path) + if archive_path.suffix in [".tar", ".zip"]: + prefix = archive_path.suffix[-3:] + elif archive_path.suffix == ".tar.gz": + raise TypeError( + ".tar.gz files are too slow to be read from inside the archive. Please extract them instead." + ) + else: + raise TypeError("Only .zip and .tar files can be read from inside its archive.") + + # Search for file + archived_band_paths = get_archived_path( + archive_path, regex=regex, as_list=True, file_list=file_list + ) + + # Convert to rio path + if path.is_cloud_path(archive_path): + archived_band_paths = [ + f"{prefix}+file+{archive_path}!{p}" for p in archived_band_paths + ] + else: + # archived_band_paths = [ + # f"{prefix}+file://{archive_path}!{path}" for path in archived_band_paths + # ] + archived_band_paths = [ + f"/vsi{prefix}/{archive_path}/{p}" for p in archived_band_paths + ] + + # Convert to str if needed + if not as_list: + archived_band_paths = archived_band_paths[0] + + return archived_band_paths diff --git a/sertit/arcpy.py b/sertit/arcpy.py index a82f603..64cc9f9 100644 --- a/sertit/arcpy.py +++ b/sertit/arcpy.py @@ -1,8 +1,6 @@ import logging import logging.handlers -from sertit.logs import deprecation_warning - # Arcpy types from inside a schema SHORT = "int32:4" """ 'Short' type for ArcGis GDB """ @@ -153,32 +151,6 @@ def emit(self, record): super(ArcPyLogHandler, self).emit(record) -def feature_layer_to_path(feature_layer) -> str: - """ - .. deprecated:: 1.36.0 - Use :py:func:`gp_layer_to_path` instead. - - Use :func:`gp_layer_to_path` instead. - - Convert a feature layer to its source path. - - Args: - feature_layer: Feature layer - - Returns: - str: Path to the feature layer source - - """ - deprecation_warning("This function is deprecated. Use gp_layer_to_path instead.") - # Get path - if hasattr(feature_layer, "dataSource"): - path = feature_layer.dataSource - else: - path = str(feature_layer) - - return path - - def gp_layer_to_path(feature_layer) -> str: """ Convert a GP layer to its source path. diff --git a/sertit/ci.py b/sertit/ci.py index 8b000ba..9e07937 100644 --- a/sertit/ci.py +++ b/sertit/ci.py @@ -20,6 +20,7 @@ import filecmp import logging import pprint +import tempfile from doctest import Example from typing import Any, Union @@ -30,8 +31,8 @@ from shapely import force_2d, normalize from shapely.testing import assert_geometries_equal -from sertit import AnyPath, files, s3, unistra -from sertit.logs import SU_NAME, deprecation_warning +from sertit import AnyPath, files, path, s3 +from sertit.logs import SU_NAME from sertit.types import AnyPathStrType, AnyXrDataStructure LOGGER = logging.getLogger(SU_NAME) @@ -42,61 +43,6 @@ AWS_S3_ENDPOINT = s3.AWS_S3_ENDPOINT -def s3_env(*args, **kwargs): - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.unistra` instead of :py:mod:`sertit.ci` - """ - deprecation_warning( - "This function is deprecated. Import it from 'sertit.unistra' instead of 'sertit.ci'" - ) - return unistra.s3_env(*args, **kwargs) - - -def define_s3_client(): - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.unistra` instead of :py:mod:`sertit.ci` - """ - deprecation_warning( - "This function is deprecated. Import it from 'sertit.unistra' instead of 'sertit.ci'" - ) - return unistra.define_s3_client() - - -def get_db2_path(): - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.unistra` instead of :py:mod:`sertit.ci` - """ - deprecation_warning( - "This function is deprecated. Import it from 'sertit.unistra' instead of 'sertit.ci'" - ) - return unistra.get_db2_path() - - -def get_db3_path(): - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.unistra` instead of :py:mod:`sertit.ci` - """ - deprecation_warning( - "This function is deprecated. Import it from 'sertit.unistra' instead of 'sertit.ci'" - ) - return unistra.get_db3_path() - - -def get_db4_path(): - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.unistra` instead of :py:mod:`sertit.ci` - """ - deprecation_warning( - "This function is deprecated. Import it from 'sertit.unistra' instead of 'sertit.ci'" - ) - return unistra.get_db4_path() - - def assert_val(val_1: Any, val_2: Any, field: str) -> None: """ Compare two values corresponding to a field @@ -140,7 +86,7 @@ def assert_files_equal(file_1: AnyPathStrType, file_2: AnyPathStrType): file_1 (str): Path to file 1 file_2 (str): Path to file 2 """ - with file_1.open("r") as f1, file_2.open("r") as f2: + with AnyPath(file_1).open("r") as f1, AnyPath(file_2).open("r") as f2: assert files.hash_file_content(f1.read()) == files.hash_file_content(f2.read()) @@ -381,27 +327,36 @@ def assert_dir_equal(path_1: AnyPathStrType, path_2: AnyPathStrType) -> None: assert path_1.is_dir(), f"{path_1} is not a directory!" assert path_2.is_dir(), f"{path_2} is not a directory!" - dcmp = filecmp.dircmp(path_1, path_2) - try: - assert ( - dcmp.left_only == [] - ), f"More files in {path_1}!\n{pprint.pformat(list(dcmp.left_only))}" - assert ( - dcmp.right_only == [] - ), f"More files in {path_2}!\n{pprint.pformat(list(dcmp.right_only))}" - except FileNotFoundError: - files_1 = [AnyPath(p).name for p in AnyPath(path_1).iterdir()] - files_2 = [AnyPath(p).name for p in AnyPath(path_2).iterdir()] - - for f1 in files_1: - assert ( - f1 in files_2 - ), f"File missing!\n{f1} not in {pprint.pformat(files_2)}" + with ( + tempfile.TemporaryDirectory() as tmpdir, + tempfile.TemporaryDirectory() as tmpdir2, + ): + if path.is_cloud_path(path_1): + path_1 = s3.download(path_1, tmpdir) + if path.is_cloud_path(path_2): + path_2 = s3.download(path_2, tmpdir2) - for f2 in files_2: + dcmp = filecmp.dircmp(path_1, path_2) + try: + assert ( + dcmp.left_only == [] + ), f"More files in {path_1}!\n{pprint.pformat(list(dcmp.left_only))}" assert ( - f2 in files_1 - ), f"File missing!\n{f2} not in {pprint.pformat(files_1)}" + dcmp.right_only == [] + ), f"More files in {path_2}!\n{pprint.pformat(list(dcmp.right_only))}" + except FileNotFoundError: + files_1 = [p.name for p in path_1.iterdir()] + files_2 = [p.name for p in path_2.iterdir()] + + for f1 in files_1: + assert ( + f1 in files_2 + ), f"File missing!\n{f1} not in {pprint.pformat(files_2)}" + + for f2 in files_2: + assert ( + f2 in files_1 + ), f"File missing!\n{f2} not in {pprint.pformat(files_1)}" def assert_geom_equal( diff --git a/sertit/files.py b/sertit/files.py index 4bdb6c2..7be06e9 100644 --- a/sertit/files.py +++ b/sertit/files.py @@ -19,23 +19,17 @@ import json import logging import os -import re import shutil -import tarfile -import tempfile -import zipfile from datetime import date, datetime from enum import Enum from json import JSONDecoder, JSONEncoder from pathlib import Path -from typing import Any, Union +from typing import Any import dill import numpy as np -from lxml import etree, html -from tqdm import tqdm -from sertit import AnyPath, logs, path +from sertit import AnyPath, path, s3 from sertit.logs import SU_NAME from sertit.strings import DATE_FORMAT from sertit.types import AnyPathStrType, AnyPathType @@ -43,628 +37,6 @@ LOGGER = logging.getLogger(SU_NAME) -def get_root_path() -> AnyPathType: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Get the root path of the current disk: - - - On Linux this returns :code:`/` - - On Windows this returns :code:`C:/` or whatever the current drive is - - Example: - >>> get_root_path() - "/" on Linux - "C:/" on Windows (if you run this code from the C: drive) - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.get_root_path() - - -def listdir_abspath(directory: AnyPathStrType) -> list: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Get absolute path of all files in the given directory. - - It is the same function than :code:`os.listdir` but returning absolute paths. - - Args: - directory (AnyPathStrType): Relative or absolute path to the directory to be scanned - - Returns: - str: Absolute path of all files in the given directory - - Example: - >>> folder = "." - >>> listdir_abspath(folder) - ['D:/_SERTIT_UTILS/sertit-utils/sertit/files.py', - 'D:/_SERTIT_UTILS/sertit-utils/sertit/logs.py', - 'D:/_SERTIT_UTILS/sertit-utils/sertit/misc.py', - 'D:/_SERTIT_UTILS/sertit-utils/sertit/network.py', - 'D:/_SERTIT_UTILS/sertit-utils/sertit/rasters_rio.py', - 'D:/_SERTIT_UTILS/sertit-utils/sertit/strings.py', - 'D:/_SERTIT_UTILS/sertit-utils/sertit/vectors.py', - 'D:/_SERTIT_UTILS/sertit-utils/sertit/version.py', - 'D:/_SERTIT_UTILS/sertit-utils/sertit/__init__.py'] - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.listdir_abspath(directory) - - -def to_abspath( - raw_path: AnyPathStrType, - create: bool = True, - raise_file_not_found: bool = True, -) -> AnyPathType: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Return the absolute path of the specified path and check if it exists - - If not: - - - If it is a file (aka has an extension), it raises an exception - - If it is a folder, it creates it - - To be used with argparse to retrieve the absolute path of a file, like: - - Args: - raw_path (AnyPathStrType): Path as a string (relative or absolute) - create (bool): Create directory if not existing - - Returns: - AnyPathType: Absolute path - - Example: - >>> parser = argparse.ArgumentParser() - >>> # Add config file path key - >>> parser.add_argument( - >>> "--config", - >>> help="Config file path (absolute or relative)", - >>> type=to_abspath - >>> ) - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.to_abspath(raw_path, create, raise_file_not_found) - - -def real_rel_path(raw_path: AnyPathStrType, start: AnyPathStrType) -> AnyPathType: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Gives the real relative path from a starting folder. - (and not just adding :code:`../..` between the start and the target) - - Args: - raw_path (AnyPathStrType): Path to make relative - start (AnyPathStrType): Start, the path being relative from this folder. - - Returns: - Relative path - - Example: - >>> path = r'D:/_SERTIT_UTILS/sertit-utils/sertit' - >>> start = os.path.join(".", "..", "..") - >>> real_rel_path(path, start) - 'sertit-utils/sertit' - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.real_rel_path(raw_path, start) - - -def extract_file( - file_path: AnyPathStrType, - output: AnyPathStrType, - overwrite: bool = False, -) -> AnyPathType: - """ - Extract an archived file (zip or others). Overwrites if specified. - If the archive don't contain a root directory with the name of the archive without the extension, create it - - Args: - file_path (str): Archive file path - output (str): Output where to put the extracted directory - overwrite (bool): Overwrite found extracted directory - - Returns: - AnyPathType: Extracted directory paths - - Example: - >>> file_path = 'D:/path/to/zip.zip' - >>> output = 'D:/path/to/output' - >>> extract_file(file_path, output, overwrite=True) - D:/path/to/output/zip' - """ - # Convert to path - file_path = AnyPath(file_path) - output = AnyPath(output) - - # In case a folder is given, returns it (this means that the file is already extracted) - if file_path.is_dir(): - return file_path - - # Beware with .SEN3 and .SAFE extensions - archive_output = output.joinpath(path.get_filename(file_path)) - - # In case not overwrite and the extracted directory already exists - if not overwrite and archive_output.exists(): - LOGGER.debug( - "Already existing extracted %s. It won't be overwritten.", - archive_output, - ) - return archive_output - - def extract_sub_dir(arch, filename_list): - top_level_files = list({item.split("/")[0] for item in filename_list}) - - # When the only root directory in the archive has the right name, we don't have to create it - if len(top_level_files) == 1 and archive_output.name == path.get_filename( - top_level_files[0] - ): - arch.extractall(archive_output.parent) - archive_output.parent.joinpath(top_level_files[0]).rename(archive_output) - else: - arch.extractall(archive_output) - - # Manage archive type - if file_path.suffix == ".zip": - with zipfile.ZipFile(file_path, "r") as zip_file: - extract_sub_dir(zip_file, zip_file.namelist()) - elif file_path.suffix == ".tar" or file_path.suffixes == [".tar", ".gz"]: - with tarfile.open(file_path, "r") as tar_file: - extract_sub_dir(tar_file, tar_file.getnames()) - elif file_path.suffix == ".7z": - try: - import py7zr - - with py7zr.SevenZipFile(file_path, "r") as z7_file: - extract_sub_dir(z7_file, z7_file.getnames()) - except ModuleNotFoundError as exc: - raise TypeError("Please install 'py7zr' to extract .7z files") from exc - else: - raise TypeError( - f"Only .zip, .tar, .tar.gz and .7z files can be extracted, not {file_path}" - ) - - return archive_output - - -def extract_files( - archives: list, output: AnyPathStrType, overwrite: bool = False -) -> list: - """ - Extract all archived files. Overwrites if specified. - - Example: - >>> file_path = ['D:/path/to/zip1.zip', 'D:/path/to/zip2.zip'] - >>> output = 'D:/path/to/output' - >>> extract_files(file_path, output, overwrite=True) - ['D:/path/to/output.zip1', 'D:/path/to/output.zip2'] - - Args: - archives (list of str): List of archives to be extracted - output (str): Output folder where extracted files will be written - overwrite (bool): Overwrite found extracted files - - Returns: - list: Extracted files (even pre-existing ones) - """ - LOGGER.info("Extracting products in %s", output) - progress_bar = tqdm(archives) - extracts = [] - for arch in progress_bar: - progress_bar.set_description(f"Extracting product {os.path.basename(arch)}") - extracts.append(extract_file(arch, output, overwrite)) - - return extracts - - -def get_archived_file_list(archive_path: AnyPathStrType) -> list: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Get the list of all the files contained in an archive. - - Args: - archive_path (AnyPathStrType): Archive path - - Returns: - list: All files contained in the given archive - - Example: - >>> arch_path = 'D:/path/to/zip.zip' - >>> get_archived_file_list(arch_path, file_regex) - ['file_1.txt', 'file_2.tif', 'file_3.xml', 'file_4.geojson'] - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.get_archived_file_list(archive_path) - - -def get_archived_path( - archive_path: AnyPathStrType, file_regex: str, as_list: bool = False -) -> Union[list, AnyPathType]: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Get archived file path from inside the archive. - - .. WARNING:: - If :code:`as_list` is :code:`False`, it will only return the first file matched ! - - You can use this `site `_ to build your regex. - - Example: - >>> arch_path = 'D:/path/to/zip.zip' - >>> file_regex = '.*dir.*file_name' # Use .* for any character - >>> path = get_archived_path(arch_path, file_regex) - 'dir/filename.tif' - - Args: - archive_path (AnyPathStrType): Archive path - file_regex (str): File regex (used by re) as it can be found in the getmembers() list - as_list (bool): If true, returns a list (including all found files). If false, returns only the first match - - Returns: - Union[list, str]: Path from inside the zipfile - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.get_archived_path(archive_path, file_regex, as_list) - - -def get_archived_rio_path( - archive_path: AnyPathStrType, file_regex: str, as_list: bool = False -) -> Union[list, AnyPathType]: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Get archived file path from inside the archive, to be read with rasterio: - - - :code:`zip+file://{zip_path}!{file_name}` - - :code:`tar+file://{tar_path}!{file_name}` - - - See `here `_ - for more information. - - .. WARNING:: - It won't be readable by pandas, geopandas or xmltree ! - - .. WARNING:: - If :code:`as_list` is :code:`False`, it will only return the first file matched ! - - You can use this `site `_ to build your regex. - - Args: - archive_path (AnyPathStrType): Archive path - file_regex (str): File regex (used by re) as it can be found in the getmembers() list - as_list (bool): If true, returns a list (including all found files). If false, returns only the first match - - Returns: - Union[list, str]: Band path that can be read by rasterio - - Example: - >>> arch_path = 'D:/path/to/zip.zip' - >>> file_regex = '.*dir.*file_name' # Use .* for any character - >>> path = get_archived_tif_path(arch_path, file_regex) - 'zip+file://D:/path/to/output.zip!dir/filename.tif' - >>> rasterio.open(path) - - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.get_archived_rio_path(archive_path, file_regex, as_list) - - -def read_archived_file( - archive_path: AnyPathStrType, regex: str, file_list: list = None -) -> bytes: - """ - Read archived file (in bytes) from :code:`zip` or :code:`tar` archives. - - You can use this `site `_ to build your regex. - - Args: - archive_path (AnyPathStrType): Archive path - regex (str): Regex (used by re) as it can be found in the getmembers() list - file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. - - Returns: - bytes: Archived file in bytes - """ - archive_path = AnyPath(archive_path) - - # Compile regex - regex = re.compile(regex) - - # Open tar and zip XML - try: - if archive_path.suffix == ".tar": - with tarfile.open(archive_path) as tar_ds: - # file_list is not very useful for TAR files... - if file_list is None: - tar_mb = tar_ds.getmembers() - file_list = [mb.name for mb in tar_mb] - name = list(filter(regex.match, file_list))[0] - tarinfo = tar_ds.getmember(name) - file_str = tar_ds.extractfile(tarinfo).read() - elif archive_path.suffix == ".zip": - with zipfile.ZipFile(archive_path) as zip_ds: - if file_list is None: - file_list = [f.filename for f in zip_ds.filelist] - name = list(filter(regex.match, file_list))[0] - file_str = zip_ds.read(name) - - elif archive_path.suffix == ".tar.gz": - raise TypeError( - ".tar.gz files are too slow to read from inside the archive. Please extract them instead." - ) - else: - raise TypeError( - "Only .zip and .tar files can be read from inside its archive." - ) - except IndexError as exc: - raise FileNotFoundError( - f"Impossible to find file {regex} in {path.get_filename(archive_path)}" - ) from exc - - return file_str - - -def read_archived_xml( - archive_path: AnyPathStrType, regex: str = None, file_list: list = None, **kwargs -) -> etree._Element: - """ - Read archived XML from :code:`zip` or :code:`tar` archives. - - You can use this `site `_ to build your regex. - - Args: - archive_path (AnyPathStrType): Archive path - regex (str): XML regex (used by re) as it can be found in the getmembers() list - file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. - - Returns: - etree._Element: XML file - - Example: - >>> arch_path = 'D:/path/to/zip.zip' - >>> file_regex = '.*dir.*file_name' # Use .* for any character - >>> read_archived_xml(arch_path, file_regex) - - """ - if regex is None: - logs.deprecation_warning( - "'xml_regex' is deprecated, please use 'regex' instead." - ) - regex = kwargs.pop("xml_regex") - - xml_bytes = read_archived_file(archive_path, regex=regex, file_list=file_list) - - return etree.fromstring(xml_bytes) - - -def read_archived_html( - archive_path: AnyPathStrType, regex: str, file_list: list = None -) -> html.HtmlElement: - """ - Read archived HTML from :code:`zip` or :code:`tar` archives. - - You can use this `site `_ to build your regex. - - Args: - archive_path (AnyPathStrType): Archive path - regex (str): HTML regex (used by re) as it can be found in the getmembers() list - file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. - - Returns: - html._Element: HTML file - - Example: - >>> arch_path = 'D:/path/to/zip.zip' - >>> file_regex = '.*dir.*file_name' # Use .* for any character - >>> read_archived_html(arch_path, file_regex) - - """ - html_bytes = read_archived_file(archive_path, regex, file_list=file_list) - - return html.fromstring(html_bytes) - - -def archive( - folder_path: AnyPathStrType, - archive_path: AnyPathStrType, - fmt: str = "zip", -) -> AnyPathType: - """ - Archives a folder recursively. - - Args: - folder_path (AnyPathStrType): Folder to archive - archive_path (AnyPathStrType): Archive path, with or without extension - fmt (str): Format of the archive, used by :code:`shutil.make_archive`. Choose between [zip, tar, gztar, bztar, xztar] - - Returns: - str: Archive filename - - Example: - >>> folder_path = 'D:/path/to/folder_to_archive' - >>> archive_path = 'D:/path/to/output' - >>> archive = archive(folder_path, archive_path, fmt="gztar") - 'D:/path/to/output/folder_to_archive.tar.gz' - """ - archive_path = AnyPath(archive_path) - folder_path = AnyPath(folder_path) - - tmp_dir = None - if path.is_cloud_path(folder_path): - tmp_dir = tempfile.TemporaryDirectory() - folder_path = folder_path.download_to(tmp_dir.name) - - # Shutil make_archive needs a path without extension - archive_base = os.path.splitext(archive_path)[0] - - # Archive the folder - archive_fn = shutil.make_archive( - archive_base, - format=fmt, - root_dir=folder_path.parent, - base_dir=folder_path.name, - ) - - if tmp_dir is not None: - tmp_dir.cleanup() - - return AnyPath(archive_fn) - - -def add_to_zip( - zip_path: AnyPathStrType, - dirs_to_add: Union[list, AnyPathStrType], -) -> AnyPathType: - """ - Add folders to an already existing zip file (recursively). - - Args: - zip_path (AnyPathStrType): Already existing zip file - dirs_to_add (Union[list, AnyPathStrType]): Directories to add - - Returns: - AnyPathType: Updated zip_path - - Example: - >>> zip_path = 'D:/path/to/zip.zip' - >>> dirs_to_add = ['D:/path/to/dir1', 'D:/path/to/dir2'] - >>> add_to_zip(zip_path, dirs_to_add) - zip.zip contains 2 more folders, dir1 and dir2 - """ - zip_path = AnyPath(zip_path) - - # If the zip is on the cloud, cache it (zipfile doesn't like cloud paths) - if path.is_cloud_path(zip_path): - zip_path = AnyPath(zip_path.fspath) - - # Check if existing zipfile - if not zip_path.is_file(): - raise FileNotFoundError(f"Non existing {zip_path}") - - # Convert to list if needed - if not isinstance(dirs_to_add, list): - dirs_to_add = [dirs_to_add] - - # Add all folders to the existing zip - # Forced to use ZipFile because make_archive only works with one folder and not existing zipfile - with zipfile.ZipFile(zip_path, "a") as zip_file: - progress_bar = tqdm(dirs_to_add) - for dir_to_add_path in progress_bar: - # Just to be sure, use str instead of Paths - if isinstance(dir_to_add_path, Path): - dir_to_add = str(dir_to_add_path) - elif path.is_cloud_path(dir_to_add_path): - dir_to_add = dir_to_add_path.fspath - else: - dir_to_add = dir_to_add_path - - progress_bar.set_description( - f"Adding {os.path.basename(dir_to_add)} to {os.path.basename(zip_path)}" - ) - tmp = tempfile.TemporaryDirectory() - if os.path.isfile(dir_to_add): - dir_to_add = extract_file(dir_to_add, tmp.name) - - for root, _, files in os.walk(dir_to_add): - base_path = os.path.join(dir_to_add, "..") - - # Write dir (in namelist at least) - zip_file.write(root, os.path.relpath(root, base_path)) - - # Write files - for file in files: - zip_file.write( - os.path.join(root, file), - os.path.relpath( - os.path.join(root, file), os.path.join(dir_to_add, "..") - ), - ) - - # Clean tmp - tmp.cleanup() - - return zip_path - - -def get_filename(file_path: AnyPathStrType, other_exts: Union[list, str] = None) -> str: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Get file name (without extension) from file path, i.e.: - - Args: - file_path (AnyPathStrType): Absolute or relative file path (the file doesn't need to exist) - other_exts (Union[list, str]): Other double extensions to discard - - Returns: - str: File name (without extension) - - Example: - >>> file_path = 'D:/path/to/filename.zip' - >>> get_file_name(file_path) - 'filename' - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.get_filename(file_path, other_exts) - - -def get_ext(file_path: AnyPathStrType) -> str: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Get file extension from file path. - - .. WARNING:: - Extension is given WITHOUT THE FIRST POINT - - Args: - file_path (AnyPathStrType): Absolute or relative file path (the file doesn't need to exist) - - Returns: - str: File name (without extension) - - Example: - >>> file_path = 'D:/path/to/filename.zip' - >>> get_ext(file_path) - 'zip' - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.get_ext(file_path) - - def remove(path: AnyPathStrType) -> None: """ Deletes a file or a directory (recursively) using :code:`shutil.rmtree` or :code:`os.remove`. @@ -754,7 +126,7 @@ def copy(src: AnyPathStrType, dst: AnyPathStrType) -> AnyPathType: src = AnyPath(src) if path.is_cloud_path(src): - out = src.download_to(dst) + out = s3.download(src, dst) else: out = None try: @@ -772,54 +144,6 @@ def copy(src: AnyPathStrType, dst: AnyPathStrType) -> AnyPathType: return out -def find_files( - names: Union[list, str], - root_paths: Union[list, AnyPathStrType], - max_nof_files: int = -1, - get_as_str: bool = False, -) -> Union[list, str]: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Returns matching files recursively from a list of root paths. - - Regex are allowed (using glob) - - Args: - names (Union[list, str]): File names. - root_paths (Union[list, str]): Root paths - max_nof_files (int): Maximum number of files (set to -1 for unlimited) - get_as_str (bool): if only one file is found, it can be retrieved as a string instead of a list - - Returns: - list: File name - - Examples: - >>> root_path = 'D:/root' - >>> dir1_path = 'D:/root/dir1' - >>> dir2_path = 'D:/root/dir2' - >>> - >>> os.listdir(dir1_path) - ["haha.txt", "huhu.txt", "hoho.txt"] - >>> os.listdir(dir2_path) - ["huhu.txt", "hehe.txt"] - >>> - >>> find_files("huhu.txt", root_path) - ['D:/root/dir1/huhu.txt', 'D:/root/dir2/huhu.txt'] - >>> - >>> find_files("huhu.txt", root_path, max_nof_files=1) - ['D:/root/dir1/huhu.txt'] - - >>> find_files("huhu.txt", root_path, max_nof_files=1, get_as_str=True) - found = 'D:/root/dir1/huhu.txt' - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.find_files(names, root_paths, max_nof_files, get_as_str) - - # subclass JSONDecoder class CustomDecoder(JSONDecoder): """Decoder for JSON with methods for datetimes""" @@ -927,15 +251,6 @@ def save_json(json_dict: dict, output_json: AnyPathStrType, **kwargs) -> None: >>> json_dict = {"A": np.int64(1), "B": datetime.today(), "C": SomeEnum.some_name} >>> save_json(output_json, json_dict) """ - if isinstance(output_json, dict): - # Old order. Swap the variables. - logs.deprecation_warning( - "The order of the function has changed. Please set json_dict in first!" - ) - tmp = output_json - output_json = json_dict - json_dict = tmp - kwargs["indent"] = kwargs.get("indent", 3) kwargs["cls"] = kwargs.get("cls", CustomEncoder) @@ -982,66 +297,6 @@ def load_obj(path: AnyPathStrType) -> Any: return dill.load(file) -# too many arguments -# pylint: disable=R0913 -def get_file_in_dir( - directory: AnyPathStrType, - pattern_str: str, - extension: str = None, - filename_only: bool = False, - get_list: bool = False, - exact_name: bool = False, -) -> Union[AnyPathType, list]: - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Get one or all matching files (pattern + extension) from inside a directory. - - Note that the pattern is a regex with glob's convention, i.e. :code:`*pattern*`. - - If :code:`exact_name` is :code:`False`, the searched pattern will be :code:`*{pattern}*.{extension}`, - else :code:`{pattern}.{extension}`. - - Args: - directory (str): Directory where to find the files - pattern_str (str): Pattern wanted as a string, with glob's convention. - extension (str): Extension wanted, optional. With or without point. (:code:`yaml` or :code:`.yaml` accepted) - filename_only (bool): Get only the filename - get_list (bool): Get the whole list of matching files - exact_name (bool): Get the exact name (without adding :code:`*` before and after the given pattern) - - Returns: - Union[AnyPathType, list]: File - - Example: - >>> directory = 'D:/path/to/dir' - >>> os.listdir(directory) - ["haha.txt", "huhu1.txt", "huhu1.geojson", "hoho.txt"] - >>> - >>> get_file_in_dir(directory, "huhu") - 'D:/path/to/dir/huhu1.geojson' - >>> - >>> get_file_in_dir(directory, "huhu", extension="txt") - 'D:/path/to/dir/huhu1.txt' - >>> - >>> get_file_in_dir(directory, "huhu", get_list=True) - ['D:/path/to/dir/huhu1.txt', 'D:/path/to/dir/huhu1.geojson'] - >>> - >>> get_file_in_dir(directory, "huhu", filename_only=True, get_list=True) - ['huhu1.txt', 'huhu1.geojson'] - >>> - >>> get_file_in_dir(directory, "huhu", get_list=True, exact_name=True) - [] - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.get_file_in_dir( - directory, pattern_str, extension, filename_only, get_list, exact_name - ) - - # pylint: disable=E1121 def hash_file_content(file_content: str, len_param: int = 5) -> str: """ @@ -1064,22 +319,3 @@ def hash_file_content(file_content: str, len_param: int = 5) -> str: hasher = hashlib.shake_256() hasher.update(str.encode(file_content)) return hasher.hexdigest(len_param) - - -def is_writable(dir_path: AnyPathStrType): - """ - .. deprecated:: 1.30.0 - Import it from :py:mod:`sertit.path` instead of :py:mod:`sertit.files` - - Determine whether the directory is writeable or not. - - Args: - dir_path (AnyPathStrType): Directory path - - Returns: - bool: True if the directory is writable - """ - logs.deprecation_warning( - "This function is deprecated. Import it from 'sertit.path' instead of 'sertit.files'" - ) - return path.is_writable(dir_path) diff --git a/sertit/path.py b/sertit/path.py index aeb1f12..504c526 100644 --- a/sertit/path.py +++ b/sertit/path.py @@ -15,17 +15,15 @@ # limitations under the License. """Tools for paths""" +import contextlib import errno import logging import os import pprint -import re -import tarfile import tempfile -import zipfile from typing import Any, Union -from sertit import AnyPath, logs +from sertit import AnyPath from sertit.logs import SU_NAME from sertit.types import AnyPathStrType, AnyPathType @@ -150,181 +148,6 @@ def real_rel_path(raw_path: AnyPathStrType, start: AnyPathStrType) -> AnyPathTyp return rel_path -def get_archived_file_list(archive_path: AnyPathStrType) -> list: - """ - Get the list of all the files contained in an archive. - - Args: - archive_path (AnyPathStrType): Archive path - - Returns: - list: All files contained in the given archive - - Example: - >>> arch_path = 'D:/path/to/zip.zip' - >>> get_archived_file_list(arch_path, file_regex) - ['file_1.txt', 'file_2.tif', 'file_3.xml', 'file_4.geojson'] - """ - archive_path = AnyPath(archive_path) - if archive_path.suffix == ".zip": - with zipfile.ZipFile(archive_path) as zip_ds: - file_list = [f.filename for f in zip_ds.filelist] - else: - try: - with tarfile.open(archive_path) as tar_ds: - tar_mb = tar_ds.getmembers() - file_list = [mb.name for mb in tar_mb] - except tarfile.ReadError as ex: - raise tarfile.ReadError( - f"Impossible to open archive: {archive_path}" - ) from ex - - return file_list - - -def get_archived_path( - archive_path: AnyPathStrType, - regex: str, - as_list: bool = False, - case_sensitive: bool = False, - file_list: list = None, - **kwargs, -) -> Union[list, AnyPathType]: - """ - Get archived file path from inside the archive. - - .. WARNING:: - If :code:`as_list` is :code:`False`, it will only return the first file matched ! - - You can use this `site `_ to build your regex. - - Args: - archive_path (AnyPathStrType): Archive path - regex (str): File regex (used by re) as it can be found in the getmembers() list - as_list (bool): If true, returns a list (including all found files). If false, returns only the first match - case_sensitive (bool): If true, the regex is case-sensitive. - file_list (list): List of files to get archived from. Optional, if not given it will be re-computed. - - Returns: - Union[list, str]: Path from inside the zipfile - - Example: - >>> arch_path = 'D:/path/to/zip.zip' - >>> file_regex = '.*dir.*file_name' # Use .* for any character - >>> path = get_archived_path(arch_path, file_regex) - 'dir/filename.tif' - """ - if regex is None: - logs.deprecation_warning( - "'file_regex' is deprecated, please use 'regex' instead." - ) - regex = kwargs.pop("file_regex") - - # Get file list - archive_path = AnyPath(archive_path) - - # Offer the ability to give the file list directly, as this operation is expensive when done with large archives stored on the cloud - if file_list is None: - file_list = get_archived_file_list(archive_path) - - # Search for file - re_rgx = re.compile(regex) if case_sensitive else re.compile(regex, re.IGNORECASE) - archived_band_paths = list(filter(re_rgx.match, file_list)) - if not archived_band_paths: - raise FileNotFoundError( - f"Impossible to find file {regex} in {get_filename(archive_path)}" - ) - - # Convert to str if needed - if not as_list: - archived_band_paths = archived_band_paths[0] - - return archived_band_paths - - -def get_archived_rio_path( - archive_path: AnyPathStrType, - regex: str, - as_list: bool = False, - file_list: list = None, - **kwargs, -) -> Union[list, AnyPathType]: - """ - Get archived file path from inside the archive, to be read with rasterio: - - - :code:`zip+file://{zip_path}!{file_name}` - - :code:`tar+file://{tar_path}!{file_name}` - - - See `here `_ - for more information. - - .. WARNING:: - It wont be readable by pandas, geopandas or xmltree ! - - .. WARNING:: - If :code:`as_list` is :code:`False`, it will only return the first file matched ! - - You can use this `site `_ to build your regex. - - Args: - archive_path (AnyPathStrType): Archive path - regex (str): File regex (used by re) as it can be found in the getmembers() list - as_list (bool): If true, returns a list (including all found files). If false, returns only the first match - file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. - - Returns: - Union[list, str]: Band path that can be read by rasterio - - Example: - >>> arch_path = 'D:/path/to/zip.zip' - >>> file_regex = '.*dir.*file_name' # Use .* for any character - >>> path = get_archived_tif_path(arch_path, file_regex) - 'zip+file://D:/path/to/output.zip!dir/filename.tif' - >>> rasterio.open(path) - - """ - if regex is None: - logs.deprecation_warning( - "'file_regex' is deprecated, please use 'regex' instead." - ) - regex = kwargs.pop("file_regex") - - archive_path = AnyPath(archive_path) - if archive_path.suffix in [".tar", ".zip"]: - prefix = archive_path.suffix[-3:] - elif archive_path.suffix == ".tar.gz": - raise TypeError( - ".tar.gz files are too slow to be read from inside the archive. Please extract them instead." - ) - else: - raise TypeError("Only .zip and .tar files can be read from inside its archive.") - - # Search for file - archived_band_paths = get_archived_path( - archive_path, regex=regex, as_list=True, file_list=file_list - ) - - # Convert to rio path - if is_cloud_path(archive_path): - archived_band_paths = [ - f"{prefix}+file+{archive_path}!{path}" for path in archived_band_paths - ] - else: - # archived_band_paths = [ - # f"{prefix}+file://{archive_path}!{path}" for path in archived_band_paths - # ] - archived_band_paths = [ - f"/vsi{prefix}/{archive_path}/{path}" for path in archived_band_paths - ] - - # Convert to str if needed - if not as_list: - archived_band_paths = archived_band_paths[0] - - return archived_band_paths - - def get_filename(file_path: AnyPathStrType, other_exts: Union[list, str] = None) -> str: """ Get file name (without extension) from file path, ie: @@ -589,25 +412,46 @@ def is_cloud_path(path: AnyPathStrType): bool: True if the file is store on the cloud. """ try: - from cloudpathlib import CloudPath + return AnyPath(path).protocol in [ + "s3", + "az", + "adl", + "abfs", + "abfss", + "gs", + "gcs", + ] + except AttributeError: + try: + from cloudpathlib import CloudPath - return isinstance(AnyPath(path), CloudPath) - except Exception: - return False + return isinstance(AnyPath(path), CloudPath) + except Exception: + return False def is_path(path: Any) -> bool: """ - Determine whether the path corresponds to a file stored on the cloud or not. + Determine whether the path is really a path or not: either str, Path, UPath or CloudPath Args: path (AnyPathStrType): File path Returns: - bool: True if the file is store on the cloud. + bool: True if the file is a path """ from pathlib import Path - from cloudpathlib import CloudPath + is_path = isinstance(path, (str, Path)) + + with contextlib.suppress(ImportError): + from upath import UPath + + is_path = is_path or isinstance(path, UPath) + + with contextlib.suppress(ImportError): + from cloudpathlib import CloudPath + + is_path = is_path or isinstance(path, CloudPath) - return isinstance(path, (str, Path, CloudPath)) + return is_path diff --git a/sertit/rasters.py b/sertit/rasters.py index a835a66..52ca67c 100644 --- a/sertit/rasters.py +++ b/sertit/rasters.py @@ -33,7 +33,7 @@ try: import rasterio import rioxarray - from rasterio import MemoryFile, features + from rasterio import features from rasterio.enums import Resampling from rioxarray.exceptions import MissingCRS except ModuleNotFoundError as ex: @@ -126,25 +126,6 @@ def get_nodata_value_from_dtype(dtype) -> float: return rasters_rio.get_nodata_value_from_dtype(dtype) -def get_nodata_value(dtype) -> float: - """ - .. deprecated:: 1.41.0 - Use :code:`get_nodata_value_from_dtype` instead. - - Get default nodata value: - - Args: - dtype: Dtype for the wanted nodata. Best if numpy's dtype. - - Returns: - float: Nodata value - """ - logs.deprecation_warning( - "This function is deprecated. Use 'get_nodata_value_from_dtype' instead." - ) - return get_nodata_value_from_dtype(dtype) - - def any_raster_to_xr_ds(function: Callable) -> Callable: """ Allows a function to ingest AnyRasterType and convert it into a xr.DataArray: @@ -191,8 +172,8 @@ def wrapper(any_raster_type: AnyRasterType, *args, **kwargs) -> Any: if any_raster_type is None: raise ValueError("'any_raster_type' shouldn't be None!") - default_chunks = True if dask.get_client() is not None else None - + default_chunks = "auto" if dask.get_client() is not None else None + masked = kwargs.get("masked", True) # By default, try with the input fct try: out = function(any_raster_type, *args, **kwargs) @@ -216,67 +197,17 @@ def wrapper(any_raster_type: AnyRasterType, *args, **kwargs) -> Any: except Exception as ex: raise TypeError("Function not available for xarray.Dataset") from ex - elif isinstance(any_raster_type, tuple): - arr, meta = any_raster_type - with ( - MemoryFile() as memfile, - memfile.open( - **meta, BIGTIFF=rasters_rio.bigtiff_value(any_raster_type) - ) as ds, - ): - ds.write(arr.data) - - with rioxarray.open_rasterio( - any_raster_type, - masked=True, - default_name=ds.name, - chunks=kwargs.pop("chunks", default_chunks), - ) as xds: - out = function(xds, *args, **kwargs) else: - # Get the path from the input - if path.is_path(any_raster_type): - name = str(any_raster_type) - any_raster_type = str(any_raster_type) - else: - # For rasterio datasets, '.name' gives the path - name = any_raster_type.name - - # Convert path or rasterio.dataset to xr.dataset - with rioxarray.open_rasterio( - any_raster_type, - masked=True, - default_name=name, - chunks=kwargs.pop("chunks", default_chunks), - ) as xds: - out = function(xds, *args, **kwargs) - + out = function( + read(any_raster_type, chunks=default_chunks, masked=masked), + *args, + **kwargs, + ) return out return wrapper -def path_xarr_dst(function: Callable) -> Callable: - """ - .. deprecated:: 1.40.0 - Use :py:func:`rasters.any_raster_to_xr_ds` instead. - """ - logs.deprecation_warning( - "Deprecated 'path_xarr_dst' decorator. Please use 'any_raster_to_xr_ds' instead." - ) - return any_raster_to_xr_ds(function) - - -@any_raster_to_xr_ds -def get_nodata_mask(xds: AnyXrDataStructure) -> np.ndarray: - """ - .. deprecated:: 1.36.0 - Use :py:func:`rasters.get_data_mask` instead. - """ - logs.deprecation_warning("This function is deprecated. Use 'get_data_mask' instead") - return get_data_mask(xds) - - @any_raster_to_xr_ds def get_data_mask(xds: AnyXrDataStructure) -> np.ndarray: """ @@ -988,13 +919,19 @@ def read( rioxarray.set_options(export_grid_mapping=False), rioxarray.open_rasterio( ds, - lock=False, default_name=path.get_filename(ds.name), chunks=chunks, + masked=masked, **kwargs, ) as xda, ): - orig_dtype = xda.dtype + orig_dtype = xda.encoding.get( + "rasterio_dtype", xda.encoding.get("dtype", xda.dtype) + ) + + if isinstance(orig_dtype, str): + with contextlib.suppress(AttributeError): + orig_dtype = getattr(np, orig_dtype) # Windows if window is not None: @@ -1104,12 +1041,6 @@ def write( >>> # Rewrite it >>> write(xds, raster_out) """ - if output_path is None: - logs.deprecation_warning( - "'path' is deprecated in 'rasters.write'. Use 'output_path' instead." - ) - output_path = kwargs.pop("path") - # Prune empty kwargs to avoid throwing GDAL warnings/errors kwargs = {k: v for k, v in kwargs.items() if v is not None} @@ -1403,14 +1334,15 @@ def sieve( assert connectivity in [4, 8] - # Use this trick to make the sieve work - mask = np.where(np.isnan(xds.data), 0, 1).astype(np.uint8) - data = xds.data.astype(np.uint8) + mask = xr.where(np.isnan(xds), 0, 1).astype(np.uint8).data + data = xds.astype(np.uint8).data # Sieve try: sieved_arr = xr.apply_ufunc( - features.sieve, data, sieve_thresh, connectivity, mask + features.sieve, + data, + kwargs={"size": sieve_thresh, "connectivity": connectivity, "mask": mask}, ) except ValueError: sieved_arr = features.sieve( diff --git a/sertit/rasters_rio.py b/sertit/rasters_rio.py index bc0fcdf..9663590 100644 --- a/sertit/rasters_rio.py +++ b/sertit/rasters_rio.py @@ -43,7 +43,7 @@ "Please install 'rasterio' to use the 'rasters_rio' package." ) from ex -from sertit import AnyPath, geometry, logs, misc, path, strings, vectors, xml +from sertit import AnyPath, geometry, misc, path, s3, strings, vectors, xml from sertit.logs import SU_NAME from sertit.types import AnyNumpyArray, AnyPathStrType, AnyPathType, AnyRasterType @@ -112,25 +112,6 @@ def get_nodata_value_from_dtype(dtype) -> float: return nodata -def get_nodata_value(dtype) -> float: - """ - .. deprecated:: 1.41.0 - Use :code:`get_nodata_value_from_dtype` instead. - - Get default nodata value: - - Args: - dtype: Dtype for the wanted nodata. Best if numpy's dtype. - - Returns: - float: Nodata value - """ - logs.deprecation_warning( - "This function is deprecated. Use 'get_nodata_value_from_dtype' instead." - ) - return get_nodata_value_from_dtype(dtype) - - def bigtiff_value(arr: Any) -> str: """ Returns :code:`YES` if array is larger than 4 GB, :code:`IF_NEEDED` otherwise. @@ -250,17 +231,6 @@ def wrapper(any_raster_type: AnyRasterType, *args, **kwargs) -> Any: return wrapper -def path_arr_dst(function: Callable) -> Callable: - """ - .. deprecated:: 1.40.0 - Use :py:func:`rasters.any_raster_to_rio_ds` instead. - """ - logs.deprecation_warning( - "Deprecated 'path_arr_dst' decorator. Please use 'any_raster_to_rio_ds' instead." - ) - return any_raster_to_rio_ds(function) - - @any_raster_to_rio_ds def get_new_shape( ds: AnyRasterType, @@ -424,19 +394,6 @@ def update_meta(arr: AnyNumpyArray, meta: dict) -> dict: return out_meta -def get_nodata_mask( - array: AnyNumpyArray, - has_nodata: bool, - default_nodata: int = 0, -) -> np.ndarray: - """ - .. deprecated:: 1.36.0 - Use :py:func:`rasters_rio.get_data_mask` instead. - """ - logs.deprecation_warning("This function is deprecated. Use 'get_data_mask' instead") - return get_data_mask(array, has_nodata, default_nodata) - - def get_data_mask( array: AnyNumpyArray, has_nodata: bool, @@ -1090,12 +1047,6 @@ def write( >>> # Rewrite it on disk >>> write(raster, meta, raster_out) """ - if output_path is None: - logs.deprecation_warning( - "'path' is deprecated in 'rasters_rio.write'. Use 'output_path' instead." - ) - output_path = kwargs.pop("path") - raster_out = raster.copy() # Prune empty kwargs to avoid throwing GDAL warnings/errors @@ -1427,7 +1378,7 @@ def merge_vrt( crs_path = AnyPath(crs_path) # Download file if VRT is needed if path.is_cloud_path(crs_path): - crs_path = crs_path.download_to(merged_path.parent) + crs_path = s3.download(crs_path, merged_path.parent) with rasterio.open(str(crs_path)) as src: if first_crs is None: diff --git a/sertit/s3.py b/sertit/s3.py index 37588e9..bc14535 100644 --- a/sertit/s3.py +++ b/sertit/s3.py @@ -17,13 +17,16 @@ S3 tools """ +import contextlib import logging import os from contextlib import contextmanager from functools import wraps +from io import BytesIO from cloudpathlib import S3Client +from sertit import AnyPath, path from sertit.logs import SU_NAME LOGGER = logging.getLogger(SU_NAME) @@ -271,3 +274,55 @@ def define_s3_client( client = S3Client(**args_s3_client) client.set_as_default_client() + + +def download(src, dst): + # By default, use the src path + downloaded_path = src + + # Universal pathlib + if path.is_cloud_path(src): + import shutil + + with contextlib.suppress(ImportError): + from upath import UPath + + if isinstance(src, UPath): + dst = AnyPath(dst) + if dst.is_dir() and src.name != dst.name: + downloaded_path = dst / src.name + else: + downloaded_path = dst + + if src.is_file(): + with src.open("rb") as f0, downloaded_path.open("wb") as f1: + shutil.copyfileobj(f0, f1) + else: + downloaded_path.parent.mkdir(parents=True, exist_ok=True) + + for f in src.glob("**"): + dst_file = downloaded_path / f.name + if f.is_file(): + dst_file.parent.mkdir(parents=True, exist_ok=True) + with f.open("rb") as f0, dst_file.open("wb") as f1: + shutil.copyfileobj(f0, f1) + + # cloudpathlib + with contextlib.suppress(ImportError): + from cloudpathlib import CloudPath + + if isinstance(src, CloudPath): + downloaded_path = src.fspath if dst is None else src.download_to(dst) + + return downloaded_path + + +def read(src): + src = AnyPath(src) + try: + b = src.read_bytes() + except Exception: + with src.open("rb") as f: + b = f.read() + + return BytesIO(b) diff --git a/sertit/types.py b/sertit/types.py index 59c99fe..3cea7a9 100644 --- a/sertit/types.py +++ b/sertit/types.py @@ -5,14 +5,23 @@ import geopandas as gpd import numpy as np import xarray as xr -from cloudpathlib import CloudPath from rasterio.io import DatasetReader, DatasetWriter from shapely import MultiPolygon, Polygon -AnyPathType = Union[CloudPath, Path] -"""Any Path Type (derived from Pathlib and CloudpathLib)""" +try: + from upath import UPath +except ImportError: + UPath = None -AnyPathStrType = Union[str, CloudPath, Path] +try: + from cloudpathlib import CloudPath +except ImportError: + CloudPath = None + +AnyPathType = Union[CloudPath, Path, UPath] +"""Any Path Type (derived from Pathlib, Universal Pathlib and CloudpathLib)""" + +AnyPathStrType = Union[str, AnyPathType] """Same as :code:`AnyPathType` but appened with :code:`str`""" AnyXrDataStructure = Union[xr.DataArray, xr.Dataset] diff --git a/sertit/vectors.py b/sertit/vectors.py index 15074e8..42aa728 100644 --- a/sertit/vectors.py +++ b/sertit/vectors.py @@ -23,9 +23,7 @@ import os import re import shutil -import tarfile import tempfile -import zipfile from collections.abc import Generator from contextlib import contextmanager from typing import Any, Union @@ -36,7 +34,7 @@ from cloudpathlib.exceptions import AnyPathTypeError from shapely import Polygon, wkt -from sertit import AnyPath, files, geometry, logs, misc, path, strings +from sertit import AnyPath, archives, files, geometry, misc, path, s3, strings from sertit.logs import SU_NAME from sertit.types import AnyPathStrType, AnyPathType @@ -80,9 +78,6 @@ def is_geopandas_1_0(): def to_utm_crs(lon: float, lat: float) -> "CRS": # noqa: F821 """ - .. deprecated:: 1.29.1 - Use `estimate_utm_crs `_ instead, which directly returs a CRS instead of a string. - Find the EPSG code of the UTM CRS from a lon/lat in WGS84. Args: @@ -118,43 +113,6 @@ def to_utm_crs(lon: float, lat: float) -> "CRS": # noqa: F821 return gpd.GeoDataFrame(geometry=point, crs=EPSG_4326).estimate_utm_crs() -def corresponding_utm_projection(lon: float, lat: float) -> str: - """ - .. deprecated:: 1.29.1 - Use `estimate_utm_crs `_ instead, which directly returs a CRS instead of a string. - - Find the EPSG code of the UTM CRS from a lon/lat in WGS84. - - Args: - lon (float): Longitude (WGS84, epsg:4326) - lat (float): Latitude (WGS84, epsg:4326) - - Returns: - CRS: UTM CRS - - Example: - >>> to_utm_crs(lon=7.8, lat=48.6) # Strasbourg - - Name: WGS 84 / UTM zone 32N - Axis Info [cartesian]: - - E[east]: Easting (metre) - - N[north]: Northing (metre) - Area of Use: - - bounds: (6.0, 0.0, 12.0, 84.0) - Coordinate Operation: - - name: UTM zone 32N - - method: Transverse Mercator - Datum: World Geodetic System 1984 ensemble - - Ellipsoid: WGS 84 - - Prime Meridian: Greenwich - - """ - logs.deprecation_warning( - "Deprecated, use 'to_utm_crs' instead, which directly returs a CRS instead of a string." - ) - return to_utm_crs(lon, lat).to_string() - - def get_geodf(geom: Union[Polygon, list, gpd.GeoSeries], crs: str) -> gpd.GeoDataFrame: """ Get a GeoDataFrame from a geometry and a crs @@ -256,8 +214,11 @@ def get_aoi_wkt(aoi_path: AnyPathStrType, as_str: bool = True) -> Union[str, Pol if aoi_path.suffix == ".wkt": try: - with open(aoi_path) as aoi_f: - aoi = wkt.load(aoi_f) + if path.is_cloud_path(aoi_path): + aoi = wkt.load(s3.read(aoi_path)) + else: + with open(aoi_path) as aoi_f: + aoi = wkt.load(aoi_f) except Exception as ex: raise ValueError("AOI WKT cannot be read") from ex else: @@ -472,13 +433,17 @@ def read( if "!" in str(vector_path): split_vect = str(vector_path).split("!") archive_regex = ".*{}".format(split_vect[1].replace(".", r"\.")) - vector_path = AnyPath(split_vect[0]) + try: + vector_path = AnyPath(split_vect[0], **vector_path.storage_options) + except AttributeError: + # Cloudpathlib + vector_path = AnyPath(split_vect[0]) # Manage archive case if vector_path.suffix in [".tar", ".zip"]: prefix = vector_path.suffix[-3:] file_list = kwargs.pop( - "file_list", path.get_archived_file_list(vector_path) + "file_list", archives.get_archived_file_list(vector_path) ) try: @@ -715,16 +680,16 @@ def ogr2geojson( # archived vector_path are extracted in a tmp folder so no need to be downloaded if vector_path.suffix == ".zip": - with zipfile.ZipFile(vector_path, "r") as zip_ds: + with archives.open_zipfile(vector_path, "r") as zip_ds: vect_path = zip_ds.extract(arch_vect_path, out_dir) elif vector_path.suffix == ".tar": - with tarfile.open(vector_path, "r") as tar_ds: + with archives.open_tarfile(vector_path, "r") as tar_ds: tar_ds.extract(arch_vect_path, out_dir) vect_path = os.path.join(out_dir, arch_vect_path) else: # vector_path should be downloaded to work with 'ogr2ogr' if path.is_cloud_path(vector_path): - vector_path = AnyPath(vector_path).fspath + vector_path = s3.download(vector_path, out_dir) vect_path = vector_path vect_path_gj = os.path.join( diff --git a/sertit/xml.py b/sertit/xml.py index 9ddc44a..befa80f 100644 --- a/sertit/xml.py +++ b/sertit/xml.py @@ -30,7 +30,7 @@ ) from lxml.html.builder import E -from sertit import AnyPath, files, path +from sertit import AnyPath, archives, path, s3 from sertit.logs import SU_NAME from sertit.misc import ListEnum from sertit.types import AnyPathStrType @@ -61,7 +61,7 @@ def read(xml_path: AnyPathStrType) -> _Element: # Slower but works with: # {ValueError}Unicode strings with encoding declaration are not supported. # Please use bytes input or XML fragments without declaration. - root = fromstring(xml_path.read_bytes()) + root = fromstring(s3.read(xml_path).read()) else: # pylint: disable=I1101: # Module 'lxml.etree' has no 'parse' member, but source is unavailable. @@ -75,7 +75,10 @@ def read(xml_path: AnyPathStrType) -> _Element: def read_archive( - path: AnyPathStrType, regex: str = None, file_list: list = None + archive_path: AnyPathStrType, + regex: str = None, + file_list: list = None, + **kwargs, ) -> _Element: """ Read an XML file from inside an archive (zip or tar) @@ -87,25 +90,34 @@ def read_archive( - path to the archive plus a regex looking inside the archive. Duplicate behaviour to :py:func:`files.read_archived_xml` Args: - path (AnyPathStrType): Path to the XML file, stored inside an archive or path to the archive itself + archive_path (AnyPathStrType): Path to the XML file, stored inside an archive or path to the archive itself regex (str): Optional. If specified, the path should be the archive path and the regex should be the key to find the XML file inside the archive. file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed. Returns: _Element: XML Root """ - try: if not regex: - path, basename = str(path).split("!") + archive_base_path, basename = str(archive_path).split("!") regex = basename - if path.startswith("zip://") or path.startswith("tar://"): - path = path[5:] + if archive_base_path.startswith("zip://") or archive_base_path.startswith( + "tar://" + ): + archive_base_path = archive_base_path[5:] + + # For UPath + with contextlib.suppress(AttributeError): + archive_base_path = AnyPath( + archive_base_path, **archive_path.storage_options + ) + else: + archive_base_path = archive_path - return files.read_archived_xml(path, regex, file_list=file_list) + return archives.read_archived_xml(archive_base_path, regex, file_list=file_list) except XMLSyntaxError as exc: - raise ValueError(f"Invalid metadata XML for {path}!") from exc + raise ValueError(f"Invalid metadata XML for {archive_path}!") from exc def write(xml: _Element, path: str) -> None: