diff --git a/.github/workflows/workflows.yaml b/.github/workflows/workflows.yaml index fb1fdea0..50e9f216 100644 --- a/.github/workflows/workflows.yaml +++ b/.github/workflows/workflows.yaml @@ -7,11 +7,12 @@ jobs: strategy: matrix: python-version: ["3.9"] - os: ["ubuntu-latest", "macos-latest"] + os: ["ubuntu-latest"] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 + - uses: iterative/setup-cml@v1 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: @@ -20,8 +21,15 @@ jobs: run: | brew install c-blosc hdf5 if: matrix.os == 'macos-latest' + - name: Do some Ubunutu specific installs for Python 3.9 + if: runner.os == 'Linux' + run: | + sudo apt install ${{inputs.sudo_apt_install}} - name: Install dependencies run: | + # $CONDA is an environment variable pointing to the root of the miniconda directory + $CONDA/bin/conda install eccodes numpy matplotlib rasterio satpy[all] cartopy -c conda-forge + $CONDA/bin/pip install -e . python -m pip install --upgrade pip pip install pytest-xdist if [ -f requirements.txt ]; then pip install -r requirements.txt; fi diff --git a/README.md b/README.md index 4a51c318..35a17746 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,14 @@ pre-commit install ## Operation +### Getting your own API key + +In order to contribute to development or just test-run some scripts, you will need your own Eumetsat-API-key. Please follow these steps: + +1. Go to https://eoportal.eumetsat.int and register an account. +2. You can log in and got to https://data.eumetsat.int/ to check available data services. From there go to your profile and choose the option "API key" or go to https://api.eumetsat.int/api-key/ directly. +3. Please make sure that you added the key and secret to your user's environment variables. + ### Downloading EUMETSAT Data The following command will download the last 2 hours of RSS imagery into NetCDF files at the specified location diff --git a/satip/app.py b/satip/app.py index 42445523..562e3a96 100644 --- a/satip/app.py +++ b/satip/app.py @@ -68,6 +68,7 @@ def run(api_key, api_secret, save_dir, history, db_url: Optional[str] = None): api_secret: Secret for EUMETSAT save_dir: Save directory history: History time + db_url: URL of database """ logger.info(f'Running application and saving to "{save_dir}"') diff --git a/satip/download.py b/satip/download.py index 5b59050e..8b86ace1 100644 --- a/satip/download.py +++ b/satip/download.py @@ -65,6 +65,7 @@ def download_eumetsat_data( auth_filename: Optional[str] = None, number_of_processes: int = 0, product: Union[str, List[str]] = ["rss", "cloud"], + enforce_full_days: bool = True, ): """Downloads EUMETSAT RSS and Cloud Masks @@ -83,6 +84,10 @@ def download_eumetsat_data( auth_filename: Path to a file containing the user_secret and user_key number_of_processes: Number of processes to use product: Which product(s) to download + enforce_full_days: Set to True means you download data for daily batches, + i.e. no matter how you set the time of the end_date, + you will always get a full day. Set to False to get + incomplete days to strictly adhere to your start/end_date set. """ # Get authentication @@ -91,6 +96,12 @@ def download_eumetsat_data( raise RuntimeError("Please do not set BOTH auth_filename AND user_key or user_secret!") user_key, user_secret = _load_key_secret(auth_filename) + # If neither the auth-file-name nor the user-key / secret have been given, + # try to infer from the environment variable: + if auth_filename is None and (user_key is None or user_secret is None): + user_key = os.environ.get("EUMETSAT_USER_KEY") + user_secret = os.environ.get("EUMETSAT_USER_SECRET") + if backfill: # Set to year before data started to ensure gets everything # No downside to requesting an earlier time @@ -105,15 +116,19 @@ def download_eumetsat_data( products_to_use.append(RSS_ID) if "cloud" in product: products_to_use.append(CLOUD_ID) + for product_id in products_to_use: # Do this to clear out any partially downloaded days _sanity_check_files_and_move_to_directory( directory=download_directory, product_id=product_id ) - times_to_use = _determine_datetimes_to_download_files( - download_directory, start_date, end_date, product_id=product_id - ) + if enforce_full_days: + times_to_use = _determine_datetimes_to_download_files( + download_directory, start_date, end_date, product_id=product_id + ) + else: + times_to_use = [(pd.to_datetime(start_date), pd.to_datetime(end_date))] _LOG.info(times_to_use) if number_of_processes > 0: diff --git a/satip/eumetsat.py b/satip/eumetsat.py index 7397f061..9b600ac5 100644 --- a/satip/eumetsat.py +++ b/satip/eumetsat.py @@ -259,7 +259,10 @@ def __init__( self.data_dir = data_dir if not os.path.exists(self.data_dir): - os.makedirs(self.data_dir) + try: + os.makedirs(self.data_dir) + except PermissionError: + raise PermissionError(f"No permission to create {self.data_dir}.") # Adding satip helper functions self.identify_available_datasets = identify_available_datasets diff --git a/satip/jpeg_xl_float_with_nans.py b/satip/jpeg_xl_float_with_nans.py index fb2847a5..ded95e3d 100644 --- a/satip/jpeg_xl_float_with_nans.py +++ b/satip/jpeg_xl_float_with_nans.py @@ -7,6 +7,9 @@ from numcodecs.registry import register_codec # See the docs in `encode_nans` for an explanation of what these consts do. +# Also, please be aware that for sensical results, the values should follow the ordering: +# LOWER_BOUND_FOR_REAL_PIXELS > NAN_THRESHOLD > NAN_VALUE. +# Otherwise encoding and decoding back will lead to valid pixels being marked NaN. LOWER_BOUND_FOR_REAL_PIXELS = 0.075 NAN_THRESHOLD = 0.05 NAN_VALUE = 0.025 @@ -87,7 +90,7 @@ def encode(self, buf: np.ndarray) -> None: If all the information is squished into, say, the range [0, 0.1] then JPEG-XL will interpret the image as very dark, and will agressively compress the data because JPEG-XL assumes that human viewers do not - notice if detail is lost in the shaddows. + notice if detail is lost in the shadows. """ assert buf.dtype in ( np.float16, diff --git a/satip/scale_to_zero_to_one.py b/satip/scale_to_zero_to_one.py index ba1727b7..9347311c 100644 --- a/satip/scale_to_zero_to_one.py +++ b/satip/scale_to_zero_to_one.py @@ -79,7 +79,7 @@ def __init__( self.maxs = maxs self.variable_order = variable_order - def fit(self, dataset: xr.Dataset, dims: Iterable = ("time", "y", "x")) -> None: + def fit(self, dataset: xr.Dataset, dims: Iterable = ("time", "y", "x")) -> object: """ Calculate new min and max values for the compression @@ -97,6 +97,8 @@ def fit(self, dataset: xr.Dataset, dims: Iterable = ("time", "y", "x")) -> None: print(f"The maxs are: {self.maxs}") print(f"The variable order is: {self.variable_order}") + return self + def rescale(self, dataarray: xr.DataArray) -> Union[xr.DataArray, None]: """ Rescale Xarray DataArray so all values lie in the range [0, 1]. @@ -105,6 +107,7 @@ def rescale(self, dataarray: xr.DataArray) -> Union[xr.DataArray, None]: Args: dataarray: DataArray to rescale. + Dims MUST be named ('time', 'x_geostationary', 'y_geostationary', 'variable')! Returns: The DataArray rescaled to [0, 1]. NaNs in the original `dataarray` will still diff --git a/satip/utils.py b/satip/utils.py index 12023be3..4d655996 100644 --- a/satip/utils.py +++ b/satip/utils.py @@ -193,6 +193,8 @@ def load_native_to_dataset( return dataarray, hrv_dataarray +# TODO: temp_directory is unused and has no effect. But for the sake of interface consistency +# with load_native_to_dataset, can also stay. def load_cloudmask_to_dataset( filename: Path, temp_directory: Path, area: str, calculate_osgb: bool = True ) -> xr.DataArray: @@ -647,7 +649,7 @@ def move_older_files_to_different_location(save_dir: str, history_time: pd.Times Args: save_dir: Directory where data is being saved - history: History time to keep files + history_time: History time to keep files """ filesystem = fsspec.open(save_dir).fs diff --git a/scripts/generate_test_plots.py b/scripts/generate_test_plots.py index 91033a7f..e555f7f6 100644 --- a/scripts/generate_test_plots.py +++ b/scripts/generate_test_plots.py @@ -19,149 +19,157 @@ from satip import eumetsat from satip.utils import load_cloudmask_to_dataset, load_native_to_dataset, save_dataset_to_zarr -RSS_ID = "EO:EUM:DAT:MSG:MSG15-RSS" -CLOUD_ID = "EO:EUM:DAT:MSG:RSS-CLM" - -user_key = os.environ.get("EUMETSAT_USER_KEY") -user_secret = os.environ.get("EUMETSAT_USER_SECRET") - -download_manager = eumetsat.DownloadManager( - user_key=user_key, - user_secret=user_secret, - data_dir=os.getcwd(), - logger_name="Plotting_test", -) - - -def _plot_tailored(input_name: str) -> None: - """Plots the results of a download of tailored datasets.""" - geotiff_files = list(glob.glob(os.path.join(os.getcwd(), "*.tif"))) - image = rasterio.open(geotiff_files[0]) - plt.imshow(image.read(1)) - plt.title(f"Tailored {input_name}") - plt.savefig(os.path.join(os.getcwd(), f"tailored_{input_name}.png"), dpi=300) - plt.cla() - plt.clf() - os.remove(geotiff_files[0]) - - -# Then tailored ones: Download for the tailored date-range and plot. -download_manager.download_tailored_date_range( - start_date="2020-06-01 11:59:00", - end_date="2020-06-01 12:02:00", - file_format="geotiff", - product_id=CLOUD_ID, -) -# _plot_tailored("cloud_mask") -download_manager.download_tailored_date_range( - start_date="2020-06-01 11:59:00", - end_date="2020-06-01 12:00:00", - file_format="geotiff", - product_id=RSS_ID, -) -# _plot_tailored("rss") - -# Get 1 RSS native file and 1 cloud mask file -download_manager.download_date_range( - start_date="2020-06-01 11:59:00", end_date="2020-06-01 12:00:00", product_id=RSS_ID -) -# 1 Cloud mask -download_manager.download_date_range( - start_date="2020-06-01 11:59:00", end_date="2020-06-01 12:02:00", product_id=CLOUD_ID -) - -# Convert to Xarray DataArray -rss_filename = list(glob.glob(os.path.join(os.getcwd(), "*.nat"))) -cloud_mask_filename = list(glob.glob(os.path.join(os.getcwd(), "*.grb"))) - - -def _plot_dataset(dataset: xr.DataArray, name: str, area: str) -> None: - """Plots a xarray-dataset and saves the output to a defined filename. - - Args: - dataset: xarray data set to plot. - name: File base name to write to. - area: Area suffix for the filename; get appended to `name`. + +def generate_test_plots(): + """Function to generate test plots. + + Test plot generation code is encapsulated in a function to allow for easier testing. """ - if area == "UK": - ax = plt.axes(projection=ccrs.OSGB()) - dataset.plot.pcolormesh( - ax=ax, - transform=ccrs.OSGB(), - x="x_osgb", - y="y_osgb", - add_colorbar=True, - ) - else: - ax = plt.axes(projection=ccrs.Geostationary(central_longitude=9.5)) - dataset.plot.pcolormesh( - ax=ax, - transform=ccrs.Geostationary(central_longitude=9.5), - x="x_geostationary", - y="y_geostationary", - add_colorbar=True, - ) - ax.coastlines() - plt.savefig(os.path.join(os.getcwd(), f"{name}_{area}.png")) - plt.cla() - plt.clf() + RSS_ID = "EO:EUM:DAT:MSG:MSG15-RSS" + CLOUD_ID = "EO:EUM:DAT:MSG:RSS-CLM" + user_key = os.environ.get("EUMETSAT_USER_KEY") + user_secret = os.environ.get("EUMETSAT_USER_SECRET") -for area in ["UK", "RSS"]: - # First do it with the cloud mask - cloudmask_dataset = load_cloudmask_to_dataset( - Path(cloud_mask_filename[0]), temp_directory=Path(os.getcwd()), area=area - ) - rss_dataset, hrv_dataset = load_native_to_dataset( - Path(rss_filename[0]), temp_directory=Path(os.getcwd()), area=area + download_manager = eumetsat.DownloadManager( + user_key=user_key, + user_secret=user_secret, + data_dir=os.getcwd(), + logger_name="Plotting_test", ) - # Save to Zarrs, to then load them back - save_dataset_to_zarr( - cloudmask_dataset, - zarr_path=os.path.join(os.getcwd(), "cloud.zarr"), - compressor_name="bz2", - zarr_mode="w", + def _plot_tailored(input_name: str) -> None: + """Plots the results of a download of tailored datasets.""" + geotiff_files = list(glob.glob(os.path.join(os.getcwd(), "*.tif"))) + image = rasterio.open(geotiff_files[0]) + plt.imshow(image.read(1)) + plt.title(f"Tailored {input_name}") + plt.savefig(os.path.join(os.getcwd(), f"tailored_{input_name}.png"), dpi=300) + plt.cla() + plt.clf() + os.remove(geotiff_files[0]) + + # Then tailored ones: Download for the tailored date-range and plot. + download_manager.download_tailored_date_range( + start_date="2020-06-01 11:59:00", + end_date="2020-06-01 12:02:00", + file_format="geotiff", + product_id=CLOUD_ID, ) - del cloudmask_dataset + # _plot_tailored("cloud_mask") - save_dataset_to_zarr( - rss_dataset, - zarr_path=os.path.join(os.getcwd(), "rss.zarr"), - compressor_name="jpeg-xl", - zarr_mode="w", + download_manager.download_tailored_date_range( + start_date="2020-06-01 11:59:00", + end_date="2020-06-01 12:00:00", + file_format="geotiff", + product_id=RSS_ID, ) - del rss_dataset + # _plot_tailored("rss") - save_dataset_to_zarr( - hrv_dataset, - zarr_path=os.path.join(os.getcwd(), "hrv.zarr"), - compressor_name="jpeg-xl", - zarr_mode="w", + # Get 1 RSS native file and 1 cloud mask file + download_manager.download_date_range( + start_date="2020-06-01 11:59:00", end_date="2020-06-01 12:00:00", product_id=RSS_ID ) - del hrv_dataset - # Load them from Zarr to ensure its the same as the output from satip - cloudmask_dataset = ( - xr.open_zarr(os.path.join(os.getcwd(), "cloud.zarr"), consolidated=True)["data"] - .isel(time=0) - .sel(variable="cloud_mask") - ) - rss_dataset = ( - xr.open_zarr(os.path.join(os.getcwd(), "rss.zarr"), consolidated=True)["data"] - .isel(time=0) - .sel(variable="IR_016") - ) - hrv_dataset = ( - xr.open_zarr(os.path.join(os.getcwd(), "hrv.zarr"), consolidated=True)["data"] - .isel(time=0) - .sel(variable="HRV") + # 1 Cloud mask + download_manager.download_date_range( + start_date="2020-06-01 11:59:00", end_date="2020-06-01 12:02:00", product_id=CLOUD_ID ) - print(cloudmask_dataset) - print(rss_dataset) - print(hrv_dataset) + # Store filenames just downloaded to later convert them to Xarray DataArray: + rss_filenames = list(glob.glob(os.path.join(os.getcwd(), "*.nat"))) + cloud_mask_filenames = list(glob.glob(os.path.join(os.getcwd(), "*.grb"))) + + def _plot_dataset(dataset: xr.DataArray, name: str, area: str) -> None: + """Plots a xarray-dataset and saves the output to a defined filename. + + Args: + dataset: xarray data set to plot. + name: File base name to write to. + area: Area suffix for the filename; get appended to `name`. + """ + if area == "UK": + ax = plt.axes(projection=ccrs.OSGB()) + dataset.plot.pcolormesh( + ax=ax, + transform=ccrs.OSGB(), + x="x_osgb", + y="y_osgb", + add_colorbar=True, + ) + else: + ax = plt.axes(projection=ccrs.Geostationary(central_longitude=9.5)) + dataset.plot.pcolormesh( + ax=ax, + transform=ccrs.Geostationary(central_longitude=9.5), + x="x_geostationary", + y="y_geostationary", + add_colorbar=True, + ) + ax.coastlines() + plt.savefig(os.path.join(os.getcwd(), f"{name}_{area}.png")) + plt.cla() + plt.clf() + + for area in ["UK", "RSS"]: + # First do it with the cloud mask + cloudmask_dataset = load_cloudmask_to_dataset( + Path(cloud_mask_filenames[0]), temp_directory=Path(os.getcwd()), area=area + ) + rss_dataset, hrv_dataset = load_native_to_dataset( + Path(rss_filenames[0]), temp_directory=Path(os.getcwd()), area=area + ) + + # Save to Zarrs, to then load them back + save_dataset_to_zarr( + cloudmask_dataset, + zarr_path=os.path.join(os.getcwd(), "cloud.zarr"), + compressor_name="bz2", + zarr_mode="w", + ) + del cloudmask_dataset + + save_dataset_to_zarr( + rss_dataset, + zarr_path=os.path.join(os.getcwd(), "rss.zarr"), + compressor_name="jpeg-xl", + zarr_mode="w", + ) + del rss_dataset + + save_dataset_to_zarr( + hrv_dataset, + zarr_path=os.path.join(os.getcwd(), "hrv.zarr"), + compressor_name="jpeg-xl", + zarr_mode="w", + ) + del hrv_dataset + + # Load them from Zarr to ensure its the same as the output from satip + cloudmask_dataset = ( + xr.open_zarr(os.path.join(os.getcwd(), "cloud.zarr"), consolidated=True)["data"] + .isel(time=0) + .sel(variable="cloud_mask") + ) + rss_dataset = ( + xr.open_zarr(os.path.join(os.getcwd(), "rss.zarr"), consolidated=True)["data"] + .isel(time=0) + .sel(variable="IR_016") + ) + hrv_dataset = ( + xr.open_zarr(os.path.join(os.getcwd(), "hrv.zarr"), consolidated=True)["data"] + .isel(time=0) + .sel(variable="HRV") + ) + + print(cloudmask_dataset) + print(rss_dataset) + print(hrv_dataset) + + _plot_dataset(hrv_dataset, "hrv", area) + _plot_dataset(rss_dataset, "rss", area) + _plot_dataset(cloudmask_dataset, "cloud_mask", area) + - _plot_dataset(hrv_dataset, "hrv", area) - _plot_dataset(rss_dataset, "rss", area) - _plot_dataset(cloudmask_dataset, "cloud_mask", area) +if __name__ == "__main__": + generate_test_plots() diff --git a/scripts/get_raw_eumetsat_data.py b/scripts/get_raw_eumetsat_data.py index 17d36c77..47709013 100644 --- a/scripts/get_raw_eumetsat_data.py +++ b/scripts/get_raw_eumetsat_data.py @@ -9,6 +9,7 @@ python3 get_raw_eumetsat_data.py """ +import os from datetime import datetime import click @@ -33,9 +34,8 @@ def _validate_date(ctx, param, value): @click.option( "--download_directory", "--dir", - default="/storage/", - help="""Where to download the data to. - Also where the script searches for previously downloaded data.""", + default=str(os.getcwd() + "/storage/"), + prompt="""Where to download the data to and/or search for previously downloaded data.\n""", ) @click.option( "--start_date", diff --git a/tests/disabled_utils.py b/tests/disabled_utils.py new file mode 100644 index 00000000..231dc523 --- /dev/null +++ b/tests/disabled_utils.py @@ -0,0 +1,93 @@ +"""Tests for the Satip-utils. + +Please note that at the moment, these tests are marked disabled for two reasons: +a) There is a weird installation-bug which makes it such that on identical github-workflows, + script.generate_test_plots runs while this code here seems to draw from the wrong library. +b) We want to have script.generate_test_plots as our try-it-out-script for new users, so + we have to test that. Though that means that utils gets tested as well, individual + util-tests seem redundant. +""" +import glob +import os +import unittest +from pathlib import Path + +import xarray + +from satip.utils import load_cloudmask_to_dataset, load_native_to_dataset, save_dataset_to_zarr + +USER_KEY = os.environ.get("EUMETSAT_USER_KEY") +USER_SECRET = os.environ.get("EUMETSAT_USER_SECRET") +RSS_ID = "EO:EUM:DAT:MSG:MSG15-RSS" +CLOUD_ID = "EO:EUM:DAT:MSG:RSS-CLM" + + +class TestSatipUtils(unittest.TestCase): + """Tests for satip.utils.""" + + def setUp(self) -> None: # noqa D102 + # If there is no downloaded RSS-data-set or cloudmask, then download and store them: + if len(list(glob.glob(os.path.join(os.getcwd(), "*.nat")))) == 0: + from satip import eumetsat + + download_manager = eumetsat.DownloadManager( + user_key=USER_KEY, + user_secret=USER_SECRET, + data_dir=os.getcwd(), + logger_name="Plotting_test", + ) + + # Download one set of RSS data and one cloudmask and store them on disk: + download_manager.download_date_range( + start_date="2020-06-01 11:59:00", end_date="2020-06-01 12:00:00", product_id=RSS_ID + ) + download_manager.download_date_range( + start_date="2020-06-01 11:59:00", + end_date="2020-06-01 12:02:00", + product_id=CLOUD_ID, + ) + + # Now that we can be sure that those files exist, we can store the filenames. + # Note: The following fields should now contain the full paths to RSS/cloudmask-data. + # As per 01.03.2022, given above date-range, the files you got should be: + # - [path]/MSG3-SEVI-MSG15-0100-NA-20200601115916.810000000Z-NA.nat for the RSS-data + # - [path]/MSG3-SEVI-MSGCLMK-0100-0100-20200601120000.000000000Z-NA.grb for the cloudmask + # However, there is no guarantee that future API-releases will keep the naming stable. + self.rss_filename = list(glob.glob(os.path.join(os.getcwd(), "*.nat")))[0] + self.cloud_mask_filename = list(glob.glob(os.path.join(os.getcwd(), "*.grb")))[0] + + return super().setUp() + + def test_load_cloudmask_to_dataset(self): # noqa D102 + for area in ["UK", "RSS"]: + cloudmask_dataset = load_cloudmask_to_dataset( + Path(self.cloud_mask_filename), temp_directory=Path(os.getcwd()), area=area + ) + self.assertEqual(type(cloudmask_dataset), xarray.DataArray) + + def test_load_native_to_dataset(self): # noqa D102 + for area in ["UK", "RSS"]: + rss_dataset, hrv_dataset = load_native_to_dataset( + Path(self.rss_filename), temp_directory=Path(os.getcwd()), area=area + ) + self.assertEqual(type(rss_dataset), xarray.DataArray) + self.assertEqual(type(hrv_dataset), xarray.DataArray) + + def test_save_dataset_to_zarr(self): # noqa D102 + # The following is a bit ugly, but since we do not want to lump two tests into one + # test function but save_dataset_to_zarr depends on a dataset being loaded, + # we have to reload the dataset here. This means that this test can theoretically + # fail for two reasons: Either the data-loading failed, or the data-saving failed. + rss_dataset, _ = load_native_to_dataset( + Path(self.rss_filename), temp_directory=Path(os.getcwd()), area="UK" + ) + + zarr_path = os.path.join(os.getcwd(), "tmp.zarr") + + save_dataset_to_zarr( + rss_dataset, + zarr_path=zarr_path, + compressor_name="bz2", + zarr_mode="w", + ) + self.assertEqual(1, len(list(glob.glob(zarr_path)))) diff --git a/tests/test_download.py b/tests/test_download.py new file mode 100644 index 00000000..2621d5ba --- /dev/null +++ b/tests/test_download.py @@ -0,0 +1,84 @@ +"""Tests for satip.download.py.""" +import os +import unittest + +import pandas as pd + +from satip.download import ( + _determine_datetimes_to_download_files, + _get_missing_datetimes_from_list_of_files, + download_eumetsat_data, +) +from satip.utils import format_dt_str + + +class TestDownload(unittest.TestCase): + """Test case for downloader tests.""" + + def setUp(self) -> None: # noqa + return super().setUp() + + def test_download_eumetsat_data(self): # noqa + # Call the downloader on a very short chunk of data: + self.assertIsNone( + download_eumetsat_data( + download_directory=str(os.getcwd() + "/storage/"), + start_date=format_dt_str("2020-06-01 11:59:00"), + end_date=format_dt_str("2020-06-01 12:02:00"), + user_key=os.environ.get("EUMETSAT_USER_KEY"), + user_secret=os.environ.get("EUMETSAT_USER_SECRET"), + auth_filename=None, + number_of_processes=2, + product=["cloud", "rss"], + enforce_full_days=False, + ) + ) + + def test_determine_datetime_to_download_files(self): + """Tests correct day-wise-chunked lists. + + Given an empty directory, the function is supposed to return a complete + list of dates, chunked into daily intervals. + """ + datetimes = _determine_datetimes_to_download_files( + ".", + start_date=pd.to_datetime("2020-03-08 12:00"), + end_date=pd.to_datetime("2020-03-10 09:00"), + product_id="dummy", + ) + print(datetimes) + print(datetimes[0]) + print(type(datetimes[0][0])) + self.assertEqual(datetimes[0][0], pd.to_datetime("2020-03-08 11:59:00")) + self.assertEqual(datetimes[0][1], pd.to_datetime("2020-03-09 11:58:00")) + self.assertEqual(datetimes[1][0], pd.to_datetime("2020-03-09 11:59:00")) + self.assertEqual(datetimes[1][1], pd.to_datetime("2020-03-10 11:58:00")) + + def test_get_missing_datetimes_from_list_of_files(self): + """Tests padding of datetimes if files present are missing data for given days.""" + # Assume we only have two files present, as something went very wrong + # with the download. + filenames = [ + "MSG3-SEVI-MSG15-0100-NA-20190308114036.810000000Z-NA.nat", + "MSG3-SEVI-MSG15-0100-NA-20220308133711.810000000Z-NA.nat", + ] + + # We then expect the function to pad this by adding missing timeranges + # to fill up everything from the beginning to the first day to the timestamp + # of the first day (case A), then everything between both files (case B) + # and finally fill up the rest until the end of the second day (case C). + expected_timestamps = [ + (pd.to_datetime("2019-03-08 00:00:00"), pd.to_datetime("2019-03-08 11:40:00")), + (pd.to_datetime("2019-03-08 11:40:00"), pd.to_datetime("2022-03-08 13:37:00")), + (pd.to_datetime("2022-03-08 13:37:00"), pd.to_datetime("2022-03-08 23:58:00")), + ] + + # Generate the list of missing datetime interval boundaries: + res = _get_missing_datetimes_from_list_of_files(filenames) + + for i, interval in enumerate(expected_timestamps): + for b, boundary in enumerate(interval): + # Note that the function returns datetime-objects, but we defined the expected + # values as pd-datetime. Though their str-repr is different, they still pass + # the equality-check when compared for same dates. + self.assertEqual(boundary, res[i][b]) diff --git a/tests/test_jpeg_xl_float_with_nans.py b/tests/test_jpeg_xl_float_with_nans.py new file mode 100644 index 00000000..aa8ef6fe --- /dev/null +++ b/tests/test_jpeg_xl_float_with_nans.py @@ -0,0 +1,117 @@ +"""Unit tests for satip.jpeg_xl_float_with_nans.""" + +import unittest + +import numpy as np + +from satip.jpeg_xl_float_with_nans import ( + LOWER_BOUND_FOR_REAL_PIXELS, + NAN_VALUE, + JpegXlFloatWithNaNs, + decode_nans, + encode_nans, +) + + +class TestJpegXlFloatWithNaNs(unittest.TestCase): + """Test class for unittest for the class methods and the functions. + + We only test our home-written functions. + The two similarly named class functions encode and decode are mostly wrappers + around our home-written function output piped into an external library. + Testing the functionality of the external functions is out of scope. + """ + + def setUp(self) -> None: # noqa D102 + # Define synthetic input array and the expected target array: + self.buf = np.asarray([np.nan, 0.0, 0.5, 1.0], dtype=np.float32) + self.encoded = np.asarray( + [NAN_VALUE, LOWER_BOUND_FOR_REAL_PIXELS, 0.5 * (1 + LOWER_BOUND_FOR_REAL_PIXELS), 1.0], + dtype=np.float32, + ) + + self.jpegxl = JpegXlFloatWithNaNs() + + return super().setUp() + + def test_encode(self): + """Tests the encoding function. + + After encoding the raw array, the nan-values should be gone and the + real values should be transformed to the range specified by the + constants imported from the source code. See there for more details. + """ + # Check that the enconded buffer matches the expected target + # (attention: use a copy of the originals!): + self.assertTrue(np.isclose(encode_nans(self.buf.copy()), self.encoded).all()) + + def test_decode(self): + """Tests the decoding function. + + When taking what was previously the encoded array and decode it, + we expect to get the original buf-array back again. + """ + # As np.nan != np.nan (!) and thus np.isclose or array comparison do not consider + # two nan-values to be close or equal, we have to replace all nan-values with + # a numeric value before comparison. This numeric value should be one that + # can not be created via decoding (e.g. a negative number). + nan_replacement = -3.14 + self.assertTrue( + np.isclose( + np.nan_to_num(self.buf, nan_replacement), + np.nan_to_num(decode_nans(self.encoded.copy()), nan_replacement), + ).all() + ) + + def test_class_roundtrip(self): + """Tests the class-defined wrappers around our home-written functions. + + We test whether a back-and-forth transformation (nested encode-decode) + will give us back our original input value. + """ + reshaped_buf = self.buf.copy().reshape((1, -1, 1)) + + roundtrip_result = self.jpegxl.decode(self.jpegxl.encode(reshaped_buf.copy())) + + # For reasons explained in the decoding test, we have to manually replace + # the nan-values to make them comparable: + nan_replacement = -3.14 + reshaped_buf = np.nan_to_num(reshaped_buf, nan_replacement) + roundtrip_result = np.nan_to_num(roundtrip_result, nan_replacement) + + # When we do the comparison, we have to be very lenient, as the external library + # will have worked its compression magic, so values will not completely align. + # Also, going back and forth removes the information about the channel number + # in our test case (presumably b/c we here only have one channel for simplicity's sake). + # So we have to reshape both: + self.assertTrue( + np.isclose(reshaped_buf.reshape((-1)), roundtrip_result.reshape((-1)), atol=0.1).all() + ) + + def test_consistent_init_params(self): + """The JpegXLFloat-class has to be initialised with specific parameter combinations. + + Stuff that is allowed: + 1. If lossless = None, then everything is allowed. + 2. If lossless = True, then level has to be None and distance has to be None or 0 + 3. If lossless = False, then everything is allowed. + + To test this, we will try various parameters and see that the class gets + initialised properly, w/o throwing any errors. + """ + + # Sub-case 1: + self.assertTrue(JpegXlFloatWithNaNs(lossless=None, level="very_high", distance=-10)) + + # Sub-case 2: + with self.assertRaises(AssertionError): + JpegXlFloatWithNaNs(lossless=True, level=1, distance=1) + + with self.assertRaises(AssertionError): + JpegXlFloatWithNaNs(lossless=True, level=None, distance=1) + + with self.assertRaises(AssertionError): + JpegXlFloatWithNaNs(lossless=True, level=2, distance=0) + + # Sub-case 3: + self.assertTrue(JpegXlFloatWithNaNs(lossless=False)) diff --git a/tests/test_scale_to_zero_to_one.py b/tests/test_scale_to_zero_to_one.py new file mode 100644 index 00000000..f04d14f2 --- /dev/null +++ b/tests/test_scale_to_zero_to_one.py @@ -0,0 +1,108 @@ +"""Tests for scale_to_zero_to_one.py.""" +import unittest + +import numpy as np +import pandas as pd +import xarray as xr + +from satip.scale_to_zero_to_one import ScaleToZeroToOne, is_dataset_clean + + +class TestScaleToZeroToOne(unittest.TestCase): + """Test class for methods of class scale_to_zero_to_one.ScaleToZeroToOne. + + We will set up a mock dataset and try the various methods in ScaleToZeroToOne, + checking whether expected results manifest themselves. + """ + + def setUp(self) -> None: + """Set up for all following tests, where a common dataarray is defined.""" + # Set dimensionality of the fake dataset: + Nx, Ny, Nt = 2, 3, 10 + + # Generate the fake four-dimensional data: + data = np.zeros((Nx, Ny, Nt, 2)) + data[:, :, :, 0] = np.linspace(-10, 10, Nt, endpoint=True) + np.random.rand(Nx, Ny, Nt) + data[:, :, :, 1] = np.linspace(10, -10, Nt, endpoint=True) + np.random.rand(Nx, Ny, Nt) + + # Set some random values in the middle to NaN: + data[Nx // 2, Ny // 2, Nt // 2, :] = np.nan + + self.dataset = xr.DataArray( + data=data, + coords=dict( + lon=( + ["x_geostationary", "y_geostationary"], + np.linspace(0, 1.0, Nx, endpoint=True).reshape((Nx, 1)) + np.zeros((Nx, Ny)), + ), + lat=( + ["x_geostationary", "y_geostationary"], + np.linspace(-1.0, 0.0, Ny, endpoint=True).reshape((1, Ny)) + np.zeros((Nx, Ny)), + ), + time=pd.date_range("2019-03-08", periods=Nt), + ), + dims=["x_geostationary", "y_geostationary", "time", "variable"], + attrs=dict( + description="Some randomly permutated lines in time and space.\ + If you find meaning in this, please see a shrink." + ), + ) + + return super().setUp() + + def test_fit(self): # noqa: D102 + + scaler = ScaleToZeroToOne( + mins=np.asarray([-5, 0]), + maxs=np.asarray([5, 20]), + variable_order=["wrong_var_name_one", "wrong_var_name_two"], + ) + scaler.fit(self.dataset, dims=("x_geostationary", "y_geostationary", "time")) + + # Test whether the min/max-values are logged: + self.assertListEqual( + scaler.mins.values.tolist(), + self.dataset.min(("x_geostationary", "y_geostationary", "time")) + .compute() + .values.tolist(), + ) + self.assertListEqual( + scaler.maxs.values.tolist(), + self.dataset.max(("x_geostationary", "y_geostationary", "time")) + .compute() + .values.tolist(), + ) + + # Test whether the initially wrong variable names are set correctly now: + self.assertListEqual(scaler.variable_order.tolist(), [0, 1]) + + def test_rescale(self): # noqa: D102 + scaler = ScaleToZeroToOne().fit( + self.dataset, dims=("x_geostationary", "y_geostationary", "time") + ) + dataset = scaler.rescale(self.dataset) + scaler.fit(dataset, dims=("x_geostationary", "y_geostationary", "time")) + + # Assert that all values are between zero and one: + self.assertListEqual(scaler.mins.values.tolist(), [0, 0]) + self.assertListEqual(scaler.maxs.values.tolist(), [1, 1]) + + # Are the NaN still in there? + self.assertTrue(np.isnan(dataset).any()) + + def test_compress_mask(self): # noqa: D102 + # Generate a dataset and rescale it. + # The result should be a dataset which still contains NaNs. + scaler = ScaleToZeroToOne().fit( + self.dataset, dims=("x_geostationary", "y_geostationary", "time") + ) + dataset = scaler.rescale(self.dataset) + + # Now compress the dataset and then check if the NaN-values have been replaced with -1: + dataset = scaler.compress_mask(dataset) + + self.assertTrue(dataset.min() == -1) + self.assertFalse(np.isnan(dataset).any()) + + # While we are at it, lets also test the is_dataset_clean-method: + self.assertTrue(is_dataset_clean(dataset))