Update notebooks to work with changes to dask chunking, adapt to new …

…Raven code conventions (#395) Fixes 1/2 of Ouranosinc/pavics-jupyter-env-issues#12 For new Jupyter env in Ouranosinc/PAVICS-e2e-workflow-tests#137. FYI @tlvu ### Changes - Updates the date handling of datasets so that chunking with `dask` is properly performed - Minor fixes to comments (formatting, typos).
CSHS-CWRA · Nov 14, 2024 · fb74e38 · fb74e38
2 parents 88bfc6f + c06bf1f
commit fb74e38
Show file tree

Hide file tree

Showing 10 changed files with 211 additions and 200 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -52,13 +52,13 @@ jobs:
       fail-fast: false
       matrix:
         os: [ 'ubuntu-latest' ]  # 'macos-latest' disabled until a new build of raven-hydro is available
-        python-version:
-          - "3.9"
-          - "3.10"
-          - "3.11"
-          - "3.12"
+        python-version: [ "3.9", "3.11", "3.12" ]
+        tox-env: [ 'false' ]
           # - "3.13" # not yet supported by dependencies
-        upstream-branch: [ 'main' ]
+        include:
+          - os: 'ubuntu-latest'
+            python-version: '3.10'
+            tox-env: 'py3.10-coveralls-upstream'
     steps:
       - name: Harden Runner
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
@@ -113,7 +113,11 @@ jobs:
           python3 -m pip install --require-hashes -r CI/requirements_ci.txt
       - name: Test with tox and report coverage
         run: |
-          python3 -m tox
+          if [ "${{ matrix.tox-env }}" != "false" ]; then
+            python3 -m tox -e ${{ matrix.tox-env }}
+          else
+            python3 -m tox -e py${{ matrix.python-version }}-coveralls
+          fi
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           COVERALLS_FLAG_NAME: run-Python${{ matrix.python-version }}-${{ matrix.os }}
@@ -128,8 +132,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os:
-          - ubuntu-latest
+        os: [ "ubuntu-latest" ]
           # - macos-latest  # disabled until a new build of raven-hydro is available
           # - windows-latest  # disabled until xesmf is available
         python-version: [ "3.9", "3.10", "3.11", "3.12" ]

diff --git a/docs/notebooks/08_Getting_and_bias_correcting_CMIP6_data.ipynb b/docs/notebooks/08_Getting_and_bias_correcting_CMIP6_data.ipynb
@@ -85,10 +85,10 @@
    },
    "outputs": [],
    "source": [
-    "# We get the basin contour for testing on a server. You can replace the getfile method by a string containing the path\n",
-    "# to your own geojson\n",
+    "# We get the basin contour for testing on a server.\n",
+    "# You can replace the getfile method by a string containing the path to your own geojson.\n",
     "\n",
-    "# Get basin contour\n",
+    "# Get basin contour.\n",
     "basin_contour = get_file(\"notebook_inputs/input.geojson\")\n",
     "\n",
     "reference_start_day = dt.datetime(1980, 12, 31)\n",
@@ -158,15 +158,16 @@
    },
    "outputs": [],
    "source": [
-    "# Prepare the filesystem that allows reading data. Data is read on the Google Cloud Services, which host a copy of the CMIP6 (and other) data.\n",
+    "# Prepare the filesystem that allows reading data.\n",
+    "# Data is read on the Google Cloud Services, which host a copy of the CMIP6 (and other) data.\n",
     "fsCMIP = gcsfs.GCSFileSystem(token=\"anon\", access=\"read_only\")\n",
     "\n",
-    "# Get the catalog info from the pangeo dataset, which basically is a list of links to the various products.\n",
+    "# Get the catalog info from the PANGEO dataset, which basically is a list of links to the various products.\n",
     "col = intake.open_esm_datastore(\n",
     "    \"https://storage.googleapis.com/cmip6/pangeo-cmip6.json\"\n",
     ")\n",
     "\n",
-    "# Print the contents of the catalog, so we can see the classification system\n",
+    "# Print the contents of the catalog, so we can see the classification system.\n",
     "display(col)"
    ]
   },
@@ -185,7 +186,8 @@
    },
    "outputs": [],
    "source": [
-    "# Get the list of models. Replace \"source_id\" with any of the catalog categories (table_id, activity_id, variable_id, etc.)\n",
+    "# Get the list of models.\n",
+    "# Replace \"source_id\" with any of the catalog categories (table_id, activity_id, variable_id, etc.)\n",
     "list(col.df.source_id.unique())"
    ]
   },
@@ -225,11 +227,10 @@
     "    member_id=\"r1i1p1f1\",\n",
     "    source_id=climate_model,\n",
     ")\n",
-    "col_subset = col.search(\n",
-    "    require_all_on=[\"source_id\"], **query\n",
-    ")  # Command that will return the filtered list\n",
+    "# Return the filtered list.\n",
+    "col_subset = col.search(require_all_on=[\"source_id\"], **query)\n",
     "\n",
-    "# Show the filtered list:\n",
+    "# Show the filtered list.\n",
     "display(col_subset.df)"
    ]
   },
@@ -249,7 +250,7 @@
    },
    "outputs": [],
    "source": [
-    "# Get the object locator object\n",
+    "# Get the object locator object.\n",
     "mapper = fsCMIP.get_mapper(col_subset.df.zstore[0])"
    ]
   },
@@ -277,17 +278,20 @@
    },
    "outputs": [],
    "source": [
-    "# Get the CMIP6 data from Google Cloud and read it in memory using xarray. This is done via \"lazy loading\" and is not actually reading the data in memory\n",
-    "# yet, but is keeping track of what it will need to get, eventually.\n",
+    "# Get the CMIP6 data from Google Cloud and read it in memory using xarray.\n",
+    "# This is done via \"lazy loading\" and is not actually reading the data in memory yet, but is keeping track of what it will need to get, eventually.\n",
     "ds = xr.open_zarr(mapper, consolidated=True)\n",
     "\n",
-    "# Convert to numpy.datetime64 object to be compatbile\n",
-    "if type(ds.time[0].values) is not type(np.datetime64(\"1980-01-01\")):\n",
-    "    ds = ds.convert_calendar(\"standard\")\n",
+    "# Convert to numpy.datetime64 object for compatibility.\n",
+    "ds = ds.convert_calendar(\"standard\")\n",
     "\n",
-    "# Extract only the dates that we really want. Again, this is done via lazy loading, and is not actually using memory at this point.\n",
+    "# Extract only the dates that we really want.\n",
+    "# Again, this is done via lazy loading, and is not actually using memory at this point.\n",
     "ds = ds.sel(time=slice(reference_start_day, reference_end_day))\n",
     "\n",
+    "# Set the date to the midnight of the given day.\n",
+    "ds = ds.assign_coords(time=ds.time.dt.floor(\"D\"))\n",
+    "\n",
     "# Use the clisops subsetting tools to extract the data for the watershed boundaries and take the spatial average\n",
     "ds = average.average_shape(ds, basin_contour)\n",
     "\n",
@@ -322,14 +326,16 @@
     "    with xr.set_options(keep_attrs=True):\n",
     "        ds = xr.open_zarr(mapper, consolidated=True)\n",
     "\n",
-    "        # Convert to numpy.datetime64 object to be compatbile\n",
-    "        if type(ds.time[0].values) is not type(np.datetime64(\"1980-01-01\")):\n",
-    "            ds = ds.convert_calendar(\"standard\")\n",
+    "        # Convert to numpy.datetime64 object for compatibility.\n",
+    "        ds = ds.convert_calendar(\"standard\")\n",
+    "\n",
+    "        # Set the date to the midnight of the given day.\n",
+    "        ds = ds.assign_coords(time=ds.time.dt.floor(\"D\"))\n",
     "\n",
-    "        # Compute average over region\n",
+    "        # Compute the average over region.\n",
     "        out = average.average_shape(ds.sel(time=slice(start, end)), geometry)\n",
     "\n",
-    "        # Convert geometry variables into attributes\n",
+    "        # Convert geometry variables into attributes.\n",
     "        attrs = {\n",
     "            key: out[key].values.item()\n",
     "            for key in out.coords\n",
@@ -431,9 +437,15 @@
     "    ERA5_reference = subset.subset_shape(\n",
     "        ds.sel(time=slice(reference_start_day, reference_end_day)), basin_contour\n",
     "    ).mean({\"latitude\", \"longitude\"})\n",
-    "    ERA5_tmin = ERA5_reference[\"t2m\"].resample(time=\"1D\").min().chunk(-1, -1, -1)\n",
-    "    ERA5_tmax = ERA5_reference[\"t2m\"].resample(time=\"1D\").max().chunk(-1, -1, -1)\n",
-    "    ERA5_pr = ERA5_reference[\"tp\"].resample(time=\"1D\").sum().chunk(-1, -1, -1)"
+    "    ERA5_tmin = (\n",
+    "        ERA5_reference.t2m.resample(time=\"1D\")\n",
+    "        .min()\n",
+    "        .chunk(\n",
+    "            time=-1,\n",
+    "        )\n",
+    "    )\n",
+    "    ERA5_tmax = ERA5_reference.t2m.resample(time=\"1D\").max().chunk(time=-1)\n",
+    "    ERA5_pr = ERA5_reference.tp.resample(time=\"1D\").sum().chunk(time=-1)"
    ]
   },
   {
@@ -455,8 +467,8 @@
    },
    "outputs": [],
    "source": [
-    "# Here we need to make sure that our units are all in the correct format. You can play around with the tools we've seen thus far to explore the units\n",
-    "# and make sure everything is consistent.\n",
+    "# Here we need to make sure that our units are all in the correct format.\n",
+    "# You can play around with the tools we've seen thus far to explore the units and make sure everything is consistent.\n",
     "\n",
     "# Let's start with precipitation:\n",
     "ERA5_pr = xclim.core.units.convert_units_to(ERA5_pr, \"mm\", context=\"hydro\")\n",
@@ -497,25 +509,25 @@
    },
    "outputs": [],
    "source": [
-    "# Use xclim utilities (sbda) to give information on the type of window used for the bias correction.\n",
+    "# Use xclim utilities (SDBA) to give information on the type of window used for the bias correction.\n",
     "group_month_window = sdba.utils.Grouper(\"time.dayofyear\", window=15)\n",
     "\n",
     "# This is an adjusting function. It builds the tool that will perform the corrections.\n",
     "Adjustment = sdba.DetrendedQuantileMapping.train(\n",
     "    ref=ERA5_pr, hist=historical_pr, nquantiles=50, kind=\"+\", group=group_month_window\n",
     ")\n",
     "\n",
-    "# Apply the correction factors on the reference period\n",
+    "# Apply the correction factors on the reference period.\n",
     "corrected_ref_precip = Adjustment.adjust(historical_pr, interp=\"linear\")\n",
     "\n",
-    "# Apply the correction factors on the future period\n",
+    "# Apply the correction factors on the future period.\n",
     "corrected_fut_precip = Adjustment.adjust(future_pr, interp=\"linear\")\n",
     "\n",
-    "# Ensure that the precipitation is non-negative, which can happen with some climate models\n",
+    "# Ensure that the precipitation is non-negative, which can happen with some climate models.\n",
     "corrected_ref_precip = corrected_ref_precip.where(corrected_ref_precip > 0, 0)\n",
     "corrected_fut_precip = corrected_fut_precip.where(corrected_fut_precip > 0, 0)\n",
     "\n",
-    "# Train the model to find the correction factors for the maximum temperature (tasmax) data\n",
+    "# Train the model to find the correction factors for the maximum temperature (tasmax) data.\n",
     "Adjustment = sdba.DetrendedQuantileMapping.train(\n",
     "    ref=ERA5_tmax,\n",
     "    hist=historical_tasmax,\n",
@@ -524,13 +536,13 @@
     "    group=group_month_window,\n",
     ")\n",
     "\n",
-    "# Apply the correction factors on the reference period\n",
+    "# Apply the correction factors on the reference period.\n",
     "corrected_ref_tasmax = Adjustment.adjust(historical_tasmax, interp=\"linear\")\n",
     "\n",
-    "# Apply the correction factors on the future period\n",
+    "# Apply the correction factors on the future period.\n",
     "corrected_fut_tasmax = Adjustment.adjust(future_tasmax, interp=\"linear\")\n",
     "\n",
-    "# Train the model to find the correction factors for the minimum temperature (tasmin) data\n",
+    "# Train the model to find the correction factors for the minimum temperature (tasmin) data.\n",
     "Adjustment = sdba.DetrendedQuantileMapping.train(\n",
     "    ref=ERA5_tmin,\n",
     "    hist=historical_tasmin,\n",
@@ -561,7 +573,8 @@
    },
    "outputs": [],
    "source": [
-    "# Convert the reference corrected data into netCDF file. We will then apply a special code to remove a dimension in the dataset to make it applicable to the RAVEN models.\n",
+    "# Convert the reference corrected data into netCDF file.\n",
+    "# We will then apply a special code to remove a dimension in the dataset to make it applicable to the RAVEN models.\n",
     "ref_dataset = xr.merge(\n",
     "    [\n",
     "        corrected_ref_precip.to_dataset(name=\"pr\"),\n",
@@ -570,11 +583,11 @@
     "    ]\n",
     ")\n",
     "\n",
-    "# Write to temporary folder\n",
+    "# Write to temporary folder.\n",
     "fn_ref = tmp / \"reference_dataset.nc\"\n",
     "ref_dataset.to_netcdf(fn_ref)\n",
     "\n",
-    "# Convert the future corrected data into netCDF file\n",
+    "# Convert the future corrected data into netCDF file.\n",
     "fut_dataset = xr.merge(\n",
     "    [\n",
     "        corrected_fut_precip.to_dataset(name=\"pr\"),\n",
@@ -610,13 +623,6 @@
     "# Compare it to the future precipitation without bias-correction.\n",
     "future_pr.plot()"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

diff --git a/docs/notebooks/10_Data_assimilation.ipynb b/docs/notebooks/10_Data_assimilation.ipynb
@@ -156,14 +156,7 @@
     "            adj=\"MULTIPLICATIVE\",\n",
     "        ),\n",
     "        rc.ForcingPerturbation(\n",
-    "            forcing=\"TEMP_MAX\",\n",
-    "            dist=\"DIST_NORMAL\",\n",
-    "            p1=0.0,\n",
-    "            p2=2.0,\n",
-    "            adj=\"ADDITIVE\",\n",
-    "        ),\n",
-    "        rc.ForcingPerturbation(\n",
-    "            forcing=\"TEMP_MIN\",\n",
+    "            forcing=\"TEMP_AVE\",\n",
     "            dist=\"DIST_NORMAL\",\n",
     "            p1=0.0,\n",
     "            p2=2.0,\n",
@@ -456,7 +449,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.16"
+   "version": "3.12.5"
   }
  },
  "nbformat": 4,