From 9fe2ba23770657993543dbb88e9565498d1e916c Mon Sep 17 00:00:00 2001
From: ouyangwenyu <wenyuouyang@outlook.com>
Date: Mon, 25 Mar 2024 21:58:36 +0800
Subject: [PATCH] add a script for 1-CAMELS-basin calibraion

---
 hydromodel/datasets/data_preprocess.py |  14 +--
 hydromodel/trainers/calibrate_sceua.py |   3 -
 scripts/calibrate_xaj_camels.py        | 155 +++++++++++++++++++++++++
 scripts/datapreprocess4calibrate.py    | 146 -----------------------
 test/test_data_preprocess.py           |  23 ++--
 5 files changed, 168 insertions(+), 173 deletions(-)
 create mode 100644 scripts/calibrate_xaj_camels.py
 delete mode 100644 scripts/datapreprocess4calibrate.py

diff --git a/hydromodel/datasets/data_preprocess.py b/hydromodel/datasets/data_preprocess.py
index 89ef33a..0560de4 100644
--- a/hydromodel/datasets/data_preprocess.py
+++ b/hydromodel/datasets/data_preprocess.py
@@ -278,14 +278,14 @@ def process_and_save_data_as_nc(
     return True
 
 
-def split_train_test(ts_file, train_period, test_period):
+def split_train_test(ts_data, train_period, test_period):
     """
     Split all data to train and test parts with same format
 
     Parameters
     ----------
-    ts_file
-        nc file of all time series data
+    ts_data: xr.Dataset
+        time series data
     train_period
         training period
     test_period
@@ -295,7 +295,6 @@ def split_train_test(ts_file, train_period, test_period):
     -------
     None
     """
-    ts_data = xr.open_dataset(ts_file)
     # Convert date strings to pandas datetime objects
     train_start, train_end = pd.to_datetime(train_period[0]), pd.to_datetime(
         train_period[1]
@@ -332,14 +331,14 @@ def validate_freq(freq):
         return False
 
 
-def cross_valid_data(ts_file, period, warmup, cv_fold, freq="1D"):
+def cross_valid_data(ts_data, period, warmup, cv_fold, freq="1D"):
     """
     Split all data to train and test parts with same format for cross validation.
 
     Parameters
     ----------
-    ts_file : str
-        Path to the NetCDF file of time series data.
+    ts_data : xr.Dataset
+        time series data.
     period : tuple of str
         The whole period in the format ("start_date", "end_date").
     warmup : int
@@ -358,7 +357,6 @@ def cross_valid_data(ts_file, period, warmup, cv_fold, freq="1D"):
         raise ValueError(
             "Time unit must be number with either 'Y','M','W','D','h','m' or 's', such as 3D."
         )
-    ts_data = xr.open_dataset(ts_file)
 
     # Convert the whole period to pandas datetime
     start_date, end_date = pd.to_datetime(period[0]), pd.to_datetime(period[1])
diff --git a/hydromodel/trainers/calibrate_sceua.py b/hydromodel/trainers/calibrate_sceua.py
index 7e3c290..893b2bb 100644
--- a/hydromodel/trainers/calibrate_sceua.py
+++ b/hydromodel/trainers/calibrate_sceua.py
@@ -5,9 +5,6 @@
 from spotpy.parameter import Uniform, ParameterSet
 from hydromodel.models.model_config import MODEL_PARAM_DICT
 from hydromodel.models.model_dict import CRITERION_DICT, MODEL_DICT
-from hydromodel.models.gr4j import gr4j
-from hydromodel.models.hymod import hymod
-from hydromodel.models.xaj import xaj
 
 
 class SpotSetup(object):
diff --git a/scripts/calibrate_xaj_camels.py b/scripts/calibrate_xaj_camels.py
new file mode 100644
index 0000000..579a86f
--- /dev/null
+++ b/scripts/calibrate_xaj_camels.py
@@ -0,0 +1,155 @@
+"""
+Author: Wenyu Ouyang
+Date: 2022-11-19 17:27:05
+LastEditTime: 2024-03-25 20:52:08
+LastEditors: Wenyu Ouyang
+Description: the script to calibrate a model for CAMELS basin
+FilePath: \hydro-model-xaj\scripts\calibrate_xaj_camels.py
+Copyright (c) 2021-2022 Wenyu Ouyang. All rights reserved.
+"""
+
+import numpy as np
+import argparse
+import sys
+import os
+from pathlib import Path
+
+from hydrodataset import Camels
+from hydrodata.utils.utils import streamflow_unit_conv
+
+repo_path = os.path.dirname(Path(os.path.abspath(__file__)).parent)
+sys.path.append(repo_path)
+from hydromodel import SETTING
+from hydromodel.datasets.data_preprocess import cross_valid_data, split_train_test
+from hydromodel.trainers.calibrate_sceua import calibrate_by_sceua
+
+
+def main(args):
+    camels_name = args.camels_name
+    exp = args.exp
+    cv_fold = args.cv_fold
+    train_period = args.calibrate_period
+    test_period = args.test_period
+    periods = args.period
+    warmup = args.warmup
+    basin_ids = args.basin_id
+    camels_data_dir = os.path.join(
+        SETTING["local_data_path"]["datasets-origin"], "camels", camels_name
+    )
+    camels = Camels(camels_data_dir)
+    ts_data = camels.read_ts_xrdataset(
+        basin_ids, periods, ["prcp", "PET", "streamflow"]
+    )
+    where_save = Path(os.path.join(repo_path, "result", exp))
+    if os.path.exists(where_save) is False:
+        os.makedirs(where_save)
+
+    if cv_fold <= 1:
+        # no cross validation
+        periods = np.sort(
+            [train_period[0], train_period[1], test_period[0], test_period[1]]
+        )
+    if cv_fold > 1:
+        train_and_test_data = cross_valid_data(ts_data, periods, warmup, cv_fold)
+    else:
+        # when using train_test_split, the warmup period is not used
+        # so you should include the warmup period in the train and test period
+        train_and_test_data = split_train_test(ts_data, train_period, test_period)
+    print("Start to calibrate the model")
+    p_and_e = (
+        train_and_test_data[0][["prcp", "PET"]].to_array().to_numpy().transpose(2, 1, 0)
+    )
+    # trans unit to mm/day
+    qobs_ = train_and_test_data[0][["streamflow"]]
+    basin_area = camels.read_area(basin_ids)
+    r_mmd = streamflow_unit_conv(qobs_, basin_area, target_unit="mm/d")
+    qobs = np.expand_dims(r_mmd["streamflow"].to_numpy().transpose(1, 0), axis=2)
+    calibrate_by_sceua(
+        p_and_e,
+        qobs,
+        os.path.join(where_save, "sceua_xaj"),
+        warmup,
+        model={
+            "name": "xaj_mz",
+            "source_type": "sources",
+            "source_book": "HF",
+        },
+        algorithm={
+            "name": "SCE_UA",
+            "random_seed": 1234,
+            "rep": 5,
+            "ngs": 7,
+            "kstop": 3,
+            "peps": 0.1,
+            "pcento": 0.1,
+        },
+        metric={
+            "type": "time_series",
+            "obj_func": "RMSE",
+            "events": None,
+        },
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Run hydro-model-xaj models with the CAMELS dataset"
+    )
+    parser.add_argument(
+        "--camels_name",
+        dest="camels_name",
+        help="the name of CAMELS-formatted data directory",
+        default="camels_us",
+        type=str,
+    )
+    parser.add_argument(
+        "--exp",
+        dest="exp",
+        help="An exp is corresponding to one data setting",
+        default="expcamels001",
+        type=str,
+    )
+    parser.add_argument(
+        "--cv_fold",
+        dest="cv_fold",
+        help="the number of cross-validation fold",
+        default=1,
+        type=int,
+    )
+    parser.add_argument(
+        "--warmup",
+        dest="warmup",
+        help="the number of warmup days",
+        default=365,
+        type=int,
+    )
+    parser.add_argument(
+        "--period",
+        dest="period",
+        help="The whole period",
+        default=["2007-01-01", "2014-01-01"],
+        nargs="+",
+    )
+    parser.add_argument(
+        "--calibrate_period",
+        dest="calibrate_period",
+        help="The training period",
+        default=["2007-01-01", "2014-01-01"],
+        nargs="+",
+    )
+    parser.add_argument(
+        "--test_period",
+        dest="test_period",
+        help="The testing period",
+        default=["2007-01-01", "2014-01-01"],
+        nargs="+",
+    )
+    parser.add_argument(
+        "--basin_id",
+        dest="basin_id",
+        help="The basins' ids",
+        default=["01439500", "06885500", "08104900", "09510200"],
+        nargs="+",
+    )
+    the_args = parser.parse_args()
+    main(the_args)
diff --git a/scripts/datapreprocess4calibrate.py b/scripts/datapreprocess4calibrate.py
deleted file mode 100644
index 54b1b37..0000000
--- a/scripts/datapreprocess4calibrate.py
+++ /dev/null
@@ -1,146 +0,0 @@
-"""
-Author: Wenyu Ouyang
-Date: 2022-11-19 17:27:05
-LastEditTime: 2024-03-25 17:10:51
-LastEditors: Wenyu Ouyang
-Description: the script to preprocess data for models in hydro-model-xaj
-FilePath: \hydro-model-xaj\scripts\datapreprocess4calibrate.py
-Copyright (c) 2021-2022 Wenyu Ouyang. All rights reserved.
-"""
-
-import numpy as np
-import hydrodataset
-import argparse
-import sys
-import os
-from pathlib import Path
-
-sys.path.append(os.path.dirname(Path(os.path.abspath(__file__)).parent))
-from hydromodel.datasets.data_preprocess import (
-    cross_valid_data,
-    split_train_test,
-)
-
-
-def main(args):
-    camels_name = args.camels_name
-    exp = args.exp
-    cv_fold = args.cv_fold
-    train_period = args.calibrate_period
-    test_period = args.test_period
-    periods = args.period
-    warmup = args.warmup
-    basin_ids = args.basin_id
-    camels_data_dir = hydrodataset.ROOT_DIR
-    # where_save_cache = hydrodataset.CACHE_DIR
-    where_save_cache = Path(
-        os.path.join(definitions.ROOT_DIR, "hydromodel", "example", exp)
-    )
-    if os.path.exists(where_save_cache) is False:
-        os.makedirs(where_save_cache)
-    json_file = where_save_cache.joinpath("data_info.json")
-    npy_file = where_save_cache.joinpath("basins_lump_p_pe_q.npy")
-
-    if not (cv_fold > 1):
-        # no cross validation
-        periods = np.sort(
-            [train_period[0], train_period[1], test_period[0], test_period[1]]
-        )
-    trans_camels_format_to_xaj_format(
-        camels_data_dir.joinpath("camels", camels_name),
-        basin_ids,
-        [periods[0], periods[-1]],
-        json_file,
-        npy_file,
-    )
-    if cv_fold > 1:
-        cross_valid_data(json_file, npy_file, periods, warmup, cv_fold)
-    else:
-        # when using train_test_split, the warmup period is not used
-        # so you should include the warmup period in the train and test period
-        split_train_test(json_file, npy_file, train_period, test_period)
-
-
-# python datapreprocess4calibrate.py --camels_name camels_cc --exp exp004 --calibrate_period 2014-10-01 2019-10-01 --test_period 2018-10-01 2021-10-01 --basin_id 60668 61561 63002 63007 63486 92354 94560
-# python datapreprocess4calibrate.py --camels_name camels_cc --exp exp201 --cv_fold 2 --warmup 365 --period 2014-10-01 2021-10-01 --basin_id 60650 60668 61239 61277 61561 61716 62618 63002 63486 63490 92354 94560
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Prepare data for hydro-model-xaj models."
-    )
-    parser.add_argument(
-        "--camels_name",
-        dest="camels_name",
-        help="the name of CAMELS-formatted data directory",
-        default="camels_cc",
-        # default="camels_us",
-        type=str,
-    )
-    parser.add_argument(
-        "--exp",
-        dest="exp",
-        help="An exp is corresponding to one data setting",
-        default="exp004",
-        type=str,
-    )
-    parser.add_argument(
-        "--cv_fold",
-        dest="cv_fold",
-        help="the number of cross-validation fold",
-        default=1,
-        type=int,
-    )
-    parser.add_argument(
-        "--warmup",
-        dest="warmup",
-        help="the number of warmup days",
-        default=365,
-        type=int,
-    )
-    parser.add_argument(
-        "--period",
-        dest="period",
-        help="The whole period",
-        default=["2014-10-01", "2021-10-01"],
-        # default=["2007-01-01", "2014-01-01"],
-        nargs="+",
-    )
-    parser.add_argument(
-        "--calibrate_period",
-        dest="calibrate_period",
-        help="The training period",
-        default=["2016-10-01", "2021-10-01"],
-        # default=["2007-01-01", "2014-01-01"],
-        nargs="+",
-    )
-    parser.add_argument(
-        "--test_period",
-        dest="test_period",
-        help="The testing period",
-        default=["2014-10-01", "2017-10-01"],
-        # default=["2007-01-01", "2014-01-01"],
-        nargs="+",
-    )
-    parser.add_argument(
-        "--basin_id",
-        dest="basin_id",
-        help="The basins' ids",
-        default=["60668", "61561", "63002", "63007", "63486", "92354", "94560"],
-        # default=[
-        #     "60650",
-        #     "60668",
-        #     "61239",
-        #     "61277",
-        #     "61561",
-        #     "61716",
-        #     "62618",
-        #     "63002",
-        #     "63486",
-        #     "63490",
-        #     "92354",
-        #     "94560",
-        # ],
-        # default=["01439500", "06885500", "08104900", "09510200"],
-        nargs="+",
-    )
-    the_args = parser.parse_args()
-    main(the_args)
diff --git a/test/test_data_preprocess.py b/test/test_data_preprocess.py
index 78b5ea9..fae225b 100644
--- a/test/test_data_preprocess.py
+++ b/test/test_data_preprocess.py
@@ -4,7 +4,6 @@
 import os
 import pandas as pd
 import xarray as xr
-from sklearn.model_selection import KFold
 
 from hydromodel import SETTING
 from hydromodel.datasets import *
@@ -261,11 +260,10 @@ def test_load_dataset():
     print(data)
 
 
-def create_temp_netCDF(tmp_path, periods=10):
-    """temp NetCDF file for test"""
-    ts_file = tmp_path / "time_series.nc"
+@pytest.fixture
+def ts_data_tmp(periods=10):
     basins = ["basin1", "basin2", "basin3"]
-    data = xr.Dataset(
+    return xr.Dataset(
         {
             "flow": (("time", "basin"), np.random.rand(periods, 3)),
             "prcp": (("time", "basin"), np.random.rand(periods, 3)),
@@ -275,32 +273,25 @@ def create_temp_netCDF(tmp_path, periods=10):
             "basin": basins,
         },
     )
-    data.to_netcdf(ts_file)
-    return str(ts_file)
-
-
-@pytest.fixture
-def ts_file_fixture(tmp_path):
-    return create_temp_netCDF(tmp_path)
 
 
-def test_cross_valid_data(ts_file_fixture):
+def test_cross_valid_data(ts_data_tmp):
     period = ("2022-01-01", "2022-01-10")
     warmup = 3
     cv_fold = 3
 
-    train_test_data = cross_valid_data(ts_file_fixture, period, warmup, cv_fold)
+    train_test_data = cross_valid_data(ts_data_tmp, period, warmup, cv_fold)
 
     assert len(train_test_data) == cv_fold
 
 
-def test_split_train_test(ts_file_fixture):
+def test_split_train_test(ts_data_tmp):
     # Define the train and test periods
     train_period = ("2022-01-01", "2022-01-05")
     test_period = ("2022-01-06", "2022-01-10")
 
     # Call the function to split the data
-    train_data, test_data = split_train_test(ts_file_fixture, train_period, test_period)
+    train_data, test_data = split_train_test(ts_data_tmp, train_period, test_period)
 
     # Assert that the train and test data have the correct length and shape
     basins = ["basin1", "basin2", "basin3"]