From 9fe2ba23770657993543dbb88e9565498d1e916c Mon Sep 17 00:00:00 2001 From: ouyangwenyu Date: Mon, 25 Mar 2024 21:58:36 +0800 Subject: [PATCH] add a script for 1-CAMELS-basin calibraion --- hydromodel/datasets/data_preprocess.py | 14 +-- hydromodel/trainers/calibrate_sceua.py | 3 - scripts/calibrate_xaj_camels.py | 155 +++++++++++++++++++++++++ scripts/datapreprocess4calibrate.py | 146 ----------------------- test/test_data_preprocess.py | 23 ++-- 5 files changed, 168 insertions(+), 173 deletions(-) create mode 100644 scripts/calibrate_xaj_camels.py delete mode 100644 scripts/datapreprocess4calibrate.py diff --git a/hydromodel/datasets/data_preprocess.py b/hydromodel/datasets/data_preprocess.py index 89ef33a..0560de4 100644 --- a/hydromodel/datasets/data_preprocess.py +++ b/hydromodel/datasets/data_preprocess.py @@ -278,14 +278,14 @@ def process_and_save_data_as_nc( return True -def split_train_test(ts_file, train_period, test_period): +def split_train_test(ts_data, train_period, test_period): """ Split all data to train and test parts with same format Parameters ---------- - ts_file - nc file of all time series data + ts_data: xr.Dataset + time series data train_period training period test_period @@ -295,7 +295,6 @@ def split_train_test(ts_file, train_period, test_period): ------- None """ - ts_data = xr.open_dataset(ts_file) # Convert date strings to pandas datetime objects train_start, train_end = pd.to_datetime(train_period[0]), pd.to_datetime( train_period[1] @@ -332,14 +331,14 @@ def validate_freq(freq): return False -def cross_valid_data(ts_file, period, warmup, cv_fold, freq="1D"): +def cross_valid_data(ts_data, period, warmup, cv_fold, freq="1D"): """ Split all data to train and test parts with same format for cross validation. Parameters ---------- - ts_file : str - Path to the NetCDF file of time series data. + ts_data : xr.Dataset + time series data. period : tuple of str The whole period in the format ("start_date", "end_date"). warmup : int @@ -358,7 +357,6 @@ def cross_valid_data(ts_file, period, warmup, cv_fold, freq="1D"): raise ValueError( "Time unit must be number with either 'Y','M','W','D','h','m' or 's', such as 3D." ) - ts_data = xr.open_dataset(ts_file) # Convert the whole period to pandas datetime start_date, end_date = pd.to_datetime(period[0]), pd.to_datetime(period[1]) diff --git a/hydromodel/trainers/calibrate_sceua.py b/hydromodel/trainers/calibrate_sceua.py index 7e3c290..893b2bb 100644 --- a/hydromodel/trainers/calibrate_sceua.py +++ b/hydromodel/trainers/calibrate_sceua.py @@ -5,9 +5,6 @@ from spotpy.parameter import Uniform, ParameterSet from hydromodel.models.model_config import MODEL_PARAM_DICT from hydromodel.models.model_dict import CRITERION_DICT, MODEL_DICT -from hydromodel.models.gr4j import gr4j -from hydromodel.models.hymod import hymod -from hydromodel.models.xaj import xaj class SpotSetup(object): diff --git a/scripts/calibrate_xaj_camels.py b/scripts/calibrate_xaj_camels.py new file mode 100644 index 0000000..579a86f --- /dev/null +++ b/scripts/calibrate_xaj_camels.py @@ -0,0 +1,155 @@ +""" +Author: Wenyu Ouyang +Date: 2022-11-19 17:27:05 +LastEditTime: 2024-03-25 20:52:08 +LastEditors: Wenyu Ouyang +Description: the script to calibrate a model for CAMELS basin +FilePath: \hydro-model-xaj\scripts\calibrate_xaj_camels.py +Copyright (c) 2021-2022 Wenyu Ouyang. All rights reserved. +""" + +import numpy as np +import argparse +import sys +import os +from pathlib import Path + +from hydrodataset import Camels +from hydrodata.utils.utils import streamflow_unit_conv + +repo_path = os.path.dirname(Path(os.path.abspath(__file__)).parent) +sys.path.append(repo_path) +from hydromodel import SETTING +from hydromodel.datasets.data_preprocess import cross_valid_data, split_train_test +from hydromodel.trainers.calibrate_sceua import calibrate_by_sceua + + +def main(args): + camels_name = args.camels_name + exp = args.exp + cv_fold = args.cv_fold + train_period = args.calibrate_period + test_period = args.test_period + periods = args.period + warmup = args.warmup + basin_ids = args.basin_id + camels_data_dir = os.path.join( + SETTING["local_data_path"]["datasets-origin"], "camels", camels_name + ) + camels = Camels(camels_data_dir) + ts_data = camels.read_ts_xrdataset( + basin_ids, periods, ["prcp", "PET", "streamflow"] + ) + where_save = Path(os.path.join(repo_path, "result", exp)) + if os.path.exists(where_save) is False: + os.makedirs(where_save) + + if cv_fold <= 1: + # no cross validation + periods = np.sort( + [train_period[0], train_period[1], test_period[0], test_period[1]] + ) + if cv_fold > 1: + train_and_test_data = cross_valid_data(ts_data, periods, warmup, cv_fold) + else: + # when using train_test_split, the warmup period is not used + # so you should include the warmup period in the train and test period + train_and_test_data = split_train_test(ts_data, train_period, test_period) + print("Start to calibrate the model") + p_and_e = ( + train_and_test_data[0][["prcp", "PET"]].to_array().to_numpy().transpose(2, 1, 0) + ) + # trans unit to mm/day + qobs_ = train_and_test_data[0][["streamflow"]] + basin_area = camels.read_area(basin_ids) + r_mmd = streamflow_unit_conv(qobs_, basin_area, target_unit="mm/d") + qobs = np.expand_dims(r_mmd["streamflow"].to_numpy().transpose(1, 0), axis=2) + calibrate_by_sceua( + p_and_e, + qobs, + os.path.join(where_save, "sceua_xaj"), + warmup, + model={ + "name": "xaj_mz", + "source_type": "sources", + "source_book": "HF", + }, + algorithm={ + "name": "SCE_UA", + "random_seed": 1234, + "rep": 5, + "ngs": 7, + "kstop": 3, + "peps": 0.1, + "pcento": 0.1, + }, + metric={ + "type": "time_series", + "obj_func": "RMSE", + "events": None, + }, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Run hydro-model-xaj models with the CAMELS dataset" + ) + parser.add_argument( + "--camels_name", + dest="camels_name", + help="the name of CAMELS-formatted data directory", + default="camels_us", + type=str, + ) + parser.add_argument( + "--exp", + dest="exp", + help="An exp is corresponding to one data setting", + default="expcamels001", + type=str, + ) + parser.add_argument( + "--cv_fold", + dest="cv_fold", + help="the number of cross-validation fold", + default=1, + type=int, + ) + parser.add_argument( + "--warmup", + dest="warmup", + help="the number of warmup days", + default=365, + type=int, + ) + parser.add_argument( + "--period", + dest="period", + help="The whole period", + default=["2007-01-01", "2014-01-01"], + nargs="+", + ) + parser.add_argument( + "--calibrate_period", + dest="calibrate_period", + help="The training period", + default=["2007-01-01", "2014-01-01"], + nargs="+", + ) + parser.add_argument( + "--test_period", + dest="test_period", + help="The testing period", + default=["2007-01-01", "2014-01-01"], + nargs="+", + ) + parser.add_argument( + "--basin_id", + dest="basin_id", + help="The basins' ids", + default=["01439500", "06885500", "08104900", "09510200"], + nargs="+", + ) + the_args = parser.parse_args() + main(the_args) diff --git a/scripts/datapreprocess4calibrate.py b/scripts/datapreprocess4calibrate.py deleted file mode 100644 index 54b1b37..0000000 --- a/scripts/datapreprocess4calibrate.py +++ /dev/null @@ -1,146 +0,0 @@ -""" -Author: Wenyu Ouyang -Date: 2022-11-19 17:27:05 -LastEditTime: 2024-03-25 17:10:51 -LastEditors: Wenyu Ouyang -Description: the script to preprocess data for models in hydro-model-xaj -FilePath: \hydro-model-xaj\scripts\datapreprocess4calibrate.py -Copyright (c) 2021-2022 Wenyu Ouyang. All rights reserved. -""" - -import numpy as np -import hydrodataset -import argparse -import sys -import os -from pathlib import Path - -sys.path.append(os.path.dirname(Path(os.path.abspath(__file__)).parent)) -from hydromodel.datasets.data_preprocess import ( - cross_valid_data, - split_train_test, -) - - -def main(args): - camels_name = args.camels_name - exp = args.exp - cv_fold = args.cv_fold - train_period = args.calibrate_period - test_period = args.test_period - periods = args.period - warmup = args.warmup - basin_ids = args.basin_id - camels_data_dir = hydrodataset.ROOT_DIR - # where_save_cache = hydrodataset.CACHE_DIR - where_save_cache = Path( - os.path.join(definitions.ROOT_DIR, "hydromodel", "example", exp) - ) - if os.path.exists(where_save_cache) is False: - os.makedirs(where_save_cache) - json_file = where_save_cache.joinpath("data_info.json") - npy_file = where_save_cache.joinpath("basins_lump_p_pe_q.npy") - - if not (cv_fold > 1): - # no cross validation - periods = np.sort( - [train_period[0], train_period[1], test_period[0], test_period[1]] - ) - trans_camels_format_to_xaj_format( - camels_data_dir.joinpath("camels", camels_name), - basin_ids, - [periods[0], periods[-1]], - json_file, - npy_file, - ) - if cv_fold > 1: - cross_valid_data(json_file, npy_file, periods, warmup, cv_fold) - else: - # when using train_test_split, the warmup period is not used - # so you should include the warmup period in the train and test period - split_train_test(json_file, npy_file, train_period, test_period) - - -# python datapreprocess4calibrate.py --camels_name camels_cc --exp exp004 --calibrate_period 2014-10-01 2019-10-01 --test_period 2018-10-01 2021-10-01 --basin_id 60668 61561 63002 63007 63486 92354 94560 -# python datapreprocess4calibrate.py --camels_name camels_cc --exp exp201 --cv_fold 2 --warmup 365 --period 2014-10-01 2021-10-01 --basin_id 60650 60668 61239 61277 61561 61716 62618 63002 63486 63490 92354 94560 -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Prepare data for hydro-model-xaj models." - ) - parser.add_argument( - "--camels_name", - dest="camels_name", - help="the name of CAMELS-formatted data directory", - default="camels_cc", - # default="camels_us", - type=str, - ) - parser.add_argument( - "--exp", - dest="exp", - help="An exp is corresponding to one data setting", - default="exp004", - type=str, - ) - parser.add_argument( - "--cv_fold", - dest="cv_fold", - help="the number of cross-validation fold", - default=1, - type=int, - ) - parser.add_argument( - "--warmup", - dest="warmup", - help="the number of warmup days", - default=365, - type=int, - ) - parser.add_argument( - "--period", - dest="period", - help="The whole period", - default=["2014-10-01", "2021-10-01"], - # default=["2007-01-01", "2014-01-01"], - nargs="+", - ) - parser.add_argument( - "--calibrate_period", - dest="calibrate_period", - help="The training period", - default=["2016-10-01", "2021-10-01"], - # default=["2007-01-01", "2014-01-01"], - nargs="+", - ) - parser.add_argument( - "--test_period", - dest="test_period", - help="The testing period", - default=["2014-10-01", "2017-10-01"], - # default=["2007-01-01", "2014-01-01"], - nargs="+", - ) - parser.add_argument( - "--basin_id", - dest="basin_id", - help="The basins' ids", - default=["60668", "61561", "63002", "63007", "63486", "92354", "94560"], - # default=[ - # "60650", - # "60668", - # "61239", - # "61277", - # "61561", - # "61716", - # "62618", - # "63002", - # "63486", - # "63490", - # "92354", - # "94560", - # ], - # default=["01439500", "06885500", "08104900", "09510200"], - nargs="+", - ) - the_args = parser.parse_args() - main(the_args) diff --git a/test/test_data_preprocess.py b/test/test_data_preprocess.py index 78b5ea9..fae225b 100644 --- a/test/test_data_preprocess.py +++ b/test/test_data_preprocess.py @@ -4,7 +4,6 @@ import os import pandas as pd import xarray as xr -from sklearn.model_selection import KFold from hydromodel import SETTING from hydromodel.datasets import * @@ -261,11 +260,10 @@ def test_load_dataset(): print(data) -def create_temp_netCDF(tmp_path, periods=10): - """temp NetCDF file for test""" - ts_file = tmp_path / "time_series.nc" +@pytest.fixture +def ts_data_tmp(periods=10): basins = ["basin1", "basin2", "basin3"] - data = xr.Dataset( + return xr.Dataset( { "flow": (("time", "basin"), np.random.rand(periods, 3)), "prcp": (("time", "basin"), np.random.rand(periods, 3)), @@ -275,32 +273,25 @@ def create_temp_netCDF(tmp_path, periods=10): "basin": basins, }, ) - data.to_netcdf(ts_file) - return str(ts_file) - - -@pytest.fixture -def ts_file_fixture(tmp_path): - return create_temp_netCDF(tmp_path) -def test_cross_valid_data(ts_file_fixture): +def test_cross_valid_data(ts_data_tmp): period = ("2022-01-01", "2022-01-10") warmup = 3 cv_fold = 3 - train_test_data = cross_valid_data(ts_file_fixture, period, warmup, cv_fold) + train_test_data = cross_valid_data(ts_data_tmp, period, warmup, cv_fold) assert len(train_test_data) == cv_fold -def test_split_train_test(ts_file_fixture): +def test_split_train_test(ts_data_tmp): # Define the train and test periods train_period = ("2022-01-01", "2022-01-05") test_period = ("2022-01-06", "2022-01-10") # Call the function to split the data - train_data, test_data = split_train_test(ts_file_fixture, train_period, test_period) + train_data, test_data = split_train_test(ts_data_tmp, train_period, test_period) # Assert that the train and test data have the correct length and shape basins = ["basin1", "basin2", "basin3"]