diff --git a/env-dev.yml b/env-dev.yml index bf5d19a..a0bfd98 100644 --- a/env-dev.yml +++ b/env-dev.yml @@ -18,7 +18,6 @@ dependencies: - sphinx - black - flake8 - - pytest # pip - pip - pip: diff --git a/hydromodel/datasets/data_preprocess.py b/hydromodel/datasets/data_preprocess.py index 0560de4..266efb5 100644 --- a/hydromodel/datasets/data_preprocess.py +++ b/hydromodel/datasets/data_preprocess.py @@ -1,7 +1,7 @@ """ Author: Wenyu Ouyang Date: 2022-10-25 21:16:22 -LastEditTime: 2024-03-25 19:54:15 +LastEditTime: 2024-03-26 19:18:29 LastEditors: Wenyu Ouyang Description: preprocess data for models in hydro-model-xaj FilePath: \hydro-model-xaj\hydromodel\datasets\data_preprocess.py @@ -10,15 +10,15 @@ import os import re +from hydrodataset import Camels import numpy as np import pandas as pd from sklearn.model_selection import KFold -from collections import OrderedDict import xarray as xr -from hydroutils import hydro_time, hydro_file +from hydrodata.utils.utils import streamflow_unit_conv -from hydromodel import CACHE_DIR +from hydromodel import CACHE_DIR, SETTING from hydromodel.datasets import * @@ -293,7 +293,8 @@ def split_train_test(ts_data, train_period, test_period): Returns ------- - None + tuple of xr.Dataset + A tuple of xr.Dataset for training and testing data """ # Convert date strings to pandas datetime objects train_start, train_end = pd.to_datetime(train_period[0]), pd.to_datetime( @@ -387,3 +388,93 @@ def cross_valid_data(ts_data, period, warmup, cv_fold, freq="1D"): train_test_data.append((train_data, test_data)) return train_test_data + + +def get_ts_from_diffsource(data_type, data_dir, periods, basin_ids): + """Get time series data from different sources and unify the format and unit of streamflow. + + Parameters + ---------- + data_type + The type of the data source, 'camels' or 'owndata' + data_dir + The directory of the data source + periods + The periods of the time series data + basin_ids + The ids of the basins + + Returns + ------- + xr.Dataset + The time series data + + Raises + ------ + NotImplementedError + The data type is not 'camels' or 'owndata' + """ + prcp_name = remove_unit_from_name(PRCP_NAME) + pet_name = remove_unit_from_name(PET_NAME) + flow_name = remove_unit_from_name(FLOW_NAME) + area_name = remove_unit_from_name(AREA_NAME) + if data_type == "camels": + camels_data_dir = os.path.join( + SETTING["local_data_path"]["datasets-origin"], "camels", data_dir + ) + camels = Camels(camels_data_dir) + ts_data = camels.read_ts_xrdataset( + basin_ids, periods, ["prcp", "PET", "streamflow"] + ) + basin_area = camels.read_area(basin_ids) + # trans unit to mm/day + qobs_ = ts_data[["streamflow"]] + target_unit = ts_data["prcp"].attrs.get("units", "unknown") + r_mmd = streamflow_unit_conv(qobs_, basin_area, target_unit=target_unit) + ts_data[flow_name] = r_mmd["streamflow"] + ts_data[flow_name].attrs["units"] = target_unit + ts_data = ts_data.rename({"PET": pet_name}) + # ts_data = ts_data.drop_vars('streamflow') + elif data_type == "owndata": + ts_data = xr.open_dataset( + os.path.join(os.path.dirname(data_dir), "timeseries.nc") + ) + attr_data = xr.open_dataset( + os.path.join(os.path.dirname(data_dir), "attributes.nc") + ) + basin_area = attr_data[area_name].values + target_unit = ts_data[prcp_name].attrs.get("units", "unknown") + qobs_ = ts_data[[flow_name]] + r_mmd = streamflow_unit_conv(qobs_, basin_area, target_unit=target_unit) + ts_data[flow_name] = r_mmd[flow_name] + ts_data[flow_name].attrs["units"] = target_unit + else: + raise NotImplementedError( + "You should set the data type as 'camels' or 'owndata'" + ) + + return ts_data + + +def get_pe_q_from_ts(ts_xr_dataset): + """Transform the time series data to the format that can be used in the calibration process + + Parameters + ---------- + ts_xr_dataset : xr.Dataset + The time series data + + Returns + ------- + tuple[np.ndarray, np.ndarray] + The tuple contains the precipitation and evaporation data and the observed streamflow data + """ + prcp_name = remove_unit_from_name(PRCP_NAME) + pet_name = remove_unit_from_name(PET_NAME) + flow_name = remove_unit_from_name(FLOW_NAME) + p_and_e = ( + ts_xr_dataset[[prcp_name, pet_name]].to_array().to_numpy().transpose(2, 1, 0) + ) + qobs = np.expand_dims(ts_xr_dataset[flow_name].to_numpy().transpose(1, 0), axis=2) + + return p_and_e, qobs diff --git a/hydromodel/trainers/train_utils.py b/hydromodel/trainers/train_utils.py index 9dd6341..b6a42db 100644 --- a/hydromodel/trainers/train_utils.py +++ b/hydromodel/trainers/train_utils.py @@ -1,7 +1,7 @@ """ Author: Wenyu Ouyang Date: 2022-10-25 21:16:22 -LastEditTime: 2024-03-22 20:07:08 +LastEditTime: 2024-03-26 18:20:57 LastEditors: Wenyu Ouyang Description: Plots for calibration and testing results FilePath: \hydro-model-xaj\hydromodel\trainers\train_utils.py @@ -64,7 +64,6 @@ def plot_train_iteration(likelihood, save_fig): def show_calibrate_result( - spot_setup, sceua_calibrated_file, warmup_length, save_dir, @@ -79,8 +78,6 @@ def show_calibrate_result( Parameters ---------- - spot_setup - Spotpy's setup class instance sceua_calibrated_file the result file saved after optimizing basin_id diff --git a/scripts/calibrate_xaj.py b/scripts/calibrate_xaj.py index 1e4d462..1367c4d 100644 --- a/scripts/calibrate_xaj.py +++ b/scripts/calibrate_xaj.py @@ -1,7 +1,7 @@ """ Author: Wenyu Ouyang Date: 2022-11-19 17:27:05 -LastEditTime: 2024-03-26 16:54:05 +LastEditTime: 2024-03-26 18:55:25 LastEditors: Wenyu Ouyang Description: the script to calibrate a model for CAMELS basin FilePath: \hydro-model-xaj\scripts\calibrate_xaj.py @@ -14,18 +14,18 @@ import sys import os from pathlib import Path -import xarray as xr import yaml -from hydrodataset import Camels -from hydrodata.utils.utils import streamflow_unit_conv - repo_path = os.path.dirname(Path(os.path.abspath(__file__)).parent) sys.path.append(repo_path) -from hydromodel import SETTING from hydromodel.datasets import * -from hydromodel.datasets.data_preprocess import cross_valid_data, split_train_test +from hydromodel.datasets.data_preprocess import ( + cross_valid_data, + split_train_test, + get_ts_from_diffsource, + get_pe_q_from_ts, +) from hydromodel.trainers.calibrate_sceua import calibrate_by_sceua @@ -42,22 +42,7 @@ def calibrate(args): model_info = args.model algo_info = args.algorithm loss = args.loss - if data_type == "camels": - camels_data_dir = os.path.join( - SETTING["local_data_path"]["datasets-origin"], "camels", data_dir - ) - camels = Camels(camels_data_dir) - ts_data = camels.read_ts_xrdataset( - basin_ids, periods, ["prcp", "PET", "streamflow"] - ) - elif data_type == "owndata": - ts_data = xr.open_dataset( - os.path.join(os.path.dirname(data_dir), "timeseries.nc") - ) - else: - raise NotImplementedError( - "You should set the data type as 'camels' or 'owndata'" - ) + ts_data = get_ts_from_diffsource(data_type, data_dir, periods, basin_ids) where_save = Path(os.path.join(repo_path, "result", exp)) if os.path.exists(where_save) is False: @@ -68,55 +53,39 @@ def calibrate(args): periods = np.sort( [train_period[0], train_period[1], test_period[0], test_period[1]] ) - if cv_fold > 1: - train_and_test_data = cross_valid_data(ts_data, periods, warmup, cv_fold) - else: - # when using train_test_split, the warmup period is not used - # so you should include the warmup period in the train and test period train_and_test_data = split_train_test(ts_data, train_period, test_period) + else: + # cross validation + train_and_test_data = cross_valid_data(ts_data, periods, warmup, cv_fold) + print("Start to calibrate the model") - if data_type == "camels": - basin_area = camels.read_area(basin_ids) - p_and_e = ( - train_and_test_data[0][["prcp", "PET"]] - .to_array() - .to_numpy() - .transpose(2, 1, 0) - ) - # trans unit to mm/day - qobs_ = train_and_test_data[0][["streamflow"]] - r_mmd = streamflow_unit_conv(qobs_, basin_area, target_unit="mm/d") - qobs = np.expand_dims(r_mmd["streamflow"].to_numpy().transpose(1, 0), axis=2) - elif data_type == "owndata": - attr_data = xr.open_dataset( - os.path.join(os.path.dirname(data_dir), "attributes.nc") - ) - basin_area = attr_data["area"].values - p_and_e = ( - train_and_test_data[0][[PRCP_NAME, PET_NAME]] - .to_array() - .to_numpy() - .transpose(2, 1, 0) - ) - qobs = np.expand_dims( - train_and_test_data[0][[FLOW_NAME]].to_array().to_numpy().transpose(1, 0), - axis=2, + if cv_fold <= 1: + p_and_e, qobs = get_pe_q_from_ts(train_and_test_data[0]) + calibrate_by_sceua( + basin_ids, + p_and_e, + qobs, + os.path.join(where_save, "sceua_xaj"), + warmup, + model=model_info, + algorithm=algo_info, + loss=loss, ) else: - raise NotImplementedError( - "You should set the data type as 'camels' or 'owndata'" - ) - calibrate_by_sceua( - basin_ids, - p_and_e, - qobs, - os.path.join(where_save, "sceua_xaj"), - warmup, - model=model_info, - algorithm=algo_info, - loss=loss, - ) + for i in range(cv_fold): + train_data, _ = train_and_test_data[i] + p_and_e_cv, qobs_cv = get_pe_q_from_ts(train_data) + calibrate_by_sceua( + basin_ids, + p_and_e_cv, + qobs_cv, + os.path.join(where_save, f"sceua_xaj_cv{i+1}"), + warmup, + model=model_info, + algorithm=algo_info, + loss=loss, + ) # Convert the arguments to a dictionary args_dict = vars(args) # Save the arguments to a YAML file @@ -155,7 +124,7 @@ def calibrate(args): "--cv_fold", dest="cv_fold", help="the number of cross-validation fold", - default=1, + default=2, type=int, ) parser.add_argument( diff --git a/scripts/calibrate_xaj_for_multicases.py b/scripts/calibrate_xaj_for_multicases.py index 64d3021..babb920 100644 --- a/scripts/calibrate_xaj_for_multicases.py +++ b/scripts/calibrate_xaj_for_multicases.py @@ -13,7 +13,7 @@ from pathlib import Path sys.path.append(os.path.dirname(Path(os.path.abspath(__file__)).parent)) -from scripts.evaluate_xaj import calibrate +from scripts.evaluate_xaj import evaluate matplotlib.use("Agg") exp = "exp61561" @@ -49,4 +49,4 @@ def __init__(self, exp, book, source, rep, ngs): rep = reps[i] ng = ngs[j] xaj_calibrate = XAJCalibrateMultiCases(exp, book, source, rep, ng) - calibrate(xaj_calibrate) + evaluate(xaj_calibrate) diff --git a/scripts/evaluate_xaj.py b/scripts/evaluate_xaj.py index 7a44e97..f529e9c 100644 --- a/scripts/evaluate_xaj.py +++ b/scripts/evaluate_xaj.py @@ -1,16 +1,17 @@ import argparse -import json import socket import fnmatch from datetime import datetime import numpy as np import pandas as pd +import yaml import os import sys from pathlib import Path from hydroutils import hydro_file -sys.path.append(os.path.dirname(Path(os.path.abspath(__file__)).parent)) +repo_path = os.path.dirname(Path(os.path.abspath(__file__)).parent) +sys.path.append(repo_path) from hydromodel.trainers.calibrate_sceua import calibrate_by_sceua from hydromodel.datasets.data_postprocess import ( renormalize_params, @@ -24,51 +25,28 @@ from hydromodel.trainers.calibrate_ga import calibrate_by_ga, show_ga_result -def calibrate(args): +def read_yaml_config(file_path): + with open(file_path, "r") as file: + config = yaml.safe_load(file) + return config + + +def evaluate(args): exp = args.exp warmup = args.warmup_length - data_dir = os.path.join( - "D:/研究生/毕业论文/new毕业论文/预答辩/碧流河水库/模型运行/" - ) - kfold = [ - int(f_name[len("data_info_fold") : -len("_test.json")]) - for f_name in os.listdir(data_dir) # 输出文件夹下的所有文件 - if fnmatch.fnmatch(f_name, "*_fold*_test.json") - ] + cali_dir = Path(os.path.join(repo_path, "result", exp)) + cali_config = read_yaml_config(os.path.join(cali_dir, "config.yaml")) kfold = np.sort(kfold) for fold in kfold: print(f"Start to calibrate the {fold}-th fold") - train_data_info_file = os.path.join( - data_dir, f"data_info_fold{str(fold)}_train.json" - ) - train_data_file = os.path.join(data_dir, f"data_info_fold{str(fold)}_train.npy") - test_data_info_file = os.path.join( - data_dir, f"data_info_fold{str(fold)}_test.json" - ) - test_data_file = os.path.join(data_dir, f"data_info_fold{str(fold)}_test.npy") - if ( - os.path.exists(train_data_info_file) is False - or os.path.exists(train_data_file) is False - or os.path.exists(test_data_info_file) is False - or os.path.exists(test_data_file) is False - ): - raise FileNotFoundError( - "The data files are not found, please run datapreprocess4calibrate.py first." - ) - data_train = hydro_file.unserialize_numpy(train_data_file) - data_test = hydro_file.unserialize_numpy(test_data_file) - data_info_train = hydro_file.unserialize_json_ordered(train_data_info_file) - data_info_test = hydro_file.unserialize_json_ordered(test_data_info_file) current_time = datetime.now().strftime("%b%d_%H-%M-%S") save_dir = os.path.join( - data_dir, + cali_dir, current_time + "_" + socket.gethostname() + "_fold" + str(fold) - + "_" - + comment, ) # 读输入文件 if os.path.exists(save_dir) is False: @@ -86,16 +64,6 @@ def calibrate(args): if not os.path.exists(spotpy_db_dir): os.makedirs(spotpy_db_dir) db_name = os.path.join(spotpy_db_dir, "SCEUA_" + model_info["name"]) - - sampler = calibrate_by_sceua( # 率定 - data_train[:, i : i + 1, 0:2], - data_train[:, i : i + 1, -1:], - db_name, - warmup_length=warmup, - model=model_info, - algorithm=algo_info, - ) - show_calibrate_result( # 展示率定结果 sampler.setup, db_name, @@ -223,4 +191,4 @@ def calibrate(args): type=int, ) the_args = parser.parse_args() - calibrate(the_args) + evaluate(the_args)