diff --git a/definitions.py b/definitions.py index b45dc23..dcadc13 100644 --- a/definitions.py +++ b/definitions.py @@ -10,7 +10,7 @@ import os from pathlib import Path -ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) # This is your Project Root +ROOT_DIR = os.path.dirname(os.path.abspath('/home/ldaning/code/biye/hydro-model-xaj/definitions.py')) # This is your Project Root path = Path(ROOT_DIR) DATASET_DIR = os.path.join(path.parent.parent.absolute(), "data") print("Please Check your directory:") diff --git a/hydromodel/calibrate/calibrate_sceua.py b/hydromodel/calibrate/calibrate_sceua.py index 3ff3b76..e65f818 100644 --- a/hydromodel/calibrate/calibrate_sceua.py +++ b/hydromodel/calibrate/calibrate_sceua.py @@ -1,6 +1,7 @@ from typing import Union import numpy as np import spotpy +import pandas as pd from spotpy.parameter import Uniform, ParameterSet from spotpy.objectivefunctions import rmse from hydromodel.models.model_config import MODEL_PARAM_DICT @@ -14,7 +15,7 @@ def __init__( self, p_and_e, qobs, - warmup_length=30, + warmup_length=365, model={ "name": "xaj_mz", "source_type": "sources", @@ -127,24 +128,52 @@ def objectivefunction( float likelihood """ - # SPOTPY expects to get one or multiple values back, - # that define the performance of the model run + # 切片 + # time = pd.read_excel('/home/ldaning/code/biye/hydro-model-xaj/hydromodel/example/zhandian/洪水率定时间1.xlsx') + # calibrate_starttime = pd.to_datetime("2014-7-10 0:00") + # calibrate_endtime = pd.to_datetime("2020-6-24 0:00") + # total = 0 + # count = 0 + # for i in range(len(time)): + # if(time.iloc[i,0] tuple[np.array, np.array, np.array]: """ Three-layers evaporation model from "Watershed Hydrologic Simulation" written by Prof. RenJun Zhao. @@ -62,7 +63,8 @@ def calculate_evap(lm, c, wu0, wl0, prcp, pet) -> tuple[np.array, np.array, np.a return eu, el, ed -@jit +# @jit +@jit(nopython=True) def calculate_prcp_runoff(b, im, wm, w0, pe) -> tuple[np.array, np.array]: """ Calculates the amount of runoff generated from rainfall after entering the underlying surface. @@ -424,7 +426,7 @@ def sources5mm( kg, s0=None, fr0=None, - time_interval_hours=24, + time_interval_hours=1, book="HF", ): """ @@ -606,7 +608,8 @@ def sources5mm( return (rs, rss, rg), (s_ds[-1], fr_ds[-1]) -@jit +# @jit +@jit(nopython=True) def linear_reservoir(x, weight, last_y=None) -> np.array: """ Linear reservoir's release function @@ -712,7 +715,7 @@ def xaj( p_and_e, params: Union[np.array, list], return_state=False, - warmup_length=30, + warmup_length=365, **kwargs, ) -> Union[tuple, np.array]: """ @@ -748,7 +751,7 @@ def xaj( """ # default values for some function parameters model_name = kwargs["name"] if "name" in kwargs else "xaj" - source_type = kwargs["source_type"] if "source_type" in kwargs else "sources" + source_type = kwargs["source_type"] if "source_type" in kwargs else "sources5mm" source_book = kwargs["source_book"] if "source_book" in kwargs else "HF" # params param_ranges = MODEL_PARAM_DICT[model_name]["param_range"] @@ -764,6 +767,8 @@ def xaj( (value[1] - value[0]) * params[:, i] + value[0] for i, (key, value) in enumerate(param_ranges.items()) ] + # xaj_params = [param_ranges[key] for key in param_ranges] + k = xaj_params[0] b = xaj_params[1] im = xaj_params[2] @@ -778,6 +783,7 @@ def xaj( # ki+kg should be smaller than 1; if not, we scale them ki = np.where(ki_ + kg_ < 1.0, ki_, (1.0 - PRECISION) / (ki_ + kg_) * ki_) kg = np.where(ki_ + kg_ < 1.0, kg_, (1.0 - PRECISION) / (ki_ + kg_) * kg_) + if route_method == "CSL": cs = xaj_params[11] l = xaj_params[12] @@ -789,7 +795,7 @@ def xaj( kernel_size = int(xaj_params[15]) else: raise NotImplementedError( - "We don't provide this route method now! Please use 'CS' or 'MZ'!" + "We don't provide this route method now! Please use 'CSL' or 'MZ'!" ) ci = xaj_params[13] cg = xaj_params[14] @@ -811,6 +817,7 @@ def xaj( qi0 = np.full(ci.shape, 0.1) qg0 = np.full(cg.shape, 0.1) + # state_variables inputs = p_and_e[warmup_length:, :, :] runoff_ims_ = np.full(inputs.shape[:2], 0.0) diff --git a/hydromodel/utils/hydro_utils.py b/hydromodel/utils/hydro_utils.py new file mode 100644 index 0000000..bd402b2 --- /dev/null +++ b/hydromodel/utils/hydro_utils.py @@ -0,0 +1,514 @@ +import json +import os +import re +import zipfile +import datetime as dt, datetime +from typing import List +import pickle +from collections import OrderedDict +import numpy as np +import urllib +from urllib import parse + +import requests +import matplotlib.pyplot as plt +from itertools import combinations + +import threading +import functools + +import tqdm + +import logging + + +# -----------------------------------------------logger setting---------------------------------------------------- +def get_hydro_logger(log_level_param): + logger = logging.getLogger(__name__) + # StreamHandler + stream_handler = logging.StreamHandler() # console stream output + stream_handler.setLevel(level=log_level_param) + logger.addHandler(stream_handler) + return logger + + +log_level = logging.INFO +hydro_logger = get_hydro_logger(log_level) + + +# ------------------------------------------------progress bar---------------------------------------------------- +def provide_progress_bar( + function, estimated_time, tstep=0.2, tqdm_kwargs={}, args=[], kwargs={} +): + """ + Tqdm wrapper for a long-running function + + Parameters + ---------- + function + function to run + estimated_time + how long you expect the function to take + tstep + time delta (seconds) for progress bar updates + tqdm_kwargs + kwargs to construct the progress bar + args + args to pass to the function + kwargs + keyword args to pass to the function + + Returns + ------- + function + """ + ret = [None] # Mutable var so the function can store its return value + + def myrunner(function, ret, *args, **kwargs): + ret[0] = function(*args, **kwargs) + + thread = threading.Thread( + target=myrunner, args=(function, ret) + tuple(args), kwargs=kwargs + ) + pbar = tqdm.tqdm(total=estimated_time, **tqdm_kwargs) + + thread.start() + while thread.is_alive(): + thread.join(timeout=tstep) + pbar.update(tstep) + pbar.close() + return ret[0] + + +def progress_wrapped(estimated_time, tstep=0.2, tqdm_kwargs={}): + """Decorate a function to add a progress bar""" + + def real_decorator(function): + @functools.wraps(function) + def wrapper(*args, **kwargs): + return provide_progress_bar( + function, + estimated_time=estimated_time, + tstep=tstep, + tqdm_kwargs=tqdm_kwargs, + args=args, + kwargs=kwargs, + ) + + return wrapper + + return real_decorator + + +def setup_log(tag="VOC_TOPICS"): + # create logger + logger = logging.getLogger(tag) + # logger.handlers = [] + logger.propagate = False + logger.setLevel(logging.DEBUG) + # create console handler and set level to debug + ch = logging.StreamHandler() + ch.setLevel(logging.DEBUG) + # create formatter + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + # add formatter to ch + ch.setFormatter(formatter) + # add ch to logger + # logger.handlers = [] + logger.addHandler(ch) + return logger + + +# -------------------------------------------------plot utils------------------------------------------------------- +def save_or_show_plot(file_nm: str, save: bool, save_path=""): + if save: + plt.savefig(os.path.join(save_path, file_nm)) + else: + plt.show() + + +# ----------------------------------------------------data tools-------------------------------------------------------- +def unzip_file(dataset_zip, path_unzip): + """extract a zip file""" + with zipfile.ZipFile(dataset_zip, "r") as zip_temp: + zip_temp.extractall(path_unzip) + + +def unzip_nested_zip(dataset_zip, path_unzip): + """Extract a zip file including any nested zip files""" + with zipfile.ZipFile(dataset_zip, "r") as zfile: + zfile.extractall(path=path_unzip) + for root, dirs, files in os.walk(path_unzip): + for filename in files: + if re.search(r"\.zip$", filename): + file_spec = os.path.join(root, filename) + new_dir = os.path.join(root, filename[0:-4]) + unzip_nested_zip(file_spec, new_dir) + + +def zip_file_name_from_url(data_url, data_dir): + data_url_str = data_url.split("/") + filename = parse.unquote(data_url_str[-1]) + zipfile_path = os.path.join(data_dir, filename) + unzip_dir = os.path.join(data_dir, filename[0:-4]) + return zipfile_path, unzip_dir + + +def is_there_file(zipfile_path, unzip_dir): + """if a file has existed""" + if os.path.isfile(zipfile_path): + if os.path.isdir(unzip_dir): + return True + unzip_nested_zip(zipfile_path, unzip_dir) + return True + + +def download_one_zip(data_url, data_dir): + """download one zip file from url as data_file""" + zipfile_path, unzip_dir = zip_file_name_from_url(data_url, data_dir) + if not is_there_file(zipfile_path, unzip_dir): + if not os.path.isdir(unzip_dir): + os.mkdir(unzip_dir) + r = requests.get(data_url, stream=True) + with open(zipfile_path, "wb") as py_file: + for chunk in r.iter_content(chunk_size=1024): # 1024 bytes + if chunk: + py_file.write(chunk) + unzip_nested_zip(zipfile_path, unzip_dir), download_small_file + + +def download_small_zip(data_url, data_dir): + """download zip file and unzip""" + zipfile_path, unzip_dir = zip_file_name_from_url(data_url, data_dir) + if not is_there_file(zipfile_path, unzip_dir): + if not os.path.isdir(unzip_dir): + os.mkdir(unzip_dir) + zipfile_path, _ = urllib.request.urlretrieve(data_url, zipfile_path) + unzip_nested_zip(zipfile_path, unzip_dir) + + +def download_small_file(data_url, temp_file): + """download data from url to the temp_file""" + r = requests.get(data_url) + with open(temp_file, "w") as f: + f.write(r.text) + + +def download_excel(data_url, temp_file): + """download a excel file according to url""" + if not os.path.isfile(temp_file): + urllib.request.urlretrieve(data_url, temp_file) + + +def download_a_file_from_google_drive(drive, dir_id, download_dir): + file_list = drive.ListFile( + {"q": "'" + dir_id + "' in parents and trashed=false"} + ).GetList() + for file in file_list: + print("title: %s, id: %s" % (file["title"], file["id"])) + file_dl = drive.CreateFile({"id": file["id"]}) + print("mimetype is %s" % file_dl["mimeType"]) + if file_dl["mimeType"] == "application/vnd.google-apps.folder": + download_dir_sub = os.path.join(download_dir, file_dl["title"]) + if not os.path.isdir(download_dir_sub): + os.makedirs(download_dir_sub) + download_a_file_from_google_drive(drive, file_dl["id"], download_dir_sub) + else: + # download + temp_file = os.path.join(download_dir, file_dl["title"]) + if os.path.isfile(temp_file): + print("file has been downloaded") + continue + file_dl.GetContentFile(os.path.join(download_dir, file_dl["title"])) + print("Downloading file finished") + + +class NumpyArrayEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.ndarray): + return obj.tolist() + return json.JSONEncoder.default(self, obj) + + +def serialize_json_np(my_dict, my_file): + with open(my_file, "w") as FP: + json.dump(my_dict, FP, cls=NumpyArrayEncoder) + + +def serialize_json(my_dict, my_file): + with open(my_file, "w") as FP: + json.dump(my_dict, FP, indent=4) + + +def unserialize_json_ordered(my_file): + # m_file = os.path.join(my_file, 'master.json') + with open(my_file, "r") as fp: + m_dict = json.load(fp, object_pairs_hook=OrderedDict) + return m_dict + + +def unserialize_json(my_file): + with open(my_file, "r") as fp: #自动分配和释放资源 + my_object = json.load(fp) + return my_object + + +def serialize_pickle(my_object, my_file): + f = open(my_file, "wb") + pickle.dump(my_object, f) + f.close() + + +def unserialize_pickle(my_file): + f = open(my_file, "rb") + my_object = pickle.load(f) + f.close() + return my_object + + +def serialize_numpy(my_array, my_file): + np.save(my_file, my_array) + + +def unserialize_numpy(my_file): + y = np.load(my_file, allow_pickle=True) #读取npy文件 + return y + + +# -------------------------------------------------time & date tools-------------------------------------------------- +def t2dt(t, hr=False): + t_out = None + if type(t) is int: + if 30000000 > t > 10000000: + t = dt.datetime.strptime(str(t), "%Y%m%d").date() + t_out = t if hr is False else t.datetime() + + if type(t) is dt.date: + t_out = t if hr is False else t.datetime() + + if type(t) is dt.datetime: + t_out = t.date() if hr is False else t + + if t_out is None: + raise Exception("hydroDL.utils.t2dt failed") + return t_out + + +def t_range2_array(t_range, *, step=np.timedelta64(1, "D")): + sd = t2dt(t_range[0]) + ed = t2dt(t_range[1]) + tArray = np.arange(sd, ed, step) + return tArray + + +def t_range_days(t_range, *, step=np.timedelta64(1, "h")): + sd = dt.datetime.strptime(t_range[0], "%Y-%m-%d %H:%M:%S") + ed = dt.datetime.strptime(t_range[1], "%Y-%m-%d %H:%M:%S") + t_array = np.arange(sd, ed, step) + return t_array + + +def t_days_lst2range(t_array): + if type(t_array[0]) == np.datetime64: + t0 = t_array[0].astype(datetime.datetime) + t1 = t_array[-1].astype(datetime.datetime) + else: + t0 = t_array[0] + t1 = t_array[-1] + sd = t0.strftime("%Y-%m-%d") + ed = t1.strftime("%Y-%m-%d") + return [sd, ed] + + +def t_range_years(t_range): + """t_range is a left-closed and right-open interval, if t_range[1] is not Jan.1 then end_year should be included""" + start_year = int(t_range[0].split("-")[0]) + end_year = int(t_range[1].split("-")[0]) + end_month = int(t_range[1].split("-")[1]) + end_day = int(t_range[1].split("-")[2]) + if end_month == 1 and end_day == 1: + year_range_list = np.arange(start_year, end_year) + else: + year_range_list = np.arange(start_year, end_year + 1) + return year_range_list + + +def get_year(a_time): + if isinstance(a_time, datetime.date): + return a_time.year + elif isinstance(a_time, np.datetime64): + return a_time.astype("datetime64[Y]").astype(int) + 1970 + else: + return int(a_time[0:4]) + + +def intersect(t_lst1, t_lst2): + C, ind1, ind2 = np.intersect1d(t_lst1, t_lst2, return_indices=True) + return ind1, ind2 + + +def date_to_julian(a_time): + if type(a_time) == str: + fmt = "%Y-%m-%d" + dt = datetime.datetime.strptime(a_time, fmt) + else: + dt = a_time + tt = dt.timetuple() + julian_date = tt.tm_yday + return julian_date + + +def t_range_to_julian(t_range): + t_array = t_range_days(t_range) + t_array_str = np.datetime_as_string(t_array) + julian_dates = [date_to_julian(a_time[0:10]) for a_time in t_array_str] + return julian_dates + + +# --------------------------------------------------MATH CALCULATION--------------------------------------------------- +def subset_of_dict(dict, chosen_keys): + """make a new dict from key-values of chosen keys in a list""" + return {key: value for key, value in dict.items() if key in chosen_keys} + + +def pair_comb(combine_attrs): + if len(combine_attrs) == 1: + values = list(combine_attrs[0].values())[0] + key = list(combine_attrs[0].keys())[0] + results = [] + for value in values: + d = dict() + d[key] = value + results.append(d) + return results + items_all = list() + for dict_item in combine_attrs: + list_temp = list(dict_item.values())[0] + items_all = items_all + list_temp + all_combs = list(combinations(items_all, 2)) + + def is_in_same_item(item1, item2): + for dict_item in combine_attrs: + list_now = list(dict_item.values())[0] + if item1 in list_now and item2 in list_now: + return True + return False + + def which_dict(item): + for dict_item in combine_attrs: + list_now = list(dict_item.values())[0] + if item in list_now: + return list(dict_item.keys())[0] + + combs = [comb for comb in all_combs if not is_in_same_item(comb[0], comb[1])] + combs_dict = [ + {which_dict(comb[0]): comb[0], which_dict(comb[1]): comb[1]} for comb in combs + ] + return combs_dict + + +def flat_data(x): + xArrayTemp = x.flatten() + xArray = xArrayTemp[~np.isnan(xArrayTemp)] + xSort = np.sort(xArray) + return xSort + + +def interpNan(x, mode="linear"): + if len(x.shape) == 1: + ngrid = 1 + nt = x.shape[0] + else: + ngrid, nt = x.shape + for k in range(ngrid): + xx = x[k, :] + xx = interpNan1d(xx, mode) + return x + + +def interpNan1d(x, mode="linear"): + i0 = np.where(np.isnan(x))[0] + i1 = np.where(~np.isnan(x))[0] + if len(i1) > 0: + if mode == "linear": + x[i0] = np.interp(i0, i1, x[i1]) + if mode == "pre": + x0 = x[i1[0]] + for k in range(len(x)): + if k in i0: + x[k] = x0 + else: + x0 = x[k] + + return x + + +def is_any_elem_in_a_lst(lst1, lst2, return_index=False, include=False): + do_exist = False + idx_lst = [] + for j in range(len(lst1)): + if include: + for lst2_elem in lst2: + if lst1[j] in lst2_elem: + idx_lst.append(j) + do_exist = True + else: + if lst1[j] in lst2: + idx_lst.append(j) + do_exist = True + if return_index: + return do_exist, idx_lst + return do_exist + + +def random_choice_no_return(arr, num_lst): + """sampling without replacement multi-times, and the num of each time is in num_lst""" + num_lst_arr = np.array(num_lst) + num_sum = num_lst_arr.sum() + if type(arr) == list: + arr = np.array(arr) + assert num_sum <= arr.size + results = [] + arr_residue = np.arange(arr.size) + for num in num_lst_arr: + idx_chosen = np.random.choice(arr_residue.size, num, replace=False) + chosen_idx_in_arr = np.sort(arr_residue[idx_chosen]) + results.append(arr[chosen_idx_in_arr]) + arr_residue = np.delete(arr_residue, idx_chosen) + return results + + +def find_integer_factors_close_to_square_root(integer): + start = int(np.sqrt(integer)) + factor = integer / start + while not is_integer(factor): + start += 1 + factor = integer / start + return int(factor), start + + +def is_integer(number): + if int(number) == number: + return True + else: + return False + + +def random_index(ngrid, nt, dim_subset): + batch_size, rho = dim_subset + i_grid = np.random.randint(0, ngrid, [batch_size]) + i_t = np.random.randint(0, nt - rho, [batch_size]) + return i_grid, i_t + + +def flatten_list_function(input_list: List): + return [item for sublist in input_list for item in sublist] + + +def nanlog(x): + if x != x: + return np.nan + else: + return np.log(x) diff --git a/hydromodel/utils/plots.py b/hydromodel/utils/plots.py index 0dd6061..03cc2e4 100644 --- a/hydromodel/utils/plots.py +++ b/hydromodel/utils/plots.py @@ -1,10 +1,10 @@ """ Author: Wenyu Ouyang Date: 2022-10-25 21:16:22 -LastEditTime: 2023-12-17 21:11:58 +LastEditTime: 2024-03-21 18:41:51 LastEditors: Wenyu Ouyang Description: Plots for calibration and testing results -FilePath: \\hydro-model-xaj\\hydromodel\\utils\\plots.py +FilePath: \hydro-model-xaj\hydromodel\utils\plots.py Copyright (c) 2021-2022 Wenyu Ouyang. All rights reserved. """ @@ -72,8 +72,9 @@ def show_calibrate_result( save_dir, basin_id, train_period, - result_unit="mm/day", + result_unit="mm/hour", basin_area=None, + prcp=None, ): """ Plot all year result to see the effect of optimized parameters @@ -98,23 +99,32 @@ def show_calibrate_result( None """ # Load the results gained with the sceua sampler, stored in SCEUA_xaj.csv - results = spotpy.analyser.load_csv_results(sceua_calibrated_file) + + # results = [] + # for chunk in pd.read_csv(sceua_calibrated_file, chunksize=100000 ): + # results.append(chunk) + # results = pd.concat(results) + results = spotpy.analyser.load_csv_results(sceua_calibrated_file) # 读取结果 # Plot how the objective function was minimized during sampling - if not os.path.exists(save_dir): + if not os.path.exists(save_dir): # 绘制采样过程中目标函数的最小化情况 os.makedirs(save_dir) plot_train_iteration( - results["like1"], os.path.join(save_dir, "train_iteration.png") + results["like1"], + os.path.join(save_dir, "train_iteration.png"), # 绘制迭代中的RMSE ) # Plot the best model run # Find the run_id with the minimal objective function value - bestindex, bestobjf = spotpy.analyser.get_minlikeindex(results) + bestindex, bestobjf = spotpy.analyser.get_minlikeindex( + results + ) # 绘制最佳模型图并找到run—id # Select best model run - best_model_run = results[bestindex] - # Filter results for simulation results + best_model_run = results[bestindex] # 选择最佳模型结果 + # Filter results for simulation results #最佳模型模拟结果 fields = [word for word in best_model_run.dtype.names if word.startswith("sim")] best_simulation = list(best_model_run[fields]) convert_unit_sim = units.convert_unit( np.array(best_simulation).reshape(1, -1), + # np.array(list(map(float, best_simulation)), dtype=float).reshape(1, -1), result_unit, units.unit["streamflow"], basin_area=basin_area, @@ -125,7 +135,8 @@ def show_calibrate_result( units.unit["streamflow"], basin_area=basin_area, ) - # save calibrated results of calibration period + + # save calibrated results of calibration period #保存率定的结果 train_result_file = os.path.join( save_dir, "train_qsim_" + spot_setup.model["name"] + "_" + str(basin_id) + ".csv", @@ -145,11 +156,36 @@ def show_calibrate_result( hydro_file.serialize_json_np( stat_error, os.path.join(save_dir, "train_metrics.json") ) + + # 循还画图 + # time = pd.read_excel('/home/ldaning/code/biye/hydro-model-xaj/hydromodel/example/zhandian/洪水率定时间1.xlsx') + # calibrate_starttime = pd.to_datetime("2014-7-10 0:00") + # calibrate_endtime = pd.to_datetime("2020-6-24 0:00") + # basin_area = float(basin_area) + # best_simulation = [x * (basin_area*1000000/1000/3600) for x in best_simulation] + # obs = [x * (basin_area*1000000/1000/3600) for x in spot_setup.evaluation()] + # for i in range(len(time)): + ## if(time.iloc[i,0]=0): + numerator+=pow(abs(sim[i])-obs[i],2) + meangauge+=obs[i] + count+=1 + meangauge=meangauge/count + for i in range(len(obs)): + if (obs[i]>=0): + denominator+=pow(obs[i]-meangauge,2) + NSE= 1-numerator/denominator + plt.text(0.9, 0.6, 'NSE=%.2f' % NSE, + horizontalalignment='center', + verticalalignment='center', + transform = ax.transAxes, + fontsize=10) + # plt.xticks(date) + ax2 = ax.twinx() + ax2.bar(date,prcp, label='Precipitation', color='royalblue',alpha=0.9,width=0.05) + ax2.set_ylabel('Precipitation(mm)') + plt.yticks(fontproperties = 'Times New Roman', size = 10) + prcp_max = np.max(prcp) + ax2.set_ylim(0, prcp_max*4) + ax2.invert_yaxis() #y轴反向 + ax2.legend(loc='upper left') + plt.tight_layout() # 自动调整子图参数,使之填充整个图像区域 + plt.savefig(save_fig, bbox_inches="tight") + plt.close() + + +def plot_train_iteration(likelihood, save_fig): + # matplotlib.use("Agg") + fig = plt.figure(figsize=(9, 6)) #绘制单个子图 + ax = fig.subplots() + ax.plot(likelihood) + ax.set_ylabel("RMSE") + ax.set_xlabel("Iteration") + plt.savefig(save_fig, bbox_inches="tight") + # plt.cla() + plt.close() diff --git a/scripts/calibrate_xaj.py b/scripts/calibrate_xaj.py index 24830bf..b0629d8 100644 --- a/scripts/calibrate_xaj.py +++ b/scripts/calibrate_xaj.py @@ -8,7 +8,6 @@ import os import sys from pathlib import Path - from hydroutils import hydro_file sys.path.append(os.path.dirname(Path(os.path.abspath(__file__)).parent)) @@ -34,9 +33,12 @@ def calibrate(args): algo_info = args.algorithm comment = args.comment data_dir = os.path.join(definitions.ROOT_DIR, "hydromodel", "example", exp) + data_dir = os.path.join( + "/home/ldaning/code/biye/hydro-model-xaj/hydromodel/example/model_run_wuxi7" + ) kfold = [ int(f_name[len("data_info_fold") : -len("_test.json")]) - for f_name in os.listdir(data_dir) + for f_name in os.listdir(data_dir) # 输出文件夹下的所有文件 if fnmatch.fnmatch(f_name, "*_fold*_test.json") ] kfold = np.sort(kfold) @@ -45,15 +47,11 @@ def calibrate(args): train_data_info_file = os.path.join( data_dir, f"data_info_fold{str(fold)}_train.json" ) - train_data_file = os.path.join( - data_dir, f"basins_lump_p_pe_q_fold{str(fold)}_train.npy" - ) + train_data_file = os.path.join(data_dir, f"data_info_fold{str(fold)}_train.npy") test_data_info_file = os.path.join( data_dir, f"data_info_fold{str(fold)}_test.json" ) - test_data_file = os.path.join( - data_dir, f"basins_lump_p_pe_q_fold{str(fold)}_test.npy" - ) + test_data_file = os.path.join(data_dir, f"data_info_fold{str(fold)}_test.npy") if ( os.path.exists(train_data_info_file) is False or os.path.exists(train_data_file) is False @@ -78,6 +76,7 @@ def calibrate(args): + "_" + comment, ) + # 读输入文件 if os.path.exists(save_dir) is False: os.makedirs(save_dir) hydro_file.serialize_json(vars(args), os.path.join(save_dir, "args.json")) @@ -86,14 +85,15 @@ def calibrate(args): basin_id = data_info_train["basin"][i] basin_area = data_info_train["area"][i] # one directory for one model + one hyperparam setting and one basin - spotpy_db_dir = os.path.join( + spotpy_db_dir = os.path.join( # 一个模型一个文件夹 save_dir, basin_id, ) if not os.path.exists(spotpy_db_dir): os.makedirs(spotpy_db_dir) db_name = os.path.join(spotpy_db_dir, "SCEUA_" + model_info["name"]) - sampler = calibrate_by_sceua( + + sampler = calibrate_by_sceua( # 率定 data_train[:, i : i + 1, 0:2], data_train[:, i : i + 1, -1:], db_name, @@ -102,7 +102,7 @@ def calibrate(args): algorithm=algo_info, ) - show_calibrate_result( + show_calibrate_result( # 展示率定结果 sampler.setup, db_name, warmup_length=warmup, @@ -110,13 +110,14 @@ def calibrate(args): basin_id=basin_id, train_period=data_info_train["time"], basin_area=basin_area, + prcp=data_train[365:, i : i + 1, 0:1].flatten(), ) - params = read_save_sceua_calibrated_params( + params = read_save_sceua_calibrated_params( # 保存率定的参数文件 basin_id, spotpy_db_dir, db_name ) # _ is et which we didn't use here - qsim, _ = xaj( + qsim, _ = xaj( # 计算模拟结果 data_test[:, i : i + 1, 0:2], params, warmup_length=warmup, @@ -125,12 +126,14 @@ def calibrate(args): qsim = units.convert_unit( qsim, + # TODO: to unify "mm/hour" unit_now="mm/day", unit_final=units.unit["streamflow"], basin_area=basin_area, ) qobs = units.convert_unit( data_test[warmup:, i : i + 1, -1:], + # TODO: to unify "mm/hour" unit_now="mm/day", unit_final=units.unit["streamflow"], basin_area=basin_area, @@ -145,11 +148,17 @@ def calibrate(args): index=False, header=False, ) - test_date = pd.to_datetime( - data_info_test["time"][warmup:] - ).values.astype("datetime64[D]") + test_date = pd.to_datetime(data_info_test["time"][:]).values.astype( + "datetime64[h]" + ) show_test_result( - basin_id, test_date, qsim, qobs, save_dir=spotpy_db_dir + basin_id, + test_date, + qsim, + qobs, + save_dir=spotpy_db_dir, + warmup_length=warmup, + prcp=data_test[365:, i : i + 1, 0:1].flatten(), ) elif algo_info["name"] == "GA": for i in range(len(data_info_train["basin"])): @@ -205,13 +214,13 @@ def calibrate(args): # the exp must be same as the exp in data_preprocess.py # python calibrate_xaj.py --exp exp201 --warmup_length 365 --model {\"name\":\"xaj_mz\",\"source_type\":\"sources\",\"source_book\":\"HF\"} --algorithm {\"name\":\"SCE_UA\",\"random_seed\":1234,\"rep\":2000,\"ngs\":20,\"kstop\":3,\"peps\":0.1,\"pcento\":0.1} # python calibrate_xaj.py --exp exp61561 --warmup_length 365 --model {\"name\":\"xaj_mz\",\"source_type\":\"sources\",\"source_book\":\"HF\"} --algorithm {\"name\":\"GA\",\"random_seed\":1234,\"run_counts\":50,\"pop_num\":50,\"cross_prob\":0.5,\"mut_prob\":0.5,\"save_freq\":1} -if __name__ == "__main__": +if __name__ == "__main__": # 固定套路 parser = argparse.ArgumentParser(description="Calibrate a hydrological model.") parser.add_argument( "--exp", dest="exp", help="An exp is corresponding to a data plan from data_preprocess.py", - default="exp004", + default="/home/ldaning/code/biye/hydro-model-xaj/hydromodel/example/model_run_wuxi7", type=str, ) parser.add_argument( @@ -226,8 +235,8 @@ def calibrate(args): dest="model", help="which hydro model you want to calibrate and the parameters setting for model function, note: not hydromodel parameters but function's parameters", default={ - "name": "xaj_mz", - "source_type": "sources", + "name": "xaj", + "source_type": "sources5mm", "source_book": "HF", }, type=json.loads, @@ -241,31 +250,31 @@ def calibrate(args): "ngs is the number of complex, better larger than your hydro-model-params number (nopt) but not too large, because the number of population's individuals is ngs * (2*nopt+1), larger ngs need more evaluations;" "kstop is the number of evolution (not evaluation) loops, some small numbers such as 2, 3, 5, ... are recommended, if too large it is hard to finish optimizing;" "peps and pcento are two loop-stop criterion, 0.1 (its unit is %, 0.1 means a relative change of 1/1000) is a good choice", - # default={ - # "name": "SCE_UA", - # "random_seed": 1234, - # "rep": 5000, - # "ngs": 20, - # "kstop": 3, - # "peps": 0.1, - # "pcento": 0.1, - # }, default={ - "name": "GA", + "name": "SCE_UA", "random_seed": 1234, - "run_counts": 2, - "pop_num": 50, - "cross_prob": 0.5, - "mut_prob": 0.5, - "save_freq": 1, + "rep": 600, + "ngs": 1000, + "kstop": 1000, + "peps": 0.01, + "pcento": 0.01, }, + # default={ + # "name": "GA", + # "random_seed": 1234, + # "run_counts": 2, + # "pop_num": 50, + # "cross_prob": 0.5, + # "mut_prob": 0.5, + # "save_freq": 1, + # }, type=json.loads, ) parser.add_argument( "--comment", dest="comment", help="A tag for a plan, we will use it when postprocessing results", - default="HFsources", + default="HF", type=str, ) the_args = parser.parse_args() diff --git a/scripts/datapostprocess4calibrate.py b/scripts/datapostprocess4calibrate.py index 8f96e6c..4c6e08f 100644 --- a/scripts/datapostprocess4calibrate.py +++ b/scripts/datapostprocess4calibrate.py @@ -1,12 +1,13 @@ """ Author: Wenyu Ouyang Date: 2022-11-19 17:27:05 -LastEditTime: 2023-06-03 16:21:12 +LastEditTime: 2024-03-21 18:44:31 LastEditors: Wenyu Ouyang Description: the script to postprocess calibrated models in hydro-model-xaj -FilePath: /hydro-model-xaj/scripts/datapostprocess4calibrate.py +FilePath: \hydro-model-xaj\scripts\datapostprocess4calibrate.py Copyright (c) 2021-2022 Wenyu Ouyang. All rights reserved. """ + from tqdm import tqdm import numpy as np import pandas as pd @@ -27,7 +28,10 @@ def statistics(args): cases = args.cases cv_fold = args.cv_fold where_save_cache = Path( - os.path.join(definitions.ROOT_DIR, "hydromodel", "example", exp) + os.path.join( + "/home/ldaning/code/biye/hydro-model-xaj/hydromodel/example/model_run_wuxi7" + ) + # definitions.ROOT_DIR, "hydromodel", "example", exp ) if os.path.exists(where_save_cache) is False: raise NotImplementedError( @@ -54,7 +58,6 @@ def statistics(args): comment = case_dir.name.split("_")[-1] comment_lst.append(comment) comments = np.unique(comment_lst) - for comment in tqdm(comments, desc="All settings in an exp directory"): comment_folds_test = [] comment_folds_train = [] @@ -62,11 +65,15 @@ def statistics(args): comment_fold_dir = [] for case in cases: case_dir = where_save_cache.joinpath(case) + # print(case_dir) if case_dir.is_dir() and fnmatch.fnmatch( case_dir.name, f"*_fold{str(fold)}_" + comment ): comment_fold_dir.append(case_dir) + comment_fold_dir = [str(path) for path in comment_fold_dir] + print(comment_fold_dir) comment_fold_dir_newest = np.sort(comment_fold_dir)[-1] + comment_fold_dir_newest = Path(comment_fold_dir_newest) read_and_save_et_ouputs(comment_fold_dir_newest, fold=fold) comment_fold_file_test = comment_fold_dir_newest.joinpath( "basins_metrics_test.csv" @@ -135,7 +142,7 @@ def statistics(args): "--exp", dest="exp", help="An exp is corresponding to one data setting", - default="example", + default="exp61561", type=str, ) parser.add_argument( diff --git a/test/test_data.py b/test/test_data.py index af32d10..30f90f0 100644 --- a/test/test_data.py +++ b/test/test_data.py @@ -1,79 +1,172 @@ """ Author: Wenyu Ouyang Date: 2022-10-25 21:16:22 -LastEditTime: 2023-10-28 09:18:48 +LastEditTime: 2024-03-21 18:44:13 LastEditors: Wenyu Ouyang Description: Test for data preprocess FilePath: \hydro-model-xaj\test\test_data.py Copyright (c) 2021-2022 Wenyu Ouyang. All rights reserved. """ + import os from collections import OrderedDict import numpy as np import pandas as pd import pytest +import fnmatch +import socket +from datetime import datetime +import pathlib from hydroutils import hydro_file import definitions +from hydromodel.utils import hydro_utils +from hydromodel.data.data_preprocess import ( + cross_valid_data, + split_train_test, +) -@pytest.fixture() -def txt_file(): - return os.path.join( - definitions.ROOT_DIR, "hydromodel", "example", "01013500_lump_p_pe_q.txt" - ) +# @pytest.fixture() +# def txt_file(): +# return os.path.join( +# definitions.ROOT_DIR, "hydromodel", "example", "01013500_lump_p_pe_q.txt" +# ) -@pytest.fixture() -def json_file(): - return os.path.join(definitions.ROOT_DIR, "hydromodel", "example", "data_info.json") +# @pytest.fixture() +# def json_file(): +# return os.path.join(definitions.ROOT_DIR, "hydromodel", "example", "data_info.json") -@pytest.fixture() -def npy_file(): - return os.path.join( - definitions.ROOT_DIR, "hydromodel", "example", "basins_lump_p_pe_q.npy" - ) +# @pytest.fixture() +# def npy_file(): +# return os.path.join( +# definitions.ROOT_DIR, "hydromodel", "example", "basins_lump_p_pe_q.npy" +# ) +txt_file = pathlib.Path( + "/home/ldaning/code/biye/hydro-model-xaj/hydromodel/example/wuxi.csv" +) +forcing_data = pathlib.Path( + "/home/ldaning/code/biye/hydro-model-xaj/hydromodel/example/wuxi.csv" +) +json_file = pathlib.Path( + "/home/ldaning/code/biye/hydro-model-xaj/hydromodel/example/model_run_wuxi7/data_info.json" +) +npy_file = pathlib.Path( + "/home/ldaning/code/biye/hydro-model-xaj/hydromodel/example/model_run_wuxi7/data_info.npy" +) -def test_save_data(txt_file, json_file, npy_file): - data = pd.read_csv(txt_file) - print(data.columns) - # Note: The units are all mm/day! For streamflow, data is divided by basin's area - variables = ["prcp(mm/day)", "petfao56(mm/day)", "streamflow(mm/day)"] - data_info = OrderedDict( - { - "time": data["date"].values.tolist(), - "basin": ["01013500"], - "variable": variables, - } - ) - hydro_file.serialize_json(data_info, json_file) - # 1 ft3 = 0.02831685 m3 - ft3tom3 = 2.831685e-2 - # 1 km2 = 10^6 m2 - km2tom2 = 1e6 - # 1 m = 1000 mm - mtomm = 1000 - # 1 day = 24 * 3600 s - daytos = 24 * 3600 - # trans ft3/s to mm/day - basin_area = 2252.7 - data[variables[-1]] = ( - data[["streamflow(ft3/s)"]].values - * ft3tom3 - / (basin_area * km2tom2) - * mtomm - * daytos - ) - df = data[variables] - hydro_file.serialize_numpy(np.expand_dims(df.values, axis=1), npy_file) + +# def test_save_data(txt_file, json_file, npy_file): +data = pd.read_csv(txt_file) +# Note: The units are all mm/day! For streamflow, data is divided by basin's area +# variables = ["prcp(mm/day)", "petfao56(mm/day)", "streamflow(mm/day)"] +variables = ["prcp(mm/hour)", "pev(mm/hour)", "streamflow(m3/s)"] +data_info = OrderedDict( + { + "time": data["date"].values.tolist(), + "basin": ["wuxi"], + "variable": variables, + "area": ["1992.62"], + } +) +hydro_utils.serialize_json(data_info, json_file) +# 1 ft3 = 0.02831685 m3 +# ft3tom3 = 2.831685e-2 + +# 1 km2 = 10^6 m2 +km2tom2 = 1e6 +# 1 m = 1000 mm +mtomm = 1000 +# 1 day = 24 * 3600 s +# daytos = 24 * 3600 +hourtos = 3600 +# trans ft3/s to mm/day +# basin_area = 2055.56 +basin_area = 1992.62 +data[variables[-1]] = ( + data[["streamflow(m3/s)"]].values + # * ft3tom3 + / (basin_area * km2tom2) + * mtomm + * hourtos +) +df = data[variables] +hydro_utils.serialize_numpy(np.expand_dims(df.values, axis=1), npy_file) + + +# def test_load_data(txt_file, npy_file): +# data_ = pd.read_csv(txt_file) +# df = data_[["prcp(mm/day)", "petfao56(mm/day)"]] +# data = hydro_utils.unserialize_numpy(npy_file)[:, :, :2] +# np.testing.assert_array_equal(data, np.expand_dims(df.values, axis=1)) -def test_load_data(txt_file, npy_file): - data_ = pd.read_csv(txt_file) - df = data_[["prcp(mm/day)", "petfao56(mm/day)"]] - data = hydro_file.unserialize_numpy(npy_file)[:, :, :2] - np.testing.assert_array_equal(data, np.expand_dims(df.values, axis=1)) +# start_train = datetime(2014, 5, 1, 1) +# end_train = datetime(2020, 1, 1, 7) +# start_test = datetime(2020, 1, 1, 8) +# end_test = datetime(2021, 10, 11, 23) +# train_period = ["2014-05-01 09:00:00", "2019-01-01 08:00:00"] +test_period = ["2019-01-01 07:00:00", "2021-10-12 09:00:00"] +# test_period = ["2019-01-01 08:00:00", "2021-10-11 09:00:00"] +train_period = ["2014-05-01 09:00:00", "2019-01-01 07:00:00"] +period = ["2014-05-01 09:00:00", "2021-10-12 09:00:00"] +cv_fold = 1 +warmup_length = 365 + +# if not (cv_fold > 1): +# # no cross validation +# periods = np.sort( +# [train_period[0], train_period[1], test_period[0], test_period[1]] +# ) +# print(periods) +if cv_fold > 1: + cross_valid_data(json_file, npy_file, period, warmup_length, cv_fold) +else: + split_train_test(json_file, npy_file, train_period, test_period) + + +kfold = [ + int(f_name[len("data_info_fold") : -len("_test.json")]) + for f_name in os.listdir(os.path.dirname(txt_file)) + if fnmatch.fnmatch(f_name, "*_fold*_test.json") +] +kfold = np.sort(kfold) +for fold in kfold: + print(f"Start to calibrate the {fold}-th fold") + train_data_info_file = os.path.join( + os.path.dirname(forcing_data), f"data_info_fold{str(fold)}_train.json" + ) + train_data_file = os.path.join( + os.path.dirname(forcing_data), f"data_info_fold{str(fold)}_train.npy" + ) + test_data_info_file = os.path.join( + os.path.dirname(forcing_data), f"data_info_fold{str(fold)}_test.json" + ) + test_data_file = os.path.join( + os.path.dirname(forcing_data), f"data_info_fold{str(fold)}_test.npy" + ) + if ( + os.path.exists(train_data_info_file) is False + or os.path.exists(train_data_file) is False + or os.path.exists(test_data_info_file) is False + or os.path.exists(test_data_file) is False + ): + raise FileNotFoundError( + "The data files are not found, please run datapreprocess4calibrate.py first." + ) + data_train = hydro_utils.unserialize_numpy(train_data_file) + print(data_train.shape) + data_test = hydro_utils.unserialize_numpy(test_data_file) + data_info_train = hydro_utils.unserialize_json_ordered(train_data_info_file) + data_info_test = hydro_utils.unserialize_json_ordered(test_data_info_file) + current_time = datetime.now().strftime("%b%d_%H-%M-%S") + # one directory for one model + one hyperparam setting and one basin + save_dir = os.path.join( + os.path.dirname(forcing_data), + current_time + "_" + socket.gethostname() + "_fold" + str(fold), + )