From d08274c61c043cc7e1fa08351601c4b24600674d Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Tue, 18 Apr 2023 14:59:00 -0500 Subject: [PATCH 001/105] Tool to generate list of number of upstream catchments --- subsetting/README.md | 7 +++++ subsetting/ncatch_upstream.py | 51 +++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 subsetting/ncatch_upstream.py diff --git a/subsetting/README.md b/subsetting/README.md index a3c9e10..953443a 100644 --- a/subsetting/README.md +++ b/subsetting/README.md @@ -13,3 +13,10 @@ The subset algorithm will find all features upstream of the `catchment_id` and t # Note A current shortcut is being used to map `wb` and `cat` ids that isn't a valid assumption, and will be fixed in the future. This means you might get a subset that isn't topologically consistent, so use at your own risk. + +# ncatch_upstream +To get a list of how many catchments are upstream of each catchment, enter the following command +`python subset.py -i -o ` + +where `path_to_hydrofabric` can be a local geopkg, or a remote resource (s3 or http URL), +and `path_to_output_text_file` is the full path to where you want the list output \ No newline at end of file diff --git a/subsetting/ncatch_upstream.py b/subsetting/ncatch_upstream.py new file mode 100644 index 0000000..7481f9b --- /dev/null +++ b/subsetting/ncatch_upstream.py @@ -0,0 +1,51 @@ +import geopandas as gpd +import argparse +from subset import get_upstream_ids + +def main(): + #setup the argument parser + parser = argparse.ArgumentParser() + parser.add_argument("-i", dest="infile", type=str, required=True, help="A gpkg file containing divides and nexus layers") + parser.add_argument("-o", dest="outfile", type=str, required=True, help="A text file containing the number of upstream catchments for each catchment") + args = parser.parse_args() + + infile = args.infile + outfile = args.outfile + + print("Reading catchment data...") + df_cat = gpd.read_file(str(infile), layer="divides") + + print("Reading nexus data...") + df_nex = gpd.read_file(str(infile), layer="nexus") + + df_cat_org = df_cat.copy() + df_nex_org = df_nex.copy() + + df_cat.set_index('id', inplace=True) + + print("Finding upstream catchments...") + upstream = nupstream(df_cat_org, df_nex_org,df_cat.index) + + with open(outfile,'w') as fp: + for jcatch in upstream: + fp.write(f'{jcatch} : {upstream[jcatch]}\n') + + print(f'Done! - > {outfile}') + +def nupstream(divides,nexus,cat_list): + """ + Find the number of upstream catchments for each catchment + """ + upstream = {} + for j in range(len(cat_list)): + jcat_id = cat_list[j] + cat_up_ids, nexus_up_ids = get_upstream_ids(divides, nexus, jcat_id) + jnupstream = len(cat_up_ids) + upstream[jcat_id] = jnupstream + + upstream = dict(sorted(upstream.items(), key=lambda x:x[1], reverse=True)) + + return upstream + +if __name__ == "__main__": + main() \ No newline at end of file From 174b2a08775b2dd840d4602caed4b44f3147ff2c Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Tue, 18 Apr 2023 15:11:41 -0500 Subject: [PATCH 002/105] Made arguments positional --- subsetting/README.md | 2 +- subsetting/ncatch_upstream.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/subsetting/README.md b/subsetting/README.md index 953443a..ee2ebc7 100644 --- a/subsetting/README.md +++ b/subsetting/README.md @@ -16,7 +16,7 @@ This means you might get a subset that isn't topologically consistent, so use at # ncatch_upstream To get a list of how many catchments are upstream of each catchment, enter the following command -`python subset.py -i -o ` +`python subset.py ` where `path_to_hydrofabric` can be a local geopkg, or a remote resource (s3 or http URL), and `path_to_output_text_file` is the full path to where you want the list output \ No newline at end of file diff --git a/subsetting/ncatch_upstream.py b/subsetting/ncatch_upstream.py index 7481f9b..4983bd4 100644 --- a/subsetting/ncatch_upstream.py +++ b/subsetting/ncatch_upstream.py @@ -1,12 +1,12 @@ import geopandas as gpd -import argparse +import argparse, os from subset import get_upstream_ids def main(): #setup the argument parser parser = argparse.ArgumentParser() - parser.add_argument("-i", dest="infile", type=str, required=True, help="A gpkg file containing divides and nexus layers") - parser.add_argument("-o", dest="outfile", type=str, required=True, help="A text file containing the number of upstream catchments for each catchment") + parser.add_argument(dest="infile", type=str, help="A gpkg file containing divides and nexus layers") + parser.add_argument(dest="outfile", type=str, help="A text file containing the number of upstream catchments for each catchment") args = parser.parse_args() infile = args.infile @@ -27,6 +27,7 @@ def main(): upstream = nupstream(df_cat_org, df_nex_org,df_cat.index) with open(outfile,'w') as fp: + fp.write(f'Catchment IDs and the number of upstream catchments\nGenerated with file {os.path.basename(infile)}\n') for jcatch in upstream: fp.write(f'{jcatch} : {upstream[jcatch]}\n') From ca213088734fc1b9858adb3e3336c1528c4fc4d9 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Tue, 18 Apr 2023 16:32:10 -0500 Subject: [PATCH 003/105] Tool for isolating forcing files based on the catchments within a geojson --- subsetting/subset_forcing.py | 49 ++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 subsetting/subset_forcing.py diff --git a/subsetting/subset_forcing.py b/subsetting/subset_forcing.py new file mode 100644 index 0000000..9199442 --- /dev/null +++ b/subsetting/subset_forcing.py @@ -0,0 +1,49 @@ +import argparse, os, json + +def main(): + """ + Find forcing files in a directory that match the catchments within a catchment.geojson + + """ + #setup the argument parser + parser = argparse.ArgumentParser() + parser.add_argument(dest="forcing_dir", type=str, help="Path to forcing files") + parser.add_argument(dest="forcing_dir_out", type=str, help="Path to output the forcing files subset") + parser.add_argument(dest="catchment_file", type=str, help="A catchment geojson file") + args = parser.parse_args() + + indir = args.forcing_dir + outdir = args.forcing_dir_out + catch_file = args.catchment_file + + if not os.path.exists(outdir): + os.system(f'mkdir {outdir}') + + forcing_files = os.listdir(indir) + + print("Reading catchment data...") + with open(catch_file) as fp: + data = json.load(fp) + + # User should validate the catch file. + # Would do here with ngen-cal, just don't want to create the dependency + feats = data['features'] + forcing_out = [] + for jfeat in feats: + found = False + try: # Geopandas/pydantic descrepancy + cat_id = jfeat['id'] + except: + cat_id = jfeat['properties']['id'] + for jforcing in forcing_files: + if jforcing.find(cat_id) >= 0: + found = True + forcing_out.append(jforcing) + os.system(f'cp {os.path.join(indir,jforcing)} {os.path.join(outdir,jforcing)}') + if not found: + print(f'Couldn\'t find forcing file for {cat_id}!') + else: + print(f'Found forcing file for {cat_id}!') + +if __name__ == "__main__": + main() \ No newline at end of file From f80b5ca96b736555798d57b50a17b9c48defa1e6 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Tue, 18 Apr 2023 19:30:44 -0500 Subject: [PATCH 004/105] Removed unnecessary copy --- subsetting/ncatch_upstream.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/subsetting/ncatch_upstream.py b/subsetting/ncatch_upstream.py index 4983bd4..51c9032 100644 --- a/subsetting/ncatch_upstream.py +++ b/subsetting/ncatch_upstream.py @@ -18,13 +18,10 @@ def main(): print("Reading nexus data...") df_nex = gpd.read_file(str(infile), layer="nexus") - df_cat_org = df_cat.copy() - df_nex_org = df_nex.copy() - df_cat.set_index('id', inplace=True) print("Finding upstream catchments...") - upstream = nupstream(df_cat_org, df_nex_org,df_cat.index) + upstream = nupstream(df_cat.reset_index(), df_nex.reset_index(),df_cat.index) with open(outfile,'w') as fp: fp.write(f'Catchment IDs and the number of upstream catchments\nGenerated with file {os.path.basename(infile)}\n') From c86455bac132ea98fe38b6a4a497461718eb834a Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Tue, 25 Apr 2023 17:10:09 -0500 Subject: [PATCH 005/105] configurable python script to generate catchment forcing files for ngen --- prep_hydrofab_forcings_ngen.py | 465 +++++++++++++++++++++++++++++++++ user_input_ngen.json | 15 ++ 2 files changed, 480 insertions(+) create mode 100644 prep_hydrofab_forcings_ngen.py create mode 100644 user_input_ngen.json diff --git a/prep_hydrofab_forcings_ngen.py b/prep_hydrofab_forcings_ngen.py new file mode 100644 index 0000000..b423883 --- /dev/null +++ b/prep_hydrofab_forcings_ngen.py @@ -0,0 +1,465 @@ + +# https://github.com/jameshalgren/data-access-examples/blob/DONOTMERGE_VPU16/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb + +# !pip install --upgrade google-api-python-client +# !pip install --upgrade google-cloud-storage + +import pickle +import time +import pandas as pd +import argparse, os, json +import gc +from pathlib import Path +import geopandas as gpd +import pandas as pd +import numpy as np +import xarray as xr +from google.cloud import storage +from rasterio.io import MemoryFile +from rasterio.features import rasterize + +from nwm_filenames.listofnwmfilenames import create_file_list + +TEMPLATE_BLOB_NAME = ( + "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" +) +NWM_BUCKET = "national-water-model" + +# WKT strings extracted from NWM grids +CONUS_NWM_WKT = 'PROJCS["Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]], \ +PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ +PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-97.0],PARAMETER["standard_parallel_1",30.0],\ +PARAMETER["standard_parallel_2",60.0],PARAMETER["latitude_of_origin",40.0],UNIT["Meter",1.0]]' + +HI_NWM_WKT = 'PROJCS["Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]],\ +PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ +PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-157.42],PARAMETER["standard_parallel_1",10.0],\ +PARAMETER["standard_parallel_2",30.0],PARAMETER["latitude_of_origin",20.6],UNIT["Meter",1.0]]' + +PR_NWM_WKT = 'PROJCS["Sphere_Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]],\ +PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ +PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-65.91],PARAMETER["standard_parallel_1",18.1],\ +PARAMETER["standard_parallel_2",18.1],PARAMETER["latitude_of_origin",18.1],UNIT["Meter",1.0]]' + +# paths +CACHE_DIR = Path(Path.home(), "code", "data_access_examples", "raw_forcing_data") +NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") +USGS_CACHE_DIR = os.path.join(CACHE_DIR, "usgs") +GEO_CACHE_DIR = os.path.join(CACHE_DIR, "geo") + +NWM_CACHE_H5 = os.path.join(NWM_CACHE_DIR, "gcp_client.h5") + +PARQUET_CACHE_DIR = os.path.join(CACHE_DIR, "parquet") +MEDIUM_RANGE_FORCING_PARQUET = os.path.join(PARQUET_CACHE_DIR, "forcing_medium_range") +FORCING_ANALYSIS_ASSIM_PARQUET = os.path.join( + PARQUET_CACHE_DIR, "forcing_analysis_assim" +) +MEDIUM_RANGE_PARQUET = os.path.join(PARQUET_CACHE_DIR, "medium_range") +USGS_PARQUET = os.path.join(PARQUET_CACHE_DIR, "usgs") + +HUC10_SHP_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_conus.shp") +HUC10_PARQUET_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_conus.parquet") +HUC10_MEDIUM_RANGE_WEIGHTS_FILEPATH = os.path.join( + GEO_CACHE_DIR, "wbdhu10_medium_range_weights.pkl" +) + +ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, "RouteLink_CONUS.nc") +ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, "route_link_conus.parquet") + + +def parquet_to_gdf(parquet_filepath: str) -> gpd.GeoDataFrame: + gdf = gpd.read_parquet(parquet_filepath) + return gdf + +def get_cache_dir(create: bool = True): + if not os.path.exists(NWM_CACHE_DIR) and create: + os.mkdir(NWM_CACHE_DIR) + if not os.path.exists(NWM_CACHE_DIR): + raise NotADirectoryError + return NWM_CACHE_DIR + +def make_parent_dir(filepath): + Path(filepath).parent.mkdir(parents=True, exist_ok=True) + +def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: + """Retrieve a blob from the data service as xarray.Dataset. + Based largely on OWP HydroTools. + Parameters + ---------- + blob_name: str, required + Name of blob to retrieve. + use_cache: bool, default True + If cache should be used. + If True, checks to see if file is in cache, and + If fetched from remote, will save to cache. + Returns + ------- + ds : xarray.Dataset + The data stored in the blob. + """ + # TODO: Check to see if this does any better than kerchunk + # the caching should help, but probably needs to be managed to function asynchronously. + # Perhaps if the files is not cached, we can create the dataset from + # kerchunk with a remote path and then asynchronously do a download to cache it + # for next time. The hypothesis would be that the download speed will not be any slower than + # just accessing the file remotely. + nc_filepath = os.path.join(get_cache_dir(), blob_name) + make_parent_dir(nc_filepath) + + # If the file exists and use_cache = True + if os.path.exists(nc_filepath) and use_cache: + # Get dataset from cache + ds = xr.load_dataset( + nc_filepath, + engine="h5netcdf", + ) + return ds + else: + # Get raw bytes + raw_bytes = get_blob(blob_name) + # Create Dataset + ds = xr.load_dataset( + MemoryFile(raw_bytes), + engine="h5netcdf", + ) + if use_cache: + # Subset and cache + ds["RAINRATE"].to_netcdf( + nc_filepath, + engine="h5netcdf", + ) + return ds + +def generate_weights_file( + gdf: gpd.GeoDataFrame, + src: xr.DataArray, + weights_filepath: str, + crosswalk_dict_key: str, +): + """Generate a weights file.""" + + gdf_proj = gdf.to_crs(CONUS_NWM_WKT) + + crosswalk_dict = {} + + # This is a probably a really poor performing way to do this + # TODO: Consider vectorizing -- would require digging into the + # other end of these where we unpack the weights... + i = 0 + for index, row in gdf_proj.iterrows(): + geom_rasterize = rasterize( + [(row["geometry"], 1)], + out_shape=src.rio.shape, + transform=src.rio.transform(), + all_touched=True, + fill=0, # IS FILL 0 + dtype="uint8", + ) + if crosswalk_dict_key: + crosswalk_dict[row[crosswalk_dict_key]] = np.where(geom_rasterize == 1) + else: + crosswalk_dict[index] = np.where(geom_rasterize == 1) + + if i % 100 == 0: + perc = i/len(gdf_proj)*100 + print(f"{i}, {perc:.2f}%".ljust(40), end="\r") + if perc > 0.01: break + i += 1 + + with open(weights_filepath, "wb") as f: + # TODO: This is a dict of ndarrays, which could be easily stored as a set of parquet files for safekeeping. + pickle.dump(crosswalk_dict, f) + +def add_zonalstats_to_gdf_weights( + gdf: gpd.GeoDataFrame, + src: xr.DataArray, + weights_filepath: str, +) -> gpd.GeoDataFrame: + """Calculates zonal stats and adds to GeoDataFrame""" + + df = calc_zonal_stats_weights(src, weights_filepath) + gdf_map = gdf.merge(df, left_on="huc10", right_on="catchment_id") + + return gdf_map + + +def get_blob(blob_name: str, bucket: str = NWM_BUCKET) -> bytes: + """Retrieve a blob from the data service as bytes. + Based largely on OWP HydroTools. + Parameters + ---------- + blob_name : str, required + Name of blob to retrieve. + Returns + ------- + data : bytes + The data stored in the blob. + """ + # Setup anonymous client and retrieve blob data + client = storage.Client.create_anonymous_client() + bucket = client.bucket(bucket) + return bucket.blob(blob_name).download_as_bytes(timeout=120) + + +def calc_zonal_stats_weights( + src: xr.DataArray, + weights_filepath: str, +) -> pd.DataFrame: + """Calculates zonal stats""" + + # Open weights dict from pickle + # This could probably be done once and passed as a reference. + with open(weights_filepath, "rb") as f: + crosswalk_dict = pickle.load(f) + + r_array = src.values[0] + r_array[r_array == src.rio.nodata] = np.nan + + mean_dict = {} + for key, value in crosswalk_dict.items(): + mean_dict[key] = np.nanmean(r_array[value]) + + df = pd.DataFrame.from_dict(mean_dict, orient="index", columns=["value"]) + + df.reset_index(inplace=True, names="catchment_id") + + # This should not be needed, but without memory usage grows + del crosswalk_dict + del f + gc.collect() + + return df + + +def get_forcing_dict_RTIway( + pickle_file, # This would be a Feature list for parallel calling -- + # if there is a stored weights file, we use it + # (checking for an optional flag to force re-creation of the weights...) + folder_prefix, + file_list, +): + + var = "RAINRATE" + reng = "rasterio" + filehandles = [ + xr.open_dataset(folder_prefix / f, engine=reng)[var] for f in file_list + ] + # filehandles = [get_dataset("data/" + f, use_cache=True) for f in file_list] + stats = [] + + for _i, f in enumerate(filehandles): + print(f"{_i}, {round(_i/len(file_list), 2)*100}".ljust(40), end="\r") + stats.append(calc_zonal_stats_weights(f, pickle_file)) + + [f.close() for f in filehandles] + return stats + + +def get_forcing_dict_RTIway2( + pickle_file, # This would be a Feature list for parallel calling -- + # if there is a stored weights file, we use it + # (checking for an optional flag to force re-creation of the weights...) + gpkg_divides, + folder_prefix, + filelist, + var_list, +): + reng = "rasterio" + pick_val = "value" + + df_dict = {} + dl_dict = {} + for _v in var_list: + df_dict[_v] = pd.DataFrame(index=gpkg_divides.index) + dl_dict[_v] = [] + + # ds_list = [] + for _i, _nc_file in enumerate(filelist): + # _nc_file = ("nwm.t00z.medium_range.forcing.f001.conus.nc") + _full_nc_file = folder_prefix.joinpath(_nc_file) + + try: + # with xr.open_dataset(_full_nc_file, engine=reng) as _xds: + with xr.open_dataset(_full_nc_file) as _xds: + # _xds = ds_list[_i] + # _xds.rio.write_crs(rasterio.crs.CRS.from_wkt(CONUS_NWM_WKT), inplace=True) + print(f"{_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") + for _v in var_list: + _src = _xds[_v] + _df_zonal_stats = calc_zonal_stats_weights(_src, pickle_file) + # if adding statistics back to original GeoDataFrame + # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1) + _df = pd.DataFrame(index=gpkg_divides.index) + _df[_xds.time.values[0]] = _df_zonal_stats[pick_val] + # TODO: This same line could add the new values directly + # to the same dictionary. But after adding about 100 of them, + # pandas starts to complain about degraded performance due to + # fragmentation of the dataframe. We tried it this was as a + # workaround, with the loop below to accomplish the concatenation. + dl_dict[_v].append(_df) + except: + print(f"No such file: {_full_nc_file}") + + for _v in var_list: + df_dict[_v] = pd.concat(dl_dict[_v], axis=1) + + # [_xds.close() for _xds in ds_list] + + return df_dict + + +def main(): + """ + Primary function to retrieve hydrofabrics data and convert it into files that can be ingested into ngen. + Also, the forcing data is retrieved. + + Inputs: JSON config file specifying start_date, end_date, and vpu + + Outputs: ngen catchment/nexus configs and forcing files + + Will store files in the same folder as the JSON config to run this script + """ + parser = argparse.ArgumentParser() + parser.add_argument(dest="infile", type=str, help="A json containing user inputs to run ngen") + args = parser.parse_args() + + # Take in user config + conf = json.load(open(args.infile)) + start_date = conf['forcing']['start_date'] + end_date = conf['forcing']['end_date'] + runinput = conf['forcing']['runinput'] + varinput = conf['forcing']['varinput'] + geoinput = conf['forcing']['geoinput'] + meminput = conf['forcing']['meminput'] + urlbaseinput = conf['forcing']['urlbaseinput'] + + vpu = conf['hydrofab']['vpu'] + # Subsetting ??? + + top_dir = os.path.dirname(args.infile) + data_dir = os.path.join(top_dir,'raw_forcing_data') + output_dir = os.path.join(top_dir,'catchment_forcing_data') + + if not os.path.exists(data_dir): + os.system(f'mkdir {data_dir}') + + if not os.path.exists(output_dir): + os.system(f'mkdir {output_dir}') + + # Generate list of file names to retrieve for forcing data + n = 6 + fcst_cycle = [n*x for x in range(24//n)] + lead_time = [x+1 for x in range(n)] + + # TODO: These need to be in the configuration file + + + print(f'Creating list of file names to pull...') + nwm_forcing_files = create_file_list( + runinput, + varinput, + geoinput, + meminput, + start_date, + end_date, + fcst_cycle, + urlbaseinput, + lead_time, + ) + + print(f'Pulling files...') + local_files = [] + for jfile in nwm_forcing_files: + file_parts = jfile.split('/') + local_file = os.path.join(data_dir,file_parts[-1]) + local_files.append(local_file) + if os.path.exists(local_file): + continue + else: + command = f'wget -P {data_dir} -c https://storage.googleapis.com/national-water-model/{jfile}' + os.system(command) + + # TODO wget this if needed + gpkg = '/home/jlaser/code/data/nextgen_03W.gpkg' + ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) + src = ds["RAINRATE"] + + # Why are we converting to paquet and then back into geopandas dataframe? + polygonfile = gpd.read_file(gpkg, layer="divides") + parq_file = os.path.join(data_dir,"ng_03.parquet") + polygonfile.to_parquet(parq_file) + pkl_file = os.path.join(data_dir,"weights.pkl") + generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") + calc_zonal_stats_weights(src, pkl_file) + + var_list = [ + "U2D", + "V2D", + "LWDOWN", + "RAINRATE", + "T2D", + "Q2D", + "PSFC", + "SWDOWN", + ] + + just_files = [] + for jfile in local_files: + splt = jfile.split('/') # Need a way to do this that doesn't break on windows + just_files.append(splt[-1]) + + fd2 = get_forcing_dict_RTIway2( + pkl_file, + polygonfile, + Path(data_dir), + just_files, + var_list, + ) + + # pcp_var and pcp_var2 are indentical? + pcp_var = fd2["RAINRATE"] + lw_var = fd2["LWDOWN"] + sw_var = fd2["SWDOWN"] + sp_var = fd2["PSFC"] + tmp_var = fd2["T2D"] + u2d_var = fd2["U2D"] + v2d_var = fd2["V2D"] + pcp_var2 = fd2["RAINRATE"] + + ncatchments = len(polygonfile["id"]) + for _i in range(0, ncatchments): + + pcp_var_0 = pcp_var.transpose()[_i].rename("APCP_surface") + lw_var_0 = lw_var.transpose()[_i].rename("DLWRF_surface") + sw_var_0 = sw_var.transpose()[_i].rename("DSWRF_surface") + sp_var_0 = sp_var.transpose()[_i].rename("SPFH_2maboveground") + tmp_var_0 = tmp_var.transpose()[_i].rename("TMP_2maboveground") + u2d_var_0 = u2d_var.transpose()[_i].rename("UGRD_10maboveground") + v2d_var_0 = v2d_var.transpose()[_i].rename("VGRD_10maboveground") + pcp_var2_0 = pcp_var2.transpose()[_i].rename("precip_rate") ##BROKEN!! + + d = pd.concat( + [ + pcp_var_0, + lw_var_0, + sw_var_0, + sp_var_0, + tmp_var_0, + u2d_var_0, + v2d_var_0, + pcp_var2_0, + ], + axis=1, + ) + d.index.name = "time" + + id = polygonfile["id"][_i] + splt = id.split('-') + csvname = f"{output_dir}/cat{vpu}_{splt[1]}.csv" + d.to_csv(csvname) + + print(f'\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {output_dir}\n\n') + +if __name__ == "__main__": + main() + \ No newline at end of file diff --git a/user_input_ngen.json b/user_input_ngen.json new file mode 100644 index 0000000..6529f90 --- /dev/null +++ b/user_input_ngen.json @@ -0,0 +1,15 @@ +{ + "forcing" : { + "start_date" : "20220822", + "end_date" : "20220822", + "runinput" : 2, + "varinput" : 5, + "geoinput" : 1, + "meminput" : 0, + "urlbaseinput" : null + }, + + "hydrofab" : { + "vpu" : 14 + } +} \ No newline at end of file From 890e6986a7ca2eb307e9303866bbfcc0bd48f996 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 26 Apr 2023 17:29:04 -0500 Subject: [PATCH 006/105] Speed up:implemented new data -> data frame and write functions --- prep_hydrofab_forcings_ngen.py | 130 +++++++++++++++++++++++++++------ user_input_ngen.json | 14 ++-- 2 files changed, 116 insertions(+), 28 deletions(-) diff --git a/prep_hydrofab_forcings_ngen.py b/prep_hydrofab_forcings_ngen.py index b423883..d885829 100644 --- a/prep_hydrofab_forcings_ngen.py +++ b/prep_hydrofab_forcings_ngen.py @@ -1,13 +1,12 @@ - # https://github.com/jameshalgren/data-access-examples/blob/DONOTMERGE_VPU16/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb # !pip install --upgrade google-api-python-client # !pip install --upgrade google-cloud-storage import pickle -import time import pandas as pd import argparse, os, json +from sys import getsizeof import gc from pathlib import Path import geopandas as gpd @@ -17,8 +16,11 @@ from google.cloud import storage from rasterio.io import MemoryFile from rasterio.features import rasterize +import rasterio +import time from nwm_filenames.listofnwmfilenames import create_file_list +from ngen_forcing.process_nwm_forcing_to_ngen import * TEMPLATE_BLOB_NAME = ( "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" @@ -141,7 +143,6 @@ def generate_weights_file( gdf_proj = gdf.to_crs(CONUS_NWM_WKT) crosswalk_dict = {} - # This is a probably a really poor performing way to do this # TODO: Consider vectorizing -- would require digging into the # other end of these where we unpack the weights... @@ -163,7 +164,6 @@ def generate_weights_file( if i % 100 == 0: perc = i/len(gdf_proj)*100 print(f"{i}, {perc:.2f}%".ljust(40), end="\r") - if perc > 0.01: break i += 1 with open(weights_filepath, "wb") as f: @@ -176,7 +176,7 @@ def add_zonalstats_to_gdf_weights( weights_filepath: str, ) -> gpd.GeoDataFrame: """Calculates zonal stats and adds to GeoDataFrame""" - + df = calc_zonal_stats_weights(src, weights_filepath) gdf_map = gdf.merge(df, left_on="huc10", right_on="catchment_id") @@ -200,10 +200,37 @@ def get_blob(blob_name: str, bucket: str = NWM_BUCKET) -> bytes: bucket = client.bucket(bucket) return bucket.blob(blob_name).download_as_bytes(timeout=120) +def calc_zonal_stats_weights_new( + src: np.ndarray, + weights_filepath: str, +) -> pd.DataFrame: + """Calculates zonal stats""" + + # Open weights dict from pickle + # This could probably be done once and passed as a reference. + with open(weights_filepath, "rb") as f: + crosswalk_dict = pickle.load(f) + + nvar = src.shape[0] + mean_dict = {} + for key, value in crosswalk_dict.items(): + mean_dict[key] = np.zeros((nvar,),dtype=np.float64) + + mean_dict = {} + for key, value in crosswalk_dict.items(): + mean_dict[key] = np.nanmean(src[:,value[0],value[1]],axis=1) + + # This should not be needed, but without memory usage grows + del crosswalk_dict + del f + gc.collect() + + return mean_dict + def calc_zonal_stats_weights( src: xr.DataArray, - weights_filepath: str, + weights_filepath: str, ) -> pd.DataFrame: """Calculates zonal stats""" @@ -230,7 +257,6 @@ def calc_zonal_stats_weights( return df - def get_forcing_dict_RTIway( pickle_file, # This would be a Feature list for parallel calling -- # if there is a stored weights file, we use it @@ -254,6 +280,39 @@ def get_forcing_dict_RTIway( [f.close() for f in filehandles] return stats +def get_forcing_dict_JL( + pickle_file, + folder_prefix, + filelist, + var_list, + var_list_out +): + t1 = time.perf_counter() + df_by_t = [] + for _i, _nc_file in enumerate(filelist): + _full_nc_file = folder_prefix.joinpath(_nc_file) + print(f"Indexing data out of {_full_nc_file} {_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") + with xr.open_dataset(_full_nc_file) as _xds: + shp = _xds['U2D'].shape + data_allvars = np.zeros( + shape=(len(var_list),shp[1],shp[2]), + dtype=_xds['U2D'].dtype) + for var_dx, jvar in enumerate(var_list): + data_allvars[var_dx,:,:] = np.squeeze(_xds[jvar].values) + _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, pickle_file) + df_by_t.append(_df_zonal_stats) + + print(f'Reformating and converting data into dataframe') + dfs = {} + for jcat in list(df_by_t[0].keys()): + data_catch = [] + for jt in range(len(df_by_t)): + data_catch.append(df_by_t[jt][jcat]) + dfs[jcat] = pd.DataFrame(data_catch,columns = var_list_out) + + print(f"Indexing data and generating the dataframes (JL) {time.perf_counter() - t1:.2f} s") + + return dfs def get_forcing_dict_RTIway2( pickle_file, # This would be a Feature list for parallel calling -- @@ -264,6 +323,7 @@ def get_forcing_dict_RTIway2( filelist, var_list, ): + t1=time.perf_counter() reng = "rasterio" pick_val = "value" @@ -304,6 +364,7 @@ def get_forcing_dict_RTIway2( df_dict[_v] = pd.concat(dl_dict[_v], axis=1) # [_xds.close() for _xds in ds_list] + print(f"Indexing data and generating the dataframes (RTI) {time.perf_counter() - t1:.2f} s") return df_dict @@ -336,25 +397,20 @@ def main(): vpu = conf['hydrofab']['vpu'] # Subsetting ??? + # Set paths and make directories if needed top_dir = os.path.dirname(args.infile) data_dir = os.path.join(top_dir,'raw_forcing_data') output_dir = os.path.join(top_dir,'catchment_forcing_data') - if not os.path.exists(data_dir): os.system(f'mkdir {data_dir}') - if not os.path.exists(output_dir): os.system(f'mkdir {output_dir}') # Generate list of file names to retrieve for forcing data + print(f'Creating list of file names to pull...') n = 6 fcst_cycle = [n*x for x in range(24//n)] lead_time = [x+1 for x in range(n)] - - # TODO: These need to be in the configuration file - - - print(f'Creating list of file names to pull...') nwm_forcing_files = create_file_list( runinput, varinput, @@ -367,7 +423,7 @@ def main(): lead_time, ) - print(f'Pulling files...') + # Check to see if we have files cached, if not wget them local_files = [] for jfile in nwm_forcing_files: file_parts = jfile.split('/') @@ -389,33 +445,64 @@ def main(): parq_file = os.path.join(data_dir,"ng_03.parquet") polygonfile.to_parquet(parq_file) pkl_file = os.path.join(data_dir,"weights.pkl") - generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") - calc_zonal_stats_weights(src, pkl_file) + print("Generating weights") + t1 = time.perf_counter() + # generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") + print(f"Generating the weights took {time.perf_counter() - t1:.2f} s") var_list = [ "U2D", "V2D", "LWDOWN", "RAINRATE", "T2D", - "Q2D", "PSFC", "SWDOWN", ] + var_list_out = [ + "UGRD_10maboveground", + "VGRD_10maboveground", + "DLWRF_surface", + "APCP_surface", + "TMP_2maboveground", + "SPFH_2maboveground", + "DSWRF_surface", + ] + just_files = [] for jfile in local_files: splt = jfile.split('/') # Need a way to do this that doesn't break on windows just_files.append(splt[-1]) + + fd2 = get_forcing_dict_JL( + pkl_file, + Path(data_dir), + just_files, + var_list, + var_list_out, + ) + ncatch_out = len(fd2.keys()) + + t0 = time.perf_counter() + for jcatch in fd2.keys(): + arr = fd2[jcatch] + splt = jcatch.split('-') + csvname = f"{output_dir}/cat{vpu}_{splt[1]}.csv" + arr.to_csv(csvname) + + print(f'JL write took {time.perf_counter() - t0:.2f} s') + fd2 = get_forcing_dict_RTIway2( pkl_file, polygonfile, Path(data_dir), just_files, var_list, - ) + ) + t0 = time.perf_counter() # pcp_var and pcp_var2 are indentical? pcp_var = fd2["RAINRATE"] lw_var = fd2["LWDOWN"] @@ -426,8 +513,7 @@ def main(): v2d_var = fd2["V2D"] pcp_var2 = fd2["RAINRATE"] - ncatchments = len(polygonfile["id"]) - for _i in range(0, ncatchments): + for _i in range(0, ncatch_out): pcp_var_0 = pcp_var.transpose()[_i].rename("APCP_surface") lw_var_0 = lw_var.transpose()[_i].rename("DLWRF_surface") @@ -458,6 +544,8 @@ def main(): csvname = f"{output_dir}/cat{vpu}_{splt[1]}.csv" d.to_csv(csvname) + print(f'RTI write took {time.perf_counter() - t0:.2f} s') + print(f'\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {output_dir}\n\n') if __name__ == "__main__": diff --git a/user_input_ngen.json b/user_input_ngen.json index 6529f90..e3d5bef 100644 --- a/user_input_ngen.json +++ b/user_input_ngen.json @@ -1,15 +1,15 @@ { "forcing" : { - "start_date" : "20220822", - "end_date" : "20220822", - "runinput" : 2, - "varinput" : 5, - "geoinput" : 1, - "meminput" : 0, + "start_date" : "20220822", + "end_date" : "20220822", + "runinput" : 2, + "varinput" : 5, + "geoinput" : 1, + "meminput" : 0, "urlbaseinput" : null }, "hydrofab" : { - "vpu" : 14 + "vpu" : 14 } } \ No newline at end of file From 83ac0b2ce710d28b0da46ead8272c3bda0581aa1 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 26 Apr 2023 18:11:53 -0500 Subject: [PATCH 007/105] Included preop_rate (which is broken) --- prep_hydrofab_forcings_ngen.py | 131 +++++++++++++++++++-------------- 1 file changed, 77 insertions(+), 54 deletions(-) diff --git a/prep_hydrofab_forcings_ngen.py b/prep_hydrofab_forcings_ngen.py index d885829..017c922 100644 --- a/prep_hydrofab_forcings_ngen.py +++ b/prep_hydrofab_forcings_ngen.py @@ -68,6 +68,28 @@ ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, "RouteLink_CONUS.nc") ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, "route_link_conus.parquet") +# TODO: Implemenent these function to appropriately calculate precip_rate +def rho(temp): + """ + Calculate water density at temperature + """ + return 999.99399 + 0.04216485*temp - 0.007097451*(temp**2) + 0.00003509571*(temp**3) - 9.9037785E-8*(temp**4) + +def aorc_as_rate(dataFrame): + """ + Convert kg/m^2 -> m/s + """ + if isinstance(dataFrame.index, pd.MultiIndex): + interval = pd.Series(dataFrame.index.get_level_values(0)) + else: + interval = pd.Series(dataFrame.index) + interval = ( interval.shift(-1) - interval ) / np.timedelta64(1, 's') + interval.index = dataFrame.index + precip_rate = ( dataFrame['APCP_surface'].shift(-1) / dataFrame['TMP_2maboveground'].apply(rho) ) / interval + precip_rate.name = 'precip_rate' + return precip_rate + +###### def parquet_to_gdf(parquet_filepath: str) -> gpd.GeoDataFrame: gdf = gpd.read_parquet(parquet_filepath) @@ -291,7 +313,7 @@ def get_forcing_dict_JL( df_by_t = [] for _i, _nc_file in enumerate(filelist): _full_nc_file = folder_prefix.joinpath(_nc_file) - print(f"Indexing data out of {_full_nc_file} {_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") + print(f"Data indexing progress -> {_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") with xr.open_dataset(_full_nc_file) as _xds: shp = _xds['U2D'].shape data_allvars = np.zeros( @@ -302,7 +324,7 @@ def get_forcing_dict_JL( _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, pickle_file) df_by_t.append(_df_zonal_stats) - print(f'Reformating and converting data into dataframe') + print(f'Reformating and converting data into dataframe', end="\r") dfs = {} for jcat in list(df_by_t[0].keys()): data_catch = [] @@ -455,6 +477,7 @@ def main(): "V2D", "LWDOWN", "RAINRATE", + "RAINRATE", "T2D", "PSFC", "SWDOWN", @@ -465,6 +488,7 @@ def main(): "VGRD_10maboveground", "DLWRF_surface", "APCP_surface", + "precip_rate", # BROKEN "TMP_2maboveground", "SPFH_2maboveground", "DSWRF_surface", @@ -483,7 +507,6 @@ def main(): var_list, var_list_out, ) - ncatch_out = len(fd2.keys()) t0 = time.perf_counter() for jcatch in fd2.keys(): @@ -494,57 +517,57 @@ def main(): print(f'JL write took {time.perf_counter() - t0:.2f} s') - fd2 = get_forcing_dict_RTIway2( - pkl_file, - polygonfile, - Path(data_dir), - just_files, - var_list, - ) - - t0 = time.perf_counter() - # pcp_var and pcp_var2 are indentical? - pcp_var = fd2["RAINRATE"] - lw_var = fd2["LWDOWN"] - sw_var = fd2["SWDOWN"] - sp_var = fd2["PSFC"] - tmp_var = fd2["T2D"] - u2d_var = fd2["U2D"] - v2d_var = fd2["V2D"] - pcp_var2 = fd2["RAINRATE"] - - for _i in range(0, ncatch_out): - - pcp_var_0 = pcp_var.transpose()[_i].rename("APCP_surface") - lw_var_0 = lw_var.transpose()[_i].rename("DLWRF_surface") - sw_var_0 = sw_var.transpose()[_i].rename("DSWRF_surface") - sp_var_0 = sp_var.transpose()[_i].rename("SPFH_2maboveground") - tmp_var_0 = tmp_var.transpose()[_i].rename("TMP_2maboveground") - u2d_var_0 = u2d_var.transpose()[_i].rename("UGRD_10maboveground") - v2d_var_0 = v2d_var.transpose()[_i].rename("VGRD_10maboveground") - pcp_var2_0 = pcp_var2.transpose()[_i].rename("precip_rate") ##BROKEN!! - - d = pd.concat( - [ - pcp_var_0, - lw_var_0, - sw_var_0, - sp_var_0, - tmp_var_0, - u2d_var_0, - v2d_var_0, - pcp_var2_0, - ], - axis=1, - ) - d.index.name = "time" - - id = polygonfile["id"][_i] - splt = id.split('-') - csvname = f"{output_dir}/cat{vpu}_{splt[1]}.csv" - d.to_csv(csvname) - - print(f'RTI write took {time.perf_counter() - t0:.2f} s') + # fd2 = get_forcing_dict_RTIway2( + # pkl_file, + # polygonfile, + # Path(data_dir), + # just_files, + # var_list, + # ) + + # t0 = time.perf_counter() + # # pcp_var and pcp_var2 are indentical? + # pcp_var = fd2["RAINRATE"] + # lw_var = fd2["LWDOWN"] + # sw_var = fd2["SWDOWN"] + # sp_var = fd2["PSFC"] + # tmp_var = fd2["T2D"] + # u2d_var = fd2["U2D"] + # v2d_var = fd2["V2D"] + # pcp_var2 = fd2["RAINRATE"] + + # for _i in range(0, ncatch_out): + + # pcp_var_0 = pcp_var.transpose()[_i].rename("APCP_surface") + # lw_var_0 = lw_var.transpose()[_i].rename("DLWRF_surface") + # sw_var_0 = sw_var.transpose()[_i].rename("DSWRF_surface") + # sp_var_0 = sp_var.transpose()[_i].rename("SPFH_2maboveground") + # tmp_var_0 = tmp_var.transpose()[_i].rename("TMP_2maboveground") + # u2d_var_0 = u2d_var.transpose()[_i].rename("UGRD_10maboveground") + # v2d_var_0 = v2d_var.transpose()[_i].rename("VGRD_10maboveground") + # pcp_var2_0 = pcp_var2.transpose()[_i].rename("precip_rate") ##BROKEN!! + + # d = pd.concat( + # [ + # pcp_var_0, + # lw_var_0, + # sw_var_0, + # sp_var_0, + # tmp_var_0, + # u2d_var_0, + # v2d_var_0, + # pcp_var2_0, + # ], + # axis=1, + # ) + # d.index.name = "time" + + # id = polygonfile["id"][_i] + # splt = id.split('-') + # csvname = f"{output_dir}/cat{vpu}_{splt[1]}.csv" + # d.to_csv(csvname) + + # print(f'RTI write took {time.perf_counter() - t0:.2f} s') print(f'\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {output_dir}\n\n') From b8c95e2d10ab95034b44bd0ffc67563c908ab2d4 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 1 May 2023 13:19:41 -0500 Subject: [PATCH 008/105] user input markdown --- user_input_ngen.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 user_input_ngen.md diff --git a/user_input_ngen.md b/user_input_ngen.md new file mode 100644 index 0000000..d25fc7d --- /dev/null +++ b/user_input_ngen.md @@ -0,0 +1,33 @@ +# Manual for ngen user options + +## Example +filename = 'user_input_ngen.json' + +contents: + + { + "forcing" : { + "start_date" : "20220822", + "end_date" : "20220822", + "runinput" : 2, + "varinput" : 5, + "geoinput" : 1, + "meminput" : 0, + "urlbaseinput" : null + }, + + "hydrofab" : { + "vpu" : 14 + } + } + +### forcing +| Field Name | Data Type | Description | +| --- | --- | --- | +| start_date | `string` | YYYYMMDD | +| end_date | `string` | YYYYMMDD | +| runinput | `int` |
  1. short_range
  2. medium_range
  3. medium_range_no_da
  4. long_range
  5. analysis_assim
  6. analysis_assim_extend
  7. analysis_assim_extend_no_da
  8. analysis_assim_long
  9. analysis_assim_long_no_da
  10. analysis_assim_no_da
  11. short_range_no_da
| +| varinput | `int` |
  1. channel_rt: for real-time channel data
  2. land: for land data
  3. reservoir: for reservoir data
  4. terrain_rt: for real-time terrain data
  5. forcing: for forcing data
| +| geoinput | `int` |
  1. conus: for continental US
  2. hawaii: for Hawaii
  3. puertorico: for Puerto Rico
| +| meminput | `int` |
  1. mem_1
  2. mem_2
  3. mem_3
  4. mem_4
  5. mem_5
  6. mem_6
  7. mem_7
| +| urlbaseinput | `int` |
  1. Empty string: use local files
  2. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/: for real-time operational data from NOAA
  3. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/post-processed/WMS/: for post-processed data from NOAA's Web Map Service
  4. https://storage.googleapis.com/national-water-model/: for input/output data stored on Google Cloud Storage
  5. https://storage.cloud.google.com/national-water-model/: for input/output data stored on Google Cloud Storage
  6. gs://national-water-model/: for input/output data stored on Google Cloud Storage
  7. https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/: for retrospective data from AWS S3
  8. s3://noaa-nwm-retrospective-2-1-pds/model_output/: for retrospective data from AWS S3
| From c0770d76a8a3f8e9769d3d4dec7e628052686c3e Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 1 May 2023 13:26:59 -0500 Subject: [PATCH 009/105] added hydrofab field --- user_input_ngen.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/user_input_ngen.md b/user_input_ngen.md index d25fc7d..953a520 100644 --- a/user_input_ngen.md +++ b/user_input_ngen.md @@ -31,3 +31,9 @@ contents: | geoinput | `int` |
  1. conus: for continental US
  2. hawaii: for Hawaii
  3. puertorico: for Puerto Rico
| | meminput | `int` |
  1. mem_1
  2. mem_2
  3. mem_3
  4. mem_4
  5. mem_5
  6. mem_6
  7. mem_7
| | urlbaseinput | `int` |
  1. Empty string: use local files
  2. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/: for real-time operational data from NOAA
  3. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/post-processed/WMS/: for post-processed data from NOAA's Web Map Service
  4. https://storage.googleapis.com/national-water-model/: for input/output data stored on Google Cloud Storage
  5. https://storage.cloud.google.com/national-water-model/: for input/output data stored on Google Cloud Storage
  6. gs://national-water-model/: for input/output data stored on Google Cloud Storage
  7. https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/: for retrospective data from AWS S3
  8. s3://noaa-nwm-retrospective-2-1-pds/model_output/: for retrospective data from AWS S3
| + + +### hydrofab +| Field Name | Data Type | Description | +| --- | --- | --- | +| vpu | `int` | Check here for map of VPUs https://noaa-owp.github.io/hydrofabric/articles/data_access.html | From 6d0eb21ffe59167f024c713c717b38376e23e628 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 1 May 2023 14:33:09 -0500 Subject: [PATCH 010/105] gitignore initial commit --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c6e39ed --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.vscode/ +data/* +nwm_filenames/__pycache__/ +subsetting/__pycache__/ +venv/ \ No newline at end of file From 84ac5a2a96b2b258cf266f65aa786f57c3823967 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 1 May 2023 15:38:15 -0500 Subject: [PATCH 011/105] Added options and explanations --- user_input_ngen.json | 10 +++++++--- user_input_ngen.md | 9 ++++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/user_input_ngen.json b/user_input_ngen.json index e3d5bef..a6e757f 100644 --- a/user_input_ngen.json +++ b/user_input_ngen.json @@ -6,10 +6,14 @@ "varinput" : 5, "geoinput" : 1, "meminput" : 0, - "urlbaseinput" : null + "urlbaseinput" : 3 }, "hydrofab" : { - "vpu" : 14 - } + "vpu" : "03W" + }, + + "verbose" : true, + "output_dir" : "local", + "cache" : true } \ No newline at end of file diff --git a/user_input_ngen.md b/user_input_ngen.md index 953a520..eacadf6 100644 --- a/user_input_ngen.md +++ b/user_input_ngen.md @@ -36,4 +36,11 @@ contents: ### hydrofab | Field Name | Data Type | Description | | --- | --- | --- | -| vpu | `int` | Check here for map of VPUs https://noaa-owp.github.io/hydrofabric/articles/data_access.html | +| vpu | `string` | Check here for map of VPUs https://noaa-owp.github.io/hydrofabric/articles/data_access.html | + +### other options +| Field Name | Data Type | Description | +| --- | --- | --- | +| verbose | `bool` | Print raw forcing files | +| output_dir | `string` |
  1. "local" : output to ./data/catchment_forcing_data/
| +| cache | `bool` |
  • true: store forcing files locally
  • false: interact with forcing files remotely
  • | From 3bd2d8e6c65dd68ebbf9d7fe520893881e753653 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 1 May 2023 15:40:22 -0500 Subject: [PATCH 012/105] Updated example --- user_input_ngen.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/user_input_ngen.md b/user_input_ngen.md index eacadf6..3c6b82e 100644 --- a/user_input_ngen.md +++ b/user_input_ngen.md @@ -13,12 +13,16 @@ contents: "varinput" : 5, "geoinput" : 1, "meminput" : 0, - "urlbaseinput" : null + "urlbaseinput" : 3 }, "hydrofab" : { - "vpu" : 14 - } + "vpu" : "03W" + }, + + "verbose" : true, + "output_dir" : "local", + "cache" : true } ### forcing From c984dfdaa031974fb2844c707dd5ea2ecd26b9bf Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 1 May 2023 15:55:37 -0500 Subject: [PATCH 013/105] Made many of the operations conditional and added options --- prep_hydrofab_forcings_ngen.py | 197 ++++++++++++++++----------------- 1 file changed, 93 insertions(+), 104 deletions(-) diff --git a/prep_hydrofab_forcings_ngen.py b/prep_hydrofab_forcings_ngen.py index 017c922..172b585 100644 --- a/prep_hydrofab_forcings_ngen.py +++ b/prep_hydrofab_forcings_ngen.py @@ -16,7 +16,6 @@ from google.cloud import storage from rasterio.io import MemoryFile from rasterio.features import rasterize -import rasterio import time from nwm_filenames.listofnwmfilenames import create_file_list @@ -44,7 +43,7 @@ PARAMETER["standard_parallel_2",18.1],PARAMETER["latitude_of_origin",18.1],UNIT["Meter",1.0]]' # paths -CACHE_DIR = Path(Path.home(), "code", "data_access_examples", "raw_forcing_data") +CACHE_DIR = Path(Path.home(), "code", "data_access_examples", "data", "raw_forcing_data") NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") USGS_CACHE_DIR = os.path.join(CACHE_DIR, "usgs") GEO_CACHE_DIR = os.path.join(CACHE_DIR, "geo") @@ -390,6 +389,12 @@ def get_forcing_dict_RTIway2( return df_dict +def wget(cmd,name): + resp = os.system(cmd) + if resp > 0: + raise Exception (f'\nwget failed! Tried: {name}\n') + else: + print(f'Successful download of {name}') def main(): """ @@ -402,31 +407,42 @@ def main(): Will store files in the same folder as the JSON config to run this script """ + + t00 = time.perf_counter() + parser = argparse.ArgumentParser() parser.add_argument(dest="infile", type=str, help="A json containing user inputs to run ngen") - args = parser.parse_args() + args = parser.parse_args() # Take in user config conf = json.load(open(args.infile)) - start_date = conf['forcing']['start_date'] - end_date = conf['forcing']['end_date'] - runinput = conf['forcing']['runinput'] - varinput = conf['forcing']['varinput'] - geoinput = conf['forcing']['geoinput'] - meminput = conf['forcing']['meminput'] + start_date = conf['forcing']['start_date'] + end_date = conf['forcing']['end_date'] + runinput = conf['forcing']['runinput'] + varinput = conf['forcing']['varinput'] + geoinput = conf['forcing']['geoinput'] + meminput = conf['forcing']['meminput'] urlbaseinput = conf['forcing']['urlbaseinput'] + vpu = conf['hydrofab']['vpu'] + ii_verbose = conf['verbose'] + output_dir = conf['output_dir'] + ii_cache = conf['output_dir'] - vpu = conf['hydrofab']['vpu'] - # Subsetting ??? + # TODO: Subsetting! + # # Set paths and make directories if needed top_dir = os.path.dirname(args.infile) - data_dir = os.path.join(top_dir,'raw_forcing_data') - output_dir = os.path.join(top_dir,'catchment_forcing_data') - if not os.path.exists(data_dir): - os.system(f'mkdir {data_dir}') - if not os.path.exists(output_dir): - os.system(f'mkdir {output_dir}') + if not os.path.exists(CACHE_DIR): + os.system(f'mkdir {CACHE_DIR}') + + # TODO: Be able to write to anywhere we want (especially AWS bucket) + if output_dir == "local": + output_dir = Path(top_dir,'data/catchment_forcing_data') + if not os.path.exists(output_dir): + os.system(f'mkdir {output_dir}') + else: + raise NotImplementedError(f"{output_dir} is not an option for output_dir") # Generate list of file names to retrieve for forcing data print(f'Creating list of file names to pull...') @@ -446,32 +462,63 @@ def main(): ) # Check to see if we have files cached, if not wget them - local_files = [] - for jfile in nwm_forcing_files: - file_parts = jfile.split('/') - local_file = os.path.join(data_dir,file_parts[-1]) - local_files.append(local_file) - if os.path.exists(local_file): - continue - else: - command = f'wget -P {data_dir} -c https://storage.googleapis.com/national-water-model/{jfile}' - os.system(command) - - # TODO wget this if needed - gpkg = '/home/jlaser/code/data/nextgen_03W.gpkg' - ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) - src = ds["RAINRATE"] - - # Why are we converting to paquet and then back into geopandas dataframe? - polygonfile = gpd.read_file(gpkg, layer="divides") - parq_file = os.path.join(data_dir,"ng_03.parquet") - polygonfile.to_parquet(parq_file) - pkl_file = os.path.join(data_dir,"weights.pkl") + if ii_cache: + local_files = [] + for jfile in nwm_forcing_files: + if ii_verbose: print(f'Looking for {jfile}') + file_parts = Path(jfile).parts + + local_file = os.path.join(CACHE_DIR,file_parts[-1]) + local_files.append(local_file) + if os.path.exists(local_file): + if ii_verbose: print(f'Found and using raw forcing file {local_file}') + continue + else: + if ii_verbose: print(f'Forcing file not found! Downloading {jfile}') + command = f'wget -P {CACHE_DIR} -c {jfile}' + wget(command,jfile) + + cache_files = [] + for jfile in local_files: + splt = Path(jfile).parts + cache_files.append(splt[-1]) + + forcing_files = cache_files # interacting with files locally + else: + forcing_files = nwm_forcing_files # interacting with files remotely + + # Do we need a parquet file? + # parq_file = os.path.join(CACHE_DIR,"ng_03.parquet") + # polygonfile.to_parquet(parq_file) + + # Generate weight file only if one doesn't exist already + # Very time consuming so we don't want to do this if we can avoid it + pkl_file = os.path.join(CACHE_DIR,"weights.pkl") + if not os.path.exists(pkl_file): + # Search for geopackage that matches the requested VPU, if it exists + gpkg = None + for jfile in os.listdir(os.path.join(top_dir,'data')): + if jfile.find(vpu) >= 0: + gpkg = Path(top_dir,"data",jfile) + print(f'Found and using geopackge file {gpkg}') + if gpkg == None: + url = f'https://nextgen-hydrofabric.s3.amazonaws.com/05_nextgen/nextgen_{vpu}.gpkg' + command = f'wget -P {CACHE_DIR} -c {url}' + wget(command,url) + + print(f'Opening {gpkg}...') + polygonfile = gpd.read_file(gpkg, layer="divides") + + ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) + src = ds["RAINRATE"] + + print("Generating weights") + t1 = time.perf_counter() + generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") + print(f"Generating the weights took {time.perf_counter() - t1:.2f} s") + else: + print(f"Not creating weight file! Delete this if you want to create a new one: {pkl_file}") - print("Generating weights") - t1 = time.perf_counter() - # generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") - print(f"Generating the weights took {time.perf_counter() - t1:.2f} s") var_list = [ "U2D", "V2D", @@ -488,22 +535,16 @@ def main(): "VGRD_10maboveground", "DLWRF_surface", "APCP_surface", - "precip_rate", # BROKEN + "precip_rate", # BROKEN (Identical to APCP!) "TMP_2maboveground", "SPFH_2maboveground", "DSWRF_surface", ] - - just_files = [] - for jfile in local_files: - splt = jfile.split('/') # Need a way to do this that doesn't break on windows - just_files.append(splt[-1]) - fd2 = get_forcing_dict_JL( pkl_file, - Path(data_dir), - just_files, + CACHE_DIR, + forcing_files, var_list, var_list_out, ) @@ -516,60 +557,8 @@ def main(): arr.to_csv(csvname) print(f'JL write took {time.perf_counter() - t0:.2f} s') - - # fd2 = get_forcing_dict_RTIway2( - # pkl_file, - # polygonfile, - # Path(data_dir), - # just_files, - # var_list, - # ) - - # t0 = time.perf_counter() - # # pcp_var and pcp_var2 are indentical? - # pcp_var = fd2["RAINRATE"] - # lw_var = fd2["LWDOWN"] - # sw_var = fd2["SWDOWN"] - # sp_var = fd2["PSFC"] - # tmp_var = fd2["T2D"] - # u2d_var = fd2["U2D"] - # v2d_var = fd2["V2D"] - # pcp_var2 = fd2["RAINRATE"] - - # for _i in range(0, ncatch_out): - - # pcp_var_0 = pcp_var.transpose()[_i].rename("APCP_surface") - # lw_var_0 = lw_var.transpose()[_i].rename("DLWRF_surface") - # sw_var_0 = sw_var.transpose()[_i].rename("DSWRF_surface") - # sp_var_0 = sp_var.transpose()[_i].rename("SPFH_2maboveground") - # tmp_var_0 = tmp_var.transpose()[_i].rename("TMP_2maboveground") - # u2d_var_0 = u2d_var.transpose()[_i].rename("UGRD_10maboveground") - # v2d_var_0 = v2d_var.transpose()[_i].rename("VGRD_10maboveground") - # pcp_var2_0 = pcp_var2.transpose()[_i].rename("precip_rate") ##BROKEN!! - - # d = pd.concat( - # [ - # pcp_var_0, - # lw_var_0, - # sw_var_0, - # sp_var_0, - # tmp_var_0, - # u2d_var_0, - # v2d_var_0, - # pcp_var2_0, - # ], - # axis=1, - # ) - # d.index.name = "time" - - # id = polygonfile["id"][_i] - # splt = id.split('-') - # csvname = f"{output_dir}/cat{vpu}_{splt[1]}.csv" - # d.to_csv(csvname) - - # print(f'RTI write took {time.perf_counter() - t0:.2f} s') - print(f'\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {output_dir}\n\n') + print(f'Total run time: {time.perf_counter() - t00:.2f} s') if __name__ == "__main__": main() From fe903108e7de5495ff79795b93ee2b69b21965f1 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 3 May 2023 14:35:26 -0500 Subject: [PATCH 014/105] Added options to write to S3 bucket in either csv or parquet --- prep_hydrofab_forcings_ngen.py | 109 ++++++++++++++++++++++----------- user_input_ngen.json | 7 ++- user_input_ngen.md | 5 +- 3 files changed, 82 insertions(+), 39 deletions(-) diff --git a/prep_hydrofab_forcings_ngen.py b/prep_hydrofab_forcings_ngen.py index 172b585..d23df19 100644 --- a/prep_hydrofab_forcings_ngen.py +++ b/prep_hydrofab_forcings_ngen.py @@ -6,7 +6,8 @@ import pickle import pandas as pd import argparse, os, json -from sys import getsizeof +import pyarrow as pa +import pyarrow.parquet as pq import gc from pathlib import Path import geopandas as gpd @@ -17,6 +18,8 @@ from rasterio.io import MemoryFile from rasterio.features import rasterize import time +import boto3 +from io import StringIO, BytesIO from nwm_filenames.listofnwmfilenames import create_file_list from ngen_forcing.process_nwm_forcing_to_ngen import * @@ -182,9 +185,9 @@ def generate_weights_file( else: crosswalk_dict[index] = np.where(geom_rasterize == 1) - if i % 100 == 0: - perc = i/len(gdf_proj)*100 - print(f"{i}, {perc:.2f}%".ljust(40), end="\r") + # if i % 100 == 0: + # perc = i/len(gdf_proj)*100 + # print(f"{i}, {perc:.2f}%".ljust(40), end="\r") i += 1 with open(weights_filepath, "wb") as f: @@ -303,17 +306,14 @@ def get_forcing_dict_RTIway( def get_forcing_dict_JL( pickle_file, - folder_prefix, filelist, var_list, var_list_out ): t1 = time.perf_counter() df_by_t = [] - for _i, _nc_file in enumerate(filelist): - _full_nc_file = folder_prefix.joinpath(_nc_file) - print(f"Data indexing progress -> {_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") - with xr.open_dataset(_full_nc_file) as _xds: + for _i, _nc_file in enumerate(filelist): + with xr.open_dataset(_nc_file) as _xds: shp = _xds['U2D'].shape data_allvars = np.zeros( shape=(len(var_list),shp[1],shp[2]), @@ -322,8 +322,9 @@ def get_forcing_dict_JL( data_allvars[var_dx,:,:] = np.squeeze(_xds[jvar].values) _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, pickle_file) df_by_t.append(_df_zonal_stats) + print(f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}%", end="\r") - print(f'Reformating and converting data into dataframe', end="\r") + print(f'Reformating and converting data into dataframe') dfs = {} for jcat in list(df_by_t[0].keys()): data_catch = [] @@ -331,7 +332,7 @@ def get_forcing_dict_JL( data_catch.append(df_by_t[jt][jcat]) dfs[jcat] = pd.DataFrame(data_catch,columns = var_list_out) - print(f"Indexing data and generating the dataframes (JL) {time.perf_counter() - t1:.2f} s") + print(f"Indexing data and generating the dataframes (JL) {time.perf_counter() - t1:.2f}s") return dfs @@ -425,8 +426,18 @@ def main(): urlbaseinput = conf['forcing']['urlbaseinput'] vpu = conf['hydrofab']['vpu'] ii_verbose = conf['verbose'] - output_dir = conf['output_dir'] - ii_cache = conf['output_dir'] + bucket_type = conf['bucket_type'] + bucket_name = conf['bucket_name'] + file_prefix = conf['file_prefix'] + file_type = conf['file_type'] + ii_cache = conf['cache'] + + file_types = ['csv','parquet'] + assert file_type in file_types,f'{file_type} for file_type is not accepted! Accepted: {file_types}' + + bucket_types = ['local','S3'] + assert bucket_type in bucket_types,f'{bucket_type} for bucket_type is not accepted! Accepted: {bucket_types}' + # TODO: Subsetting! # @@ -435,14 +446,18 @@ def main(): top_dir = os.path.dirname(args.infile) if not os.path.exists(CACHE_DIR): os.system(f'mkdir {CACHE_DIR}') - - # TODO: Be able to write to anywhere we want (especially AWS bucket) - if output_dir == "local": - output_dir = Path(top_dir,'data/catchment_forcing_data') - if not os.path.exists(output_dir): - os.system(f'mkdir {output_dir}') - else: - raise NotImplementedError(f"{output_dir} is not an option for output_dir") + if not os.path.exists(CACHE_DIR): + raise Exception(f'Creating {CACHE_DIR} failed!') + + # Prep output directory + if bucket_type == "local": + bucket_path = Path(top_dir,file_prefix,bucket_name) + if not os.path.exists(bucket_path): + os.system(f'mkdir {bucket_path}') + if not os.path.exists(bucket_path): + raise Exception(f'Creating {bucket_path} failed!') + elif bucket_type == 'S3': + s3 = boto3.client('s3') # Generate list of file names to retrieve for forcing data print(f'Creating list of file names to pull...') @@ -461,8 +476,10 @@ def main(): lead_time, ) - # Check to see if we have files cached, if not wget them + # Download whole files and store locally if cache is true, + # otherwise index remotely and save catchment based forcings if ii_cache: + # Check to see if we have files cached, if not wget them local_files = [] for jfile in nwm_forcing_files: if ii_verbose: print(f'Looking for {jfile}') @@ -478,12 +495,7 @@ def main(): command = f'wget -P {CACHE_DIR} -c {jfile}' wget(command,jfile) - cache_files = [] - for jfile in local_files: - splt = Path(jfile).parts - cache_files.append(splt[-1]) - - forcing_files = cache_files # interacting with files locally + forcing_files = local_files # interacting with files locally else: forcing_files = nwm_forcing_files # interacting with files remotely @@ -543,21 +555,46 @@ def main(): fd2 = get_forcing_dict_JL( pkl_file, - CACHE_DIR, forcing_files, var_list, var_list_out, ) + # Write CSVs to file t0 = time.perf_counter() - for jcatch in fd2.keys(): - arr = fd2[jcatch] + write_int = 100 + write_break = 1000 + for j, jcatch in enumerate(fd2.keys()): + df = fd2[jcatch] splt = jcatch.split('-') - csvname = f"{output_dir}/cat{vpu}_{splt[1]}.csv" - arr.to_csv(csvname) - - print(f'JL write took {time.perf_counter() - t0:.2f} s') - print(f'\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {output_dir}\n\n') + + if bucket_type == 'local': + if file_type == 'csv': + csvname = Path(bucket_path,f"cat{vpu}_{splt[1]}.csv") + df.to_csv(csvname) + if file_type == 'parquet': + parq_file = Path(bucket_path,f"cat{vpu}_{splt[1]}.parquet") + df.to_parquet(parq_file) + elif bucket_type == 'S3': + buf = BytesIO() + if file_type == 'parquet': + parq_file = f"cat{vpu}_{splt[1]}.parquet" + df.to_parquet(buf) + elif file_type == 'csv': + csvname = f"cat{vpu}_{splt[1]}.csv" + df.to_csv(buf, index=False) + buf.seek(0) + key_name = f'{file_prefix}{csvname}' + s3.put_object(Bucket=bucket_name, Key=key_name, Body=buf.getvalue()) + + if (j+1) % write_int == 0: + print(f"{j+1} files written out of {len(fd2)}, {(j+1)/len(fd2)*100:.2f}%", end="\r") + + if j == write_break: break + + print(f'{file_type} write took {time.perf_counter() - t0:.2f} s\n') + + print(f'\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {bucket_type}\n\n') print(f'Total run time: {time.perf_counter() - t00:.2f} s') if __name__ == "__main__": diff --git a/user_input_ngen.json b/user_input_ngen.json index a6e757f..19bfa9b 100644 --- a/user_input_ngen.json +++ b/user_input_ngen.json @@ -13,7 +13,10 @@ "vpu" : "03W" }, - "verbose" : true, - "output_dir" : "local", + "verbose" : false, + "bucket_type" : "local", + "bucket_name" : "ciroh-devconf", + "file_prefix" : "data/", + "file_type" : "csv", "cache" : true } \ No newline at end of file diff --git a/user_input_ngen.md b/user_input_ngen.md index 3c6b82e..fef5ac0 100644 --- a/user_input_ngen.md +++ b/user_input_ngen.md @@ -46,5 +46,8 @@ contents: | Field Name | Data Type | Description | | --- | --- | --- | | verbose | `bool` | Print raw forcing files | -| output_dir | `string` |
    1. "local" : output to ./data/catchment_forcing_data/
    | +| output_dir | `string` |
    1. "local" : write to local directory
    2. "S3" : output to AWS S3 bucket
    | +| bucket_name | `string` | If local, this is the name of the folder the data will be placed in. If S3, this is the name of S3 bucket, which must exist already. | +| file_prefix | `string` | If local, this is the relative path to the bucket_name folder. If S3, this is the relative path within the S3 bucket_name bucket to store files | +| file_type | `string` |
    1. "csv" : write data as csv files/
    2. "parquet" : write data as parquet files
    | | cache | `bool` |
  • true: store forcing files locally
  • false: interact with forcing files remotely
  • | From 5940ea2d61bfad93486ad56fda6840650f42465e Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 3 May 2023 14:36:33 -0500 Subject: [PATCH 015/105] removed precip_rate functions --- prep_hydrofab_forcings_ngen.py | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/prep_hydrofab_forcings_ngen.py b/prep_hydrofab_forcings_ngen.py index d23df19..ae815e2 100644 --- a/prep_hydrofab_forcings_ngen.py +++ b/prep_hydrofab_forcings_ngen.py @@ -6,8 +6,6 @@ import pickle import pandas as pd import argparse, os, json -import pyarrow as pa -import pyarrow.parquet as pq import gc from pathlib import Path import geopandas as gpd @@ -19,7 +17,7 @@ from rasterio.features import rasterize import time import boto3 -from io import StringIO, BytesIO +from io import BytesIO from nwm_filenames.listofnwmfilenames import create_file_list from ngen_forcing.process_nwm_forcing_to_ngen import * @@ -70,29 +68,6 @@ ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, "RouteLink_CONUS.nc") ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, "route_link_conus.parquet") -# TODO: Implemenent these function to appropriately calculate precip_rate -def rho(temp): - """ - Calculate water density at temperature - """ - return 999.99399 + 0.04216485*temp - 0.007097451*(temp**2) + 0.00003509571*(temp**3) - 9.9037785E-8*(temp**4) - -def aorc_as_rate(dataFrame): - """ - Convert kg/m^2 -> m/s - """ - if isinstance(dataFrame.index, pd.MultiIndex): - interval = pd.Series(dataFrame.index.get_level_values(0)) - else: - interval = pd.Series(dataFrame.index) - interval = ( interval.shift(-1) - interval ) / np.timedelta64(1, 's') - interval.index = dataFrame.index - precip_rate = ( dataFrame['APCP_surface'].shift(-1) / dataFrame['TMP_2maboveground'].apply(rho) ) / interval - precip_rate.name = 'precip_rate' - return precip_rate - -###### - def parquet_to_gdf(parquet_filepath: str) -> gpd.GeoDataFrame: gdf = gpd.read_parquet(parquet_filepath) return gdf @@ -563,7 +538,6 @@ def main(): # Write CSVs to file t0 = time.perf_counter() write_int = 100 - write_break = 1000 for j, jcatch in enumerate(fd2.keys()): df = fd2[jcatch] splt = jcatch.split('-') @@ -590,8 +564,6 @@ def main(): if (j+1) % write_int == 0: print(f"{j+1} files written out of {len(fd2)}, {(j+1)/len(fd2)*100:.2f}%", end="\r") - if j == write_break: break - print(f'{file_type} write took {time.perf_counter() - t0:.2f} s\n') print(f'\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {bucket_type}\n\n') From 475721678d351fec8a4ff38a3105a678f38e966e Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 3 May 2023 15:09:29 -0500 Subject: [PATCH 016/105] updated config and readme --- user_input_ngen.json | 2 +- user_input_ngen.md | 43 +++++++++++++++++++++++-------------------- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/user_input_ngen.json b/user_input_ngen.json index 19bfa9b..c699e97 100644 --- a/user_input_ngen.json +++ b/user_input_ngen.json @@ -14,7 +14,7 @@ }, "verbose" : false, - "bucket_type" : "local", + "bucket_type" : "S3", "bucket_name" : "ciroh-devconf", "file_prefix" : "data/", "file_type" : "csv", diff --git a/user_input_ngen.md b/user_input_ngen.md index fef5ac0..7d24459 100644 --- a/user_input_ngen.md +++ b/user_input_ngen.md @@ -5,25 +5,28 @@ filename = 'user_input_ngen.json' contents: - { - "forcing" : { - "start_date" : "20220822", - "end_date" : "20220822", - "runinput" : 2, - "varinput" : 5, - "geoinput" : 1, - "meminput" : 0, - "urlbaseinput" : 3 - }, - - "hydrofab" : { - "vpu" : "03W" - }, - - "verbose" : true, - "output_dir" : "local", - "cache" : true - } +{ + "forcing" : { + "start_date" : "20220822", + "end_date" : "20220822", + "runinput" : 2, + "varinput" : 5, + "geoinput" : 1, + "meminput" : 0, + "urlbaseinput" : 3 + }, + + "hydrofab" : { + "vpu" : "03W" + }, + + "verbose" : false, + "bucket_type" : "S3", + "bucket_name" : "ciroh-devconf", + "file_prefix" : "data/", + "file_type" : "csv", + "cache" : true +} ### forcing | Field Name | Data Type | Description | @@ -46,7 +49,7 @@ contents: | Field Name | Data Type | Description | | --- | --- | --- | | verbose | `bool` | Print raw forcing files | -| output_dir | `string` |
    1. "local" : write to local directory
    2. "S3" : output to AWS S3 bucket
    | +| bucket_type | `string` |
    1. "local" : write to local directory
    2. "S3" : output to AWS S3 bucket
    | | bucket_name | `string` | If local, this is the name of the folder the data will be placed in. If S3, this is the name of S3 bucket, which must exist already. | | file_prefix | `string` | If local, this is the relative path to the bucket_name folder. If S3, this is the relative path within the S3 bucket_name bucket to store files | | file_type | `string` |
    1. "csv" : write data as csv files/
    2. "parquet" : write data as parquet files
    | From 1ba8ba2b80823f44c1e89c72846365b85edb80bc Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 3 May 2023 15:12:32 -0500 Subject: [PATCH 017/105] Indent for code block --- user_input_ngen.md | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/user_input_ngen.md b/user_input_ngen.md index 7d24459..902e079 100644 --- a/user_input_ngen.md +++ b/user_input_ngen.md @@ -5,28 +5,28 @@ filename = 'user_input_ngen.json' contents: -{ - "forcing" : { - "start_date" : "20220822", - "end_date" : "20220822", - "runinput" : 2, - "varinput" : 5, - "geoinput" : 1, - "meminput" : 0, - "urlbaseinput" : 3 - }, - - "hydrofab" : { - "vpu" : "03W" - }, - - "verbose" : false, - "bucket_type" : "S3", - "bucket_name" : "ciroh-devconf", - "file_prefix" : "data/", - "file_type" : "csv", - "cache" : true -} + { + "forcing" : { + "start_date" : "20220822", + "end_date" : "20220822", + "runinput" : 2, + "varinput" : 5, + "geoinput" : 1, + "meminput" : 0, + "urlbaseinput" : 3 + }, + + "hydrofab" : { + "vpu" : "03W" + }, + + "verbose" : false, + "bucket_type" : "S3", + "bucket_name" : "ciroh-devconf", + "file_prefix" : "data/", + "file_type" : "csv", + "cache" : true + } ### forcing | Field Name | Data Type | Description | From 5816e096718b1cfe1c959a87cdc56e27aabae7e0 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 11:46:16 -0500 Subject: [PATCH 018/105] moved files and implemented threaded download --- filenames.txt | 3 + ngen_forcing/__pycache__/defs.cpython-311.pyc | Bin 0 -> 1345 bytes ngen_forcing/__pycache__/defs.cpython-38.pyc | Bin 0 -> 938 bytes ...rocess_nwm_forcing_to_ngen.cpython-311.pyc | Bin 0 -> 6098 bytes ...process_nwm_forcing_to_ngen.cpython-38.pyc | Bin 0 -> 2977 bytes ...rocess_nwm_forcing_to_ngen.cpython-311.pyc | Bin 0 -> 6097 bytes ngen_forcing/defs.py | 25 + ngen_forcing/denno.py | 465 ++++++++++++++++++ .../prep_hydrofab_forcings_ngen.py | 103 ++-- ngen_forcing/process_nwm_forcing_to_ngen.py | 259 ++++++++++ .../test_process_nwm_forcing_to_ngen.py | 211 ++++++++ .../user_input_ngen.json | 7 +- .../user_input_ngen.md | 10 +- 13 files changed, 1048 insertions(+), 35 deletions(-) create mode 100644 filenames.txt create mode 100644 ngen_forcing/__pycache__/defs.cpython-311.pyc create mode 100644 ngen_forcing/__pycache__/defs.cpython-38.pyc create mode 100644 ngen_forcing/__pycache__/process_nwm_forcing_to_ngen.cpython-311.pyc create mode 100644 ngen_forcing/__pycache__/process_nwm_forcing_to_ngen.cpython-38.pyc create mode 100644 ngen_forcing/__pycache__/test_process_nwm_forcing_to_ngen.cpython-311.pyc create mode 100644 ngen_forcing/defs.py create mode 100644 ngen_forcing/denno.py rename prep_hydrofab_forcings_ngen.py => ngen_forcing/prep_hydrofab_forcings_ngen.py (89%) create mode 100644 ngen_forcing/process_nwm_forcing_to_ngen.py create mode 100644 ngen_forcing/test_process_nwm_forcing_to_ngen.py rename user_input_ngen.json => ngen_forcing/user_input_ngen.json (79%) rename user_input_ngen.md => ngen_forcing/user_input_ngen.md (89%) diff --git a/filenames.txt b/filenames.txt new file mode 100644 index 0000000..87440af --- /dev/null +++ b/filenames.txt @@ -0,0 +1,3 @@ +https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f006.conus.nc +https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f007.conus.nc +https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f008.conus.nc \ No newline at end of file diff --git a/ngen_forcing/__pycache__/defs.cpython-311.pyc b/ngen_forcing/__pycache__/defs.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6a40066f7994994d36d7134d8f5ca0faff26564 GIT binary patch literal 1345 zcmb7DPfHs?6rY*hO`i1MOlmOHmiDppoA>6uH}CCxZ$89g?Fi`U z<405wguY7Qia=I$o`P);QADwea(IqlL_<{D#B&-AQyq4lMyLV1LGcVqwtbUsnQ2E; z?hTN9NK5a6x0hli45GH{T19WgSz_ok5t|MO%PU#t=F{qEsqfj-{8tCB5u|j;9@u zva~d`Y{y|jSZvMC6i`jQ-4%a&3yL9)-!Stu-=b^ery4t@ZfFBa-h_tnzgEcxrNu5W5Q8}yM z;4u(Y_Ve@e3Elxx5V;D?yFirv688ZvE+AZ;Xa?N)S7=8ay;Z%3cY@Q+?SBrMJ>;VU z*ncM_(&{mTj&SCP1bS95H!xP07-!HuLPZc-&E~1M z8tAIex@w|oULx`9_dD0P#hFd5h9#G?D(fQd0hCZJ8rvD&9^HLVnS3)@nyQ7PCBJ6I z%h5A4zO%8tu|ITT4xXBW)yQC@0?io&cgd>{`vOl5i*8P9oi#bpyb0cUbh#4~`phB2<8!D{PQ*F(6sj#{DHz==9) Og)RdR%d;Cnh5rdLFes$} literal 0 HcmV?d00001 diff --git a/ngen_forcing/__pycache__/defs.cpython-38.pyc b/ngen_forcing/__pycache__/defs.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e52dfc5c1af0db3166cdc12f95d47beb992e5322 GIT binary patch literal 938 zcmZuv&2AGh5VpPEWb;=6Efo?Rx$Gt3PE}Px$^|Kx!X-jlt#+Jb+f6naZ%KldTczZ} zJG95VlCPY21uihNX-Yth%8p=v9(yswHkRKLEQ#l zz0#vlZ;(j3+Y`%m+Q+o*DX~Ia$&D<sYP=Wg9S79GQ&(+e|XM=@n#aR{=`Asr0 z;BGV9$R$C7lpsPb%#Fnn9HexG&zU@uO zyLXk55qn7vrpF#poFvBD$SmXOJTta88=F})$}vf2;~toi?ao)O5%JUf4Hv;45ff5N zCA3hyetmhlcj%sFc@k&#y!#k~4RQ}P26R+#Dumc#m9LNsgxpOR?4PNM%%2JQ#K$(B z|Lw&)y0EW_mEum6@?KIG6o~5pPq`bVcPC4APb~#;DYkO3RZgeYWX@kq44P0o1=iIT z=?wCP)10BxT$dfN3S9;4Xo(S;7^0mk1G1ctkWCp#t?)U?iO-e?s?o`M1@R3Dj}ZL_ zh_6bo)TKYFfU>j$%R0lw4h13CI;`PSmY*@>NpfBaOQ(x;YLF-KU$n=6qm9^+{SF{E z6}})#sQUd-4rpu#gRnO2>_#iiyIhLuFXSYu4pcc-Q3wD1DN%4X&EY09=REV4({HP0 HqZ#}Lt|Z1L literal 0 HcmV?d00001 diff --git a/ngen_forcing/__pycache__/process_nwm_forcing_to_ngen.cpython-311.pyc b/ngen_forcing/__pycache__/process_nwm_forcing_to_ngen.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21bd3c010c14424f9de91c76acddb9af51ac64dc GIT binary patch literal 6098 zcmeHLTTC3+8J?NF&BE@oH)dJx5WCJM4q)R3+aV4ZY&XWHjd7fG6FOS$3}orvdSo+qF#GYJbN=(6+du#L&iReo-9W*kPrNStv74g)iiyf&-xi)MfN-4>D1lB=DLP80 zm?)F7L@hMtF-dF67PY19QM-}0Bpp!}@-0bcw1J_{P=fUuCD=sUTw}$sXrFUckU@qb1 zL`sY>x`j_@F6Hh!KT&F}TN()q!5olr%#K+=s zQIdFZA(qM}MX5hME2jCG42dVwv;A3;F-p@{Qk6Vj&hV(ymt7=IFl>MW1gQyNsrpy4 zvPD!wS+ln_L+>WY1M)g()zVO7;I0i@=1d88<=ycCC(W z({dAP-LBTm*%kX7wr4zZ)m6s^Xxwy}EzlH0$vw4-3Ty7a(qvA* z3%-)`=6pH7X}*0G3yN283!YhKv&q~}&MWV)=unh=peC2~6@MXE%d1IdUozV?_beAM zt%K8C3TqoRRw0;cR+{IUD=pMKzW%Gpaq>$Q4OshTC7^gWy*FX(x2xYuE~JDCbt_-f zH}^T$vh7)6r9}zX902^ME;ZLusjqkxZiXp)*2+7p^h%H5yGAL_ams9$rW6;1KL4c7 zl9(imB#|NbS>K{{yO^F$q{Y*>Y`Rk>v9ts+E2Ud!<5@|!l5AYkEpY;3Jd<4nbsGd{ zap5U*u=^24qDKP4^8in2HxygMw4l?AI=ui1tB^=Z_#dNqyc$?Q8AAttI!|5(xK7Q& zCo|3EXoW67diP@<%!kZMXo@0k(8CVJlNm|8#pq1mfX)b#gyZV&Cg|f&8-|k!Nseby z*%Mb^oE7D2@D&nqnNN#XuEZAmhLf3iEGeA;p`!d9b0n-Y3q*HivJmodGk{%1XXi4BbT%fxr(51UJg(a>$CC4+WW;dYg7c4V zrQ2g!s3ho2R?wZeA)O|$59v-ZJ)aUuOcr%pmL$@$ZX=m_(6c1Pv~Ekz%}X-DNi0_y zB_y_p(;b&}%hg0yXXT8POu&|-TXC}_=!mckVk`|7GVw^O?w%22@;niFV?A|OC5cl2 zg_)!v5FI~<(hmt&Z8mFo0(50t4W72!y7$k_{%z-&>h=dA?Q^)&Fl7ULg0k|+oPiAz&y09 zH{kGiV{UavoMd1&1>GjeF&X?8`YN@8k?uVo!%Otg9-cvXsEj+CG!9O+L~VH}7q>M2 z&=*+iDsLWfG~d+H*;24`ZBh$Hmd-zDY$>tL zC4c)`pXT3p-`{u7->3QqH2*-!?ZemQExG+$ZimbMm;(43VQG9TNHzJE&Xj^3TNGm( zr^}dUOD+5FxAfg>>DyRPTaIciNAvCy+w_6wea}Z^?PrDD-HXN0Ni}p*17ycEcC2W; zUwOP>%)@O_wC$(?dFBDvvX7bxdB-ZV+?a2C$aNK5D)*Acy|hI&*j{~% zu-uTR^Ft3<_o}bxIR@`X;`-=~c;TSh*{^l>tL^=Fd@47naf5fpz+749zUQ@jp4Yx$ zRL_{^8T&F=WG7X2Qe!7UD78L+*E`i zjj6)bV$)%@>9E#xIB$IrY`GCwj~52lqZ{1kU7vNSk>gtAxEg*%3%{ZUhqd5v-uWQZ zT?nb6h!%=$Q8wqR1}vY+kLKl4xZ~5X7LIK6XyJqTvk$m^g^L@$=beqbF=t^5IwLap0`wXL26&sC`| z9EE)55h$%nTbZybE*Q;o#vy^rwdXpvEp;j#N@pGQZLLq$QD07JuTtNAf`1R%+X}Qd zFiG&tVK82T?s}Tw`GK?};73F{5xNk%5zyrc`a0Q%fQOMsNGk~*MF>)gZ-vd07ts7c zfO-;3`XS@$kv%vH@VCn%kwxKIH1-bK8$iJOgGiVhLisSlAi@!Z?;;pC5J<)iT1tjc zI*#xP0N6P!U)1B!byZN1mv1z9N{R!F@#e9kuGu?y@HT)>&yaU8@k2~7dT>d?YV!5D97rX!2Qji3g`YGD{+kb;MdcRrT=<+O`Dh0Wv z^Cnhkvkd^RvJL#tutK2V_u2M)Z2P)Uc)i%sr?UMT+g~)^hu-k|0oB{9d3(1gyY09E z%U0mKhqn3d%SF%2+k7|jar)D=+J0~&rg8%sH*hDQaz~&T-us^6d!FID7gW!P<{9}y zD6$hOJE5@?Ae5T9wM%Q$g+p5N{(Qs3wtagp}J zdV%+Xp8X$F-bj<~tG^KK6T}&*94U-of>#OU7;Q%hZwht$0-Pdai$=`S?b%pbh)Ly; zg277nzhzt#pBLr#GJ^7E8OL`45CkcLq9}@bvFs>mI*poCa^18|Th({14c z7izJz$jl-a`B=@gy=FQ?-QrtL$ydl{*lSPjNAM}nv!raf6HjM)C@}lSz5vg&`|R#- zamV-T1TOvVJM611LjHw?i&q5-Hy{;%0%3&Fgru}bQ>$mCcF(3*XC+Qr=~Ys<=bEye zRC^xO+exige~U1ORqhj3;g!JxcL$9Vr?J8zq=;1i{fLOlB<)h*$tqMjm~zdpA${ z?jQjJVmHbe-(_Ge2&0G>MZm{lI!t)6n;r5j=;tDev%}q?$W3c@oSxSOG7s>bo#8}O z!8)eG0`aIRM>rSf*9K%fNN+$YG8iDEWkr$!Ro2u}HnY~r8C4Z#pIW7>+<`rC%BrlW zs>!?4>KV=)-Y`3-Zs{G8vZm~P^6?90-Pov~(Q-l6rk=67Ftc?=DLHyOCS^l8@NTE{ zWo=Mbgt@0wHI%%B8Xm{+CpX`K)0r>5dD{5u1GH@^wWmk69idqIMty8;PRjbq0Gg7XZF>IFWYE7-L6X-waU`;hu zw{Pu{ztN-DAk~ll!IoJ2K2f!OasuO54OlqK2ig;1A-RZif$Qr5)h^EtYP|-Ch z!mNPFPPKCw4GZmvVN_^45|EAZ;RIfH6GL)A<9#J2fZJserCz_6- z;IKF?@QJ%1LVxEME)WOOZAe820lMye>y4E(va0&gp;SfGz+8Q!l!!3DJ zz)sXPo{ds2LdkVyDB?`&ipWRso}KVaSCYY~kitd-=A+kz2qze_NNW3&c&I&@7fB5J zO*;q-fiZ(n@-PDnd9=Nu{XP%nNbtbSiEf-1aZu2hCya|=D0n{}>smhsBA6EMUMOZc zN5JF*FvhG1*eFdW+7Bd$jSrm;LLsyhjIJ?Vh346pK^6sQ2;S?DlH_r*%KCxnxfR^g zG}hI5SHz3hX&PvE4#gTg=cA$nFb|$}9fuc~*y?H|a)>6=l|qIR)*kfrKj(Y(8Pkez zE3Si(s$;EDk6LiojY95`TX&$YPP-O9+g6M2&^Fv%+M*5GggTFISxqYb2o|2g)^kwt z7+e4TKVu6J*MJ1bngi<7IiTtWP%Z$Z39y<0stSPe=79294FFYV3kFclZvoUIpryfl z094ZeswJ0ZWdKzB98k)ida9+C`#7015rXT}l{3<}u?GOR14s_we8ZryTV9gg!7AW<32?rl zE*S{^RW$(SJAmVExmjMGL-UHdtgZkwH&6kZU9~xf=9ggebJk|d7m&FD$m|>lAEw{X zQZylVQ?#(WhO&-=hnl#AvWapT1y_vNLU|tKA#Ppq9h5G}|42gdT{Q4Ll*h-o0igIk z)O`Wz3He@q#ufom+~Y!W_U#B1FQfHWP=0{2gR+Z)$GW(Rg2)p;M0phji7S4B@>7)8 zP!MY3XDB~M`31@z$}dq&FmRiSYbZZL`4!0abK(Z(Uq|^h3jQXDeUvv*Zlb(}@;1sl zD8E5@7v)2co{uLD{(u=?i34o-Eedi-{0`-Pln+2O?Tb5Dz(Yt|K?>Z%gyqMVdOx>hZ{3Lf%Q@z_dz$Xji+64pv5Ayquwn-R$i4^_WepMCcI zK@de{y!&}(zHh$yX7-zJelvgd`#lKK!rzZc#Xf}oLkrEsEobh33z;vGgd|2m3C8|s z6CC`r3eGqZ4yM6#CR{d;Pw?>1DejCX;mLRt-i$Bd%lH$1TMjFMOfV5#ZWBs`SR{w0 zx|*Fzj%y4;*Wqm>2Dy>syn+(lk_%{9?veNpS;`|mXc@i+x$6y4prDmGYriq~60Mzx zF_yQGRYgV6bWzu0*y>Nox-gj~$+Vghq;ygjRQb{+@v;@&fFa~WA}WfkY%GyhFUmxh zB`da(j;2IQ85oA&6!i<1Go4biL{2(dqd>?Dc{=^~e+xr}sb`7~i7h(QsOU2A7<%7T zBlPg`_qWYEm0?q1PkaGiXx@89o1+oBJwbQ`{bmT1wh(-AKx!}k@EuUx7ivn0#mI~ z&fCjpMQ6FDFv}lb_^0es+=_>^^)f%z>^omA?^&Qv7d6=b-_P^bMc6Rxu;=AW>5B5Jx!(pd{WqQ?l9&3&Xms=|YZVZ4vcSrpXZWtUv|hxyuhchn2LZC$pK{ z(ZcrC%RYWs$tFcbI|`Ym)H`6176np56}+?YT$M`aK23a*`1rjm@3kuLgA9s=EPg7R zR&#cY5OX=0NsFB(mM;rgfsR3wb;~WQ`HW0NUA7!Kl2&!gL9%%e@A(T69|=Ftp)n(h;amM}Eg^ppVked}X$Lsut|KN@jjl#*4iFYIh}Y5S}mcb8r2Hn?LnO|DXNU zr}r#;RP7%z`$xXrR_%YGA#po!ds7{G+_W1? z%g1Wr{yBE`P}hIc*M~D!+R5*vuPq_-nTr8C>Xsp;djmE56Sg#XC)V8Qk%9mnO(&_#jDmTZzV4>XG*&P8u5Ywy+^FtKuI(kt;}V zMMu~VI0<*g^VLzU$ViSNyTM+^O3oXu&h|wvg(Ut@Oc5K5QCA0DIn=UR!$?LXT*1{)byG1*G7O z&@~s#JiLum^gc2ByB^#A?se?%z-0ed&k3)we`P5?iqHF>*v$8=v7Cmz<+R>vUDtC0 zPt5+_$F_ex&k6oC_Dcv*Y2So}DHiHLl{CEBsPZYuETc*{1bhUcoB0Y$>^B@>M-MdT z3R?l*#r2VWg?L7m(tuU~C8gx}@&PeEX|a3v7u@k=R?Ta1HA!fYim`;Qcz`lw8D`p0 zgKUDzLL?)m)%Eb72wd%XAOJZK+v{8*wO2+f{7VbZ0%<*Uu0zD$AD8GJxJ0+P!h!W+ z?Ql}m_Z&F1_rQS>A)nK9B8z~(@*eWyutSw<<`lauvB~PEvk`Li59nGpy z{em0?WEdUPl(@swrS%J-2Mm{}(&pJaZaEv3OCjhKfIg-i z%SoxA7WO<$+cJ=E)8$&ODhxfWos!Mwv?!6`Y_G{uym0){g*O~VVf1m8tg${8AoNTN zK%aW5#ib~Ur}NbS(tJvW1CLHFTP`@QP%KV%fjD`F5(<4ShYjfo1t)|q6+)o~p*vG7 zME1a+<)_Vr?4+$xv+$&_cfaMM4g?t9_8&4#<=hmyH;}y7<~Cf7?50c)+_(^3pyvsT zo0LW<*$29${klzawBmW1eZj7HA*a!AyBfwfP&}rqill%qz^g@J5tPtkPh_s|^YPj7 z($DK`bGa;@c>TMP@5Fz;`>%KHHFS@zH2|u;PLT2--xyOGTMYNj9+~f1_+=%0uo^yS zh7Xp;>#REjYqCj7r|O$f*OobPo|#LUp=V34)w;LLeKa4Q%bVRhN^jJ{Tg>otx5LA? z!o$_@h#4L!jsM_5-F>qIpKqSsTzY*GyDOeuRXk+kp$Z;a^lYl}VUr)I@jW%Zx5jtX z_|QGC(+jo%DINbIfE=FRZ=Z>MHZ(Id=c#(0t>SGa-d4fe?soNdXx&!D+fBT^g10Yv zp7|-X-leu4uHqvmK2pI)7QI7jwbuN-wP060$ori45RmUFDUJQm{m5p&RK+iw_~i?h&9-LOA>#`1<88B#d6WR3?=3iLFChr_rfUz!4h9eS!!^)N9KybG;TAVg;Rb4$FO7YC^2*7Q{i_3S6I9#t^FK8$ Bf^Yx; literal 0 HcmV?d00001 diff --git a/ngen_forcing/defs.py b/ngen_forcing/defs.py new file mode 100644 index 0000000..516fdbb --- /dev/null +++ b/ngen_forcing/defs.py @@ -0,0 +1,25 @@ +import rasterio.mask as riomask + +def polymask(dataset, invert=False, all_touched=False): + def _polymask(poly): + return riomask.raster_geometry_mask( + dataset, [poly], invert=invert, all_touched=all_touched, crop=True + ) + + return _polymask + + +def xr_read_window(ds, window, mask=None): + data = ds.isel(window) + if mask is None: + return data + else: + return data.where(mask) + + +def xr_read_window_time(ds, window, mask=None, idx=None, time=None): + data = ds.isel(window) + if mask is None: + return idx, time, data + else: + return idx, time, data.where(mask) \ No newline at end of file diff --git a/ngen_forcing/denno.py b/ngen_forcing/denno.py new file mode 100644 index 0000000..37eb829 --- /dev/null +++ b/ngen_forcing/denno.py @@ -0,0 +1,465 @@ + +# https://github.com/jameshalgren/data-access-examples/blob/DONOTMERGE_VPU16/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb + +# !pip install --upgrade google-api-python-client +# !pip install --upgrade google-cloud-storage + +import pickle +import time +import pandas as pd +import argparse, os, json +import gc +from pathlib import Path +import geopandas as gpd +import pandas as pd +import numpy as np +import xarray as xr +from google.cloud import storage +from rasterio.io import MemoryFile +from rasterio.features import rasterize + +from nwm_filenames.listofnwmfilenames import create_file_list + +DATA_DIR = Path(Path.home(),"code","data") + +TEMPLATE_BLOB_NAME = ( + "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" +) +NWM_BUCKET = "national-water-model" + +# WKT strings extracted from NWM grids +CONUS_NWM_WKT = 'PROJCS["Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]], \ +PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ +PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-97.0],PARAMETER["standard_parallel_1",30.0],\ +PARAMETER["standard_parallel_2",60.0],PARAMETER["latitude_of_origin",40.0],UNIT["Meter",1.0]]' + +HI_NWM_WKT = 'PROJCS["Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]],\ +PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ +PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-157.42],PARAMETER["standard_parallel_1",10.0],\ +PARAMETER["standard_parallel_2",30.0],PARAMETER["latitude_of_origin",20.6],UNIT["Meter",1.0]]' + +PR_NWM_WKT = 'PROJCS["Sphere_Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]],\ +PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ +PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-65.91],PARAMETER["standard_parallel_1",18.1],\ +PARAMETER["standard_parallel_2",18.1],PARAMETER["latitude_of_origin",18.1],UNIT["Meter",1.0]]' + +# paths +CACHE_DIR = Path(Path.home(), "code", "data_access_examples", "forcing_data") +NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") +USGS_CACHE_DIR = os.path.join(CACHE_DIR, "usgs") +GEO_CACHE_DIR = os.path.join(CACHE_DIR, "geo") + +NWM_CACHE_H5 = os.path.join(NWM_CACHE_DIR, "gcp_client.h5") + +PARQUET_CACHE_DIR = os.path.join(CACHE_DIR, "parquet") +MEDIUM_RANGE_FORCING_PARQUET = os.path.join(PARQUET_CACHE_DIR, "forcing_medium_range") +FORCING_ANALYSIS_ASSIM_PARQUET = os.path.join( + PARQUET_CACHE_DIR, "forcing_analysis_assim" +) +MEDIUM_RANGE_PARQUET = os.path.join(PARQUET_CACHE_DIR, "medium_range") +USGS_PARQUET = os.path.join(PARQUET_CACHE_DIR, "usgs") + +HUC10_SHP_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_conus.shp") +HUC10_PARQUET_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_conus.parquet") +HUC10_MEDIUM_RANGE_WEIGHTS_FILEPATH = os.path.join( + GEO_CACHE_DIR, "wbdhu10_medium_range_weights.pkl" +) + +ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, "RouteLink_CONUS.nc") +ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, "route_link_conus.parquet") + + +def parquet_to_gdf(parquet_filepath: str) -> gpd.GeoDataFrame: + gdf = gpd.read_parquet(parquet_filepath) + return gdf + +def get_cache_dir(create: bool = True): + if not os.path.exists(NWM_CACHE_DIR) and create: + os.mkdir(NWM_CACHE_DIR) + if not os.path.exists(NWM_CACHE_DIR): + raise NotADirectoryError + return NWM_CACHE_DIR + +def make_parent_dir(filepath): + Path(filepath).parent.mkdir(parents=True, exist_ok=True) + +def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: + """Retrieve a blob from the data service as xarray.Dataset. + Based largely on OWP HydroTools. + Parameters + ---------- + blob_name: str, required + Name of blob to retrieve. + use_cache: bool, default True + If cache should be used. + If True, checks to see if file is in cache, and + If fetched from remote, will save to cache. + Returns + ------- + ds : xarray.Dataset + The data stored in the blob. + """ + # TODO: Check to see if this does any better than kerchunk + # the caching should help, but probably needs to be managed to function asynchronously. + # Perhaps if the files is not cached, we can create the dataset from + # kerchunk with a remote path and then asynchronously do a download to cache it + # for next time. The hypothesis would be that the download speed will not be any slower than + # just accessing the file remotely. + nc_filepath = os.path.join(get_cache_dir(), blob_name) + make_parent_dir(nc_filepath) + + # If the file exists and use_cache = True + if os.path.exists(nc_filepath) and use_cache: + # Get dataset from cache + ds = xr.load_dataset( + nc_filepath, + engine="h5netcdf", + ) + return ds + else: + # Get raw bytes + raw_bytes = get_blob(blob_name) + # Create Dataset + ds = xr.load_dataset( + MemoryFile(raw_bytes), + engine="h5netcdf", + ) + if use_cache: + # Subset and cache + ds["RAINRATE"].to_netcdf( + nc_filepath, + engine="h5netcdf", + ) + return ds + +def generate_weights_file( + gdf: gpd.GeoDataFrame, + src: xr.DataArray, + weights_filepath: str, + crosswalk_dict_key: str, +): + """Generate a weights file.""" + + gdf_proj = gdf.to_crs(CONUS_NWM_WKT) + + crosswalk_dict = {} + + # This is a probably a really poor performing way to do this + # TODO: Consider vectorizing -- would require digging into the + # other end of these where we unpack the weights... + i = 0 + for index, row in gdf_proj.iterrows(): + geom_rasterize = rasterize( + [(row["geometry"], 1)], + out_shape=src.rio.shape, + transform=src.rio.transform(), + all_touched=True, + fill=0, # IS FILL 0 + dtype="uint8", + ) + if crosswalk_dict_key: + crosswalk_dict[row[crosswalk_dict_key]] = np.where(geom_rasterize == 1) + else: + crosswalk_dict[index] = np.where(geom_rasterize == 1) + + if i % 100 == 0: + perc = i/len(gdf_proj)*100 + print(f"{i}, {perc:.2f}%".ljust(40), end="\r") + if perc > 0.01: break + i += 1 + + with open(weights_filepath, "wb") as f: + # TODO: This is a dict of ndarrays, which could be easily stored as a set of parquet files for safekeeping. + pickle.dump(crosswalk_dict, f) + +def add_zonalstats_to_gdf_weights( + gdf: gpd.GeoDataFrame, + src: xr.DataArray, + weights_filepath: str, +) -> gpd.GeoDataFrame: + """Calculates zonal stats and adds to GeoDataFrame""" + + df = calc_zonal_stats_weights(src, weights_filepath) + gdf_map = gdf.merge(df, left_on="huc10", right_on="catchment_id") + + return gdf_map + + +def get_blob(blob_name: str, bucket: str = NWM_BUCKET) -> bytes: + """Retrieve a blob from the data service as bytes. + Based largely on OWP HydroTools. + Parameters + ---------- + blob_name : str, required + Name of blob to retrieve. + Returns + ------- + data : bytes + The data stored in the blob. + """ + # Setup anonymous client and retrieve blob data + client = storage.Client.create_anonymous_client() + bucket = client.bucket(bucket) + return bucket.blob(blob_name).download_as_bytes(timeout=120) + + +def calc_zonal_stats_weights( + src: xr.DataArray, + weights_filepath: str, +) -> pd.DataFrame: + """Calculates zonal stats""" + + # Open weights dict from pickle + # This could probably be done once and passed as a reference. + with open(weights_filepath, "rb") as f: + crosswalk_dict = pickle.load(f) + + r_array = src.values[0] + r_array[r_array == src.rio.nodata] = np.nan + + mean_dict = {} + for key, value in crosswalk_dict.items(): + mean_dict[key] = np.nanmean(r_array[value]) + + df = pd.DataFrame.from_dict(mean_dict, orient="index", columns=["value"]) + + df.reset_index(inplace=True, names="catchment_id") + + # This should not be needed, but without memory usage grows + del crosswalk_dict + del f + gc.collect() + + return df + + +def get_forcing_dict_RTIway( + pickle_file, # This would be a Feature list for parallel calling -- + # if there is a stored weights file, we use it + # (checking for an optional flag to force re-creation of the weights...) + folder_prefix, + file_list, +): + + var = "RAINRATE" + reng = "rasterio" + filehandles = [ + xr.open_dataset(folder_prefix / f, engine=reng)[var] for f in file_list + ] + # filehandles = [get_dataset("data/" + f, use_cache=True) for f in file_list] + stats = [] + + for _i, f in enumerate(filehandles): + print(f"{_i}, {round(_i/len(file_list), 2)*100}".ljust(40), end="\r") + stats.append(calc_zonal_stats_weights(f, pickle_file)) + + [f.close() for f in filehandles] + return stats + + +def get_forcing_dict_RTIway2( + pickle_file, # This would be a Feature list for parallel calling -- + # if there is a stored weights file, we use it + # (checking for an optional flag to force re-creation of the weights...) + gpkg_divides, + folder_prefix, + filelist, + var_list, +): + reng = "rasterio" + pick_val = "value" + + df_dict = {} + dl_dict = {} + for _v in var_list: + df_dict[_v] = pd.DataFrame(index=gpkg_divides.index) + dl_dict[_v] = [] + + # ds_list = [] + for _i, _nc_file in enumerate(filelist): + # _nc_file = ("nwm.t00z.medium_range.forcing.f001.conus.nc") + _full_nc_file = folder_prefix.joinpath(_nc_file) + + try: + # with xr.open_dataset(_full_nc_file, engine=reng) as _xds: + with xr.open_dataset(_full_nc_file) as _xds: + # _xds = ds_list[_i] + # _xds.rio.write_crs(rasterio.crs.CRS.from_wkt(CONUS_NWM_WKT), inplace=True) + print(f"{_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") + for _v in var_list: + _src = _xds[_v] + _df_zonal_stats = calc_zonal_stats_weights(_src, pickle_file) + # if adding statistics back to original GeoDataFrame + # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1) + _df = pd.DataFrame(index=gpkg_divides.index) + _df[_xds.time.values[0]] = _df_zonal_stats[pick_val] + # TODO: This same line could add the new values directly + # to the same dictionary. But after adding about 100 of them, + # pandas starts to complain about degraded performance due to + # fragmentation of the dataframe. We tried it this was as a + # workaround, with the loop below to accomplish the concatenation. + dl_dict[_v].append(_df) + except: + print(f"No such file: {_full_nc_file}") + + for _v in var_list: + df_dict[_v] = pd.concat(dl_dict[_v], axis=1) + + # [_xds.close() for _xds in ds_list] + + return df_dict + + +def main(): + """ + Primary function to retrieve hydrofabrics data and convert it into files that can be ingested into ngen. + Also, the forcing data is retrieved. + + Inputs: JSON config file specifying start_date, end_date, and vpu + + Outputs: ngen catchment/nexus configs and forcing files + + Will store files in the same folder as the JSON config to run this script + """ + parser = argparse.ArgumentParser() + parser.add_argument(dest="infile", type=str, help="A json containing user inputs to run ngen") + args = parser.parse_args() + + # Take in user config + conf = json.load(open(args.infile)) + start_date = conf['forcing']['start_date'] + end_date = conf['forcing']['end_date'] + vpu = conf['hydrofab']['vpu'] + + top_dir = os.path.dirname(args.infile) + data_dir = os.path.join(top_dir,'forcing_data') + if not os.path.exists(data_dir): + os.system(f'mkdir {data_dir}') + + # Generate list of file names to retrieve for forcing data + # Going to make assumptions here as to which forecasts we want + # Check the dictionaries at the top of listofnwmfilenames for options + n = 6 # How rapidly we want our forecasts, I think 3 is the highest frequency + fcst_cycle = [n*x for x in range(24//n)] + lead_time = [x+1 for x in range(n)] + # fcst_cycle = None # Retrieves a full day for each day within the range given. + runinput = 2 + varinput = 5 + geoinput = 1 + meminput = 0 + urlbaseinput = None + + print(f'Creating list of file names to pull...') + nwm_forcing_files = create_file_list( + runinput, + varinput, + geoinput, + meminput, + start_date, + end_date, + fcst_cycle, + urlbaseinput, + lead_time, + ) + + + print(f'Pulling files...') + local_files = [] + for jfile in nwm_forcing_files: + file_parts = jfile.split('/') + local_file = os.path.join(data_dir,file_parts[-1]) + local_files.append(local_file) + if os.path.exists(local_file): continue + else: + command = f'wget -P {data_dir} -c https://storage.googleapis.com/national-water-model/{jfile}' + os.system(command) + + # Download dataset, read into df with geopandas + gpkg = os.path.join(DATA_DIR,'nextgen_03W.gpkg') + ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) + src = ds["RAINRATE"] + + # Why are we converting to paquet and then back into geopandas dataframe? + polygonfile = gpd.read_file(gpkg, layer="divides") + parq_file = os.path.join(DATA_DIR,"ng_03.parquet") + polygonfile.to_parquet(parq_file) + pkl_file = os.path.join(DATA_DIR,"weights.pkl") + generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") + calc_zonal_stats_weights(src, pkl_file) + + folder_prefix = DATA_DIR + + var_list = [ + "U2D", + "V2D", + "LWDOWN", + "RAINRATE", + "T2D", + "Q2D", + "PSFC", + "SWDOWN", + ] + + var_list + start_time = time.time() + print(f"Working on the new way") + fd2 = get_forcing_dict_RTIway2( + pkl_file, + polygonfile, + folder_prefix, + nwm_forcing_files, + var_list, + ) + print(time.time() - start_time) + + fd2["U2D"] + pcp_var.transpose()[0] + pcp_var = fd2["RAINRATE"] + lw_var = fd2["LWDOWN"] + sw_var = fd2["SWDOWN"] + sp_var = fd2["PSFC"] + tmp_var = fd2["T2D"] + u2d_var = fd2["U2D"] + v2d_var = fd2["V2D"] + pcp_var2 = fd2["RAINRATE"] + + for _i in range(0, 40000): + # _i = 0 + try: + pcp_var_0 = pcp_var.transpose()[_i].rename("APCP_surface") + lw_var_0 = lw_var.transpose()[_i].rename("DLWRF_surface") + sw_var_0 = sw_var.transpose()[_i].rename("DSWRF_surface") + sp_var_0 = sp_var.transpose()[_i].rename("SPFH_2maboveground") + tmp_var_0 = tmp_var.transpose()[_i].rename("TMP_2maboveground") + u2d_var_0 = u2d_var.transpose()[_i].rename("UGRD_10maboveground") + v2d_var_0 = v2d_var.transpose()[_i].rename("VGRD_10maboveground") + pcp_var2_0 = pcp_var2.transpose()[_i].rename("precip_rate") ##BROKEN!! + + d = pd.concat( + [ + pcp_var_0, + lw_var_0, + sw_var_0, + sp_var_0, + tmp_var_0, + u2d_var_0, + v2d_var_0, + pcp_var2_0, + ], + axis=1, + ) + d.index.name = "time" + + d.to_csv(f"input_data/cat16_{_i:07}.csv") + except: + print(f"no data for watershed {_i}", end="\t") + + ## Make a shell script string to rename the csvs... + gpkg_divides["id"] + for _i, cat_id in enumerate(gpkg_divides["id"]): + print(f"mv cat16_{_i:07}.csv cat16_{cat_id}.csv") + +if __name__ == "__main__": + + main() + \ No newline at end of file diff --git a/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py similarity index 89% rename from prep_hydrofab_forcings_ngen.py rename to ngen_forcing/prep_hydrofab_forcings_ngen.py index ae815e2..7a0a39d 100644 --- a/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -5,7 +5,7 @@ import pickle import pandas as pd -import argparse, os, json +import argparse, os, json, sys import gc from pathlib import Path import geopandas as gpd @@ -19,8 +19,11 @@ import boto3 from io import BytesIO -from nwm_filenames.listofnwmfilenames import create_file_list -from ngen_forcing.process_nwm_forcing_to_ngen import * +import threading + +pkg_dir = Path(Path(os.path.dirname(__file__)).parent,'nwm_filenames') +sys.path.append(str(pkg_dir)) +from listofnwmfilenames import create_file_list TEMPLATE_BLOB_NAME = ( "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" @@ -44,7 +47,10 @@ PARAMETER["standard_parallel_2",18.1],PARAMETER["latitude_of_origin",18.1],UNIT["Meter",1.0]]' # paths -CACHE_DIR = Path(Path.home(), "code", "data_access_examples", "data", "raw_forcing_data") + +#TODO Make CACHE_DIR configurable +CACHE_DIR = Path(pkg_dir.parent, "data", "raw_forcing_data") # Maybe this should have a date attached to the name + NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") USGS_CACHE_DIR = os.path.join(CACHE_DIR, "usgs") GEO_CACHE_DIR = os.path.join(CACHE_DIR, "geo") @@ -160,9 +166,9 @@ def generate_weights_file( else: crosswalk_dict[index] = np.where(geom_rasterize == 1) - # if i % 100 == 0: - # perc = i/len(gdf_proj)*100 - # print(f"{i}, {perc:.2f}%".ljust(40), end="\r") + if i % 100 == 0: + perc = i/len(gdf_proj)*100 + print(f"{i}, {perc:.2f}%".ljust(40), end="\r") i += 1 with open(weights_filepath, "wb") as f: @@ -365,12 +371,15 @@ def get_forcing_dict_RTIway2( return df_dict -def wget(cmd,name): +def wget(cmd,name,semaphore=None): + if not semaphore == None: semaphore.acquire() resp = os.system(cmd) if resp > 0: - raise Exception (f'\nwget failed! Tried: {name}\n') + raise Exception(f'\nwget failed! Tried: {name}\n') else: - print(f'Successful download of {name}') + print(f'Successful download of {name}') + + if not semaphore == None: semaphore.release() def main(): """ @@ -390,10 +399,17 @@ def main(): parser.add_argument(dest="infile", type=str, help="A json containing user inputs to run ngen") args = parser.parse_args() + # Increase for more threads + dl_threads = 10 + # Take in user config conf = json.load(open(args.infile)) start_date = conf['forcing']['start_date'] end_date = conf['forcing']['end_date'] + if 'nwm_files' in conf['forcing']: + nwm_files = conf['forcing']['nwm_files'] + else: + nwm_files = "" runinput = conf['forcing']['runinput'] varinput = conf['forcing']['varinput'] geoinput = conf['forcing']['geoinput'] @@ -406,6 +422,7 @@ def main(): file_prefix = conf['file_prefix'] file_type = conf['file_type'] ii_cache = conf['cache'] + dl_threads = conf['dl_threads'] file_types = ['csv','parquet'] assert file_type in file_types,f'{file_type} for file_type is not accepted! Accepted: {file_types}' @@ -434,28 +451,38 @@ def main(): elif bucket_type == 'S3': s3 = boto3.client('s3') - # Generate list of file names to retrieve for forcing data - print(f'Creating list of file names to pull...') - n = 6 - fcst_cycle = [n*x for x in range(24//n)] - lead_time = [x+1 for x in range(n)] - nwm_forcing_files = create_file_list( - runinput, - varinput, - geoinput, - meminput, - start_date, - end_date, - fcst_cycle, - urlbaseinput, - lead_time, - ) - + # Get nwm forcing file names + if len(nwm_files) == 0: + print(f'Creating list of file names to pull...') + n = 6 + fcst_cycle = [n*x for x in range(24//n)] + lead_time = [x+1 for x in range(n)] + nwm_forcing_files = create_file_list( + runinput, + varinput, + geoinput, + meminput, + start_date, + end_date, + fcst_cycle, + urlbaseinput, + lead_time, + ) + else: + print(f'Reading list of file names from {nwm_files}...') + nwm_forcing_files = [] + with open(nwm_files,'r') as f: + for line in f: + nwm_forcing_files.append(line) + # Download whole files and store locally if cache is true, # otherwise index remotely and save catchment based forcings + t0 = time.perf_counter() if ii_cache: # Check to see if we have files cached, if not wget them local_files = [] + cmds = [] + fls = [] for jfile in nwm_forcing_files: if ii_verbose: print(f'Looking for {jfile}') file_parts = Path(jfile).parts @@ -468,11 +495,27 @@ def main(): else: if ii_verbose: print(f'Forcing file not found! Downloading {jfile}') command = f'wget -P {CACHE_DIR} -c {jfile}' - wget(command,jfile) - - forcing_files = local_files # interacting with files locally + cmds.append(command) + fls.append(jfile) + + # TODO make this async! + # wget(command,jfile) + + threads = [] + semaphore = threading.Semaphore(dl_threads) + for i,jcmd in enumerate(cmds): + t = threading.Thread(target = wget, args = [jcmd, fls[i],semaphore]) + t.start() + threads.append(t) + + for jt in threads: + jt.join() + + forcing_files = local_files # interacting with files locally else: forcing_files = nwm_forcing_files # interacting with files remotely + + print(f'SERIAL Time to dl files {time.perf_counter() - t0}') # Do we need a parquet file? # parq_file = os.path.join(CACHE_DIR,"ng_03.parquet") diff --git a/ngen_forcing/process_nwm_forcing_to_ngen.py b/ngen_forcing/process_nwm_forcing_to_ngen.py new file mode 100644 index 0000000..8957438 --- /dev/null +++ b/ngen_forcing/process_nwm_forcing_to_ngen.py @@ -0,0 +1,259 @@ +from ngen_forcing.defs import xr_read_window, polymask, xr_read_window_time +from rasterio import _io, windows +import xarray as xr +import pandas as pd + + +class MemoryDataset(_io.MemoryDataset, windows.WindowMethodsMixin): + pass + + +def get_forcing_dict_newway( + feature_index, + feature_list, + folder_prefix, + file_list, + var_list, +): + reng = "rasterio" + + _xds_dummy = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) + _template_arr = _xds_dummy.U2D.values + _u2d = MemoryDataset( + _template_arr, + transform=_xds_dummy.U2D.rio.transform(), + gcps=None, + rpcs=None, + crs=None, + copy=False, + ) + + # Open .nc files ahead of time + ds_list = [] + for _nc_file in file_list: + _full_nc_file = folder_prefix.joinpath(_nc_file) + ds_list.append(xr.open_dataset(_full_nc_file, engine=reng)) + + df_dict = {} + for _v in var_list: + df_dict[_v] = pd.DataFrame(index=feature_index) + + for i, feature in enumerate(feature_list): + print(f"{i}, {round(i/len(feature_list), 5)*100}".ljust(40), end="\r") + mask, _, window = polymask(_u2d)(feature) + mask = xr.DataArray(mask, dims=["y", "x"]) + winslices = dict(zip(["y", "x"], window.toslices())) + for j, _xds in enumerate(ds_list): + time_value = _xds.time.values[0] + cropped = xr_read_window(_xds, winslices, mask=mask) + stats = cropped.mean() + for var in var_list: + df_dict[var].loc[i, time_value] = stats[var] + + [ds.close() for ds in ds_list] + return df_dict + + +# def get_forcing_dict_newway_parallel( +# feature_index, +# feature_list, +# folder_prefix, +# file_list, +# var_list, +# para="thread", +# para_n=2, +# ): + +# import concurrent.futures + +# reng = "rasterio" +# _xds = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) +# _template_arr = _xds.U2D.values +# _u2d = MemoryDataset( +# _template_arr, +# transform=_xds.U2D.rio.transform(), +# gcps=None, +# rpcs=None, +# crs=None, +# copy=False, +# ) +# ds_list = [xr.open_dataset(folder_prefix.joinpath(f)) for f in file_list] +# # ds_list = [xr.open_dataset(folder_prefix.joinpath(f), engine=reng) for f in file_list] +# # TODO: figure out why using the rasterio engine DOES NOT WORK with parallel +# # TODO: figure out why NOT using the rasterio engine produces a different result + +# if para == "process": +# pool = concurrent.futures.ProcessPoolExecutor +# elif para == "thread": +# pool = concurrent.futures.ThreadPoolExecutor +# else: +# pool = concurrent.futures.ThreadPoolExecutor + +# stats = [] +# future_list = [] + +# with pool(max_workers=para_n) as executor: + +# for _i, _m in enumerate(map(polymask(_u2d), feature_list)): +# print(f"{_i}, {round(_i/len(feature_list), 5)*100}".ljust(40), end="\r") +# mask, _, window = _m +# mask = xr.DataArray(mask, dims=["y", "x"]) +# winslices = dict(zip(["y", "x"], window.toslices())) +# for ds in ds_list: +# _t = ds.time.values[0] +# future = executor.submit( +# xr_read_window_time, ds, winslices, mask=mask, idx=_i, time=_t +# ) +# # cropped = xr_read_window(f, winslices, mask=mask) +# # stats.append(cropped.mean()) +# future_list.append(future) +# for _f in concurrent.futures.as_completed(future_list): +# _j, _t, _s = _f.result() +# stats.append((_j, _t, _s)) + +# df_dict = {} +# for _v in var_list: +# df_dict[_v] = pd.DataFrame(index=feature_index) + +# for j, t, s in stats: +# for var in var_list: +# df_dict[var].loc[j, t] = s[var].mean() + +# [ds.close() for ds in ds_list] +# return df_dict + + +def get_forcing_dict_newway_inverted( + feature_index, + feature_list, + folder_prefix, + file_list, + var_list, +): + reng = "rasterio" + + _xds_dummy = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) + _template_arr = _xds_dummy.U2D.values + _u2d = MemoryDataset( + _template_arr, + transform=_xds_dummy.U2D.rio.transform(), + gcps=None, + rpcs=None, + crs=None, + copy=False, + ) + ds_list = [] + for _nc_file in file_list: + _full_nc_file = folder_prefix.joinpath(_nc_file) + ds_list.append(xr.open_dataset(_full_nc_file, engine=reng)) + + stats = [] + mask_win_list = [] + + for i, feature in enumerate(feature_list): + print(f"{i}, {round(i/len(feature_list), 5)*100}".ljust(40), end="\r") + mask, _, window = polymask(_u2d)(feature) + mask = xr.DataArray(mask, dims=["y", "x"]) + winslices = dict(zip(["y", "x"], window.toslices())) + mask_win_list.append((mask, winslices)) + + for i, f in enumerate(ds_list): + print(f"{i}, {round(i/len(file_list), 2)*100}".ljust(40), end="\r") + time_value = f.time.values[0] + # TODO: when we read the window, could the time be added as a dimension? + for j, (_m, _w) in enumerate(mask_win_list): + cropped = xr_read_window(f, _w, mask=_m) + stats.append((j, time_value, cropped.mean())) + + df_dict = {} + for _v in var_list: + df_dict[_v] = pd.DataFrame(index=feature_index) + + for j, t, s in stats: + for var in var_list: + df_dict[var].loc[j, t] = s[var] + + [ds.close() for ds in ds_list] + return df_dict + + +# def get_forcing_dict_newway_inverted_parallel( +# feature_index, +# feature_list, +# folder_prefix, +# file_list, +# var_list, +# para="thread", +# para_n=2, +# ): +# import concurrent.futures + +# reng = "rasterio" +# _xds = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) +# _template_arr = _xds.U2D.values +# _u2d = MemoryDataset( +# _template_arr, +# transform=_xds.U2D.rio.transform(), +# gcps=None, +# rpcs=None, +# crs=None, +# copy=False, +# ) + +# ds_list = [xr.open_dataset("data/" + f) for f in file_list] + +# stats = [] +# future_list = [] +# mask_win_list = [] + +# for i, feature in enumerate(feature_list): +# print(f"{i}, {round(i/len(feature_list), 5)*100}".ljust(40), end="\r") +# mask, _, window = polymask(_u2d)(feature) +# mask = xr.DataArray(mask, dims=["y", "x"]) +# winslices = dict(zip(["y", "x"], window.toslices())) +# mask_win_list.append((mask, winslices)) + +# ds_list = [xr.open_dataset(folder_prefix.joinpath(f)) for f in file_list] +# # ds_list = [xr.open_dataset(folder_prefix.joinpath(f), engine=reng) for f in file_list] +# # TODO: figure out why using the rasterio engine DOES NOT WORK with parallel +# # TODO: figure out why NOT using the rasterio engine produces a different result + +# stats = [] +# future_list = [] + +# if para == "process": +# pool = concurrent.futures.ProcessPoolExecutor +# elif para == "thread": +# pool = concurrent.futures.ThreadPoolExecutor +# else: +# pool = concurrent.futures.ThreadPoolExecutor + +# with pool(max_workers=para_n) as executor: +# df_dict = {} +# for _v in var_list: +# df_dict[_v] = pd.DataFrame(index=feature_index) + +# for j, ds in enumerate(ds_list): +# print(f"{j}, {round(i/len(file_list), 2)*100}".ljust(40), end="\r") +# _t = ds.time.values[0] +# for _i, (_m, _w) in enumerate(mask_win_list): +# future = executor.submit( +# xr_read_window_time, ds, _w, mask=_m, idx=_i, time=_t +# ) +# # cropped = xr_read_window(ds, _w, mask=_m) +# # stats.append(cropped.mean()) +# future_list.append(future) +# for _f in concurrent.futures.as_completed(future_list): +# _j, _t, _s = _f.result() +# stats.append((_j, _t, _s)) + +# df_dict = {} +# for _v in var_list: +# df_dict[_v] = pd.DataFrame(index=feature_index) + +# for j, t, s in stats: +# for var in var_list: +# df_dict[var].loc[j, t] = s[var].mean() + +# [ds.close() for ds in ds_list] +# return df_dict \ No newline at end of file diff --git a/ngen_forcing/test_process_nwm_forcing_to_ngen.py b/ngen_forcing/test_process_nwm_forcing_to_ngen.py new file mode 100644 index 0000000..545fe95 --- /dev/null +++ b/ngen_forcing/test_process_nwm_forcing_to_ngen.py @@ -0,0 +1,211 @@ +# import rioxarray as rxr +import xarray as xr +import geopandas as gpd +from rasterstats import zonal_stats + +# import rasterio +import pandas as pd + +import time + +from process_nwm_forcing_to_ngen import ( + get_forcing_dict_newway, + get_forcing_dict_newway_parallel, + get_forcing_dict_newway_inverted, + get_forcing_dict_newway_inverted_parallel, +) + +from pathlib import Path +import warnings + +warnings.simplefilter("ignore") + +# Read forcing files +# Generate List of files + +# TODO: Add looping through lists of forcing files +# consider looking at the "listofnwmfilenames.py" in the data_access_examples repository. +# Integer values for runinput, varinput, etc. are listed at the top of the file +# and an example is given in the `main` function. + +# import listofnwmfilenames +# create_file_list( +# runinput, +# varinput, +# geoinput, +# meminput, +# start_date, +# end_date, +# fcst_cycle, +# ) + +""" +A set of test files can be generated downloading these files +wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f001.conus.nc +wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f002.conus.nc +wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f003.conus.nc +wget -P 03w -c https://nextgen-hydrofabric.s3.amazonaws.com/v1.2/nextgen_03W.gpkg +""" + + +def get_forcing_dict( + feature_index, + feature_list, + folder_prefix, + filelist, + var_list, +): + reng = "rasterio" + sum_stat = "mean" + + df_dict = {} + for _v in var_list: + df_dict[_v] = pd.DataFrame(index=feature_index) + + ds_list = [] + for _nc_file in filelist: + # _nc_file = ("nwm.t00z.medium_range.forcing.f001.conus.nc") + _full_nc_file = folder_prefix.joinpath(_nc_file) + ds_list.append(xr.open_dataset(_full_nc_file, engine=reng)) + + for _i, _nc_file in enumerate(filelist): + _xds = ds_list[_i] + print(f"{_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") + if 1 == 1: + for _v in var_list: + _src = _xds[_v] + _aff2 = _src.rio.transform() + _arr2 = _src.values[0] + + _df_zonal_stats = pd.DataFrame( + zonal_stats(feature_list, _arr2, affine=_aff2) + ) + # if adding statistics back to original GeoDataFrame + # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1) + df_dict[_v][_xds.time.values[0]] = _df_zonal_stats[sum_stat] + + [_xds.close() for _xds in ds_list] + + return df_dict + + +# TODO: Convert the output to CSV with something like +# `gdf3.to_csv` + + +def main(): + folder_prefix = Path("data") + list_of_files = [ + f"nwm.t12z.medium_range.forcing.f{_r:03}.conus.nc" for _r in range(1, 241) + ] + + # Read basin boundary file + f_03 = "03w/nextgen_03W.gpkg" + gpkg_divides = gpd.read_file(f_03, layer="divides") + var_list = [ + "U2D", + "V2D", + "LWDOWN", + "RAINRATE", + "T2D", + "Q2D", + "PSFC", + "SWDOWN", + ] + + # file_list = list_of_files[0:30] + # gpkg_subset = gpkg_divides[0:2000] + file_list = list_of_files[0:3] + gpkg_subset = gpkg_divides[0:200] + feature_list = gpkg_subset.geometry.to_list() + + # This way is extremely slow for anything more than a + # few files, so we comment it out of the test + + start_time = time.time() + print(f"Working on the old (slow) way") + fd1 = get_forcing_dict( + gpkg_subset.index, + feature_list, + folder_prefix, + file_list, + var_list, + ) + print(time.time() - start_time) + + start_time = time.time() + print(f"Working on the new way") + fd2 = get_forcing_dict_newway( + gpkg_subset.index, + feature_list, + folder_prefix, + file_list, + var_list, + ) + print(time.time() - start_time) + + start_time = time.time() + + print(f"Working on the new way with threading parallel.") + fd3t = get_forcing_dict_newway_parallel( + gpkg_subset.index, + feature_list, + folder_prefix, + file_list, + var_list, + para="thread", + para_n=16, + ) + print(time.time() - start_time) + + start_time = time.time() + print(f"Working on the new way with process parallel.") + fd3p = get_forcing_dict_newway_parallel( + gpkg_subset.index, + feature_list, + folder_prefix, + file_list, + var_list, + para="process", + para_n=16, + ) + print(time.time() - start_time) + start_time = time.time() + print(f"Working on the new way with loops reversed.") + fd4 = get_forcing_dict_newway_inverted( + gpkg_subset.index, + feature_list, + folder_prefix, + file_list, + var_list, + ) + print(time.time() - start_time) + + start_time = time.time() + print(f"Working on the new way with loops reversed with threading parallel.") + fd5t = get_forcing_dict_newway_inverted_parallel( + gpkg_subset.index, + feature_list, + folder_prefix, + file_list, + var_list, + para="thread", + para_n=16, + ) + print(time.time() - start_time) + start_time = time.time() + print(f"Working on the new way with loops reversed with process parallel.") + fd5p = get_forcing_dict_newway_inverted_parallel( + gpkg_subset.index, + feature_list, + folder_prefix, + file_list, + var_list, + para="process", + para_n=16, + ) + print(time.time() - start_time) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/user_input_ngen.json b/ngen_forcing/user_input_ngen.json similarity index 79% rename from user_input_ngen.json rename to ngen_forcing/user_input_ngen.json index c699e97..5059e3d 100644 --- a/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -2,6 +2,7 @@ "forcing" : { "start_date" : "20220822", "end_date" : "20220822", + "nwm_files" : "", "runinput" : 2, "varinput" : 5, "geoinput" : 1, @@ -13,10 +14,12 @@ "vpu" : "03W" }, - "verbose" : false, + "verbose" : true, "bucket_type" : "S3", "bucket_name" : "ciroh-devconf", "file_prefix" : "data/", "file_type" : "csv", - "cache" : true + "cache" : true, + "dl_threads" : 10 + } \ No newline at end of file diff --git a/user_input_ngen.md b/ngen_forcing/user_input_ngen.md similarity index 89% rename from user_input_ngen.md rename to ngen_forcing/user_input_ngen.md index 902e079..9e0cd45 100644 --- a/user_input_ngen.md +++ b/ngen_forcing/user_input_ngen.md @@ -9,6 +9,7 @@ contents: "forcing" : { "start_date" : "20220822", "end_date" : "20220822", + "nwm_files" : "", "runinput" : 2, "varinput" : 5, "geoinput" : 1, @@ -20,12 +21,14 @@ contents: "vpu" : "03W" }, - "verbose" : false, + "verbose" : true, "bucket_type" : "S3", "bucket_name" : "ciroh-devconf", "file_prefix" : "data/", "file_type" : "csv", - "cache" : true + "cache" : true, + "dl_threads" : 10 + } ### forcing @@ -33,13 +36,13 @@ contents: | --- | --- | --- | | start_date | `string` | YYYYMMDD | | end_date | `string` | YYYYMMDD | +| nwm_files | `string` | Path to a text file containing nwm file names. One filename per line. To have nwm forcing file names generated automatically, leave this option out of the config or set it to "". | | runinput | `int` |
    1. short_range
    2. medium_range
    3. medium_range_no_da
    4. long_range
    5. analysis_assim
    6. analysis_assim_extend
    7. analysis_assim_extend_no_da
    8. analysis_assim_long
    9. analysis_assim_long_no_da
    10. analysis_assim_no_da
    11. short_range_no_da
    | | varinput | `int` |
    1. channel_rt: for real-time channel data
    2. land: for land data
    3. reservoir: for reservoir data
    4. terrain_rt: for real-time terrain data
    5. forcing: for forcing data
    | | geoinput | `int` |
    1. conus: for continental US
    2. hawaii: for Hawaii
    3. puertorico: for Puerto Rico
    | | meminput | `int` |
    1. mem_1
    2. mem_2
    3. mem_3
    4. mem_4
    5. mem_5
    6. mem_6
    7. mem_7
    | | urlbaseinput | `int` |
    1. Empty string: use local files
    2. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/: for real-time operational data from NOAA
    3. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/post-processed/WMS/: for post-processed data from NOAA's Web Map Service
    4. https://storage.googleapis.com/national-water-model/: for input/output data stored on Google Cloud Storage
    5. https://storage.cloud.google.com/national-water-model/: for input/output data stored on Google Cloud Storage
    6. gs://national-water-model/: for input/output data stored on Google Cloud Storage
    7. https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/: for retrospective data from AWS S3
    8. s3://noaa-nwm-retrospective-2-1-pds/model_output/: for retrospective data from AWS S3
    | - ### hydrofab | Field Name | Data Type | Description | | --- | --- | --- | @@ -54,3 +57,4 @@ contents: | file_prefix | `string` | If local, this is the relative path to the bucket_name folder. If S3, this is the relative path within the S3 bucket_name bucket to store files | | file_type | `string` |
    1. "csv" : write data as csv files/
    2. "parquet" : write data as parquet files
    | | cache | `bool` |
  • true: store forcing files locally
  • false: interact with forcing files remotely
  • | +| dl_threads | `int` | Number of threads to use while downloading | From 367cc3ac1aef93f3dcb2d2ff306109dfd6566464 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 12:26:18 -0500 Subject: [PATCH 019/105] weights write to json, removed functions --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 165 +++----------------- 1 file changed, 19 insertions(+), 146 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 7a0a39d..0571d85 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -74,10 +74,6 @@ ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, "RouteLink_CONUS.nc") ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, "route_link_conus.parquet") -def parquet_to_gdf(parquet_filepath: str) -> gpd.GeoDataFrame: - gdf = gpd.read_parquet(parquet_filepath) - return gdf - def get_cache_dir(create: bool = True): if not os.path.exists(NWM_CACHE_DIR) and create: os.mkdir(NWM_CACHE_DIR) @@ -171,21 +167,15 @@ def generate_weights_file( print(f"{i}, {perc:.2f}%".ljust(40), end="\r") i += 1 - with open(weights_filepath, "wb") as f: - # TODO: This is a dict of ndarrays, which could be easily stored as a set of parquet files for safekeeping. - pickle.dump(crosswalk_dict, f) - -def add_zonalstats_to_gdf_weights( - gdf: gpd.GeoDataFrame, - src: xr.DataArray, - weights_filepath: str, -) -> gpd.GeoDataFrame: - """Calculates zonal stats and adds to GeoDataFrame""" - - df = calc_zonal_stats_weights(src, weights_filepath) - gdf_map = gdf.merge(df, left_on="huc10", right_on="catchment_id") + # with open(weights_filepath, "wb") as f: + # # TODO: This is a dict of ndarrays, which could be easily stored as a set of parquet files for safekeeping. + # pickle.dump(crosswalk_dict, f) - return gdf_map + # This block was taken from https://github.com/RTIInternational/hydro-evaluation/blob/dev-denno-4-1/src/evaluation/utils.py + # TODO: Perhaps import RTI's module, but just do this for now. + weights_json = json.dumps({k: [x.tolist() for x in v] for k, v in crosswalk_dict.items()}) + with open(weights_filepath, "w") as f: + f.write(weights_json) def get_blob(blob_name: str, bucket: str = NWM_BUCKET) -> bytes: @@ -213,8 +203,8 @@ def calc_zonal_stats_weights_new( # Open weights dict from pickle # This could probably be done once and passed as a reference. - with open(weights_filepath, "rb") as f: - crosswalk_dict = pickle.load(f) + with open(weights_filepath, "r") as f: + crosswalk_dict = json.load(f) nvar = src.shape[0] mean_dict = {} @@ -232,61 +222,8 @@ def calc_zonal_stats_weights_new( return mean_dict - -def calc_zonal_stats_weights( - src: xr.DataArray, - weights_filepath: str, -) -> pd.DataFrame: - """Calculates zonal stats""" - - # Open weights dict from pickle - # This could probably be done once and passed as a reference. - with open(weights_filepath, "rb") as f: - crosswalk_dict = pickle.load(f) - - r_array = src.values[0] - r_array[r_array == src.rio.nodata] = np.nan - - mean_dict = {} - for key, value in crosswalk_dict.items(): - mean_dict[key] = np.nanmean(r_array[value]) - - df = pd.DataFrame.from_dict(mean_dict, orient="index", columns=["value"]) - - df.reset_index(inplace=True, names="catchment_id") - - # This should not be needed, but without memory usage grows - del crosswalk_dict - del f - gc.collect() - - return df - -def get_forcing_dict_RTIway( - pickle_file, # This would be a Feature list for parallel calling -- - # if there is a stored weights file, we use it - # (checking for an optional flag to force re-creation of the weights...) - folder_prefix, - file_list, -): - - var = "RAINRATE" - reng = "rasterio" - filehandles = [ - xr.open_dataset(folder_prefix / f, engine=reng)[var] for f in file_list - ] - # filehandles = [get_dataset("data/" + f, use_cache=True) for f in file_list] - stats = [] - - for _i, f in enumerate(filehandles): - print(f"{_i}, {round(_i/len(file_list), 2)*100}".ljust(40), end="\r") - stats.append(calc_zonal_stats_weights(f, pickle_file)) - - [f.close() for f in filehandles] - return stats - def get_forcing_dict_JL( - pickle_file, + wgt_file, filelist, var_list, var_list_out @@ -301,7 +238,7 @@ def get_forcing_dict_JL( dtype=_xds['U2D'].dtype) for var_dx, jvar in enumerate(var_list): data_allvars[var_dx,:,:] = np.squeeze(_xds[jvar].values) - _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, pickle_file) + _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, wgt_file) df_by_t.append(_df_zonal_stats) print(f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}%", end="\r") @@ -317,60 +254,6 @@ def get_forcing_dict_JL( return dfs -def get_forcing_dict_RTIway2( - pickle_file, # This would be a Feature list for parallel calling -- - # if there is a stored weights file, we use it - # (checking for an optional flag to force re-creation of the weights...) - gpkg_divides, - folder_prefix, - filelist, - var_list, -): - t1=time.perf_counter() - reng = "rasterio" - pick_val = "value" - - df_dict = {} - dl_dict = {} - for _v in var_list: - df_dict[_v] = pd.DataFrame(index=gpkg_divides.index) - dl_dict[_v] = [] - - # ds_list = [] - for _i, _nc_file in enumerate(filelist): - # _nc_file = ("nwm.t00z.medium_range.forcing.f001.conus.nc") - _full_nc_file = folder_prefix.joinpath(_nc_file) - - try: - # with xr.open_dataset(_full_nc_file, engine=reng) as _xds: - with xr.open_dataset(_full_nc_file) as _xds: - # _xds = ds_list[_i] - # _xds.rio.write_crs(rasterio.crs.CRS.from_wkt(CONUS_NWM_WKT), inplace=True) - print(f"{_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") - for _v in var_list: - _src = _xds[_v] - _df_zonal_stats = calc_zonal_stats_weights(_src, pickle_file) - # if adding statistics back to original GeoDataFrame - # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1) - _df = pd.DataFrame(index=gpkg_divides.index) - _df[_xds.time.values[0]] = _df_zonal_stats[pick_val] - # TODO: This same line could add the new values directly - # to the same dictionary. But after adding about 100 of them, - # pandas starts to complain about degraded performance due to - # fragmentation of the dataframe. We tried it this was as a - # workaround, with the loop below to accomplish the concatenation. - dl_dict[_v].append(_df) - except: - print(f"No such file: {_full_nc_file}") - - for _v in var_list: - df_dict[_v] = pd.concat(dl_dict[_v], axis=1) - - # [_xds.close() for _xds in ds_list] - print(f"Indexing data and generating the dataframes (RTI) {time.perf_counter() - t1:.2f} s") - - return df_dict - def wget(cmd,name,semaphore=None): if not semaphore == None: semaphore.acquire() resp = os.system(cmd) @@ -399,9 +282,6 @@ def main(): parser.add_argument(dest="infile", type=str, help="A json containing user inputs to run ngen") args = parser.parse_args() - # Increase for more threads - dl_threads = 10 - # Take in user config conf = json.load(open(args.infile)) start_date = conf['forcing']['start_date'] @@ -435,7 +315,7 @@ def main(): # # Set paths and make directories if needed - top_dir = os.path.dirname(args.infile) + top_dir = Path(os.path.dirname(args.infile)).parent if not os.path.exists(CACHE_DIR): os.system(f'mkdir {CACHE_DIR}') if not os.path.exists(CACHE_DIR): @@ -497,9 +377,6 @@ def main(): command = f'wget -P {CACHE_DIR} -c {jfile}' cmds.append(command) fls.append(jfile) - - # TODO make this async! - # wget(command,jfile) threads = [] semaphore = threading.Semaphore(dl_threads) @@ -515,16 +392,12 @@ def main(): else: forcing_files = nwm_forcing_files # interacting with files remotely - print(f'SERIAL Time to dl files {time.perf_counter() - t0}') - - # Do we need a parquet file? - # parq_file = os.path.join(CACHE_DIR,"ng_03.parquet") - # polygonfile.to_parquet(parq_file) + print(f'Time to download files {time.perf_counter() - t0}') # Generate weight file only if one doesn't exist already # Very time consuming so we don't want to do this if we can avoid it - pkl_file = os.path.join(CACHE_DIR,"weights.pkl") - if not os.path.exists(pkl_file): + wgt_file = os.path.join(CACHE_DIR,"weights.json") + if not os.path.exists(wgt_file): # Search for geopackage that matches the requested VPU, if it exists gpkg = None for jfile in os.listdir(os.path.join(top_dir,'data')): @@ -544,10 +417,10 @@ def main(): print("Generating weights") t1 = time.perf_counter() - generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") + generate_weights_file(polygonfile, src, wgt_file, crosswalk_dict_key="id") print(f"Generating the weights took {time.perf_counter() - t1:.2f} s") else: - print(f"Not creating weight file! Delete this if you want to create a new one: {pkl_file}") + print(f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}") var_list = [ "U2D", @@ -572,7 +445,7 @@ def main(): ] fd2 = get_forcing_dict_JL( - pkl_file, + wgt_file, forcing_files, var_list, var_list_out, From 82a67d32e3d63add6e8113ccaf8eedf451233b4a Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 15:21:36 -0500 Subject: [PATCH 020/105] Applied black formatting --- ngen_forcing/defs.py | 3 +- ngen_forcing/denno.py | 97 +++--- ngen_forcing/prep_hydrofab_forcings_ngen.py | 316 ++++++++++-------- ngen_forcing/process_nwm_forcing_to_ngen.py | 2 +- .../test_process_nwm_forcing_to_ngen.py | 2 +- ngen_forcing/user_input_ngen.md | 6 +- 6 files changed, 229 insertions(+), 197 deletions(-) diff --git a/ngen_forcing/defs.py b/ngen_forcing/defs.py index 516fdbb..18175c8 100644 --- a/ngen_forcing/defs.py +++ b/ngen_forcing/defs.py @@ -1,5 +1,6 @@ import rasterio.mask as riomask + def polymask(dataset, invert=False, all_touched=False): def _polymask(poly): return riomask.raster_geometry_mask( @@ -22,4 +23,4 @@ def xr_read_window_time(ds, window, mask=None, idx=None, time=None): if mask is None: return idx, time, data else: - return idx, time, data.where(mask) \ No newline at end of file + return idx, time, data.where(mask) diff --git a/ngen_forcing/denno.py b/ngen_forcing/denno.py index 37eb829..bc6c369 100644 --- a/ngen_forcing/denno.py +++ b/ngen_forcing/denno.py @@ -1,4 +1,3 @@ - # https://github.com/jameshalgren/data-access-examples/blob/DONOTMERGE_VPU16/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb # !pip install --upgrade google-api-python-client @@ -20,7 +19,7 @@ from nwm_filenames.listofnwmfilenames import create_file_list -DATA_DIR = Path(Path.home(),"code","data") +DATA_DIR = Path(Path.home(), "code", "data") TEMPLATE_BLOB_NAME = ( "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" @@ -73,6 +72,7 @@ def parquet_to_gdf(parquet_filepath: str) -> gpd.GeoDataFrame: gdf = gpd.read_parquet(parquet_filepath) return gdf + def get_cache_dir(create: bool = True): if not os.path.exists(NWM_CACHE_DIR) and create: os.mkdir(NWM_CACHE_DIR) @@ -80,9 +80,11 @@ def get_cache_dir(create: bool = True): raise NotADirectoryError return NWM_CACHE_DIR + def make_parent_dir(filepath): Path(filepath).parent.mkdir(parents=True, exist_ok=True) + def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: """Retrieve a blob from the data service as xarray.Dataset. Based largely on OWP HydroTools. @@ -131,7 +133,8 @@ def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: engine="h5netcdf", ) return ds - + + def generate_weights_file( gdf: gpd.GeoDataFrame, src: xr.DataArray, @@ -146,8 +149,8 @@ def generate_weights_file( # This is a probably a really poor performing way to do this # TODO: Consider vectorizing -- would require digging into the - # other end of these where we unpack the weights... - i = 0 + # other end of these where we unpack the weights... + i = 0 for index, row in gdf_proj.iterrows(): geom_rasterize = rasterize( [(row["geometry"], 1)], @@ -161,17 +164,19 @@ def generate_weights_file( crosswalk_dict[row[crosswalk_dict_key]] = np.where(geom_rasterize == 1) else: crosswalk_dict[index] = np.where(geom_rasterize == 1) - + if i % 100 == 0: - perc = i/len(gdf_proj)*100 + perc = i / len(gdf_proj) * 100 print(f"{i}, {perc:.2f}%".ljust(40), end="\r") - if perc > 0.01: break + if perc > 0.01: + break i += 1 with open(weights_filepath, "wb") as f: # TODO: This is a dict of ndarrays, which could be easily stored as a set of parquet files for safekeeping. pickle.dump(crosswalk_dict, f) + def add_zonalstats_to_gdf_weights( gdf: gpd.GeoDataFrame, src: xr.DataArray, @@ -240,7 +245,6 @@ def get_forcing_dict_RTIway( folder_prefix, file_list, ): - var = "RAINRATE" reng = "rasterio" filehandles = [ @@ -312,7 +316,7 @@ def get_forcing_dict_RTIway2( def main(): """ - Primary function to retrieve hydrofabrics data and convert it into files that can be ingested into ngen. + Primary function to retrieve hydrofabrics data and convert it into files that can be ingested into ngen. Also, the forcing data is retrieved. Inputs: JSON config file specifying start_date, end_date, and vpu @@ -322,68 +326,70 @@ def main(): Will store files in the same folder as the JSON config to run this script """ parser = argparse.ArgumentParser() - parser.add_argument(dest="infile", type=str, help="A json containing user inputs to run ngen") + parser.add_argument( + dest="infile", type=str, help="A json containing user inputs to run ngen" + ) args = parser.parse_args() # Take in user config conf = json.load(open(args.infile)) - start_date = conf['forcing']['start_date'] - end_date = conf['forcing']['end_date'] - vpu = conf['hydrofab']['vpu'] + start_date = conf["forcing"]["start_date"] + end_date = conf["forcing"]["end_date"] + vpu = conf["hydrofab"]["vpu"] - top_dir = os.path.dirname(args.infile) - data_dir = os.path.join(top_dir,'forcing_data') + top_dir = os.path.dirname(args.infile) + data_dir = os.path.join(top_dir, "forcing_data") if not os.path.exists(data_dir): - os.system(f'mkdir {data_dir}') + os.system(f"mkdir {data_dir}") # Generate list of file names to retrieve for forcing data # Going to make assumptions here as to which forecasts we want # Check the dictionaries at the top of listofnwmfilenames for options - n = 6 # How rapidly we want our forecasts, I think 3 is the highest frequency - fcst_cycle = [n*x for x in range(24//n)] - lead_time = [x+1 for x in range(n)] + n = 6 # How rapidly we want our forecasts, I think 3 is the highest frequency + fcst_cycle = [n * x for x in range(24 // n)] + lead_time = [x + 1 for x in range(n)] # fcst_cycle = None # Retrieves a full day for each day within the range given. - runinput = 2 + runinput = 2 varinput = 5 geoinput = 1 meminput = 0 urlbaseinput = None - print(f'Creating list of file names to pull...') + print(f"Creating list of file names to pull...") nwm_forcing_files = create_file_list( - runinput, - varinput, - geoinput, - meminput, - start_date, - end_date, - fcst_cycle, - urlbaseinput, - lead_time, - ) - + runinput, + varinput, + geoinput, + meminput, + start_date, + end_date, + fcst_cycle, + urlbaseinput, + lead_time, + ) - print(f'Pulling files...') + print(f"Pulling files...") local_files = [] for jfile in nwm_forcing_files: - file_parts = jfile.split('/') - local_file = os.path.join(data_dir,file_parts[-1]) + file_parts = jfile.split("/") + local_file = os.path.join(data_dir, file_parts[-1]) local_files.append(local_file) - if os.path.exists(local_file): continue + if os.path.exists(local_file): + continue else: - command = f'wget -P {data_dir} -c https://storage.googleapis.com/national-water-model/{jfile}' - os.system(command) + command = f"wget -P {data_dir} -c https://storage.googleapis.com/national-water-model/{jfile}" + os.system(command) # Download dataset, read into df with geopandas - gpkg = os.path.join(DATA_DIR,'nextgen_03W.gpkg') - ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) - src = ds["RAINRATE"] + gpkg = os.path.join(DATA_DIR, "nextgen_03W.gpkg") + ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) + src = ds["RAINRATE"] # Why are we converting to paquet and then back into geopandas dataframe? polygonfile = gpd.read_file(gpkg, layer="divides") - parq_file = os.path.join(DATA_DIR,"ng_03.parquet") + parq_file = os.path.join(DATA_DIR, "ng_03.parquet") polygonfile.to_parquet(parq_file) - pkl_file = os.path.join(DATA_DIR,"weights.pkl") + pkl_file = os.path.join(DATA_DIR, "weights.pkl") generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") calc_zonal_stats_weights(src, pkl_file) @@ -459,7 +465,6 @@ def main(): for _i, cat_id in enumerate(gpkg_divides["id"]): print(f"mv cat16_{_i:07}.csv cat16_{cat_id}.csv") + if __name__ == "__main__": - main() - \ No newline at end of file diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 0571d85..4256e01 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -1,9 +1,10 @@ +# TODO NOTE a lot of this code is borrowed from https://github.com/RTIInternational/hydro-evaluation +# In the future, import this package # https://github.com/jameshalgren/data-access-examples/blob/DONOTMERGE_VPU16/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb # !pip install --upgrade google-api-python-client # !pip install --upgrade google-cloud-storage -import pickle import pandas as pd import argparse, os, json, sys import gc @@ -21,7 +22,7 @@ import threading -pkg_dir = Path(Path(os.path.dirname(__file__)).parent,'nwm_filenames') +pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "nwm_filenames") sys.path.append(str(pkg_dir)) from listofnwmfilenames import create_file_list @@ -48,8 +49,10 @@ # paths -#TODO Make CACHE_DIR configurable -CACHE_DIR = Path(pkg_dir.parent, "data", "raw_forcing_data") # Maybe this should have a date attached to the name +# TODO Make CACHE_DIR configurable +CACHE_DIR = Path( + pkg_dir.parent, "data", "raw_forcing_data" +) # Maybe this should have a date attached to the name NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") USGS_CACHE_DIR = os.path.join(CACHE_DIR, "usgs") @@ -74,6 +77,7 @@ ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, "RouteLink_CONUS.nc") ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, "route_link_conus.parquet") + def get_cache_dir(create: bool = True): if not os.path.exists(NWM_CACHE_DIR) and create: os.mkdir(NWM_CACHE_DIR) @@ -81,9 +85,11 @@ def get_cache_dir(create: bool = True): raise NotADirectoryError return NWM_CACHE_DIR + def make_parent_dir(filepath): Path(filepath).parent.mkdir(parents=True, exist_ok=True) + def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: """Retrieve a blob from the data service as xarray.Dataset. Based largely on OWP HydroTools. @@ -132,7 +138,10 @@ def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: engine="h5netcdf", ) return ds - + + +# TODO: Import this instead! +# Adapted from https://github.com/RTIInternational/hydro-evaluation/blob/dev-denno-4-1/src/evaluation/loading/generate_weights.py def generate_weights_file( gdf: gpd.GeoDataFrame, src: xr.DataArray, @@ -146,8 +155,8 @@ def generate_weights_file( crosswalk_dict = {} # This is a probably a really poor performing way to do this # TODO: Consider vectorizing -- would require digging into the - # other end of these where we unpack the weights... - i = 0 + # other end of these where we unpack the weights... + i = 0 for index, row in gdf_proj.iterrows(): geom_rasterize = rasterize( [(row["geometry"], 1)], @@ -161,19 +170,15 @@ def generate_weights_file( crosswalk_dict[row[crosswalk_dict_key]] = np.where(geom_rasterize == 1) else: crosswalk_dict[index] = np.where(geom_rasterize == 1) - + if i % 100 == 0: - perc = i/len(gdf_proj)*100 + perc = i / len(gdf_proj) * 100 print(f"{i}, {perc:.2f}%".ljust(40), end="\r") i += 1 - # with open(weights_filepath, "wb") as f: - # # TODO: This is a dict of ndarrays, which could be easily stored as a set of parquet files for safekeeping. - # pickle.dump(crosswalk_dict, f) - - # This block was taken from https://github.com/RTIInternational/hydro-evaluation/blob/dev-denno-4-1/src/evaluation/utils.py - # TODO: Perhaps import RTI's module, but just do this for now. - weights_json = json.dumps({k: [x.tolist() for x in v] for k, v in crosswalk_dict.items()}) + weights_json = json.dumps( + {k: [x.tolist() for x in v] for k, v in crosswalk_dict.items()} + ) with open(weights_filepath, "w") as f: f.write(weights_json) @@ -195,9 +200,10 @@ def get_blob(blob_name: str, bucket: str = NWM_BUCKET) -> bytes: bucket = client.bucket(bucket) return bucket.blob(blob_name).download_as_bytes(timeout=120) + def calc_zonal_stats_weights_new( src: np.ndarray, - weights_filepath: str, + weights_filepath: str, ) -> pd.DataFrame: """Calculates zonal stats""" @@ -209,11 +215,11 @@ def calc_zonal_stats_weights_new( nvar = src.shape[0] mean_dict = {} for key, value in crosswalk_dict.items(): - mean_dict[key] = np.zeros((nvar,),dtype=np.float64) - + mean_dict[key] = np.zeros((nvar,), dtype=np.float64) + mean_dict = {} for key, value in crosswalk_dict.items(): - mean_dict[key] = np.nanmean(src[:,value[0],value[1]],axis=1) + mean_dict[key] = np.nanmean(src[:, value[0], value[1]], axis=1) # This should not be needed, but without memory usage grows del crosswalk_dict @@ -222,51 +228,56 @@ def calc_zonal_stats_weights_new( return mean_dict -def get_forcing_dict_JL( - wgt_file, - filelist, - var_list, - var_list_out -): + +def get_forcing_dict_JL(wgt_file, filelist, var_list, var_list_out): t1 = time.perf_counter() - df_by_t = [] - for _i, _nc_file in enumerate(filelist): + df_by_t = [] + for _i, _nc_file in enumerate(filelist): with xr.open_dataset(_nc_file) as _xds: - shp = _xds['U2D'].shape + shp = _xds["U2D"].shape data_allvars = np.zeros( - shape=(len(var_list),shp[1],shp[2]), - dtype=_xds['U2D'].dtype) + shape=(len(var_list), shp[1], shp[2]), dtype=_xds["U2D"].dtype + ) for var_dx, jvar in enumerate(var_list): - data_allvars[var_dx,:,:] = np.squeeze(_xds[jvar].values) + data_allvars[var_dx, :, :] = np.squeeze(_xds[jvar].values) _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, wgt_file) df_by_t.append(_df_zonal_stats) - print(f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}%", end="\r") + print( + f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}%", + end="\r", + ) - print(f'Reformating and converting data into dataframe') + print(f"Reformating and converting data into dataframe") dfs = {} for jcat in list(df_by_t[0].keys()): data_catch = [] - for jt in range(len(df_by_t)): + for jt in range(len(df_by_t)): data_catch.append(df_by_t[jt][jcat]) - dfs[jcat] = pd.DataFrame(data_catch,columns = var_list_out) + dfs[jcat] = pd.DataFrame(data_catch, columns=var_list_out) - print(f"Indexing data and generating the dataframes (JL) {time.perf_counter() - t1:.2f}s") + print( + f"Indexing data and generating the dataframes (JL) {time.perf_counter() - t1:.2f}s" + ) return dfs -def wget(cmd,name,semaphore=None): - if not semaphore == None: semaphore.acquire() - resp = os.system(cmd) + +def wget(cmd, name, semaphore=None): + if not semaphore == None: + semaphore.acquire() + resp = os.system(cmd) if resp > 0: - raise Exception(f'\nwget failed! Tried: {name}\n') + raise Exception(f"\nwget failed! Tried: {name}\n") else: - print(f'Successful download of {name}') + print(f"Successful download of {name}") + + if not semaphore == None: + semaphore.release() - if not semaphore == None: semaphore.release() def main(): """ - Primary function to retrieve hydrofabrics data and convert it into files that can be ingested into ngen. + Primary function to retrieve hydrofabrics data and convert it into files that can be ingested into ngen. Also, the forcing data is retrieved. Inputs: JSON config file specifying start_date, end_date, and vpu @@ -279,148 +290,158 @@ def main(): t00 = time.perf_counter() parser = argparse.ArgumentParser() - parser.add_argument(dest="infile", type=str, help="A json containing user inputs to run ngen") - args = parser.parse_args() + parser.add_argument( + dest="infile", type=str, help="A json containing user inputs to run ngen" + ) + args = parser.parse_args() # Take in user config conf = json.load(open(args.infile)) - start_date = conf['forcing']['start_date'] - end_date = conf['forcing']['end_date'] - if 'nwm_files' in conf['forcing']: - nwm_files = conf['forcing']['nwm_files'] + start_date = conf["forcing"]["start_date"] + end_date = conf["forcing"]["end_date"] + if "nwm_files" in conf["forcing"]: + nwm_files = conf["forcing"]["nwm_files"] else: nwm_files = "" - runinput = conf['forcing']['runinput'] - varinput = conf['forcing']['varinput'] - geoinput = conf['forcing']['geoinput'] - meminput = conf['forcing']['meminput'] - urlbaseinput = conf['forcing']['urlbaseinput'] - vpu = conf['hydrofab']['vpu'] - ii_verbose = conf['verbose'] - bucket_type = conf['bucket_type'] - bucket_name = conf['bucket_name'] - file_prefix = conf['file_prefix'] - file_type = conf['file_type'] - ii_cache = conf['cache'] - dl_threads = conf['dl_threads'] - - file_types = ['csv','parquet'] - assert file_type in file_types,f'{file_type} for file_type is not accepted! Accepted: {file_types}' - - bucket_types = ['local','S3'] - assert bucket_type in bucket_types,f'{bucket_type} for bucket_type is not accepted! Accepted: {bucket_types}' - + runinput = conf["forcing"]["runinput"] + varinput = conf["forcing"]["varinput"] + geoinput = conf["forcing"]["geoinput"] + meminput = conf["forcing"]["meminput"] + urlbaseinput = conf["forcing"]["urlbaseinput"] + vpu = conf["hydrofab"]["vpu"] + ii_verbose = conf["verbose"] + bucket_type = conf["bucket_type"] + bucket_name = conf["bucket_name"] + file_prefix = conf["file_prefix"] + file_type = conf["file_type"] + ii_cache = conf["cache"] + dl_threads = conf["dl_threads"] + + file_types = ["csv", "parquet"] + assert ( + file_type in file_types + ), f"{file_type} for file_type is not accepted! Accepted: {file_types}" + + bucket_types = ["local", "S3"] + assert ( + bucket_type in bucket_types + ), f"{bucket_type} for bucket_type is not accepted! Accepted: {bucket_types}" # TODO: Subsetting! # # Set paths and make directories if needed - top_dir = Path(os.path.dirname(args.infile)).parent + top_dir = Path(os.path.dirname(args.infile)).parent if not os.path.exists(CACHE_DIR): - os.system(f'mkdir {CACHE_DIR}') + os.system(f"mkdir {CACHE_DIR}") if not os.path.exists(CACHE_DIR): - raise Exception(f'Creating {CACHE_DIR} failed!') + raise Exception(f"Creating {CACHE_DIR} failed!") # Prep output directory if bucket_type == "local": - bucket_path = Path(top_dir,file_prefix,bucket_name) + bucket_path = Path(top_dir, file_prefix, bucket_name) if not os.path.exists(bucket_path): - os.system(f'mkdir {bucket_path}') + os.system(f"mkdir {bucket_path}") if not os.path.exists(bucket_path): - raise Exception(f'Creating {bucket_path} failed!') - elif bucket_type == 'S3': - s3 = boto3.client('s3') + raise Exception(f"Creating {bucket_path} failed!") + elif bucket_type == "S3": + s3 = boto3.client("s3") # Get nwm forcing file names if len(nwm_files) == 0: - print(f'Creating list of file names to pull...') + print(f"Creating list of file names to pull...") n = 6 - fcst_cycle = [n*x for x in range(24//n)] - lead_time = [x+1 for x in range(n)] + fcst_cycle = [n * x for x in range(24 // n)] + lead_time = [x + 1 for x in range(n)] nwm_forcing_files = create_file_list( - runinput, - varinput, - geoinput, - meminput, - start_date, - end_date, - fcst_cycle, - urlbaseinput, - lead_time, - ) + runinput, + varinput, + geoinput, + meminput, + start_date, + end_date, + fcst_cycle, + urlbaseinput, + lead_time, + ) else: - print(f'Reading list of file names from {nwm_files}...') + print(f"Reading list of file names from {nwm_files}...") nwm_forcing_files = [] - with open(nwm_files,'r') as f: + with open(nwm_files, "r") as f: for line in f: nwm_forcing_files.append(line) - # Download whole files and store locally if cache is true, + # Download whole files and store locally if cache is true, # otherwise index remotely and save catchment based forcings t0 = time.perf_counter() if ii_cache: - # Check to see if we have files cached, if not wget them + # Check to see if we have files cached, if not wget them local_files = [] cmds = [] - fls = [] + fls = [] for jfile in nwm_forcing_files: - if ii_verbose: print(f'Looking for {jfile}') + if ii_verbose: + print(f"Looking for {jfile}") file_parts = Path(jfile).parts - local_file = os.path.join(CACHE_DIR,file_parts[-1]) + local_file = os.path.join(CACHE_DIR, file_parts[-1]) local_files.append(local_file) - if os.path.exists(local_file): - if ii_verbose: print(f'Found and using raw forcing file {local_file}') + if os.path.exists(local_file): + if ii_verbose: + print(f"Found and using raw forcing file {local_file}") continue else: - if ii_verbose: print(f'Forcing file not found! Downloading {jfile}') - command = f'wget -P {CACHE_DIR} -c {jfile}' + if ii_verbose: + print(f"Forcing file not found! Downloading {jfile}") + command = f"wget -P {CACHE_DIR} -c {jfile}" cmds.append(command) fls.append(jfile) - threads = [] + threads = [] semaphore = threading.Semaphore(dl_threads) - for i,jcmd in enumerate(cmds): - t = threading.Thread(target = wget, args = [jcmd, fls[i],semaphore]) + for i, jcmd in enumerate(cmds): + t = threading.Thread(target=wget, args=[jcmd, fls[i], semaphore]) t.start() threads.append(t) for jt in threads: jt.join() - forcing_files = local_files # interacting with files locally + forcing_files = local_files # interacting with files locally else: - forcing_files = nwm_forcing_files # interacting with files remotely + forcing_files = nwm_forcing_files # interacting with files remotely - print(f'Time to download files {time.perf_counter() - t0}') + print(f"Time to download files {time.perf_counter() - t0}") # Generate weight file only if one doesn't exist already # Very time consuming so we don't want to do this if we can avoid it - wgt_file = os.path.join(CACHE_DIR,"weights.json") + wgt_file = os.path.join(CACHE_DIR, "weights.json") if not os.path.exists(wgt_file): # Search for geopackage that matches the requested VPU, if it exists gpkg = None - for jfile in os.listdir(os.path.join(top_dir,'data')): + for jfile in os.listdir(os.path.join(top_dir, "data")): if jfile.find(vpu) >= 0: - gpkg = Path(top_dir,"data",jfile) - print(f'Found and using geopackge file {gpkg}') + gpkg = Path(top_dir, "data", jfile) + print(f"Found and using geopackge file {gpkg}") if gpkg == None: - url = f'https://nextgen-hydrofabric.s3.amazonaws.com/05_nextgen/nextgen_{vpu}.gpkg' - command = f'wget -P {CACHE_DIR} -c {url}' - wget(command,url) + url = f"https://nextgen-hydrofabric.s3.amazonaws.com/05_nextgen/nextgen_{vpu}.gpkg" + command = f"wget -P {CACHE_DIR} -c {url}" + wget(command, url) - print(f'Opening {gpkg}...') + print(f"Opening {gpkg}...") polygonfile = gpd.read_file(gpkg, layer="divides") - ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) - src = ds["RAINRATE"] + ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) + src = ds["RAINRATE"] print("Generating weights") t1 = time.perf_counter() generate_weights_file(polygonfile, src, wgt_file, crosswalk_dict_key="id") print(f"Generating the weights took {time.perf_counter() - t1:.2f} s") else: - print(f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}") + print( + f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}" + ) var_list = [ "U2D", @@ -438,12 +459,12 @@ def main(): "VGRD_10maboveground", "DLWRF_surface", "APCP_surface", - "precip_rate", # BROKEN (Identical to APCP!) + "precip_rate", # BROKEN (Identical to APCP!) "TMP_2maboveground", "SPFH_2maboveground", "DSWRF_surface", ] - + fd2 = get_forcing_dict_JL( wgt_file, forcing_files, @@ -454,37 +475,42 @@ def main(): # Write CSVs to file t0 = time.perf_counter() write_int = 100 - for j, jcatch in enumerate(fd2.keys()): - df = fd2[jcatch] - splt = jcatch.split('-') - - if bucket_type == 'local': - if file_type == 'csv': - csvname = Path(bucket_path,f"cat{vpu}_{splt[1]}.csv") + for j, jcatch in enumerate(fd2.keys()): + df = fd2[jcatch] + splt = jcatch.split("-") + + if bucket_type == "local": + if file_type == "csv": + csvname = Path(bucket_path, f"cat{vpu}_{splt[1]}.csv") df.to_csv(csvname) - if file_type == 'parquet': - parq_file = Path(bucket_path,f"cat{vpu}_{splt[1]}.parquet") + if file_type == "parquet": + parq_file = Path(bucket_path, f"cat{vpu}_{splt[1]}.parquet") df.to_parquet(parq_file) - elif bucket_type == 'S3': - buf = BytesIO() - if file_type == 'parquet': - parq_file = f"cat{vpu}_{splt[1]}.parquet" - df.to_parquet(buf) - elif file_type == 'csv': - csvname = f"cat{vpu}_{splt[1]}.csv" + elif bucket_type == "S3": + buf = BytesIO() + if file_type == "parquet": + parq_file = f"cat{vpu}_{splt[1]}.parquet" + df.to_parquet(buf) + elif file_type == "csv": + csvname = f"cat{vpu}_{splt[1]}.csv" df.to_csv(buf, index=False) buf.seek(0) - key_name = f'{file_prefix}{csvname}' - s3.put_object(Bucket=bucket_name, Key=key_name, Body=buf.getvalue()) + key_name = f"{file_prefix}{csvname}" + s3.put_object(Bucket=bucket_name, Key=key_name, Body=buf.getvalue()) - if (j+1) % write_int == 0: - print(f"{j+1} files written out of {len(fd2)}, {(j+1)/len(fd2)*100:.2f}%", end="\r") + if (j + 1) % write_int == 0: + print( + f"{j+1} files written out of {len(fd2)}, {(j+1)/len(fd2)*100:.2f}%", + end="\r", + ) + + print(f"{file_type} write took {time.perf_counter() - t0:.2f} s\n") - print(f'{file_type} write took {time.perf_counter() - t0:.2f} s\n') + print( + f"\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {bucket_type}\n\n" + ) + print(f"Total run time: {time.perf_counter() - t00:.2f} s") - print(f'\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {bucket_type}\n\n') - print(f'Total run time: {time.perf_counter() - t00:.2f} s') if __name__ == "__main__": main() - \ No newline at end of file diff --git a/ngen_forcing/process_nwm_forcing_to_ngen.py b/ngen_forcing/process_nwm_forcing_to_ngen.py index 8957438..23c73ba 100644 --- a/ngen_forcing/process_nwm_forcing_to_ngen.py +++ b/ngen_forcing/process_nwm_forcing_to_ngen.py @@ -256,4 +256,4 @@ def get_forcing_dict_newway_inverted( # df_dict[var].loc[j, t] = s[var].mean() # [ds.close() for ds in ds_list] -# return df_dict \ No newline at end of file +# return df_dict diff --git a/ngen_forcing/test_process_nwm_forcing_to_ngen.py b/ngen_forcing/test_process_nwm_forcing_to_ngen.py index 545fe95..ed98587 100644 --- a/ngen_forcing/test_process_nwm_forcing_to_ngen.py +++ b/ngen_forcing/test_process_nwm_forcing_to_ngen.py @@ -208,4 +208,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/ngen_forcing/user_input_ngen.md b/ngen_forcing/user_input_ngen.md index 9e0cd45..8877637 100644 --- a/ngen_forcing/user_input_ngen.md +++ b/ngen_forcing/user_input_ngen.md @@ -36,7 +36,7 @@ contents: | --- | --- | --- | | start_date | `string` | YYYYMMDD | | end_date | `string` | YYYYMMDD | -| nwm_files | `string` | Path to a text file containing nwm file names. One filename per line. To have nwm forcing file names generated automatically, leave this option out of the config or set it to "". | +| nwm_files | `string` | Path to a text file containing nwm file names. One filename per line. To have nwm forcing file names generated automatically, leave this option out of the config or set it to "" | | runinput | `int` |
    1. short_range
    2. medium_range
    3. medium_range_no_da
    4. long_range
    5. analysis_assim
    6. analysis_assim_extend
    7. analysis_assim_extend_no_da
    8. analysis_assim_long
    9. analysis_assim_long_no_da
    10. analysis_assim_no_da
    11. short_range_no_da
    | | varinput | `int` |
    1. channel_rt: for real-time channel data
    2. land: for land data
    3. reservoir: for reservoir data
    4. terrain_rt: for real-time terrain data
    5. forcing: for forcing data
    | | geoinput | `int` |
    1. conus: for continental US
    2. hawaii: for Hawaii
    3. puertorico: for Puerto Rico
    | @@ -56,5 +56,5 @@ contents: | bucket_name | `string` | If local, this is the name of the folder the data will be placed in. If S3, this is the name of S3 bucket, which must exist already. | | file_prefix | `string` | If local, this is the relative path to the bucket_name folder. If S3, this is the relative path within the S3 bucket_name bucket to store files | | file_type | `string` |
    1. "csv" : write data as csv files/
    2. "parquet" : write data as parquet files
    | -| cache | `bool` |
  • true: store forcing files locally
  • false: interact with forcing files remotely
  • | -| dl_threads | `int` | Number of threads to use while downloading | +| cache | `bool` |
  • true: Store forcing files locally. Must specify dl_threads
  • false: Interact with forcing files remotely
  • | +| dl_threads | `int` | Number of threads to use while downloading. | From 72066ab620fb6f16b4d91fb5d19e47d223c3a6b4 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 15:23:27 -0500 Subject: [PATCH 021/105] block pycache in ngen_forcing --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index c6e39ed..125a2ec 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ data/* nwm_filenames/__pycache__/ subsetting/__pycache__/ +ngen_forcing/__pycache__/ venv/ \ No newline at end of file From c9630ce66f6b6b0c4dea5fbcd3d654229b4baa6e Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 15:24:33 -0500 Subject: [PATCH 022/105] removed pycache --- ngen_forcing/__pycache__/defs.cpython-311.pyc | Bin 1345 -> 0 bytes ngen_forcing/__pycache__/defs.cpython-38.pyc | Bin 938 -> 0 bytes .../process_nwm_forcing_to_ngen.cpython-311.pyc | Bin 6098 -> 0 bytes .../process_nwm_forcing_to_ngen.cpython-38.pyc | Bin 2977 -> 0 bytes ...t_process_nwm_forcing_to_ngen.cpython-311.pyc | Bin 6097 -> 0 bytes 5 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 ngen_forcing/__pycache__/defs.cpython-311.pyc delete mode 100644 ngen_forcing/__pycache__/defs.cpython-38.pyc delete mode 100644 ngen_forcing/__pycache__/process_nwm_forcing_to_ngen.cpython-311.pyc delete mode 100644 ngen_forcing/__pycache__/process_nwm_forcing_to_ngen.cpython-38.pyc delete mode 100644 ngen_forcing/__pycache__/test_process_nwm_forcing_to_ngen.cpython-311.pyc diff --git a/ngen_forcing/__pycache__/defs.cpython-311.pyc b/ngen_forcing/__pycache__/defs.cpython-311.pyc deleted file mode 100644 index c6a40066f7994994d36d7134d8f5ca0faff26564..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1345 zcmb7DPfHs?6rY*hO`i1MOlmOHmiDppoA>6uH}CCxZ$89g?Fi`U z<405wguY7Qia=I$o`P);QADwea(IqlL_<{D#B&-AQyq4lMyLV1LGcVqwtbUsnQ2E; z?hTN9NK5a6x0hli45GH{T19WgSz_ok5t|MO%PU#t=F{qEsqfj-{8tCB5u|j;9@u zva~d`Y{y|jSZvMC6i`jQ-4%a&3yL9)-!Stu-=b^ery4t@ZfFBa-h_tnzgEcxrNu5W5Q8}yM z;4u(Y_Ve@e3Elxx5V;D?yFirv688ZvE+AZ;Xa?N)S7=8ay;Z%3cY@Q+?SBrMJ>;VU z*ncM_(&{mTj&SCP1bS95H!xP07-!HuLPZc-&E~1M z8tAIex@w|oULx`9_dD0P#hFd5h9#G?D(fQd0hCZJ8rvD&9^HLVnS3)@nyQ7PCBJ6I z%h5A4zO%8tu|ITT4xXBW)yQC@0?io&cgd>{`vOl5i*8P9oi#bpyb0cUbh#4~`phB2<8!D{PQ*F(6sj#{DHz==9) Og)RdR%d;Cnh5rdLFes$} diff --git a/ngen_forcing/__pycache__/defs.cpython-38.pyc b/ngen_forcing/__pycache__/defs.cpython-38.pyc deleted file mode 100644 index e52dfc5c1af0db3166cdc12f95d47beb992e5322..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 938 zcmZuv&2AGh5VpPEWb;=6Efo?Rx$Gt3PE}Px$^|Kx!X-jlt#+Jb+f6naZ%KldTczZ} zJG95VlCPY21uihNX-Yth%8p=v9(yswHkRKLEQ#l zz0#vlZ;(j3+Y`%m+Q+o*DX~Ia$&D<sYP=Wg9S79GQ&(+e|XM=@n#aR{=`Asr0 z;BGV9$R$C7lpsPb%#Fnn9HexG&zU@uO zyLXk55qn7vrpF#poFvBD$SmXOJTta88=F})$}vf2;~toi?ao)O5%JUf4Hv;45ff5N zCA3hyetmhlcj%sFc@k&#y!#k~4RQ}P26R+#Dumc#m9LNsgxpOR?4PNM%%2JQ#K$(B z|Lw&)y0EW_mEum6@?KIG6o~5pPq`bVcPC4APb~#;DYkO3RZgeYWX@kq44P0o1=iIT z=?wCP)10BxT$dfN3S9;4Xo(S;7^0mk1G1ctkWCp#t?)U?iO-e?s?o`M1@R3Dj}ZL_ zh_6bo)TKYFfU>j$%R0lw4h13CI;`PSmY*@>NpfBaOQ(x;YLF-KU$n=6qm9^+{SF{E z6}})#sQUd-4rpu#gRnO2>_#iiyIhLuFXSYu4pcc-Q3wD1DN%4X&EY09=REV4({HP0 HqZ#}Lt|Z1L diff --git a/ngen_forcing/__pycache__/process_nwm_forcing_to_ngen.cpython-311.pyc b/ngen_forcing/__pycache__/process_nwm_forcing_to_ngen.cpython-311.pyc deleted file mode 100644 index 21bd3c010c14424f9de91c76acddb9af51ac64dc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6098 zcmeHLTTC3+8J?NF&BE@oH)dJx5WCJM4q)R3+aV4ZY&XWHjd7fG6FOS$3}orvdSo+qF#GYJbN=(6+du#L&iReo-9W*kPrNStv74g)iiyf&-xi)MfN-4>D1lB=DLP80 zm?)F7L@hMtF-dF67PY19QM-}0Bpp!}@-0bcw1J_{P=fUuCD=sUTw}$sXrFUckU@qb1 zL`sY>x`j_@F6Hh!KT&F}TN()q!5olr%#K+=s zQIdFZA(qM}MX5hME2jCG42dVwv;A3;F-p@{Qk6Vj&hV(ymt7=IFl>MW1gQyNsrpy4 zvPD!wS+ln_L+>WY1M)g()zVO7;I0i@=1d88<=ycCC(W z({dAP-LBTm*%kX7wr4zZ)m6s^Xxwy}EzlH0$vw4-3Ty7a(qvA* z3%-)`=6pH7X}*0G3yN283!YhKv&q~}&MWV)=unh=peC2~6@MXE%d1IdUozV?_beAM zt%K8C3TqoRRw0;cR+{IUD=pMKzW%Gpaq>$Q4OshTC7^gWy*FX(x2xYuE~JDCbt_-f zH}^T$vh7)6r9}zX902^ME;ZLusjqkxZiXp)*2+7p^h%H5yGAL_ams9$rW6;1KL4c7 zl9(imB#|NbS>K{{yO^F$q{Y*>Y`Rk>v9ts+E2Ud!<5@|!l5AYkEpY;3Jd<4nbsGd{ zap5U*u=^24qDKP4^8in2HxygMw4l?AI=ui1tB^=Z_#dNqyc$?Q8AAttI!|5(xK7Q& zCo|3EXoW67diP@<%!kZMXo@0k(8CVJlNm|8#pq1mfX)b#gyZV&Cg|f&8-|k!Nseby z*%Mb^oE7D2@D&nqnNN#XuEZAmhLf3iEGeA;p`!d9b0n-Y3q*HivJmodGk{%1XXi4BbT%fxr(51UJg(a>$CC4+WW;dYg7c4V zrQ2g!s3ho2R?wZeA)O|$59v-ZJ)aUuOcr%pmL$@$ZX=m_(6c1Pv~Ekz%}X-DNi0_y zB_y_p(;b&}%hg0yXXT8POu&|-TXC}_=!mckVk`|7GVw^O?w%22@;niFV?A|OC5cl2 zg_)!v5FI~<(hmt&Z8mFo0(50t4W72!y7$k_{%z-&>h=dA?Q^)&Fl7ULg0k|+oPiAz&y09 zH{kGiV{UavoMd1&1>GjeF&X?8`YN@8k?uVo!%Otg9-cvXsEj+CG!9O+L~VH}7q>M2 z&=*+iDsLWfG~d+H*;24`ZBh$Hmd-zDY$>tL zC4c)`pXT3p-`{u7->3QqH2*-!?ZemQExG+$ZimbMm;(43VQG9TNHzJE&Xj^3TNGm( zr^}dUOD+5FxAfg>>DyRPTaIciNAvCy+w_6wea}Z^?PrDD-HXN0Ni}p*17ycEcC2W; zUwOP>%)@O_wC$(?dFBDvvX7bxdB-ZV+?a2C$aNK5D)*Acy|hI&*j{~% zu-uTR^Ft3<_o}bxIR@`X;`-=~c;TSh*{^l>tL^=Fd@47naf5fpz+749zUQ@jp4Yx$ zRL_{^8T&F=WG7X2Qe!7UD78L+*E`i zjj6)bV$)%@>9E#xIB$IrY`GCwj~52lqZ{1kU7vNSk>gtAxEg*%3%{ZUhqd5v-uWQZ zT?nb6h!%=$Q8wqR1}vY+kLKl4xZ~5X7LIK6XyJqTvk$m^g^L@$=beqbF=t^5IwLap0`wXL26&sC`| z9EE)55h$%nTbZybE*Q;o#vy^rwdXpvEp;j#N@pGQZLLq$QD07JuTtNAf`1R%+X}Qd zFiG&tVK82T?s}Tw`GK?};73F{5xNk%5zyrc`a0Q%fQOMsNGk~*MF>)gZ-vd07ts7c zfO-;3`XS@$kv%vH@VCn%kwxKIH1-bK8$iJOgGiVhLisSlAi@!Z?;;pC5J<)iT1tjc zI*#xP0N6P!U)1B!byZN1mv1z9N{R!F@#e9kuGu?y@HT)>&yaU8@k2~7dT>d?YV!5D97rX!2Qji3g`YGD{+kb;MdcRrT=<+O`Dh0Wv z^Cnhkvkd^RvJL#tutK2V_u2M)Z2P)Uc)i%sr?UMT+g~)^hu-k|0oB{9d3(1gyY09E z%U0mKhqn3d%SF%2+k7|jar)D=+J0~&rg8%sH*hDQaz~&T-us^6d!FID7gW!P<{9}y zD6$hOJE5@?Ae5T9wM%Q$g+p5N{(Qs3wtagp}J zdV%+Xp8X$F-bj<~tG^KK6T}&*94U-of>#OU7;Q%hZwht$0-Pdai$=`S?b%pbh)Ly; zg277nzhzt#pBLr#GJ^7E8OL`45CkcLq9}@bvFs>mI*poCa^18|Th({14c z7izJz$jl-a`B=@gy=FQ?-QrtL$ydl{*lSPjNAM}nv!raf6HjM)C@}lSz5vg&`|R#- zamV-T1TOvVJM611LjHw?i&q5-Hy{;%0%3&Fgru}bQ>$mCcF(3*XC+Qr=~Ys<=bEye zRC^xO+exige~U1ORqhj3;g!JxcL$9Vr?J8zq=;1i{fLOlB<)h*$tqMjm~zdpA${ z?jQjJVmHbe-(_Ge2&0G>MZm{lI!t)6n;r5j=;tDev%}q?$W3c@oSxSOG7s>bo#8}O z!8)eG0`aIRM>rSf*9K%fNN+$YG8iDEWkr$!Ro2u}HnY~r8C4Z#pIW7>+<`rC%BrlW zs>!?4>KV=)-Y`3-Zs{G8vZm~P^6?90-Pov~(Q-l6rk=67Ftc?=DLHyOCS^l8@NTE{ zWo=Mbgt@0wHI%%B8Xm{+CpX`K)0r>5dD{5u1GH@^wWmk69idqIMty8;PRjbq0Gg7XZF>IFWYE7-L6X-waU`;hu zw{Pu{ztN-DAk~ll!IoJ2K2f!OasuO54OlqK2ig;1A-RZif$Qr5)h^EtYP|-Ch z!mNPFPPKCw4GZmvVN_^45|EAZ;RIfH6GL)A<9#J2fZJserCz_6- z;IKF?@QJ%1LVxEME)WOOZAe820lMye>y4E(va0&gp;SfGz+8Q!l!!3DJ zz)sXPo{ds2LdkVyDB?`&ipWRso}KVaSCYY~kitd-=A+kz2qze_NNW3&c&I&@7fB5J zO*;q-fiZ(n@-PDnd9=Nu{XP%nNbtbSiEf-1aZu2hCya|=D0n{}>smhsBA6EMUMOZc zN5JF*FvhG1*eFdW+7Bd$jSrm;LLsyhjIJ?Vh346pK^6sQ2;S?DlH_r*%KCxnxfR^g zG}hI5SHz3hX&PvE4#gTg=cA$nFb|$}9fuc~*y?H|a)>6=l|qIR)*kfrKj(Y(8Pkez zE3Si(s$;EDk6LiojY95`TX&$YPP-O9+g6M2&^Fv%+M*5GggTFISxqYb2o|2g)^kwt z7+e4TKVu6J*MJ1bngi<7IiTtWP%Z$Z39y<0stSPe=79294FFYV3kFclZvoUIpryfl z094ZeswJ0ZWdKzB98k)ida9+C`#7015rXT}l{3<}u?GOR14s_we8ZryTV9gg!7AW<32?rl zE*S{^RW$(SJAmVExmjMGL-UHdtgZkwH&6kZU9~xf=9ggebJk|d7m&FD$m|>lAEw{X zQZylVQ?#(WhO&-=hnl#AvWapT1y_vNLU|tKA#Ppq9h5G}|42gdT{Q4Ll*h-o0igIk z)O`Wz3He@q#ufom+~Y!W_U#B1FQfHWP=0{2gR+Z)$GW(Rg2)p;M0phji7S4B@>7)8 zP!MY3XDB~M`31@z$}dq&FmRiSYbZZL`4!0abK(Z(Uq|^h3jQXDeUvv*Zlb(}@;1sl zD8E5@7v)2co{uLD{(u=?i34o-Eedi-{0`-Pln+2O?Tb5Dz(Yt|K?>Z%gyqMVdOx>hZ{3Lf%Q@z_dz$Xji+64pv5Ayquwn-R$i4^_WepMCcI zK@de{y!&}(zHh$yX7-zJelvgd`#lKK!rzZc#Xf}oLkrEsEobh33z;vGgd|2m3C8|s z6CC`r3eGqZ4yM6#CR{d;Pw?>1DejCX;mLRt-i$Bd%lH$1TMjFMOfV5#ZWBs`SR{w0 zx|*Fzj%y4;*Wqm>2Dy>syn+(lk_%{9?veNpS;`|mXc@i+x$6y4prDmGYriq~60Mzx zF_yQGRYgV6bWzu0*y>Nox-gj~$+Vghq;ygjRQb{+@v;@&fFa~WA}WfkY%GyhFUmxh zB`da(j;2IQ85oA&6!i<1Go4biL{2(dqd>?Dc{=^~e+xr}sb`7~i7h(QsOU2A7<%7T zBlPg`_qWYEm0?q1PkaGiXx@89o1+oBJwbQ`{bmT1wh(-AKx!}k@EuUx7ivn0#mI~ z&fCjpMQ6FDFv}lb_^0es+=_>^^)f%z>^omA?^&Qv7d6=b-_P^bMc6Rxu;=AW>5B5Jx!(pd{WqQ?l9&3&Xms=|YZVZ4vcSrpXZWtUv|hxyuhchn2LZC$pK{ z(ZcrC%RYWs$tFcbI|`Ym)H`6176np56}+?YT$M`aK23a*`1rjm@3kuLgA9s=EPg7R zR&#cY5OX=0NsFB(mM;rgfsR3wb;~WQ`HW0NUA7!Kl2&!gL9%%e@A(T69|=Ftp)n(h;amM}Eg^ppVked}X$Lsut|KN@jjl#*4iFYIh}Y5S}mcb8r2Hn?LnO|DXNU zr}r#;RP7%z`$xXrR_%YGA#po!ds7{G+_W1? z%g1Wr{yBE`P}hIc*M~D!+R5*vuPq_-nTr8C>Xsp;djmE56Sg#XC)V8Qk%9mnO(&_#jDmTZzV4>XG*&P8u5Ywy+^FtKuI(kt;}V zMMu~VI0<*g^VLzU$ViSNyTM+^O3oXu&h|wvg(Ut@Oc5K5QCA0DIn=UR!$?LXT*1{)byG1*G7O z&@~s#JiLum^gc2ByB^#A?se?%z-0ed&k3)we`P5?iqHF>*v$8=v7Cmz<+R>vUDtC0 zPt5+_$F_ex&k6oC_Dcv*Y2So}DHiHLl{CEBsPZYuETc*{1bhUcoB0Y$>^B@>M-MdT z3R?l*#r2VWg?L7m(tuU~C8gx}@&PeEX|a3v7u@k=R?Ta1HA!fYim`;Qcz`lw8D`p0 zgKUDzLL?)m)%Eb72wd%XAOJZK+v{8*wO2+f{7VbZ0%<*Uu0zD$AD8GJxJ0+P!h!W+ z?Ql}m_Z&F1_rQS>A)nK9B8z~(@*eWyutSw<<`lauvB~PEvk`Li59nGpy z{em0?WEdUPl(@swrS%J-2Mm{}(&pJaZaEv3OCjhKfIg-i z%SoxA7WO<$+cJ=E)8$&ODhxfWos!Mwv?!6`Y_G{uym0){g*O~VVf1m8tg${8AoNTN zK%aW5#ib~Ur}NbS(tJvW1CLHFTP`@QP%KV%fjD`F5(<4ShYjfo1t)|q6+)o~p*vG7 zME1a+<)_Vr?4+$xv+$&_cfaMM4g?t9_8&4#<=hmyH;}y7<~Cf7?50c)+_(^3pyvsT zo0LW<*$29${klzawBmW1eZj7HA*a!AyBfwfP&}rqill%qz^g@J5tPtkPh_s|^YPj7 z($DK`bGa;@c>TMP@5Fz;`>%KHHFS@zH2|u;PLT2--xyOGTMYNj9+~f1_+=%0uo^yS zh7Xp;>#REjYqCj7r|O$f*OobPo|#LUp=V34)w;LLeKa4Q%bVRhN^jJ{Tg>otx5LA? z!o$_@h#4L!jsM_5-F>qIpKqSsTzY*GyDOeuRXk+kp$Z;a^lYl}VUr)I@jW%Zx5jtX z_|QGC(+jo%DINbIfE=FRZ=Z>MHZ(Id=c#(0t>SGa-d4fe?soNdXx&!D+fBT^g10Yv zp7|-X-leu4uHqvmK2pI)7QI7jwbuN-wP060$ori45RmUFDUJQm{m5p&RK+iw_~i?h&9-LOA>#`1<88B#d6WR3?=3iLFChr_rfUz!4h9eS!!^)N9KybG;TAVg;Rb4$FO7YC^2*7Q{i_3S6I9#t^FK8$ Bf^Yx; From 106fb086b52ad656ecbb22a666359ba081603dc3 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 15:26:25 -0500 Subject: [PATCH 023/105] removed raw script --- ngen_forcing/denno.py | 470 ------------------------------------------ 1 file changed, 470 deletions(-) delete mode 100644 ngen_forcing/denno.py diff --git a/ngen_forcing/denno.py b/ngen_forcing/denno.py deleted file mode 100644 index bc6c369..0000000 --- a/ngen_forcing/denno.py +++ /dev/null @@ -1,470 +0,0 @@ -# https://github.com/jameshalgren/data-access-examples/blob/DONOTMERGE_VPU16/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb - -# !pip install --upgrade google-api-python-client -# !pip install --upgrade google-cloud-storage - -import pickle -import time -import pandas as pd -import argparse, os, json -import gc -from pathlib import Path -import geopandas as gpd -import pandas as pd -import numpy as np -import xarray as xr -from google.cloud import storage -from rasterio.io import MemoryFile -from rasterio.features import rasterize - -from nwm_filenames.listofnwmfilenames import create_file_list - -DATA_DIR = Path(Path.home(), "code", "data") - -TEMPLATE_BLOB_NAME = ( - "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" -) -NWM_BUCKET = "national-water-model" - -# WKT strings extracted from NWM grids -CONUS_NWM_WKT = 'PROJCS["Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]], \ -PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ -PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-97.0],PARAMETER["standard_parallel_1",30.0],\ -PARAMETER["standard_parallel_2",60.0],PARAMETER["latitude_of_origin",40.0],UNIT["Meter",1.0]]' - -HI_NWM_WKT = 'PROJCS["Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]],\ -PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ -PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-157.42],PARAMETER["standard_parallel_1",10.0],\ -PARAMETER["standard_parallel_2",30.0],PARAMETER["latitude_of_origin",20.6],UNIT["Meter",1.0]]' - -PR_NWM_WKT = 'PROJCS["Sphere_Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]],\ -PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ -PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-65.91],PARAMETER["standard_parallel_1",18.1],\ -PARAMETER["standard_parallel_2",18.1],PARAMETER["latitude_of_origin",18.1],UNIT["Meter",1.0]]' - -# paths -CACHE_DIR = Path(Path.home(), "code", "data_access_examples", "forcing_data") -NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") -USGS_CACHE_DIR = os.path.join(CACHE_DIR, "usgs") -GEO_CACHE_DIR = os.path.join(CACHE_DIR, "geo") - -NWM_CACHE_H5 = os.path.join(NWM_CACHE_DIR, "gcp_client.h5") - -PARQUET_CACHE_DIR = os.path.join(CACHE_DIR, "parquet") -MEDIUM_RANGE_FORCING_PARQUET = os.path.join(PARQUET_CACHE_DIR, "forcing_medium_range") -FORCING_ANALYSIS_ASSIM_PARQUET = os.path.join( - PARQUET_CACHE_DIR, "forcing_analysis_assim" -) -MEDIUM_RANGE_PARQUET = os.path.join(PARQUET_CACHE_DIR, "medium_range") -USGS_PARQUET = os.path.join(PARQUET_CACHE_DIR, "usgs") - -HUC10_SHP_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_conus.shp") -HUC10_PARQUET_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_conus.parquet") -HUC10_MEDIUM_RANGE_WEIGHTS_FILEPATH = os.path.join( - GEO_CACHE_DIR, "wbdhu10_medium_range_weights.pkl" -) - -ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, "RouteLink_CONUS.nc") -ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, "route_link_conus.parquet") - - -def parquet_to_gdf(parquet_filepath: str) -> gpd.GeoDataFrame: - gdf = gpd.read_parquet(parquet_filepath) - return gdf - - -def get_cache_dir(create: bool = True): - if not os.path.exists(NWM_CACHE_DIR) and create: - os.mkdir(NWM_CACHE_DIR) - if not os.path.exists(NWM_CACHE_DIR): - raise NotADirectoryError - return NWM_CACHE_DIR - - -def make_parent_dir(filepath): - Path(filepath).parent.mkdir(parents=True, exist_ok=True) - - -def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: - """Retrieve a blob from the data service as xarray.Dataset. - Based largely on OWP HydroTools. - Parameters - ---------- - blob_name: str, required - Name of blob to retrieve. - use_cache: bool, default True - If cache should be used. - If True, checks to see if file is in cache, and - If fetched from remote, will save to cache. - Returns - ------- - ds : xarray.Dataset - The data stored in the blob. - """ - # TODO: Check to see if this does any better than kerchunk - # the caching should help, but probably needs to be managed to function asynchronously. - # Perhaps if the files is not cached, we can create the dataset from - # kerchunk with a remote path and then asynchronously do a download to cache it - # for next time. The hypothesis would be that the download speed will not be any slower than - # just accessing the file remotely. - nc_filepath = os.path.join(get_cache_dir(), blob_name) - make_parent_dir(nc_filepath) - - # If the file exists and use_cache = True - if os.path.exists(nc_filepath) and use_cache: - # Get dataset from cache - ds = xr.load_dataset( - nc_filepath, - engine="h5netcdf", - ) - return ds - else: - # Get raw bytes - raw_bytes = get_blob(blob_name) - # Create Dataset - ds = xr.load_dataset( - MemoryFile(raw_bytes), - engine="h5netcdf", - ) - if use_cache: - # Subset and cache - ds["RAINRATE"].to_netcdf( - nc_filepath, - engine="h5netcdf", - ) - return ds - - -def generate_weights_file( - gdf: gpd.GeoDataFrame, - src: xr.DataArray, - weights_filepath: str, - crosswalk_dict_key: str, -): - """Generate a weights file.""" - - gdf_proj = gdf.to_crs(CONUS_NWM_WKT) - - crosswalk_dict = {} - - # This is a probably a really poor performing way to do this - # TODO: Consider vectorizing -- would require digging into the - # other end of these where we unpack the weights... - i = 0 - for index, row in gdf_proj.iterrows(): - geom_rasterize = rasterize( - [(row["geometry"], 1)], - out_shape=src.rio.shape, - transform=src.rio.transform(), - all_touched=True, - fill=0, # IS FILL 0 - dtype="uint8", - ) - if crosswalk_dict_key: - crosswalk_dict[row[crosswalk_dict_key]] = np.where(geom_rasterize == 1) - else: - crosswalk_dict[index] = np.where(geom_rasterize == 1) - - if i % 100 == 0: - perc = i / len(gdf_proj) * 100 - print(f"{i}, {perc:.2f}%".ljust(40), end="\r") - if perc > 0.01: - break - i += 1 - - with open(weights_filepath, "wb") as f: - # TODO: This is a dict of ndarrays, which could be easily stored as a set of parquet files for safekeeping. - pickle.dump(crosswalk_dict, f) - - -def add_zonalstats_to_gdf_weights( - gdf: gpd.GeoDataFrame, - src: xr.DataArray, - weights_filepath: str, -) -> gpd.GeoDataFrame: - """Calculates zonal stats and adds to GeoDataFrame""" - - df = calc_zonal_stats_weights(src, weights_filepath) - gdf_map = gdf.merge(df, left_on="huc10", right_on="catchment_id") - - return gdf_map - - -def get_blob(blob_name: str, bucket: str = NWM_BUCKET) -> bytes: - """Retrieve a blob from the data service as bytes. - Based largely on OWP HydroTools. - Parameters - ---------- - blob_name : str, required - Name of blob to retrieve. - Returns - ------- - data : bytes - The data stored in the blob. - """ - # Setup anonymous client and retrieve blob data - client = storage.Client.create_anonymous_client() - bucket = client.bucket(bucket) - return bucket.blob(blob_name).download_as_bytes(timeout=120) - - -def calc_zonal_stats_weights( - src: xr.DataArray, - weights_filepath: str, -) -> pd.DataFrame: - """Calculates zonal stats""" - - # Open weights dict from pickle - # This could probably be done once and passed as a reference. - with open(weights_filepath, "rb") as f: - crosswalk_dict = pickle.load(f) - - r_array = src.values[0] - r_array[r_array == src.rio.nodata] = np.nan - - mean_dict = {} - for key, value in crosswalk_dict.items(): - mean_dict[key] = np.nanmean(r_array[value]) - - df = pd.DataFrame.from_dict(mean_dict, orient="index", columns=["value"]) - - df.reset_index(inplace=True, names="catchment_id") - - # This should not be needed, but without memory usage grows - del crosswalk_dict - del f - gc.collect() - - return df - - -def get_forcing_dict_RTIway( - pickle_file, # This would be a Feature list for parallel calling -- - # if there is a stored weights file, we use it - # (checking for an optional flag to force re-creation of the weights...) - folder_prefix, - file_list, -): - var = "RAINRATE" - reng = "rasterio" - filehandles = [ - xr.open_dataset(folder_prefix / f, engine=reng)[var] for f in file_list - ] - # filehandles = [get_dataset("data/" + f, use_cache=True) for f in file_list] - stats = [] - - for _i, f in enumerate(filehandles): - print(f"{_i}, {round(_i/len(file_list), 2)*100}".ljust(40), end="\r") - stats.append(calc_zonal_stats_weights(f, pickle_file)) - - [f.close() for f in filehandles] - return stats - - -def get_forcing_dict_RTIway2( - pickle_file, # This would be a Feature list for parallel calling -- - # if there is a stored weights file, we use it - # (checking for an optional flag to force re-creation of the weights...) - gpkg_divides, - folder_prefix, - filelist, - var_list, -): - reng = "rasterio" - pick_val = "value" - - df_dict = {} - dl_dict = {} - for _v in var_list: - df_dict[_v] = pd.DataFrame(index=gpkg_divides.index) - dl_dict[_v] = [] - - # ds_list = [] - for _i, _nc_file in enumerate(filelist): - # _nc_file = ("nwm.t00z.medium_range.forcing.f001.conus.nc") - _full_nc_file = folder_prefix.joinpath(_nc_file) - - try: - # with xr.open_dataset(_full_nc_file, engine=reng) as _xds: - with xr.open_dataset(_full_nc_file) as _xds: - # _xds = ds_list[_i] - # _xds.rio.write_crs(rasterio.crs.CRS.from_wkt(CONUS_NWM_WKT), inplace=True) - print(f"{_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") - for _v in var_list: - _src = _xds[_v] - _df_zonal_stats = calc_zonal_stats_weights(_src, pickle_file) - # if adding statistics back to original GeoDataFrame - # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1) - _df = pd.DataFrame(index=gpkg_divides.index) - _df[_xds.time.values[0]] = _df_zonal_stats[pick_val] - # TODO: This same line could add the new values directly - # to the same dictionary. But after adding about 100 of them, - # pandas starts to complain about degraded performance due to - # fragmentation of the dataframe. We tried it this was as a - # workaround, with the loop below to accomplish the concatenation. - dl_dict[_v].append(_df) - except: - print(f"No such file: {_full_nc_file}") - - for _v in var_list: - df_dict[_v] = pd.concat(dl_dict[_v], axis=1) - - # [_xds.close() for _xds in ds_list] - - return df_dict - - -def main(): - """ - Primary function to retrieve hydrofabrics data and convert it into files that can be ingested into ngen. - Also, the forcing data is retrieved. - - Inputs: JSON config file specifying start_date, end_date, and vpu - - Outputs: ngen catchment/nexus configs and forcing files - - Will store files in the same folder as the JSON config to run this script - """ - parser = argparse.ArgumentParser() - parser.add_argument( - dest="infile", type=str, help="A json containing user inputs to run ngen" - ) - args = parser.parse_args() - - # Take in user config - conf = json.load(open(args.infile)) - start_date = conf["forcing"]["start_date"] - end_date = conf["forcing"]["end_date"] - vpu = conf["hydrofab"]["vpu"] - - top_dir = os.path.dirname(args.infile) - data_dir = os.path.join(top_dir, "forcing_data") - if not os.path.exists(data_dir): - os.system(f"mkdir {data_dir}") - - # Generate list of file names to retrieve for forcing data - # Going to make assumptions here as to which forecasts we want - # Check the dictionaries at the top of listofnwmfilenames for options - n = 6 # How rapidly we want our forecasts, I think 3 is the highest frequency - fcst_cycle = [n * x for x in range(24 // n)] - lead_time = [x + 1 for x in range(n)] - # fcst_cycle = None # Retrieves a full day for each day within the range given. - runinput = 2 - varinput = 5 - geoinput = 1 - meminput = 0 - urlbaseinput = None - - print(f"Creating list of file names to pull...") - nwm_forcing_files = create_file_list( - runinput, - varinput, - geoinput, - meminput, - start_date, - end_date, - fcst_cycle, - urlbaseinput, - lead_time, - ) - - print(f"Pulling files...") - local_files = [] - for jfile in nwm_forcing_files: - file_parts = jfile.split("/") - local_file = os.path.join(data_dir, file_parts[-1]) - local_files.append(local_file) - if os.path.exists(local_file): - continue - else: - command = f"wget -P {data_dir} -c https://storage.googleapis.com/national-water-model/{jfile}" - os.system(command) - - # Download dataset, read into df with geopandas - gpkg = os.path.join(DATA_DIR, "nextgen_03W.gpkg") - ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) - src = ds["RAINRATE"] - - # Why are we converting to paquet and then back into geopandas dataframe? - polygonfile = gpd.read_file(gpkg, layer="divides") - parq_file = os.path.join(DATA_DIR, "ng_03.parquet") - polygonfile.to_parquet(parq_file) - pkl_file = os.path.join(DATA_DIR, "weights.pkl") - generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") - calc_zonal_stats_weights(src, pkl_file) - - folder_prefix = DATA_DIR - - var_list = [ - "U2D", - "V2D", - "LWDOWN", - "RAINRATE", - "T2D", - "Q2D", - "PSFC", - "SWDOWN", - ] - - var_list - start_time = time.time() - print(f"Working on the new way") - fd2 = get_forcing_dict_RTIway2( - pkl_file, - polygonfile, - folder_prefix, - nwm_forcing_files, - var_list, - ) - print(time.time() - start_time) - - fd2["U2D"] - pcp_var.transpose()[0] - pcp_var = fd2["RAINRATE"] - lw_var = fd2["LWDOWN"] - sw_var = fd2["SWDOWN"] - sp_var = fd2["PSFC"] - tmp_var = fd2["T2D"] - u2d_var = fd2["U2D"] - v2d_var = fd2["V2D"] - pcp_var2 = fd2["RAINRATE"] - - for _i in range(0, 40000): - # _i = 0 - try: - pcp_var_0 = pcp_var.transpose()[_i].rename("APCP_surface") - lw_var_0 = lw_var.transpose()[_i].rename("DLWRF_surface") - sw_var_0 = sw_var.transpose()[_i].rename("DSWRF_surface") - sp_var_0 = sp_var.transpose()[_i].rename("SPFH_2maboveground") - tmp_var_0 = tmp_var.transpose()[_i].rename("TMP_2maboveground") - u2d_var_0 = u2d_var.transpose()[_i].rename("UGRD_10maboveground") - v2d_var_0 = v2d_var.transpose()[_i].rename("VGRD_10maboveground") - pcp_var2_0 = pcp_var2.transpose()[_i].rename("precip_rate") ##BROKEN!! - - d = pd.concat( - [ - pcp_var_0, - lw_var_0, - sw_var_0, - sp_var_0, - tmp_var_0, - u2d_var_0, - v2d_var_0, - pcp_var2_0, - ], - axis=1, - ) - d.index.name = "time" - - d.to_csv(f"input_data/cat16_{_i:07}.csv") - except: - print(f"no data for watershed {_i}", end="\t") - - ## Make a shell script string to rename the csvs... - gpkg_divides["id"] - for _i, cat_id in enumerate(gpkg_divides["id"]): - print(f"mv cat16_{_i:07}.csv cat16_{cat_id}.csv") - - -if __name__ == "__main__": - main() From 3d778378ada70d0bd87615540cdfd0b7c6c8c0f7 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 15:29:41 -0500 Subject: [PATCH 024/105] Removed alternates for now --- ngen_forcing/defs.py | 26 -- ngen_forcing/process_nwm_forcing_to_ngen.py | 259 ------------------ .../test_process_nwm_forcing_to_ngen.py | 211 -------------- 3 files changed, 496 deletions(-) delete mode 100644 ngen_forcing/defs.py delete mode 100644 ngen_forcing/process_nwm_forcing_to_ngen.py delete mode 100644 ngen_forcing/test_process_nwm_forcing_to_ngen.py diff --git a/ngen_forcing/defs.py b/ngen_forcing/defs.py deleted file mode 100644 index 18175c8..0000000 --- a/ngen_forcing/defs.py +++ /dev/null @@ -1,26 +0,0 @@ -import rasterio.mask as riomask - - -def polymask(dataset, invert=False, all_touched=False): - def _polymask(poly): - return riomask.raster_geometry_mask( - dataset, [poly], invert=invert, all_touched=all_touched, crop=True - ) - - return _polymask - - -def xr_read_window(ds, window, mask=None): - data = ds.isel(window) - if mask is None: - return data - else: - return data.where(mask) - - -def xr_read_window_time(ds, window, mask=None, idx=None, time=None): - data = ds.isel(window) - if mask is None: - return idx, time, data - else: - return idx, time, data.where(mask) diff --git a/ngen_forcing/process_nwm_forcing_to_ngen.py b/ngen_forcing/process_nwm_forcing_to_ngen.py deleted file mode 100644 index 23c73ba..0000000 --- a/ngen_forcing/process_nwm_forcing_to_ngen.py +++ /dev/null @@ -1,259 +0,0 @@ -from ngen_forcing.defs import xr_read_window, polymask, xr_read_window_time -from rasterio import _io, windows -import xarray as xr -import pandas as pd - - -class MemoryDataset(_io.MemoryDataset, windows.WindowMethodsMixin): - pass - - -def get_forcing_dict_newway( - feature_index, - feature_list, - folder_prefix, - file_list, - var_list, -): - reng = "rasterio" - - _xds_dummy = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) - _template_arr = _xds_dummy.U2D.values - _u2d = MemoryDataset( - _template_arr, - transform=_xds_dummy.U2D.rio.transform(), - gcps=None, - rpcs=None, - crs=None, - copy=False, - ) - - # Open .nc files ahead of time - ds_list = [] - for _nc_file in file_list: - _full_nc_file = folder_prefix.joinpath(_nc_file) - ds_list.append(xr.open_dataset(_full_nc_file, engine=reng)) - - df_dict = {} - for _v in var_list: - df_dict[_v] = pd.DataFrame(index=feature_index) - - for i, feature in enumerate(feature_list): - print(f"{i}, {round(i/len(feature_list), 5)*100}".ljust(40), end="\r") - mask, _, window = polymask(_u2d)(feature) - mask = xr.DataArray(mask, dims=["y", "x"]) - winslices = dict(zip(["y", "x"], window.toslices())) - for j, _xds in enumerate(ds_list): - time_value = _xds.time.values[0] - cropped = xr_read_window(_xds, winslices, mask=mask) - stats = cropped.mean() - for var in var_list: - df_dict[var].loc[i, time_value] = stats[var] - - [ds.close() for ds in ds_list] - return df_dict - - -# def get_forcing_dict_newway_parallel( -# feature_index, -# feature_list, -# folder_prefix, -# file_list, -# var_list, -# para="thread", -# para_n=2, -# ): - -# import concurrent.futures - -# reng = "rasterio" -# _xds = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) -# _template_arr = _xds.U2D.values -# _u2d = MemoryDataset( -# _template_arr, -# transform=_xds.U2D.rio.transform(), -# gcps=None, -# rpcs=None, -# crs=None, -# copy=False, -# ) -# ds_list = [xr.open_dataset(folder_prefix.joinpath(f)) for f in file_list] -# # ds_list = [xr.open_dataset(folder_prefix.joinpath(f), engine=reng) for f in file_list] -# # TODO: figure out why using the rasterio engine DOES NOT WORK with parallel -# # TODO: figure out why NOT using the rasterio engine produces a different result - -# if para == "process": -# pool = concurrent.futures.ProcessPoolExecutor -# elif para == "thread": -# pool = concurrent.futures.ThreadPoolExecutor -# else: -# pool = concurrent.futures.ThreadPoolExecutor - -# stats = [] -# future_list = [] - -# with pool(max_workers=para_n) as executor: - -# for _i, _m in enumerate(map(polymask(_u2d), feature_list)): -# print(f"{_i}, {round(_i/len(feature_list), 5)*100}".ljust(40), end="\r") -# mask, _, window = _m -# mask = xr.DataArray(mask, dims=["y", "x"]) -# winslices = dict(zip(["y", "x"], window.toslices())) -# for ds in ds_list: -# _t = ds.time.values[0] -# future = executor.submit( -# xr_read_window_time, ds, winslices, mask=mask, idx=_i, time=_t -# ) -# # cropped = xr_read_window(f, winslices, mask=mask) -# # stats.append(cropped.mean()) -# future_list.append(future) -# for _f in concurrent.futures.as_completed(future_list): -# _j, _t, _s = _f.result() -# stats.append((_j, _t, _s)) - -# df_dict = {} -# for _v in var_list: -# df_dict[_v] = pd.DataFrame(index=feature_index) - -# for j, t, s in stats: -# for var in var_list: -# df_dict[var].loc[j, t] = s[var].mean() - -# [ds.close() for ds in ds_list] -# return df_dict - - -def get_forcing_dict_newway_inverted( - feature_index, - feature_list, - folder_prefix, - file_list, - var_list, -): - reng = "rasterio" - - _xds_dummy = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) - _template_arr = _xds_dummy.U2D.values - _u2d = MemoryDataset( - _template_arr, - transform=_xds_dummy.U2D.rio.transform(), - gcps=None, - rpcs=None, - crs=None, - copy=False, - ) - ds_list = [] - for _nc_file in file_list: - _full_nc_file = folder_prefix.joinpath(_nc_file) - ds_list.append(xr.open_dataset(_full_nc_file, engine=reng)) - - stats = [] - mask_win_list = [] - - for i, feature in enumerate(feature_list): - print(f"{i}, {round(i/len(feature_list), 5)*100}".ljust(40), end="\r") - mask, _, window = polymask(_u2d)(feature) - mask = xr.DataArray(mask, dims=["y", "x"]) - winslices = dict(zip(["y", "x"], window.toslices())) - mask_win_list.append((mask, winslices)) - - for i, f in enumerate(ds_list): - print(f"{i}, {round(i/len(file_list), 2)*100}".ljust(40), end="\r") - time_value = f.time.values[0] - # TODO: when we read the window, could the time be added as a dimension? - for j, (_m, _w) in enumerate(mask_win_list): - cropped = xr_read_window(f, _w, mask=_m) - stats.append((j, time_value, cropped.mean())) - - df_dict = {} - for _v in var_list: - df_dict[_v] = pd.DataFrame(index=feature_index) - - for j, t, s in stats: - for var in var_list: - df_dict[var].loc[j, t] = s[var] - - [ds.close() for ds in ds_list] - return df_dict - - -# def get_forcing_dict_newway_inverted_parallel( -# feature_index, -# feature_list, -# folder_prefix, -# file_list, -# var_list, -# para="thread", -# para_n=2, -# ): -# import concurrent.futures - -# reng = "rasterio" -# _xds = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) -# _template_arr = _xds.U2D.values -# _u2d = MemoryDataset( -# _template_arr, -# transform=_xds.U2D.rio.transform(), -# gcps=None, -# rpcs=None, -# crs=None, -# copy=False, -# ) - -# ds_list = [xr.open_dataset("data/" + f) for f in file_list] - -# stats = [] -# future_list = [] -# mask_win_list = [] - -# for i, feature in enumerate(feature_list): -# print(f"{i}, {round(i/len(feature_list), 5)*100}".ljust(40), end="\r") -# mask, _, window = polymask(_u2d)(feature) -# mask = xr.DataArray(mask, dims=["y", "x"]) -# winslices = dict(zip(["y", "x"], window.toslices())) -# mask_win_list.append((mask, winslices)) - -# ds_list = [xr.open_dataset(folder_prefix.joinpath(f)) for f in file_list] -# # ds_list = [xr.open_dataset(folder_prefix.joinpath(f), engine=reng) for f in file_list] -# # TODO: figure out why using the rasterio engine DOES NOT WORK with parallel -# # TODO: figure out why NOT using the rasterio engine produces a different result - -# stats = [] -# future_list = [] - -# if para == "process": -# pool = concurrent.futures.ProcessPoolExecutor -# elif para == "thread": -# pool = concurrent.futures.ThreadPoolExecutor -# else: -# pool = concurrent.futures.ThreadPoolExecutor - -# with pool(max_workers=para_n) as executor: -# df_dict = {} -# for _v in var_list: -# df_dict[_v] = pd.DataFrame(index=feature_index) - -# for j, ds in enumerate(ds_list): -# print(f"{j}, {round(i/len(file_list), 2)*100}".ljust(40), end="\r") -# _t = ds.time.values[0] -# for _i, (_m, _w) in enumerate(mask_win_list): -# future = executor.submit( -# xr_read_window_time, ds, _w, mask=_m, idx=_i, time=_t -# ) -# # cropped = xr_read_window(ds, _w, mask=_m) -# # stats.append(cropped.mean()) -# future_list.append(future) -# for _f in concurrent.futures.as_completed(future_list): -# _j, _t, _s = _f.result() -# stats.append((_j, _t, _s)) - -# df_dict = {} -# for _v in var_list: -# df_dict[_v] = pd.DataFrame(index=feature_index) - -# for j, t, s in stats: -# for var in var_list: -# df_dict[var].loc[j, t] = s[var].mean() - -# [ds.close() for ds in ds_list] -# return df_dict diff --git a/ngen_forcing/test_process_nwm_forcing_to_ngen.py b/ngen_forcing/test_process_nwm_forcing_to_ngen.py deleted file mode 100644 index ed98587..0000000 --- a/ngen_forcing/test_process_nwm_forcing_to_ngen.py +++ /dev/null @@ -1,211 +0,0 @@ -# import rioxarray as rxr -import xarray as xr -import geopandas as gpd -from rasterstats import zonal_stats - -# import rasterio -import pandas as pd - -import time - -from process_nwm_forcing_to_ngen import ( - get_forcing_dict_newway, - get_forcing_dict_newway_parallel, - get_forcing_dict_newway_inverted, - get_forcing_dict_newway_inverted_parallel, -) - -from pathlib import Path -import warnings - -warnings.simplefilter("ignore") - -# Read forcing files -# Generate List of files - -# TODO: Add looping through lists of forcing files -# consider looking at the "listofnwmfilenames.py" in the data_access_examples repository. -# Integer values for runinput, varinput, etc. are listed at the top of the file -# and an example is given in the `main` function. - -# import listofnwmfilenames -# create_file_list( -# runinput, -# varinput, -# geoinput, -# meminput, -# start_date, -# end_date, -# fcst_cycle, -# ) - -""" -A set of test files can be generated downloading these files -wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f001.conus.nc -wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f002.conus.nc -wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f003.conus.nc -wget -P 03w -c https://nextgen-hydrofabric.s3.amazonaws.com/v1.2/nextgen_03W.gpkg -""" - - -def get_forcing_dict( - feature_index, - feature_list, - folder_prefix, - filelist, - var_list, -): - reng = "rasterio" - sum_stat = "mean" - - df_dict = {} - for _v in var_list: - df_dict[_v] = pd.DataFrame(index=feature_index) - - ds_list = [] - for _nc_file in filelist: - # _nc_file = ("nwm.t00z.medium_range.forcing.f001.conus.nc") - _full_nc_file = folder_prefix.joinpath(_nc_file) - ds_list.append(xr.open_dataset(_full_nc_file, engine=reng)) - - for _i, _nc_file in enumerate(filelist): - _xds = ds_list[_i] - print(f"{_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") - if 1 == 1: - for _v in var_list: - _src = _xds[_v] - _aff2 = _src.rio.transform() - _arr2 = _src.values[0] - - _df_zonal_stats = pd.DataFrame( - zonal_stats(feature_list, _arr2, affine=_aff2) - ) - # if adding statistics back to original GeoDataFrame - # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1) - df_dict[_v][_xds.time.values[0]] = _df_zonal_stats[sum_stat] - - [_xds.close() for _xds in ds_list] - - return df_dict - - -# TODO: Convert the output to CSV with something like -# `gdf3.to_csv` - - -def main(): - folder_prefix = Path("data") - list_of_files = [ - f"nwm.t12z.medium_range.forcing.f{_r:03}.conus.nc" for _r in range(1, 241) - ] - - # Read basin boundary file - f_03 = "03w/nextgen_03W.gpkg" - gpkg_divides = gpd.read_file(f_03, layer="divides") - var_list = [ - "U2D", - "V2D", - "LWDOWN", - "RAINRATE", - "T2D", - "Q2D", - "PSFC", - "SWDOWN", - ] - - # file_list = list_of_files[0:30] - # gpkg_subset = gpkg_divides[0:2000] - file_list = list_of_files[0:3] - gpkg_subset = gpkg_divides[0:200] - feature_list = gpkg_subset.geometry.to_list() - - # This way is extremely slow for anything more than a - # few files, so we comment it out of the test - - start_time = time.time() - print(f"Working on the old (slow) way") - fd1 = get_forcing_dict( - gpkg_subset.index, - feature_list, - folder_prefix, - file_list, - var_list, - ) - print(time.time() - start_time) - - start_time = time.time() - print(f"Working on the new way") - fd2 = get_forcing_dict_newway( - gpkg_subset.index, - feature_list, - folder_prefix, - file_list, - var_list, - ) - print(time.time() - start_time) - - start_time = time.time() - - print(f"Working on the new way with threading parallel.") - fd3t = get_forcing_dict_newway_parallel( - gpkg_subset.index, - feature_list, - folder_prefix, - file_list, - var_list, - para="thread", - para_n=16, - ) - print(time.time() - start_time) - - start_time = time.time() - print(f"Working on the new way with process parallel.") - fd3p = get_forcing_dict_newway_parallel( - gpkg_subset.index, - feature_list, - folder_prefix, - file_list, - var_list, - para="process", - para_n=16, - ) - print(time.time() - start_time) - start_time = time.time() - print(f"Working on the new way with loops reversed.") - fd4 = get_forcing_dict_newway_inverted( - gpkg_subset.index, - feature_list, - folder_prefix, - file_list, - var_list, - ) - print(time.time() - start_time) - - start_time = time.time() - print(f"Working on the new way with loops reversed with threading parallel.") - fd5t = get_forcing_dict_newway_inverted_parallel( - gpkg_subset.index, - feature_list, - folder_prefix, - file_list, - var_list, - para="thread", - para_n=16, - ) - print(time.time() - start_time) - start_time = time.time() - print(f"Working on the new way with loops reversed with process parallel.") - fd5p = get_forcing_dict_newway_inverted_parallel( - gpkg_subset.index, - feature_list, - folder_prefix, - file_list, - var_list, - para="process", - para_n=16, - ) - print(time.time() - start_time) - - -if __name__ == "__main__": - main() From ded1a98ad6808caa8088230cf2e5feb29536fdcf Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 15:33:43 -0500 Subject: [PATCH 025/105] blacked subsetting and added new line to end of file --- .gitignore | 2 +- ngen_forcing/user_input_ngen.json | 2 +- subsetting/ncatch_upstream.py | 41 +++++++++++++++++----------- subsetting/subset_forcing.py | 44 ++++++++++++++++++------------- 4 files changed, 54 insertions(+), 35 deletions(-) diff --git a/.gitignore b/.gitignore index 125a2ec..688d779 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,4 @@ data/* nwm_filenames/__pycache__/ subsetting/__pycache__/ ngen_forcing/__pycache__/ -venv/ \ No newline at end of file +venv/ diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index 5059e3d..d5610bc 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -22,4 +22,4 @@ "cache" : true, "dl_threads" : 10 -} \ No newline at end of file +} diff --git a/subsetting/ncatch_upstream.py b/subsetting/ncatch_upstream.py index 51c9032..31221c0 100644 --- a/subsetting/ncatch_upstream.py +++ b/subsetting/ncatch_upstream.py @@ -1,16 +1,23 @@ import geopandas as gpd import argparse, os -from subset import get_upstream_ids +from subset import get_upstream_ids + def main(): - #setup the argument parser + # setup the argument parser parser = argparse.ArgumentParser() - parser.add_argument(dest="infile", type=str, help="A gpkg file containing divides and nexus layers") - parser.add_argument(dest="outfile", type=str, help="A text file containing the number of upstream catchments for each catchment") + parser.add_argument( + dest="infile", type=str, help="A gpkg file containing divides and nexus layers" + ) + parser.add_argument( + dest="outfile", + type=str, + help="A text file containing the number of upstream catchments for each catchment", + ) args = parser.parse_args() - infile = args.infile - outfile = args.outfile + infile = args.infile + outfile = args.outfile print("Reading catchment data...") df_cat = gpd.read_file(str(infile), layer="divides") @@ -18,19 +25,22 @@ def main(): print("Reading nexus data...") df_nex = gpd.read_file(str(infile), layer="nexus") - df_cat.set_index('id', inplace=True) + df_cat.set_index("id", inplace=True) print("Finding upstream catchments...") - upstream = nupstream(df_cat.reset_index(), df_nex.reset_index(),df_cat.index) + upstream = nupstream(df_cat.reset_index(), df_nex.reset_index(), df_cat.index) - with open(outfile,'w') as fp: - fp.write(f'Catchment IDs and the number of upstream catchments\nGenerated with file {os.path.basename(infile)}\n') + with open(outfile, "w") as fp: + fp.write( + f"Catchment IDs and the number of upstream catchments\nGenerated with file {os.path.basename(infile)}\n" + ) for jcatch in upstream: - fp.write(f'{jcatch} : {upstream[jcatch]}\n') + fp.write(f"{jcatch} : {upstream[jcatch]}\n") + + print(f"Done! - > {outfile}") - print(f'Done! - > {outfile}') -def nupstream(divides,nexus,cat_list): +def nupstream(divides, nexus, cat_list): """ Find the number of upstream catchments for each catchment """ @@ -41,9 +51,10 @@ def nupstream(divides,nexus,cat_list): jnupstream = len(cat_up_ids) upstream[jcat_id] = jnupstream - upstream = dict(sorted(upstream.items(), key=lambda x:x[1], reverse=True)) + upstream = dict(sorted(upstream.items(), key=lambda x: x[1], reverse=True)) return upstream + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/subsetting/subset_forcing.py b/subsetting/subset_forcing.py index 9199442..2a7b7b3 100644 --- a/subsetting/subset_forcing.py +++ b/subsetting/subset_forcing.py @@ -1,23 +1,28 @@ import argparse, os, json + def main(): """ Find forcing files in a directory that match the catchments within a catchment.geojson - + """ - #setup the argument parser + # setup the argument parser parser = argparse.ArgumentParser() - parser.add_argument(dest="forcing_dir", type=str, help="Path to forcing files") - parser.add_argument(dest="forcing_dir_out", type=str, help="Path to output the forcing files subset") - parser.add_argument(dest="catchment_file", type=str, help="A catchment geojson file") + parser.add_argument(dest="forcing_dir", type=str, help="Path to forcing files") + parser.add_argument( + dest="forcing_dir_out", type=str, help="Path to output the forcing files subset" + ) + parser.add_argument( + dest="catchment_file", type=str, help="A catchment geojson file" + ) args = parser.parse_args() - indir = args.forcing_dir - outdir = args.forcing_dir_out + indir = args.forcing_dir + outdir = args.forcing_dir_out catch_file = args.catchment_file if not os.path.exists(outdir): - os.system(f'mkdir {outdir}') + os.system(f"mkdir {outdir}") forcing_files = os.listdir(indir) @@ -27,23 +32,26 @@ def main(): # User should validate the catch file. # Would do here with ngen-cal, just don't want to create the dependency - feats = data['features'] + feats = data["features"] forcing_out = [] for jfeat in feats: found = False - try: # Geopandas/pydantic descrepancy - cat_id = jfeat['id'] - except: - cat_id = jfeat['properties']['id'] + try: # Geopandas/pydantic descrepancy + cat_id = jfeat["id"] + except: + cat_id = jfeat["properties"]["id"] for jforcing in forcing_files: if jforcing.find(cat_id) >= 0: found = True forcing_out.append(jforcing) - os.system(f'cp {os.path.join(indir,jforcing)} {os.path.join(outdir,jforcing)}') - if not found: - print(f'Couldn\'t find forcing file for {cat_id}!') + os.system( + f"cp {os.path.join(indir,jforcing)} {os.path.join(outdir,jforcing)}" + ) + if not found: + print(f"Couldn't find forcing file for {cat_id}!") else: - print(f'Found forcing file for {cat_id}!') + print(f"Found forcing file for {cat_id}!") + if __name__ == "__main__": - main() \ No newline at end of file + main() From ca06fe168fe0721ede38704173f2c6de46a5fc30 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 15:54:09 -0500 Subject: [PATCH 026/105] Added hydrofabric version to config --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 13 +++++++++---- ngen_forcing/user_input_ngen.json | 1 + ngen_forcing/user_input_ngen.md | 3 +++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 4256e01..d51ac05 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -308,6 +308,7 @@ def main(): geoinput = conf["forcing"]["geoinput"] meminput = conf["forcing"]["meminput"] urlbaseinput = conf["forcing"]["urlbaseinput"] + version = conf["hydrofab"]["version"] vpu = conf["hydrofab"]["vpu"] ii_verbose = conf["verbose"] bucket_type = conf["bucket_type"] @@ -315,7 +316,8 @@ def main(): file_prefix = conf["file_prefix"] file_type = conf["file_type"] ii_cache = conf["cache"] - dl_threads = conf["dl_threads"] + if ii_cache: + dl_threads = conf["dl_threads"] file_types = ["csv", "parquet"] assert ( @@ -351,8 +353,11 @@ def main(): if len(nwm_files) == 0: print(f"Creating list of file names to pull...") n = 6 - fcst_cycle = [n * x for x in range(24 // n)] - lead_time = [x + 1 for x in range(n)] + # fcst_cycle = [n*x for x in range(24//n)] + # lead_time = [x+1 for x in range(n)] + fcst_cycle = [] + lead_time = None + nwm_forcing_files = create_file_list( runinput, varinput, @@ -424,7 +429,7 @@ def main(): gpkg = Path(top_dir, "data", jfile) print(f"Found and using geopackge file {gpkg}") if gpkg == None: - url = f"https://nextgen-hydrofabric.s3.amazonaws.com/05_nextgen/nextgen_{vpu}.gpkg" + url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" command = f"wget -P {CACHE_DIR} -c {url}" wget(command, url) diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index d5610bc..94ecc89 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -11,6 +11,7 @@ }, "hydrofab" : { + "version" : "v1.2", "vpu" : "03W" }, diff --git a/ngen_forcing/user_input_ngen.md b/ngen_forcing/user_input_ngen.md index 8877637..d0d9ed0 100644 --- a/ngen_forcing/user_input_ngen.md +++ b/ngen_forcing/user_input_ngen.md @@ -18,6 +18,7 @@ contents: }, "hydrofab" : { + "version" : "v1.2", "vpu" : "03W" }, @@ -30,6 +31,7 @@ contents: "dl_threads" : 10 } + ### forcing | Field Name | Data Type | Description | @@ -46,6 +48,7 @@ contents: ### hydrofab | Field Name | Data Type | Description | | --- | --- | --- | +| version | `string` | Current hydrofabric version | | vpu | `string` | Check here for map of VPUs https://noaa-owp.github.io/hydrofabric/articles/data_access.html | ### other options From a3ac55e6aa941d75c6c8aed3d224845f71b8a139 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 16:02:15 -0500 Subject: [PATCH 027/105] Fixed wget gpkg bug --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index d51ac05..839e168 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -352,11 +352,10 @@ def main(): # Get nwm forcing file names if len(nwm_files) == 0: print(f"Creating list of file names to pull...") - n = 6 + # n = 6 # fcst_cycle = [n*x for x in range(24//n)] # lead_time = [x+1 for x in range(n)] - fcst_cycle = [] - lead_time = None + fcst_cycle = [0] nwm_forcing_files = create_file_list( runinput, @@ -367,7 +366,6 @@ def main(): end_date, fcst_cycle, urlbaseinput, - lead_time, ) else: print(f"Reading list of file names from {nwm_files}...") @@ -429,12 +427,14 @@ def main(): gpkg = Path(top_dir, "data", jfile) print(f"Found and using geopackge file {gpkg}") if gpkg == None: - url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" + gpkg = f"nextgen_{vpu}.gpkg" + url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/{gpkg}" command = f"wget -P {CACHE_DIR} -c {url}" wget(command, url) + local_gpkg = Path(top_dir, "data",gpkg) - print(f"Opening {gpkg}...") - polygonfile = gpd.read_file(gpkg, layer="divides") + print(f"Opening {local_gpkg}...") + polygonfile = gpd.read_file(local_gpkg, layer="divides") ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) src = ds["RAINRATE"] From ef427f5adcd35dab855123d18c9c66924a16f7b5 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 16:35:55 -0500 Subject: [PATCH 028/105] Default to short range --- ngen_forcing/user_input_ngen.json | 2 +- ngen_forcing/user_input_ngen.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index 94ecc89..a72ffe2 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -3,7 +3,7 @@ "start_date" : "20220822", "end_date" : "20220822", "nwm_files" : "", - "runinput" : 2, + "runinput" : 1, "varinput" : 5, "geoinput" : 1, "meminput" : 0, diff --git a/ngen_forcing/user_input_ngen.md b/ngen_forcing/user_input_ngen.md index d0d9ed0..dce7042 100644 --- a/ngen_forcing/user_input_ngen.md +++ b/ngen_forcing/user_input_ngen.md @@ -10,7 +10,7 @@ contents: "start_date" : "20220822", "end_date" : "20220822", "nwm_files" : "", - "runinput" : 2, + "runinput" : 1, "varinput" : 5, "geoinput" : 1, "meminput" : 0, From 33ad1a875aa7799d5deebe0b48c4a0988f2a9048 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 16:44:05 -0500 Subject: [PATCH 029/105] Fixed pathing issues --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 839e168..6ed8115 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -51,7 +51,7 @@ # TODO Make CACHE_DIR configurable CACHE_DIR = Path( - pkg_dir.parent, "data", "raw_forcing_data" + pkg_dir.parent, "data", "raw_data" ) # Maybe this should have a date attached to the name NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") @@ -422,19 +422,18 @@ def main(): if not os.path.exists(wgt_file): # Search for geopackage that matches the requested VPU, if it exists gpkg = None - for jfile in os.listdir(os.path.join(top_dir, "data")): - if jfile.find(vpu) >= 0: - gpkg = Path(top_dir, "data", jfile) + for jfile in os.listdir(CACHE_DIR): + if jfile.find(f"nextgen_{vpu}.gpkg") >= 0: + gpkg = Path(CACHE_DIR, jfile) print(f"Found and using geopackge file {gpkg}") if gpkg == None: - gpkg = f"nextgen_{vpu}.gpkg" - url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/{gpkg}" + url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" command = f"wget -P {CACHE_DIR} -c {url}" wget(command, url) - local_gpkg = Path(top_dir, "data",gpkg) + gpkg = Path(CACHE_DIR, f"nextgen_{vpu}.gpkg") - print(f"Opening {local_gpkg}...") - polygonfile = gpd.read_file(local_gpkg, layer="divides") + print(f"Opening {gpkg}...") + polygonfile = gpd.read_file(gpkg, layer="divides") ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) src = ds["RAINRATE"] @@ -469,7 +468,7 @@ def main(): "SPFH_2maboveground", "DSWRF_surface", ] - + fd2 = get_forcing_dict_JL( wgt_file, forcing_files, @@ -477,6 +476,7 @@ def main(): var_list_out, ) + print(f'Writting data!') # Write CSVs to file t0 = time.perf_counter() write_int = 100 From f7592a08b67c86387d766a57fcfe0214d4980f79 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Fri, 5 May 2023 14:29:21 -0500 Subject: [PATCH 030/105] Added remote indexing. Moved cache into forcing. --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 214 ++++++++++++-------- ngen_forcing/user_input_ngen.json | 10 +- ngen_forcing/user_input_ngen.md | 13 +- 3 files changed, 139 insertions(+), 98 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 6ed8115..db76a73 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -108,7 +108,7 @@ def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: """ # TODO: Check to see if this does any better than kerchunk # the caching should help, but probably needs to be managed to function asynchronously. - # Perhaps if the files is not cached, we can create the dataset from + # Perhaps if theget_dataset files is not cached, we can create the dataset from # kerchunk with a remote path and then asynchronously do a download to cache it # for next time. The hypothesis would be that the download speed will not be any slower than # just accessing the file remotely. @@ -229,21 +229,36 @@ def calc_zonal_stats_weights_new( return mean_dict -def get_forcing_dict_JL(wgt_file, filelist, var_list, var_list_out): +def get_forcing_dict_JL( + wgt_file : str, + local_filelist : list, + remote_filelist : list, + var_list : list , + var_list_out : list, + ii_cache : bool + ): + t1 = time.perf_counter() + nlocal = len(local_filelist) + full_list = local_filelist + remote_filelist df_by_t = [] - for _i, _nc_file in enumerate(filelist): - with xr.open_dataset(_nc_file) as _xds: + # NOTE this scheme uses the same algorithm for remote and local processing. This may not be desireable + if ii_cache: + eng = 'h5netcdf' + for _i, _nc_file in enumerate(full_list): + if _i == nlocal: eng = 'rasterio' # switch engine for remote processing + with xr.open_dataset(_nc_file,engine=eng) as _xds: shp = _xds["U2D"].shape + dtp = _xds["U2D"].dtype data_allvars = np.zeros( - shape=(len(var_list), shp[1], shp[2]), dtype=_xds["U2D"].dtype - ) + shape=(len(var_list), shp[1], shp[2]), dtype=dtp + ) for var_dx, jvar in enumerate(var_list): data_allvars[var_dx, :, :] = np.squeeze(_xds[jvar].values) _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, wgt_file) df_by_t.append(_df_zonal_stats) print( - f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}%", + f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(full_list)}, {(_i+1)/len(full_list)*100:.2f}% {time.perf_counter() - t1:.2f}s elapsed", end="\r", ) @@ -262,18 +277,78 @@ def get_forcing_dict_JL(wgt_file, filelist, var_list, var_list_out): return dfs -def wget(cmd, name, semaphore=None): +def threaded_cmd(cmd, semaphore=None): + """ + Execute many system commands using python threading. Semaphore is set outside this function + """ if not semaphore == None: semaphore.acquire() resp = os.system(cmd) if resp > 0: - raise Exception(f"\nwget failed! Tried: {name}\n") - else: - print(f"Successful download of {name}") - + raise Exception(f"\Threaded command failed! Tried: {cmd}\n") if not semaphore == None: semaphore.release() +def locate_dl_files_threaded( + ii_cache: bool, + ii_verbose : bool, + forcing_file_names : list, + dl_threads : int +): + """ + Look for forcing files locally, if found, will apend to local file list for local processing + If not found and if we do not wish to cache, append to remote files for remote processing + If not found and if we do wish to cache, append to local file list for local processing and perform a threaded download + """ + + local_files = [] + remote_files = [] + dl_files = [] + cmds = [] + for jfile in forcing_file_names: + if ii_verbose: + print(f"Looking for {jfile}") + file_parts = Path(jfile).parts + + local_file = os.path.join(CACHE_DIR, file_parts[-1]) + + # decide whether to use local file, download it, or index it remotely + if os.path.exists(local_file): + # If the file exists local, get data from this file regardless of ii_cache option + if ii_verbose and ii_cache: + print(f"Found and using local raw forcing file {local_file}") + elif ii_verbose and not ii_cache: + print(f"CACHE OPTION OVERRIDE : Found and using local raw forcing file {local_file}") + local_files.append(local_file) + elif not os.path.exists(local_file) and not ii_cache: + # If file is not found locally, and we don't want to cache it, append to remote file list + remote_files.append(jfile) + elif not os.path.exists(local_file) and ii_cache: + # Download file + if ii_verbose: + print(f"Forcing file not found! Downloading {jfile}") + command = f"wget -P {CACHE_DIR} -c {jfile}" + cmds.append(command) + dl_files.append(jfile) + local_files.append(local_file) + + # Do threaded download if we have any files to download + n_files = len(dl_files) + if n_files > 0: + t0 = time.perf_counter() + threads = [] + semaphore = threading.Semaphore(dl_threads) + for i, jcmd in enumerate(cmds): + t = threading.Thread(target=threaded_cmd, args=[jcmd, semaphore]) + t.start() + threads.append(t) + + for jt in threads: + jt.join() + + print(f"Time to download {n_files} files {time.perf_counter() - t0}") + + return local_files, remote_files def main(): """ @@ -299,15 +374,16 @@ def main(): conf = json.load(open(args.infile)) start_date = conf["forcing"]["start_date"] end_date = conf["forcing"]["end_date"] - if "nwm_files" in conf["forcing"]: - nwm_files = conf["forcing"]["nwm_files"] + if "nwm_file" in conf["forcing"]: + nwm_file = conf["forcing"]["nwm_file"] else: - nwm_files = "" + nwm_file = "" runinput = conf["forcing"]["runinput"] varinput = conf["forcing"]["varinput"] geoinput = conf["forcing"]["geoinput"] meminput = conf["forcing"]["meminput"] urlbaseinput = conf["forcing"]["urlbaseinput"] + ii_cache = conf["forcing"]["cache"] version = conf["hydrofab"]["version"] vpu = conf["hydrofab"]["vpu"] ii_verbose = conf["verbose"] @@ -315,9 +391,7 @@ def main(): bucket_name = conf["bucket_name"] file_prefix = conf["file_prefix"] file_type = conf["file_type"] - ii_cache = conf["cache"] - if ii_cache: - dl_threads = conf["dl_threads"] + dl_threads = conf["dl_threads"] file_types = ["csv", "parquet"] assert ( @@ -349,73 +423,6 @@ def main(): elif bucket_type == "S3": s3 = boto3.client("s3") - # Get nwm forcing file names - if len(nwm_files) == 0: - print(f"Creating list of file names to pull...") - # n = 6 - # fcst_cycle = [n*x for x in range(24//n)] - # lead_time = [x+1 for x in range(n)] - fcst_cycle = [0] - - nwm_forcing_files = create_file_list( - runinput, - varinput, - geoinput, - meminput, - start_date, - end_date, - fcst_cycle, - urlbaseinput, - ) - else: - print(f"Reading list of file names from {nwm_files}...") - nwm_forcing_files = [] - with open(nwm_files, "r") as f: - for line in f: - nwm_forcing_files.append(line) - - # Download whole files and store locally if cache is true, - # otherwise index remotely and save catchment based forcings - t0 = time.perf_counter() - if ii_cache: - # Check to see if we have files cached, if not wget them - local_files = [] - cmds = [] - fls = [] - for jfile in nwm_forcing_files: - if ii_verbose: - print(f"Looking for {jfile}") - file_parts = Path(jfile).parts - - local_file = os.path.join(CACHE_DIR, file_parts[-1]) - local_files.append(local_file) - if os.path.exists(local_file): - if ii_verbose: - print(f"Found and using raw forcing file {local_file}") - continue - else: - if ii_verbose: - print(f"Forcing file not found! Downloading {jfile}") - command = f"wget -P {CACHE_DIR} -c {jfile}" - cmds.append(command) - fls.append(jfile) - - threads = [] - semaphore = threading.Semaphore(dl_threads) - for i, jcmd in enumerate(cmds): - t = threading.Thread(target=wget, args=[jcmd, fls[i], semaphore]) - t.start() - threads.append(t) - - for jt in threads: - jt.join() - - forcing_files = local_files # interacting with files locally - else: - forcing_files = nwm_forcing_files # interacting with files remotely - - print(f"Time to download files {time.perf_counter() - t0}") - # Generate weight file only if one doesn't exist already # Very time consuming so we don't want to do this if we can avoid it wgt_file = os.path.join(CACHE_DIR, "weights.json") @@ -429,7 +436,7 @@ def main(): if gpkg == None: url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" command = f"wget -P {CACHE_DIR} -c {url}" - wget(command, url) + threaded_cmd(command, url) gpkg = Path(CACHE_DIR, f"nextgen_{vpu}.gpkg") print(f"Opening {gpkg}...") @@ -445,7 +452,32 @@ def main(): else: print( f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}" + ) + + # Get nwm forcing file names + if len(nwm_file) == 0: + print(f"Creating list of file names to locate...") + fcst_cycle = [0] + + nwm_forcing_files = create_file_list( + runinput, + varinput, + geoinput, + meminput, + start_date, + end_date, + fcst_cycle, + urlbaseinput, ) + else: + print(f"Reading list of file names from {nwm_file}...") + nwm_forcing_files = [] + with open(nwm_file, "r") as f: + for line in f: + nwm_forcing_files.append(line) + + # This will look for local raw forcing files and download them if needed + local_nwm_files, remote_nwm_files = locate_dl_files_threaded(ii_cache,ii_verbose,nwm_forcing_files,dl_threads) var_list = [ "U2D", @@ -468,13 +500,21 @@ def main(): "SPFH_2maboveground", "DSWRF_surface", ] + + # Considering possible memory constraints in this operation, + # let's loop though a certain number of files, write them out, and go back for more + t0 = time.perf_counter() + local_nwm_files = [] fd2 = get_forcing_dict_JL( wgt_file, - forcing_files, + local_nwm_files, + remote_nwm_files[:5], var_list, var_list_out, + ii_cache ) + print(f'Time to create forcing dictionary {time.perf_counter() - t0}') print(f'Writting data!') # Write CSVs to file diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index a72ffe2..11b5962 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -2,12 +2,13 @@ "forcing" : { "start_date" : "20220822", "end_date" : "20220822", - "nwm_files" : "", + "nwm_file" : "", "runinput" : 1, "varinput" : 5, "geoinput" : 1, "meminput" : 0, - "urlbaseinput" : 3 + "urlbaseinput" : 3, + "cache" : false }, "hydrofab" : { @@ -16,11 +17,10 @@ }, "verbose" : true, - "bucket_type" : "S3", - "bucket_name" : "ciroh-devconf", + "bucket_type" : "local", + "bucket_name" : "out_data_CIROH", "file_prefix" : "data/", "file_type" : "csv", - "cache" : true, "dl_threads" : 10 } diff --git a/ngen_forcing/user_input_ngen.md b/ngen_forcing/user_input_ngen.md index dce7042..033fc5b 100644 --- a/ngen_forcing/user_input_ngen.md +++ b/ngen_forcing/user_input_ngen.md @@ -9,12 +9,13 @@ contents: "forcing" : { "start_date" : "20220822", "end_date" : "20220822", - "nwm_files" : "", + "nwm_file" : "", "runinput" : 1, "varinput" : 5, "geoinput" : 1, "meminput" : 0, - "urlbaseinput" : 3 + "urlbaseinput" : 3, + "cache" : false }, "hydrofab" : { @@ -23,15 +24,15 @@ contents: }, "verbose" : true, - "bucket_type" : "S3", - "bucket_name" : "ciroh-devconf", + "bucket_type" : "local", + "bucket_name" : "out_data_CIROH", "file_prefix" : "data/", "file_type" : "csv", - "cache" : true, "dl_threads" : 10 } + ### forcing | Field Name | Data Type | Description | @@ -44,6 +45,7 @@ contents: | geoinput | `int` |
    1. conus: for continental US
    2. hawaii: for Hawaii
    3. puertorico: for Puerto Rico
    | | meminput | `int` |
    1. mem_1
    2. mem_2
    3. mem_3
    4. mem_4
    5. mem_5
    6. mem_6
    7. mem_7
    | | urlbaseinput | `int` |
    1. Empty string: use local files
    2. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/: for real-time operational data from NOAA
    3. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/post-processed/WMS/: for post-processed data from NOAA's Web Map Service
    4. https://storage.googleapis.com/national-water-model/: for input/output data stored on Google Cloud Storage
    5. https://storage.cloud.google.com/national-water-model/: for input/output data stored on Google Cloud Storage
    6. gs://national-water-model/: for input/output data stored on Google Cloud Storage
    7. https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/: for retrospective data from AWS S3
    8. s3://noaa-nwm-retrospective-2-1-pds/model_output/: for retrospective data from AWS S3
    | +| cache | `bool` |
  • true: Store forcing files locally. Must specify dl_threads
  • false: Interact with forcing files remotely
  • | ### hydrofab | Field Name | Data Type | Description | @@ -59,5 +61,4 @@ contents: | bucket_name | `string` | If local, this is the name of the folder the data will be placed in. If S3, this is the name of S3 bucket, which must exist already. | | file_prefix | `string` | If local, this is the relative path to the bucket_name folder. If S3, this is the relative path within the S3 bucket_name bucket to store files | | file_type | `string` |
    1. "csv" : write data as csv files/
    2. "parquet" : write data as parquet files
    | -| cache | `bool` |
  • true: Store forcing files locally. Must specify dl_threads
  • false: Interact with forcing files remotely
  • | | dl_threads | `int` | Number of threads to use while downloading. | From 093320ec7cea14e5ce5b8a51c9447a93131dcb89 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Fri, 5 May 2023 14:37:48 -0500 Subject: [PATCH 031/105] Removed indexing (bug) --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index db76a73..f354595 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -509,7 +509,7 @@ def main(): fd2 = get_forcing_dict_JL( wgt_file, local_nwm_files, - remote_nwm_files[:5], + remote_nwm_files, var_list, var_list_out, ii_cache From 37488b58bc324dd58802531225f91c743d46fc1d Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Fri, 5 May 2023 14:39:00 -0500 Subject: [PATCH 032/105] Removed bug #12412 --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index f354595..d933412 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -504,7 +504,6 @@ def main(): # Considering possible memory constraints in this operation, # let's loop though a certain number of files, write them out, and go back for more t0 = time.perf_counter() - local_nwm_files = [] fd2 = get_forcing_dict_JL( wgt_file, From 125408645977194cbea33ec6e8c974a45b743051 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Fri, 5 May 2023 14:42:46 -0500 Subject: [PATCH 033/105] Removed bug #12413 --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index d933412..d9d8f49 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -243,7 +243,7 @@ def get_forcing_dict_JL( full_list = local_filelist + remote_filelist df_by_t = [] # NOTE this scheme uses the same algorithm for remote and local processing. This may not be desireable - if ii_cache: + if nlocal > 0: eng = 'h5netcdf' for _i, _nc_file in enumerate(full_list): if _i == nlocal: eng = 'rasterio' # switch engine for remote processing From 53ac8d5f89812ae8a94c6ff403307972d2eb5d04 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Fri, 5 May 2023 15:03:39 -0500 Subject: [PATCH 034/105] blacked and print statements fixes --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 175 ++++++++++---------- 1 file changed, 85 insertions(+), 90 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index d9d8f49..8b3bac0 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -230,29 +230,27 @@ def calc_zonal_stats_weights_new( def get_forcing_dict_JL( - wgt_file : str, - local_filelist : list, - remote_filelist : list, - var_list : list , - var_list_out : list, - ii_cache : bool - ): - + wgt_file: str, + local_filelist: list, + remote_filelist: list, + var_list: list, + var_list_out: list, + ii_cache: bool, +): t1 = time.perf_counter() nlocal = len(local_filelist) full_list = local_filelist + remote_filelist df_by_t = [] # NOTE this scheme uses the same algorithm for remote and local processing. This may not be desireable if nlocal > 0: - eng = 'h5netcdf' + eng = "h5netcdf" for _i, _nc_file in enumerate(full_list): - if _i == nlocal: eng = 'rasterio' # switch engine for remote processing - with xr.open_dataset(_nc_file,engine=eng) as _xds: + if _i == nlocal: + eng = "rasterio" # switch engine for remote processing + with xr.open_dataset(_nc_file, engine=eng) as _xds: shp = _xds["U2D"].shape dtp = _xds["U2D"].dtype - data_allvars = np.zeros( - shape=(len(var_list), shp[1], shp[2]), dtype=dtp - ) + data_allvars = np.zeros(shape=(len(var_list), shp[1], shp[2]), dtype=dtp) for var_dx, jvar in enumerate(var_list): data_allvars[var_dx, :, :] = np.squeeze(_xds[jvar].values) _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, wgt_file) @@ -262,7 +260,7 @@ def get_forcing_dict_JL( end="\r", ) - print(f"Reformating and converting data into dataframe") + print(f"\nReformating and converting data into dataframe") dfs = {} for jcat in list(df_by_t[0].keys()): data_catch = [] @@ -289,71 +287,71 @@ def threaded_cmd(cmd, semaphore=None): if not semaphore == None: semaphore.release() + def locate_dl_files_threaded( - ii_cache: bool, - ii_verbose : bool, - forcing_file_names : list, - dl_threads : int + ii_cache: bool, ii_verbose: bool, forcing_file_names: list, dl_threads: int ): - """ - Look for forcing files locally, if found, will apend to local file list for local processing - If not found and if we do not wish to cache, append to remote files for remote processing - If not found and if we do wish to cache, append to local file list for local processing and perform a threaded download - """ - - local_files = [] - remote_files = [] - dl_files = [] - cmds = [] - for jfile in forcing_file_names: + """ + Look for forcing files locally, if found, will apend to local file list for local processing + If not found and if we do not wish to cache, append to remote files for remote processing + If not found and if we do wish to cache, append to local file list for local processing and perform a threaded download + """ + + local_files = [] + remote_files = [] + dl_files = [] + cmds = [] + for jfile in forcing_file_names: + if ii_verbose: + print(f"Looking for {jfile}") + file_parts = Path(jfile).parts + + local_file = os.path.join(CACHE_DIR, file_parts[-1]) + + # decide whether to use local file, download it, or index it remotely + if os.path.exists(local_file): + # If the file exists local, get data from this file regardless of ii_cache option + if ii_verbose and ii_cache: + print(f"Found and using local raw forcing file {local_file}") + elif ii_verbose and not ii_cache: + print( + f"CACHE OPTION OVERRIDE : Found and using local raw forcing file {local_file}" + ) + local_files.append(local_file) + elif not os.path.exists(local_file) and not ii_cache: + # If file is not found locally, and we don't want to cache it, append to remote file list + remote_files.append(jfile) + elif not os.path.exists(local_file) and ii_cache: + # Download file if ii_verbose: - print(f"Looking for {jfile}") - file_parts = Path(jfile).parts - - local_file = os.path.join(CACHE_DIR, file_parts[-1]) - - # decide whether to use local file, download it, or index it remotely - if os.path.exists(local_file): - # If the file exists local, get data from this file regardless of ii_cache option - if ii_verbose and ii_cache: - print(f"Found and using local raw forcing file {local_file}") - elif ii_verbose and not ii_cache: - print(f"CACHE OPTION OVERRIDE : Found and using local raw forcing file {local_file}") - local_files.append(local_file) - elif not os.path.exists(local_file) and not ii_cache: - # If file is not found locally, and we don't want to cache it, append to remote file list - remote_files.append(jfile) - elif not os.path.exists(local_file) and ii_cache: - # Download file - if ii_verbose: - print(f"Forcing file not found! Downloading {jfile}") - command = f"wget -P {CACHE_DIR} -c {jfile}" - cmds.append(command) - dl_files.append(jfile) - local_files.append(local_file) - - # Do threaded download if we have any files to download - n_files = len(dl_files) - if n_files > 0: - t0 = time.perf_counter() - threads = [] - semaphore = threading.Semaphore(dl_threads) - for i, jcmd in enumerate(cmds): - t = threading.Thread(target=threaded_cmd, args=[jcmd, semaphore]) - t.start() - threads.append(t) - - for jt in threads: - jt.join() - - print(f"Time to download {n_files} files {time.perf_counter() - t0}") - - return local_files, remote_files + print(f"Forcing file not found! Downloading {jfile}") + command = f"wget -P {CACHE_DIR} -c {jfile}" + cmds.append(command) + dl_files.append(jfile) + local_files.append(local_file) + + # Do threaded download if we have any files to download + n_files = len(dl_files) + if n_files > 0: + t0 = time.perf_counter() + threads = [] + semaphore = threading.Semaphore(dl_threads) + for i, jcmd in enumerate(cmds): + t = threading.Thread(target=threaded_cmd, args=[jcmd, semaphore]) + t.start() + threads.append(t) + + for jt in threads: + jt.join() + + print(f"Time to download {n_files} files {time.perf_counter() - t0}") + + return local_files, remote_files + def main(): """ - Primary function to retrieve hydrofabrics data and convert it into files that can be ingested into ngen. - Also, the forcing data is retrieved. + Primary function to retrieve forcing and hydrofabric data and convert it into files that can be ingested into ngen. Inputs: JSON config file specifying start_date, end_date, and vpu @@ -384,7 +382,7 @@ def main(): meminput = conf["forcing"]["meminput"] urlbaseinput = conf["forcing"]["urlbaseinput"] ii_cache = conf["forcing"]["cache"] - version = conf["hydrofab"]["version"] + version = conf["hydrofab"]["version"] vpu = conf["hydrofab"]["vpu"] ii_verbose = conf["verbose"] bucket_type = conf["bucket_type"] @@ -448,11 +446,11 @@ def main(): print("Generating weights") t1 = time.perf_counter() generate_weights_file(polygonfile, src, wgt_file, crosswalk_dict_key="id") - print(f"Generating the weights took {time.perf_counter() - t1:.2f} s") + print(f"\nGenerating the weights took {time.perf_counter() - t1:.2f} s") else: print( f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}" - ) + ) # Get nwm forcing file names if len(nwm_file) == 0: @@ -474,10 +472,12 @@ def main(): nwm_forcing_files = [] with open(nwm_file, "r") as f: for line in f: - nwm_forcing_files.append(line) + nwm_forcing_files.append(line) # This will look for local raw forcing files and download them if needed - local_nwm_files, remote_nwm_files = locate_dl_files_threaded(ii_cache,ii_verbose,nwm_forcing_files,dl_threads) + local_nwm_files, remote_nwm_files = locate_dl_files_threaded( + ii_cache, ii_verbose, nwm_forcing_files, dl_threads + ) var_list = [ "U2D", @@ -501,21 +501,16 @@ def main(): "DSWRF_surface", ] - # Considering possible memory constraints in this operation, + # TODO: Considering possible memory constraints in this operation, # let's loop though a certain number of files, write them out, and go back for more t0 = time.perf_counter() - + fd2 = get_forcing_dict_JL( - wgt_file, - local_nwm_files, - remote_nwm_files, - var_list, - var_list_out, - ii_cache + wgt_file, local_nwm_files, remote_nwm_files, var_list, var_list_out, ii_cache ) - print(f'Time to create forcing dictionary {time.perf_counter() - t0}') + print(f"Time to create forcing dictionary {time.perf_counter() - t0}") - print(f'Writting data!') + print(f"Writing data!") # Write CSVs to file t0 = time.perf_counter() write_int = 100 @@ -548,7 +543,7 @@ def main(): end="\r", ) - print(f"{file_type} write took {time.perf_counter() - t0:.2f} s\n") + print(f"\n{file_type} write took {time.perf_counter() - t0:.2f} s\n") print( f"\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {bucket_type}\n\n" From 4aeca19042888811a31865d895f3eeaa403d7dc9 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 8 May 2023 09:12:14 -0500 Subject: [PATCH 035/105] moved file names template --- ngen_forcing/filenames.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 ngen_forcing/filenames.txt diff --git a/ngen_forcing/filenames.txt b/ngen_forcing/filenames.txt new file mode 100644 index 0000000..87440af --- /dev/null +++ b/ngen_forcing/filenames.txt @@ -0,0 +1,3 @@ +https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f006.conus.nc +https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f007.conus.nc +https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f008.conus.nc \ No newline at end of file From 3fcc1fc2a4e16ac5386b00a0c2c362549443e8dc Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 8 May 2023 14:54:18 -0500 Subject: [PATCH 036/105] fixed threading, organized config --- filenames.txt | 3 - ngen_forcing/prep_hydrofab_forcings_ngen.py | 300 ++++++++++++++------ ngen_forcing/user_input_ngen.json | 27 +- ngen_forcing/user_input_ngen.md | 53 ++-- 4 files changed, 254 insertions(+), 129 deletions(-) delete mode 100644 filenames.txt diff --git a/filenames.txt b/filenames.txt deleted file mode 100644 index 87440af..0000000 --- a/filenames.txt +++ /dev/null @@ -1,3 +0,0 @@ -https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f006.conus.nc -https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f007.conus.nc -https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f008.conus.nc \ No newline at end of file diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 8b3bac0..e1ee25c 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -229,24 +229,38 @@ def calc_zonal_stats_weights_new( return mean_dict -def get_forcing_dict_JL( +def get_forcing_timelist( wgt_file: str, - local_filelist: list, - remote_filelist: list, + filelist: list, var_list: list, - var_list_out: list, - ii_cache: bool, + jt = None, + out = None, ): + """ + General function to read either remote or local nwm forcing files. + + Inputs: + wgt_file: a path to the weights json, + filelist: list of filenames (urls for remote, local paths otherwise), + var_list: list (list of variable names to read), + jt: the index to place the file. This is used to ensure elements increase in time, regardless of thread number, + out: a list (in time) of forcing data, (THIS IS A THREADING OUTPUT) + + Outputs: + df_by_t : (returned for local files) a list (in time) of forcing data. Note that this list may not be consistent in time + OR + out : (returned for remote files) a list (in time) of forcing data. + Each thread will write into this list such that time increases, but may not be consistent + + """ + t1 = time.perf_counter() - nlocal = len(local_filelist) - full_list = local_filelist + remote_filelist - df_by_t = [] - # NOTE this scheme uses the same algorithm for remote and local processing. This may not be desireable - if nlocal > 0: - eng = "h5netcdf" - for _i, _nc_file in enumerate(full_list): - if _i == nlocal: + df_by_t = [] + for _i, _nc_file in enumerate(filelist): + if _nc_file[:5] == 'https': eng = "rasterio" # switch engine for remote processing + else: + eng = "h5netcdf" with xr.open_dataset(_nc_file, engine=eng) as _xds: shp = _xds["U2D"].shape dtp = _xds["U2D"].dtype @@ -255,46 +269,70 @@ def get_forcing_dict_JL( data_allvars[var_dx, :, :] = np.squeeze(_xds[jvar].values) _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, wgt_file) df_by_t.append(_df_zonal_stats) - print( - f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(full_list)}, {(_i+1)/len(full_list)*100:.2f}% {time.perf_counter() - t1:.2f}s elapsed", - end="\r", - ) - print(f"\nReformating and converting data into dataframe") + if jt == None: + print( + f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}% {time.perf_counter() - t1:.2f}s elapsed", + end="\r", + ) + + if not jt == None: + out[jt] = df_by_t + + return df_by_t + +def time2catchment(time_list, var_list_out): + """ + Convert a list of catchment dictionaries into a single dictionary of dataframes for each catchment + + Inputs: + time_list : a list returned by get_forcing_timelist. It is assumed this list is consistent in time. + var_list_out : a list of clomun headers for the dataframes + + Outputs: + dfs : a dictionary of catchment based dataframes + + """ + dfs = {} - for jcat in list(df_by_t[0].keys()): + for jcat in list(time_list[0].keys()): data_catch = [] - for jt in range(len(df_by_t)): - data_catch.append(df_by_t[jt][jcat]) + for jt in range(len(time_list)): + data_catch.append(time_list[jt][jcat]) dfs[jcat] = pd.DataFrame(data_catch, columns=var_list_out) - print( - f"Indexing data and generating the dataframes (JL) {time.perf_counter() - t1:.2f}s" - ) - return dfs - -def threaded_cmd(cmd, semaphore=None): +def cmd(cmd,out=None): """ - Execute many system commands using python threading. Semaphore is set outside this function + Execute system commands + + Inputs + cmd : the command to execute + """ - if not semaphore == None: - semaphore.acquire() resp = os.system(cmd) if resp > 0: raise Exception(f"\Threaded command failed! Tried: {cmd}\n") - if not semaphore == None: - semaphore.release() def locate_dl_files_threaded( - ii_cache: bool, ii_verbose: bool, forcing_file_names: list, dl_threads: int + ii_cache: bool, ii_verbose: bool, forcing_file_names: list, nthreads: int ): """ Look for forcing files locally, if found, will apend to local file list for local processing If not found and if we do not wish to cache, append to remote files for remote processing If not found and if we do wish to cache, append to local file list for local processing and perform a threaded download + + Inputs: + ii_cache : user-defined caching bool + ii_verbose : user-defined verbosity bool + forcing_file_names : a list of forcing files names + nthreads : user-defined maximum number of threads + + Outputs: + local_files : list of paths to the local files. Note that even if ii_cache if false, if a file is found locally, it will be used. + remote_files : list of urls to the remote files. """ local_files = [] @@ -302,10 +340,7 @@ def locate_dl_files_threaded( dl_files = [] cmds = [] for jfile in forcing_file_names: - if ii_verbose: - print(f"Looking for {jfile}") file_parts = Path(jfile).parts - local_file = os.path.join(CACHE_DIR, file_parts[-1]) # decide whether to use local file, download it, or index it remotely @@ -330,24 +365,53 @@ def locate_dl_files_threaded( dl_files.append(jfile) local_files.append(local_file) - # Do threaded download if we have any files to download - n_files = len(dl_files) - if n_files > 0: - t0 = time.perf_counter() - threads = [] - semaphore = threading.Semaphore(dl_threads) + if len(cmds) > 0: + args = [] for i, jcmd in enumerate(cmds): - t = threading.Thread(target=threaded_cmd, args=[jcmd, semaphore]) - t.start() - threads.append(t) + args.append([jcmd]) + out = threaded_fun(cmd,nthreads,args) - for jt in threads: - jt.join() + return local_files, remote_files - print(f"Time to download {n_files} files {time.perf_counter() - t0}") +def threaded_fun(fun, + nthreads : int, + args : list): + + """ + Threaded function call + """ + threads = [] + out = [None for x in range(len(args))] + for i in range(len(args)): + + if i >= nthreads: # Assign new jobs as threads finish + k = 0 + while True: + jj = k % nthreads + jthread = threads[jj] + if jthread.is_alive(): + k += 1 + time.sleep(0.25) + else: + t = threading.Thread(target=fun, args= [*args[i], out]) + t.start() + threads[jj] = t + break + else: # Initial set of threads + t = threading.Thread(target=fun, args=[*args[i], out]) + t.start() + threads.append(t) - return local_files, remote_files + # Ensure all threads are finished + done = 0 + while done < len(threads): + done = 0 + for jthread in threads: + if not jthread.is_alive(): + done += 1 + time.sleep(0.25) + return out def main(): """ @@ -362,13 +426,14 @@ def main(): t00 = time.perf_counter() + # Take in user config parser = argparse.ArgumentParser() parser.add_argument( dest="infile", type=str, help="A json containing user inputs to run ngen" ) args = parser.parse_args() - - # Take in user config + + # Extract configurations conf = json.load(open(args.infile)) start_date = conf["forcing"]["start_date"] end_date = conf["forcing"]["end_date"] @@ -384,26 +449,29 @@ def main(): ii_cache = conf["forcing"]["cache"] version = conf["hydrofab"]["version"] vpu = conf["hydrofab"]["vpu"] - ii_verbose = conf["verbose"] - bucket_type = conf["bucket_type"] - bucket_name = conf["bucket_name"] - file_prefix = conf["file_prefix"] - file_type = conf["file_type"] - dl_threads = conf["dl_threads"] + bucket_type = conf["storage"]["bucket_type"] + bucket_name = conf["storage"]["bucket_name"] + file_prefix = conf["storage"]["file_prefix"] + file_type = conf["storage"]["file_type"] + ii_verbose = conf["run"]["verbose"] + nthreads = conf["run"]["nthreads"] + print(f'\nWelcome to Preparing Data for NextGen-Based Simulations!\n') + if not ii_verbose: print(f'Generating files now! This may take a few moments...') + + dl_time = 0 + proc_time = 0 + + # configuration validation file_types = ["csv", "parquet"] assert ( file_type in file_types ), f"{file_type} for file_type is not accepted! Accepted: {file_types}" - bucket_types = ["local", "S3"] assert ( bucket_type in bucket_types ), f"{bucket_type} for bucket_type is not accepted! Accepted: {bucket_types}" - # TODO: Subsetting! - # - # Set paths and make directories if needed top_dir = Path(os.path.dirname(args.infile)).parent if not os.path.exists(CACHE_DIR): @@ -430,31 +498,36 @@ def main(): for jfile in os.listdir(CACHE_DIR): if jfile.find(f"nextgen_{vpu}.gpkg") >= 0: gpkg = Path(CACHE_DIR, jfile) - print(f"Found and using geopackge file {gpkg}") + if ii_verbose:print(f"Found and using geopackge file {gpkg}") if gpkg == None: url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" command = f"wget -P {CACHE_DIR} -c {url}" - threaded_cmd(command, url) + t0 = time.perf_counter() + cmd(command) + dl_time += time.perf_counter() - t0 gpkg = Path(CACHE_DIR, f"nextgen_{vpu}.gpkg") - print(f"Opening {gpkg}...") + if ii_verbose:print(f"Opening {gpkg}...") + t0 = time.perf_counter() polygonfile = gpd.read_file(gpkg, layer="divides") ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) src = ds["RAINRATE"] - print("Generating weights") + if ii_verbose:print("Generating weights") t1 = time.perf_counter() generate_weights_file(polygonfile, src, wgt_file, crosswalk_dict_key="id") - print(f"\nGenerating the weights took {time.perf_counter() - t1:.2f} s") + if ii_verbose:print(f"\nGenerating the weights took {time.perf_counter() - t1:.2f} s") + proc_time +=time.perf_counter() - t0 else: - print( - f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}" - ) + if ii_verbose: + print( + f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}" + ) # Get nwm forcing file names + t0 = time.perf_counter() if len(nwm_file) == 0: - print(f"Creating list of file names to locate...") fcst_cycle = [0] nwm_forcing_files = create_file_list( @@ -468,16 +541,23 @@ def main(): urlbaseinput, ) else: - print(f"Reading list of file names from {nwm_file}...") nwm_forcing_files = [] with open(nwm_file, "r") as f: for line in f: nwm_forcing_files.append(line) + if ii_verbose: + print(f'Raw file names:') + for jfile in nwm_forcing_files: + print(f'{jfile}') + + proc_time += time.perf_counter() - t0 # This will look for local raw forcing files and download them if needed + t0 = time.perf_counter() local_nwm_files, remote_nwm_files = locate_dl_files_threaded( - ii_cache, ii_verbose, nwm_forcing_files, dl_threads + ii_cache, ii_verbose, nwm_forcing_files, nthreads ) + dl_time += time.perf_counter() - t0 var_list = [ "U2D", @@ -500,22 +580,46 @@ def main(): "SPFH_2maboveground", "DSWRF_surface", ] - - # TODO: Considering possible memory constraints in this operation, - # let's loop though a certain number of files, write them out, and go back for more + t0 = time.perf_counter() - fd2 = get_forcing_dict_JL( - wgt_file, local_nwm_files, remote_nwm_files, var_list, var_list_out, ii_cache - ) - print(f"Time to create forcing dictionary {time.perf_counter() - t0}") - - print(f"Writing data!") - # Write CSVs to file + # Index remote files with threads + if len(remote_nwm_files) > 0: + args = [] + for i in range(len(remote_nwm_files)): + if ii_verbose: print(f'Doing a threaded remote data retrieval for file {remote_nwm_files[i]}') + args.append([wgt_file, [remote_nwm_files[i]], var_list, i]) + out = threaded_fun(get_forcing_timelist,nthreads,args) + + # If we have any local files, index locally serially + if len(local_nwm_files) > 0: + time_list = get_forcing_timelist( + wgt_file, local_nwm_files, var_list + ) + + # Sync in time between remote and local files + complete_timelist = [] + for i, ifile in enumerate(nwm_forcing_files): + filename = Path(ifile).parts[-1] + for j, jfile in enumerate(local_nwm_files): + if jfile.find(filename) >= 0: + complete_timelist.append(time_list[j]) + for j, jfile in enumerate(remote_nwm_files): + if jfile.find(filename) >= 0: + complete_timelist.append(out[j][0]) + + # Convert time-synced list of catchment dictionaries + # to catchment based dataframes + dfs = time2catchment(complete_timelist, var_list_out) + proc_time = time.perf_counter() - t0 + + # Write to file + if ii_verbose: print(f"Writing data!") t0 = time.perf_counter() - write_int = 100 - for j, jcatch in enumerate(fd2.keys()): - df = fd2[jcatch] + nfiles = len(dfs) + write_int = 1000 + for j, jcatch in enumerate(dfs.keys()): + df = dfs[jcatch] splt = jcatch.split("-") if bucket_type == "local": @@ -539,17 +643,27 @@ def main(): if (j + 1) % write_int == 0: print( - f"{j+1} files written out of {len(fd2)}, {(j+1)/len(fd2)*100:.2f}%", + f"{j+1} files written out of {len(dfs)}, {(j+1)/len(dfs)*100:.2f}%", end="\r", ) + if j == nfiles-1: + print( + f"{j+1} files written out of {len(dfs)}, {(j+1)/len(dfs)*100:.2f}%", + end="\r", + ) + write_time = time.perf_counter() - t0 + total_time = time.perf_counter() - t00 - print(f"\n{file_type} write took {time.perf_counter() - t0:.2f} s\n") - - print( - f"\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {bucket_type}\n\n" - ) - print(f"Total run time: {time.perf_counter() - t00:.2f} s") - + print(f'\n\n--------SUMMARY-------') + if bucket_type == 'local': + msg = f'\nData has been written locally to {bucket_path}' + else: + msg = f'\nData has been written to S3 bucket {bucket_name} at {file_prefix}' + msg += f'\nDownloading data : {dl_time:.2f}s' + msg += f'\nProcessing data : {proc_time:.2f}s' + msg += f'\nWriting data : {write_time:.2f}s' + msg += f'\nTotal time : {total_time:.2f}s\n' + print(msg) if __name__ == "__main__": main() diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index 11b5962..4fef300 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -2,25 +2,30 @@ "forcing" : { "start_date" : "20220822", "end_date" : "20220822", + "cache" : true, "nwm_file" : "", "runinput" : 1, "varinput" : 5, "geoinput" : 1, "meminput" : 0, - "urlbaseinput" : 3, - "cache" : false + "urlbaseinput" : 3 }, "hydrofab" : { - "version" : "v1.2", - "vpu" : "03W" + "version" : "v1.2", + "vpu" : "03W" }, - "verbose" : true, - "bucket_type" : "local", - "bucket_name" : "out_data_CIROH", - "file_prefix" : "data/", - "file_type" : "csv", - "dl_threads" : 10 - + "storage":{ + "bucket_type" : "local", + "bucket_name" : "out_data_CIROH", + "file_prefix" : "data/", + "file_type" : "csv" + }, + + "run" : { + "verbose" : false, + "nthreads" : 1 + } + } diff --git a/ngen_forcing/user_input_ngen.md b/ngen_forcing/user_input_ngen.md index 033fc5b..1b13256 100644 --- a/ngen_forcing/user_input_ngen.md +++ b/ngen_forcing/user_input_ngen.md @@ -9,43 +9,47 @@ contents: "forcing" : { "start_date" : "20220822", "end_date" : "20220822", + "cache" : false, "nwm_file" : "", "runinput" : 1, "varinput" : 5, "geoinput" : 1, "meminput" : 0, - "urlbaseinput" : 3, - "cache" : false + "urlbaseinput" : 3 }, - + "hydrofab" : { - "version" : "v1.2", - "vpu" : "03W" + "version" : "v1.2", + "vpu" : "03W" }, - - "verbose" : true, - "bucket_type" : "local", - "bucket_name" : "out_data_CIROH", - "file_prefix" : "data/", - "file_type" : "csv", - "dl_threads" : 10 - + + "storage":{ + "bucket_type" : "local", + "bucket_name" : "out_data_CIROH", + "file_prefix" : "data/", + "file_type" : "csv" + }, + + "run" : { + "verbose" : false, + "nthreads" : 5 + } + } - - ### forcing | Field Name | Data Type | Description | | --- | --- | --- | | start_date | `string` | YYYYMMDD | | end_date | `string` | YYYYMMDD | -| nwm_files | `string` | Path to a text file containing nwm file names. One filename per line. To have nwm forcing file names generated automatically, leave this option out of the config or set it to "" | +| cache | `bool` |
  • true: Store forcing files locally. Must specify dl_threads
  • false: Interact with forcing files remotely
  • | +| nwm_file | `string` | Path to a text file containing nwm file names. One filename per line. To have nwm forcing file names generated automatically, leave this option out of the config or set it to "" | | runinput | `int` |
    1. short_range
    2. medium_range
    3. medium_range_no_da
    4. long_range
    5. analysis_assim
    6. analysis_assim_extend
    7. analysis_assim_extend_no_da
    8. analysis_assim_long
    9. analysis_assim_long_no_da
    10. analysis_assim_no_da
    11. short_range_no_da
    | -| varinput | `int` |
    1. channel_rt: for real-time channel data
    2. land: for land data
    3. reservoir: for reservoir data
    4. terrain_rt: for real-time terrain data
    5. forcing: for forcing data
    | -| geoinput | `int` |
    1. conus: for continental US
    2. hawaii: for Hawaii
    3. puertorico: for Puerto Rico
    | +| varinput | `int` |
    1. channel_rt
    2. land
    3. reservoir
    4. terrain_rt terrain
    5. forcing
    | +| geoinput | `int` |
    1. conus
    2. hawaii
    3. puertorico
    | | meminput | `int` |
    1. mem_1
    2. mem_2
    3. mem_3
    4. mem_4
    5. mem_5
    6. mem_6
    7. mem_7
    | | urlbaseinput | `int` |
    1. Empty string: use local files
    2. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/: for real-time operational data from NOAA
    3. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/post-processed/WMS/: for post-processed data from NOAA's Web Map Service
    4. https://storage.googleapis.com/national-water-model/: for input/output data stored on Google Cloud Storage
    5. https://storage.cloud.google.com/national-water-model/: for input/output data stored on Google Cloud Storage
    6. gs://national-water-model/: for input/output data stored on Google Cloud Storage
    7. https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/: for retrospective data from AWS S3
    8. s3://noaa-nwm-retrospective-2-1-pds/model_output/: for retrospective data from AWS S3
    | -| cache | `bool` |
  • true: Store forcing files locally. Must specify dl_threads
  • false: Interact with forcing files remotely
  • | + ### hydrofab | Field Name | Data Type | Description | @@ -53,12 +57,17 @@ contents: | version | `string` | Current hydrofabric version | | vpu | `string` | Check here for map of VPUs https://noaa-owp.github.io/hydrofabric/articles/data_access.html | -### other options +### storage | Field Name | Data Type | Description | | --- | --- | --- | -| verbose | `bool` | Print raw forcing files | | bucket_type | `string` |
    1. "local" : write to local directory
    2. "S3" : output to AWS S3 bucket
    | | bucket_name | `string` | If local, this is the name of the folder the data will be placed in. If S3, this is the name of S3 bucket, which must exist already. | | file_prefix | `string` | If local, this is the relative path to the bucket_name folder. If S3, this is the relative path within the S3 bucket_name bucket to store files | | file_type | `string` |
    1. "csv" : write data as csv files/
    2. "parquet" : write data as parquet files
    | -| dl_threads | `int` | Number of threads to use while downloading. | + + +### run +| Field Name | Data Type | Description | +| --- | --- | --- | +| verbose | `bool` | Print raw forcing files | +| nthreads | `int` | Number of threads to use while downloading. | From 7ffb9cf902afe83e5d1687c0a181df454c8cbb8e Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 8 May 2023 14:55:20 -0500 Subject: [PATCH 037/105] blacked --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 122 +++++++++++--------- 1 file changed, 65 insertions(+), 57 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index e1ee25c..9b5edfa 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -233,31 +233,31 @@ def get_forcing_timelist( wgt_file: str, filelist: list, var_list: list, - jt = None, - out = None, + jt=None, + out=None, ): """ - General function to read either remote or local nwm forcing files. + General function to read either remote or local nwm forcing files. Inputs: wgt_file: a path to the weights json, filelist: list of filenames (urls for remote, local paths otherwise), - var_list: list (list of variable names to read), + var_list: list (list of variable names to read), jt: the index to place the file. This is used to ensure elements increase in time, regardless of thread number, out: a list (in time) of forcing data, (THIS IS A THREADING OUTPUT) - + Outputs: df_by_t : (returned for local files) a list (in time) of forcing data. Note that this list may not be consistent in time OR - out : (returned for remote files) a list (in time) of forcing data. + out : (returned for remote files) a list (in time) of forcing data. Each thread will write into this list such that time increases, but may not be consistent - + """ t1 = time.perf_counter() - df_by_t = [] + df_by_t = [] for _i, _nc_file in enumerate(filelist): - if _nc_file[:5] == 'https': + if _nc_file[:5] == "https": eng = "rasterio" # switch engine for remote processing else: eng = "h5netcdf" @@ -270,7 +270,7 @@ def get_forcing_timelist( _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, wgt_file) df_by_t.append(_df_zonal_stats) - if jt == None: + if jt == None: print( f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}% {time.perf_counter() - t1:.2f}s elapsed", end="\r", @@ -281,11 +281,12 @@ def get_forcing_timelist( return df_by_t + def time2catchment(time_list, var_list_out): """ Convert a list of catchment dictionaries into a single dictionary of dataframes for each catchment - Inputs: + Inputs: time_list : a list returned by get_forcing_timelist. It is assumed this list is consistent in time. var_list_out : a list of clomun headers for the dataframes @@ -303,7 +304,8 @@ def time2catchment(time_list, var_list_out): return dfs -def cmd(cmd,out=None): + +def cmd(cmd, out=None): """ Execute system commands @@ -369,22 +371,19 @@ def locate_dl_files_threaded( args = [] for i, jcmd in enumerate(cmds): args.append([jcmd]) - out = threaded_fun(cmd,nthreads,args) + out = threaded_fun(cmd, nthreads, args) return local_files, remote_files -def threaded_fun(fun, - nthreads : int, - args : list): - + +def threaded_fun(fun, nthreads: int, args: list): """ Threaded function call """ threads = [] out = [None for x in range(len(args))] - for i in range(len(args)): - - if i >= nthreads: # Assign new jobs as threads finish + for i in range(len(args)): + if i >= nthreads: # Assign new jobs as threads finish k = 0 while True: jj = k % nthreads @@ -392,12 +391,12 @@ def threaded_fun(fun, if jthread.is_alive(): k += 1 time.sleep(0.25) - else: - t = threading.Thread(target=fun, args= [*args[i], out]) + else: + t = threading.Thread(target=fun, args=[*args[i], out]) t.start() threads[jj] = t break - else: # Initial set of threads + else: # Initial set of threads t = threading.Thread(target=fun, args=[*args[i], out]) t.start() threads.append(t) @@ -407,12 +406,13 @@ def threaded_fun(fun, while done < len(threads): done = 0 for jthread in threads: - if not jthread.is_alive(): + if not jthread.is_alive(): done += 1 time.sleep(0.25) return out + def main(): """ Primary function to retrieve forcing and hydrofabric data and convert it into files that can be ingested into ngen. @@ -432,7 +432,7 @@ def main(): dest="infile", type=str, help="A json containing user inputs to run ngen" ) args = parser.parse_args() - + # Extract configurations conf = json.load(open(args.infile)) start_date = conf["forcing"]["start_date"] @@ -452,12 +452,13 @@ def main(): bucket_type = conf["storage"]["bucket_type"] bucket_name = conf["storage"]["bucket_name"] file_prefix = conf["storage"]["file_prefix"] - file_type = conf["storage"]["file_type"] + file_type = conf["storage"]["file_type"] ii_verbose = conf["run"]["verbose"] nthreads = conf["run"]["nthreads"] - print(f'\nWelcome to Preparing Data for NextGen-Based Simulations!\n') - if not ii_verbose: print(f'Generating files now! This may take a few moments...') + print(f"\nWelcome to Preparing Data for NextGen-Based Simulations!\n") + if not ii_verbose: + print(f"Generating files now! This may take a few moments...") dl_time = 0 proc_time = 0 @@ -498,7 +499,8 @@ def main(): for jfile in os.listdir(CACHE_DIR): if jfile.find(f"nextgen_{vpu}.gpkg") >= 0: gpkg = Path(CACHE_DIR, jfile) - if ii_verbose:print(f"Found and using geopackge file {gpkg}") + if ii_verbose: + print(f"Found and using geopackge file {gpkg}") if gpkg == None: url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" command = f"wget -P {CACHE_DIR} -c {url}" @@ -507,18 +509,21 @@ def main(): dl_time += time.perf_counter() - t0 gpkg = Path(CACHE_DIR, f"nextgen_{vpu}.gpkg") - if ii_verbose:print(f"Opening {gpkg}...") + if ii_verbose: + print(f"Opening {gpkg}...") t0 = time.perf_counter() polygonfile = gpd.read_file(gpkg, layer="divides") ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) src = ds["RAINRATE"] - if ii_verbose:print("Generating weights") + if ii_verbose: + print("Generating weights") t1 = time.perf_counter() generate_weights_file(polygonfile, src, wgt_file, crosswalk_dict_key="id") - if ii_verbose:print(f"\nGenerating the weights took {time.perf_counter() - t1:.2f} s") - proc_time +=time.perf_counter() - t0 + if ii_verbose: + print(f"\nGenerating the weights took {time.perf_counter() - t1:.2f} s") + proc_time += time.perf_counter() - t0 else: if ii_verbose: print( @@ -545,10 +550,10 @@ def main(): with open(nwm_file, "r") as f: for line in f: nwm_forcing_files.append(line) - if ii_verbose: - print(f'Raw file names:') + if ii_verbose: + print(f"Raw file names:") for jfile in nwm_forcing_files: - print(f'{jfile}') + print(f"{jfile}") proc_time += time.perf_counter() - t0 @@ -580,22 +585,23 @@ def main(): "SPFH_2maboveground", "DSWRF_surface", ] - + t0 = time.perf_counter() # Index remote files with threads - if len(remote_nwm_files) > 0: + if len(remote_nwm_files) > 0: args = [] - for i in range(len(remote_nwm_files)): - if ii_verbose: print(f'Doing a threaded remote data retrieval for file {remote_nwm_files[i]}') + for i in range(len(remote_nwm_files)): + if ii_verbose: + print( + f"Doing a threaded remote data retrieval for file {remote_nwm_files[i]}" + ) args.append([wgt_file, [remote_nwm_files[i]], var_list, i]) - out = threaded_fun(get_forcing_timelist,nthreads,args) + out = threaded_fun(get_forcing_timelist, nthreads, args) # If we have any local files, index locally serially - if len(local_nwm_files) > 0: - time_list = get_forcing_timelist( - wgt_file, local_nwm_files, var_list - ) + if len(local_nwm_files) > 0: + time_list = get_forcing_timelist(wgt_file, local_nwm_files, var_list) # Sync in time between remote and local files complete_timelist = [] @@ -606,7 +612,7 @@ def main(): complete_timelist.append(time_list[j]) for j, jfile in enumerate(remote_nwm_files): if jfile.find(filename) >= 0: - complete_timelist.append(out[j][0]) + complete_timelist.append(out[j][0]) # Convert time-synced list of catchment dictionaries # to catchment based dataframes @@ -614,7 +620,8 @@ def main(): proc_time = time.perf_counter() - t0 # Write to file - if ii_verbose: print(f"Writing data!") + if ii_verbose: + print(f"Writing data!") t0 = time.perf_counter() nfiles = len(dfs) write_int = 1000 @@ -646,24 +653,25 @@ def main(): f"{j+1} files written out of {len(dfs)}, {(j+1)/len(dfs)*100:.2f}%", end="\r", ) - if j == nfiles-1: + if j == nfiles - 1: print( f"{j+1} files written out of {len(dfs)}, {(j+1)/len(dfs)*100:.2f}%", end="\r", - ) + ) write_time = time.perf_counter() - t0 total_time = time.perf_counter() - t00 - print(f'\n\n--------SUMMARY-------') - if bucket_type == 'local': - msg = f'\nData has been written locally to {bucket_path}' + print(f"\n\n--------SUMMARY-------") + if bucket_type == "local": + msg = f"\nData has been written locally to {bucket_path}" else: - msg = f'\nData has been written to S3 bucket {bucket_name} at {file_prefix}' - msg += f'\nDownloading data : {dl_time:.2f}s' - msg += f'\nProcessing data : {proc_time:.2f}s' - msg += f'\nWriting data : {write_time:.2f}s' - msg += f'\nTotal time : {total_time:.2f}s\n' + msg = f"\nData has been written to S3 bucket {bucket_name} at {file_prefix}" + msg += f"\nDownloading data : {dl_time:.2f}s" + msg += f"\nProcessing data : {proc_time:.2f}s" + msg += f"\nWriting data : {write_time:.2f}s" + msg += f"\nTotal time : {total_time:.2f}s\n" print(msg) + if __name__ == "__main__": main() From ebd7d326ef551251f425c561b7aabe4b87781f15 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Tue, 9 May 2023 01:50:57 -0500 Subject: [PATCH 038/105] Fixed threading and added local file check --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 172 ++++++++++---------- ngen_forcing/user_input_ngen.json | 4 +- 2 files changed, 87 insertions(+), 89 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 9b5edfa..3932095 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -20,6 +20,8 @@ import boto3 from io import BytesIO + +import concurrent.futures as cf import threading pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "nwm_filenames") @@ -233,8 +235,7 @@ def get_forcing_timelist( wgt_file: str, filelist: list, var_list: list, - jt=None, - out=None, + jt=None ): """ General function to read either remote or local nwm forcing files. @@ -244,18 +245,16 @@ def get_forcing_timelist( filelist: list of filenames (urls for remote, local paths otherwise), var_list: list (list of variable names to read), jt: the index to place the file. This is used to ensure elements increase in time, regardless of thread number, - out: a list (in time) of forcing data, (THIS IS A THREADING OUTPUT) Outputs: df_by_t : (returned for local files) a list (in time) of forcing data. Note that this list may not be consistent in time - OR - out : (returned for remote files) a list (in time) of forcing data. - Each thread will write into this list such that time increases, but may not be consistent + t : model_output_valid_time for each """ t1 = time.perf_counter() df_by_t = [] + t = [] for _i, _nc_file in enumerate(filelist): if _nc_file[:5] == "https": eng = "rasterio" # switch engine for remote processing @@ -269,20 +268,20 @@ def get_forcing_timelist( data_allvars[var_dx, :, :] = np.squeeze(_xds[jvar].values) _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, wgt_file) df_by_t.append(_df_zonal_stats) + time_splt = _xds.attrs["model_output_valid_time"].split("_") + t.append(time_splt[0] + " " + time_splt[1]) if jt == None: print( f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}% {time.perf_counter() - t1:.2f}s elapsed", end="\r", ) + if _i == len(filelist) -1: print() - if not jt == None: - out[jt] = df_by_t - - return df_by_t + return df_by_t, t -def time2catchment(time_list, var_list_out): +def time2catchment(data_list, time_list, var_list_out): """ Convert a list of catchment dictionaries into a single dictionary of dataframes for each catchment @@ -296,16 +295,18 @@ def time2catchment(time_list, var_list_out): """ dfs = {} - for jcat in list(time_list[0].keys()): + for jcat in list(data_list[0].keys()): data_catch = [] - for jt in range(len(time_list)): - data_catch.append(time_list[jt][jcat]) + for jt in range(len(data_list)): + data_catch.append(data_list[jt][jcat]) dfs[jcat] = pd.DataFrame(data_catch, columns=var_list_out) + dfs[jcat]["time"] = time_list + dfs[jcat] = dfs[jcat][["time"] + var_list_out] return dfs -def cmd(cmd, out=None): +def cmd(cmd): """ Execute system commands @@ -344,21 +345,37 @@ def locate_dl_files_threaded( for jfile in forcing_file_names: file_parts = Path(jfile).parts local_file = os.path.join(CACHE_DIR, file_parts[-1]) + ii_dl = False # decide whether to use local file, download it, or index it remotely if os.path.exists(local_file): - # If the file exists local, get data from this file regardless of ii_cache option - if ii_verbose and ii_cache: - print(f"Found and using local raw forcing file {local_file}") - elif ii_verbose and not ii_cache: - print( - f"CACHE OPTION OVERRIDE : Found and using local raw forcing file {local_file}" - ) - local_files.append(local_file) + + # Check to make sure file is not broken + try: + with xr.open_dataset(local_file, engine="h5netcdf") as _xds: + pass + if ii_cache: + if ii_verbose: print(f"Found and using local raw forcing file {local_file}") + else: + if ii_verbose: print( + f"CACHE OPTION OVERRIDE : Found and using local raw forcing file {local_file}" + ) + local_files.append(local_file) + except: + if ii_cache: + if ii_verbose: print(f"{local_file} is broken! Will Download") + ii_dl = True + else: + if ii_verbose: print(f"{local_file} is broken! Will index remotely") + remote_files.append(jfile) + elif not os.path.exists(local_file) and not ii_cache: # If file is not found locally, and we don't want to cache it, append to remote file list remote_files.append(jfile) elif not os.path.exists(local_file) and ii_cache: + ii_dl = True + + if ii_dl: # Download file if ii_verbose: print(f"Forcing file not found! Downloading {jfile}") @@ -367,52 +384,14 @@ def locate_dl_files_threaded( dl_files.append(jfile) local_files.append(local_file) + # Get files with pool if len(cmds) > 0: - args = [] - for i, jcmd in enumerate(cmds): - args.append([jcmd]) - out = threaded_fun(cmd, nthreads, args) + pool = cf.ThreadPoolExecutor(max_workers=nthreads) + pool.map(cmd, cmds) + pool.shutdown() return local_files, remote_files - -def threaded_fun(fun, nthreads: int, args: list): - """ - Threaded function call - """ - threads = [] - out = [None for x in range(len(args))] - for i in range(len(args)): - if i >= nthreads: # Assign new jobs as threads finish - k = 0 - while True: - jj = k % nthreads - jthread = threads[jj] - if jthread.is_alive(): - k += 1 - time.sleep(0.25) - else: - t = threading.Thread(target=fun, args=[*args[i], out]) - t.start() - threads[jj] = t - break - else: # Initial set of threads - t = threading.Thread(target=fun, args=[*args[i], out]) - t.start() - threads.append(t) - - # Ensure all threads are finished - done = 0 - while done < len(threads): - done = 0 - for jthread in threads: - if not jthread.is_alive(): - done += 1 - time.sleep(0.25) - - return out - - def main(): """ Primary function to retrieve forcing and hydrofabric data and convert it into files that can be ingested into ngen. @@ -587,41 +566,60 @@ def main(): ] t0 = time.perf_counter() - # Index remote files with threads - if len(remote_nwm_files) > 0: - args = [] - for i in range(len(remote_nwm_files)): - if ii_verbose: - print( - f"Doing a threaded remote data retrieval for file {remote_nwm_files[i]}" - ) - args.append([wgt_file, [remote_nwm_files[i]], var_list, i]) - out = threaded_fun(get_forcing_timelist, nthreads, args) + pool = cf.ThreadPoolExecutor(max_workers=nthreads) + arg0 = [] + arg1 = [] + arg2 = [] + arg3 = [] + for i in range(len(remote_nwm_files)): + if ii_verbose: + print( + f"Doing a threaded remote data retrieval for file {remote_nwm_files[i]}" + ) + arg0.append(wgt_file) + arg1.append([remote_nwm_files[i]]) + arg2.append(var_list) + arg3.append(i) + results = pool.map(get_forcing_timelist, arg0,arg1,arg2,arg3) + + # Get data + remote_data_list = [] + for jres in results: + remote_data_list.append(jres) + + # Build time axis + t_ax_remote = [] + for i in range(len(remote_nwm_files)): + t_ax_remote.append(remote_data_list[i][1]) # If we have any local files, index locally serially if len(local_nwm_files) > 0: - time_list = get_forcing_timelist(wgt_file, local_nwm_files, var_list) + data_list, t_ax_local = get_forcing_timelist( + wgt_file, local_nwm_files, var_list + ) # Sync in time between remote and local files - complete_timelist = [] + complete_data_timelist = [] + timelist = [] for i, ifile in enumerate(nwm_forcing_files): filename = Path(ifile).parts[-1] for j, jfile in enumerate(local_nwm_files): if jfile.find(filename) >= 0: - complete_timelist.append(time_list[j]) + complete_data_timelist.append(data_list[j]) + timelist.append(t_ax_local[j]) for j, jfile in enumerate(remote_nwm_files): if jfile.find(filename) >= 0: - complete_timelist.append(out[j][0]) + complete_data_timelist.append(remote_data_list[j][0][0]) + timelist.append(t_ax_remote[j]) # Convert time-synced list of catchment dictionaries # to catchment based dataframes - dfs = time2catchment(complete_timelist, var_list_out) - proc_time = time.perf_counter() - t0 + if ii_verbose: print(f'Reformatting data into dataframes...') + dfs = time2catchment(complete_data_timelist, timelist, var_list_out) + proc_time += time.perf_counter() - t0 # Write to file - if ii_verbose: - print(f"Writing data!") t0 = time.perf_counter() nfiles = len(dfs) write_int = 1000 @@ -632,7 +630,7 @@ def main(): if bucket_type == "local": if file_type == "csv": csvname = Path(bucket_path, f"cat{vpu}_{splt[1]}.csv") - df.to_csv(csvname) + df.to_csv(csvname, index=False) if file_type == "parquet": parq_file = Path(bucket_path, f"cat{vpu}_{splt[1]}.parquet") df.to_parquet(parq_file) @@ -666,9 +664,9 @@ def main(): msg = f"\nData has been written locally to {bucket_path}" else: msg = f"\nData has been written to S3 bucket {bucket_name} at {file_prefix}" - msg += f"\nDownloading data : {dl_time:.2f}s" - msg += f"\nProcessing data : {proc_time:.2f}s" - msg += f"\nWriting data : {write_time:.2f}s" + msg += f"\Check and DL data : {dl_time:.2f}s" + msg += f"\nProcess data : {proc_time:.2f}s" + msg += f"\nWrite data : {write_time:.2f}s" msg += f"\nTotal time : {total_time:.2f}s\n" print(msg) diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index 4fef300..eebfb5e 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -24,8 +24,8 @@ }, "run" : { - "verbose" : false, - "nthreads" : 1 + "verbose" : true, + "nthreads" : 10 } } From c789cd94f6eba0dfa1f82582abc3a2cf325e362d Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 11 May 2023 13:52:01 -0500 Subject: [PATCH 039/105] removed print statements --- subsetting/subset.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/subsetting/subset.py b/subsetting/subset.py index 973fa10..aa03f21 100644 --- a/subsetting/subset.py +++ b/subsetting/subset.py @@ -95,7 +95,11 @@ def get_upstream_ids(divides, nexus, catchment_id): # os._exit(1) for cid in catchment_ids: graph_nodes.put((catchment_id, 0, True)) - graph_nodes.put((cat_index.loc[cid].item(), 0, False)) + try: + graph_nodes.put((cat_index.loc[cid].item(), 0, False)) + except: + raise Exception(f'catchment id {cid} is not found in geopackage!') + cat_ids = set() nex_ids = set() @@ -150,9 +154,9 @@ def subset_upstream(hydrofabric: Path, ids: "List") -> None: # print(nex_ids) # print(wb_ids) # Useful for looking at the name of each layer and which id index is needed to subset it - for layer in layers: + # for layer in layers: # df = gpd.read_file(hydrofabric, layer=layer) - print(layer) + # print(layer) # print(df.head()) flowpaths = ( From aa6c0c0dd43115336cb4afce3615de8b77d7d071 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 11 May 2023 14:53:38 -0500 Subject: [PATCH 040/105] Threaded local data processing and updated user_inputs --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 288 ++++++++++++-------- ngen_forcing/user_input_ngen.json | 20 +- ngen_forcing/user_input_ngen.md | 43 ++- 3 files changed, 215 insertions(+), 136 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 3932095..fe919c6 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -19,15 +19,18 @@ import time import boto3 from io import BytesIO - - +import matplotlib.pyplot as plt +from mpl_toolkits.basemap import Basemap import concurrent.futures as cf -import threading pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "nwm_filenames") sys.path.append(str(pkg_dir)) from listofnwmfilenames import create_file_list +pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "subsetting") +sys.path.append(str(pkg_dir)) +from subset import subset_upstream + TEMPLATE_BLOB_NAME = ( "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" ) @@ -53,7 +56,7 @@ # TODO Make CACHE_DIR configurable CACHE_DIR = Path( - pkg_dir.parent, "data", "raw_data" + pkg_dir.parent, "data", "cache" ) # Maybe this should have a date attached to the name NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") @@ -231,12 +234,7 @@ def calc_zonal_stats_weights_new( return mean_dict -def get_forcing_timelist( - wgt_file: str, - filelist: list, - var_list: list, - jt=None -): +def get_forcing_timelist(wgt_file: str, filelist: list, var_list: list): """ General function to read either remote or local nwm forcing files. @@ -271,13 +269,6 @@ def get_forcing_timelist( time_splt = _xds.attrs["model_output_valid_time"].split("_") t.append(time_splt[0] + " " + time_splt[1]) - if jt == None: - print( - f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}% {time.perf_counter() - t1:.2f}s elapsed", - end="\r", - ) - if _i == len(filelist) -1: print() - return df_by_t, t @@ -349,26 +340,29 @@ def locate_dl_files_threaded( # decide whether to use local file, download it, or index it remotely if os.path.exists(local_file): - # Check to make sure file is not broken try: with xr.open_dataset(local_file, engine="h5netcdf") as _xds: - pass + pass if ii_cache: - if ii_verbose: print(f"Found and using local raw forcing file {local_file}") + if ii_verbose: + print(f"Found and using local raw forcing file {local_file}") else: - if ii_verbose: print( - f"CACHE OPTION OVERRIDE : Found and using local raw forcing file {local_file}" - ) + if ii_verbose: + print( + f"CACHE OPTION OVERRIDE : Found and using local raw forcing file {local_file}" + ) local_files.append(local_file) - except: + except: if ii_cache: - if ii_verbose: print(f"{local_file} is broken! Will Download") + if ii_verbose: + print(f"{local_file} is broken! Will Download") ii_dl = True else: - if ii_verbose: print(f"{local_file} is broken! Will index remotely") - remote_files.append(jfile) - + if ii_verbose: + print(f"{local_file} is broken! Will index remotely") + remote_files.append(jfile) + elif not os.path.exists(local_file) and not ii_cache: # If file is not found locally, and we don't want to cache it, append to remote file list remote_files.append(jfile) @@ -392,6 +386,34 @@ def locate_dl_files_threaded( return local_files, remote_files +def threaded_data_extract(files,nthreads,ii_verbose,wgt_file,var_list): + """ + Sets up the thread pool for get_forcing_timelist and returns the data and time axis ordered in time + + """ + pool = cf.ThreadPoolExecutor(max_workers=nthreads) + arg0 = [] + arg1 = [] + arg2 = [] + for i in range(len(files)): + arg0.append(wgt_file) + arg1.append([files[i]]) + arg2.append(var_list) + + results = pool.map(get_forcing_timelist, arg0, arg1, arg2) + + data_list = [] + for jres in results: + data_list.append(jres) + + # Build time axis + t_ax_local = [] + for i in range(len(files)): + t_ax_local.append(data_list[i][1]) + + return data_list, t_ax_local + + def main(): """ Primary function to retrieve forcing and hydrofabric data and convert it into files that can be ingested into ngen. @@ -414,35 +436,44 @@ def main(): # Extract configurations conf = json.load(open(args.infile)) - start_date = conf["forcing"]["start_date"] - end_date = conf["forcing"]["end_date"] - if "nwm_file" in conf["forcing"]: - nwm_file = conf["forcing"]["nwm_file"] - else: - nwm_file = "" - runinput = conf["forcing"]["runinput"] - varinput = conf["forcing"]["varinput"] - geoinput = conf["forcing"]["geoinput"] - meminput = conf["forcing"]["meminput"] - urlbaseinput = conf["forcing"]["urlbaseinput"] + forcing_type = conf["forcing"]["forcing_type"] ii_cache = conf["forcing"]["cache"] - version = conf["hydrofab"]["version"] - vpu = conf["hydrofab"]["vpu"] + + start_date = conf["forcing"].get("start_date",None) + end_date = conf["forcing"].get("end_date",None) + runinput = conf["forcing"].get("runinput",None) + varinput = conf["forcing"].get("varinput",None) + geoinput = conf["forcing"].get("geoinput",None) + meminput = conf["forcing"].get("meminput",None) + urlbaseinput = conf["forcing"].get("urlbaseinput",None) + nwm_file = conf["forcing"].get("nwm_file",None) + fcst_cycle = conf["forcing"].get("fcst_cycle",None) + lead_time = conf["forcing"].get("lead_time",None) + + version = conf["hydrofab"].get('version','v1.2') + vpu = conf["hydrofab"].get("vpu") + catchment_subset = conf['hydrofab'].get("catch_subset") + geopkg_file = conf["hydrofab"].get("geopkg_file") + ii_weights_only = conf['hydrofab'].get('weights_only',False) + bucket_type = conf["storage"]["bucket_type"] bucket_name = conf["storage"]["bucket_name"] file_prefix = conf["storage"]["file_prefix"] file_type = conf["storage"]["file_type"] - ii_verbose = conf["run"]["verbose"] - nthreads = conf["run"]["nthreads"] + + ii_verbose = conf["run"]["verbose"] + dl_threads = conf["run"]["dl_threads"] + proc_threads = conf["run"]["proc_threads"] print(f"\nWelcome to Preparing Data for NextGen-Based Simulations!\n") - if not ii_verbose: - print(f"Generating files now! This may take a few moments...") dl_time = 0 proc_time = 0 # configuration validation + accepted = ['operational_archive','retrospective','from_file'] + msg = f'{forcing_type} is not a valid input for \"forcing_type\"\nAccepted inputs: {accepted}' + assert forcing_type in accepted, msg file_types = ["csv", "parquet"] assert ( file_type in file_types @@ -451,6 +482,7 @@ def main(): assert ( bucket_type in bucket_types ), f"{bucket_type} for bucket_type is not accepted! Accepted: {bucket_types}" + assert vpu is not None or geopkg_file is not None, "Need to input either vpu or geopkg_file" # Set paths and make directories if needed top_dir = Path(os.path.dirname(args.infile)).parent @@ -462,57 +494,100 @@ def main(): # Prep output directory if bucket_type == "local": bucket_path = Path(top_dir, file_prefix, bucket_name) + forcing_path = Path(bucket_path, 'forcing') if not os.path.exists(bucket_path): - os.system(f"mkdir {bucket_path}") + os.system(f"mkdir {bucket_path}") + os.system(f"mkdir {forcing_path}") if not os.path.exists(bucket_path): raise Exception(f"Creating {bucket_path} failed!") elif bucket_type == "S3": s3 = boto3.client("s3") # Generate weight file only if one doesn't exist already - # Very time consuming so we don't want to do this if we can avoid it - wgt_file = os.path.join(CACHE_DIR, "weights.json") + if catchment_subset is not None: + wgt_file = os.path.join(CACHE_DIR, f"{catchment_subset}_upstream_weights.json") + else: + wgt_file = os.path.join(CACHE_DIR, f"{vpu}_weights.json") if not os.path.exists(wgt_file): - # Search for geopackage that matches the requested VPU, if it exists - gpkg = None - for jfile in os.listdir(CACHE_DIR): - if jfile.find(f"nextgen_{vpu}.gpkg") >= 0: - gpkg = Path(CACHE_DIR, jfile) - if ii_verbose: - print(f"Found and using geopackge file {gpkg}") - if gpkg == None: - url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" - command = f"wget -P {CACHE_DIR} -c {url}" - t0 = time.perf_counter() - cmd(command) - dl_time += time.perf_counter() - t0 - gpkg = Path(CACHE_DIR, f"nextgen_{vpu}.gpkg") - if ii_verbose: - print(f"Opening {gpkg}...") - t0 = time.perf_counter() - polygonfile = gpd.read_file(gpkg, layer="divides") + # Use geopkg_file if given + if geopkg_file is not None: + gpkg = Path(Path(os.path.dirname(__file__)).parent,geopkg_file) + if not gpkg.exists: + raise Exception(f"{gpkg} doesn't exist!!") + + elif catchment_subset is not None: + gpkg = Path(Path(os.path.dirname(__file__)).parent,catchment_subset + '_upstream_subset.gpkg') + + # Default to geopackage that matches the requested VPU + else: + gpkg = None + for jfile in os.listdir(CACHE_DIR): + if jfile.find(f"nextgen_{vpu}.gpkg") >= 0: + gpkg = Path(CACHE_DIR, jfile) + if ii_verbose: + print(f"Found and using geopackge file {gpkg}") + if gpkg == None: + url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" + command = f"wget -P {CACHE_DIR} -c {url}" + t0 = time.perf_counter() + cmd(command) + dl_time += time.perf_counter() - t0 + gpkg = Path(CACHE_DIR, f"nextgen_{vpu}.gpkg") + + if not os.path.exists(gpkg): + + # Generate geopackage through subsetting routine. This will generate ngen geojsons files + if catchment_subset is not None: + if ii_verbose: print(f'Subsetting catchment with id {catchment_subset} from {gpkg}') + subset_upstream(gpkg,catchment_subset) + + # geojsons will be placed in working directory. Copy them to bucket + if bucket_type == 'local': + out_path = Path(bucket_path,'configs') + if not os.path.exists(out_path): os.system(f'mkdir {out_path}') + os.system(f"mv ./catchments.geojson ./nexus.geojson ./crosswalk.json ./flowpaths.geojson ./flowpath_edge_list.json {out_path}") + else: + print(f'UNTESTED!!') + files = ["./catchments.geojson" "./nexus.geojson" "./crosswalk.json" "./flowpaths.geojson" "./flowpath_edge_list.json"] + buf = BytesIO() + for jfile in files: + s3.put_object( + Body=json.dumps(jfile), + Bucket={bucket_name} + ) + + # TODO: Create Realization file + # TODO: Validate configs + else: + if ii_verbose: + print(f"Opening {gpkg}...") + t0 = time.perf_counter() + polygonfile = gpd.read_file(gpkg, layer="divides") - ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) - src = ds["RAINRATE"] + ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) + src = ds["RAINRATE"] - if ii_verbose: - print("Generating weights") - t1 = time.perf_counter() - generate_weights_file(polygonfile, src, wgt_file, crosswalk_dict_key="id") - if ii_verbose: - print(f"\nGenerating the weights took {time.perf_counter() - t1:.2f} s") - proc_time += time.perf_counter() - t0 + if ii_verbose: + print("Generating weights") + t1 = time.perf_counter() + generate_weights_file(polygonfile, src, wgt_file, crosswalk_dict_key="id") + if ii_verbose: + print(f"\nGenerating the weights took {time.perf_counter() - t1:.2f} s") + proc_time += time.perf_counter() - t0 else: if ii_verbose: print( f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}" ) + # Exit early if we only want to calculate the weights + if ii_weights_only: + exit + # Get nwm forcing file names t0 = time.perf_counter() - if len(nwm_file) == 0: - fcst_cycle = [0] + if not forcing_type == 'from_file': nwm_forcing_files = create_file_list( runinput, @@ -523,6 +598,7 @@ def main(): end_date, fcst_cycle, urlbaseinput, + lead_time ) else: nwm_forcing_files = [] @@ -539,7 +615,7 @@ def main(): # This will look for local raw forcing files and download them if needed t0 = time.perf_counter() local_nwm_files, remote_nwm_files = locate_dl_files_threaded( - ii_cache, ii_verbose, nwm_forcing_files, nthreads + ii_cache, ii_verbose, nwm_forcing_files, dl_threads ) dl_time += time.perf_counter() - t0 @@ -567,46 +643,29 @@ def main(): t0 = time.perf_counter() # Index remote files with threads - pool = cf.ThreadPoolExecutor(max_workers=nthreads) - arg0 = [] - arg1 = [] - arg2 = [] - arg3 = [] - for i in range(len(remote_nwm_files)): + if len(remote_nwm_files) > 0: if ii_verbose: print( - f"Doing a threaded remote data retrieval for file {remote_nwm_files[i]}" + f"Performing threaded remote data extraction with {proc_threads} workers..." ) - arg0.append(wgt_file) - arg1.append([remote_nwm_files[i]]) - arg2.append(var_list) - arg3.append(i) - results = pool.map(get_forcing_timelist, arg0,arg1,arg2,arg3) - - # Get data - remote_data_list = [] - for jres in results: - remote_data_list.append(jres) - - # Build time axis - t_ax_remote = [] - for i in range(len(remote_nwm_files)): - t_ax_remote.append(remote_data_list[i][1]) + remote_data_list, t_ax_remote = threaded_data_extract(remote_nwm_files,proc_threads,ii_verbose,wgt_file,var_list) - # If we have any local files, index locally serially + # Index local files with threads if len(local_nwm_files) > 0: - data_list, t_ax_local = get_forcing_timelist( - wgt_file, local_nwm_files, var_list - ) + if ii_verbose: + print( + f"Performing threaded local data extraction with {proc_threads} workers..." + ) + local_data_list, t_ax_local = threaded_data_extract(local_nwm_files,proc_threads,ii_verbose,wgt_file,var_list) # Sync in time between remote and local files complete_data_timelist = [] timelist = [] - for i, ifile in enumerate(nwm_forcing_files): + for ifile in nwm_forcing_files: filename = Path(ifile).parts[-1] for j, jfile in enumerate(local_nwm_files): if jfile.find(filename) >= 0: - complete_data_timelist.append(data_list[j]) + complete_data_timelist.append(local_data_list[j][0][0]) timelist.append(t_ax_local[j]) for j, jfile in enumerate(remote_nwm_files): if jfile.find(filename) >= 0: @@ -615,7 +674,8 @@ def main(): # Convert time-synced list of catchment dictionaries # to catchment based dataframes - if ii_verbose: print(f'Reformatting data into dataframes...') + if ii_verbose: + print(f"Reformatting data into dataframes...") dfs = time2catchment(complete_data_timelist, timelist, var_list_out) proc_time += time.perf_counter() - t0 @@ -629,10 +689,10 @@ def main(): if bucket_type == "local": if file_type == "csv": - csvname = Path(bucket_path, f"cat{vpu}_{splt[1]}.csv") + csvname = Path(forcing_path, f"cat{vpu}_{splt[1]}.csv") df.to_csv(csvname, index=False) if file_type == "parquet": - parq_file = Path(bucket_path, f"cat{vpu}_{splt[1]}.parquet") + parq_file = Path(forcing_path, f"cat{vpu}_{splt[1]}.parquet") df.to_parquet(parq_file) elif bucket_type == "S3": buf = BytesIO() @@ -643,7 +703,7 @@ def main(): csvname = f"cat{vpu}_{splt[1]}.csv" df.to_csv(buf, index=False) buf.seek(0) - key_name = f"{file_prefix}{csvname}" + key_name = f"{file_prefix}/forcing/{csvname}" s3.put_object(Bucket=bucket_name, Key=key_name, Body=buf.getvalue()) if (j + 1) % write_int == 0: @@ -664,10 +724,10 @@ def main(): msg = f"\nData has been written locally to {bucket_path}" else: msg = f"\nData has been written to S3 bucket {bucket_name} at {file_prefix}" - msg += f"\Check and DL data : {dl_time:.2f}s" - msg += f"\nProcess data : {proc_time:.2f}s" - msg += f"\nWrite data : {write_time:.2f}s" - msg += f"\nTotal time : {total_time:.2f}s\n" + msg += f"\nCheck and DL data : {dl_time:.2f}s" + msg += f"\nProcess data : {proc_time:.2f}s" + msg += f"\nWrite data : {write_time:.2f}s" + msg += f"\nTotal time : {total_time:.2f}s\n" print(msg) diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index eebfb5e..94f64df 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -1,31 +1,37 @@ { "forcing" : { + "forcing_type" : "operational_archive", "start_date" : "20220822", "end_date" : "20220822", - "cache" : true, + "cache" : true, "nwm_file" : "", "runinput" : 1, "varinput" : 5, "geoinput" : 1, "meminput" : 0, - "urlbaseinput" : 3 + "urlbaseinput" : 3, + "fcst_cycle" : [0], + "lead_time" : null }, "hydrofab" : { - "version" : "v1.2", - "vpu" : "03W" + "version" : "v1.2", + "vpu" : "03W", + "catch_subset" : "cat-112977", + "weights_only" : false }, "storage":{ "bucket_type" : "local", - "bucket_name" : "out_data_CIROH", + "bucket_name" : "ngen_inputs", "file_prefix" : "data/", "file_type" : "csv" }, "run" : { - "verbose" : true, - "nthreads" : 10 + "verbose" : true, + "dl_threads" : 10, + "proc_threads" : 2 } } diff --git a/ngen_forcing/user_input_ngen.md b/ngen_forcing/user_input_ngen.md index 1b13256..6da6cab 100644 --- a/ngen_forcing/user_input_ngen.md +++ b/ngen_forcing/user_input_ngen.md @@ -7,61 +7,73 @@ contents: { "forcing" : { + "forcing_type" : "operational_archive", "start_date" : "20220822", "end_date" : "20220822", - "cache" : false, + "cache" : true, "nwm_file" : "", "runinput" : 1, "varinput" : 5, "geoinput" : 1, "meminput" : 0, - "urlbaseinput" : 3 + "urlbaseinput" : 3, + "fcst_cycle" : [0], + "lead_time" : null }, - + "hydrofab" : { - "version" : "v1.2", - "vpu" : "03W" + "version" : "v1.2", + "vpu" : "03W", + "catch_subset" : "cat-112977", + "weights_only" : false }, - + "storage":{ "bucket_type" : "local", - "bucket_name" : "out_data_CIROH", + "bucket_name" : "ngen_inputs", "file_prefix" : "data/", "file_type" : "csv" }, - + "run" : { - "verbose" : false, - "nthreads" : 5 + "verbose" : true, + "nthreads" : 2 } - + } ### forcing | Field Name | Data Type | Description | | --- | --- | --- | +| forcing_type | `string` |
  • operational_archive
  • retrospective
  • from_file
  • | | start_date | `string` | YYYYMMDD | | end_date | `string` | YYYYMMDD | | cache | `bool` |
  • true: Store forcing files locally. Must specify dl_threads
  • false: Interact with forcing files remotely
  • | -| nwm_file | `string` | Path to a text file containing nwm file names. One filename per line. To have nwm forcing file names generated automatically, leave this option out of the config or set it to "" | +| nwm_file | `string` | Path to a text file containing nwm file names. One filename per line. Set this only if forcing_type is set to 'from_file' | | runinput | `int` |
    1. short_range
    2. medium_range
    3. medium_range_no_da
    4. long_range
    5. analysis_assim
    6. analysis_assim_extend
    7. analysis_assim_extend_no_da
    8. analysis_assim_long
    9. analysis_assim_long_no_da
    10. analysis_assim_no_da
    11. short_range_no_da
    | | varinput | `int` |
    1. channel_rt
    2. land
    3. reservoir
    4. terrain_rt terrain
    5. forcing
    | | geoinput | `int` |
    1. conus
    2. hawaii
    3. puertorico
    | | meminput | `int` |
    1. mem_1
    2. mem_2
    3. mem_3
    4. mem_4
    5. mem_5
    6. mem_6
    7. mem_7
    | | urlbaseinput | `int` |
    1. Empty string: use local files
    2. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/: for real-time operational data from NOAA
    3. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/post-processed/WMS/: for post-processed data from NOAA's Web Map Service
    4. https://storage.googleapis.com/national-water-model/: for input/output data stored on Google Cloud Storage
    5. https://storage.cloud.google.com/national-water-model/: for input/output data stored on Google Cloud Storage
    6. gs://national-water-model/: for input/output data stored on Google Cloud Storage
    7. https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/: for retrospective data from AWS S3
    8. s3://noaa-nwm-retrospective-2-1-pds/model_output/: for retrospective data from AWS S3
    | +| fcst_cycle | `list` | List of forecast cycles in UTC. If empty, will use all available cycles | +| lead_time | `list` | List of lead times in hours. If empty, will use all available lead times | ### hydrofab | Field Name | Data Type | Description | | --- | --- | --- | -| version | `string` | Current hydrofabric version | +| version | `string` | Desired hydrofabric data version | | vpu | `string` | Check here for map of VPUs https://noaa-owp.github.io/hydrofabric/articles/data_access.html | +| geopkg_file | `string` | Path to file containing catchment polygons. Must exist locally | +| catch_subset | `string` | catchment id of the form "cat-#". If provided, a subsetted geopackage will be created from vpu geopackage. NGen config files will be generated as well | +| weights_only | `bool` |
  • true: Generate weight file and exit.
  • false: Proceed with full script, generate forcing files
  • | + ### storage | Field Name | Data Type | Description | | --- | --- | --- | | bucket_type | `string` |
    1. "local" : write to local directory
    2. "S3" : output to AWS S3 bucket
    | -| bucket_name | `string` | If local, this is the name of the folder the data will be placed in. If S3, this is the name of S3 bucket, which must exist already. | +| bucket_name | `string` | If local, this is the name of the folder the data will be placed in. If S3, this is the name of S3 bucket, which must exist already | | file_prefix | `string` | If local, this is the relative path to the bucket_name folder. If S3, this is the relative path within the S3 bucket_name bucket to store files | | file_type | `string` |
    1. "csv" : write data as csv files/
    2. "parquet" : write data as parquet files
    | @@ -70,4 +82,5 @@ contents: | Field Name | Data Type | Description | | --- | --- | --- | | verbose | `bool` | Print raw forcing files | -| nthreads | `int` | Number of threads to use while downloading. | +| dl_threads | `int` | Number of threads to use while downloading. | +| proc_threads | `int` | Number of threads to use while processing data (either remotely or locally). | From b710cff48b049a0247f20b94b1193a979611e03d Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Fri, 12 May 2023 12:11:19 -0500 Subject: [PATCH 041/105] Retrospective file names --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 59 ++++++++++++++++----- ngen_forcing/user_input_ngen.json | 20 +++---- ngen_forcing/user_input_ngen.md | 4 +- 3 files changed, 60 insertions(+), 23 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index fe919c6..bee591c 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -27,9 +27,15 @@ sys.path.append(str(pkg_dir)) from listofnwmfilenames import create_file_list +retro_file = Path(pkg_dir,'listofnwmfilenamesretro.py') +ii_retro = False +if retro_file.exists(): + ii_retro = True + from listofnwmfilenamesretro import create_file_list_retro + pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "subsetting") sys.path.append(str(pkg_dir)) -from subset import subset_upstream +from subset import subset_upstream, subset_upstream_prerelease TEMPLATE_BLOB_NAME = ( "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" @@ -437,6 +443,8 @@ def main(): # Extract configurations conf = json.load(open(args.infile)) forcing_type = conf["forcing"]["forcing_type"] + if not ii_retro and forcing_type == "retrospective": + raise NotImplementedError("Need listofnwmfilenamesretro for this!") ii_cache = conf["forcing"]["cache"] start_date = conf["forcing"].get("start_date",None) @@ -449,6 +457,8 @@ def main(): nwm_file = conf["forcing"].get("nwm_file",None) fcst_cycle = conf["forcing"].get("fcst_cycle",None) lead_time = conf["forcing"].get("lead_time",None) + data_type = conf["forcing"].get("data_type",None) + object_type = conf["forcing"].get("object_type",None) version = conf["hydrofab"].get('version','v1.2') vpu = conf["hydrofab"].get("vpu") @@ -540,7 +550,13 @@ def main(): # Generate geopackage through subsetting routine. This will generate ngen geojsons files if catchment_subset is not None: if ii_verbose: print(f'Subsetting catchment with id {catchment_subset} from {gpkg}') - subset_upstream(gpkg,catchment_subset) + if catchment_subset.find("release"): + try: + subset_upstream_prerelease(gpkg,catchment_subset) + except: + raise NotImplementedError(f"Need Tony's version of subset.py!") + else: + subset_upstream(gpkg,catchment_subset) # geojsons will be placed in working directory. Copy them to bucket if bucket_type == 'local': @@ -589,17 +605,34 @@ def main(): t0 = time.perf_counter() if not forcing_type == 'from_file': - nwm_forcing_files = create_file_list( - runinput, - varinput, - geoinput, - meminput, - start_date, - end_date, - fcst_cycle, - urlbaseinput, - lead_time - ) + if forcing_type == "operational_archive": + nwm_forcing_files = create_file_list( + runinput, + varinput, + geoinput, + meminput, + start_date, + end_date, + fcst_cycle, + urlbaseinput, + lead_time + ) + elif forcing_type == "retrospective": + nwm_forcing_files = create_file_list_retro( + runinput, + varinput, + geoinput, + meminput, + start_date + "0000", # Hack + end_date + "0000", # Hack + fcst_cycle, + urlbaseinput, + lead_time, + data_type, + object_type + ) + nwm_forcing_files = nwm_forcing_files[0] + else: nwm_forcing_files = [] with open(nwm_file, "r") as f: diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index 94f64df..613ea7d 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -1,17 +1,19 @@ { "forcing" : { - "forcing_type" : "operational_archive", - "start_date" : "20220822", - "end_date" : "20220822", + "forcing_type" : "retrospective", + "start_date" : "19790201", + "end_date" : "19790202", "cache" : true, "nwm_file" : "", - "runinput" : 1, - "varinput" : 5, + "runinput" : 2, + "varinput" : 1, "geoinput" : 1, - "meminput" : 0, - "urlbaseinput" : 3, - "fcst_cycle" : [0], - "lead_time" : null + "meminput" : 1, + "urlbaseinput" : 6, + "fcst_cycle" : [12,18], + "lead_time" : [1, 2, 240], + "data_type" : [6], + "object_type" : 1 }, "hydrofab" : { diff --git a/ngen_forcing/user_input_ngen.md b/ngen_forcing/user_input_ngen.md index 6da6cab..9d7ac58 100644 --- a/ngen_forcing/user_input_ngen.md +++ b/ngen_forcing/user_input_ngen.md @@ -54,9 +54,11 @@ contents: | varinput | `int` |
    1. channel_rt
    2. land
    3. reservoir
    4. terrain_rt terrain
    5. forcing
    | | geoinput | `int` |
    1. conus
    2. hawaii
    3. puertorico
    | | meminput | `int` |
    1. mem_1
    2. mem_2
    3. mem_3
    4. mem_4
    5. mem_5
    6. mem_6
    7. mem_7
    | -| urlbaseinput | `int` |
    1. Empty string: use local files
    2. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/: for real-time operational data from NOAA
    3. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/post-processed/WMS/: for post-processed data from NOAA's Web Map Service
    4. https://storage.googleapis.com/national-water-model/: for input/output data stored on Google Cloud Storage
    5. https://storage.cloud.google.com/national-water-model/: for input/output data stored on Google Cloud Storage
    6. gs://national-water-model/: for input/output data stored on Google Cloud Storage
    7. https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/: for retrospective data from AWS S3
    8. s3://noaa-nwm-retrospective-2-1-pds/model_output/: for retrospective data from AWS S3
    | +| urlbaseinput | `int` |
    1. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/: for real-time operational data from NOAA
    2. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/post-processed/WMS/: for post-processed data from NOAA's Web Map Service
    3. https://storage.googleapis.com/national-water-model/: for input/output data stored on Google Cloud Storage
    4. https://storage.cloud.google.com/national-water-model/: for input/output data stored on Google Cloud Storage
    5. gs://national-water-model/: for input/output data stored on Google Cloud Storage
    6. https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/: for retrospective data from AWS S3
    7. s3://noaa-nwm-retrospective-2-1-pds/model_output/: for retrospective data from AWS S3
    | | fcst_cycle | `list` | List of forecast cycles in UTC. If empty, will use all available cycles | | lead_time | `list` | List of lead times in hours. If empty, will use all available lead times | +| data_type | `list` | Only required for retroactive
    1. CHRTOUT_DOMAIN1
    2. GWOUT_DOMAIN1
    3. LAKEOUT_DOMAIN1
    4. LDASOUT_DOMAIN1
    5. RTOUT_DOMAIN1
    6. LDASIN_DOMAIN1
    | +| object_type | `list` or `int` | Only required for retroactive
    1. forcing
    2. model_output
    | ### hydrofab From 1d0190e122f5b0f87cca8a0556fa5240a56991ec Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Fri, 12 May 2023 13:41:25 -0500 Subject: [PATCH 042/105] Removed import shield --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index bee591c..7d6040a 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -28,10 +28,7 @@ from listofnwmfilenames import create_file_list retro_file = Path(pkg_dir,'listofnwmfilenamesretro.py') -ii_retro = False -if retro_file.exists(): - ii_retro = True - from listofnwmfilenamesretro import create_file_list_retro +from listofnwmfilenamesretro import create_file_list_retro pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "subsetting") sys.path.append(str(pkg_dir)) @@ -443,8 +440,6 @@ def main(): # Extract configurations conf = json.load(open(args.infile)) forcing_type = conf["forcing"]["forcing_type"] - if not ii_retro and forcing_type == "retrospective": - raise NotImplementedError("Need listofnwmfilenamesretro for this!") ii_cache = conf["forcing"]["cache"] start_date = conf["forcing"].get("start_date",None) From e5ae8f1d345d26806d5a77240c341d8b1323fdb7 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Tue, 18 Apr 2023 14:59:00 -0500 Subject: [PATCH 043/105] Tool to generate list of number of upstream catchments --- subsetting/README.md | 7 +++++ subsetting/ncatch_upstream.py | 51 +++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 subsetting/ncatch_upstream.py diff --git a/subsetting/README.md b/subsetting/README.md index a3c9e10..953443a 100644 --- a/subsetting/README.md +++ b/subsetting/README.md @@ -13,3 +13,10 @@ The subset algorithm will find all features upstream of the `catchment_id` and t # Note A current shortcut is being used to map `wb` and `cat` ids that isn't a valid assumption, and will be fixed in the future. This means you might get a subset that isn't topologically consistent, so use at your own risk. + +# ncatch_upstream +To get a list of how many catchments are upstream of each catchment, enter the following command +`python subset.py -i -o ` + +where `path_to_hydrofabric` can be a local geopkg, or a remote resource (s3 or http URL), +and `path_to_output_text_file` is the full path to where you want the list output \ No newline at end of file diff --git a/subsetting/ncatch_upstream.py b/subsetting/ncatch_upstream.py new file mode 100644 index 0000000..7481f9b --- /dev/null +++ b/subsetting/ncatch_upstream.py @@ -0,0 +1,51 @@ +import geopandas as gpd +import argparse +from subset import get_upstream_ids + +def main(): + #setup the argument parser + parser = argparse.ArgumentParser() + parser.add_argument("-i", dest="infile", type=str, required=True, help="A gpkg file containing divides and nexus layers") + parser.add_argument("-o", dest="outfile", type=str, required=True, help="A text file containing the number of upstream catchments for each catchment") + args = parser.parse_args() + + infile = args.infile + outfile = args.outfile + + print("Reading catchment data...") + df_cat = gpd.read_file(str(infile), layer="divides") + + print("Reading nexus data...") + df_nex = gpd.read_file(str(infile), layer="nexus") + + df_cat_org = df_cat.copy() + df_nex_org = df_nex.copy() + + df_cat.set_index('id', inplace=True) + + print("Finding upstream catchments...") + upstream = nupstream(df_cat_org, df_nex_org,df_cat.index) + + with open(outfile,'w') as fp: + for jcatch in upstream: + fp.write(f'{jcatch} : {upstream[jcatch]}\n') + + print(f'Done! - > {outfile}') + +def nupstream(divides,nexus,cat_list): + """ + Find the number of upstream catchments for each catchment + """ + upstream = {} + for j in range(len(cat_list)): + jcat_id = cat_list[j] + cat_up_ids, nexus_up_ids = get_upstream_ids(divides, nexus, jcat_id) + jnupstream = len(cat_up_ids) + upstream[jcat_id] = jnupstream + + upstream = dict(sorted(upstream.items(), key=lambda x:x[1], reverse=True)) + + return upstream + +if __name__ == "__main__": + main() \ No newline at end of file From 801d89de19383c3c9cb26b73e8445e2059e848fe Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Tue, 18 Apr 2023 15:11:41 -0500 Subject: [PATCH 044/105] Made arguments positional --- subsetting/README.md | 2 +- subsetting/ncatch_upstream.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/subsetting/README.md b/subsetting/README.md index 953443a..ee2ebc7 100644 --- a/subsetting/README.md +++ b/subsetting/README.md @@ -16,7 +16,7 @@ This means you might get a subset that isn't topologically consistent, so use at # ncatch_upstream To get a list of how many catchments are upstream of each catchment, enter the following command -`python subset.py -i -o ` +`python subset.py ` where `path_to_hydrofabric` can be a local geopkg, or a remote resource (s3 or http URL), and `path_to_output_text_file` is the full path to where you want the list output \ No newline at end of file diff --git a/subsetting/ncatch_upstream.py b/subsetting/ncatch_upstream.py index 7481f9b..4983bd4 100644 --- a/subsetting/ncatch_upstream.py +++ b/subsetting/ncatch_upstream.py @@ -1,12 +1,12 @@ import geopandas as gpd -import argparse +import argparse, os from subset import get_upstream_ids def main(): #setup the argument parser parser = argparse.ArgumentParser() - parser.add_argument("-i", dest="infile", type=str, required=True, help="A gpkg file containing divides and nexus layers") - parser.add_argument("-o", dest="outfile", type=str, required=True, help="A text file containing the number of upstream catchments for each catchment") + parser.add_argument(dest="infile", type=str, help="A gpkg file containing divides and nexus layers") + parser.add_argument(dest="outfile", type=str, help="A text file containing the number of upstream catchments for each catchment") args = parser.parse_args() infile = args.infile @@ -27,6 +27,7 @@ def main(): upstream = nupstream(df_cat_org, df_nex_org,df_cat.index) with open(outfile,'w') as fp: + fp.write(f'Catchment IDs and the number of upstream catchments\nGenerated with file {os.path.basename(infile)}\n') for jcatch in upstream: fp.write(f'{jcatch} : {upstream[jcatch]}\n') From 6831f117067d2d047f48350db7fa6de9fbc3b9b9 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Tue, 18 Apr 2023 16:32:10 -0500 Subject: [PATCH 045/105] Tool for isolating forcing files based on the catchments within a geojson --- subsetting/subset_forcing.py | 49 ++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 subsetting/subset_forcing.py diff --git a/subsetting/subset_forcing.py b/subsetting/subset_forcing.py new file mode 100644 index 0000000..9199442 --- /dev/null +++ b/subsetting/subset_forcing.py @@ -0,0 +1,49 @@ +import argparse, os, json + +def main(): + """ + Find forcing files in a directory that match the catchments within a catchment.geojson + + """ + #setup the argument parser + parser = argparse.ArgumentParser() + parser.add_argument(dest="forcing_dir", type=str, help="Path to forcing files") + parser.add_argument(dest="forcing_dir_out", type=str, help="Path to output the forcing files subset") + parser.add_argument(dest="catchment_file", type=str, help="A catchment geojson file") + args = parser.parse_args() + + indir = args.forcing_dir + outdir = args.forcing_dir_out + catch_file = args.catchment_file + + if not os.path.exists(outdir): + os.system(f'mkdir {outdir}') + + forcing_files = os.listdir(indir) + + print("Reading catchment data...") + with open(catch_file) as fp: + data = json.load(fp) + + # User should validate the catch file. + # Would do here with ngen-cal, just don't want to create the dependency + feats = data['features'] + forcing_out = [] + for jfeat in feats: + found = False + try: # Geopandas/pydantic descrepancy + cat_id = jfeat['id'] + except: + cat_id = jfeat['properties']['id'] + for jforcing in forcing_files: + if jforcing.find(cat_id) >= 0: + found = True + forcing_out.append(jforcing) + os.system(f'cp {os.path.join(indir,jforcing)} {os.path.join(outdir,jforcing)}') + if not found: + print(f'Couldn\'t find forcing file for {cat_id}!') + else: + print(f'Found forcing file for {cat_id}!') + +if __name__ == "__main__": + main() \ No newline at end of file From 72a6e0ec97706113eccd54eab0cf58a50bcc87d7 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Tue, 18 Apr 2023 19:30:44 -0500 Subject: [PATCH 046/105] Removed unnecessary copy --- subsetting/ncatch_upstream.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/subsetting/ncatch_upstream.py b/subsetting/ncatch_upstream.py index 4983bd4..51c9032 100644 --- a/subsetting/ncatch_upstream.py +++ b/subsetting/ncatch_upstream.py @@ -18,13 +18,10 @@ def main(): print("Reading nexus data...") df_nex = gpd.read_file(str(infile), layer="nexus") - df_cat_org = df_cat.copy() - df_nex_org = df_nex.copy() - df_cat.set_index('id', inplace=True) print("Finding upstream catchments...") - upstream = nupstream(df_cat_org, df_nex_org,df_cat.index) + upstream = nupstream(df_cat.reset_index(), df_nex.reset_index(),df_cat.index) with open(outfile,'w') as fp: fp.write(f'Catchment IDs and the number of upstream catchments\nGenerated with file {os.path.basename(infile)}\n') From 0b7d4118b4eb6aedd44b8e373fbf615a5b3aa267 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Tue, 25 Apr 2023 17:10:09 -0500 Subject: [PATCH 047/105] configurable python script to generate catchment forcing files for ngen --- prep_hydrofab_forcings_ngen.py | 465 +++++++++++++++++++++++++++++++++ user_input_ngen.json | 15 ++ 2 files changed, 480 insertions(+) create mode 100644 prep_hydrofab_forcings_ngen.py create mode 100644 user_input_ngen.json diff --git a/prep_hydrofab_forcings_ngen.py b/prep_hydrofab_forcings_ngen.py new file mode 100644 index 0000000..b423883 --- /dev/null +++ b/prep_hydrofab_forcings_ngen.py @@ -0,0 +1,465 @@ + +# https://github.com/jameshalgren/data-access-examples/blob/DONOTMERGE_VPU16/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb + +# !pip install --upgrade google-api-python-client +# !pip install --upgrade google-cloud-storage + +import pickle +import time +import pandas as pd +import argparse, os, json +import gc +from pathlib import Path +import geopandas as gpd +import pandas as pd +import numpy as np +import xarray as xr +from google.cloud import storage +from rasterio.io import MemoryFile +from rasterio.features import rasterize + +from nwm_filenames.listofnwmfilenames import create_file_list + +TEMPLATE_BLOB_NAME = ( + "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" +) +NWM_BUCKET = "national-water-model" + +# WKT strings extracted from NWM grids +CONUS_NWM_WKT = 'PROJCS["Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]], \ +PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ +PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-97.0],PARAMETER["standard_parallel_1",30.0],\ +PARAMETER["standard_parallel_2",60.0],PARAMETER["latitude_of_origin",40.0],UNIT["Meter",1.0]]' + +HI_NWM_WKT = 'PROJCS["Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]],\ +PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ +PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-157.42],PARAMETER["standard_parallel_1",10.0],\ +PARAMETER["standard_parallel_2",30.0],PARAMETER["latitude_of_origin",20.6],UNIT["Meter",1.0]]' + +PR_NWM_WKT = 'PROJCS["Sphere_Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]],\ +PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ +PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-65.91],PARAMETER["standard_parallel_1",18.1],\ +PARAMETER["standard_parallel_2",18.1],PARAMETER["latitude_of_origin",18.1],UNIT["Meter",1.0]]' + +# paths +CACHE_DIR = Path(Path.home(), "code", "data_access_examples", "raw_forcing_data") +NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") +USGS_CACHE_DIR = os.path.join(CACHE_DIR, "usgs") +GEO_CACHE_DIR = os.path.join(CACHE_DIR, "geo") + +NWM_CACHE_H5 = os.path.join(NWM_CACHE_DIR, "gcp_client.h5") + +PARQUET_CACHE_DIR = os.path.join(CACHE_DIR, "parquet") +MEDIUM_RANGE_FORCING_PARQUET = os.path.join(PARQUET_CACHE_DIR, "forcing_medium_range") +FORCING_ANALYSIS_ASSIM_PARQUET = os.path.join( + PARQUET_CACHE_DIR, "forcing_analysis_assim" +) +MEDIUM_RANGE_PARQUET = os.path.join(PARQUET_CACHE_DIR, "medium_range") +USGS_PARQUET = os.path.join(PARQUET_CACHE_DIR, "usgs") + +HUC10_SHP_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_conus.shp") +HUC10_PARQUET_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_conus.parquet") +HUC10_MEDIUM_RANGE_WEIGHTS_FILEPATH = os.path.join( + GEO_CACHE_DIR, "wbdhu10_medium_range_weights.pkl" +) + +ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, "RouteLink_CONUS.nc") +ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, "route_link_conus.parquet") + + +def parquet_to_gdf(parquet_filepath: str) -> gpd.GeoDataFrame: + gdf = gpd.read_parquet(parquet_filepath) + return gdf + +def get_cache_dir(create: bool = True): + if not os.path.exists(NWM_CACHE_DIR) and create: + os.mkdir(NWM_CACHE_DIR) + if not os.path.exists(NWM_CACHE_DIR): + raise NotADirectoryError + return NWM_CACHE_DIR + +def make_parent_dir(filepath): + Path(filepath).parent.mkdir(parents=True, exist_ok=True) + +def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: + """Retrieve a blob from the data service as xarray.Dataset. + Based largely on OWP HydroTools. + Parameters + ---------- + blob_name: str, required + Name of blob to retrieve. + use_cache: bool, default True + If cache should be used. + If True, checks to see if file is in cache, and + If fetched from remote, will save to cache. + Returns + ------- + ds : xarray.Dataset + The data stored in the blob. + """ + # TODO: Check to see if this does any better than kerchunk + # the caching should help, but probably needs to be managed to function asynchronously. + # Perhaps if the files is not cached, we can create the dataset from + # kerchunk with a remote path and then asynchronously do a download to cache it + # for next time. The hypothesis would be that the download speed will not be any slower than + # just accessing the file remotely. + nc_filepath = os.path.join(get_cache_dir(), blob_name) + make_parent_dir(nc_filepath) + + # If the file exists and use_cache = True + if os.path.exists(nc_filepath) and use_cache: + # Get dataset from cache + ds = xr.load_dataset( + nc_filepath, + engine="h5netcdf", + ) + return ds + else: + # Get raw bytes + raw_bytes = get_blob(blob_name) + # Create Dataset + ds = xr.load_dataset( + MemoryFile(raw_bytes), + engine="h5netcdf", + ) + if use_cache: + # Subset and cache + ds["RAINRATE"].to_netcdf( + nc_filepath, + engine="h5netcdf", + ) + return ds + +def generate_weights_file( + gdf: gpd.GeoDataFrame, + src: xr.DataArray, + weights_filepath: str, + crosswalk_dict_key: str, +): + """Generate a weights file.""" + + gdf_proj = gdf.to_crs(CONUS_NWM_WKT) + + crosswalk_dict = {} + + # This is a probably a really poor performing way to do this + # TODO: Consider vectorizing -- would require digging into the + # other end of these where we unpack the weights... + i = 0 + for index, row in gdf_proj.iterrows(): + geom_rasterize = rasterize( + [(row["geometry"], 1)], + out_shape=src.rio.shape, + transform=src.rio.transform(), + all_touched=True, + fill=0, # IS FILL 0 + dtype="uint8", + ) + if crosswalk_dict_key: + crosswalk_dict[row[crosswalk_dict_key]] = np.where(geom_rasterize == 1) + else: + crosswalk_dict[index] = np.where(geom_rasterize == 1) + + if i % 100 == 0: + perc = i/len(gdf_proj)*100 + print(f"{i}, {perc:.2f}%".ljust(40), end="\r") + if perc > 0.01: break + i += 1 + + with open(weights_filepath, "wb") as f: + # TODO: This is a dict of ndarrays, which could be easily stored as a set of parquet files for safekeeping. + pickle.dump(crosswalk_dict, f) + +def add_zonalstats_to_gdf_weights( + gdf: gpd.GeoDataFrame, + src: xr.DataArray, + weights_filepath: str, +) -> gpd.GeoDataFrame: + """Calculates zonal stats and adds to GeoDataFrame""" + + df = calc_zonal_stats_weights(src, weights_filepath) + gdf_map = gdf.merge(df, left_on="huc10", right_on="catchment_id") + + return gdf_map + + +def get_blob(blob_name: str, bucket: str = NWM_BUCKET) -> bytes: + """Retrieve a blob from the data service as bytes. + Based largely on OWP HydroTools. + Parameters + ---------- + blob_name : str, required + Name of blob to retrieve. + Returns + ------- + data : bytes + The data stored in the blob. + """ + # Setup anonymous client and retrieve blob data + client = storage.Client.create_anonymous_client() + bucket = client.bucket(bucket) + return bucket.blob(blob_name).download_as_bytes(timeout=120) + + +def calc_zonal_stats_weights( + src: xr.DataArray, + weights_filepath: str, +) -> pd.DataFrame: + """Calculates zonal stats""" + + # Open weights dict from pickle + # This could probably be done once and passed as a reference. + with open(weights_filepath, "rb") as f: + crosswalk_dict = pickle.load(f) + + r_array = src.values[0] + r_array[r_array == src.rio.nodata] = np.nan + + mean_dict = {} + for key, value in crosswalk_dict.items(): + mean_dict[key] = np.nanmean(r_array[value]) + + df = pd.DataFrame.from_dict(mean_dict, orient="index", columns=["value"]) + + df.reset_index(inplace=True, names="catchment_id") + + # This should not be needed, but without memory usage grows + del crosswalk_dict + del f + gc.collect() + + return df + + +def get_forcing_dict_RTIway( + pickle_file, # This would be a Feature list for parallel calling -- + # if there is a stored weights file, we use it + # (checking for an optional flag to force re-creation of the weights...) + folder_prefix, + file_list, +): + + var = "RAINRATE" + reng = "rasterio" + filehandles = [ + xr.open_dataset(folder_prefix / f, engine=reng)[var] for f in file_list + ] + # filehandles = [get_dataset("data/" + f, use_cache=True) for f in file_list] + stats = [] + + for _i, f in enumerate(filehandles): + print(f"{_i}, {round(_i/len(file_list), 2)*100}".ljust(40), end="\r") + stats.append(calc_zonal_stats_weights(f, pickle_file)) + + [f.close() for f in filehandles] + return stats + + +def get_forcing_dict_RTIway2( + pickle_file, # This would be a Feature list for parallel calling -- + # if there is a stored weights file, we use it + # (checking for an optional flag to force re-creation of the weights...) + gpkg_divides, + folder_prefix, + filelist, + var_list, +): + reng = "rasterio" + pick_val = "value" + + df_dict = {} + dl_dict = {} + for _v in var_list: + df_dict[_v] = pd.DataFrame(index=gpkg_divides.index) + dl_dict[_v] = [] + + # ds_list = [] + for _i, _nc_file in enumerate(filelist): + # _nc_file = ("nwm.t00z.medium_range.forcing.f001.conus.nc") + _full_nc_file = folder_prefix.joinpath(_nc_file) + + try: + # with xr.open_dataset(_full_nc_file, engine=reng) as _xds: + with xr.open_dataset(_full_nc_file) as _xds: + # _xds = ds_list[_i] + # _xds.rio.write_crs(rasterio.crs.CRS.from_wkt(CONUS_NWM_WKT), inplace=True) + print(f"{_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") + for _v in var_list: + _src = _xds[_v] + _df_zonal_stats = calc_zonal_stats_weights(_src, pickle_file) + # if adding statistics back to original GeoDataFrame + # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1) + _df = pd.DataFrame(index=gpkg_divides.index) + _df[_xds.time.values[0]] = _df_zonal_stats[pick_val] + # TODO: This same line could add the new values directly + # to the same dictionary. But after adding about 100 of them, + # pandas starts to complain about degraded performance due to + # fragmentation of the dataframe. We tried it this was as a + # workaround, with the loop below to accomplish the concatenation. + dl_dict[_v].append(_df) + except: + print(f"No such file: {_full_nc_file}") + + for _v in var_list: + df_dict[_v] = pd.concat(dl_dict[_v], axis=1) + + # [_xds.close() for _xds in ds_list] + + return df_dict + + +def main(): + """ + Primary function to retrieve hydrofabrics data and convert it into files that can be ingested into ngen. + Also, the forcing data is retrieved. + + Inputs: JSON config file specifying start_date, end_date, and vpu + + Outputs: ngen catchment/nexus configs and forcing files + + Will store files in the same folder as the JSON config to run this script + """ + parser = argparse.ArgumentParser() + parser.add_argument(dest="infile", type=str, help="A json containing user inputs to run ngen") + args = parser.parse_args() + + # Take in user config + conf = json.load(open(args.infile)) + start_date = conf['forcing']['start_date'] + end_date = conf['forcing']['end_date'] + runinput = conf['forcing']['runinput'] + varinput = conf['forcing']['varinput'] + geoinput = conf['forcing']['geoinput'] + meminput = conf['forcing']['meminput'] + urlbaseinput = conf['forcing']['urlbaseinput'] + + vpu = conf['hydrofab']['vpu'] + # Subsetting ??? + + top_dir = os.path.dirname(args.infile) + data_dir = os.path.join(top_dir,'raw_forcing_data') + output_dir = os.path.join(top_dir,'catchment_forcing_data') + + if not os.path.exists(data_dir): + os.system(f'mkdir {data_dir}') + + if not os.path.exists(output_dir): + os.system(f'mkdir {output_dir}') + + # Generate list of file names to retrieve for forcing data + n = 6 + fcst_cycle = [n*x for x in range(24//n)] + lead_time = [x+1 for x in range(n)] + + # TODO: These need to be in the configuration file + + + print(f'Creating list of file names to pull...') + nwm_forcing_files = create_file_list( + runinput, + varinput, + geoinput, + meminput, + start_date, + end_date, + fcst_cycle, + urlbaseinput, + lead_time, + ) + + print(f'Pulling files...') + local_files = [] + for jfile in nwm_forcing_files: + file_parts = jfile.split('/') + local_file = os.path.join(data_dir,file_parts[-1]) + local_files.append(local_file) + if os.path.exists(local_file): + continue + else: + command = f'wget -P {data_dir} -c https://storage.googleapis.com/national-water-model/{jfile}' + os.system(command) + + # TODO wget this if needed + gpkg = '/home/jlaser/code/data/nextgen_03W.gpkg' + ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) + src = ds["RAINRATE"] + + # Why are we converting to paquet and then back into geopandas dataframe? + polygonfile = gpd.read_file(gpkg, layer="divides") + parq_file = os.path.join(data_dir,"ng_03.parquet") + polygonfile.to_parquet(parq_file) + pkl_file = os.path.join(data_dir,"weights.pkl") + generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") + calc_zonal_stats_weights(src, pkl_file) + + var_list = [ + "U2D", + "V2D", + "LWDOWN", + "RAINRATE", + "T2D", + "Q2D", + "PSFC", + "SWDOWN", + ] + + just_files = [] + for jfile in local_files: + splt = jfile.split('/') # Need a way to do this that doesn't break on windows + just_files.append(splt[-1]) + + fd2 = get_forcing_dict_RTIway2( + pkl_file, + polygonfile, + Path(data_dir), + just_files, + var_list, + ) + + # pcp_var and pcp_var2 are indentical? + pcp_var = fd2["RAINRATE"] + lw_var = fd2["LWDOWN"] + sw_var = fd2["SWDOWN"] + sp_var = fd2["PSFC"] + tmp_var = fd2["T2D"] + u2d_var = fd2["U2D"] + v2d_var = fd2["V2D"] + pcp_var2 = fd2["RAINRATE"] + + ncatchments = len(polygonfile["id"]) + for _i in range(0, ncatchments): + + pcp_var_0 = pcp_var.transpose()[_i].rename("APCP_surface") + lw_var_0 = lw_var.transpose()[_i].rename("DLWRF_surface") + sw_var_0 = sw_var.transpose()[_i].rename("DSWRF_surface") + sp_var_0 = sp_var.transpose()[_i].rename("SPFH_2maboveground") + tmp_var_0 = tmp_var.transpose()[_i].rename("TMP_2maboveground") + u2d_var_0 = u2d_var.transpose()[_i].rename("UGRD_10maboveground") + v2d_var_0 = v2d_var.transpose()[_i].rename("VGRD_10maboveground") + pcp_var2_0 = pcp_var2.transpose()[_i].rename("precip_rate") ##BROKEN!! + + d = pd.concat( + [ + pcp_var_0, + lw_var_0, + sw_var_0, + sp_var_0, + tmp_var_0, + u2d_var_0, + v2d_var_0, + pcp_var2_0, + ], + axis=1, + ) + d.index.name = "time" + + id = polygonfile["id"][_i] + splt = id.split('-') + csvname = f"{output_dir}/cat{vpu}_{splt[1]}.csv" + d.to_csv(csvname) + + print(f'\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {output_dir}\n\n') + +if __name__ == "__main__": + main() + \ No newline at end of file diff --git a/user_input_ngen.json b/user_input_ngen.json new file mode 100644 index 0000000..6529f90 --- /dev/null +++ b/user_input_ngen.json @@ -0,0 +1,15 @@ +{ + "forcing" : { + "start_date" : "20220822", + "end_date" : "20220822", + "runinput" : 2, + "varinput" : 5, + "geoinput" : 1, + "meminput" : 0, + "urlbaseinput" : null + }, + + "hydrofab" : { + "vpu" : 14 + } +} \ No newline at end of file From cd5346d7455969fb76da1bd546a003fd45ba9aa2 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 26 Apr 2023 17:29:04 -0500 Subject: [PATCH 048/105] Speed up:implemented new data -> data frame and write functions --- prep_hydrofab_forcings_ngen.py | 130 +++++++++++++++++++++++++++------ user_input_ngen.json | 14 ++-- 2 files changed, 116 insertions(+), 28 deletions(-) diff --git a/prep_hydrofab_forcings_ngen.py b/prep_hydrofab_forcings_ngen.py index b423883..d885829 100644 --- a/prep_hydrofab_forcings_ngen.py +++ b/prep_hydrofab_forcings_ngen.py @@ -1,13 +1,12 @@ - # https://github.com/jameshalgren/data-access-examples/blob/DONOTMERGE_VPU16/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb # !pip install --upgrade google-api-python-client # !pip install --upgrade google-cloud-storage import pickle -import time import pandas as pd import argparse, os, json +from sys import getsizeof import gc from pathlib import Path import geopandas as gpd @@ -17,8 +16,11 @@ from google.cloud import storage from rasterio.io import MemoryFile from rasterio.features import rasterize +import rasterio +import time from nwm_filenames.listofnwmfilenames import create_file_list +from ngen_forcing.process_nwm_forcing_to_ngen import * TEMPLATE_BLOB_NAME = ( "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" @@ -141,7 +143,6 @@ def generate_weights_file( gdf_proj = gdf.to_crs(CONUS_NWM_WKT) crosswalk_dict = {} - # This is a probably a really poor performing way to do this # TODO: Consider vectorizing -- would require digging into the # other end of these where we unpack the weights... @@ -163,7 +164,6 @@ def generate_weights_file( if i % 100 == 0: perc = i/len(gdf_proj)*100 print(f"{i}, {perc:.2f}%".ljust(40), end="\r") - if perc > 0.01: break i += 1 with open(weights_filepath, "wb") as f: @@ -176,7 +176,7 @@ def add_zonalstats_to_gdf_weights( weights_filepath: str, ) -> gpd.GeoDataFrame: """Calculates zonal stats and adds to GeoDataFrame""" - + df = calc_zonal_stats_weights(src, weights_filepath) gdf_map = gdf.merge(df, left_on="huc10", right_on="catchment_id") @@ -200,10 +200,37 @@ def get_blob(blob_name: str, bucket: str = NWM_BUCKET) -> bytes: bucket = client.bucket(bucket) return bucket.blob(blob_name).download_as_bytes(timeout=120) +def calc_zonal_stats_weights_new( + src: np.ndarray, + weights_filepath: str, +) -> pd.DataFrame: + """Calculates zonal stats""" + + # Open weights dict from pickle + # This could probably be done once and passed as a reference. + with open(weights_filepath, "rb") as f: + crosswalk_dict = pickle.load(f) + + nvar = src.shape[0] + mean_dict = {} + for key, value in crosswalk_dict.items(): + mean_dict[key] = np.zeros((nvar,),dtype=np.float64) + + mean_dict = {} + for key, value in crosswalk_dict.items(): + mean_dict[key] = np.nanmean(src[:,value[0],value[1]],axis=1) + + # This should not be needed, but without memory usage grows + del crosswalk_dict + del f + gc.collect() + + return mean_dict + def calc_zonal_stats_weights( src: xr.DataArray, - weights_filepath: str, + weights_filepath: str, ) -> pd.DataFrame: """Calculates zonal stats""" @@ -230,7 +257,6 @@ def calc_zonal_stats_weights( return df - def get_forcing_dict_RTIway( pickle_file, # This would be a Feature list for parallel calling -- # if there is a stored weights file, we use it @@ -254,6 +280,39 @@ def get_forcing_dict_RTIway( [f.close() for f in filehandles] return stats +def get_forcing_dict_JL( + pickle_file, + folder_prefix, + filelist, + var_list, + var_list_out +): + t1 = time.perf_counter() + df_by_t = [] + for _i, _nc_file in enumerate(filelist): + _full_nc_file = folder_prefix.joinpath(_nc_file) + print(f"Indexing data out of {_full_nc_file} {_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") + with xr.open_dataset(_full_nc_file) as _xds: + shp = _xds['U2D'].shape + data_allvars = np.zeros( + shape=(len(var_list),shp[1],shp[2]), + dtype=_xds['U2D'].dtype) + for var_dx, jvar in enumerate(var_list): + data_allvars[var_dx,:,:] = np.squeeze(_xds[jvar].values) + _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, pickle_file) + df_by_t.append(_df_zonal_stats) + + print(f'Reformating and converting data into dataframe') + dfs = {} + for jcat in list(df_by_t[0].keys()): + data_catch = [] + for jt in range(len(df_by_t)): + data_catch.append(df_by_t[jt][jcat]) + dfs[jcat] = pd.DataFrame(data_catch,columns = var_list_out) + + print(f"Indexing data and generating the dataframes (JL) {time.perf_counter() - t1:.2f} s") + + return dfs def get_forcing_dict_RTIway2( pickle_file, # This would be a Feature list for parallel calling -- @@ -264,6 +323,7 @@ def get_forcing_dict_RTIway2( filelist, var_list, ): + t1=time.perf_counter() reng = "rasterio" pick_val = "value" @@ -304,6 +364,7 @@ def get_forcing_dict_RTIway2( df_dict[_v] = pd.concat(dl_dict[_v], axis=1) # [_xds.close() for _xds in ds_list] + print(f"Indexing data and generating the dataframes (RTI) {time.perf_counter() - t1:.2f} s") return df_dict @@ -336,25 +397,20 @@ def main(): vpu = conf['hydrofab']['vpu'] # Subsetting ??? + # Set paths and make directories if needed top_dir = os.path.dirname(args.infile) data_dir = os.path.join(top_dir,'raw_forcing_data') output_dir = os.path.join(top_dir,'catchment_forcing_data') - if not os.path.exists(data_dir): os.system(f'mkdir {data_dir}') - if not os.path.exists(output_dir): os.system(f'mkdir {output_dir}') # Generate list of file names to retrieve for forcing data + print(f'Creating list of file names to pull...') n = 6 fcst_cycle = [n*x for x in range(24//n)] lead_time = [x+1 for x in range(n)] - - # TODO: These need to be in the configuration file - - - print(f'Creating list of file names to pull...') nwm_forcing_files = create_file_list( runinput, varinput, @@ -367,7 +423,7 @@ def main(): lead_time, ) - print(f'Pulling files...') + # Check to see if we have files cached, if not wget them local_files = [] for jfile in nwm_forcing_files: file_parts = jfile.split('/') @@ -389,33 +445,64 @@ def main(): parq_file = os.path.join(data_dir,"ng_03.parquet") polygonfile.to_parquet(parq_file) pkl_file = os.path.join(data_dir,"weights.pkl") - generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") - calc_zonal_stats_weights(src, pkl_file) + print("Generating weights") + t1 = time.perf_counter() + # generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") + print(f"Generating the weights took {time.perf_counter() - t1:.2f} s") var_list = [ "U2D", "V2D", "LWDOWN", "RAINRATE", "T2D", - "Q2D", "PSFC", "SWDOWN", ] + var_list_out = [ + "UGRD_10maboveground", + "VGRD_10maboveground", + "DLWRF_surface", + "APCP_surface", + "TMP_2maboveground", + "SPFH_2maboveground", + "DSWRF_surface", + ] + just_files = [] for jfile in local_files: splt = jfile.split('/') # Need a way to do this that doesn't break on windows just_files.append(splt[-1]) + + fd2 = get_forcing_dict_JL( + pkl_file, + Path(data_dir), + just_files, + var_list, + var_list_out, + ) + ncatch_out = len(fd2.keys()) + + t0 = time.perf_counter() + for jcatch in fd2.keys(): + arr = fd2[jcatch] + splt = jcatch.split('-') + csvname = f"{output_dir}/cat{vpu}_{splt[1]}.csv" + arr.to_csv(csvname) + + print(f'JL write took {time.perf_counter() - t0:.2f} s') + fd2 = get_forcing_dict_RTIway2( pkl_file, polygonfile, Path(data_dir), just_files, var_list, - ) + ) + t0 = time.perf_counter() # pcp_var and pcp_var2 are indentical? pcp_var = fd2["RAINRATE"] lw_var = fd2["LWDOWN"] @@ -426,8 +513,7 @@ def main(): v2d_var = fd2["V2D"] pcp_var2 = fd2["RAINRATE"] - ncatchments = len(polygonfile["id"]) - for _i in range(0, ncatchments): + for _i in range(0, ncatch_out): pcp_var_0 = pcp_var.transpose()[_i].rename("APCP_surface") lw_var_0 = lw_var.transpose()[_i].rename("DLWRF_surface") @@ -458,6 +544,8 @@ def main(): csvname = f"{output_dir}/cat{vpu}_{splt[1]}.csv" d.to_csv(csvname) + print(f'RTI write took {time.perf_counter() - t0:.2f} s') + print(f'\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {output_dir}\n\n') if __name__ == "__main__": diff --git a/user_input_ngen.json b/user_input_ngen.json index 6529f90..e3d5bef 100644 --- a/user_input_ngen.json +++ b/user_input_ngen.json @@ -1,15 +1,15 @@ { "forcing" : { - "start_date" : "20220822", - "end_date" : "20220822", - "runinput" : 2, - "varinput" : 5, - "geoinput" : 1, - "meminput" : 0, + "start_date" : "20220822", + "end_date" : "20220822", + "runinput" : 2, + "varinput" : 5, + "geoinput" : 1, + "meminput" : 0, "urlbaseinput" : null }, "hydrofab" : { - "vpu" : 14 + "vpu" : 14 } } \ No newline at end of file From 6a8cab6f59d7493226b224702de8315d3b528e43 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 26 Apr 2023 18:11:53 -0500 Subject: [PATCH 049/105] Included preop_rate (which is broken) --- prep_hydrofab_forcings_ngen.py | 131 +++++++++++++++++++-------------- 1 file changed, 77 insertions(+), 54 deletions(-) diff --git a/prep_hydrofab_forcings_ngen.py b/prep_hydrofab_forcings_ngen.py index d885829..017c922 100644 --- a/prep_hydrofab_forcings_ngen.py +++ b/prep_hydrofab_forcings_ngen.py @@ -68,6 +68,28 @@ ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, "RouteLink_CONUS.nc") ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, "route_link_conus.parquet") +# TODO: Implemenent these function to appropriately calculate precip_rate +def rho(temp): + """ + Calculate water density at temperature + """ + return 999.99399 + 0.04216485*temp - 0.007097451*(temp**2) + 0.00003509571*(temp**3) - 9.9037785E-8*(temp**4) + +def aorc_as_rate(dataFrame): + """ + Convert kg/m^2 -> m/s + """ + if isinstance(dataFrame.index, pd.MultiIndex): + interval = pd.Series(dataFrame.index.get_level_values(0)) + else: + interval = pd.Series(dataFrame.index) + interval = ( interval.shift(-1) - interval ) / np.timedelta64(1, 's') + interval.index = dataFrame.index + precip_rate = ( dataFrame['APCP_surface'].shift(-1) / dataFrame['TMP_2maboveground'].apply(rho) ) / interval + precip_rate.name = 'precip_rate' + return precip_rate + +###### def parquet_to_gdf(parquet_filepath: str) -> gpd.GeoDataFrame: gdf = gpd.read_parquet(parquet_filepath) @@ -291,7 +313,7 @@ def get_forcing_dict_JL( df_by_t = [] for _i, _nc_file in enumerate(filelist): _full_nc_file = folder_prefix.joinpath(_nc_file) - print(f"Indexing data out of {_full_nc_file} {_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") + print(f"Data indexing progress -> {_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") with xr.open_dataset(_full_nc_file) as _xds: shp = _xds['U2D'].shape data_allvars = np.zeros( @@ -302,7 +324,7 @@ def get_forcing_dict_JL( _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, pickle_file) df_by_t.append(_df_zonal_stats) - print(f'Reformating and converting data into dataframe') + print(f'Reformating and converting data into dataframe', end="\r") dfs = {} for jcat in list(df_by_t[0].keys()): data_catch = [] @@ -455,6 +477,7 @@ def main(): "V2D", "LWDOWN", "RAINRATE", + "RAINRATE", "T2D", "PSFC", "SWDOWN", @@ -465,6 +488,7 @@ def main(): "VGRD_10maboveground", "DLWRF_surface", "APCP_surface", + "precip_rate", # BROKEN "TMP_2maboveground", "SPFH_2maboveground", "DSWRF_surface", @@ -483,7 +507,6 @@ def main(): var_list, var_list_out, ) - ncatch_out = len(fd2.keys()) t0 = time.perf_counter() for jcatch in fd2.keys(): @@ -494,57 +517,57 @@ def main(): print(f'JL write took {time.perf_counter() - t0:.2f} s') - fd2 = get_forcing_dict_RTIway2( - pkl_file, - polygonfile, - Path(data_dir), - just_files, - var_list, - ) - - t0 = time.perf_counter() - # pcp_var and pcp_var2 are indentical? - pcp_var = fd2["RAINRATE"] - lw_var = fd2["LWDOWN"] - sw_var = fd2["SWDOWN"] - sp_var = fd2["PSFC"] - tmp_var = fd2["T2D"] - u2d_var = fd2["U2D"] - v2d_var = fd2["V2D"] - pcp_var2 = fd2["RAINRATE"] - - for _i in range(0, ncatch_out): - - pcp_var_0 = pcp_var.transpose()[_i].rename("APCP_surface") - lw_var_0 = lw_var.transpose()[_i].rename("DLWRF_surface") - sw_var_0 = sw_var.transpose()[_i].rename("DSWRF_surface") - sp_var_0 = sp_var.transpose()[_i].rename("SPFH_2maboveground") - tmp_var_0 = tmp_var.transpose()[_i].rename("TMP_2maboveground") - u2d_var_0 = u2d_var.transpose()[_i].rename("UGRD_10maboveground") - v2d_var_0 = v2d_var.transpose()[_i].rename("VGRD_10maboveground") - pcp_var2_0 = pcp_var2.transpose()[_i].rename("precip_rate") ##BROKEN!! - - d = pd.concat( - [ - pcp_var_0, - lw_var_0, - sw_var_0, - sp_var_0, - tmp_var_0, - u2d_var_0, - v2d_var_0, - pcp_var2_0, - ], - axis=1, - ) - d.index.name = "time" - - id = polygonfile["id"][_i] - splt = id.split('-') - csvname = f"{output_dir}/cat{vpu}_{splt[1]}.csv" - d.to_csv(csvname) - - print(f'RTI write took {time.perf_counter() - t0:.2f} s') + # fd2 = get_forcing_dict_RTIway2( + # pkl_file, + # polygonfile, + # Path(data_dir), + # just_files, + # var_list, + # ) + + # t0 = time.perf_counter() + # # pcp_var and pcp_var2 are indentical? + # pcp_var = fd2["RAINRATE"] + # lw_var = fd2["LWDOWN"] + # sw_var = fd2["SWDOWN"] + # sp_var = fd2["PSFC"] + # tmp_var = fd2["T2D"] + # u2d_var = fd2["U2D"] + # v2d_var = fd2["V2D"] + # pcp_var2 = fd2["RAINRATE"] + + # for _i in range(0, ncatch_out): + + # pcp_var_0 = pcp_var.transpose()[_i].rename("APCP_surface") + # lw_var_0 = lw_var.transpose()[_i].rename("DLWRF_surface") + # sw_var_0 = sw_var.transpose()[_i].rename("DSWRF_surface") + # sp_var_0 = sp_var.transpose()[_i].rename("SPFH_2maboveground") + # tmp_var_0 = tmp_var.transpose()[_i].rename("TMP_2maboveground") + # u2d_var_0 = u2d_var.transpose()[_i].rename("UGRD_10maboveground") + # v2d_var_0 = v2d_var.transpose()[_i].rename("VGRD_10maboveground") + # pcp_var2_0 = pcp_var2.transpose()[_i].rename("precip_rate") ##BROKEN!! + + # d = pd.concat( + # [ + # pcp_var_0, + # lw_var_0, + # sw_var_0, + # sp_var_0, + # tmp_var_0, + # u2d_var_0, + # v2d_var_0, + # pcp_var2_0, + # ], + # axis=1, + # ) + # d.index.name = "time" + + # id = polygonfile["id"][_i] + # splt = id.split('-') + # csvname = f"{output_dir}/cat{vpu}_{splt[1]}.csv" + # d.to_csv(csvname) + + # print(f'RTI write took {time.perf_counter() - t0:.2f} s') print(f'\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {output_dir}\n\n') From 79b21d701b377bbe5ab4d050279c22b1b3d6ea65 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 1 May 2023 13:19:41 -0500 Subject: [PATCH 050/105] user input markdown --- user_input_ngen.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 user_input_ngen.md diff --git a/user_input_ngen.md b/user_input_ngen.md new file mode 100644 index 0000000..d25fc7d --- /dev/null +++ b/user_input_ngen.md @@ -0,0 +1,33 @@ +# Manual for ngen user options + +## Example +filename = 'user_input_ngen.json' + +contents: + + { + "forcing" : { + "start_date" : "20220822", + "end_date" : "20220822", + "runinput" : 2, + "varinput" : 5, + "geoinput" : 1, + "meminput" : 0, + "urlbaseinput" : null + }, + + "hydrofab" : { + "vpu" : 14 + } + } + +### forcing +| Field Name | Data Type | Description | +| --- | --- | --- | +| start_date | `string` | YYYYMMDD | +| end_date | `string` | YYYYMMDD | +| runinput | `int` |
    1. short_range
    2. medium_range
    3. medium_range_no_da
    4. long_range
    5. analysis_assim
    6. analysis_assim_extend
    7. analysis_assim_extend_no_da
    8. analysis_assim_long
    9. analysis_assim_long_no_da
    10. analysis_assim_no_da
    11. short_range_no_da
    | +| varinput | `int` |
    1. channel_rt: for real-time channel data
    2. land: for land data
    3. reservoir: for reservoir data
    4. terrain_rt: for real-time terrain data
    5. forcing: for forcing data
    | +| geoinput | `int` |
    1. conus: for continental US
    2. hawaii: for Hawaii
    3. puertorico: for Puerto Rico
    | +| meminput | `int` |
    1. mem_1
    2. mem_2
    3. mem_3
    4. mem_4
    5. mem_5
    6. mem_6
    7. mem_7
    | +| urlbaseinput | `int` |
    1. Empty string: use local files
    2. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/: for real-time operational data from NOAA
    3. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/post-processed/WMS/: for post-processed data from NOAA's Web Map Service
    4. https://storage.googleapis.com/national-water-model/: for input/output data stored on Google Cloud Storage
    5. https://storage.cloud.google.com/national-water-model/: for input/output data stored on Google Cloud Storage
    6. gs://national-water-model/: for input/output data stored on Google Cloud Storage
    7. https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/: for retrospective data from AWS S3
    8. s3://noaa-nwm-retrospective-2-1-pds/model_output/: for retrospective data from AWS S3
    | From 23e4ec814adf8fe62ff51bd787b551cf3c663988 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 1 May 2023 13:26:59 -0500 Subject: [PATCH 051/105] added hydrofab field --- user_input_ngen.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/user_input_ngen.md b/user_input_ngen.md index d25fc7d..953a520 100644 --- a/user_input_ngen.md +++ b/user_input_ngen.md @@ -31,3 +31,9 @@ contents: | geoinput | `int` |
    1. conus: for continental US
    2. hawaii: for Hawaii
    3. puertorico: for Puerto Rico
    | | meminput | `int` |
    1. mem_1
    2. mem_2
    3. mem_3
    4. mem_4
    5. mem_5
    6. mem_6
    7. mem_7
    | | urlbaseinput | `int` |
    1. Empty string: use local files
    2. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/: for real-time operational data from NOAA
    3. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/post-processed/WMS/: for post-processed data from NOAA's Web Map Service
    4. https://storage.googleapis.com/national-water-model/: for input/output data stored on Google Cloud Storage
    5. https://storage.cloud.google.com/national-water-model/: for input/output data stored on Google Cloud Storage
    6. gs://national-water-model/: for input/output data stored on Google Cloud Storage
    7. https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/: for retrospective data from AWS S3
    8. s3://noaa-nwm-retrospective-2-1-pds/model_output/: for retrospective data from AWS S3
    | + + +### hydrofab +| Field Name | Data Type | Description | +| --- | --- | --- | +| vpu | `int` | Check here for map of VPUs https://noaa-owp.github.io/hydrofabric/articles/data_access.html | From 6297d643700f61d7dba0ffc3b3b2ced7aee3ab7a Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 1 May 2023 14:33:09 -0500 Subject: [PATCH 052/105] gitignore initial commit --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c6e39ed --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.vscode/ +data/* +nwm_filenames/__pycache__/ +subsetting/__pycache__/ +venv/ \ No newline at end of file From 2475605597eea5d71a148e76324374377ea92f81 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 1 May 2023 15:38:15 -0500 Subject: [PATCH 053/105] Added options and explanations --- user_input_ngen.json | 10 +++++++--- user_input_ngen.md | 9 ++++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/user_input_ngen.json b/user_input_ngen.json index e3d5bef..a6e757f 100644 --- a/user_input_ngen.json +++ b/user_input_ngen.json @@ -6,10 +6,14 @@ "varinput" : 5, "geoinput" : 1, "meminput" : 0, - "urlbaseinput" : null + "urlbaseinput" : 3 }, "hydrofab" : { - "vpu" : 14 - } + "vpu" : "03W" + }, + + "verbose" : true, + "output_dir" : "local", + "cache" : true } \ No newline at end of file diff --git a/user_input_ngen.md b/user_input_ngen.md index 953a520..eacadf6 100644 --- a/user_input_ngen.md +++ b/user_input_ngen.md @@ -36,4 +36,11 @@ contents: ### hydrofab | Field Name | Data Type | Description | | --- | --- | --- | -| vpu | `int` | Check here for map of VPUs https://noaa-owp.github.io/hydrofabric/articles/data_access.html | +| vpu | `string` | Check here for map of VPUs https://noaa-owp.github.io/hydrofabric/articles/data_access.html | + +### other options +| Field Name | Data Type | Description | +| --- | --- | --- | +| verbose | `bool` | Print raw forcing files | +| output_dir | `string` |
    1. "local" : output to ./data/catchment_forcing_data/
    | +| cache | `bool` |
  • true: store forcing files locally
  • false: interact with forcing files remotely
  • | From 5bf52f13d63f881a9ac95658918329ca2b94ad9a Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 1 May 2023 15:40:22 -0500 Subject: [PATCH 054/105] Updated example --- user_input_ngen.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/user_input_ngen.md b/user_input_ngen.md index eacadf6..3c6b82e 100644 --- a/user_input_ngen.md +++ b/user_input_ngen.md @@ -13,12 +13,16 @@ contents: "varinput" : 5, "geoinput" : 1, "meminput" : 0, - "urlbaseinput" : null + "urlbaseinput" : 3 }, "hydrofab" : { - "vpu" : 14 - } + "vpu" : "03W" + }, + + "verbose" : true, + "output_dir" : "local", + "cache" : true } ### forcing From 634a3c5fcdb24b41782263806d5f6f92fda7e703 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 1 May 2023 15:55:37 -0500 Subject: [PATCH 055/105] Made many of the operations conditional and added options --- prep_hydrofab_forcings_ngen.py | 197 ++++++++++++++++----------------- 1 file changed, 93 insertions(+), 104 deletions(-) diff --git a/prep_hydrofab_forcings_ngen.py b/prep_hydrofab_forcings_ngen.py index 017c922..172b585 100644 --- a/prep_hydrofab_forcings_ngen.py +++ b/prep_hydrofab_forcings_ngen.py @@ -16,7 +16,6 @@ from google.cloud import storage from rasterio.io import MemoryFile from rasterio.features import rasterize -import rasterio import time from nwm_filenames.listofnwmfilenames import create_file_list @@ -44,7 +43,7 @@ PARAMETER["standard_parallel_2",18.1],PARAMETER["latitude_of_origin",18.1],UNIT["Meter",1.0]]' # paths -CACHE_DIR = Path(Path.home(), "code", "data_access_examples", "raw_forcing_data") +CACHE_DIR = Path(Path.home(), "code", "data_access_examples", "data", "raw_forcing_data") NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") USGS_CACHE_DIR = os.path.join(CACHE_DIR, "usgs") GEO_CACHE_DIR = os.path.join(CACHE_DIR, "geo") @@ -390,6 +389,12 @@ def get_forcing_dict_RTIway2( return df_dict +def wget(cmd,name): + resp = os.system(cmd) + if resp > 0: + raise Exception (f'\nwget failed! Tried: {name}\n') + else: + print(f'Successful download of {name}') def main(): """ @@ -402,31 +407,42 @@ def main(): Will store files in the same folder as the JSON config to run this script """ + + t00 = time.perf_counter() + parser = argparse.ArgumentParser() parser.add_argument(dest="infile", type=str, help="A json containing user inputs to run ngen") - args = parser.parse_args() + args = parser.parse_args() # Take in user config conf = json.load(open(args.infile)) - start_date = conf['forcing']['start_date'] - end_date = conf['forcing']['end_date'] - runinput = conf['forcing']['runinput'] - varinput = conf['forcing']['varinput'] - geoinput = conf['forcing']['geoinput'] - meminput = conf['forcing']['meminput'] + start_date = conf['forcing']['start_date'] + end_date = conf['forcing']['end_date'] + runinput = conf['forcing']['runinput'] + varinput = conf['forcing']['varinput'] + geoinput = conf['forcing']['geoinput'] + meminput = conf['forcing']['meminput'] urlbaseinput = conf['forcing']['urlbaseinput'] + vpu = conf['hydrofab']['vpu'] + ii_verbose = conf['verbose'] + output_dir = conf['output_dir'] + ii_cache = conf['output_dir'] - vpu = conf['hydrofab']['vpu'] - # Subsetting ??? + # TODO: Subsetting! + # # Set paths and make directories if needed top_dir = os.path.dirname(args.infile) - data_dir = os.path.join(top_dir,'raw_forcing_data') - output_dir = os.path.join(top_dir,'catchment_forcing_data') - if not os.path.exists(data_dir): - os.system(f'mkdir {data_dir}') - if not os.path.exists(output_dir): - os.system(f'mkdir {output_dir}') + if not os.path.exists(CACHE_DIR): + os.system(f'mkdir {CACHE_DIR}') + + # TODO: Be able to write to anywhere we want (especially AWS bucket) + if output_dir == "local": + output_dir = Path(top_dir,'data/catchment_forcing_data') + if not os.path.exists(output_dir): + os.system(f'mkdir {output_dir}') + else: + raise NotImplementedError(f"{output_dir} is not an option for output_dir") # Generate list of file names to retrieve for forcing data print(f'Creating list of file names to pull...') @@ -446,32 +462,63 @@ def main(): ) # Check to see if we have files cached, if not wget them - local_files = [] - for jfile in nwm_forcing_files: - file_parts = jfile.split('/') - local_file = os.path.join(data_dir,file_parts[-1]) - local_files.append(local_file) - if os.path.exists(local_file): - continue - else: - command = f'wget -P {data_dir} -c https://storage.googleapis.com/national-water-model/{jfile}' - os.system(command) - - # TODO wget this if needed - gpkg = '/home/jlaser/code/data/nextgen_03W.gpkg' - ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) - src = ds["RAINRATE"] - - # Why are we converting to paquet and then back into geopandas dataframe? - polygonfile = gpd.read_file(gpkg, layer="divides") - parq_file = os.path.join(data_dir,"ng_03.parquet") - polygonfile.to_parquet(parq_file) - pkl_file = os.path.join(data_dir,"weights.pkl") + if ii_cache: + local_files = [] + for jfile in nwm_forcing_files: + if ii_verbose: print(f'Looking for {jfile}') + file_parts = Path(jfile).parts + + local_file = os.path.join(CACHE_DIR,file_parts[-1]) + local_files.append(local_file) + if os.path.exists(local_file): + if ii_verbose: print(f'Found and using raw forcing file {local_file}') + continue + else: + if ii_verbose: print(f'Forcing file not found! Downloading {jfile}') + command = f'wget -P {CACHE_DIR} -c {jfile}' + wget(command,jfile) + + cache_files = [] + for jfile in local_files: + splt = Path(jfile).parts + cache_files.append(splt[-1]) + + forcing_files = cache_files # interacting with files locally + else: + forcing_files = nwm_forcing_files # interacting with files remotely + + # Do we need a parquet file? + # parq_file = os.path.join(CACHE_DIR,"ng_03.parquet") + # polygonfile.to_parquet(parq_file) + + # Generate weight file only if one doesn't exist already + # Very time consuming so we don't want to do this if we can avoid it + pkl_file = os.path.join(CACHE_DIR,"weights.pkl") + if not os.path.exists(pkl_file): + # Search for geopackage that matches the requested VPU, if it exists + gpkg = None + for jfile in os.listdir(os.path.join(top_dir,'data')): + if jfile.find(vpu) >= 0: + gpkg = Path(top_dir,"data",jfile) + print(f'Found and using geopackge file {gpkg}') + if gpkg == None: + url = f'https://nextgen-hydrofabric.s3.amazonaws.com/05_nextgen/nextgen_{vpu}.gpkg' + command = f'wget -P {CACHE_DIR} -c {url}' + wget(command,url) + + print(f'Opening {gpkg}...') + polygonfile = gpd.read_file(gpkg, layer="divides") + + ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) + src = ds["RAINRATE"] + + print("Generating weights") + t1 = time.perf_counter() + generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") + print(f"Generating the weights took {time.perf_counter() - t1:.2f} s") + else: + print(f"Not creating weight file! Delete this if you want to create a new one: {pkl_file}") - print("Generating weights") - t1 = time.perf_counter() - # generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") - print(f"Generating the weights took {time.perf_counter() - t1:.2f} s") var_list = [ "U2D", "V2D", @@ -488,22 +535,16 @@ def main(): "VGRD_10maboveground", "DLWRF_surface", "APCP_surface", - "precip_rate", # BROKEN + "precip_rate", # BROKEN (Identical to APCP!) "TMP_2maboveground", "SPFH_2maboveground", "DSWRF_surface", ] - - just_files = [] - for jfile in local_files: - splt = jfile.split('/') # Need a way to do this that doesn't break on windows - just_files.append(splt[-1]) - fd2 = get_forcing_dict_JL( pkl_file, - Path(data_dir), - just_files, + CACHE_DIR, + forcing_files, var_list, var_list_out, ) @@ -516,60 +557,8 @@ def main(): arr.to_csv(csvname) print(f'JL write took {time.perf_counter() - t0:.2f} s') - - # fd2 = get_forcing_dict_RTIway2( - # pkl_file, - # polygonfile, - # Path(data_dir), - # just_files, - # var_list, - # ) - - # t0 = time.perf_counter() - # # pcp_var and pcp_var2 are indentical? - # pcp_var = fd2["RAINRATE"] - # lw_var = fd2["LWDOWN"] - # sw_var = fd2["SWDOWN"] - # sp_var = fd2["PSFC"] - # tmp_var = fd2["T2D"] - # u2d_var = fd2["U2D"] - # v2d_var = fd2["V2D"] - # pcp_var2 = fd2["RAINRATE"] - - # for _i in range(0, ncatch_out): - - # pcp_var_0 = pcp_var.transpose()[_i].rename("APCP_surface") - # lw_var_0 = lw_var.transpose()[_i].rename("DLWRF_surface") - # sw_var_0 = sw_var.transpose()[_i].rename("DSWRF_surface") - # sp_var_0 = sp_var.transpose()[_i].rename("SPFH_2maboveground") - # tmp_var_0 = tmp_var.transpose()[_i].rename("TMP_2maboveground") - # u2d_var_0 = u2d_var.transpose()[_i].rename("UGRD_10maboveground") - # v2d_var_0 = v2d_var.transpose()[_i].rename("VGRD_10maboveground") - # pcp_var2_0 = pcp_var2.transpose()[_i].rename("precip_rate") ##BROKEN!! - - # d = pd.concat( - # [ - # pcp_var_0, - # lw_var_0, - # sw_var_0, - # sp_var_0, - # tmp_var_0, - # u2d_var_0, - # v2d_var_0, - # pcp_var2_0, - # ], - # axis=1, - # ) - # d.index.name = "time" - - # id = polygonfile["id"][_i] - # splt = id.split('-') - # csvname = f"{output_dir}/cat{vpu}_{splt[1]}.csv" - # d.to_csv(csvname) - - # print(f'RTI write took {time.perf_counter() - t0:.2f} s') - print(f'\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {output_dir}\n\n') + print(f'Total run time: {time.perf_counter() - t00:.2f} s') if __name__ == "__main__": main() From aa00098a78b6f750bf58c84538d5ad0338fc2028 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 3 May 2023 14:35:26 -0500 Subject: [PATCH 056/105] Added options to write to S3 bucket in either csv or parquet --- prep_hydrofab_forcings_ngen.py | 109 ++++++++++++++++++++++----------- user_input_ngen.json | 7 ++- user_input_ngen.md | 5 +- 3 files changed, 82 insertions(+), 39 deletions(-) diff --git a/prep_hydrofab_forcings_ngen.py b/prep_hydrofab_forcings_ngen.py index 172b585..d23df19 100644 --- a/prep_hydrofab_forcings_ngen.py +++ b/prep_hydrofab_forcings_ngen.py @@ -6,7 +6,8 @@ import pickle import pandas as pd import argparse, os, json -from sys import getsizeof +import pyarrow as pa +import pyarrow.parquet as pq import gc from pathlib import Path import geopandas as gpd @@ -17,6 +18,8 @@ from rasterio.io import MemoryFile from rasterio.features import rasterize import time +import boto3 +from io import StringIO, BytesIO from nwm_filenames.listofnwmfilenames import create_file_list from ngen_forcing.process_nwm_forcing_to_ngen import * @@ -182,9 +185,9 @@ def generate_weights_file( else: crosswalk_dict[index] = np.where(geom_rasterize == 1) - if i % 100 == 0: - perc = i/len(gdf_proj)*100 - print(f"{i}, {perc:.2f}%".ljust(40), end="\r") + # if i % 100 == 0: + # perc = i/len(gdf_proj)*100 + # print(f"{i}, {perc:.2f}%".ljust(40), end="\r") i += 1 with open(weights_filepath, "wb") as f: @@ -303,17 +306,14 @@ def get_forcing_dict_RTIway( def get_forcing_dict_JL( pickle_file, - folder_prefix, filelist, var_list, var_list_out ): t1 = time.perf_counter() df_by_t = [] - for _i, _nc_file in enumerate(filelist): - _full_nc_file = folder_prefix.joinpath(_nc_file) - print(f"Data indexing progress -> {_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") - with xr.open_dataset(_full_nc_file) as _xds: + for _i, _nc_file in enumerate(filelist): + with xr.open_dataset(_nc_file) as _xds: shp = _xds['U2D'].shape data_allvars = np.zeros( shape=(len(var_list),shp[1],shp[2]), @@ -322,8 +322,9 @@ def get_forcing_dict_JL( data_allvars[var_dx,:,:] = np.squeeze(_xds[jvar].values) _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, pickle_file) df_by_t.append(_df_zonal_stats) + print(f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}%", end="\r") - print(f'Reformating and converting data into dataframe', end="\r") + print(f'Reformating and converting data into dataframe') dfs = {} for jcat in list(df_by_t[0].keys()): data_catch = [] @@ -331,7 +332,7 @@ def get_forcing_dict_JL( data_catch.append(df_by_t[jt][jcat]) dfs[jcat] = pd.DataFrame(data_catch,columns = var_list_out) - print(f"Indexing data and generating the dataframes (JL) {time.perf_counter() - t1:.2f} s") + print(f"Indexing data and generating the dataframes (JL) {time.perf_counter() - t1:.2f}s") return dfs @@ -425,8 +426,18 @@ def main(): urlbaseinput = conf['forcing']['urlbaseinput'] vpu = conf['hydrofab']['vpu'] ii_verbose = conf['verbose'] - output_dir = conf['output_dir'] - ii_cache = conf['output_dir'] + bucket_type = conf['bucket_type'] + bucket_name = conf['bucket_name'] + file_prefix = conf['file_prefix'] + file_type = conf['file_type'] + ii_cache = conf['cache'] + + file_types = ['csv','parquet'] + assert file_type in file_types,f'{file_type} for file_type is not accepted! Accepted: {file_types}' + + bucket_types = ['local','S3'] + assert bucket_type in bucket_types,f'{bucket_type} for bucket_type is not accepted! Accepted: {bucket_types}' + # TODO: Subsetting! # @@ -435,14 +446,18 @@ def main(): top_dir = os.path.dirname(args.infile) if not os.path.exists(CACHE_DIR): os.system(f'mkdir {CACHE_DIR}') - - # TODO: Be able to write to anywhere we want (especially AWS bucket) - if output_dir == "local": - output_dir = Path(top_dir,'data/catchment_forcing_data') - if not os.path.exists(output_dir): - os.system(f'mkdir {output_dir}') - else: - raise NotImplementedError(f"{output_dir} is not an option for output_dir") + if not os.path.exists(CACHE_DIR): + raise Exception(f'Creating {CACHE_DIR} failed!') + + # Prep output directory + if bucket_type == "local": + bucket_path = Path(top_dir,file_prefix,bucket_name) + if not os.path.exists(bucket_path): + os.system(f'mkdir {bucket_path}') + if not os.path.exists(bucket_path): + raise Exception(f'Creating {bucket_path} failed!') + elif bucket_type == 'S3': + s3 = boto3.client('s3') # Generate list of file names to retrieve for forcing data print(f'Creating list of file names to pull...') @@ -461,8 +476,10 @@ def main(): lead_time, ) - # Check to see if we have files cached, if not wget them + # Download whole files and store locally if cache is true, + # otherwise index remotely and save catchment based forcings if ii_cache: + # Check to see if we have files cached, if not wget them local_files = [] for jfile in nwm_forcing_files: if ii_verbose: print(f'Looking for {jfile}') @@ -478,12 +495,7 @@ def main(): command = f'wget -P {CACHE_DIR} -c {jfile}' wget(command,jfile) - cache_files = [] - for jfile in local_files: - splt = Path(jfile).parts - cache_files.append(splt[-1]) - - forcing_files = cache_files # interacting with files locally + forcing_files = local_files # interacting with files locally else: forcing_files = nwm_forcing_files # interacting with files remotely @@ -543,21 +555,46 @@ def main(): fd2 = get_forcing_dict_JL( pkl_file, - CACHE_DIR, forcing_files, var_list, var_list_out, ) + # Write CSVs to file t0 = time.perf_counter() - for jcatch in fd2.keys(): - arr = fd2[jcatch] + write_int = 100 + write_break = 1000 + for j, jcatch in enumerate(fd2.keys()): + df = fd2[jcatch] splt = jcatch.split('-') - csvname = f"{output_dir}/cat{vpu}_{splt[1]}.csv" - arr.to_csv(csvname) - - print(f'JL write took {time.perf_counter() - t0:.2f} s') - print(f'\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {output_dir}\n\n') + + if bucket_type == 'local': + if file_type == 'csv': + csvname = Path(bucket_path,f"cat{vpu}_{splt[1]}.csv") + df.to_csv(csvname) + if file_type == 'parquet': + parq_file = Path(bucket_path,f"cat{vpu}_{splt[1]}.parquet") + df.to_parquet(parq_file) + elif bucket_type == 'S3': + buf = BytesIO() + if file_type == 'parquet': + parq_file = f"cat{vpu}_{splt[1]}.parquet" + df.to_parquet(buf) + elif file_type == 'csv': + csvname = f"cat{vpu}_{splt[1]}.csv" + df.to_csv(buf, index=False) + buf.seek(0) + key_name = f'{file_prefix}{csvname}' + s3.put_object(Bucket=bucket_name, Key=key_name, Body=buf.getvalue()) + + if (j+1) % write_int == 0: + print(f"{j+1} files written out of {len(fd2)}, {(j+1)/len(fd2)*100:.2f}%", end="\r") + + if j == write_break: break + + print(f'{file_type} write took {time.perf_counter() - t0:.2f} s\n') + + print(f'\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {bucket_type}\n\n') print(f'Total run time: {time.perf_counter() - t00:.2f} s') if __name__ == "__main__": diff --git a/user_input_ngen.json b/user_input_ngen.json index a6e757f..19bfa9b 100644 --- a/user_input_ngen.json +++ b/user_input_ngen.json @@ -13,7 +13,10 @@ "vpu" : "03W" }, - "verbose" : true, - "output_dir" : "local", + "verbose" : false, + "bucket_type" : "local", + "bucket_name" : "ciroh-devconf", + "file_prefix" : "data/", + "file_type" : "csv", "cache" : true } \ No newline at end of file diff --git a/user_input_ngen.md b/user_input_ngen.md index 3c6b82e..fef5ac0 100644 --- a/user_input_ngen.md +++ b/user_input_ngen.md @@ -46,5 +46,8 @@ contents: | Field Name | Data Type | Description | | --- | --- | --- | | verbose | `bool` | Print raw forcing files | -| output_dir | `string` |
    1. "local" : output to ./data/catchment_forcing_data/
    | +| output_dir | `string` |
    1. "local" : write to local directory
    2. "S3" : output to AWS S3 bucket
    | +| bucket_name | `string` | If local, this is the name of the folder the data will be placed in. If S3, this is the name of S3 bucket, which must exist already. | +| file_prefix | `string` | If local, this is the relative path to the bucket_name folder. If S3, this is the relative path within the S3 bucket_name bucket to store files | +| file_type | `string` |
    1. "csv" : write data as csv files/
    2. "parquet" : write data as parquet files
    | | cache | `bool` |
  • true: store forcing files locally
  • false: interact with forcing files remotely
  • | From 4106e767615cf0f416fdc26471a2760e97be47c6 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 3 May 2023 14:36:33 -0500 Subject: [PATCH 057/105] removed precip_rate functions --- prep_hydrofab_forcings_ngen.py | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/prep_hydrofab_forcings_ngen.py b/prep_hydrofab_forcings_ngen.py index d23df19..ae815e2 100644 --- a/prep_hydrofab_forcings_ngen.py +++ b/prep_hydrofab_forcings_ngen.py @@ -6,8 +6,6 @@ import pickle import pandas as pd import argparse, os, json -import pyarrow as pa -import pyarrow.parquet as pq import gc from pathlib import Path import geopandas as gpd @@ -19,7 +17,7 @@ from rasterio.features import rasterize import time import boto3 -from io import StringIO, BytesIO +from io import BytesIO from nwm_filenames.listofnwmfilenames import create_file_list from ngen_forcing.process_nwm_forcing_to_ngen import * @@ -70,29 +68,6 @@ ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, "RouteLink_CONUS.nc") ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, "route_link_conus.parquet") -# TODO: Implemenent these function to appropriately calculate precip_rate -def rho(temp): - """ - Calculate water density at temperature - """ - return 999.99399 + 0.04216485*temp - 0.007097451*(temp**2) + 0.00003509571*(temp**3) - 9.9037785E-8*(temp**4) - -def aorc_as_rate(dataFrame): - """ - Convert kg/m^2 -> m/s - """ - if isinstance(dataFrame.index, pd.MultiIndex): - interval = pd.Series(dataFrame.index.get_level_values(0)) - else: - interval = pd.Series(dataFrame.index) - interval = ( interval.shift(-1) - interval ) / np.timedelta64(1, 's') - interval.index = dataFrame.index - precip_rate = ( dataFrame['APCP_surface'].shift(-1) / dataFrame['TMP_2maboveground'].apply(rho) ) / interval - precip_rate.name = 'precip_rate' - return precip_rate - -###### - def parquet_to_gdf(parquet_filepath: str) -> gpd.GeoDataFrame: gdf = gpd.read_parquet(parquet_filepath) return gdf @@ -563,7 +538,6 @@ def main(): # Write CSVs to file t0 = time.perf_counter() write_int = 100 - write_break = 1000 for j, jcatch in enumerate(fd2.keys()): df = fd2[jcatch] splt = jcatch.split('-') @@ -590,8 +564,6 @@ def main(): if (j+1) % write_int == 0: print(f"{j+1} files written out of {len(fd2)}, {(j+1)/len(fd2)*100:.2f}%", end="\r") - if j == write_break: break - print(f'{file_type} write took {time.perf_counter() - t0:.2f} s\n') print(f'\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {bucket_type}\n\n') From b5cb3a74bd155384c26238eb639a43b38983f6cb Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 3 May 2023 15:09:29 -0500 Subject: [PATCH 058/105] updated config and readme --- user_input_ngen.json | 2 +- user_input_ngen.md | 43 +++++++++++++++++++++++-------------------- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/user_input_ngen.json b/user_input_ngen.json index 19bfa9b..c699e97 100644 --- a/user_input_ngen.json +++ b/user_input_ngen.json @@ -14,7 +14,7 @@ }, "verbose" : false, - "bucket_type" : "local", + "bucket_type" : "S3", "bucket_name" : "ciroh-devconf", "file_prefix" : "data/", "file_type" : "csv", diff --git a/user_input_ngen.md b/user_input_ngen.md index fef5ac0..7d24459 100644 --- a/user_input_ngen.md +++ b/user_input_ngen.md @@ -5,25 +5,28 @@ filename = 'user_input_ngen.json' contents: - { - "forcing" : { - "start_date" : "20220822", - "end_date" : "20220822", - "runinput" : 2, - "varinput" : 5, - "geoinput" : 1, - "meminput" : 0, - "urlbaseinput" : 3 - }, - - "hydrofab" : { - "vpu" : "03W" - }, - - "verbose" : true, - "output_dir" : "local", - "cache" : true - } +{ + "forcing" : { + "start_date" : "20220822", + "end_date" : "20220822", + "runinput" : 2, + "varinput" : 5, + "geoinput" : 1, + "meminput" : 0, + "urlbaseinput" : 3 + }, + + "hydrofab" : { + "vpu" : "03W" + }, + + "verbose" : false, + "bucket_type" : "S3", + "bucket_name" : "ciroh-devconf", + "file_prefix" : "data/", + "file_type" : "csv", + "cache" : true +} ### forcing | Field Name | Data Type | Description | @@ -46,7 +49,7 @@ contents: | Field Name | Data Type | Description | | --- | --- | --- | | verbose | `bool` | Print raw forcing files | -| output_dir | `string` |
    1. "local" : write to local directory
    2. "S3" : output to AWS S3 bucket
    | +| bucket_type | `string` |
    1. "local" : write to local directory
    2. "S3" : output to AWS S3 bucket
    | | bucket_name | `string` | If local, this is the name of the folder the data will be placed in. If S3, this is the name of S3 bucket, which must exist already. | | file_prefix | `string` | If local, this is the relative path to the bucket_name folder. If S3, this is the relative path within the S3 bucket_name bucket to store files | | file_type | `string` |
    1. "csv" : write data as csv files/
    2. "parquet" : write data as parquet files
    | From 30e0e295eb0e84fccddb26956c8280f22ad0d4ff Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 3 May 2023 15:12:32 -0500 Subject: [PATCH 059/105] Indent for code block --- user_input_ngen.md | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/user_input_ngen.md b/user_input_ngen.md index 7d24459..902e079 100644 --- a/user_input_ngen.md +++ b/user_input_ngen.md @@ -5,28 +5,28 @@ filename = 'user_input_ngen.json' contents: -{ - "forcing" : { - "start_date" : "20220822", - "end_date" : "20220822", - "runinput" : 2, - "varinput" : 5, - "geoinput" : 1, - "meminput" : 0, - "urlbaseinput" : 3 - }, - - "hydrofab" : { - "vpu" : "03W" - }, - - "verbose" : false, - "bucket_type" : "S3", - "bucket_name" : "ciroh-devconf", - "file_prefix" : "data/", - "file_type" : "csv", - "cache" : true -} + { + "forcing" : { + "start_date" : "20220822", + "end_date" : "20220822", + "runinput" : 2, + "varinput" : 5, + "geoinput" : 1, + "meminput" : 0, + "urlbaseinput" : 3 + }, + + "hydrofab" : { + "vpu" : "03W" + }, + + "verbose" : false, + "bucket_type" : "S3", + "bucket_name" : "ciroh-devconf", + "file_prefix" : "data/", + "file_type" : "csv", + "cache" : true + } ### forcing | Field Name | Data Type | Description | From cf8639a474af961a81d8ec819e6b7dee8aa66637 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 11:46:16 -0500 Subject: [PATCH 060/105] moved files and implemented threaded download --- filenames.txt | 3 + ngen_forcing/__pycache__/defs.cpython-311.pyc | Bin 0 -> 1345 bytes ngen_forcing/__pycache__/defs.cpython-38.pyc | Bin 0 -> 938 bytes ...rocess_nwm_forcing_to_ngen.cpython-311.pyc | Bin 0 -> 6098 bytes ...process_nwm_forcing_to_ngen.cpython-38.pyc | Bin 0 -> 2977 bytes ...rocess_nwm_forcing_to_ngen.cpython-311.pyc | Bin 0 -> 6097 bytes ngen_forcing/defs.py | 25 + ngen_forcing/denno.py | 465 ++++++++++++++++++ .../prep_hydrofab_forcings_ngen.py | 103 ++-- ngen_forcing/process_nwm_forcing_to_ngen.py | 259 ++++++++++ .../test_process_nwm_forcing_to_ngen.py | 211 ++++++++ .../user_input_ngen.json | 7 +- .../user_input_ngen.md | 10 +- 13 files changed, 1048 insertions(+), 35 deletions(-) create mode 100644 filenames.txt create mode 100644 ngen_forcing/__pycache__/defs.cpython-311.pyc create mode 100644 ngen_forcing/__pycache__/defs.cpython-38.pyc create mode 100644 ngen_forcing/__pycache__/process_nwm_forcing_to_ngen.cpython-311.pyc create mode 100644 ngen_forcing/__pycache__/process_nwm_forcing_to_ngen.cpython-38.pyc create mode 100644 ngen_forcing/__pycache__/test_process_nwm_forcing_to_ngen.cpython-311.pyc create mode 100644 ngen_forcing/defs.py create mode 100644 ngen_forcing/denno.py rename prep_hydrofab_forcings_ngen.py => ngen_forcing/prep_hydrofab_forcings_ngen.py (89%) create mode 100644 ngen_forcing/process_nwm_forcing_to_ngen.py create mode 100644 ngen_forcing/test_process_nwm_forcing_to_ngen.py rename user_input_ngen.json => ngen_forcing/user_input_ngen.json (79%) rename user_input_ngen.md => ngen_forcing/user_input_ngen.md (89%) diff --git a/filenames.txt b/filenames.txt new file mode 100644 index 0000000..87440af --- /dev/null +++ b/filenames.txt @@ -0,0 +1,3 @@ +https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f006.conus.nc +https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f007.conus.nc +https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f008.conus.nc \ No newline at end of file diff --git a/ngen_forcing/__pycache__/defs.cpython-311.pyc b/ngen_forcing/__pycache__/defs.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6a40066f7994994d36d7134d8f5ca0faff26564 GIT binary patch literal 1345 zcmb7DPfHs?6rY*hO`i1MOlmOHmiDppoA>6uH}CCxZ$89g?Fi`U z<405wguY7Qia=I$o`P);QADwea(IqlL_<{D#B&-AQyq4lMyLV1LGcVqwtbUsnQ2E; z?hTN9NK5a6x0hli45GH{T19WgSz_ok5t|MO%PU#t=F{qEsqfj-{8tCB5u|j;9@u zva~d`Y{y|jSZvMC6i`jQ-4%a&3yL9)-!Stu-=b^ery4t@ZfFBa-h_tnzgEcxrNu5W5Q8}yM z;4u(Y_Ve@e3Elxx5V;D?yFirv688ZvE+AZ;Xa?N)S7=8ay;Z%3cY@Q+?SBrMJ>;VU z*ncM_(&{mTj&SCP1bS95H!xP07-!HuLPZc-&E~1M z8tAIex@w|oULx`9_dD0P#hFd5h9#G?D(fQd0hCZJ8rvD&9^HLVnS3)@nyQ7PCBJ6I z%h5A4zO%8tu|ITT4xXBW)yQC@0?io&cgd>{`vOl5i*8P9oi#bpyb0cUbh#4~`phB2<8!D{PQ*F(6sj#{DHz==9) Og)RdR%d;Cnh5rdLFes$} literal 0 HcmV?d00001 diff --git a/ngen_forcing/__pycache__/defs.cpython-38.pyc b/ngen_forcing/__pycache__/defs.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e52dfc5c1af0db3166cdc12f95d47beb992e5322 GIT binary patch literal 938 zcmZuv&2AGh5VpPEWb;=6Efo?Rx$Gt3PE}Px$^|Kx!X-jlt#+Jb+f6naZ%KldTczZ} zJG95VlCPY21uihNX-Yth%8p=v9(yswHkRKLEQ#l zz0#vlZ;(j3+Y`%m+Q+o*DX~Ia$&D<sYP=Wg9S79GQ&(+e|XM=@n#aR{=`Asr0 z;BGV9$R$C7lpsPb%#Fnn9HexG&zU@uO zyLXk55qn7vrpF#poFvBD$SmXOJTta88=F})$}vf2;~toi?ao)O5%JUf4Hv;45ff5N zCA3hyetmhlcj%sFc@k&#y!#k~4RQ}P26R+#Dumc#m9LNsgxpOR?4PNM%%2JQ#K$(B z|Lw&)y0EW_mEum6@?KIG6o~5pPq`bVcPC4APb~#;DYkO3RZgeYWX@kq44P0o1=iIT z=?wCP)10BxT$dfN3S9;4Xo(S;7^0mk1G1ctkWCp#t?)U?iO-e?s?o`M1@R3Dj}ZL_ zh_6bo)TKYFfU>j$%R0lw4h13CI;`PSmY*@>NpfBaOQ(x;YLF-KU$n=6qm9^+{SF{E z6}})#sQUd-4rpu#gRnO2>_#iiyIhLuFXSYu4pcc-Q3wD1DN%4X&EY09=REV4({HP0 HqZ#}Lt|Z1L literal 0 HcmV?d00001 diff --git a/ngen_forcing/__pycache__/process_nwm_forcing_to_ngen.cpython-311.pyc b/ngen_forcing/__pycache__/process_nwm_forcing_to_ngen.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21bd3c010c14424f9de91c76acddb9af51ac64dc GIT binary patch literal 6098 zcmeHLTTC3+8J?NF&BE@oH)dJx5WCJM4q)R3+aV4ZY&XWHjd7fG6FOS$3}orvdSo+qF#GYJbN=(6+du#L&iReo-9W*kPrNStv74g)iiyf&-xi)MfN-4>D1lB=DLP80 zm?)F7L@hMtF-dF67PY19QM-}0Bpp!}@-0bcw1J_{P=fUuCD=sUTw}$sXrFUckU@qb1 zL`sY>x`j_@F6Hh!KT&F}TN()q!5olr%#K+=s zQIdFZA(qM}MX5hME2jCG42dVwv;A3;F-p@{Qk6Vj&hV(ymt7=IFl>MW1gQyNsrpy4 zvPD!wS+ln_L+>WY1M)g()zVO7;I0i@=1d88<=ycCC(W z({dAP-LBTm*%kX7wr4zZ)m6s^Xxwy}EzlH0$vw4-3Ty7a(qvA* z3%-)`=6pH7X}*0G3yN283!YhKv&q~}&MWV)=unh=peC2~6@MXE%d1IdUozV?_beAM zt%K8C3TqoRRw0;cR+{IUD=pMKzW%Gpaq>$Q4OshTC7^gWy*FX(x2xYuE~JDCbt_-f zH}^T$vh7)6r9}zX902^ME;ZLusjqkxZiXp)*2+7p^h%H5yGAL_ams9$rW6;1KL4c7 zl9(imB#|NbS>K{{yO^F$q{Y*>Y`Rk>v9ts+E2Ud!<5@|!l5AYkEpY;3Jd<4nbsGd{ zap5U*u=^24qDKP4^8in2HxygMw4l?AI=ui1tB^=Z_#dNqyc$?Q8AAttI!|5(xK7Q& zCo|3EXoW67diP@<%!kZMXo@0k(8CVJlNm|8#pq1mfX)b#gyZV&Cg|f&8-|k!Nseby z*%Mb^oE7D2@D&nqnNN#XuEZAmhLf3iEGeA;p`!d9b0n-Y3q*HivJmodGk{%1XXi4BbT%fxr(51UJg(a>$CC4+WW;dYg7c4V zrQ2g!s3ho2R?wZeA)O|$59v-ZJ)aUuOcr%pmL$@$ZX=m_(6c1Pv~Ekz%}X-DNi0_y zB_y_p(;b&}%hg0yXXT8POu&|-TXC}_=!mckVk`|7GVw^O?w%22@;niFV?A|OC5cl2 zg_)!v5FI~<(hmt&Z8mFo0(50t4W72!y7$k_{%z-&>h=dA?Q^)&Fl7ULg0k|+oPiAz&y09 zH{kGiV{UavoMd1&1>GjeF&X?8`YN@8k?uVo!%Otg9-cvXsEj+CG!9O+L~VH}7q>M2 z&=*+iDsLWfG~d+H*;24`ZBh$Hmd-zDY$>tL zC4c)`pXT3p-`{u7->3QqH2*-!?ZemQExG+$ZimbMm;(43VQG9TNHzJE&Xj^3TNGm( zr^}dUOD+5FxAfg>>DyRPTaIciNAvCy+w_6wea}Z^?PrDD-HXN0Ni}p*17ycEcC2W; zUwOP>%)@O_wC$(?dFBDvvX7bxdB-ZV+?a2C$aNK5D)*Acy|hI&*j{~% zu-uTR^Ft3<_o}bxIR@`X;`-=~c;TSh*{^l>tL^=Fd@47naf5fpz+749zUQ@jp4Yx$ zRL_{^8T&F=WG7X2Qe!7UD78L+*E`i zjj6)bV$)%@>9E#xIB$IrY`GCwj~52lqZ{1kU7vNSk>gtAxEg*%3%{ZUhqd5v-uWQZ zT?nb6h!%=$Q8wqR1}vY+kLKl4xZ~5X7LIK6XyJqTvk$m^g^L@$=beqbF=t^5IwLap0`wXL26&sC`| z9EE)55h$%nTbZybE*Q;o#vy^rwdXpvEp;j#N@pGQZLLq$QD07JuTtNAf`1R%+X}Qd zFiG&tVK82T?s}Tw`GK?};73F{5xNk%5zyrc`a0Q%fQOMsNGk~*MF>)gZ-vd07ts7c zfO-;3`XS@$kv%vH@VCn%kwxKIH1-bK8$iJOgGiVhLisSlAi@!Z?;;pC5J<)iT1tjc zI*#xP0N6P!U)1B!byZN1mv1z9N{R!F@#e9kuGu?y@HT)>&yaU8@k2~7dT>d?YV!5D97rX!2Qji3g`YGD{+kb;MdcRrT=<+O`Dh0Wv z^Cnhkvkd^RvJL#tutK2V_u2M)Z2P)Uc)i%sr?UMT+g~)^hu-k|0oB{9d3(1gyY09E z%U0mKhqn3d%SF%2+k7|jar)D=+J0~&rg8%sH*hDQaz~&T-us^6d!FID7gW!P<{9}y zD6$hOJE5@?Ae5T9wM%Q$g+p5N{(Qs3wtagp}J zdV%+Xp8X$F-bj<~tG^KK6T}&*94U-of>#OU7;Q%hZwht$0-Pdai$=`S?b%pbh)Ly; zg277nzhzt#pBLr#GJ^7E8OL`45CkcLq9}@bvFs>mI*poCa^18|Th({14c z7izJz$jl-a`B=@gy=FQ?-QrtL$ydl{*lSPjNAM}nv!raf6HjM)C@}lSz5vg&`|R#- zamV-T1TOvVJM611LjHw?i&q5-Hy{;%0%3&Fgru}bQ>$mCcF(3*XC+Qr=~Ys<=bEye zRC^xO+exige~U1ORqhj3;g!JxcL$9Vr?J8zq=;1i{fLOlB<)h*$tqMjm~zdpA${ z?jQjJVmHbe-(_Ge2&0G>MZm{lI!t)6n;r5j=;tDev%}q?$W3c@oSxSOG7s>bo#8}O z!8)eG0`aIRM>rSf*9K%fNN+$YG8iDEWkr$!Ro2u}HnY~r8C4Z#pIW7>+<`rC%BrlW zs>!?4>KV=)-Y`3-Zs{G8vZm~P^6?90-Pov~(Q-l6rk=67Ftc?=DLHyOCS^l8@NTE{ zWo=Mbgt@0wHI%%B8Xm{+CpX`K)0r>5dD{5u1GH@^wWmk69idqIMty8;PRjbq0Gg7XZF>IFWYE7-L6X-waU`;hu zw{Pu{ztN-DAk~ll!IoJ2K2f!OasuO54OlqK2ig;1A-RZif$Qr5)h^EtYP|-Ch z!mNPFPPKCw4GZmvVN_^45|EAZ;RIfH6GL)A<9#J2fZJserCz_6- z;IKF?@QJ%1LVxEME)WOOZAe820lMye>y4E(va0&gp;SfGz+8Q!l!!3DJ zz)sXPo{ds2LdkVyDB?`&ipWRso}KVaSCYY~kitd-=A+kz2qze_NNW3&c&I&@7fB5J zO*;q-fiZ(n@-PDnd9=Nu{XP%nNbtbSiEf-1aZu2hCya|=D0n{}>smhsBA6EMUMOZc zN5JF*FvhG1*eFdW+7Bd$jSrm;LLsyhjIJ?Vh346pK^6sQ2;S?DlH_r*%KCxnxfR^g zG}hI5SHz3hX&PvE4#gTg=cA$nFb|$}9fuc~*y?H|a)>6=l|qIR)*kfrKj(Y(8Pkez zE3Si(s$;EDk6LiojY95`TX&$YPP-O9+g6M2&^Fv%+M*5GggTFISxqYb2o|2g)^kwt z7+e4TKVu6J*MJ1bngi<7IiTtWP%Z$Z39y<0stSPe=79294FFYV3kFclZvoUIpryfl z094ZeswJ0ZWdKzB98k)ida9+C`#7015rXT}l{3<}u?GOR14s_we8ZryTV9gg!7AW<32?rl zE*S{^RW$(SJAmVExmjMGL-UHdtgZkwH&6kZU9~xf=9ggebJk|d7m&FD$m|>lAEw{X zQZylVQ?#(WhO&-=hnl#AvWapT1y_vNLU|tKA#Ppq9h5G}|42gdT{Q4Ll*h-o0igIk z)O`Wz3He@q#ufom+~Y!W_U#B1FQfHWP=0{2gR+Z)$GW(Rg2)p;M0phji7S4B@>7)8 zP!MY3XDB~M`31@z$}dq&FmRiSYbZZL`4!0abK(Z(Uq|^h3jQXDeUvv*Zlb(}@;1sl zD8E5@7v)2co{uLD{(u=?i34o-Eedi-{0`-Pln+2O?Tb5Dz(Yt|K?>Z%gyqMVdOx>hZ{3Lf%Q@z_dz$Xji+64pv5Ayquwn-R$i4^_WepMCcI zK@de{y!&}(zHh$yX7-zJelvgd`#lKK!rzZc#Xf}oLkrEsEobh33z;vGgd|2m3C8|s z6CC`r3eGqZ4yM6#CR{d;Pw?>1DejCX;mLRt-i$Bd%lH$1TMjFMOfV5#ZWBs`SR{w0 zx|*Fzj%y4;*Wqm>2Dy>syn+(lk_%{9?veNpS;`|mXc@i+x$6y4prDmGYriq~60Mzx zF_yQGRYgV6bWzu0*y>Nox-gj~$+Vghq;ygjRQb{+@v;@&fFa~WA}WfkY%GyhFUmxh zB`da(j;2IQ85oA&6!i<1Go4biL{2(dqd>?Dc{=^~e+xr}sb`7~i7h(QsOU2A7<%7T zBlPg`_qWYEm0?q1PkaGiXx@89o1+oBJwbQ`{bmT1wh(-AKx!}k@EuUx7ivn0#mI~ z&fCjpMQ6FDFv}lb_^0es+=_>^^)f%z>^omA?^&Qv7d6=b-_P^bMc6Rxu;=AW>5B5Jx!(pd{WqQ?l9&3&Xms=|YZVZ4vcSrpXZWtUv|hxyuhchn2LZC$pK{ z(ZcrC%RYWs$tFcbI|`Ym)H`6176np56}+?YT$M`aK23a*`1rjm@3kuLgA9s=EPg7R zR&#cY5OX=0NsFB(mM;rgfsR3wb;~WQ`HW0NUA7!Kl2&!gL9%%e@A(T69|=Ftp)n(h;amM}Eg^ppVked}X$Lsut|KN@jjl#*4iFYIh}Y5S}mcb8r2Hn?LnO|DXNU zr}r#;RP7%z`$xXrR_%YGA#po!ds7{G+_W1? z%g1Wr{yBE`P}hIc*M~D!+R5*vuPq_-nTr8C>Xsp;djmE56Sg#XC)V8Qk%9mnO(&_#jDmTZzV4>XG*&P8u5Ywy+^FtKuI(kt;}V zMMu~VI0<*g^VLzU$ViSNyTM+^O3oXu&h|wvg(Ut@Oc5K5QCA0DIn=UR!$?LXT*1{)byG1*G7O z&@~s#JiLum^gc2ByB^#A?se?%z-0ed&k3)we`P5?iqHF>*v$8=v7Cmz<+R>vUDtC0 zPt5+_$F_ex&k6oC_Dcv*Y2So}DHiHLl{CEBsPZYuETc*{1bhUcoB0Y$>^B@>M-MdT z3R?l*#r2VWg?L7m(tuU~C8gx}@&PeEX|a3v7u@k=R?Ta1HA!fYim`;Qcz`lw8D`p0 zgKUDzLL?)m)%Eb72wd%XAOJZK+v{8*wO2+f{7VbZ0%<*Uu0zD$AD8GJxJ0+P!h!W+ z?Ql}m_Z&F1_rQS>A)nK9B8z~(@*eWyutSw<<`lauvB~PEvk`Li59nGpy z{em0?WEdUPl(@swrS%J-2Mm{}(&pJaZaEv3OCjhKfIg-i z%SoxA7WO<$+cJ=E)8$&ODhxfWos!Mwv?!6`Y_G{uym0){g*O~VVf1m8tg${8AoNTN zK%aW5#ib~Ur}NbS(tJvW1CLHFTP`@QP%KV%fjD`F5(<4ShYjfo1t)|q6+)o~p*vG7 zME1a+<)_Vr?4+$xv+$&_cfaMM4g?t9_8&4#<=hmyH;}y7<~Cf7?50c)+_(^3pyvsT zo0LW<*$29${klzawBmW1eZj7HA*a!AyBfwfP&}rqill%qz^g@J5tPtkPh_s|^YPj7 z($DK`bGa;@c>TMP@5Fz;`>%KHHFS@zH2|u;PLT2--xyOGTMYNj9+~f1_+=%0uo^yS zh7Xp;>#REjYqCj7r|O$f*OobPo|#LUp=V34)w;LLeKa4Q%bVRhN^jJ{Tg>otx5LA? z!o$_@h#4L!jsM_5-F>qIpKqSsTzY*GyDOeuRXk+kp$Z;a^lYl}VUr)I@jW%Zx5jtX z_|QGC(+jo%DINbIfE=FRZ=Z>MHZ(Id=c#(0t>SGa-d4fe?soNdXx&!D+fBT^g10Yv zp7|-X-leu4uHqvmK2pI)7QI7jwbuN-wP060$ori45RmUFDUJQm{m5p&RK+iw_~i?h&9-LOA>#`1<88B#d6WR3?=3iLFChr_rfUz!4h9eS!!^)N9KybG;TAVg;Rb4$FO7YC^2*7Q{i_3S6I9#t^FK8$ Bf^Yx; literal 0 HcmV?d00001 diff --git a/ngen_forcing/defs.py b/ngen_forcing/defs.py new file mode 100644 index 0000000..516fdbb --- /dev/null +++ b/ngen_forcing/defs.py @@ -0,0 +1,25 @@ +import rasterio.mask as riomask + +def polymask(dataset, invert=False, all_touched=False): + def _polymask(poly): + return riomask.raster_geometry_mask( + dataset, [poly], invert=invert, all_touched=all_touched, crop=True + ) + + return _polymask + + +def xr_read_window(ds, window, mask=None): + data = ds.isel(window) + if mask is None: + return data + else: + return data.where(mask) + + +def xr_read_window_time(ds, window, mask=None, idx=None, time=None): + data = ds.isel(window) + if mask is None: + return idx, time, data + else: + return idx, time, data.where(mask) \ No newline at end of file diff --git a/ngen_forcing/denno.py b/ngen_forcing/denno.py new file mode 100644 index 0000000..37eb829 --- /dev/null +++ b/ngen_forcing/denno.py @@ -0,0 +1,465 @@ + +# https://github.com/jameshalgren/data-access-examples/blob/DONOTMERGE_VPU16/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb + +# !pip install --upgrade google-api-python-client +# !pip install --upgrade google-cloud-storage + +import pickle +import time +import pandas as pd +import argparse, os, json +import gc +from pathlib import Path +import geopandas as gpd +import pandas as pd +import numpy as np +import xarray as xr +from google.cloud import storage +from rasterio.io import MemoryFile +from rasterio.features import rasterize + +from nwm_filenames.listofnwmfilenames import create_file_list + +DATA_DIR = Path(Path.home(),"code","data") + +TEMPLATE_BLOB_NAME = ( + "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" +) +NWM_BUCKET = "national-water-model" + +# WKT strings extracted from NWM grids +CONUS_NWM_WKT = 'PROJCS["Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]], \ +PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ +PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-97.0],PARAMETER["standard_parallel_1",30.0],\ +PARAMETER["standard_parallel_2",60.0],PARAMETER["latitude_of_origin",40.0],UNIT["Meter",1.0]]' + +HI_NWM_WKT = 'PROJCS["Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]],\ +PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ +PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-157.42],PARAMETER["standard_parallel_1",10.0],\ +PARAMETER["standard_parallel_2",30.0],PARAMETER["latitude_of_origin",20.6],UNIT["Meter",1.0]]' + +PR_NWM_WKT = 'PROJCS["Sphere_Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]],\ +PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ +PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-65.91],PARAMETER["standard_parallel_1",18.1],\ +PARAMETER["standard_parallel_2",18.1],PARAMETER["latitude_of_origin",18.1],UNIT["Meter",1.0]]' + +# paths +CACHE_DIR = Path(Path.home(), "code", "data_access_examples", "forcing_data") +NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") +USGS_CACHE_DIR = os.path.join(CACHE_DIR, "usgs") +GEO_CACHE_DIR = os.path.join(CACHE_DIR, "geo") + +NWM_CACHE_H5 = os.path.join(NWM_CACHE_DIR, "gcp_client.h5") + +PARQUET_CACHE_DIR = os.path.join(CACHE_DIR, "parquet") +MEDIUM_RANGE_FORCING_PARQUET = os.path.join(PARQUET_CACHE_DIR, "forcing_medium_range") +FORCING_ANALYSIS_ASSIM_PARQUET = os.path.join( + PARQUET_CACHE_DIR, "forcing_analysis_assim" +) +MEDIUM_RANGE_PARQUET = os.path.join(PARQUET_CACHE_DIR, "medium_range") +USGS_PARQUET = os.path.join(PARQUET_CACHE_DIR, "usgs") + +HUC10_SHP_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_conus.shp") +HUC10_PARQUET_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_conus.parquet") +HUC10_MEDIUM_RANGE_WEIGHTS_FILEPATH = os.path.join( + GEO_CACHE_DIR, "wbdhu10_medium_range_weights.pkl" +) + +ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, "RouteLink_CONUS.nc") +ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, "route_link_conus.parquet") + + +def parquet_to_gdf(parquet_filepath: str) -> gpd.GeoDataFrame: + gdf = gpd.read_parquet(parquet_filepath) + return gdf + +def get_cache_dir(create: bool = True): + if not os.path.exists(NWM_CACHE_DIR) and create: + os.mkdir(NWM_CACHE_DIR) + if not os.path.exists(NWM_CACHE_DIR): + raise NotADirectoryError + return NWM_CACHE_DIR + +def make_parent_dir(filepath): + Path(filepath).parent.mkdir(parents=True, exist_ok=True) + +def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: + """Retrieve a blob from the data service as xarray.Dataset. + Based largely on OWP HydroTools. + Parameters + ---------- + blob_name: str, required + Name of blob to retrieve. + use_cache: bool, default True + If cache should be used. + If True, checks to see if file is in cache, and + If fetched from remote, will save to cache. + Returns + ------- + ds : xarray.Dataset + The data stored in the blob. + """ + # TODO: Check to see if this does any better than kerchunk + # the caching should help, but probably needs to be managed to function asynchronously. + # Perhaps if the files is not cached, we can create the dataset from + # kerchunk with a remote path and then asynchronously do a download to cache it + # for next time. The hypothesis would be that the download speed will not be any slower than + # just accessing the file remotely. + nc_filepath = os.path.join(get_cache_dir(), blob_name) + make_parent_dir(nc_filepath) + + # If the file exists and use_cache = True + if os.path.exists(nc_filepath) and use_cache: + # Get dataset from cache + ds = xr.load_dataset( + nc_filepath, + engine="h5netcdf", + ) + return ds + else: + # Get raw bytes + raw_bytes = get_blob(blob_name) + # Create Dataset + ds = xr.load_dataset( + MemoryFile(raw_bytes), + engine="h5netcdf", + ) + if use_cache: + # Subset and cache + ds["RAINRATE"].to_netcdf( + nc_filepath, + engine="h5netcdf", + ) + return ds + +def generate_weights_file( + gdf: gpd.GeoDataFrame, + src: xr.DataArray, + weights_filepath: str, + crosswalk_dict_key: str, +): + """Generate a weights file.""" + + gdf_proj = gdf.to_crs(CONUS_NWM_WKT) + + crosswalk_dict = {} + + # This is a probably a really poor performing way to do this + # TODO: Consider vectorizing -- would require digging into the + # other end of these where we unpack the weights... + i = 0 + for index, row in gdf_proj.iterrows(): + geom_rasterize = rasterize( + [(row["geometry"], 1)], + out_shape=src.rio.shape, + transform=src.rio.transform(), + all_touched=True, + fill=0, # IS FILL 0 + dtype="uint8", + ) + if crosswalk_dict_key: + crosswalk_dict[row[crosswalk_dict_key]] = np.where(geom_rasterize == 1) + else: + crosswalk_dict[index] = np.where(geom_rasterize == 1) + + if i % 100 == 0: + perc = i/len(gdf_proj)*100 + print(f"{i}, {perc:.2f}%".ljust(40), end="\r") + if perc > 0.01: break + i += 1 + + with open(weights_filepath, "wb") as f: + # TODO: This is a dict of ndarrays, which could be easily stored as a set of parquet files for safekeeping. + pickle.dump(crosswalk_dict, f) + +def add_zonalstats_to_gdf_weights( + gdf: gpd.GeoDataFrame, + src: xr.DataArray, + weights_filepath: str, +) -> gpd.GeoDataFrame: + """Calculates zonal stats and adds to GeoDataFrame""" + + df = calc_zonal_stats_weights(src, weights_filepath) + gdf_map = gdf.merge(df, left_on="huc10", right_on="catchment_id") + + return gdf_map + + +def get_blob(blob_name: str, bucket: str = NWM_BUCKET) -> bytes: + """Retrieve a blob from the data service as bytes. + Based largely on OWP HydroTools. + Parameters + ---------- + blob_name : str, required + Name of blob to retrieve. + Returns + ------- + data : bytes + The data stored in the blob. + """ + # Setup anonymous client and retrieve blob data + client = storage.Client.create_anonymous_client() + bucket = client.bucket(bucket) + return bucket.blob(blob_name).download_as_bytes(timeout=120) + + +def calc_zonal_stats_weights( + src: xr.DataArray, + weights_filepath: str, +) -> pd.DataFrame: + """Calculates zonal stats""" + + # Open weights dict from pickle + # This could probably be done once and passed as a reference. + with open(weights_filepath, "rb") as f: + crosswalk_dict = pickle.load(f) + + r_array = src.values[0] + r_array[r_array == src.rio.nodata] = np.nan + + mean_dict = {} + for key, value in crosswalk_dict.items(): + mean_dict[key] = np.nanmean(r_array[value]) + + df = pd.DataFrame.from_dict(mean_dict, orient="index", columns=["value"]) + + df.reset_index(inplace=True, names="catchment_id") + + # This should not be needed, but without memory usage grows + del crosswalk_dict + del f + gc.collect() + + return df + + +def get_forcing_dict_RTIway( + pickle_file, # This would be a Feature list for parallel calling -- + # if there is a stored weights file, we use it + # (checking for an optional flag to force re-creation of the weights...) + folder_prefix, + file_list, +): + + var = "RAINRATE" + reng = "rasterio" + filehandles = [ + xr.open_dataset(folder_prefix / f, engine=reng)[var] for f in file_list + ] + # filehandles = [get_dataset("data/" + f, use_cache=True) for f in file_list] + stats = [] + + for _i, f in enumerate(filehandles): + print(f"{_i}, {round(_i/len(file_list), 2)*100}".ljust(40), end="\r") + stats.append(calc_zonal_stats_weights(f, pickle_file)) + + [f.close() for f in filehandles] + return stats + + +def get_forcing_dict_RTIway2( + pickle_file, # This would be a Feature list for parallel calling -- + # if there is a stored weights file, we use it + # (checking for an optional flag to force re-creation of the weights...) + gpkg_divides, + folder_prefix, + filelist, + var_list, +): + reng = "rasterio" + pick_val = "value" + + df_dict = {} + dl_dict = {} + for _v in var_list: + df_dict[_v] = pd.DataFrame(index=gpkg_divides.index) + dl_dict[_v] = [] + + # ds_list = [] + for _i, _nc_file in enumerate(filelist): + # _nc_file = ("nwm.t00z.medium_range.forcing.f001.conus.nc") + _full_nc_file = folder_prefix.joinpath(_nc_file) + + try: + # with xr.open_dataset(_full_nc_file, engine=reng) as _xds: + with xr.open_dataset(_full_nc_file) as _xds: + # _xds = ds_list[_i] + # _xds.rio.write_crs(rasterio.crs.CRS.from_wkt(CONUS_NWM_WKT), inplace=True) + print(f"{_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") + for _v in var_list: + _src = _xds[_v] + _df_zonal_stats = calc_zonal_stats_weights(_src, pickle_file) + # if adding statistics back to original GeoDataFrame + # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1) + _df = pd.DataFrame(index=gpkg_divides.index) + _df[_xds.time.values[0]] = _df_zonal_stats[pick_val] + # TODO: This same line could add the new values directly + # to the same dictionary. But after adding about 100 of them, + # pandas starts to complain about degraded performance due to + # fragmentation of the dataframe. We tried it this was as a + # workaround, with the loop below to accomplish the concatenation. + dl_dict[_v].append(_df) + except: + print(f"No such file: {_full_nc_file}") + + for _v in var_list: + df_dict[_v] = pd.concat(dl_dict[_v], axis=1) + + # [_xds.close() for _xds in ds_list] + + return df_dict + + +def main(): + """ + Primary function to retrieve hydrofabrics data and convert it into files that can be ingested into ngen. + Also, the forcing data is retrieved. + + Inputs: JSON config file specifying start_date, end_date, and vpu + + Outputs: ngen catchment/nexus configs and forcing files + + Will store files in the same folder as the JSON config to run this script + """ + parser = argparse.ArgumentParser() + parser.add_argument(dest="infile", type=str, help="A json containing user inputs to run ngen") + args = parser.parse_args() + + # Take in user config + conf = json.load(open(args.infile)) + start_date = conf['forcing']['start_date'] + end_date = conf['forcing']['end_date'] + vpu = conf['hydrofab']['vpu'] + + top_dir = os.path.dirname(args.infile) + data_dir = os.path.join(top_dir,'forcing_data') + if not os.path.exists(data_dir): + os.system(f'mkdir {data_dir}') + + # Generate list of file names to retrieve for forcing data + # Going to make assumptions here as to which forecasts we want + # Check the dictionaries at the top of listofnwmfilenames for options + n = 6 # How rapidly we want our forecasts, I think 3 is the highest frequency + fcst_cycle = [n*x for x in range(24//n)] + lead_time = [x+1 for x in range(n)] + # fcst_cycle = None # Retrieves a full day for each day within the range given. + runinput = 2 + varinput = 5 + geoinput = 1 + meminput = 0 + urlbaseinput = None + + print(f'Creating list of file names to pull...') + nwm_forcing_files = create_file_list( + runinput, + varinput, + geoinput, + meminput, + start_date, + end_date, + fcst_cycle, + urlbaseinput, + lead_time, + ) + + + print(f'Pulling files...') + local_files = [] + for jfile in nwm_forcing_files: + file_parts = jfile.split('/') + local_file = os.path.join(data_dir,file_parts[-1]) + local_files.append(local_file) + if os.path.exists(local_file): continue + else: + command = f'wget -P {data_dir} -c https://storage.googleapis.com/national-water-model/{jfile}' + os.system(command) + + # Download dataset, read into df with geopandas + gpkg = os.path.join(DATA_DIR,'nextgen_03W.gpkg') + ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) + src = ds["RAINRATE"] + + # Why are we converting to paquet and then back into geopandas dataframe? + polygonfile = gpd.read_file(gpkg, layer="divides") + parq_file = os.path.join(DATA_DIR,"ng_03.parquet") + polygonfile.to_parquet(parq_file) + pkl_file = os.path.join(DATA_DIR,"weights.pkl") + generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") + calc_zonal_stats_weights(src, pkl_file) + + folder_prefix = DATA_DIR + + var_list = [ + "U2D", + "V2D", + "LWDOWN", + "RAINRATE", + "T2D", + "Q2D", + "PSFC", + "SWDOWN", + ] + + var_list + start_time = time.time() + print(f"Working on the new way") + fd2 = get_forcing_dict_RTIway2( + pkl_file, + polygonfile, + folder_prefix, + nwm_forcing_files, + var_list, + ) + print(time.time() - start_time) + + fd2["U2D"] + pcp_var.transpose()[0] + pcp_var = fd2["RAINRATE"] + lw_var = fd2["LWDOWN"] + sw_var = fd2["SWDOWN"] + sp_var = fd2["PSFC"] + tmp_var = fd2["T2D"] + u2d_var = fd2["U2D"] + v2d_var = fd2["V2D"] + pcp_var2 = fd2["RAINRATE"] + + for _i in range(0, 40000): + # _i = 0 + try: + pcp_var_0 = pcp_var.transpose()[_i].rename("APCP_surface") + lw_var_0 = lw_var.transpose()[_i].rename("DLWRF_surface") + sw_var_0 = sw_var.transpose()[_i].rename("DSWRF_surface") + sp_var_0 = sp_var.transpose()[_i].rename("SPFH_2maboveground") + tmp_var_0 = tmp_var.transpose()[_i].rename("TMP_2maboveground") + u2d_var_0 = u2d_var.transpose()[_i].rename("UGRD_10maboveground") + v2d_var_0 = v2d_var.transpose()[_i].rename("VGRD_10maboveground") + pcp_var2_0 = pcp_var2.transpose()[_i].rename("precip_rate") ##BROKEN!! + + d = pd.concat( + [ + pcp_var_0, + lw_var_0, + sw_var_0, + sp_var_0, + tmp_var_0, + u2d_var_0, + v2d_var_0, + pcp_var2_0, + ], + axis=1, + ) + d.index.name = "time" + + d.to_csv(f"input_data/cat16_{_i:07}.csv") + except: + print(f"no data for watershed {_i}", end="\t") + + ## Make a shell script string to rename the csvs... + gpkg_divides["id"] + for _i, cat_id in enumerate(gpkg_divides["id"]): + print(f"mv cat16_{_i:07}.csv cat16_{cat_id}.csv") + +if __name__ == "__main__": + + main() + \ No newline at end of file diff --git a/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py similarity index 89% rename from prep_hydrofab_forcings_ngen.py rename to ngen_forcing/prep_hydrofab_forcings_ngen.py index ae815e2..7a0a39d 100644 --- a/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -5,7 +5,7 @@ import pickle import pandas as pd -import argparse, os, json +import argparse, os, json, sys import gc from pathlib import Path import geopandas as gpd @@ -19,8 +19,11 @@ import boto3 from io import BytesIO -from nwm_filenames.listofnwmfilenames import create_file_list -from ngen_forcing.process_nwm_forcing_to_ngen import * +import threading + +pkg_dir = Path(Path(os.path.dirname(__file__)).parent,'nwm_filenames') +sys.path.append(str(pkg_dir)) +from listofnwmfilenames import create_file_list TEMPLATE_BLOB_NAME = ( "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" @@ -44,7 +47,10 @@ PARAMETER["standard_parallel_2",18.1],PARAMETER["latitude_of_origin",18.1],UNIT["Meter",1.0]]' # paths -CACHE_DIR = Path(Path.home(), "code", "data_access_examples", "data", "raw_forcing_data") + +#TODO Make CACHE_DIR configurable +CACHE_DIR = Path(pkg_dir.parent, "data", "raw_forcing_data") # Maybe this should have a date attached to the name + NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") USGS_CACHE_DIR = os.path.join(CACHE_DIR, "usgs") GEO_CACHE_DIR = os.path.join(CACHE_DIR, "geo") @@ -160,9 +166,9 @@ def generate_weights_file( else: crosswalk_dict[index] = np.where(geom_rasterize == 1) - # if i % 100 == 0: - # perc = i/len(gdf_proj)*100 - # print(f"{i}, {perc:.2f}%".ljust(40), end="\r") + if i % 100 == 0: + perc = i/len(gdf_proj)*100 + print(f"{i}, {perc:.2f}%".ljust(40), end="\r") i += 1 with open(weights_filepath, "wb") as f: @@ -365,12 +371,15 @@ def get_forcing_dict_RTIway2( return df_dict -def wget(cmd,name): +def wget(cmd,name,semaphore=None): + if not semaphore == None: semaphore.acquire() resp = os.system(cmd) if resp > 0: - raise Exception (f'\nwget failed! Tried: {name}\n') + raise Exception(f'\nwget failed! Tried: {name}\n') else: - print(f'Successful download of {name}') + print(f'Successful download of {name}') + + if not semaphore == None: semaphore.release() def main(): """ @@ -390,10 +399,17 @@ def main(): parser.add_argument(dest="infile", type=str, help="A json containing user inputs to run ngen") args = parser.parse_args() + # Increase for more threads + dl_threads = 10 + # Take in user config conf = json.load(open(args.infile)) start_date = conf['forcing']['start_date'] end_date = conf['forcing']['end_date'] + if 'nwm_files' in conf['forcing']: + nwm_files = conf['forcing']['nwm_files'] + else: + nwm_files = "" runinput = conf['forcing']['runinput'] varinput = conf['forcing']['varinput'] geoinput = conf['forcing']['geoinput'] @@ -406,6 +422,7 @@ def main(): file_prefix = conf['file_prefix'] file_type = conf['file_type'] ii_cache = conf['cache'] + dl_threads = conf['dl_threads'] file_types = ['csv','parquet'] assert file_type in file_types,f'{file_type} for file_type is not accepted! Accepted: {file_types}' @@ -434,28 +451,38 @@ def main(): elif bucket_type == 'S3': s3 = boto3.client('s3') - # Generate list of file names to retrieve for forcing data - print(f'Creating list of file names to pull...') - n = 6 - fcst_cycle = [n*x for x in range(24//n)] - lead_time = [x+1 for x in range(n)] - nwm_forcing_files = create_file_list( - runinput, - varinput, - geoinput, - meminput, - start_date, - end_date, - fcst_cycle, - urlbaseinput, - lead_time, - ) - + # Get nwm forcing file names + if len(nwm_files) == 0: + print(f'Creating list of file names to pull...') + n = 6 + fcst_cycle = [n*x for x in range(24//n)] + lead_time = [x+1 for x in range(n)] + nwm_forcing_files = create_file_list( + runinput, + varinput, + geoinput, + meminput, + start_date, + end_date, + fcst_cycle, + urlbaseinput, + lead_time, + ) + else: + print(f'Reading list of file names from {nwm_files}...') + nwm_forcing_files = [] + with open(nwm_files,'r') as f: + for line in f: + nwm_forcing_files.append(line) + # Download whole files and store locally if cache is true, # otherwise index remotely and save catchment based forcings + t0 = time.perf_counter() if ii_cache: # Check to see if we have files cached, if not wget them local_files = [] + cmds = [] + fls = [] for jfile in nwm_forcing_files: if ii_verbose: print(f'Looking for {jfile}') file_parts = Path(jfile).parts @@ -468,11 +495,27 @@ def main(): else: if ii_verbose: print(f'Forcing file not found! Downloading {jfile}') command = f'wget -P {CACHE_DIR} -c {jfile}' - wget(command,jfile) - - forcing_files = local_files # interacting with files locally + cmds.append(command) + fls.append(jfile) + + # TODO make this async! + # wget(command,jfile) + + threads = [] + semaphore = threading.Semaphore(dl_threads) + for i,jcmd in enumerate(cmds): + t = threading.Thread(target = wget, args = [jcmd, fls[i],semaphore]) + t.start() + threads.append(t) + + for jt in threads: + jt.join() + + forcing_files = local_files # interacting with files locally else: forcing_files = nwm_forcing_files # interacting with files remotely + + print(f'SERIAL Time to dl files {time.perf_counter() - t0}') # Do we need a parquet file? # parq_file = os.path.join(CACHE_DIR,"ng_03.parquet") diff --git a/ngen_forcing/process_nwm_forcing_to_ngen.py b/ngen_forcing/process_nwm_forcing_to_ngen.py new file mode 100644 index 0000000..8957438 --- /dev/null +++ b/ngen_forcing/process_nwm_forcing_to_ngen.py @@ -0,0 +1,259 @@ +from ngen_forcing.defs import xr_read_window, polymask, xr_read_window_time +from rasterio import _io, windows +import xarray as xr +import pandas as pd + + +class MemoryDataset(_io.MemoryDataset, windows.WindowMethodsMixin): + pass + + +def get_forcing_dict_newway( + feature_index, + feature_list, + folder_prefix, + file_list, + var_list, +): + reng = "rasterio" + + _xds_dummy = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) + _template_arr = _xds_dummy.U2D.values + _u2d = MemoryDataset( + _template_arr, + transform=_xds_dummy.U2D.rio.transform(), + gcps=None, + rpcs=None, + crs=None, + copy=False, + ) + + # Open .nc files ahead of time + ds_list = [] + for _nc_file in file_list: + _full_nc_file = folder_prefix.joinpath(_nc_file) + ds_list.append(xr.open_dataset(_full_nc_file, engine=reng)) + + df_dict = {} + for _v in var_list: + df_dict[_v] = pd.DataFrame(index=feature_index) + + for i, feature in enumerate(feature_list): + print(f"{i}, {round(i/len(feature_list), 5)*100}".ljust(40), end="\r") + mask, _, window = polymask(_u2d)(feature) + mask = xr.DataArray(mask, dims=["y", "x"]) + winslices = dict(zip(["y", "x"], window.toslices())) + for j, _xds in enumerate(ds_list): + time_value = _xds.time.values[0] + cropped = xr_read_window(_xds, winslices, mask=mask) + stats = cropped.mean() + for var in var_list: + df_dict[var].loc[i, time_value] = stats[var] + + [ds.close() for ds in ds_list] + return df_dict + + +# def get_forcing_dict_newway_parallel( +# feature_index, +# feature_list, +# folder_prefix, +# file_list, +# var_list, +# para="thread", +# para_n=2, +# ): + +# import concurrent.futures + +# reng = "rasterio" +# _xds = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) +# _template_arr = _xds.U2D.values +# _u2d = MemoryDataset( +# _template_arr, +# transform=_xds.U2D.rio.transform(), +# gcps=None, +# rpcs=None, +# crs=None, +# copy=False, +# ) +# ds_list = [xr.open_dataset(folder_prefix.joinpath(f)) for f in file_list] +# # ds_list = [xr.open_dataset(folder_prefix.joinpath(f), engine=reng) for f in file_list] +# # TODO: figure out why using the rasterio engine DOES NOT WORK with parallel +# # TODO: figure out why NOT using the rasterio engine produces a different result + +# if para == "process": +# pool = concurrent.futures.ProcessPoolExecutor +# elif para == "thread": +# pool = concurrent.futures.ThreadPoolExecutor +# else: +# pool = concurrent.futures.ThreadPoolExecutor + +# stats = [] +# future_list = [] + +# with pool(max_workers=para_n) as executor: + +# for _i, _m in enumerate(map(polymask(_u2d), feature_list)): +# print(f"{_i}, {round(_i/len(feature_list), 5)*100}".ljust(40), end="\r") +# mask, _, window = _m +# mask = xr.DataArray(mask, dims=["y", "x"]) +# winslices = dict(zip(["y", "x"], window.toslices())) +# for ds in ds_list: +# _t = ds.time.values[0] +# future = executor.submit( +# xr_read_window_time, ds, winslices, mask=mask, idx=_i, time=_t +# ) +# # cropped = xr_read_window(f, winslices, mask=mask) +# # stats.append(cropped.mean()) +# future_list.append(future) +# for _f in concurrent.futures.as_completed(future_list): +# _j, _t, _s = _f.result() +# stats.append((_j, _t, _s)) + +# df_dict = {} +# for _v in var_list: +# df_dict[_v] = pd.DataFrame(index=feature_index) + +# for j, t, s in stats: +# for var in var_list: +# df_dict[var].loc[j, t] = s[var].mean() + +# [ds.close() for ds in ds_list] +# return df_dict + + +def get_forcing_dict_newway_inverted( + feature_index, + feature_list, + folder_prefix, + file_list, + var_list, +): + reng = "rasterio" + + _xds_dummy = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) + _template_arr = _xds_dummy.U2D.values + _u2d = MemoryDataset( + _template_arr, + transform=_xds_dummy.U2D.rio.transform(), + gcps=None, + rpcs=None, + crs=None, + copy=False, + ) + ds_list = [] + for _nc_file in file_list: + _full_nc_file = folder_prefix.joinpath(_nc_file) + ds_list.append(xr.open_dataset(_full_nc_file, engine=reng)) + + stats = [] + mask_win_list = [] + + for i, feature in enumerate(feature_list): + print(f"{i}, {round(i/len(feature_list), 5)*100}".ljust(40), end="\r") + mask, _, window = polymask(_u2d)(feature) + mask = xr.DataArray(mask, dims=["y", "x"]) + winslices = dict(zip(["y", "x"], window.toslices())) + mask_win_list.append((mask, winslices)) + + for i, f in enumerate(ds_list): + print(f"{i}, {round(i/len(file_list), 2)*100}".ljust(40), end="\r") + time_value = f.time.values[0] + # TODO: when we read the window, could the time be added as a dimension? + for j, (_m, _w) in enumerate(mask_win_list): + cropped = xr_read_window(f, _w, mask=_m) + stats.append((j, time_value, cropped.mean())) + + df_dict = {} + for _v in var_list: + df_dict[_v] = pd.DataFrame(index=feature_index) + + for j, t, s in stats: + for var in var_list: + df_dict[var].loc[j, t] = s[var] + + [ds.close() for ds in ds_list] + return df_dict + + +# def get_forcing_dict_newway_inverted_parallel( +# feature_index, +# feature_list, +# folder_prefix, +# file_list, +# var_list, +# para="thread", +# para_n=2, +# ): +# import concurrent.futures + +# reng = "rasterio" +# _xds = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) +# _template_arr = _xds.U2D.values +# _u2d = MemoryDataset( +# _template_arr, +# transform=_xds.U2D.rio.transform(), +# gcps=None, +# rpcs=None, +# crs=None, +# copy=False, +# ) + +# ds_list = [xr.open_dataset("data/" + f) for f in file_list] + +# stats = [] +# future_list = [] +# mask_win_list = [] + +# for i, feature in enumerate(feature_list): +# print(f"{i}, {round(i/len(feature_list), 5)*100}".ljust(40), end="\r") +# mask, _, window = polymask(_u2d)(feature) +# mask = xr.DataArray(mask, dims=["y", "x"]) +# winslices = dict(zip(["y", "x"], window.toslices())) +# mask_win_list.append((mask, winslices)) + +# ds_list = [xr.open_dataset(folder_prefix.joinpath(f)) for f in file_list] +# # ds_list = [xr.open_dataset(folder_prefix.joinpath(f), engine=reng) for f in file_list] +# # TODO: figure out why using the rasterio engine DOES NOT WORK with parallel +# # TODO: figure out why NOT using the rasterio engine produces a different result + +# stats = [] +# future_list = [] + +# if para == "process": +# pool = concurrent.futures.ProcessPoolExecutor +# elif para == "thread": +# pool = concurrent.futures.ThreadPoolExecutor +# else: +# pool = concurrent.futures.ThreadPoolExecutor + +# with pool(max_workers=para_n) as executor: +# df_dict = {} +# for _v in var_list: +# df_dict[_v] = pd.DataFrame(index=feature_index) + +# for j, ds in enumerate(ds_list): +# print(f"{j}, {round(i/len(file_list), 2)*100}".ljust(40), end="\r") +# _t = ds.time.values[0] +# for _i, (_m, _w) in enumerate(mask_win_list): +# future = executor.submit( +# xr_read_window_time, ds, _w, mask=_m, idx=_i, time=_t +# ) +# # cropped = xr_read_window(ds, _w, mask=_m) +# # stats.append(cropped.mean()) +# future_list.append(future) +# for _f in concurrent.futures.as_completed(future_list): +# _j, _t, _s = _f.result() +# stats.append((_j, _t, _s)) + +# df_dict = {} +# for _v in var_list: +# df_dict[_v] = pd.DataFrame(index=feature_index) + +# for j, t, s in stats: +# for var in var_list: +# df_dict[var].loc[j, t] = s[var].mean() + +# [ds.close() for ds in ds_list] +# return df_dict \ No newline at end of file diff --git a/ngen_forcing/test_process_nwm_forcing_to_ngen.py b/ngen_forcing/test_process_nwm_forcing_to_ngen.py new file mode 100644 index 0000000..545fe95 --- /dev/null +++ b/ngen_forcing/test_process_nwm_forcing_to_ngen.py @@ -0,0 +1,211 @@ +# import rioxarray as rxr +import xarray as xr +import geopandas as gpd +from rasterstats import zonal_stats + +# import rasterio +import pandas as pd + +import time + +from process_nwm_forcing_to_ngen import ( + get_forcing_dict_newway, + get_forcing_dict_newway_parallel, + get_forcing_dict_newway_inverted, + get_forcing_dict_newway_inverted_parallel, +) + +from pathlib import Path +import warnings + +warnings.simplefilter("ignore") + +# Read forcing files +# Generate List of files + +# TODO: Add looping through lists of forcing files +# consider looking at the "listofnwmfilenames.py" in the data_access_examples repository. +# Integer values for runinput, varinput, etc. are listed at the top of the file +# and an example is given in the `main` function. + +# import listofnwmfilenames +# create_file_list( +# runinput, +# varinput, +# geoinput, +# meminput, +# start_date, +# end_date, +# fcst_cycle, +# ) + +""" +A set of test files can be generated downloading these files +wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f001.conus.nc +wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f002.conus.nc +wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f003.conus.nc +wget -P 03w -c https://nextgen-hydrofabric.s3.amazonaws.com/v1.2/nextgen_03W.gpkg +""" + + +def get_forcing_dict( + feature_index, + feature_list, + folder_prefix, + filelist, + var_list, +): + reng = "rasterio" + sum_stat = "mean" + + df_dict = {} + for _v in var_list: + df_dict[_v] = pd.DataFrame(index=feature_index) + + ds_list = [] + for _nc_file in filelist: + # _nc_file = ("nwm.t00z.medium_range.forcing.f001.conus.nc") + _full_nc_file = folder_prefix.joinpath(_nc_file) + ds_list.append(xr.open_dataset(_full_nc_file, engine=reng)) + + for _i, _nc_file in enumerate(filelist): + _xds = ds_list[_i] + print(f"{_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") + if 1 == 1: + for _v in var_list: + _src = _xds[_v] + _aff2 = _src.rio.transform() + _arr2 = _src.values[0] + + _df_zonal_stats = pd.DataFrame( + zonal_stats(feature_list, _arr2, affine=_aff2) + ) + # if adding statistics back to original GeoDataFrame + # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1) + df_dict[_v][_xds.time.values[0]] = _df_zonal_stats[sum_stat] + + [_xds.close() for _xds in ds_list] + + return df_dict + + +# TODO: Convert the output to CSV with something like +# `gdf3.to_csv` + + +def main(): + folder_prefix = Path("data") + list_of_files = [ + f"nwm.t12z.medium_range.forcing.f{_r:03}.conus.nc" for _r in range(1, 241) + ] + + # Read basin boundary file + f_03 = "03w/nextgen_03W.gpkg" + gpkg_divides = gpd.read_file(f_03, layer="divides") + var_list = [ + "U2D", + "V2D", + "LWDOWN", + "RAINRATE", + "T2D", + "Q2D", + "PSFC", + "SWDOWN", + ] + + # file_list = list_of_files[0:30] + # gpkg_subset = gpkg_divides[0:2000] + file_list = list_of_files[0:3] + gpkg_subset = gpkg_divides[0:200] + feature_list = gpkg_subset.geometry.to_list() + + # This way is extremely slow for anything more than a + # few files, so we comment it out of the test + + start_time = time.time() + print(f"Working on the old (slow) way") + fd1 = get_forcing_dict( + gpkg_subset.index, + feature_list, + folder_prefix, + file_list, + var_list, + ) + print(time.time() - start_time) + + start_time = time.time() + print(f"Working on the new way") + fd2 = get_forcing_dict_newway( + gpkg_subset.index, + feature_list, + folder_prefix, + file_list, + var_list, + ) + print(time.time() - start_time) + + start_time = time.time() + + print(f"Working on the new way with threading parallel.") + fd3t = get_forcing_dict_newway_parallel( + gpkg_subset.index, + feature_list, + folder_prefix, + file_list, + var_list, + para="thread", + para_n=16, + ) + print(time.time() - start_time) + + start_time = time.time() + print(f"Working on the new way with process parallel.") + fd3p = get_forcing_dict_newway_parallel( + gpkg_subset.index, + feature_list, + folder_prefix, + file_list, + var_list, + para="process", + para_n=16, + ) + print(time.time() - start_time) + start_time = time.time() + print(f"Working on the new way with loops reversed.") + fd4 = get_forcing_dict_newway_inverted( + gpkg_subset.index, + feature_list, + folder_prefix, + file_list, + var_list, + ) + print(time.time() - start_time) + + start_time = time.time() + print(f"Working on the new way with loops reversed with threading parallel.") + fd5t = get_forcing_dict_newway_inverted_parallel( + gpkg_subset.index, + feature_list, + folder_prefix, + file_list, + var_list, + para="thread", + para_n=16, + ) + print(time.time() - start_time) + start_time = time.time() + print(f"Working on the new way with loops reversed with process parallel.") + fd5p = get_forcing_dict_newway_inverted_parallel( + gpkg_subset.index, + feature_list, + folder_prefix, + file_list, + var_list, + para="process", + para_n=16, + ) + print(time.time() - start_time) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/user_input_ngen.json b/ngen_forcing/user_input_ngen.json similarity index 79% rename from user_input_ngen.json rename to ngen_forcing/user_input_ngen.json index c699e97..5059e3d 100644 --- a/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -2,6 +2,7 @@ "forcing" : { "start_date" : "20220822", "end_date" : "20220822", + "nwm_files" : "", "runinput" : 2, "varinput" : 5, "geoinput" : 1, @@ -13,10 +14,12 @@ "vpu" : "03W" }, - "verbose" : false, + "verbose" : true, "bucket_type" : "S3", "bucket_name" : "ciroh-devconf", "file_prefix" : "data/", "file_type" : "csv", - "cache" : true + "cache" : true, + "dl_threads" : 10 + } \ No newline at end of file diff --git a/user_input_ngen.md b/ngen_forcing/user_input_ngen.md similarity index 89% rename from user_input_ngen.md rename to ngen_forcing/user_input_ngen.md index 902e079..9e0cd45 100644 --- a/user_input_ngen.md +++ b/ngen_forcing/user_input_ngen.md @@ -9,6 +9,7 @@ contents: "forcing" : { "start_date" : "20220822", "end_date" : "20220822", + "nwm_files" : "", "runinput" : 2, "varinput" : 5, "geoinput" : 1, @@ -20,12 +21,14 @@ contents: "vpu" : "03W" }, - "verbose" : false, + "verbose" : true, "bucket_type" : "S3", "bucket_name" : "ciroh-devconf", "file_prefix" : "data/", "file_type" : "csv", - "cache" : true + "cache" : true, + "dl_threads" : 10 + } ### forcing @@ -33,13 +36,13 @@ contents: | --- | --- | --- | | start_date | `string` | YYYYMMDD | | end_date | `string` | YYYYMMDD | +| nwm_files | `string` | Path to a text file containing nwm file names. One filename per line. To have nwm forcing file names generated automatically, leave this option out of the config or set it to "". | | runinput | `int` |
    1. short_range
    2. medium_range
    3. medium_range_no_da
    4. long_range
    5. analysis_assim
    6. analysis_assim_extend
    7. analysis_assim_extend_no_da
    8. analysis_assim_long
    9. analysis_assim_long_no_da
    10. analysis_assim_no_da
    11. short_range_no_da
    | | varinput | `int` |
    1. channel_rt: for real-time channel data
    2. land: for land data
    3. reservoir: for reservoir data
    4. terrain_rt: for real-time terrain data
    5. forcing: for forcing data
    | | geoinput | `int` |
    1. conus: for continental US
    2. hawaii: for Hawaii
    3. puertorico: for Puerto Rico
    | | meminput | `int` |
    1. mem_1
    2. mem_2
    3. mem_3
    4. mem_4
    5. mem_5
    6. mem_6
    7. mem_7
    | | urlbaseinput | `int` |
    1. Empty string: use local files
    2. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/: for real-time operational data from NOAA
    3. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/post-processed/WMS/: for post-processed data from NOAA's Web Map Service
    4. https://storage.googleapis.com/national-water-model/: for input/output data stored on Google Cloud Storage
    5. https://storage.cloud.google.com/national-water-model/: for input/output data stored on Google Cloud Storage
    6. gs://national-water-model/: for input/output data stored on Google Cloud Storage
    7. https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/: for retrospective data from AWS S3
    8. s3://noaa-nwm-retrospective-2-1-pds/model_output/: for retrospective data from AWS S3
    | - ### hydrofab | Field Name | Data Type | Description | | --- | --- | --- | @@ -54,3 +57,4 @@ contents: | file_prefix | `string` | If local, this is the relative path to the bucket_name folder. If S3, this is the relative path within the S3 bucket_name bucket to store files | | file_type | `string` |
    1. "csv" : write data as csv files/
    2. "parquet" : write data as parquet files
    | | cache | `bool` |
  • true: store forcing files locally
  • false: interact with forcing files remotely
  • | +| dl_threads | `int` | Number of threads to use while downloading | From 45fe379b16232723d444beecba7fc88f98b7d6e3 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 12:26:18 -0500 Subject: [PATCH 061/105] weights write to json, removed functions --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 165 +++----------------- 1 file changed, 19 insertions(+), 146 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 7a0a39d..0571d85 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -74,10 +74,6 @@ ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, "RouteLink_CONUS.nc") ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, "route_link_conus.parquet") -def parquet_to_gdf(parquet_filepath: str) -> gpd.GeoDataFrame: - gdf = gpd.read_parquet(parquet_filepath) - return gdf - def get_cache_dir(create: bool = True): if not os.path.exists(NWM_CACHE_DIR) and create: os.mkdir(NWM_CACHE_DIR) @@ -171,21 +167,15 @@ def generate_weights_file( print(f"{i}, {perc:.2f}%".ljust(40), end="\r") i += 1 - with open(weights_filepath, "wb") as f: - # TODO: This is a dict of ndarrays, which could be easily stored as a set of parquet files for safekeeping. - pickle.dump(crosswalk_dict, f) - -def add_zonalstats_to_gdf_weights( - gdf: gpd.GeoDataFrame, - src: xr.DataArray, - weights_filepath: str, -) -> gpd.GeoDataFrame: - """Calculates zonal stats and adds to GeoDataFrame""" - - df = calc_zonal_stats_weights(src, weights_filepath) - gdf_map = gdf.merge(df, left_on="huc10", right_on="catchment_id") + # with open(weights_filepath, "wb") as f: + # # TODO: This is a dict of ndarrays, which could be easily stored as a set of parquet files for safekeeping. + # pickle.dump(crosswalk_dict, f) - return gdf_map + # This block was taken from https://github.com/RTIInternational/hydro-evaluation/blob/dev-denno-4-1/src/evaluation/utils.py + # TODO: Perhaps import RTI's module, but just do this for now. + weights_json = json.dumps({k: [x.tolist() for x in v] for k, v in crosswalk_dict.items()}) + with open(weights_filepath, "w") as f: + f.write(weights_json) def get_blob(blob_name: str, bucket: str = NWM_BUCKET) -> bytes: @@ -213,8 +203,8 @@ def calc_zonal_stats_weights_new( # Open weights dict from pickle # This could probably be done once and passed as a reference. - with open(weights_filepath, "rb") as f: - crosswalk_dict = pickle.load(f) + with open(weights_filepath, "r") as f: + crosswalk_dict = json.load(f) nvar = src.shape[0] mean_dict = {} @@ -232,61 +222,8 @@ def calc_zonal_stats_weights_new( return mean_dict - -def calc_zonal_stats_weights( - src: xr.DataArray, - weights_filepath: str, -) -> pd.DataFrame: - """Calculates zonal stats""" - - # Open weights dict from pickle - # This could probably be done once and passed as a reference. - with open(weights_filepath, "rb") as f: - crosswalk_dict = pickle.load(f) - - r_array = src.values[0] - r_array[r_array == src.rio.nodata] = np.nan - - mean_dict = {} - for key, value in crosswalk_dict.items(): - mean_dict[key] = np.nanmean(r_array[value]) - - df = pd.DataFrame.from_dict(mean_dict, orient="index", columns=["value"]) - - df.reset_index(inplace=True, names="catchment_id") - - # This should not be needed, but without memory usage grows - del crosswalk_dict - del f - gc.collect() - - return df - -def get_forcing_dict_RTIway( - pickle_file, # This would be a Feature list for parallel calling -- - # if there is a stored weights file, we use it - # (checking for an optional flag to force re-creation of the weights...) - folder_prefix, - file_list, -): - - var = "RAINRATE" - reng = "rasterio" - filehandles = [ - xr.open_dataset(folder_prefix / f, engine=reng)[var] for f in file_list - ] - # filehandles = [get_dataset("data/" + f, use_cache=True) for f in file_list] - stats = [] - - for _i, f in enumerate(filehandles): - print(f"{_i}, {round(_i/len(file_list), 2)*100}".ljust(40), end="\r") - stats.append(calc_zonal_stats_weights(f, pickle_file)) - - [f.close() for f in filehandles] - return stats - def get_forcing_dict_JL( - pickle_file, + wgt_file, filelist, var_list, var_list_out @@ -301,7 +238,7 @@ def get_forcing_dict_JL( dtype=_xds['U2D'].dtype) for var_dx, jvar in enumerate(var_list): data_allvars[var_dx,:,:] = np.squeeze(_xds[jvar].values) - _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, pickle_file) + _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, wgt_file) df_by_t.append(_df_zonal_stats) print(f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}%", end="\r") @@ -317,60 +254,6 @@ def get_forcing_dict_JL( return dfs -def get_forcing_dict_RTIway2( - pickle_file, # This would be a Feature list for parallel calling -- - # if there is a stored weights file, we use it - # (checking for an optional flag to force re-creation of the weights...) - gpkg_divides, - folder_prefix, - filelist, - var_list, -): - t1=time.perf_counter() - reng = "rasterio" - pick_val = "value" - - df_dict = {} - dl_dict = {} - for _v in var_list: - df_dict[_v] = pd.DataFrame(index=gpkg_divides.index) - dl_dict[_v] = [] - - # ds_list = [] - for _i, _nc_file in enumerate(filelist): - # _nc_file = ("nwm.t00z.medium_range.forcing.f001.conus.nc") - _full_nc_file = folder_prefix.joinpath(_nc_file) - - try: - # with xr.open_dataset(_full_nc_file, engine=reng) as _xds: - with xr.open_dataset(_full_nc_file) as _xds: - # _xds = ds_list[_i] - # _xds.rio.write_crs(rasterio.crs.CRS.from_wkt(CONUS_NWM_WKT), inplace=True) - print(f"{_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") - for _v in var_list: - _src = _xds[_v] - _df_zonal_stats = calc_zonal_stats_weights(_src, pickle_file) - # if adding statistics back to original GeoDataFrame - # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1) - _df = pd.DataFrame(index=gpkg_divides.index) - _df[_xds.time.values[0]] = _df_zonal_stats[pick_val] - # TODO: This same line could add the new values directly - # to the same dictionary. But after adding about 100 of them, - # pandas starts to complain about degraded performance due to - # fragmentation of the dataframe. We tried it this was as a - # workaround, with the loop below to accomplish the concatenation. - dl_dict[_v].append(_df) - except: - print(f"No such file: {_full_nc_file}") - - for _v in var_list: - df_dict[_v] = pd.concat(dl_dict[_v], axis=1) - - # [_xds.close() for _xds in ds_list] - print(f"Indexing data and generating the dataframes (RTI) {time.perf_counter() - t1:.2f} s") - - return df_dict - def wget(cmd,name,semaphore=None): if not semaphore == None: semaphore.acquire() resp = os.system(cmd) @@ -399,9 +282,6 @@ def main(): parser.add_argument(dest="infile", type=str, help="A json containing user inputs to run ngen") args = parser.parse_args() - # Increase for more threads - dl_threads = 10 - # Take in user config conf = json.load(open(args.infile)) start_date = conf['forcing']['start_date'] @@ -435,7 +315,7 @@ def main(): # # Set paths and make directories if needed - top_dir = os.path.dirname(args.infile) + top_dir = Path(os.path.dirname(args.infile)).parent if not os.path.exists(CACHE_DIR): os.system(f'mkdir {CACHE_DIR}') if not os.path.exists(CACHE_DIR): @@ -497,9 +377,6 @@ def main(): command = f'wget -P {CACHE_DIR} -c {jfile}' cmds.append(command) fls.append(jfile) - - # TODO make this async! - # wget(command,jfile) threads = [] semaphore = threading.Semaphore(dl_threads) @@ -515,16 +392,12 @@ def main(): else: forcing_files = nwm_forcing_files # interacting with files remotely - print(f'SERIAL Time to dl files {time.perf_counter() - t0}') - - # Do we need a parquet file? - # parq_file = os.path.join(CACHE_DIR,"ng_03.parquet") - # polygonfile.to_parquet(parq_file) + print(f'Time to download files {time.perf_counter() - t0}') # Generate weight file only if one doesn't exist already # Very time consuming so we don't want to do this if we can avoid it - pkl_file = os.path.join(CACHE_DIR,"weights.pkl") - if not os.path.exists(pkl_file): + wgt_file = os.path.join(CACHE_DIR,"weights.json") + if not os.path.exists(wgt_file): # Search for geopackage that matches the requested VPU, if it exists gpkg = None for jfile in os.listdir(os.path.join(top_dir,'data')): @@ -544,10 +417,10 @@ def main(): print("Generating weights") t1 = time.perf_counter() - generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") + generate_weights_file(polygonfile, src, wgt_file, crosswalk_dict_key="id") print(f"Generating the weights took {time.perf_counter() - t1:.2f} s") else: - print(f"Not creating weight file! Delete this if you want to create a new one: {pkl_file}") + print(f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}") var_list = [ "U2D", @@ -572,7 +445,7 @@ def main(): ] fd2 = get_forcing_dict_JL( - pkl_file, + wgt_file, forcing_files, var_list, var_list_out, From 3fdb6fa50a77d1b39c4f33f752a10bef481110d6 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 15:21:36 -0500 Subject: [PATCH 062/105] Applied black formatting --- ngen_forcing/defs.py | 3 +- ngen_forcing/denno.py | 97 +++--- ngen_forcing/prep_hydrofab_forcings_ngen.py | 316 ++++++++++-------- ngen_forcing/process_nwm_forcing_to_ngen.py | 2 +- .../test_process_nwm_forcing_to_ngen.py | 2 +- ngen_forcing/user_input_ngen.md | 6 +- 6 files changed, 229 insertions(+), 197 deletions(-) diff --git a/ngen_forcing/defs.py b/ngen_forcing/defs.py index 516fdbb..18175c8 100644 --- a/ngen_forcing/defs.py +++ b/ngen_forcing/defs.py @@ -1,5 +1,6 @@ import rasterio.mask as riomask + def polymask(dataset, invert=False, all_touched=False): def _polymask(poly): return riomask.raster_geometry_mask( @@ -22,4 +23,4 @@ def xr_read_window_time(ds, window, mask=None, idx=None, time=None): if mask is None: return idx, time, data else: - return idx, time, data.where(mask) \ No newline at end of file + return idx, time, data.where(mask) diff --git a/ngen_forcing/denno.py b/ngen_forcing/denno.py index 37eb829..bc6c369 100644 --- a/ngen_forcing/denno.py +++ b/ngen_forcing/denno.py @@ -1,4 +1,3 @@ - # https://github.com/jameshalgren/data-access-examples/blob/DONOTMERGE_VPU16/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb # !pip install --upgrade google-api-python-client @@ -20,7 +19,7 @@ from nwm_filenames.listofnwmfilenames import create_file_list -DATA_DIR = Path(Path.home(),"code","data") +DATA_DIR = Path(Path.home(), "code", "data") TEMPLATE_BLOB_NAME = ( "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" @@ -73,6 +72,7 @@ def parquet_to_gdf(parquet_filepath: str) -> gpd.GeoDataFrame: gdf = gpd.read_parquet(parquet_filepath) return gdf + def get_cache_dir(create: bool = True): if not os.path.exists(NWM_CACHE_DIR) and create: os.mkdir(NWM_CACHE_DIR) @@ -80,9 +80,11 @@ def get_cache_dir(create: bool = True): raise NotADirectoryError return NWM_CACHE_DIR + def make_parent_dir(filepath): Path(filepath).parent.mkdir(parents=True, exist_ok=True) + def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: """Retrieve a blob from the data service as xarray.Dataset. Based largely on OWP HydroTools. @@ -131,7 +133,8 @@ def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: engine="h5netcdf", ) return ds - + + def generate_weights_file( gdf: gpd.GeoDataFrame, src: xr.DataArray, @@ -146,8 +149,8 @@ def generate_weights_file( # This is a probably a really poor performing way to do this # TODO: Consider vectorizing -- would require digging into the - # other end of these where we unpack the weights... - i = 0 + # other end of these where we unpack the weights... + i = 0 for index, row in gdf_proj.iterrows(): geom_rasterize = rasterize( [(row["geometry"], 1)], @@ -161,17 +164,19 @@ def generate_weights_file( crosswalk_dict[row[crosswalk_dict_key]] = np.where(geom_rasterize == 1) else: crosswalk_dict[index] = np.where(geom_rasterize == 1) - + if i % 100 == 0: - perc = i/len(gdf_proj)*100 + perc = i / len(gdf_proj) * 100 print(f"{i}, {perc:.2f}%".ljust(40), end="\r") - if perc > 0.01: break + if perc > 0.01: + break i += 1 with open(weights_filepath, "wb") as f: # TODO: This is a dict of ndarrays, which could be easily stored as a set of parquet files for safekeeping. pickle.dump(crosswalk_dict, f) + def add_zonalstats_to_gdf_weights( gdf: gpd.GeoDataFrame, src: xr.DataArray, @@ -240,7 +245,6 @@ def get_forcing_dict_RTIway( folder_prefix, file_list, ): - var = "RAINRATE" reng = "rasterio" filehandles = [ @@ -312,7 +316,7 @@ def get_forcing_dict_RTIway2( def main(): """ - Primary function to retrieve hydrofabrics data and convert it into files that can be ingested into ngen. + Primary function to retrieve hydrofabrics data and convert it into files that can be ingested into ngen. Also, the forcing data is retrieved. Inputs: JSON config file specifying start_date, end_date, and vpu @@ -322,68 +326,70 @@ def main(): Will store files in the same folder as the JSON config to run this script """ parser = argparse.ArgumentParser() - parser.add_argument(dest="infile", type=str, help="A json containing user inputs to run ngen") + parser.add_argument( + dest="infile", type=str, help="A json containing user inputs to run ngen" + ) args = parser.parse_args() # Take in user config conf = json.load(open(args.infile)) - start_date = conf['forcing']['start_date'] - end_date = conf['forcing']['end_date'] - vpu = conf['hydrofab']['vpu'] + start_date = conf["forcing"]["start_date"] + end_date = conf["forcing"]["end_date"] + vpu = conf["hydrofab"]["vpu"] - top_dir = os.path.dirname(args.infile) - data_dir = os.path.join(top_dir,'forcing_data') + top_dir = os.path.dirname(args.infile) + data_dir = os.path.join(top_dir, "forcing_data") if not os.path.exists(data_dir): - os.system(f'mkdir {data_dir}') + os.system(f"mkdir {data_dir}") # Generate list of file names to retrieve for forcing data # Going to make assumptions here as to which forecasts we want # Check the dictionaries at the top of listofnwmfilenames for options - n = 6 # How rapidly we want our forecasts, I think 3 is the highest frequency - fcst_cycle = [n*x for x in range(24//n)] - lead_time = [x+1 for x in range(n)] + n = 6 # How rapidly we want our forecasts, I think 3 is the highest frequency + fcst_cycle = [n * x for x in range(24 // n)] + lead_time = [x + 1 for x in range(n)] # fcst_cycle = None # Retrieves a full day for each day within the range given. - runinput = 2 + runinput = 2 varinput = 5 geoinput = 1 meminput = 0 urlbaseinput = None - print(f'Creating list of file names to pull...') + print(f"Creating list of file names to pull...") nwm_forcing_files = create_file_list( - runinput, - varinput, - geoinput, - meminput, - start_date, - end_date, - fcst_cycle, - urlbaseinput, - lead_time, - ) - + runinput, + varinput, + geoinput, + meminput, + start_date, + end_date, + fcst_cycle, + urlbaseinput, + lead_time, + ) - print(f'Pulling files...') + print(f"Pulling files...") local_files = [] for jfile in nwm_forcing_files: - file_parts = jfile.split('/') - local_file = os.path.join(data_dir,file_parts[-1]) + file_parts = jfile.split("/") + local_file = os.path.join(data_dir, file_parts[-1]) local_files.append(local_file) - if os.path.exists(local_file): continue + if os.path.exists(local_file): + continue else: - command = f'wget -P {data_dir} -c https://storage.googleapis.com/national-water-model/{jfile}' - os.system(command) + command = f"wget -P {data_dir} -c https://storage.googleapis.com/national-water-model/{jfile}" + os.system(command) # Download dataset, read into df with geopandas - gpkg = os.path.join(DATA_DIR,'nextgen_03W.gpkg') - ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) - src = ds["RAINRATE"] + gpkg = os.path.join(DATA_DIR, "nextgen_03W.gpkg") + ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) + src = ds["RAINRATE"] # Why are we converting to paquet and then back into geopandas dataframe? polygonfile = gpd.read_file(gpkg, layer="divides") - parq_file = os.path.join(DATA_DIR,"ng_03.parquet") + parq_file = os.path.join(DATA_DIR, "ng_03.parquet") polygonfile.to_parquet(parq_file) - pkl_file = os.path.join(DATA_DIR,"weights.pkl") + pkl_file = os.path.join(DATA_DIR, "weights.pkl") generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") calc_zonal_stats_weights(src, pkl_file) @@ -459,7 +465,6 @@ def main(): for _i, cat_id in enumerate(gpkg_divides["id"]): print(f"mv cat16_{_i:07}.csv cat16_{cat_id}.csv") + if __name__ == "__main__": - main() - \ No newline at end of file diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 0571d85..4256e01 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -1,9 +1,10 @@ +# TODO NOTE a lot of this code is borrowed from https://github.com/RTIInternational/hydro-evaluation +# In the future, import this package # https://github.com/jameshalgren/data-access-examples/blob/DONOTMERGE_VPU16/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb # !pip install --upgrade google-api-python-client # !pip install --upgrade google-cloud-storage -import pickle import pandas as pd import argparse, os, json, sys import gc @@ -21,7 +22,7 @@ import threading -pkg_dir = Path(Path(os.path.dirname(__file__)).parent,'nwm_filenames') +pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "nwm_filenames") sys.path.append(str(pkg_dir)) from listofnwmfilenames import create_file_list @@ -48,8 +49,10 @@ # paths -#TODO Make CACHE_DIR configurable -CACHE_DIR = Path(pkg_dir.parent, "data", "raw_forcing_data") # Maybe this should have a date attached to the name +# TODO Make CACHE_DIR configurable +CACHE_DIR = Path( + pkg_dir.parent, "data", "raw_forcing_data" +) # Maybe this should have a date attached to the name NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") USGS_CACHE_DIR = os.path.join(CACHE_DIR, "usgs") @@ -74,6 +77,7 @@ ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, "RouteLink_CONUS.nc") ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, "route_link_conus.parquet") + def get_cache_dir(create: bool = True): if not os.path.exists(NWM_CACHE_DIR) and create: os.mkdir(NWM_CACHE_DIR) @@ -81,9 +85,11 @@ def get_cache_dir(create: bool = True): raise NotADirectoryError return NWM_CACHE_DIR + def make_parent_dir(filepath): Path(filepath).parent.mkdir(parents=True, exist_ok=True) + def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: """Retrieve a blob from the data service as xarray.Dataset. Based largely on OWP HydroTools. @@ -132,7 +138,10 @@ def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: engine="h5netcdf", ) return ds - + + +# TODO: Import this instead! +# Adapted from https://github.com/RTIInternational/hydro-evaluation/blob/dev-denno-4-1/src/evaluation/loading/generate_weights.py def generate_weights_file( gdf: gpd.GeoDataFrame, src: xr.DataArray, @@ -146,8 +155,8 @@ def generate_weights_file( crosswalk_dict = {} # This is a probably a really poor performing way to do this # TODO: Consider vectorizing -- would require digging into the - # other end of these where we unpack the weights... - i = 0 + # other end of these where we unpack the weights... + i = 0 for index, row in gdf_proj.iterrows(): geom_rasterize = rasterize( [(row["geometry"], 1)], @@ -161,19 +170,15 @@ def generate_weights_file( crosswalk_dict[row[crosswalk_dict_key]] = np.where(geom_rasterize == 1) else: crosswalk_dict[index] = np.where(geom_rasterize == 1) - + if i % 100 == 0: - perc = i/len(gdf_proj)*100 + perc = i / len(gdf_proj) * 100 print(f"{i}, {perc:.2f}%".ljust(40), end="\r") i += 1 - # with open(weights_filepath, "wb") as f: - # # TODO: This is a dict of ndarrays, which could be easily stored as a set of parquet files for safekeeping. - # pickle.dump(crosswalk_dict, f) - - # This block was taken from https://github.com/RTIInternational/hydro-evaluation/blob/dev-denno-4-1/src/evaluation/utils.py - # TODO: Perhaps import RTI's module, but just do this for now. - weights_json = json.dumps({k: [x.tolist() for x in v] for k, v in crosswalk_dict.items()}) + weights_json = json.dumps( + {k: [x.tolist() for x in v] for k, v in crosswalk_dict.items()} + ) with open(weights_filepath, "w") as f: f.write(weights_json) @@ -195,9 +200,10 @@ def get_blob(blob_name: str, bucket: str = NWM_BUCKET) -> bytes: bucket = client.bucket(bucket) return bucket.blob(blob_name).download_as_bytes(timeout=120) + def calc_zonal_stats_weights_new( src: np.ndarray, - weights_filepath: str, + weights_filepath: str, ) -> pd.DataFrame: """Calculates zonal stats""" @@ -209,11 +215,11 @@ def calc_zonal_stats_weights_new( nvar = src.shape[0] mean_dict = {} for key, value in crosswalk_dict.items(): - mean_dict[key] = np.zeros((nvar,),dtype=np.float64) - + mean_dict[key] = np.zeros((nvar,), dtype=np.float64) + mean_dict = {} for key, value in crosswalk_dict.items(): - mean_dict[key] = np.nanmean(src[:,value[0],value[1]],axis=1) + mean_dict[key] = np.nanmean(src[:, value[0], value[1]], axis=1) # This should not be needed, but without memory usage grows del crosswalk_dict @@ -222,51 +228,56 @@ def calc_zonal_stats_weights_new( return mean_dict -def get_forcing_dict_JL( - wgt_file, - filelist, - var_list, - var_list_out -): + +def get_forcing_dict_JL(wgt_file, filelist, var_list, var_list_out): t1 = time.perf_counter() - df_by_t = [] - for _i, _nc_file in enumerate(filelist): + df_by_t = [] + for _i, _nc_file in enumerate(filelist): with xr.open_dataset(_nc_file) as _xds: - shp = _xds['U2D'].shape + shp = _xds["U2D"].shape data_allvars = np.zeros( - shape=(len(var_list),shp[1],shp[2]), - dtype=_xds['U2D'].dtype) + shape=(len(var_list), shp[1], shp[2]), dtype=_xds["U2D"].dtype + ) for var_dx, jvar in enumerate(var_list): - data_allvars[var_dx,:,:] = np.squeeze(_xds[jvar].values) + data_allvars[var_dx, :, :] = np.squeeze(_xds[jvar].values) _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, wgt_file) df_by_t.append(_df_zonal_stats) - print(f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}%", end="\r") + print( + f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}%", + end="\r", + ) - print(f'Reformating and converting data into dataframe') + print(f"Reformating and converting data into dataframe") dfs = {} for jcat in list(df_by_t[0].keys()): data_catch = [] - for jt in range(len(df_by_t)): + for jt in range(len(df_by_t)): data_catch.append(df_by_t[jt][jcat]) - dfs[jcat] = pd.DataFrame(data_catch,columns = var_list_out) + dfs[jcat] = pd.DataFrame(data_catch, columns=var_list_out) - print(f"Indexing data and generating the dataframes (JL) {time.perf_counter() - t1:.2f}s") + print( + f"Indexing data and generating the dataframes (JL) {time.perf_counter() - t1:.2f}s" + ) return dfs -def wget(cmd,name,semaphore=None): - if not semaphore == None: semaphore.acquire() - resp = os.system(cmd) + +def wget(cmd, name, semaphore=None): + if not semaphore == None: + semaphore.acquire() + resp = os.system(cmd) if resp > 0: - raise Exception(f'\nwget failed! Tried: {name}\n') + raise Exception(f"\nwget failed! Tried: {name}\n") else: - print(f'Successful download of {name}') + print(f"Successful download of {name}") + + if not semaphore == None: + semaphore.release() - if not semaphore == None: semaphore.release() def main(): """ - Primary function to retrieve hydrofabrics data and convert it into files that can be ingested into ngen. + Primary function to retrieve hydrofabrics data and convert it into files that can be ingested into ngen. Also, the forcing data is retrieved. Inputs: JSON config file specifying start_date, end_date, and vpu @@ -279,148 +290,158 @@ def main(): t00 = time.perf_counter() parser = argparse.ArgumentParser() - parser.add_argument(dest="infile", type=str, help="A json containing user inputs to run ngen") - args = parser.parse_args() + parser.add_argument( + dest="infile", type=str, help="A json containing user inputs to run ngen" + ) + args = parser.parse_args() # Take in user config conf = json.load(open(args.infile)) - start_date = conf['forcing']['start_date'] - end_date = conf['forcing']['end_date'] - if 'nwm_files' in conf['forcing']: - nwm_files = conf['forcing']['nwm_files'] + start_date = conf["forcing"]["start_date"] + end_date = conf["forcing"]["end_date"] + if "nwm_files" in conf["forcing"]: + nwm_files = conf["forcing"]["nwm_files"] else: nwm_files = "" - runinput = conf['forcing']['runinput'] - varinput = conf['forcing']['varinput'] - geoinput = conf['forcing']['geoinput'] - meminput = conf['forcing']['meminput'] - urlbaseinput = conf['forcing']['urlbaseinput'] - vpu = conf['hydrofab']['vpu'] - ii_verbose = conf['verbose'] - bucket_type = conf['bucket_type'] - bucket_name = conf['bucket_name'] - file_prefix = conf['file_prefix'] - file_type = conf['file_type'] - ii_cache = conf['cache'] - dl_threads = conf['dl_threads'] - - file_types = ['csv','parquet'] - assert file_type in file_types,f'{file_type} for file_type is not accepted! Accepted: {file_types}' - - bucket_types = ['local','S3'] - assert bucket_type in bucket_types,f'{bucket_type} for bucket_type is not accepted! Accepted: {bucket_types}' - + runinput = conf["forcing"]["runinput"] + varinput = conf["forcing"]["varinput"] + geoinput = conf["forcing"]["geoinput"] + meminput = conf["forcing"]["meminput"] + urlbaseinput = conf["forcing"]["urlbaseinput"] + vpu = conf["hydrofab"]["vpu"] + ii_verbose = conf["verbose"] + bucket_type = conf["bucket_type"] + bucket_name = conf["bucket_name"] + file_prefix = conf["file_prefix"] + file_type = conf["file_type"] + ii_cache = conf["cache"] + dl_threads = conf["dl_threads"] + + file_types = ["csv", "parquet"] + assert ( + file_type in file_types + ), f"{file_type} for file_type is not accepted! Accepted: {file_types}" + + bucket_types = ["local", "S3"] + assert ( + bucket_type in bucket_types + ), f"{bucket_type} for bucket_type is not accepted! Accepted: {bucket_types}" # TODO: Subsetting! # # Set paths and make directories if needed - top_dir = Path(os.path.dirname(args.infile)).parent + top_dir = Path(os.path.dirname(args.infile)).parent if not os.path.exists(CACHE_DIR): - os.system(f'mkdir {CACHE_DIR}') + os.system(f"mkdir {CACHE_DIR}") if not os.path.exists(CACHE_DIR): - raise Exception(f'Creating {CACHE_DIR} failed!') + raise Exception(f"Creating {CACHE_DIR} failed!") # Prep output directory if bucket_type == "local": - bucket_path = Path(top_dir,file_prefix,bucket_name) + bucket_path = Path(top_dir, file_prefix, bucket_name) if not os.path.exists(bucket_path): - os.system(f'mkdir {bucket_path}') + os.system(f"mkdir {bucket_path}") if not os.path.exists(bucket_path): - raise Exception(f'Creating {bucket_path} failed!') - elif bucket_type == 'S3': - s3 = boto3.client('s3') + raise Exception(f"Creating {bucket_path} failed!") + elif bucket_type == "S3": + s3 = boto3.client("s3") # Get nwm forcing file names if len(nwm_files) == 0: - print(f'Creating list of file names to pull...') + print(f"Creating list of file names to pull...") n = 6 - fcst_cycle = [n*x for x in range(24//n)] - lead_time = [x+1 for x in range(n)] + fcst_cycle = [n * x for x in range(24 // n)] + lead_time = [x + 1 for x in range(n)] nwm_forcing_files = create_file_list( - runinput, - varinput, - geoinput, - meminput, - start_date, - end_date, - fcst_cycle, - urlbaseinput, - lead_time, - ) + runinput, + varinput, + geoinput, + meminput, + start_date, + end_date, + fcst_cycle, + urlbaseinput, + lead_time, + ) else: - print(f'Reading list of file names from {nwm_files}...') + print(f"Reading list of file names from {nwm_files}...") nwm_forcing_files = [] - with open(nwm_files,'r') as f: + with open(nwm_files, "r") as f: for line in f: nwm_forcing_files.append(line) - # Download whole files and store locally if cache is true, + # Download whole files and store locally if cache is true, # otherwise index remotely and save catchment based forcings t0 = time.perf_counter() if ii_cache: - # Check to see if we have files cached, if not wget them + # Check to see if we have files cached, if not wget them local_files = [] cmds = [] - fls = [] + fls = [] for jfile in nwm_forcing_files: - if ii_verbose: print(f'Looking for {jfile}') + if ii_verbose: + print(f"Looking for {jfile}") file_parts = Path(jfile).parts - local_file = os.path.join(CACHE_DIR,file_parts[-1]) + local_file = os.path.join(CACHE_DIR, file_parts[-1]) local_files.append(local_file) - if os.path.exists(local_file): - if ii_verbose: print(f'Found and using raw forcing file {local_file}') + if os.path.exists(local_file): + if ii_verbose: + print(f"Found and using raw forcing file {local_file}") continue else: - if ii_verbose: print(f'Forcing file not found! Downloading {jfile}') - command = f'wget -P {CACHE_DIR} -c {jfile}' + if ii_verbose: + print(f"Forcing file not found! Downloading {jfile}") + command = f"wget -P {CACHE_DIR} -c {jfile}" cmds.append(command) fls.append(jfile) - threads = [] + threads = [] semaphore = threading.Semaphore(dl_threads) - for i,jcmd in enumerate(cmds): - t = threading.Thread(target = wget, args = [jcmd, fls[i],semaphore]) + for i, jcmd in enumerate(cmds): + t = threading.Thread(target=wget, args=[jcmd, fls[i], semaphore]) t.start() threads.append(t) for jt in threads: jt.join() - forcing_files = local_files # interacting with files locally + forcing_files = local_files # interacting with files locally else: - forcing_files = nwm_forcing_files # interacting with files remotely + forcing_files = nwm_forcing_files # interacting with files remotely - print(f'Time to download files {time.perf_counter() - t0}') + print(f"Time to download files {time.perf_counter() - t0}") # Generate weight file only if one doesn't exist already # Very time consuming so we don't want to do this if we can avoid it - wgt_file = os.path.join(CACHE_DIR,"weights.json") + wgt_file = os.path.join(CACHE_DIR, "weights.json") if not os.path.exists(wgt_file): # Search for geopackage that matches the requested VPU, if it exists gpkg = None - for jfile in os.listdir(os.path.join(top_dir,'data')): + for jfile in os.listdir(os.path.join(top_dir, "data")): if jfile.find(vpu) >= 0: - gpkg = Path(top_dir,"data",jfile) - print(f'Found and using geopackge file {gpkg}') + gpkg = Path(top_dir, "data", jfile) + print(f"Found and using geopackge file {gpkg}") if gpkg == None: - url = f'https://nextgen-hydrofabric.s3.amazonaws.com/05_nextgen/nextgen_{vpu}.gpkg' - command = f'wget -P {CACHE_DIR} -c {url}' - wget(command,url) + url = f"https://nextgen-hydrofabric.s3.amazonaws.com/05_nextgen/nextgen_{vpu}.gpkg" + command = f"wget -P {CACHE_DIR} -c {url}" + wget(command, url) - print(f'Opening {gpkg}...') + print(f"Opening {gpkg}...") polygonfile = gpd.read_file(gpkg, layer="divides") - ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) - src = ds["RAINRATE"] + ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) + src = ds["RAINRATE"] print("Generating weights") t1 = time.perf_counter() generate_weights_file(polygonfile, src, wgt_file, crosswalk_dict_key="id") print(f"Generating the weights took {time.perf_counter() - t1:.2f} s") else: - print(f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}") + print( + f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}" + ) var_list = [ "U2D", @@ -438,12 +459,12 @@ def main(): "VGRD_10maboveground", "DLWRF_surface", "APCP_surface", - "precip_rate", # BROKEN (Identical to APCP!) + "precip_rate", # BROKEN (Identical to APCP!) "TMP_2maboveground", "SPFH_2maboveground", "DSWRF_surface", ] - + fd2 = get_forcing_dict_JL( wgt_file, forcing_files, @@ -454,37 +475,42 @@ def main(): # Write CSVs to file t0 = time.perf_counter() write_int = 100 - for j, jcatch in enumerate(fd2.keys()): - df = fd2[jcatch] - splt = jcatch.split('-') - - if bucket_type == 'local': - if file_type == 'csv': - csvname = Path(bucket_path,f"cat{vpu}_{splt[1]}.csv") + for j, jcatch in enumerate(fd2.keys()): + df = fd2[jcatch] + splt = jcatch.split("-") + + if bucket_type == "local": + if file_type == "csv": + csvname = Path(bucket_path, f"cat{vpu}_{splt[1]}.csv") df.to_csv(csvname) - if file_type == 'parquet': - parq_file = Path(bucket_path,f"cat{vpu}_{splt[1]}.parquet") + if file_type == "parquet": + parq_file = Path(bucket_path, f"cat{vpu}_{splt[1]}.parquet") df.to_parquet(parq_file) - elif bucket_type == 'S3': - buf = BytesIO() - if file_type == 'parquet': - parq_file = f"cat{vpu}_{splt[1]}.parquet" - df.to_parquet(buf) - elif file_type == 'csv': - csvname = f"cat{vpu}_{splt[1]}.csv" + elif bucket_type == "S3": + buf = BytesIO() + if file_type == "parquet": + parq_file = f"cat{vpu}_{splt[1]}.parquet" + df.to_parquet(buf) + elif file_type == "csv": + csvname = f"cat{vpu}_{splt[1]}.csv" df.to_csv(buf, index=False) buf.seek(0) - key_name = f'{file_prefix}{csvname}' - s3.put_object(Bucket=bucket_name, Key=key_name, Body=buf.getvalue()) + key_name = f"{file_prefix}{csvname}" + s3.put_object(Bucket=bucket_name, Key=key_name, Body=buf.getvalue()) - if (j+1) % write_int == 0: - print(f"{j+1} files written out of {len(fd2)}, {(j+1)/len(fd2)*100:.2f}%", end="\r") + if (j + 1) % write_int == 0: + print( + f"{j+1} files written out of {len(fd2)}, {(j+1)/len(fd2)*100:.2f}%", + end="\r", + ) + + print(f"{file_type} write took {time.perf_counter() - t0:.2f} s\n") - print(f'{file_type} write took {time.perf_counter() - t0:.2f} s\n') + print( + f"\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {bucket_type}\n\n" + ) + print(f"Total run time: {time.perf_counter() - t00:.2f} s") - print(f'\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {bucket_type}\n\n') - print(f'Total run time: {time.perf_counter() - t00:.2f} s') if __name__ == "__main__": main() - \ No newline at end of file diff --git a/ngen_forcing/process_nwm_forcing_to_ngen.py b/ngen_forcing/process_nwm_forcing_to_ngen.py index 8957438..23c73ba 100644 --- a/ngen_forcing/process_nwm_forcing_to_ngen.py +++ b/ngen_forcing/process_nwm_forcing_to_ngen.py @@ -256,4 +256,4 @@ def get_forcing_dict_newway_inverted( # df_dict[var].loc[j, t] = s[var].mean() # [ds.close() for ds in ds_list] -# return df_dict \ No newline at end of file +# return df_dict diff --git a/ngen_forcing/test_process_nwm_forcing_to_ngen.py b/ngen_forcing/test_process_nwm_forcing_to_ngen.py index 545fe95..ed98587 100644 --- a/ngen_forcing/test_process_nwm_forcing_to_ngen.py +++ b/ngen_forcing/test_process_nwm_forcing_to_ngen.py @@ -208,4 +208,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/ngen_forcing/user_input_ngen.md b/ngen_forcing/user_input_ngen.md index 9e0cd45..8877637 100644 --- a/ngen_forcing/user_input_ngen.md +++ b/ngen_forcing/user_input_ngen.md @@ -36,7 +36,7 @@ contents: | --- | --- | --- | | start_date | `string` | YYYYMMDD | | end_date | `string` | YYYYMMDD | -| nwm_files | `string` | Path to a text file containing nwm file names. One filename per line. To have nwm forcing file names generated automatically, leave this option out of the config or set it to "". | +| nwm_files | `string` | Path to a text file containing nwm file names. One filename per line. To have nwm forcing file names generated automatically, leave this option out of the config or set it to "" | | runinput | `int` |
    1. short_range
    2. medium_range
    3. medium_range_no_da
    4. long_range
    5. analysis_assim
    6. analysis_assim_extend
    7. analysis_assim_extend_no_da
    8. analysis_assim_long
    9. analysis_assim_long_no_da
    10. analysis_assim_no_da
    11. short_range_no_da
    | | varinput | `int` |
    1. channel_rt: for real-time channel data
    2. land: for land data
    3. reservoir: for reservoir data
    4. terrain_rt: for real-time terrain data
    5. forcing: for forcing data
    | | geoinput | `int` |
    1. conus: for continental US
    2. hawaii: for Hawaii
    3. puertorico: for Puerto Rico
    | @@ -56,5 +56,5 @@ contents: | bucket_name | `string` | If local, this is the name of the folder the data will be placed in. If S3, this is the name of S3 bucket, which must exist already. | | file_prefix | `string` | If local, this is the relative path to the bucket_name folder. If S3, this is the relative path within the S3 bucket_name bucket to store files | | file_type | `string` |
    1. "csv" : write data as csv files/
    2. "parquet" : write data as parquet files
    | -| cache | `bool` |
  • true: store forcing files locally
  • false: interact with forcing files remotely
  • | -| dl_threads | `int` | Number of threads to use while downloading | +| cache | `bool` |
  • true: Store forcing files locally. Must specify dl_threads
  • false: Interact with forcing files remotely
  • | +| dl_threads | `int` | Number of threads to use while downloading. | From 3c0c7f57dc2a0635de3a0f224db02dd3711d0f15 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 15:23:27 -0500 Subject: [PATCH 063/105] block pycache in ngen_forcing --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index c6e39ed..125a2ec 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ data/* nwm_filenames/__pycache__/ subsetting/__pycache__/ +ngen_forcing/__pycache__/ venv/ \ No newline at end of file From b4dca394182b5431c19d4bc107774c39a379607c Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 15:24:33 -0500 Subject: [PATCH 064/105] removed pycache --- ngen_forcing/__pycache__/defs.cpython-311.pyc | Bin 1345 -> 0 bytes ngen_forcing/__pycache__/defs.cpython-38.pyc | Bin 938 -> 0 bytes .../process_nwm_forcing_to_ngen.cpython-311.pyc | Bin 6098 -> 0 bytes .../process_nwm_forcing_to_ngen.cpython-38.pyc | Bin 2977 -> 0 bytes ...t_process_nwm_forcing_to_ngen.cpython-311.pyc | Bin 6097 -> 0 bytes 5 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 ngen_forcing/__pycache__/defs.cpython-311.pyc delete mode 100644 ngen_forcing/__pycache__/defs.cpython-38.pyc delete mode 100644 ngen_forcing/__pycache__/process_nwm_forcing_to_ngen.cpython-311.pyc delete mode 100644 ngen_forcing/__pycache__/process_nwm_forcing_to_ngen.cpython-38.pyc delete mode 100644 ngen_forcing/__pycache__/test_process_nwm_forcing_to_ngen.cpython-311.pyc diff --git a/ngen_forcing/__pycache__/defs.cpython-311.pyc b/ngen_forcing/__pycache__/defs.cpython-311.pyc deleted file mode 100644 index c6a40066f7994994d36d7134d8f5ca0faff26564..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1345 zcmb7DPfHs?6rY*hO`i1MOlmOHmiDppoA>6uH}CCxZ$89g?Fi`U z<405wguY7Qia=I$o`P);QADwea(IqlL_<{D#B&-AQyq4lMyLV1LGcVqwtbUsnQ2E; z?hTN9NK5a6x0hli45GH{T19WgSz_ok5t|MO%PU#t=F{qEsqfj-{8tCB5u|j;9@u zva~d`Y{y|jSZvMC6i`jQ-4%a&3yL9)-!Stu-=b^ery4t@ZfFBa-h_tnzgEcxrNu5W5Q8}yM z;4u(Y_Ve@e3Elxx5V;D?yFirv688ZvE+AZ;Xa?N)S7=8ay;Z%3cY@Q+?SBrMJ>;VU z*ncM_(&{mTj&SCP1bS95H!xP07-!HuLPZc-&E~1M z8tAIex@w|oULx`9_dD0P#hFd5h9#G?D(fQd0hCZJ8rvD&9^HLVnS3)@nyQ7PCBJ6I z%h5A4zO%8tu|ITT4xXBW)yQC@0?io&cgd>{`vOl5i*8P9oi#bpyb0cUbh#4~`phB2<8!D{PQ*F(6sj#{DHz==9) Og)RdR%d;Cnh5rdLFes$} diff --git a/ngen_forcing/__pycache__/defs.cpython-38.pyc b/ngen_forcing/__pycache__/defs.cpython-38.pyc deleted file mode 100644 index e52dfc5c1af0db3166cdc12f95d47beb992e5322..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 938 zcmZuv&2AGh5VpPEWb;=6Efo?Rx$Gt3PE}Px$^|Kx!X-jlt#+Jb+f6naZ%KldTczZ} zJG95VlCPY21uihNX-Yth%8p=v9(yswHkRKLEQ#l zz0#vlZ;(j3+Y`%m+Q+o*DX~Ia$&D<sYP=Wg9S79GQ&(+e|XM=@n#aR{=`Asr0 z;BGV9$R$C7lpsPb%#Fnn9HexG&zU@uO zyLXk55qn7vrpF#poFvBD$SmXOJTta88=F})$}vf2;~toi?ao)O5%JUf4Hv;45ff5N zCA3hyetmhlcj%sFc@k&#y!#k~4RQ}P26R+#Dumc#m9LNsgxpOR?4PNM%%2JQ#K$(B z|Lw&)y0EW_mEum6@?KIG6o~5pPq`bVcPC4APb~#;DYkO3RZgeYWX@kq44P0o1=iIT z=?wCP)10BxT$dfN3S9;4Xo(S;7^0mk1G1ctkWCp#t?)U?iO-e?s?o`M1@R3Dj}ZL_ zh_6bo)TKYFfU>j$%R0lw4h13CI;`PSmY*@>NpfBaOQ(x;YLF-KU$n=6qm9^+{SF{E z6}})#sQUd-4rpu#gRnO2>_#iiyIhLuFXSYu4pcc-Q3wD1DN%4X&EY09=REV4({HP0 HqZ#}Lt|Z1L diff --git a/ngen_forcing/__pycache__/process_nwm_forcing_to_ngen.cpython-311.pyc b/ngen_forcing/__pycache__/process_nwm_forcing_to_ngen.cpython-311.pyc deleted file mode 100644 index 21bd3c010c14424f9de91c76acddb9af51ac64dc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6098 zcmeHLTTC3+8J?NF&BE@oH)dJx5WCJM4q)R3+aV4ZY&XWHjd7fG6FOS$3}orvdSo+qF#GYJbN=(6+du#L&iReo-9W*kPrNStv74g)iiyf&-xi)MfN-4>D1lB=DLP80 zm?)F7L@hMtF-dF67PY19QM-}0Bpp!}@-0bcw1J_{P=fUuCD=sUTw}$sXrFUckU@qb1 zL`sY>x`j_@F6Hh!KT&F}TN()q!5olr%#K+=s zQIdFZA(qM}MX5hME2jCG42dVwv;A3;F-p@{Qk6Vj&hV(ymt7=IFl>MW1gQyNsrpy4 zvPD!wS+ln_L+>WY1M)g()zVO7;I0i@=1d88<=ycCC(W z({dAP-LBTm*%kX7wr4zZ)m6s^Xxwy}EzlH0$vw4-3Ty7a(qvA* z3%-)`=6pH7X}*0G3yN283!YhKv&q~}&MWV)=unh=peC2~6@MXE%d1IdUozV?_beAM zt%K8C3TqoRRw0;cR+{IUD=pMKzW%Gpaq>$Q4OshTC7^gWy*FX(x2xYuE~JDCbt_-f zH}^T$vh7)6r9}zX902^ME;ZLusjqkxZiXp)*2+7p^h%H5yGAL_ams9$rW6;1KL4c7 zl9(imB#|NbS>K{{yO^F$q{Y*>Y`Rk>v9ts+E2Ud!<5@|!l5AYkEpY;3Jd<4nbsGd{ zap5U*u=^24qDKP4^8in2HxygMw4l?AI=ui1tB^=Z_#dNqyc$?Q8AAttI!|5(xK7Q& zCo|3EXoW67diP@<%!kZMXo@0k(8CVJlNm|8#pq1mfX)b#gyZV&Cg|f&8-|k!Nseby z*%Mb^oE7D2@D&nqnNN#XuEZAmhLf3iEGeA;p`!d9b0n-Y3q*HivJmodGk{%1XXi4BbT%fxr(51UJg(a>$CC4+WW;dYg7c4V zrQ2g!s3ho2R?wZeA)O|$59v-ZJ)aUuOcr%pmL$@$ZX=m_(6c1Pv~Ekz%}X-DNi0_y zB_y_p(;b&}%hg0yXXT8POu&|-TXC}_=!mckVk`|7GVw^O?w%22@;niFV?A|OC5cl2 zg_)!v5FI~<(hmt&Z8mFo0(50t4W72!y7$k_{%z-&>h=dA?Q^)&Fl7ULg0k|+oPiAz&y09 zH{kGiV{UavoMd1&1>GjeF&X?8`YN@8k?uVo!%Otg9-cvXsEj+CG!9O+L~VH}7q>M2 z&=*+iDsLWfG~d+H*;24`ZBh$Hmd-zDY$>tL zC4c)`pXT3p-`{u7->3QqH2*-!?ZemQExG+$ZimbMm;(43VQG9TNHzJE&Xj^3TNGm( zr^}dUOD+5FxAfg>>DyRPTaIciNAvCy+w_6wea}Z^?PrDD-HXN0Ni}p*17ycEcC2W; zUwOP>%)@O_wC$(?dFBDvvX7bxdB-ZV+?a2C$aNK5D)*Acy|hI&*j{~% zu-uTR^Ft3<_o}bxIR@`X;`-=~c;TSh*{^l>tL^=Fd@47naf5fpz+749zUQ@jp4Yx$ zRL_{^8T&F=WG7X2Qe!7UD78L+*E`i zjj6)bV$)%@>9E#xIB$IrY`GCwj~52lqZ{1kU7vNSk>gtAxEg*%3%{ZUhqd5v-uWQZ zT?nb6h!%=$Q8wqR1}vY+kLKl4xZ~5X7LIK6XyJqTvk$m^g^L@$=beqbF=t^5IwLap0`wXL26&sC`| z9EE)55h$%nTbZybE*Q;o#vy^rwdXpvEp;j#N@pGQZLLq$QD07JuTtNAf`1R%+X}Qd zFiG&tVK82T?s}Tw`GK?};73F{5xNk%5zyrc`a0Q%fQOMsNGk~*MF>)gZ-vd07ts7c zfO-;3`XS@$kv%vH@VCn%kwxKIH1-bK8$iJOgGiVhLisSlAi@!Z?;;pC5J<)iT1tjc zI*#xP0N6P!U)1B!byZN1mv1z9N{R!F@#e9kuGu?y@HT)>&yaU8@k2~7dT>d?YV!5D97rX!2Qji3g`YGD{+kb;MdcRrT=<+O`Dh0Wv z^Cnhkvkd^RvJL#tutK2V_u2M)Z2P)Uc)i%sr?UMT+g~)^hu-k|0oB{9d3(1gyY09E z%U0mKhqn3d%SF%2+k7|jar)D=+J0~&rg8%sH*hDQaz~&T-us^6d!FID7gW!P<{9}y zD6$hOJE5@?Ae5T9wM%Q$g+p5N{(Qs3wtagp}J zdV%+Xp8X$F-bj<~tG^KK6T}&*94U-of>#OU7;Q%hZwht$0-Pdai$=`S?b%pbh)Ly; zg277nzhzt#pBLr#GJ^7E8OL`45CkcLq9}@bvFs>mI*poCa^18|Th({14c z7izJz$jl-a`B=@gy=FQ?-QrtL$ydl{*lSPjNAM}nv!raf6HjM)C@}lSz5vg&`|R#- zamV-T1TOvVJM611LjHw?i&q5-Hy{;%0%3&Fgru}bQ>$mCcF(3*XC+Qr=~Ys<=bEye zRC^xO+exige~U1ORqhj3;g!JxcL$9Vr?J8zq=;1i{fLOlB<)h*$tqMjm~zdpA${ z?jQjJVmHbe-(_Ge2&0G>MZm{lI!t)6n;r5j=;tDev%}q?$W3c@oSxSOG7s>bo#8}O z!8)eG0`aIRM>rSf*9K%fNN+$YG8iDEWkr$!Ro2u}HnY~r8C4Z#pIW7>+<`rC%BrlW zs>!?4>KV=)-Y`3-Zs{G8vZm~P^6?90-Pov~(Q-l6rk=67Ftc?=DLHyOCS^l8@NTE{ zWo=Mbgt@0wHI%%B8Xm{+CpX`K)0r>5dD{5u1GH@^wWmk69idqIMty8;PRjbq0Gg7XZF>IFWYE7-L6X-waU`;hu zw{Pu{ztN-DAk~ll!IoJ2K2f!OasuO54OlqK2ig;1A-RZif$Qr5)h^EtYP|-Ch z!mNPFPPKCw4GZmvVN_^45|EAZ;RIfH6GL)A<9#J2fZJserCz_6- z;IKF?@QJ%1LVxEME)WOOZAe820lMye>y4E(va0&gp;SfGz+8Q!l!!3DJ zz)sXPo{ds2LdkVyDB?`&ipWRso}KVaSCYY~kitd-=A+kz2qze_NNW3&c&I&@7fB5J zO*;q-fiZ(n@-PDnd9=Nu{XP%nNbtbSiEf-1aZu2hCya|=D0n{}>smhsBA6EMUMOZc zN5JF*FvhG1*eFdW+7Bd$jSrm;LLsyhjIJ?Vh346pK^6sQ2;S?DlH_r*%KCxnxfR^g zG}hI5SHz3hX&PvE4#gTg=cA$nFb|$}9fuc~*y?H|a)>6=l|qIR)*kfrKj(Y(8Pkez zE3Si(s$;EDk6LiojY95`TX&$YPP-O9+g6M2&^Fv%+M*5GggTFISxqYb2o|2g)^kwt z7+e4TKVu6J*MJ1bngi<7IiTtWP%Z$Z39y<0stSPe=79294FFYV3kFclZvoUIpryfl z094ZeswJ0ZWdKzB98k)ida9+C`#7015rXT}l{3<}u?GOR14s_we8ZryTV9gg!7AW<32?rl zE*S{^RW$(SJAmVExmjMGL-UHdtgZkwH&6kZU9~xf=9ggebJk|d7m&FD$m|>lAEw{X zQZylVQ?#(WhO&-=hnl#AvWapT1y_vNLU|tKA#Ppq9h5G}|42gdT{Q4Ll*h-o0igIk z)O`Wz3He@q#ufom+~Y!W_U#B1FQfHWP=0{2gR+Z)$GW(Rg2)p;M0phji7S4B@>7)8 zP!MY3XDB~M`31@z$}dq&FmRiSYbZZL`4!0abK(Z(Uq|^h3jQXDeUvv*Zlb(}@;1sl zD8E5@7v)2co{uLD{(u=?i34o-Eedi-{0`-Pln+2O?Tb5Dz(Yt|K?>Z%gyqMVdOx>hZ{3Lf%Q@z_dz$Xji+64pv5Ayquwn-R$i4^_WepMCcI zK@de{y!&}(zHh$yX7-zJelvgd`#lKK!rzZc#Xf}oLkrEsEobh33z;vGgd|2m3C8|s z6CC`r3eGqZ4yM6#CR{d;Pw?>1DejCX;mLRt-i$Bd%lH$1TMjFMOfV5#ZWBs`SR{w0 zx|*Fzj%y4;*Wqm>2Dy>syn+(lk_%{9?veNpS;`|mXc@i+x$6y4prDmGYriq~60Mzx zF_yQGRYgV6bWzu0*y>Nox-gj~$+Vghq;ygjRQb{+@v;@&fFa~WA}WfkY%GyhFUmxh zB`da(j;2IQ85oA&6!i<1Go4biL{2(dqd>?Dc{=^~e+xr}sb`7~i7h(QsOU2A7<%7T zBlPg`_qWYEm0?q1PkaGiXx@89o1+oBJwbQ`{bmT1wh(-AKx!}k@EuUx7ivn0#mI~ z&fCjpMQ6FDFv}lb_^0es+=_>^^)f%z>^omA?^&Qv7d6=b-_P^bMc6Rxu;=AW>5B5Jx!(pd{WqQ?l9&3&Xms=|YZVZ4vcSrpXZWtUv|hxyuhchn2LZC$pK{ z(ZcrC%RYWs$tFcbI|`Ym)H`6176np56}+?YT$M`aK23a*`1rjm@3kuLgA9s=EPg7R zR&#cY5OX=0NsFB(mM;rgfsR3wb;~WQ`HW0NUA7!Kl2&!gL9%%e@A(T69|=Ftp)n(h;amM}Eg^ppVked}X$Lsut|KN@jjl#*4iFYIh}Y5S}mcb8r2Hn?LnO|DXNU zr}r#;RP7%z`$xXrR_%YGA#po!ds7{G+_W1? z%g1Wr{yBE`P}hIc*M~D!+R5*vuPq_-nTr8C>Xsp;djmE56Sg#XC)V8Qk%9mnO(&_#jDmTZzV4>XG*&P8u5Ywy+^FtKuI(kt;}V zMMu~VI0<*g^VLzU$ViSNyTM+^O3oXu&h|wvg(Ut@Oc5K5QCA0DIn=UR!$?LXT*1{)byG1*G7O z&@~s#JiLum^gc2ByB^#A?se?%z-0ed&k3)we`P5?iqHF>*v$8=v7Cmz<+R>vUDtC0 zPt5+_$F_ex&k6oC_Dcv*Y2So}DHiHLl{CEBsPZYuETc*{1bhUcoB0Y$>^B@>M-MdT z3R?l*#r2VWg?L7m(tuU~C8gx}@&PeEX|a3v7u@k=R?Ta1HA!fYim`;Qcz`lw8D`p0 zgKUDzLL?)m)%Eb72wd%XAOJZK+v{8*wO2+f{7VbZ0%<*Uu0zD$AD8GJxJ0+P!h!W+ z?Ql}m_Z&F1_rQS>A)nK9B8z~(@*eWyutSw<<`lauvB~PEvk`Li59nGpy z{em0?WEdUPl(@swrS%J-2Mm{}(&pJaZaEv3OCjhKfIg-i z%SoxA7WO<$+cJ=E)8$&ODhxfWos!Mwv?!6`Y_G{uym0){g*O~VVf1m8tg${8AoNTN zK%aW5#ib~Ur}NbS(tJvW1CLHFTP`@QP%KV%fjD`F5(<4ShYjfo1t)|q6+)o~p*vG7 zME1a+<)_Vr?4+$xv+$&_cfaMM4g?t9_8&4#<=hmyH;}y7<~Cf7?50c)+_(^3pyvsT zo0LW<*$29${klzawBmW1eZj7HA*a!AyBfwfP&}rqill%qz^g@J5tPtkPh_s|^YPj7 z($DK`bGa;@c>TMP@5Fz;`>%KHHFS@zH2|u;PLT2--xyOGTMYNj9+~f1_+=%0uo^yS zh7Xp;>#REjYqCj7r|O$f*OobPo|#LUp=V34)w;LLeKa4Q%bVRhN^jJ{Tg>otx5LA? z!o$_@h#4L!jsM_5-F>qIpKqSsTzY*GyDOeuRXk+kp$Z;a^lYl}VUr)I@jW%Zx5jtX z_|QGC(+jo%DINbIfE=FRZ=Z>MHZ(Id=c#(0t>SGa-d4fe?soNdXx&!D+fBT^g10Yv zp7|-X-leu4uHqvmK2pI)7QI7jwbuN-wP060$ori45RmUFDUJQm{m5p&RK+iw_~i?h&9-LOA>#`1<88B#d6WR3?=3iLFChr_rfUz!4h9eS!!^)N9KybG;TAVg;Rb4$FO7YC^2*7Q{i_3S6I9#t^FK8$ Bf^Yx; From 66e40e3ed1b0d751b25e0adf94d24ab4bc0114e7 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 15:26:25 -0500 Subject: [PATCH 065/105] removed raw script --- ngen_forcing/denno.py | 470 ------------------------------------------ 1 file changed, 470 deletions(-) delete mode 100644 ngen_forcing/denno.py diff --git a/ngen_forcing/denno.py b/ngen_forcing/denno.py deleted file mode 100644 index bc6c369..0000000 --- a/ngen_forcing/denno.py +++ /dev/null @@ -1,470 +0,0 @@ -# https://github.com/jameshalgren/data-access-examples/blob/DONOTMERGE_VPU16/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb - -# !pip install --upgrade google-api-python-client -# !pip install --upgrade google-cloud-storage - -import pickle -import time -import pandas as pd -import argparse, os, json -import gc -from pathlib import Path -import geopandas as gpd -import pandas as pd -import numpy as np -import xarray as xr -from google.cloud import storage -from rasterio.io import MemoryFile -from rasterio.features import rasterize - -from nwm_filenames.listofnwmfilenames import create_file_list - -DATA_DIR = Path(Path.home(), "code", "data") - -TEMPLATE_BLOB_NAME = ( - "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" -) -NWM_BUCKET = "national-water-model" - -# WKT strings extracted from NWM grids -CONUS_NWM_WKT = 'PROJCS["Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]], \ -PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ -PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-97.0],PARAMETER["standard_parallel_1",30.0],\ -PARAMETER["standard_parallel_2",60.0],PARAMETER["latitude_of_origin",40.0],UNIT["Meter",1.0]]' - -HI_NWM_WKT = 'PROJCS["Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]],\ -PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ -PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-157.42],PARAMETER["standard_parallel_1",10.0],\ -PARAMETER["standard_parallel_2",30.0],PARAMETER["latitude_of_origin",20.6],UNIT["Meter",1.0]]' - -PR_NWM_WKT = 'PROJCS["Sphere_Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]],\ -PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ -PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-65.91],PARAMETER["standard_parallel_1",18.1],\ -PARAMETER["standard_parallel_2",18.1],PARAMETER["latitude_of_origin",18.1],UNIT["Meter",1.0]]' - -# paths -CACHE_DIR = Path(Path.home(), "code", "data_access_examples", "forcing_data") -NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") -USGS_CACHE_DIR = os.path.join(CACHE_DIR, "usgs") -GEO_CACHE_DIR = os.path.join(CACHE_DIR, "geo") - -NWM_CACHE_H5 = os.path.join(NWM_CACHE_DIR, "gcp_client.h5") - -PARQUET_CACHE_DIR = os.path.join(CACHE_DIR, "parquet") -MEDIUM_RANGE_FORCING_PARQUET = os.path.join(PARQUET_CACHE_DIR, "forcing_medium_range") -FORCING_ANALYSIS_ASSIM_PARQUET = os.path.join( - PARQUET_CACHE_DIR, "forcing_analysis_assim" -) -MEDIUM_RANGE_PARQUET = os.path.join(PARQUET_CACHE_DIR, "medium_range") -USGS_PARQUET = os.path.join(PARQUET_CACHE_DIR, "usgs") - -HUC10_SHP_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_conus.shp") -HUC10_PARQUET_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_conus.parquet") -HUC10_MEDIUM_RANGE_WEIGHTS_FILEPATH = os.path.join( - GEO_CACHE_DIR, "wbdhu10_medium_range_weights.pkl" -) - -ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, "RouteLink_CONUS.nc") -ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, "route_link_conus.parquet") - - -def parquet_to_gdf(parquet_filepath: str) -> gpd.GeoDataFrame: - gdf = gpd.read_parquet(parquet_filepath) - return gdf - - -def get_cache_dir(create: bool = True): - if not os.path.exists(NWM_CACHE_DIR) and create: - os.mkdir(NWM_CACHE_DIR) - if not os.path.exists(NWM_CACHE_DIR): - raise NotADirectoryError - return NWM_CACHE_DIR - - -def make_parent_dir(filepath): - Path(filepath).parent.mkdir(parents=True, exist_ok=True) - - -def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: - """Retrieve a blob from the data service as xarray.Dataset. - Based largely on OWP HydroTools. - Parameters - ---------- - blob_name: str, required - Name of blob to retrieve. - use_cache: bool, default True - If cache should be used. - If True, checks to see if file is in cache, and - If fetched from remote, will save to cache. - Returns - ------- - ds : xarray.Dataset - The data stored in the blob. - """ - # TODO: Check to see if this does any better than kerchunk - # the caching should help, but probably needs to be managed to function asynchronously. - # Perhaps if the files is not cached, we can create the dataset from - # kerchunk with a remote path and then asynchronously do a download to cache it - # for next time. The hypothesis would be that the download speed will not be any slower than - # just accessing the file remotely. - nc_filepath = os.path.join(get_cache_dir(), blob_name) - make_parent_dir(nc_filepath) - - # If the file exists and use_cache = True - if os.path.exists(nc_filepath) and use_cache: - # Get dataset from cache - ds = xr.load_dataset( - nc_filepath, - engine="h5netcdf", - ) - return ds - else: - # Get raw bytes - raw_bytes = get_blob(blob_name) - # Create Dataset - ds = xr.load_dataset( - MemoryFile(raw_bytes), - engine="h5netcdf", - ) - if use_cache: - # Subset and cache - ds["RAINRATE"].to_netcdf( - nc_filepath, - engine="h5netcdf", - ) - return ds - - -def generate_weights_file( - gdf: gpd.GeoDataFrame, - src: xr.DataArray, - weights_filepath: str, - crosswalk_dict_key: str, -): - """Generate a weights file.""" - - gdf_proj = gdf.to_crs(CONUS_NWM_WKT) - - crosswalk_dict = {} - - # This is a probably a really poor performing way to do this - # TODO: Consider vectorizing -- would require digging into the - # other end of these where we unpack the weights... - i = 0 - for index, row in gdf_proj.iterrows(): - geom_rasterize = rasterize( - [(row["geometry"], 1)], - out_shape=src.rio.shape, - transform=src.rio.transform(), - all_touched=True, - fill=0, # IS FILL 0 - dtype="uint8", - ) - if crosswalk_dict_key: - crosswalk_dict[row[crosswalk_dict_key]] = np.where(geom_rasterize == 1) - else: - crosswalk_dict[index] = np.where(geom_rasterize == 1) - - if i % 100 == 0: - perc = i / len(gdf_proj) * 100 - print(f"{i}, {perc:.2f}%".ljust(40), end="\r") - if perc > 0.01: - break - i += 1 - - with open(weights_filepath, "wb") as f: - # TODO: This is a dict of ndarrays, which could be easily stored as a set of parquet files for safekeeping. - pickle.dump(crosswalk_dict, f) - - -def add_zonalstats_to_gdf_weights( - gdf: gpd.GeoDataFrame, - src: xr.DataArray, - weights_filepath: str, -) -> gpd.GeoDataFrame: - """Calculates zonal stats and adds to GeoDataFrame""" - - df = calc_zonal_stats_weights(src, weights_filepath) - gdf_map = gdf.merge(df, left_on="huc10", right_on="catchment_id") - - return gdf_map - - -def get_blob(blob_name: str, bucket: str = NWM_BUCKET) -> bytes: - """Retrieve a blob from the data service as bytes. - Based largely on OWP HydroTools. - Parameters - ---------- - blob_name : str, required - Name of blob to retrieve. - Returns - ------- - data : bytes - The data stored in the blob. - """ - # Setup anonymous client and retrieve blob data - client = storage.Client.create_anonymous_client() - bucket = client.bucket(bucket) - return bucket.blob(blob_name).download_as_bytes(timeout=120) - - -def calc_zonal_stats_weights( - src: xr.DataArray, - weights_filepath: str, -) -> pd.DataFrame: - """Calculates zonal stats""" - - # Open weights dict from pickle - # This could probably be done once and passed as a reference. - with open(weights_filepath, "rb") as f: - crosswalk_dict = pickle.load(f) - - r_array = src.values[0] - r_array[r_array == src.rio.nodata] = np.nan - - mean_dict = {} - for key, value in crosswalk_dict.items(): - mean_dict[key] = np.nanmean(r_array[value]) - - df = pd.DataFrame.from_dict(mean_dict, orient="index", columns=["value"]) - - df.reset_index(inplace=True, names="catchment_id") - - # This should not be needed, but without memory usage grows - del crosswalk_dict - del f - gc.collect() - - return df - - -def get_forcing_dict_RTIway( - pickle_file, # This would be a Feature list for parallel calling -- - # if there is a stored weights file, we use it - # (checking for an optional flag to force re-creation of the weights...) - folder_prefix, - file_list, -): - var = "RAINRATE" - reng = "rasterio" - filehandles = [ - xr.open_dataset(folder_prefix / f, engine=reng)[var] for f in file_list - ] - # filehandles = [get_dataset("data/" + f, use_cache=True) for f in file_list] - stats = [] - - for _i, f in enumerate(filehandles): - print(f"{_i}, {round(_i/len(file_list), 2)*100}".ljust(40), end="\r") - stats.append(calc_zonal_stats_weights(f, pickle_file)) - - [f.close() for f in filehandles] - return stats - - -def get_forcing_dict_RTIway2( - pickle_file, # This would be a Feature list for parallel calling -- - # if there is a stored weights file, we use it - # (checking for an optional flag to force re-creation of the weights...) - gpkg_divides, - folder_prefix, - filelist, - var_list, -): - reng = "rasterio" - pick_val = "value" - - df_dict = {} - dl_dict = {} - for _v in var_list: - df_dict[_v] = pd.DataFrame(index=gpkg_divides.index) - dl_dict[_v] = [] - - # ds_list = [] - for _i, _nc_file in enumerate(filelist): - # _nc_file = ("nwm.t00z.medium_range.forcing.f001.conus.nc") - _full_nc_file = folder_prefix.joinpath(_nc_file) - - try: - # with xr.open_dataset(_full_nc_file, engine=reng) as _xds: - with xr.open_dataset(_full_nc_file) as _xds: - # _xds = ds_list[_i] - # _xds.rio.write_crs(rasterio.crs.CRS.from_wkt(CONUS_NWM_WKT), inplace=True) - print(f"{_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") - for _v in var_list: - _src = _xds[_v] - _df_zonal_stats = calc_zonal_stats_weights(_src, pickle_file) - # if adding statistics back to original GeoDataFrame - # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1) - _df = pd.DataFrame(index=gpkg_divides.index) - _df[_xds.time.values[0]] = _df_zonal_stats[pick_val] - # TODO: This same line could add the new values directly - # to the same dictionary. But after adding about 100 of them, - # pandas starts to complain about degraded performance due to - # fragmentation of the dataframe. We tried it this was as a - # workaround, with the loop below to accomplish the concatenation. - dl_dict[_v].append(_df) - except: - print(f"No such file: {_full_nc_file}") - - for _v in var_list: - df_dict[_v] = pd.concat(dl_dict[_v], axis=1) - - # [_xds.close() for _xds in ds_list] - - return df_dict - - -def main(): - """ - Primary function to retrieve hydrofabrics data and convert it into files that can be ingested into ngen. - Also, the forcing data is retrieved. - - Inputs: JSON config file specifying start_date, end_date, and vpu - - Outputs: ngen catchment/nexus configs and forcing files - - Will store files in the same folder as the JSON config to run this script - """ - parser = argparse.ArgumentParser() - parser.add_argument( - dest="infile", type=str, help="A json containing user inputs to run ngen" - ) - args = parser.parse_args() - - # Take in user config - conf = json.load(open(args.infile)) - start_date = conf["forcing"]["start_date"] - end_date = conf["forcing"]["end_date"] - vpu = conf["hydrofab"]["vpu"] - - top_dir = os.path.dirname(args.infile) - data_dir = os.path.join(top_dir, "forcing_data") - if not os.path.exists(data_dir): - os.system(f"mkdir {data_dir}") - - # Generate list of file names to retrieve for forcing data - # Going to make assumptions here as to which forecasts we want - # Check the dictionaries at the top of listofnwmfilenames for options - n = 6 # How rapidly we want our forecasts, I think 3 is the highest frequency - fcst_cycle = [n * x for x in range(24 // n)] - lead_time = [x + 1 for x in range(n)] - # fcst_cycle = None # Retrieves a full day for each day within the range given. - runinput = 2 - varinput = 5 - geoinput = 1 - meminput = 0 - urlbaseinput = None - - print(f"Creating list of file names to pull...") - nwm_forcing_files = create_file_list( - runinput, - varinput, - geoinput, - meminput, - start_date, - end_date, - fcst_cycle, - urlbaseinput, - lead_time, - ) - - print(f"Pulling files...") - local_files = [] - for jfile in nwm_forcing_files: - file_parts = jfile.split("/") - local_file = os.path.join(data_dir, file_parts[-1]) - local_files.append(local_file) - if os.path.exists(local_file): - continue - else: - command = f"wget -P {data_dir} -c https://storage.googleapis.com/national-water-model/{jfile}" - os.system(command) - - # Download dataset, read into df with geopandas - gpkg = os.path.join(DATA_DIR, "nextgen_03W.gpkg") - ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) - src = ds["RAINRATE"] - - # Why are we converting to paquet and then back into geopandas dataframe? - polygonfile = gpd.read_file(gpkg, layer="divides") - parq_file = os.path.join(DATA_DIR, "ng_03.parquet") - polygonfile.to_parquet(parq_file) - pkl_file = os.path.join(DATA_DIR, "weights.pkl") - generate_weights_file(polygonfile, src, pkl_file, crosswalk_dict_key="id") - calc_zonal_stats_weights(src, pkl_file) - - folder_prefix = DATA_DIR - - var_list = [ - "U2D", - "V2D", - "LWDOWN", - "RAINRATE", - "T2D", - "Q2D", - "PSFC", - "SWDOWN", - ] - - var_list - start_time = time.time() - print(f"Working on the new way") - fd2 = get_forcing_dict_RTIway2( - pkl_file, - polygonfile, - folder_prefix, - nwm_forcing_files, - var_list, - ) - print(time.time() - start_time) - - fd2["U2D"] - pcp_var.transpose()[0] - pcp_var = fd2["RAINRATE"] - lw_var = fd2["LWDOWN"] - sw_var = fd2["SWDOWN"] - sp_var = fd2["PSFC"] - tmp_var = fd2["T2D"] - u2d_var = fd2["U2D"] - v2d_var = fd2["V2D"] - pcp_var2 = fd2["RAINRATE"] - - for _i in range(0, 40000): - # _i = 0 - try: - pcp_var_0 = pcp_var.transpose()[_i].rename("APCP_surface") - lw_var_0 = lw_var.transpose()[_i].rename("DLWRF_surface") - sw_var_0 = sw_var.transpose()[_i].rename("DSWRF_surface") - sp_var_0 = sp_var.transpose()[_i].rename("SPFH_2maboveground") - tmp_var_0 = tmp_var.transpose()[_i].rename("TMP_2maboveground") - u2d_var_0 = u2d_var.transpose()[_i].rename("UGRD_10maboveground") - v2d_var_0 = v2d_var.transpose()[_i].rename("VGRD_10maboveground") - pcp_var2_0 = pcp_var2.transpose()[_i].rename("precip_rate") ##BROKEN!! - - d = pd.concat( - [ - pcp_var_0, - lw_var_0, - sw_var_0, - sp_var_0, - tmp_var_0, - u2d_var_0, - v2d_var_0, - pcp_var2_0, - ], - axis=1, - ) - d.index.name = "time" - - d.to_csv(f"input_data/cat16_{_i:07}.csv") - except: - print(f"no data for watershed {_i}", end="\t") - - ## Make a shell script string to rename the csvs... - gpkg_divides["id"] - for _i, cat_id in enumerate(gpkg_divides["id"]): - print(f"mv cat16_{_i:07}.csv cat16_{cat_id}.csv") - - -if __name__ == "__main__": - main() From 007c1aa8afd1ad12ba933feaefcdc64d5d144a4f Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 15:29:41 -0500 Subject: [PATCH 066/105] Removed alternates for now --- ngen_forcing/defs.py | 26 -- ngen_forcing/process_nwm_forcing_to_ngen.py | 259 ------------------ .../test_process_nwm_forcing_to_ngen.py | 211 -------------- 3 files changed, 496 deletions(-) delete mode 100644 ngen_forcing/defs.py delete mode 100644 ngen_forcing/process_nwm_forcing_to_ngen.py delete mode 100644 ngen_forcing/test_process_nwm_forcing_to_ngen.py diff --git a/ngen_forcing/defs.py b/ngen_forcing/defs.py deleted file mode 100644 index 18175c8..0000000 --- a/ngen_forcing/defs.py +++ /dev/null @@ -1,26 +0,0 @@ -import rasterio.mask as riomask - - -def polymask(dataset, invert=False, all_touched=False): - def _polymask(poly): - return riomask.raster_geometry_mask( - dataset, [poly], invert=invert, all_touched=all_touched, crop=True - ) - - return _polymask - - -def xr_read_window(ds, window, mask=None): - data = ds.isel(window) - if mask is None: - return data - else: - return data.where(mask) - - -def xr_read_window_time(ds, window, mask=None, idx=None, time=None): - data = ds.isel(window) - if mask is None: - return idx, time, data - else: - return idx, time, data.where(mask) diff --git a/ngen_forcing/process_nwm_forcing_to_ngen.py b/ngen_forcing/process_nwm_forcing_to_ngen.py deleted file mode 100644 index 23c73ba..0000000 --- a/ngen_forcing/process_nwm_forcing_to_ngen.py +++ /dev/null @@ -1,259 +0,0 @@ -from ngen_forcing.defs import xr_read_window, polymask, xr_read_window_time -from rasterio import _io, windows -import xarray as xr -import pandas as pd - - -class MemoryDataset(_io.MemoryDataset, windows.WindowMethodsMixin): - pass - - -def get_forcing_dict_newway( - feature_index, - feature_list, - folder_prefix, - file_list, - var_list, -): - reng = "rasterio" - - _xds_dummy = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) - _template_arr = _xds_dummy.U2D.values - _u2d = MemoryDataset( - _template_arr, - transform=_xds_dummy.U2D.rio.transform(), - gcps=None, - rpcs=None, - crs=None, - copy=False, - ) - - # Open .nc files ahead of time - ds_list = [] - for _nc_file in file_list: - _full_nc_file = folder_prefix.joinpath(_nc_file) - ds_list.append(xr.open_dataset(_full_nc_file, engine=reng)) - - df_dict = {} - for _v in var_list: - df_dict[_v] = pd.DataFrame(index=feature_index) - - for i, feature in enumerate(feature_list): - print(f"{i}, {round(i/len(feature_list), 5)*100}".ljust(40), end="\r") - mask, _, window = polymask(_u2d)(feature) - mask = xr.DataArray(mask, dims=["y", "x"]) - winslices = dict(zip(["y", "x"], window.toslices())) - for j, _xds in enumerate(ds_list): - time_value = _xds.time.values[0] - cropped = xr_read_window(_xds, winslices, mask=mask) - stats = cropped.mean() - for var in var_list: - df_dict[var].loc[i, time_value] = stats[var] - - [ds.close() for ds in ds_list] - return df_dict - - -# def get_forcing_dict_newway_parallel( -# feature_index, -# feature_list, -# folder_prefix, -# file_list, -# var_list, -# para="thread", -# para_n=2, -# ): - -# import concurrent.futures - -# reng = "rasterio" -# _xds = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) -# _template_arr = _xds.U2D.values -# _u2d = MemoryDataset( -# _template_arr, -# transform=_xds.U2D.rio.transform(), -# gcps=None, -# rpcs=None, -# crs=None, -# copy=False, -# ) -# ds_list = [xr.open_dataset(folder_prefix.joinpath(f)) for f in file_list] -# # ds_list = [xr.open_dataset(folder_prefix.joinpath(f), engine=reng) for f in file_list] -# # TODO: figure out why using the rasterio engine DOES NOT WORK with parallel -# # TODO: figure out why NOT using the rasterio engine produces a different result - -# if para == "process": -# pool = concurrent.futures.ProcessPoolExecutor -# elif para == "thread": -# pool = concurrent.futures.ThreadPoolExecutor -# else: -# pool = concurrent.futures.ThreadPoolExecutor - -# stats = [] -# future_list = [] - -# with pool(max_workers=para_n) as executor: - -# for _i, _m in enumerate(map(polymask(_u2d), feature_list)): -# print(f"{_i}, {round(_i/len(feature_list), 5)*100}".ljust(40), end="\r") -# mask, _, window = _m -# mask = xr.DataArray(mask, dims=["y", "x"]) -# winslices = dict(zip(["y", "x"], window.toslices())) -# for ds in ds_list: -# _t = ds.time.values[0] -# future = executor.submit( -# xr_read_window_time, ds, winslices, mask=mask, idx=_i, time=_t -# ) -# # cropped = xr_read_window(f, winslices, mask=mask) -# # stats.append(cropped.mean()) -# future_list.append(future) -# for _f in concurrent.futures.as_completed(future_list): -# _j, _t, _s = _f.result() -# stats.append((_j, _t, _s)) - -# df_dict = {} -# for _v in var_list: -# df_dict[_v] = pd.DataFrame(index=feature_index) - -# for j, t, s in stats: -# for var in var_list: -# df_dict[var].loc[j, t] = s[var].mean() - -# [ds.close() for ds in ds_list] -# return df_dict - - -def get_forcing_dict_newway_inverted( - feature_index, - feature_list, - folder_prefix, - file_list, - var_list, -): - reng = "rasterio" - - _xds_dummy = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) - _template_arr = _xds_dummy.U2D.values - _u2d = MemoryDataset( - _template_arr, - transform=_xds_dummy.U2D.rio.transform(), - gcps=None, - rpcs=None, - crs=None, - copy=False, - ) - ds_list = [] - for _nc_file in file_list: - _full_nc_file = folder_prefix.joinpath(_nc_file) - ds_list.append(xr.open_dataset(_full_nc_file, engine=reng)) - - stats = [] - mask_win_list = [] - - for i, feature in enumerate(feature_list): - print(f"{i}, {round(i/len(feature_list), 5)*100}".ljust(40), end="\r") - mask, _, window = polymask(_u2d)(feature) - mask = xr.DataArray(mask, dims=["y", "x"]) - winslices = dict(zip(["y", "x"], window.toslices())) - mask_win_list.append((mask, winslices)) - - for i, f in enumerate(ds_list): - print(f"{i}, {round(i/len(file_list), 2)*100}".ljust(40), end="\r") - time_value = f.time.values[0] - # TODO: when we read the window, could the time be added as a dimension? - for j, (_m, _w) in enumerate(mask_win_list): - cropped = xr_read_window(f, _w, mask=_m) - stats.append((j, time_value, cropped.mean())) - - df_dict = {} - for _v in var_list: - df_dict[_v] = pd.DataFrame(index=feature_index) - - for j, t, s in stats: - for var in var_list: - df_dict[var].loc[j, t] = s[var] - - [ds.close() for ds in ds_list] - return df_dict - - -# def get_forcing_dict_newway_inverted_parallel( -# feature_index, -# feature_list, -# folder_prefix, -# file_list, -# var_list, -# para="thread", -# para_n=2, -# ): -# import concurrent.futures - -# reng = "rasterio" -# _xds = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) -# _template_arr = _xds.U2D.values -# _u2d = MemoryDataset( -# _template_arr, -# transform=_xds.U2D.rio.transform(), -# gcps=None, -# rpcs=None, -# crs=None, -# copy=False, -# ) - -# ds_list = [xr.open_dataset("data/" + f) for f in file_list] - -# stats = [] -# future_list = [] -# mask_win_list = [] - -# for i, feature in enumerate(feature_list): -# print(f"{i}, {round(i/len(feature_list), 5)*100}".ljust(40), end="\r") -# mask, _, window = polymask(_u2d)(feature) -# mask = xr.DataArray(mask, dims=["y", "x"]) -# winslices = dict(zip(["y", "x"], window.toslices())) -# mask_win_list.append((mask, winslices)) - -# ds_list = [xr.open_dataset(folder_prefix.joinpath(f)) for f in file_list] -# # ds_list = [xr.open_dataset(folder_prefix.joinpath(f), engine=reng) for f in file_list] -# # TODO: figure out why using the rasterio engine DOES NOT WORK with parallel -# # TODO: figure out why NOT using the rasterio engine produces a different result - -# stats = [] -# future_list = [] - -# if para == "process": -# pool = concurrent.futures.ProcessPoolExecutor -# elif para == "thread": -# pool = concurrent.futures.ThreadPoolExecutor -# else: -# pool = concurrent.futures.ThreadPoolExecutor - -# with pool(max_workers=para_n) as executor: -# df_dict = {} -# for _v in var_list: -# df_dict[_v] = pd.DataFrame(index=feature_index) - -# for j, ds in enumerate(ds_list): -# print(f"{j}, {round(i/len(file_list), 2)*100}".ljust(40), end="\r") -# _t = ds.time.values[0] -# for _i, (_m, _w) in enumerate(mask_win_list): -# future = executor.submit( -# xr_read_window_time, ds, _w, mask=_m, idx=_i, time=_t -# ) -# # cropped = xr_read_window(ds, _w, mask=_m) -# # stats.append(cropped.mean()) -# future_list.append(future) -# for _f in concurrent.futures.as_completed(future_list): -# _j, _t, _s = _f.result() -# stats.append((_j, _t, _s)) - -# df_dict = {} -# for _v in var_list: -# df_dict[_v] = pd.DataFrame(index=feature_index) - -# for j, t, s in stats: -# for var in var_list: -# df_dict[var].loc[j, t] = s[var].mean() - -# [ds.close() for ds in ds_list] -# return df_dict diff --git a/ngen_forcing/test_process_nwm_forcing_to_ngen.py b/ngen_forcing/test_process_nwm_forcing_to_ngen.py deleted file mode 100644 index ed98587..0000000 --- a/ngen_forcing/test_process_nwm_forcing_to_ngen.py +++ /dev/null @@ -1,211 +0,0 @@ -# import rioxarray as rxr -import xarray as xr -import geopandas as gpd -from rasterstats import zonal_stats - -# import rasterio -import pandas as pd - -import time - -from process_nwm_forcing_to_ngen import ( - get_forcing_dict_newway, - get_forcing_dict_newway_parallel, - get_forcing_dict_newway_inverted, - get_forcing_dict_newway_inverted_parallel, -) - -from pathlib import Path -import warnings - -warnings.simplefilter("ignore") - -# Read forcing files -# Generate List of files - -# TODO: Add looping through lists of forcing files -# consider looking at the "listofnwmfilenames.py" in the data_access_examples repository. -# Integer values for runinput, varinput, etc. are listed at the top of the file -# and an example is given in the `main` function. - -# import listofnwmfilenames -# create_file_list( -# runinput, -# varinput, -# geoinput, -# meminput, -# start_date, -# end_date, -# fcst_cycle, -# ) - -""" -A set of test files can be generated downloading these files -wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f001.conus.nc -wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f002.conus.nc -wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f003.conus.nc -wget -P 03w -c https://nextgen-hydrofabric.s3.amazonaws.com/v1.2/nextgen_03W.gpkg -""" - - -def get_forcing_dict( - feature_index, - feature_list, - folder_prefix, - filelist, - var_list, -): - reng = "rasterio" - sum_stat = "mean" - - df_dict = {} - for _v in var_list: - df_dict[_v] = pd.DataFrame(index=feature_index) - - ds_list = [] - for _nc_file in filelist: - # _nc_file = ("nwm.t00z.medium_range.forcing.f001.conus.nc") - _full_nc_file = folder_prefix.joinpath(_nc_file) - ds_list.append(xr.open_dataset(_full_nc_file, engine=reng)) - - for _i, _nc_file in enumerate(filelist): - _xds = ds_list[_i] - print(f"{_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") - if 1 == 1: - for _v in var_list: - _src = _xds[_v] - _aff2 = _src.rio.transform() - _arr2 = _src.values[0] - - _df_zonal_stats = pd.DataFrame( - zonal_stats(feature_list, _arr2, affine=_aff2) - ) - # if adding statistics back to original GeoDataFrame - # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1) - df_dict[_v][_xds.time.values[0]] = _df_zonal_stats[sum_stat] - - [_xds.close() for _xds in ds_list] - - return df_dict - - -# TODO: Convert the output to CSV with something like -# `gdf3.to_csv` - - -def main(): - folder_prefix = Path("data") - list_of_files = [ - f"nwm.t12z.medium_range.forcing.f{_r:03}.conus.nc" for _r in range(1, 241) - ] - - # Read basin boundary file - f_03 = "03w/nextgen_03W.gpkg" - gpkg_divides = gpd.read_file(f_03, layer="divides") - var_list = [ - "U2D", - "V2D", - "LWDOWN", - "RAINRATE", - "T2D", - "Q2D", - "PSFC", - "SWDOWN", - ] - - # file_list = list_of_files[0:30] - # gpkg_subset = gpkg_divides[0:2000] - file_list = list_of_files[0:3] - gpkg_subset = gpkg_divides[0:200] - feature_list = gpkg_subset.geometry.to_list() - - # This way is extremely slow for anything more than a - # few files, so we comment it out of the test - - start_time = time.time() - print(f"Working on the old (slow) way") - fd1 = get_forcing_dict( - gpkg_subset.index, - feature_list, - folder_prefix, - file_list, - var_list, - ) - print(time.time() - start_time) - - start_time = time.time() - print(f"Working on the new way") - fd2 = get_forcing_dict_newway( - gpkg_subset.index, - feature_list, - folder_prefix, - file_list, - var_list, - ) - print(time.time() - start_time) - - start_time = time.time() - - print(f"Working on the new way with threading parallel.") - fd3t = get_forcing_dict_newway_parallel( - gpkg_subset.index, - feature_list, - folder_prefix, - file_list, - var_list, - para="thread", - para_n=16, - ) - print(time.time() - start_time) - - start_time = time.time() - print(f"Working on the new way with process parallel.") - fd3p = get_forcing_dict_newway_parallel( - gpkg_subset.index, - feature_list, - folder_prefix, - file_list, - var_list, - para="process", - para_n=16, - ) - print(time.time() - start_time) - start_time = time.time() - print(f"Working on the new way with loops reversed.") - fd4 = get_forcing_dict_newway_inverted( - gpkg_subset.index, - feature_list, - folder_prefix, - file_list, - var_list, - ) - print(time.time() - start_time) - - start_time = time.time() - print(f"Working on the new way with loops reversed with threading parallel.") - fd5t = get_forcing_dict_newway_inverted_parallel( - gpkg_subset.index, - feature_list, - folder_prefix, - file_list, - var_list, - para="thread", - para_n=16, - ) - print(time.time() - start_time) - start_time = time.time() - print(f"Working on the new way with loops reversed with process parallel.") - fd5p = get_forcing_dict_newway_inverted_parallel( - gpkg_subset.index, - feature_list, - folder_prefix, - file_list, - var_list, - para="process", - para_n=16, - ) - print(time.time() - start_time) - - -if __name__ == "__main__": - main() From ee82e79d2cfa1d703dbf9f4d1aa4f90dc77ff362 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 15:33:43 -0500 Subject: [PATCH 067/105] blacked subsetting and added new line to end of file --- .gitignore | 2 +- ngen_forcing/user_input_ngen.json | 2 +- subsetting/ncatch_upstream.py | 41 +++++++++++++++++----------- subsetting/subset_forcing.py | 44 ++++++++++++++++++------------- 4 files changed, 54 insertions(+), 35 deletions(-) diff --git a/.gitignore b/.gitignore index 125a2ec..688d779 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,4 @@ data/* nwm_filenames/__pycache__/ subsetting/__pycache__/ ngen_forcing/__pycache__/ -venv/ \ No newline at end of file +venv/ diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index 5059e3d..d5610bc 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -22,4 +22,4 @@ "cache" : true, "dl_threads" : 10 -} \ No newline at end of file +} diff --git a/subsetting/ncatch_upstream.py b/subsetting/ncatch_upstream.py index 51c9032..31221c0 100644 --- a/subsetting/ncatch_upstream.py +++ b/subsetting/ncatch_upstream.py @@ -1,16 +1,23 @@ import geopandas as gpd import argparse, os -from subset import get_upstream_ids +from subset import get_upstream_ids + def main(): - #setup the argument parser + # setup the argument parser parser = argparse.ArgumentParser() - parser.add_argument(dest="infile", type=str, help="A gpkg file containing divides and nexus layers") - parser.add_argument(dest="outfile", type=str, help="A text file containing the number of upstream catchments for each catchment") + parser.add_argument( + dest="infile", type=str, help="A gpkg file containing divides and nexus layers" + ) + parser.add_argument( + dest="outfile", + type=str, + help="A text file containing the number of upstream catchments for each catchment", + ) args = parser.parse_args() - infile = args.infile - outfile = args.outfile + infile = args.infile + outfile = args.outfile print("Reading catchment data...") df_cat = gpd.read_file(str(infile), layer="divides") @@ -18,19 +25,22 @@ def main(): print("Reading nexus data...") df_nex = gpd.read_file(str(infile), layer="nexus") - df_cat.set_index('id', inplace=True) + df_cat.set_index("id", inplace=True) print("Finding upstream catchments...") - upstream = nupstream(df_cat.reset_index(), df_nex.reset_index(),df_cat.index) + upstream = nupstream(df_cat.reset_index(), df_nex.reset_index(), df_cat.index) - with open(outfile,'w') as fp: - fp.write(f'Catchment IDs and the number of upstream catchments\nGenerated with file {os.path.basename(infile)}\n') + with open(outfile, "w") as fp: + fp.write( + f"Catchment IDs and the number of upstream catchments\nGenerated with file {os.path.basename(infile)}\n" + ) for jcatch in upstream: - fp.write(f'{jcatch} : {upstream[jcatch]}\n') + fp.write(f"{jcatch} : {upstream[jcatch]}\n") + + print(f"Done! - > {outfile}") - print(f'Done! - > {outfile}') -def nupstream(divides,nexus,cat_list): +def nupstream(divides, nexus, cat_list): """ Find the number of upstream catchments for each catchment """ @@ -41,9 +51,10 @@ def nupstream(divides,nexus,cat_list): jnupstream = len(cat_up_ids) upstream[jcat_id] = jnupstream - upstream = dict(sorted(upstream.items(), key=lambda x:x[1], reverse=True)) + upstream = dict(sorted(upstream.items(), key=lambda x: x[1], reverse=True)) return upstream + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/subsetting/subset_forcing.py b/subsetting/subset_forcing.py index 9199442..2a7b7b3 100644 --- a/subsetting/subset_forcing.py +++ b/subsetting/subset_forcing.py @@ -1,23 +1,28 @@ import argparse, os, json + def main(): """ Find forcing files in a directory that match the catchments within a catchment.geojson - + """ - #setup the argument parser + # setup the argument parser parser = argparse.ArgumentParser() - parser.add_argument(dest="forcing_dir", type=str, help="Path to forcing files") - parser.add_argument(dest="forcing_dir_out", type=str, help="Path to output the forcing files subset") - parser.add_argument(dest="catchment_file", type=str, help="A catchment geojson file") + parser.add_argument(dest="forcing_dir", type=str, help="Path to forcing files") + parser.add_argument( + dest="forcing_dir_out", type=str, help="Path to output the forcing files subset" + ) + parser.add_argument( + dest="catchment_file", type=str, help="A catchment geojson file" + ) args = parser.parse_args() - indir = args.forcing_dir - outdir = args.forcing_dir_out + indir = args.forcing_dir + outdir = args.forcing_dir_out catch_file = args.catchment_file if not os.path.exists(outdir): - os.system(f'mkdir {outdir}') + os.system(f"mkdir {outdir}") forcing_files = os.listdir(indir) @@ -27,23 +32,26 @@ def main(): # User should validate the catch file. # Would do here with ngen-cal, just don't want to create the dependency - feats = data['features'] + feats = data["features"] forcing_out = [] for jfeat in feats: found = False - try: # Geopandas/pydantic descrepancy - cat_id = jfeat['id'] - except: - cat_id = jfeat['properties']['id'] + try: # Geopandas/pydantic descrepancy + cat_id = jfeat["id"] + except: + cat_id = jfeat["properties"]["id"] for jforcing in forcing_files: if jforcing.find(cat_id) >= 0: found = True forcing_out.append(jforcing) - os.system(f'cp {os.path.join(indir,jforcing)} {os.path.join(outdir,jforcing)}') - if not found: - print(f'Couldn\'t find forcing file for {cat_id}!') + os.system( + f"cp {os.path.join(indir,jforcing)} {os.path.join(outdir,jforcing)}" + ) + if not found: + print(f"Couldn't find forcing file for {cat_id}!") else: - print(f'Found forcing file for {cat_id}!') + print(f"Found forcing file for {cat_id}!") + if __name__ == "__main__": - main() \ No newline at end of file + main() From 1d6254d75c0cb662196de617801e1479a2862c77 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 15:54:09 -0500 Subject: [PATCH 068/105] Added hydrofabric version to config --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 13 +++++++++---- ngen_forcing/user_input_ngen.json | 1 + ngen_forcing/user_input_ngen.md | 3 +++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 4256e01..d51ac05 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -308,6 +308,7 @@ def main(): geoinput = conf["forcing"]["geoinput"] meminput = conf["forcing"]["meminput"] urlbaseinput = conf["forcing"]["urlbaseinput"] + version = conf["hydrofab"]["version"] vpu = conf["hydrofab"]["vpu"] ii_verbose = conf["verbose"] bucket_type = conf["bucket_type"] @@ -315,7 +316,8 @@ def main(): file_prefix = conf["file_prefix"] file_type = conf["file_type"] ii_cache = conf["cache"] - dl_threads = conf["dl_threads"] + if ii_cache: + dl_threads = conf["dl_threads"] file_types = ["csv", "parquet"] assert ( @@ -351,8 +353,11 @@ def main(): if len(nwm_files) == 0: print(f"Creating list of file names to pull...") n = 6 - fcst_cycle = [n * x for x in range(24 // n)] - lead_time = [x + 1 for x in range(n)] + # fcst_cycle = [n*x for x in range(24//n)] + # lead_time = [x+1 for x in range(n)] + fcst_cycle = [] + lead_time = None + nwm_forcing_files = create_file_list( runinput, varinput, @@ -424,7 +429,7 @@ def main(): gpkg = Path(top_dir, "data", jfile) print(f"Found and using geopackge file {gpkg}") if gpkg == None: - url = f"https://nextgen-hydrofabric.s3.amazonaws.com/05_nextgen/nextgen_{vpu}.gpkg" + url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" command = f"wget -P {CACHE_DIR} -c {url}" wget(command, url) diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index d5610bc..94ecc89 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -11,6 +11,7 @@ }, "hydrofab" : { + "version" : "v1.2", "vpu" : "03W" }, diff --git a/ngen_forcing/user_input_ngen.md b/ngen_forcing/user_input_ngen.md index 8877637..d0d9ed0 100644 --- a/ngen_forcing/user_input_ngen.md +++ b/ngen_forcing/user_input_ngen.md @@ -18,6 +18,7 @@ contents: }, "hydrofab" : { + "version" : "v1.2", "vpu" : "03W" }, @@ -30,6 +31,7 @@ contents: "dl_threads" : 10 } + ### forcing | Field Name | Data Type | Description | @@ -46,6 +48,7 @@ contents: ### hydrofab | Field Name | Data Type | Description | | --- | --- | --- | +| version | `string` | Current hydrofabric version | | vpu | `string` | Check here for map of VPUs https://noaa-owp.github.io/hydrofabric/articles/data_access.html | ### other options From 1a0b492381a1eab37d95713d1699388b578289c3 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 16:02:15 -0500 Subject: [PATCH 069/105] Fixed wget gpkg bug --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index d51ac05..839e168 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -352,11 +352,10 @@ def main(): # Get nwm forcing file names if len(nwm_files) == 0: print(f"Creating list of file names to pull...") - n = 6 + # n = 6 # fcst_cycle = [n*x for x in range(24//n)] # lead_time = [x+1 for x in range(n)] - fcst_cycle = [] - lead_time = None + fcst_cycle = [0] nwm_forcing_files = create_file_list( runinput, @@ -367,7 +366,6 @@ def main(): end_date, fcst_cycle, urlbaseinput, - lead_time, ) else: print(f"Reading list of file names from {nwm_files}...") @@ -429,12 +427,14 @@ def main(): gpkg = Path(top_dir, "data", jfile) print(f"Found and using geopackge file {gpkg}") if gpkg == None: - url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" + gpkg = f"nextgen_{vpu}.gpkg" + url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/{gpkg}" command = f"wget -P {CACHE_DIR} -c {url}" wget(command, url) + local_gpkg = Path(top_dir, "data",gpkg) - print(f"Opening {gpkg}...") - polygonfile = gpd.read_file(gpkg, layer="divides") + print(f"Opening {local_gpkg}...") + polygonfile = gpd.read_file(local_gpkg, layer="divides") ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) src = ds["RAINRATE"] From 2403ba04e4c780ad0e1d3bdf5625e7d384fe3e36 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 16:35:55 -0500 Subject: [PATCH 070/105] Default to short range --- ngen_forcing/user_input_ngen.json | 2 +- ngen_forcing/user_input_ngen.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index 94ecc89..a72ffe2 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -3,7 +3,7 @@ "start_date" : "20220822", "end_date" : "20220822", "nwm_files" : "", - "runinput" : 2, + "runinput" : 1, "varinput" : 5, "geoinput" : 1, "meminput" : 0, diff --git a/ngen_forcing/user_input_ngen.md b/ngen_forcing/user_input_ngen.md index d0d9ed0..dce7042 100644 --- a/ngen_forcing/user_input_ngen.md +++ b/ngen_forcing/user_input_ngen.md @@ -10,7 +10,7 @@ contents: "start_date" : "20220822", "end_date" : "20220822", "nwm_files" : "", - "runinput" : 2, + "runinput" : 1, "varinput" : 5, "geoinput" : 1, "meminput" : 0, From c8354eda38a593cfcd538449e2b7f2b2026ea73b Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 4 May 2023 16:44:05 -0500 Subject: [PATCH 071/105] Fixed pathing issues --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 839e168..6ed8115 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -51,7 +51,7 @@ # TODO Make CACHE_DIR configurable CACHE_DIR = Path( - pkg_dir.parent, "data", "raw_forcing_data" + pkg_dir.parent, "data", "raw_data" ) # Maybe this should have a date attached to the name NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") @@ -422,19 +422,18 @@ def main(): if not os.path.exists(wgt_file): # Search for geopackage that matches the requested VPU, if it exists gpkg = None - for jfile in os.listdir(os.path.join(top_dir, "data")): - if jfile.find(vpu) >= 0: - gpkg = Path(top_dir, "data", jfile) + for jfile in os.listdir(CACHE_DIR): + if jfile.find(f"nextgen_{vpu}.gpkg") >= 0: + gpkg = Path(CACHE_DIR, jfile) print(f"Found and using geopackge file {gpkg}") if gpkg == None: - gpkg = f"nextgen_{vpu}.gpkg" - url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/{gpkg}" + url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" command = f"wget -P {CACHE_DIR} -c {url}" wget(command, url) - local_gpkg = Path(top_dir, "data",gpkg) + gpkg = Path(CACHE_DIR, f"nextgen_{vpu}.gpkg") - print(f"Opening {local_gpkg}...") - polygonfile = gpd.read_file(local_gpkg, layer="divides") + print(f"Opening {gpkg}...") + polygonfile = gpd.read_file(gpkg, layer="divides") ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) src = ds["RAINRATE"] @@ -469,7 +468,7 @@ def main(): "SPFH_2maboveground", "DSWRF_surface", ] - + fd2 = get_forcing_dict_JL( wgt_file, forcing_files, @@ -477,6 +476,7 @@ def main(): var_list_out, ) + print(f'Writting data!') # Write CSVs to file t0 = time.perf_counter() write_int = 100 From 715002d16c063ed99b6448bfd36ce96985c65f07 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Fri, 5 May 2023 14:29:21 -0500 Subject: [PATCH 072/105] Added remote indexing. Moved cache into forcing. --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 214 ++++++++++++-------- ngen_forcing/user_input_ngen.json | 10 +- ngen_forcing/user_input_ngen.md | 13 +- 3 files changed, 139 insertions(+), 98 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 6ed8115..db76a73 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -108,7 +108,7 @@ def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: """ # TODO: Check to see if this does any better than kerchunk # the caching should help, but probably needs to be managed to function asynchronously. - # Perhaps if the files is not cached, we can create the dataset from + # Perhaps if theget_dataset files is not cached, we can create the dataset from # kerchunk with a remote path and then asynchronously do a download to cache it # for next time. The hypothesis would be that the download speed will not be any slower than # just accessing the file remotely. @@ -229,21 +229,36 @@ def calc_zonal_stats_weights_new( return mean_dict -def get_forcing_dict_JL(wgt_file, filelist, var_list, var_list_out): +def get_forcing_dict_JL( + wgt_file : str, + local_filelist : list, + remote_filelist : list, + var_list : list , + var_list_out : list, + ii_cache : bool + ): + t1 = time.perf_counter() + nlocal = len(local_filelist) + full_list = local_filelist + remote_filelist df_by_t = [] - for _i, _nc_file in enumerate(filelist): - with xr.open_dataset(_nc_file) as _xds: + # NOTE this scheme uses the same algorithm for remote and local processing. This may not be desireable + if ii_cache: + eng = 'h5netcdf' + for _i, _nc_file in enumerate(full_list): + if _i == nlocal: eng = 'rasterio' # switch engine for remote processing + with xr.open_dataset(_nc_file,engine=eng) as _xds: shp = _xds["U2D"].shape + dtp = _xds["U2D"].dtype data_allvars = np.zeros( - shape=(len(var_list), shp[1], shp[2]), dtype=_xds["U2D"].dtype - ) + shape=(len(var_list), shp[1], shp[2]), dtype=dtp + ) for var_dx, jvar in enumerate(var_list): data_allvars[var_dx, :, :] = np.squeeze(_xds[jvar].values) _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, wgt_file) df_by_t.append(_df_zonal_stats) print( - f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}%", + f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(full_list)}, {(_i+1)/len(full_list)*100:.2f}% {time.perf_counter() - t1:.2f}s elapsed", end="\r", ) @@ -262,18 +277,78 @@ def get_forcing_dict_JL(wgt_file, filelist, var_list, var_list_out): return dfs -def wget(cmd, name, semaphore=None): +def threaded_cmd(cmd, semaphore=None): + """ + Execute many system commands using python threading. Semaphore is set outside this function + """ if not semaphore == None: semaphore.acquire() resp = os.system(cmd) if resp > 0: - raise Exception(f"\nwget failed! Tried: {name}\n") - else: - print(f"Successful download of {name}") - + raise Exception(f"\Threaded command failed! Tried: {cmd}\n") if not semaphore == None: semaphore.release() +def locate_dl_files_threaded( + ii_cache: bool, + ii_verbose : bool, + forcing_file_names : list, + dl_threads : int +): + """ + Look for forcing files locally, if found, will apend to local file list for local processing + If not found and if we do not wish to cache, append to remote files for remote processing + If not found and if we do wish to cache, append to local file list for local processing and perform a threaded download + """ + + local_files = [] + remote_files = [] + dl_files = [] + cmds = [] + for jfile in forcing_file_names: + if ii_verbose: + print(f"Looking for {jfile}") + file_parts = Path(jfile).parts + + local_file = os.path.join(CACHE_DIR, file_parts[-1]) + + # decide whether to use local file, download it, or index it remotely + if os.path.exists(local_file): + # If the file exists local, get data from this file regardless of ii_cache option + if ii_verbose and ii_cache: + print(f"Found and using local raw forcing file {local_file}") + elif ii_verbose and not ii_cache: + print(f"CACHE OPTION OVERRIDE : Found and using local raw forcing file {local_file}") + local_files.append(local_file) + elif not os.path.exists(local_file) and not ii_cache: + # If file is not found locally, and we don't want to cache it, append to remote file list + remote_files.append(jfile) + elif not os.path.exists(local_file) and ii_cache: + # Download file + if ii_verbose: + print(f"Forcing file not found! Downloading {jfile}") + command = f"wget -P {CACHE_DIR} -c {jfile}" + cmds.append(command) + dl_files.append(jfile) + local_files.append(local_file) + + # Do threaded download if we have any files to download + n_files = len(dl_files) + if n_files > 0: + t0 = time.perf_counter() + threads = [] + semaphore = threading.Semaphore(dl_threads) + for i, jcmd in enumerate(cmds): + t = threading.Thread(target=threaded_cmd, args=[jcmd, semaphore]) + t.start() + threads.append(t) + + for jt in threads: + jt.join() + + print(f"Time to download {n_files} files {time.perf_counter() - t0}") + + return local_files, remote_files def main(): """ @@ -299,15 +374,16 @@ def main(): conf = json.load(open(args.infile)) start_date = conf["forcing"]["start_date"] end_date = conf["forcing"]["end_date"] - if "nwm_files" in conf["forcing"]: - nwm_files = conf["forcing"]["nwm_files"] + if "nwm_file" in conf["forcing"]: + nwm_file = conf["forcing"]["nwm_file"] else: - nwm_files = "" + nwm_file = "" runinput = conf["forcing"]["runinput"] varinput = conf["forcing"]["varinput"] geoinput = conf["forcing"]["geoinput"] meminput = conf["forcing"]["meminput"] urlbaseinput = conf["forcing"]["urlbaseinput"] + ii_cache = conf["forcing"]["cache"] version = conf["hydrofab"]["version"] vpu = conf["hydrofab"]["vpu"] ii_verbose = conf["verbose"] @@ -315,9 +391,7 @@ def main(): bucket_name = conf["bucket_name"] file_prefix = conf["file_prefix"] file_type = conf["file_type"] - ii_cache = conf["cache"] - if ii_cache: - dl_threads = conf["dl_threads"] + dl_threads = conf["dl_threads"] file_types = ["csv", "parquet"] assert ( @@ -349,73 +423,6 @@ def main(): elif bucket_type == "S3": s3 = boto3.client("s3") - # Get nwm forcing file names - if len(nwm_files) == 0: - print(f"Creating list of file names to pull...") - # n = 6 - # fcst_cycle = [n*x for x in range(24//n)] - # lead_time = [x+1 for x in range(n)] - fcst_cycle = [0] - - nwm_forcing_files = create_file_list( - runinput, - varinput, - geoinput, - meminput, - start_date, - end_date, - fcst_cycle, - urlbaseinput, - ) - else: - print(f"Reading list of file names from {nwm_files}...") - nwm_forcing_files = [] - with open(nwm_files, "r") as f: - for line in f: - nwm_forcing_files.append(line) - - # Download whole files and store locally if cache is true, - # otherwise index remotely and save catchment based forcings - t0 = time.perf_counter() - if ii_cache: - # Check to see if we have files cached, if not wget them - local_files = [] - cmds = [] - fls = [] - for jfile in nwm_forcing_files: - if ii_verbose: - print(f"Looking for {jfile}") - file_parts = Path(jfile).parts - - local_file = os.path.join(CACHE_DIR, file_parts[-1]) - local_files.append(local_file) - if os.path.exists(local_file): - if ii_verbose: - print(f"Found and using raw forcing file {local_file}") - continue - else: - if ii_verbose: - print(f"Forcing file not found! Downloading {jfile}") - command = f"wget -P {CACHE_DIR} -c {jfile}" - cmds.append(command) - fls.append(jfile) - - threads = [] - semaphore = threading.Semaphore(dl_threads) - for i, jcmd in enumerate(cmds): - t = threading.Thread(target=wget, args=[jcmd, fls[i], semaphore]) - t.start() - threads.append(t) - - for jt in threads: - jt.join() - - forcing_files = local_files # interacting with files locally - else: - forcing_files = nwm_forcing_files # interacting with files remotely - - print(f"Time to download files {time.perf_counter() - t0}") - # Generate weight file only if one doesn't exist already # Very time consuming so we don't want to do this if we can avoid it wgt_file = os.path.join(CACHE_DIR, "weights.json") @@ -429,7 +436,7 @@ def main(): if gpkg == None: url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" command = f"wget -P {CACHE_DIR} -c {url}" - wget(command, url) + threaded_cmd(command, url) gpkg = Path(CACHE_DIR, f"nextgen_{vpu}.gpkg") print(f"Opening {gpkg}...") @@ -445,7 +452,32 @@ def main(): else: print( f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}" + ) + + # Get nwm forcing file names + if len(nwm_file) == 0: + print(f"Creating list of file names to locate...") + fcst_cycle = [0] + + nwm_forcing_files = create_file_list( + runinput, + varinput, + geoinput, + meminput, + start_date, + end_date, + fcst_cycle, + urlbaseinput, ) + else: + print(f"Reading list of file names from {nwm_file}...") + nwm_forcing_files = [] + with open(nwm_file, "r") as f: + for line in f: + nwm_forcing_files.append(line) + + # This will look for local raw forcing files and download them if needed + local_nwm_files, remote_nwm_files = locate_dl_files_threaded(ii_cache,ii_verbose,nwm_forcing_files,dl_threads) var_list = [ "U2D", @@ -468,13 +500,21 @@ def main(): "SPFH_2maboveground", "DSWRF_surface", ] + + # Considering possible memory constraints in this operation, + # let's loop though a certain number of files, write them out, and go back for more + t0 = time.perf_counter() + local_nwm_files = [] fd2 = get_forcing_dict_JL( wgt_file, - forcing_files, + local_nwm_files, + remote_nwm_files[:5], var_list, var_list_out, + ii_cache ) + print(f'Time to create forcing dictionary {time.perf_counter() - t0}') print(f'Writting data!') # Write CSVs to file diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index a72ffe2..11b5962 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -2,12 +2,13 @@ "forcing" : { "start_date" : "20220822", "end_date" : "20220822", - "nwm_files" : "", + "nwm_file" : "", "runinput" : 1, "varinput" : 5, "geoinput" : 1, "meminput" : 0, - "urlbaseinput" : 3 + "urlbaseinput" : 3, + "cache" : false }, "hydrofab" : { @@ -16,11 +17,10 @@ }, "verbose" : true, - "bucket_type" : "S3", - "bucket_name" : "ciroh-devconf", + "bucket_type" : "local", + "bucket_name" : "out_data_CIROH", "file_prefix" : "data/", "file_type" : "csv", - "cache" : true, "dl_threads" : 10 } diff --git a/ngen_forcing/user_input_ngen.md b/ngen_forcing/user_input_ngen.md index dce7042..033fc5b 100644 --- a/ngen_forcing/user_input_ngen.md +++ b/ngen_forcing/user_input_ngen.md @@ -9,12 +9,13 @@ contents: "forcing" : { "start_date" : "20220822", "end_date" : "20220822", - "nwm_files" : "", + "nwm_file" : "", "runinput" : 1, "varinput" : 5, "geoinput" : 1, "meminput" : 0, - "urlbaseinput" : 3 + "urlbaseinput" : 3, + "cache" : false }, "hydrofab" : { @@ -23,15 +24,15 @@ contents: }, "verbose" : true, - "bucket_type" : "S3", - "bucket_name" : "ciroh-devconf", + "bucket_type" : "local", + "bucket_name" : "out_data_CIROH", "file_prefix" : "data/", "file_type" : "csv", - "cache" : true, "dl_threads" : 10 } + ### forcing | Field Name | Data Type | Description | @@ -44,6 +45,7 @@ contents: | geoinput | `int` |
    1. conus: for continental US
    2. hawaii: for Hawaii
    3. puertorico: for Puerto Rico
    | | meminput | `int` |
    1. mem_1
    2. mem_2
    3. mem_3
    4. mem_4
    5. mem_5
    6. mem_6
    7. mem_7
    | | urlbaseinput | `int` |
    1. Empty string: use local files
    2. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/: for real-time operational data from NOAA
    3. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/post-processed/WMS/: for post-processed data from NOAA's Web Map Service
    4. https://storage.googleapis.com/national-water-model/: for input/output data stored on Google Cloud Storage
    5. https://storage.cloud.google.com/national-water-model/: for input/output data stored on Google Cloud Storage
    6. gs://national-water-model/: for input/output data stored on Google Cloud Storage
    7. https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/: for retrospective data from AWS S3
    8. s3://noaa-nwm-retrospective-2-1-pds/model_output/: for retrospective data from AWS S3
    | +| cache | `bool` |
  • true: Store forcing files locally. Must specify dl_threads
  • false: Interact with forcing files remotely
  • | ### hydrofab | Field Name | Data Type | Description | @@ -59,5 +61,4 @@ contents: | bucket_name | `string` | If local, this is the name of the folder the data will be placed in. If S3, this is the name of S3 bucket, which must exist already. | | file_prefix | `string` | If local, this is the relative path to the bucket_name folder. If S3, this is the relative path within the S3 bucket_name bucket to store files | | file_type | `string` |
    1. "csv" : write data as csv files/
    2. "parquet" : write data as parquet files
    | -| cache | `bool` |
  • true: Store forcing files locally. Must specify dl_threads
  • false: Interact with forcing files remotely
  • | | dl_threads | `int` | Number of threads to use while downloading. | From 6117014610e05aac40f28a90c4ad2998afe1e38a Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Fri, 5 May 2023 14:37:48 -0500 Subject: [PATCH 073/105] Removed indexing (bug) --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index db76a73..f354595 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -509,7 +509,7 @@ def main(): fd2 = get_forcing_dict_JL( wgt_file, local_nwm_files, - remote_nwm_files[:5], + remote_nwm_files, var_list, var_list_out, ii_cache From b9175923bba239210712a65ccf2c3207eaa1ec09 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Fri, 5 May 2023 14:39:00 -0500 Subject: [PATCH 074/105] Removed bug #12412 --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index f354595..d933412 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -504,7 +504,6 @@ def main(): # Considering possible memory constraints in this operation, # let's loop though a certain number of files, write them out, and go back for more t0 = time.perf_counter() - local_nwm_files = [] fd2 = get_forcing_dict_JL( wgt_file, From 4ddd40c302db386f5971b2fe8f9cb4e5f35255cb Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Fri, 5 May 2023 14:42:46 -0500 Subject: [PATCH 075/105] Removed bug #12413 --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index d933412..d9d8f49 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -243,7 +243,7 @@ def get_forcing_dict_JL( full_list = local_filelist + remote_filelist df_by_t = [] # NOTE this scheme uses the same algorithm for remote and local processing. This may not be desireable - if ii_cache: + if nlocal > 0: eng = 'h5netcdf' for _i, _nc_file in enumerate(full_list): if _i == nlocal: eng = 'rasterio' # switch engine for remote processing From 52da0d74bc5c619baa701bc8e807c84bff3bf49b Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Fri, 5 May 2023 15:03:39 -0500 Subject: [PATCH 076/105] blacked and print statements fixes --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 175 ++++++++++---------- 1 file changed, 85 insertions(+), 90 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index d9d8f49..8b3bac0 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -230,29 +230,27 @@ def calc_zonal_stats_weights_new( def get_forcing_dict_JL( - wgt_file : str, - local_filelist : list, - remote_filelist : list, - var_list : list , - var_list_out : list, - ii_cache : bool - ): - + wgt_file: str, + local_filelist: list, + remote_filelist: list, + var_list: list, + var_list_out: list, + ii_cache: bool, +): t1 = time.perf_counter() nlocal = len(local_filelist) full_list = local_filelist + remote_filelist df_by_t = [] # NOTE this scheme uses the same algorithm for remote and local processing. This may not be desireable if nlocal > 0: - eng = 'h5netcdf' + eng = "h5netcdf" for _i, _nc_file in enumerate(full_list): - if _i == nlocal: eng = 'rasterio' # switch engine for remote processing - with xr.open_dataset(_nc_file,engine=eng) as _xds: + if _i == nlocal: + eng = "rasterio" # switch engine for remote processing + with xr.open_dataset(_nc_file, engine=eng) as _xds: shp = _xds["U2D"].shape dtp = _xds["U2D"].dtype - data_allvars = np.zeros( - shape=(len(var_list), shp[1], shp[2]), dtype=dtp - ) + data_allvars = np.zeros(shape=(len(var_list), shp[1], shp[2]), dtype=dtp) for var_dx, jvar in enumerate(var_list): data_allvars[var_dx, :, :] = np.squeeze(_xds[jvar].values) _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, wgt_file) @@ -262,7 +260,7 @@ def get_forcing_dict_JL( end="\r", ) - print(f"Reformating and converting data into dataframe") + print(f"\nReformating and converting data into dataframe") dfs = {} for jcat in list(df_by_t[0].keys()): data_catch = [] @@ -289,71 +287,71 @@ def threaded_cmd(cmd, semaphore=None): if not semaphore == None: semaphore.release() + def locate_dl_files_threaded( - ii_cache: bool, - ii_verbose : bool, - forcing_file_names : list, - dl_threads : int + ii_cache: bool, ii_verbose: bool, forcing_file_names: list, dl_threads: int ): - """ - Look for forcing files locally, if found, will apend to local file list for local processing - If not found and if we do not wish to cache, append to remote files for remote processing - If not found and if we do wish to cache, append to local file list for local processing and perform a threaded download - """ - - local_files = [] - remote_files = [] - dl_files = [] - cmds = [] - for jfile in forcing_file_names: + """ + Look for forcing files locally, if found, will apend to local file list for local processing + If not found and if we do not wish to cache, append to remote files for remote processing + If not found and if we do wish to cache, append to local file list for local processing and perform a threaded download + """ + + local_files = [] + remote_files = [] + dl_files = [] + cmds = [] + for jfile in forcing_file_names: + if ii_verbose: + print(f"Looking for {jfile}") + file_parts = Path(jfile).parts + + local_file = os.path.join(CACHE_DIR, file_parts[-1]) + + # decide whether to use local file, download it, or index it remotely + if os.path.exists(local_file): + # If the file exists local, get data from this file regardless of ii_cache option + if ii_verbose and ii_cache: + print(f"Found and using local raw forcing file {local_file}") + elif ii_verbose and not ii_cache: + print( + f"CACHE OPTION OVERRIDE : Found and using local raw forcing file {local_file}" + ) + local_files.append(local_file) + elif not os.path.exists(local_file) and not ii_cache: + # If file is not found locally, and we don't want to cache it, append to remote file list + remote_files.append(jfile) + elif not os.path.exists(local_file) and ii_cache: + # Download file if ii_verbose: - print(f"Looking for {jfile}") - file_parts = Path(jfile).parts - - local_file = os.path.join(CACHE_DIR, file_parts[-1]) - - # decide whether to use local file, download it, or index it remotely - if os.path.exists(local_file): - # If the file exists local, get data from this file regardless of ii_cache option - if ii_verbose and ii_cache: - print(f"Found and using local raw forcing file {local_file}") - elif ii_verbose and not ii_cache: - print(f"CACHE OPTION OVERRIDE : Found and using local raw forcing file {local_file}") - local_files.append(local_file) - elif not os.path.exists(local_file) and not ii_cache: - # If file is not found locally, and we don't want to cache it, append to remote file list - remote_files.append(jfile) - elif not os.path.exists(local_file) and ii_cache: - # Download file - if ii_verbose: - print(f"Forcing file not found! Downloading {jfile}") - command = f"wget -P {CACHE_DIR} -c {jfile}" - cmds.append(command) - dl_files.append(jfile) - local_files.append(local_file) - - # Do threaded download if we have any files to download - n_files = len(dl_files) - if n_files > 0: - t0 = time.perf_counter() - threads = [] - semaphore = threading.Semaphore(dl_threads) - for i, jcmd in enumerate(cmds): - t = threading.Thread(target=threaded_cmd, args=[jcmd, semaphore]) - t.start() - threads.append(t) - - for jt in threads: - jt.join() - - print(f"Time to download {n_files} files {time.perf_counter() - t0}") - - return local_files, remote_files + print(f"Forcing file not found! Downloading {jfile}") + command = f"wget -P {CACHE_DIR} -c {jfile}" + cmds.append(command) + dl_files.append(jfile) + local_files.append(local_file) + + # Do threaded download if we have any files to download + n_files = len(dl_files) + if n_files > 0: + t0 = time.perf_counter() + threads = [] + semaphore = threading.Semaphore(dl_threads) + for i, jcmd in enumerate(cmds): + t = threading.Thread(target=threaded_cmd, args=[jcmd, semaphore]) + t.start() + threads.append(t) + + for jt in threads: + jt.join() + + print(f"Time to download {n_files} files {time.perf_counter() - t0}") + + return local_files, remote_files + def main(): """ - Primary function to retrieve hydrofabrics data and convert it into files that can be ingested into ngen. - Also, the forcing data is retrieved. + Primary function to retrieve forcing and hydrofabric data and convert it into files that can be ingested into ngen. Inputs: JSON config file specifying start_date, end_date, and vpu @@ -384,7 +382,7 @@ def main(): meminput = conf["forcing"]["meminput"] urlbaseinput = conf["forcing"]["urlbaseinput"] ii_cache = conf["forcing"]["cache"] - version = conf["hydrofab"]["version"] + version = conf["hydrofab"]["version"] vpu = conf["hydrofab"]["vpu"] ii_verbose = conf["verbose"] bucket_type = conf["bucket_type"] @@ -448,11 +446,11 @@ def main(): print("Generating weights") t1 = time.perf_counter() generate_weights_file(polygonfile, src, wgt_file, crosswalk_dict_key="id") - print(f"Generating the weights took {time.perf_counter() - t1:.2f} s") + print(f"\nGenerating the weights took {time.perf_counter() - t1:.2f} s") else: print( f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}" - ) + ) # Get nwm forcing file names if len(nwm_file) == 0: @@ -474,10 +472,12 @@ def main(): nwm_forcing_files = [] with open(nwm_file, "r") as f: for line in f: - nwm_forcing_files.append(line) + nwm_forcing_files.append(line) # This will look for local raw forcing files and download them if needed - local_nwm_files, remote_nwm_files = locate_dl_files_threaded(ii_cache,ii_verbose,nwm_forcing_files,dl_threads) + local_nwm_files, remote_nwm_files = locate_dl_files_threaded( + ii_cache, ii_verbose, nwm_forcing_files, dl_threads + ) var_list = [ "U2D", @@ -501,21 +501,16 @@ def main(): "DSWRF_surface", ] - # Considering possible memory constraints in this operation, + # TODO: Considering possible memory constraints in this operation, # let's loop though a certain number of files, write them out, and go back for more t0 = time.perf_counter() - + fd2 = get_forcing_dict_JL( - wgt_file, - local_nwm_files, - remote_nwm_files, - var_list, - var_list_out, - ii_cache + wgt_file, local_nwm_files, remote_nwm_files, var_list, var_list_out, ii_cache ) - print(f'Time to create forcing dictionary {time.perf_counter() - t0}') + print(f"Time to create forcing dictionary {time.perf_counter() - t0}") - print(f'Writting data!') + print(f"Writing data!") # Write CSVs to file t0 = time.perf_counter() write_int = 100 @@ -548,7 +543,7 @@ def main(): end="\r", ) - print(f"{file_type} write took {time.perf_counter() - t0:.2f} s\n") + print(f"\n{file_type} write took {time.perf_counter() - t0:.2f} s\n") print( f"\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {bucket_type}\n\n" From 5e14877fceb51aaa35c908566be33b38e7ca6835 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 8 May 2023 09:12:14 -0500 Subject: [PATCH 077/105] moved file names template --- ngen_forcing/filenames.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 ngen_forcing/filenames.txt diff --git a/ngen_forcing/filenames.txt b/ngen_forcing/filenames.txt new file mode 100644 index 0000000..87440af --- /dev/null +++ b/ngen_forcing/filenames.txt @@ -0,0 +1,3 @@ +https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f006.conus.nc +https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f007.conus.nc +https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f008.conus.nc \ No newline at end of file From 4eddf713491a42d0be130da5dd657c1075435fc2 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 8 May 2023 14:54:18 -0500 Subject: [PATCH 078/105] fixed threading, organized config --- filenames.txt | 3 - ngen_forcing/prep_hydrofab_forcings_ngen.py | 300 ++++++++++++++------ ngen_forcing/user_input_ngen.json | 27 +- ngen_forcing/user_input_ngen.md | 53 ++-- 4 files changed, 254 insertions(+), 129 deletions(-) delete mode 100644 filenames.txt diff --git a/filenames.txt b/filenames.txt deleted file mode 100644 index 87440af..0000000 --- a/filenames.txt +++ /dev/null @@ -1,3 +0,0 @@ -https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f006.conus.nc -https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f007.conus.nc -https://storage.googleapis.com/national-water-model/nwm.20220822/forcing_medium_range/nwm.t00z.medium_range.forcing.f008.conus.nc \ No newline at end of file diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 8b3bac0..e1ee25c 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -229,24 +229,38 @@ def calc_zonal_stats_weights_new( return mean_dict -def get_forcing_dict_JL( +def get_forcing_timelist( wgt_file: str, - local_filelist: list, - remote_filelist: list, + filelist: list, var_list: list, - var_list_out: list, - ii_cache: bool, + jt = None, + out = None, ): + """ + General function to read either remote or local nwm forcing files. + + Inputs: + wgt_file: a path to the weights json, + filelist: list of filenames (urls for remote, local paths otherwise), + var_list: list (list of variable names to read), + jt: the index to place the file. This is used to ensure elements increase in time, regardless of thread number, + out: a list (in time) of forcing data, (THIS IS A THREADING OUTPUT) + + Outputs: + df_by_t : (returned for local files) a list (in time) of forcing data. Note that this list may not be consistent in time + OR + out : (returned for remote files) a list (in time) of forcing data. + Each thread will write into this list such that time increases, but may not be consistent + + """ + t1 = time.perf_counter() - nlocal = len(local_filelist) - full_list = local_filelist + remote_filelist - df_by_t = [] - # NOTE this scheme uses the same algorithm for remote and local processing. This may not be desireable - if nlocal > 0: - eng = "h5netcdf" - for _i, _nc_file in enumerate(full_list): - if _i == nlocal: + df_by_t = [] + for _i, _nc_file in enumerate(filelist): + if _nc_file[:5] == 'https': eng = "rasterio" # switch engine for remote processing + else: + eng = "h5netcdf" with xr.open_dataset(_nc_file, engine=eng) as _xds: shp = _xds["U2D"].shape dtp = _xds["U2D"].dtype @@ -255,46 +269,70 @@ def get_forcing_dict_JL( data_allvars[var_dx, :, :] = np.squeeze(_xds[jvar].values) _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, wgt_file) df_by_t.append(_df_zonal_stats) - print( - f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(full_list)}, {(_i+1)/len(full_list)*100:.2f}% {time.perf_counter() - t1:.2f}s elapsed", - end="\r", - ) - print(f"\nReformating and converting data into dataframe") + if jt == None: + print( + f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}% {time.perf_counter() - t1:.2f}s elapsed", + end="\r", + ) + + if not jt == None: + out[jt] = df_by_t + + return df_by_t + +def time2catchment(time_list, var_list_out): + """ + Convert a list of catchment dictionaries into a single dictionary of dataframes for each catchment + + Inputs: + time_list : a list returned by get_forcing_timelist. It is assumed this list is consistent in time. + var_list_out : a list of clomun headers for the dataframes + + Outputs: + dfs : a dictionary of catchment based dataframes + + """ + dfs = {} - for jcat in list(df_by_t[0].keys()): + for jcat in list(time_list[0].keys()): data_catch = [] - for jt in range(len(df_by_t)): - data_catch.append(df_by_t[jt][jcat]) + for jt in range(len(time_list)): + data_catch.append(time_list[jt][jcat]) dfs[jcat] = pd.DataFrame(data_catch, columns=var_list_out) - print( - f"Indexing data and generating the dataframes (JL) {time.perf_counter() - t1:.2f}s" - ) - return dfs - -def threaded_cmd(cmd, semaphore=None): +def cmd(cmd,out=None): """ - Execute many system commands using python threading. Semaphore is set outside this function + Execute system commands + + Inputs + cmd : the command to execute + """ - if not semaphore == None: - semaphore.acquire() resp = os.system(cmd) if resp > 0: raise Exception(f"\Threaded command failed! Tried: {cmd}\n") - if not semaphore == None: - semaphore.release() def locate_dl_files_threaded( - ii_cache: bool, ii_verbose: bool, forcing_file_names: list, dl_threads: int + ii_cache: bool, ii_verbose: bool, forcing_file_names: list, nthreads: int ): """ Look for forcing files locally, if found, will apend to local file list for local processing If not found and if we do not wish to cache, append to remote files for remote processing If not found and if we do wish to cache, append to local file list for local processing and perform a threaded download + + Inputs: + ii_cache : user-defined caching bool + ii_verbose : user-defined verbosity bool + forcing_file_names : a list of forcing files names + nthreads : user-defined maximum number of threads + + Outputs: + local_files : list of paths to the local files. Note that even if ii_cache if false, if a file is found locally, it will be used. + remote_files : list of urls to the remote files. """ local_files = [] @@ -302,10 +340,7 @@ def locate_dl_files_threaded( dl_files = [] cmds = [] for jfile in forcing_file_names: - if ii_verbose: - print(f"Looking for {jfile}") file_parts = Path(jfile).parts - local_file = os.path.join(CACHE_DIR, file_parts[-1]) # decide whether to use local file, download it, or index it remotely @@ -330,24 +365,53 @@ def locate_dl_files_threaded( dl_files.append(jfile) local_files.append(local_file) - # Do threaded download if we have any files to download - n_files = len(dl_files) - if n_files > 0: - t0 = time.perf_counter() - threads = [] - semaphore = threading.Semaphore(dl_threads) + if len(cmds) > 0: + args = [] for i, jcmd in enumerate(cmds): - t = threading.Thread(target=threaded_cmd, args=[jcmd, semaphore]) - t.start() - threads.append(t) + args.append([jcmd]) + out = threaded_fun(cmd,nthreads,args) - for jt in threads: - jt.join() + return local_files, remote_files - print(f"Time to download {n_files} files {time.perf_counter() - t0}") +def threaded_fun(fun, + nthreads : int, + args : list): + + """ + Threaded function call + """ + threads = [] + out = [None for x in range(len(args))] + for i in range(len(args)): + + if i >= nthreads: # Assign new jobs as threads finish + k = 0 + while True: + jj = k % nthreads + jthread = threads[jj] + if jthread.is_alive(): + k += 1 + time.sleep(0.25) + else: + t = threading.Thread(target=fun, args= [*args[i], out]) + t.start() + threads[jj] = t + break + else: # Initial set of threads + t = threading.Thread(target=fun, args=[*args[i], out]) + t.start() + threads.append(t) - return local_files, remote_files + # Ensure all threads are finished + done = 0 + while done < len(threads): + done = 0 + for jthread in threads: + if not jthread.is_alive(): + done += 1 + time.sleep(0.25) + return out def main(): """ @@ -362,13 +426,14 @@ def main(): t00 = time.perf_counter() + # Take in user config parser = argparse.ArgumentParser() parser.add_argument( dest="infile", type=str, help="A json containing user inputs to run ngen" ) args = parser.parse_args() - - # Take in user config + + # Extract configurations conf = json.load(open(args.infile)) start_date = conf["forcing"]["start_date"] end_date = conf["forcing"]["end_date"] @@ -384,26 +449,29 @@ def main(): ii_cache = conf["forcing"]["cache"] version = conf["hydrofab"]["version"] vpu = conf["hydrofab"]["vpu"] - ii_verbose = conf["verbose"] - bucket_type = conf["bucket_type"] - bucket_name = conf["bucket_name"] - file_prefix = conf["file_prefix"] - file_type = conf["file_type"] - dl_threads = conf["dl_threads"] + bucket_type = conf["storage"]["bucket_type"] + bucket_name = conf["storage"]["bucket_name"] + file_prefix = conf["storage"]["file_prefix"] + file_type = conf["storage"]["file_type"] + ii_verbose = conf["run"]["verbose"] + nthreads = conf["run"]["nthreads"] + print(f'\nWelcome to Preparing Data for NextGen-Based Simulations!\n') + if not ii_verbose: print(f'Generating files now! This may take a few moments...') + + dl_time = 0 + proc_time = 0 + + # configuration validation file_types = ["csv", "parquet"] assert ( file_type in file_types ), f"{file_type} for file_type is not accepted! Accepted: {file_types}" - bucket_types = ["local", "S3"] assert ( bucket_type in bucket_types ), f"{bucket_type} for bucket_type is not accepted! Accepted: {bucket_types}" - # TODO: Subsetting! - # - # Set paths and make directories if needed top_dir = Path(os.path.dirname(args.infile)).parent if not os.path.exists(CACHE_DIR): @@ -430,31 +498,36 @@ def main(): for jfile in os.listdir(CACHE_DIR): if jfile.find(f"nextgen_{vpu}.gpkg") >= 0: gpkg = Path(CACHE_DIR, jfile) - print(f"Found and using geopackge file {gpkg}") + if ii_verbose:print(f"Found and using geopackge file {gpkg}") if gpkg == None: url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" command = f"wget -P {CACHE_DIR} -c {url}" - threaded_cmd(command, url) + t0 = time.perf_counter() + cmd(command) + dl_time += time.perf_counter() - t0 gpkg = Path(CACHE_DIR, f"nextgen_{vpu}.gpkg") - print(f"Opening {gpkg}...") + if ii_verbose:print(f"Opening {gpkg}...") + t0 = time.perf_counter() polygonfile = gpd.read_file(gpkg, layer="divides") ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) src = ds["RAINRATE"] - print("Generating weights") + if ii_verbose:print("Generating weights") t1 = time.perf_counter() generate_weights_file(polygonfile, src, wgt_file, crosswalk_dict_key="id") - print(f"\nGenerating the weights took {time.perf_counter() - t1:.2f} s") + if ii_verbose:print(f"\nGenerating the weights took {time.perf_counter() - t1:.2f} s") + proc_time +=time.perf_counter() - t0 else: - print( - f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}" - ) + if ii_verbose: + print( + f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}" + ) # Get nwm forcing file names + t0 = time.perf_counter() if len(nwm_file) == 0: - print(f"Creating list of file names to locate...") fcst_cycle = [0] nwm_forcing_files = create_file_list( @@ -468,16 +541,23 @@ def main(): urlbaseinput, ) else: - print(f"Reading list of file names from {nwm_file}...") nwm_forcing_files = [] with open(nwm_file, "r") as f: for line in f: nwm_forcing_files.append(line) + if ii_verbose: + print(f'Raw file names:') + for jfile in nwm_forcing_files: + print(f'{jfile}') + + proc_time += time.perf_counter() - t0 # This will look for local raw forcing files and download them if needed + t0 = time.perf_counter() local_nwm_files, remote_nwm_files = locate_dl_files_threaded( - ii_cache, ii_verbose, nwm_forcing_files, dl_threads + ii_cache, ii_verbose, nwm_forcing_files, nthreads ) + dl_time += time.perf_counter() - t0 var_list = [ "U2D", @@ -500,22 +580,46 @@ def main(): "SPFH_2maboveground", "DSWRF_surface", ] - - # TODO: Considering possible memory constraints in this operation, - # let's loop though a certain number of files, write them out, and go back for more + t0 = time.perf_counter() - fd2 = get_forcing_dict_JL( - wgt_file, local_nwm_files, remote_nwm_files, var_list, var_list_out, ii_cache - ) - print(f"Time to create forcing dictionary {time.perf_counter() - t0}") - - print(f"Writing data!") - # Write CSVs to file + # Index remote files with threads + if len(remote_nwm_files) > 0: + args = [] + for i in range(len(remote_nwm_files)): + if ii_verbose: print(f'Doing a threaded remote data retrieval for file {remote_nwm_files[i]}') + args.append([wgt_file, [remote_nwm_files[i]], var_list, i]) + out = threaded_fun(get_forcing_timelist,nthreads,args) + + # If we have any local files, index locally serially + if len(local_nwm_files) > 0: + time_list = get_forcing_timelist( + wgt_file, local_nwm_files, var_list + ) + + # Sync in time between remote and local files + complete_timelist = [] + for i, ifile in enumerate(nwm_forcing_files): + filename = Path(ifile).parts[-1] + for j, jfile in enumerate(local_nwm_files): + if jfile.find(filename) >= 0: + complete_timelist.append(time_list[j]) + for j, jfile in enumerate(remote_nwm_files): + if jfile.find(filename) >= 0: + complete_timelist.append(out[j][0]) + + # Convert time-synced list of catchment dictionaries + # to catchment based dataframes + dfs = time2catchment(complete_timelist, var_list_out) + proc_time = time.perf_counter() - t0 + + # Write to file + if ii_verbose: print(f"Writing data!") t0 = time.perf_counter() - write_int = 100 - for j, jcatch in enumerate(fd2.keys()): - df = fd2[jcatch] + nfiles = len(dfs) + write_int = 1000 + for j, jcatch in enumerate(dfs.keys()): + df = dfs[jcatch] splt = jcatch.split("-") if bucket_type == "local": @@ -539,17 +643,27 @@ def main(): if (j + 1) % write_int == 0: print( - f"{j+1} files written out of {len(fd2)}, {(j+1)/len(fd2)*100:.2f}%", + f"{j+1} files written out of {len(dfs)}, {(j+1)/len(dfs)*100:.2f}%", end="\r", ) + if j == nfiles-1: + print( + f"{j+1} files written out of {len(dfs)}, {(j+1)/len(dfs)*100:.2f}%", + end="\r", + ) + write_time = time.perf_counter() - t0 + total_time = time.perf_counter() - t00 - print(f"\n{file_type} write took {time.perf_counter() - t0:.2f} s\n") - - print( - f"\n\nDone! Catchment forcing files have been generated for VPU {vpu} in {bucket_type}\n\n" - ) - print(f"Total run time: {time.perf_counter() - t00:.2f} s") - + print(f'\n\n--------SUMMARY-------') + if bucket_type == 'local': + msg = f'\nData has been written locally to {bucket_path}' + else: + msg = f'\nData has been written to S3 bucket {bucket_name} at {file_prefix}' + msg += f'\nDownloading data : {dl_time:.2f}s' + msg += f'\nProcessing data : {proc_time:.2f}s' + msg += f'\nWriting data : {write_time:.2f}s' + msg += f'\nTotal time : {total_time:.2f}s\n' + print(msg) if __name__ == "__main__": main() diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index 11b5962..4fef300 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -2,25 +2,30 @@ "forcing" : { "start_date" : "20220822", "end_date" : "20220822", + "cache" : true, "nwm_file" : "", "runinput" : 1, "varinput" : 5, "geoinput" : 1, "meminput" : 0, - "urlbaseinput" : 3, - "cache" : false + "urlbaseinput" : 3 }, "hydrofab" : { - "version" : "v1.2", - "vpu" : "03W" + "version" : "v1.2", + "vpu" : "03W" }, - "verbose" : true, - "bucket_type" : "local", - "bucket_name" : "out_data_CIROH", - "file_prefix" : "data/", - "file_type" : "csv", - "dl_threads" : 10 - + "storage":{ + "bucket_type" : "local", + "bucket_name" : "out_data_CIROH", + "file_prefix" : "data/", + "file_type" : "csv" + }, + + "run" : { + "verbose" : false, + "nthreads" : 1 + } + } diff --git a/ngen_forcing/user_input_ngen.md b/ngen_forcing/user_input_ngen.md index 033fc5b..1b13256 100644 --- a/ngen_forcing/user_input_ngen.md +++ b/ngen_forcing/user_input_ngen.md @@ -9,43 +9,47 @@ contents: "forcing" : { "start_date" : "20220822", "end_date" : "20220822", + "cache" : false, "nwm_file" : "", "runinput" : 1, "varinput" : 5, "geoinput" : 1, "meminput" : 0, - "urlbaseinput" : 3, - "cache" : false + "urlbaseinput" : 3 }, - + "hydrofab" : { - "version" : "v1.2", - "vpu" : "03W" + "version" : "v1.2", + "vpu" : "03W" }, - - "verbose" : true, - "bucket_type" : "local", - "bucket_name" : "out_data_CIROH", - "file_prefix" : "data/", - "file_type" : "csv", - "dl_threads" : 10 - + + "storage":{ + "bucket_type" : "local", + "bucket_name" : "out_data_CIROH", + "file_prefix" : "data/", + "file_type" : "csv" + }, + + "run" : { + "verbose" : false, + "nthreads" : 5 + } + } - - ### forcing | Field Name | Data Type | Description | | --- | --- | --- | | start_date | `string` | YYYYMMDD | | end_date | `string` | YYYYMMDD | -| nwm_files | `string` | Path to a text file containing nwm file names. One filename per line. To have nwm forcing file names generated automatically, leave this option out of the config or set it to "" | +| cache | `bool` |
  • true: Store forcing files locally. Must specify dl_threads
  • false: Interact with forcing files remotely
  • | +| nwm_file | `string` | Path to a text file containing nwm file names. One filename per line. To have nwm forcing file names generated automatically, leave this option out of the config or set it to "" | | runinput | `int` |
    1. short_range
    2. medium_range
    3. medium_range_no_da
    4. long_range
    5. analysis_assim
    6. analysis_assim_extend
    7. analysis_assim_extend_no_da
    8. analysis_assim_long
    9. analysis_assim_long_no_da
    10. analysis_assim_no_da
    11. short_range_no_da
    | -| varinput | `int` |
    1. channel_rt: for real-time channel data
    2. land: for land data
    3. reservoir: for reservoir data
    4. terrain_rt: for real-time terrain data
    5. forcing: for forcing data
    | -| geoinput | `int` |
    1. conus: for continental US
    2. hawaii: for Hawaii
    3. puertorico: for Puerto Rico
    | +| varinput | `int` |
    1. channel_rt
    2. land
    3. reservoir
    4. terrain_rt terrain
    5. forcing
    | +| geoinput | `int` |
    1. conus
    2. hawaii
    3. puertorico
    | | meminput | `int` |
    1. mem_1
    2. mem_2
    3. mem_3
    4. mem_4
    5. mem_5
    6. mem_6
    7. mem_7
    | | urlbaseinput | `int` |
    1. Empty string: use local files
    2. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/: for real-time operational data from NOAA
    3. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/post-processed/WMS/: for post-processed data from NOAA's Web Map Service
    4. https://storage.googleapis.com/national-water-model/: for input/output data stored on Google Cloud Storage
    5. https://storage.cloud.google.com/national-water-model/: for input/output data stored on Google Cloud Storage
    6. gs://national-water-model/: for input/output data stored on Google Cloud Storage
    7. https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/: for retrospective data from AWS S3
    8. s3://noaa-nwm-retrospective-2-1-pds/model_output/: for retrospective data from AWS S3
    | -| cache | `bool` |
  • true: Store forcing files locally. Must specify dl_threads
  • false: Interact with forcing files remotely
  • | + ### hydrofab | Field Name | Data Type | Description | @@ -53,12 +57,17 @@ contents: | version | `string` | Current hydrofabric version | | vpu | `string` | Check here for map of VPUs https://noaa-owp.github.io/hydrofabric/articles/data_access.html | -### other options +### storage | Field Name | Data Type | Description | | --- | --- | --- | -| verbose | `bool` | Print raw forcing files | | bucket_type | `string` |
    1. "local" : write to local directory
    2. "S3" : output to AWS S3 bucket
    | | bucket_name | `string` | If local, this is the name of the folder the data will be placed in. If S3, this is the name of S3 bucket, which must exist already. | | file_prefix | `string` | If local, this is the relative path to the bucket_name folder. If S3, this is the relative path within the S3 bucket_name bucket to store files | | file_type | `string` |
    1. "csv" : write data as csv files/
    2. "parquet" : write data as parquet files
    | -| dl_threads | `int` | Number of threads to use while downloading. | + + +### run +| Field Name | Data Type | Description | +| --- | --- | --- | +| verbose | `bool` | Print raw forcing files | +| nthreads | `int` | Number of threads to use while downloading. | From dd7a721e1f57db584a5100b5fa8a874e02936eb9 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 8 May 2023 14:55:20 -0500 Subject: [PATCH 079/105] blacked --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 122 +++++++++++--------- 1 file changed, 65 insertions(+), 57 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index e1ee25c..9b5edfa 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -233,31 +233,31 @@ def get_forcing_timelist( wgt_file: str, filelist: list, var_list: list, - jt = None, - out = None, + jt=None, + out=None, ): """ - General function to read either remote or local nwm forcing files. + General function to read either remote or local nwm forcing files. Inputs: wgt_file: a path to the weights json, filelist: list of filenames (urls for remote, local paths otherwise), - var_list: list (list of variable names to read), + var_list: list (list of variable names to read), jt: the index to place the file. This is used to ensure elements increase in time, regardless of thread number, out: a list (in time) of forcing data, (THIS IS A THREADING OUTPUT) - + Outputs: df_by_t : (returned for local files) a list (in time) of forcing data. Note that this list may not be consistent in time OR - out : (returned for remote files) a list (in time) of forcing data. + out : (returned for remote files) a list (in time) of forcing data. Each thread will write into this list such that time increases, but may not be consistent - + """ t1 = time.perf_counter() - df_by_t = [] + df_by_t = [] for _i, _nc_file in enumerate(filelist): - if _nc_file[:5] == 'https': + if _nc_file[:5] == "https": eng = "rasterio" # switch engine for remote processing else: eng = "h5netcdf" @@ -270,7 +270,7 @@ def get_forcing_timelist( _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, wgt_file) df_by_t.append(_df_zonal_stats) - if jt == None: + if jt == None: print( f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}% {time.perf_counter() - t1:.2f}s elapsed", end="\r", @@ -281,11 +281,12 @@ def get_forcing_timelist( return df_by_t + def time2catchment(time_list, var_list_out): """ Convert a list of catchment dictionaries into a single dictionary of dataframes for each catchment - Inputs: + Inputs: time_list : a list returned by get_forcing_timelist. It is assumed this list is consistent in time. var_list_out : a list of clomun headers for the dataframes @@ -303,7 +304,8 @@ def time2catchment(time_list, var_list_out): return dfs -def cmd(cmd,out=None): + +def cmd(cmd, out=None): """ Execute system commands @@ -369,22 +371,19 @@ def locate_dl_files_threaded( args = [] for i, jcmd in enumerate(cmds): args.append([jcmd]) - out = threaded_fun(cmd,nthreads,args) + out = threaded_fun(cmd, nthreads, args) return local_files, remote_files -def threaded_fun(fun, - nthreads : int, - args : list): - + +def threaded_fun(fun, nthreads: int, args: list): """ Threaded function call """ threads = [] out = [None for x in range(len(args))] - for i in range(len(args)): - - if i >= nthreads: # Assign new jobs as threads finish + for i in range(len(args)): + if i >= nthreads: # Assign new jobs as threads finish k = 0 while True: jj = k % nthreads @@ -392,12 +391,12 @@ def threaded_fun(fun, if jthread.is_alive(): k += 1 time.sleep(0.25) - else: - t = threading.Thread(target=fun, args= [*args[i], out]) + else: + t = threading.Thread(target=fun, args=[*args[i], out]) t.start() threads[jj] = t break - else: # Initial set of threads + else: # Initial set of threads t = threading.Thread(target=fun, args=[*args[i], out]) t.start() threads.append(t) @@ -407,12 +406,13 @@ def threaded_fun(fun, while done < len(threads): done = 0 for jthread in threads: - if not jthread.is_alive(): + if not jthread.is_alive(): done += 1 time.sleep(0.25) return out + def main(): """ Primary function to retrieve forcing and hydrofabric data and convert it into files that can be ingested into ngen. @@ -432,7 +432,7 @@ def main(): dest="infile", type=str, help="A json containing user inputs to run ngen" ) args = parser.parse_args() - + # Extract configurations conf = json.load(open(args.infile)) start_date = conf["forcing"]["start_date"] @@ -452,12 +452,13 @@ def main(): bucket_type = conf["storage"]["bucket_type"] bucket_name = conf["storage"]["bucket_name"] file_prefix = conf["storage"]["file_prefix"] - file_type = conf["storage"]["file_type"] + file_type = conf["storage"]["file_type"] ii_verbose = conf["run"]["verbose"] nthreads = conf["run"]["nthreads"] - print(f'\nWelcome to Preparing Data for NextGen-Based Simulations!\n') - if not ii_verbose: print(f'Generating files now! This may take a few moments...') + print(f"\nWelcome to Preparing Data for NextGen-Based Simulations!\n") + if not ii_verbose: + print(f"Generating files now! This may take a few moments...") dl_time = 0 proc_time = 0 @@ -498,7 +499,8 @@ def main(): for jfile in os.listdir(CACHE_DIR): if jfile.find(f"nextgen_{vpu}.gpkg") >= 0: gpkg = Path(CACHE_DIR, jfile) - if ii_verbose:print(f"Found and using geopackge file {gpkg}") + if ii_verbose: + print(f"Found and using geopackge file {gpkg}") if gpkg == None: url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" command = f"wget -P {CACHE_DIR} -c {url}" @@ -507,18 +509,21 @@ def main(): dl_time += time.perf_counter() - t0 gpkg = Path(CACHE_DIR, f"nextgen_{vpu}.gpkg") - if ii_verbose:print(f"Opening {gpkg}...") + if ii_verbose: + print(f"Opening {gpkg}...") t0 = time.perf_counter() polygonfile = gpd.read_file(gpkg, layer="divides") ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) src = ds["RAINRATE"] - if ii_verbose:print("Generating weights") + if ii_verbose: + print("Generating weights") t1 = time.perf_counter() generate_weights_file(polygonfile, src, wgt_file, crosswalk_dict_key="id") - if ii_verbose:print(f"\nGenerating the weights took {time.perf_counter() - t1:.2f} s") - proc_time +=time.perf_counter() - t0 + if ii_verbose: + print(f"\nGenerating the weights took {time.perf_counter() - t1:.2f} s") + proc_time += time.perf_counter() - t0 else: if ii_verbose: print( @@ -545,10 +550,10 @@ def main(): with open(nwm_file, "r") as f: for line in f: nwm_forcing_files.append(line) - if ii_verbose: - print(f'Raw file names:') + if ii_verbose: + print(f"Raw file names:") for jfile in nwm_forcing_files: - print(f'{jfile}') + print(f"{jfile}") proc_time += time.perf_counter() - t0 @@ -580,22 +585,23 @@ def main(): "SPFH_2maboveground", "DSWRF_surface", ] - + t0 = time.perf_counter() # Index remote files with threads - if len(remote_nwm_files) > 0: + if len(remote_nwm_files) > 0: args = [] - for i in range(len(remote_nwm_files)): - if ii_verbose: print(f'Doing a threaded remote data retrieval for file {remote_nwm_files[i]}') + for i in range(len(remote_nwm_files)): + if ii_verbose: + print( + f"Doing a threaded remote data retrieval for file {remote_nwm_files[i]}" + ) args.append([wgt_file, [remote_nwm_files[i]], var_list, i]) - out = threaded_fun(get_forcing_timelist,nthreads,args) + out = threaded_fun(get_forcing_timelist, nthreads, args) # If we have any local files, index locally serially - if len(local_nwm_files) > 0: - time_list = get_forcing_timelist( - wgt_file, local_nwm_files, var_list - ) + if len(local_nwm_files) > 0: + time_list = get_forcing_timelist(wgt_file, local_nwm_files, var_list) # Sync in time between remote and local files complete_timelist = [] @@ -606,7 +612,7 @@ def main(): complete_timelist.append(time_list[j]) for j, jfile in enumerate(remote_nwm_files): if jfile.find(filename) >= 0: - complete_timelist.append(out[j][0]) + complete_timelist.append(out[j][0]) # Convert time-synced list of catchment dictionaries # to catchment based dataframes @@ -614,7 +620,8 @@ def main(): proc_time = time.perf_counter() - t0 # Write to file - if ii_verbose: print(f"Writing data!") + if ii_verbose: + print(f"Writing data!") t0 = time.perf_counter() nfiles = len(dfs) write_int = 1000 @@ -646,24 +653,25 @@ def main(): f"{j+1} files written out of {len(dfs)}, {(j+1)/len(dfs)*100:.2f}%", end="\r", ) - if j == nfiles-1: + if j == nfiles - 1: print( f"{j+1} files written out of {len(dfs)}, {(j+1)/len(dfs)*100:.2f}%", end="\r", - ) + ) write_time = time.perf_counter() - t0 total_time = time.perf_counter() - t00 - print(f'\n\n--------SUMMARY-------') - if bucket_type == 'local': - msg = f'\nData has been written locally to {bucket_path}' + print(f"\n\n--------SUMMARY-------") + if bucket_type == "local": + msg = f"\nData has been written locally to {bucket_path}" else: - msg = f'\nData has been written to S3 bucket {bucket_name} at {file_prefix}' - msg += f'\nDownloading data : {dl_time:.2f}s' - msg += f'\nProcessing data : {proc_time:.2f}s' - msg += f'\nWriting data : {write_time:.2f}s' - msg += f'\nTotal time : {total_time:.2f}s\n' + msg = f"\nData has been written to S3 bucket {bucket_name} at {file_prefix}" + msg += f"\nDownloading data : {dl_time:.2f}s" + msg += f"\nProcessing data : {proc_time:.2f}s" + msg += f"\nWriting data : {write_time:.2f}s" + msg += f"\nTotal time : {total_time:.2f}s\n" print(msg) + if __name__ == "__main__": main() From 3461a7cf0536ab8aee5ee301bab3e400b619ac61 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Tue, 9 May 2023 01:50:57 -0500 Subject: [PATCH 080/105] Fixed threading and added local file check --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 172 ++++++++++---------- ngen_forcing/user_input_ngen.json | 4 +- 2 files changed, 87 insertions(+), 89 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 9b5edfa..3932095 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -20,6 +20,8 @@ import boto3 from io import BytesIO + +import concurrent.futures as cf import threading pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "nwm_filenames") @@ -233,8 +235,7 @@ def get_forcing_timelist( wgt_file: str, filelist: list, var_list: list, - jt=None, - out=None, + jt=None ): """ General function to read either remote or local nwm forcing files. @@ -244,18 +245,16 @@ def get_forcing_timelist( filelist: list of filenames (urls for remote, local paths otherwise), var_list: list (list of variable names to read), jt: the index to place the file. This is used to ensure elements increase in time, regardless of thread number, - out: a list (in time) of forcing data, (THIS IS A THREADING OUTPUT) Outputs: df_by_t : (returned for local files) a list (in time) of forcing data. Note that this list may not be consistent in time - OR - out : (returned for remote files) a list (in time) of forcing data. - Each thread will write into this list such that time increases, but may not be consistent + t : model_output_valid_time for each """ t1 = time.perf_counter() df_by_t = [] + t = [] for _i, _nc_file in enumerate(filelist): if _nc_file[:5] == "https": eng = "rasterio" # switch engine for remote processing @@ -269,20 +268,20 @@ def get_forcing_timelist( data_allvars[var_dx, :, :] = np.squeeze(_xds[jvar].values) _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, wgt_file) df_by_t.append(_df_zonal_stats) + time_splt = _xds.attrs["model_output_valid_time"].split("_") + t.append(time_splt[0] + " " + time_splt[1]) if jt == None: print( f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}% {time.perf_counter() - t1:.2f}s elapsed", end="\r", ) + if _i == len(filelist) -1: print() - if not jt == None: - out[jt] = df_by_t - - return df_by_t + return df_by_t, t -def time2catchment(time_list, var_list_out): +def time2catchment(data_list, time_list, var_list_out): """ Convert a list of catchment dictionaries into a single dictionary of dataframes for each catchment @@ -296,16 +295,18 @@ def time2catchment(time_list, var_list_out): """ dfs = {} - for jcat in list(time_list[0].keys()): + for jcat in list(data_list[0].keys()): data_catch = [] - for jt in range(len(time_list)): - data_catch.append(time_list[jt][jcat]) + for jt in range(len(data_list)): + data_catch.append(data_list[jt][jcat]) dfs[jcat] = pd.DataFrame(data_catch, columns=var_list_out) + dfs[jcat]["time"] = time_list + dfs[jcat] = dfs[jcat][["time"] + var_list_out] return dfs -def cmd(cmd, out=None): +def cmd(cmd): """ Execute system commands @@ -344,21 +345,37 @@ def locate_dl_files_threaded( for jfile in forcing_file_names: file_parts = Path(jfile).parts local_file = os.path.join(CACHE_DIR, file_parts[-1]) + ii_dl = False # decide whether to use local file, download it, or index it remotely if os.path.exists(local_file): - # If the file exists local, get data from this file regardless of ii_cache option - if ii_verbose and ii_cache: - print(f"Found and using local raw forcing file {local_file}") - elif ii_verbose and not ii_cache: - print( - f"CACHE OPTION OVERRIDE : Found and using local raw forcing file {local_file}" - ) - local_files.append(local_file) + + # Check to make sure file is not broken + try: + with xr.open_dataset(local_file, engine="h5netcdf") as _xds: + pass + if ii_cache: + if ii_verbose: print(f"Found and using local raw forcing file {local_file}") + else: + if ii_verbose: print( + f"CACHE OPTION OVERRIDE : Found and using local raw forcing file {local_file}" + ) + local_files.append(local_file) + except: + if ii_cache: + if ii_verbose: print(f"{local_file} is broken! Will Download") + ii_dl = True + else: + if ii_verbose: print(f"{local_file} is broken! Will index remotely") + remote_files.append(jfile) + elif not os.path.exists(local_file) and not ii_cache: # If file is not found locally, and we don't want to cache it, append to remote file list remote_files.append(jfile) elif not os.path.exists(local_file) and ii_cache: + ii_dl = True + + if ii_dl: # Download file if ii_verbose: print(f"Forcing file not found! Downloading {jfile}") @@ -367,52 +384,14 @@ def locate_dl_files_threaded( dl_files.append(jfile) local_files.append(local_file) + # Get files with pool if len(cmds) > 0: - args = [] - for i, jcmd in enumerate(cmds): - args.append([jcmd]) - out = threaded_fun(cmd, nthreads, args) + pool = cf.ThreadPoolExecutor(max_workers=nthreads) + pool.map(cmd, cmds) + pool.shutdown() return local_files, remote_files - -def threaded_fun(fun, nthreads: int, args: list): - """ - Threaded function call - """ - threads = [] - out = [None for x in range(len(args))] - for i in range(len(args)): - if i >= nthreads: # Assign new jobs as threads finish - k = 0 - while True: - jj = k % nthreads - jthread = threads[jj] - if jthread.is_alive(): - k += 1 - time.sleep(0.25) - else: - t = threading.Thread(target=fun, args=[*args[i], out]) - t.start() - threads[jj] = t - break - else: # Initial set of threads - t = threading.Thread(target=fun, args=[*args[i], out]) - t.start() - threads.append(t) - - # Ensure all threads are finished - done = 0 - while done < len(threads): - done = 0 - for jthread in threads: - if not jthread.is_alive(): - done += 1 - time.sleep(0.25) - - return out - - def main(): """ Primary function to retrieve forcing and hydrofabric data and convert it into files that can be ingested into ngen. @@ -587,41 +566,60 @@ def main(): ] t0 = time.perf_counter() - # Index remote files with threads - if len(remote_nwm_files) > 0: - args = [] - for i in range(len(remote_nwm_files)): - if ii_verbose: - print( - f"Doing a threaded remote data retrieval for file {remote_nwm_files[i]}" - ) - args.append([wgt_file, [remote_nwm_files[i]], var_list, i]) - out = threaded_fun(get_forcing_timelist, nthreads, args) + pool = cf.ThreadPoolExecutor(max_workers=nthreads) + arg0 = [] + arg1 = [] + arg2 = [] + arg3 = [] + for i in range(len(remote_nwm_files)): + if ii_verbose: + print( + f"Doing a threaded remote data retrieval for file {remote_nwm_files[i]}" + ) + arg0.append(wgt_file) + arg1.append([remote_nwm_files[i]]) + arg2.append(var_list) + arg3.append(i) + results = pool.map(get_forcing_timelist, arg0,arg1,arg2,arg3) + + # Get data + remote_data_list = [] + for jres in results: + remote_data_list.append(jres) + + # Build time axis + t_ax_remote = [] + for i in range(len(remote_nwm_files)): + t_ax_remote.append(remote_data_list[i][1]) # If we have any local files, index locally serially if len(local_nwm_files) > 0: - time_list = get_forcing_timelist(wgt_file, local_nwm_files, var_list) + data_list, t_ax_local = get_forcing_timelist( + wgt_file, local_nwm_files, var_list + ) # Sync in time between remote and local files - complete_timelist = [] + complete_data_timelist = [] + timelist = [] for i, ifile in enumerate(nwm_forcing_files): filename = Path(ifile).parts[-1] for j, jfile in enumerate(local_nwm_files): if jfile.find(filename) >= 0: - complete_timelist.append(time_list[j]) + complete_data_timelist.append(data_list[j]) + timelist.append(t_ax_local[j]) for j, jfile in enumerate(remote_nwm_files): if jfile.find(filename) >= 0: - complete_timelist.append(out[j][0]) + complete_data_timelist.append(remote_data_list[j][0][0]) + timelist.append(t_ax_remote[j]) # Convert time-synced list of catchment dictionaries # to catchment based dataframes - dfs = time2catchment(complete_timelist, var_list_out) - proc_time = time.perf_counter() - t0 + if ii_verbose: print(f'Reformatting data into dataframes...') + dfs = time2catchment(complete_data_timelist, timelist, var_list_out) + proc_time += time.perf_counter() - t0 # Write to file - if ii_verbose: - print(f"Writing data!") t0 = time.perf_counter() nfiles = len(dfs) write_int = 1000 @@ -632,7 +630,7 @@ def main(): if bucket_type == "local": if file_type == "csv": csvname = Path(bucket_path, f"cat{vpu}_{splt[1]}.csv") - df.to_csv(csvname) + df.to_csv(csvname, index=False) if file_type == "parquet": parq_file = Path(bucket_path, f"cat{vpu}_{splt[1]}.parquet") df.to_parquet(parq_file) @@ -666,9 +664,9 @@ def main(): msg = f"\nData has been written locally to {bucket_path}" else: msg = f"\nData has been written to S3 bucket {bucket_name} at {file_prefix}" - msg += f"\nDownloading data : {dl_time:.2f}s" - msg += f"\nProcessing data : {proc_time:.2f}s" - msg += f"\nWriting data : {write_time:.2f}s" + msg += f"\Check and DL data : {dl_time:.2f}s" + msg += f"\nProcess data : {proc_time:.2f}s" + msg += f"\nWrite data : {write_time:.2f}s" msg += f"\nTotal time : {total_time:.2f}s\n" print(msg) diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index 4fef300..eebfb5e 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -24,8 +24,8 @@ }, "run" : { - "verbose" : false, - "nthreads" : 1 + "verbose" : true, + "nthreads" : 10 } } From bf96bd3e915866e9e30403689f09f49030ae4713 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 11 May 2023 13:52:01 -0500 Subject: [PATCH 081/105] removed print statements --- subsetting/subset.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/subsetting/subset.py b/subsetting/subset.py index 973fa10..aa03f21 100644 --- a/subsetting/subset.py +++ b/subsetting/subset.py @@ -95,7 +95,11 @@ def get_upstream_ids(divides, nexus, catchment_id): # os._exit(1) for cid in catchment_ids: graph_nodes.put((catchment_id, 0, True)) - graph_nodes.put((cat_index.loc[cid].item(), 0, False)) + try: + graph_nodes.put((cat_index.loc[cid].item(), 0, False)) + except: + raise Exception(f'catchment id {cid} is not found in geopackage!') + cat_ids = set() nex_ids = set() @@ -150,9 +154,9 @@ def subset_upstream(hydrofabric: Path, ids: "List") -> None: # print(nex_ids) # print(wb_ids) # Useful for looking at the name of each layer and which id index is needed to subset it - for layer in layers: + # for layer in layers: # df = gpd.read_file(hydrofabric, layer=layer) - print(layer) + # print(layer) # print(df.head()) flowpaths = ( From 37100148e84c7ee93a5b7028cece663eb9d8fba7 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 11 May 2023 14:53:38 -0500 Subject: [PATCH 082/105] Threaded local data processing and updated user_inputs --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 288 ++++++++++++-------- ngen_forcing/user_input_ngen.json | 20 +- ngen_forcing/user_input_ngen.md | 43 ++- 3 files changed, 215 insertions(+), 136 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 3932095..fe919c6 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -19,15 +19,18 @@ import time import boto3 from io import BytesIO - - +import matplotlib.pyplot as plt +from mpl_toolkits.basemap import Basemap import concurrent.futures as cf -import threading pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "nwm_filenames") sys.path.append(str(pkg_dir)) from listofnwmfilenames import create_file_list +pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "subsetting") +sys.path.append(str(pkg_dir)) +from subset import subset_upstream + TEMPLATE_BLOB_NAME = ( "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" ) @@ -53,7 +56,7 @@ # TODO Make CACHE_DIR configurable CACHE_DIR = Path( - pkg_dir.parent, "data", "raw_data" + pkg_dir.parent, "data", "cache" ) # Maybe this should have a date attached to the name NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") @@ -231,12 +234,7 @@ def calc_zonal_stats_weights_new( return mean_dict -def get_forcing_timelist( - wgt_file: str, - filelist: list, - var_list: list, - jt=None -): +def get_forcing_timelist(wgt_file: str, filelist: list, var_list: list): """ General function to read either remote or local nwm forcing files. @@ -271,13 +269,6 @@ def get_forcing_timelist( time_splt = _xds.attrs["model_output_valid_time"].split("_") t.append(time_splt[0] + " " + time_splt[1]) - if jt == None: - print( - f"Indexing catchment data progress -> {_i+1} files proccessed out of {len(filelist)}, {(_i+1)/len(filelist)*100:.2f}% {time.perf_counter() - t1:.2f}s elapsed", - end="\r", - ) - if _i == len(filelist) -1: print() - return df_by_t, t @@ -349,26 +340,29 @@ def locate_dl_files_threaded( # decide whether to use local file, download it, or index it remotely if os.path.exists(local_file): - # Check to make sure file is not broken try: with xr.open_dataset(local_file, engine="h5netcdf") as _xds: - pass + pass if ii_cache: - if ii_verbose: print(f"Found and using local raw forcing file {local_file}") + if ii_verbose: + print(f"Found and using local raw forcing file {local_file}") else: - if ii_verbose: print( - f"CACHE OPTION OVERRIDE : Found and using local raw forcing file {local_file}" - ) + if ii_verbose: + print( + f"CACHE OPTION OVERRIDE : Found and using local raw forcing file {local_file}" + ) local_files.append(local_file) - except: + except: if ii_cache: - if ii_verbose: print(f"{local_file} is broken! Will Download") + if ii_verbose: + print(f"{local_file} is broken! Will Download") ii_dl = True else: - if ii_verbose: print(f"{local_file} is broken! Will index remotely") - remote_files.append(jfile) - + if ii_verbose: + print(f"{local_file} is broken! Will index remotely") + remote_files.append(jfile) + elif not os.path.exists(local_file) and not ii_cache: # If file is not found locally, and we don't want to cache it, append to remote file list remote_files.append(jfile) @@ -392,6 +386,34 @@ def locate_dl_files_threaded( return local_files, remote_files +def threaded_data_extract(files,nthreads,ii_verbose,wgt_file,var_list): + """ + Sets up the thread pool for get_forcing_timelist and returns the data and time axis ordered in time + + """ + pool = cf.ThreadPoolExecutor(max_workers=nthreads) + arg0 = [] + arg1 = [] + arg2 = [] + for i in range(len(files)): + arg0.append(wgt_file) + arg1.append([files[i]]) + arg2.append(var_list) + + results = pool.map(get_forcing_timelist, arg0, arg1, arg2) + + data_list = [] + for jres in results: + data_list.append(jres) + + # Build time axis + t_ax_local = [] + for i in range(len(files)): + t_ax_local.append(data_list[i][1]) + + return data_list, t_ax_local + + def main(): """ Primary function to retrieve forcing and hydrofabric data and convert it into files that can be ingested into ngen. @@ -414,35 +436,44 @@ def main(): # Extract configurations conf = json.load(open(args.infile)) - start_date = conf["forcing"]["start_date"] - end_date = conf["forcing"]["end_date"] - if "nwm_file" in conf["forcing"]: - nwm_file = conf["forcing"]["nwm_file"] - else: - nwm_file = "" - runinput = conf["forcing"]["runinput"] - varinput = conf["forcing"]["varinput"] - geoinput = conf["forcing"]["geoinput"] - meminput = conf["forcing"]["meminput"] - urlbaseinput = conf["forcing"]["urlbaseinput"] + forcing_type = conf["forcing"]["forcing_type"] ii_cache = conf["forcing"]["cache"] - version = conf["hydrofab"]["version"] - vpu = conf["hydrofab"]["vpu"] + + start_date = conf["forcing"].get("start_date",None) + end_date = conf["forcing"].get("end_date",None) + runinput = conf["forcing"].get("runinput",None) + varinput = conf["forcing"].get("varinput",None) + geoinput = conf["forcing"].get("geoinput",None) + meminput = conf["forcing"].get("meminput",None) + urlbaseinput = conf["forcing"].get("urlbaseinput",None) + nwm_file = conf["forcing"].get("nwm_file",None) + fcst_cycle = conf["forcing"].get("fcst_cycle",None) + lead_time = conf["forcing"].get("lead_time",None) + + version = conf["hydrofab"].get('version','v1.2') + vpu = conf["hydrofab"].get("vpu") + catchment_subset = conf['hydrofab'].get("catch_subset") + geopkg_file = conf["hydrofab"].get("geopkg_file") + ii_weights_only = conf['hydrofab'].get('weights_only',False) + bucket_type = conf["storage"]["bucket_type"] bucket_name = conf["storage"]["bucket_name"] file_prefix = conf["storage"]["file_prefix"] file_type = conf["storage"]["file_type"] - ii_verbose = conf["run"]["verbose"] - nthreads = conf["run"]["nthreads"] + + ii_verbose = conf["run"]["verbose"] + dl_threads = conf["run"]["dl_threads"] + proc_threads = conf["run"]["proc_threads"] print(f"\nWelcome to Preparing Data for NextGen-Based Simulations!\n") - if not ii_verbose: - print(f"Generating files now! This may take a few moments...") dl_time = 0 proc_time = 0 # configuration validation + accepted = ['operational_archive','retrospective','from_file'] + msg = f'{forcing_type} is not a valid input for \"forcing_type\"\nAccepted inputs: {accepted}' + assert forcing_type in accepted, msg file_types = ["csv", "parquet"] assert ( file_type in file_types @@ -451,6 +482,7 @@ def main(): assert ( bucket_type in bucket_types ), f"{bucket_type} for bucket_type is not accepted! Accepted: {bucket_types}" + assert vpu is not None or geopkg_file is not None, "Need to input either vpu or geopkg_file" # Set paths and make directories if needed top_dir = Path(os.path.dirname(args.infile)).parent @@ -462,57 +494,100 @@ def main(): # Prep output directory if bucket_type == "local": bucket_path = Path(top_dir, file_prefix, bucket_name) + forcing_path = Path(bucket_path, 'forcing') if not os.path.exists(bucket_path): - os.system(f"mkdir {bucket_path}") + os.system(f"mkdir {bucket_path}") + os.system(f"mkdir {forcing_path}") if not os.path.exists(bucket_path): raise Exception(f"Creating {bucket_path} failed!") elif bucket_type == "S3": s3 = boto3.client("s3") # Generate weight file only if one doesn't exist already - # Very time consuming so we don't want to do this if we can avoid it - wgt_file = os.path.join(CACHE_DIR, "weights.json") + if catchment_subset is not None: + wgt_file = os.path.join(CACHE_DIR, f"{catchment_subset}_upstream_weights.json") + else: + wgt_file = os.path.join(CACHE_DIR, f"{vpu}_weights.json") if not os.path.exists(wgt_file): - # Search for geopackage that matches the requested VPU, if it exists - gpkg = None - for jfile in os.listdir(CACHE_DIR): - if jfile.find(f"nextgen_{vpu}.gpkg") >= 0: - gpkg = Path(CACHE_DIR, jfile) - if ii_verbose: - print(f"Found and using geopackge file {gpkg}") - if gpkg == None: - url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" - command = f"wget -P {CACHE_DIR} -c {url}" - t0 = time.perf_counter() - cmd(command) - dl_time += time.perf_counter() - t0 - gpkg = Path(CACHE_DIR, f"nextgen_{vpu}.gpkg") - if ii_verbose: - print(f"Opening {gpkg}...") - t0 = time.perf_counter() - polygonfile = gpd.read_file(gpkg, layer="divides") + # Use geopkg_file if given + if geopkg_file is not None: + gpkg = Path(Path(os.path.dirname(__file__)).parent,geopkg_file) + if not gpkg.exists: + raise Exception(f"{gpkg} doesn't exist!!") + + elif catchment_subset is not None: + gpkg = Path(Path(os.path.dirname(__file__)).parent,catchment_subset + '_upstream_subset.gpkg') + + # Default to geopackage that matches the requested VPU + else: + gpkg = None + for jfile in os.listdir(CACHE_DIR): + if jfile.find(f"nextgen_{vpu}.gpkg") >= 0: + gpkg = Path(CACHE_DIR, jfile) + if ii_verbose: + print(f"Found and using geopackge file {gpkg}") + if gpkg == None: + url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" + command = f"wget -P {CACHE_DIR} -c {url}" + t0 = time.perf_counter() + cmd(command) + dl_time += time.perf_counter() - t0 + gpkg = Path(CACHE_DIR, f"nextgen_{vpu}.gpkg") + + if not os.path.exists(gpkg): + + # Generate geopackage through subsetting routine. This will generate ngen geojsons files + if catchment_subset is not None: + if ii_verbose: print(f'Subsetting catchment with id {catchment_subset} from {gpkg}') + subset_upstream(gpkg,catchment_subset) + + # geojsons will be placed in working directory. Copy them to bucket + if bucket_type == 'local': + out_path = Path(bucket_path,'configs') + if not os.path.exists(out_path): os.system(f'mkdir {out_path}') + os.system(f"mv ./catchments.geojson ./nexus.geojson ./crosswalk.json ./flowpaths.geojson ./flowpath_edge_list.json {out_path}") + else: + print(f'UNTESTED!!') + files = ["./catchments.geojson" "./nexus.geojson" "./crosswalk.json" "./flowpaths.geojson" "./flowpath_edge_list.json"] + buf = BytesIO() + for jfile in files: + s3.put_object( + Body=json.dumps(jfile), + Bucket={bucket_name} + ) + + # TODO: Create Realization file + # TODO: Validate configs + else: + if ii_verbose: + print(f"Opening {gpkg}...") + t0 = time.perf_counter() + polygonfile = gpd.read_file(gpkg, layer="divides") - ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) - src = ds["RAINRATE"] + ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) + src = ds["RAINRATE"] - if ii_verbose: - print("Generating weights") - t1 = time.perf_counter() - generate_weights_file(polygonfile, src, wgt_file, crosswalk_dict_key="id") - if ii_verbose: - print(f"\nGenerating the weights took {time.perf_counter() - t1:.2f} s") - proc_time += time.perf_counter() - t0 + if ii_verbose: + print("Generating weights") + t1 = time.perf_counter() + generate_weights_file(polygonfile, src, wgt_file, crosswalk_dict_key="id") + if ii_verbose: + print(f"\nGenerating the weights took {time.perf_counter() - t1:.2f} s") + proc_time += time.perf_counter() - t0 else: if ii_verbose: print( f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}" ) + # Exit early if we only want to calculate the weights + if ii_weights_only: + exit + # Get nwm forcing file names t0 = time.perf_counter() - if len(nwm_file) == 0: - fcst_cycle = [0] + if not forcing_type == 'from_file': nwm_forcing_files = create_file_list( runinput, @@ -523,6 +598,7 @@ def main(): end_date, fcst_cycle, urlbaseinput, + lead_time ) else: nwm_forcing_files = [] @@ -539,7 +615,7 @@ def main(): # This will look for local raw forcing files and download them if needed t0 = time.perf_counter() local_nwm_files, remote_nwm_files = locate_dl_files_threaded( - ii_cache, ii_verbose, nwm_forcing_files, nthreads + ii_cache, ii_verbose, nwm_forcing_files, dl_threads ) dl_time += time.perf_counter() - t0 @@ -567,46 +643,29 @@ def main(): t0 = time.perf_counter() # Index remote files with threads - pool = cf.ThreadPoolExecutor(max_workers=nthreads) - arg0 = [] - arg1 = [] - arg2 = [] - arg3 = [] - for i in range(len(remote_nwm_files)): + if len(remote_nwm_files) > 0: if ii_verbose: print( - f"Doing a threaded remote data retrieval for file {remote_nwm_files[i]}" + f"Performing threaded remote data extraction with {proc_threads} workers..." ) - arg0.append(wgt_file) - arg1.append([remote_nwm_files[i]]) - arg2.append(var_list) - arg3.append(i) - results = pool.map(get_forcing_timelist, arg0,arg1,arg2,arg3) - - # Get data - remote_data_list = [] - for jres in results: - remote_data_list.append(jres) - - # Build time axis - t_ax_remote = [] - for i in range(len(remote_nwm_files)): - t_ax_remote.append(remote_data_list[i][1]) + remote_data_list, t_ax_remote = threaded_data_extract(remote_nwm_files,proc_threads,ii_verbose,wgt_file,var_list) - # If we have any local files, index locally serially + # Index local files with threads if len(local_nwm_files) > 0: - data_list, t_ax_local = get_forcing_timelist( - wgt_file, local_nwm_files, var_list - ) + if ii_verbose: + print( + f"Performing threaded local data extraction with {proc_threads} workers..." + ) + local_data_list, t_ax_local = threaded_data_extract(local_nwm_files,proc_threads,ii_verbose,wgt_file,var_list) # Sync in time between remote and local files complete_data_timelist = [] timelist = [] - for i, ifile in enumerate(nwm_forcing_files): + for ifile in nwm_forcing_files: filename = Path(ifile).parts[-1] for j, jfile in enumerate(local_nwm_files): if jfile.find(filename) >= 0: - complete_data_timelist.append(data_list[j]) + complete_data_timelist.append(local_data_list[j][0][0]) timelist.append(t_ax_local[j]) for j, jfile in enumerate(remote_nwm_files): if jfile.find(filename) >= 0: @@ -615,7 +674,8 @@ def main(): # Convert time-synced list of catchment dictionaries # to catchment based dataframes - if ii_verbose: print(f'Reformatting data into dataframes...') + if ii_verbose: + print(f"Reformatting data into dataframes...") dfs = time2catchment(complete_data_timelist, timelist, var_list_out) proc_time += time.perf_counter() - t0 @@ -629,10 +689,10 @@ def main(): if bucket_type == "local": if file_type == "csv": - csvname = Path(bucket_path, f"cat{vpu}_{splt[1]}.csv") + csvname = Path(forcing_path, f"cat{vpu}_{splt[1]}.csv") df.to_csv(csvname, index=False) if file_type == "parquet": - parq_file = Path(bucket_path, f"cat{vpu}_{splt[1]}.parquet") + parq_file = Path(forcing_path, f"cat{vpu}_{splt[1]}.parquet") df.to_parquet(parq_file) elif bucket_type == "S3": buf = BytesIO() @@ -643,7 +703,7 @@ def main(): csvname = f"cat{vpu}_{splt[1]}.csv" df.to_csv(buf, index=False) buf.seek(0) - key_name = f"{file_prefix}{csvname}" + key_name = f"{file_prefix}/forcing/{csvname}" s3.put_object(Bucket=bucket_name, Key=key_name, Body=buf.getvalue()) if (j + 1) % write_int == 0: @@ -664,10 +724,10 @@ def main(): msg = f"\nData has been written locally to {bucket_path}" else: msg = f"\nData has been written to S3 bucket {bucket_name} at {file_prefix}" - msg += f"\Check and DL data : {dl_time:.2f}s" - msg += f"\nProcess data : {proc_time:.2f}s" - msg += f"\nWrite data : {write_time:.2f}s" - msg += f"\nTotal time : {total_time:.2f}s\n" + msg += f"\nCheck and DL data : {dl_time:.2f}s" + msg += f"\nProcess data : {proc_time:.2f}s" + msg += f"\nWrite data : {write_time:.2f}s" + msg += f"\nTotal time : {total_time:.2f}s\n" print(msg) diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index eebfb5e..94f64df 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -1,31 +1,37 @@ { "forcing" : { + "forcing_type" : "operational_archive", "start_date" : "20220822", "end_date" : "20220822", - "cache" : true, + "cache" : true, "nwm_file" : "", "runinput" : 1, "varinput" : 5, "geoinput" : 1, "meminput" : 0, - "urlbaseinput" : 3 + "urlbaseinput" : 3, + "fcst_cycle" : [0], + "lead_time" : null }, "hydrofab" : { - "version" : "v1.2", - "vpu" : "03W" + "version" : "v1.2", + "vpu" : "03W", + "catch_subset" : "cat-112977", + "weights_only" : false }, "storage":{ "bucket_type" : "local", - "bucket_name" : "out_data_CIROH", + "bucket_name" : "ngen_inputs", "file_prefix" : "data/", "file_type" : "csv" }, "run" : { - "verbose" : true, - "nthreads" : 10 + "verbose" : true, + "dl_threads" : 10, + "proc_threads" : 2 } } diff --git a/ngen_forcing/user_input_ngen.md b/ngen_forcing/user_input_ngen.md index 1b13256..6da6cab 100644 --- a/ngen_forcing/user_input_ngen.md +++ b/ngen_forcing/user_input_ngen.md @@ -7,61 +7,73 @@ contents: { "forcing" : { + "forcing_type" : "operational_archive", "start_date" : "20220822", "end_date" : "20220822", - "cache" : false, + "cache" : true, "nwm_file" : "", "runinput" : 1, "varinput" : 5, "geoinput" : 1, "meminput" : 0, - "urlbaseinput" : 3 + "urlbaseinput" : 3, + "fcst_cycle" : [0], + "lead_time" : null }, - + "hydrofab" : { - "version" : "v1.2", - "vpu" : "03W" + "version" : "v1.2", + "vpu" : "03W", + "catch_subset" : "cat-112977", + "weights_only" : false }, - + "storage":{ "bucket_type" : "local", - "bucket_name" : "out_data_CIROH", + "bucket_name" : "ngen_inputs", "file_prefix" : "data/", "file_type" : "csv" }, - + "run" : { - "verbose" : false, - "nthreads" : 5 + "verbose" : true, + "nthreads" : 2 } - + } ### forcing | Field Name | Data Type | Description | | --- | --- | --- | +| forcing_type | `string` |
  • operational_archive
  • retrospective
  • from_file
  • | | start_date | `string` | YYYYMMDD | | end_date | `string` | YYYYMMDD | | cache | `bool` |
  • true: Store forcing files locally. Must specify dl_threads
  • false: Interact with forcing files remotely
  • | -| nwm_file | `string` | Path to a text file containing nwm file names. One filename per line. To have nwm forcing file names generated automatically, leave this option out of the config or set it to "" | +| nwm_file | `string` | Path to a text file containing nwm file names. One filename per line. Set this only if forcing_type is set to 'from_file' | | runinput | `int` |
    1. short_range
    2. medium_range
    3. medium_range_no_da
    4. long_range
    5. analysis_assim
    6. analysis_assim_extend
    7. analysis_assim_extend_no_da
    8. analysis_assim_long
    9. analysis_assim_long_no_da
    10. analysis_assim_no_da
    11. short_range_no_da
    | | varinput | `int` |
    1. channel_rt
    2. land
    3. reservoir
    4. terrain_rt terrain
    5. forcing
    | | geoinput | `int` |
    1. conus
    2. hawaii
    3. puertorico
    | | meminput | `int` |
    1. mem_1
    2. mem_2
    3. mem_3
    4. mem_4
    5. mem_5
    6. mem_6
    7. mem_7
    | | urlbaseinput | `int` |
    1. Empty string: use local files
    2. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/: for real-time operational data from NOAA
    3. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/post-processed/WMS/: for post-processed data from NOAA's Web Map Service
    4. https://storage.googleapis.com/national-water-model/: for input/output data stored on Google Cloud Storage
    5. https://storage.cloud.google.com/national-water-model/: for input/output data stored on Google Cloud Storage
    6. gs://national-water-model/: for input/output data stored on Google Cloud Storage
    7. https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/: for retrospective data from AWS S3
    8. s3://noaa-nwm-retrospective-2-1-pds/model_output/: for retrospective data from AWS S3
    | +| fcst_cycle | `list` | List of forecast cycles in UTC. If empty, will use all available cycles | +| lead_time | `list` | List of lead times in hours. If empty, will use all available lead times | ### hydrofab | Field Name | Data Type | Description | | --- | --- | --- | -| version | `string` | Current hydrofabric version | +| version | `string` | Desired hydrofabric data version | | vpu | `string` | Check here for map of VPUs https://noaa-owp.github.io/hydrofabric/articles/data_access.html | +| geopkg_file | `string` | Path to file containing catchment polygons. Must exist locally | +| catch_subset | `string` | catchment id of the form "cat-#". If provided, a subsetted geopackage will be created from vpu geopackage. NGen config files will be generated as well | +| weights_only | `bool` |
  • true: Generate weight file and exit.
  • false: Proceed with full script, generate forcing files
  • | + ### storage | Field Name | Data Type | Description | | --- | --- | --- | | bucket_type | `string` |
    1. "local" : write to local directory
    2. "S3" : output to AWS S3 bucket
    | -| bucket_name | `string` | If local, this is the name of the folder the data will be placed in. If S3, this is the name of S3 bucket, which must exist already. | +| bucket_name | `string` | If local, this is the name of the folder the data will be placed in. If S3, this is the name of S3 bucket, which must exist already | | file_prefix | `string` | If local, this is the relative path to the bucket_name folder. If S3, this is the relative path within the S3 bucket_name bucket to store files | | file_type | `string` |
    1. "csv" : write data as csv files/
    2. "parquet" : write data as parquet files
    | @@ -70,4 +82,5 @@ contents: | Field Name | Data Type | Description | | --- | --- | --- | | verbose | `bool` | Print raw forcing files | -| nthreads | `int` | Number of threads to use while downloading. | +| dl_threads | `int` | Number of threads to use while downloading. | +| proc_threads | `int` | Number of threads to use while processing data (either remotely or locally). | From f14faa68a95b94602ed3d3d5fea4878b99af05aa Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Fri, 12 May 2023 12:11:19 -0500 Subject: [PATCH 083/105] Retrospective file names --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 59 ++++++++++++++++----- ngen_forcing/user_input_ngen.json | 20 +++---- ngen_forcing/user_input_ngen.md | 4 +- 3 files changed, 60 insertions(+), 23 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index fe919c6..bee591c 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -27,9 +27,15 @@ sys.path.append(str(pkg_dir)) from listofnwmfilenames import create_file_list +retro_file = Path(pkg_dir,'listofnwmfilenamesretro.py') +ii_retro = False +if retro_file.exists(): + ii_retro = True + from listofnwmfilenamesretro import create_file_list_retro + pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "subsetting") sys.path.append(str(pkg_dir)) -from subset import subset_upstream +from subset import subset_upstream, subset_upstream_prerelease TEMPLATE_BLOB_NAME = ( "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" @@ -437,6 +443,8 @@ def main(): # Extract configurations conf = json.load(open(args.infile)) forcing_type = conf["forcing"]["forcing_type"] + if not ii_retro and forcing_type == "retrospective": + raise NotImplementedError("Need listofnwmfilenamesretro for this!") ii_cache = conf["forcing"]["cache"] start_date = conf["forcing"].get("start_date",None) @@ -449,6 +457,8 @@ def main(): nwm_file = conf["forcing"].get("nwm_file",None) fcst_cycle = conf["forcing"].get("fcst_cycle",None) lead_time = conf["forcing"].get("lead_time",None) + data_type = conf["forcing"].get("data_type",None) + object_type = conf["forcing"].get("object_type",None) version = conf["hydrofab"].get('version','v1.2') vpu = conf["hydrofab"].get("vpu") @@ -540,7 +550,13 @@ def main(): # Generate geopackage through subsetting routine. This will generate ngen geojsons files if catchment_subset is not None: if ii_verbose: print(f'Subsetting catchment with id {catchment_subset} from {gpkg}') - subset_upstream(gpkg,catchment_subset) + if catchment_subset.find("release"): + try: + subset_upstream_prerelease(gpkg,catchment_subset) + except: + raise NotImplementedError(f"Need Tony's version of subset.py!") + else: + subset_upstream(gpkg,catchment_subset) # geojsons will be placed in working directory. Copy them to bucket if bucket_type == 'local': @@ -589,17 +605,34 @@ def main(): t0 = time.perf_counter() if not forcing_type == 'from_file': - nwm_forcing_files = create_file_list( - runinput, - varinput, - geoinput, - meminput, - start_date, - end_date, - fcst_cycle, - urlbaseinput, - lead_time - ) + if forcing_type == "operational_archive": + nwm_forcing_files = create_file_list( + runinput, + varinput, + geoinput, + meminput, + start_date, + end_date, + fcst_cycle, + urlbaseinput, + lead_time + ) + elif forcing_type == "retrospective": + nwm_forcing_files = create_file_list_retro( + runinput, + varinput, + geoinput, + meminput, + start_date + "0000", # Hack + end_date + "0000", # Hack + fcst_cycle, + urlbaseinput, + lead_time, + data_type, + object_type + ) + nwm_forcing_files = nwm_forcing_files[0] + else: nwm_forcing_files = [] with open(nwm_file, "r") as f: diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index 94f64df..613ea7d 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -1,17 +1,19 @@ { "forcing" : { - "forcing_type" : "operational_archive", - "start_date" : "20220822", - "end_date" : "20220822", + "forcing_type" : "retrospective", + "start_date" : "19790201", + "end_date" : "19790202", "cache" : true, "nwm_file" : "", - "runinput" : 1, - "varinput" : 5, + "runinput" : 2, + "varinput" : 1, "geoinput" : 1, - "meminput" : 0, - "urlbaseinput" : 3, - "fcst_cycle" : [0], - "lead_time" : null + "meminput" : 1, + "urlbaseinput" : 6, + "fcst_cycle" : [12,18], + "lead_time" : [1, 2, 240], + "data_type" : [6], + "object_type" : 1 }, "hydrofab" : { diff --git a/ngen_forcing/user_input_ngen.md b/ngen_forcing/user_input_ngen.md index 6da6cab..9d7ac58 100644 --- a/ngen_forcing/user_input_ngen.md +++ b/ngen_forcing/user_input_ngen.md @@ -54,9 +54,11 @@ contents: | varinput | `int` |
    1. channel_rt
    2. land
    3. reservoir
    4. terrain_rt terrain
    5. forcing
    | | geoinput | `int` |
    1. conus
    2. hawaii
    3. puertorico
    | | meminput | `int` |
    1. mem_1
    2. mem_2
    3. mem_3
    4. mem_4
    5. mem_5
    6. mem_6
    7. mem_7
    | -| urlbaseinput | `int` |
    1. Empty string: use local files
    2. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/: for real-time operational data from NOAA
    3. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/post-processed/WMS/: for post-processed data from NOAA's Web Map Service
    4. https://storage.googleapis.com/national-water-model/: for input/output data stored on Google Cloud Storage
    5. https://storage.cloud.google.com/national-water-model/: for input/output data stored on Google Cloud Storage
    6. gs://national-water-model/: for input/output data stored on Google Cloud Storage
    7. https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/: for retrospective data from AWS S3
    8. s3://noaa-nwm-retrospective-2-1-pds/model_output/: for retrospective data from AWS S3
    | +| urlbaseinput | `int` |
    1. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/prod/: for real-time operational data from NOAA
    2. https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/post-processed/WMS/: for post-processed data from NOAA's Web Map Service
    3. https://storage.googleapis.com/national-water-model/: for input/output data stored on Google Cloud Storage
    4. https://storage.cloud.google.com/national-water-model/: for input/output data stored on Google Cloud Storage
    5. gs://national-water-model/: for input/output data stored on Google Cloud Storage
    6. https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/: for retrospective data from AWS S3
    7. s3://noaa-nwm-retrospective-2-1-pds/model_output/: for retrospective data from AWS S3
    | | fcst_cycle | `list` | List of forecast cycles in UTC. If empty, will use all available cycles | | lead_time | `list` | List of lead times in hours. If empty, will use all available lead times | +| data_type | `list` | Only required for retroactive
    1. CHRTOUT_DOMAIN1
    2. GWOUT_DOMAIN1
    3. LAKEOUT_DOMAIN1
    4. LDASOUT_DOMAIN1
    5. RTOUT_DOMAIN1
    6. LDASIN_DOMAIN1
    | +| object_type | `list` or `int` | Only required for retroactive
    1. forcing
    2. model_output
    | ### hydrofab From 159d33cd6f21bd7dce055bb305511a4ed2929697 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Fri, 12 May 2023 13:41:25 -0500 Subject: [PATCH 084/105] Removed import shield --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index bee591c..7d6040a 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -28,10 +28,7 @@ from listofnwmfilenames import create_file_list retro_file = Path(pkg_dir,'listofnwmfilenamesretro.py') -ii_retro = False -if retro_file.exists(): - ii_retro = True - from listofnwmfilenamesretro import create_file_list_retro +from listofnwmfilenamesretro import create_file_list_retro pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "subsetting") sys.path.append(str(pkg_dir)) @@ -443,8 +440,6 @@ def main(): # Extract configurations conf = json.load(open(args.infile)) forcing_type = conf["forcing"]["forcing_type"] - if not ii_retro and forcing_type == "retrospective": - raise NotImplementedError("Need listofnwmfilenamesretro for this!") ii_cache = conf["forcing"]["cache"] start_date = conf["forcing"].get("start_date",None) From d813d7f0e196e15f0ab5590eac98bc688e37d2cb Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 17 May 2023 16:57:44 -0600 Subject: [PATCH 085/105] Shielded subset function --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 7d6040a..7aa8ca7 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -32,7 +32,9 @@ pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "subsetting") sys.path.append(str(pkg_dir)) -from subset import subset_upstream, subset_upstream_prerelease +from subset import subset_upstream +if os.path.exists(pkg_dir): + from subset import subset_upstream_prerelease TEMPLATE_BLOB_NAME = ( "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" From 7b584e6c5add69df10b42b6aa5e03dc1aca016c9 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 22 May 2023 08:15:45 -0600 Subject: [PATCH 086/105] Removed unnecessary imports --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 7aa8ca7..8883953 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -19,8 +19,6 @@ import time import boto3 from io import BytesIO -import matplotlib.pyplot as plt -from mpl_toolkits.basemap import Basemap import concurrent.futures as cf pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "nwm_filenames") From 9fe69124f5b76d977cd86bdbeb5364bf1db45ff4 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 22 May 2023 08:17:50 -0600 Subject: [PATCH 087/105] Removed tony's subset import --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 8883953..00028cf 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -31,8 +31,6 @@ pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "subsetting") sys.path.append(str(pkg_dir)) from subset import subset_upstream -if os.path.exists(pkg_dir): - from subset import subset_upstream_prerelease TEMPLATE_BLOB_NAME = ( "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" From 3c79bdb2b596d976345f2b2e2411d76bd54d6411 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 22 May 2023 09:25:05 -0500 Subject: [PATCH 088/105] find bug fix --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 00028cf..61b1fa3 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -543,7 +543,7 @@ def main(): # Generate geopackage through subsetting routine. This will generate ngen geojsons files if catchment_subset is not None: if ii_verbose: print(f'Subsetting catchment with id {catchment_subset} from {gpkg}') - if catchment_subset.find("release"): + if catchment_subset.find("release") >= 0: try: subset_upstream_prerelease(gpkg,catchment_subset) except: From 82db7a3ed7b15d63c78550907386333b460dcd66 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Tue, 30 May 2023 13:23:04 -0500 Subject: [PATCH 089/105] pytests for listofnwmfilenames and prep data script --- .gitignore | 2 + ngen_forcing/prep_hydrofab_forcings_ngen.py | 31 ++-- tests/data/test_config.json | 39 +++++ tests/test_retro.py | 163 ++++++++++++++++++++ 4 files changed, 219 insertions(+), 16 deletions(-) create mode 100644 tests/data/test_config.json create mode 100644 tests/test_retro.py diff --git a/.gitignore b/.gitignore index 688d779..2a12cb2 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ nwm_filenames/__pycache__/ subsetting/__pycache__/ ngen_forcing/__pycache__/ venv/ +tests/__pycache__/ +tests/data/* diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 61b1fa3..d86f403 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -10,7 +10,6 @@ import gc from pathlib import Path import geopandas as gpd -import pandas as pd import numpy as np import xarray as xr from google.cloud import storage @@ -308,7 +307,7 @@ def cmd(cmd): """ resp = os.system(cmd) if resp > 0: - raise Exception(f"\Threaded command failed! Tried: {cmd}\n") + raise Exception(f"\nThreaded command failed! Tried: {cmd}\n") def locate_dl_files_threaded( @@ -415,7 +414,7 @@ def threaded_data_extract(files,nthreads,ii_verbose,wgt_file,var_list): return data_list, t_ax_local -def main(): +def prep_ngen_data(conf): """ Primary function to retrieve forcing and hydrofabric data and convert it into files that can be ingested into ngen. @@ -428,15 +427,6 @@ def main(): t00 = time.perf_counter() - # Take in user config - parser = argparse.ArgumentParser() - parser.add_argument( - dest="infile", type=str, help="A json containing user inputs to run ngen" - ) - args = parser.parse_args() - - # Extract configurations - conf = json.load(open(args.infile)) forcing_type = conf["forcing"]["forcing_type"] ii_cache = conf["forcing"]["cache"] @@ -487,15 +477,15 @@ def main(): ), f"{bucket_type} for bucket_type is not accepted! Accepted: {bucket_types}" assert vpu is not None or geopkg_file is not None, "Need to input either vpu or geopkg_file" - # Set paths and make directories if needed - top_dir = Path(os.path.dirname(args.infile)).parent + # Set paths and make directories if needed if not os.path.exists(CACHE_DIR): os.system(f"mkdir {CACHE_DIR}") if not os.path.exists(CACHE_DIR): raise Exception(f"Creating {CACHE_DIR} failed!") - # Prep output directory + # Prep output directory if bucket_type == "local": + top_dir = Path(os.path.dirname(__file__)).parent bucket_path = Path(top_dir, file_prefix, bucket_name) forcing_path = Path(bucket_path, 'forcing') if not os.path.exists(bucket_path): @@ -758,4 +748,13 @@ def main(): if __name__ == "__main__": - main() + # Take in user config + parser = argparse.ArgumentParser() + parser.add_argument( + dest="infile", type=str, help="A json containing user inputs to run ngen" + ) + args = parser.parse_args() + + # Extract configurations + conf = json.load(open(args.infile)) + prep_ngen_data(conf) diff --git a/tests/data/test_config.json b/tests/data/test_config.json new file mode 100644 index 0000000..e331318 --- /dev/null +++ b/tests/data/test_config.json @@ -0,0 +1,39 @@ +{ + "forcing" : { + "forcing_type" : "retrospective", + "start_date" : "19790201", + "end_date" : "19790202", + "cache" : true, + "nwm_file" : "", + "runinput" : 2, + "varinput" : 1, + "geoinput" : 1, + "meminput" : 1, + "urlbaseinput" : 6, + "fcst_cycle" : [12,18], + "lead_time" : [1, 2, 240], + "data_type" : [6], + "object_type" : 1 + }, + + "hydrofab" : { + "version" : "v1.2", + "vpu" : "03W", + "catch_subset" : "cat-112977", + "weights_only" : false + }, + + "storage":{ + "bucket_type" : "local", + "bucket_name" : "ngen_inputs", + "file_prefix" : "tests/data", + "file_type" : "csv" + }, + + "run" : { + "verbose" : true, + "dl_threads" : 10, + "proc_threads" : 2 + } + +} diff --git a/tests/test_retro.py b/tests/test_retro.py new file mode 100644 index 0000000..8cffea1 --- /dev/null +++ b/tests/test_retro.py @@ -0,0 +1,163 @@ +import pytest +import os, sys, json +from pathlib import Path +pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "ngen_forcing") +sys.path.append(str(pkg_dir)) +from prep_hydrofab_forcings_ngen import prep_ngen_data + +pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "nwm_filenames") +sys.path.append(str(pkg_dir)) +from listofnwmfilenames import create_file_list +from listofnwmfilenamesretro import create_file_list_retro + +def test_filenames_operational_archive(): + runinput = 2 + varinput = 1 + geoinput = 1 + meminput = 1 + start_date = "20220822" + end_date = "20220824" + fcst_cycle = [12, 18] + urlbaseinput = None + lead_time = [1, 2, 240] + + nwm_forcing_files = create_file_list( + runinput, + varinput, + geoinput, + meminput, + start_date, + end_date, + fcst_cycle, + urlbaseinput, + lead_time + ) + + nwm_forcing_files_truth = [ + 'nwm.20220822/medium_range_mem1/nwm.t12z.medium_range.channel_rt_1.f001.conus.nc', + 'nwm.20220822/medium_range_mem1/nwm.t12z.medium_range.channel_rt_1.f002.conus.nc', + 'nwm.20220822/medium_range_mem1/nwm.t12z.medium_range.channel_rt_1.f240.conus.nc', + 'nwm.20220822/medium_range_mem1/nwm.t18z.medium_range.channel_rt_1.f001.conus.nc', + 'nwm.20220822/medium_range_mem1/nwm.t18z.medium_range.channel_rt_1.f002.conus.nc', + 'nwm.20220822/medium_range_mem1/nwm.t18z.medium_range.channel_rt_1.f240.conus.nc', + 'nwm.20220823/medium_range_mem1/nwm.t12z.medium_range.channel_rt_1.f001.conus.nc', + 'nwm.20220823/medium_range_mem1/nwm.t12z.medium_range.channel_rt_1.f002.conus.nc', + 'nwm.20220823/medium_range_mem1/nwm.t12z.medium_range.channel_rt_1.f240.conus.nc', + 'nwm.20220823/medium_range_mem1/nwm.t18z.medium_range.channel_rt_1.f001.conus.nc', + 'nwm.20220823/medium_range_mem1/nwm.t18z.medium_range.channel_rt_1.f002.conus.nc', + 'nwm.20220823/medium_range_mem1/nwm.t18z.medium_range.channel_rt_1.f240.conus.nc', + 'nwm.20220824/medium_range_mem1/nwm.t12z.medium_range.channel_rt_1.f001.conus.nc', + 'nwm.20220824/medium_range_mem1/nwm.t12z.medium_range.channel_rt_1.f002.conus.nc', + 'nwm.20220824/medium_range_mem1/nwm.t12z.medium_range.channel_rt_1.f240.conus.nc', + 'nwm.20220824/medium_range_mem1/nwm.t18z.medium_range.channel_rt_1.f001.conus.nc', + 'nwm.20220824/medium_range_mem1/nwm.t18z.medium_range.channel_rt_1.f002.conus.nc', + 'nwm.20220824/medium_range_mem1/nwm.t18z.medium_range.channel_rt_1.f240.conus.nc' + ] + + for j,jf in enumerate(nwm_forcing_files_truth): + assert nwm_forcing_files[j] == jf + + +def test_filenames_retrospective(): + runinput = 2 + varinput = 1 + geoinput = 1 + meminput = 1 + start_date = "19790201" + end_date = "19790202" + fcst_cycle = [12, 18] + urlbaseinput = 6 + lead_time = [1, 2, 240] + data_type = [6] + object_type = 1 + + nwm_forcing_files = create_file_list_retro( + runinput, + varinput, + geoinput, + meminput, + start_date + "0000", # Hack + end_date + "0000", # Hack + fcst_cycle, + urlbaseinput, + lead_time, + data_type, + object_type + ) + + nwm_forcing_files = nwm_forcing_files[0] + + nwm_forcing_files_truth = [ + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902010000.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902010100.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902010200.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902010300.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902010400.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902010500.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902010600.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902010700.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902010800.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902010900.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902011000.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902011100.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902011200.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902011300.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902011400.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902011500.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902011600.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902011700.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902011800.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902011900.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902012000.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902012100.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902012200.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902012300.LDASIN_DOMAIN1', + 'https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/forcing/1979/197902020000.LDASIN_DOMAIN1' + ] + + for j,jf in enumerate(nwm_forcing_files_truth): + assert nwm_forcing_files[j] == jf + +def test_data_prep(): + """ + This tests the entire script. + #TODO: Break this into seperate test functions + """ + + conf = { + "forcing":{ + "forcing_type" : "retrospective", + "start_date" : "19790201", + "end_date" : "19790202", + "cache" : True, + "nwm_file" : "", + "runinput" : 2, + "varinput" : 1, + "geoinput" : 1, + "meminput" : 1, + "urlbaseinput" : 6, + "fcst_cycle" : [12,18], + "lead_time" : [1, 2, 240], + "data_type" : [6], + "object_type" : 1 + }, + "hydrofab":{ + "version" : "v1.2", + "vpu" : "03W", + "catch_subset" : "cat-112977", + "weights_only" : False + }, + "storage": { + "bucket_type" : "local", + "bucket_name" : "ngen_inputs", + "file_prefix" : "tests/data/", + "file_type" : "csv" + }, + "run":{ + "verbose" : True, + "dl_threads" : 10, + "proc_threads" : 2 + } + } + + prep_ngen_data(conf) From e465a4972c87691be42e698f72a7148e3512d663 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 31 May 2023 10:10:01 -0500 Subject: [PATCH 090/105] split the tests up --- tests/{test_retro.py => test_filenames.py} | 52 +--------------------- tests/test_prep_ngen.py | 49 ++++++++++++++++++++ 2 files changed, 51 insertions(+), 50 deletions(-) rename tests/{test_retro.py => test_filenames.py} (81%) create mode 100644 tests/test_prep_ngen.py diff --git a/tests/test_retro.py b/tests/test_filenames.py similarity index 81% rename from tests/test_retro.py rename to tests/test_filenames.py index 8cffea1..ea04260 100644 --- a/tests/test_retro.py +++ b/tests/test_filenames.py @@ -1,9 +1,5 @@ -import pytest -import os, sys, json +import os, sys from pathlib import Path -pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "ngen_forcing") -sys.path.append(str(pkg_dir)) -from prep_hydrofab_forcings_ngen import prep_ngen_data pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "nwm_filenames") sys.path.append(str(pkg_dir)) @@ -116,48 +112,4 @@ def test_filenames_retrospective(): ] for j,jf in enumerate(nwm_forcing_files_truth): - assert nwm_forcing_files[j] == jf - -def test_data_prep(): - """ - This tests the entire script. - #TODO: Break this into seperate test functions - """ - - conf = { - "forcing":{ - "forcing_type" : "retrospective", - "start_date" : "19790201", - "end_date" : "19790202", - "cache" : True, - "nwm_file" : "", - "runinput" : 2, - "varinput" : 1, - "geoinput" : 1, - "meminput" : 1, - "urlbaseinput" : 6, - "fcst_cycle" : [12,18], - "lead_time" : [1, 2, 240], - "data_type" : [6], - "object_type" : 1 - }, - "hydrofab":{ - "version" : "v1.2", - "vpu" : "03W", - "catch_subset" : "cat-112977", - "weights_only" : False - }, - "storage": { - "bucket_type" : "local", - "bucket_name" : "ngen_inputs", - "file_prefix" : "tests/data/", - "file_type" : "csv" - }, - "run":{ - "verbose" : True, - "dl_threads" : 10, - "proc_threads" : 2 - } - } - - prep_ngen_data(conf) + assert nwm_forcing_files[j] == jf \ No newline at end of file diff --git a/tests/test_prep_ngen.py b/tests/test_prep_ngen.py new file mode 100644 index 0000000..4c9f4ab --- /dev/null +++ b/tests/test_prep_ngen.py @@ -0,0 +1,49 @@ +import os, sys +from pathlib import Path +pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "ngen_forcing") +sys.path.append(str(pkg_dir)) +from prep_hydrofab_forcings_ngen import prep_ngen_data + +def test_data_prep(): + """ + This tests the entire script. + #TODO: Break this into separate test functions + """ + + conf = { + "forcing":{ + "forcing_type" : "retrospective", + "start_date" : "19790201", + "end_date" : "19790202", + "cache" : True, + "nwm_file" : "", + "runinput" : 2, + "varinput" : 1, + "geoinput" : 1, + "meminput" : 1, + "urlbaseinput" : 6, + "fcst_cycle" : [12,18], + "lead_time" : [1, 2, 240], + "data_type" : [6], + "object_type" : 1 + }, + "hydrofab":{ + "version" : "v1.2", + "vpu" : "03W", + "catch_subset" : "cat-112977", + "weights_only" : False + }, + "storage": { + "bucket_type" : "local", + "bucket_name" : "ngen_inputs", + "file_prefix" : "tests/data/", + "file_type" : "csv" + }, + "run":{ + "verbose" : True, + "dl_threads" : 10, + "proc_threads" : 2 + } + } + + prep_ngen_data(conf) From 4d61218e7ca8b6b5aed02c4b02ce601935e440a9 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 6 Jul 2023 10:08:31 -0600 Subject: [PATCH 091/105] Garbage collected and updated storage variables --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 190 +++++++++----------- 1 file changed, 80 insertions(+), 110 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index d86f403..a5fb087 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -1,7 +1,3 @@ -# TODO NOTE a lot of this code is borrowed from https://github.com/RTIInternational/hydro-evaluation -# In the future, import this package -# https://github.com/jameshalgren/data-access-examples/blob/DONOTMERGE_VPU16/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb - # !pip install --upgrade google-api-python-client # !pip install --upgrade google-cloud-storage @@ -42,60 +38,17 @@ PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-97.0],PARAMETER["standard_parallel_1",30.0],\ PARAMETER["standard_parallel_2",60.0],PARAMETER["latitude_of_origin",40.0],UNIT["Meter",1.0]]' -HI_NWM_WKT = 'PROJCS["Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]],\ -PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ -PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-157.42],PARAMETER["standard_parallel_1",10.0],\ -PARAMETER["standard_parallel_2",30.0],PARAMETER["latitude_of_origin",20.6],UNIT["Meter",1.0]]' - -PR_NWM_WKT = 'PROJCS["Sphere_Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]],\ -PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\ -PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-65.91],PARAMETER["standard_parallel_1",18.1],\ -PARAMETER["standard_parallel_2",18.1],PARAMETER["latitude_of_origin",18.1],UNIT["Meter",1.0]]' - -# paths - -# TODO Make CACHE_DIR configurable -CACHE_DIR = Path( - pkg_dir.parent, "data", "cache" -) # Maybe this should have a date attached to the name - -NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm") -USGS_CACHE_DIR = os.path.join(CACHE_DIR, "usgs") -GEO_CACHE_DIR = os.path.join(CACHE_DIR, "geo") - -NWM_CACHE_H5 = os.path.join(NWM_CACHE_DIR, "gcp_client.h5") - -PARQUET_CACHE_DIR = os.path.join(CACHE_DIR, "parquet") -MEDIUM_RANGE_FORCING_PARQUET = os.path.join(PARQUET_CACHE_DIR, "forcing_medium_range") -FORCING_ANALYSIS_ASSIM_PARQUET = os.path.join( - PARQUET_CACHE_DIR, "forcing_analysis_assim" -) -MEDIUM_RANGE_PARQUET = os.path.join(PARQUET_CACHE_DIR, "medium_range") -USGS_PARQUET = os.path.join(PARQUET_CACHE_DIR, "usgs") - -HUC10_SHP_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_conus.shp") -HUC10_PARQUET_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_conus.parquet") -HUC10_MEDIUM_RANGE_WEIGHTS_FILEPATH = os.path.join( - GEO_CACHE_DIR, "wbdhu10_medium_range_weights.pkl" -) - -ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, "RouteLink_CONUS.nc") -ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, "route_link_conus.parquet") - - -def get_cache_dir(create: bool = True): - if not os.path.exists(NWM_CACHE_DIR) and create: - os.mkdir(NWM_CACHE_DIR) - if not os.path.exists(NWM_CACHE_DIR): +def get_cache_dir(nwm_cache_dir: str,create: bool = True): + if not os.path.exists(nwm_cache_dir) and create: + os.mkdir(nwm_cache_dir) + if not os.path.exists(nwm_cache_dir): raise NotADirectoryError - return NWM_CACHE_DIR - + return nwm_cache_dir def make_parent_dir(filepath): Path(filepath).parent.mkdir(parents=True, exist_ok=True) - -def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: +def get_dataset(nwm_cache_dir: str, blob_name: str, use_cache: bool = True) -> xr.Dataset: """Retrieve a blob from the data service as xarray.Dataset. Based largely on OWP HydroTools. Parameters @@ -117,7 +70,7 @@ def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: # kerchunk with a remote path and then asynchronously do a download to cache it # for next time. The hypothesis would be that the download speed will not be any slower than # just accessing the file remotely. - nc_filepath = os.path.join(get_cache_dir(), blob_name) + nc_filepath = os.path.join(get_cache_dir(nwm_cache_dir), blob_name) make_parent_dir(nc_filepath) # If the file exists and use_cache = True @@ -144,9 +97,6 @@ def get_dataset(blob_name: str, use_cache: bool = True) -> xr.Dataset: ) return ds - -# TODO: Import this instead! -# Adapted from https://github.com/RTIInternational/hydro-evaluation/blob/dev-denno-4-1/src/evaluation/loading/generate_weights.py def generate_weights_file( gdf: gpd.GeoDataFrame, src: xr.DataArray, @@ -158,9 +108,6 @@ def generate_weights_file( gdf_proj = gdf.to_crs(CONUS_NWM_WKT) crosswalk_dict = {} - # This is a probably a really poor performing way to do this - # TODO: Consider vectorizing -- would require digging into the - # other end of these where we unpack the weights... i = 0 for index, row in gdf_proj.iterrows(): geom_rasterize = rasterize( @@ -258,6 +205,7 @@ def get_forcing_timelist(wgt_file: str, filelist: list, var_list: list): eng = "rasterio" # switch engine for remote processing else: eng = "h5netcdf" + with xr.open_dataset(_nc_file, engine=eng) as _xds: shp = _xds["U2D"].shape dtp = _xds["U2D"].dtype @@ -311,7 +259,7 @@ def cmd(cmd): def locate_dl_files_threaded( - ii_cache: bool, ii_verbose: bool, forcing_file_names: list, nthreads: int + cache_dir: str, ii_cache: bool, ii_verbose: bool, forcing_file_names: list, nthreads: int ): """ Look for forcing files locally, if found, will apend to local file list for local processing @@ -335,7 +283,7 @@ def locate_dl_files_threaded( cmds = [] for jfile in forcing_file_names: file_parts = Path(jfile).parts - local_file = os.path.join(CACHE_DIR, file_parts[-1]) + local_file = os.path.join(cache_dir, file_parts[-1]) ii_dl = False # decide whether to use local file, download it, or index it remotely @@ -373,7 +321,7 @@ def locate_dl_files_threaded( # Download file if ii_verbose: print(f"Forcing file not found! Downloading {jfile}") - command = f"wget -P {CACHE_DIR} -c {jfile}" + command = f"wget -P {cache_dir} -c {jfile}" cmds.append(command) dl_files.append(jfile) local_files.append(local_file) @@ -449,10 +397,12 @@ def prep_ngen_data(conf): geopkg_file = conf["hydrofab"].get("geopkg_file") ii_weights_only = conf['hydrofab'].get('weights_only',False) - bucket_type = conf["storage"]["bucket_type"] - bucket_name = conf["storage"]["bucket_name"] - file_prefix = conf["storage"]["file_prefix"] - file_type = conf["storage"]["file_type"] + storage_type = conf["storage"]["type"] + output_bucket = conf["storage"]["output_bucket"] + output_bucket_path = conf["storage"]["output_bucket_path"] + cache_bucket = conf["storage"]["cache_bucket"] + cache_bucket_path = conf["storage"]["cache_bucket_path"] + output_file_type = conf["storage"]["output_file_type"] ii_verbose = conf["run"]["verbose"] dl_threads = conf["run"]["dl_threads"] @@ -469,44 +419,58 @@ def prep_ngen_data(conf): assert forcing_type in accepted, msg file_types = ["csv", "parquet"] assert ( - file_type in file_types - ), f"{file_type} for file_type is not accepted! Accepted: {file_types}" + output_file_type in file_types + ), f"{output_file_type} for output_file_type is not accepted! Accepted: {file_types}" bucket_types = ["local", "S3"] assert ( - bucket_type in bucket_types - ), f"{bucket_type} for bucket_type is not accepted! Accepted: {bucket_types}" + type in bucket_types + ), f"{storage_type} for storage_type is not accepted! Accepted: {bucket_types}" assert vpu is not None or geopkg_file is not None, "Need to input either vpu or geopkg_file" + + if storage_type == "local": - # Set paths and make directories if needed - if not os.path.exists(CACHE_DIR): - os.system(f"mkdir {CACHE_DIR}") - if not os.path.exists(CACHE_DIR): - raise Exception(f"Creating {CACHE_DIR} failed!") - - # Prep output directory - if bucket_type == "local": + # Prep output directory top_dir = Path(os.path.dirname(__file__)).parent - bucket_path = Path(top_dir, file_prefix, bucket_name) - forcing_path = Path(bucket_path, 'forcing') + bucket_path = Path(top_dir, output_bucket_path, output_bucket) + forcing_path = Path(bucket_path, 'forcing') if not os.path.exists(bucket_path): os.system(f"mkdir {bucket_path}") os.system(f"mkdir {forcing_path}") if not os.path.exists(bucket_path): raise Exception(f"Creating {bucket_path} failed!") - elif bucket_type == "S3": + + # Prep cache directory + cache_dir = Path(Path(os.path.dirname(__file__)).parent,cache_bucket_path) + nwm_cache_dir = os.path.join(cache_dir, "nwm") + if not os.path.exists(cache_dir): + os.system(f"mkdir {cache_dir}") + if not os.path.exists(cache_dir): + raise Exception(f"Creating {cache_dir} failed!") + + elif storage_type == "S3": s3 = boto3.client("s3") + # Prep cache directory + # TODO: test that the bucket exists + # cache_dir = Path(Path(os.path.dirname(__file__)).parent,cache_bucket_path) + # nwm_cache_dir = os.path.join(cache_dir, "nwm") + try: + bucket = s3.create_bucket(Bucket=cache_bucket) + # nwm bucket should be created when we store the nwm file + except: + raise Exception(f'Provided bucket {cache_bucket} does not exist and cannot be created!') + # Generate weight file only if one doesn't exist already if catchment_subset is not None: - wgt_file = os.path.join(CACHE_DIR, f"{catchment_subset}_upstream_weights.json") + wgt_file = os.path.join(cache_dir, f"{catchment_subset}_upstream_weights.json") else: - wgt_file = os.path.join(CACHE_DIR, f"{vpu}_weights.json") + wgt_file = os.path.join(cache_dir, f"{vpu}_weights.json") if not os.path.exists(wgt_file): # Use geopkg_file if given if geopkg_file is not None: gpkg = Path(Path(os.path.dirname(__file__)).parent,geopkg_file) - if not gpkg.exists: + if not gpkg.exists(): raise Exception(f"{gpkg} doesn't exist!!") elif catchment_subset is not None: @@ -515,18 +479,18 @@ def prep_ngen_data(conf): # Default to geopackage that matches the requested VPU else: gpkg = None - for jfile in os.listdir(CACHE_DIR): + for jfile in os.listdir(cache_dir): if jfile.find(f"nextgen_{vpu}.gpkg") >= 0: - gpkg = Path(CACHE_DIR, jfile) + gpkg = Path(cache_dir, jfile) if ii_verbose: print(f"Found and using geopackge file {gpkg}") if gpkg == None: url = f"https://nextgen-hydrofabric.s3.amazonaws.com/{version}/nextgen_{vpu}.gpkg" - command = f"wget -P {CACHE_DIR} -c {url}" + command = f"wget -P {cache_dir} -c {url}" t0 = time.perf_counter() cmd(command) dl_time += time.perf_counter() - t0 - gpkg = Path(CACHE_DIR, f"nextgen_{vpu}.gpkg") + gpkg = Path(cache_dir, f"nextgen_{vpu}.gpkg") if not os.path.exists(gpkg): @@ -542,19 +506,25 @@ def prep_ngen_data(conf): subset_upstream(gpkg,catchment_subset) # geojsons will be placed in working directory. Copy them to bucket - if bucket_type == 'local': + if storage_type == 'local': out_path = Path(bucket_path,'configs') if not os.path.exists(out_path): os.system(f'mkdir {out_path}') os.system(f"mv ./catchments.geojson ./nexus.geojson ./crosswalk.json ./flowpaths.geojson ./flowpath_edge_list.json {out_path}") else: - print(f'UNTESTED!!') - files = ["./catchments.geojson" "./nexus.geojson" "./crosswalk.json" "./flowpaths.geojson" "./flowpath_edge_list.json"] - buf = BytesIO() - for jfile in files: - s3.put_object( - Body=json.dumps(jfile), - Bucket={bucket_name} - ) + # Just don't worry about all the output files from subsetting for now + os.system(f"rm ./catchments.geojson ./nexus.geojson ./crosswalk.json ./flowpaths.geojson ./flowpath_edge_list.json") + # print(f'UNTESTED!!') + # files = ["./catchments.geojson", + # "./nexus.geojson", + # "./crosswalk.json", + # "./flowpaths.geojson", + # "./flowpath_edge_list.json"] + # buf = BytesIO() + # for jfile in files: + # s3.put_object( + # Body=json.dumps(jfile), + # Bucket={output_bucket} + # ) # TODO: Create Realization file # TODO: Validate configs @@ -564,7 +534,7 @@ def prep_ngen_data(conf): t0 = time.perf_counter() polygonfile = gpd.read_file(gpkg, layer="divides") - ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True) + ds = get_dataset(nwm_cache_dir,TEMPLATE_BLOB_NAME, use_cache=True) src = ds["RAINRATE"] if ii_verbose: @@ -631,7 +601,7 @@ def prep_ngen_data(conf): # This will look for local raw forcing files and download them if needed t0 = time.perf_counter() local_nwm_files, remote_nwm_files = locate_dl_files_threaded( - ii_cache, ii_verbose, nwm_forcing_files, dl_threads + cache_dir, ii_cache, ii_verbose, nwm_forcing_files, dl_threads ) dl_time += time.perf_counter() - t0 @@ -703,24 +673,24 @@ def prep_ngen_data(conf): df = dfs[jcatch] splt = jcatch.split("-") - if bucket_type == "local": - if file_type == "csv": + if storage_type == "local": + if output_file_type == "csv": csvname = Path(forcing_path, f"cat{vpu}_{splt[1]}.csv") df.to_csv(csvname, index=False) - if file_type == "parquet": + if output_file_type == "parquet": parq_file = Path(forcing_path, f"cat{vpu}_{splt[1]}.parquet") df.to_parquet(parq_file) - elif bucket_type == "S3": + elif storage_type == "S3": buf = BytesIO() - if file_type == "parquet": + if output_file_type == "parquet": parq_file = f"cat{vpu}_{splt[1]}.parquet" df.to_parquet(buf) - elif file_type == "csv": + elif output_file_type == "csv": csvname = f"cat{vpu}_{splt[1]}.csv" df.to_csv(buf, index=False) buf.seek(0) - key_name = f"{file_prefix}/forcing/{csvname}" - s3.put_object(Bucket=bucket_name, Key=key_name, Body=buf.getvalue()) + key_name = f"{output_bucket_path}/forcing/{csvname}" + s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) if (j + 1) % write_int == 0: print( @@ -736,10 +706,10 @@ def prep_ngen_data(conf): total_time = time.perf_counter() - t00 print(f"\n\n--------SUMMARY-------") - if bucket_type == "local": + if storage_type == "local": msg = f"\nData has been written locally to {bucket_path}" else: - msg = f"\nData has been written to S3 bucket {bucket_name} at {file_prefix}" + msg = f"\nData has been written to S3 bucket {output_bucket} at {output_bucket_path}" msg += f"\nCheck and DL data : {dl_time:.2f}s" msg += f"\nProcess data : {proc_time:.2f}s" msg += f"\nWrite data : {write_time:.2f}s" From b41bd2d4b73cad648c6f154a7ec6d2282094d187 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Tue, 11 Jul 2023 10:50:00 -0600 Subject: [PATCH 092/105] Initial lambda function files WIP --- Dockerfile | 45 +++++++ forcing_processor_lambda.tf | 136 ++++++++++++++++++++ jl_dev.tfvars | 8 ++ lambda_function.py | 24 ++++ ngen_forcing/prep_hydrofab_forcings_ngen.py | 95 ++++++++------ push_docker_ecr.sh | 83 ++++++++++++ requirements.txt | 50 +++++++ 7 files changed, 399 insertions(+), 42 deletions(-) create mode 100644 Dockerfile create mode 100644 forcing_processor_lambda.tf create mode 100644 jl_dev.tfvars create mode 100644 lambda_function.py create mode 100644 push_docker_ecr.sh create mode 100644 requirements.txt diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..40d0db4 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,45 @@ +# Define custom function directory +ARG FUNCTION_DIR="/function" + +FROM python:3.9 as build-image + +# Include global arg in this stage of the build +ARG FUNCTION_DIR + +# Copy function code +RUN mkdir -p ${FUNCTION_DIR} +RUN mkdir -p ${FUNCTION_DIR}/ngen_forcing +RUN mkdir -p ${FUNCTION_DIR}/subsetting +RUN mkdir -p ${FUNCTION_DIR}/nwm_filenames +COPY ./ngen_forcing ${FUNCTION_DIR}/ngen_forcing +COPY ./subsetting ${FUNCTION_DIR}/subsetting +COPY ./nwm_filenames ${FUNCTION_DIR}/nwm_filenames +COPY requirements.txt ${FUNCTION_DIR} +COPY lambda_function.py ${FUNCTION_DIR} + +# Install the function's dependencies +RUN pip install \ + --target ${FUNCTION_DIR} \ + awslambdaric + +# Use a slim version of the base Python image to reduce the final image size +FROM python:3.9-slim + +# Include global arg in this stage of the build +ARG FUNCTION_DIR + +# Set working directory to function root directory +WORKDIR ${FUNCTION_DIR} + +# Copy in the built dependencies +COPY --from=build-image ${FUNCTION_DIR} ${FUNCTION_DIR} + +RUN pip install -r "${FUNCTION_DIR}/requirements.txt" --target ${FUNCTION_DIR} +RUN pip install --upgrade google-api-python-client +RUN pip install --upgrade google-cloud-storage + +# Set runtime interface client as default command for the container runtime +ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ] + +# # Pass the name of the function handler as an argument to the runtime +CMD [ "lambda_function.handler" ] diff --git a/forcing_processor_lambda.tf b/forcing_processor_lambda.tf new file mode 100644 index 0000000..f57098e --- /dev/null +++ b/forcing_processor_lambda.tf @@ -0,0 +1,136 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 4.0" + } + } +} + +# Variable declarations +variable "region" { + type = string +} + +variable "trigger_bucket" { + type = string +} + +variable "ecr_repo" { + type = string +} + +variable "function_name" { + type = string +} + +variable "trigger_file_prefix" { + type = string +} + +variable "trigger_file_suffix" { + type = string +} + +variable "image_tag" { + type = string +} + +variable "memory_size" { + type = number +} + + +# Resources +provider "aws" { + region = var.region +} + +data "aws_ecr_repository" "image_repo" { + name = var.ecr_repo +} + +resource "aws_lambda_function" "forcing_processor_function" { + function_name = var.function_name + timeout = 900 # 900 is max + image_uri = "${data.aws_ecr_repository.image_repo.repository_url}:${var.image_tag}" + package_type = "Image" + + memory_size = var.memory_size + + + role = aws_iam_role.forcing_processor_function_role.arn + +} + +resource "aws_iam_role" "forcing_processor_function_role" { + name = "forcing-processor" + + assume_role_policy = jsonencode({ + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "lambda.amazonaws.com" + } + }, + ] + }) +} + +resource "aws_s3_bucket" "trigger_bucket" { + bucket = var.trigger_bucket +} + +resource "aws_s3_bucket_notification" "bucket_notification" { + bucket = aws_s3_bucket.trigger_bucket.id + + lambda_function { + lambda_function_arn = aws_lambda_function.forcing_processor_function.arn + events = ["s3:ObjectCreated:*"] + filter_prefix = var.trigger_file_prefix + filter_suffix = var.trigger_file_suffix + } + + depends_on = [aws_lambda_permission.allow_bucket] +} + +resource "aws_lambda_permission" "allow_bucket" { + statement_id = "AllowExecutionFromS3Bucket" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.forcing_processor_function.arn + principal = "s3.amazonaws.com" + source_arn = aws_s3_bucket.trigger_bucket.arn +} + +resource "aws_cloudwatch_log_group" "function_log_group" { + name = "${aws_lambda_function.forcing_processor_function.function_name}" + retention_in_days = 7 + lifecycle { + prevent_destroy = false + } +} + +resource "aws_iam_policy" "function_logging_policy" { + name = "function-logging-policy" + policy = jsonencode({ + "Version" : "2012-10-17", + "Statement" : [ + { + Action : [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents" + ], + Effect : "Allow", + Resource : "arn:aws:logs:*:*:*" + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "function_logging_policy_attachment" { + role = aws_iam_role.forcing_processor_function_role.id + policy_arn = aws_iam_policy.function_logging_policy.arn +} diff --git a/jl_dev.tfvars b/jl_dev.tfvars new file mode 100644 index 0000000..834364a --- /dev/null +++ b/jl_dev.tfvars @@ -0,0 +1,8 @@ +region = "us-west-2" +trigger_bucket = "nwm.test" +ecr_repo = "nextgenforcing" +function_name = "forcingprocessor" +trigger_file_prefix = "" +trigger_file_suffix = "02.conus.nc.txt" +image_tag = "forcingprocessor" +memory_size = 512 \ No newline at end of file diff --git a/lambda_function.py b/lambda_function.py new file mode 100644 index 0000000..bef536f --- /dev/null +++ b/lambda_function.py @@ -0,0 +1,24 @@ +import sys, os, json +from pathlib import Path + +print(f'Importing my code...') +sys.path.append("/function/ngen_forcing") +print(help('modules')) +from prep_hydrofab_forcings_ngen import prep_ngen_data + +def handler(event, context): + + print(f'Made it to handler!') + + # load template config + conf = json.load(open('./ngen_forcing/ngen_forcings_lambda.json')) + + # get date from event + + # modify config + + # call function + prep_ngen_data(conf) + + print('TRY v9' + sys.version + '!') + return 'TRY v9' + sys.version + '!' diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index a5fb087..87f0231 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -3,7 +3,6 @@ import pandas as pd import argparse, os, json, sys -import gc from pathlib import Path import geopandas as gpd import numpy as np @@ -152,18 +151,23 @@ def get_blob(blob_name: str, bucket: str = NWM_BUCKET) -> bytes: bucket = client.bucket(bucket) return bucket.blob(blob_name).download_as_bytes(timeout=120) +def get_weights_dict(weights_file): + # Open weights dict from pickle + # The if statement is here to decide how to read the weight file based on local or bucket + if type(weights_file) is dict: + crosswalk_dict = json.loads(weights_file["Body"].read().decode()) + else: + with open(weights_file, "r") as f: + crosswalk_dict = json.load(f) + + return crosswalk_dict def calc_zonal_stats_weights_new( src: np.ndarray, - weights_filepath: str, + crosswalk_dict: dict, ) -> pd.DataFrame: """Calculates zonal stats""" - # Open weights dict from pickle - # This could probably be done once and passed as a reference. - with open(weights_filepath, "r") as f: - crosswalk_dict = json.load(f) - nvar = src.shape[0] mean_dict = {} for key, value in crosswalk_dict.items(): @@ -173,15 +177,10 @@ def calc_zonal_stats_weights_new( for key, value in crosswalk_dict.items(): mean_dict[key] = np.nanmean(src[:, value[0], value[1]], axis=1) - # This should not be needed, but without memory usage grows - del crosswalk_dict - del f - gc.collect() - return mean_dict -def get_forcing_timelist(wgt_file: str, filelist: list, var_list: list): +def get_forcing_timelist(crosswalk_dict: dict, filelist: list, var_list: list): """ General function to read either remote or local nwm forcing files. @@ -197,7 +196,6 @@ def get_forcing_timelist(wgt_file: str, filelist: list, var_list: list): """ - t1 = time.perf_counter() df_by_t = [] t = [] for _i, _nc_file in enumerate(filelist): @@ -212,7 +210,7 @@ def get_forcing_timelist(wgt_file: str, filelist: list, var_list: list): data_allvars = np.zeros(shape=(len(var_list), shp[1], shp[2]), dtype=dtp) for var_dx, jvar in enumerate(var_list): data_allvars[var_dx, :, :] = np.squeeze(_xds[jvar].values) - _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, wgt_file) + _df_zonal_stats = calc_zonal_stats_weights_new(data_allvars, crosswalk_dict) df_by_t.append(_df_zonal_stats) time_splt = _xds.attrs["model_output_valid_time"].split("_") t.append(time_splt[0] + " " + time_splt[1]) @@ -334,7 +332,7 @@ def locate_dl_files_threaded( return local_files, remote_files -def threaded_data_extract(files,nthreads,ii_verbose,wgt_file,var_list): +def threaded_data_extract(files,nthreads,ii_verbose,crosswalk_dict,var_list): """ Sets up the thread pool for get_forcing_timelist and returns the data and time axis ordered in time @@ -344,7 +342,7 @@ def threaded_data_extract(files,nthreads,ii_verbose,wgt_file,var_list): arg1 = [] arg2 = [] for i in range(len(files)): - arg0.append(wgt_file) + arg0.append(crosswalk_dict) arg1.append([files[i]]) arg2.append(var_list) @@ -397,7 +395,7 @@ def prep_ngen_data(conf): geopkg_file = conf["hydrofab"].get("geopkg_file") ii_weights_only = conf['hydrofab'].get('weights_only',False) - storage_type = conf["storage"]["type"] + storage_type = conf["storage"]["storage_type"] output_bucket = conf["storage"]["output_bucket"] output_bucket_path = conf["storage"]["output_bucket_path"] cache_bucket = conf["storage"]["cache_bucket"] @@ -423,10 +421,15 @@ def prep_ngen_data(conf): ), f"{output_file_type} for output_file_type is not accepted! Accepted: {file_types}" bucket_types = ["local", "S3"] assert ( - type in bucket_types + storage_type in bucket_types ), f"{storage_type} for storage_type is not accepted! Accepted: {bucket_types}" assert vpu is not None or geopkg_file is not None, "Need to input either vpu or geopkg_file" + if catchment_subset is not None: + vpu_or_subset = catchment_subset + "_upstream" + else: + vpu_or_subset = vpu + if storage_type == "local": # Prep output directory @@ -445,27 +448,29 @@ def prep_ngen_data(conf): if not os.path.exists(cache_dir): os.system(f"mkdir {cache_dir}") if not os.path.exists(cache_dir): - raise Exception(f"Creating {cache_dir} failed!") + raise Exception(f"Creating {cache_dir} failed!") - elif storage_type == "S3": - s3 = boto3.client("s3") + wgt_file = os.path.join(cache_dir, f"{vpu_or_subset}_weights.json") + ii_wgt_file = os.path.exists(wgt_file) - # Prep cache directory - # TODO: test that the bucket exists - # cache_dir = Path(Path(os.path.dirname(__file__)).parent,cache_bucket_path) - # nwm_cache_dir = os.path.join(cache_dir, "nwm") + elif storage_type == "S3": + with open(Path(Path(os.path.dirname(__file__)),"credentials")) as f: + creds = f.readlines() + s3 = boto3.client("s3", + aws_access_key_id=creds[1].split(' = ')[1][:-1], + aws_secret_access_key=creds[2].split(' = ')[1][:-1] + ) try: - bucket = s3.create_bucket(Bucket=cache_bucket) - # nwm bucket should be created when we store the nwm file - except: - raise Exception(f'Provided bucket {cache_bucket} does not exist and cannot be created!') + wgt_file = s3.get_object(Bucket=cache_bucket, Key=f"{vpu_or_subset}_weights.json") + ii_wgt_file = True + except : + ii_wgt_file = False + raise NotImplementedError(f'Need to implement weight file creation in bucket') # Generate weight file only if one doesn't exist already - if catchment_subset is not None: - wgt_file = os.path.join(cache_dir, f"{catchment_subset}_upstream_weights.json") - else: - wgt_file = os.path.join(cache_dir, f"{vpu}_weights.json") - if not os.path.exists(wgt_file): + # TODO: This will break hard if looking for the weight file in S3, + # this code block assumes the weight files are local + if not ii_wgt_file: # Use geopkg_file if given if geopkg_file is not None: @@ -545,6 +550,7 @@ def prep_ngen_data(conf): print(f"\nGenerating the weights took {time.perf_counter() - t1:.2f} s") proc_time += time.perf_counter() - t0 else: + crosswalk_dict = get_weights_dict(wgt_file) if ii_verbose: print( f"Not creating weight file! Delete this if you want to create a new one: {wgt_file}" @@ -599,11 +605,15 @@ def prep_ngen_data(conf): proc_time += time.perf_counter() - t0 # This will look for local raw forcing files and download them if needed - t0 = time.perf_counter() - local_nwm_files, remote_nwm_files = locate_dl_files_threaded( - cache_dir, ii_cache, ii_verbose, nwm_forcing_files, dl_threads - ) - dl_time += time.perf_counter() - t0 + if storage_type == 'local': + t0 = time.perf_counter() + local_nwm_files, remote_nwm_files = locate_dl_files_threaded( + cache_dir, ii_cache, ii_verbose, nwm_forcing_files, dl_threads + ) + dl_time += time.perf_counter() - t0 + else: + remote_nwm_files = nwm_forcing_files + local_nwm_files = [] var_list = [ "U2D", @@ -634,7 +644,7 @@ def prep_ngen_data(conf): print( f"Performing threaded remote data extraction with {proc_threads} workers..." ) - remote_data_list, t_ax_remote = threaded_data_extract(remote_nwm_files,proc_threads,ii_verbose,wgt_file,var_list) + remote_data_list, t_ax_remote = threaded_data_extract(remote_nwm_files,proc_threads,ii_verbose,crosswalk_dict,var_list) # Index local files with threads if len(local_nwm_files) > 0: @@ -642,7 +652,7 @@ def prep_ngen_data(conf): print( f"Performing threaded local data extraction with {proc_threads} workers..." ) - local_data_list, t_ax_local = threaded_data_extract(local_nwm_files,proc_threads,ii_verbose,wgt_file,var_list) + local_data_list, t_ax_local = threaded_data_extract(local_nwm_files,proc_threads,ii_verbose,crosswalk_dict,var_list) # Sync in time between remote and local files complete_data_timelist = [] @@ -670,6 +680,7 @@ def prep_ngen_data(conf): nfiles = len(dfs) write_int = 1000 for j, jcatch in enumerate(dfs.keys()): + if j > 10: break # TODO: remove this break for actual deployment. Just don't want to get charged for uploads. df = dfs[jcatch] splt = jcatch.split("-") diff --git a/push_docker_ecr.sh b/push_docker_ecr.sh new file mode 100644 index 0000000..e40815c --- /dev/null +++ b/push_docker_ecr.sh @@ -0,0 +1,83 @@ +# Set default values +AWS_ACCT_ID="010036277732" +IMAGE_NAME="forcingprocessor" +REPO_NAME="nextgenforcing" +REGION="us-west-2" + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case "$1" in + -a|--aws-acct-id) + AWS_ACCT_ID="$2" + shift 2 + ;; + -i|--image-name) + IMAGE_NAME="$2" + shift 2 + ;; + -r|--repo-name) + REPO_NAME="$2" + shift 2 + ;; + --region) + REGION="$2" + shift 2 + ;; + *) + echo "Unknown argument: $1" + exit 1 + ;; + esac +done + +# Use the variables as needed +echo "AWS_ACCT_ID: $AWS_ACCT_ID" +echo "IMAGE_NAME: $IMAGE_NAME" +echo "REPO_NAME: $REPO_NAME" +echo "REGION: $REGION" + +# build docker image +echo "Building docker image ${REPO_NAME}" +docker build --no-cache -t ${IMAGE_NAME} . + +# Validate docker credentials +echo "Validating credentials" +aws ecr get-login-password --region ${REGION} | docker login --username AWS --password-stdin "${AWS_ACCT_ID}.dkr.ecr.us-west-2.amazonaws.com" + +# Create repo +echo "Checking image repo status" +repository_info=$(aws ecr describe-repositories --region ${REGION} --repository-names ${REPO_NAME} 2>/dev/null) +if [[ -z "${repository_info}" ]]; then + echo "${REPO_NAME} doesn't exist" + echo "Creating ${REPO_NAME}" + aws ecr create-repository \ + --region ${REGION} \ + --repository-name ${REPO_NAME} \ + --image-scanning-configuration scanOnPush=true + sleep 10 + +else + echo "${REPO_NAME} exists" +fi + +# Tag the image +echo "Tagging docker image as ${IMAGE_NAME}" +docker tag ${IMAGE_NAME} "${AWS_ACCT_ID}.dkr.ecr.${REGION}.amazonaws.com/${REPO_NAME}:${IMAGE_NAME}" + +# Push the image +echo "Pushing ${IMAGE_NAME} to ${AWS_ACCT_ID}.dkr.ecr.${REGION}.amazonaws.com/${REPO_NAME}" +docker push "${AWS_ACCT_ID}.dkr.ecr.${REGION}.amazonaws.com/${REPO_NAME}:${IMAGE_NAME}" + +# Update the function code +echo "Updating function code" +aws lambda update-function-code \ + --function-name forcingprocessor \ + --image-uri ${AWS_ACCT_ID}.dkr.ecr.${REGION}.amazonaws.com/${REPO_NAME}:${IMAGE_NAME} \ + --region ${REGION} + +# Test +echo "Testing deployment" +aws s3api put-object \ + --bucket nwm.test \ + --key 02.conus.nc.txt \ + --body ~/code/data_access_examples/data/cache/02.conus.nc.txt \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d3e9db8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,50 @@ +affine==2.4.0 +attrs==23.1.0 +boto3==1.27.1 +botocore==1.30.1 +cachetools==5.3.1 +certifi==2023.5.7 +cftime==1.6.2 +charset-normalizer==3.1.0 +click==8.1.3 +click-plugins==1.1.1 +cligj==0.7.2 +Fiona==1.9.4.post1 +google-api-core==2.11.1 +google-api-python-client==2.92.0 +google-auth==2.21.0 +google-auth-httplib2==0.1.0 +google-cloud-core==2.3.3 +google-cloud-storage==2.10.0 +google-crc32c==1.5.0 +google-resumable-media==2.5.0 +googleapis-common-protos==1.59.1 +httplib2==0.22.0 +idna==3.4 +importlib-metadata==6.7.0 +jmespath==1.0.1 +netCDF4==1.6.4 +numpy==1.25.0 +packaging==23.1 +pandas==2.0.3 +pathlib==1.0.1 +protobuf==4.23.3 +pyasn1==0.5.0 +pyasn1-modules==0.3.0 +pyparsing==3.1.0 +pyproj==3.6.0 +python-dateutil==2.8.2 +pytz==2023.3 +rasterio==1.3.8 +requests==2.31.0 +rioxarray==0.14.1 +rsa==4.9 +s3transfer==0.6.1 +shapely==2.0.1 +six==1.16.0 +snuggs==1.4.7 +tzdata==2023.3 +uritemplate==4.1.1 +urllib3==1.26.16 +xarray==2023.6.0 +zipp==3.15.0 From 267acafcf5d3611c6d94cb118faab6840c92d5a0 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Thu, 13 Jul 2023 18:57:18 -0600 Subject: [PATCH 093/105] implemented metadata outputs, lambda function works though still WIP --- Dockerfile | 10 +- ngen_forcing/prep_hydrofab_forcings_ngen.py | 390 ++++++++++++++------ requirements.txt | 84 +++-- 3 files changed, 350 insertions(+), 134 deletions(-) diff --git a/Dockerfile b/Dockerfile index 40d0db4..558dfce 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,7 +18,8 @@ COPY requirements.txt ${FUNCTION_DIR} COPY lambda_function.py ${FUNCTION_DIR} # Install the function's dependencies -RUN pip install \ +RUN pip3 install --upgrade pip +RUN pip3 install --no-cache-dir \ --target ${FUNCTION_DIR} \ awslambdaric @@ -34,12 +35,11 @@ WORKDIR ${FUNCTION_DIR} # Copy in the built dependencies COPY --from=build-image ${FUNCTION_DIR} ${FUNCTION_DIR} -RUN pip install -r "${FUNCTION_DIR}/requirements.txt" --target ${FUNCTION_DIR} -RUN pip install --upgrade google-api-python-client -RUN pip install --upgrade google-cloud-storage +RUN pip3 install --upgrade pip +RUN pip3 install --no-cache-dir -r "${FUNCTION_DIR}/requirements.txt" # Set runtime interface client as default command for the container runtime ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ] -# # Pass the name of the function handler as an argument to the runtime +# Pass the name of the function handler as an argument to the runtime CMD [ "lambda_function.handler" ] diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 87f0231..e5f63ae 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -3,6 +3,8 @@ import pandas as pd import argparse, os, json, sys +import fsspec +import s3fs from pathlib import Path import geopandas as gpd import numpy as np @@ -12,8 +14,11 @@ from rasterio.features import rasterize import time import boto3 -from io import BytesIO +from io import BytesIO, TextIOWrapper import concurrent.futures as cf +import git +import gzip +from datetime import datetime pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "nwm_filenames") sys.path.append(str(pkg_dir)) @@ -180,7 +185,7 @@ def calc_zonal_stats_weights_new( return mean_dict -def get_forcing_timelist(crosswalk_dict: dict, filelist: list, var_list: list): +def get_forcing_timelist(crosswalk_dict: dict, filelist: list, var_list: list, cache_dir=None, s3=None): """ General function to read either remote or local nwm forcing files. @@ -195,6 +200,7 @@ def get_forcing_timelist(crosswalk_dict: dict, filelist: list, var_list: list): t : model_output_valid_time for each """ + fs = fsspec.filesystem('s3', anon=True) df_by_t = [] t = [] @@ -202,9 +208,16 @@ def get_forcing_timelist(crosswalk_dict: dict, filelist: list, var_list: list): if _nc_file[:5] == "https": eng = "rasterio" # switch engine for remote processing else: - eng = "h5netcdf" + pass + eng = "h5netcdf" - with xr.open_dataset(_nc_file, engine=eng) as _xds: + # Create an S3 filesystem object + # Open the NetCDF file using xarray's open_dataset function + _nc_file = 's3://' + cache_dir + '/' + _nc_file.split('/')[-1] + s3_file_obj = s3.open(_nc_file, mode='rb') + + # with xr.open_dataset(_nc_file, engine=eng) as _xds: + with xr.open_dataset(s3_file_obj, engine=eng) as _xds: shp = _xds["U2D"].shape dtp = _xds["U2D"].dtype data_allvars = np.zeros(shape=(len(var_list), shp[1], shp[2]), dtype=dtp) @@ -218,7 +231,7 @@ def get_forcing_timelist(crosswalk_dict: dict, filelist: list, var_list: list): return df_by_t, t -def time2catchment(data_list, time_list, var_list_out): +def time2catchment(data_list, time_list, var_list_out, ii_collect_stats): """ Convert a list of catchment dictionaries into a single dictionary of dataframes for each catchment @@ -230,8 +243,11 @@ def time2catchment(data_list, time_list, var_list_out): dfs : a dictionary of catchment based dataframes """ + ii_collect_stats = True dfs = {} + stats_avg = [] + stats_median = [] for jcat in list(data_list[0].keys()): data_catch = [] for jt in range(len(data_list)): @@ -240,7 +256,18 @@ def time2catchment(data_list, time_list, var_list_out): dfs[jcat]["time"] = time_list dfs[jcat] = dfs[jcat][["time"] + var_list_out] - return dfs + if ii_collect_stats: + stacked = np.stack(data_catch) + data_avg = np.average(stacked,axis=0) + data_median = np.median(stacked,axis=0) + stats_avg.append([jcat] + list(data_avg)) + stats_median.append([jcat] + list(data_median)) + + if ii_collect_stats: + avg_df = pd.DataFrame(stats_avg,columns=['catchment'] + var_list_out) + median_df = pd.DataFrame(stats_median,columns=['catchment'] + var_list_out) + + return dfs, avg_df, median_df def cmd(cmd): @@ -257,7 +284,7 @@ def cmd(cmd): def locate_dl_files_threaded( - cache_dir: str, ii_cache: bool, ii_verbose: bool, forcing_file_names: list, nthreads: int + cache_dir: str, ii_cache: bool, ii_verbose: bool, forcing_file_names: list, nthreads: int, s3=None ): """ Look for forcing files locally, if found, will apend to local file list for local processing @@ -280,49 +307,59 @@ def locate_dl_files_threaded( dl_files = [] cmds = [] for jfile in forcing_file_names: + file_parts = Path(jfile).parts local_file = os.path.join(cache_dir, file_parts[-1]) ii_dl = False - # decide whether to use local file, download it, or index it remotely - if os.path.exists(local_file): - # Check to make sure file is not broken + if s3 is not None: try: - with xr.open_dataset(local_file, engine="h5netcdf") as _xds: - pass - if ii_cache: - if ii_verbose: - print(f"Found and using local raw forcing file {local_file}") - else: - if ii_verbose: - print( - f"CACHE OPTION OVERRIDE : Found and using local raw forcing file {local_file}" - ) - local_files.append(local_file) + # if this succeeds, file has been found in bucket + s3_obj = s3.get_object(Bucket=cache_dir,Key=local_file.split('/')[1]) + local_files.append(jfile) except: - if ii_cache: - if ii_verbose: - print(f"{local_file} is broken! Will Download") - ii_dl = True - else: - if ii_verbose: - print(f"{local_file} is broken! Will index remotely") - remote_files.append(jfile) - - elif not os.path.exists(local_file) and not ii_cache: - # If file is not found locally, and we don't want to cache it, append to remote file list - remote_files.append(jfile) - elif not os.path.exists(local_file) and ii_cache: - ii_dl = True + raise Exception(f'{local_file} not found in {cache_dir}!!') + else: - if ii_dl: - # Download file - if ii_verbose: - print(f"Forcing file not found! Downloading {jfile}") - command = f"wget -P {cache_dir} -c {jfile}" - cmds.append(command) - dl_files.append(jfile) - local_files.append(local_file) + # decide whether to use local file, download it, or index it remotely + if os.path.exists(local_file): + # Check to make sure file is not broken + try: + with xr.open_dataset(local_file, engine="h5netcdf") as _xds: + pass + if ii_cache: + if ii_verbose: + print(f"Found and using local raw forcing file {local_file}") + else: + if ii_verbose: + print( + f"CACHE OPTION OVERRIDE : Found and using local raw forcing file {local_file}" + ) + local_files.append(local_file) + except: + if ii_cache: + if ii_verbose: + print(f"{local_file} is broken! Will Download") + ii_dl = True + else: + if ii_verbose: + print(f"{local_file} is broken! Will index remotely") + remote_files.append(jfile) + + elif not os.path.exists(local_file) and not ii_cache: + # If file is not found locally, and we don't want to cache it, append to remote file list + remote_files.append(jfile) + elif not os.path.exists(local_file) and ii_cache: + ii_dl = True + + if ii_dl: + # Download file + if ii_verbose: + print(f"Forcing file not found! Downloading {jfile}") + command = f"wget -P {cache_dir} -c {jfile}" + cmds.append(command) + dl_files.append(jfile) + local_files.append(local_file) # Get files with pool if len(cmds) > 0: @@ -332,32 +369,40 @@ def locate_dl_files_threaded( return local_files, remote_files -def threaded_data_extract(files,nthreads,ii_verbose,crosswalk_dict,var_list): +# TODO: Clean up script by implementing read/write file functions +def write_file( + filename : str, + filepath : str, + storage_type : str, + output_file_type : str, + s3 = None, + bucket = None, + df = pd.DataFrame +): """ - Sets up the thread pool for get_forcing_timelist and returns the data and time axis ordered in time - + filename : name of the file to be written + filepath : if storage_type is local, this is the local path to output folder """ - pool = cf.ThreadPoolExecutor(max_workers=nthreads) - arg0 = [] - arg1 = [] - arg2 = [] - for i in range(len(files)): - arg0.append(crosswalk_dict) - arg1.append([files[i]]) - arg2.append(var_list) - - results = pool.map(get_forcing_timelist, arg0, arg1, arg2) - - data_list = [] - for jres in results: - data_list.append(jres) - - # Build time axis - t_ax_local = [] - for i in range(len(files)): - t_ax_local.append(data_list[i][1]) - - return data_list, t_ax_local + if storage_type == "local": + forcing_path = filepath + if output_file_type == "csv": + csvname = Path(forcing_path, f"{filename}.csv") + df.to_csv(filename, index=False) + if output_file_type == "parquet": + parq_file = Path(forcing_path, f"{filename}.parquet") + df.to_parquet(filename) + elif storage_type == "S3": + output_bucket_path = filepath + buf = BytesIO() + if output_file_type == "parquet": + parq_file = f"{filename}.parquet" + df.to_parquet(buf) + elif output_file_type == "csv": + csvname = f"{filename}.csv" + df.to_csv(buf, index=False) + buf.seek(0) + key_name = f"{output_bucket_path}/forcing/{csvname}" + s3.put_object(Bucket=bucket, Key=key_name, Body=buf.getvalue()) def prep_ngen_data(conf): @@ -371,6 +416,8 @@ def prep_ngen_data(conf): Will store files in the same folder as the JSON config to run this script """ + datentime = datetime.utcnow().strftime("%m%d%y_%H%M%S") + t00 = time.perf_counter() forcing_type = conf["forcing"]["forcing_type"] @@ -384,6 +431,7 @@ def prep_ngen_data(conf): meminput = conf["forcing"].get("meminput",None) urlbaseinput = conf["forcing"].get("urlbaseinput",None) nwm_file = conf["forcing"].get("nwm_file",None) + path_override = conf["forcing"].get("path_override",None) fcst_cycle = conf["forcing"].get("fcst_cycle",None) lead_time = conf["forcing"].get("lead_time",None) data_type = conf["forcing"].get("data_type",None) @@ -404,7 +452,7 @@ def prep_ngen_data(conf): ii_verbose = conf["run"]["verbose"] dl_threads = conf["run"]["dl_threads"] - proc_threads = conf["run"]["proc_threads"] + ii_collect_stats = conf["run"].get("collect_stats",True) print(f"\nWelcome to Preparing Data for NextGen-Based Simulations!\n") @@ -435,10 +483,12 @@ def prep_ngen_data(conf): # Prep output directory top_dir = Path(os.path.dirname(__file__)).parent bucket_path = Path(top_dir, output_bucket_path, output_bucket) - forcing_path = Path(bucket_path, 'forcing') + forcing_path = Path(bucket_path, 'forcing') + meta_path = Path(forcing_path, 'metadata') if not os.path.exists(bucket_path): os.system(f"mkdir {bucket_path}") os.system(f"mkdir {forcing_path}") + os.system(f"mkdir {meta_path}") if not os.path.exists(bucket_path): raise Exception(f"Creating {bucket_path} failed!") @@ -454,18 +504,40 @@ def prep_ngen_data(conf): ii_wgt_file = os.path.exists(wgt_file) elif storage_type == "S3": - with open(Path(Path(os.path.dirname(__file__)),"credentials")) as f: + cache_dir = cache_bucket + + + + # TODO: Authenticate with Vault + print(f'SECURITY VULNERABILITY: CREDENTIALS IN IMAGE') + with open(os.path.join(os.getcwd(),"ngen_forcing/credentials")) as f: creds = f.readlines() + + key_id = creds[1].split(' = ')[1][:-1] + key = creds[2].split(' = ')[1][:-1] + + fs_s3 = s3fs.S3FileSystem(anon=False, + key=key_id, + secret=key) + s3 = boto3.client("s3", - aws_access_key_id=creds[1].split(' = ')[1][:-1], - aws_secret_access_key=creds[2].split(' = ')[1][:-1] + aws_access_key_id=key_id, + aws_secret_access_key=key ) + try: wgt_file = s3.get_object(Bucket=cache_bucket, Key=f"{vpu_or_subset}_weights.json") ii_wgt_file = True except : ii_wgt_file = False raise NotImplementedError(f'Need to implement weight file creation in bucket') + + # Save config to metadata + s3.put_object( + Body=json.dumps(conf), + Bucket=output_bucket, + Key=f"{output_bucket_path}/forcing/metadata/{datentime}/conf.json" + ) # Generate weight file only if one doesn't exist already # TODO: This will break hard if looking for the weight file in S3, @@ -563,7 +635,6 @@ def prep_ngen_data(conf): # Get nwm forcing file names t0 = time.perf_counter() if not forcing_type == 'from_file': - if forcing_type == "operational_archive": nwm_forcing_files = create_file_list( runinput, @@ -592,6 +663,13 @@ def prep_ngen_data(conf): ) nwm_forcing_files = nwm_forcing_files[0] + if path_override is not None: + print(f'Over riding path with {path_override}') + files = [] + for jfile in nwm_forcing_files: + files.append(path_override + jfile.split('/')[-1]) + nwm_forcing_files = files + else: nwm_forcing_files = [] with open(nwm_file, "r") as f: @@ -605,15 +683,11 @@ def prep_ngen_data(conf): proc_time += time.perf_counter() - t0 # This will look for local raw forcing files and download them if needed - if storage_type == 'local': - t0 = time.perf_counter() - local_nwm_files, remote_nwm_files = locate_dl_files_threaded( - cache_dir, ii_cache, ii_verbose, nwm_forcing_files, dl_threads - ) - dl_time += time.perf_counter() - t0 - else: - remote_nwm_files = nwm_forcing_files - local_nwm_files = [] + t0 = time.perf_counter() + local_nwm_files, remote_nwm_files = locate_dl_files_threaded( + cache_dir, ii_cache, ii_verbose, nwm_forcing_files, dl_threads, s3 + ) + dl_time += time.perf_counter() - t0 var_list = [ "U2D", @@ -638,21 +712,11 @@ def prep_ngen_data(conf): ] t0 = time.perf_counter() - # Index remote files with threads - if len(remote_nwm_files) > 0: - if ii_verbose: - print( - f"Performing threaded remote data extraction with {proc_threads} workers..." - ) - remote_data_list, t_ax_remote = threaded_data_extract(remote_nwm_files,proc_threads,ii_verbose,crosswalk_dict,var_list) - # Index local files with threads - if len(local_nwm_files) > 0: - if ii_verbose: - print( - f"Performing threaded local data extraction with {proc_threads} workers..." - ) - local_data_list, t_ax_local = threaded_data_extract(local_nwm_files,proc_threads,ii_verbose,crosswalk_dict,var_list) + + # AWS does not support thread pools so this will have to be unthreaded... + if len(remote_nwm_files) > 0: remote_data_list, t_ax_remote = get_forcing_timelist(crosswalk_dict,remote_nwm_files,var_list,cache_dir,fs_s3) + if len(local_nwm_files) > 0: local_data_list, t_ax_local = get_forcing_timelist(crosswalk_dict,local_nwm_files,var_list,cache_dir,fs_s3) # Sync in time between remote and local files complete_data_timelist = [] @@ -661,26 +725,27 @@ def prep_ngen_data(conf): filename = Path(ifile).parts[-1] for j, jfile in enumerate(local_nwm_files): if jfile.find(filename) >= 0: - complete_data_timelist.append(local_data_list[j][0][0]) + complete_data_timelist.append(local_data_list[j]) timelist.append(t_ax_local[j]) for j, jfile in enumerate(remote_nwm_files): if jfile.find(filename) >= 0: - complete_data_timelist.append(remote_data_list[j][0][0]) + complete_data_timelist.append(remote_data_list[j]) timelist.append(t_ax_remote[j]) # Convert time-synced list of catchment dictionaries # to catchment based dataframes if ii_verbose: print(f"Reformatting data into dataframes...") - dfs = time2catchment(complete_data_timelist, timelist, var_list_out) + dfs, stats_avg, stats_median = time2catchment(complete_data_timelist, timelist, var_list_out, ii_collect_stats) proc_time += time.perf_counter() - t0 # Write to file t0 = time.perf_counter() nfiles = len(dfs) write_int = 1000 + catch_lim = 10 for j, jcatch in enumerate(dfs.keys()): - if j > 10: break # TODO: remove this break for actual deployment. Just don't want to get charged for uploads. + if j > catch_lim: break # TODO: remove this break for actual deployment. Just don't want to get charged for uploads. df = dfs[jcatch] splt = jcatch.split("-") @@ -701,7 +766,7 @@ def prep_ngen_data(conf): df.to_csv(buf, index=False) buf.seek(0) key_name = f"{output_bucket_path}/forcing/{csvname}" - s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) + s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) if (j + 1) % write_int == 0: print( @@ -713,21 +778,136 @@ def prep_ngen_data(conf): f"{j+1} files written out of {len(dfs)}, {(j+1)/len(dfs)*100:.2f}%", end="\r", ) - write_time = time.perf_counter() - t0 + if storage_type == "S3": buf.close() + write_time = time.perf_counter() - t0 + total_time = time.perf_counter() - t00 + # Metadata + if ii_collect_stats: + t000 = time.perf_counter() + print(f'\nSaving metadata...') + # Write out a csv with script runtime stats + nwm_file_sizes = [] + for jfile in nwm_forcing_files: + response = s3.head_object( + Bucket=cache_bucket, + Key=jfile.split('/')[-1] + ) + nwm_file_sizes.append(response['ContentLength']) + nwm_file_size_avg = np.average(nwm_file_sizes) + nwm_file_size_med = np.median(nwm_file_sizes) + nwm_file_size_std = np.std(nwm_file_sizes) + + catchment_sizes = [] + zipped_sizes = [] + for j, jcatch in enumerate(dfs.keys()): + if j > catch_lim: break # TODO: remove this break for actual deployment. Just don't want to get charged for uploads. + + # Check forcing size + splt = jcatch.split("-") + csvname = f"cat{vpu}_{splt[1]}.csv" + key_name = f"{output_bucket_path}/forcing/{csvname}" + response = s3.head_object( + Bucket=output_bucket, + Key=key_name + ) + catchment_sizes.append(response['ContentLength']) + + # zip + zipname = csvname[:-4] + '.zip' + buf = BytesIO() + buf.seek(0) + df = dfs[jcatch] + with gzip.GzipFile(mode='w', fileobj=buf) as zipped_file: + df.to_csv(TextIOWrapper(zipped_file, 'utf8'), index=False) + key_name = f"{output_bucket_path}/forcing/metadata/{datentime}/zipped_forcing/{zipname}" + s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) + buf.close() + + # Check zipped size + response = s3.head_object( + Bucket=output_bucket, + Key=key_name + ) + zipped_sizes.append(response['ContentLength']) + + catch_file_size_avg = np.average(catchment_sizes) + catch_file_size_med = np.median(catchment_sizes) + catch_file_size_std = np.std(catchment_sizes) + + catch_file_zip_size_avg = np.average(zipped_sizes) + catch_file_zip_size_med = np.median(zipped_sizes) + catch_file_zip_size_std = np.std(zipped_sizes) + + metadata_script = { + "runtime_s" : [round(total_time,2)], + "nvars_intput" : [len(var_list)], + "nvars_output" : [len(var_list_out)], + "nwmfiles_input" : [len(nwm_forcing_files)], + "nwm_file_size_avg" : [nwm_file_size_avg], + "nwm_file_size_med" : [nwm_file_size_med], + "nwm_file_size_std" : [nwm_file_size_std], + "catch_files_output" : [nfiles], + "catch_file_size_avg" : [catch_file_size_avg], + "catch_file_size_med" : [catch_file_size_med], + "catch_file_size_std" : [catch_file_size_std], + "catch_file_zip_size_avg" : [catch_file_zip_size_avg], + "catch_file_zip_size_med" : [catch_file_zip_size_med], + "catch_file_zip_size_std" : [catch_file_zip_size_std], + } + + # TODO: if we pull this script from a repo, include the hash + if False: + repo = git.Repo(search_parent_directories=True) + sha = repo.head.object.hexsha + metadata_script["commit"] = sha + + # Save input config file and script commit + metadata_df = pd.DataFrame.from_dict(metadata_script) + if storage_type == 'S3': + buf = BytesIO() + + csvname = f"metadata_{vpu_or_subset}_{datentime}.csv" + metadata_df.to_csv(buf, index=False) + buf.seek(0) + key_name = f"{output_bucket_path}/forcing/metadata/{datentime}/{csvname}" + s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) + + # Save catchment based forcing stats + csvname = f"catchments_avg.csv" + stats_avg.to_csv(buf, index=False) + buf.seek(0) + key_name = f"{output_bucket_path}/forcing/metadata/{datentime}/{csvname}" + s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) + + csvname = f"catchments_median.csv" + stats_median.to_csv(buf, index=False) + buf.seek(0) + key_name = f"{output_bucket_path}/forcing/metadata/{datentime}/{csvname}" + s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) + + buf.close() + + else: + # TODO + raise NotImplementedError + + meta_time = time.perf_counter() - t000 + print(f"\n\n--------SUMMARY-------") if storage_type == "local": msg = f"\nData has been written locally to {bucket_path}" else: - msg = f"\nData has been written to S3 bucket {output_bucket} at {output_bucket_path}" + msg = f"\nData has been written to S3 bucket {output_bucket} at /{output_bucket_path}/forcing" msg += f"\nCheck and DL data : {dl_time:.2f}s" msg += f"\nProcess data : {proc_time:.2f}s" msg += f"\nWrite data : {write_time:.2f}s" msg += f"\nTotal time : {total_time:.2f}s\n" + if ii_collect_stats: msg += f"\nCollect stats : {meta_time:.2f}s" + print(msg) - - + if __name__ == "__main__": # Take in user config parser = argparse.ArgumentParser() diff --git a/requirements.txt b/requirements.txt index d3e9db8..5bddbc9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,50 +1,86 @@ affine==2.4.0 +asciitree==0.3.3 attrs==23.1.0 -boto3==1.27.1 -botocore==1.30.1 -cachetools==5.3.1 +beautifulsoup4==4.12.2 +bokeh==3.2.0 +boto3==1.28.3 +botocore==1.31.3 +Bottleneck==1.3.7 certifi==2023.5.7 cftime==1.6.2 -charset-normalizer==3.1.0 -click==8.1.3 +charset-normalizer==3.2.0 +click==8.1.5 click-plugins==1.1.1 cligj==0.7.2 +cloudpickle==2.2.1 +contourpy==1.1.0 +cycler==0.11.0 +dask==2023.7.0 +distributed==2023.7.0 +docopt==0.6.2 +entrypoints==0.4 +fasteners==0.18 Fiona==1.9.4.post1 -google-api-core==2.11.1 -google-api-python-client==2.92.0 -google-auth==2.21.0 -google-auth-httplib2==0.1.0 -google-cloud-core==2.3.3 -google-cloud-storage==2.10.0 -google-crc32c==1.5.0 -google-resumable-media==2.5.0 -googleapis-common-protos==1.59.1 -httplib2==0.22.0 +flox==0.7.2 +fonttools==4.41.0 +fsspec==2023.6.0 +geopandas==0.13.2 +h5netcdf==1.2.0 +h5py==3.9.0 idna==3.4 -importlib-metadata==6.7.0 +importlib-metadata==6.8.0 +importlib-resources==6.0.0 +Jinja2==3.1.2 jmespath==1.0.1 +kiwisolver==1.4.4 +llvmlite==0.40.1 +locket==1.0.0 +lz4==4.3.2 +MarkupSafe==2.1.3 +matplotlib==3.7.2 +msgpack==1.0.5 +nc-time-axis==1.4.1 netCDF4==1.6.4 -numpy==1.25.0 +numba==0.57.1 +numbagg==0.2.2 +numcodecs==0.11.0 +numpy==1.24.4 +numpy-groupies==0.9.22 packaging==23.1 pandas==2.0.3 +partd==1.4.0 pathlib==1.0.1 -protobuf==4.23.3 -pyasn1==0.5.0 -pyasn1-modules==0.3.0 -pyparsing==3.1.0 +Pillow==10.0.0 +platformdirs==3.8.1 +pooch==1.7.0 +psutil==5.9.5 +pyarrow==12.0.1 +pydap==3.4.1 +pyparsing==3.0.9 pyproj==3.6.0 python-dateutil==2.8.2 pytz==2023.3 +PyYAML==6.0 rasterio==1.3.8 requests==2.31.0 rioxarray==0.14.1 -rsa==4.9 +s3fs==0.4.2 s3transfer==0.6.1 +scipy==1.11.1 +seaborn==0.12.2 shapely==2.0.1 six==1.16.0 snuggs==1.4.7 +sortedcontainers==2.4.0 +soupsieve==2.4.1 +tblib==2.0.0 +toolz==0.12.0 +tornado==6.3.2 tzdata==2023.3 -uritemplate==4.1.1 urllib3==1.26.16 +WebOb==1.8.7 xarray==2023.6.0 -zipp==3.15.0 +xyzservices==2023.7.0 +zarr==2.15.0 +zict==3.0.0 +zipp==3.16.1 From 9fe5d3d56a0a84da8c767eecfb2de8c212b27fe0 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Fri, 14 Jul 2023 12:46:46 -0600 Subject: [PATCH 094/105] Added security manager --- .gitignore | 9 +- forcing_processor_lambda.tf | 28 ++- jl_dev.tfvars | 2 +- lambda_function.py | 21 +-- ngen_forcing/prep_hydrofab_forcings_ngen.py | 179 +++++++++++--------- ngen_forcing/user_input_ngen.json | 6 +- 6 files changed, 141 insertions(+), 104 deletions(-) diff --git a/.gitignore b/.gitignore index 2a12cb2..5a5fb11 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,3 @@ -.vscode/ data/* -nwm_filenames/__pycache__/ -subsetting/__pycache__/ -ngen_forcing/__pycache__/ -venv/ -tests/__pycache__/ -tests/data/* +venv9/ +tests/data/ngen_inputs/forcing/* diff --git a/forcing_processor_lambda.tf b/forcing_processor_lambda.tf index f57098e..8d5a577 100644 --- a/forcing_processor_lambda.tf +++ b/forcing_processor_lambda.tf @@ -40,8 +40,6 @@ variable "memory_size" { type = number } - -# Resources provider "aws" { region = var.region } @@ -50,6 +48,7 @@ data "aws_ecr_repository" "image_repo" { name = var.ecr_repo } +# Create function and set role resource "aws_lambda_function" "forcing_processor_function" { function_name = var.function_name timeout = 900 # 900 is max @@ -79,6 +78,7 @@ resource "aws_iam_role" "forcing_processor_function_role" { }) } +# Set up the trigger resource "aws_s3_bucket" "trigger_bucket" { bucket = var.trigger_bucket } @@ -104,6 +104,7 @@ resource "aws_lambda_permission" "allow_bucket" { source_arn = aws_s3_bucket.trigger_bucket.arn } +# Logging resource "aws_cloudwatch_log_group" "function_log_group" { name = "${aws_lambda_function.forcing_processor_function.function_name}" retention_in_days = 7 @@ -134,3 +135,26 @@ resource "aws_iam_role_policy_attachment" "function_logging_policy_attachment" { role = aws_iam_role.forcing_processor_function_role.id policy_arn = aws_iam_policy.function_logging_policy.arn } + +# Add secret access to lambda function +resource "aws_iam_policy" "secrets_manager_policy" { + name = "secrets_manager_access_policy" + description = "Allows access to Secrets Manager" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "AllowSecretsManagerAccess" + Effect = "Allow" + Action = ["secretsmanager:GetSecretValue"] + Resource = "*" + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "secrets_manager_attachment" { + role = aws_iam_role.forcing_processor_function_role.name + policy_arn = aws_iam_policy.secrets_manager_policy.arn +} diff --git a/jl_dev.tfvars b/jl_dev.tfvars index 834364a..8b98917 100644 --- a/jl_dev.tfvars +++ b/jl_dev.tfvars @@ -5,4 +5,4 @@ function_name = "forcingprocessor" trigger_file_prefix = "" trigger_file_suffix = "02.conus.nc.txt" image_tag = "forcingprocessor" -memory_size = 512 \ No newline at end of file +memory_size = 4096 \ No newline at end of file diff --git a/lambda_function.py b/lambda_function.py index bef536f..419d63f 100644 --- a/lambda_function.py +++ b/lambda_function.py @@ -1,24 +1,15 @@ -import sys, os, json -from pathlib import Path - -print(f'Importing my code...') -sys.path.append("/function/ngen_forcing") -print(help('modules')) -from prep_hydrofab_forcings_ngen import prep_ngen_data +import sys, json +# from aws_lambda_powertools.utilities import parameters def handler(event, context): - print(f'Made it to handler!') - # load template config - conf = json.load(open('./ngen_forcing/ngen_forcings_lambda.json')) + conf = json.load(open('/function/ngen_forcing/ngen_forcings_lambda.json')) # get date from event - # modify config - # call function - prep_ngen_data(conf) + from ngen_forcing import prep_hydrofab_forcings_ngen + prep_hydrofab_forcings_ngen.prep_ngen_data(conf) - print('TRY v9' + sys.version + '!') - return 'TRY v9' + sys.version + '!' + return 'Done!' diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index e5f63ae..dacbccb 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -3,20 +3,18 @@ import pandas as pd import argparse, os, json, sys -import fsspec import s3fs from pathlib import Path import geopandas as gpd import numpy as np import xarray as xr -from google.cloud import storage from rasterio.io import MemoryFile from rasterio.features import rasterize import time import boto3 +from botocore.exceptions import ClientError from io import BytesIO, TextIOWrapper import concurrent.futures as cf -import git import gzip from datetime import datetime @@ -52,54 +50,54 @@ def get_cache_dir(nwm_cache_dir: str,create: bool = True): def make_parent_dir(filepath): Path(filepath).parent.mkdir(parents=True, exist_ok=True) -def get_dataset(nwm_cache_dir: str, blob_name: str, use_cache: bool = True) -> xr.Dataset: - """Retrieve a blob from the data service as xarray.Dataset. - Based largely on OWP HydroTools. - Parameters - ---------- - blob_name: str, required - Name of blob to retrieve. - use_cache: bool, default True - If cache should be used. - If True, checks to see if file is in cache, and - If fetched from remote, will save to cache. - Returns - ------- - ds : xarray.Dataset - The data stored in the blob. - """ - # TODO: Check to see if this does any better than kerchunk - # the caching should help, but probably needs to be managed to function asynchronously. - # Perhaps if theget_dataset files is not cached, we can create the dataset from - # kerchunk with a remote path and then asynchronously do a download to cache it - # for next time. The hypothesis would be that the download speed will not be any slower than - # just accessing the file remotely. - nc_filepath = os.path.join(get_cache_dir(nwm_cache_dir), blob_name) - make_parent_dir(nc_filepath) - - # If the file exists and use_cache = True - if os.path.exists(nc_filepath) and use_cache: - # Get dataset from cache - ds = xr.load_dataset( - nc_filepath, - engine="h5netcdf", - ) - return ds - else: - # Get raw bytes - raw_bytes = get_blob(blob_name) - # Create Dataset - ds = xr.load_dataset( - MemoryFile(raw_bytes), - engine="h5netcdf", - ) - if use_cache: - # Subset and cache - ds["RAINRATE"].to_netcdf( - nc_filepath, - engine="h5netcdf", - ) - return ds +# def get_dataset(nwm_cache_dir: str, blob_name: str, use_cache: bool = True) -> xr.Dataset: +# """Retrieve a blob from the data service as xarray.Dataset. +# Based largely on OWP HydroTools. +# Parameters +# ---------- +# blob_name: str, required +# Name of blob to retrieve. +# use_cache: bool, default True +# If cache should be used. +# If True, checks to see if file is in cache, and +# If fetched from remote, will save to cache. +# Returns +# ------- +# ds : xarray.Dataset +# The data stored in the blob. +# """ +# # TODO: Check to see if this does any better than kerchunk +# # the caching should help, but probably needs to be managed to function asynchronously. +# # Perhaps if theget_dataset files is not cached, we can create the dataset from +# # kerchunk with a remote path and then asynchronously do a download to cache it +# # for next time. The hypothesis would be that the download speed will not be any slower than +# # just accessing the file remotely. +# nc_filepath = os.path.join(get_cache_dir(nwm_cache_dir), blob_name) +# make_parent_dir(nc_filepath) + +# # If the file exists and use_cache = True +# if os.path.exists(nc_filepath) and use_cache: +# # Get dataset from cache +# ds = xr.load_dataset( +# nc_filepath, +# engine="h5netcdf", +# ) +# return ds +# else: +# # Get raw bytes +# raw_bytes = get_blob(blob_name) +# # Create Dataset +# ds = xr.load_dataset( +# MemoryFile(raw_bytes), +# engine="h5netcdf", +# ) +# if use_cache: +# # Subset and cache +# ds["RAINRATE"].to_netcdf( +# nc_filepath, +# engine="h5netcdf", +# ) +# return ds def generate_weights_file( gdf: gpd.GeoDataFrame, @@ -139,22 +137,22 @@ def generate_weights_file( f.write(weights_json) -def get_blob(blob_name: str, bucket: str = NWM_BUCKET) -> bytes: - """Retrieve a blob from the data service as bytes. - Based largely on OWP HydroTools. - Parameters - ---------- - blob_name : str, required - Name of blob to retrieve. - Returns - ------- - data : bytes - The data stored in the blob. - """ - # Setup anonymous client and retrieve blob data - client = storage.Client.create_anonymous_client() - bucket = client.bucket(bucket) - return bucket.blob(blob_name).download_as_bytes(timeout=120) +# def get_blob(blob_name: str, bucket: str = NWM_BUCKET) -> bytes: +# """Retrieve a blob from the data service as bytes. +# Based largely on OWP HydroTools. +# Parameters +# ---------- +# blob_name : str, required +# Name of blob to retrieve. +# Returns +# ------- +# data : bytes +# The data stored in the blob. +# """ +# # Setup anonymous client and retrieve blob data +# client = storage.Client.create_anonymous_client() +# bucket = client.bucket(bucket) +# return bucket.blob(blob_name).download_as_bytes(timeout=120) def get_weights_dict(weights_file): # Open weights dict from pickle @@ -200,7 +198,6 @@ def get_forcing_timelist(crosswalk_dict: dict, filelist: list, var_list: list, c t : model_output_valid_time for each """ - fs = fsspec.filesystem('s3', anon=True) df_by_t = [] t = [] @@ -369,6 +366,37 @@ def locate_dl_files_threaded( return local_files, remote_files + +def get_secret( + secret_name : str, + region_name : str, + +): + + # Create a Secrets Manager client + session = boto3.session.Session() + client = session.client( + service_name='secretsmanager', + region_name=region_name + ) + + try: + get_secret_value_response = client.get_secret_value( + SecretId=secret_name + ) + except ClientError as e: + # For a list of exceptions thrown, see + # https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html + raise e + + # Decrypts secret using the associated KMS key. + secret = get_secret_value_response['SecretString'] + + # Your code goes here. + + return json.loads(secret) + + # TODO: Clean up script by implementing read/write file functions def write_file( filename : str, @@ -453,6 +481,8 @@ def prep_ngen_data(conf): ii_verbose = conf["run"]["verbose"] dl_threads = conf["run"]["dl_threads"] ii_collect_stats = conf["run"].get("collect_stats",True) + secret_name = conf["run"]["secret_name"] + region_name = conf["run"]["region_name"] print(f"\nWelcome to Preparing Data for NextGen-Based Simulations!\n") @@ -506,15 +536,10 @@ def prep_ngen_data(conf): elif storage_type == "S3": cache_dir = cache_bucket - - - # TODO: Authenticate with Vault - print(f'SECURITY VULNERABILITY: CREDENTIALS IN IMAGE') - with open(os.path.join(os.getcwd(),"ngen_forcing/credentials")) as f: - creds = f.readlines() + secret = get_secret(secret_name,region_name) - key_id = creds[1].split(' = ')[1][:-1] - key = creds[2].split(' = ')[1][:-1] + key_id = secret['aws_access_key_id'] + key = secret['aws_secret_access_key'] fs_s3 = s3fs.S3FileSystem(anon=False, key=key_id, @@ -684,6 +709,7 @@ def prep_ngen_data(conf): # This will look for local raw forcing files and download them if needed t0 = time.perf_counter() + if ii_verbose: print(f'Locating nwm input forcing files...') local_nwm_files, remote_nwm_files = locate_dl_files_threaded( cache_dir, ii_cache, ii_verbose, nwm_forcing_files, dl_threads, s3 ) @@ -715,6 +741,7 @@ def prep_ngen_data(conf): # AWS does not support thread pools so this will have to be unthreaded... + if ii_verbose: print(f'Extracting data...') if len(remote_nwm_files) > 0: remote_data_list, t_ax_remote = get_forcing_timelist(crosswalk_dict,remote_nwm_files,var_list,cache_dir,fs_s3) if len(local_nwm_files) > 0: local_data_list, t_ax_local = get_forcing_timelist(crosswalk_dict,local_nwm_files,var_list,cache_dir,fs_s3) diff --git a/ngen_forcing/user_input_ngen.json b/ngen_forcing/user_input_ngen.json index 613ea7d..f9fd91f 100644 --- a/ngen_forcing/user_input_ngen.json +++ b/ngen_forcing/user_input_ngen.json @@ -1,8 +1,8 @@ { "forcing" : { - "forcing_type" : "retrospective", - "start_date" : "19790201", - "end_date" : "19790202", + "forcing_type" : "operational_archive", + "start_date" : "20220822", + "end_date" : "20220822", "cache" : true, "nwm_file" : "", "runinput" : 2, From 66adf2ea4feb37cc320a866f3a231f56799428fd Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 24 Jul 2023 13:00:44 -0600 Subject: [PATCH 095/105] Added lambda function versioning --- forcing_processor_lambda.tf | 17 ++++++----------- jl_dev.tfvars | 1 + 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/forcing_processor_lambda.tf b/forcing_processor_lambda.tf index 8d5a577..ddbb6f7 100644 --- a/forcing_processor_lambda.tf +++ b/forcing_processor_lambda.tf @@ -8,6 +8,10 @@ terraform { } # Variable declarations +variable "deployment_version" { + type = string +} + variable "region" { type = string } @@ -50,9 +54,9 @@ data "aws_ecr_repository" "image_repo" { # Create function and set role resource "aws_lambda_function" "forcing_processor_function" { - function_name = var.function_name + function_name = "${var.function_name}_${var.deployment_version}" timeout = 900 # 900 is max - image_uri = "${data.aws_ecr_repository.image_repo.repository_url}:${var.image_tag}" + image_uri = "${data.aws_ecr_repository.image_repo.repository_url}:${var.image_tag}_${deployment_version}" package_type = "Image" memory_size = var.memory_size @@ -104,15 +108,6 @@ resource "aws_lambda_permission" "allow_bucket" { source_arn = aws_s3_bucket.trigger_bucket.arn } -# Logging -resource "aws_cloudwatch_log_group" "function_log_group" { - name = "${aws_lambda_function.forcing_processor_function.function_name}" - retention_in_days = 7 - lifecycle { - prevent_destroy = false - } -} - resource "aws_iam_policy" "function_logging_policy" { name = "function-logging-policy" policy = jsonencode({ diff --git a/jl_dev.tfvars b/jl_dev.tfvars index 8b98917..94293cb 100644 --- a/jl_dev.tfvars +++ b/jl_dev.tfvars @@ -1,3 +1,4 @@ +deployment_version = "0.2" region = "us-west-2" trigger_bucket = "nwm.test" ecr_repo = "nextgenforcing" From 6ba50db5a9a1c4ae8eb043e64fbb76b8637818f3 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 16 Aug 2023 13:06:57 -0600 Subject: [PATCH 096/105] Implemented hashing in metadata --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 220 +++++++------------- 1 file changed, 73 insertions(+), 147 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index dacbccb..39befc9 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -17,6 +17,7 @@ import concurrent.futures as cf import gzip from datetime import datetime +import hashlib pkg_dir = Path(Path(os.path.dirname(__file__)).parent, "nwm_filenames") sys.path.append(str(pkg_dir)) @@ -50,55 +51,6 @@ def get_cache_dir(nwm_cache_dir: str,create: bool = True): def make_parent_dir(filepath): Path(filepath).parent.mkdir(parents=True, exist_ok=True) -# def get_dataset(nwm_cache_dir: str, blob_name: str, use_cache: bool = True) -> xr.Dataset: -# """Retrieve a blob from the data service as xarray.Dataset. -# Based largely on OWP HydroTools. -# Parameters -# ---------- -# blob_name: str, required -# Name of blob to retrieve. -# use_cache: bool, default True -# If cache should be used. -# If True, checks to see if file is in cache, and -# If fetched from remote, will save to cache. -# Returns -# ------- -# ds : xarray.Dataset -# The data stored in the blob. -# """ -# # TODO: Check to see if this does any better than kerchunk -# # the caching should help, but probably needs to be managed to function asynchronously. -# # Perhaps if theget_dataset files is not cached, we can create the dataset from -# # kerchunk with a remote path and then asynchronously do a download to cache it -# # for next time. The hypothesis would be that the download speed will not be any slower than -# # just accessing the file remotely. -# nc_filepath = os.path.join(get_cache_dir(nwm_cache_dir), blob_name) -# make_parent_dir(nc_filepath) - -# # If the file exists and use_cache = True -# if os.path.exists(nc_filepath) and use_cache: -# # Get dataset from cache -# ds = xr.load_dataset( -# nc_filepath, -# engine="h5netcdf", -# ) -# return ds -# else: -# # Get raw bytes -# raw_bytes = get_blob(blob_name) -# # Create Dataset -# ds = xr.load_dataset( -# MemoryFile(raw_bytes), -# engine="h5netcdf", -# ) -# if use_cache: -# # Subset and cache -# ds["RAINRATE"].to_netcdf( -# nc_filepath, -# engine="h5netcdf", -# ) -# return ds - def generate_weights_file( gdf: gpd.GeoDataFrame, src: xr.DataArray, @@ -136,24 +88,6 @@ def generate_weights_file( with open(weights_filepath, "w") as f: f.write(weights_json) - -# def get_blob(blob_name: str, bucket: str = NWM_BUCKET) -> bytes: -# """Retrieve a blob from the data service as bytes. -# Based largely on OWP HydroTools. -# Parameters -# ---------- -# blob_name : str, required -# Name of blob to retrieve. -# Returns -# ------- -# data : bytes -# The data stored in the blob. -# """ -# # Setup anonymous client and retrieve blob data -# client = storage.Client.create_anonymous_client() -# bucket = client.bucket(bucket) -# return bucket.blob(blob_name).download_as_bytes(timeout=120) - def get_weights_dict(weights_file): # Open weights dict from pickle # The if statement is here to decide how to read the weight file based on local or bucket @@ -396,43 +330,6 @@ def get_secret( return json.loads(secret) - -# TODO: Clean up script by implementing read/write file functions -def write_file( - filename : str, - filepath : str, - storage_type : str, - output_file_type : str, - s3 = None, - bucket = None, - df = pd.DataFrame -): - """ - filename : name of the file to be written - filepath : if storage_type is local, this is the local path to output folder - """ - if storage_type == "local": - forcing_path = filepath - if output_file_type == "csv": - csvname = Path(forcing_path, f"{filename}.csv") - df.to_csv(filename, index=False) - if output_file_type == "parquet": - parq_file = Path(forcing_path, f"{filename}.parquet") - df.to_parquet(filename) - elif storage_type == "S3": - output_bucket_path = filepath - buf = BytesIO() - if output_file_type == "parquet": - parq_file = f"{filename}.parquet" - df.to_parquet(buf) - elif output_file_type == "csv": - csvname = f"{filename}.csv" - df.to_csv(buf, index=False) - buf.seek(0) - key_name = f"{output_bucket_path}/forcing/{csvname}" - s3.put_object(Bucket=bucket, Key=key_name, Body=buf.getvalue()) - - def prep_ngen_data(conf): """ Primary function to retrieve forcing and hydrofabric data and convert it into files that can be ingested into ngen. @@ -508,6 +405,9 @@ def prep_ngen_data(conf): else: vpu_or_subset = vpu + if output_bucket_path == "": + output_bucket_path = start_date + if storage_type == "local": # Prep output directory @@ -715,6 +615,9 @@ def prep_ngen_data(conf): ) dl_time += time.perf_counter() - t0 + # HACK + local_nwm_files = local_nwm_files[:3] + var_list = [ "U2D", "V2D", @@ -739,7 +642,6 @@ def prep_ngen_data(conf): t0 = time.perf_counter() - # AWS does not support thread pools so this will have to be unthreaded... if ii_verbose: print(f'Extracting data...') if len(remote_nwm_files) > 0: remote_data_list, t_ax_remote = get_forcing_timelist(crosswalk_dict,remote_nwm_files,var_list,cache_dir,fs_s3) @@ -766,30 +668,37 @@ def prep_ngen_data(conf): dfs, stats_avg, stats_median = time2catchment(complete_data_timelist, timelist, var_list_out, ii_collect_stats) proc_time += time.perf_counter() - t0 - # Write to file + # Write forcings to file t0 = time.perf_counter() nfiles = len(dfs) write_int = 1000 catch_lim = 10 + forcing_cat_ids = [] + forcing_hashes = [] for j, jcatch in enumerate(dfs.keys()): if j > catch_lim: break # TODO: remove this break for actual deployment. Just don't want to get charged for uploads. df = dfs[jcatch] - splt = jcatch.split("-") + cat_id = jcatch.split("-")[1] + + forcing_cat_ids.append(cat_id) + + sha256_hash = hashlib.sha256(df.to_json().encode()).hexdigest() + forcing_hashes.append(sha256_hash) if storage_type == "local": if output_file_type == "csv": - csvname = Path(forcing_path, f"cat{vpu}_{splt[1]}.csv") + csvname = Path(forcing_path, f"cat{vpu}_{cat_id}.csv") df.to_csv(csvname, index=False) if output_file_type == "parquet": - parq_file = Path(forcing_path, f"cat{vpu}_{splt[1]}.parquet") + parq_file = Path(forcing_path, f"cat{vpu}_{cat_id}.parquet") df.to_parquet(parq_file) elif storage_type == "S3": buf = BytesIO() if output_file_type == "parquet": - parq_file = f"cat{vpu}_{splt[1]}.parquet" + parq_file = f"cat{vpu}_{cat_id}.parquet" df.to_parquet(buf) elif output_file_type == "csv": - csvname = f"cat{vpu}_{splt[1]}.csv" + csvname = f"cat{vpu}_{cat_id}.csv" df.to_csv(buf, index=False) buf.seek(0) key_name = f"{output_bucket_path}/forcing/{csvname}" @@ -802,7 +711,7 @@ def prep_ngen_data(conf): ) if j == nfiles - 1: print( - f"{j+1} files written out of {len(dfs)}, {(j+1)/len(dfs)*100:.2f}%", + f"{j+1} files written out of {len(dfs)}, {(j+1)/len(dfs)*100:.2f}%\n", end="\r", ) if storage_type == "S3": buf.close() @@ -812,8 +721,21 @@ def prep_ngen_data(conf): # Metadata if ii_collect_stats: + + # Forcing hashes + full_str = '' + for x in forcing_hashes: full_str = full_str + x + root_hash = hashlib.sha256(full_str.encode()).hexdigest() + + hash_dict = {"root_hash":root_hash, + "cat_ids":forcing_cat_ids, + "hash":forcing_hashes, + } + + hash_df = pd.DataFrame(dict([(key, pd.Series(value)) for key, value in hash_dict.items()])) + t000 = time.perf_counter() - print(f'\nSaving metadata...') + print(f'Saving metadata...') # Write out a csv with script runtime stats nwm_file_sizes = [] for jfile in nwm_forcing_files: @@ -848,7 +770,7 @@ def prep_ngen_data(conf): df = dfs[jcatch] with gzip.GzipFile(mode='w', fileobj=buf) as zipped_file: df.to_csv(TextIOWrapper(zipped_file, 'utf8'), index=False) - key_name = f"{output_bucket_path}/forcing/metadata/{datentime}/zipped_forcing/{zipname}" + key_name = f"{output_bucket_path}/metadata/{datentime}/zipped_forcing/{zipname}" s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) buf.close() @@ -867,58 +789,61 @@ def prep_ngen_data(conf): catch_file_zip_size_med = np.median(zipped_sizes) catch_file_zip_size_std = np.std(zipped_sizes) - metadata_script = { + mil = 1000000 + + metadata = { "runtime_s" : [round(total_time,2)], - "nvars_intput" : [len(var_list)], - "nvars_output" : [len(var_list_out)], + "nvars_intput" : [len(var_list)], "nwmfiles_input" : [len(nwm_forcing_files)], - "nwm_file_size_avg" : [nwm_file_size_avg], - "nwm_file_size_med" : [nwm_file_size_med], - "nwm_file_size_std" : [nwm_file_size_std], + "nwm_file_size_avg_MB" : [nwm_file_size_avg/mil], + "nwm_file_size_med_MB" : [nwm_file_size_med/mil], + "nwm_file_size_std_MB" : [nwm_file_size_std/mil], "catch_files_output" : [nfiles], - "catch_file_size_avg" : [catch_file_size_avg], - "catch_file_size_med" : [catch_file_size_med], - "catch_file_size_std" : [catch_file_size_std], - "catch_file_zip_size_avg" : [catch_file_zip_size_avg], - "catch_file_zip_size_med" : [catch_file_zip_size_med], - "catch_file_zip_size_std" : [catch_file_zip_size_std], + "nvars_output" : [len(var_list_out)], + "catch_file_size_avg_MB" : [catch_file_size_avg/mil], + "catch_file_size_med_MB" : [catch_file_size_med/mil], + "catch_file_size_std_MB" : [catch_file_size_std/mil], + "catch_file_zip_size_avg_MB" : [catch_file_zip_size_avg/mil], + "catch_file_zip_size_med_MB" : [catch_file_zip_size_med/mil], + "catch_file_zip_size_std_MB" : [catch_file_zip_size_std/mil], } - # TODO: if we pull this script from a repo, include the hash - if False: - repo = git.Repo(search_parent_directories=True) - sha = repo.head.object.hexsha - metadata_script["commit"] = sha - # Save input config file and script commit - metadata_df = pd.DataFrame.from_dict(metadata_script) + metadata_df = pd.DataFrame.from_dict(metadata) if storage_type == 'S3': + # Write files to s3 bucket buf = BytesIO() - + csvname = f"hashes_{vpu_or_subset}_{datentime}.csv" + hash_df.to_csv(buf, index=False) + buf.seek(0) + key_name = f"{output_bucket_path}/metadata/{csvname}" + s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) csvname = f"metadata_{vpu_or_subset}_{datentime}.csv" metadata_df.to_csv(buf, index=False) buf.seek(0) - key_name = f"{output_bucket_path}/forcing/metadata/{datentime}/{csvname}" + key_name = f"{output_bucket_path}/metadata/{csvname}" s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) - - # Save catchment based forcing stats - csvname = f"catchments_avg.csv" + csvname = f"catchments_avg_{datentime}.csv" stats_avg.to_csv(buf, index=False) buf.seek(0) - key_name = f"{output_bucket_path}/forcing/metadata/{datentime}/{csvname}" + key_name = f"{output_bucket_path}/metadata/{csvname}" s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) - - csvname = f"catchments_median.csv" + csvname = f"catchments_median_{datentime}.csv" stats_median.to_csv(buf, index=False) buf.seek(0) - key_name = f"{output_bucket_path}/forcing/metadata/{datentime}/{csvname}" + key_name = f"{output_bucket_path}/metadata/{csvname}" s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) - buf.close() - else: - # TODO - raise NotImplementedError + # Write files locally + csvname = Path(forcing_path, f"hashes_{vpu_or_subset}_{datentime}.csv") + hash_df.to_csv(csvname, index=False) + csvname = Path(forcing_path, f"metadata_{vpu_or_subset}_{datentime}.csv") + metadata_df.to_csv(csvname, index=False) + csvname = Path(forcing_path, f"catchments_avg_{datentime}.csv") + stats_avg.to_csv(csvname, index=False) + csvname = Path(forcing_path, f"catchments_median_{datentime}.csv") + stats_median.to_csv(csvname, index=False) meta_time = time.perf_counter() - t000 @@ -934,7 +859,8 @@ def prep_ngen_data(conf): if ii_collect_stats: msg += f"\nCollect stats : {meta_time:.2f}s" print(msg) - + + if __name__ == "__main__": # Take in user config parser = argparse.ArgumentParser() From c4d0f1a13e1f60eb4699c9b3a801c0b0adb6420f Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 16 Aug 2023 13:07:21 -0600 Subject: [PATCH 097/105] removed hard coded variables and made testing optional --- push_docker_ecr.sh | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/push_docker_ecr.sh b/push_docker_ecr.sh index e40815c..53c41fa 100644 --- a/push_docker_ecr.sh +++ b/push_docker_ecr.sh @@ -1,9 +1,3 @@ -# Set default values -AWS_ACCT_ID="010036277732" -IMAGE_NAME="forcingprocessor" -REPO_NAME="nextgenforcing" -REGION="us-west-2" - # Parse command line arguments while [[ $# -gt 0 ]]; do case "$1" in @@ -23,6 +17,10 @@ while [[ $# -gt 0 ]]; do REGION="$2" shift 2 ;; + --test) + TEST="$2" + shift 2 + ;; *) echo "Unknown argument: $1" exit 1 @@ -35,12 +33,13 @@ echo "AWS_ACCT_ID: $AWS_ACCT_ID" echo "IMAGE_NAME: $IMAGE_NAME" echo "REPO_NAME: $REPO_NAME" echo "REGION: $REGION" +echo "TEST: $TEST" # build docker image echo "Building docker image ${REPO_NAME}" docker build --no-cache -t ${IMAGE_NAME} . -# Validate docker credentials +# Validate aws credentials echo "Validating credentials" aws ecr get-login-password --region ${REGION} | docker login --username AWS --password-stdin "${AWS_ACCT_ID}.dkr.ecr.us-west-2.amazonaws.com" @@ -76,8 +75,12 @@ aws lambda update-function-code \ --region ${REGION} # Test -echo "Testing deployment" -aws s3api put-object \ - --bucket nwm.test \ - --key 02.conus.nc.txt \ - --body ~/code/data_access_examples/data/cache/02.conus.nc.txt \ No newline at end of file +if [[ "$TEST" == "TRUE" ]]; then + echo "Testing deployment" + aws s3api put-object \ + --bucket nwm.test \ + --key 02.conus.nc.txt \ + --body ~/code/data_access_examples/data/cache/02.conus.nc.txt +else + echo "Lambda function was not tested" +fi From daabf3c50d00278dfeea8b9a8cd360bed7533145 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 16 Aug 2023 13:39:07 -0600 Subject: [PATCH 098/105] moved conf into proper metadata folder --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 39befc9..4ae030d 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -461,7 +461,7 @@ def prep_ngen_data(conf): s3.put_object( Body=json.dumps(conf), Bucket=output_bucket, - Key=f"{output_bucket_path}/forcing/metadata/{datentime}/conf.json" + Key=f"{output_bucket_path}/metadata/{datentime}/conf.json" ) # Generate weight file only if one doesn't exist already @@ -812,26 +812,27 @@ def prep_ngen_data(conf): metadata_df = pd.DataFrame.from_dict(metadata) if storage_type == 'S3': # Write files to s3 bucket + meta_path = f"{output_bucket_path}/metadata/" buf = BytesIO() csvname = f"hashes_{vpu_or_subset}_{datentime}.csv" hash_df.to_csv(buf, index=False) buf.seek(0) - key_name = f"{output_bucket_path}/metadata/{csvname}" + key_name = meta_path + csvname s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) csvname = f"metadata_{vpu_or_subset}_{datentime}.csv" metadata_df.to_csv(buf, index=False) buf.seek(0) - key_name = f"{output_bucket_path}/metadata/{csvname}" + key_name = meta_path + csvname s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) csvname = f"catchments_avg_{datentime}.csv" stats_avg.to_csv(buf, index=False) buf.seek(0) - key_name = f"{output_bucket_path}/metadata/{csvname}" + key_name = meta_path + csvname s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) csvname = f"catchments_median_{datentime}.csv" stats_median.to_csv(buf, index=False) buf.seek(0) - key_name = f"{output_bucket_path}/metadata/{csvname}" + key_name = meta_path + csvname s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) buf.close() else: From 652aea28ac03df0fd01ec0cb8336e0be93aa1d08 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Fri, 25 Aug 2023 14:45:03 -0600 Subject: [PATCH 099/105] personal -> ciroh changes --- forcing_processor_lambda.tf | 8 ++----- jl_dev.tfvars | 5 ++-- ngen_forcing/prep_hydrofab_forcings_ngen.py | 26 ++++++++++----------- push_docker_ecr.sh | 2 +- 4 files changed, 18 insertions(+), 23 deletions(-) diff --git a/forcing_processor_lambda.tf b/forcing_processor_lambda.tf index ddbb6f7..4ecd7fd 100644 --- a/forcing_processor_lambda.tf +++ b/forcing_processor_lambda.tf @@ -8,10 +8,6 @@ terraform { } # Variable declarations -variable "deployment_version" { - type = string -} - variable "region" { type = string } @@ -54,9 +50,9 @@ data "aws_ecr_repository" "image_repo" { # Create function and set role resource "aws_lambda_function" "forcing_processor_function" { - function_name = "${var.function_name}_${var.deployment_version}" + function_name = "${var.function_name}" timeout = 900 # 900 is max - image_uri = "${data.aws_ecr_repository.image_repo.repository_url}:${var.image_tag}_${deployment_version}" + image_uri = "${data.aws_ecr_repository.image_repo.repository_url}:${var.image_tag}" package_type = "Image" memory_size = var.memory_size diff --git a/jl_dev.tfvars b/jl_dev.tfvars index 94293cb..17b31d0 100644 --- a/jl_dev.tfvars +++ b/jl_dev.tfvars @@ -1,6 +1,5 @@ -deployment_version = "0.2" -region = "us-west-2" -trigger_bucket = "nwm.test" +region = "us-east-2" +trigger_bucket = "ngenresources" ecr_repo = "nextgenforcing" function_name = "forcingprocessor" trigger_file_prefix = "" diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 4ae030d..7f0477b 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -406,7 +406,7 @@ def prep_ngen_data(conf): vpu_or_subset = vpu if output_bucket_path == "": - output_bucket_path = start_date + output_bucket_path = datentime if storage_type == "local": @@ -414,7 +414,7 @@ def prep_ngen_data(conf): top_dir = Path(os.path.dirname(__file__)).parent bucket_path = Path(top_dir, output_bucket_path, output_bucket) forcing_path = Path(bucket_path, 'forcing') - meta_path = Path(forcing_path, 'metadata') + meta_path = Path(forcing_path, 'forcing_metadata') if not os.path.exists(bucket_path): os.system(f"mkdir {bucket_path}") os.system(f"mkdir {forcing_path}") @@ -461,7 +461,7 @@ def prep_ngen_data(conf): s3.put_object( Body=json.dumps(conf), Bucket=output_bucket, - Key=f"{output_bucket_path}/metadata/{datentime}/conf.json" + Key=f"{output_bucket_path}/forcing_metadata/conf.json" ) # Generate weight file only if one doesn't exist already @@ -770,7 +770,7 @@ def prep_ngen_data(conf): df = dfs[jcatch] with gzip.GzipFile(mode='w', fileobj=buf) as zipped_file: df.to_csv(TextIOWrapper(zipped_file, 'utf8'), index=False) - key_name = f"{output_bucket_path}/metadata/{datentime}/zipped_forcing/{zipname}" + key_name = f"{output_bucket_path}/forcing_metadata/zipped_forcing/{zipname}" s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) buf.close() @@ -812,24 +812,24 @@ def prep_ngen_data(conf): metadata_df = pd.DataFrame.from_dict(metadata) if storage_type == 'S3': # Write files to s3 bucket - meta_path = f"{output_bucket_path}/metadata/" + meta_path = f"{output_bucket_path}/forcing_metadata/" buf = BytesIO() - csvname = f"hashes_{vpu_or_subset}_{datentime}.csv" + csvname = f"hashes_{vpu_or_subset}.csv" hash_df.to_csv(buf, index=False) buf.seek(0) key_name = meta_path + csvname s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) - csvname = f"metadata_{vpu_or_subset}_{datentime}.csv" + csvname = f"metadata_{vpu_or_subset}.csv" metadata_df.to_csv(buf, index=False) buf.seek(0) key_name = meta_path + csvname s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) - csvname = f"catchments_avg_{datentime}.csv" + csvname = f"catchments_avg.csv" stats_avg.to_csv(buf, index=False) buf.seek(0) key_name = meta_path + csvname s3.put_object(Bucket=output_bucket, Key=key_name, Body=buf.getvalue()) - csvname = f"catchments_median_{datentime}.csv" + csvname = f"catchments_median.csv" stats_median.to_csv(buf, index=False) buf.seek(0) key_name = meta_path + csvname @@ -837,13 +837,13 @@ def prep_ngen_data(conf): buf.close() else: # Write files locally - csvname = Path(forcing_path, f"hashes_{vpu_or_subset}_{datentime}.csv") + csvname = Path(forcing_path, f"hashes_{vpu_or_subset}.csv") hash_df.to_csv(csvname, index=False) - csvname = Path(forcing_path, f"metadata_{vpu_or_subset}_{datentime}.csv") + csvname = Path(forcing_path, f"metadata_{vpu_or_subset}.csv") metadata_df.to_csv(csvname, index=False) - csvname = Path(forcing_path, f"catchments_avg_{datentime}.csv") + csvname = Path(forcing_path, f"catchments_avg.csv") stats_avg.to_csv(csvname, index=False) - csvname = Path(forcing_path, f"catchments_median_{datentime}.csv") + csvname = Path(forcing_path, f"catchments_median.csv") stats_median.to_csv(csvname, index=False) meta_time = time.perf_counter() - t000 diff --git a/push_docker_ecr.sh b/push_docker_ecr.sh index 53c41fa..ee8e30a 100644 --- a/push_docker_ecr.sh +++ b/push_docker_ecr.sh @@ -41,7 +41,7 @@ docker build --no-cache -t ${IMAGE_NAME} . # Validate aws credentials echo "Validating credentials" -aws ecr get-login-password --region ${REGION} | docker login --username AWS --password-stdin "${AWS_ACCT_ID}.dkr.ecr.us-west-2.amazonaws.com" +aws ecr get-login-password --region ${REGION} | docker login --username AWS --password-stdin "${AWS_ACCT_ID}.dkr.ecr.${REGION}.amazonaws.com" # Create repo echo "Checking image repo status" From 3ae22900bb5a7f86b293f5cc87477190c616dee3 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 30 Aug 2023 13:05:42 -0600 Subject: [PATCH 100/105] fix path --- lambda_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambda_function.py b/lambda_function.py index 419d63f..bfbdb93 100644 --- a/lambda_function.py +++ b/lambda_function.py @@ -4,7 +4,7 @@ def handler(event, context): # load template config - conf = json.load(open('/function/ngen_forcing/ngen_forcings_lambda.json')) + conf = json.load(open('/data_access_examples/ngen_forcing/ngen_forcings_lambda.json')) # get date from event From 2a063953c4de75da155e2d86f26af9ecfd12d680 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Wed, 30 Aug 2023 13:32:25 -0600 Subject: [PATCH 101/105] config file --- ngen_forcing/ngen_forcings_lambda.json | 42 ++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 ngen_forcing/ngen_forcings_lambda.json diff --git a/ngen_forcing/ngen_forcings_lambda.json b/ngen_forcing/ngen_forcings_lambda.json new file mode 100644 index 0000000..ca8a308 --- /dev/null +++ b/ngen_forcing/ngen_forcings_lambda.json @@ -0,0 +1,42 @@ +{ + "forcing" : { + "forcing_type" : "operational_archive", + "start_date" : "20230705", + "end_date" : "20230705", + "cache" : true, + "nwm_file" : "", + "path_override": "", + "runinput" : 1, + "varinput" : 5, + "geoinput" : 1, + "meminput" : 0, + "urlbaseinput" : 3, + "fcst_cycle" : [0], + "lead_time" : [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18] + }, + + "hydrofab" : { + "version" : "v1.2", + "vpu" : "03W", + "geopkg_file" : "data/cache/nextgen_03W.gpkg", + "catch_subset" : "cat-115362", + "weights_only" : false + }, + + "storage":{ + "storage_type" : "S3", + "output_bucket" : "ngenforcing", + "output_bucket_path" : "", + "cache_bucket" : "ngenresources", + "cache_bucket_path" : "", + "output_file_type" : "csv" + }, + + "run" : { + "verbose" : true, + "dl_threads" : 1, + "collect_stats" : true, + "secret_name" : "jlaser_creds", + "region_name" : "us-east-2" + } +} From a6f39b7664852381aa192e68664ebf6b7d1495f7 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 18 Sep 2023 10:50:37 -0600 Subject: [PATCH 102/105] Allows VPU to be given as env variable --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 7f0477b..1bc59ba 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -364,6 +364,8 @@ def prep_ngen_data(conf): version = conf["hydrofab"].get('version','v1.2') vpu = conf["hydrofab"].get("vpu") + if vpu =='ENV': vpu = os.getenv('VPU') # allows lambdas to be unique + catchment_subset = conf['hydrofab'].get("catch_subset") geopkg_file = conf["hydrofab"].get("geopkg_file") ii_weights_only = conf['hydrofab'].get('weights_only',False) From 3e86ad876dd0f74c0a12dcdb3e03c60ee8555cb6 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Mon, 18 Sep 2023 23:44:26 -0600 Subject: [PATCH 103/105] aws updates --- ngen_forcing/ngen_forcings_lambda.json | 14 ++--- ngen_forcing/prep_hydrofab_forcings_ngen.py | 66 +++++++++++++++++---- 2 files changed, 63 insertions(+), 17 deletions(-) diff --git a/ngen_forcing/ngen_forcings_lambda.json b/ngen_forcing/ngen_forcings_lambda.json index ca8a308..505f084 100644 --- a/ngen_forcing/ngen_forcings_lambda.json +++ b/ngen_forcing/ngen_forcings_lambda.json @@ -3,7 +3,7 @@ "forcing_type" : "operational_archive", "start_date" : "20230705", "end_date" : "20230705", - "cache" : true, + "cache" : false, "nwm_file" : "", "path_override": "", "runinput" : 1, @@ -17,17 +17,17 @@ "hydrofab" : { "version" : "v1.2", - "vpu" : "03W", - "geopkg_file" : "data/cache/nextgen_03W.gpkg", - "catch_subset" : "cat-115362", + "vpu" : "ENV", + "geopkg_file" : "", + "catch_subset" : "", "weights_only" : false }, "storage":{ "storage_type" : "S3", - "output_bucket" : "ngenforcing", + "output_bucket" : "ngenforcingdev1", "output_bucket_path" : "", - "cache_bucket" : "ngenresources", + "cache_bucket" : "ngenforcingresources", "cache_bucket_path" : "", "output_file_type" : "csv" }, @@ -37,6 +37,6 @@ "dl_threads" : 1, "collect_stats" : true, "secret_name" : "jlaser_creds", - "region_name" : "us-east-2" + "region_name" : "us-west-2" } } diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 1bc59ba..40272ba 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -31,7 +31,7 @@ from subset import subset_upstream TEMPLATE_BLOB_NAME = ( - "nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc" + "nwm.t00z.medium_range.forcing.f001.conus.nc" ) NWM_BUCKET = "national-water-model" @@ -41,6 +41,52 @@ PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-97.0],PARAMETER["standard_parallel_1",30.0],\ PARAMETER["standard_parallel_2",60.0],PARAMETER["latitude_of_origin",40.0],UNIT["Meter",1.0]]' +def get_dataset( + blob_name: str, + use_cache: bool = True +) -> xr.Dataset: + """Retrieve a blob from the data service as xarray.Dataset. + Based largely on OWP HydroTools. + Parameters + ---------- + blob_name: str, required + Name of blob to retrieve. + use_cacahe: bool, default True + If cache should be used. + If True, checks to see if file is in cache, and + if fetched from remote will save to cache. + Returns + ------- + ds : xarray.Dataset + The data stored in the blob. + """ + # nc_filepath = os.path.join(get_cache_dir(), blob_name) + # make_parent_dir(nc_filepath) + + # # If the file exists and use_cache = True + # if os.path.exists(nc_filepath) and use_cache: + # Get dataset from cache + ds = xr.open_dataset( + blob_name, + engine='h5netcdf', + ) + return ds + # else: + # # Get raw bytes + # raw_bytes = get_blob(blob_name) + # # Create Dataset + # ds = xr.load_dataset( + # MemoryFile(raw_bytes), + # engine='h5netcdf', + # ) + # if use_cache: + # # Subset and cache + # ds["RAINRATE"].to_netcdf( + # nc_filepath, + # engine='h5netcdf', + # ) + # return ds + def get_cache_dir(nwm_cache_dir: str,create: bool = True): if not os.path.exists(nwm_cache_dir) and create: os.mkdir(nwm_cache_dir) @@ -396,13 +442,13 @@ def prep_ngen_data(conf): assert ( output_file_type in file_types ), f"{output_file_type} for output_file_type is not accepted! Accepted: {file_types}" - bucket_types = ["local", "S3"] + bucket_types = ["local", "s3"] assert ( - storage_type in bucket_types + storage_type.lower() in bucket_types ), f"{storage_type} for storage_type is not accepted! Accepted: {bucket_types}" - assert vpu is not None or geopkg_file is not None, "Need to input either vpu or geopkg_file" + assert vpu is not None or geopkg_file is not None, "Need to input either vpu or geopkg_file" - if catchment_subset is not None: + if len(catchment_subset) > 0: vpu_or_subset = catchment_subset + "_upstream" else: vpu_or_subset = vpu @@ -418,6 +464,7 @@ def prep_ngen_data(conf): forcing_path = Path(bucket_path, 'forcing') meta_path = Path(forcing_path, 'forcing_metadata') if not os.path.exists(bucket_path): + os.system(f"mkdir {bucket_path}") os.system(f"mkdir {bucket_path}") os.system(f"mkdir {forcing_path}") os.system(f"mkdir {meta_path}") @@ -432,6 +479,7 @@ def prep_ngen_data(conf): if not os.path.exists(cache_dir): raise Exception(f"Creating {cache_dir} failed!") + if len(geopkg_file) == 0: geopkg_file = os.path.join(cache_dir,"nextgen_" + vpu + ".gpkg") wgt_file = os.path.join(cache_dir, f"{vpu_or_subset}_weights.json") ii_wgt_file = os.path.exists(wgt_file) @@ -538,7 +586,8 @@ def prep_ngen_data(conf): t0 = time.perf_counter() polygonfile = gpd.read_file(gpkg, layer="divides") - ds = get_dataset(nwm_cache_dir,TEMPLATE_BLOB_NAME, use_cache=True) + ds = get_dataset(os.path.join(nwm_cache_dir,TEMPLATE_BLOB_NAME), use_cache=True) + # ds = get_dataset(nwm_cache_dir,TEMPLATE_BLOB_NAME, use_cache=True) src = ds["RAINRATE"] if ii_verbose: @@ -557,7 +606,7 @@ def prep_ngen_data(conf): # Exit early if we only want to calculate the weights if ii_weights_only: - exit + return # Get nwm forcing file names t0 = time.perf_counter() @@ -617,9 +666,6 @@ def prep_ngen_data(conf): ) dl_time += time.perf_counter() - t0 - # HACK - local_nwm_files = local_nwm_files[:3] - var_list = [ "U2D", "V2D", From 3dccd0b9165396943d991fb61d0db58e17c41313 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Tue, 19 Sep 2023 00:01:17 -0600 Subject: [PATCH 104/105] debug --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 40272ba..64634d4 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -501,7 +501,8 @@ def prep_ngen_data(conf): ) try: - wgt_file = s3.get_object(Bucket=cache_bucket, Key=f"{vpu_or_subset}_weights.json") + print(f'{cache_bucket} {vpu_or_subset}') + wgt_file = s3.get_object(Bucket=cache_bucket, Key=f"{vpu_or_subset}_weights.json") ii_wgt_file = True except : ii_wgt_file = False From a1e20ead3f7b0f19c154ea0bbf86cddfabf70752 Mon Sep 17 00:00:00 2001 From: Jordan Laser Date: Tue, 19 Sep 2023 10:13:01 -0600 Subject: [PATCH 105/105] removing file cap for production --- ngen_forcing/prep_hydrofab_forcings_ngen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ngen_forcing/prep_hydrofab_forcings_ngen.py b/ngen_forcing/prep_hydrofab_forcings_ngen.py index 64634d4..09025ae 100644 --- a/ngen_forcing/prep_hydrofab_forcings_ngen.py +++ b/ngen_forcing/prep_hydrofab_forcings_ngen.py @@ -725,7 +725,7 @@ def prep_ngen_data(conf): forcing_cat_ids = [] forcing_hashes = [] for j, jcatch in enumerate(dfs.keys()): - if j > catch_lim: break # TODO: remove this break for actual deployment. Just don't want to get charged for uploads. + # if j > catch_lim: break # TODO: remove this break for actual deployment. Just don't want to get charged for uploads. df = dfs[jcatch] cat_id = jcatch.split("-")[1] @@ -800,7 +800,7 @@ def prep_ngen_data(conf): catchment_sizes = [] zipped_sizes = [] for j, jcatch in enumerate(dfs.keys()): - if j > catch_lim: break # TODO: remove this break for actual deployment. Just don't want to get charged for uploads. + # if j > catch_lim: break # TODO: remove this break for actual deployment. Just don't want to get charged for uploads. # Check forcing size splt = jcatch.split("-")