From 1bfe3059a21dc9f0f4c2af7cfa8133c1d0d5b170 Mon Sep 17 00:00:00 2001 From: Baptiste Vandecrux <35140661+BaptisteVandecrux@users.noreply.github.com> Date: Mon, 1 Jul 2024 10:58:17 +0200 Subject: [PATCH] printing nan fields in L2, list old variables to drop on join_l3 fixed warning in join_l3 --- src/pypromice/process/L2toL3.py | 16 ++++--- src/pypromice/process/aws.py | 1 + src/pypromice/process/join_l3.py | 42 +++++++++++-------- src/pypromice/process/write.py | 12 +++++- .../resources/variable_aliases_GC-Net.csv | 6 +-- 5 files changed, 49 insertions(+), 28 deletions(-) diff --git a/src/pypromice/process/L2toL3.py b/src/pypromice/process/L2toL3.py index cd9e70fa..50848be6 100755 --- a/src/pypromice/process/L2toL3.py +++ b/src/pypromice/process/L2toL3.py @@ -5,7 +5,7 @@ import pandas as pd import numpy as np import xarray as xr -import toml +import toml, os from sklearn.linear_model import LinearRegression import logging @@ -128,11 +128,15 @@ def gpsCoordinatePostprocessing(ds, var, config_folder='../aws-l0/metadata/stati # fetching the station relocation dates at which the coordinates will/should # have a break config_file = config_folder +"/" + ds.attrs['station_id'] + ".toml" - with open(config_file, "r") as f: - config_data = toml.load(f) - - # Extract station relocations from the TOML data - station_relocations = config_data.get("station_relocation", []) + if os.path.isfile(config_file): + with open(config_file, "r") as f: + config_data = toml.load(f) + + # Extract station relocations from the TOML data + station_relocations = config_data.get("station_relocation", []) + else: + station_relocations = [] + logger.warning('Did not find config file for '+ds.attrs['station_id']+'. Assuming no station relocation.') # Convert the ISO8601 strings to pandas datetime objects breaks = [pd.to_datetime(date_str) for date_str in station_relocations] diff --git a/src/pypromice/process/aws.py b/src/pypromice/process/aws.py index 7314a19e..5e337cc7 100644 --- a/src/pypromice/process/aws.py +++ b/src/pypromice/process/aws.py @@ -91,6 +91,7 @@ def getL1(self): logger.info('Level 1 processing...') self.L0 = [utilities.addBasicMeta(item, self.vars) for item in self.L0] self.L1 = [toL1(item, self.vars) for item in self.L0] + self.L1.reverse() self.L1A = reduce(xr.Dataset.combine_first, self.L1) def getL2(self): diff --git a/src/pypromice/process/join_l3.py b/src/pypromice/process/join_l3.py index 10a2c769..f4cf0d26 100644 --- a/src/pypromice/process/join_l3.py +++ b/src/pypromice/process/join_l3.py @@ -105,8 +105,20 @@ def readNead(infile): # combining thermocouple and CS100 temperatures ds['TA1'] = ds['TA1'].combine_first(ds['TA3']) ds['TA2'] = ds['TA2'].combine_first(ds['TA4']) - + ds=ds.rename(var_name) + + standard_vars_to_drop = ["NR", "TA3", "TA4", "TA5", "NR_cor", + "z_surf_1", "z_surf_2", "z_surf_combined", + "TA2m", "RH2m", "VW10m", "SZA", "SAA", + "depth_t_i_1", "depth_t_i_2", "depth_t_i_3", "depth_t_i_4", "depth_t_i_5", + "depth_t_i_6", "depth_t_i_7", "depth_t_i_8", "depth_t_i_9", "depth_t_i_10", "t_i_10m" + ] + standard_vars_to_drop = standard_vars_to_drop + [v for v in list(ds.keys()) if v.endswith("_adj_flag")] + + # Drop the variables if they are present in the dataset + ds = ds.drop_vars([var for var in standard_vars_to_drop if var in ds]) + ds=ds.rename({'timestamp':'time'}) return ds @@ -121,7 +133,8 @@ def loadArr(infile, isNead): ds = xr.Dataset.from_dataframe(df) elif infile.split('.')[-1].lower() in 'nc': - ds = xr.open_dataset(infile) + with xr.open_dataset(infile) as ds: + ds.load() # Remove encoding attributes from NetCDF for varname in ds.variables: if ds[varname].encoding!={}: @@ -211,10 +224,17 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me filepath = os.path.join(folder_gcnet, stid+'.csv') isNead = True if not os.path.isfile(filepath): - logger.info(stid+' is from an project '+folder_l3+' or '+folder_gcnet) + logger.info(stid+' was listed as station but could not be found in '+folder_l3+' nor '+folder_gcnet) continue - l3, _ = loadArr(filepath, isNead) + l3, _ = loadArr(filepath, isNead) + + # removing specific variable from a given file + specific_vars_to_drop = station_info.get("skipped_variables",[]) + if len(specific_vars_to_drop)>0: + logger.info("Skipping %s from %s"%(specific_vars_to_drop, stid)) + l3 = l3.drop_vars([var for var in specific_vars_to_drop if var in l3]) + list_station_data.append((l3, station_info)) # Sort the list in reverse chronological order so that we start with the latest data @@ -251,19 +271,7 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me for v in l3_merged.data_vars: if v not in l3.data_vars: l3[v] = l3.t_u*np.nan - - # if l3 (older data) has variables that does not have l3_merged (newer data) - # then they are removed from l3 - list_dropped = [] - for v in l3.data_vars: - if v not in l3_merged.data_vars: - if v != 'z_stake': - list_dropped.append(v) - l3 = l3.drop(v) - else: - l3_merged[v] = ('time', l3_merged.t_u.data*np.nan) - logger.info('Unused variables in older dataset: '+' '.join(list_dropped)) - + # saving attributes of station under an attribute called $stid st_attrs = l3_merged.attrs.get('stations_attributes', {}) st_attrs[stid] = l3.attrs.copy() diff --git a/src/pypromice/process/write.py b/src/pypromice/process/write.py index e8d9e6a7..922c82d0 100644 --- a/src/pypromice/process/write.py +++ b/src/pypromice/process/write.py @@ -63,7 +63,11 @@ def prepare_and_write(dataset, outpath, vars_df=None, meta_dict=None, time='60mi d2 = roundValues(d2, vars_df) # Get variable names to write out - col_names = getColNames(vars_df, d2, remove_nan_fields=True) + if 'site_id' in d2.attrs.keys(): + remove_nan_fields = True + else: + remove_nan_fields = False + col_names = getColNames(vars_df, d2, remove_nan_fields=remove_nan_fields) # Define filename based on resample rate t = int(pd.Timedelta((d2['time'][1] - d2['time'][0]).values).total_seconds()) @@ -256,12 +260,16 @@ def addMeta(ds, meta): elif 'gps_lon' in ds.keys(): # caluclating average coordinates based on the measured coords (can be gappy) for v in ['gps_lat','gps_lon','gps_alt']: - ds.attrs[v+'_avg'] = ds[v].mean().item() + if v in ds.keys(): + ds.attrs[v+'_avg'] = ds[v].mean().item() + else: + ds.attrs[v+'_avg'] = np.nan # dropping the less accurate standard coordinates given in the # raw or tx config files for v in ['latitude','longitude']: if v in ds.attrs.keys(): del ds.attrs[v] + # Attribute convention for data discovery # https://wiki.esipfed.org/Attribute_Convention_for_Data_Discovery_1-3 diff --git a/src/pypromice/resources/variable_aliases_GC-Net.csv b/src/pypromice/resources/variable_aliases_GC-Net.csv index a0b4fe14..d84c0a71 100644 --- a/src/pypromice/resources/variable_aliases_GC-Net.csv +++ b/src/pypromice/resources/variable_aliases_GC-Net.csv @@ -47,9 +47,9 @@ t_i_11, tilt_x, tilt_y, rot, -gps_lat,latitude -gps_lon,longitude -gps_alt,elevation +lat,latitude +lon,longitude +alt,elevation gps_time, gps_geounit, gps_hdop,