From bd1a8cee413416b62a4483f71eacd89409b6367d Mon Sep 17 00:00:00 2001 From: Baptiste Vandecrux <35140661+BaptisteVandecrux@users.noreply.github.com> Date: Wed, 14 Aug 2024 14:16:08 +0200 Subject: [PATCH] making surface height calc failsafe + caught exception when there's insufficient data for resample + set default values of station_config when no config file is found + better adjustment of height in join_l3 when there's overlap between old and new data --- src/pypromice/process/L2toL3.py | 50 ++++++++++++++++++----------- src/pypromice/process/get_l2tol3.py | 17 ++++++++-- src/pypromice/process/join_l3.py | 46 ++++++++++++++++++++------ src/pypromice/process/write.py | 25 ++++++++------- src/pypromice/qc/persistence.py | 2 +- 5 files changed, 98 insertions(+), 42 deletions(-) diff --git a/src/pypromice/process/L2toL3.py b/src/pypromice/process/L2toL3.py index 4e1c9fa1..292f2d04 100755 --- a/src/pypromice/process/L2toL3.py +++ b/src/pypromice/process/L2toL3.py @@ -105,7 +105,11 @@ def toL3(L2, station_config={}, T_0=273.15): ds[var.replace('gps_','')] = ('time', gps_coordinate_postprocessing(ds, var, station_config)) # processing continuous surface height, ice surface height, snow height - ds = process_surface_height(ds, station_config) + try: + ds = process_surface_height(ds, station_config) + except Exception as e: + logger.error("Error processing surface height at %s"%station_config['stid']) + logging.error(e, exc_info=True) # making sure dataset has the attributes contained in the config files ds.attrs['project'] = station_config['project'] @@ -168,28 +172,36 @@ def process_surface_height(ds, station_config={}): # Convert to dataframe and combine surface height variables df_in = ds[[v for v in ['z_surf_1', 'z_surf_2', 'z_ice_surf'] if v in ds.data_vars]].to_dataframe() + (ds['z_surf_combined'], ds['z_ice_surf'], - ds['z_surf_1_adj'], ds['z_surf_2_adj'])= combine_surface_height(df_in, ds.attrs['site_type']) + ds['z_surf_1_adj'], ds['z_surf_2_adj']) = combine_surface_height(df_in, ds.attrs['site_type']) + if ds.attrs['site_type'] == 'ablation': # Calculate rolling minimum for ice surface height and snow height - ts_interpolated = np.minimum(xr.where(ds.z_ice_surf.notnull(),ds.z_ice_surf,ds.z_surf_combined), - ds.z_surf_combined).to_series().resample('H').interpolate(limit=72) - - # Apply the rolling window with median calculation - z_ice_surf = (ts_interpolated - .rolling('14D', center=True, min_periods=1) - .median()) - - # Overprint the first and last 7 days with interpolated values - # because of edge effect of rolling windows - z_ice_surf.iloc[:24*7] = (ts_interpolated.iloc[:24*7] - .rolling('1D', center=True, min_periods=1) - .median().values) - z_ice_surf.iloc[-24*7:] = (ts_interpolated.iloc[-24*7:] - .rolling('1D', center=True, min_periods=1) - .median().values) - + ts_interpolated = np.minimum( + xr.where(ds.z_ice_surf.notnull(), + ds.z_ice_surf,ds.z_surf_combined), + ds.z_surf_combined).to_series().resample('H').interpolate(limit=72) + + if len(ts_interpolated)>24*7: + # Apply the rolling window with median calculation + z_ice_surf = (ts_interpolated + .rolling('14D', center=True, min_periods=1) + .median()) + # Overprint the first and last 7 days with interpolated values + # because of edge effect of rolling windows + z_ice_surf.iloc[:24*7] = (ts_interpolated.iloc[:24*7] + .rolling('1D', center=True, min_periods=1) + .median().values) + z_ice_surf.iloc[-24*7:] = (ts_interpolated.iloc[-24*7:] + .rolling('1D', center=True, min_periods=1) + .median().values) + else: + z_ice_surf = (ts_interpolated + .rolling('1D', center=True, min_periods=1) + .median()) + z_ice_surf = z_ice_surf.loc[ds.time] # here we make sure that the periods where both z_stake and z_pt are # missing are also missing in z_ice_surf diff --git a/src/pypromice/process/get_l2tol3.py b/src/pypromice/process/get_l2tol3.py index ca08b17c..8805db74 100644 --- a/src/pypromice/process/get_l2tol3.py +++ b/src/pypromice/process/get_l2tol3.py @@ -50,8 +50,21 @@ def get_l2tol3(config_folder, inpath, outpath, variables, metadata): # importing station_config (dict) from config_folder (str path) config_file = config_folder+l2.attrs['station_id']+'.toml' - with open(config_file) as fp: - station_config = toml.load(fp) + + if os.path.exists(config_file): + # File exists, load the configuration + with open(config_file) as fp: + station_config = toml.load(fp) + else: + # File does not exist, initialize with standard info + # this was prefered by RSF over exiting with error + logger.error("\n***\nNo station_configuration file for %s.\nPlease create one on AWS-L0/metadata/station_configurations.\n***"%l2.attrs['station_id']) + station_config = {"stid":l2.attrs['station_id'], + "station_site":l2.attrs['station_id'], + "project": "PROMICE", + "location_type": "ice sheet", + } + # Perform Level 3 processing l3 = toL3(l2, station_config) diff --git a/src/pypromice/process/join_l3.py b/src/pypromice/process/join_l3.py index 3377b107..83488b20 100644 --- a/src/pypromice/process/join_l3.py +++ b/src/pypromice/process/join_l3.py @@ -154,12 +154,15 @@ def loadArr(infile, isNead): def align_surface_heights(data_series_new, data_series_old): """ - Align two surface height time series based on the gap between their end and start. + Align two surface height time series based on the gap between their end and + start. - If the gap between the end of `data_series_old` and the start of `data_series_new` is less than a week, - the function aligns them based on the median value of the last week of `data_series_old` and the first - week of `data_series_new`. If the gap is larger than a week, it aligns them using a linear fit. - If there is overlap, the function uses the overlapping period to adjust the newer time series. + If the gap between the end of `data_series_old` and the start of `data_series_new` + is less than a week, the function aligns them based on the median value of + the last week of `data_series_old` and the first week of `data_series_new`. + If the gap is larger than a week, it aligns them using a linear fit. If + there is overlap, the function uses the overlapping period to adjust the + newer time series. Parameters ---------- @@ -181,15 +184,22 @@ def align_surface_heights(data_series_new, data_series_old): if first_new_idx <= last_old_idx: # Find the overlapping period overlap_start = first_new_idx - overlap_end = min(last_old_idx, data_series_new.last_valid_index()) + overlap_end = min(last_old_idx, overlap_start+pd.to_timedelta('7D')) # Compute the median values for the overlapping period overlap_old = data_series_old[overlap_start:overlap_end].median() overlap_new = data_series_new[overlap_start:overlap_end].median() + if np.isnan(overlap_old) or np.isnan(overlap_new): + overlap_end = min(last_old_idx, data_series_new.last_valid_index()) + + # Compute the median values for the overlapping period + overlap_old = data_series_old[overlap_start:overlap_end].median() + overlap_new = data_series_new[overlap_start:overlap_end].median() + # Align based on the overlapping median values data_series_new = data_series_new - overlap_new + overlap_old - + elif (first_new_idx - last_old_idx).days <= 7: # Compute the median of the last week of data in the old series last_week_old = data_series_old[last_old_idx - pd.Timedelta(weeks=1):last_old_idx].median() @@ -232,6 +242,7 @@ def build_station_list(config_folder: str, target_station_site: str) -> list: """ station_info_list = [] # Initialize an empty list to store station information + found_as_station = False for filename in os.listdir(config_folder): if filename.endswith(".toml"): file_path = os.path.join(config_folder, filename) @@ -242,10 +253,25 @@ def build_station_list(config_folder: str, target_station_site: str) -> list: stid = data.get("stid") # Get the station ID # Check if the station site matches the target and stid is unique + if stid == target_station_site: + found_as_station = True if station_site == target_station_site and stid: station_info = data.copy() # Copy all attributes from the TOML file station_info_list.append(station_info) # Add the station info to the list + + if len(station_info_list)==0 and not found_as_station: + logger.error('\n***\nNo station_configuration file found for %s.\nProcessing it as a single-station PROMICE site.\n***'%target_station_site) + station_info = { + "stid": target_station_site, + "station_site": target_station_site, + "project": "PROMICE", + "location_type": "ice sheet", + } + station_info_list.append(station_info) + elif len(station_info_list)==0 : + logger.error('\n***\nThe name \"%s\" passed to join_l3 is a station name and not a site name (e.g. SCO_Lv3 instead of SCO_L). Please provide a site name that is named at least once in the "station_site" attribute of the station configuration files.\n***'%target_station_site) + return station_info_list def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, metadata): @@ -264,7 +290,7 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me filepath = os.path.join(folder_gcnet, stid+'.csv') isNead = True if not os.path.isfile(filepath): - logger.info(stid+' was listed as station but could not be found in '+folder_l3+' nor '+folder_gcnet) + logger.error('\n***\n'+stid+' was listed as station but could not be found in '+folder_l3+' nor '+folder_gcnet+'\n***') continue l3, _ = loadArr(filepath, isNead) @@ -283,6 +309,7 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me logger.info('joining %s' % ' '.join(sorted_stids)) l3_merged = None + for l3, station_info in sorted_list_station_data: stid = station_info["stid"] @@ -346,7 +373,8 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me # Assign site id if not l3_merged: - logger.error('No level 2 data file found for '+site) + logger.error('No level 3 station data file found for '+site) + return None, sorted_list_station_data l3_merged.attrs['site_id'] = site l3_merged.attrs['stations'] = ' '.join(sorted_stids) l3_merged.attrs['level'] = 'L3' diff --git a/src/pypromice/process/write.py b/src/pypromice/process/write.py index 28adbfd5..db513f23 100644 --- a/src/pypromice/process/write.py +++ b/src/pypromice/process/write.py @@ -32,6 +32,9 @@ def prepare_and_write(dataset, outpath, vars_df=None, meta_dict=None, time='60mi if resample: d2 = resample_dataset(dataset, time) logger.info('Resampling to '+str(time)) + if len(d2.time) == 1: + logger.warning('Output of resample has length 1. Not enough data to calculate daily/monthly average.') + return None else: d2 = dataset.copy() @@ -275,17 +278,17 @@ def addMeta(ds, meta): # https://wiki.esipfed.org/Attribute_Convention_for_Data_Discovery_1-3 # Determine the temporal resolution - time_diff = pd.Timedelta((ds['time'][1] - ds['time'][0]).values) - if time_diff == pd.Timedelta('10min'): - sample_rate = "10min" - elif time_diff == pd.Timedelta('1H'): - sample_rate = "hourly" - elif time_diff == pd.Timedelta('1D'): - sample_rate = "daily" - elif 28 <= time_diff.days <= 31: - sample_rate = "monthly" - else: - sample_rate = "unknown_sample_rate" + sample_rate = "unknown_sample_rate" + if len(ds['time'])>1: + time_diff = pd.Timedelta((ds['time'][1] - ds['time'][0]).values) + if time_diff == pd.Timedelta('10min'): + sample_rate = "10min" + elif time_diff == pd.Timedelta('1H'): + sample_rate = "hourly" + elif time_diff == pd.Timedelta('1D'): + sample_rate = "daily" + elif 28 <= time_diff.days <= 31: + sample_rate = "monthly" if 'station_id' in ds.attrs.keys(): ds.attrs['id'] = 'dk.geus.promice.station.' + ds.attrs['station_id']+'.'+sample_rate diff --git a/src/pypromice/qc/persistence.py b/src/pypromice/qc/persistence.py index 82fe6df8..2146cf0c 100644 --- a/src/pypromice/qc/persistence.py +++ b/src/pypromice/qc/persistence.py @@ -82,7 +82,7 @@ def persistence_qc( if variable_thresholds is None: variable_thresholds = DEFAULT_VARIABLE_THRESHOLDS - logger.info(f"Running persistence_qc using {variable_thresholds}") + logger.debug(f"Running persistence_qc using {variable_thresholds}") for k in variable_thresholds.keys(): if k in ["t", "p", "rh", "wspd", "wdir", "z_boom"]: