making surface height calc failsafe

+ caught exception when there's insufficient data for resample + set default values of station_config when no config file is found + better adjustment of height in join_l3 when there's overlap between old and new data
GEUS-Glaciology-and-Climate · Aug 14, 2024 · bd1a8ce · bd1a8ce
1 parent 8098629
commit bd1a8ce
Show file tree

Hide file tree

Showing 5 changed files with 98 additions and 42 deletions.
diff --git a/src/pypromice/process/L2toL3.py b/src/pypromice/process/L2toL3.py
@@ -105,7 +105,11 @@ def toL3(L2, station_config={}, T_0=273.15):
         ds[var.replace('gps_','')] = ('time', gps_coordinate_postprocessing(ds, var, station_config))
 
     # processing continuous surface height, ice surface height, snow height
-    ds = process_surface_height(ds, station_config)
+    try:
+        ds = process_surface_height(ds, station_config)
+    except Exception as e:
+        logger.error("Error processing surface height at %s"%station_config['stid'])
+        logging.error(e, exc_info=True)
 
     # making sure dataset has the attributes contained in the config files
     ds.attrs['project'] = station_config['project']
@@ -168,28 +172,36 @@ def process_surface_height(ds, station_config={}):
 
     # Convert to dataframe and combine surface height variables
     df_in = ds[[v for v in ['z_surf_1', 'z_surf_2', 'z_ice_surf'] if v in ds.data_vars]].to_dataframe()
+
     (ds['z_surf_combined'], ds['z_ice_surf'],
-         ds['z_surf_1_adj'], ds['z_surf_2_adj'])= combine_surface_height(df_in, ds.attrs['site_type'])
+     ds['z_surf_1_adj'], ds['z_surf_2_adj']) = combine_surface_height(df_in, ds.attrs['site_type'])
+
 
     if ds.attrs['site_type'] == 'ablation':
         # Calculate rolling minimum for ice surface height and snow height
-        ts_interpolated = np.minimum(xr.where(ds.z_ice_surf.notnull(),ds.z_ice_surf,ds.z_surf_combined),
-                                     ds.z_surf_combined).to_series().resample('H').interpolate(limit=72)
-
-        # Apply the rolling window with median calculation
-        z_ice_surf = (ts_interpolated
-                      .rolling('14D', center=True, min_periods=1)
-                      .median())
-
-        # Overprint the first and last 7 days with interpolated values
-        # because of edge effect of rolling windows
-        z_ice_surf.iloc[:24*7] = (ts_interpolated.iloc[:24*7]
-                                  .rolling('1D', center=True, min_periods=1)
-                                  .median().values)
-        z_ice_surf.iloc[-24*7:] = (ts_interpolated.iloc[-24*7:]
-                                   .rolling('1D', center=True, min_periods=1)
-                                   .median().values)
-
+        ts_interpolated = np.minimum(
+            xr.where(ds.z_ice_surf.notnull(),
+                     ds.z_ice_surf,ds.z_surf_combined),
+            ds.z_surf_combined).to_series().resample('H').interpolate(limit=72)
+
+        if len(ts_interpolated)>24*7:
+            # Apply the rolling window with median calculation
+            z_ice_surf = (ts_interpolated
+                          .rolling('14D', center=True, min_periods=1)
+                          .median())
+            # Overprint the first and last 7 days with interpolated values
+            # because of edge effect of rolling windows
+            z_ice_surf.iloc[:24*7] = (ts_interpolated.iloc[:24*7]
+                                      .rolling('1D', center=True, min_periods=1)
+                                      .median().values)
+            z_ice_surf.iloc[-24*7:] = (ts_interpolated.iloc[-24*7:]
+                                       .rolling('1D', center=True, min_periods=1)
+                                       .median().values)
+        else:
+            z_ice_surf = (ts_interpolated
+                                       .rolling('1D', center=True, min_periods=1)
+                                       .median())
+
         z_ice_surf = z_ice_surf.loc[ds.time]
         # here we make sure that the periods where both z_stake and z_pt are
         # missing are also missing in z_ice_surf

diff --git a/src/pypromice/process/get_l2tol3.py b/src/pypromice/process/get_l2tol3.py
@@ -50,8 +50,21 @@ def get_l2tol3(config_folder, inpath, outpath, variables, metadata):
 
     # importing station_config (dict) from config_folder (str path)
     config_file = config_folder+l2.attrs['station_id']+'.toml'
-    with open(config_file) as fp:
-        station_config = toml.load(fp)
+
+    if os.path.exists(config_file):
+        # File exists, load the configuration
+        with open(config_file) as fp:
+            station_config = toml.load(fp)
+    else:
+        # File does not exist, initialize with standard info 
+        # this was prefered by RSF over exiting with error
+        logger.error("\n***\nNo station_configuration file for %s.\nPlease create one on AWS-L0/metadata/station_configurations.\n***"%l2.attrs['station_id'])
+        station_config = {"stid":l2.attrs['station_id'],
+                        "station_site":l2.attrs['station_id'],
+                        "project": "PROMICE",
+                        "location_type": "ice sheet",
+                        }
+
     # Perform Level 3 processing
     l3 = toL3(l2, station_config)
 

diff --git a/src/pypromice/process/join_l3.py b/src/pypromice/process/join_l3.py
@@ -154,12 +154,15 @@ def loadArr(infile, isNead):
 
 def align_surface_heights(data_series_new, data_series_old):
     """
-    Align two surface height time series based on the gap between their end and start.
+    Align two surface height time series based on the gap between their end and
+    start.
 
-    If the gap between the end of `data_series_old` and the start of `data_series_new` is less than a week,
-    the function aligns them based on the median value of the last week of `data_series_old` and the first
-    week of `data_series_new`. If the gap is larger than a week, it aligns them using a linear fit.
-    If there is overlap, the function uses the overlapping period to adjust the newer time series.
+    If the gap between the end of `data_series_old` and the start of `data_series_new` 
+    is less than a week, the function aligns them based on the median value of 
+    the last week of `data_series_old` and the first week of `data_series_new`. 
+    If the gap is larger than a week, it aligns them using a linear fit. If 
+    there is overlap, the function uses the overlapping period to adjust the 
+    newer time series.
 
     Parameters
     ----------
@@ -181,15 +184,22 @@ def align_surface_heights(data_series_new, data_series_old):
     if first_new_idx <= last_old_idx:
         # Find the overlapping period
         overlap_start = first_new_idx
-        overlap_end = min(last_old_idx, data_series_new.last_valid_index())
+        overlap_end = min(last_old_idx, overlap_start+pd.to_timedelta('7D'))
 
         # Compute the median values for the overlapping period
         overlap_old = data_series_old[overlap_start:overlap_end].median()
         overlap_new = data_series_new[overlap_start:overlap_end].median()
 
+        if np.isnan(overlap_old) or np.isnan(overlap_new):
+            overlap_end = min(last_old_idx, data_series_new.last_valid_index())
+
+            # Compute the median values for the overlapping period
+            overlap_old = data_series_old[overlap_start:overlap_end].median()
+            overlap_new = data_series_new[overlap_start:overlap_end].median()
+
         # Align based on the overlapping median values
         data_series_new = data_series_new - overlap_new + overlap_old
-
+        
     elif (first_new_idx - last_old_idx).days <= 7:
         # Compute the median of the last week of data in the old series
         last_week_old = data_series_old[last_old_idx - pd.Timedelta(weeks=1):last_old_idx].median()
@@ -232,6 +242,7 @@ def build_station_list(config_folder: str, target_station_site: str) -> list:
     """
     station_info_list = []  # Initialize an empty list to store station information
 
+    found_as_station = False
     for filename in os.listdir(config_folder):
         if filename.endswith(".toml"):
             file_path = os.path.join(config_folder, filename)
@@ -242,10 +253,25 @@ def build_station_list(config_folder: str, target_station_site: str) -> list:
                 stid = data.get("stid")  # Get the station ID
 
                 # Check if the station site matches the target and stid is unique
+                if stid == target_station_site:
+                    found_as_station = True
                 if station_site == target_station_site and stid:
                     station_info = data.copy()  # Copy all attributes from the TOML file
                     station_info_list.append(station_info)  # Add the station info to the list
 
+
+    if len(station_info_list)==0 and not found_as_station:
+        logger.error('\n***\nNo station_configuration file found for %s.\nProcessing it as a single-station PROMICE site.\n***'%target_station_site)
+        station_info = {
+            "stid": target_station_site,
+            "station_site": target_station_site,
+            "project": "PROMICE",
+            "location_type": "ice sheet",
+                        }
+        station_info_list.append(station_info)
+    elif len(station_info_list)==0 :
+        logger.error('\n***\nThe name \"%s\" passed to join_l3 is a station name and not a site name (e.g. SCO_Lv3 instead of SCO_L). Please provide a site name that is named at least once in the "station_site" attribute of the station configuration files.\n***'%target_station_site)
+
     return station_info_list
 
 def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, metadata):
@@ -264,7 +290,7 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me
             filepath = os.path.join(folder_gcnet, stid+'.csv')
             isNead = True
         if not os.path.isfile(filepath):
-            logger.info(stid+' was listed as station but could not be found in '+folder_l3+' nor '+folder_gcnet)
+            logger.error('\n***\n'+stid+' was listed as station but could not be found in '+folder_l3+' nor '+folder_gcnet+'\n***')
             continue
 
         l3, _ = loadArr(filepath, isNead)    
@@ -283,6 +309,7 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me
     logger.info('joining %s' % ' '.join(sorted_stids))
 
     l3_merged = None
+
     for l3, station_info in sorted_list_station_data:
         stid = station_info["stid"]
 
@@ -346,7 +373,8 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me
 
     # Assign site id
     if not l3_merged:
-        logger.error('No level 2 data file found for '+site)
+        logger.error('No level 3 station data file found for '+site)
+        return None, sorted_list_station_data
     l3_merged.attrs['site_id'] = site
     l3_merged.attrs['stations'] = ' '.join(sorted_stids)
     l3_merged.attrs['level'] = 'L3'

diff --git a/src/pypromice/process/write.py b/src/pypromice/process/write.py
@@ -32,6 +32,9 @@ def prepare_and_write(dataset, outpath, vars_df=None, meta_dict=None, time='60mi
     if resample:
         d2 = resample_dataset(dataset, time)
         logger.info('Resampling to '+str(time))
+        if len(d2.time) == 1:
+            logger.warning('Output of resample has length 1. Not enough data to calculate daily/monthly average.')
+            return None
     else:
         d2 = dataset.copy()
 
@@ -275,17 +278,17 @@ def addMeta(ds, meta):
     # https://wiki.esipfed.org/Attribute_Convention_for_Data_Discovery_1-3
 
     # Determine the temporal resolution
-    time_diff = pd.Timedelta((ds['time'][1] - ds['time'][0]).values)
-    if time_diff == pd.Timedelta('10min'):
-        sample_rate  = "10min"
-    elif time_diff == pd.Timedelta('1H'):
-        sample_rate  = "hourly"
-    elif time_diff == pd.Timedelta('1D'):
-        sample_rate  = "daily"
-    elif  28 <= time_diff.days <= 31:
-        sample_rate  = "monthly"
-    else:
-        sample_rate  = "unknown_sample_rate"
+    sample_rate  = "unknown_sample_rate"
+    if len(ds['time'])>1:
+        time_diff = pd.Timedelta((ds['time'][1] - ds['time'][0]).values)
+        if time_diff == pd.Timedelta('10min'):
+            sample_rate  = "10min"
+        elif time_diff == pd.Timedelta('1H'):
+            sample_rate  = "hourly"
+        elif time_diff == pd.Timedelta('1D'):
+            sample_rate  = "daily"
+        elif  28 <= time_diff.days <= 31:
+            sample_rate  = "monthly"
 
     if 'station_id' in ds.attrs.keys():
         ds.attrs['id'] = 'dk.geus.promice.station.' + ds.attrs['station_id']+'.'+sample_rate

diff --git a/src/pypromice/qc/persistence.py b/src/pypromice/qc/persistence.py
@@ -82,7 +82,7 @@ def persistence_qc(
     if variable_thresholds is None:
         variable_thresholds = DEFAULT_VARIABLE_THRESHOLDS
 
-    logger.info(f"Running persistence_qc using {variable_thresholds}")
+    logger.debug(f"Running persistence_qc using {variable_thresholds}")
 
     for k in variable_thresholds.keys():
         if k in ["t", "p", "rh", "wspd", "wdir", "z_boom"]: