From fced84af6ba46c6d0f2da95d6810d8756ea903c5 Mon Sep 17 00:00:00 2001
From: BaptisteVandecrux <b.vandecrux@gmail.com>
Date: Mon, 29 Jul 2024 23:01:22 +0200
Subject: [PATCH] small adjustments following latest assessment

---
 src/pypromice/process/L2toL3.py | 149 +++++++++++++++++++++++++-------
 1 file changed, 116 insertions(+), 33 deletions(-)

diff --git a/src/pypromice/process/L2toL3.py b/src/pypromice/process/L2toL3.py
index 650f0702..23cdd97c 100755
--- a/src/pypromice/process/L2toL3.py
+++ b/src/pypromice/process/L2toL3.py
@@ -142,7 +142,7 @@ def process_surface_height(ds, station_config={}):
         # Calculate surface heights for ablation sites
         ds['z_surf_1'] = 2.6 - ds['z_boom_u']
         if ds.z_stake.notnull().any():
-            first_valid_index = ds.time.where(ds.z_stake.notnull(), drop=True).data[0]
+            first_valid_index = ds.time.where((ds.z_stake + ds.z_boom_u).notnull(), drop=True).data[0]
             ds['z_surf_2'] = ds.z_surf_1.sel(time=first_valid_index) + ds.z_stake.sel(time=first_valid_index) - ds['z_stake']
         
         # Use corrected point data if available
@@ -153,24 +153,45 @@ def process_surface_height(ds, station_config={}):
         # Calculate surface heights for other site types
         first_valid_index = ds.time.where(ds.z_boom_u.notnull(), drop=True).data[0]
         ds['z_surf_1'] = ds.z_boom_u.sel(time=first_valid_index) - ds['z_boom_u']
-        if 'z_boom_l' in ds.data_vars:
-            first_valid_index = ds.time.where(ds.z_boom_l.notnull(), drop=True).data[0]
-            ds['z_surf_2'] = ds.z_boom_l.sel(time=first_valid_index) - ds['z_boom_l']
         if 'z_stake' in ds.data_vars and ds.z_stake.notnull().any():
             first_valid_index = ds.time.where(ds.z_stake.notnull(), drop=True).data[0]
-            ds['z_surf_2'] = ds.z_stake.sel(time=first_valid_index) - ds['z_stake']        
+            ds['z_surf_2'] = ds.z_stake.sel(time=first_valid_index) - ds['z_stake'] 
+        if 'z_boom_l' in ds.data_vars:
+            # need a combine first because KAN_U switches from having a z_stake
+            # to having a z_boom_l
+            first_valid_index = ds.time.where(ds.z_boom_l.notnull(), drop=True).data[0]
+            ds['z_surf_2'] = ds['z_surf_2'].combine_first(
+                ds.z_boom_l.sel(time=first_valid_index) - ds['z_boom_l'])
 
     # Adjust data for the created surface height variables
     ds = adjustData(ds, var_list=['z_surf_1', 'z_surf_2', 'z_ice_surf'])
 
     # Convert to dataframe and combine surface height variables
     df_in = ds[[v for v in ['z_surf_1', 'z_surf_2', 'z_ice_surf'] if v in ds.data_vars]].to_dataframe()
-    ds['z_surf_combined'], ds['z_ice_surf'] = combine_surface_height(df_in, ds.attrs['site_type'])
+    (ds['z_surf_combined'], ds['z_ice_surf'],
+         ds['z_surf_1_adj'], ds['z_surf_2_adj'])= combine_surface_height(df_in, ds.attrs['site_type'])
 
     if ds.attrs['site_type'] == 'ablation':
         # Calculate rolling minimum for ice surface height and snow height
-        ds['z_ice_surf'] = ds.z_surf_combined.to_series().rolling('365D').min()
-        ds['snow_height'] = ds['z_surf_combined'] - ds['z_ice_surf']
+        ts_interpolated = ds.z_surf_combined.to_series().resample('H').interpolate(limit=72)
+        
+        # Apply the rolling window with median calculation
+        z_ice_surf = (ts_interpolated
+                      .rolling('14D', center=True, min_periods=1)
+                      .median())
+        
+        # Overprint the first and last 7 days with interpolated values
+        # because of edge effect of rolling windows
+        z_ice_surf.iloc[:24*7] = (ts_interpolated.iloc[:24*7]
+                                  .rolling('1D', center=True, min_periods=1)
+                                  .median().values)
+        z_ice_surf.iloc[-24*7:] = (ts_interpolated.iloc[-24*7:]
+                                   .rolling('1D', center=True, min_periods=1)
+                                   .median().values)
+        
+        ds['z_ice_surf'] = z_ice_surf.cummin()
+        
+        ds['snow_height'] = np.maximum(0, ds['z_surf_combined'] - ds['z_ice_surf'])
     elif ds.attrs['site_type'] in ['accumulation', 'bedrock']:
         # Handle accumulation and bedrock site types
         ds['z_ice_surf'] = ('time', ds['z_surf_1'].data * np.nan)
@@ -241,8 +262,9 @@ def combine_surface_height(df, site_type, threshold_ablation = -0.0002):
         elif df.z_surf_2_adj.notnull().any():
             df['z_surf_combined'] = df.z_surf_2_adj.values
 
-        df["z_surf_combined"] = hampel(df["z_surf_combined"].interpolate(limit=72)).values
-        return df["z_surf_combined"], df["z_surf_combined"]*np.nan
+        # df["z_surf_combined"] = hampel(df["z_surf_combined"].interpolate(limit=72)).values
+        return (df['z_surf_combined'], df["z_surf_combined"]*np.nan, 
+                    df["z_surf_1_adj"], df["z_surf_2_adj"])
 
     else:
         logger.info('-> ablation site')
@@ -251,6 +273,9 @@ def combine_surface_height(df, site_type, threshold_ablation = -0.0002):
         df["z_surf_1_adj"] = hampel(df["z_surf_1"].interpolate(limit=72)).values
         df["z_surf_2_adj"] = hampel(df["z_surf_2"].interpolate(limit=72)).values
 
+        df["z_surf_1_adj"] = hampel(df["z_surf_1"].interpolate(limit=72), k=24, t0=5).values
+        df["z_surf_2_adj"] = hampel(df["z_surf_2"].interpolate(limit=72), k=24, t0=5).values
+
         # defining ice ablation period from the decrease of a smoothed version of z_pt
         # meaning when smoothed_z_pt.diff() < threshold_ablation 
         # first smoothing
@@ -263,7 +288,7 @@ def combine_surface_height(df, site_type, threshold_ablation = -0.0002):
         smoothed_PT = smoothed_PT.rolling('14D', center=True, min_periods=1).mean()
 
         smoothed_PT = smoothed_PT.reindex(df.index,method='ffill')
-        smoothed_PT.loc[df.z_ice_surf.isnull()] = np.nan
+        # smoothed_PT.loc[df.z_ice_surf.isnull()] = np.nan
 
         # logical index where ablation is detected
         ind_ablation = np.logical_and(smoothed_PT.diff().values < threshold_ablation, 
@@ -272,6 +297,16 @@ def combine_surface_height(df, site_type, threshold_ablation = -0.0002):
         # finding the beginning and end of each period with True
         idx = np.argwhere(np.diff(np.r_[False,ind_ablation, False])).reshape(-1, 2)
         idx[:, 1] -= 1
+        
+        # because the smooth_PT sees 7 days ahead, it starts showing a decline 
+        # 7 days in advance, we therefore need to exclude the first 7 days of 
+        # each ablation period
+        for start, end in idx:
+            period_start = df.index[start]
+            period_end = period_start + pd.Timedelta(days=7)
+            exclusion_period = (df.index >= period_start) & (df.index < period_end)
+            ind_ablation[exclusion_period] = False
+            
         # fill small gaps in the ice ablation periods.
         for i in range(len(idx)-1):
             ind = idx[i]
@@ -287,6 +322,7 @@ def combine_surface_height(df, site_type, threshold_ablation = -0.0002):
         z=df["z_ice_surf_adj"].interpolate(limit=24*2).copy()
 
         # the surface heights are adjusted so that they start at 0
+
         if any(~np.isnan(hs1.iloc[:24*7])):
             hs1 = hs1 - hs1.iloc[:24*7].mean()
         if any(~np.isnan(hs2.iloc[:24*7])):
@@ -337,9 +373,11 @@ def combine_surface_height(df, site_type, threshold_ablation = -0.0002):
                 ind_abl_yr = np.logical_and(ind_yr, df.index.month.isin([6,7,8]))
                 ind_ablation[ind_yr] = ind_abl_yr[ind_yr]
                 logger.debug(str(y)+' no z_ice_surf, just using JJA')
+                
+            else:
+                logger.debug(str(y)+ ' derived from z_ice_surf')
 
             if np.any(ind_abl_yr):
-                logger.debug(str(y)+ ' derived from z_ice_surf')
                 # if there are some ablation flagged for that year
                 # then find begining and end
                 ind_start[i] = np.argwhere(ind_abl_yr)[0][0]
@@ -358,6 +396,8 @@ def combine_surface_height(df, site_type, threshold_ablation = -0.0002):
         # to hs1 and hs2 the year after.
 
         for i, y in enumerate(years):
+            # if y == 2021:
+            #     import pdb; pdb.set_trace()
             logger.debug(str(y))
             # defining subsets of hs1, hs2, z
             hs1_jja =  hs1[str(y)+'-06-01':str(y)+'-09-01']
@@ -394,28 +434,52 @@ def combine_surface_height(df, site_type, threshold_ablation = -0.0002):
                 # we need to adjust z to match hs2 when ablation starts
                 hs2_ref = 1
 
-
+            # adjustment at the start of the ablation season
             if hs2_ref:
+                # if hs2 has been taken as reference in the previous years
+                # then we check if pressure transducer is reinstalled and needs
+                # to be adjusted
                 if ind_start[i] != -999:
                     # the first year there is both ablation and PT data available
                     # then PT is adjusted to hs2
                     if any(~np.isnan(z_ablation)) and any(~np.isnan(hs2_ablation)):
-                        logger.debug('adjusting z to hs2')
                         tmp1 = z_ablation.copy()
                         tmp2 = hs2_ablation.copy()
-                        tmp1[np.isnan(tmp2)] = np.nan
-                        tmp2[np.isnan(tmp1)] = np.nan
+                        # tmp1[np.isnan(tmp2)] = np.nan
+                        # tmp2[np.isnan(tmp1)] = np.nan
 
                         # in some instances, the PT data is available but no ablation
                         # is recorded, then hs2 remains the reference during that time.
                         # When eventually there is ablation, then we need to find the 
                         # first index in these preceding ablation-free years
                         # the shift will be applied back from this point
-                        first_index = z[:z[str(y)].first_valid_index()].isnull().iloc[::-1].idxmax()
-                        z[first_index:] = z[first_index:] -  np.nanmean(tmp1)  +  np.nanmean(tmp2)
+                        # first_index = z[:z[str(y)].first_valid_index()].isnull().iloc[::-1].idxmax()
+                        # z[first_index:] = z[first_index:] -  np.nanmean(tmp1)  +  np.nanmean(tmp2)
+                        # hs2_ref = 0 # from now on PT is the reference
+
+                        # in some other instance, z just need to be adjusted to hs2
+                        # first_index = z[str(y)].first_valid_index()
+                        first_index = z.iloc[ind_start[i]:].first_valid_index() # of ablation
+                        if np.isnan(hs2[first_index]):
+                            first_index_2 = hs2.iloc[ind_start[i]:].first_valid_index()
+                            if (first_index_2 - first_index)>pd.Timedelta('30d'):
+                                logger.debug('adjusting z to hs1')
+                                if np.isnan(hs1[first_index]):
+                                    first_index = hs1.iloc[ind_start[i]:].first_valid_index()
+                                z[first_index:] = z[first_index:] -  z[first_index]   +  hs1[first_index]
+                            else:
+                                logger.debug('adjusting z to hs1')
+                                first_index = hs2.iloc[ind_start[i]:].first_valid_index()
+                                z[first_index:] = z[first_index:] -  z[first_index]   +  hs2[first_index] 
+                        else:
+                            logger.debug('adjusting z to hs1')                            
+                            z[first_index:] = z[first_index:] -  z[first_index]   +  hs2[first_index] 
                         hs2_ref = 0 # from now on PT is the reference
-            else:         
-                # if there is some ablation
+                        
+                        
+            else:
+                # if z_pt is the reference and there is some ablation
+                # then hs1 and hs2 are adjusted to z_pt
                 if (ind_start[i] != -999) & z_year.notnull().any():
                     # calculating first index with PT, hs1 and hs2
                     first_index = z_year.first_valid_index()
@@ -447,10 +511,13 @@ def combine_surface_height(df, site_type, threshold_ablation = -0.0002):
                                             -  np.nanmean(hs2[first_index:first_index+pd.to_timedelta('1D')])  \
                                                 +  np.nanmean(z[first_index:first_index+pd.to_timedelta('1D')])
 
-            # adapting hs2
+            # adjustment taking place at the end of the ablation period
             if (ind_end[i] != -999): 
-                # and if there are PT data available at the end of the melt season
-                if np.any(~np.isnan(z.iloc[(ind_end[i]-24*7):(ind_end[i]+24*7)])):
+                # if y == 2023:
+                #     import pdb; pdb.set_trace()
+                # if there's ablation and
+                # if there are PT data available at the end of the melt season
+                if z.iloc[(ind_end[i]-24*7):(ind_end[i]+24*7)].notnull().any():
                     logger.debug('adjusting hs2 to z')
                     # then we adjust hs2 to the end-of-ablation z    
                     # first trying at the end of melt season
@@ -509,7 +576,7 @@ def combine_surface_height(df, site_type, threshold_ablation = -0.0002):
                     logger.debug('adjusting hs1')
                     # and if there are some hs2 during the accumulation period
                     if any(~np.isnan(hs2_following_winter)):
-                        logger.debug('to hs2')
+                        logger.debug('to hs2, minimizing winter difference')
                         # then we adjust hs1 to hs2 during the accumulation area
                         # adjustment is done so that the mean hs1 and mean hs2 match 
                         # for the period when both are available
@@ -518,25 +585,40 @@ def combine_surface_height(df, site_type, threshold_ablation = -0.0002):
 
                         tmp1[np.isnan(tmp2)] = np.nan
                         tmp2[np.isnan(tmp1)] = np.nan
+                        if tmp1.isnull().all():
+                            tmp1 = hs1_following_winter.copy()
+                            tmp2 = hs2_following_winter.copy()
 
+                            tmp1[np.isnan(tmp2)] = np.nan
+                            tmp2[np.isnan(tmp1)] = np.nan
                         hs1.iloc[ind_end[i]:] = hs1.iloc[ind_end[i]:] -  np.nanmean(tmp1)  +  np.nanmean(tmp2)
 
                     # if no hs2, then use PT data available at the end of the melt season
-                    elif np.any(~np.isnan(z.iloc[(ind_end[i]-24*7):(ind_end[i]+24*7)])):
+                    elif np.any(~np.isnan(z.iloc[(ind_end[i]-24*14):(ind_end[i]+24*7)])):
                         logger.debug('to z')
                         # then we adjust hs2 to the end-of-ablation z    
                         # first trying at the end of melt season
-                        if ~np.isnan(np.nanmean(hs1.iloc[(ind_end[i]-24*7):(ind_end[i]+24*30)])): 
+                        if ~np.isnan(np.nanmean(hs1.iloc[(ind_end[i]-24*14):(ind_end[i]+24*30)])): 
                             logger.debug('using end of melt season')
                             hs1.iloc[ind_end[i]:] = hs1.iloc[ind_end[i]:] - \
-                                np.nanmean(hs1.iloc[(ind_end[i]-24*7):(ind_end[i]+24*30)])  + \
-                                    np.nanmean(z.iloc[(ind_end[i]-24*7):(ind_end[i]+24*30)])
+                                np.nanmean(hs1.iloc[(ind_end[i]-24*14):(ind_end[i]+24*30)])  + \
+                                    np.nanmean(z.iloc[(ind_end[i]-24*14):(ind_end[i]+24*30)])
                         # if not possible, then trying the end of the following accumulation season
-                        elif ind_start[i+1]!=-999 and any(~np.isnan(hs1.iloc[(ind_start[i+1]-24*7):(ind_start[i+1]+24*7)]+ z.iloc[(ind_start[i+1]-24*7):(ind_start[i+1]+24*7)])): 
+                        elif ind_start[i+1]!=-999 and any(~np.isnan(hs1.iloc[(ind_start[i+1]-24*14):(ind_start[i+1]+24*7)]+ z.iloc[(ind_start[i+1]-24*14):(ind_start[i+1]+24*7)])): 
                             logger.debug('using end of accumulation season')
                             hs1.iloc[ind_end[i]:] = hs1.iloc[ind_end[i]:] - \
-                                np.nanmean(hs1.iloc[(ind_start[i+1]-24*7):(ind_start[i+1]+24*7)])  + \
-                                    np.nanmean(z.iloc[(ind_start[i+1]-24*7):(ind_start[i+1]+24*7)])
+                                np.nanmean(hs1.iloc[(ind_start[i+1]-24*14):(ind_start[i+1]+24*7)])  + \
+                                    np.nanmean(z.iloc[(ind_start[i+1]-24*14):(ind_start[i+1]+24*7)])
+                    elif any(~np.isnan(hs2_year)):
+                        logger.debug('to the last value of hs2')
+                        # then we adjust hs1 to hs2 during the accumulation area
+                        # adjustment is done so that the mean hs1 and mean hs2 match 
+                        # for the period when both are available
+                        half_span = pd.to_timedelta('7D')
+                        tmp1 = hs1_year.loc[(hs2_year.last_valid_index()-half_span):(hs2_year.last_valid_index()+half_span)].copy()
+                        tmp2 = hs2_year.loc[(hs2_year.last_valid_index()-half_span):(hs2_year.last_valid_index()+half_span)].copy()
+
+                        hs1.iloc[ind_end[i]:] = hs1.iloc[ind_end[i]:] -  np.nanmean(tmp1)  +  np.nanmean(tmp2)
 
         df["z_surf_1_adj"] = hs1.interpolate(limit=2*24).values
         df["z_surf_2_adj"] = hs2.interpolate(limit=2*24).values
@@ -554,6 +636,7 @@ def combine_surface_height(df, site_type, threshold_ablation = -0.0002):
         # 2) we use SR1 when SR2 is not available (commented) 
         # the later one can cause jumps when SR2 starts to be available few days after SR1
         data_update = df["z_surf_2_adj"].interpolate(limit=72).values 
+                
         ind_update = ind_ablation 
         #ind_update = np.logical_and(ind_ablation,  ~np.isnan(data_update))
         df.loc[ind_update,"z_surf_combined"] = data_update[ind_update]  
@@ -563,9 +646,9 @@ def combine_surface_height(df, site_type, threshold_ablation = -0.0002):
         ind_update = np.logical_and(ind_ablation, ~np.isnan(data_update))
         df.loc[ind_update,"z_surf_combined"] = data_update[ind_update] 
     logger.info('surface height combination finished')
-    return df['z_surf_combined'], df["z_ice_surf_adj"] 
+    return df['z_surf_combined'], df["z_ice_surf_adj"], df["z_surf_1_adj"], df["z_surf_2_adj"]
 
-def hampel(vals_orig, k=7*24, t0=3):
+def hampel(vals_orig, k=7*24, t0=15):
     '''
     vals: pandas series of values from which to remove outliers
     k: size of window (including the sample; 7 is equal to 3 on either side of value)