From f4a358fb01a0dc648f02bc6577f137d0b8c6dadb Mon Sep 17 00:00:00 2001 From: BaptisteVandecrux Date: Thu, 27 Jun 2024 11:20:48 +0200 Subject: [PATCH 1/5] adding surface height processing functions --- src/pypromice/process/L2toL3.py | 652 ++++++++++++++++++++++++++++++++ 1 file changed, 652 insertions(+) diff --git a/src/pypromice/process/L2toL3.py b/src/pypromice/process/L2toL3.py index 0db1dffb..a1b91186 100755 --- a/src/pypromice/process/L2toL3.py +++ b/src/pypromice/process/L2toL3.py @@ -6,6 +6,9 @@ import numpy as np import xarray as xr from statsmodels.nonparametric.smoothers_lowess import lowess +from sklearn.linear_model import LinearRegression +from pypromice.qc.github_data_issues import adjustData +from scipy.interpolate import interp1d import logging logger = logging.getLogger(__name__) @@ -14,6 +17,8 @@ def toL3(L2, T_0=273.15): derived variables: - Turbulent fluxes - smoothed and inter/extrapolated GPS coordinates + - continuous surface height, ice surface height, snow height + - thermistor depths Parameters @@ -78,8 +83,655 @@ def toL3(L2, T_0=273.15): # adding average coordinate as attribute for v in ['lat','lon','alt']: ds.attrs[v+'_avg'] = ds[v].mean(dim='time').item() + + # processing continuous surface height, ice surface height, snow height + ds = surfaceHeightProcessing(ds) + + return ds + +def surfaceHeightProcessing(ds): + # creating surface height variables + ds['z_surf_1'] = ('time', ds['z_boom_u'].data*np.nan) + ds['z_surf_2'] = ('time', ds['z_boom_u'].data*np.nan) + + if ds.attrs['site_type'] == 'ablation': + ds['z_surf_1'] = 2.6 - ds['z_boom_u'] + first_valid_index = ds.time.where(ds.z_stake.notnull(),drop=True).data[0] + if first_valid_index: + ds['z_surf_2'] = ds.z_surf_1.sel(time=first_valid_index) + ds.z_stake.sel(time=first_valid_index) - ds['z_stake'] + + if 'z_pt_cor' in ds.data_vars: + ds['z_ice_surf'] = ('time', ds['z_pt_cor'].data) + + else: + first_valid_index = ds.time.where(ds.z_boom_u.notnull(),drop=True).data[0] + ds['z_surf_1'] = ds.z_boom_u.sel(time=first_valid_index) - ds['z_boom_u'] + if 'z_boom_l' in ds.data_vars: + first_valid_index = ds.time.where(ds.z_boom_l.notnull(),drop=True).data[0] + ds['z_surf_2'] = ds.z_boom_l.sel(time=first_valid_index) - ds['z_boom_l'] + if 'z_stake' in ds.data_vars: + if ds.z_stake.notnull().any(): + first_valid_index = ds.time.where(ds.z_stake.notnull(),drop=True).data[0] + ds['z_surf_2'] = ds.z_stake.sel(time=first_valid_index) - ds['z_stake'] + + + ds = adjustData(ds, var_list=['z_surf_1', 'z_surf_2','z_ice_surf']) + + df_in = ds[[v for v in ['z_surf_1', 'z_surf_2','z_ice_surf'] if v in ds.data_vars]].to_dataframe() + + ds['z_surf_combined'], ds['z_ice_surf'] = combineSurfaceHeight(df_in, ds.attrs['site_type']) + + if ds.attrs['site_type'] == 'ablation': + ds['z_ice_surf'] = ds.z_surf_combined.to_series().rolling('365D').min() + ds['snow_height'] = ds['z_surf_combined'] -ds['z_ice_surf'] + elif ds.attrs['site_type'] == 'accumulation': + ds['z_ice_surf'] = ('time', ds['z_surf_1'].data*np.nan) + ds['snow_height'] = ds['z_surf_combined'] + elif ds.attrs['site_type'] == 'bedrock': + ds['z_ice_surf'] = ('time', ds['z_surf_1'].data*np.nan) + ds['snow_height'] = ds['z_surf_combined'] + else: + logger.info('other site type') + + if ds.attrs['site_type'] != 'bedrock': + ice_temp_vars = [v for v in ds.data_vars if 't_i_' in v] + vars_out = [v.replace('t','d_t') for v in ice_temp_vars] + vars_out.append('t_i_10m') + df_out = thermistorDepth( + ds[ice_temp_vars+['z_surf_combined']].to_dataframe(), + ds.attrs['station_id']) + for var in df_out.columns: + ds[var] = ('time', df_out[var].values) return ds +def combineSurfaceHeight(df, site_type, threshold_ablation = -0.0002): + '''Combines the data from three sensor: the two sonic rangers and the + pressure transducer, to recreate the surface height, the ice surface height + and the snow depth through the years. For the accumulation sites, it is + only the average of the two sonic rangers (after manual adjustments to + correct maintenance shifts). For the ablation sites, first an ablation + period is estimated each year (either the period when z_pt_cor decreases + or JJA if no better estimate) then different adjustmnents are conducted + to stitch the three time series together: z_ice_surface (adjusted from + z_pt_cor) or if unvailable, z_surf_2 (adjusted from z_stake) + are used in the ablation period while an average of z_surf_1 and z_surf_2 + are used otherwise, after they are being adjusted to z_ice_surf at the end + of the ablation season. + + Parameters + ---------- + df : pandas.dataframe + Dataframe with datetime index and variables z_surf_1, z_surf_2 and z_ice_surf + site_type : str + Either 'accumulation' or 'ablation' + threshold_ablation : float + Threshold to which a z_pt_cor hourly decrease is compared. If the decrease + is higher, then there is ablation. + ''' + logger.info('Combining surface height') + if 'z_surf_2' not in df.columns: + logger.info('-> did not find z_surf_2') + df["z_surf_combined"] = hampel(df["z_surf_1"].interpolate(limit=72)).values + return df["z_surf_combined"], df["z_surf_combined"]*np.nan + + elif site_type == 'accumulation': + logger.info('-> no z_pt or accumulation site: averaging z_surf_1 and z_surf_2') + df["z_surf_1_adj"] = hampel(df["z_surf_1"].interpolate(limit=72)).values + df["z_surf_2_adj"] = hampel(df["z_surf_2"].interpolate(limit=72)).values + # adjusting z_surf_2 to z_surf_1 + df["z_surf_2_adj"] = df["z_surf_2_adj"] + (df["z_surf_1_adj"]- df["z_surf_2_adj"]).mean() + # z_surf_combined is the average of the two z_surf + if df.z_surf_1_adj.notnull().any() & df.z_surf_2_adj.notnull().any(): + df['z_surf_combined'] = np.nanmean( + df[['z_surf_1_adj', 'z_surf_2_adj']].values, axis = 1) + elif df.z_surf_1_adj.notnull().any(): + df['z_surf_combined'] = df.z_surf_1_adj.values + elif df.z_surf_2_adj.notnull().any(): + df['z_surf_combined'] = df.z_surf_2_adj.values + + df["z_surf_combined"] = hampel(df["z_surf_combined"].interpolate(limit=72)).values + return df["z_surf_combined"], df["z_surf_combined"]*np.nan + + else: + logger.info('-> ablation site') + # smoothing and filtering pressure transducer data + df["z_ice_surf_adj"] = hampel(df["z_ice_surf"].interpolate(limit=72)).values + df["z_surf_1_adj"] = hampel(df["z_surf_1"].interpolate(limit=72)).values + df["z_surf_2_adj"] = hampel(df["z_surf_2"].interpolate(limit=72)).values + + # defining ice ablation period from the decrease of a smoothed version of z_pt + # meaning when smoothed_z_pt.diff() < threshold_ablation + # first smoothing + smoothed_PT = (df['z_ice_surf'] + .resample('H') + .interpolate(limit=72) + .rolling('14D',center=True, min_periods=1) + .mean()) + # second smoothing + smoothed_PT = smoothed_PT.rolling('14D', center=True, min_periods=1).mean() + + smoothed_PT = smoothed_PT.reindex(df.index,method='ffill') + smoothed_PT.loc[df.z_ice_surf.isnull()] = np.nan + + # logical index where ablation is detected + ind_ablation = np.logical_and(smoothed_PT.diff().values < threshold_ablation, + np.isin(smoothed_PT.diff().index.month, [6, 7, 8, 9])) + + # finding the beginning and end of each period with True + idx = np.argwhere(np.diff(np.r_[False,ind_ablation, False])).reshape(-1, 2) + idx[:, 1] -= 1 + # fill small gaps in the ice ablation periods. + for i in range(len(idx)-1): + ind = idx[i] + ind_next = idx[i+1] + # if the end of an ablation period is less than 15 days away from + # the next ablation, then it is still considered like the same ablation + # season + if df.index[ind_next[0]]-df.index[ind[1]] pd.to_timedelta('0H')): + # if the pressure transducer is installed the year after then + # we use the mean surface height 1 on its first week as a 0 + # for the ice height + z = z - z.loc[ + z.first_valid_index():(z.first_valid_index()+pd.to_timedelta('14D')) + ].mean() + hs1.iloc[:24*7].mean() + else: + # if there is more than a year (actually 251 days) between the + # initiation of the AWS and the installation of the pressure transducer + # we remove the intercept in the pressure transducer data. + # Removing the intercept + # means that we consider the ice surface height at 0 when the AWS + # is installed, and not when the pressure transducer is installed. + Y = z.iloc[:].values.reshape(-1, 1) + X = z.iloc[~np.isnan(Y)].index.astype(np.int64).values.reshape(-1, 1) + Y = Y[~np.isnan(Y)] + linear_regressor = LinearRegression() + linear_regressor.fit(X, Y) + Y_pred = linear_regressor.predict(z.index.astype(np.int64).values.reshape(-1, 1) ) + z = z-Y_pred[0] + + years = df.index.year.unique().values + ind_start = years.copy() + ind_end = years.copy() + logger.info('-> estimating ablation period for each year') + for i, y in enumerate(years): + # for each year + ind_yr = df.index.year.values==y + ind_abl_yr = np.logical_and(ind_yr, ind_ablation) + + if df.loc[ + np.logical_and(ind_yr, df.index.month.isin([6,7,8])), + "z_ice_surf_adj"].isnull().all(): + + ind_abl_yr = np.logical_and(ind_yr, df.index.month.isin([6,7,8])) + ind_ablation[ind_yr] = ind_abl_yr[ind_yr] + logger.info(y, 'no z_ice_surf, just using JJA') + + if np.any(ind_abl_yr): + logger.info(y, 'derived from z_ice_surf') + # if there are some ablation flagged for that year + # then find begining and end + ind_start[i] = np.argwhere(ind_abl_yr)[0][0] + ind_end[i] = np.argwhere(ind_abl_yr)[-1][0] + + else: + logger.info(y, 'could not estimate ablation season') + # otherwise left as nan + ind_start[i] = -999 + ind_end[i] = -999 + + # adjustement loop + missing_hs2 = 0 # if hs2 is missing then when it comes back it is adjusted to hs1 + hs2_ref = 0 # by default, the PT is the reference: hs1 and 2 will be adjusted to PT + # but if it is missing one year or one winter, then it needs to be rajusted + # to hs1 and hs2 the year after. + + for i, y in enumerate(years): + logger.info(y) + # defining subsets of hs1, hs2, z + hs1_jja = hs1[str(y)+'-06-01':str(y)+'-09-01'] + hs2_jja = hs2[str(y)+'-06-01':str(y)+'-09-01'] + z_jja = z[str(y)+'-06-01':str(y)+'-09-01'] + + z_ablation = z.iloc[ind_start[i]:ind_end[i]] + hs2_ablation = hs2.iloc[ind_start[i]:ind_end[i]] + + hs1_year = hs1[str(y)] + hs2_year = hs2[str(y)] + + hs2_winter = hs2[str(y)+'-01-01':str(y)+'-03-01'].copy() + z_winter = z[str(y)+'-01-01':str(y)+'-03-01'].copy() + + hs1_following_winter = hs1[str(y)+'-09-01':str(y+1)+'-03-01'].copy() + hs2_following_winter = hs2[str(y)+'-09-01':str(y+1)+'-03-01'].copy() + + z_year = z[str(y)] + if hs1_jja.isnull().all() and hs2_jja.isnull().all() and z_jja.isnull().all(): + # if there is no height for a year between June and September + # then the adjustment cannot be made automatically + # it needs to be specified manually on the adjustment files + # on https://github.com/GEUS-Glaciology-and-Climate/PROMICE-AWS-data-issues + continue + + if all(np.isnan(z_jja)) and any(~np.isnan(hs2_jja)): + # if there is no PT for a given year, but there is some hs2 + # then z will be adjusted to hs2 next time it is available + hs2_ref = 1 + + if all(np.isnan(z_winter)) and all(np.isnan(hs2_winter)): + # if there is no PT nor hs2 during the winter, then again + # we need to adjust z to match hs2 when ablation starts + hs2_ref = 1 + + + if hs2_ref: + if ind_start[i] != -999: + # the first year there is both ablation and PT data available + # then PT is adjusted to hs2 + if any(~np.isnan(z_ablation)) and any(~np.isnan(hs2_ablation)): + logger.info('adjusting z to hs2') + tmp1 = z_ablation.copy() + tmp2 = hs2_ablation.copy() + tmp1[np.isnan(tmp2)] = np.nan + tmp2[np.isnan(tmp1)] = np.nan + + # in some instances, the PT data is available but no ablation + # is recorded, then hs2 remains the reference during that time. + # When eventually there is ablation, then we need to find the + # first index in these preceding ablation-free years + # the shift will be applied back from this point + first_index = z[:z[str(y)].first_valid_index()].isnull().iloc[::-1].idxmax() + z[first_index:] = z[first_index:] - np.nanmean(tmp1) + np.nanmean(tmp2) + hs2_ref = 0 # from now on PT is the reference + else: + # if there is some ablation + if (ind_start[i] != -999) & z_year.notnull().any(): + # calculating first index with PT, hs1 and hs2 + first_index = z_year.first_valid_index() + if hs1_year.notnull().any(): + first_index = np.max(np.array( + [first_index, + hs1_year.first_valid_index()])) + if hs2_year.notnull().any(): + first_index = np.max(np.array( + [first_index, + hs2_year.first_valid_index()])) + + # if PT, hs1 and hs2 are all nan until station is reactivated, then + first_day_of_year = pd.to_datetime(str(y)+'-01-01') + + if len(z[first_day_of_year:first_index-pd.to_timedelta('1D')])>0: + if z[first_day_of_year:first_index-pd.to_timedelta('1D')].isnull().all() & \ + hs1[first_day_of_year:first_index-pd.to_timedelta('1D')].isnull().all() & \ + hs2[first_day_of_year:first_index-pd.to_timedelta('1D')].isnull().all(): + if (~np.isnan(np.nanmean(z[first_index:first_index+pd.to_timedelta('1D')])) \ + and ~np.isnan(np.nanmean(hs2[first_index:first_index+pd.to_timedelta('1D')]))): + logger.info(' ======= adjusting hs1 and hs2 to z_pt') + if ~np.isnan(np.nanmean(hs1[first_index:first_index+pd.to_timedelta('1D')]) ): + hs1[first_index:] = hs1[first_index:] \ + - np.nanmean(hs1[first_index:first_index+pd.to_timedelta('1D')]) \ + + np.nanmean(z[first_index:first_index+pd.to_timedelta('1D')]) + if ~np.isnan(np.nanmean(hs2[first_index:first_index+pd.to_timedelta('1D')]) ): + hs2[first_index:] = hs2[first_index:] \ + - np.nanmean(hs2[first_index:first_index+pd.to_timedelta('1D')]) \ + + np.nanmean(z[first_index:first_index+pd.to_timedelta('1D')]) + + # adapting hs2 + if (ind_end[i] != -999): + # and if there are PT data available at the end of the melt season + if np.any(~np.isnan(z.iloc[(ind_end[i]-24*7):(ind_end[i]+24*7)])): + logger.info('adjusting hs2 to z') + # then we adjust hs2 to the end-of-ablation z + # first trying at the end of melt season + if ~np.isnan(np.nanmean(hs2.iloc[(ind_end[i]-24*7):(ind_end[i]+24*30)])): + logger.info('using end of melt season') + hs2.iloc[ind_end[i]:] = hs2.iloc[ind_end[i]:] - \ + np.nanmean(hs2.iloc[(ind_end[i]-24*7):(ind_end[i]+24*30)]) + \ + np.nanmean(z.iloc[(ind_end[i]-24*7):(ind_end[i]+24*30)]) + # if not possible, then trying the end of the following accumulation season + elif (i+1 < len(ind_start)): + if ind_start[i+1]!=-999 and any(~np.isnan(hs2.iloc[(ind_start[i+1]-24*7):(ind_start[i+1]+24*7)]+ z.iloc[(ind_start[i+1]-24*7):(ind_start[i+1]+24*7)])): + logger.info('using end of accumulation season') + hs2.iloc[ind_end[i]:] = hs2.iloc[ind_end[i]:] - \ + np.nanmean(hs2.iloc[(ind_start[i+1]-24*7):(ind_start[i+1]+24*7)]) + \ + np.nanmean(z.iloc[(ind_start[i+1]-24*7):(ind_start[i+1]+24*7)]) + else: + logger.info('no ablation') + + if all(np.isnan(hs2_following_winter)): + logger.info('no hs2') + missing_hs2 = 1 + elif missing_hs2 == 1: + logger.info('adjusting hs2') + # and if there are some hs2 during the accumulation period + if any(~np.isnan(hs1_following_winter)): + logger.info('to hs1') + # then we adjust hs1 to hs2 during the accumulation area + # adjustment is done so that the mean hs1 and mean hs2 match + # for the period when both are available + hs2_following_winter[np.isnan(hs1_following_winter)] = np.nan + hs1_following_winter[np.isnan(hs2_following_winter)] = np.nan + + hs2[str(y)+'-01-01':] = hs2[str(y)+'-01-01':] \ + - np.nanmean(hs2_following_winter) + np.nanmean(hs1_following_winter) + missing_hs2 = 0 + + # adjusting hs1 to hs2 (no ablation case) + if any(~np.isnan(hs1_following_winter)): + logger.info('adjusting hs1') + # and if there are some hs2 during the accumulation period + if any(~np.isnan(hs2_following_winter)): + logger.info('to hs2') + # then we adjust hs1 to hs2 during the accumulation area + # adjustment is done so that the mean hs1 and mean hs2 match + # for the period when both are available + hs1_following_winter[np.isnan(hs2_following_winter)] = np.nan + hs2_following_winter[np.isnan(hs1_following_winter)] = np.nan + + hs1[str(y)+'-09-01':] = hs1[str(y)+'-09-01':] \ + - np.nanmean(hs1_following_winter) + np.nanmean(hs2_following_winter) + + if ind_end[i] != -999: + # if there is some hs1 + + if any(~np.isnan(hs1_following_winter)): + logger.info('adjusting hs1') + # and if there are some hs2 during the accumulation period + if any(~np.isnan(hs2_following_winter)): + logger.info('to hs2') + # then we adjust hs1 to hs2 during the accumulation area + # adjustment is done so that the mean hs1 and mean hs2 match + # for the period when both are available + tmp1 = hs1.iloc[ind_end[i]:min(len(hs1),ind_end[i]+24*30*9)].copy() + tmp2 = hs2.iloc[ind_end[i]:min(len(hs2),ind_end[i]+24*30*9)].copy() + + tmp1[np.isnan(tmp2)] = np.nan + tmp2[np.isnan(tmp1)] = np.nan + + hs1.iloc[ind_end[i]:] = hs1.iloc[ind_end[i]:] - np.nanmean(tmp1) + np.nanmean(tmp2) + + # if no hs2, then use PT data available at the end of the melt season + elif np.any(~np.isnan(z.iloc[(ind_end[i]-24*7):(ind_end[i]+24*7)])): + logger.info('to z') + # then we adjust hs2 to the end-of-ablation z + # first trying at the end of melt season + if ~np.isnan(np.nanmean(hs1.iloc[(ind_end[i]-24*7):(ind_end[i]+24*30)])): + logger.info('using end of melt season') + hs1.iloc[ind_end[i]:] = hs1.iloc[ind_end[i]:] - \ + np.nanmean(hs1.iloc[(ind_end[i]-24*7):(ind_end[i]+24*30)]) + \ + np.nanmean(z.iloc[(ind_end[i]-24*7):(ind_end[i]+24*30)]) + # if not possible, then trying the end of the following accumulation season + elif ind_start[i+1]!=-999 and any(~np.isnan(hs1.iloc[(ind_start[i+1]-24*7):(ind_start[i+1]+24*7)]+ z.iloc[(ind_start[i+1]-24*7):(ind_start[i+1]+24*7)])): + logger.info('using end of accumulation season') + hs1.iloc[ind_end[i]:] = hs1.iloc[ind_end[i]:] - \ + np.nanmean(hs1.iloc[(ind_start[i+1]-24*7):(ind_start[i+1]+24*7)]) + \ + np.nanmean(z.iloc[(ind_start[i+1]-24*7):(ind_start[i+1]+24*7)]) + + df["z_surf_1_adj"] = hs1.interpolate(limit=2*24).values + df["z_surf_2_adj"] = hs2.interpolate(limit=2*24).values + df["z_ice_surf_adj"] = z.interpolate(limit=2*24).values + + # making a summary of the surface height + df["z_surf_combined"] = np.nan + + # in winter, both SR1 and SR2 are used + df["z_surf_combined"] = np.nanmean( df[["z_surf_1_adj", + "z_surf_2_adj"]].values, + axis = 1) + + # in ablation season we use SR2 instead of the SR1&2 average + # here two options: + # 1) we ignore the SR1 and only use SR2 + # 2) we use SR1 when SR2 is not available (commented) + # the later one can cause jumps when SR2 starts to be available few days after SR1 + data_update = df["z_surf_2_adj"].interpolate(limit=72).values + ind_update = ind_ablation + #ind_update = np.logical_and(ind_ablation, ~np.isnan(data_update)) + df.loc[ind_update,"z_surf_combined"] = data_update[ind_update] + + # in ablation season we use pressure transducer over all other options + data_update = df[ "z_ice_surf_adj"].interpolate(limit=72).values + ind_update = np.logical_and(ind_ablation, ~np.isnan(data_update)) + df.loc[ind_update,"z_surf_combined"] = data_update[ind_update] + logger.info('surface height combination finished') + return df['z_surf_combined'], df["z_ice_surf_adj"] + +def hampel(vals_orig, k=7*24, t0=3): + ''' + vals: pandas series of values from which to remove outliers + k: size of window (including the sample; 7 is equal to 3 on either side of value) + ''' + #Make copy so original not edited + vals=vals_orig.copy() + #Hampel Filter + L= 1.4826 + rolling_median=vals.rolling(k).median() + difference=np.abs(rolling_median-vals) + median_abs_deviation=difference.rolling(k).median() + threshold= t0 *L * median_abs_deviation + outlier_idx=difference>threshold + outlier_idx[0:round(k/2)]=False + vals.loc[outlier_idx]=np.nan + return(vals) + +def thermistorDepth(df_in, site, + installation_info_file = '../PROMICE-AWS-data-issues/fieldwork_summary_PROMICE_GC-Net.csv'): + '''Calculates the depth of the thermistors through time based on their + installation depth (collected in a google sheet) and on the change of surface + height: instruments getting buried under new snow or surfacing due to ablation. + There is a potential for additional filtering of thermistor data for surfaced + (or just noisy) thermistors, but that is currently deactivated because slow. + + Parameters + ---------- + df_in : pandas:dataframe + dataframe containing the ice/firn temperature t_i_* as well as the + combined surface height z_surf_combined + site : str + stid, so that maintenance date and sensor installation depths can be found + in database + installation_info_file : str + path to file with thermistor installation dates and depths + ''' + + logger.info('Calculating thermistor depth') + + maintenance_string = pd.read_csv(installation_info_file) + + maintenance_string = maintenance_string.replace("OUT", np.nan) + maintenance_string[ + "length_of_thermistor_string_on_surface_from_surface_marking" + ] = maintenance_string[ + "length_of_thermistor_string_on_surface_from_surface_marking" + ].astype( + float + ) + maintenance_string["date"] = pd.to_datetime(maintenance_string["date_visit"]) + maintenance = maintenance_string.loc[maintenance_string.station == site] + + temp_cols_name = ['t_i_'+str(i) for i in range(12) if 't_i_'+str(i) in df_in.columns] + num_therm = len(temp_cols_name) + depth_cols_name = ['d_t_i_'+str(i) for i in range(1,num_therm+1)] + if num_therm == 8: + ini_depth = [1, 2, 3, 4, 5, 6, 7, 10] + else: + ini_depth = [0, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5] + df_in[depth_cols_name] = np.nan + + # filtering the surface height + surface_height = df_in["z_surf_combined"].copy() + ind_filter = surface_height.rolling(window=14, center=True).var() > 0.1 + if any(ind_filter): + surface_height[ind_filter] = np.nan + df_in["z_surf_combined"] = surface_height.values + z_surf_interp = df_in["z_surf_combined"].interpolate() + + # first initialization of the depths + for i, col in enumerate(depth_cols_name): + df_in[col] = ( + ini_depth[i] + + z_surf_interp.values + - z_surf_interp[z_surf_interp.first_valid_index()] + ) + + # reseting depth at maintenance + if len(maintenance.date) == 0: + logger.info("No maintenance at "+site) + + for date in maintenance.date: + if date > z_surf_interp.last_valid_index(): + continue + new_depth = maintenance.loc[ + maintenance.date == date + ].depth_new_thermistor_m.values[0] + if isinstance(new_depth, str): + logger.info('thermistor reinstalled on '+date.strftime('%Y-%m-%d %X')) + new_depth = [float(x) for x in new_depth.split(",")] + for i, col in enumerate(depth_cols_name): + tmp = df_in[col].copy() + tmp.loc[date:] = ( + new_depth[i] + + z_surf_interp[date:].values + - z_surf_interp[date:][ + z_surf_interp[date:].first_valid_index() + ] + ) + df_in[col] = tmp.values + + # % Filtering thermistor data + # for i in range(len(temp_cols_name)): + # tmp = df_in[temp_cols_name[i]].copy() + + # # variance filter + # ind_filter = ( + # df_in[temp_cols_name[i]] + # .interpolate(limit=14) + # .rolling(window=7) + # .var() + # > 0.5 + # ) + # month = ( + # df_in[temp_cols_name[i]].interpolate(limit=14).index.month.values + # ) + # ind_filter.loc[np.isin(month, [5, 6, 7])] = False + # if any(ind_filter): + # tmp.loc[ind_filter] = np.nan + + # # before and after maintenance adaptation filter + # if len(maintenance.date) > 0: + # for date in maintenance.date: + # if isinstance( + # maintenance.loc[ + # maintenance.date == date + # ].depth_new_thermistor_m.values[0], + # str, + # ): + # ind_adapt = np.abs( + # tmp.interpolate(limit=14).index.values + # - pd.to_datetime(date).to_datetime64() + # ) < np.timedelta64(7, "D") + # if any(ind_adapt): + # tmp.loc[ind_adapt] = np.nan + + # # surfaced thermistor + # ind_pos = df_in[depth_cols_name[i]] < 0.1 + # if any(ind_pos): + # tmp.loc[ind_pos] = np.nan + # # copying the filtered values to the original table + # df_in[temp_cols_name[i]] = tmp.values + + # interpolating 10 m firn/ice temp + df_in['t_i_10m'] = interpolate_temperature( + df_in.index.values, + df_in[depth_cols_name].values.astype(float), + df_in[temp_cols_name].values.astype(float), + kind="linear", + min_diff_to_depth=1.5, + ).set_index('date').values + + # filtering + ind_pos = df_in["t_i_10m"] > 0.1 + ind_low = df_in["t_i_10m"] < -70 + df_in.loc[ind_pos, "t_i_10m"] = np.nan + df_in.loc[ind_low, "t_i_10m"] = np.nan + depth_cols_name.append('t_i_10m') + return df_in[depth_cols_name] + + +def interpolate_temperature(dates, depth_cor, temp, depth=10, min_diff_to_depth=2, + kind="quadratic"): + '''Calculates the depth of the thermistors through time based on their + installation depth (collected in a google sheet) and on the change of surface + height: instruments getting buried under new snow or surfacing due to ablation. + There is a potential for additional filtering of thermistor data for surfaced + (or just noisy) thermistors, but that is currently deactivated because slow. + + Parameters + ---------- + dates : numpy.array + array of datetime64 + depth_cor : numpy.ndarray + matrix of depths + temp : numpy.ndarray + matrix of temperatures + depth : float + constant depth at which (depth_cor, temp) should be interpolated. + min_diff_to_depth: float + maximum difference allowed between the available depht and the target depth + for the interpolation to be done. + kind : str + type of interpolation from scipy.interpolate.interp1d + ''' + + depth_cor = depth_cor.astype(float) + df_interp = pd.DataFrame() + df_interp["date"] = dates + df_interp["temperatureObserved"] = np.nan + + # preprocessing temperatures for small gaps + tmp = pd.DataFrame(temp) + tmp["time"] = dates + tmp = tmp.set_index("time") + tmp = tmp.resample("H").mean() + # tmp = tmp.interpolate(limit=24*7) + temp = tmp.loc[dates].values + for i in (range(len(dates))): + x = depth_cor[i, :].astype(float) + y = temp[i, :].astype(float) + ind_no_nan = ~np.isnan(x + y) + x = x[ind_no_nan] + y = y[ind_no_nan] + x, indices = np.unique(x, return_index=True) + y = y[indices] + if len(x) < 2 or np.min(np.abs(x - depth)) > min_diff_to_depth: + continue + f = interp1d(x, y, kind, fill_value="extrapolate") + df_interp.iloc[i, 1] = np.min(f(depth), 0) + + if df_interp.iloc[:5, 1].std() > 0.1: + df_interp.iloc[:5, 1] = np.nan + + return df_interp + def gpsCoordinatePostprocessing(ds, var): # saving the static value of 'lat','lon' or 'alt' stored in attribute # as it might be the only coordinate available for certain stations (e.g. bedrock) From 9685a1ed14edc6c72f948c2a83137eddf7f3e73c Mon Sep 17 00:00:00 2001 From: BaptisteVandecrux Date: Thu, 27 Jun 2024 11:22:53 +0200 Subject: [PATCH 2/5] better doc for surfaceHeightProcessing --- src/pypromice/process/L2toL3.py | 91 +++++++++++++++++++++------------ 1 file changed, 57 insertions(+), 34 deletions(-) diff --git a/src/pypromice/process/L2toL3.py b/src/pypromice/process/L2toL3.py index a1b91186..66515fd1 100755 --- a/src/pypromice/process/L2toL3.py +++ b/src/pypromice/process/L2toL3.py @@ -89,61 +89,84 @@ def toL3(L2, T_0=273.15): return ds + def surfaceHeightProcessing(ds): - # creating surface height variables - ds['z_surf_1'] = ('time', ds['z_boom_u'].data*np.nan) - ds['z_surf_2'] = ('time', ds['z_boom_u'].data*np.nan) + """ + Process surface height data for different site types and create + surface height variables. - if ds.attrs['site_type'] == 'ablation': + Parameters + ---------- + ds : xarray.Dataset + The dataset containing various measurements and attributes including + 'site_type' which determines the type of site (e.g., 'ablation', + 'accumulation', 'bedrock') and other relevant data variables such as + 'z_boom_u', 'z_stake', 'z_pt_cor', etc. + + Returns + ------- + xarray.Dataset + The dataset with additional processed surface height variables: + 'z_surf_1', 'z_surf_2', 'z_ice_surf', 'z_surf_combined', 'snow_height', + and possibly depth variables derived from temperature measurements. + """ + # Initialize surface height variables with NaNs + ds['z_surf_1'] = ('time', ds['z_boom_u'].data * np.nan) + ds['z_surf_2'] = ('time', ds['z_boom_u'].data * np.nan) + + if ds.attrs['site_type'] == 'ablation': + # Calculate surface heights for ablation sites ds['z_surf_1'] = 2.6 - ds['z_boom_u'] - first_valid_index = ds.time.where(ds.z_stake.notnull(),drop=True).data[0] + first_valid_index = ds.time.where(ds.z_stake.notnull(), drop=True).data[0] if first_valid_index: - ds['z_surf_2'] = ds.z_surf_1.sel(time=first_valid_index) + ds.z_stake.sel(time=first_valid_index) - ds['z_stake'] - + ds['z_surf_2'] = ds.z_surf_1.sel(time=first_valid_index) + ds.z_stake.sel(time=first_valid_index) - ds['z_stake'] + + # Use corrected point data if available if 'z_pt_cor' in ds.data_vars: ds['z_ice_surf'] = ('time', ds['z_pt_cor'].data) - + else: - first_valid_index = ds.time.where(ds.z_boom_u.notnull(),drop=True).data[0] + # Calculate surface heights for other site types + first_valid_index = ds.time.where(ds.z_boom_u.notnull(), drop=True).data[0] ds['z_surf_1'] = ds.z_boom_u.sel(time=first_valid_index) - ds['z_boom_u'] if 'z_boom_l' in ds.data_vars: - first_valid_index = ds.time.where(ds.z_boom_l.notnull(),drop=True).data[0] + first_valid_index = ds.time.where(ds.z_boom_l.notnull(), drop=True).data[0] ds['z_surf_2'] = ds.z_boom_l.sel(time=first_valid_index) - ds['z_boom_l'] - if 'z_stake' in ds.data_vars: - if ds.z_stake.notnull().any(): - first_valid_index = ds.time.where(ds.z_stake.notnull(),drop=True).data[0] - ds['z_surf_2'] = ds.z_stake.sel(time=first_valid_index) - ds['z_stake'] - - - ds = adjustData(ds, var_list=['z_surf_1', 'z_surf_2','z_ice_surf']) - - df_in = ds[[v for v in ['z_surf_1', 'z_surf_2','z_ice_surf'] if v in ds.data_vars]].to_dataframe() - - ds['z_surf_combined'], ds['z_ice_surf'] = combineSurfaceHeight(df_in, ds.attrs['site_type']) - - if ds.attrs['site_type'] == 'ablation': - ds['z_ice_surf'] = ds.z_surf_combined.to_series().rolling('365D').min() - ds['snow_height'] = ds['z_surf_combined'] -ds['z_ice_surf'] - elif ds.attrs['site_type'] == 'accumulation': - ds['z_ice_surf'] = ('time', ds['z_surf_1'].data*np.nan) - ds['snow_height'] = ds['z_surf_combined'] - elif ds.attrs['site_type'] == 'bedrock': - ds['z_ice_surf'] = ('time', ds['z_surf_1'].data*np.nan) + if 'z_stake' in ds.data_vars and ds.z_stake.notnull().any(): + first_valid_index = ds.time.where(ds.z_stake.notnull(), drop=True).data[0] + ds['z_surf_2'] = ds.z_stake.sel(time=first_valid_index) - ds['z_stake'] + + # Adjust data for the created surface height variables + ds = adjustData(ds, var_list=['z_surf_1', 'z_surf_2', 'z_ice_surf']) + + # Convert to dataframe and combine surface height variables + df_in = ds[[v for v in ['z_surf_1', 'z_surf_2', 'z_ice_surf'] if v in ds.data_vars]].to_dataframe() + ds['z_surf_combined'], ds['z_ice_surf'] = combineSurfaceHeight(df_in, ds.attrs['site_type']) + + if ds.attrs['site_type'] == 'ablation': + # Calculate rolling minimum for ice surface height and snow height + ds['z_ice_surf'] = ds.z_surf_combined.to_series().rolling('365D').min() + ds['snow_height'] = ds['z_surf_combined'] - ds['z_ice_surf'] + elif ds.attrs['site_type'] in ['accumulation', 'bedrock']: + # Handle accumulation and bedrock site types + ds['z_ice_surf'] = ('time', ds['z_surf_1'].data * np.nan) ds['snow_height'] = ds['z_surf_combined'] else: + # Log info for other site types logger.info('other site type') if ds.attrs['site_type'] != 'bedrock': + # Process ice temperature data and create depth variables ice_temp_vars = [v for v in ds.data_vars if 't_i_' in v] - vars_out = [v.replace('t','d_t') for v in ice_temp_vars] + vars_out = [v.replace('t', 'd_t') for v in ice_temp_vars] vars_out.append('t_i_10m') - df_out = thermistorDepth( - ds[ice_temp_vars+['z_surf_combined']].to_dataframe(), - ds.attrs['station_id']) + df_out = thermistorDepth(ds[ice_temp_vars + ['z_surf_combined']].to_dataframe(), ds.attrs['station_id']) for var in df_out.columns: ds[var] = ('time', df_out[var].values) + return ds + def combineSurfaceHeight(df, site_type, threshold_ablation = -0.0002): '''Combines the data from three sensor: the two sonic rangers and the pressure transducer, to recreate the surface height, the ice surface height From 057124873b62ef8527ef298fa1a69efc214baa7f Mon Sep 17 00:00:00 2001 From: BaptisteVandecrux Date: Thu, 27 Jun 2024 11:43:21 +0200 Subject: [PATCH 3/5] added new variables to variable.csv --- src/pypromice/resources/variables.csv | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/pypromice/resources/variables.csv b/src/pypromice/resources/variables.csv index 77eaa7b8..5979d63d 100644 --- a/src/pypromice/resources/variables.csv +++ b/src/pypromice/resources/variables.csv @@ -88,4 +88,21 @@ rh_i_cor,relative_humidity_corrected,Relative humidity (instantaneous) – corre wspd_i,wind_speed,Wind speed (instantaneous),m s-1,physicalMeasurement,time,TRUE,,0,100,wdir_i wspd_x_i wspd_y_i,all,1,1,1,4 wdir_i,wind_from_direction,Wind from direction (instantaneous),degrees,physicalMeasurement,time,TRUE,,1,360,wspd_x_i wspd_y_i,all,1,1,1,4 wspd_x_i,wind_speed_from_x_direction,Wind speed from x direction (instantaneous),m s-1,modelResult,time,TRUE,L2 or later,-100,100,wdir_i wspd_i,all,0,1,1,4 -wspd_y_i,wind_speed_from_y_direction,Wind speed from y direction (instantaneous),m s-1,modelResult,time,TRUE,L2 or later,-100,100,wdir_i wspd_i,all,0,1,1,4 \ No newline at end of file +wspd_y_i,wind_speed_from_y_direction,Wind speed from y direction (instantaneous),m s-1,modelResult,time,TRUE,L2 or later,-100,100,wdir_i wspd_i,all,0,1,1,4 +z_surf_1,height_of_surface_1,"Surface height 1, relative to surface height at installation and calculated from boom (upper one if two)",m,modelResult,time,FALSE,L3,,,,all,0,0,1,4 +z_surf_2,height_of_surface_2,"Surface height 2, relative to surface height at installation and calculated from stake booms and pt",m,modelResult,time,FALSE,L3,,,,all,0,0,1,4 +z_surf_combined,height_of_surface_combined,"Surface height combined from multiple sensors, relative to ice surface height at installation",m,modelResult,time,FALSE,L3,,,,all,0,0,1,4 +z_ice_surf,height_of_ice_surface,"Ice surface height, relative to ice surface height at installation and calculated from pt_cor and z_stake",m,modelResult,time,FALSE,L3,,,,one-boom,0,0,1,4 +snow_height,height_of_snow,"Snow surface height, relative to ice surface",m,modelResult,time,FALSE,L3,0,,,one-boom,0,0,1,4 +d_t_i_1,depth_of_thermistor_1,Depth of thermistor 1,m,modelResult,time,FALSE,L3,-10,100,,all,0,0,1,4 +d_t_i_2,depth_of_thermistor_2,Depth of thermistor 2,m,modelResult,time,FALSE,L3,-10,100,,all,0,0,1,4 +d_t_i_3,depth_of_thermistor_3,Depth of thermistor 3,m,modelResult,time,FALSE,L3,-10,100,,all,0,0,1,4 +d_t_i_4,depth_of_thermistor_4,Depth of thermistor 4,m,modelResult,time,FALSE,L3,-10,100,,all,0,0,1,4 +d_t_i_5,depth_of_thermistor_5,Depth of thermistor 5,m,modelResult,time,FALSE,L3,-10,100,,all,0,0,1,4 +d_t_i_6,depth_of_thermistor_6,Depth of thermistor 6,m,modelResult,time,FALSE,L3,-10,100,,all,0,0,1,4 +d_t_i_7,depth_of_thermistor_7,Depth of thermistor 7,m,modelResult,time,FALSE,L3,-10,100,,all,0,0,1,4 +d_t_i_8,depth_of_thermistor_8,Depth of thermistor 8,m,modelResult,time,FALSE,L3,-10,100,,all,0,0,1,4 +d_t_i_9,depth_of_thermistor_9,Depth of thermistor 9,m,modelResult,time,FALSE,L3,-10,100,,two-boom,0,0,1,4 +d_t_i_10,depth_of_thermistor_10,Depth of thermistor 10,m,modelResult,time,FALSE,L3,-10,100,,two-boom,0,0,1,4 +d_t_i_11,depth_of_thermistor_11,Depth of thermistor 11,m,modelResult,time,FALSE,L3,-10,100,,two-boom,0,0,1,4 +t_i_10m,10m_subsurface_temperature,10 m subsurface temperature,degrees_C,modelResult,time,FALSE,L3,-70,0,,all,0,0,1,4 From c9f4ceb2f2af9571a7c2c2c16e756705b40b0f43 Mon Sep 17 00:00:00 2001 From: BaptisteVandecrux Date: Thu, 27 Jun 2024 11:57:25 +0200 Subject: [PATCH 4/5] fixing tests added site-type to dummy test dataset --- src/pypromice/process/L2toL3.py | 33 +++++++++++++++++----------- src/pypromice/process/test.py | 1 + src/pypromice/test/test_config1.toml | 1 + src/pypromice/test/test_config2.toml | 1 + 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/pypromice/process/L2toL3.py b/src/pypromice/process/L2toL3.py index 66515fd1..39ae0793 100755 --- a/src/pypromice/process/L2toL3.py +++ b/src/pypromice/process/L2toL3.py @@ -9,7 +9,7 @@ from sklearn.linear_model import LinearRegression from pypromice.qc.github_data_issues import adjustData from scipy.interpolate import interp1d -import logging +import logging, os logger = logging.getLogger(__name__) def toL3(L2, T_0=273.15): @@ -205,8 +205,7 @@ def combineSurfaceHeight(df, site_type, threshold_ablation = -0.0002): df["z_surf_2_adj"] = df["z_surf_2_adj"] + (df["z_surf_1_adj"]- df["z_surf_2_adj"]).mean() # z_surf_combined is the average of the two z_surf if df.z_surf_1_adj.notnull().any() & df.z_surf_2_adj.notnull().any(): - df['z_surf_combined'] = np.nanmean( - df[['z_surf_1_adj', 'z_surf_2_adj']].values, axis = 1) + df['z_surf_combined'] = df[['z_surf_1_adj', 'z_surf_2_adj']].mean(axis = 1).values elif df.z_surf_1_adj.notnull().any(): df['z_surf_combined'] = df.z_surf_1_adj.values elif df.z_surf_2_adj.notnull().any(): @@ -307,17 +306,17 @@ def combineSurfaceHeight(df, site_type, threshold_ablation = -0.0002): ind_abl_yr = np.logical_and(ind_yr, df.index.month.isin([6,7,8])) ind_ablation[ind_yr] = ind_abl_yr[ind_yr] - logger.info(y, 'no z_ice_surf, just using JJA') + logger.info(str(y)+' no z_ice_surf, just using JJA') if np.any(ind_abl_yr): - logger.info(y, 'derived from z_ice_surf') + logger.info(str(y)+ ' derived from z_ice_surf') # if there are some ablation flagged for that year # then find begining and end ind_start[i] = np.argwhere(ind_abl_yr)[0][0] ind_end[i] = np.argwhere(ind_abl_yr)[-1][0] else: - logger.info(y, 'could not estimate ablation season') + logger.info(str(y) + ' could not estimate ablation season') # otherwise left as nan ind_start[i] = -999 ind_end[i] = -999 @@ -329,7 +328,7 @@ def combineSurfaceHeight(df, site_type, threshold_ablation = -0.0002): # to hs1 and hs2 the year after. for i, y in enumerate(years): - logger.info(y) + logger.info(str(y)) # defining subsets of hs1, hs2, z hs1_jja = hs1[str(y)+'-06-01':str(y)+'-09-01'] hs2_jja = hs2[str(y)+'-06-01':str(y)+'-09-01'] @@ -517,9 +516,7 @@ def combineSurfaceHeight(df, site_type, threshold_ablation = -0.0002): df["z_surf_combined"] = np.nan # in winter, both SR1 and SR2 are used - df["z_surf_combined"] = np.nanmean( df[["z_surf_1_adj", - "z_surf_2_adj"]].values, - axis = 1) + df["z_surf_combined"] = df[["z_surf_1_adj", "z_surf_2_adj"]].mean(axis=1).values # in ablation season we use SR2 instead of the SR1&2 average # here two options: @@ -557,7 +554,7 @@ def hampel(vals_orig, k=7*24, t0=3): return(vals) def thermistorDepth(df_in, site, - installation_info_file = '../PROMICE-AWS-data-issues/fieldwork_summary_PROMICE_GC-Net.csv'): + installation_info_file = '../aws-l0/metadata/fieldwork_summary_PROMICE_GC-Net.csv'): '''Calculates the depth of the thermistors through time based on their installation depth (collected in a google sheet) and on the change of surface height: instruments getting buried under new snow or surfacing due to ablation. @@ -577,8 +574,18 @@ def thermistorDepth(df_in, site, ''' logger.info('Calculating thermistor depth') - - maintenance_string = pd.read_csv(installation_info_file) + + if os.path.isfile(installation_info_file): + maintenance_string = pd.read_csv(installation_info_file) + logger.info('loading '+installation_info_file) + else: + maintenance_string = pd.DataFrame(columns=['date_visit', 'station', + 'length_of_thermistor_string_on_surface_from_surface_marking', + 'depth_new_thermistor_m', 'note', 'depth_ablation_hose_arrival_m', + 'depth_ablation_hose_departure_m', 'height_sr50_aws_arrival_cm', + 'height_sr50_aws_departure_cm', 'height_sr50_stakes_arrival_cm', + 'height_sr50_stakes_departure_cm', 'note.1']) + logger.info('cannot find '+installation_info_file) maintenance_string = maintenance_string.replace("OUT", np.nan) maintenance_string[ diff --git a/src/pypromice/process/test.py b/src/pypromice/process/test.py index f5e26601..5c2ce6bb 100644 --- a/src/pypromice/process/test.py +++ b/src/pypromice/process/test.py @@ -40,6 +40,7 @@ def testAddAll(self): datetime.datetime.now()-timedelta(days=365)] d.attrs['station_id']='TEST' d.attrs['level']='L2_test' + d.attrs['site_type']='ablation' meta = getMeta() d = addVars(d, v) d = addMeta(d, meta) diff --git a/src/pypromice/test/test_config1.toml b/src/pypromice/test/test_config1.toml index 09392c08..6b9e2e95 100644 --- a/src/pypromice/test/test_config1.toml +++ b/src/pypromice/test/test_config1.toml @@ -2,6 +2,7 @@ station_id = 'TEST1' logger_type = 'CR1000X' nodata = ['-999', 'NAN'] # if one is a string, all must be strings number_of_booms = 1 #1-boom = promice, 2-boom = gc-net +site_type = 'ablation' latitude = 79.91 longitude = 24.09 diff --git a/src/pypromice/test/test_config2.toml b/src/pypromice/test/test_config2.toml index 8213d955..195cd2ac 100644 --- a/src/pypromice/test/test_config2.toml +++ b/src/pypromice/test/test_config2.toml @@ -2,6 +2,7 @@ station_id = 'TEST2' logger_type = 'CR1000X' nodata = ['-999', 'NAN'] # if one is a string, all must be strings number_of_booms = 2 +sitet_type = 'accumulation' ['test_raw_transmitted2.txt'] format = 'TX' From 0b98589f3cec4fe9161b91bffe0e3e816de37a4e Mon Sep 17 00:00:00 2001 From: BaptisteVandecrux Date: Thu, 27 Jun 2024 15:50:30 +0200 Subject: [PATCH 5/5] removing resampling within T10m calculation --- src/pypromice/process/L2toL3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pypromice/process/L2toL3.py b/src/pypromice/process/L2toL3.py index 39ae0793..ee7b32a8 100755 --- a/src/pypromice/process/L2toL3.py +++ b/src/pypromice/process/L2toL3.py @@ -741,7 +741,7 @@ def interpolate_temperature(dates, depth_cor, temp, depth=10, min_diff_to_depth= tmp = pd.DataFrame(temp) tmp["time"] = dates tmp = tmp.set_index("time") - tmp = tmp.resample("H").mean() + # tmp = tmp.resample("H").mean() # tmp = tmp.interpolate(limit=24*7) temp = tmp.loc[dates].values for i in (range(len(dates))):