From e77de0abf38b6767cce1f934a9bf417a4b730c40 Mon Sep 17 00:00:00 2001 From: Baptiste Vandecrux <35140661+BaptisteVandecrux@users.noreply.github.com> Date: Mon, 1 Jul 2024 09:12:01 +0200 Subject: [PATCH] Feature/smoothing and extrapolating gps coordinates (#271) * rework get_l2tol3 and l2tol3 - both functions now take the station_configuration config file as input - added some error handling in l2tol3-py when variables required for processing are not available - now breaks in station locations are listed in the station_configurations file - now the smoothed lat/lon/alt between two breaks (i.e. between two station relocation) is linear and not with lowess anymore * removing statsmodels from setup and envirnoment * fixed arguments in get_l2tol3.py and logger in join_l3 * fixed level_2 file not found case * removed old lat/lon/alt attribute, replace by lat_avg/lon_avg/alt_avg * fixed issues --- environment.yml | 1 - setup.py | 2 +- src/pypromice/process/L2toL3.py | 251 ++++++++++++++----------- src/pypromice/process/get_l2tol3.py | 8 +- src/pypromice/process/join_l3.py | 13 +- src/pypromice/process/write.py | 99 +++++++--- src/pypromice/qc/github_data_issues.py | 11 +- src/pypromice/qc/persistence.py | 4 +- 8 files changed, 236 insertions(+), 153 deletions(-) diff --git a/environment.yml b/environment.yml index 96a569f4..b982311f 100644 --- a/environment.yml +++ b/environment.yml @@ -72,7 +72,6 @@ dependencies: - setuptools=68.2.2=py38h06a4308_0 - six=1.16.0=pyh6c4a22f_0 - sqlite=3.41.2=h5eee18b_0 - - statsmodels=0.13.2=py39h2bbff1b_0 - tbb=2021.8.0=hdb19cb5_0 - threadpoolctl=3.4.0=pyhc1e730c_0 - tk=8.6.12=h1ccaba5_0 diff --git a/setup.py b/setup.py index ce8c9bd4..52a9b216 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ "pypromice.qc.percentiles": ["thresholds.csv"], "pypromice.postprocess": ["station_configurations.toml", "positions_seed.csv"], }, - install_requires=['numpy~=1.23', 'pandas>=1.5.0', 'xarray>=2022.6.0', 'toml', 'scipy>=1.9.0', 'Bottleneck', 'netcdf4', 'pyDataverse==0.3.1', 'eccodes', 'scikit-learn>=1.1.0', 'statsmodels==0.13.2'], + install_requires=['numpy~=1.23', 'pandas>=1.5.0', 'xarray>=2022.6.0', 'toml', 'scipy>=1.9.0', 'Bottleneck', 'netcdf4', 'pyDataverse==0.3.1', 'eccodes', 'scikit-learn>=1.1.0'], # extras_require={'postprocess': ['eccodes','scikit-learn>=1.1.0']}, entry_points={ 'console_scripts': [ diff --git a/src/pypromice/process/L2toL3.py b/src/pypromice/process/L2toL3.py index ee7b32a8..f265d9b2 100755 --- a/src/pypromice/process/L2toL3.py +++ b/src/pypromice/process/L2toL3.py @@ -5,14 +5,14 @@ import pandas as pd import numpy as np import xarray as xr -from statsmodels.nonparametric.smoothers_lowess import lowess +import toml from sklearn.linear_model import LinearRegression from pypromice.qc.github_data_issues import adjustData from scipy.interpolate import interp1d import logging, os logger = logging.getLogger(__name__) -def toL3(L2, T_0=273.15): +def toL3(L2, config_folder='../aws-l0/metadata/station_configurations/', T_0=273.15): '''Process one Level 2 (L2) product to Level 3 (L3) meaning calculating all derived variables: - Turbulent fluxes @@ -34,48 +34,68 @@ def toL3(L2, T_0=273.15): T_100 = _getTempK(T_0) # Get steam point temperature as K # Turbulent heat flux calculation - # Upper boom bulk calculation - T_h_u = ds['t_u'].copy() # Copy for processing - p_h_u = ds['p_u'].copy() - WS_h_u = ds['wspd_u'].copy() - RH_cor_h_u = ds['rh_u_cor'].copy() - Tsurf_h = ds['t_surf'].copy() # T surf from derived upper boom product. TODO is this okay to use with lower boom parameters? - z_WS_u = ds['z_boom_u'].copy() + 0.4 # Get height of Anemometer - z_T_u = ds['z_boom_u'].copy() - 0.1 # Get height of thermometer + if ('t_u' in ds.keys()) and \ + ('p_u' in ds.keys()) and \ + ('rh_u_cor' in ds.keys()): + # Upper boom bulk calculation + T_h_u = ds['t_u'].copy() # Copy for processing + p_h_u = ds['p_u'].copy() + RH_cor_h_u = ds['rh_u_cor'].copy() + + q_h_u = calcSpHumid(T_0, T_100, T_h_u, p_h_u, RH_cor_h_u) # Calculate specific humidity + if ('wspd_u' in ds.keys()) and \ + ('t_surf' in ds.keys()) and \ + ('z_boom_u' in ds.keys()): + WS_h_u = ds['wspd_u'].copy() + Tsurf_h = ds['t_surf'].copy() # T surf from derived upper boom product. TODO is this okay to use with lower boom parameters? + z_WS_u = ds['z_boom_u'].copy() + 0.4 # Get height of Anemometer + z_T_u = ds['z_boom_u'].copy() - 0.1 # Get height of thermometer + + if not ds.attrs['bedrock']: + SHF_h_u, LHF_h_u= calcHeatFlux(T_0, T_h_u, Tsurf_h, WS_h_u, # Calculate latent and sensible heat fluxes + z_WS_u, z_T_u, q_h_u, p_h_u) - q_h_u = calcSpHumid(T_0, T_100, T_h_u, p_h_u, RH_cor_h_u) # Calculate specific humidity - - if not ds.attrs['bedrock']: - SHF_h_u, LHF_h_u= calcHeatFlux(T_0, T_h_u, Tsurf_h, WS_h_u, # Calculate latent and sensible heat fluxes - z_WS_u, z_T_u, q_h_u, p_h_u) - - ds['dshf_u'] = (('time'), SHF_h_u.data) - ds['dlhf_u'] = (('time'), LHF_h_u.data) + ds['dshf_u'] = (('time'), SHF_h_u.data) + ds['dlhf_u'] = (('time'), LHF_h_u.data) + else: + logger.info('wspd_u, t_surf or z_boom_u missing, cannot calulate tubrulent heat fluxes') - q_h_u = 1000 * q_h_u # Convert sp.humid from kg/kg to g/kg - ds['qh_u'] = (('time'), q_h_u.data) + q_h_u = 1000 * q_h_u # Convert sp.humid from kg/kg to g/kg + ds['qh_u'] = (('time'), q_h_u.data) + else: + logger.info('t_u, p_u or rh_u_cor missing, cannot calulate tubrulent heat fluxes') # Lower boom bulk calculation if ds.attrs['number_of_booms']==2: - T_h_l = ds['t_l'].copy() # Copy for processing - p_h_l = ds['p_l'].copy() - WS_h_l = ds['wspd_l'].copy() - RH_cor_h_l = ds['rh_l_cor'].copy() - z_WS_l = ds['z_boom_l'].copy() + 0.4 # Get height of W - z_T_l = ds['z_boom_l'].copy() - 0.1 # Get height of thermometer + if ('t_l' in ds.keys()) and \ + ('p_l' in ds.keys()) and \ + ('rh_l_cor' in ds.keys()): + T_h_l = ds['t_l'].copy() # Copy for processing + p_h_l = ds['p_l'].copy() + RH_cor_h_l = ds['rh_l_cor'].copy() + + q_h_l = calcSpHumid(T_0, T_100, T_h_l, p_h_l, RH_cor_h_l) # Calculate sp.humidity + + if ('wspd_l' in ds.keys()) and \ + ('t_surf' in ds.keys()) and \ + ('z_boom_l' in ds.keys()): + z_WS_l = ds['z_boom_l'].copy() + 0.4 # Get height of W + z_T_l = ds['z_boom_l'].copy() - 0.1 # Get height of thermometer + WS_h_l = ds['wspd_l'].copy() + if not ds.attrs['bedrock']: + SHF_h_l, LHF_h_l= calcHeatFlux(T_0, T_h_l, Tsurf_h, WS_h_l, # Calculate latent and sensible heat fluxes + z_WS_l, z_T_l, q_h_l, p_h_l) - q_h_l = calcSpHumid(T_0, T_100, T_h_l, p_h_l, RH_cor_h_l) # Calculate sp.humidity - - if not ds.attrs['bedrock']: - SHF_h_l, LHF_h_l= calcHeatFlux(T_0, T_h_l, Tsurf_h, WS_h_l, # Calculate latent and sensible heat fluxes - z_WS_l, z_T_l, q_h_l, p_h_l) - - ds['dshf_l'] = (('time'), SHF_h_l.data) - ds['dlhf_l'] = (('time'), LHF_h_l.data) - q_h_l = 1000 * q_h_l # Convert sp.humid from kg/kg to g/kg - - ds['qh_l'] = (('time'), q_h_l.data) + ds['dshf_l'] = (('time'), SHF_h_l.data) + ds['dlhf_l'] = (('time'), LHF_h_l.data) + else: + logger.info('wspd_l, t_surf or z_boom_l missing, cannot calulate tubrulent heat fluxes') + q_h_l = 1000 * q_h_l # Convert sp.humid from kg/kg to g/kg + ds['qh_l'] = (('time'), q_h_l.data) + else: + logger.info('t_l, p_l or rh_l_cor missing, cannot calulate tubrulent heat fluxes') + # Smoothing and inter/extrapolation of GPS coordinates for var in ['gps_lat', 'gps_lon', 'gps_alt']: ds[var.replace('gps_','')] = gpsCoordinatePostprocessing(ds, var) @@ -762,21 +782,18 @@ def interpolate_temperature(dates, depth_cor, temp, depth=10, min_diff_to_depth= return df_interp -def gpsCoordinatePostprocessing(ds, var): +def gpsCoordinatePostprocessing(ds, var, config_folder='../aws-l0/metadata/station_configurations/'): # saving the static value of 'lat','lon' or 'alt' stored in attribute # as it might be the only coordinate available for certain stations (e.g. bedrock) var_out = var.replace('gps_','') - - if var_out == 'alt': - if 'altitude' in list(ds.attrs.keys()): - static_value = float(ds.attrs['altitude']) - else: - print('no standard altitude for', ds.attrs['station_id']) - static_value = np.nan - elif var_out == 'lat': - static_value = float(ds.attrs['latitude']) - elif var_out == 'lon': - static_value = float(ds.attrs['longitude']) + coord_names = {'alt':'altitude', 'lat':'latitude','lon':'longitude'} + + if var_out+'_avg' in list(ds.attrs.keys()): + static_value = float(ds.attrs[var_out+'_avg']) + elif coord_names[var_out] in list(ds.attrs.keys()): + static_value = float(ds.attrs[coord_names[var_out]]) + else: + static_value = np.nan # if there is no gps observations, then we use the static value repeated # for each time stamp @@ -788,16 +805,22 @@ def gpsCoordinatePostprocessing(ds, var): print('no',var,'at',ds.attrs['station_id']) return ('time', np.ones_like(ds['t_u'].data)*static_value) - # here we detect potential relocation of the station in the form of a - # break in the general trend of the latitude, longitude and altitude - # in the future, this could/should be listed in an external file to - # avoid missed relocations or sensor issues interpreted as a relocation - if var == 'gps_alt': - _, breaks = find_breaks(ds[var].to_series(), alpha=8) - else: - _, breaks = find_breaks(ds[var].to_series(), alpha=6) + # fetching the station relocation dates at which the coordinates will/should + # have a break + config_file = config_folder +"/" + ds.attrs['station_id'] + ".toml" + with open(config_file, "r") as f: + config_data = toml.load(f) + + # Extract station relocations from the TOML data + station_relocations = config_data.get("station_relocation", []) - # smoothing and inter/extrapolation of the coordinate + # Convert the ISO8601 strings to pandas datetime objects + breaks = [pd.to_datetime(date_str) for date_str in station_relocations] + if len(breaks)==0: + logger.info('processing '+var+' without relocation') + else: + logger.info('processing '+var+' with relocation on ' + ', '.join([br.strftime('%Y-%m-%dT%H:%M:%S') for br in breaks])) + return ('time', piecewise_smoothing_and_interpolation(ds[var].to_series(), breaks)) @@ -1047,75 +1070,77 @@ def calcSpHumid(T_0, T_100, T_h, p_h, RH_cor_h, es_0=6.1071, es_100=1013.246, ep # Convert to kg/kg return RH_cor_h * q_sat / 100 - -def find_breaks(df,alpha): - '''Detects potential relocation of the station from the GPS measurements. - The code first makes a forward linear interpolation of the coordinates and - then looks for important jumps in latitude, longitude and altitude. The jumps - that are higher than a given threshold (expressed as a multiple of the - standard deviation) are mostly caused by the station being moved during - maintenance. To avoid misclassification, only the jumps detected in May-Sept. - are kept. - - Parameters - ---------- - df : pandas.Series - series of observed latitude, longitude or elevation - alpha: float - coefficient to be applied to the the standard deviation of the daily - coordinate fluctuation - ''' - diff = df.resample('D').median().interpolate( - method='linear', limit_area='inside', limit_direction='forward').diff() - thresh = diff.std() * alpha - list_diff = diff.loc[diff.abs()>thresh].reset_index() - list_diff = list_diff.loc[list_diff.time.dt.month.isin([5,6,7,8,9])] - list_diff['year']=list_diff.time.dt.year - list_diff=list_diff.groupby('year').max() - return diff, [None]+list_diff.time.to_list()+[None] - - def piecewise_smoothing_and_interpolation(df_in, breaks): - '''Smoothes, inter- or extrapolate the gps observations. The processing is - done piecewise so that each period between station relocation are done - separately (no smoothing of the jump due to relocation). Locally Weighted - Scatterplot Smoothing (lowess) is then used to smooth the available - observations. Then this smoothed curve is interpolated linearly over internal - gaps. Eventually, this interpolated curve is extrapolated linearly for - timestamps before the first valid measurement and after the last valid - measurement. + '''Smoothes, inter- or extrapolate the GPS observations. The processing is + done piecewise so that each period between station relocations are done + separately (no smoothing of the jump due to relocation). Piecewise linear + regression is then used to smooth the available observations. Then this + smoothed curve is interpolated linearly over internal gaps. Eventually, this + interpolated curve is extrapolated linearly for timestamps before the first + valid measurement and after the last valid measurement. Parameters ---------- df_in : pandas.Series - series of observed latitude, longitude or elevation + Series of observed latitude, longitude or elevation with datetime index. breaks: list List of timestamps of station relocation. First and last item should be None so that they can be used in slice(breaks[i], breaks[i+1]) + + Returns + ------- + np.ndarray + Smoothed and interpolated values corresponding to the input series. ''' - df_all = pd.Series() # dataframe gathering all the smoothed pieces - for i in range(len(breaks)-1): + df_all = pd.Series(dtype=float) # Initialize an empty Series to gather all smoothed pieces + breaks = [None] + breaks + [None] + for i in range(len(breaks) - 1): df = df_in.loc[slice(breaks[i], breaks[i+1])].copy() + + # Drop NaN values and calculate the number of segments based on valid data + df_valid = df.dropna() + if df_valid.shape[0] > 2: + # Fit linear regression model to the valid data range + x = pd.to_numeric(df_valid.index).values.reshape(-1, 1) + y = df_valid.values.reshape(-1, 1) + + model = LinearRegression() + model.fit(x, y) + + # Predict using the model for the entire segment range + x_pred = pd.to_numeric(df.index).values.reshape(-1, 1) + + y_pred = model.predict(x_pred) + df = pd.Series(y_pred.flatten(), index=df.index) + - y_sm = lowess(df, - pd.to_numeric(df.index), - is_sorted=True, frac=1/3, it=0, - ) - df.loc[df.notnull()] = y_sm[:,1] - df = df.interpolate(method='linear', limit_area='inside') - - last_valid_6_months = slice(df.last_valid_index()-pd.to_timedelta('180D'),None) - df.loc[last_valid_6_months] = (df.loc[last_valid_6_months].interpolate( axis=0, - method='spline',order=1, limit_direction='forward', fill_value="extrapolate")).values - - first_valid_6_months = slice(None, df.first_valid_index()+pd.to_timedelta('180D')) - df.loc[first_valid_6_months] = (df.loc[first_valid_6_months].interpolate( axis=0, - method='spline',order=1, limit_direction='backward', fill_value="extrapolate")).values - df_all=pd.concat((df_all, df)) - + # Update df_all with predicted values for the current segment + df_all = pd.concat([df_all, df]) + + # Fill internal gaps with linear interpolation + df_all = df_all.interpolate(method='linear', limit_area='inside') + + # Extrapolate for timestamps before the first valid measurement + first_valid_6_months = slice(None, df_in.first_valid_index() + pd.to_timedelta('180D')) + df_all.loc[first_valid_6_months] = ( + df_all.loc[first_valid_6_months].interpolate( + method='linear', limit_direction='backward', fill_value="extrapolate" + ) + ).values + + # Extrapolate for timestamps after the last valid measurement + last_valid_6_months = slice(df_in.last_valid_index() - pd.to_timedelta('180D'), None) + df_all.loc[last_valid_6_months] = ( + df_all.loc[last_valid_6_months].interpolate( + method='linear', limit_direction='forward', fill_value="extrapolate" + ) + ).values + + # Remove duplicate indices and return values as numpy array df_all = df_all[~df_all.index.duplicated(keep='first')] return df_all.values + def _calcAtmosDens(p_h, R_d, T_h, T_0): # TODO: check this shouldn't be in this step somewhere '''Calculate atmospheric density''' return 100 * p_h / R_d / (T_h + T_0) diff --git a/src/pypromice/process/get_l2tol3.py b/src/pypromice/process/get_l2tol3.py index 75be5b41..84a33e20 100644 --- a/src/pypromice/process/get_l2tol3.py +++ b/src/pypromice/process/get_l2tol3.py @@ -12,6 +12,8 @@ def parse_arguments_l2tol3(debug_args=None): parser = ArgumentParser(description="AWS L3 script for the processing L3 "+ "data from L2. An hourly, daily and monthly L3 "+ "data product is outputted to the defined output path") + parser.add_argument('-c', '--config_folder', type=str, required=True, + help='Path to folder with sites configuration (TOML) files') parser.add_argument('-i', '--inpath', type=str, required=True, help='Path to Level 2 .nc data file') parser.add_argument('-o', '--outpath', default=None, type=str, required=False, @@ -24,7 +26,7 @@ def parse_arguments_l2tol3(debug_args=None): args = parser.parse_args(args=debug_args) return args -def get_l2tol3(inpath, outpath, variables, metadata): +def get_l2tol3(config_folder, inpath, outpath, variables, metadata): logging.basicConfig( format="%(asctime)s; %(levelname)s; %(name)s; %(message)s", level=logging.INFO, @@ -46,7 +48,7 @@ def get_l2tol3(inpath, outpath, variables, metadata): l2.attrs['number_of_booms'] = int(l2.attrs['number_of_booms']) # Perform Level 3 processing - l3 = toL3(l2) + l3 = toL3(l2, config_folder) # Write Level 3 dataset to file if output directory given v = getVars(variables) @@ -59,7 +61,7 @@ def get_l2tol3(inpath, outpath, variables, metadata): def main(): args = parse_arguments_l2tol3() - _ = get_l2tol3(args.inpath, args.outpath, args.variables, args.metadata) + _ = get_l2tol3(args.config_folder, args.inpath, args.outpath, args.variables, args.metadata) if __name__ == "__main__": main() diff --git a/src/pypromice/process/join_l3.py b/src/pypromice/process/join_l3.py index e062911c..10a2c769 100644 --- a/src/pypromice/process/join_l3.py +++ b/src/pypromice/process/join_l3.py @@ -6,6 +6,11 @@ import numpy as np import pandas as pd import xarray as xr +logging.basicConfig( + format="%(asctime)s; %(levelname)s; %(name)s; %(message)s", + level=logging.INFO, + stream=sys.stdout, +) logger = logging.getLogger(__name__) def parse_arguments_joinl3(debug_args=None): @@ -202,16 +207,16 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me filepath = os.path.join(folder_l3, stid, stid+'_hour.nc') isNead = False - if station_info["project"].lower() in ["historical gc-net", "glaciobasis"]: + if station_info["project"].lower() in ["historical gc-net"]: filepath = os.path.join(folder_gcnet, stid+'.csv') isNead = True - if not os.path.isfile(filepath): + if not os.path.isfile(filepath): logger.info(stid+' is from an project '+folder_l3+' or '+folder_gcnet) continue l3, _ = loadArr(filepath, isNead) list_station_data.append((l3, station_info)) - + # Sort the list in reverse chronological order so that we start with the latest data sorted_list_station_data = sorted(list_station_data, key=lambda x: x[0].time.max(), reverse=True) sorted_stids = [info["stid"] for _, info in sorted_list_station_data] @@ -280,6 +285,8 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me # Assign site id + if not l3_merged: + logger.error('No level 2 data file found for '+site) l3_merged.attrs['site_id'] = site l3_merged.attrs['stations'] = ' '.join(sorted_stids) l3_merged.attrs['level'] = 'L3' diff --git a/src/pypromice/process/write.py b/src/pypromice/process/write.py index cc4d8fe5..e8d9e6a7 100644 --- a/src/pypromice/process/write.py +++ b/src/pypromice/process/write.py @@ -48,7 +48,7 @@ def prepare_and_write(dataset, outpath, vars_df=None, meta_dict=None, time='60mi if 'gps_lon' in d2.keys(): d2 = reformat_lon(d2) else: - logger.info('%s does not have gpd_lon'%name) + logger.info('%s does not have gps_lon'%name) # Add variable attributes and metadata if vars_df is None: @@ -100,7 +100,6 @@ def prepare_and_write(dataset, outpath, vars_df=None, meta_dict=None, time='60mi writeCSV(out_csv, d2, col_names) # Write to netcdf file - col_names = col_names + ['lat', 'lon', 'alt'] writeNC(out_nc, d2, col_names) logger.info(f'Written to {out_csv}') logger.info(f'Written to {out_nc}') @@ -245,16 +244,24 @@ def addMeta(ds, meta): ds : xarray.Dataset Dataset with metadata ''' - if 'gps_lon' in ds.keys(): - ds['lon'] = ds['gps_lon'].mean() - ds['lon'].attrs = ds['gps_lon'].attrs - - ds['lat'] = ds['gps_lat'].mean() - ds['lat'].attrs = ds['gps_lat'].attrs - - ds['alt'] = ds['gps_alt'].mean() - ds['alt'].attrs = ds['gps_alt'].attrs - + if 'lon' in ds.keys(): + # caluclating average coordinates based on the extra/interpolated coords + for v in ['lat','lon','alt']: + ds.attrs[v+'_avg'] = ds[v].mean().item() + # dropping the less accurate standard coordinates given in the + # raw or tx config files + for v in ['latitude','longitude']: + if v in ds.attrs.keys(): + del ds.attrs[v] + elif 'gps_lon' in ds.keys(): + # caluclating average coordinates based on the measured coords (can be gappy) + for v in ['gps_lat','gps_lon','gps_alt']: + ds.attrs[v+'_avg'] = ds[v].mean().item() + # dropping the less accurate standard coordinates given in the + # raw or tx config files + for v in ['latitude','longitude']: + if v in ds.attrs.keys(): + del ds.attrs[v] # Attribute convention for data discovery # https://wiki.esipfed.org/Attribute_Convention_for_Data_Discovery_1-3 @@ -283,19 +290,61 @@ def addMeta(ds, meta): ds.attrs['date_metadata_modified'] = ds.attrs['date_created'] ds.attrs['processing_level'] = ds.attrs['level'].replace('L','level ') + + if 'lat' in ds.keys(): + lat_min = ds['lat'].min().values + lat_max = ds['lat'].max().values + elif 'gps_lat' in ds.keys(): + lat_min = ds['gps_lat'].min().values + lat_max = ds['gps_lat'].max().values + elif 'latitude' in ds.attrs.keys(): + lat_min = ds.attrs['latitude'] + lat_max = ds.attrs['latitude'] + else: + lat_min =np.nan + lat_max = np.nan + + + if 'lon' in ds.keys(): + lon_min = ds['lon'].min().values + lon_max = ds['lon'].max().values + elif 'gps_lon' in ds.keys(): + lon_min = ds['gps_lon'].min().values + lon_max = ds['gps_lon'].max().values + elif 'longitude' in ds.attrs.keys(): + lon_min = ds.attrs['longitude'] + lon_max = ds.attrs['longitude'] + else: + lon_min =np.nan + lon_max = np.nan + + if 'alt' in ds.keys(): + alt_min = ds['alt'].min().values + alt_max = ds['alt'].max().values + elif 'gps_alt' in ds.keys(): + alt_min = ds['gps_alt'].min().values + alt_max = ds['gps_alt'].max().values + elif 'altitude' in ds.attrs.keys(): + alt_min = ds.attrs['altitude'] + alt_max = ds.attrs['altitude'] + else: + alt_min =np.nan + alt_max = np.nan + ds.attrs['geospatial_bounds'] = "POLYGON((" + \ - f"{ds['lat'].min().values} {ds['lon'].min().values}, " + \ - f"{ds['lat'].min().values} {ds['lon'].max().values}, " + \ - f"{ds['lat'].max().values} {ds['lon'].max().values}, " + \ - f"{ds['lat'].max().values} {ds['lon'].min().values}, " + \ - f"{ds['lat'].min().values} {ds['lon'].min().values}))" - - ds.attrs['geospatial_lat_min'] = str(ds['lat'].min().values) - ds.attrs['geospatial_lat_max'] = str(ds['lat'].max().values) - ds.attrs['geospatial_lon_min'] = str(ds['lon'].min().values) - ds.attrs['geospatial_lon_max'] = str(ds['lon'].max().values) - ds.attrs['geospatial_vertical_min'] = str(ds['alt'].min().values) - ds.attrs['geospatial_vertical_max'] = str(ds['alt'].max().values) + f"{lat_min} {lon_min}, " + \ + f"{lat_min} {lon_max}, " + \ + f"{lat_max} {lon_max}, " + \ + f"{lat_max} {lon_min}, " + \ + f"{lat_min} {lon_min}))" + + ds.attrs['geospatial_lat_min'] = str(lat_min) + ds.attrs['geospatial_lat_max'] = str(lat_max) + ds.attrs['geospatial_lon_min'] = str(lon_min) + ds.attrs['geospatial_lon_max'] = str(lon_max) + ds.attrs['geospatial_vertical_min'] = str(alt_min) + ds.attrs['geospatial_vertical_max'] = str(alt_max) + ds.attrs['geospatial_vertical_positive'] = 'up' ds.attrs['time_coverage_start'] = str(ds['time'][0].values) ds.attrs['time_coverage_end'] = str(ds['time'][-1].values) @@ -387,4 +436,4 @@ def reformat_lon(dataset, exempt=['UWN', 'Roof_GEUS', 'Roof_PROMICE']): if 'gps_lon' not in dataset.keys(): return dataset dataset['gps_lon'] = dataset['gps_lon'] * -1 - return dataset \ No newline at end of file + return dataset diff --git a/src/pypromice/qc/github_data_issues.py b/src/pypromice/qc/github_data_issues.py index fd4d96e0..41603ed8 100644 --- a/src/pypromice/qc/github_data_issues.py +++ b/src/pypromice/qc/github_data_issues.py @@ -65,10 +65,10 @@ def flagNAN(ds_in, for v in varlist: if v in list(ds.keys()): - logger.info(f'---> flagging {t0} {t1} {v}') + logger.debug(f'---> flagging {t0} {t1} {v}') ds[v] = ds[v].where((ds['time'] < t0) | (ds['time'] > t1)) else: - logger.info(f'---> could not flag {v} not in dataset') + logger.debug(f'---> could not flag {v} not in dataset') return ds @@ -206,13 +206,14 @@ def adjustData(ds, t1 = pd.to_datetime(t1, utc=True).tz_localize(None) index_slice = dict(time=slice(t0, t1)) - if len(ds_out[var].loc[index_slice].time.time) == 0: + logger.info(f'---> {t0} {t1} {var} {func} {val}') logger.info("Time range does not intersect with dataset") continue - logger.info(f'---> {t0} {t1} {var} {func} {val}') - + else: + logger.debug(f'---> {t0} {t1} {var} {func} {val}') + if func == "add": ds_out[var].loc[index_slice] = ds_out[var].loc[index_slice].values + val # flagging adjusted values diff --git a/src/pypromice/qc/persistence.py b/src/pypromice/qc/persistence.py index 5f5d55f4..963ff786 100644 --- a/src/pypromice/qc/persistence.py +++ b/src/pypromice/qc/persistence.py @@ -83,7 +83,7 @@ def persistence_qc( mask = mask & (df[v]<99) n_masked = mask.sum() n_samples = len(mask) - logger.info( + logger.debug( f"Applying persistent QC in {v}. Filtering {n_masked}/{n_samples} samples" ) # setting outliers to NaN @@ -96,7 +96,7 @@ def persistence_qc( n_masked = mask.sum() n_samples = len(mask) - logger.info( + logger.debug( f"Applying persistent QC in {v}. Filtering {n_masked}/{n_samples} samples" ) # setting outliers to NaN