From e77de0abf38b6767cce1f934a9bf417a4b730c40 Mon Sep 17 00:00:00 2001
From: Baptiste Vandecrux <35140661+BaptisteVandecrux@users.noreply.github.com>
Date: Mon, 1 Jul 2024 09:12:01 +0200
Subject: [PATCH] Feature/smoothing and extrapolating gps coordinates (#271)

* rework get_l2tol3 and l2tol3

- both functions now take the station_configuration config file as input
- added some error handling in l2tol3-py when variables required for processing are not available
- now breaks in station locations are listed in the station_configurations file
- now the smoothed lat/lon/alt between two breaks (i.e. between two station relocation) is linear and not with lowess anymore

* removing statsmodels from setup and envirnoment

* fixed arguments in get_l2tol3.py and logger in join_l3

* fixed level_2 file not found case

* removed old lat/lon/alt attribute, replace by lat_avg/lon_avg/alt_avg

* fixed issues
---
 environment.yml                        |   1 -
 setup.py                               |   2 +-
 src/pypromice/process/L2toL3.py        | 251 ++++++++++++++-----------
 src/pypromice/process/get_l2tol3.py    |   8 +-
 src/pypromice/process/join_l3.py       |  13 +-
 src/pypromice/process/write.py         |  99 +++++++---
 src/pypromice/qc/github_data_issues.py |  11 +-
 src/pypromice/qc/persistence.py        |   4 +-
 8 files changed, 236 insertions(+), 153 deletions(-)

diff --git a/environment.yml b/environment.yml
index 96a569f4..b982311f 100644
--- a/environment.yml
+++ b/environment.yml
@@ -72,7 +72,6 @@ dependencies:
   - setuptools=68.2.2=py38h06a4308_0
   - six=1.16.0=pyh6c4a22f_0
   - sqlite=3.41.2=h5eee18b_0
-  - statsmodels=0.13.2=py39h2bbff1b_0
   - tbb=2021.8.0=hdb19cb5_0
   - threadpoolctl=3.4.0=pyhc1e730c_0
   - tk=8.6.12=h1ccaba5_0
diff --git a/setup.py b/setup.py
index ce8c9bd4..52a9b216 100644
--- a/setup.py
+++ b/setup.py
@@ -35,7 +35,7 @@
         "pypromice.qc.percentiles": ["thresholds.csv"],
         "pypromice.postprocess": ["station_configurations.toml", "positions_seed.csv"],
     },
-    install_requires=['numpy~=1.23', 'pandas>=1.5.0', 'xarray>=2022.6.0', 'toml', 'scipy>=1.9.0', 'Bottleneck', 'netcdf4', 'pyDataverse==0.3.1', 'eccodes', 'scikit-learn>=1.1.0', 'statsmodels==0.13.2'],
+    install_requires=['numpy~=1.23', 'pandas>=1.5.0', 'xarray>=2022.6.0', 'toml', 'scipy>=1.9.0', 'Bottleneck', 'netcdf4', 'pyDataverse==0.3.1', 'eccodes', 'scikit-learn>=1.1.0'],
 #    extras_require={'postprocess': ['eccodes','scikit-learn>=1.1.0']},
     entry_points={
     'console_scripts': [
diff --git a/src/pypromice/process/L2toL3.py b/src/pypromice/process/L2toL3.py
index ee7b32a8..f265d9b2 100755
--- a/src/pypromice/process/L2toL3.py
+++ b/src/pypromice/process/L2toL3.py
@@ -5,14 +5,14 @@
 import pandas as pd
 import numpy as np
 import xarray as xr
-from statsmodels.nonparametric.smoothers_lowess import lowess
+import toml
 from sklearn.linear_model import LinearRegression
 from pypromice.qc.github_data_issues import adjustData
 from scipy.interpolate import interp1d
 import logging, os
 logger = logging.getLogger(__name__)
 
-def toL3(L2, T_0=273.15):
+def toL3(L2, config_folder='../aws-l0/metadata/station_configurations/', T_0=273.15):
     '''Process one Level 2 (L2) product to Level 3 (L3) meaning calculating all
     derived variables:
         - Turbulent fluxes
@@ -34,48 +34,68 @@ def toL3(L2, T_0=273.15):
     T_100 = _getTempK(T_0)                                                     # Get steam point temperature as K 
     
     # Turbulent heat flux calculation
-    # Upper boom bulk calculation
-    T_h_u = ds['t_u'].copy()                                                   # Copy for processing
-    p_h_u = ds['p_u'].copy()
-    WS_h_u = ds['wspd_u'].copy()
-    RH_cor_h_u = ds['rh_u_cor'].copy()
-    Tsurf_h = ds['t_surf'].copy()                                              # T surf from derived upper boom product. TODO is this okay to use with lower boom parameters?
-    z_WS_u = ds['z_boom_u'].copy() + 0.4                                       # Get height of Anemometer                
-    z_T_u = ds['z_boom_u'].copy() - 0.1                                        # Get height of thermometer  
+    if ('t_u' in ds.keys()) and \
+        ('p_u' in ds.keys()) and \
+            ('rh_u_cor' in ds.keys()):
+        # Upper boom bulk calculation
+        T_h_u = ds['t_u'].copy()                                                   # Copy for processing
+        p_h_u = ds['p_u'].copy()
+        RH_cor_h_u = ds['rh_u_cor'].copy()
+            
+        q_h_u = calcSpHumid(T_0, T_100, T_h_u, p_h_u, RH_cor_h_u)                  # Calculate specific humidity
+        if ('wspd_u' in ds.keys()) and \
+            ('t_surf' in ds.keys()) and \
+                ('z_boom_u' in ds.keys()):
+            WS_h_u = ds['wspd_u'].copy()
+            Tsurf_h = ds['t_surf'].copy()                                              # T surf from derived upper boom product. TODO is this okay to use with lower boom parameters?
+            z_WS_u = ds['z_boom_u'].copy() + 0.4                                       # Get height of Anemometer                
+            z_T_u = ds['z_boom_u'].copy() - 0.1                                        # Get height of thermometer  
+            
+            if not ds.attrs['bedrock']:       
+                SHF_h_u, LHF_h_u= calcHeatFlux(T_0, T_h_u, Tsurf_h, WS_h_u,            # Calculate latent and sensible heat fluxes
+                                                z_WS_u, z_T_u, q_h_u, p_h_u)     
         
-    q_h_u = calcSpHumid(T_0, T_100, T_h_u, p_h_u, RH_cor_h_u)                  # Calculate specific humidity
-    
-    if not ds.attrs['bedrock']:       
-        SHF_h_u, LHF_h_u= calcHeatFlux(T_0, T_h_u, Tsurf_h, WS_h_u,            # Calculate latent and sensible heat fluxes
-                                        z_WS_u, z_T_u, q_h_u, p_h_u)     
-
-        ds['dshf_u'] = (('time'), SHF_h_u.data)
-        ds['dlhf_u'] = (('time'), LHF_h_u.data)
+                ds['dshf_u'] = (('time'), SHF_h_u.data)
+                ds['dlhf_u'] = (('time'), LHF_h_u.data)
+        else:
+            logger.info('wspd_u, t_surf or z_boom_u missing, cannot calulate tubrulent heat fluxes')
 
-    q_h_u = 1000 * q_h_u                                                       # Convert sp.humid from kg/kg to g/kg
-    ds['qh_u'] = (('time'), q_h_u.data)    
+        q_h_u = 1000 * q_h_u                                                       # Convert sp.humid from kg/kg to g/kg
+        ds['qh_u'] = (('time'), q_h_u.data)
+    else:
+        logger.info('t_u, p_u or rh_u_cor missing, cannot calulate tubrulent heat fluxes')
 
     # Lower boom bulk calculation
     if ds.attrs['number_of_booms']==2:
-        T_h_l = ds['t_l'].copy()                                               # Copy for processing
-        p_h_l = ds['p_l'].copy()
-        WS_h_l = ds['wspd_l'].copy()                                      
-        RH_cor_h_l = ds['rh_l_cor'].copy()
-        z_WS_l = ds['z_boom_l'].copy() + 0.4                                   # Get height of W                  
-        z_T_l = ds['z_boom_l'].copy() - 0.1                                    # Get height of thermometer 
+        if ('t_l' in ds.keys()) and \
+            ('p_l' in ds.keys()) and \
+                ('rh_l_cor' in ds.keys()):
+            T_h_l = ds['t_l'].copy()                                               # Copy for processing
+            p_h_l = ds['p_l'].copy()                                    
+            RH_cor_h_l = ds['rh_l_cor'].copy()
+
+            q_h_l = calcSpHumid(T_0, T_100, T_h_l, p_h_l, RH_cor_h_l)              # Calculate sp.humidity
+
+            if ('wspd_l' in ds.keys()) and \
+                ('t_surf' in ds.keys()) and \
+                    ('z_boom_l' in ds.keys()):
+                z_WS_l = ds['z_boom_l'].copy() + 0.4                                   # Get height of W                  
+                z_T_l = ds['z_boom_l'].copy() - 0.1                                    # Get height of thermometer 
+                WS_h_l = ds['wspd_l'].copy()                                 
+                if not ds.attrs['bedrock']:       
+                    SHF_h_l, LHF_h_l= calcHeatFlux(T_0, T_h_l, Tsurf_h, WS_h_l, # Calculate latent and sensible heat fluxes 
+                                                    z_WS_l, z_T_l, q_h_l, p_h_l)        
         
-        q_h_l = calcSpHumid(T_0, T_100, T_h_l, p_h_l, RH_cor_h_l)              # Calculate sp.humidity
-                           
-        if not ds.attrs['bedrock']:       
-            SHF_h_l, LHF_h_l= calcHeatFlux(T_0, T_h_l, Tsurf_h, WS_h_l, # Calculate latent and sensible heat fluxes 
-                                            z_WS_l, z_T_l, q_h_l, p_h_l)        
-
-            ds['dshf_l'] = (('time'), SHF_h_l.data)
-            ds['dlhf_l'] = (('time'), LHF_h_l.data)
-        q_h_l = 1000 * q_h_l                                                   # Convert sp.humid from kg/kg to g/kg
-
-        ds['qh_l'] = (('time'), q_h_l.data)    
+                    ds['dshf_l'] = (('time'), SHF_h_l.data)
+                    ds['dlhf_l'] = (('time'), LHF_h_l.data)
+            else:
+                logger.info('wspd_l, t_surf or z_boom_l missing, cannot calulate tubrulent heat fluxes')
     
+            q_h_l = 1000 * q_h_l                                                       # Convert sp.humid from kg/kg to g/kg
+            ds['qh_l'] = (('time'), q_h_l.data)
+        else:
+            logger.info('t_l, p_l or rh_l_cor missing, cannot calulate tubrulent heat fluxes')
+
     # Smoothing and inter/extrapolation of GPS coordinates
     for var in ['gps_lat', 'gps_lon', 'gps_alt']:
         ds[var.replace('gps_','')] = gpsCoordinatePostprocessing(ds, var)
@@ -762,21 +782,18 @@ def interpolate_temperature(dates, depth_cor, temp, depth=10, min_diff_to_depth=
 
     return df_interp
 
-def gpsCoordinatePostprocessing(ds, var):
+def gpsCoordinatePostprocessing(ds, var, config_folder='../aws-l0/metadata/station_configurations/'):
         # saving the static value of 'lat','lon' or 'alt' stored in attribute
         # as it might be the only coordinate available for certain stations (e.g. bedrock)
         var_out = var.replace('gps_','')
-        
-        if var_out == 'alt':
-            if 'altitude' in list(ds.attrs.keys()):
-                static_value = float(ds.attrs['altitude'])
-            else:
-                print('no standard altitude for', ds.attrs['station_id'])
-                static_value = np.nan
-        elif  var_out == 'lat':
-            static_value = float(ds.attrs['latitude'])
-        elif  var_out == 'lon':
-            static_value = float(ds.attrs['longitude'])
+        coord_names = {'alt':'altitude', 'lat':'latitude','lon':'longitude'}
+
+        if var_out+'_avg' in list(ds.attrs.keys()):
+            static_value = float(ds.attrs[var_out+'_avg'])
+        elif coord_names[var_out] in list(ds.attrs.keys()):
+            static_value = float(ds.attrs[coord_names[var_out]])
+        else:
+            static_value = np.nan
         
         # if there is no gps observations, then we use the static value repeated
         # for each time stamp
@@ -788,16 +805,22 @@ def gpsCoordinatePostprocessing(ds, var):
             print('no',var,'at',ds.attrs['station_id'])
             return ('time', np.ones_like(ds['t_u'].data)*static_value)
         
-        # here we detect potential relocation of the station in the form of a 
-        # break in the general trend of the latitude, longitude and altitude
-        # in the future, this could/should be listed in an external file to 
-        # avoid missed relocations or sensor issues interpreted as a relocation
-        if var == 'gps_alt':
-            _, breaks = find_breaks(ds[var].to_series(), alpha=8)
-        else:
-            _, breaks = find_breaks(ds[var].to_series(), alpha=6)
+        # fetching the station relocation dates at which the coordinates will/should
+        # have a break
+        config_file = config_folder +"/" + ds.attrs['station_id'] + ".toml"
+        with open(config_file, "r") as f:
+            config_data = toml.load(f)
+        
+        # Extract station relocations from the TOML data
+        station_relocations = config_data.get("station_relocation", [])
         
-        # smoothing and inter/extrapolation of the coordinate
+        # Convert the ISO8601 strings to pandas datetime objects
+        breaks = [pd.to_datetime(date_str) for date_str in station_relocations]
+        if len(breaks)==0:
+            logger.info('processing '+var+' without relocation')
+        else:
+            logger.info('processing '+var+' with relocation on ' + ', '.join([br.strftime('%Y-%m-%dT%H:%M:%S') for br in breaks]))
+
         return ('time',  piecewise_smoothing_and_interpolation(ds[var].to_series(), breaks))
             
 
@@ -1047,75 +1070,77 @@ def calcSpHumid(T_0, T_100, T_h, p_h, RH_cor_h, es_0=6.1071, es_100=1013.246, ep
     # Convert to kg/kg
     return RH_cor_h * q_sat / 100 
 
-
-def find_breaks(df,alpha):
-    '''Detects potential relocation of the station from the GPS measurements.
-    The code first makes a forward linear interpolation of the coordinates and
-    then looks for important jumps in latitude, longitude and altitude. The jumps
-    that are higher than a given threshold (expressed as a multiple of the 
-    standard deviation) are mostly caused by the station being moved during
-    maintenance. To avoid misclassification, only the jumps detected in May-Sept. 
-    are kept.
-    
-    Parameters
-    ----------
-    df : pandas.Series
-        series of observed latitude, longitude or elevation
-    alpha: float
-        coefficient to be applied to the the standard deviation of the daily
-        coordinate fluctuation
-    '''
-    diff = df.resample('D').median().interpolate(
-        method='linear', limit_area='inside', limit_direction='forward').diff()        
-    thresh = diff.std() * alpha
-    list_diff = diff.loc[diff.abs()>thresh].reset_index()
-    list_diff = list_diff.loc[list_diff.time.dt.month.isin([5,6,7,8,9])]
-    list_diff['year']=list_diff.time.dt.year
-    list_diff=list_diff.groupby('year').max()
-    return diff, [None]+list_diff.time.to_list()+[None]
-
-
 def piecewise_smoothing_and_interpolation(df_in, breaks):
-    '''Smoothes, inter- or extrapolate the gps observations. The processing is 
-    done piecewise so that each period between station relocation are done 
-    separately (no smoothing of the jump due to relocation). Locally Weighted
-    Scatterplot Smoothing (lowess) is then used to smooth the available
-    observations. Then this smoothed curve is interpolated linearly over internal
-    gaps. Eventually, this interpolated curve is extrapolated linearly for 
-    timestamps before the first valid measurement and after the last valid
-    measurement.
+    '''Smoothes, inter- or extrapolate the GPS observations. The processing is 
+    done piecewise so that each period between station relocations are done 
+    separately (no smoothing of the jump due to relocation). Piecewise linear 
+    regression is then used to smooth the available observations. Then this 
+    smoothed curve is interpolated linearly over internal gaps. Eventually, this 
+    interpolated curve is extrapolated linearly for timestamps before the first 
+    valid measurement and after the last valid measurement.
     
     Parameters
     ----------
     df_in : pandas.Series
-        series of observed latitude, longitude or elevation
+        Series of observed latitude, longitude or elevation with datetime index.
     breaks: list
         List of timestamps of station relocation. First and last item should be
         None so that they can be used in slice(breaks[i], breaks[i+1])
+        
+    Returns
+    -------
+    np.ndarray
+        Smoothed and interpolated values corresponding to the input series.
     '''
-    df_all = pd.Series() # dataframe gathering all the smoothed pieces
-    for i in range(len(breaks)-1):
+    df_all = pd.Series(dtype=float)  # Initialize an empty Series to gather all smoothed pieces
+    breaks = [None] + breaks + [None]
+    for i in range(len(breaks) - 1):
         df = df_in.loc[slice(breaks[i], breaks[i+1])].copy()
+               
+        # Drop NaN values and calculate the number of segments based on valid data
+        df_valid = df.dropna()
+        if df_valid.shape[0] > 2:        
+            # Fit linear regression model to the valid data range
+            x = pd.to_numeric(df_valid.index).values.reshape(-1, 1)
+            y = df_valid.values.reshape(-1, 1)
+            
+            model = LinearRegression()
+            model.fit(x, y)
+            
+            # Predict using the model for the entire segment range
+            x_pred = pd.to_numeric(df.index).values.reshape(-1, 1)
+            
+            y_pred = model.predict(x_pred)
+            df =  pd.Series(y_pred.flatten(), index=df.index)
+
         
-        y_sm = lowess(df,
-                      pd.to_numeric(df.index),
-                      is_sorted=True, frac=1/3, it=0,
-                      )
-        df.loc[df.notnull()] = y_sm[:,1]
-        df = df.interpolate(method='linear', limit_area='inside')
-        
-        last_valid_6_months = slice(df.last_valid_index()-pd.to_timedelta('180D'),None)
-        df.loc[last_valid_6_months] = (df.loc[last_valid_6_months].interpolate( axis=0,
-            method='spline',order=1, limit_direction='forward', fill_value="extrapolate")).values
-        
-        first_valid_6_months = slice(None, df.first_valid_index()+pd.to_timedelta('180D'))
-        df.loc[first_valid_6_months] = (df.loc[first_valid_6_months].interpolate( axis=0,
-            method='spline',order=1, limit_direction='backward', fill_value="extrapolate")).values
-        df_all=pd.concat((df_all, df))
-        
+        # Update df_all with predicted values for the current segment
+        df_all = pd.concat([df_all, df])
+    
+    # Fill internal gaps with linear interpolation
+    df_all = df_all.interpolate(method='linear', limit_area='inside')
+    
+    # Extrapolate for timestamps before the first valid measurement
+    first_valid_6_months = slice(None, df_in.first_valid_index() + pd.to_timedelta('180D'))
+    df_all.loc[first_valid_6_months] = (
+        df_all.loc[first_valid_6_months].interpolate(
+            method='linear', limit_direction='backward', fill_value="extrapolate"
+        )
+    ).values
+    
+    # Extrapolate for timestamps after the last valid measurement
+    last_valid_6_months = slice(df_in.last_valid_index() - pd.to_timedelta('180D'), None)
+    df_all.loc[last_valid_6_months] = (
+        df_all.loc[last_valid_6_months].interpolate(
+            method='linear', limit_direction='forward', fill_value="extrapolate"
+        )
+    ).values
+    
+    # Remove duplicate indices and return values as numpy array
     df_all = df_all[~df_all.index.duplicated(keep='first')]
     return df_all.values
 
+
 def _calcAtmosDens(p_h, R_d, T_h, T_0):                                        # TODO: check this shouldn't be in this step somewhere
     '''Calculate atmospheric density'''
     return 100 * p_h / R_d / (T_h + T_0)
diff --git a/src/pypromice/process/get_l2tol3.py b/src/pypromice/process/get_l2tol3.py
index 75be5b41..84a33e20 100644
--- a/src/pypromice/process/get_l2tol3.py
+++ b/src/pypromice/process/get_l2tol3.py
@@ -12,6 +12,8 @@ def parse_arguments_l2tol3(debug_args=None):
     parser = ArgumentParser(description="AWS L3 script for the processing L3 "+
                             "data from L2. An hourly, daily and monthly L3 "+
                             "data product is outputted to the defined output path")
+    parser.add_argument('-c', '--config_folder', type=str, required=True,
+                        help='Path to folder with sites configuration (TOML) files')
     parser.add_argument('-i', '--inpath', type=str, required=True, 
                         help='Path to Level 2 .nc data file')
     parser.add_argument('-o', '--outpath', default=None, type=str, required=False, 
@@ -24,7 +26,7 @@ def parse_arguments_l2tol3(debug_args=None):
     args = parser.parse_args(args=debug_args)
     return args
 
-def get_l2tol3(inpath, outpath, variables, metadata):
+def get_l2tol3(config_folder, inpath, outpath, variables, metadata):
     logging.basicConfig(
         format="%(asctime)s; %(levelname)s; %(name)s; %(message)s",
         level=logging.INFO,
@@ -46,7 +48,7 @@ def get_l2tol3(inpath, outpath, variables, metadata):
         l2.attrs['number_of_booms'] = int(l2.attrs['number_of_booms'])
     
     # Perform Level 3 processing
-    l3 = toL3(l2)
+    l3 = toL3(l2, config_folder)
 
     # Write Level 3 dataset to file if output directory given
     v = getVars(variables)
@@ -59,7 +61,7 @@ def get_l2tol3(inpath, outpath, variables, metadata):
 
 def main():
     args = parse_arguments_l2tol3()
-    _ = get_l2tol3(args.inpath, args.outpath, args.variables, args.metadata)
+    _ = get_l2tol3(args.config_folder, args.inpath, args.outpath, args.variables, args.metadata)
     
 if __name__ == "__main__":  
     main()
diff --git a/src/pypromice/process/join_l3.py b/src/pypromice/process/join_l3.py
index e062911c..10a2c769 100644
--- a/src/pypromice/process/join_l3.py
+++ b/src/pypromice/process/join_l3.py
@@ -6,6 +6,11 @@
 import numpy as np
 import pandas as pd
 import xarray as xr
+logging.basicConfig(
+    format="%(asctime)s; %(levelname)s; %(name)s; %(message)s",
+    level=logging.INFO,
+    stream=sys.stdout,
+)
 logger = logging.getLogger(__name__)
 
 def parse_arguments_joinl3(debug_args=None):
@@ -202,16 +207,16 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me
         
         filepath = os.path.join(folder_l3, stid, stid+'_hour.nc')
         isNead = False
-        if station_info["project"].lower() in ["historical gc-net", "glaciobasis"]:
+        if station_info["project"].lower() in ["historical gc-net"]:
             filepath = os.path.join(folder_gcnet, stid+'.csv')
             isNead = True
-        if not os.path.isfile(filepath):            
+        if not os.path.isfile(filepath):
             logger.info(stid+' is from an project '+folder_l3+' or '+folder_gcnet)
             continue
 
         l3, _ = loadArr(filepath, isNead)            
         list_station_data.append((l3, station_info))
-    
+
     # Sort the list in reverse chronological order so that we start with the latest data
     sorted_list_station_data = sorted(list_station_data, key=lambda x: x[0].time.max(), reverse=True)
     sorted_stids = [info["stid"] for _, info in sorted_list_station_data]
@@ -280,6 +285,8 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me
             
 
     # Assign site id
+    if not l3_merged:
+        logger.error('No level 2 data file found for '+site)
     l3_merged.attrs['site_id'] = site
     l3_merged.attrs['stations'] = ' '.join(sorted_stids)
     l3_merged.attrs['level'] = 'L3'
diff --git a/src/pypromice/process/write.py b/src/pypromice/process/write.py
index cc4d8fe5..e8d9e6a7 100644
--- a/src/pypromice/process/write.py
+++ b/src/pypromice/process/write.py
@@ -48,7 +48,7 @@ def prepare_and_write(dataset, outpath, vars_df=None, meta_dict=None, time='60mi
     if 'gps_lon' in d2.keys():
         d2 = reformat_lon(d2)
     else:
-        logger.info('%s does not have gpd_lon'%name)
+        logger.info('%s does not have gps_lon'%name)
         
     # Add variable attributes and metadata
     if vars_df is None:
@@ -100,7 +100,6 @@ def prepare_and_write(dataset, outpath, vars_df=None, meta_dict=None, time='60mi
     writeCSV(out_csv, d2, col_names)
 
     # Write to netcdf file
-    col_names = col_names + ['lat', 'lon', 'alt']
     writeNC(out_nc, d2, col_names)
     logger.info(f'Written to {out_csv}')
     logger.info(f'Written to {out_nc}')
@@ -245,16 +244,24 @@ def addMeta(ds, meta):
     ds : xarray.Dataset
         Dataset with metadata
    '''
-    if 'gps_lon' in ds.keys():
-        ds['lon'] = ds['gps_lon'].mean()
-        ds['lon'].attrs = ds['gps_lon'].attrs
-
-        ds['lat'] = ds['gps_lat'].mean()
-        ds['lat'].attrs = ds['gps_lat'].attrs
-
-        ds['alt'] = ds['gps_alt'].mean()
-        ds['alt'].attrs = ds['gps_alt'].attrs
-
+    if 'lon' in ds.keys():
+        # caluclating average coordinates based on the extra/interpolated coords
+        for v in ['lat','lon','alt']:
+            ds.attrs[v+'_avg'] = ds[v].mean().item()
+        # dropping the less accurate standard coordinates given in the 
+        # raw or tx config files
+        for v in ['latitude','longitude']:
+            if v in ds.attrs.keys():
+                del ds.attrs[v]
+    elif 'gps_lon' in ds.keys():
+        # caluclating average coordinates based on the measured coords (can be gappy)
+        for v in ['gps_lat','gps_lon','gps_alt']:
+            ds.attrs[v+'_avg'] = ds[v].mean().item()
+        # dropping the less accurate standard coordinates given in the 
+        # raw or tx config files
+        for v in ['latitude','longitude']:
+            if v in ds.attrs.keys():
+                del ds.attrs[v]
     # Attribute convention for data discovery
     # https://wiki.esipfed.org/Attribute_Convention_for_Data_Discovery_1-3
     
@@ -283,19 +290,61 @@ def addMeta(ds, meta):
     ds.attrs['date_metadata_modified'] = ds.attrs['date_created']
     ds.attrs['processing_level'] = ds.attrs['level'].replace('L','level ')
 
+
+    if 'lat' in ds.keys():
+        lat_min = ds['lat'].min().values
+        lat_max = ds['lat'].max().values
+    elif 'gps_lat' in ds.keys():
+        lat_min = ds['gps_lat'].min().values
+        lat_max = ds['gps_lat'].max().values
+    elif 'latitude' in ds.attrs.keys():
+        lat_min = ds.attrs['latitude']
+        lat_max = ds.attrs['latitude']
+    else:
+        lat_min =np.nan
+        lat_max = np.nan
+        
+        
+    if 'lon' in ds.keys():
+        lon_min = ds['lon'].min().values
+        lon_max = ds['lon'].max().values
+    elif 'gps_lon' in ds.keys():
+        lon_min = ds['gps_lon'].min().values
+        lon_max = ds['gps_lon'].max().values
+    elif 'longitude' in ds.attrs.keys():
+        lon_min = ds.attrs['longitude']
+        lon_max = ds.attrs['longitude']
+    else:
+        lon_min =np.nan
+        lon_max = np.nan
+        
+    if 'alt' in ds.keys():
+        alt_min = ds['alt'].min().values
+        alt_max = ds['alt'].max().values
+    elif 'gps_alt' in ds.keys():
+        alt_min = ds['gps_alt'].min().values
+        alt_max = ds['gps_alt'].max().values
+    elif 'altitude' in ds.attrs.keys():
+        alt_min = ds.attrs['altitude']
+        alt_max = ds.attrs['altitude']
+    else:
+        alt_min =np.nan
+        alt_max = np.nan
+        
     ds.attrs['geospatial_bounds'] = "POLYGON((" + \
-        f"{ds['lat'].min().values} {ds['lon'].min().values}, " + \
-        f"{ds['lat'].min().values} {ds['lon'].max().values}, " + \
-        f"{ds['lat'].max().values} {ds['lon'].max().values}, " + \
-        f"{ds['lat'].max().values} {ds['lon'].min().values}, " + \
-        f"{ds['lat'].min().values} {ds['lon'].min().values}))"
-
-    ds.attrs['geospatial_lat_min'] = str(ds['lat'].min().values)
-    ds.attrs['geospatial_lat_max'] = str(ds['lat'].max().values)
-    ds.attrs['geospatial_lon_min'] = str(ds['lon'].min().values)
-    ds.attrs['geospatial_lon_max'] = str(ds['lon'].max().values)
-    ds.attrs['geospatial_vertical_min'] = str(ds['alt'].min().values)
-    ds.attrs['geospatial_vertical_max'] = str(ds['alt'].max().values)
+        f"{lat_min} {lon_min}, " + \
+        f"{lat_min} {lon_max}, " + \
+        f"{lat_max} {lon_max}, " + \
+        f"{lat_max} {lon_min}, " + \
+        f"{lat_min} {lon_min}))"
+
+    ds.attrs['geospatial_lat_min'] = str(lat_min)
+    ds.attrs['geospatial_lat_max'] = str(lat_max)
+    ds.attrs['geospatial_lon_min'] = str(lon_min)
+    ds.attrs['geospatial_lon_max'] = str(lon_max)
+    ds.attrs['geospatial_vertical_min'] = str(alt_min)
+    ds.attrs['geospatial_vertical_max'] = str(alt_max)
+    
     ds.attrs['geospatial_vertical_positive'] = 'up'
     ds.attrs['time_coverage_start'] = str(ds['time'][0].values)
     ds.attrs['time_coverage_end'] = str(ds['time'][-1].values)
@@ -387,4 +436,4 @@ def reformat_lon(dataset, exempt=['UWN', 'Roof_GEUS', 'Roof_PROMICE']):
         if 'gps_lon' not in dataset.keys():
             return dataset
         dataset['gps_lon'] = dataset['gps_lon'] * -1
-    return dataset
\ No newline at end of file
+    return dataset
diff --git a/src/pypromice/qc/github_data_issues.py b/src/pypromice/qc/github_data_issues.py
index fd4d96e0..41603ed8 100644
--- a/src/pypromice/qc/github_data_issues.py
+++ b/src/pypromice/qc/github_data_issues.py
@@ -65,10 +65,10 @@ def flagNAN(ds_in,
 
                 for v in varlist:
                     if v in list(ds.keys()):
-                        logger.info(f'---> flagging {t0} {t1} {v}')
+                        logger.debug(f'---> flagging {t0} {t1} {v}')
                         ds[v] = ds[v].where((ds['time'] < t0) | (ds['time'] > t1))
                     else:
-                        logger.info(f'---> could not flag {v} not in dataset')
+                        logger.debug(f'---> could not flag {v} not in dataset')
 
     return ds
 
@@ -206,13 +206,14 @@ def adjustData(ds,
                     t1 = pd.to_datetime(t1, utc=True).tz_localize(None)
 
                 index_slice = dict(time=slice(t0, t1))
-
                 if len(ds_out[var].loc[index_slice].time.time) == 0:
+                    logger.info(f'---> {t0} {t1} {var} {func} {val}')
                     logger.info("Time range does not intersect with dataset")
                     continue
 
-                logger.info(f'---> {t0} {t1} {var} {func} {val}')
-				
+                else:
+                    logger.debug(f'---> {t0} {t1} {var} {func} {val}')
+
                 if func == "add":
                     ds_out[var].loc[index_slice] = ds_out[var].loc[index_slice].values + val
                     # flagging adjusted values
diff --git a/src/pypromice/qc/persistence.py b/src/pypromice/qc/persistence.py
index 5f5d55f4..963ff786 100644
--- a/src/pypromice/qc/persistence.py
+++ b/src/pypromice/qc/persistence.py
@@ -83,7 +83,7 @@ def persistence_qc(
                     mask = mask & (df[v]<99)
                 n_masked = mask.sum()
                 n_samples = len(mask)
-                logger.info(
+                logger.debug(
                     f"Applying persistent QC in {v}. Filtering {n_masked}/{n_samples} samples"
                 )
                 # setting outliers to NaN
@@ -96,7 +96,7 @@ def persistence_qc(
 
                 n_masked = mask.sum()
                 n_samples = len(mask)
-                logger.info(
+                logger.debug(
                     f"Applying persistent QC in {v}. Filtering {n_masked}/{n_samples} samples"
                 )
                 # setting outliers to NaN