Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/smoothing and extrapolating gps coordinates #268

Merged
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
325 changes: 204 additions & 121 deletions src/pypromice/process/L2toL3.py

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/pypromice/process/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def getL1(self):
logger.info('Level 1 processing...')
self.L0 = [utilities.addBasicMeta(item, self.vars) for item in self.L0]
self.L1 = [toL1(item, self.vars) for item in self.L0]
self.L1.reverse()
ladsmund marked this conversation as resolved.
Show resolved Hide resolved
self.L1A = reduce(xr.Dataset.combine_first, self.L1)

def getL2(self):
Expand Down
12 changes: 7 additions & 5 deletions src/pypromice/process/get_l2tol3.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ def parse_arguments_l2tol3(debug_args=None):
parser = ArgumentParser(description="AWS L3 script for the processing L3 "+
"data from L2. An hourly, daily and monthly L3 "+
"data product is outputted to the defined output path")
parser.add_argument('-c', '--config_folder', type=str, required=True,
help='Path to folder with sites configuration (TOML) files')
parser.add_argument('-i', '--inpath', type=str, required=True,
help='Path to Level 2 .nc data file')
parser.add_argument('-o', '--outpath', default=None, type=str, required=False,
Expand All @@ -24,7 +26,7 @@ def parse_arguments_l2tol3(debug_args=None):
args = parser.parse_args(args=debug_args)
return args

def get_l2tol3(inpath, outpath, variables, metadata):
def get_l2tol3(config_folder, inpath, outpath, variables, metadata):
logging.basicConfig(
format="%(asctime)s; %(levelname)s; %(name)s; %(message)s",
level=logging.INFO,
Expand All @@ -38,15 +40,15 @@ def get_l2tol3(inpath, outpath, variables, metadata):
# Remove encoding attributes from NetCDF
for varname in l2.variables:
if l2[varname].encoding!={}:
l2[varname].encoding = {}
l2[varname].encoding = {}

if 'bedrock' in l2.attrs.keys():
l2.attrs['bedrock'] = l2.attrs['bedrock'] == 'True'
if 'number_of_booms' in l2.attrs.keys():
l2.attrs['number_of_booms'] = int(l2.attrs['number_of_booms'])

# Perform Level 3 processing
l3 = toL3(l2)
l3 = toL3(l2, config_folder)
ladsmund marked this conversation as resolved.
Show resolved Hide resolved

# Write Level 3 dataset to file if output directory given
v = getVars(variables)
Expand All @@ -59,7 +61,7 @@ def get_l2tol3(inpath, outpath, variables, metadata):

def main():
args = parse_arguments_l2tol3()
_ = get_l2tol3(args.inpath, args.outpath, args.variables, args.metadata)
_ = get_l2tol3(args.config_folder, args.inpath, args.outpath, args.variables, args.metadata)

if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion src/pypromice/process/join_l2.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def loadArr(infile):
elif infile.split('.')[-1].lower() == 'nc':
with xr.open_dataset(infile) as ds:
ds.load()
# Remove encoding attributes from NetCDF
# Remove encoding attributes from NetCDF
for varname in ds.variables:
if ds[varname].encoding!={}:
ds[varname].encoding = {}
Expand Down
55 changes: 35 additions & 20 deletions src/pypromice/process/join_l3.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
import numpy as np
import pandas as pd
import xarray as xr
logging.basicConfig(
format="%(asctime)s; %(levelname)s; %(name)s; %(message)s",
level=logging.INFO,
stream=sys.stdout,
)
logger = logging.getLogger(__name__)

def parse_arguments_joinl3(debug_args=None):
Expand Down Expand Up @@ -100,8 +105,20 @@ def readNead(infile):
# combining thermocouple and CS100 temperatures
ds['TA1'] = ds['TA1'].combine_first(ds['TA3'])
ds['TA2'] = ds['TA2'].combine_first(ds['TA4'])

ds=ds.rename(var_name)

standard_vars_to_drop = ["NR", "TA3", "TA4", "TA5", "NR_cor",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these variables related to the NEAD format or specifically for the files you are reading? 🤔
Consider creating a module dedicated for reading (and preprocessing) historical gc-net data.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to add them to skipped_variables in the station configuration file?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically, they are related to the files we are reading (here historical GC-Net).
But they 31 historical GC-Net stations and I'd rather have one list here rather than 31 skipped_variables list in the config files. Additionally the only other NEAD files we might read at some point are the old glaciobasis stations (4-5 AWS). For these few glaciobasis stations I'd consider using a skipped_variables info.
Last, this standard_vars_to_drop will be reduced very soon as

"z_surf_1",   "z_surf_2", "z_surf_combined", 
            "depth_t_i_1", "depth_t_i_2", "depth_t_i_3", "depth_t_i_4", "depth_t_i_5", 
            "depth_t_i_6", "depth_t_i_7", "depth_t_i_8", "depth_t_i_9", "depth_t_i_10", "t_i_10m"

will shortly be derived for the GEUS stations, and therefore won't need to be skipped anymore in the historical files.

and that in the longer term, we could also calculate

"TA2m", "RH2m", "VW10m", "SZA", "SAA",

and thereafter remove them from the list of variables to skip in the historical files.

Once again, keeping this list in one single place (rather than multiple config files) makes every update easier.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we will look at making a smart solution for this in the future, but I think it works fine for now

"z_surf_1", "z_surf_2", "z_surf_combined",
"TA2m", "RH2m", "VW10m", "SZA", "SAA",
"depth_t_i_1", "depth_t_i_2", "depth_t_i_3", "depth_t_i_4", "depth_t_i_5",
"depth_t_i_6", "depth_t_i_7", "depth_t_i_8", "depth_t_i_9", "depth_t_i_10", "t_i_10m"
]
standard_vars_to_drop = standard_vars_to_drop + [v for v in list(ds.keys()) if v.endswith("_adj_flag")]

# Drop the variables if they are present in the dataset
ds = ds.drop_vars([var for var in standard_vars_to_drop if var in ds])

ds=ds.rename({'timestamp':'time'})
return ds

Expand All @@ -116,7 +133,8 @@ def loadArr(infile, isNead):
ds = xr.Dataset.from_dataframe(df)

elif infile.split('.')[-1].lower() in 'nc':
ds = xr.open_dataset(infile)
with xr.open_dataset(infile) as ds:
ds.load()
# Remove encoding attributes from NetCDF
for varname in ds.variables:
if ds[varname].encoding!={}:
Expand Down Expand Up @@ -202,16 +220,23 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me

filepath = os.path.join(folder_l3, stid, stid+'_hour.nc')
isNead = False
if station_info["project"].lower() in ["historical gc-net", "glaciobasis"]:
if station_info["project"].lower() in ["historical gc-net"]:
filepath = os.path.join(folder_gcnet, stid+'.csv')
isNead = True
if not os.path.isfile(filepath):
logger.info(stid+' is from an project '+folder_l3+' or '+folder_gcnet)
if not os.path.isfile(filepath):
logger.info(stid+' was listed as station but could not be found in '+folder_l3+' nor '+folder_gcnet)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I anticipated that this would trigger an error and raise an exception. If the station list specifies data files, I would have expected those files to exist.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My intention was that if a historical station is listed but cannot be found, then the latest data (which can most likely be found because it was produced by pypromice) should still be loaded and written in a l3/sites file. So that people fetching the latest data are not affected if there is a mess-up with the historical files' path.

continue

l3, _ = loadArr(filepath, isNead)
l3, _ = loadArr(filepath, isNead)

# removing specific variable from a given file
specific_vars_to_drop = station_info.get("skipped_variables",[])
if len(specific_vars_to_drop)>0:
logger.info("Skipping %s from %s"%(specific_vars_to_drop, stid))
l3 = l3.drop_vars([var for var in specific_vars_to_drop if var in l3])

list_station_data.append((l3, station_info))

# Sort the list in reverse chronological order so that we start with the latest data
sorted_list_station_data = sorted(list_station_data, key=lambda x: x[0].time.max(), reverse=True)
sorted_stids = [info["stid"] for _, info in sorted_list_station_data]
Expand Down Expand Up @@ -246,19 +271,7 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me
for v in l3_merged.data_vars:
if v not in l3.data_vars:
l3[v] = l3.t_u*np.nan

# if l3 (older data) has variables that does not have l3_merged (newer data)
# then they are removed from l3
list_dropped = []
for v in l3.data_vars:
if v not in l3_merged.data_vars:
if v != 'z_stake':
list_dropped.append(v)
l3 = l3.drop(v)
else:
l3_merged[v] = ('time', l3_merged.t_u.data*np.nan)
logger.info('Unused variables in older dataset: '+' '.join(list_dropped))


# saving attributes of station under an attribute called $stid
st_attrs = l3_merged.attrs.get('stations_attributes', {})
st_attrs[stid] = l3.attrs.copy()
Expand All @@ -280,6 +293,8 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me


# Assign site id
if not l3_merged:
logger.error('No level 2 data file found for '+site)
l3_merged.attrs['site_id'] = site
l3_merged.attrs['stations'] = ' '.join(sorted_stids)
l3_merged.attrs['level'] = 'L3'
Expand Down
109 changes: 83 additions & 26 deletions src/pypromice/process/write.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def prepare_and_write(dataset, outpath, vars_df=None, meta_dict=None, time='60mi
if 'gps_lon' in d2.keys():
d2 = reformat_lon(d2)
else:
logger.info('%s does not have gpd_lon'%name)
logger.info('%s does not have gps_lon'%name)

# Add variable attributes and metadata
if vars_df is None:
Expand All @@ -63,7 +63,11 @@ def prepare_and_write(dataset, outpath, vars_df=None, meta_dict=None, time='60mi
d2 = roundValues(d2, vars_df)

# Get variable names to write out
col_names = getColNames(vars_df, d2, remove_nan_fields=True)
if 'site_id' in d2.attrs.keys():
remove_nan_fields = True
else:
remove_nan_fields = False
col_names = getColNames(vars_df, d2, remove_nan_fields=remove_nan_fields)

# Define filename based on resample rate
t = int(pd.Timedelta((d2['time'][1] - d2['time'][0]).values).total_seconds())
Expand Down Expand Up @@ -100,7 +104,6 @@ def prepare_and_write(dataset, outpath, vars_df=None, meta_dict=None, time='60mi
writeCSV(out_csv, d2, col_names)

# Write to netcdf file
col_names = col_names + ['lat', 'lon', 'alt']
writeNC(out_nc, d2, col_names)
logger.info(f'Written to {out_csv}')
logger.info(f'Written to {out_nc}')
Expand Down Expand Up @@ -245,16 +248,28 @@ def addMeta(ds, meta):
ds : xarray.Dataset
Dataset with metadata
'''
if 'gps_lon' in ds.keys():
ds['lon'] = ds['gps_lon'].mean()
ds['lon'].attrs = ds['gps_lon'].attrs

ds['lat'] = ds['gps_lat'].mean()
ds['lat'].attrs = ds['gps_lat'].attrs

ds['alt'] = ds['gps_alt'].mean()
ds['alt'].attrs = ds['gps_alt'].attrs

if 'lon' in ds.keys():
# caluclating average coordinates based on the extra/interpolated coords
for v in ['lat','lon','alt']:
ds.attrs[v+'_avg'] = ds[v].mean().item()
# dropping the less accurate standard coordinates given in the
# raw or tx config files
for v in ['latitude','longitude']:
if v in ds.attrs.keys():
del ds.attrs[v]
elif 'gps_lon' in ds.keys():
# caluclating average coordinates based on the measured coords (can be gappy)
for v in ['gps_lat','gps_lon','gps_alt']:
if v in ds.keys():
ds.attrs[v+'_avg'] = ds[v].mean().item()
else:
ds.attrs[v+'_avg'] = np.nan
# dropping the less accurate standard coordinates given in the
# raw or tx config files
for v in ['latitude','longitude']:
if v in ds.attrs.keys():
del ds.attrs[v]

# Attribute convention for data discovery
# https://wiki.esipfed.org/Attribute_Convention_for_Data_Discovery_1-3

Expand Down Expand Up @@ -283,19 +298,61 @@ def addMeta(ds, meta):
ds.attrs['date_metadata_modified'] = ds.attrs['date_created']
ds.attrs['processing_level'] = ds.attrs['level'].replace('L','level ')


if 'lat' in ds.keys():
lat_min = ds['lat'].min().values
lat_max = ds['lat'].max().values
elif 'gps_lat' in ds.keys():
lat_min = ds['gps_lat'].min().values
lat_max = ds['gps_lat'].max().values
elif 'latitude' in ds.attrs.keys():
lat_min = ds.attrs['latitude']
lat_max = ds.attrs['latitude']
else:
lat_min =np.nan
lat_max = np.nan


if 'lon' in ds.keys():
lon_min = ds['lon'].min().values
lon_max = ds['lon'].max().values
elif 'gps_lon' in ds.keys():
lon_min = ds['gps_lon'].min().values
lon_max = ds['gps_lon'].max().values
elif 'longitude' in ds.attrs.keys():
lon_min = ds.attrs['longitude']
lon_max = ds.attrs['longitude']
else:
lon_min =np.nan
lon_max = np.nan

if 'alt' in ds.keys():
alt_min = ds['alt'].min().values
alt_max = ds['alt'].max().values
elif 'gps_alt' in ds.keys():
alt_min = ds['gps_alt'].min().values
alt_max = ds['gps_alt'].max().values
elif 'altitude' in ds.attrs.keys():
alt_min = ds.attrs['altitude']
alt_max = ds.attrs['altitude']
else:
alt_min =np.nan
alt_max = np.nan

ds.attrs['geospatial_bounds'] = "POLYGON((" + \
f"{ds['lat'].min().values} {ds['lon'].min().values}, " + \
f"{ds['lat'].min().values} {ds['lon'].max().values}, " + \
f"{ds['lat'].max().values} {ds['lon'].max().values}, " + \
f"{ds['lat'].max().values} {ds['lon'].min().values}, " + \
f"{ds['lat'].min().values} {ds['lon'].min().values}))"

ds.attrs['geospatial_lat_min'] = str(ds['lat'].min().values)
ds.attrs['geospatial_lat_max'] = str(ds['lat'].max().values)
ds.attrs['geospatial_lon_min'] = str(ds['lon'].min().values)
ds.attrs['geospatial_lon_max'] = str(ds['lon'].max().values)
ds.attrs['geospatial_vertical_min'] = str(ds['alt'].min().values)
ds.attrs['geospatial_vertical_max'] = str(ds['alt'].max().values)
f"{lat_min} {lon_min}, " + \
f"{lat_min} {lon_max}, " + \
f"{lat_max} {lon_max}, " + \
f"{lat_max} {lon_min}, " + \
f"{lat_min} {lon_min}))"

ds.attrs['geospatial_lat_min'] = str(lat_min)
ds.attrs['geospatial_lat_max'] = str(lat_max)
ds.attrs['geospatial_lon_min'] = str(lon_min)
ds.attrs['geospatial_lon_max'] = str(lon_max)
ds.attrs['geospatial_vertical_min'] = str(alt_min)
ds.attrs['geospatial_vertical_max'] = str(alt_max)

ds.attrs['geospatial_vertical_positive'] = 'up'
ds.attrs['time_coverage_start'] = str(ds['time'][0].values)
ds.attrs['time_coverage_end'] = str(ds['time'][-1].values)
Expand Down Expand Up @@ -387,4 +444,4 @@ def reformat_lon(dataset, exempt=['UWN', 'Roof_GEUS', 'Roof_PROMICE']):
if 'gps_lon' not in dataset.keys():
return dataset
dataset['gps_lon'] = dataset['gps_lon'] * -1
return dataset
return dataset
11 changes: 6 additions & 5 deletions src/pypromice/qc/github_data_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,10 @@ def flagNAN(ds_in,

for v in varlist:
if v in list(ds.keys()):
logger.info(f'---> flagging {t0} {t1} {v}')
logger.debug(f'---> flagging {t0} {t1} {v}')
ds[v] = ds[v].where((ds['time'] < t0) | (ds['time'] > t1))
else:
logger.info(f'---> could not flag {v} not in dataset')
logger.debug(f'---> could not flag {v} not in dataset')

return ds

Expand Down Expand Up @@ -206,13 +206,14 @@ def adjustData(ds,
t1 = pd.to_datetime(t1, utc=True).tz_localize(None)

index_slice = dict(time=slice(t0, t1))

if len(ds_out[var].loc[index_slice].time.time) == 0:
logger.info(f'---> {t0} {t1} {var} {func} {val}')
logger.info("Time range does not intersect with dataset")
continue

logger.info(f'---> {t0} {t1} {var} {func} {val}')

else:
logger.debug(f'---> {t0} {t1} {var} {func} {val}')

if func == "add":
ds_out[var].loc[index_slice] = ds_out[var].loc[index_slice].values + val
# flagging adjusted values
Expand Down
4 changes: 2 additions & 2 deletions src/pypromice/qc/persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def persistence_qc(
mask = mask & (df[v]<99)
n_masked = mask.sum()
n_samples = len(mask)
logger.info(
logger.debug(
f"Applying persistent QC in {v}. Filtering {n_masked}/{n_samples} samples"
)
# setting outliers to NaN
Expand All @@ -96,7 +96,7 @@ def persistence_qc(

n_masked = mask.sum()
n_samples = len(mask)
logger.info(
logger.debug(
f"Applying persistent QC in {v}. Filtering {n_masked}/{n_samples} samples"
)
# setting outliers to NaN
Expand Down
6 changes: 3 additions & 3 deletions src/pypromice/resources/variable_aliases_GC-Net.csv
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ t_i_11,
tilt_x,
tilt_y,
rot,
gps_lat,latitude
gps_lon,longitude
gps_alt,elevation
lat,latitude
lon,longitude
alt,elevation
gps_time,
gps_geounit,
gps_hdop,
Expand Down
Loading
Loading