From 777c5e846d42ad6550aa26c35b13b4e741348d95 Mon Sep 17 00:00:00 2001 From: Baptiste Vandecrux <35140661+BaptisteVandecrux@users.noreply.github.com> Date: Thu, 15 Aug 2024 22:30:53 +0200 Subject: [PATCH] Feature/surface heights and thermistor depths (#278) * processes surface heights variables: `z_surf_combined`, `z_ice_surf`, `snow_height`, and thermistors' depths: `d_t_i_1-11` * `variable.csv` was updated accordingly * some clean-up of turbulent fluxes calculation, including renaming functions * handling empty station configuration files and making errors understandable * updated join_l3 so that surface height and thermistor depths in historical data are no longer ignored and to adjust the surface height between the merged datasets * calculated either from `gps_lat, gps_lon, gps_alt` or `lat, lon, alt`, static values called `latitude`, `longitude` and `altitude` are saved as attributes along with `latitude_origin`, `longitude_origin` and `altitude_origin` which states whether they come from gappy observations `gps_lat, gps_lon, gps_alt` or from gap-filled postprocess `lat, lon, alt` * changed "H" to "h" in pandas and added ".iloc" when necessary to remove deprecation warnings * made `make_metadata_csv.py` to update latest location in `aws-l3/AWS_station_metadata.csv` and `aws-l3/AWS_sites_metadata.csv` --------- Co-authored-by: Penny How * L2toL3 test added (#282) * 3.8 and 3.9 tests removed, tests only for 3.10 and 3.11 * echo syntax changed * updated input file paths --------- --- .github/workflows/process_test.yml | 16 +- setup.py | 1 + .../postprocess/create_bufr_files.py | 2 +- .../postprocess/make_metadata_csv.py | 214 ++++ .../postprocess/real_time_utilities.py | 4 +- src/pypromice/process/L2toL3.py | 1010 +++++++++++++++-- src/pypromice/process/get_l2tol3.py | 17 +- src/pypromice/process/join_l3.py | 152 ++- src/pypromice/process/write.py | 71 +- src/pypromice/qc/persistence.py | 9 +- src/pypromice/resources/file_attributes.csv | 111 +- .../resources/variable_aliases_GC-Net.csv | 22 +- src/pypromice/resources/variables.csv | 29 +- tests/data/test_config1.toml | 1 + tests/data/test_config2.toml | 1 + tests/unit/qc/test_persistence.py | 28 +- 16 files changed, 1415 insertions(+), 273 deletions(-) create mode 100644 src/pypromice/postprocess/make_metadata_csv.py diff --git a/.github/workflows/process_test.yml b/.github/workflows/process_test.yml index 049b7117..e5bf3362 100644 --- a/.github/workflows/process_test.yml +++ b/.github/workflows/process_test.yml @@ -41,23 +41,15 @@ jobs: for i in $(echo ${{ env.TEST_STATION }} | tr ' ' '\n'); do python3 $GITHUB_WORKSPACE/main/src/pypromice/process/get_l2.py -c $GITHUB_WORKSPACE/aws-l0/tx/config/$i.toml -i $GITHUB_WORKSPACE/aws-l0/tx -o $GITHUB_WORKSPACE/out/L0toL2/ done -# - name: Run L0 to L2 processing -# env: -# TEST_STATION: KAN_U HUM -# shell: bash -# run: | -# mkdir $GITHUB_WORKSPACE/out/L2toL3/ -# for i in $(echo ${{ env.TEST_STATION }} | tr ' ' '\n'); do -# python3 $GITHUB_WORKSPACE/main/src/pypromice/process/get_l2tol3.py -i $GITHUB_WORKSPACE/out/L0toL2/$i/$i_hour.nc -o $GITHUB_WORKSPACE/out/L2toL3 -t 60min -# done - - name: Run L0 to L3 processing + - name: Run L2 to L3 processing env: TEST_STATION: KAN_U HUM shell: bash run: | - mkdir $GITHUB_WORKSPACE/out/L0toL3/ + mkdir $GITHUB_WORKSPACE/out/L2toL3/ for i in $(echo ${{ env.TEST_STATION }} | tr ' ' '\n'); do - python3 $GITHUB_WORKSPACE/main/src/pypromice/process/get_l2.py -c $GITHUB_WORKSPACE/aws-l0/tx/config/$i.toml -i $GITHUB_WORKSPACE/aws-l0/tx -o $GITHUB_WORKSPACE/out/L2/ + echo ${i}_hour.nc + python3 $GITHUB_WORKSPACE/main/src/pypromice/process/get_l2tol3.py -c $GITHUB_WORKSPACE/aws-l0/metadata/station_configurations/ -i $GITHUB_WORKSPACE/out/L0toL2/${i}/${i}_hour.nc -o $GITHUB_WORKSPACE/out/L2toL3/ done - name: Upload test output uses: actions/upload-artifact@v3 diff --git a/setup.py b/setup.py index 08b72656..4f126237 100644 --- a/setup.py +++ b/setup.py @@ -45,6 +45,7 @@ 'join_l3 = pypromice.process.join_l3:main', 'get_l2 = pypromice.process.get_l2:main', 'get_l2tol3 = pypromice.process.get_l2tol3:main', + 'make_metadata_csv = pypromice.postprocess.make_metadata_csv:main', 'get_watsontx = pypromice.tx.get_watsontx:get_watsontx', 'get_bufr = pypromice.postprocess.get_bufr:main', 'create_bufr_files = pypromice.postprocess.create_bufr_files:main', diff --git a/src/pypromice/postprocess/create_bufr_files.py b/src/pypromice/postprocess/create_bufr_files.py index a6cb7842..33178deb 100644 --- a/src/pypromice/postprocess/create_bufr_files.py +++ b/src/pypromice/postprocess/create_bufr_files.py @@ -47,7 +47,7 @@ def create_bufr_files( Suffix for the compiled output file """ - periods = pd.date_range(period_start, period_end, freq="H") + periods = pd.date_range(period_start, period_end, freq="h") output_individual_root = output_root / "individual" output_compiled_root = output_root / "compiled" output_individual_root.mkdir(parents=True, exist_ok=True) diff --git a/src/pypromice/postprocess/make_metadata_csv.py b/src/pypromice/postprocess/make_metadata_csv.py new file mode 100644 index 00000000..c0004799 --- /dev/null +++ b/src/pypromice/postprocess/make_metadata_csv.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python +import os, sys, argparse +import pandas as pd +import xarray as xr +import logging + +logging.basicConfig( + format="%(asctime)s; %(levelname)s; %(name)s; %(message)s", + level=logging.INFO, + stream=sys.stdout, +) +logger = logging.getLogger(__name__) + +def extract_metadata_from_nc(file_path: str, data_type: str, label_s_id: str) -> pd.Series: + """ + Extract metadata from a NetCDF file and return it as a pandas Series. + + Parameters: + - file_path (str): The path to the NetCDF file. + - data_type (str): The type of data ('station' or 'site'). + - label_s_id (str): The label for the station or site ID. + + Returns: + - pd.Series: A pandas Series containing the extracted metadata. + """ + try: + with xr.open_dataset(file_path) as nc_file: + # Extract attributes + s_id = nc_file.attrs.get(label_s_id, 'N/A') + location_type = nc_file.attrs.get('location_type', 'N/A') + project = nc_file.attrs.get('project', 'N/A') + if data_type == 'site': + stations = nc_file.attrs.get('stations', s_id) + if data_type == 'station': + number_of_booms = nc_file.attrs.get('number_of_booms', 'N/A') + + # Extract the time variable as datetime64 + time_var = nc_file['time'].values.astype('datetime64[s]') + + # Extract the first and last timestamps + date_installation_str = pd.Timestamp(time_var[0]).strftime('%Y-%m-%d') + last_valid_date_str = pd.Timestamp(time_var[-1]).strftime('%Y-%m-%d') + + # Extract the first and last values of lat, lon, and alt + lat_installation = nc_file['lat'].isel(time=0).values.item() + lon_installation = nc_file['lon'].isel(time=0).values.item() + alt_installation = nc_file['alt'].isel(time=0).values.item() + + lat_last_known = nc_file['lat'].isel(time=-1).values.item() + lon_last_known = nc_file['lon'].isel(time=-1).values.item() + alt_last_known = nc_file['alt'].isel(time=-1).values.item() + + # Create a pandas Series for the metadata + if data_type == 'site': + row = pd.Series({ + 'project': project.replace('\r',''), + 'location_type': location_type, + 'stations': stations, + 'date_installation': date_installation_str, + 'latitude_installation': lat_installation, + 'longitude_installation': lon_installation, + 'altitude_installation': alt_installation, + 'date_last_valid': last_valid_date_str, + 'latitude_last_valid': lat_last_known, + 'longitude_last_valid': lon_last_known, + 'altitude_last_valid': alt_last_known + }, name=s_id) + else: + row = pd.Series({ + 'project': project.replace('\r',''), + 'number_of_booms': number_of_booms, + 'location_type': location_type, + 'date_installation': date_installation_str, + 'latitude_installation': lat_installation, + 'longitude_installation': lon_installation, + 'altitude_installation': alt_installation, + 'date_last_valid': last_valid_date_str, + 'latitude_last_valid': lat_last_known, + 'longitude_last_valid': lon_last_known, + 'altitude_last_valid': alt_last_known + }, name=s_id) + return row + except Exception as e: + logger.info(f"Warning: Error processing {file_path}: {str(e)}") + return pd.Series() # Return an empty Series in case of an error + +def process_files(base_dir: str, csv_file_path: str, data_type: str) -> pd.DataFrame: + """ + Process all files in the base directory to generate new metadata. + + Parameters: + - base_dir (str): The base directory containing the NetCDF files. + - csv_file_path (str): The path to the existing metadata CSV file. + - data_type (str): The type of data ('station' or 'site'). + + Returns: + - pd.DataFrame: The combined metadata DataFrame. + """ + label_s_id = 'station_id' if data_type == 'station' else 'site_id' + + # Initialize a list to hold the rows (Series) of DataFrame + rows = [] + + # Read existing metadata if the CSV file exists + if os.path.exists(csv_file_path) and os.path.getsize(csv_file_path) > 0: + logger.info("Updating " + str(csv_file_path)) + existing_metadata_df = pd.read_csv(csv_file_path, index_col=label_s_id) + else: + logger.info("Creating " + str(csv_file_path)) + existing_metadata_df = pd.DataFrame() + + # Track updated sites or stations to avoid duplicate updates + updated_s = [] + new_s = [] + + # Traverse through all the subfolders and files in the base directory + for subdir, _, files in os.walk(base_dir): + for file in files: + if file.endswith('_hour.nc'): + file_path = os.path.join(subdir, file) + row = extract_metadata_from_nc(file_path, data_type, label_s_id) + if not row.empty: + s_id = row.name + if s_id in existing_metadata_df.index: + # Compare with existing metadata + existing_row = existing_metadata_df.loc[s_id] + old_date_installation = existing_row['date_installation'] + old_last_valid_date = existing_row['date_last_valid'] + + # Update the existing metadata + existing_metadata_df.loc[s_id] = row + + # Print message if dates are updated + if old_last_valid_date != row['date_last_valid']: + logger.info(f"Updated {label_s_id}: {s_id} date_last_valid: {old_last_valid_date} --> {row['date_last_valid']}") + + updated_s.append(s_id) + else: + new_s.append(s_id) + # Append new metadata row to the list + rows.append(row) + + # Convert the list of rows to a DataFrame + new_metadata_df = pd.DataFrame(rows) + + # Concatenate the existing metadata with the new metadata + combined_metadata_df = pd.concat([existing_metadata_df, new_metadata_df], ignore_index=False) + + # Exclude some sites + sites_to_exclude = [s for s in ['XXX', 'Roof_GEUS', 'Roof_PROMICE'] if s in combined_metadata_df.index] + excluded_metadata_df = combined_metadata_df.loc[sites_to_exclude].copy() + combined_metadata_df.drop(sites_to_exclude, inplace=True) + + # Sort the DataFrame by index (s_id) + combined_metadata_df.sort_index(inplace=True) + + # Print excluded lines + if not excluded_metadata_df.empty: + pd.set_option('display.max_columns', None) # Show all columns + pd.set_option('display.max_colwidth', None) # Show full width of columns + pd.set_option('display.width', None) # Disable line wrapping + logger.info("\nExcluded lines from combined metadata.csv:") + print(excluded_metadata_df) + + # Drop excluded lines from combined_metadata_df + combined_metadata_df.drop(sites_to_exclude, errors='ignore', inplace=True) + + # Save to csv + combined_metadata_df.to_csv(csv_file_path, index_label=label_s_id) + + return combined_metadata_df, existing_metadata_df, new_s, updated_s + +def compare_and_log_updates(combined_metadata_df: pd.DataFrame, existing_metadata_df: pd.DataFrame, new_s: list, updated_s: list): + """ + Compare the combined metadata with the existing metadata and log the updates. + + Parameters: + - combined_metadata_df (pd.DataFrame): The combined metadata DataFrame. + - existing_metadata_df (pd.DataFrame): The existing metadata DataFrame. + - new_s (list): List of new station/site IDs. + - updated_s (list): List of updated station/site IDs. + """ + # Determine which lines were not updated (reused) and which were added + if not existing_metadata_df.empty: + reused_s = [s_id for s_id in existing_metadata_df.index if ((s_id not in new_s) & (s_id not in updated_s))] + reused_lines = existing_metadata_df.loc[reused_s] + added_lines = combined_metadata_df.loc[combined_metadata_df.index.difference(existing_metadata_df.index)] + + logger.info("\nLines from the old metadata.csv that are reused (not updated):") + print(reused_lines) + + if not added_lines.empty: + logger.info("\nLines that were not present in the old metadata.csv and are added:") + print(added_lines) + else: + logger.info("\nAll lines are added (no old metadata.csv found)") + +def main(): + parser = argparse.ArgumentParser(description='Process station or site data.') + parser.add_argument('-t', '--type', choices=['station', 'site'], + required=True, + help='Type of data to process: "station" or "site"') + parser.add_argument('-r', '--root_dir', required=True, help='Root directory ' + + 'containing the aws-l3 station or site folder') + parser.add_argument('-m','--metadata_file', required=True, + help='File path to metadata csv file (existing or '+ + 'intended output path') + + args = parser.parse_args() + combined_metadata_df, existing_metadata_df, new_s, updated_s = process_files(args.root_dir, args.metadata_file, args.type) + compare_and_log_updates(combined_metadata_df, existing_metadata_df, new_s, updated_s) + +if __name__ == '__main__': + main() diff --git a/src/pypromice/postprocess/real_time_utilities.py b/src/pypromice/postprocess/real_time_utilities.py index f79f9ca0..dd058286 100644 --- a/src/pypromice/postprocess/real_time_utilities.py +++ b/src/pypromice/postprocess/real_time_utilities.py @@ -73,8 +73,8 @@ def get_latest_data( # Apply smoothing to z_boom_u # require at least 2 hourly obs? Sometimes seeing once/day data for z_boom_u - df_limited = rolling_window(df_limited, "z_boom_u", "72H", 2, 3) - + df_limited = rolling_window(df_limited, "z_boom_u", "72h", 2, 3) + # limit to single most recent valid row (convert to series) s_current = df_limited.loc[last_valid_index] diff --git a/src/pypromice/process/L2toL3.py b/src/pypromice/process/L2toL3.py index d78161fd..004cd615 100755 --- a/src/pypromice/process/L2toL3.py +++ b/src/pypromice/process/L2toL3.py @@ -5,10 +5,11 @@ import pandas as pd import numpy as np import xarray as xr -import toml, os from sklearn.linear_model import LinearRegression - +from pypromice.qc.github_data_issues import adjustData +from scipy.interpolate import interp1d import logging + logger = logging.getLogger(__name__) def toL3(L2, station_config={}, T_0=273.15): @@ -16,7 +17,9 @@ def toL3(L2, station_config={}, T_0=273.15): derived variables: - Turbulent fluxes - smoothed and inter/extrapolated GPS coordinates - + - continuous surface height, ice surface height, snow height + - thermistor depths + Parameters ---------- @@ -32,7 +35,7 @@ def toL3(L2, station_config={}, T_0=273.15): ds = L2 ds.attrs['level'] = 'L3' - T_100 = _getTempK(T_0) # Get steam point temperature as K + T_100 = T_0+100 # Get steam point temperature as K # Turbulent heat flux calculation if ('t_u' in ds.keys()) and \ @@ -43,7 +46,7 @@ def toL3(L2, station_config={}, T_0=273.15): p_h_u = ds['p_u'].copy() RH_cor_h_u = ds['rh_u_cor'].copy() - q_h_u = calcSpHumid(T_0, T_100, T_h_u, p_h_u, RH_cor_h_u) # Calculate specific humidity + q_h_u = calculate_specific_humidity(T_0, T_100, T_h_u, p_h_u, RH_cor_h_u) # Calculate specific humidity if ('wspd_u' in ds.keys()) and \ ('t_surf' in ds.keys()) and \ ('z_boom_u' in ds.keys()): @@ -53,7 +56,7 @@ def toL3(L2, station_config={}, T_0=273.15): z_T_u = ds['z_boom_u'].copy() - 0.1 # Get height of thermometer if not ds.attrs['bedrock']: - SHF_h_u, LHF_h_u= calcHeatFlux(T_0, T_h_u, Tsurf_h, WS_h_u, # Calculate latent and sensible heat fluxes + SHF_h_u, LHF_h_u= calculate_tubulent_heat_fluxes(T_0, T_h_u, Tsurf_h, WS_h_u, # Calculate latent and sensible heat fluxes z_WS_u, z_T_u, q_h_u, p_h_u) ds['dshf_u'] = (('time'), SHF_h_u.data) @@ -75,7 +78,7 @@ def toL3(L2, station_config={}, T_0=273.15): p_h_l = ds['p_l'].copy() RH_cor_h_l = ds['rh_l_cor'].copy() - q_h_l = calcSpHumid(T_0, T_100, T_h_l, p_h_l, RH_cor_h_l) # Calculate sp.humidity + q_h_l = calculate_specific_humidity(T_0, T_100, T_h_l, p_h_l, RH_cor_h_l) # Calculate sp.humidity if ('wspd_l' in ds.keys()) and \ ('t_surf' in ds.keys()) and \ @@ -84,7 +87,7 @@ def toL3(L2, station_config={}, T_0=273.15): z_T_l = ds['z_boom_l'].copy() - 0.1 # Get height of thermometer WS_h_l = ds['wspd_l'].copy() if not ds.attrs['bedrock']: - SHF_h_l, LHF_h_l= calcHeatFlux(T_0, T_h_l, Tsurf_h, WS_h_l, # Calculate latent and sensible heat fluxes + SHF_h_l, LHF_h_l= calculate_tubulent_heat_fluxes(T_0, T_h_l, Tsurf_h, WS_h_l, # Calculate latent and sensible heat fluxes z_WS_l, z_T_l, q_h_l, p_h_l) ds['dshf_l'] = (('time'), SHF_h_l.data) @@ -97,24 +100,857 @@ def toL3(L2, station_config={}, T_0=273.15): else: logger.info('t_l, p_l or rh_l_cor missing, cannot calulate tubrulent heat fluxes') - # Smoothing and inter/extrapolation of GPS coordinates + if len(station_config)==0: + logger.warning('\n***\nThe station configuration file is missing or improperly passed to pypromice. Some processing steps might fail.\n***\n') + + # Smoothing and inter/extrapolation of GPS coordinates for var in ['gps_lat', 'gps_lon', 'gps_alt']: - ds[var.replace('gps_','')] = ('time', gpsCoordinatePostprocessing(ds, var, station_config)) + ds[var.replace('gps_','')] = ('time', gps_coordinate_postprocessing(ds, var, station_config)) + + # processing continuous surface height, ice surface height, snow height + try: + ds = process_surface_height(ds, station_config) + except Exception as e: + logger.error("Error processing surface height at %s"%L2.attrs['station_id']) + logging.error(e, exc_info=True) + + # making sure dataset has the attributes contained in the config files + if 'project' in station_config.keys(): + ds.attrs['project'] = station_config['project'] + else: + logger.error('No project info in station_config. Using \"PROMICE\".') + ds.attrs['project'] = "PROMICE" + + if 'location_type' in station_config.keys(): + ds.attrs['location_type'] = station_config['location_type'] + else: + logger.error('No project info in station_config. Using \"ice sheet\".') + ds.attrs['location_type'] = "ice sheet" + + return ds + + +def process_surface_height(ds, station_config={}): + """ + Process surface height data for different site types and create + surface height variables. + + Parameters + ---------- + ds : xarray.Dataset + The dataset containing various measurements and attributes including + 'site_type' which determines the type of site (e.g., 'ablation', + 'accumulation', 'bedrock') and other relevant data variables such as + 'z_boom_u', 'z_stake', 'z_pt_cor', etc. - # adding average coordinate as attribute - for v in ['lat','lon','alt']: - ds.attrs[v+'_avg'] = ds[v].mean(dim='time').item() + Returns + ------- + xarray.Dataset + The dataset with additional processed surface height variables: + 'z_surf_1', 'z_surf_2', 'z_ice_surf', 'z_surf_combined', 'snow_height', + and possibly depth variables derived from temperature measurements. + """ + # Initialize surface height variables with NaNs + ds['z_surf_1'] = ('time', ds['z_boom_u'].data * np.nan) + ds['z_surf_2'] = ('time', ds['z_boom_u'].data * np.nan) + + if ds.attrs['site_type'] == 'ablation': + # Calculate surface heights for ablation sites + ds['z_surf_1'] = 2.6 - ds['z_boom_u'] + if ds.z_stake.notnull().any(): + first_valid_index = ds.time.where((ds.z_stake + ds.z_boom_u).notnull(), drop=True).data[0] + ds['z_surf_2'] = ds.z_surf_1.sel(time=first_valid_index) + ds.z_stake.sel(time=first_valid_index) - ds['z_stake'] + + # Use corrected point data if available + if 'z_pt_cor' in ds.data_vars: + ds['z_ice_surf'] = ('time', ds['z_pt_cor'].data) + + else: + # Calculate surface heights for other site types + first_valid_index = ds.time.where(ds.z_boom_u.notnull(), drop=True).data[0] + ds['z_surf_1'] = ds.z_boom_u.sel(time=first_valid_index) - ds['z_boom_u'] + if 'z_stake' in ds.data_vars and ds.z_stake.notnull().any(): + first_valid_index = ds.time.where(ds.z_stake.notnull(), drop=True).data[0] + ds['z_surf_2'] = ds.z_stake.sel(time=first_valid_index) - ds['z_stake'] + if 'z_boom_l' in ds.data_vars: + # need a combine first because KAN_U switches from having a z_stake + # to having a z_boom_l + first_valid_index = ds.time.where(ds.z_boom_l.notnull(), drop=True).data[0] + ds['z_surf_2'] = ds['z_surf_2'].combine_first( + ds.z_boom_l.sel(time=first_valid_index) - ds['z_boom_l']) + + # Adjust data for the created surface height variables + ds = adjustData(ds, var_list=['z_surf_1', 'z_surf_2', 'z_ice_surf']) + + # Convert to dataframe and combine surface height variables + df_in = ds[[v for v in ['z_surf_1', 'z_surf_2', 'z_ice_surf'] if v in ds.data_vars]].to_dataframe() + + (ds['z_surf_combined'], ds['z_ice_surf'], + ds['z_surf_1_adj'], ds['z_surf_2_adj']) = combine_surface_height(df_in, ds.attrs['site_type']) + + + if ds.attrs['site_type'] == 'ablation': + # Calculate rolling minimum for ice surface height and snow height + ts_interpolated = np.minimum( + xr.where(ds.z_ice_surf.notnull(), + ds.z_ice_surf,ds.z_surf_combined), + ds.z_surf_combined).to_series().resample('h').interpolate(limit=72) + + if len(ts_interpolated)>24*7: + # Apply the rolling window with median calculation + z_ice_surf = (ts_interpolated + .rolling('14D', center=True, min_periods=1) + .median()) + # Overprint the first and last 7 days with interpolated values + # because of edge effect of rolling windows + z_ice_surf.iloc[:24*7] = (ts_interpolated.iloc[:24*7] + .rolling('1D', center=True, min_periods=1) + .median().values) + z_ice_surf.iloc[-24*7:] = (ts_interpolated.iloc[-24*7:] + .rolling('1D', center=True, min_periods=1) + .median().values) + else: + z_ice_surf = (ts_interpolated + .rolling('1D', center=True, min_periods=1) + .median()) + + z_ice_surf = z_ice_surf.loc[ds.time] + # here we make sure that the periods where both z_stake and z_pt are + # missing are also missing in z_ice_surf + msk = ds['z_ice_surf'].notnull() | ds['z_surf_2_adj'].notnull() + z_ice_surf = z_ice_surf.where(msk) + + # taking running minimum to get ice + z_ice_surf = z_ice_surf.cummin() + + # filling gaps only if they are less than a year long and if values on both + # sides are less than 0.01 m appart + + # Forward and backward fill to identify bounds of gaps + df_filled = z_ice_surf.fillna(method='ffill').fillna(method='bfill') + + # Identify gaps and their start and end dates + gaps = pd.DataFrame(index=z_ice_surf[z_ice_surf.isna()].index) + gaps['prev_value'] = df_filled.shift(1) + gaps['next_value'] = df_filled.shift(-1) + gaps['gap_start'] = gaps.index.to_series().shift(1) + gaps['gap_end'] = gaps.index.to_series().shift(-1) + gaps['gap_duration'] = (gaps['gap_end'] - gaps['gap_start']).dt.days + gaps['value_diff'] = (gaps['next_value'] - gaps['prev_value']).abs() + + # Determine which gaps to fill + mask = (gaps['gap_duration'] < 365) & (gaps['value_diff'] < 0.01) + gaps_to_fill = gaps[mask].index + + # Fill gaps in the original Series + z_ice_surf.loc[gaps_to_fill] = df_filled.loc[gaps_to_fill] + + # bringing the variable into the dataset + ds['z_ice_surf'] = z_ice_surf + + ds['z_surf_combined'] = np.maximum(ds['z_surf_combined'], ds['z_ice_surf']) + ds['snow_height'] = np.maximum(0, ds['z_surf_combined'] - ds['z_ice_surf']) + elif ds.attrs['site_type'] in ['accumulation', 'bedrock']: + # Handle accumulation and bedrock site types + ds['z_ice_surf'] = ('time', ds['z_surf_1'].data * np.nan) + ds['snow_height'] = ds['z_surf_combined'] + else: + # Log info for other site types + logger.info('other site type') + + if ds.attrs['site_type'] != 'bedrock': + # Process ice temperature data and create depth variables + ice_temp_vars = [v for v in ds.data_vars if 't_i_' in v] + vars_out = [v.replace('t', 'd_t') for v in ice_temp_vars] + vars_out.append('t_i_10m') + df_out = get_thermistor_depth( + ds[ice_temp_vars + ['z_surf_combined']].to_dataframe(), + ds.attrs['station_id'], + station_config) + for var in df_out.columns: + ds[var] = ('time', df_out[var].values) + return ds -def gpsCoordinatePostprocessing(ds, var, station_config={}): +def combine_surface_height(df, site_type, threshold_ablation = -0.0002): + '''Combines the data from three sensor: the two sonic rangers and the + pressure transducer, to recreate the surface height, the ice surface height + and the snow depth through the years. For the accumulation sites, it is + only the average of the two sonic rangers (after manual adjustments to + correct maintenance shifts). For the ablation sites, first an ablation + period is estimated each year (either the period when z_pt_cor decreases + or JJA if no better estimate) then different adjustmnents are conducted + to stitch the three time series together: z_ice_surface (adjusted from + z_pt_cor) or if unvailable, z_surf_2 (adjusted from z_stake) + are used in the ablation period while an average of z_surf_1 and z_surf_2 + are used otherwise, after they are being adjusted to z_ice_surf at the end + of the ablation season. + + Parameters + ---------- + df : pandas.dataframe + Dataframe with datetime index and variables z_surf_1, z_surf_2 and z_ice_surf + site_type : str + Either 'accumulation' or 'ablation' + threshold_ablation : float + Threshold to which a z_pt_cor hourly decrease is compared. If the decrease + is higher, then there is ablation. + ''' + logger.info('Combining surface height') + + if 'z_surf_2' not in df.columns: + logger.info('-> did not find z_surf_2') + df["z_surf_2"] = df["z_surf_1"].values*np.nan + + if 'z_ice_surf' not in df.columns: + logger.info('-> did not find z_ice_surf') + df["z_ice_surf"] = df["z_surf_1"].values*np.nan + + if site_type in ['accumulation', 'bedrock']: + logger.info('-> no z_pt or accumulation site: averaging z_surf_1 and z_surf_2') + df["z_surf_1_adj"] = hampel(df["z_surf_1"].interpolate(limit=72)).values + df["z_surf_2_adj"] = hampel(df["z_surf_2"].interpolate(limit=72)).values + # adjusting z_surf_2 to z_surf_1 + df["z_surf_2_adj"] = df["z_surf_2_adj"] + (df["z_surf_1_adj"]- df["z_surf_2_adj"]).mean() + # z_surf_combined is the average of the two z_surf + if df.z_surf_1_adj.notnull().any() & df.z_surf_2_adj.notnull().any(): + df['z_surf_combined'] = df[['z_surf_1_adj', 'z_surf_2_adj']].mean(axis = 1).values + elif df.z_surf_1_adj.notnull().any(): + df['z_surf_combined'] = df.z_surf_1_adj.values + elif df.z_surf_2_adj.notnull().any(): + df['z_surf_combined'] = df.z_surf_2_adj.values + + # df["z_surf_combined"] = hampel(df["z_surf_combined"].interpolate(limit=72)).values + return (df['z_surf_combined'], df["z_surf_combined"]*np.nan, + df["z_surf_1_adj"], df["z_surf_2_adj"]) + + else: + logger.info('-> ablation site') + # smoothing and filtering pressure transducer data + df["z_ice_surf_adj"] = hampel(df["z_ice_surf"].interpolate(limit=72)).values + df["z_surf_1_adj"] = hampel(df["z_surf_1"].interpolate(limit=72)).values + df["z_surf_2_adj"] = hampel(df["z_surf_2"].interpolate(limit=72)).values + + df["z_surf_1_adj"] = hampel(df["z_surf_1"].interpolate(limit=72), k=24, t0=5).values + df["z_surf_2_adj"] = hampel(df["z_surf_2"].interpolate(limit=72), k=24, t0=5).values + + # defining ice ablation period from the decrease of a smoothed version of z_pt + # meaning when smoothed_z_pt.diff() < threshold_ablation + # first smoothing + smoothed_PT = (df['z_ice_surf'] + .resample('h') + .interpolate(limit=72) + .rolling('14D',center=True, min_periods=1) + .mean()) + # second smoothing + smoothed_PT = smoothed_PT.rolling('14D', center=True, min_periods=1).mean() + + smoothed_PT = smoothed_PT.reindex(df.index,method='ffill') + # smoothed_PT.loc[df.z_ice_surf.isnull()] = np.nan + + # logical index where ablation is detected + ind_ablation = np.logical_and(smoothed_PT.diff().values < threshold_ablation, + np.isin(smoothed_PT.diff().index.month, [6, 7, 8, 9])) + + + # finding the beginning and end of each period with True + idx = np.argwhere(np.diff(np.r_[False,ind_ablation, False])).reshape(-1, 2) + idx[:, 1] -= 1 + + # fill small gaps in the ice ablation periods. + for i in range(len(idx)-1): + ind = idx[i] + ind_next = idx[i+1] + # if the end of an ablation period is less than 60 days away from + # the next ablation, then it is still considered like the same ablation + # season + if df.index[ind_next[0]]-df.index[ind[1]]= period_start) & (df.index < period_end) + ind_ablation[exclusion_period] = False + + hs1=df["z_surf_1_adj"].interpolate(limit=24*2).copy() + hs2=df["z_surf_2_adj"].interpolate(limit=24*2).copy() + z=df["z_ice_surf_adj"].interpolate(limit=24*2).copy() + + # the surface heights are adjusted so that they start at 0 + + + if any(~np.isnan(hs2.iloc[:24*7])): + hs2 = hs2 - hs2.iloc[:24*7].mean() + + if any(~np.isnan(hs2.iloc[:24*7])) & any(~np.isnan(hs1.iloc[:24*7])): + hs2 = hs2 + hs1.iloc[:24*7].mean() - hs2.iloc[:24*7].mean() + + if any(~np.isnan(z.iloc[:24*7])): + # expressing ice surface height relative to its mean value in the + # first week of the record + z = z - z.iloc[:24*7].mean() + elif z.notnull().any(): + # if there is no data in the first week but that there are some + # PT data afterwards + if ((z.first_valid_index() - hs1.first_valid_index()) < pd.to_timedelta('251D')) &\ + ((z.first_valid_index() - hs1.first_valid_index()) > pd.to_timedelta('0H')): + # if the pressure transducer is installed the year after then + # we use the mean surface height 1 on its first week as a 0 + # for the ice height + z = z - z.loc[ + z.first_valid_index():(z.first_valid_index()+pd.to_timedelta('14D')) + ].mean() + hs1.iloc[:24*7].mean() + else: + # if there is more than a year (actually 251 days) between the + # initiation of the AWS and the installation of the pressure transducer + # we remove the intercept in the pressure transducer data. + # Removing the intercept + # means that we consider the ice surface height at 0 when the AWS + # is installed, and not when the pressure transducer is installed. + Y = z.iloc[:].values.reshape(-1, 1) + X = z.iloc[~np.isnan(Y)].index.astype(np.int64).values.reshape(-1, 1) + Y = Y[~np.isnan(Y)] + linear_regressor = LinearRegression() + linear_regressor.fit(X, Y) + Y_pred = linear_regressor.predict(z.index.astype(np.int64).values.reshape(-1, 1) ) + z = z-Y_pred[0] + + years = df.index.year.unique().values + ind_start = years.copy() + ind_end = years.copy() + logger.debug('-> estimating ablation period for each year') + for i, y in enumerate(years): + # for each year + ind_yr = df.index.year.values==y + ind_abl_yr = np.logical_and(ind_yr, ind_ablation) + + if df.loc[ + np.logical_and(ind_yr, df.index.month.isin([6,7,8])), + "z_ice_surf_adj"].isnull().all(): + + ind_abl_yr = np.logical_and(ind_yr, df.index.month.isin([6,7,8])) + ind_ablation[ind_yr] = ind_abl_yr[ind_yr] + logger.debug(str(y)+' no z_ice_surf, just using JJA') + + else: + logger.debug(str(y)+ ' derived from z_ice_surf') + + if np.any(ind_abl_yr): + # if there are some ablation flagged for that year + # then find begining and end + ind_start[i] = np.argwhere(ind_abl_yr)[0][0] + ind_end[i] = np.argwhere(ind_abl_yr)[-1][0] + + else: + logger.debug(str(y) + ' could not estimate ablation season') + # otherwise left as nan + ind_start[i] = -999 + ind_end[i] = -999 + + # adjustement loop + missing_hs2 = 0 # if hs2 is missing then when it comes back it is adjusted to hs1 + hs2_ref = 0 # by default, the PT is the reference: hs1 and 2 will be adjusted to PT + # but if it is missing one year or one winter, then it needs to be rajusted + # to hs1 and hs2 the year after. + + for i, y in enumerate(years): + # if y == 2014: + # import pdb; pdb.set_trace() + logger.debug(str(y)) + # defining subsets of hs1, hs2, z + hs1_jja = hs1[str(y)+'-06-01':str(y)+'-09-01'] + hs2_jja = hs2[str(y)+'-06-01':str(y)+'-09-01'] + z_jja = z[str(y)+'-06-01':str(y)+'-09-01'] + + z_ablation = z.iloc[ind_start[i]:ind_end[i]] + hs2_ablation = hs2.iloc[ind_start[i]:ind_end[i]] + + hs1_year = hs1[str(y)] + hs2_year = hs2[str(y)] + + hs2_winter = hs2[str(y)+'-01-01':str(y)+'-03-01'].copy() + z_winter = z[str(y)+'-01-01':str(y)+'-03-01'].copy() + + z_year = z[str(y)] + if hs1_jja.isnull().all() and hs2_jja.isnull().all() and z_jja.isnull().all(): + # if there is no height for a year between June and September + # then the adjustment cannot be made automatically + # it needs to be specified manually on the adjustment files + # on https://github.com/GEUS-Glaciology-and-Climate/PROMICE-AWS-data-issues + continue + + if all(np.isnan(z_jja)) and any(~np.isnan(hs2_jja)): + # if there is no PT for a given year, but there is some hs2 + # then z will be adjusted to hs2 next time it is available + hs2_ref = 1 + + if all(np.isnan(z_winter)) and all(np.isnan(hs2_winter)): + # if there is no PT nor hs2 during the winter, then again + # we need to adjust z to match hs2 when ablation starts + hs2_ref = 1 + + # adjustment at the start of the ablation season + if hs2_ref: + # if hs2 has been taken as reference in the previous years + # then we check if pressure transducer is reinstalled and needs + # to be adjusted + if ind_start[i] != -999: + # the first year there is both ablation and PT data available + # then PT is adjusted to hs2 + if any(~np.isnan(z_ablation)) and any(~np.isnan(hs2_ablation)): + tmp1 = z_ablation.copy() + tmp2 = hs2_ablation.copy() + # tmp1[np.isnan(tmp2)] = np.nan + # tmp2[np.isnan(tmp1)] = np.nan + + # in some instances, the PT data is available but no ablation + # is recorded, then hs2 remains the reference during that time. + # When eventually there is ablation, then we need to find the + # first index in these preceding ablation-free years + # the shift will be applied back from this point + # first_index = z[:z[str(y)].first_valid_index()].isnull().iloc[::-1].idxmax() + # z[first_index:] = z[first_index:] - np.nanmean(tmp1) + np.nanmean(tmp2) + # hs2_ref = 0 # from now on PT is the reference + + # in some other instance, z just need to be adjusted to hs2 + # first_index = z[str(y)].first_valid_index() + first_index = z.iloc[ind_start[i]:].first_valid_index() # of ablation + if np.isnan(hs2[first_index]): + first_index_2 = hs2.iloc[ind_start[i]:].first_valid_index() + if (first_index_2 - first_index)>pd.Timedelta('30d'): + logger.debug('adjusting z to hs1') + if np.isnan(hs1[first_index]): + first_index = hs1.iloc[ind_start[i]:].first_valid_index() + z[first_index:] = z[first_index:] - z[first_index] + hs1[first_index] + else: + logger.debug('adjusting z to hs1') + first_index = hs2.iloc[ind_start[i]:].first_valid_index() + z[first_index:] = z[first_index:] - z[first_index] + hs2[first_index] + else: + logger.debug('adjusting z to hs1') + z[first_index:] = z[first_index:] - z[first_index] + hs2[first_index] + hs2_ref = 0 # from now on PT is the reference + + + else: + # if z_pt is the reference and there is some ablation + # then hs1 and hs2 are adjusted to z_pt + if (ind_start[i] != -999) & z_year.notnull().any(): + # calculating first index with PT, hs1 and hs2 + first_index = z_year.first_valid_index() + if hs1_year.notnull().any(): + first_index = np.max(np.array( + [first_index, + hs1_year.first_valid_index()])) + if hs2_year.notnull().any(): + first_index = np.max(np.array( + [first_index, + hs2_year.first_valid_index()])) + + # if PT, hs1 and hs2 are all nan until station is reactivated, then + first_day_of_year = pd.to_datetime(str(y)+'-01-01') + + if len(z[first_day_of_year:first_index-pd.to_timedelta('1D')])>0: + if z[first_day_of_year:first_index-pd.to_timedelta('1D')].isnull().all() & \ + hs1[first_day_of_year:first_index-pd.to_timedelta('1D')].isnull().all() & \ + hs2[first_day_of_year:first_index-pd.to_timedelta('1D')].isnull().all(): + if (~np.isnan(np.nanmean(z[first_index:first_index+pd.to_timedelta('1D')])) \ + and ~np.isnan(np.nanmean(hs2[first_index:first_index+pd.to_timedelta('1D')]))): + logger.debug(' ======= adjusting hs1 and hs2 to z_pt') + if ~np.isnan(np.nanmean(hs1[first_index:first_index+pd.to_timedelta('1D')]) ): + hs1[first_index:] = hs1[first_index:] \ + - np.nanmean(hs1[first_index:first_index+pd.to_timedelta('1D')]) \ + + np.nanmean(z[first_index:first_index+pd.to_timedelta('1D')]) + if ~np.isnan(np.nanmean(hs2[first_index:first_index+pd.to_timedelta('1D')]) ): + hs2[first_index:] = hs2[first_index:] \ + - np.nanmean(hs2[first_index:first_index+pd.to_timedelta('1D')]) \ + + np.nanmean(z[first_index:first_index+pd.to_timedelta('1D')]) + + # adjustment taking place at the end of the ablation period + if (ind_end[i] != -999): + # if y == 2023: + # import pdb; pdb.set_trace() + # if there's ablation and + # if there are PT data available at the end of the melt season + if z.iloc[(ind_end[i]-24*7):(ind_end[i]+24*7)].notnull().any(): + logger.debug('adjusting hs2 to z') + # then we adjust hs2 to the end-of-ablation z + # first trying at the end of melt season + if ~np.isnan(np.nanmean(hs2.iloc[(ind_end[i]-24*7):(ind_end[i]+24*30)])): + logger.debug('using end of melt season') + hs2.iloc[ind_end[i]:] = hs2.iloc[ind_end[i]:] - \ + np.nanmean(hs2.iloc[(ind_end[i]-24*7):(ind_end[i]+24*30)]) + \ + np.nanmean(z.iloc[(ind_end[i]-24*7):(ind_end[i]+24*30)]) + # if not possible, then trying the end of the following accumulation season + elif (i+1 < len(ind_start)): + if ind_start[i+1]!=-999 and any(~np.isnan(hs2.iloc[(ind_start[i+1]-24*7):(ind_start[i+1]+24*7)]+ z.iloc[(ind_start[i+1]-24*7):(ind_start[i+1]+24*7)])): + logger.debug('using end of accumulation season') + hs2.iloc[ind_end[i]:] = hs2.iloc[ind_end[i]:] - \ + np.nanmean(hs2.iloc[(ind_start[i+1]-24*7):(ind_start[i+1]+24*7)]) + \ + np.nanmean(z.iloc[(ind_start[i+1]-24*7):(ind_start[i+1]+24*7)]) + else: + logger.debug('no ablation') + hs1_following_winter = hs1[str(y)+'-09-01':str(y+1)+'-03-01'].copy() + hs2_following_winter = hs2[str(y)+'-09-01':str(y+1)+'-03-01'].copy() + if all(np.isnan(hs2_following_winter)): + logger.debug('no hs2') + missing_hs2 = 1 + elif missing_hs2 == 1: + logger.debug('adjusting hs2') + # and if there are some hs2 during the accumulation period + if any(~np.isnan(hs1_following_winter)): + logger.debug('to hs1') + # then we adjust hs1 to hs2 during the accumulation area + # adjustment is done so that the mean hs1 and mean hs2 match + # for the period when both are available + hs2_following_winter[np.isnan(hs1_following_winter)] = np.nan + hs1_following_winter[np.isnan(hs2_following_winter)] = np.nan + + hs2[str(y)+'-01-01':] = hs2[str(y)+'-01-01':] \ + - np.nanmean(hs2_following_winter) + np.nanmean(hs1_following_winter) + missing_hs2 = 0 + + + hs1_following_winter = hs1[str(y)+'-09-01':str(y+1)+'-03-01'].copy() + hs2_following_winter = hs2[str(y)+'-09-01':str(y+1)+'-03-01'].copy() + # adjusting hs1 to hs2 (no ablation case) + if any(~np.isnan(hs1_following_winter)): + logger.debug('adjusting hs1') + # and if there are some hs2 during the accumulation period + if any(~np.isnan(hs2_following_winter)): + logger.debug('to hs2') + # then we adjust hs1 to hs2 during the accumulation area + # adjustment is done so that the mean hs1 and mean hs2 match + # for the period when both are available + hs1_following_winter[np.isnan(hs2_following_winter)] = np.nan + hs2_following_winter[np.isnan(hs1_following_winter)] = np.nan + + hs1[str(y)+'-09-01':] = hs1[str(y)+'-09-01':] \ + - np.nanmean(hs1_following_winter) + np.nanmean(hs2_following_winter) + hs1_following_winter = hs1[str(y)+'-09-01':str(y+1)+'-03-01'].copy() + + if ind_end[i] != -999: + # if there is some hs1 + hs1_following_winter = hs1[str(y)+'-09-01':str(y+1)+'-03-01'].copy() + hs2_following_winter = hs2[str(y)+'-09-01':str(y+1)+'-03-01'].copy() + if any(~np.isnan(hs1_following_winter)): + logger.debug('adjusting hs1') + # and if there are some hs2 during the accumulation period + if any(~np.isnan(hs2_following_winter)): + logger.debug('to hs2, minimizing winter difference') + # then we adjust hs1 to hs2 during the accumulation area + # adjustment is done so that the mean hs1 and mean hs2 match + # for the period when both are available + tmp1 = hs1.iloc[ind_end[i]:min(len(hs1),ind_end[i]+24*30*9)].copy() + tmp2 = hs2.iloc[ind_end[i]:min(len(hs2),ind_end[i]+24*30*9)].copy() + + tmp1[np.isnan(tmp2)] = np.nan + tmp2[np.isnan(tmp1)] = np.nan + if tmp1.isnull().all(): + tmp1 = hs1_following_winter.copy() + tmp2 = hs2_following_winter.copy() + + tmp1[np.isnan(tmp2)] = np.nan + tmp2[np.isnan(tmp1)] = np.nan + hs1.iloc[ind_end[i]:] = hs1.iloc[ind_end[i]:] - np.nanmean(tmp1) + np.nanmean(tmp2) + + # if no hs2, then use PT data available at the end of the melt season + elif np.any(~np.isnan(z.iloc[(ind_end[i]-24*14):(ind_end[i]+24*7)])): + logger.debug('to z') + # then we adjust hs2 to the end-of-ablation z + # first trying at the end of melt season + if ~np.isnan(np.nanmean(hs1.iloc[(ind_end[i]-24*14):(ind_end[i]+24*30)])): + logger.debug('using end of melt season') + hs1.iloc[ind_end[i]:] = hs1.iloc[ind_end[i]:] - \ + np.nanmean(hs1.iloc[(ind_end[i]-24*14):(ind_end[i]+24*30)]) + \ + np.nanmean(z.iloc[(ind_end[i]-24*14):(ind_end[i]+24*30)]) + # if not possible, then trying the end of the following accumulation season + elif ind_start[i+1]!=-999 and any(~np.isnan(hs1.iloc[(ind_start[i+1]-24*14):(ind_start[i+1]+24*7)]+ z.iloc[(ind_start[i+1]-24*14):(ind_start[i+1]+24*7)])): + logger.debug('using end of accumulation season') + hs1.iloc[ind_end[i]:] = hs1.iloc[ind_end[i]:] - \ + np.nanmean(hs1.iloc[(ind_start[i+1]-24*14):(ind_start[i+1]+24*7)]) + \ + np.nanmean(z.iloc[(ind_start[i+1]-24*14):(ind_start[i+1]+24*7)]) + elif any(~np.isnan(hs2_year)): + logger.debug('to the last value of hs2') + # then we adjust hs1 to hs2 during the accumulation area + # adjustment is done so that the mean hs1 and mean hs2 match + # for the period when both are available + half_span = pd.to_timedelta('7D') + tmp1 = hs1_year.loc[(hs2_year.last_valid_index()-half_span):(hs2_year.last_valid_index()+half_span)].copy() + tmp2 = hs2_year.loc[(hs2_year.last_valid_index()-half_span):(hs2_year.last_valid_index()+half_span)].copy() + + hs1.iloc[ind_end[i]:] = hs1.iloc[ind_end[i]:] - np.nanmean(tmp1) + np.nanmean(tmp2) + + df["z_surf_1_adj"] = hs1.interpolate(limit=2*24).values + df["z_surf_2_adj"] = hs2.interpolate(limit=2*24).values + df["z_ice_surf_adj"] = z.interpolate(limit=2*24).values + + # making a summary of the surface height + df["z_surf_combined"] = np.nan + + # in winter, both SR1 and SR2 are used + df["z_surf_combined"] = df["z_surf_2_adj"].interpolate(limit=72).values + + + # in ablation season we use SR2 instead of the SR1&2 average + # here two options: + # 1) we ignore the SR1 and only use SR2 + # 2) we use SR1 when SR2 is not available (commented) + # the later one can cause jumps when SR2 starts to be available few days after SR1 + data_update = df[["z_surf_1_adj", "z_surf_2_adj"]].mean(axis=1).values + + ind_update = ~ind_ablation + #ind_update = np.logical_and(ind_ablation, ~np.isnan(data_update)) + df.loc[ind_update,"z_surf_combined"] = data_update[ind_update] + + # in ablation season we use pressure transducer over all other options + data_update = df[ "z_ice_surf_adj"].interpolate(limit=72).values + ind_update = np.logical_and(ind_ablation, ~np.isnan(data_update)) + df.loc[ind_update,"z_surf_combined"] = data_update[ind_update] + + logger.info('surface height combination finished') + return df['z_surf_combined'], df["z_ice_surf_adj"], df["z_surf_1_adj"], df["z_surf_2_adj"] + +def hampel(vals_orig, k=7*24, t0=15): + ''' + vals: pandas series of values from which to remove outliers + k: size of window (including the sample; 7 is equal to 3 on either side of value) + ''' + #Make copy so original not edited + vals=vals_orig.copy() + #Hampel Filter + L= 1.4826 + rolling_median=vals.rolling(k).median() + difference=np.abs(rolling_median-vals) + median_abs_deviation=difference.rolling(k).median() + threshold= t0 *L * median_abs_deviation + outlier_idx=difference>threshold + outlier_idx[0:round(k/2)]=False + vals.loc[outlier_idx]=np.nan + return(vals) + + +def get_thermistor_depth(df_in, site, station_config): + '''Calculates the depth of the thermistors through time based on their + installation depth (collected in a google sheet) and on the change of surface + height: instruments getting buried under new snow or surfacing due to ablation. + There is a potential for additional filtering of thermistor data for surfaced + (or just noisy) thermistors, but that is currently deactivated because slow. + + Parameters + ---------- + df_in : pandas:dataframe + dataframe containing the ice/firn temperature t_i_* as well as the + combined surface height z_surf_combined + site : str + stid, so that maintenance date and sensor installation depths can be found + in database + station_config : dict + potentially containing the key string_maintenance + with station_config["string_maintenance"] being a list of dictionaries + containing maintenance information in the format: + [ + {"date": "2007-08-20", "installation_depth": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 6.0]}, + {"date": "2008-07-17", "installation_depth": [1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 10.2]} + # Add more entries as needed + ] + ''' + + temp_cols_name = ['t_i_'+str(i) for i in range(12) if 't_i_'+str(i) in df_in.columns] + num_therm = len(temp_cols_name) + depth_cols_name = ['d_t_i_'+str(i) for i in range(1,num_therm+1)] + + if df_in['z_surf_combined'].isnull().all(): + logger.info('No valid surface height at '+site+', cannot calculate thermistor depth') + df_in[depth_cols_name + ['t_i_10m']] = np.nan + else: + logger.info('Calculating thermistor depth') + + # Convert maintenance_info to DataFrame for easier manipulation + maintenance_string = pd.DataFrame( + station_config.get("string_maintenance",[]), + columns = ['date', 'installation_depths'] + ) + maintenance_string["date"] = pd.to_datetime(maintenance_string["date"]) + maintenance_string = maintenance_string.sort_values(by='date', ascending=True) + + + if num_therm == 8: + ini_depth = [1, 2, 3, 4, 5, 6, 7, 10] + else: + ini_depth = [0, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5] + df_in[depth_cols_name] = np.nan + + # filtering the surface height + surface_height = df_in["z_surf_combined"].copy() + ind_filter = surface_height.rolling(window=14, center=True).var() > 0.1 + if any(ind_filter): + surface_height[ind_filter] = np.nan + df_in["z_surf_combined"] = surface_height.values + z_surf_interp = df_in["z_surf_combined"].interpolate() + + # first initialization of the depths + for i, col in enumerate(depth_cols_name): + df_in[col] = ( + ini_depth[i] + + z_surf_interp.values + - z_surf_interp[z_surf_interp.first_valid_index()] + ) + + # reseting depth at maintenance + if len(maintenance_string.date) == 0: + logger.info("No maintenance at "+site) + + for date in maintenance_string.date: + if date > z_surf_interp.last_valid_index(): + continue + new_depth = maintenance_string.loc[ + maintenance_string.date == date + ].installation_depths.values[0] + + for i, col in enumerate(depth_cols_name[:len(new_depth)]): + tmp = df_in[col].copy() + tmp.loc[date:] = ( + new_depth[i] + + z_surf_interp[date:].values + - z_surf_interp[date:][ + z_surf_interp[date:].first_valid_index() + ] + ) + df_in[col] = tmp.values + + # % Filtering thermistor data + for i in range(len(temp_cols_name)): + tmp = df_in[temp_cols_name[i]].copy() + + # variance filter + # ind_filter = ( + # df_in[temp_cols_name[i]] + # .interpolate(limit=14) + # .rolling(window=7) + # .var() + # > 0.5 + # ) + # month = ( + # df_in[temp_cols_name[i]].interpolate(limit=14).index.month.values + # ) + # ind_filter.loc[np.isin(month, [5, 6, 7])] = False + # if any(ind_filter): + # tmp.loc[ind_filter] = np.nan + + # before and after maintenance adaptation filter + if len(maintenance_string.date) > 0: + for date in maintenance_string.date: + if isinstance( + maintenance_string.loc[ + maintenance_string.date == date + ].installation_depths.values[0], + str, + ): + ind_adapt = np.abs( + tmp.interpolate(limit=14).index.values + - pd.to_datetime(date).to_datetime64() + ) < np.timedelta64(7, "D") + if any(ind_adapt): + tmp.loc[ind_adapt] = np.nan + + # surfaced thermistor + ind_pos = df_in[depth_cols_name[i]] < 0.1 + if any(ind_pos): + tmp.loc[ind_pos] = np.nan + + # copying the filtered values to the original table + df_in[temp_cols_name[i]] = tmp.values + + # removing negative depth + df_in.loc[df_in[depth_cols_name[i]]<0, depth_cols_name[i]] = np.nan + logger.info("interpolating 10 m firn/ice temperature") + df_in['t_i_10m'] = interpolate_temperature( + df_in.index.values, + df_in[depth_cols_name].values.astype(float), + df_in[temp_cols_name].values.astype(float), + kind="linear", + min_diff_to_depth=1.5, + ).set_index('date').values + + # filtering + ind_pos = df_in["t_i_10m"] > 0.1 + ind_low = df_in["t_i_10m"] < -70 + df_in.loc[ind_pos, "t_i_10m"] = np.nan + df_in.loc[ind_low, "t_i_10m"] = np.nan + + return df_in[depth_cols_name + ['t_i_10m']] + + +def interpolate_temperature(dates, depth_cor, temp, depth=10, min_diff_to_depth=2, + kind="quadratic"): + '''Calculates the depth of the thermistors through time based on their + installation depth (collected in a google sheet) and on the change of surface + height: instruments getting buried under new snow or surfacing due to ablation. + There is a potential for additional filtering of thermistor data for surfaced + (or just noisy) thermistors, but that is currently deactivated because slow. + + Parameters + ---------- + dates : numpy.array + array of datetime64 + depth_cor : numpy.ndarray + matrix of depths + temp : numpy.ndarray + matrix of temperatures + depth : float + constant depth at which (depth_cor, temp) should be interpolated. + min_diff_to_depth: float + maximum difference allowed between the available depht and the target depth + for the interpolation to be done. + kind : str + type of interpolation from scipy.interpolate.interp1d + ''' + + depth_cor = depth_cor.astype(float) + df_interp = pd.DataFrame() + df_interp["date"] = dates + df_interp["temperatureObserved"] = np.nan + + # preprocessing temperatures for small gaps + tmp = pd.DataFrame(temp) + tmp["time"] = dates + tmp = tmp.set_index("time") + # tmp = tmp.resample("H").mean() + # tmp = tmp.interpolate(limit=24*7) + temp = tmp.loc[dates].values + for i in (range(len(dates))): + x = depth_cor[i, :].astype(float) + y = temp[i, :].astype(float) + ind_no_nan = ~np.isnan(x + y) + x = x[ind_no_nan] + y = y[ind_no_nan] + x, indices = np.unique(x, return_index=True) + y = y[indices] + if len(x) < 2 or np.min(np.abs(x - depth)) > min_diff_to_depth: + continue + f = interp1d(x, y, kind, fill_value="extrapolate") + df_interp.iloc[i, 1] = np.min(f(depth), 0) + + if df_interp.iloc[:5, 1].std() > 0.1: + df_interp.iloc[:5, 1] = np.nan + + return df_interp + +def gps_coordinate_postprocessing(ds, var, station_config={}): # saving the static value of 'lat','lon' or 'alt' stored in attribute # as it might be the only coordinate available for certain stations (e.g. bedrock) var_out = var.replace('gps_','') - coord_names = {'alt':'altitude', 'lat':'latitude','lon':'longitude'} - - if var_out+'_avg' in list(ds.attrs.keys()): - static_value = float(ds.attrs[var_out+'_avg']) - elif coord_names[var_out] in list(ds.attrs.keys()): + coord_names = {'lat':'latitude','lon':'longitude', 'alt':'altitude'} + if coord_names[var_out] in list(ds.attrs.keys()): static_value = float(ds.attrs[coord_names[var_out]]) else: static_value = np.nan @@ -129,7 +965,7 @@ def gpsCoordinatePostprocessing(ds, var, station_config={}): print('no',var,'at',ds.attrs['station_id']) return np.ones_like(ds['t_u'].data)*static_value - # Extract station relocations from the TOML data + # Extract station relocations from the config dict station_relocations = station_config.get("station_relocation", []) # Convert the ISO8601 strings to pandas datetime objects @@ -140,9 +976,63 @@ def gpsCoordinatePostprocessing(ds, var, station_config={}): logger.info('processing '+var+' with relocation on ' + ', '.join([br.strftime('%Y-%m-%dT%H:%M:%S') for br in breaks])) return piecewise_smoothing_and_interpolation(ds[var].to_series(), breaks) + +def piecewise_smoothing_and_interpolation(data_series, breaks): + '''Smoothes, inter- or extrapolate the GPS observations. The processing is + done piecewise so that each period between station relocations are done + separately (no smoothing of the jump due to relocation). Piecewise linear + regression is then used to smooth the available observations. Then this + smoothed curve is interpolated linearly over internal gaps. Eventually, this + interpolated curve is extrapolated linearly for timestamps before the first + valid measurement and after the last valid measurement. + + Parameters + ---------- + data_series : pandas.Series + Series of observed latitude, longitude or elevation with datetime index. + breaks: list + List of timestamps of station relocation. First and last item should be + None so that they can be used in slice(breaks[i], breaks[i+1]) + + Returns + ------- + np.ndarray + Smoothed and interpolated values corresponding to the input series. + ''' + df_all = pd.Series(dtype=float) # Initialize an empty Series to gather all smoothed pieces + breaks = [None] + breaks + [None] + _inferred_series = [] + for i in range(len(breaks) - 1): + df = data_series.loc[slice(breaks[i], breaks[i+1])] + + # Drop NaN values and calculate the number of segments based on valid data + df_valid = df.dropna() + if df_valid.shape[0] > 2: + # Fit linear regression model to the valid data range + x = pd.to_numeric(df_valid.index).values.reshape(-1, 1) + y = df_valid.values.reshape(-1, 1) - -def calcHeatFlux(T_0, T_h, Tsurf_h, WS_h, z_WS, z_T, q_h, p_h, + model = LinearRegression() + model.fit(x, y) + + # Predict using the model for the entire segment range + x_pred = pd.to_numeric(df.index).values.reshape(-1, 1) + + y_pred = model.predict(x_pred) + df = pd.Series(y_pred.flatten(), index=df.index) + # adds to list the predicted values for the current segment + _inferred_series.append(df) + + df_all = pd.concat(_inferred_series) + + # Fill internal gaps with linear interpolation + df_all = df_all.interpolate(method='linear', limit_area='inside') + + # Remove duplicate indices and return values as numpy array + df_all = df_all[~df_all.index.duplicated(keep='last')] + return df_all.values + +def calculate_tubulent_heat_fluxes(T_0, T_h, Tsurf_h, WS_h, z_WS, z_T, q_h, p_h, kappa=0.4, WS_lim=1., z_0=0.001, g=9.82, es_0=6.1071, eps=0.622, gamma=16., L_sub=2.83e6, L_dif_max=0.01, c_pd=1005., aa=0.7, bb=0.75, cc=5., dd=0.35, R_d=287.05): @@ -213,7 +1103,7 @@ def calcHeatFlux(T_0, T_h, Tsurf_h, WS_h, z_WS, z_T, q_h, p_h, Latent heat flux ''' rho_atm = 100 * p_h / R_d / (T_h + T_0) # Calculate atmospheric density - nu = calcVisc(T_h, T_0, rho_atm) # Calculate kinematic viscosity + nu = calculate_viscosity(T_h, T_0, rho_atm) # Calculate kinematic viscosity SHF_h = xr.zeros_like(T_h) # Create empty xarrays LHF_h = xr.zeros_like(T_h) @@ -307,7 +1197,7 @@ def calcHeatFlux(T_0, T_h, Tsurf_h, WS_h, z_WS, z_T, q_h, p_h, LHF_h[HF_nan] = np.nan return SHF_h, LHF_h -def calcVisc(T_h, T_0, rho_atm): +def calculate_viscosity(T_h, T_0, rho_atm): '''Calculate kinematic viscosity of air Parameters @@ -330,7 +1220,7 @@ def calcVisc(T_h, T_0, rho_atm): # Kinematic viscosity of air in m^2/s return mu / rho_atm -def calcSpHumid(T_0, T_100, T_h, p_h, RH_cor_h, es_0=6.1071, es_100=1013.246, eps=0.622): +def calculate_specific_humidity(T_0, T_100, T_h, p_h, RH_cor_h, es_0=6.1071, es_100=1013.246, eps=0.622): '''Calculate specific humidity Parameters ---------- @@ -379,76 +1269,6 @@ def calcSpHumid(T_0, T_100, T_h, p_h, RH_cor_h, es_0=6.1071, es_100=1013.246, ep # Convert to kg/kg return RH_cor_h * q_sat / 100 - -def piecewise_smoothing_and_interpolation(data_series, breaks): - '''Smoothes, inter- or extrapolate the GPS observations. The processing is - done piecewise so that each period between station relocations are done - separately (no smoothing of the jump due to relocation). Piecewise linear - regression is then used to smooth the available observations. Then this - smoothed curve is interpolated linearly over internal gaps. Eventually, this - interpolated curve is extrapolated linearly for timestamps before the first - valid measurement and after the last valid measurement. - - Parameters - ---------- - data_series : pandas.Series - Series of observed latitude, longitude or elevation with datetime index. - breaks: list - List of timestamps of station relocation. First and last item should be - None so that they can be used in slice(breaks[i], breaks[i+1]) - - Returns - ------- - np.ndarray - Smoothed and interpolated values corresponding to the input series. - ''' - df_all = pd.Series(dtype=float) # Initialize an empty Series to gather all smoothed pieces - breaks = [None] + breaks + [None] - _inferred_series = [] - for i in range(len(breaks) - 1): - df = data_series.loc[slice(breaks[i], breaks[i+1])] - - # Drop NaN values and calculate the number of segments based on valid data - df_valid = df.dropna() - if df_valid.shape[0] > 2: - # Fit linear regression model to the valid data range - x = pd.to_numeric(df_valid.index).values.reshape(-1, 1) - y = df_valid.values.reshape(-1, 1) - - model = LinearRegression() - model.fit(x, y) - - # Predict using the model for the entire segment range - x_pred = pd.to_numeric(df.index).values.reshape(-1, 1) - - y_pred = model.predict(x_pred) - df = pd.Series(y_pred.flatten(), index=df.index) - # adds to list the predicted values for the current segment - _inferred_series.append(df) - - df_all = pd.concat(_inferred_series) - - # Fill internal gaps with linear interpolation - df_all = df_all.interpolate(method='linear', limit_area='inside') - - # Remove duplicate indices and return values as numpy array - df_all = df_all[~df_all.index.duplicated(keep='last')] - return df_all.values - - -def _calcAtmosDens(p_h, R_d, T_h, T_0): # TODO: check this shouldn't be in this step somewhere - '''Calculate atmospheric density''' - return 100 * p_h / R_d / (T_h + T_0) - -def _getTempK(T_0): - '''Return steam point temperature in K''' - return T_0+100 - -def _getRotation(): - '''Return degrees-to-radians and radians-to-degrees''' - deg2rad = np.pi / 180 - rad2deg = 1 / deg2rad - return deg2rad, rad2deg if __name__ == "__main__": # unittest.main() diff --git a/src/pypromice/process/get_l2tol3.py b/src/pypromice/process/get_l2tol3.py index ca08b17c..8805db74 100644 --- a/src/pypromice/process/get_l2tol3.py +++ b/src/pypromice/process/get_l2tol3.py @@ -50,8 +50,21 @@ def get_l2tol3(config_folder, inpath, outpath, variables, metadata): # importing station_config (dict) from config_folder (str path) config_file = config_folder+l2.attrs['station_id']+'.toml' - with open(config_file) as fp: - station_config = toml.load(fp) + + if os.path.exists(config_file): + # File exists, load the configuration + with open(config_file) as fp: + station_config = toml.load(fp) + else: + # File does not exist, initialize with standard info + # this was prefered by RSF over exiting with error + logger.error("\n***\nNo station_configuration file for %s.\nPlease create one on AWS-L0/metadata/station_configurations.\n***"%l2.attrs['station_id']) + station_config = {"stid":l2.attrs['station_id'], + "station_site":l2.attrs['station_id'], + "project": "PROMICE", + "location_type": "ice sheet", + } + # Perform Level 3 processing l3 = toL3(l2, station_config) diff --git a/src/pypromice/process/join_l3.py b/src/pypromice/process/join_l3.py index cf5b1e16..96963415 100644 --- a/src/pypromice/process/join_l3.py +++ b/src/pypromice/process/join_l3.py @@ -105,21 +105,25 @@ def readNead(infile): # combining thermocouple and CS100 temperatures ds['TA1'] = ds['TA1'].combine_first(ds['TA3']) ds['TA2'] = ds['TA2'].combine_first(ds['TA4']) - + + # renaming variables to the GEUS names ds=ds.rename(var_name) - standard_vars_to_drop = ["NR", "TA3", "TA4", "TA5", "NR_cor", - "z_surf_1", "z_surf_2", "z_surf_combined", - "TA2m", "RH2m", "VW10m", "SZA", "SAA", - "depth_t_i_1", "depth_t_i_2", "depth_t_i_3", "depth_t_i_4", "depth_t_i_5", - "depth_t_i_6", "depth_t_i_7", "depth_t_i_8", "depth_t_i_9", "depth_t_i_10", "t_i_10m" - ] + # variables always dropped from the historical GC-Net files + # could be move to the config files at some point + standard_vars_to_drop = ["NR", "TA3", "TA4", "TA5", "NR_cor", "TA2m", + "RH2m", "VW10m", "SZA", "SAA"] standard_vars_to_drop = standard_vars_to_drop + [v for v in list(ds.keys()) if v.endswith("_adj_flag")] # Drop the variables if they are present in the dataset ds = ds.drop_vars([var for var in standard_vars_to_drop if var in ds]) ds=ds.rename({'timestamp':'time'}) + + # in the historical GC-Net processing, periods with missing z_surf_combined + # are filled with a constant value, these values should be removed to + # allow a better alignement with the z_surf_combined estimated for the GEUS stations + ds['z_surf_combined'] = ds['z_surf_combined'].where(ds['z_surf_combined'].diff(dim='time')!=0) return ds def loadArr(infile, isNead): @@ -148,32 +152,78 @@ def loadArr(infile, isNead): print(f'{name} array loaded from {infile}') return ds, name -# %% will be used in the future -# def aligning_surface_heights(l3_merged, l3): - # df_aux['z_surf_combined'] = \ - # df_aux['z_surf_combined'] \ - # - df_aux.loc[df_aux.z_surf_combined.last_valid_index(), 'z_surf_combined'] \ - # + df_v6.loc[df_v6.z_surf_combined.first_valid_index(), 'z_surf_combined'] - - # if s == 'Swiss Camp 10m': - # df.loc[:df.HS_combined.first_valid_index(), 'HS_combined'] = \ - # df2.loc[:df.HS_combined.first_valid_index(), 'HS_combined'] \ - # - df2.loc[df2.HS_combined.last_valid_index(), 'HS_combined'] \ - # + df.loc[df.HS_combined.first_valid_index(), 'HS_combined'] +def align_surface_heights(data_series_new, data_series_old): + """ + Align two surface height time series based on the gap between their end and + start. + + If the gap between the end of `data_series_old` and the start of `data_series_new` + is less than a week, the function aligns them based on the median value of + the last week of `data_series_old` and the first week of `data_series_new`. + If the gap is larger than a week, it aligns them using a linear fit. If + there is overlap, the function uses the overlapping period to adjust the + newer time series. + + Parameters + ---------- + data_series_old : pandas.Series + The older time series data. + data_series_new : pandas.Series + The newer time series data. + Returns + ------- + numpy.ndarray + Array containing the aligned newer time series data. + """ + # Get the first and last valid indices of both series + last_old_idx = data_series_old.last_valid_index() + first_new_idx = data_series_new.first_valid_index() + + # Check for overlap + if first_new_idx <= last_old_idx: + # Find the overlapping period + overlap_start = first_new_idx + overlap_end = min(last_old_idx, overlap_start+pd.to_timedelta('7D')) + + # Compute the median values for the overlapping period + overlap_old = data_series_old[overlap_start:overlap_end].median() + overlap_new = data_series_new[overlap_start:overlap_end].median() + + if np.isnan(overlap_old) or np.isnan(overlap_new): + overlap_end = min(last_old_idx, data_series_new.last_valid_index()) + + # Compute the median values for the overlapping period + overlap_old = data_series_old[overlap_start:overlap_end].median() + overlap_new = data_series_new[overlap_start:overlap_end].median() - # df.loc[df.HS_combined.diff()==0,'HS_combined'] = np.nan - - # fit = np.polyfit(df.loc[df.HS_combined.notnull(),:].index.astype('int64'), - # df.loc[df.HS_combined.notnull(),'HS_combined'], 1) - # fit_fn = np.poly1d(fit) + # Align based on the overlapping median values + data_series_new = data_series_new - overlap_new + overlap_old + + elif (first_new_idx - last_old_idx).days <= 7: + # Compute the median of the last week of data in the old series + last_week_old = data_series_old[last_old_idx - pd.Timedelta(weeks=1):last_old_idx].median() + + # Compute the median of the first week of data in the new series + first_week_new = data_series_new[first_new_idx:first_new_idx + pd.Timedelta(weeks=1)].median() + + # Align based on the median values + data_series_new = data_series_new - first_week_new + last_week_old + else: + # Perform a linear fit + combined_series = pd.concat([data_series_old, data_series_new]) + fit = np.polyfit( + combined_series.dropna().index.astype('int64'), + combined_series.dropna().values, 1 + ) + fit_fn = np.poly1d(fit) + + data_series_new = data_series_new.values \ + - fit_fn(data_series_new.index.astype('int64')[0]) \ + + data_series_old[last_old_idx] - # df['HS_combined'] = df['HS_combined'].values \ - # - fit_fn( - # df_in.loc[[df_in.z_surf_combined.first_valid_index()],:].index.astype('int64')[0] - # ) + df_in.loc[df_in.z_surf_combined.first_valid_index(), 'z_surf_combined'] - # return l3_merged - # %% + return data_series_new + def build_station_list(config_folder: str, target_station_site: str) -> list: """ Get a list of unique station information dictionaries for a given station site. @@ -192,6 +242,7 @@ def build_station_list(config_folder: str, target_station_site: str) -> list: """ station_info_list = [] # Initialize an empty list to store station information + found_as_station = False for filename in os.listdir(config_folder): if filename.endswith(".toml"): file_path = os.path.join(config_folder, filename) @@ -202,10 +253,25 @@ def build_station_list(config_folder: str, target_station_site: str) -> list: stid = data.get("stid") # Get the station ID # Check if the station site matches the target and stid is unique + if stid == target_station_site: + found_as_station = True if station_site == target_station_site and stid: station_info = data.copy() # Copy all attributes from the TOML file station_info_list.append(station_info) # Add the station info to the list + + if len(station_info_list)==0 and not found_as_station: + logger.error('\n***\nNo station_configuration file found for %s.\nProcessing it as a single-station PROMICE site.\n***'%target_station_site) + station_info = { + "stid": target_station_site, + "station_site": target_station_site, + "project": "PROMICE", + "location_type": "ice sheet", + } + station_info_list.append(station_info) + elif len(station_info_list)==0 : + logger.error('\n***\nThe name \"%s\" passed to join_l3 is a station name and not a site name (e.g. SCO_Lv3 instead of SCO_L). Please provide a site name that is named at least once in the "station_site" attribute of the station configuration files.\n***'%target_station_site) + return station_info_list def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, metadata): @@ -224,7 +290,7 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me filepath = os.path.join(folder_gcnet, stid+'.csv') isNead = True if not os.path.isfile(filepath): - logger.info(stid+' was listed as station but could not be found in '+folder_l3+' nor '+folder_gcnet) + logger.error('\n***\n'+stid+' was listed as station but could not be found in '+folder_l3+' nor '+folder_gcnet+'\n***') continue l3, _ = loadArr(filepath, isNead) @@ -238,11 +304,12 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me list_station_data.append((l3, station_info)) # Sort the list in reverse chronological order so that we start with the latest data - sorted_list_station_data = sorted(list_station_data, key=lambda x: x[0].time.max(), reverse=True) + sorted_list_station_data = sorted(list_station_data, key=lambda x: x[0].time.min(), reverse=True) sorted_stids = [info["stid"] for _, info in sorted_list_station_data] logger.info('joining %s' % ' '.join(sorted_stids)) l3_merged = None + for l3, station_info in sorted_list_station_data: stid = station_info["stid"] @@ -264,7 +331,7 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me # creating the station_attributes attribute in l3_merged l3_merged.attrs["stations_attributes"] = st_attrs - + else: # if l3 (older data) is missing variables compared to l3_merged (newer data) # , then we fill them with nan @@ -285,8 +352,18 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me for k in attrs_list: del l3.attrs[k] - l3_merged.attrs['stations_attributes'][stid]['first_timestamp'] = l3.time.isel(time=0).dt.strftime( date_format='%Y-%m-%d %H:%M:%S').item() - l3_merged.attrs['stations_attributes'][stid]['last_timestamp'] = l3_merged.time.isel(time=0).dt.strftime( date_format='%Y-%m-%d %H:%M:%S').item() + l3_merged.attrs['stations_attributes'][stid]['first_timestamp'] = \ + l3.time.isel(time=0).dt.strftime( date_format='%Y-%m-%d %H:%M:%S').item() + l3_merged.attrs['stations_attributes'][stid]['last_timestamp'] = \ + l3_merged.time.isel(time=0).dt.strftime( date_format='%Y-%m-%d %H:%M:%S').item() + + # adjusting surface height in the most recent data (l3_merged) + # so that it shows continuity with the older data (l3) + l3_merged['z_surf_combined'] = ('time', + align_surface_heights( + l3_merged.z_surf_combined.to_series(), + l3.z_surf_combined.to_series()) + ) # merging by time block l3_merged = xr.concat((l3.sel( @@ -297,10 +374,13 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me # Assign site id if not l3_merged: - logger.error('No level 2 data file found for '+site) + logger.error('No level 3 station data file found for '+site) + return None, sorted_list_station_data l3_merged.attrs['site_id'] = site l3_merged.attrs['stations'] = ' '.join(sorted_stids) l3_merged.attrs['level'] = 'L3' + l3_merged.attrs['project'] = sorted_list_station_data[0][1]['project'] + l3_merged.attrs['location_type'] = sorted_list_station_data[0][1]['location_type'] v = getVars(variables) m = getMeta(metadata) diff --git a/src/pypromice/process/write.py b/src/pypromice/process/write.py index 647d6f75..58b87278 100644 --- a/src/pypromice/process/write.py +++ b/src/pypromice/process/write.py @@ -32,6 +32,9 @@ def prepare_and_write(dataset, outpath, vars_df=None, meta_dict=None, time='60mi if resample: d2 = resample_dataset(dataset, time) logger.info('Resampling to '+str(time)) + if len(d2.time) == 1: + logger.warning('Output of resample has length 1. Not enough data to calculate daily/monthly average.') + return None else: d2 = dataset.copy() @@ -248,43 +251,44 @@ def addMeta(ds, meta): ds : xarray.Dataset Dataset with metadata ''' - if 'lon' in ds.keys(): - # caluclating average coordinates based on the extra/interpolated coords - for v in ['lat','lon','alt']: - ds.attrs[v+'_avg'] = ds[v].mean().item() - # dropping the less accurate standard coordinates given in the - # raw or tx config files - for v in ['latitude','longitude']: - if v in ds.attrs.keys(): - del ds.attrs[v] - elif 'gps_lon' in ds.keys(): - # caluclating average coordinates based on the measured coords (can be gappy) - for v in ['gps_lat','gps_lon','gps_alt']: - if v in ds.keys(): - ds.attrs[v+'_avg'] = ds[v].mean().item() - else: - ds.attrs[v+'_avg'] = np.nan - # dropping the less accurate standard coordinates given in the - # raw or tx config files - for v in ['latitude','longitude']: - if v in ds.attrs.keys(): - del ds.attrs[v] + + # a static latitude, longitude and altitude is saved as attribute along its origin + var_alias = {'lat':'latitude','lon':'longitude','alt':'altitude'} + for v in ['lat','lon','alt']: + # saving the reference latitude/longitude/altitude + original_value = np.nan + if var_alias[v] in ds.attrs.keys(): + original_value = ds.attrs[var_alias[v]] + if v in ds.keys(): + # if possible, replacing it with average coordinates based on the extra/interpolated coords + ds.attrs[var_alias[v]] = ds[v].mean().item() + ds.attrs[var_alias[v]+'_origin'] = 'average of gap-filled postprocessed '+v + elif 'gps_'+v in ds.keys(): + # if possible, replacing it with average coordinates based on the measured coords (can be gappy) + ds.attrs[var_alias[v]] = ds['gps_'+v].mean().item() + ds.attrs[var_alias[v]+'_origin'] = 'average of GPS-measured '+v+', potentially including gaps' + + if np.isnan(ds.attrs[var_alias[v]]): + # if no better data was available to update the coordinate, then we + # re-use the original value + ds.attrs[var_alias[v]] = original_value + ds.attrs[var_alias[v]+'_origin'] = 'reference value, origin unknown' # Attribute convention for data discovery # https://wiki.esipfed.org/Attribute_Convention_for_Data_Discovery_1-3 # Determine the temporal resolution - time_diff = pd.Timedelta((ds['time'][1] - ds['time'][0]).values) - if time_diff == pd.Timedelta('10min'): - sample_rate = "10min" - elif time_diff == pd.Timedelta('1H'): - sample_rate = "hourly" - elif time_diff == pd.Timedelta('1D'): - sample_rate = "daily" - elif 28 <= time_diff.days <= 31: - sample_rate = "monthly" - else: - sample_rate = "unknown_sample_rate" + sample_rate = "unknown_sample_rate" + if len(ds['time'])>1: + time_diff = pd.Timedelta((ds['time'][1] - ds['time'][0]).values) + if time_diff == pd.Timedelta('10min'): + sample_rate = "10min" + elif time_diff == pd.Timedelta('1h'): + sample_rate = "hourly" + elif time_diff == pd.Timedelta('1D'): + sample_rate = "daily" + elif 28 <= time_diff.days <= 31: + sample_rate = "monthly" if 'station_id' in ds.attrs.keys(): ds.attrs['id'] = 'dk.geus.promice.station.' + ds.attrs['station_id']+'.'+sample_rate @@ -444,4 +448,7 @@ def reformat_lon(dataset, exempt=['UWN', 'Roof_GEUS', 'Roof_PROMICE']): if 'gps_lon' not in dataset.keys(): return dataset dataset['gps_lon'] = np.abs(dataset['gps_lon']) * -1 + if 'lon' not in dataset.keys(): + return dataset + dataset['lon'] = np.abs(dataset['lon']) * -1 return dataset diff --git a/src/pypromice/qc/persistence.py b/src/pypromice/qc/persistence.py index 82fe6df8..3825f5df 100644 --- a/src/pypromice/qc/persistence.py +++ b/src/pypromice/qc/persistence.py @@ -81,8 +81,9 @@ def persistence_qc( if variable_thresholds is None: variable_thresholds = DEFAULT_VARIABLE_THRESHOLDS - - logger.info(f"Running persistence_qc using {variable_thresholds}") + logger.debug(f"Running persistence_qc using {variable_thresholds}") + else: + logger.info(f"Running persistence_qc using custom thresholds:\n {variable_thresholds}") for k in variable_thresholds.keys(): if k in ["t", "p", "rh", "wspd", "wdir", "z_boom"]: @@ -180,9 +181,7 @@ def get_duration_consecutive_true( """ is_first = series.astype("int").diff() == 1 - delta_time = (series.index.diff().total_seconds() / 3600).to_series( - index=series.index - ) + delta_time = (series.index.to_series().diff().dt.total_seconds() / 3600) cumsum = delta_time.cumsum() offset = (is_first * (cumsum - delta_time)).replace(0, np.nan).ffill().fillna(0) diff --git a/src/pypromice/resources/file_attributes.csv b/src/pypromice/resources/file_attributes.csv index 9fa56bc4..a2482b7d 100644 --- a/src/pypromice/resources/file_attributes.csv +++ b/src/pypromice/resources/file_attributes.csv @@ -1,56 +1,55 @@ -attribute,entry -acknowledgements,The Programme for Monitoring of the Greenland Ice Sheet (PROMICE) -alt.axis,Z -alt.coverage_content_type,coordinate -gps_alt.positive,up -cdm_data_type, -comment,https://doi.org/10.22008/promice/data/aws -contributor_name, -contributor_role, -conventions,ACDD-1.3; CF-1.7 -creater_email,pho@geus.dk -creater_url,https://promice.dk -creator_institution,Geological Survey of Denmark and Greenland (GEUS) -creator_name,Penelope How -creator_type,person -featureType,timeSeries -geospatial_bounds_crs,EPSG:4979 -geospatial_lat_extents_match,gps_lat -geospatial_lat_resolution, -geospatial_lat_units,degrees_north -geospatial_lon_extents_match,gps_lon -geospatial_lon_resolution, -geospatial_lon_units,degrees_east -geospatial_vertical_resolution, -geospatial_vertical_units,EPSG:4979 -institution,Geological Survey of Denmark and Greenland (GEUS) -instrument,See https://doi.org/10.5194/essd-13-3819-2021 -instrument_vocabulary,GCMD:GCMD Keywords -keywords,GCMDSK:EARTH SCIENCE > CRYOSPHERE > GLACIERS/ICE SHEETS > ICE SHEETS > ICE SHEET MEASUREMENTS; GCMDSK:EARTH SCIENCE > CRYOSPHERE > GLACIERS/ICE SHEETS > GLACIER MASS BALANCE/ICE SHEET MASS BALANCE; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > SNOW/ICE TEMPERATURE; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > SNOW MELT; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > SNOW DEPTH; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > ICE VELOCITY; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > ALBEDO; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > ALBEDO; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > ICE GROWTH/MELT; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > ICE VELOCITY; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > SNOW DEPTH; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > SNOW MELT; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > SNOW/ICE TEMPERATURE; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC PRESSURE; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > ALBEDO; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > INCOMING SOLAR RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > LONGWAVE RADIATION > DOWNWELLING LONGWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > LONGWAVE RADIATION > UPWELLING LONGWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > LONGWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > NET RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > OUTGOING LONGWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > RADIATIVE FLUX; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > RADIATIVE FORCING; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > SHORTWAVE RADIATION > DOWNWELLING SHORTWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > SHORTWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > SUNSHINE; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC TEMPERATURE > SURFACE TEMPERATURE > AIR TEMPERATURE; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WATER VAPOR > WATER VAPOR INDICATORS > HUMIDITY > ABSOLUTE HUMIDITY; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WATER VAPOR > WATER VAPOR INDICATORS > HUMIDITY > RELATIVE HUMIDITY; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > LOCAL WINDS; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS > U/V WIND COMPONENTS; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS > WIND DIRECTION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS > WIND SPEED; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS; GCMDSK:EARTH SCIENCE > ATMOSPHERE > CLOUDS; GCMDSK:EARTH SCIENCE > ATMOSPHERE > PRECIPITATION -keywords_vocabulary,GCMDSK:GCMD Science Keywords:https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/sciencekeywords -lat.axis,Y -lat.coverage_content_type,coordinate -lat.long_name,station latitude -license,Creative Commons Attribution 4.0 International (CC-BY-4.0) https://creativecommons.org/licenses/by/4.0 -lon.axis,X -lon.coverage_content_type,coordinate -lon.long_name,station longitude -lon.units,degrees_east -metadata_link, -naming_authority,dk.geus.promice -platform, -platform_vocabulary,GCMD:GCMD Keywords -processing_level,Level 3 -product_status,beta -product_version,4 -program,PROMICE -project,PROMICE -publisher_email,info@promice.dk -publisher_institution,GEUS -publisher_name,GEUS -publisher_type,institution -publisher_url,https://promice.dk -references,"How, P.; Abermann, J.; Ahlstrøm, A.P.; Andersen, S.B.; Box, J. E.; Citterio, M.; Colgan, W.T.; Fausto. R.S.; Karlsson, N.B.; Jakobsen, J.; Langley, K.; Larsen, S.H.; Mankoff, K.D.; Pedersen, A.Ø.; Rutishauser, A.; Shield, C.L.; Solgaard, A.M.; van As, D.; Vandecrux, B.; Wright, P.J., 2022, ""PROMICE and GC-Net automated weather station data in Greenland"", https://doi.org/10.22008/FK2/IW73UU, GEUS Dataverse" -references_bib,@article{How2022; doi = {10.22008/FK2/IW73UU}; url = {https://doi.org/10.22008/FK2/IW73UU}; year = {2022}; month=10; publisher= {GEUS Dataverse}; author = {Penelope How and Jakob Abermann and Andreas P. Ahlstr{\o}m and Signe B. Andersen and Jason E. Box and Michele Citterio and William Colgan and Robert S. Fausto and Nanna B. Karlsson and Jakob Jakobsen and Kirsty Langley and Signe Hillerup Larsen and Kenneth D. Mankoff and Allan {\O}. Pedersen and Anja Rutishauser and Christopher L. Shields and Anne M. Solgaard and Dirk van As and Baptiste Vandecrux}; title = {PROMICE and GC-Net automated weather station data in Greenland}; journal = {GEUS Dataverse}} -standard_name_vocabulary,CF Standard Name Table (v77; 19 January 2021) -summary,"The Programme for Monitoring of the Greenland Ice Sheet (PROMICE) and Greenland Climate Network (GC-Net) have been measuring climate and ice sheet properties since 2007 and 1995, respectively. The PROMICE weather station network monitors glacier mass balance in the melt zone of the Greenland Ice Sheet, providing ground truth data to calibrate mass budget models. GC-Net weather stations measure snowfall and surface properties in the accumulation zone, providing valuable knowledge on the Greenland Ice Sheet’s mass gain and climatology.Accurate measurements of the surface and near-surface atmospheric conditions in a changing climate is important for reliable present and future assessment of changes to the Greenland Ice Sheet. All measurements are handled and processed with pypromice, which is a peer-reviewed and freely available Python package with source code available at https://github.com/GEUS-Glaciology-and-Climate/pypromice. A user-contributable dynamic web-based database of known data quality issues is associated with the data products at https://github.com/GEUS-PROMICE/ PROMICE-AWS-data-issues/." +attribute,entry +acknowledgements,The Programme for Monitoring of the Greenland Ice Sheet (PROMICE) +alt.axis,Z +alt.coverage_content_type,coordinate +gps_alt.positive,up +cdm_data_type, +comment,https://doi.org/10.22008/promice/data/aws +contributor_name, +contributor_role, +conventions,ACDD-1.3; CF-1.7 +creater_email,pho@geus.dk +creater_url,https://promice.dk +creator_institution,Geological Survey of Denmark and Greenland (GEUS) +creator_name,Penelope How +creator_type,person +featureType,timeSeries +geospatial_bounds_crs,EPSG:4979 +geospatial_lat_extents_match,gps_lat +geospatial_lat_resolution, +geospatial_lat_units,degrees_north +geospatial_lon_extents_match,gps_lon +geospatial_lon_resolution, +geospatial_lon_units,degrees_east +geospatial_vertical_resolution, +geospatial_vertical_units,EPSG:4979 +institution,Geological Survey of Denmark and Greenland (GEUS) +instrument,See https://doi.org/10.5194/essd-13-3819-2021 +instrument_vocabulary,GCMD:GCMD Keywords +keywords,GCMDSK:EARTH SCIENCE > CRYOSPHERE > GLACIERS/ICE SHEETS > ICE SHEETS > ICE SHEET MEASUREMENTS; GCMDSK:EARTH SCIENCE > CRYOSPHERE > GLACIERS/ICE SHEETS > GLACIER MASS BALANCE/ICE SHEET MASS BALANCE; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > SNOW/ICE TEMPERATURE; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > SNOW MELT; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > SNOW DEPTH; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > ICE VELOCITY; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > ALBEDO; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > ALBEDO; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > ICE GROWTH/MELT; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > ICE VELOCITY; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > SNOW DEPTH; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > SNOW MELT; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > SNOW/ICE TEMPERATURE; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC PRESSURE; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > ALBEDO; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > INCOMING SOLAR RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > LONGWAVE RADIATION > DOWNWELLING LONGWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > LONGWAVE RADIATION > UPWELLING LONGWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > LONGWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > NET RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > OUTGOING LONGWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > RADIATIVE FLUX; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > RADIATIVE FORCING; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > SHORTWAVE RADIATION > DOWNWELLING SHORTWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > SHORTWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > SUNSHINE; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC TEMPERATURE > SURFACE TEMPERATURE > AIR TEMPERATURE; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WATER VAPOR > WATER VAPOR INDICATORS > HUMIDITY > ABSOLUTE HUMIDITY; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WATER VAPOR > WATER VAPOR INDICATORS > HUMIDITY > RELATIVE HUMIDITY; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > LOCAL WINDS; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS > U/V WIND COMPONENTS; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS > WIND DIRECTION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS > WIND SPEED; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS; GCMDSK:EARTH SCIENCE > ATMOSPHERE > CLOUDS; GCMDSK:EARTH SCIENCE > ATMOSPHERE > PRECIPITATION +keywords_vocabulary,GCMDSK:GCMD Science Keywords:https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/sciencekeywords +lat.axis,Y +lat.coverage_content_type,coordinate +lat.long_name,station latitude +license,Creative Commons Attribution 4.0 International (CC-BY-4.0) https://creativecommons.org/licenses/by/4.0 +lon.axis,X +lon.coverage_content_type,coordinate +lon.long_name,station longitude +lon.units,degrees_east +metadata_link, +naming_authority,dk.geus.promice +platform, +platform_vocabulary,GCMD:GCMD Keywords +processing_level,Level 3 +product_status,beta +product_version,4 +program,PROMICE +publisher_email,info@promice.dk +publisher_institution,GEUS +publisher_name,GEUS +publisher_type,institution +publisher_url,https://promice.dk +references,"How, P.; Abermann, J.; Ahlstrøm, A.P.; Andersen, S.B.; Box, J. E.; Citterio, M.; Colgan, W.T.; Fausto. R.S.; Karlsson, N.B.; Jakobsen, J.; Langley, K.; Larsen, S.H.; Mankoff, K.D.; Pedersen, A.Ø.; Rutishauser, A.; Shield, C.L.; Solgaard, A.M.; van As, D.; Vandecrux, B.; Wright, P.J., 2022, ""PROMICE and GC-Net automated weather station data in Greenland"", https://doi.org/10.22008/FK2/IW73UU, GEUS Dataverse" +references_bib,@article{How2022; doi = {10.22008/FK2/IW73UU}; url = {https://doi.org/10.22008/FK2/IW73UU}; year = {2022}; month=10; publisher= {GEUS Dataverse}; author = {Penelope How and Jakob Abermann and Andreas P. Ahlstr{\o}m and Signe B. Andersen and Jason E. Box and Michele Citterio and William Colgan and Robert S. Fausto and Nanna B. Karlsson and Jakob Jakobsen and Kirsty Langley and Signe Hillerup Larsen and Kenneth D. Mankoff and Allan {\O}. Pedersen and Anja Rutishauser and Christopher L. Shields and Anne M. Solgaard and Dirk van As and Baptiste Vandecrux}; title = {PROMICE and GC-Net automated weather station data in Greenland}; journal = {GEUS Dataverse}} +standard_name_vocabulary,CF Standard Name Table (v77; 19 January 2021) +summary,"The Programme for Monitoring of the Greenland Ice Sheet (PROMICE) and Greenland Climate Network (GC-Net) have been measuring climate and ice sheet properties since 2007 and 1995, respectively. The PROMICE weather station network monitors glacier mass balance in the melt zone of the Greenland Ice Sheet, providing ground truth data to calibrate mass budget models. GC-Net weather stations measure snowfall and surface properties in the accumulation zone, providing valuable knowledge on the Greenland Ice Sheet’s mass gain and climatology.Accurate measurements of the surface and near-surface atmospheric conditions in a changing climate is important for reliable present and future assessment of changes to the Greenland Ice Sheet. All measurements are handled and processed with pypromice, which is a peer-reviewed and freely available Python package with source code available at https://github.com/GEUS-Glaciology-and-Climate/pypromice. A user-contributable dynamic web-based database of known data quality issues is associated with the data products at https://github.com/GEUS-PROMICE/ PROMICE-AWS-data-issues/." diff --git a/src/pypromice/resources/variable_aliases_GC-Net.csv b/src/pypromice/resources/variable_aliases_GC-Net.csv index d84c0a71..afac35f7 100644 --- a/src/pypromice/resources/variable_aliases_GC-Net.csv +++ b/src/pypromice/resources/variable_aliases_GC-Net.csv @@ -64,15 +64,15 @@ z_surf_2,HS2 z_surf_1_adj_flag,HS1_adj_flag z_surf_2_adj_flag,HS2_adj_flag z_surf_combined,HS_combined -depth_t_i_1,DTS1 -depth_t_i_2,DTS2 -depth_t_i_3,DTS3 -depth_t_i_4,DTS4 -depth_t_i_5,DTS5 -depth_t_i_6,DTS6 -depth_t_i_7,DTS7 -depth_t_i_8,DTS8 -depth_t_i_9,DTS9 -depth_t_i_10,DTS10 -depth_t_i_11, +d_t_i_1,DTS1 +d_t_i_2,DTS2 +d_t_i_3,DTS3 +d_t_i_4,DTS4 +d_t_i_5,DTS5 +d_t_i_6,DTS6 +d_t_i_7,DTS7 +d_t_i_8,DTS8 +d_t_i_9,DTS9 +d_t_i_10,DTS10 +d_t_i_11, t_i_10m,TS_10m diff --git a/src/pypromice/resources/variables.csv b/src/pypromice/resources/variables.csv index 78354105..7630e813 100644 --- a/src/pypromice/resources/variables.csv +++ b/src/pypromice/resources/variables.csv @@ -41,6 +41,9 @@ z_stake,distance_to_surface_from_stake_assembly,Stake height,m,physicalMeasureme z_stake_q,distance_to_surface_from_stake_assembly_quality,Stake height (quality),-,qualityInformation,time,TRUE,L0 or L2,,,,one-boom,1,1,0,4 z_pt,depth_of_pressure_transducer_in_ice,Depth of pressure transducer in ice,m,physicalMeasurement,time,FALSE,,0,30,z_pt_cor,one-boom,1,1,1,4 z_pt_cor,depth_of_pressure_transducer_in_ice_corrected,Depth of pressure transducer in ice - corrected,m,modelResult,time,FALSE,L2 or later,0,30,,one-boom,0,1,1,4 +z_surf_combined,height_of_surface_combined,"Surface height combined from multiple sensors, relative to ice surface height at installation",m,modelResult,time,FALSE,L3,,,,all,0,0,1,4 +z_ice_surf,height_of_ice_surface,"Ice surface height, relative to ice surface height at installation and calculated from pt_cor and z_stake",m,modelResult,time,FALSE,L3,,,,one-boom,0,0,1,4 +snow_height,height_of_snow,"Snow surface height, relative to ice surface",m,modelResult,time,FALSE,L3,0,,,one-boom,0,0,1,4 precip_u,precipitation,Precipitation (upper boom) (cumulative solid & liquid),mm,physicalMeasurement,time,TRUE,,0,,precip_u_cor precip_u_rate,all,1,1,1,4 precip_u_cor,precipitation_corrected,Precipitation (upper boom) (cumulative solid & liquid) – corrected,mm,modelResult,time,TRUE,L2 or later,0,,,all,0,1,1,4 precip_u_rate,precipitation_rate,Precipitation rate (upper boom) (cumulative solid & liquid) – corrected,mm,modelResult,time,TRUE,L2 or later,0,,,all,0,1,1,4 @@ -58,21 +61,33 @@ t_i_8,ice_temperature_at_t8,Ice temperature at sensor 8,degrees_C,physicalMeasur t_i_9,ice_temperature_at_t9,Ice temperature at sensor 9,degrees_C,physicalMeasurement,time,FALSE,,-80,1,,two-boom,1,1,1,4 t_i_10,ice_temperature_at_t10,Ice temperature at sensor 10,degrees_C,physicalMeasurement,time,FALSE,,-80,1,,two-boom,1,1,1,4 t_i_11,ice_temperature_at_t11,Ice temperature at sensor 11,degrees_C,physicalMeasurement,time,FALSE,,-80,1,,two-boom,1,1,1,4 +d_t_i_1,depth_of_thermistor_1,Depth of thermistor 1,m,modelResult,time,FALSE,L3,-10,100,,all,0,0,1,4 +d_t_i_2,depth_of_thermistor_2,Depth of thermistor 2,m,modelResult,time,FALSE,L3,-10,100,,all,0,0,1,4 +d_t_i_3,depth_of_thermistor_3,Depth of thermistor 3,m,modelResult,time,FALSE,L3,-10,100,,all,0,0,1,4 +d_t_i_4,depth_of_thermistor_4,Depth of thermistor 4,m,modelResult,time,FALSE,L3,-10,100,,all,0,0,1,4 +d_t_i_5,depth_of_thermistor_5,Depth of thermistor 5,m,modelResult,time,FALSE,L3,-10,100,,all,0,0,1,4 +d_t_i_6,depth_of_thermistor_6,Depth of thermistor 6,m,modelResult,time,FALSE,L3,-10,100,,all,0,0,1,4 +d_t_i_7,depth_of_thermistor_7,Depth of thermistor 7,m,modelResult,time,FALSE,L3,-10,100,,all,0,0,1,4 +d_t_i_8,depth_of_thermistor_8,Depth of thermistor 8,m,modelResult,time,FALSE,L3,-10,100,,all,0,0,1,4 +d_t_i_9,depth_of_thermistor_9,Depth of thermistor 9,m,modelResult,time,FALSE,L3,-10,100,,two-boom,0,0,1,4 +d_t_i_10,depth_of_thermistor_10,Depth of thermistor 10,m,modelResult,time,FALSE,L3,-10,100,,two-boom,0,0,1,4 +d_t_i_11,depth_of_thermistor_11,Depth of thermistor 11,m,modelResult,time,FALSE,L3,-10,100,,two-boom,0,0,1,4 +t_i_10m,10m_subsurface_temperature,10 m subsurface temperature,degrees_C,modelResult,time,FALSE,L3,-70,0,,all,0,0,1,4 tilt_x,platform_view_angle_x,Tilt to east,degrees,physicalMeasurement,time,FALSE,,-30,30,dsr_cor usr_cor albedo,all,1,1,1,4 tilt_y,platform_view_angle_y,Tilt to north,degrees,physicalMeasurement,time,FALSE,,-30,30,dsr_cor usr_cor albedo,all,1,1,1,4 rot,platform_azimuth_angle,Station rotation from true North,degrees,physicalMeasurement,time,FALSE,,0,360,,all,1,1,1,2 -gps_lat,gps_latitude,Latitude,degrees_north,coordinate,time,TRUE,,50,83,,all,1,1,1,6 -gps_lon,gps_longitude,Longitude,degrees_east,coordinate,time,TRUE,,5,70,,all,1,1,1,6 -gps_alt,gps_altitude,Altitude,m,coordinate,time,TRUE,,0,3000,,all,1,1,1,2 +gps_lat,gps_latitude,Latitude,degrees_north,physicalMeasurement,time,TRUE,,50,83,,all,1,1,1,6 +gps_lon,gps_longitude,Longitude,degrees_east,physicalMeasurement,time,TRUE,,5,70,,all,1,1,1,6 +gps_alt,gps_altitude,Altitude above mean sea level (orthometric height),m,physicalMeasurement,time,TRUE,,0,3000,,all,1,1,1,2 gps_time,gps_time,GPS time,s,physicalMeasurement,time,TRUE,L0 or L2,0,240000,,all,1,1,0, gps_geoid,gps_geoid_separation,Height of EGM96 geoid over WGS84 ellipsoid,m,physicalMeasurement,time,TRUE,L0 or L2,,,,one-boom,1,1,0, gps_geounit,gps_geounit,GeoUnit,-,qualityInformation,time,TRUE,L0 or L2,,,,all,1,1,0, gps_hdop,gps_hdop,GPS horizontal dillution of precision (HDOP),m,qualityInformation,time,TRUE,L0 or L2,,,,all,1,1,0,2 gps_numsat,gps_numsat,GPS number of satellites,-,qualityInformation,time,TRUE,L0 or L2,,,,,1,1,0,0 gps_q,gps_q,Quality,-,qualityInformation,time,TRUE,L0 or L2,,,,,1,1,0, -lat,latitude_postprocessed,smoothed and interpolated latitude of station (best estimate),degrees_north,modelResult,time,TRUE,L3,,,,all,0,0,1,6 -lon,longitude_postprocessed,smoothed and interpolated longitude of station (best estimate),degrees_east,modelResult,time,TRUE,L3,,,,all,0,0,1,6 -alt,altitude_postprocessed,smoothed and interpolated altitude of station (best estimate),m,modelResult,time,TRUE,L3,,,,all,0,0,1,2 +lat,latitude_postprocessed,smoothed and interpolated latitude of station,degrees_north,modelResult,time,TRUE,L3,,,,all,0,0,1,6 +lon,longitude_postprocessed,smoothed and interpolated longitude of station,degrees_east,modelResult,time,TRUE,L3,,,,all,0,0,1,6 +alt,altitude_postprocessed,smoothed and interpolated altitude of station above mean sea level (orthometric height),m,modelResult,time,TRUE,L3,,,,all,0,0,1,2 batt_v,battery_voltage,Battery voltage,V,physicalMeasurement,time,TRUE,,0,30,,all,1,1,1,2 batt_v_ini,,,-,physicalMeasurement,time,TRUE,L0 or L2,0,30,,,1,1,0,2 batt_v_ss,battery_voltage_at_sample_start,Battery voltage (sample start),V,physicalMeasurement,time,TRUE,L0 or L2,0,30,,,1,1,0,2 @@ -88,4 +103,4 @@ rh_i_cor,relative_humidity_corrected,Relative humidity (instantaneous) – corre wspd_i,wind_speed,Wind speed (instantaneous),m s-1,physicalMeasurement,time,TRUE,,0,100,wdir_i wspd_x_i wspd_y_i,all,1,1,1,4 wdir_i,wind_from_direction,Wind from direction (instantaneous),degrees,physicalMeasurement,time,TRUE,,1,360,wspd_x_i wspd_y_i,all,1,1,1,4 wspd_x_i,wind_speed_from_x_direction,Wind speed from x direction (instantaneous),m s-1,modelResult,time,TRUE,L2 or later,-100,100,wdir_i wspd_i,all,0,1,1,4 -wspd_y_i,wind_speed_from_y_direction,Wind speed from y direction (instantaneous),m s-1,modelResult,time,TRUE,L2 or later,-100,100,wdir_i wspd_i,all,0,1,1,4 \ No newline at end of file +wspd_y_i,wind_speed_from_y_direction,Wind speed from y direction (instantaneous),m s-1,modelResult,time,TRUE,L2 or later,-100,100,wdir_i wspd_i,all,0,1,1,4 diff --git a/tests/data/test_config1.toml b/tests/data/test_config1.toml index 09392c08..6b9e2e95 100644 --- a/tests/data/test_config1.toml +++ b/tests/data/test_config1.toml @@ -2,6 +2,7 @@ station_id = 'TEST1' logger_type = 'CR1000X' nodata = ['-999', 'NAN'] # if one is a string, all must be strings number_of_booms = 1 #1-boom = promice, 2-boom = gc-net +site_type = 'ablation' latitude = 79.91 longitude = 24.09 diff --git a/tests/data/test_config2.toml b/tests/data/test_config2.toml index 8213d955..2c632abe 100644 --- a/tests/data/test_config2.toml +++ b/tests/data/test_config2.toml @@ -2,6 +2,7 @@ station_id = 'TEST2' logger_type = 'CR1000X' nodata = ['-999', 'NAN'] # if one is a string, all must be strings number_of_booms = 2 +site_type = 'accumulation' ['test_raw_transmitted2.txt'] format = 'TX' diff --git a/tests/unit/qc/test_persistence.py b/tests/unit/qc/test_persistence.py index d343b0bc..8f4d813a 100644 --- a/tests/unit/qc/test_persistence.py +++ b/tests/unit/qc/test_persistence.py @@ -23,10 +23,10 @@ def _test_1_hour_repeat(self, index: int): start="2023-01-26", end="2023-01-27", freq="h", tz="utc", inclusive="left" ) input_series = pd.Series(index=time_range, data=np.arange(0, len(time_range))) - input_series[index + 1] = input_series[index] + input_series.iloc[index + 1] = input_series.iloc[index] min_repeats = 1 expected_output = input_series.map(lambda _: False) - expected_output[index + 1] = True + expected_output.iloc[index + 1] = True persistent_mask = find_persistent_regions( input_series, min_repeats=min_repeats, max_diff=0.001 @@ -62,9 +62,9 @@ def test_persistent_period_longer_than_period_threshold(self): expected_filter_start = 27 expected_filter_end = 33 input_series = pd.Series(index=time_range, data=np.arange(0, len(time_range))) - input_series[index_start:index_end] = input_series[index_start] + input_series.iloc[index_start:index_end] = input_series.iloc[index_start] expected_output = input_series.map(lambda _: False) - expected_output[expected_filter_start:expected_filter_end] = True + expected_output.iloc[expected_filter_start:expected_filter_end] = True persistent_mask = find_persistent_regions( input_series, min_repeats=min_repeats, max_diff=0.001 @@ -82,7 +82,7 @@ def test_period_threshold_longer_than_persistent_period(self): index_end = 27 min_repeats = 10 input_series = pd.Series(index=time_range, data=np.arange(0, len(time_range))) - input_series[index_start:index_end] = input_series[index_start] + input_series.iloc[index_start:index_end] = input_series.iloc[index_start] expected_output = input_series.map(lambda _: False) persistent_mask = find_persistent_regions( @@ -101,7 +101,7 @@ def test_persistent_period_at_the_end(self): min_repeats = 4 expected_filter_start = 27 input_series = pd.Series(index=time_range, data=np.arange(0, len(time_range))) - input_series[index_start:] = input_series[index_start] + input_series.iloc[index_start:] = input_series.iloc[index_start] expected_output = input_series.map(lambda _: False) expected_output[expected_filter_start:] = True @@ -121,10 +121,10 @@ def test_dont_filter_nan_values(self): index=time_range, data=np.zeros_like(time_range, dtype="float") ) min_repeats = 4 - input_series[:] = np.nan - input_series[9] = -11 - input_series[10:12] = -10 - input_series[15] = -9 + input_series.iloc[:] = np.nan + input_series.iloc[9] = -11 + input_series.iloc[10:12] = -10 + input_series.iloc[15] = -9 # There are >=4 repeats if the nan values are forward filled. [10:15] == -10 # The output mask shouldn't filter nan values. expected_output = input_series.map(lambda _: False) @@ -175,19 +175,19 @@ def test_get_duration_consecutive_true(self): np.isnan(duration_consecutive_true[0]), "The first index should be ignored" ) np.testing.assert_almost_equal( - duration_consecutive_true[1], + duration_consecutive_true.iloc[1], delta_time_hours[1], ) np.testing.assert_almost_equal( - duration_consecutive_true[6], + duration_consecutive_true.iloc[6], delta_time_hours[6], ) np.testing.assert_almost_equal( - duration_consecutive_true[10:14], + duration_consecutive_true.iloc[10:14], delta_time_hours[10:14].cumsum(), ) np.testing.assert_almost_equal( - duration_consecutive_true[-3:], + duration_consecutive_true.iloc[-3:], delta_time_hours[-3:].cumsum(), )