Skip to content

Commit

Permalink
Feature/surface heights and thermistor depths (#278)
Browse files Browse the repository at this point in the history
* processes surface heights variables: `z_surf_combined`, `z_ice_surf`, `snow_height`, and thermistors' depths: `d_t_i_1-11`
* `variable.csv` was updated accordingly
* some clean-up of turbulent fluxes calculation, including renaming functions
* handling empty station configuration files and making errors understandable
* updated join_l3 so that surface height and thermistor depths in historical data are no longer ignored and to adjust the surface height between the merged datasets

* calculated either from `gps_lat, gps_lon, gps_alt` or `lat, lon, alt`, static values called `latitude`, `longitude` and `altitude` are saved as attributes along with  `latitude_origin`, `longitude_origin` and `altitude_origin` which states whether they come from gappy observations  `gps_lat, gps_lon, gps_alt`  or from gap-filled postprocess `lat, lon, alt`
* changed "H" to "h" in pandas and added ".iloc" when necessary to remove deprecation warnings

* made `make_metadata_csv.py` to update latest location in `aws-l3/AWS_station_metadata.csv` and `aws-l3/AWS_sites_metadata.csv`

---------

Co-authored-by: Penny How <[email protected]>

* L2toL3 test added (#282)

* 3.8 and 3.9 tests removed, tests only for 3.10 and 3.11

* echo syntax changed

* updated input file paths
---------
  • Loading branch information
BaptisteVandecrux committed Aug 20, 2024
1 parent 5064f7e commit ec47142
Show file tree
Hide file tree
Showing 16 changed files with 1,415 additions and 273 deletions.
16 changes: 4 additions & 12 deletions .github/workflows/process_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,23 +41,15 @@ jobs:
for i in $(echo ${{ env.TEST_STATION }} | tr ' ' '\n'); do
python3 $GITHUB_WORKSPACE/main/src/pypromice/process/get_l2.py -c $GITHUB_WORKSPACE/aws-l0/tx/config/$i.toml -i $GITHUB_WORKSPACE/aws-l0/tx -o $GITHUB_WORKSPACE/out/L0toL2/
done
# - name: Run L0 to L2 processing
# env:
# TEST_STATION: KAN_U HUM
# shell: bash
# run: |
# mkdir $GITHUB_WORKSPACE/out/L2toL3/
# for i in $(echo ${{ env.TEST_STATION }} | tr ' ' '\n'); do
# python3 $GITHUB_WORKSPACE/main/src/pypromice/process/get_l2tol3.py -i $GITHUB_WORKSPACE/out/L0toL2/$i/$i_hour.nc -o $GITHUB_WORKSPACE/out/L2toL3 -t 60min
# done
- name: Run L0 to L3 processing
- name: Run L2 to L3 processing
env:
TEST_STATION: KAN_U HUM
shell: bash
run: |
mkdir $GITHUB_WORKSPACE/out/L0toL3/
mkdir $GITHUB_WORKSPACE/out/L2toL3/
for i in $(echo ${{ env.TEST_STATION }} | tr ' ' '\n'); do
python3 $GITHUB_WORKSPACE/main/src/pypromice/process/get_l2.py -c $GITHUB_WORKSPACE/aws-l0/tx/config/$i.toml -i $GITHUB_WORKSPACE/aws-l0/tx -o $GITHUB_WORKSPACE/out/L2/
echo ${i}_hour.nc
python3 $GITHUB_WORKSPACE/main/src/pypromice/process/get_l2tol3.py -c $GITHUB_WORKSPACE/aws-l0/metadata/station_configurations/ -i $GITHUB_WORKSPACE/out/L0toL2/${i}/${i}_hour.nc -o $GITHUB_WORKSPACE/out/L2toL3/
done
- name: Upload test output
uses: actions/upload-artifact@v3
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
'join_l3 = pypromice.process.join_l3:main',
'get_l2 = pypromice.process.get_l2:main',
'get_l2tol3 = pypromice.process.get_l2tol3:main',
'make_metadata_csv = pypromice.postprocess.make_metadata_csv:main',
'get_watsontx = pypromice.tx.get_watsontx:get_watsontx',
'get_bufr = pypromice.postprocess.get_bufr:main',
'create_bufr_files = pypromice.postprocess.create_bufr_files:main',
Expand Down
2 changes: 1 addition & 1 deletion src/pypromice/postprocess/create_bufr_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def create_bufr_files(
Suffix for the compiled output file
"""
periods = pd.date_range(period_start, period_end, freq="H")
periods = pd.date_range(period_start, period_end, freq="h")
output_individual_root = output_root / "individual"
output_compiled_root = output_root / "compiled"
output_individual_root.mkdir(parents=True, exist_ok=True)
Expand Down
214 changes: 214 additions & 0 deletions src/pypromice/postprocess/make_metadata_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
#!/usr/bin/env python
import os, sys, argparse
import pandas as pd
import xarray as xr
import logging

logging.basicConfig(
format="%(asctime)s; %(levelname)s; %(name)s; %(message)s",
level=logging.INFO,
stream=sys.stdout,
)
logger = logging.getLogger(__name__)

def extract_metadata_from_nc(file_path: str, data_type: str, label_s_id: str) -> pd.Series:
"""
Extract metadata from a NetCDF file and return it as a pandas Series.
Parameters:
- file_path (str): The path to the NetCDF file.
- data_type (str): The type of data ('station' or 'site').
- label_s_id (str): The label for the station or site ID.
Returns:
- pd.Series: A pandas Series containing the extracted metadata.
"""
try:
with xr.open_dataset(file_path) as nc_file:
# Extract attributes
s_id = nc_file.attrs.get(label_s_id, 'N/A')
location_type = nc_file.attrs.get('location_type', 'N/A')
project = nc_file.attrs.get('project', 'N/A')
if data_type == 'site':
stations = nc_file.attrs.get('stations', s_id)
if data_type == 'station':
number_of_booms = nc_file.attrs.get('number_of_booms', 'N/A')

# Extract the time variable as datetime64
time_var = nc_file['time'].values.astype('datetime64[s]')

# Extract the first and last timestamps
date_installation_str = pd.Timestamp(time_var[0]).strftime('%Y-%m-%d')
last_valid_date_str = pd.Timestamp(time_var[-1]).strftime('%Y-%m-%d')

# Extract the first and last values of lat, lon, and alt
lat_installation = nc_file['lat'].isel(time=0).values.item()
lon_installation = nc_file['lon'].isel(time=0).values.item()
alt_installation = nc_file['alt'].isel(time=0).values.item()

lat_last_known = nc_file['lat'].isel(time=-1).values.item()
lon_last_known = nc_file['lon'].isel(time=-1).values.item()
alt_last_known = nc_file['alt'].isel(time=-1).values.item()

# Create a pandas Series for the metadata
if data_type == 'site':
row = pd.Series({
'project': project.replace('\r',''),
'location_type': location_type,
'stations': stations,
'date_installation': date_installation_str,
'latitude_installation': lat_installation,
'longitude_installation': lon_installation,
'altitude_installation': alt_installation,
'date_last_valid': last_valid_date_str,
'latitude_last_valid': lat_last_known,
'longitude_last_valid': lon_last_known,
'altitude_last_valid': alt_last_known
}, name=s_id)
else:
row = pd.Series({
'project': project.replace('\r',''),
'number_of_booms': number_of_booms,
'location_type': location_type,
'date_installation': date_installation_str,
'latitude_installation': lat_installation,
'longitude_installation': lon_installation,
'altitude_installation': alt_installation,
'date_last_valid': last_valid_date_str,
'latitude_last_valid': lat_last_known,
'longitude_last_valid': lon_last_known,
'altitude_last_valid': alt_last_known
}, name=s_id)
return row
except Exception as e:
logger.info(f"Warning: Error processing {file_path}: {str(e)}")
return pd.Series() # Return an empty Series in case of an error

def process_files(base_dir: str, csv_file_path: str, data_type: str) -> pd.DataFrame:
"""
Process all files in the base directory to generate new metadata.
Parameters:
- base_dir (str): The base directory containing the NetCDF files.
- csv_file_path (str): The path to the existing metadata CSV file.
- data_type (str): The type of data ('station' or 'site').
Returns:
- pd.DataFrame: The combined metadata DataFrame.
"""
label_s_id = 'station_id' if data_type == 'station' else 'site_id'

# Initialize a list to hold the rows (Series) of DataFrame
rows = []

# Read existing metadata if the CSV file exists
if os.path.exists(csv_file_path) and os.path.getsize(csv_file_path) > 0:
logger.info("Updating " + str(csv_file_path))
existing_metadata_df = pd.read_csv(csv_file_path, index_col=label_s_id)
else:
logger.info("Creating " + str(csv_file_path))
existing_metadata_df = pd.DataFrame()

# Track updated sites or stations to avoid duplicate updates
updated_s = []
new_s = []

# Traverse through all the subfolders and files in the base directory
for subdir, _, files in os.walk(base_dir):
for file in files:
if file.endswith('_hour.nc'):
file_path = os.path.join(subdir, file)
row = extract_metadata_from_nc(file_path, data_type, label_s_id)
if not row.empty:
s_id = row.name
if s_id in existing_metadata_df.index:
# Compare with existing metadata
existing_row = existing_metadata_df.loc[s_id]
old_date_installation = existing_row['date_installation']
old_last_valid_date = existing_row['date_last_valid']

# Update the existing metadata
existing_metadata_df.loc[s_id] = row

# Print message if dates are updated
if old_last_valid_date != row['date_last_valid']:
logger.info(f"Updated {label_s_id}: {s_id} date_last_valid: {old_last_valid_date} --> {row['date_last_valid']}")

updated_s.append(s_id)
else:
new_s.append(s_id)
# Append new metadata row to the list
rows.append(row)

# Convert the list of rows to a DataFrame
new_metadata_df = pd.DataFrame(rows)

# Concatenate the existing metadata with the new metadata
combined_metadata_df = pd.concat([existing_metadata_df, new_metadata_df], ignore_index=False)

# Exclude some sites
sites_to_exclude = [s for s in ['XXX', 'Roof_GEUS', 'Roof_PROMICE'] if s in combined_metadata_df.index]
excluded_metadata_df = combined_metadata_df.loc[sites_to_exclude].copy()
combined_metadata_df.drop(sites_to_exclude, inplace=True)

# Sort the DataFrame by index (s_id)
combined_metadata_df.sort_index(inplace=True)

# Print excluded lines
if not excluded_metadata_df.empty:
pd.set_option('display.max_columns', None) # Show all columns
pd.set_option('display.max_colwidth', None) # Show full width of columns
pd.set_option('display.width', None) # Disable line wrapping
logger.info("\nExcluded lines from combined metadata.csv:")
print(excluded_metadata_df)

# Drop excluded lines from combined_metadata_df
combined_metadata_df.drop(sites_to_exclude, errors='ignore', inplace=True)

# Save to csv
combined_metadata_df.to_csv(csv_file_path, index_label=label_s_id)

return combined_metadata_df, existing_metadata_df, new_s, updated_s

def compare_and_log_updates(combined_metadata_df: pd.DataFrame, existing_metadata_df: pd.DataFrame, new_s: list, updated_s: list):
"""
Compare the combined metadata with the existing metadata and log the updates.
Parameters:
- combined_metadata_df (pd.DataFrame): The combined metadata DataFrame.
- existing_metadata_df (pd.DataFrame): The existing metadata DataFrame.
- new_s (list): List of new station/site IDs.
- updated_s (list): List of updated station/site IDs.
"""
# Determine which lines were not updated (reused) and which were added
if not existing_metadata_df.empty:
reused_s = [s_id for s_id in existing_metadata_df.index if ((s_id not in new_s) & (s_id not in updated_s))]
reused_lines = existing_metadata_df.loc[reused_s]
added_lines = combined_metadata_df.loc[combined_metadata_df.index.difference(existing_metadata_df.index)]

logger.info("\nLines from the old metadata.csv that are reused (not updated):")
print(reused_lines)

if not added_lines.empty:
logger.info("\nLines that were not present in the old metadata.csv and are added:")
print(added_lines)
else:
logger.info("\nAll lines are added (no old metadata.csv found)")

def main():
parser = argparse.ArgumentParser(description='Process station or site data.')
parser.add_argument('-t', '--type', choices=['station', 'site'],
required=True,
help='Type of data to process: "station" or "site"')
parser.add_argument('-r', '--root_dir', required=True, help='Root directory ' +
'containing the aws-l3 station or site folder')
parser.add_argument('-m','--metadata_file', required=True,
help='File path to metadata csv file (existing or '+
'intended output path')

args = parser.parse_args()
combined_metadata_df, existing_metadata_df, new_s, updated_s = process_files(args.root_dir, args.metadata_file, args.type)
compare_and_log_updates(combined_metadata_df, existing_metadata_df, new_s, updated_s)

if __name__ == '__main__':
main()
4 changes: 2 additions & 2 deletions src/pypromice/postprocess/real_time_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ def get_latest_data(

# Apply smoothing to z_boom_u
# require at least 2 hourly obs? Sometimes seeing once/day data for z_boom_u
df_limited = rolling_window(df_limited, "z_boom_u", "72H", 2, 3)

df_limited = rolling_window(df_limited, "z_boom_u", "72h", 2, 3)
# limit to single most recent valid row (convert to series)
s_current = df_limited.loc[last_valid_index]

Expand Down
Loading

0 comments on commit ec47142

Please sign in to comment.