Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature/Automatically update AWS_stations_metadata.csv and AWS_sites_metadata.csv from L3 files #277

Merged
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
'join_l3 = pypromice.process.join_l3:main',
'get_l2 = pypromice.process.get_l2:main',
'get_l2tol3 = pypromice.process.get_l2tol3:main',
'make_metadata_csv = pypromice.postprocess.make_metadata_csv:main',
'get_watsontx = pypromice.tx.get_watsontx:get_watsontx',
'get_bufr = pypromice.postprocess.get_bufr:main',
'get_msg = pypromice.tx.get_msg:get_msg'
Expand Down
180 changes: 180 additions & 0 deletions src/pypromice/postprocess/make_metadata_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
#!/usr/bin/env python
import os
import argparse
import pandas as pd
import xarray as xr
import logging
logger = logging.getLogger(__name__)

def process_files(base_dir, data_type):
# Determine the CSV file path based on the data type
if data_type == 'station':
BaptisteVandecrux marked this conversation as resolved.
Show resolved Hide resolved
logger.info("Updating AWS_stations_metadata.csv")
csv_file_path = os.path.join(base_dir, '../AWS_stations_metadata.csv')
label_s_id = 'station_id'
elif data_type == 'site':
logger.info("Updating AWS_sites_metadata.csv")
csv_file_path = os.path.join(base_dir, '../AWS_sites_metadata.csv')
label_s_id = 'site_id'

# Initialize a list to hold the rows (Series) of DataFrame
rows = []

# Read existing metadata if the CSV file exists
if os.path.exists(csv_file_path):
existing_metadata_df = pd.read_csv(csv_file_path, index_col=label_s_id)
else:
existing_metadata_df = pd.DataFrame()

# Drop the 'timestamp_last_known_coordinates' column if it exists
if 'timestamp_last_known_coordinates' in existing_metadata_df.columns:
existing_metadata_df.drop(columns=['timestamp_last_known_coordinates'], inplace=True)

# Track updated sites or stations to avoid duplicate updates
updated_s = []
new_s = []

# Traverse through all the subfolders and files in the base directory
for subdir, _, files in os.walk(base_dir):
for file in files:
if file.endswith('_hour.nc'):
file_path = os.path.join(subdir, file)
try:
with xr.open_dataset(file_path) as nc_file:
# Extract attributes
s_id = nc_file.attrs.get(label_s_id, 'N/A')

number_of_booms = nc_file.attrs.get('number_of_booms', 'N/A')
if number_of_booms == '1':
station_type = 'one boom'
elif number_of_booms == '2':
station_type = 'two booms'
else:
station_type = 'N/A'

# Keep the existing location_type if it exists
if s_id in existing_metadata_df.index:
location_type = existing_metadata_df.loc[s_id, 'location_type']
else:
location_type = nc_file.attrs.get('location_type', 'N/A')

# Extract the time variable as datetime64
time_var = nc_file['time'].values.astype('datetime64[s]')

# Extract the first and last timestamps
date_installation_str = pd.Timestamp(time_var[0]).strftime('%Y-%m-%d')
last_valid_date_str = pd.Timestamp(time_var[-1]).strftime('%Y-%m-%d')

# Extract the first and last values of lat, lon, and alt
lat_installation = nc_file['lat'].isel(time=0).values.item()
lon_installation = nc_file['lon'].isel(time=0).values.item()
alt_installation = nc_file['alt'].isel(time=0).values.item()

lat_last_known = nc_file['lat'].isel(time=-1).values.item()
lon_last_known = nc_file['lon'].isel(time=-1).values.item()
alt_last_known = nc_file['alt'].isel(time=-1).values.item()

# Create a pandas Series for the metadata
row = pd.Series({
'station_type': station_type,
'location_type': location_type,
'date_installation': date_installation_str,
'last_valid_date': last_valid_date_str,
'lat_installation': lat_installation,
'lon_installation': lon_installation,
'alt_installation': alt_installation,
'lat_last_known': lat_last_known,
'lon_last_known': lon_last_known,
'alt_last_known': alt_last_known
}, name=s_id)
BaptisteVandecrux marked this conversation as resolved.
Show resolved Hide resolved

# Check if this s_id is already in the existing metadata
if s_id in existing_metadata_df.index:
# Compare with existing metadata
existing_row = existing_metadata_df.loc[s_id]
old_date_installation = existing_row['date_installation']
old_last_valid_date = existing_row['last_valid_date']

# Update the existing metadata
existing_metadata_df.loc[s_id] = row

# Print message if dates are updated
if old_date_installation != date_installation_str or old_last_valid_date != last_valid_date_str:
logger.info(f"Updated {label_s_id}: {s_id}")
logger.info(f" Old date_installation: {old_date_installation} --> New date_installation: {date_installation_str}")
logger.info(f" Old last_valid_date: {old_last_valid_date} --> New last_valid_date: {last_valid_date_str}")

updated_s.append(s_id)
else:
new_s.append(s_id)
# Append new metadata row to the list
rows.append(row)

except Exception as e:
logger.info(f"Warning: Error processing {file_path}: {str(e)}")
continue # Continue to next file if there's an error

# Convert the list of rows to a DataFrame
new_metadata_df = pd.DataFrame(rows)

# Convert the list of excluded rows to a DataFrame

# Concatenate the existing metadata with the new metadata and excluded metadata
combined_metadata_df = pd.concat([existing_metadata_df, new_metadata_df], ignore_index=False)

# excluding some sites
sites_to_exclude = [s for s in ['XXX', 'Roof_GEUS', 'Roof_PROMICE'] if s in combined_metadata_df.index]
excluded_metadata_df = combined_metadata_df.loc[sites_to_exclude].copy()
combined_metadata_df.drop(sites_to_exclude, inplace=True)

# Sort the DataFrame by index (s_id)
combined_metadata_df.sort_index(inplace=True)

# Print excluded lines
if not excluded_metadata_df.empty:
pd.set_option('display.max_columns', None) # Show all columns
pd.set_option('display.max_colwidth', None) # Show full width of columns
pd.set_option('display.width', None) # Disable line wrapping

logger.info("\nExcluded lines from combined metadata.csv:")
print(excluded_metadata_df)

# Drop excluded lines from combined_metadata_df
combined_metadata_df.drop(sites_to_exclude, errors='ignore', inplace=True)

if label_s_id == 'site_id':
combined_metadata_df.drop(columns=['station_type'], inplace=True)

# saving to csv
combined_metadata_df.to_csv(csv_file_path, index_label=label_s_id)

# Determine which lines were not updated (reused) and which were added
if not existing_metadata_df.empty:
reused_s = [s_id for s_id in existing_metadata_df.index if ((s_id not in new_s) & (s_id not in updated_s))]
reused_lines = existing_metadata_df.loc[reused_s]
added_lines = combined_metadata_df.loc[combined_metadata_df.index.difference(existing_metadata_df.index)]

logger.info("\nLines from the old metadata.csv that are reused (not updated):")
print(reused_lines)

if not added_lines.empty:
logger.info("\nLines that were not present in the old metadata.csv and are added:")
print(added_lines)
else:
logger.info("\nAll lines are added (no old metadata.csv found)")

def main():
parser = argparse.ArgumentParser(description='Process station or site data.')
parser.add_argument('-t', '--type', choices=['station', 'site'], required=True, help='Type of data to process: "station" or "site"')
parser.add_argument('--root_dir', required=True, help='Root directory containing the aws-l3 folder')
args = parser.parse_args()

if args.type == 'station':
BaptisteVandecrux marked this conversation as resolved.
Show resolved Hide resolved
base_dir = os.path.join(args.root_dir, 'aws-l3/stations/')
elif args.type == 'site':
base_dir = os.path.join(args.root_dir, 'aws-l3/sites/')

process_files(base_dir, args.type)

if __name__ == '__main__':
main()
Loading