From 403fe012d61d88de26d88870abaf367930b722f2 Mon Sep 17 00:00:00 2001 From: Baptiste Vandecrux <35140661+BaptisteVandecrux@users.noreply.github.com> Date: Fri, 5 Jul 2024 15:34:24 +0200 Subject: [PATCH 01/10] added make_metadata_csv.py, made it a CLI --- setup.py | 1 + .../postprocess/make_metadata_csv.py | 175 ++++++++++++++++++ 2 files changed, 176 insertions(+) create mode 100644 src/pypromice/postprocess/make_metadata_csv.py diff --git a/setup.py b/setup.py index 52a9b216..92146024 100644 --- a/setup.py +++ b/setup.py @@ -45,6 +45,7 @@ 'join_l3 = pypromice.process.join_l3:main', 'get_l2 = pypromice.process.get_l2:main', 'get_l2tol3 = pypromice.process.get_l2tol3:main', + 'make_metadata_csv = pypromice.postprocess.make_metadata_csv:main', 'get_watsontx = pypromice.tx.get_watsontx:get_watsontx', 'get_bufr = pypromice.postprocess.get_bufr:main', 'get_msg = pypromice.tx.get_msg:get_msg' diff --git a/src/pypromice/postprocess/make_metadata_csv.py b/src/pypromice/postprocess/make_metadata_csv.py new file mode 100644 index 00000000..2c2ad56b --- /dev/null +++ b/src/pypromice/postprocess/make_metadata_csv.py @@ -0,0 +1,175 @@ +import os +import argparse +import pandas as pd +import xarray as xr + +def process_files(base_dir, data_type): + # Determine the CSV file path based on the data type + if data_type == 'station': + csv_file_path = os.path.join(base_dir, '../AWS_stations_metadata.csv') + label_s_id = 'station_id' + elif data_type == 'site': + csv_file_path = os.path.join(base_dir, '../AWS_sites_metadata.csv') + label_s_id = 'site_id' + + # Initialize a list to hold the rows (Series) of DataFrame + rows = [] + + # Read existing metadata if the CSV file exists + if os.path.exists(csv_file_path): + existing_metadata_df = pd.read_csv(csv_file_path, index_col=label_s_id) + else: + existing_metadata_df = pd.DataFrame() + + # Drop the 'timestamp_last_known_coordinates' column if it exists + if 'timestamp_last_known_coordinates' in existing_metadata_df.columns: + existing_metadata_df.drop(columns=['timestamp_last_known_coordinates'], inplace=True) + + # Track updated sites or stations to avoid duplicate updates + updated_s = [] + new_s = [] + + # Traverse through all the subfolders and files in the base directory + for subdir, _, files in os.walk(base_dir): + for file in files: + if file.endswith('_hour.nc'): + file_path = os.path.join(subdir, file) + try: + with xr.open_dataset(file_path) as nc_file: + # Extract attributes + s_id = nc_file.attrs.get(label_s_id, 'N/A') + + number_of_booms = nc_file.attrs.get('number_of_booms', 'N/A') + if number_of_booms == '1': + station_type = 'one boom' + elif number_of_booms == '2': + station_type = 'two booms' + else: + station_type = 'N/A' + + # Keep the existing location_type if it exists + if s_id in existing_metadata_df.index: + location_type = existing_metadata_df.loc[s_id, 'location_type'] + else: + location_type = nc_file.attrs.get('location_type', 'N/A') + + # Extract the time variable as datetime64 + time_var = nc_file['time'].values.astype('datetime64[s]') + + # Extract the first and last timestamps + date_installation_str = pd.Timestamp(time_var[0]).strftime('%Y-%m-%d') + last_valid_date_str = pd.Timestamp(time_var[-1]).strftime('%Y-%m-%d') + + # Extract the first and last values of lat, lon, and alt + lat_installation = nc_file['lat'].isel(time=0).values.item() + lon_installation = nc_file['lon'].isel(time=0).values.item() + alt_installation = nc_file['alt'].isel(time=0).values.item() + + lat_last_known = nc_file['lat'].isel(time=-1).values.item() + lon_last_known = nc_file['lon'].isel(time=-1).values.item() + alt_last_known = nc_file['alt'].isel(time=-1).values.item() + + # Create a pandas Series for the metadata + row = pd.Series({ + 'station_type': station_type, + 'location_type': location_type, + 'date_installation': date_installation_str, + 'last_valid_date': last_valid_date_str, + 'lat_installation': lat_installation, + 'lon_installation': lon_installation, + 'alt_installation': alt_installation, + 'lat_last_known': lat_last_known, + 'lon_last_known': lon_last_known, + 'alt_last_known': alt_last_known + }, name=s_id) + + # Check if this s_id is already in the existing metadata + if s_id in existing_metadata_df.index: + # Compare with existing metadata + existing_row = existing_metadata_df.loc[s_id] + old_date_installation = existing_row['date_installation'] + old_last_valid_date = existing_row['last_valid_date'] + + # Update the existing metadata + existing_metadata_df.loc[s_id] = row + + # Print message if dates are updated + if old_date_installation != date_installation_str or old_last_valid_date != last_valid_date_str: + print(f"Updated {label_s_id}: {s_id}") + print(f" Old date_installation: {old_date_installation} --> New date_installation: {date_installation_str}") + print(f" Old last_valid_date: {old_last_valid_date} --> New last_valid_date: {last_valid_date_str}") + + updated_s.append(s_id) + else: + new_s.append(s_id) + # Append new metadata row to the list + rows.append(row) + + except Exception as e: + print(f"Warning: Error processing {file_path}: {str(e)}") + continue # Continue to next file if there's an error + + # Convert the list of rows to a DataFrame + new_metadata_df = pd.DataFrame(rows) + + # Convert the list of excluded rows to a DataFrame + + # Concatenate the existing metadata with the new metadata and excluded metadata + combined_metadata_df = pd.concat([existing_metadata_df, new_metadata_df], ignore_index=False) + + # excluding some sites + sites_to_exclude = [s for s in ['XXX', 'Roof_GEUS', 'Roof_PROMICE'] if s in combined_metadata_df.index] + excluded_metadata_df = combined_metadata_df.loc[sites_to_exclude].copy() + combined_metadata_df.drop(sites_to_exclude, inplace=True) + + # Sort the DataFrame by index (s_id) + combined_metadata_df.sort_index(inplace=True) + + # Print excluded lines + if not excluded_metadata_df.empty: + pd.set_option('display.max_columns', None) # Show all columns + pd.set_option('display.max_colwidth', None) # Show full width of columns + pd.set_option('display.width', None) # Disable line wrapping + + print("\nExcluded lines from combined metadata.csv:") + print(excluded_metadata_df) + + # Drop excluded lines from combined_metadata_df + combined_metadata_df.drop(sites_to_exclude, errors='ignore', inplace=True) + + if label_s_id == 'site_id': + combined_metadata_df.drop(columns=['station_type'], inplace=True) + + # saving to csv + combined_metadata_df.to_csv(csv_file_path, index_label=label_s_id) + + # Determine which lines were not updated (reused) and which were added + if not existing_metadata_df.empty: + reused_s = [s_id for s_id in existing_metadata_df.index if ((s_id not in new_s) & (s_id not in updated_s))] + reused_lines = existing_metadata_df.loc[reused_s] + added_lines = combined_metadata_df.loc[combined_metadata_df.index.difference(existing_metadata_df.index)] + + print("\nLines from the old metadata.csv that are reused (not updated):") + print(reused_lines) + + if not added_lines.empty: + print("\nLines that were not present in the old metadata.csv and are added:") + print(added_lines) + else: + print("\nAll lines are added (no old metadata.csv found)") + +def main(): + parser = argparse.ArgumentParser(description='Process station or site data.') + parser.add_argument('-t', '--type', choices=['station', 'site'], required=True, help='Type of data to process: "station" or "site"') + parser.add_argument('--root_dir', required=True, help='Root directory containing the aws-l3 folder') + args = parser.parse_args() + + if args.type == 'station': + base_dir = os.path.join(args.root_dir, 'aws-l3/stations/') + elif args.type == 'site': + base_dir = os.path.join(args.root_dir, 'aws-l3/sites/') + + process_files(base_dir, args.type) + +if __name__ == '__main__': + main() From 84e66908f4f4b6fa95b8ff5cb45166fdd440eff5 Mon Sep 17 00:00:00 2001 From: Baptiste Vandecrux <35140661+BaptisteVandecrux@users.noreply.github.com> Date: Fri, 5 Jul 2024 15:39:59 +0200 Subject: [PATCH 02/10] switch print to logger --- .../postprocess/make_metadata_csv.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/pypromice/postprocess/make_metadata_csv.py b/src/pypromice/postprocess/make_metadata_csv.py index 2c2ad56b..e0e41379 100644 --- a/src/pypromice/postprocess/make_metadata_csv.py +++ b/src/pypromice/postprocess/make_metadata_csv.py @@ -1,14 +1,19 @@ +#!/usr/bin/env python import os import argparse import pandas as pd import xarray as xr +import logging +logger = logging.getLogger(__name__) def process_files(base_dir, data_type): # Determine the CSV file path based on the data type if data_type == 'station': + logger.info("Updating AWS_stations_metadata.csv") csv_file_path = os.path.join(base_dir, '../AWS_stations_metadata.csv') label_s_id = 'station_id' elif data_type == 'site': + logger.info("Updating AWS_sites_metadata.csv") csv_file_path = os.path.join(base_dir, '../AWS_sites_metadata.csv') label_s_id = 'site_id' @@ -95,9 +100,9 @@ def process_files(base_dir, data_type): # Print message if dates are updated if old_date_installation != date_installation_str or old_last_valid_date != last_valid_date_str: - print(f"Updated {label_s_id}: {s_id}") - print(f" Old date_installation: {old_date_installation} --> New date_installation: {date_installation_str}") - print(f" Old last_valid_date: {old_last_valid_date} --> New last_valid_date: {last_valid_date_str}") + logger.info(f"Updated {label_s_id}: {s_id}") + logger.info(f" Old date_installation: {old_date_installation} --> New date_installation: {date_installation_str}") + logger.info(f" Old last_valid_date: {old_last_valid_date} --> New last_valid_date: {last_valid_date_str}") updated_s.append(s_id) else: @@ -106,7 +111,7 @@ def process_files(base_dir, data_type): rows.append(row) except Exception as e: - print(f"Warning: Error processing {file_path}: {str(e)}") + logger.info(f"Warning: Error processing {file_path}: {str(e)}") continue # Continue to next file if there's an error # Convert the list of rows to a DataFrame @@ -131,7 +136,7 @@ def process_files(base_dir, data_type): pd.set_option('display.max_colwidth', None) # Show full width of columns pd.set_option('display.width', None) # Disable line wrapping - print("\nExcluded lines from combined metadata.csv:") + logger.info("\nExcluded lines from combined metadata.csv:") print(excluded_metadata_df) # Drop excluded lines from combined_metadata_df @@ -149,14 +154,14 @@ def process_files(base_dir, data_type): reused_lines = existing_metadata_df.loc[reused_s] added_lines = combined_metadata_df.loc[combined_metadata_df.index.difference(existing_metadata_df.index)] - print("\nLines from the old metadata.csv that are reused (not updated):") + logger.info("\nLines from the old metadata.csv that are reused (not updated):") print(reused_lines) if not added_lines.empty: - print("\nLines that were not present in the old metadata.csv and are added:") + logger.info("\nLines that were not present in the old metadata.csv and are added:") print(added_lines) else: - print("\nAll lines are added (no old metadata.csv found)") + logger.info("\nAll lines are added (no old metadata.csv found)") def main(): parser = argparse.ArgumentParser(description='Process station or site data.') From 437d7f44777ee8d91bf38b6eae74e38b029ebdad Mon Sep 17 00:00:00 2001 From: Penny How Date: Tue, 9 Jul 2024 16:22:05 -0100 Subject: [PATCH 03/10] File paths specified rather than inferred (#279) * paths specified --- .../postprocess/make_metadata_csv.py | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/pypromice/postprocess/make_metadata_csv.py b/src/pypromice/postprocess/make_metadata_csv.py index e0e41379..b19a1cf6 100644 --- a/src/pypromice/postprocess/make_metadata_csv.py +++ b/src/pypromice/postprocess/make_metadata_csv.py @@ -5,16 +5,13 @@ import xarray as xr import logging logger = logging.getLogger(__name__) - -def process_files(base_dir, data_type): + +def process_files(base_dir, csv_file_path, data_type): + # Determine the CSV file path based on the data type if data_type == 'station': - logger.info("Updating AWS_stations_metadata.csv") - csv_file_path = os.path.join(base_dir, '../AWS_stations_metadata.csv') label_s_id = 'station_id' elif data_type == 'site': - logger.info("Updating AWS_sites_metadata.csv") - csv_file_path = os.path.join(base_dir, '../AWS_sites_metadata.csv') label_s_id = 'site_id' # Initialize a list to hold the rows (Series) of DataFrame @@ -22,8 +19,10 @@ def process_files(base_dir, data_type): # Read existing metadata if the CSV file exists if os.path.exists(csv_file_path): + logger.info("Updating "+str(csv_file_path)) existing_metadata_df = pd.read_csv(csv_file_path, index_col=label_s_id) else: + logger.info("Creating "+str(csv_file_path)) existing_metadata_df = pd.DataFrame() # Drop the 'timestamp_last_known_coordinates' column if it exists @@ -135,7 +134,6 @@ def process_files(base_dir, data_type): pd.set_option('display.max_columns', None) # Show all columns pd.set_option('display.max_colwidth', None) # Show full width of columns pd.set_option('display.width', None) # Disable line wrapping - logger.info("\nExcluded lines from combined metadata.csv:") print(excluded_metadata_df) @@ -165,16 +163,17 @@ def process_files(base_dir, data_type): def main(): parser = argparse.ArgumentParser(description='Process station or site data.') - parser.add_argument('-t', '--type', choices=['station', 'site'], required=True, help='Type of data to process: "station" or "site"') - parser.add_argument('--root_dir', required=True, help='Root directory containing the aws-l3 folder') + parser.add_argument('-t', '--type', choices=['station', 'site'], + required=True, + help='Type of data to process: "station" or "site"') + parser.add_argument('-r', '--root_dir', required=True, help='Root directory ' + + 'containing the aws-l3 station or site folder') + parser.add_argument('-m','--metadata_file', required=True, + help='File path to metadata csv file (existing or '+ + 'intended output path') + args = parser.parse_args() - - if args.type == 'station': - base_dir = os.path.join(args.root_dir, 'aws-l3/stations/') - elif args.type == 'site': - base_dir = os.path.join(args.root_dir, 'aws-l3/sites/') - - process_files(base_dir, args.type) + process_files(args.root_dir, args.metadata_file, args.type) if __name__ == '__main__': main() From 3546fa0c85675d46fe17e5ec28760de0353a9c56 Mon Sep 17 00:00:00 2001 From: Baptiste Vandecrux <35140661+BaptisteVandecrux@users.noreply.github.com> Date: Fri, 5 Jul 2024 15:34:24 +0200 Subject: [PATCH 04/10] added make_metadata_csv.py, made it a CLI --- setup.py | 1 + .../postprocess/make_metadata_csv.py | 175 ++++++++++++++++++ 2 files changed, 176 insertions(+) create mode 100644 src/pypromice/postprocess/make_metadata_csv.py diff --git a/setup.py b/setup.py index 52a9b216..92146024 100644 --- a/setup.py +++ b/setup.py @@ -45,6 +45,7 @@ 'join_l3 = pypromice.process.join_l3:main', 'get_l2 = pypromice.process.get_l2:main', 'get_l2tol3 = pypromice.process.get_l2tol3:main', + 'make_metadata_csv = pypromice.postprocess.make_metadata_csv:main', 'get_watsontx = pypromice.tx.get_watsontx:get_watsontx', 'get_bufr = pypromice.postprocess.get_bufr:main', 'get_msg = pypromice.tx.get_msg:get_msg' diff --git a/src/pypromice/postprocess/make_metadata_csv.py b/src/pypromice/postprocess/make_metadata_csv.py new file mode 100644 index 00000000..2c2ad56b --- /dev/null +++ b/src/pypromice/postprocess/make_metadata_csv.py @@ -0,0 +1,175 @@ +import os +import argparse +import pandas as pd +import xarray as xr + +def process_files(base_dir, data_type): + # Determine the CSV file path based on the data type + if data_type == 'station': + csv_file_path = os.path.join(base_dir, '../AWS_stations_metadata.csv') + label_s_id = 'station_id' + elif data_type == 'site': + csv_file_path = os.path.join(base_dir, '../AWS_sites_metadata.csv') + label_s_id = 'site_id' + + # Initialize a list to hold the rows (Series) of DataFrame + rows = [] + + # Read existing metadata if the CSV file exists + if os.path.exists(csv_file_path): + existing_metadata_df = pd.read_csv(csv_file_path, index_col=label_s_id) + else: + existing_metadata_df = pd.DataFrame() + + # Drop the 'timestamp_last_known_coordinates' column if it exists + if 'timestamp_last_known_coordinates' in existing_metadata_df.columns: + existing_metadata_df.drop(columns=['timestamp_last_known_coordinates'], inplace=True) + + # Track updated sites or stations to avoid duplicate updates + updated_s = [] + new_s = [] + + # Traverse through all the subfolders and files in the base directory + for subdir, _, files in os.walk(base_dir): + for file in files: + if file.endswith('_hour.nc'): + file_path = os.path.join(subdir, file) + try: + with xr.open_dataset(file_path) as nc_file: + # Extract attributes + s_id = nc_file.attrs.get(label_s_id, 'N/A') + + number_of_booms = nc_file.attrs.get('number_of_booms', 'N/A') + if number_of_booms == '1': + station_type = 'one boom' + elif number_of_booms == '2': + station_type = 'two booms' + else: + station_type = 'N/A' + + # Keep the existing location_type if it exists + if s_id in existing_metadata_df.index: + location_type = existing_metadata_df.loc[s_id, 'location_type'] + else: + location_type = nc_file.attrs.get('location_type', 'N/A') + + # Extract the time variable as datetime64 + time_var = nc_file['time'].values.astype('datetime64[s]') + + # Extract the first and last timestamps + date_installation_str = pd.Timestamp(time_var[0]).strftime('%Y-%m-%d') + last_valid_date_str = pd.Timestamp(time_var[-1]).strftime('%Y-%m-%d') + + # Extract the first and last values of lat, lon, and alt + lat_installation = nc_file['lat'].isel(time=0).values.item() + lon_installation = nc_file['lon'].isel(time=0).values.item() + alt_installation = nc_file['alt'].isel(time=0).values.item() + + lat_last_known = nc_file['lat'].isel(time=-1).values.item() + lon_last_known = nc_file['lon'].isel(time=-1).values.item() + alt_last_known = nc_file['alt'].isel(time=-1).values.item() + + # Create a pandas Series for the metadata + row = pd.Series({ + 'station_type': station_type, + 'location_type': location_type, + 'date_installation': date_installation_str, + 'last_valid_date': last_valid_date_str, + 'lat_installation': lat_installation, + 'lon_installation': lon_installation, + 'alt_installation': alt_installation, + 'lat_last_known': lat_last_known, + 'lon_last_known': lon_last_known, + 'alt_last_known': alt_last_known + }, name=s_id) + + # Check if this s_id is already in the existing metadata + if s_id in existing_metadata_df.index: + # Compare with existing metadata + existing_row = existing_metadata_df.loc[s_id] + old_date_installation = existing_row['date_installation'] + old_last_valid_date = existing_row['last_valid_date'] + + # Update the existing metadata + existing_metadata_df.loc[s_id] = row + + # Print message if dates are updated + if old_date_installation != date_installation_str or old_last_valid_date != last_valid_date_str: + print(f"Updated {label_s_id}: {s_id}") + print(f" Old date_installation: {old_date_installation} --> New date_installation: {date_installation_str}") + print(f" Old last_valid_date: {old_last_valid_date} --> New last_valid_date: {last_valid_date_str}") + + updated_s.append(s_id) + else: + new_s.append(s_id) + # Append new metadata row to the list + rows.append(row) + + except Exception as e: + print(f"Warning: Error processing {file_path}: {str(e)}") + continue # Continue to next file if there's an error + + # Convert the list of rows to a DataFrame + new_metadata_df = pd.DataFrame(rows) + + # Convert the list of excluded rows to a DataFrame + + # Concatenate the existing metadata with the new metadata and excluded metadata + combined_metadata_df = pd.concat([existing_metadata_df, new_metadata_df], ignore_index=False) + + # excluding some sites + sites_to_exclude = [s for s in ['XXX', 'Roof_GEUS', 'Roof_PROMICE'] if s in combined_metadata_df.index] + excluded_metadata_df = combined_metadata_df.loc[sites_to_exclude].copy() + combined_metadata_df.drop(sites_to_exclude, inplace=True) + + # Sort the DataFrame by index (s_id) + combined_metadata_df.sort_index(inplace=True) + + # Print excluded lines + if not excluded_metadata_df.empty: + pd.set_option('display.max_columns', None) # Show all columns + pd.set_option('display.max_colwidth', None) # Show full width of columns + pd.set_option('display.width', None) # Disable line wrapping + + print("\nExcluded lines from combined metadata.csv:") + print(excluded_metadata_df) + + # Drop excluded lines from combined_metadata_df + combined_metadata_df.drop(sites_to_exclude, errors='ignore', inplace=True) + + if label_s_id == 'site_id': + combined_metadata_df.drop(columns=['station_type'], inplace=True) + + # saving to csv + combined_metadata_df.to_csv(csv_file_path, index_label=label_s_id) + + # Determine which lines were not updated (reused) and which were added + if not existing_metadata_df.empty: + reused_s = [s_id for s_id in existing_metadata_df.index if ((s_id not in new_s) & (s_id not in updated_s))] + reused_lines = existing_metadata_df.loc[reused_s] + added_lines = combined_metadata_df.loc[combined_metadata_df.index.difference(existing_metadata_df.index)] + + print("\nLines from the old metadata.csv that are reused (not updated):") + print(reused_lines) + + if not added_lines.empty: + print("\nLines that were not present in the old metadata.csv and are added:") + print(added_lines) + else: + print("\nAll lines are added (no old metadata.csv found)") + +def main(): + parser = argparse.ArgumentParser(description='Process station or site data.') + parser.add_argument('-t', '--type', choices=['station', 'site'], required=True, help='Type of data to process: "station" or "site"') + parser.add_argument('--root_dir', required=True, help='Root directory containing the aws-l3 folder') + args = parser.parse_args() + + if args.type == 'station': + base_dir = os.path.join(args.root_dir, 'aws-l3/stations/') + elif args.type == 'site': + base_dir = os.path.join(args.root_dir, 'aws-l3/sites/') + + process_files(base_dir, args.type) + +if __name__ == '__main__': + main() From cd582f0b4dd446e8b94ed82616b904f2b633edb3 Mon Sep 17 00:00:00 2001 From: Baptiste Vandecrux <35140661+BaptisteVandecrux@users.noreply.github.com> Date: Fri, 5 Jul 2024 15:39:59 +0200 Subject: [PATCH 05/10] switch print to logger --- .../postprocess/make_metadata_csv.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/pypromice/postprocess/make_metadata_csv.py b/src/pypromice/postprocess/make_metadata_csv.py index 2c2ad56b..e0e41379 100644 --- a/src/pypromice/postprocess/make_metadata_csv.py +++ b/src/pypromice/postprocess/make_metadata_csv.py @@ -1,14 +1,19 @@ +#!/usr/bin/env python import os import argparse import pandas as pd import xarray as xr +import logging +logger = logging.getLogger(__name__) def process_files(base_dir, data_type): # Determine the CSV file path based on the data type if data_type == 'station': + logger.info("Updating AWS_stations_metadata.csv") csv_file_path = os.path.join(base_dir, '../AWS_stations_metadata.csv') label_s_id = 'station_id' elif data_type == 'site': + logger.info("Updating AWS_sites_metadata.csv") csv_file_path = os.path.join(base_dir, '../AWS_sites_metadata.csv') label_s_id = 'site_id' @@ -95,9 +100,9 @@ def process_files(base_dir, data_type): # Print message if dates are updated if old_date_installation != date_installation_str or old_last_valid_date != last_valid_date_str: - print(f"Updated {label_s_id}: {s_id}") - print(f" Old date_installation: {old_date_installation} --> New date_installation: {date_installation_str}") - print(f" Old last_valid_date: {old_last_valid_date} --> New last_valid_date: {last_valid_date_str}") + logger.info(f"Updated {label_s_id}: {s_id}") + logger.info(f" Old date_installation: {old_date_installation} --> New date_installation: {date_installation_str}") + logger.info(f" Old last_valid_date: {old_last_valid_date} --> New last_valid_date: {last_valid_date_str}") updated_s.append(s_id) else: @@ -106,7 +111,7 @@ def process_files(base_dir, data_type): rows.append(row) except Exception as e: - print(f"Warning: Error processing {file_path}: {str(e)}") + logger.info(f"Warning: Error processing {file_path}: {str(e)}") continue # Continue to next file if there's an error # Convert the list of rows to a DataFrame @@ -131,7 +136,7 @@ def process_files(base_dir, data_type): pd.set_option('display.max_colwidth', None) # Show full width of columns pd.set_option('display.width', None) # Disable line wrapping - print("\nExcluded lines from combined metadata.csv:") + logger.info("\nExcluded lines from combined metadata.csv:") print(excluded_metadata_df) # Drop excluded lines from combined_metadata_df @@ -149,14 +154,14 @@ def process_files(base_dir, data_type): reused_lines = existing_metadata_df.loc[reused_s] added_lines = combined_metadata_df.loc[combined_metadata_df.index.difference(existing_metadata_df.index)] - print("\nLines from the old metadata.csv that are reused (not updated):") + logger.info("\nLines from the old metadata.csv that are reused (not updated):") print(reused_lines) if not added_lines.empty: - print("\nLines that were not present in the old metadata.csv and are added:") + logger.info("\nLines that were not present in the old metadata.csv and are added:") print(added_lines) else: - print("\nAll lines are added (no old metadata.csv found)") + logger.info("\nAll lines are added (no old metadata.csv found)") def main(): parser = argparse.ArgumentParser(description='Process station or site data.') From f615d4b7be5c0962bf8dedb9928cfde22adc1ce3 Mon Sep 17 00:00:00 2001 From: Penny How Date: Tue, 9 Jul 2024 16:22:05 -0100 Subject: [PATCH 06/10] File paths specified rather than inferred (#279) * paths specified --- .../postprocess/make_metadata_csv.py | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/pypromice/postprocess/make_metadata_csv.py b/src/pypromice/postprocess/make_metadata_csv.py index e0e41379..b19a1cf6 100644 --- a/src/pypromice/postprocess/make_metadata_csv.py +++ b/src/pypromice/postprocess/make_metadata_csv.py @@ -5,16 +5,13 @@ import xarray as xr import logging logger = logging.getLogger(__name__) - -def process_files(base_dir, data_type): + +def process_files(base_dir, csv_file_path, data_type): + # Determine the CSV file path based on the data type if data_type == 'station': - logger.info("Updating AWS_stations_metadata.csv") - csv_file_path = os.path.join(base_dir, '../AWS_stations_metadata.csv') label_s_id = 'station_id' elif data_type == 'site': - logger.info("Updating AWS_sites_metadata.csv") - csv_file_path = os.path.join(base_dir, '../AWS_sites_metadata.csv') label_s_id = 'site_id' # Initialize a list to hold the rows (Series) of DataFrame @@ -22,8 +19,10 @@ def process_files(base_dir, data_type): # Read existing metadata if the CSV file exists if os.path.exists(csv_file_path): + logger.info("Updating "+str(csv_file_path)) existing_metadata_df = pd.read_csv(csv_file_path, index_col=label_s_id) else: + logger.info("Creating "+str(csv_file_path)) existing_metadata_df = pd.DataFrame() # Drop the 'timestamp_last_known_coordinates' column if it exists @@ -135,7 +134,6 @@ def process_files(base_dir, data_type): pd.set_option('display.max_columns', None) # Show all columns pd.set_option('display.max_colwidth', None) # Show full width of columns pd.set_option('display.width', None) # Disable line wrapping - logger.info("\nExcluded lines from combined metadata.csv:") print(excluded_metadata_df) @@ -165,16 +163,17 @@ def process_files(base_dir, data_type): def main(): parser = argparse.ArgumentParser(description='Process station or site data.') - parser.add_argument('-t', '--type', choices=['station', 'site'], required=True, help='Type of data to process: "station" or "site"') - parser.add_argument('--root_dir', required=True, help='Root directory containing the aws-l3 folder') + parser.add_argument('-t', '--type', choices=['station', 'site'], + required=True, + help='Type of data to process: "station" or "site"') + parser.add_argument('-r', '--root_dir', required=True, help='Root directory ' + + 'containing the aws-l3 station or site folder') + parser.add_argument('-m','--metadata_file', required=True, + help='File path to metadata csv file (existing or '+ + 'intended output path') + args = parser.parse_args() - - if args.type == 'station': - base_dir = os.path.join(args.root_dir, 'aws-l3/stations/') - elif args.type == 'site': - base_dir = os.path.join(args.root_dir, 'aws-l3/sites/') - - process_files(base_dir, args.type) + process_files(args.root_dir, args.metadata_file, args.type) if __name__ == '__main__': main() From 15a13ca8b810abb46d070e85318b6bcd5f14798a Mon Sep 17 00:00:00 2001 From: BaptisteVandecrux Date: Thu, 11 Jul 2024 17:42:27 +0200 Subject: [PATCH 07/10] fixed EOL in file attributes --- src/pypromice/resources/file_attributes.csv | 112 ++++++++++---------- 1 file changed, 56 insertions(+), 56 deletions(-) diff --git a/src/pypromice/resources/file_attributes.csv b/src/pypromice/resources/file_attributes.csv index 9fa56bc4..01b79ff5 100644 --- a/src/pypromice/resources/file_attributes.csv +++ b/src/pypromice/resources/file_attributes.csv @@ -1,56 +1,56 @@ -attribute,entry -acknowledgements,The Programme for Monitoring of the Greenland Ice Sheet (PROMICE) -alt.axis,Z -alt.coverage_content_type,coordinate -gps_alt.positive,up -cdm_data_type, -comment,https://doi.org/10.22008/promice/data/aws -contributor_name, -contributor_role, -conventions,ACDD-1.3; CF-1.7 -creater_email,pho@geus.dk -creater_url,https://promice.dk -creator_institution,Geological Survey of Denmark and Greenland (GEUS) -creator_name,Penelope How -creator_type,person -featureType,timeSeries -geospatial_bounds_crs,EPSG:4979 -geospatial_lat_extents_match,gps_lat -geospatial_lat_resolution, -geospatial_lat_units,degrees_north -geospatial_lon_extents_match,gps_lon -geospatial_lon_resolution, -geospatial_lon_units,degrees_east -geospatial_vertical_resolution, -geospatial_vertical_units,EPSG:4979 -institution,Geological Survey of Denmark and Greenland (GEUS) -instrument,See https://doi.org/10.5194/essd-13-3819-2021 -instrument_vocabulary,GCMD:GCMD Keywords -keywords,GCMDSK:EARTH SCIENCE > CRYOSPHERE > GLACIERS/ICE SHEETS > ICE SHEETS > ICE SHEET MEASUREMENTS; GCMDSK:EARTH SCIENCE > CRYOSPHERE > GLACIERS/ICE SHEETS > GLACIER MASS BALANCE/ICE SHEET MASS BALANCE; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > SNOW/ICE TEMPERATURE; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > SNOW MELT; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > SNOW DEPTH; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > ICE VELOCITY; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > ALBEDO; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > ALBEDO; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > ICE GROWTH/MELT; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > ICE VELOCITY; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > SNOW DEPTH; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > SNOW MELT; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > SNOW/ICE TEMPERATURE; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC PRESSURE; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > ALBEDO; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > INCOMING SOLAR RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > LONGWAVE RADIATION > DOWNWELLING LONGWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > LONGWAVE RADIATION > UPWELLING LONGWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > LONGWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > NET RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > OUTGOING LONGWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > RADIATIVE FLUX; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > RADIATIVE FORCING; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > SHORTWAVE RADIATION > DOWNWELLING SHORTWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > SHORTWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > SUNSHINE; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC TEMPERATURE > SURFACE TEMPERATURE > AIR TEMPERATURE; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WATER VAPOR > WATER VAPOR INDICATORS > HUMIDITY > ABSOLUTE HUMIDITY; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WATER VAPOR > WATER VAPOR INDICATORS > HUMIDITY > RELATIVE HUMIDITY; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > LOCAL WINDS; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS > U/V WIND COMPONENTS; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS > WIND DIRECTION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS > WIND SPEED; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS; GCMDSK:EARTH SCIENCE > ATMOSPHERE > CLOUDS; GCMDSK:EARTH SCIENCE > ATMOSPHERE > PRECIPITATION -keywords_vocabulary,GCMDSK:GCMD Science Keywords:https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/sciencekeywords -lat.axis,Y -lat.coverage_content_type,coordinate -lat.long_name,station latitude -license,Creative Commons Attribution 4.0 International (CC-BY-4.0) https://creativecommons.org/licenses/by/4.0 -lon.axis,X -lon.coverage_content_type,coordinate -lon.long_name,station longitude -lon.units,degrees_east -metadata_link, -naming_authority,dk.geus.promice -platform, -platform_vocabulary,GCMD:GCMD Keywords -processing_level,Level 3 -product_status,beta -product_version,4 -program,PROMICE -project,PROMICE -publisher_email,info@promice.dk -publisher_institution,GEUS -publisher_name,GEUS -publisher_type,institution -publisher_url,https://promice.dk -references,"How, P.; Abermann, J.; Ahlstrøm, A.P.; Andersen, S.B.; Box, J. E.; Citterio, M.; Colgan, W.T.; Fausto. R.S.; Karlsson, N.B.; Jakobsen, J.; Langley, K.; Larsen, S.H.; Mankoff, K.D.; Pedersen, A.Ø.; Rutishauser, A.; Shield, C.L.; Solgaard, A.M.; van As, D.; Vandecrux, B.; Wright, P.J., 2022, ""PROMICE and GC-Net automated weather station data in Greenland"", https://doi.org/10.22008/FK2/IW73UU, GEUS Dataverse" -references_bib,@article{How2022; doi = {10.22008/FK2/IW73UU}; url = {https://doi.org/10.22008/FK2/IW73UU}; year = {2022}; month=10; publisher= {GEUS Dataverse}; author = {Penelope How and Jakob Abermann and Andreas P. Ahlstr{\o}m and Signe B. Andersen and Jason E. Box and Michele Citterio and William Colgan and Robert S. Fausto and Nanna B. Karlsson and Jakob Jakobsen and Kirsty Langley and Signe Hillerup Larsen and Kenneth D. Mankoff and Allan {\O}. Pedersen and Anja Rutishauser and Christopher L. Shields and Anne M. Solgaard and Dirk van As and Baptiste Vandecrux}; title = {PROMICE and GC-Net automated weather station data in Greenland}; journal = {GEUS Dataverse}} -standard_name_vocabulary,CF Standard Name Table (v77; 19 January 2021) -summary,"The Programme for Monitoring of the Greenland Ice Sheet (PROMICE) and Greenland Climate Network (GC-Net) have been measuring climate and ice sheet properties since 2007 and 1995, respectively. The PROMICE weather station network monitors glacier mass balance in the melt zone of the Greenland Ice Sheet, providing ground truth data to calibrate mass budget models. GC-Net weather stations measure snowfall and surface properties in the accumulation zone, providing valuable knowledge on the Greenland Ice Sheet’s mass gain and climatology.Accurate measurements of the surface and near-surface atmospheric conditions in a changing climate is important for reliable present and future assessment of changes to the Greenland Ice Sheet. All measurements are handled and processed with pypromice, which is a peer-reviewed and freely available Python package with source code available at https://github.com/GEUS-Glaciology-and-Climate/pypromice. A user-contributable dynamic web-based database of known data quality issues is associated with the data products at https://github.com/GEUS-PROMICE/ PROMICE-AWS-data-issues/." +attribute,entry +acknowledgements,The Programme for Monitoring of the Greenland Ice Sheet (PROMICE) +alt.axis,Z +alt.coverage_content_type,coordinate +gps_alt.positive,up +cdm_data_type, +comment,https://doi.org/10.22008/promice/data/aws +contributor_name, +contributor_role, +conventions,ACDD-1.3; CF-1.7 +creater_email,pho@geus.dk +creater_url,https://promice.dk +creator_institution,Geological Survey of Denmark and Greenland (GEUS) +creator_name,Penelope How +creator_type,person +featureType,timeSeries +geospatial_bounds_crs,EPSG:4979 +geospatial_lat_extents_match,gps_lat +geospatial_lat_resolution, +geospatial_lat_units,degrees_north +geospatial_lon_extents_match,gps_lon +geospatial_lon_resolution, +geospatial_lon_units,degrees_east +geospatial_vertical_resolution, +geospatial_vertical_units,EPSG:4979 +institution,Geological Survey of Denmark and Greenland (GEUS) +instrument,See https://doi.org/10.5194/essd-13-3819-2021 +instrument_vocabulary,GCMD:GCMD Keywords +keywords,GCMDSK:EARTH SCIENCE > CRYOSPHERE > GLACIERS/ICE SHEETS > ICE SHEETS > ICE SHEET MEASUREMENTS; GCMDSK:EARTH SCIENCE > CRYOSPHERE > GLACIERS/ICE SHEETS > GLACIER MASS BALANCE/ICE SHEET MASS BALANCE; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > SNOW/ICE TEMPERATURE; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > SNOW MELT; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > SNOW DEPTH; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > ICE VELOCITY; GCMDSK:EARTH SCIENCE > CRYOSPHERE > SNOW/ICE > ALBEDO; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > ALBEDO; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > ICE GROWTH/MELT; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > ICE VELOCITY; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > SNOW DEPTH; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > SNOW MELT; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE > SNOW/ICE TEMPERATURE; GCMDSK:EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > SNOW/ICE; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC PRESSURE; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > ALBEDO; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > INCOMING SOLAR RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > LONGWAVE RADIATION > DOWNWELLING LONGWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > LONGWAVE RADIATION > UPWELLING LONGWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > LONGWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > NET RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > OUTGOING LONGWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > RADIATIVE FLUX; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > RADIATIVE FORCING; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > SHORTWAVE RADIATION > DOWNWELLING SHORTWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > SHORTWAVE RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION > SUNSHINE; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC RADIATION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC TEMPERATURE > SURFACE TEMPERATURE > AIR TEMPERATURE; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WATER VAPOR > WATER VAPOR INDICATORS > HUMIDITY > ABSOLUTE HUMIDITY; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WATER VAPOR > WATER VAPOR INDICATORS > HUMIDITY > RELATIVE HUMIDITY; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > LOCAL WINDS; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS > U/V WIND COMPONENTS; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS > WIND DIRECTION; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS > WIND SPEED; GCMDSK:EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS; GCMDSK:EARTH SCIENCE > ATMOSPHERE > CLOUDS; GCMDSK:EARTH SCIENCE > ATMOSPHERE > PRECIPITATION +keywords_vocabulary,GCMDSK:GCMD Science Keywords:https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/sciencekeywords +lat.axis,Y +lat.coverage_content_type,coordinate +lat.long_name,station latitude +license,Creative Commons Attribution 4.0 International (CC-BY-4.0) https://creativecommons.org/licenses/by/4.0 +lon.axis,X +lon.coverage_content_type,coordinate +lon.long_name,station longitude +lon.units,degrees_east +metadata_link, +naming_authority,dk.geus.promice +platform, +platform_vocabulary,GCMD:GCMD Keywords +processing_level,Level 3 +product_status,beta +product_version,4 +program,PROMICE +project,PROMICE +publisher_email,info@promice.dk +publisher_institution,GEUS +publisher_name,GEUS +publisher_type,institution +publisher_url,https://promice.dk +references,"How, P.; Abermann, J.; Ahlstrøm, A.P.; Andersen, S.B.; Box, J. E.; Citterio, M.; Colgan, W.T.; Fausto. R.S.; Karlsson, N.B.; Jakobsen, J.; Langley, K.; Larsen, S.H.; Mankoff, K.D.; Pedersen, A.Ø.; Rutishauser, A.; Shield, C.L.; Solgaard, A.M.; van As, D.; Vandecrux, B.; Wright, P.J., 2022, ""PROMICE and GC-Net automated weather station data in Greenland"", https://doi.org/10.22008/FK2/IW73UU, GEUS Dataverse" +references_bib,@article{How2022; doi = {10.22008/FK2/IW73UU}; url = {https://doi.org/10.22008/FK2/IW73UU}; year = {2022}; month=10; publisher= {GEUS Dataverse}; author = {Penelope How and Jakob Abermann and Andreas P. Ahlstr{\o}m and Signe B. Andersen and Jason E. Box and Michele Citterio and William Colgan and Robert S. Fausto and Nanna B. Karlsson and Jakob Jakobsen and Kirsty Langley and Signe Hillerup Larsen and Kenneth D. Mankoff and Allan {\O}. Pedersen and Anja Rutishauser and Christopher L. Shields and Anne M. Solgaard and Dirk van As and Baptiste Vandecrux}; title = {PROMICE and GC-Net automated weather station data in Greenland}; journal = {GEUS Dataverse}} +standard_name_vocabulary,CF Standard Name Table (v77; 19 January 2021) +summary,"The Programme for Monitoring of the Greenland Ice Sheet (PROMICE) and Greenland Climate Network (GC-Net) have been measuring climate and ice sheet properties since 2007 and 1995, respectively. The PROMICE weather station network monitors glacier mass balance in the melt zone of the Greenland Ice Sheet, providing ground truth data to calibrate mass budget models. GC-Net weather stations measure snowfall and surface properties in the accumulation zone, providing valuable knowledge on the Greenland Ice Sheet’s mass gain and climatology.Accurate measurements of the surface and near-surface atmospheric conditions in a changing climate is important for reliable present and future assessment of changes to the Greenland Ice Sheet. All measurements are handled and processed with pypromice, which is a peer-reviewed and freely available Python package with source code available at https://github.com/GEUS-Glaciology-and-Climate/pypromice. A user-contributable dynamic web-based database of known data quality issues is associated with the data products at https://github.com/GEUS-PROMICE/ PROMICE-AWS-data-issues/." From 9ddc1a91698c1dcbfb1cb135cbe2aeaed5e1f493 Mon Sep 17 00:00:00 2001 From: BaptisteVandecrux Date: Thu, 11 Jul 2024 18:12:40 +0200 Subject: [PATCH 08/10] added project and stations as columns in metadata CSV --- .../postprocess/make_metadata_csv.py | 53 ++++++++++++++----- src/pypromice/process/join_l3.py | 3 ++ src/pypromice/resources/file_attributes.csv | 1 - 3 files changed, 42 insertions(+), 15 deletions(-) diff --git a/src/pypromice/postprocess/make_metadata_csv.py b/src/pypromice/postprocess/make_metadata_csv.py index b19a1cf6..099939c9 100644 --- a/src/pypromice/postprocess/make_metadata_csv.py +++ b/src/pypromice/postprocess/make_metadata_csv.py @@ -1,9 +1,13 @@ #!/usr/bin/env python -import os -import argparse +import os, sys, argparse import pandas as pd import xarray as xr import logging +logging.basicConfig( + format="%(asctime)s; %(levelname)s; %(name)s; %(message)s", + level=logging.INFO, + stream=sys.stdout, +) logger = logging.getLogger(__name__) def process_files(base_dir, csv_file_path, data_type): @@ -57,6 +61,9 @@ def process_files(base_dir, csv_file_path, data_type): else: location_type = nc_file.attrs.get('location_type', 'N/A') + project = nc_file.attrs.get('project', 'N/A') + if data_type == 'site': + stations = nc_file.attrs.get('stations', s_id) # Extract the time variable as datetime64 time_var = nc_file['time'].values.astype('datetime64[s]') @@ -74,18 +81,36 @@ def process_files(base_dir, csv_file_path, data_type): alt_last_known = nc_file['alt'].isel(time=-1).values.item() # Create a pandas Series for the metadata - row = pd.Series({ - 'station_type': station_type, - 'location_type': location_type, - 'date_installation': date_installation_str, - 'last_valid_date': last_valid_date_str, - 'lat_installation': lat_installation, - 'lon_installation': lon_installation, - 'alt_installation': alt_installation, - 'lat_last_known': lat_last_known, - 'lon_last_known': lon_last_known, - 'alt_last_known': alt_last_known - }, name=s_id) + if data_type == 'site': + row = pd.Series({ + 'project': project.replace('\r',''), + 'station_type': station_type, + 'location_type': location_type, + 'stations': stations, + 'date_installation': date_installation_str, + 'lat_installation': lat_installation, + 'lon_installation': lon_installation, + 'alt_installation': alt_installation, + 'last_valid_date': last_valid_date_str, + 'lat_last_known': lat_last_known, + 'lon_last_known': lon_last_known, + 'alt_last_known': alt_last_known + }, name=s_id) + else: + row = pd.Series({ + 'project': project.replace('\r',''), + 'station_type': station_type, + 'location_type': location_type, + 'date_installation': date_installation_str, + 'lat_installation': lat_installation, + 'lon_installation': lon_installation, + 'alt_installation': alt_installation, + 'last_valid_date': last_valid_date_str, + 'lat_last_known': lat_last_known, + 'lon_last_known': lon_last_known, + 'alt_last_known': alt_last_known + }, name=s_id) + # Check if this s_id is already in the existing metadata if s_id in existing_metadata_df.index: diff --git a/src/pypromice/process/join_l3.py b/src/pypromice/process/join_l3.py index a61b2c64..1e29a9a9 100644 --- a/src/pypromice/process/join_l3.py +++ b/src/pypromice/process/join_l3.py @@ -304,6 +304,9 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me # creating the station_attributes attribute in l3_merged l3_merged.attrs["stations_attributes"] = st_attrs + + if "project" in st_attrs[stid].keys(): + l3_merged.attrs["project"] = st_attrs[stid]["project"] else: # if l3 (older data) is missing variables compared to l3_merged (newer data) diff --git a/src/pypromice/resources/file_attributes.csv b/src/pypromice/resources/file_attributes.csv index 01b79ff5..a2482b7d 100644 --- a/src/pypromice/resources/file_attributes.csv +++ b/src/pypromice/resources/file_attributes.csv @@ -44,7 +44,6 @@ processing_level,Level 3 product_status,beta product_version,4 program,PROMICE -project,PROMICE publisher_email,info@promice.dk publisher_institution,GEUS publisher_name,GEUS From d03b92b8cc22333404a42843bf13c1dc2aee1b84 Mon Sep 17 00:00:00 2001 From: Baptiste Vandecrux <35140661+BaptisteVandecrux@users.noreply.github.com> Date: Tue, 16 Jul 2024 09:59:20 +0200 Subject: [PATCH 09/10] properly passing the project attribute from L2 to L3 --- src/pypromice/process/L2toL3.py | 3 +++ src/pypromice/process/join_l3.py | 4 +--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/pypromice/process/L2toL3.py b/src/pypromice/process/L2toL3.py index 87d051cd..6774e155 100755 --- a/src/pypromice/process/L2toL3.py +++ b/src/pypromice/process/L2toL3.py @@ -107,6 +107,9 @@ def toL3(L2, station_config={}, T_0=273.15): # processing continuous surface height, ice surface height, snow height ds = process_surface_height(ds, station_config) + # making sure dataset has project as attribute + ds.attrs['project'] = station_config['project'] + return ds diff --git a/src/pypromice/process/join_l3.py b/src/pypromice/process/join_l3.py index 1e29a9a9..fa02a2d0 100644 --- a/src/pypromice/process/join_l3.py +++ b/src/pypromice/process/join_l3.py @@ -305,9 +305,6 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me # creating the station_attributes attribute in l3_merged l3_merged.attrs["stations_attributes"] = st_attrs - if "project" in st_attrs[stid].keys(): - l3_merged.attrs["project"] = st_attrs[stid]["project"] - else: # if l3 (older data) is missing variables compared to l3_merged (newer data) # , then we fill them with nan @@ -353,6 +350,7 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me l3_merged.attrs['site_id'] = site l3_merged.attrs['stations'] = ' '.join(sorted_stids) l3_merged.attrs['level'] = 'L3' + l3_merged.attrs['project'] = sorted_list_station_data[0][1]['project'] v = getVars(variables) m = getMeta(metadata) From 4fdb027758af92027aa06e49a91f49e069823f97 Mon Sep 17 00:00:00 2001 From: BaptisteVandecrux Date: Tue, 16 Jul 2024 13:51:48 +0200 Subject: [PATCH 10/10] update make_metadata_csv.py after review, store location_type attribute from config file into L3 dataset attribute --- .../postprocess/make_metadata_csv.py | 248 +++++++++--------- src/pypromice/process/L2toL3.py | 3 +- src/pypromice/process/join_l3.py | 1 + 3 files changed, 132 insertions(+), 120 deletions(-) diff --git a/src/pypromice/postprocess/make_metadata_csv.py b/src/pypromice/postprocess/make_metadata_csv.py index 099939c9..7bf819cb 100644 --- a/src/pypromice/postprocess/make_metadata_csv.py +++ b/src/pypromice/postprocess/make_metadata_csv.py @@ -3,36 +3,112 @@ import pandas as pd import xarray as xr import logging + logging.basicConfig( format="%(asctime)s; %(levelname)s; %(name)s; %(message)s", level=logging.INFO, stream=sys.stdout, ) logger = logging.getLogger(__name__) - -def process_files(base_dir, csv_file_path, data_type): - - # Determine the CSV file path based on the data type - if data_type == 'station': - label_s_id = 'station_id' - elif data_type == 'site': - label_s_id = 'site_id' + +def extract_metadata_from_nc(file_path: str, data_type: str, label_s_id: str) -> pd.Series: + """ + Extract metadata from a NetCDF file and return it as a pandas Series. + + Parameters: + - file_path (str): The path to the NetCDF file. + - data_type (str): The type of data ('station' or 'site'). + - label_s_id (str): The label for the station or site ID. + + Returns: + - pd.Series: A pandas Series containing the extracted metadata. + """ + try: + with xr.open_dataset(file_path) as nc_file: + # Extract attributes + s_id = nc_file.attrs.get(label_s_id, 'N/A') + location_type = nc_file.attrs.get('location_type', 'N/A') + project = nc_file.attrs.get('project', 'N/A') + if data_type == 'site': + stations = nc_file.attrs.get('stations', s_id) + if data_type == 'station': + number_of_booms = nc_file.attrs.get('number_of_booms', 'N/A') + + # Extract the time variable as datetime64 + time_var = nc_file['time'].values.astype('datetime64[s]') + + # Extract the first and last timestamps + date_installation_str = pd.Timestamp(time_var[0]).strftime('%Y-%m-%d') + last_valid_date_str = pd.Timestamp(time_var[-1]).strftime('%Y-%m-%d') + + # Extract the first and last values of lat, lon, and alt + lat_installation = nc_file['lat'].isel(time=0).values.item() + lon_installation = nc_file['lon'].isel(time=0).values.item() + alt_installation = nc_file['alt'].isel(time=0).values.item() + + lat_last_known = nc_file['lat'].isel(time=-1).values.item() + lon_last_known = nc_file['lon'].isel(time=-1).values.item() + alt_last_known = nc_file['alt'].isel(time=-1).values.item() + + # Create a pandas Series for the metadata + if data_type == 'site': + row = pd.Series({ + 'project': project.replace('\r',''), + 'location_type': location_type, + 'stations': stations, + 'date_installation': date_installation_str, + 'latitude_installation': lat_installation, + 'longitude_installation': lon_installation, + 'altitude_installation': alt_installation, + 'date_last_valid': last_valid_date_str, + 'latitude_last_valid': lat_last_known, + 'longitude_last_valid': lon_last_known, + 'altitude_last_valid': alt_last_known + }, name=s_id) + else: + row = pd.Series({ + 'project': project.replace('\r',''), + 'number_of_booms': number_of_booms, + 'location_type': location_type, + 'date_installation': date_installation_str, + 'latitude_installation': lat_installation, + 'longitude_installation': lon_installation, + 'altitude_installation': alt_installation, + 'date_last_valid': last_valid_date_str, + 'latitude_last_valid': lat_last_known, + 'longitude_last_valid': lon_last_known, + 'altitude_last_valid': alt_last_known + }, name=s_id) + return row + except Exception as e: + logger.info(f"Warning: Error processing {file_path}: {str(e)}") + return pd.Series() # Return an empty Series in case of an error + +def process_files(base_dir: str, csv_file_path: str, data_type: str) -> pd.DataFrame: + """ + Process all files in the base directory to generate new metadata. + + Parameters: + - base_dir (str): The base directory containing the NetCDF files. + - csv_file_path (str): The path to the existing metadata CSV file. + - data_type (str): The type of data ('station' or 'site'). + + Returns: + - pd.DataFrame: The combined metadata DataFrame. + """ + label_s_id = 'station_id' if data_type == 'station' else 'site_id' # Initialize a list to hold the rows (Series) of DataFrame rows = [] # Read existing metadata if the CSV file exists if os.path.exists(csv_file_path): - logger.info("Updating "+str(csv_file_path)) + logger.info("Updating " + str(csv_file_path)) existing_metadata_df = pd.read_csv(csv_file_path, index_col=label_s_id) else: - logger.info("Creating "+str(csv_file_path)) + logger.info("Creating " + str(csv_file_path)) existing_metadata_df = pd.DataFrame() - # Drop the 'timestamp_last_known_coordinates' column if it exists - if 'timestamp_last_known_coordinates' in existing_metadata_df.columns: - existing_metadata_df.drop(columns=['timestamp_last_known_coordinates'], inplace=True) - # Track updated sites or stations to avoid duplicate updates updated_s = [] new_s = [] @@ -42,111 +118,35 @@ def process_files(base_dir, csv_file_path, data_type): for file in files: if file.endswith('_hour.nc'): file_path = os.path.join(subdir, file) - try: - with xr.open_dataset(file_path) as nc_file: - # Extract attributes - s_id = nc_file.attrs.get(label_s_id, 'N/A') - - number_of_booms = nc_file.attrs.get('number_of_booms', 'N/A') - if number_of_booms == '1': - station_type = 'one boom' - elif number_of_booms == '2': - station_type = 'two booms' - else: - station_type = 'N/A' - - # Keep the existing location_type if it exists - if s_id in existing_metadata_df.index: - location_type = existing_metadata_df.loc[s_id, 'location_type'] - else: - location_type = nc_file.attrs.get('location_type', 'N/A') - - project = nc_file.attrs.get('project', 'N/A') - if data_type == 'site': - stations = nc_file.attrs.get('stations', s_id) - # Extract the time variable as datetime64 - time_var = nc_file['time'].values.astype('datetime64[s]') - - # Extract the first and last timestamps - date_installation_str = pd.Timestamp(time_var[0]).strftime('%Y-%m-%d') - last_valid_date_str = pd.Timestamp(time_var[-1]).strftime('%Y-%m-%d') - - # Extract the first and last values of lat, lon, and alt - lat_installation = nc_file['lat'].isel(time=0).values.item() - lon_installation = nc_file['lon'].isel(time=0).values.item() - alt_installation = nc_file['alt'].isel(time=0).values.item() - - lat_last_known = nc_file['lat'].isel(time=-1).values.item() - lon_last_known = nc_file['lon'].isel(time=-1).values.item() - alt_last_known = nc_file['alt'].isel(time=-1).values.item() - - # Create a pandas Series for the metadata - if data_type == 'site': - row = pd.Series({ - 'project': project.replace('\r',''), - 'station_type': station_type, - 'location_type': location_type, - 'stations': stations, - 'date_installation': date_installation_str, - 'lat_installation': lat_installation, - 'lon_installation': lon_installation, - 'alt_installation': alt_installation, - 'last_valid_date': last_valid_date_str, - 'lat_last_known': lat_last_known, - 'lon_last_known': lon_last_known, - 'alt_last_known': alt_last_known - }, name=s_id) - else: - row = pd.Series({ - 'project': project.replace('\r',''), - 'station_type': station_type, - 'location_type': location_type, - 'date_installation': date_installation_str, - 'lat_installation': lat_installation, - 'lon_installation': lon_installation, - 'alt_installation': alt_installation, - 'last_valid_date': last_valid_date_str, - 'lat_last_known': lat_last_known, - 'lon_last_known': lon_last_known, - 'alt_last_known': alt_last_known - }, name=s_id) - - - # Check if this s_id is already in the existing metadata - if s_id in existing_metadata_df.index: - # Compare with existing metadata - existing_row = existing_metadata_df.loc[s_id] - old_date_installation = existing_row['date_installation'] - old_last_valid_date = existing_row['last_valid_date'] - - # Update the existing metadata - existing_metadata_df.loc[s_id] = row - - # Print message if dates are updated - if old_date_installation != date_installation_str or old_last_valid_date != last_valid_date_str: - logger.info(f"Updated {label_s_id}: {s_id}") - logger.info(f" Old date_installation: {old_date_installation} --> New date_installation: {date_installation_str}") - logger.info(f" Old last_valid_date: {old_last_valid_date} --> New last_valid_date: {last_valid_date_str}") - - updated_s.append(s_id) - else: - new_s.append(s_id) - # Append new metadata row to the list - rows.append(row) - - except Exception as e: - logger.info(f"Warning: Error processing {file_path}: {str(e)}") - continue # Continue to next file if there's an error + row = extract_metadata_from_nc(file_path, data_type, label_s_id) + if not row.empty: + s_id = row.name + if s_id in existing_metadata_df.index: + # Compare with existing metadata + existing_row = existing_metadata_df.loc[s_id] + old_date_installation = existing_row['date_installation'] + old_last_valid_date = existing_row['date_last_valid'] + + # Update the existing metadata + existing_metadata_df.loc[s_id] = row + + # Print message if dates are updated + if old_last_valid_date != row['date_last_valid']: + logger.info(f"Updated {label_s_id}: {s_id} date_last_valid: {old_last_valid_date} --> {row['date_last_valid']}") + + updated_s.append(s_id) + else: + new_s.append(s_id) + # Append new metadata row to the list + rows.append(row) # Convert the list of rows to a DataFrame new_metadata_df = pd.DataFrame(rows) - # Convert the list of excluded rows to a DataFrame - - # Concatenate the existing metadata with the new metadata and excluded metadata + # Concatenate the existing metadata with the new metadata combined_metadata_df = pd.concat([existing_metadata_df, new_metadata_df], ignore_index=False) - # excluding some sites + # Exclude some sites sites_to_exclude = [s for s in ['XXX', 'Roof_GEUS', 'Roof_PROMICE'] if s in combined_metadata_df.index] excluded_metadata_df = combined_metadata_df.loc[sites_to_exclude].copy() combined_metadata_df.drop(sites_to_exclude, inplace=True) @@ -165,12 +165,21 @@ def process_files(base_dir, csv_file_path, data_type): # Drop excluded lines from combined_metadata_df combined_metadata_df.drop(sites_to_exclude, errors='ignore', inplace=True) - if label_s_id == 'site_id': - combined_metadata_df.drop(columns=['station_type'], inplace=True) - - # saving to csv + # Save to csv combined_metadata_df.to_csv(csv_file_path, index_label=label_s_id) - + + return combined_metadata_df, existing_metadata_df, new_s, updated_s + +def compare_and_log_updates(combined_metadata_df: pd.DataFrame, existing_metadata_df: pd.DataFrame, new_s: list, updated_s: list): + """ + Compare the combined metadata with the existing metadata and log the updates. + + Parameters: + - combined_metadata_df (pd.DataFrame): The combined metadata DataFrame. + - existing_metadata_df (pd.DataFrame): The existing metadata DataFrame. + - new_s (list): List of new station/site IDs. + - updated_s (list): List of updated station/site IDs. + """ # Determine which lines were not updated (reused) and which were added if not existing_metadata_df.empty: reused_s = [s_id for s_id in existing_metadata_df.index if ((s_id not in new_s) & (s_id not in updated_s))] @@ -198,7 +207,8 @@ def main(): 'intended output path') args = parser.parse_args() - process_files(args.root_dir, args.metadata_file, args.type) + combined_metadata_df, existing_metadata_df, new_s, updated_s = process_files(args.root_dir, args.metadata_file, args.type) + compare_and_log_updates(combined_metadata_df, existing_metadata_df, new_s, updated_s) if __name__ == '__main__': main() diff --git a/src/pypromice/process/L2toL3.py b/src/pypromice/process/L2toL3.py index 6774e155..650f0702 100755 --- a/src/pypromice/process/L2toL3.py +++ b/src/pypromice/process/L2toL3.py @@ -107,8 +107,9 @@ def toL3(L2, station_config={}, T_0=273.15): # processing continuous surface height, ice surface height, snow height ds = process_surface_height(ds, station_config) - # making sure dataset has project as attribute + # making sure dataset has the attributes contained in the config files ds.attrs['project'] = station_config['project'] + ds.attrs['location_type'] = station_config['location_type'] return ds diff --git a/src/pypromice/process/join_l3.py b/src/pypromice/process/join_l3.py index fa02a2d0..3377b107 100644 --- a/src/pypromice/process/join_l3.py +++ b/src/pypromice/process/join_l3.py @@ -351,6 +351,7 @@ def join_l3(config_folder, site, folder_l3, folder_gcnet, outpath, variables, me l3_merged.attrs['stations'] = ' '.join(sorted_stids) l3_merged.attrs['level'] = 'L3' l3_merged.attrs['project'] = sorted_list_station_data[0][1]['project'] + l3_merged.attrs['location_type'] = sorted_list_station_data[0][1]['location_type'] v = getVars(variables) m = getMeta(metadata)