From 0b04bf82b16be38802157b49f0e186474ee5f5b1 Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Fri, 21 Jun 2024 15:07:24 +0200 Subject: [PATCH 01/16] Fixed bug in get_bufr Configuration variables were to strictly validated. * Made bufr_integration_test explicit --- src/pypromice/postprocess/get_bufr.py | 20 +- .../bufr_export/test_get_bufr_integration.py | 257 +++++++++++------- 2 files changed, 169 insertions(+), 108 deletions(-) diff --git a/src/pypromice/postprocess/get_bufr.py b/src/pypromice/postprocess/get_bufr.py index 6b7bc217..0ad4b3a6 100644 --- a/src/pypromice/postprocess/get_bufr.py +++ b/src/pypromice/postprocess/get_bufr.py @@ -491,26 +491,30 @@ def get_bufr_variables( BUFRVariables used by bufr_utilities """ - heightOfStationGroundAboveMeanSeaLevel = np.nan - if isinstance(station_configuration.height_of_gps_from_station_ground, float): + if station_configuration.height_of_gps_from_station_ground is None: + heightOfStationGroundAboveMeanSeaLevel = np.nan + else: heightOfStationGroundAboveMeanSeaLevel = ( data["gps_alt_fit"] - station_configuration.height_of_gps_from_station_ground ) - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = np.nan - if isinstance(station_configuration.temperature_from_sonic_ranger, float): + if station_configuration.temperature_from_sonic_ranger is None: + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = np.nan + else: heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = ( data["z_boom_u_smooth"]+ station_configuration.temperature_from_sonic_ranger ) - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = np.nan - if isinstance(station_configuration.anemometer_from_sonic_ranger, float): + if station_configuration.anemometer_from_sonic_ranger is None: + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = np.nan + else: heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = ( data["z_boom_u_smooth"] + station_configuration.anemometer_from_sonic_ranger ) - heightOfBarometerAboveMeanSeaLevel = np.nan - if isinstance(station_configuration.barometer_from_gps, float): + if station_configuration.barometer_from_gps is None: + heightOfBarometerAboveMeanSeaLevel = np.nan + else: heightOfBarometerAboveMeanSeaLevel = ( data["gps_alt_fit"] + station_configuration.barometer_from_gps ) diff --git a/tests/unit/bufr_export/test_get_bufr_integration.py b/tests/unit/bufr_export/test_get_bufr_integration.py index b60d235a..541962cd 100644 --- a/tests/unit/bufr_export/test_get_bufr_integration.py +++ b/tests/unit/bufr_export/test_get_bufr_integration.py @@ -3,7 +3,6 @@ """ import datetime -import hashlib import logging import pickle import shutil @@ -17,6 +16,7 @@ import pandas as pd from pypromice.postprocess import get_bufr +from pypromice.postprocess.bufr_utilities import read_bufr_message, BUFRVariables from pypromice.postprocess.get_bufr import ( DEFAULT_STATION_CONFIGURATION_PATH, StationConfiguration, @@ -38,10 +38,9 @@ def run_get_bufr( latest_timestamps: Optional[Dict[str, datetime.datetime]], station_configuration_mapping=None, **get_bufr_kwargs, -) -> Dict[str, str]: +) -> Optional[BUFRVariables]: """ Run get_bufr using a temporary folder structure for input and output data - The output bufr files can be verified using the sha256 hashes. Parameters ---------- @@ -51,7 +50,8 @@ def run_get_bufr( Returns ------- - mapping from station id to sha256 hashes + Optional[BUFRVariables] + BUFR variables if the output file was generated successfully """ with TemporaryDirectory() as output_path: @@ -88,13 +88,13 @@ def run_get_bufr( **get_bufr_kwargs, ) - output_bufr_files = bufr_out.glob("*.bufr") - file_hashes = dict() - for p in output_bufr_files: - with p.open("rb") as fp: - file_hashes[p.stem] = hashlib.sha256(fp.read()).hexdigest() + output_path = bufr_out.joinpath(f"{stid}.bufr") + if not output_path.exists(): + return None + + with output_path.open("rb") as fp: + return read_bufr_message(fp) - return file_hashes class PreRefactoringBufrTestCase(TestCase): @@ -104,10 +104,10 @@ def get_station_configuration_mapping( wmo_id: str, station_site: Optional[str] = None, station_type: str = "mobile", - barometer_from_gps: float = 0, + barometer_from_gps: float = 0.0, anemometer_from_sonic_ranger: float = 0.4, temperature_from_sonic_ranger: float = -0.1, - height_of_gps_from_station_ground: float = 0, + height_of_gps_from_station_ground: float = 0.0, skipped_variables=(), comment=None, export_bufr=True, @@ -137,11 +137,8 @@ def test_get_bufr_has_new_data(self): # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"DY2": datetime.datetime(2023, 12, 1)} now_timestamp = datetime.datetime(2023, 12, 8) - expected_file_hashes = { - stid: "2b94d2ef611cfddb6dd537ca63d0ec4fb5d8e880943f81a6d5e724c042ac8971" - } mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, now_timestamp=now_timestamp, latest_timestamps=latest_timestamps, @@ -150,10 +147,28 @@ def test_get_bufr_has_new_data(self): time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, + expected_bufr_variables = BUFRVariables( + wmo_id="04464", + station_type="mobile", + # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 + timestamp=datetime.datetime(2023, 12, 7, 23, 00), + relativeHumidity=69, + airTemperature=256.0, + pressure=77300.0, + windDirection=149, + windSpeed=14.9, + latitude=66.48249, + longitude=-46.29427, + heightOfStationGroundAboveMeanSeaLevel=2124.7, + heightOfBarometerAboveMeanSeaLevel=2124.7, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, ) + pd.testing.assert_series_equal( + bufr_data.as_series(), + expected_bufr_variables.as_series(), + ) + def test_get_bufr_has_new_data_dont_store_position(self): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") @@ -162,11 +177,8 @@ def test_get_bufr_has_new_data_dont_store_position(self): # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"DY2": datetime.datetime(2023, 12, 1)} now_timestamp = datetime.datetime(2023, 12, 8) - expected_file_hashes = { - stid: "2b94d2ef611cfddb6dd537ca63d0ec4fb5d8e880943f81a6d5e724c042ac8971" - } mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, now_timestamp=now_timestamp, latest_timestamps=latest_timestamps, @@ -175,11 +187,29 @@ def test_get_bufr_has_new_data_dont_store_position(self): time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, + expected_bufr_variables = BUFRVariables( + wmo_id="04464", + station_type="mobile", + # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 + timestamp=datetime.datetime(2023, 12, 7, 23, 00), + relativeHumidity=69, + airTemperature=256.0, + pressure=77300.0, + windDirection=149, + windSpeed=14.9, + latitude=66.48249, + longitude=-46.29427, + heightOfStationGroundAboveMeanSeaLevel=2124.7, + heightOfBarometerAboveMeanSeaLevel=2124.7, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, + ) + pd.testing.assert_series_equal( + bufr_data.as_series(), + expected_bufr_variables.as_series(), ) + def test_get_bufr_stid_to_skip(self): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") l3_src = pd.read_csv(l3_src_filepath) @@ -187,11 +217,10 @@ def test_get_bufr_stid_to_skip(self): # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"DY2": datetime.datetime(2023, 12, 1)} now_timestamp = datetime.datetime(2023, 12, 6) - expected_file_hashes = {} mapping = self.get_station_configuration_mapping( stid, wmo_id="04464", export_bufr=False ) - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, now_timestamp=now_timestamp, latest_timestamps=latest_timestamps, @@ -200,10 +229,7 @@ def test_get_bufr_stid_to_skip(self): time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, - ) + self.assertIsNone(bufr_data) def test_get_bufr_has_no_data_newer_than_latests_timestamps(self): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") @@ -212,10 +238,9 @@ def test_get_bufr_has_no_data_newer_than_latests_timestamps(self): # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {stid: datetime.datetime(2023, 12, 7, 23, 00)} now_timestamp = datetime.datetime(2023, 12, 8) - expected_file_hashes = {} mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, now_timestamp=now_timestamp, latest_timestamps=latest_timestamps, @@ -224,10 +249,7 @@ def test_get_bufr_has_no_data_newer_than_latests_timestamps(self): time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, - ) + self.assertIsNone(bufr_data) def test_get_bufr_includes_datasets_not_in_latests_timestamps(self): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") @@ -235,12 +257,8 @@ def test_get_bufr_includes_datasets_not_in_latests_timestamps(self): stid = "DY2" latest_timestamps = {} now_timestamp = datetime.datetime(2023, 12, 8) - expected_file_hashes = { - stid: "2b94d2ef611cfddb6dd537ca63d0ec4fb5d8e880943f81a6d5e724c042ac8971" - } - mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, now_timestamp=now_timestamp, latest_timestamps=latest_timestamps, @@ -249,9 +267,27 @@ def test_get_bufr_includes_datasets_not_in_latests_timestamps(self): time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, + + expected_bufr_variables = BUFRVariables( + wmo_id="04464", + station_type="mobile", + # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 + timestamp=datetime.datetime(2023, 12, 7, 23, 00), + relativeHumidity=69, + airTemperature=256.0, + pressure=77300.0, + windDirection=149, + windSpeed=14.9, + latitude=66.48249, + longitude=-46.29427, + heightOfStationGroundAboveMeanSeaLevel=2124.7, + heightOfBarometerAboveMeanSeaLevel=2124.7, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, + ) + pd.testing.assert_series_equal( + bufr_data.as_series(), + expected_bufr_variables.as_series(), ) def test_get_bufr_has_old_data_compared_to_now(self): @@ -261,10 +297,9 @@ def test_get_bufr_has_old_data_compared_to_now(self): l3_src = pd.read_csv(l3_src_filepath) latest_timestamps = {stid: datetime.datetime(2023, 12, 6)} now_timestamp = datetime.datetime(2023, 12, 20) - expected_file_hashes = {} mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, now_timestamp=now_timestamp, latest_timestamps=latest_timestamps, @@ -273,10 +308,7 @@ def test_get_bufr_has_old_data_compared_to_now(self): time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, - ) + self.assertIsNone(bufr_data) def test_invalid_value_at_last_index(self): stid = "DY2" @@ -287,12 +319,8 @@ def test_invalid_value_at_last_index(self): l3_src.loc[140:, "p_i"] = np.nan latest_timestamps = {stid: datetime.datetime(2023, 12, 1)} now_timestamp = datetime.datetime(2023, 12, 8) - expected_file_hashes = { - stid: "bb951e0245ce3f6fe656b9bb5c85f097753a6969cc60b2cf8b34e0764495e627" - } - mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, now_timestamp=now_timestamp, latest_timestamps=latest_timestamps, @@ -301,9 +329,26 @@ def test_invalid_value_at_last_index(self): time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, + expected_bufr_variables = BUFRVariables( + wmo_id="04464", + station_type="mobile", + # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 + timestamp=datetime.datetime(2023, 12, 7, 23, 00), + relativeHumidity=69, + airTemperature=256.0, + pressure=np.nan, + windDirection=149, + windSpeed=14.9, + latitude=66.48249, + longitude=-46.29427, + heightOfStationGroundAboveMeanSeaLevel=2124.7, + heightOfBarometerAboveMeanSeaLevel=2124.7, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, + ) + pd.testing.assert_series_equal( + bufr_data.as_series(), + expected_bufr_variables.as_series(), ) def test_multiple_last_valid_indices_all_instantaneous_timestamps_are_none(self): @@ -324,10 +369,8 @@ def test_multiple_last_valid_indices_all_instantaneous_timestamps_are_none(self) ] = np.nan latest_timestamps = {stid: datetime.datetime(2023, 12, 1)} now_timestamp = datetime.datetime(2023, 12, 6) - expected_file_hashes = {} - mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, now_timestamp=now_timestamp, latest_timestamps=latest_timestamps, @@ -336,10 +379,8 @@ def test_multiple_last_valid_indices_all_instantaneous_timestamps_are_none(self) time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, - ) + + self.assertIsNone(bufr_data) def test_multiple_last_valid_indices_all_older_than_2days(self): stid = "DY2" @@ -350,10 +391,9 @@ def test_multiple_last_valid_indices_all_older_than_2days(self): l3_src.loc[140:, "p_i"] = np.nan latest_timestamps = {stid: datetime.datetime(2023, 12, 1)} now_timestamp = datetime.datetime(2023, 12, 10) - expected_file_hashes = {} mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, now_timestamp=now_timestamp, latest_timestamps=latest_timestamps, @@ -362,10 +402,7 @@ def test_multiple_last_valid_indices_all_older_than_2days(self): time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, - ) + self.assertIsNone(bufr_data) def test_min_data_wx_failed(self): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") @@ -375,9 +412,8 @@ def test_min_data_wx_failed(self): # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"DY2": datetime.datetime(2023, 12, 1)} now_timestamp = datetime.datetime(2023, 12, 6) - expected_file_hashes = {} mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, now_timestamp=now_timestamp, latest_timestamps=latest_timestamps, @@ -386,10 +422,8 @@ def test_min_data_wx_failed(self): time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, - ) + + self.assertIsNone(bufr_data) def test_min_data_pos_failed(self): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") @@ -399,9 +433,8 @@ def test_min_data_pos_failed(self): # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"DY2": datetime.datetime(2023, 12, 1)} now_timestamp = datetime.datetime(2023, 12, 6) - expected_file_hashes = {} mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, now_timestamp=now_timestamp, latest_timestamps=latest_timestamps, @@ -410,10 +443,7 @@ def test_min_data_pos_failed(self): time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, - ) + self.assertIsNone(bufr_data) def test_ignore_newer_data_than_now_input(self): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") @@ -427,11 +457,8 @@ def test_ignore_newer_data_than_now_input(self): 12, 6, ) - expected_file_hashes = { - stid: "976a24edef2d0e6e2f29fa13d6242419fa05b24905db715fe351c19a1aa1d577" - } mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, now_timestamp=now_timestamp, latest_timestamps=latest_timestamps, @@ -440,9 +467,26 @@ def test_ignore_newer_data_than_now_input(self): time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, + expected_bufr_variables = BUFRVariables( + wmo_id="04464", + station_type="mobile", + # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 but now_timestamp is 2023-12-06 + timestamp=datetime.datetime(2023, 12, 6, 0, 0), + relativeHumidity=82, + airTemperature=250.8, + pressure=77370.0, + windDirection=153, + windSpeed=10.4, + latitude=66.48249, + longitude=-46.29426, + heightOfStationGroundAboveMeanSeaLevel=2124.3, + heightOfBarometerAboveMeanSeaLevel=2124.3, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, + ) + pd.testing.assert_series_equal( + bufr_data.as_series(), + expected_bufr_variables.as_series(), ) def test_land_station_export(self): @@ -452,14 +496,10 @@ def test_land_station_export(self): # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"WEG_B": datetime.datetime(2023, 12, 1)} now_timestamp = datetime.datetime(2023, 12, 8) - expected_file_hashes = { - stid: "eb42044f38326a295bcd18bd42fba5ed88800c5a688f885b87147aacaa5f5001" - } - mapping = self.get_station_configuration_mapping( stid, wmo_id="460", station_type="land" ) - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, now_timestamp=now_timestamp, latest_timestamps=latest_timestamps, @@ -468,7 +508,24 @@ def test_land_station_export(self): time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, + expected_bufr_variables = BUFRVariables( + wmo_id="460", + station_type="land", + # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 + timestamp=datetime.datetime(2023, 12, 7, 23, 00), + relativeHumidity=69, + airTemperature=256.0, + pressure=77300.0, + windDirection=149, + windSpeed=14.9, + latitude=66.48249, + longitude=-46.29427, + heightOfStationGroundAboveMeanSeaLevel=2124.7, + heightOfBarometerAboveMeanSeaLevel=2124.7, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, + ) + pd.testing.assert_series_equal( + bufr_data.as_series(), + expected_bufr_variables.as_series(), ) From 76070b0f14ac92aec766f0384f81565f93186a17 Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Tue, 11 Jun 2024 08:32:02 +0200 Subject: [PATCH 02/16] Added __all__ to get_bufr.py --- src/pypromice/postprocess/get_bufr.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/pypromice/postprocess/get_bufr.py b/src/pypromice/postprocess/get_bufr.py index 0ad4b3a6..f92bbabf 100644 --- a/src/pypromice/postprocess/get_bufr.py +++ b/src/pypromice/postprocess/get_bufr.py @@ -23,6 +23,14 @@ from pypromice.postprocess.bufr_utilities import write_bufr_message, BUFRVariables from pypromice.postprocess.real_time_utilities import get_latest_data +__all__ = [ + 'get_bufr', + 'main', + 'DEFAULT_STATION_CONFIGURATION_PATH', + 'DEFAULT_POSITION_SEED_PATH', + 'DEFAULT_LIN_REG_TIME_LIMIT', +] + logger = logging.getLogger(__name__) DEFAULT_STATION_CONFIGURATION_PATH = Path(__file__).parent.joinpath( From fdc14071425a5843044b83c6754c39289ffbe7c1 Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Fri, 21 Jun 2024 12:08:19 +0200 Subject: [PATCH 03/16] Applied black code formatting --- src/pypromice/postprocess/bufr_utilities.py | 2 ++ src/pypromice/postprocess/get_bufr.py | 30 +++++++++++-------- .../postprocess/real_time_utilities.py | 4 +-- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/pypromice/postprocess/bufr_utilities.py b/src/pypromice/postprocess/bufr_utilities.py index d53f2733..e189c4fe 100644 --- a/src/pypromice/postprocess/bufr_utilities.py +++ b/src/pypromice/postprocess/bufr_utilities.py @@ -45,6 +45,7 @@ def round(value: float): return round + # Enforce precision # Note the sensor accuracies listed here: # https://essd.copernicus.org/articles/13/3819/2021/#section8 @@ -64,6 +65,7 @@ class BUFRVariables: * heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD: Corresponds to "#7#heightOfSensorAboveLocalGroundOrDeckOfMarinePlatform" which is height if anemometer relative to ground or deck of marine platform. """ + wmo_id: str station_type: str timestamp: datetime.datetime diff --git a/src/pypromice/postprocess/get_bufr.py b/src/pypromice/postprocess/get_bufr.py index f92bbabf..433f94e2 100644 --- a/src/pypromice/postprocess/get_bufr.py +++ b/src/pypromice/postprocess/get_bufr.py @@ -24,11 +24,11 @@ from pypromice.postprocess.real_time_utilities import get_latest_data __all__ = [ - 'get_bufr', - 'main', - 'DEFAULT_STATION_CONFIGURATION_PATH', - 'DEFAULT_POSITION_SEED_PATH', - 'DEFAULT_LIN_REG_TIME_LIMIT', + "get_bufr", + "main", + "DEFAULT_STATION_CONFIGURATION_PATH", + "DEFAULT_POSITION_SEED_PATH", + "DEFAULT_LIN_REG_TIME_LIMIT", ] logger = logging.getLogger(__name__) @@ -39,6 +39,7 @@ DEFAULT_POSITION_SEED_PATH = Path(__file__).parent.joinpath("positions_seed.csv") DEFAULT_LIN_REG_TIME_LIMIT = "91d" + def parse_arguments_bufr() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() @@ -109,10 +110,10 @@ def parse_arguments_bufr() -> argparse.ArgumentParser: ) parser.add_argument( - '--latest_timestamp', + "--latest_timestamp", default=datetime.utcnow(), type=pd.Timestamp, - help="Timestamp used to determine latest data. Default utcnow." + help="Timestamp used to determine latest data. Default utcnow.", ) parser.add_argument("--verbose", "-v", default=False, action="store_true") @@ -503,31 +504,32 @@ def get_bufr_variables( heightOfStationGroundAboveMeanSeaLevel = np.nan else: heightOfStationGroundAboveMeanSeaLevel = ( - data["gps_alt_fit"] - station_configuration.height_of_gps_from_station_ground + data["gps_alt_fit"] + - station_configuration.height_of_gps_from_station_ground ) if station_configuration.temperature_from_sonic_ranger is None: heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = np.nan else: heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = ( - data["z_boom_u_smooth"]+ station_configuration.temperature_from_sonic_ranger + data["z_boom_u_smooth"] + + station_configuration.temperature_from_sonic_ranger ) if station_configuration.anemometer_from_sonic_ranger is None: heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = np.nan else: heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = ( - data["z_boom_u_smooth"] + station_configuration.anemometer_from_sonic_ranger + data["z_boom_u_smooth"] + station_configuration.anemometer_from_sonic_ranger ) if station_configuration.barometer_from_gps is None: heightOfBarometerAboveMeanSeaLevel = np.nan else: heightOfBarometerAboveMeanSeaLevel = ( - data["gps_alt_fit"] + station_configuration.barometer_from_gps + data["gps_alt_fit"] + station_configuration.barometer_from_gps ) - output_row = BUFRVariables( wmo_id=station_configuration.wmo_id, station_type=station_configuration.station_type, @@ -604,6 +606,7 @@ def min_data_check(s): return min_data_wx_result, min_data_pos_result + def main(): args = parse_arguments_bufr().parse_args() @@ -637,5 +640,6 @@ def main(): positions_seed_path=args.position_seed, ) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/pypromice/postprocess/real_time_utilities.py b/src/pypromice/postprocess/real_time_utilities.py index 952a69d8..bbc0b337 100644 --- a/src/pypromice/postprocess/real_time_utilities.py +++ b/src/pypromice/postprocess/real_time_utilities.py @@ -166,9 +166,7 @@ def find_positions(df, time_limit): df_limited[f"{k}_fit"] = df.loc[df_limited.index, f"{k}_fit"] else: logger.info(f"----> No data exists for {k}. Stubbing out with NaN.") - df_limited[f"{k}_fit"] = pd.Series( - np.nan, index=df_limited.index - ) + df_limited[f"{k}_fit"] = pd.Series(np.nan, index=df_limited.index) return df_limited From 6ff1afa0c2cc15f56780ae70576737d78ce2e3c1 Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Tue, 11 Jun 2024 08:33:21 +0200 Subject: [PATCH 04/16] Made bufr_to_csv as cli script in setup.py * Updated read_bufr_file to use wmo_id as index --- setup.py | 1 + src/pypromice/postprocess/bufr_to_csv.py | 7 ++++++- src/pypromice/postprocess/bufr_utilities.py | 5 +++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 52a9b216..3e39237b 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ 'get_l2tol3 = pypromice.process.get_l2tol3:main', 'get_watsontx = pypromice.tx.get_watsontx:get_watsontx', 'get_bufr = pypromice.postprocess.get_bufr:main', + 'bufr_to_csv = pypromice.postprocess.bufr_to_csv:main', 'get_msg = pypromice.tx.get_msg:get_msg' ], }, diff --git a/src/pypromice/postprocess/bufr_to_csv.py b/src/pypromice/postprocess/bufr_to_csv.py index 788aef39..d80f99a3 100644 --- a/src/pypromice/postprocess/bufr_to_csv.py +++ b/src/pypromice/postprocess/bufr_to_csv.py @@ -3,9 +3,14 @@ from pypromice.postprocess.bufr_utilities import read_bufr_file -if __name__ == "__main__": + +def main(): parser = argparse.ArgumentParser("BUFR to CSV converter") parser.add_argument("path", type=Path) args = parser.parse_args() print(read_bufr_file(args.path).to_csv()) + + +if __name__ == "__main__": + main() diff --git a/src/pypromice/postprocess/bufr_utilities.py b/src/pypromice/postprocess/bufr_utilities.py index e189c4fe..8293ac86 100644 --- a/src/pypromice/postprocess/bufr_utilities.py +++ b/src/pypromice/postprocess/bufr_utilities.py @@ -487,5 +487,6 @@ def read_bufr_file(path: PathLike) -> pd.DataFrame: message_vars = read_bufr_message(fp) if message_vars is None: break - lines.append(message_vars) - return pd.DataFrame(lines).rename_axis("message_index") + lines.append(message_vars.as_series()) + data_frame = pd.DataFrame(lines).set_index("wmo_id") + return data_frame From eb5ec232ce45702eac420478b1de86569835dcaf Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Tue, 11 Jun 2024 08:34:19 +0200 Subject: [PATCH 05/16] Added script to recreate bufr files * Added corresponding unit tests * Added flag to raise exceptions on errors * Added create_bufr_files.py to setup --- setup.py | 1 + .../postprocess/create_bufr_files.py | 152 ++++++++++++++ src/pypromice/postprocess/get_bufr.py | 6 + .../postprocess/real_time_utilities.py | 4 + .../bufr_export/test_create_bufr_files.py | 187 ++++++++++++++++++ 5 files changed, 350 insertions(+) create mode 100644 src/pypromice/postprocess/create_bufr_files.py create mode 100644 tests/unit/bufr_export/test_create_bufr_files.py diff --git a/setup.py b/setup.py index 3e39237b..db9f6365 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ 'get_l2tol3 = pypromice.process.get_l2tol3:main', 'get_watsontx = pypromice.tx.get_watsontx:get_watsontx', 'get_bufr = pypromice.postprocess.get_bufr:main', + 'create_bufr_files = pypromice.postprocess.create_bufr_files:main', 'bufr_to_csv = pypromice.postprocess.bufr_to_csv:main', 'get_msg = pypromice.tx.get_msg:get_msg' ], diff --git a/src/pypromice/postprocess/create_bufr_files.py b/src/pypromice/postprocess/create_bufr_files.py new file mode 100644 index 00000000..01a26770 --- /dev/null +++ b/src/pypromice/postprocess/create_bufr_files.py @@ -0,0 +1,152 @@ +import logging +from pathlib import Path +from typing import Sequence, List + +import pandas as pd + +from pypromice.postprocess.get_bufr import ( + get_bufr, + DEFAULT_LIN_REG_TIME_LIMIT, + DEFAULT_STATION_CONFIGURATION_PATH, + DEFAULT_POSITION_SEED_PATH, +) + +main_logger = logging.getLogger(__name__) + + +def create_bufr_files( + input_files: Sequence[Path], + period_start: str, + period_end: str, + output_root: Path, + override: bool, + break_on_error: bool = False, + output_filename_suffix: str = "geus_", +): + """ + Generate hourly bufr files from the for all input files + + :param input_files: Paths to csv l3 hourly data files + :param period_start: Datetime string for period start. Eg '2024-01-01T00:00' or '20240101 + :param period_end: Datetime string for period end + :param output_root: Output dir for both bufr files for individual stations and compiled. Organized in two sub directories. + :param override: If False: Skip a period if the compiled output file exists. + :param break_on_error: If True: Stop processing if an error occurs + :return: + """ + periods = pd.date_range(period_start, period_end, freq="H") + output_individual_root = output_root / "individual" + output_compiled_root = output_root / "compiled" + output_individual_root.mkdir(parents=True, exist_ok=True) + output_compiled_root.mkdir(parents=True, exist_ok=True) + + for period in periods: + period: pd.Timestamp + date_str = period.strftime("%Y%m%dT%H%M") + main_logger.info(f"Processing {date_str}") + output_dir_path = output_individual_root / f"{date_str}" + output_file_path = ( + output_compiled_root / f"{output_filename_suffix}{date_str}.bufr" + ) + + main_logger.info(f"{period}, {date_str}") + if override or not output_file_path.exists(): + get_bufr( + bufr_out=output_dir_path, + input_files=input_files, + store_positions=False, + positions_filepath=None, + time_limit=DEFAULT_LIN_REG_TIME_LIMIT, + timestamps_pickle_filepath=None, + now_timestamp=period, + station_configuration_path=DEFAULT_STATION_CONFIGURATION_PATH, + positions_seed_path=DEFAULT_POSITION_SEED_PATH, + break_on_error=break_on_error, + ) + + with output_file_path.open("wb") as fp_dst: + for src_path in output_dir_path.glob("*.bufr"): + with src_path.open("rb") as fp_src: + fp_dst.write(fp_src.read()) + else: + main_logger.info(f"Output file exists. Skipping {output_file_path}") + + +# %% + +def main(): + import argparse + import glob + import sys + + logger_format_string = "%(asctime)s; %(levelname)s; %(name)s; %(message)s" + logging.basicConfig( + level=logging.ERROR, + stream=sys.stdout, + format=logger_format_string, + ) + + main_handler = logging.StreamHandler(sys.stdout) + main_handler.setLevel(logging.INFO) + formatter = logging.Formatter(logger_format_string) + main_handler.setFormatter(formatter) + main_logger.addHandler(main_handler) + main_logger.setLevel(logging.INFO) + + parser = argparse.ArgumentParser( + "Create BUFR files from L3 tx .csv files." + ) + parser.add_argument( + "--input_files", + "--l3-filepath", + "-i", + type=Path, + nargs="+", + required=True, + help="Path to L3 tx .csv files. Can be direct paths or glob patterns", + ) + parser.add_argument( + "--period_start", + "-s", + required=True, + help="Datetime string for period start. Eg '2024-01-01T00:00' or '20240101", + ) + parser.add_argument( + "--period_end", "-e", required=True, help="Datetime string for period end" + ) + parser.add_argument( + "--output_root", + "-o", + required=True, + type=Path, + help="Output dir for both bufr files for individual stations and compiled. Organized in two sub directories.", + ) + parser.add_argument( + "--override", + "-f", + default=False, + action="store_true", + help="Recreate and overide existing output files", + ) + args = parser.parse_args() + + # Interpret all input file paths as glob patterns if they don't exist + input_files: List[Path] = list() + for path in args.input_files: + if path.exists(): + input_files.append(path) + else: + # The input path might be a glob pattern + input_files += map(Path, glob.glob(path.as_posix())) + + main_logger.info(f"Processing {len(input_files)} input files") + create_bufr_files( + input_files=input_files, + period_start=args.period_start, + period_end=args.period_end, + output_root=args.output_root, + override=args.override, + ) + +if __name__ == "__main__": + main() diff --git a/src/pypromice/postprocess/get_bufr.py b/src/pypromice/postprocess/get_bufr.py index 433f94e2..1c5fbe09 100644 --- a/src/pypromice/postprocess/get_bufr.py +++ b/src/pypromice/postprocess/get_bufr.py @@ -300,6 +300,7 @@ def get_bufr( earliest_timestamp: datetime = None, store_positions: bool = False, time_limit: str = "91d", + break_on_error: bool = False, ): """ Main function for generating BUFR files and determine latest positions from a sequence of csv files @@ -330,6 +331,8 @@ def get_bufr( Flag determine if latest positions are exported. time_limit Previous time to limit dataframe before applying linear regression. + break_on_error + If True, the function will raise an exception if an error occurs during processing. """ if now_timestamp is None: @@ -378,6 +381,7 @@ def get_bufr( # Iterate through csv files for file_path in input_files: + # TODO: This split is explicitly requiring the filename to have sampleate at suffix. This shuld be more robust stid = file_path.stem.rsplit("_", 1)[0] logger.info("####### Processing {} #######".format(stid)) @@ -405,6 +409,8 @@ def get_bufr( ) except Exception: logger.exception(f"Failed processing {stid}") + if break_on_error: + raise continue if station_position is None: diff --git a/src/pypromice/postprocess/real_time_utilities.py b/src/pypromice/postprocess/real_time_utilities.py index bbc0b337..17efac92 100644 --- a/src/pypromice/postprocess/real_time_utilities.py +++ b/src/pypromice/postprocess/real_time_utilities.py @@ -66,6 +66,10 @@ def get_latest_data( lin_reg_time_limit, ) + if last_valid_index not in df_limited.index: + logger.info("No valid data limited period") + return None + # Apply smoothing to z_boom_u # require at least 2 hourly obs? Sometimes seeing once/day data for z_boom_u df_limited = rolling_window(df_limited, "z_boom_u", "72H", 2, 1) diff --git a/tests/unit/bufr_export/test_create_bufr_files.py b/tests/unit/bufr_export/test_create_bufr_files.py new file mode 100644 index 00000000..0b209fbf --- /dev/null +++ b/tests/unit/bufr_export/test_create_bufr_files.py @@ -0,0 +1,187 @@ +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Optional +from unittest import TestCase + +from pypromice.postprocess.create_bufr_files import create_bufr_files + +DATA_DIR = Path(__file__).parent.absolute() + + +def create_data_file(path: Path, src_path: Optional[Path] = None): + if src_path is None: + src_path = Path("/dev/null") + + path.parent.mkdir(exist_ok=True, parents=True) + with src_path.open() as fp_src: + with path.open("w") as fp_out: + fp_out.write(fp_src.read()) + + +class TestCreateBufrFiles(TestCase): + def setUp(self): + self._temp_dir = TemporaryDirectory() + self.temp_dir = Path(self._temp_dir.name) + + def tearDown(self): + self._temp_dir.cleanup() + + def test_create_bufr_files(self): + """ + Teste the creation of bufr files and their output folder structure. + It does not test the content of the bufr files. + """ + input_dir = self.temp_dir / "input" + output_dir = self.temp_dir / "output" + input_files = [ + input_dir / "THU_L2_hourly.csv", + input_dir / "KAN_Lv3_hourly.csv", + ] + # Use the same data for all input files + for input_file in input_files: + create_data_file( + input_file, + src_path=DATA_DIR.joinpath("tx_l3_test1.csv"), + ) + + create_bufr_files( + input_files=input_files, + period_start="2023-12-06T00:00", + period_end="2023-12-06T04:00", + output_root=output_dir, + override=True, + break_on_error=True, + ) + + compiled_output_dir = output_dir / "compiled" + individual_output_root = output_dir / "individual" + self.assertTrue(compiled_output_dir.exists()) + self.assertTrue(individual_output_root.exists()) + expected_output_timestamps = [ + "20231206T0000", + "20231206T0100", + "20231206T0200", + "20231206T0300", + "20231206T0400", + ] + compiled_output_files = sorted(compiled_output_dir.glob("*.bufr")) + expected_output_file_names = sorted( + [ + f"geus_{timestamp_str}.bufr" + for timestamp_str in expected_output_timestamps + ] + ) + self.assertListEqual( + expected_output_file_names, [p.name for p in compiled_output_files] + ) + individual_output_dirs = sorted(individual_output_root.glob("*")) + self.assertListEqual( + expected_output_timestamps, [p.stem for p in individual_output_dirs] + ) + for dir in individual_output_dirs: + # There should be a bufr file for each station + self.assertTrue((dir / "THU_L2.bufr").exists()) + self.assertTrue((dir / "KAN_Lv3.bufr").exists()) + + def test_get_bufr_from_empty_data_file_raises_error(self): + input_dir = self.temp_dir / "input" + output_dir = self.temp_dir / "output" + input_file = input_dir / "THU_L2_hourly.csv" + create_data_file(input_file, src_path=None) + + with self.assertRaises(ValueError): + create_bufr_files( + input_files=[input_file], + period_start="2023-12-06T00:00", + period_end="2023-12-06T04:00", + output_root=output_dir, + override=True, + break_on_error=True, + ) + + def test_get_gufr_continues_when_break_on_error_is_false(self): + input_dir = self.temp_dir / "input" + output_dir = self.temp_dir / "output" + input_file_without_data = input_dir / "THU_L2_hourly.csv" + create_data_file(input_file_without_data, src_path=None) + input_file_with_data = input_dir / "KAN_Lv3_hourly.csv" + create_data_file( + input_file_with_data, src_path=DATA_DIR.joinpath("tx_l3_test1.csv") + ) + compiled_output_dir = output_dir / "compiled" + individual_output_root = output_dir / "individual" + expected_compiled_output_file = compiled_output_dir / "geus_20231206T0000.bufr" + expected_individual_output_dir = individual_output_root / "20231206T0000" + expected_individual_output_file = ( + expected_individual_output_dir / "KAN_Lv3.bufr" + ) + + create_bufr_files( + input_files=[ + input_file_without_data, + input_file_with_data, + ], + period_start="2023-12-06T00:00", + period_end="2023-12-06T00:00", + output_root=output_dir, + override=True, + break_on_error=False, + ) + + self.assertTrue(expected_compiled_output_file.exists()) + # There should only be a single output file since the first input file is empty + self.assertEqual(1, len(list(expected_individual_output_dir.glob("*")))) + self.assertTrue(expected_individual_output_file.exists()) + individual_data = expected_individual_output_file.read_bytes() + compiled_data = expected_compiled_output_file.read_bytes() + self.assertEqual( + individual_data, + compiled_data, + ) + + def test_get_bufr_where_period_does_not_exist(self): + input_dir = self.temp_dir / "input" + output_dir = self.temp_dir / "output" + input_file = input_dir / "THU_L2_hourly.csv" + create_data_file(input_file, src_path=DATA_DIR.joinpath("tx_l3_test1.csv")) + + create_bufr_files( + input_files=[input_file], + period_start="2025-12-06T00:00", + period_end="2025-12-06T04:00", + output_root=output_dir, + override=True, + break_on_error=True, + ) + + compiled_output_dir = output_dir / "compiled" + individual_output_root = output_dir / "individual" + self.assertTrue(compiled_output_dir.exists()) + self.assertTrue(individual_output_root.exists()) + expected_output_timestamps = [ + "20251206T0000", + "20251206T0100", + "20251206T0200", + "20251206T0300", + "20251206T0400", + ] + compiled_output_files = sorted(compiled_output_dir.glob("*.bufr")) + expected_output_file_names = sorted( + [ + f"geus_{timestamp_str}.bufr" + for timestamp_str in expected_output_timestamps + ] + ) + self.assertListEqual( + expected_output_file_names, [p.name for p in compiled_output_files] + ) + for file in compiled_output_files: + # The compiled bufr files should be empty + self.assertEqual(0, file.stat().st_size) + individual_output_dirs = sorted(individual_output_root.glob("*")) + self.assertListEqual( + expected_output_timestamps, [p.stem for p in individual_output_dirs] + ) + for dir in individual_output_dirs: + # There should be no bufr files in the individual directories + self.assertEqual(0, len(list(dir.glob("*.bufr")))) From 40547e8031355c1e06b1937b5530ba333c5b846f Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Tue, 11 Jun 2024 15:28:54 +0200 Subject: [PATCH 06/16] Updated tests parameters Updated station config: * Added sonic_ranger_from_gps * Changed height_of_gps_from_station_ground from 0 to 1 --- .../bufr_export/test_get_bufr_integration.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/tests/unit/bufr_export/test_get_bufr_integration.py b/tests/unit/bufr_export/test_get_bufr_integration.py index 541962cd..f7b3cde4 100644 --- a/tests/unit/bufr_export/test_get_bufr_integration.py +++ b/tests/unit/bufr_export/test_get_bufr_integration.py @@ -96,7 +96,6 @@ def run_get_bufr( return read_bufr_message(fp) - class PreRefactoringBufrTestCase(TestCase): @staticmethod def get_station_configuration_mapping( @@ -107,7 +106,8 @@ def get_station_configuration_mapping( barometer_from_gps: float = 0.0, anemometer_from_sonic_ranger: float = 0.4, temperature_from_sonic_ranger: float = -0.1, - height_of_gps_from_station_ground: float = 0.0, + height_of_gps_from_station_ground: float = 1.0, + sonic_ranger_from_gps: float = 1.5, skipped_variables=(), comment=None, export_bufr=True, @@ -122,6 +122,7 @@ def get_station_configuration_mapping( barometer_from_gps=barometer_from_gps, anemometer_from_sonic_ranger=anemometer_from_sonic_ranger, temperature_from_sonic_ranger=temperature_from_sonic_ranger, + sonic_ranger_from_gps=sonic_ranger_from_gps, height_of_gps_from_station_ground=height_of_gps_from_station_ground, skipped_variables=skipped_variables, comment=comment, @@ -159,7 +160,7 @@ def test_get_bufr_has_new_data(self): windSpeed=14.9, latitude=66.48249, longitude=-46.29427, - heightOfStationGroundAboveMeanSeaLevel=2124.7, + heightOfStationGroundAboveMeanSeaLevel=2123.7, heightOfBarometerAboveMeanSeaLevel=2124.7, heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, @@ -169,7 +170,6 @@ def test_get_bufr_has_new_data(self): expected_bufr_variables.as_series(), ) - def test_get_bufr_has_new_data_dont_store_position(self): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") l3_src = pd.read_csv(l3_src_filepath) @@ -199,7 +199,7 @@ def test_get_bufr_has_new_data_dont_store_position(self): windSpeed=14.9, latitude=66.48249, longitude=-46.29427, - heightOfStationGroundAboveMeanSeaLevel=2124.7, + heightOfStationGroundAboveMeanSeaLevel=2123.7, heightOfBarometerAboveMeanSeaLevel=2124.7, heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, @@ -209,7 +209,6 @@ def test_get_bufr_has_new_data_dont_store_position(self): expected_bufr_variables.as_series(), ) - def test_get_bufr_stid_to_skip(self): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") l3_src = pd.read_csv(l3_src_filepath) @@ -280,7 +279,7 @@ def test_get_bufr_includes_datasets_not_in_latests_timestamps(self): windSpeed=14.9, latitude=66.48249, longitude=-46.29427, - heightOfStationGroundAboveMeanSeaLevel=2124.7, + heightOfStationGroundAboveMeanSeaLevel=2123.7, heightOfBarometerAboveMeanSeaLevel=2124.7, heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, @@ -341,7 +340,7 @@ def test_invalid_value_at_last_index(self): windSpeed=14.9, latitude=66.48249, longitude=-46.29427, - heightOfStationGroundAboveMeanSeaLevel=2124.7, + heightOfStationGroundAboveMeanSeaLevel=2123.7, heightOfBarometerAboveMeanSeaLevel=2124.7, heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, @@ -479,7 +478,7 @@ def test_ignore_newer_data_than_now_input(self): windSpeed=10.4, latitude=66.48249, longitude=-46.29426, - heightOfStationGroundAboveMeanSeaLevel=2124.3, + heightOfStationGroundAboveMeanSeaLevel=2123.3, heightOfBarometerAboveMeanSeaLevel=2124.3, heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, @@ -520,7 +519,7 @@ def test_land_station_export(self): windSpeed=14.9, latitude=66.48249, longitude=-46.29427, - heightOfStationGroundAboveMeanSeaLevel=2124.7, + heightOfStationGroundAboveMeanSeaLevel=2123.7, heightOfBarometerAboveMeanSeaLevel=2124.7, heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, From b37fbf46ab660f5ceb168181845c34f92bf02f81 Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Fri, 21 Jun 2024 13:27:27 +0200 Subject: [PATCH 07/16] Added test for missing data in get_bufr - Ensure get_bufr_variables raises AttributeError when station dimensions are missing --- src/pypromice/postprocess/get_bufr.py | 33 ++++++++++++--------- tests/unit/bufr_export/test_get_bufr.py | 39 +++++++------------------ 2 files changed, 30 insertions(+), 42 deletions(-) diff --git a/src/pypromice/postprocess/get_bufr.py b/src/pypromice/postprocess/get_bufr.py index 1c5fbe09..e6e248f1 100644 --- a/src/pypromice/postprocess/get_bufr.py +++ b/src/pypromice/postprocess/get_bufr.py @@ -492,7 +492,13 @@ def get_bufr_variables( station_configuration: StationConfiguration, ) -> BUFRVariables: """ - Helper function for converting our variables to the variables needed for bufr export. + Helper function for converting our variables to the variables needed for bufr export. + + Raises AttributeError if station_configuration dont have the minimum dimension fields since they are required to determine barometer heights. + * height_of_gps_from_station_ground + * barometer_from_gps + + Parameters ---------- @@ -506,13 +512,21 @@ def get_bufr_variables( BUFRVariables used by bufr_utilities """ + if station_configuration.height_of_gps_from_station_ground is None: - heightOfStationGroundAboveMeanSeaLevel = np.nan - else: - heightOfStationGroundAboveMeanSeaLevel = ( - data["gps_alt_fit"] - - station_configuration.height_of_gps_from_station_ground + raise AttributeError( + "height_of_gps_from_station_ground is required for BUFR export" ) + if station_configuration.barometer_from_gps is None: + raise AttributeError("barometer_from_gps is required for BUFR export") + + heightOfStationGroundAboveMeanSeaLevel = ( + data["gps_alt_fit"] - station_configuration.height_of_gps_from_station_ground + ) + + heightOfBarometerAboveMeanSeaLevel = ( + data["gps_alt_fit"] + station_configuration.barometer_from_gps + ) if station_configuration.temperature_from_sonic_ranger is None: heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = np.nan @@ -529,13 +543,6 @@ def get_bufr_variables( data["z_boom_u_smooth"] + station_configuration.anemometer_from_sonic_ranger ) - if station_configuration.barometer_from_gps is None: - heightOfBarometerAboveMeanSeaLevel = np.nan - else: - heightOfBarometerAboveMeanSeaLevel = ( - data["gps_alt_fit"] + station_configuration.barometer_from_gps - ) - output_row = BUFRVariables( wmo_id=station_configuration.wmo_id, station_type=station_configuration.station_type, diff --git a/tests/unit/bufr_export/test_get_bufr.py b/tests/unit/bufr_export/test_get_bufr.py index 5095c2d8..a987607b 100644 --- a/tests/unit/bufr_export/test_get_bufr.py +++ b/tests/unit/bufr_export/test_get_bufr.py @@ -199,7 +199,10 @@ def test_bufr_variables_promice_v3(self): heightOfBarometerAboveMeanSeaLevel=2126, ) - def test_none_values_in_config(self): + def test_fails_on_missing_dimension_values(self): + """ + Test that get_bufr_variables raises an AttributeError if the data is missing + """ timestamp = datetime.datetime.now() data = pd.Series( data=dict( @@ -220,36 +223,14 @@ def test_none_values_in_config(self): stid="A_STID", station_type="land", wmo_id="4201", - barometer_from_gps=0.2, - anemometer_from_sonic_ranger=0.1, - temperature_from_sonic_ranger=1.3, - height_of_gps_from_station_ground=2.1, - ) - - output = get_bufr_variables( - data, - station_configuration=station_config, + export_bufr=True, ) - self.assertEqual( - BUFRVariables( - wmo_id=station_config.wmo_id, - station_type=station_config.station_type, - timestamp=timestamp, - relativeHumidity=1.0, - airTemperature=252.2, # Converted to kelvin - pressure=199300.0, - windDirection=32.0, - windSpeed=5.3, - latitude=66.0, - longitude=-46.0, - heightOfStationGroundAboveMeanSeaLevel=1091.9, - heightOfBarometerAboveMeanSeaLevel=1094.2, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=3.4, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=2.2, - ), - output, - ) + with self.assertRaises(AttributeError) as context: + get_bufr_variables( + data, + station_configuration=station_config, + ) @mock.patch("pypromice.postprocess.get_bufr.write_bufr_message") def _test_bufr_variables( From 7049c0e2577c912476be753d95f07ce00d9cc6b9 Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Fri, 21 Jun 2024 11:31:46 +0200 Subject: [PATCH 08/16] Updated get_bufr to support static GPS heights. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Bedrock stations shouldn’t depend on the noisy GPS signal for elevation. * Added station dimension values for WEG_B * Added corresponding unittest --- src/pypromice/postprocess/get_bufr.py | 13 +++- .../postprocess/station_configurations.toml | 6 ++ tests/unit/bufr_export/test_get_bufr.py | 62 +++++++++++++++++++ 3 files changed, 79 insertions(+), 2 deletions(-) diff --git a/src/pypromice/postprocess/get_bufr.py b/src/pypromice/postprocess/get_bufr.py index e6e248f1..00125050 100644 --- a/src/pypromice/postprocess/get_bufr.py +++ b/src/pypromice/postprocess/get_bufr.py @@ -144,6 +144,7 @@ class StationConfiguration: temperature_from_sonic_ranger: Optional[float] = None height_of_gps_from_station_ground: Optional[float] = None sonic_ranger_from_gps: Optional[float] = None + static_height_of_gps_from_mean_sea_level: Optional[float] = None # The station data will be exported to BUFR if True. Otherwise, it will only export latest position export_bufr: bool = False @@ -520,14 +521,22 @@ def get_bufr_variables( if station_configuration.barometer_from_gps is None: raise AttributeError("barometer_from_gps is required for BUFR export") + if station_configuration.static_height_of_gps_from_mean_sea_level is None: + height_of_gps_above_mean_sea_level = data["gps_alt_fit"] + else: + height_of_gps_above_mean_sea_level = ( + station_configuration.static_height_of_gps_from_mean_sea_level + ) + heightOfStationGroundAboveMeanSeaLevel = ( - data["gps_alt_fit"] - station_configuration.height_of_gps_from_station_ground + height_of_gps_above_mean_sea_level - station_configuration.height_of_gps_from_station_ground ) heightOfBarometerAboveMeanSeaLevel = ( - data["gps_alt_fit"] + station_configuration.barometer_from_gps + height_of_gps_above_mean_sea_level + station_configuration.barometer_from_gps ) + if station_configuration.temperature_from_sonic_ranger is None: heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = np.nan else: diff --git a/src/pypromice/postprocess/station_configurations.toml b/src/pypromice/postprocess/station_configurations.toml index 99bca21e..b6897327 100644 --- a/src/pypromice/postprocess/station_configurations.toml +++ b/src/pypromice/postprocess/station_configurations.toml @@ -514,6 +514,12 @@ station_site = "NUK_U" project = "Wegener" station_type = "land" wmo_id = "460" +barometer_from_gps = 1.3 +anemometer_from_sonic_ranger = 0.4 +temperature_from_sonic_ranger = 0.0 +height_of_gps_from_station_ground = 0.9 +sonic_ranger_from_gps = 1.3 +static_height_of_gps_from_mean_sea_level = 17.5 export_bufr = true skipped_variables = [] positions_update_timestamp_only = false diff --git a/tests/unit/bufr_export/test_get_bufr.py b/tests/unit/bufr_export/test_get_bufr.py index a987607b..d9ffa078 100644 --- a/tests/unit/bufr_export/test_get_bufr.py +++ b/tests/unit/bufr_export/test_get_bufr.py @@ -199,6 +199,68 @@ def test_bufr_variables_promice_v3(self): heightOfBarometerAboveMeanSeaLevel=2126, ) + def test_bufr_variables_static_gps_elevation(self): + timestamp = datetime.datetime.now() + data = pd.Series( + data=dict( + rh_i=0.93, + t_i=-21, + name="", + p_i=993, + wdir_i=32.1, + wspd_i=5.3, + gps_lon_fit=-46.0, + gps_lat_fit=66.0, + # This is a erroneous value that should be overridden by the static value + gps_alt_fit=142.1, + z_boom_u_smooth=2.1, + ), + name=timestamp, + ) + station_config = StationConfiguration( + stid="A_STID", + station_type="land", + wmo_id="4201", + export_bufr=True, + barometer_from_gps=1.3, + height_of_gps_from_station_ground=0.9, + static_height_of_gps_from_mean_sea_level=17.5, + anemometer_from_sonic_ranger=None, + temperature_from_sonic_ranger=None, + sonic_ranger_from_gps=None, + ) + # The elevations should be determined from the static variable + expected_station_ground_elevation = 17.5 - 0.9 + expected_barometer_elevation = 17.5 + 1.3 + + expected_bufr_variables = BUFRVariables( + wmo_id=station_config.wmo_id, + station_type=station_config.station_type, + timestamp=timestamp, + relativeHumidity=1.0, + airTemperature=252.2, # Converted to kelvin + pressure=199300.0, + windDirection=32.0, + windSpeed=5.3, + latitude=66.0, + longitude=-46.0, + heightOfStationGroundAboveMeanSeaLevel=expected_station_ground_elevation, + heightOfBarometerAboveMeanSeaLevel=expected_barometer_elevation, + # The sensor heights are ignored since the necessary dimension values are missing + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=np.nan, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=np.nan, + ) + + output = get_bufr_variables( + data, + station_configuration=station_config, + ) + + self.assertEqual( + expected_bufr_variables, + output, + ) + def test_fails_on_missing_dimension_values(self): """ Test that get_bufr_variables raises an AttributeError if the data is missing From f1285274172f1b7a844098256fc641eb859325e0 Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Fri, 28 Jun 2024 14:23:51 +0200 Subject: [PATCH 09/16] Updated github/workflow to run unittests Added eccodes installation --- .github/workflows/unit_test.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml index 14aa27e8..e0ebc350 100644 --- a/.github/workflows/unit_test.yml +++ b/.github/workflows/unit_test.yml @@ -4,7 +4,7 @@ on: workflow_dispatch: jobs: - build: + test: name: unit_test runs-on: ubuntu-latest strategy: @@ -19,6 +19,9 @@ jobs: uses: actions/checkout@v3 with: token: ${{ secrets.GITHUB_TOKEN }} + - name: Install eccodes + run : | + sudo apt-get install -y libeccodes-dev - name: Install dependencies shell: bash run: | @@ -30,4 +33,4 @@ jobs: - name: Run unit tests shell: bash run: | - python3 -m unittest discover tests.e2e + python3 -m unittest discover tests From 745f85b439cddef99f1c422a5b160f531ca737ec Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Mon, 8 Jul 2024 12:53:02 +0200 Subject: [PATCH 10/16] Updated get_bufr to support station config files in folder * Removed station_configurations.toml from repository * Updated bufr_utilities.set_station to validate wmo id * Implemented StationConfig io tests * Extracted StationConfiguration utils from get_bufr * Added support for loading multiple station configuration files Other * Made ArgumentParser instantiation inline --- setup.py | 2 +- src/pypromice/postprocess/bufr_utilities.py | 5 + .../postprocess/create_bufr_files.py | 15 +- src/pypromice/postprocess/get_bufr.py | 268 ++---- .../postprocess/station_configurations.toml | 768 ------------------ src/pypromice/station_configuration.py | 99 +++ .../bufr_export/test_create_bufr_files.py | 45 +- tests/unit/bufr_export/test_get_bufr.py | 208 ++--- .../bufr_export/test_get_bufr_integration.py | 21 +- tests/unit/test_station_config.py | 143 ++++ tests/utilities.py | 61 ++ 11 files changed, 514 insertions(+), 1121 deletions(-) delete mode 100644 src/pypromice/postprocess/station_configurations.toml create mode 100644 src/pypromice/station_configuration.py create mode 100644 tests/unit/test_station_config.py create mode 100644 tests/utilities.py diff --git a/setup.py b/setup.py index db9f6365..08b72656 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ package_data={ "pypromice.tx": ["payload_formats.csv", "payload_types.csv"], "pypromice.qc.percentiles": ["thresholds.csv"], - "pypromice.postprocess": ["station_configurations.toml", "positions_seed.csv"], + "pypromice.postprocess": ["positions_seed.csv"], }, install_requires=['numpy~=1.23', 'pandas>=1.5.0', 'xarray>=2022.6.0', 'toml', 'scipy>=1.9.0', 'Bottleneck', 'netcdf4', 'pyDataverse==0.3.1', 'eccodes', 'scikit-learn>=1.1.0'], # extras_require={'postprocess': ['eccodes','scikit-learn>=1.1.0']}, diff --git a/src/pypromice/postprocess/bufr_utilities.py b/src/pypromice/postprocess/bufr_utilities.py index 8293ac86..00e036d8 100644 --- a/src/pypromice/postprocess/bufr_utilities.py +++ b/src/pypromice/postprocess/bufr_utilities.py @@ -248,6 +248,11 @@ def set_station(ibufr, station_type: str, wmo_id: str): elif station_type == "land": # StationNumber for land stations are integeres wmo_id_int = int(wmo_id) + if wmo_id_int >= 1024: + raise ValueError( + f"Invalid WMO ID {wmo_id}. Land station number must be less than 1024." + "See https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/001002" + ) station_config = dict(stationNumber=wmo_id_int) else: raise Exception(f"Unsupported station station type {station_type}") diff --git a/src/pypromice/postprocess/create_bufr_files.py b/src/pypromice/postprocess/create_bufr_files.py index 01a26770..2b9925c0 100644 --- a/src/pypromice/postprocess/create_bufr_files.py +++ b/src/pypromice/postprocess/create_bufr_files.py @@ -3,11 +3,11 @@ from typing import Sequence, List import pandas as pd +from pypromice.station_configuration import load_station_configuration_mapping from pypromice.postprocess.get_bufr import ( get_bufr, DEFAULT_LIN_REG_TIME_LIMIT, - DEFAULT_STATION_CONFIGURATION_PATH, DEFAULT_POSITION_SEED_PATH, ) @@ -16,6 +16,7 @@ def create_bufr_files( input_files: Sequence[Path], + station_configuration_root: Path, period_start: str, period_end: str, output_root: Path, @@ -40,6 +41,8 @@ def create_bufr_files( output_individual_root.mkdir(parents=True, exist_ok=True) output_compiled_root.mkdir(parents=True, exist_ok=True) + station_configuration_mapping = load_station_configuration_mapping(station_configuration_root) + for period in periods: period: pd.Timestamp date_str = period.strftime("%Y%m%dT%H%M") @@ -59,7 +62,7 @@ def create_bufr_files( time_limit=DEFAULT_LIN_REG_TIME_LIMIT, timestamps_pickle_filepath=None, now_timestamp=period, - station_configuration_path=DEFAULT_STATION_CONFIGURATION_PATH, + station_configuration_mapping=station_configuration_mapping, positions_seed_path=DEFAULT_POSITION_SEED_PATH, break_on_error=break_on_error, ) @@ -121,6 +124,13 @@ def main(): type=Path, help="Output dir for both bufr files for individual stations and compiled. Organized in two sub directories.", ) + parser.add_argument( + "--station_configuration_root", + "-c", + required=True, + type=Path, + help="Root directory containing station configuration toml files", + ) parser.add_argument( "--override", "-f", @@ -146,6 +156,7 @@ def main(): period_end=args.period_end, output_root=args.output_root, override=args.override, + station_configuration_root=args.station_configuration_root, ) if __name__ == "__main__": diff --git a/src/pypromice/postprocess/get_bufr.py b/src/pypromice/postprocess/get_bufr.py index 00125050..ab198236 100644 --- a/src/pypromice/postprocess/get_bufr.py +++ b/src/pypromice/postprocess/get_bufr.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python - """ Command-line script for running BUFR file generation @@ -13,12 +11,10 @@ import sys from datetime import datetime, timedelta from pathlib import Path -from typing import List, Dict, Mapping, Optional, Collection, Sequence, Union, TextIO +from typing import List, Dict, Optional, Collection, Sequence, Mapping -import attrs import numpy as np import pandas as pd -import toml from pypromice.postprocess.bufr_utilities import write_bufr_message, BUFRVariables from pypromice.postprocess.real_time_utilities import get_latest_data @@ -26,184 +22,21 @@ __all__ = [ "get_bufr", "main", - "DEFAULT_STATION_CONFIGURATION_PATH", "DEFAULT_POSITION_SEED_PATH", "DEFAULT_LIN_REG_TIME_LIMIT", ] +from pypromice.station_configuration import ( + StationConfiguration, + load_station_configuration_mapping, +) + logger = logging.getLogger(__name__) -DEFAULT_STATION_CONFIGURATION_PATH = Path(__file__).parent.joinpath( - "station_configurations.toml" -) DEFAULT_POSITION_SEED_PATH = Path(__file__).parent.joinpath("positions_seed.csv") DEFAULT_LIN_REG_TIME_LIMIT = "91d" -def parse_arguments_bufr() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser() - - parser.add_argument( - "--store_positions", - "--positions", - action="store_true", - required=False, - default=False, - help="If included (True), make a positions dict and output AWS_latest_locations.csv file.", - ) - - parser.add_argument( - "--positions-filepath", - "-p", - type=Path, - required=False, - help="Path to write AWS_latest_locations.csv file.", - ) - - parser.add_argument( - "--time-limit", - default=DEFAULT_LIN_REG_TIME_LIMIT, - type=str, - required=False, - help="Previous time to limit dataframe before applying linear regression.", - ) - - parser.add_argument( - "--input_files", - "--l3-filepath", - "-i", - type=Path, - nargs="+", - required=True, - help="Path to L3 tx .csv files. Can be direct paths or glob patterns", - ) - - parser.add_argument( - "--bufr-out", - "-o", - type=Path, - required=True, - help="Path to the BUFR out directory.", - ) - - parser.add_argument( - "--timestamps-pickle-filepath", - type=Path, - required=False, - help="Path to the latest_timestamps.pickle file.", - ) - - parser.add_argument( - "--station_configuration_mapping", - default=DEFAULT_STATION_CONFIGURATION_PATH, - type=Path, - required=False, - help="Path to csv file with station meta data and BUFR export configuration", - ) - - parser.add_argument( - "--position_seed", - default=DEFAULT_POSITION_SEED_PATH, - type=Path, - required=False, - help="Path to csv file with seed values for output positions.", - ) - - parser.add_argument( - "--latest_timestamp", - default=datetime.utcnow(), - type=pd.Timestamp, - help="Timestamp used to determine latest data. Default utcnow.", - ) - - parser.add_argument("--verbose", "-v", default=False, action="store_true") - - return parser - - -@attrs.define -class StationConfiguration: - """ - Helper class for storing station specific configurations with respect to - - * Installation specific distance measurements such as height differences between instruments - * Reference strings such as stid, station_site and wmo_id - * BUFR export specific parameters - - # TODO: The station related meta data should be fetched from a station specific configuration files in the future or - # from header data in data source. - """ - - stid: str - station_site: str = None - project: Optional[str] = None - station_type: Optional[str] = None - wmo_id: Optional[str] = None - barometer_from_gps: Optional[float] = None - anemometer_from_sonic_ranger: Optional[float] = None - temperature_from_sonic_ranger: Optional[float] = None - height_of_gps_from_station_ground: Optional[float] = None - sonic_ranger_from_gps: Optional[float] = None - static_height_of_gps_from_mean_sea_level: Optional[float] = None - - # The station data will be exported to BUFR if True. Otherwise, it will only export latest position - export_bufr: bool = False - comment: Optional[str] = None - - # skip specific variables for stations - # If a variable has known bad data, use this collection to skip the variable - # Note that if a station is not reporting both air temp and pressure it will be skipped, - # as currently implemented in csv2bufr.min_data_check(). - # ['p_i'], # EXAMPLE - skipped_variables: List[str] = attrs.field(factory=list) - - positions_update_timestamp_only: bool = False - - def as_dict(self) -> Dict: - return attrs.asdict(self) - - -def load_station_configuration_mapping( - fp: Union[str, Path, TextIO] -) -> Mapping[str, StationConfiguration]: - """ - Read station configurations from toml file - - Parameters - ---------- - fp : - Path to or open toml file - - Returns - ------- - Mapping from stid to StationConfiguration - - """ - return { - stid: StationConfiguration(**config_dict) - for stid, config_dict in toml.load(fp).items() - } - - -def write_station_configuration_mapping( - config_mapping: Mapping[str, StationConfiguration], fp: TextIO -): - """ - Write station configuration to toml file - - Parameters - ---------- - config_mapping - Mapping from stid to StationConfiguration - fp - open writable TextIO - """ - config_mapping = { - config.stid: config.as_dict() for config in config_mapping.values() - } - toml.dump(config_mapping, fp) - - def process_station( file_path: Path, output_path: Path, @@ -295,7 +128,7 @@ def get_bufr( input_files: Sequence[Path], positions_filepath: Optional[Path], timestamps_pickle_filepath: Optional[Path], - station_configuration_path: Optional[Path], + station_configuration_mapping: Mapping[str, StationConfiguration], now_timestamp: Optional[datetime] = None, positions_seed_path: Optional[Path] = None, earliest_timestamp: datetime = None, @@ -320,8 +153,8 @@ def get_bufr( Path to write latest positions. Used to retrieve a static set of positions to register stations with DMI/WMO timestamps_pickle_filepath Path to pickle file used for storing latest timestamp - station_configuration_path - Path to toml file with configuration entries for each station + station_configuration_mapping + Mapping of station id to StationConfiguration object now_timestamp get_bufr will export the latest data before now_timestamp. Default datetime.utcnow() positions_seed_path @@ -351,14 +184,6 @@ def get_bufr( logger.info(f"Seed positions for {positions_seed.keys()}") positions.update(positions_seed) - # Prepare station configurations - if station_configuration_path is None: - station_configuration_mapping = dict() - else: - station_configuration_mapping = load_station_configuration_mapping( - station_configuration_path - ) - # Prepare bufr output dir bufr_out.mkdir(parents=True, exist_ok=True) @@ -529,7 +354,8 @@ def get_bufr_variables( ) heightOfStationGroundAboveMeanSeaLevel = ( - height_of_gps_above_mean_sea_level - station_configuration.height_of_gps_from_station_ground + height_of_gps_above_mean_sea_level + - station_configuration.height_of_gps_from_station_ground ) heightOfBarometerAboveMeanSeaLevel = ( @@ -630,7 +456,73 @@ def min_data_check(s): def main(): - args = parse_arguments_bufr().parse_args() + parser = argparse.ArgumentParser() + parser.add_argument( + "--store_positions", + "--positions", + action="store_true", + required=False, + default=False, + help="If included (True), make a positions dict and output AWS_latest_locations.csv file.", + ) + parser.add_argument( + "--positions-filepath", + "-p", + type=Path, + required=False, + help="Path to write AWS_latest_locations.csv file.", + ) + parser.add_argument( + "--time-limit", + default=DEFAULT_LIN_REG_TIME_LIMIT, + type=str, + required=False, + help="Previous time to limit dataframe before applying linear regression.", + ) + parser.add_argument( + "--input_files", + "--l3-filepath", + "-i", + type=Path, + nargs="+", + required=True, + help="Path to L3 tx .csv files. Can be direct paths or glob patterns", + ) + parser.add_argument( + "--bufr-out", + "-o", + type=Path, + required=True, + help="Path to the BUFR out directory.", + ) + parser.add_argument( + "--timestamps-pickle-filepath", + type=Path, + required=False, + help="Path to the latest_timestamps.pickle file.", + ) + parser.add_argument( + "--station_configuration_root", + type=Path, + required=True, + help="Path to root directory containing station configuration toml files", + ) + parser.add_argument( + "--position_seed", + default=DEFAULT_POSITION_SEED_PATH, + type=Path, + required=False, + help="Path to csv file with seed values for output positions.", + ) + parser.add_argument( + "--latest_timestamp", + default=datetime.utcnow(), + type=pd.Timestamp, + help="Timestamp used to determine latest data. Default utcnow.", + ) + parser.add_argument("--verbose", "-v", default=False, action="store_true") + + args = parser.parse_args() log_level = logging.INFO if args.verbose: @@ -650,6 +542,8 @@ def main(): # The input path might be a glob pattern input_files += map(Path, glob.glob(path.as_posix())) + station_configuration_mapping = load_station_configuration_mapping(args.station_configuration_root) + get_bufr( bufr_out=args.bufr_out, input_files=input_files, @@ -658,7 +552,7 @@ def main(): time_limit=args.time_limit, timestamps_pickle_filepath=args.timestamps_pickle_filepath, now_timestamp=args.latest_timestamp, - station_configuration_path=args.station_configuration_mapping, + station_configuration_mapping=args.station_configuration_mapping, positions_seed_path=args.position_seed, ) diff --git a/src/pypromice/postprocess/station_configurations.toml b/src/pypromice/postprocess/station_configurations.toml deleted file mode 100644 index b6897327..00000000 --- a/src/pypromice/postprocess/station_configurations.toml +++ /dev/null @@ -1,768 +0,0 @@ -[CEN2] -stid = "CEN2" -station_site = "CEN" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04407" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[CP1] -stid = "CP1" -station_site = "CP1" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04442" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[DY2] -stid = "DY2" -station_site = "DY2" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04464" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[EGP] -stid = "EGP" -station_site = "EGP" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04451" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[HUM] -stid = "HUM" -station_site = "HUM" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04432" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[NAE] -stid = "NAE" -station_site = "NAE" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04420" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[NAU] -stid = "NAU" -station_site = "NAU" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04443" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[NEM] -stid = "NEM" -station_site = "NEM" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04436" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[NSE] -stid = "NSE" -station_site = "NSE" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04488" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[SDL] -stid = "SDL" -station_site = "SDL" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04485" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[SDM] -stid = "SDM" -station_site = "SDM" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04492" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[TUN] -stid = "TUN" -station_site = "TUN" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04425" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[KAN_M] -stid = "KAN_M" -station_site = "KAN_M" -project = "Promice" -station_type = "mobile" -wmo_id = "04411" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[KAN_U] -stid = "KAN_U" -station_site = "KAN_U" -project = "Promice" -station_type = "mobile" -wmo_id = "04409" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[LYN_L] -stid = "LYN_L" -station_site = "LYN_L" -project = "Disko" -station_type = "mobile" -wmo_id = "04450" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[LYN_T] -stid = "LYN_T" -station_site = "LYN_T" -project = "Disko" -station_type = "mobile" -wmo_id = "04429" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[MIT] -stid = "MIT" -station_site = "MIT" -project = "Promice" -station_type = "mobile" -wmo_id = "04430" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[NUK_K] -stid = "NUK_K" -station_site = "NUK_K" -project = "Promice" -station_type = "mobile" -wmo_id = "04437" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[NUK_L] -stid = "NUK_L" -station_site = "NUK_L" -project = "Promice" -station_type = "mobile" -wmo_id = "04403" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[SCO_L] -stid = "SCO_L" -station_site = "SCO_L" -project = "Promice" -station_type = "mobile" -wmo_id = "04413" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[SCO_U] -stid = "SCO_U" -station_site = "SCO_U" -project = "Promice" -station_type = "mobile" -wmo_id = "04421" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[TAS_A] -stid = "TAS_A" -station_site = "TAS_A" -project = "Promice" -station_type = "mobile" -wmo_id = "04408" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[TAS_L] -stid = "TAS_L" -station_site = "TAS_L" -project = "Promice" -station_type = "mobile" -wmo_id = "04404" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[THU_L] -stid = "THU_L" -station_site = "THU_L" -project = "Promice" -station_type = "mobile" -wmo_id = "04424" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[THU_L2] -stid = "THU_L2" -station_site = "THU_L2" -project = "Promice" -station_type = "mobile" -wmo_id = "04453" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[UPE_L] -stid = "UPE_L" -station_site = "UPE_L" -project = "Promice" -station_type = "mobile" -wmo_id = "04423" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[UPE_U] -stid = "UPE_U" -station_site = "UPE_U" -project = "Promice" -station_type = "mobile" -wmo_id = "04422" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[KAN_Lv3] -stid = "KAN_Lv3" -station_site = "KAN_L" -project = "Promice" -station_type = "mobile" -wmo_id = "04412" -barometer_from_gps = 1.3 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[NUK_Uv3] -stid = "NUK_Uv3" -station_site = "NUK_U" -project = "Promice" -station_type = "mobile" -wmo_id = "04439" -barometer_from_gps = 1.3 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[QAS_Lv3] -stid = "QAS_Lv3" -station_site = "QAS_L" -project = "Promice" -station_type = "mobile" -wmo_id = "04401" -barometer_from_gps = 1.3 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[QAS_Mv3] -stid = "QAS_Mv3" -station_site = "QAS_M" -project = "Promice" -station_type = "mobile" -wmo_id = "04441" -barometer_from_gps = 1.3 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[QAS_Uv3] -stid = "QAS_Uv3" -station_site = "QAS_U" -project = "Promice" -station_type = "mobile" -wmo_id = "04402" -barometer_from_gps = 1.3 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[ZAK_Lv3] -stid = "ZAK_Lv3" -station_site = "ZAK_L" -project = "GlacioBasis" -station_type = "mobile" -wmo_id = "04461" -barometer_from_gps = 1.3 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[ZAK_Uv3] -stid = "ZAK_Uv3" -station_site = "ZAK_U" -project = "GlacioBasis" -station_type = "mobile" -wmo_id = "04462" -barometer_from_gps = 1.3 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[WEG_B] -stid = "WEG_B" -station_site = "NUK_U" -project = "Wegener" -station_type = "land" -wmo_id = "460" -barometer_from_gps = 1.3 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -static_height_of_gps_from_mean_sea_level = 17.5 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[KAN_B] -stid = "KAN_B" -station_site = "KAN_B" -project = "Promice" -station_type = "land" -wmo_id = "445" -export_bufr = false -comment = "no_instantaneous" -skipped_variables = [] -positions_update_timestamp_only = true - -[CEN1] -stid = "CEN1" -station_site = "CEN1" -project = "GC-Net" -export_bufr = false -comment = "discontinued" -skipped_variables = [] -positions_update_timestamp_only = false - -[JAR_O] -stid = "JAR_O" -station_site = "JAR" -project = "GC-Net" -wmo_id = "04452" -export_bufr = false -comment = "discontinued" -skipped_variables = [] -positions_update_timestamp_only = false - -[KAN_L] -stid = "KAN_L" -station_site = "KAN_L" -project = "Promice" -wmo_id = "04412" -export_bufr = false -comment = "use_v3" -skipped_variables = [] -positions_update_timestamp_only = false - -[KPC_Lv3] -stid = "KPC_Lv3" -station_site = "KPC_L" -project = "Promice" -station_type = "mobile" -wmo_id = "04428" -export_bufr = false -comment = "v3_bad" -skipped_variables = [] -positions_update_timestamp_only = false - -[NUK_N] -stid = "NUK_N" -station_site = "NUK_N" -export_bufr = false -comment = "discontinued" -skipped_variables = [] -positions_update_timestamp_only = false - -[NUK_U] -stid = "NUK_U" -station_site = "NUK_U" -project = "Promice" -wmo_id = "04439" -export_bufr = false -comment = "use_v3" -skipped_variables = [] -positions_update_timestamp_only = false - -[QAS_A] -stid = "QAS_A" -station_site = "QAS_A" -project = "Promice" -export_bufr = false -comment = "discontinued" -skipped_variables = [] -positions_update_timestamp_only = false - -[QAS_L] -stid = "QAS_L" -station_site = "QAS_L" -project = "Promice" -wmo_id = "04401" -export_bufr = false -comment = "use_v3" -skipped_variables = [] -positions_update_timestamp_only = false - -[QAS_M] -stid = "QAS_M" -station_site = "QAS_M" -project = "Promice" -wmo_id = "04441" -export_bufr = false -comment = "use_v3" -skipped_variables = [] -positions_update_timestamp_only = false - -[QAS_U] -stid = "QAS_U" -station_site = "QAS_U" -project = "Promice" -wmo_id = "04402" -export_bufr = false -comment = "use_v3" -skipped_variables = [] -positions_update_timestamp_only = false - -[SWC_O] -stid = "SWC_O" -station_site = "SWC" -project = "GC-Net" -wmo_id = "04458" -export_bufr = false -comment = "discontinued" -skipped_variables = [] -positions_update_timestamp_only = false - -[TAS_U] -stid = "TAS_U" -station_site = "TAS_U" -project = "Promice" -export_bufr = false -comment = "discontinued" -skipped_variables = [] -positions_update_timestamp_only = false - -[THU_U] -stid = "THU_U" -station_site = "THU_U" -project = "Promice" -station_type = "mobile" -wmo_id = "04454" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = false -comment = "discontinued" -skipped_variables = [] -positions_update_timestamp_only = false - -[UWN] -stid = "UWN" -station_site = "UWN" -export_bufr = false -comment = "not_registered" -skipped_variables = [] -positions_update_timestamp_only = false - -[WEG_L] -stid = "WEG_L" -station_site = "WEG_L" -project = "Wegener" -export_bufr = false -comment = "not_registered" -skipped_variables = [] -positions_update_timestamp_only = false - -[XXX] -stid = "XXX" -station_site = "XXX" -export_bufr = false -comment = "test" -skipped_variables = [] -positions_update_timestamp_only = false - -[ZAK_A] -stid = "ZAK_A" -station_site = "ZAK_A" -project = "GlacioBasis" -export_bufr = false -comment = "not_registered" -skipped_variables = [] -positions_update_timestamp_only = false - -[ZAK_L] -stid = "ZAK_L" -station_site = "ZAK_L" -project = "GlacioBasis" -wmo_id = "04461" -export_bufr = false -comment = "use_v3,no_instantaneous" -skipped_variables = [] -positions_update_timestamp_only = false - -[ZAK_U] -stid = "ZAK_U" -station_site = "ZAK_U" -project = "GlacioBasis" -wmo_id = "04462" -export_bufr = false -comment = "use_v3,no_instantaneous" -skipped_variables = [] -positions_update_timestamp_only = false - -[KPC_Uv3] -stid = "KPC_Uv3" -station_site = "KPC_U" -project = "Promice" -station_type = "mobile" -wmo_id = "04427" -export_bufr = false -comment = "v3_bad" -skipped_variables = [] -positions_update_timestamp_only = false - -[KPC_L] -stid = "KPC_L" -station_site = "KPC_L" -project = "Promice" -wmo_id = "04428" -export_bufr = false -comment = "use_v3" -skipped_variables = [] -positions_update_timestamp_only = false - -[KPC_U] -stid = "KPC_U" -station_site = "KPC_U" -project = "Promice" -wmo_id = "04427" -export_bufr = false -comment = "use_v3" -skipped_variables = [] -positions_update_timestamp_only = false - -[THU_U2] -stid = "THU_U2" -station_site = "THU_U" -project = "Promice" -station_type = "mobile" -wmo_id = "04454" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false diff --git a/src/pypromice/station_configuration.py b/src/pypromice/station_configuration.py new file mode 100644 index 00000000..34e85ff3 --- /dev/null +++ b/src/pypromice/station_configuration.py @@ -0,0 +1,99 @@ +from pathlib import Path +from typing import Optional, Dict, Mapping, Sequence + +import attrs +import toml + + +@attrs.define +class StationConfiguration: + """ + Helper class for storing station specific configurations with respect to + + * Installation specific distance measurements such as height differences between instruments + * Reference strings such as stid, station_site and wmo_id + * BUFR export specific parameters + + # TODO: The station related meta data should be fetched from a station specific configuration files in the future or + # from header data in data source. + """ + + stid: str + station_site: str = None + project: Optional[str] = None + station_type: Optional[str] = None + wmo_id: Optional[str] = None + barometer_from_gps: Optional[float] = None + anemometer_from_sonic_ranger: Optional[float] = None + temperature_from_sonic_ranger: Optional[float] = None + height_of_gps_from_station_ground: Optional[float] = None + sonic_ranger_from_gps: Optional[float] = None + static_height_of_gps_from_mean_sea_level: Optional[float] = None + station_relocation: Sequence[str] = attrs.field(factory=list) + + # The station data will be exported to BUFR if True. Otherwise, it will only export latest position + export_bufr: bool = False + comment: Optional[str] = None + + # skip specific variables for stations + # If a variable has known bad data, use this collection to skip the variable + # Note that if a station is not reporting both air temp and pressure it will be skipped, + # as currently implemented in csv2bufr.min_data_check(). + # ['p_i'], # EXAMPLE + skipped_variables: Sequence[str] = attrs.field(factory=list) + + positions_update_timestamp_only: bool = False + + @classmethod + def load_toml(cls, path): + return cls(**toml.load(path)) + + def dump_toml(self, path: Path): + with path.open("w") as fp: + toml.dump(self.as_dict(), fp) + + def as_dict(self) -> Dict: + return attrs.asdict(self) + + +def load_station_configuration_mapping( + configuration_root_dir: Path, +) -> Mapping[str, StationConfiguration]: + """ + Load station configurations from toml files in configuration_root_dir + + Parameters + ---------- + configuration_root_dir + Root directory containing toml files + + Returns + ------- + Mapping from stid to StationConfiguration + + """ + return { + config_file.stem: StationConfiguration(**toml.load(config_file)) + for config_file in configuration_root_dir.glob("*.toml") + } + + +def write_station_configuration_mapping( + station_configurations: Mapping[str, StationConfiguration], + configuration_root_dir: Path, +) -> None: + """ + Write station configurations to toml files in configuration_root_dir + + Parameters + ---------- + station_configurations + Mapping from stid to StationConfiguration + configuration_root_dir + Output directory + + """ + configuration_root_dir.mkdir(parents=True, exist_ok=True) + for stid, station_configuration in station_configurations.items(): + with (configuration_root_dir / f"{stid}.toml").open("w") as fp: + toml.dump(station_configuration.as_dict(), fp) diff --git a/tests/unit/bufr_export/test_create_bufr_files.py b/tests/unit/bufr_export/test_create_bufr_files.py index 0b209fbf..f9cf935e 100644 --- a/tests/unit/bufr_export/test_create_bufr_files.py +++ b/tests/unit/bufr_export/test_create_bufr_files.py @@ -3,7 +3,11 @@ from typing import Optional from unittest import TestCase +import toml +from pypromice.station_configuration import write_station_configuration_mapping + from pypromice.postprocess.create_bufr_files import create_bufr_files +from tests.utilities import get_station_configuration DATA_DIR = Path(__file__).parent.absolute() @@ -44,6 +48,17 @@ def test_create_bufr_files(self): src_path=DATA_DIR.joinpath("tx_l3_test1.csv"), ) + station_configuration_root = self.temp_dir / "station_configuration" + station_configuration_root.mkdir(parents=True, exist_ok=True) + station_configuration_mapping = { + "THU_L2": get_station_configuration(stid="THU_L2", export_bufr=True), + "KAN_Lv3": get_station_configuration(stid="KAN_Lv3", export_bufr=True), + } + write_station_configuration_mapping( + station_configurations=station_configuration_mapping, + configuration_root_dir=station_configuration_root, + ) + create_bufr_files( input_files=input_files, period_start="2023-12-06T00:00", @@ -51,6 +66,7 @@ def test_create_bufr_files(self): output_root=output_dir, override=True, break_on_error=True, + station_configuration_root=station_configuration_root, ) compiled_output_dir = output_dir / "compiled" @@ -88,6 +104,14 @@ def test_get_bufr_from_empty_data_file_raises_error(self): output_dir = self.temp_dir / "output" input_file = input_dir / "THU_L2_hourly.csv" create_data_file(input_file, src_path=None) + station_configuration_root = self.temp_dir / "station_configuration" + station_configuration = get_station_configuration( + stid="KAN_Lv3", export_bufr=True + ) + write_station_configuration_mapping( + station_configurations={station_configuration.stid: station_configuration}, + configuration_root_dir=station_configuration_root, + ) with self.assertRaises(ValueError): create_bufr_files( @@ -97,9 +121,10 @@ def test_get_bufr_from_empty_data_file_raises_error(self): output_root=output_dir, override=True, break_on_error=True, + station_configuration_root=station_configuration_root, ) - def test_get_gufr_continues_when_break_on_error_is_false(self): + def test_get_bufr_continues_when_break_on_error_is_false(self): input_dir = self.temp_dir / "input" output_dir = self.temp_dir / "output" input_file_without_data = input_dir / "THU_L2_hourly.csv" @@ -110,6 +135,14 @@ def test_get_gufr_continues_when_break_on_error_is_false(self): ) compiled_output_dir = output_dir / "compiled" individual_output_root = output_dir / "individual" + station_configuration_root = self.temp_dir / "station_configuration" + write_station_configuration_mapping( + station_configurations={ + "THU_L2": get_station_configuration(stid="THU_L2", export_bufr=True), + "KAN_Lv3": get_station_configuration(stid="KAN_Lv3", export_bufr=True), + }, + configuration_root_dir=station_configuration_root, + ) expected_compiled_output_file = compiled_output_dir / "geus_20231206T0000.bufr" expected_individual_output_dir = individual_output_root / "20231206T0000" expected_individual_output_file = ( @@ -126,6 +159,7 @@ def test_get_gufr_continues_when_break_on_error_is_false(self): output_root=output_dir, override=True, break_on_error=False, + station_configuration_root=station_configuration_root, ) self.assertTrue(expected_compiled_output_file.exists()) @@ -144,6 +178,14 @@ def test_get_bufr_where_period_does_not_exist(self): output_dir = self.temp_dir / "output" input_file = input_dir / "THU_L2_hourly.csv" create_data_file(input_file, src_path=DATA_DIR.joinpath("tx_l3_test1.csv")) + station_configuration_root = self.temp_dir / "station_configuration" + station_configuration = get_station_configuration( + stid="THU_L2", export_bufr=True + ) + write_station_configuration_mapping( + station_configurations={station_configuration.stid: station_configuration}, + configuration_root_dir=station_configuration_root, + ) create_bufr_files( input_files=[input_file], @@ -152,6 +194,7 @@ def test_get_bufr_where_period_does_not_exist(self): output_root=output_dir, override=True, break_on_error=True, + station_configuration_root=station_configuration_root, ) compiled_output_dir = output_dir / "compiled" diff --git a/tests/unit/bufr_export/test_get_bufr.py b/tests/unit/bufr_export/test_get_bufr.py index d9ffa078..2af06c4c 100644 --- a/tests/unit/bufr_export/test_get_bufr.py +++ b/tests/unit/bufr_export/test_get_bufr.py @@ -4,7 +4,6 @@ import sys import unittest import uuid -from io import StringIO from pathlib import Path from tempfile import TemporaryDirectory from unittest import TestCase, mock @@ -15,11 +14,11 @@ from pypromice.postprocess.bufr_utilities import BUFRVariables from pypromice.postprocess.get_bufr import ( process_station, - StationConfiguration, get_bufr, get_bufr_variables, - write_station_configuration_mapping, - load_station_configuration_mapping, +) +from pypromice.station_configuration import ( + StationConfiguration, ) from tests.unit.bufr_export.test_get_bufr_integration import ( DATA_DIR, @@ -35,121 +34,25 @@ MOCK_BASE_STR = "pypromice.postprocess.get_bufr.{}" -class StationConfigurationTestCase(TestCase): - def test_read(self): - source_lines = [ - "[UPE_L]\n", - 'stid = "UPE_L"\n', - 'station_site = "UPE_L"\n', - 'project = "Promice"\n', - 'station_type = "mobile"\n', - 'wmo_id = "04423"\n', - "barometer_from_gps = -0.25\n", - "anemometer_from_sonic_ranger = 0.4\n", - "temperature_from_sonic_ranger = 0.0\n", - "height_of_gps_from_station_ground = 0.9\n", - "sonic_ranger_from_gps = 1.3\n", - "export_bufr = true\n", - "skipped_variables = []\n", - "positions_update_timestamp_only = false\n", - ] - source_io = StringIO() - source_io.writelines(source_lines) - source_io.seek(0) - expected_configuration_mapping = { - "UPE_L": StationConfiguration( - stid="UPE_L", - station_site="UPE_L", - project="Promice", - station_type="mobile", - wmo_id="04423", - barometer_from_gps=-0.25, - anemometer_from_sonic_ranger=0.4, - temperature_from_sonic_ranger=0.0, - height_of_gps_from_station_ground=0.9, - sonic_ranger_from_gps=1.3, - export_bufr=True, - comment=None, - skipped_variables=[], - positions_update_timestamp_only=False, - ) - } - - station_configuration_mapping = load_station_configuration_mapping(source_io) - - self.assertDictEqual( - expected_configuration_mapping, - station_configuration_mapping, - ) - - def test_write_read(self): - station_config = StationConfiguration( - stid="UPE_L", - station_site="UPE_L", - project="Promice", +class BufrVariablesTestCase(TestCase): + def test_bufr_variables_gcnet(self): + station_configuration = StationConfiguration( + stid="DY2", + station_site="DY2", + project="GC-Net", + wmo_id="04464", station_type="mobile", - wmo_id="04423", - barometer_from_gps=-0.25, + barometer_from_gps=0.55, anemometer_from_sonic_ranger=0.4, - temperature_from_sonic_ranger=0.0, - height_of_gps_from_station_ground=0.9, - sonic_ranger_from_gps=1.3, + temperature_from_sonic_ranger=0.4, + height_of_gps_from_station_ground=1.5, + sonic_ranger_from_gps=0.15, export_bufr=True, - comment=None, - skipped_variables=[], - positions_update_timestamp_only=False, - ) - config_mapping = {station_config.stid: station_config} - source_io = StringIO() - - write_station_configuration_mapping(config_mapping, source_io) - source_io.seek(0) - read_mapping = load_station_configuration_mapping(source_io) - - self.assertDictEqual( - config_mapping, - read_mapping, ) - def test_write_read_minimal_config(self): - station_config = StationConfiguration(stid="UPE_L") - config_mapping = {station_config.stid: station_config} - source_io = StringIO() - - write_station_configuration_mapping(config_mapping, source_io) - source_io.seek(0) - read_mapping = load_station_configuration_mapping(source_io) - - self.maxDiff = None - self.assertEqual( - station_config, - config_mapping[station_config.stid], - ) - self.assertDictEqual( - config_mapping, - read_mapping, - ) - - def test_write_read_empty_mapping(self): - config_mapping = {} - source_io = StringIO() - - write_station_configuration_mapping(config_mapping, source_io) - source_io.seek(0) - read_mapping = load_station_configuration_mapping(source_io) - - self.assertDictEqual( - config_mapping, - read_mapping, - ) - - -class BufrVariablesTestCase(TestCase): - def test_bufr_variables_gcnet(self): self._test_bufr_variables( - stid="DY2", - wmo_id="04464", - station_type="mobile", + stid=station_configuration.stid, + station_configuration=station_configuration, relativeHumidity=69.0, airTemperature=256.0, pressure=77300.0, @@ -164,10 +67,22 @@ def test_bufr_variables_gcnet(self): ) def test_bufr_variables_promice_v2(self): - self._test_bufr_variables( + station_configuration = StationConfiguration( stid="NUK_L", - wmo_id="04403", + station_site="NUK_L", + project="Promice", station_type="mobile", + wmo_id="04403", + barometer_from_gps=-0.25, + anemometer_from_sonic_ranger=0.4, + temperature_from_sonic_ranger=0.0, + height_of_gps_from_station_ground=0.9, + sonic_ranger_from_gps=1.3, + export_bufr=True, + ) + self._test_bufr_variables( + stid=station_configuration.stid, + station_configuration=station_configuration, relativeHumidity=69.0, airTemperature=256.0, pressure=77300.0, @@ -182,10 +97,22 @@ def test_bufr_variables_promice_v2(self): ) def test_bufr_variables_promice_v3(self): - self._test_bufr_variables( + station_configuration = StationConfiguration( stid="QAS_Mv3", - wmo_id="04441", + station_site="QAS_M", + project="Promice", station_type="mobile", + wmo_id="04441", + barometer_from_gps=1.3, + anemometer_from_sonic_ranger=0.4, + temperature_from_sonic_ranger=0.0, + height_of_gps_from_station_ground=0.9, + sonic_ranger_from_gps=1.3, + export_bufr=True, + ) + self._test_bufr_variables( + stid=station_configuration.stid, + station_configuration=station_configuration, relativeHumidity=69.0, airTemperature=256.0, pressure=77300.0, @@ -299,7 +226,7 @@ def _test_bufr_variables( self, write_bufr_message_mock: mock.MagicMock, stid: str, - wmo_id: str, + station_configuration: StationConfiguration, relativeHumidity: float, airTemperature: float, pressure: float, @@ -311,7 +238,6 @@ def _test_bufr_variables( heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH: float, heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD: float, heightOfBarometerAboveMeanSeaLevel: float, - station_type: str, ): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") l3_src = pd.read_csv(l3_src_filepath) @@ -325,14 +251,17 @@ def _test_bufr_variables( stid=stid, store_positions=True, time_limit="91d", + station_configuration_mapping={ + station_configuration.stid: station_configuration + }, ) write_bufr_message_mock.assert_called_once() call = write_bufr_message_mock.call_args_list[0] expected_time = datetime.datetime(year=2023, month=12, day=7, hour=23) expected_bufr_variables = BUFRVariables( - wmo_id=wmo_id, - station_type=station_type, + wmo_id=station_configuration.wmo_id, + station_type=station_configuration.station_type, timestamp=expected_time, relativeHumidity=relativeHumidity, airTemperature=airTemperature, @@ -719,7 +648,7 @@ def test_process_station_raises_exception( bufr_out=self.bufr_root, input_files=[input_file_path], positions_filepath=self.positions_file_path, - station_configuration_path=None, + station_configuration_mapping=dict(), timestamps_pickle_filepath=timestamps_pickle_filepath, now_timestamp=now_timestamp, ) @@ -753,23 +682,18 @@ def test_multiple_stations(self, process_station_mock: mock.MagicMock): self.root_path / f"{station_config02.stid}_hourly.csv", self.root_path / f"{station_config03.stid}_hourly.csv", ] - station_configs = { + station_config_mapping = { station_config01.stid: station_config01, station_config02.stid: station_config02, station_config03.stid: station_config03, } - with self.station_configuration_path.open("w") as fp: - write_station_configuration_mapping( - station_configs, - fp, - ) get_bufr( store_positions=True, bufr_out=self.bufr_root, input_files=input_files, positions_filepath=self.positions_file_path, - station_configuration_path=self.station_configuration_path, + station_configuration_mapping=station_config_mapping, timestamps_pickle_filepath=self.timestamps_pickle_filepath, positions_seed_path=None, now_timestamp=datetime.datetime.now(), @@ -794,7 +718,7 @@ def test_no_stations(self): bufr_out=self.bufr_root, input_files=(), positions_filepath=self.positions_file_path, - station_configuration_path=None, + station_configuration_mapping=dict(), timestamps_pickle_filepath=self.timestamps_pickle_filepath, now_timestamp=now_timestamp, ) @@ -816,11 +740,9 @@ def test_single_station(self, process_station_mock: mock.MagicMock): stid = "THE_STID_FOR_A_STATION" input_file_path = self.root_path / f"{stid}_hourly.csv" station_configuration = StationConfiguration(stid=stid, export_bufr=True) - with self.station_configuration_path.open("w") as fp: - write_station_configuration_mapping( - dict(stid=station_configuration), - fp, - ) + station_configuration_mapping = { + stid: station_configuration, + } expected_output_path = self.bufr_root / f"{stid}.bufr" expected_latest_timestamp = now_timestamp - datetime.timedelta(days=2) expected_station_configuration = StationConfiguration( @@ -832,7 +754,7 @@ def test_single_station(self, process_station_mock: mock.MagicMock): bufr_out=self.bufr_root, input_files=[input_file_path], positions_filepath=self.positions_file_path, - station_configuration_path=self.station_configuration_path, + station_configuration_mapping=station_configuration_mapping, timestamps_pickle_filepath=self.timestamps_pickle_filepath, positions_seed_path=None, now_timestamp=now_timestamp, @@ -861,7 +783,7 @@ def test_station_without_configuration(self, process_station_mock: mock.MagicMoc bufr_out=self.bufr_root, input_files=[input_file_path], positions_filepath=self.positions_file_path, - station_configuration_path=None, + station_configuration_mapping=dict(), timestamps_pickle_filepath=self.timestamps_pickle_filepath, positions_seed_path=None, now_timestamp=now_timestamp, @@ -892,7 +814,7 @@ def test_latest_timestamp(self, process_station_mock: mock.MagicMock): bufr_out=self.bufr_root, input_files=[input_file_path], positions_filepath=self.positions_file_path, - station_configuration_path=None, + station_configuration_mapping=dict(), timestamps_pickle_filepath=self.timestamps_pickle_filepath, positions_seed_path=None, now_timestamp=now_timestamp, @@ -915,11 +837,7 @@ def test_update_timestamp_only(self, process_station_mock: mock.MagicMock): station_config = StationConfiguration( stid=stid, positions_update_timestamp_only=True ) - with self.station_configuration_path.open("w") as fp: - write_station_configuration_mapping( - config_mapping={station_config.stid: station_config}, - fp=fp, - ) + config_mapping = {station_config.stid: station_config} input_file_path = self.root_path / f"{stid}_hourly.csv" seed_timestamp = datetime.datetime(2021, 10, 2, 10, 0) now_timestamp = datetime.datetime(2023, 3, 3, 5, 0) @@ -946,7 +864,7 @@ def test_update_timestamp_only(self, process_station_mock: mock.MagicMock): bufr_out=self.bufr_root, input_files=[input_file_path], positions_filepath=self.positions_file_path, - station_configuration_path=self.station_configuration_path, + station_configuration_mapping=config_mapping, timestamps_pickle_filepath=self.timestamps_pickle_filepath, positions_seed_path=self.positions_seed_path, now_timestamp=now_timestamp, @@ -977,7 +895,7 @@ def test_position_seed(self): bufr_out=self.bufr_root, input_files=(), positions_filepath=self.positions_file_path, - station_configuration_path=None, + station_configuration_mapping=dict(), timestamps_pickle_filepath=self.timestamps_pickle_filepath, positions_seed_path=self.positions_seed_path, now_timestamp=datetime.datetime.now(), diff --git a/tests/unit/bufr_export/test_get_bufr_integration.py b/tests/unit/bufr_export/test_get_bufr_integration.py index f7b3cde4..64e76f74 100644 --- a/tests/unit/bufr_export/test_get_bufr_integration.py +++ b/tests/unit/bufr_export/test_get_bufr_integration.py @@ -17,11 +17,11 @@ from pypromice.postprocess import get_bufr from pypromice.postprocess.bufr_utilities import read_bufr_message, BUFRVariables -from pypromice.postprocess.get_bufr import ( - DEFAULT_STATION_CONFIGURATION_PATH, +from pypromice.station_configuration import ( StationConfiguration, write_station_configuration_mapping, ) +from tests.utilities import get_station_configuration logging.basicConfig( stream=sys.stdout, @@ -36,7 +36,7 @@ def run_get_bufr( l3_data: pd.DataFrame, stid: str, latest_timestamps: Optional[Dict[str, datetime.datetime]], - station_configuration_mapping=None, + station_configuration_mapping: Dict[str, StationConfiguration], **get_bufr_kwargs, ) -> Optional[BUFRVariables]: """ @@ -59,22 +59,9 @@ def run_get_bufr( bufr_out = output_path.joinpath("BUFR_out") timestamps_pickle_filepath = output_path.joinpath("latest_timestamps.pickle") positions_filepath = output_path.joinpath("AWS_latest_locations.csv") - station_configuration_path = output_path.joinpath("station_configuration.toml") l3_filepath = output_path.joinpath(f"{stid}_hour.csv") l3_data.to_csv(l3_filepath) - if station_configuration_mapping is None: - shutil.copy( - DEFAULT_STATION_CONFIGURATION_PATH, - station_configuration_path, - ) - else: - with station_configuration_path.open("w") as fp: - write_station_configuration_mapping( - station_configuration_mapping, - fp, - ) - if latest_timestamps is not None: with timestamps_pickle_filepath.open("wb") as fp: pickle.dump(latest_timestamps, fp) @@ -84,7 +71,7 @@ def run_get_bufr( input_files=[l3_filepath], timestamps_pickle_filepath=timestamps_pickle_filepath, positions_filepath=positions_filepath, - station_configuration_path=station_configuration_path, + station_configuration_mapping=station_configuration_mapping, **get_bufr_kwargs, ) diff --git a/tests/unit/test_station_config.py b/tests/unit/test_station_config.py new file mode 100644 index 00000000..a2b117fd --- /dev/null +++ b/tests/unit/test_station_config.py @@ -0,0 +1,143 @@ +from pathlib import Path +from unittest import TestCase +from tempfile import TemporaryDirectory + +from pypromice.station_configuration import ( + StationConfiguration, + load_station_configuration_mapping, + write_station_configuration_mapping, +) +from tests.utilities import get_station_configuration + + +class StationConfigurationTestCase(TestCase): + def test_read_toml(self): + with TemporaryDirectory() as temp_dir: + source_path = Path(temp_dir) / "UPE_L.toml" + source_str = """ + stid = "UPE_L" + station_site = "UPE_L" + project = "Promice" + station_type = "mobile" + wmo_id = "04423" + barometer_from_gps = -0.25 + anemometer_from_sonic_ranger = 0.4 + temperature_from_sonic_ranger = 0.0 + height_of_gps_from_station_ground = 0.9 + sonic_ranger_from_gps = 1.3 + export_bufr = true + skipped_variables = [] + positions_update_timestamp_only = false + """ + with source_path.open("w") as source_io: + source_io.writelines(source_str) + + expected_configuration = StationConfiguration( + stid="UPE_L", + station_site="UPE_L", + project="Promice", + station_type="mobile", + wmo_id="04423", + barometer_from_gps=-0.25, + anemometer_from_sonic_ranger=0.4, + temperature_from_sonic_ranger=0.0, + height_of_gps_from_station_ground=0.9, + sonic_ranger_from_gps=1.3, + export_bufr=True, + comment=None, + skipped_variables=[], + positions_update_timestamp_only=False, + ) + + station_configuration = StationConfiguration.load_toml(source_path) + self.assertEqual( + expected_configuration, + station_configuration, + ) + + def test_write_read(self): + with TemporaryDirectory() as temp_dir: + output_path = Path(temp_dir) / "UPE_L.toml" + src_station_config = StationConfiguration( + stid="UPE_L", + station_site="UPE_L", + project="Promice", + station_type="mobile", + wmo_id="04423", + barometer_from_gps=-0.25, + anemometer_from_sonic_ranger=0.4, + temperature_from_sonic_ranger=0.0, + height_of_gps_from_station_ground=0.9, + sonic_ranger_from_gps=1.3, + export_bufr=True, + comment=None, + skipped_variables=[], + positions_update_timestamp_only=False, + ) + src_station_config.dump_toml(output_path) + + read_station_config = StationConfiguration.load_toml(output_path) + self.assertEqual( + src_station_config, + read_station_config, + ) + + def test_read_station_config_mapping(self): + with TemporaryDirectory() as temp_dir: + station_config_root = Path(temp_dir) / "station_configs" + station_config_root.mkdir() + source_mapping = { + "UPE_L": get_station_configuration(stid="UPE_L"), + "UPE_R": get_station_configuration(stid="UPE_R"), + } + for stid, station_config in source_mapping.items(): + station_config.dump_toml(station_config_root / f"{stid}.toml") + + read_mapping = load_station_configuration_mapping(station_config_root) + self.assertDictEqual( + source_mapping, + read_mapping, + ) + + def test_write_station_config_mapping(self): + with TemporaryDirectory() as temp_dir: + station_config_root = Path(temp_dir) / "station_configs" + station_config_root.mkdir() + source_mapping = { + "UPE_L": get_station_configuration(stid="UPE_L"), + "UPE_R": get_station_configuration(stid="UPE_R"), + } + + write_station_configuration_mapping(source_mapping, station_config_root) + + read_mapping = load_station_configuration_mapping(station_config_root) + self.assertDictEqual( + source_mapping, + read_mapping, + ) + + def test_read_station_config_mapping_empty(self): + with TemporaryDirectory() as temp_dir: + station_config_root = Path(temp_dir) / "station_configs" + station_config_root.mkdir() + + read_mapping = load_station_configuration_mapping(station_config_root) + self.assertDictEqual( + {}, + read_mapping, + ) + + def test_read_station_config_mapping_ingore_filenames(self): + def test_read_station_config_mapping(self): + with TemporaryDirectory() as temp_dir: + station_config_root = Path(temp_dir) / "station_configs" + station_config_root.mkdir() + station_config = get_station_configuration(stid="UPE_L") + station_config.dump_toml(station_config_root / "a_custom_filename.toml") + expected_station_config_mapping = {station_config.stid: station_config} + + read_mapping = load_station_configuration_mapping(station_config_root) + self.assertDictEqual( + expected_station_config_mapping, + read_mapping, + ) diff --git a/tests/utilities.py b/tests/utilities.py new file mode 100644 index 00000000..742f6861 --- /dev/null +++ b/tests/utilities.py @@ -0,0 +1,61 @@ +import random +import uuid + +from pypromice.postprocess.bufr_utilities import BUFR_TEMPLATES +from pypromice.station_configuration import StationConfiguration + +STATION_TYPE_STRINGS = tuple(BUFR_TEMPLATES.keys()) + + +def get_station_configuration(**kwargs) -> StationConfiguration: + """ + Create a StationConfiguration object with random values. + + Parameters + ---------- + kwargs : dict + Keyword arguments to providie explicit values for the StationConfiguration object. + Returns + ------- + """ + stid = kwargs.get("stid", str(uuid.uuid4())) + station_site = kwargs.get("station_site", str(uuid.uuid4())) + project = kwargs.get("project", str(uuid.uuid4())) + station_type = kwargs.get("station_type", random.choice(STATION_TYPE_STRINGS)) + # WMO Station number <1024 for land stations + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/001002 + wmo_id = kwargs.get("wmo_id", "{:05}".format(random.randint(0, 1023))) + barometer_from_gps = kwargs.get("barometer_from_gps", random.random() * 3) + anemometer_from_sonic_ranger = kwargs.get( + "anemometer_from_sonic_ranger", random.random() * 3 + ) + temperature_from_sonic_ranger = kwargs.get( + "temperature_from_sonic_ranger", random.random() * 3 + ) + height_of_gps_from_station_ground = kwargs.get( + "height_of_gps_from_station_ground", random.random() * 3 + ) + sonic_ranger_from_gps = kwargs.get("sonic_ranger_from_gps", random.random() * 3) + export_bufr = kwargs.get("export_bufr", random.random() > 0.5) + skipped_variables = kwargs.get("skipped_variables", []) + positions_update_timestamp_only = kwargs.get( + "positions_update_timestamp_only", random.random() > 0.5 + ) + station_relocation = kwargs.get("station_relocation", []) + + return StationConfiguration( + stid=stid, + station_site=station_site, + project=project, + station_type=station_type, + wmo_id=wmo_id, + barometer_from_gps=barometer_from_gps, + anemometer_from_sonic_ranger=anemometer_from_sonic_ranger, + temperature_from_sonic_ranger=temperature_from_sonic_ranger, + height_of_gps_from_station_ground=height_of_gps_from_station_ground, + sonic_ranger_from_gps=sonic_ranger_from_gps, + export_bufr=export_bufr, + skipped_variables=skipped_variables, + positions_update_timestamp_only=positions_update_timestamp_only, + station_relocation=station_relocation, + ) From 828cba29d3b18c767f074ec1a71d5ffce4f3c491 Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Tue, 9 Jul 2024 09:38:33 +0200 Subject: [PATCH 11/16] Updated BUFRVariables with scales and descriptions * Added detailed descriptions with references to the attributes in BUFRVariables * Change the attribute order to align with the exported schema * Changed variable roundings to align with the scales defined in the BUFR schemas: * Latitude and longitude is set to 5. Was 6 * heightOfStationGroundAboveMeanSeaLevel is set to 1. Was 2 * heightOfBarometerAboveMeanSeaLevel is set to to 1. Was 2 * pressure is set to -1. Was 1. Note: The BUFRVariable unit is Pa and not hPA * airTemperature is set to 2. Was 1. * heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH is set to 2. Was 4 * heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD is set to 2. Was 4 * Added unit tests to test the roundings * Updated existing unit tests to align with corrected precision --- src/pypromice/postprocess/bufr_utilities.py | 81 ++++++++++++++++--- .../unit/bufr_export/test_bufr_utilitites.py | 34 ++++++++ .../bufr_export/test_create_bufr_files.py | 2 +- tests/unit/bufr_export/test_get_bufr.py | 8 +- .../bufr_export/test_get_bufr_integration.py | 12 +-- 5 files changed, 113 insertions(+), 24 deletions(-) diff --git a/src/pypromice/postprocess/bufr_utilities.py b/src/pypromice/postprocess/bufr_utilities.py index 00e036d8..8537e7f2 100644 --- a/src/pypromice/postprocess/bufr_utilities.py +++ b/src/pypromice/postprocess/bufr_utilities.py @@ -66,28 +66,81 @@ class BUFRVariables: """ - wmo_id: str + # Station type: "mobile" or "land" + # =============================== + # Fixed land station schema: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/307080 + # Mobile station schema: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/307090 + station_type: str + + # WMO station identifier + # Land stations: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/301090 + # Mobile stations: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/301092 + # ====================================================================================================== + wmo_id: str timestamp: datetime.datetime - relativeHumidity: float = attrs.field(converter=round_converter(0)) - airTemperature: float = attrs.field(converter=round_converter(1)) - pressure: float = attrs.field(converter=round_converter(1)) - windDirection: float = attrs.field(converter=round_converter(0)) - windSpeed: float = attrs.field(converter=round_converter(1)) - latitude: float = attrs.field(converter=round_converter(6)) - longitude: float = attrs.field(converter=round_converter(6)) + + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/005001 + # Scale: 5, unit: degrees + # TODO: Test if eccodes does the rounding as well. The rounding is was 6 which is larger that the scale. + latitude: float = attrs.field(converter=round_converter(5)) + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/006001 + # Scale: 5, unit: degrees + longitude: float = attrs.field(converter=round_converter(5)) + + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/007030 + # Scale: 1, unit: m heightOfStationGroundAboveMeanSeaLevel: float = attrs.field( - converter=round_converter(2) + converter=round_converter(1) ) - # + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/007031 + # Scale: 1, unit: m heightOfBarometerAboveMeanSeaLevel: float = attrs.field( - converter=round_converter(2), + converter=round_converter(1), ) + + # Pressure information + # ==================== + # Definition table: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/302031 + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/007004 + # Scale: -1, unit: Pa + pressure: float = attrs.field(converter=round_converter(-1)) + # There are two other pressure variables in the template: 302001 and 010062. + + # Basic synoptic "instantaneous" data + # =================================== + # Definition table: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/302035 + # This section only include the temperature and humidity data (302032). + # Precipitation and cloud data are currently ignored. + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/007032 + # Scale: 2, unit: m + # This is the first appearance of this variable id. heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH: float = attrs.field( - converter=round_converter(4), + converter=round_converter(2), ) + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/012101 + # Scale: 2, unit: K + airTemperature: float = attrs.field(converter=round_converter(2)) + # There is also a Dewpoint temperature in this template: 012103 which is currently unused. + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/012103 + # Scale: 0, unit: % + relativeHumidity: float = attrs.field(converter=round_converter(0)) + + # Basic synoptic "period" data + # ============================ + # Definition table: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/302043 + # Wind data: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/302042 + # Wind direction: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/011001 + # Scale: 0, unit: degrees + windDirection: float = attrs.field(converter=round_converter(0)) + # Wind speed: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/011002 + # Scale: 1, unit: m/s + windSpeed: float = attrs.field(converter=round_converter(1)) + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/007032 + # Scale: 2, unit: m + # This is the 7th appearance of this variable id. heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD: float = attrs.field( - converter=round_converter(4) + converter=round_converter(2) ) def as_series(self) -> pd.Series: @@ -131,6 +184,7 @@ def __eq__(self, other: "BUFRVariables"): BUFR_TEMPLATES = { "mobile": { + # Template definition: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/307090 "unexpandedDescriptors": (307090), # message template, "synopMobil" "edition": 4, # latest edition "masterTableNumber": 0, @@ -146,6 +200,7 @@ def __eq__(self, other: "BUFRVariables"): "compressedData": 0, }, "land": { + # Template definition: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/307080 "unexpandedDescriptors": (307080), # message template, "synopLand" "edition": 4, # latest edition "masterTableNumber": 0, diff --git a/tests/unit/bufr_export/test_bufr_utilitites.py b/tests/unit/bufr_export/test_bufr_utilitites.py index 2b9a19b1..49ecc203 100644 --- a/tests/unit/bufr_export/test_bufr_utilitites.py +++ b/tests/unit/bufr_export/test_bufr_utilitites.py @@ -181,3 +181,37 @@ def test_nan_value_serialization(self): variables_src, variables_read, ) + + def test_precision(self): + """ + Test if the BUFRVariable rounding configurations aligns with the BUFR format. + + Use np.random.random() to generate high precision random values. + """ + variables_src = BUFRVariables( + wmo_id="04464", + station_type="mobile", + timestamp=datetime.datetime(2023, 12, 19, 10, 0), + relativeHumidity=np.random.random(), + airTemperature=np.random.random(), + pressure=1000*np.random.random(), + windDirection=np.random.random(), + windSpeed=np.random.random(), + latitude=np.random.random(), + longitude=np.random.random(), + heightOfStationGroundAboveMeanSeaLevel=np.random.random(), + heightOfBarometerAboveMeanSeaLevel=np.random.random(), + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=np.random.random(), + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=np.random.random(), + ) + with tempfile.TemporaryFile("w+b") as fp: + write_bufr_message(variables=variables_src, file=fp) + fp.seek(0) + variables_read = read_bufr_message( + fp=fp, + ) + + self.assertEqual( + variables_src, + variables_read, + ) diff --git a/tests/unit/bufr_export/test_create_bufr_files.py b/tests/unit/bufr_export/test_create_bufr_files.py index f9cf935e..2cb25afc 100644 --- a/tests/unit/bufr_export/test_create_bufr_files.py +++ b/tests/unit/bufr_export/test_create_bufr_files.py @@ -32,7 +32,7 @@ def tearDown(self): def test_create_bufr_files(self): """ - Teste the creation of bufr files and their output folder structure. + Test the creation of bufr files and their output folder structure. It does not test the content of the bufr files. """ input_dir = self.temp_dir / "input" diff --git a/tests/unit/bufr_export/test_get_bufr.py b/tests/unit/bufr_export/test_get_bufr.py index 2af06c4c..edb93b5b 100644 --- a/tests/unit/bufr_export/test_get_bufr.py +++ b/tests/unit/bufr_export/test_get_bufr.py @@ -54,7 +54,7 @@ def test_bufr_variables_gcnet(self): stid=station_configuration.stid, station_configuration=station_configuration, relativeHumidity=69.0, - airTemperature=256.0, + airTemperature=255.95, pressure=77300.0, windDirection=149.0, windSpeed=14.9, @@ -84,7 +84,7 @@ def test_bufr_variables_promice_v2(self): stid=station_configuration.stid, station_configuration=station_configuration, relativeHumidity=69.0, - airTemperature=256.0, + airTemperature=255.95, pressure=77300.0, windDirection=149.0, windSpeed=14.9, @@ -114,7 +114,7 @@ def test_bufr_variables_promice_v3(self): stid=station_configuration.stid, station_configuration=station_configuration, relativeHumidity=69.0, - airTemperature=256.0, + airTemperature=255.95, pressure=77300.0, windDirection=149.0, windSpeed=14.9, @@ -165,7 +165,7 @@ def test_bufr_variables_static_gps_elevation(self): station_type=station_config.station_type, timestamp=timestamp, relativeHumidity=1.0, - airTemperature=252.2, # Converted to kelvin + airTemperature=252.15, # Converted to kelvin pressure=199300.0, windDirection=32.0, windSpeed=5.3, diff --git a/tests/unit/bufr_export/test_get_bufr_integration.py b/tests/unit/bufr_export/test_get_bufr_integration.py index 64e76f74..fd755fd9 100644 --- a/tests/unit/bufr_export/test_get_bufr_integration.py +++ b/tests/unit/bufr_export/test_get_bufr_integration.py @@ -141,7 +141,7 @@ def test_get_bufr_has_new_data(self): # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 timestamp=datetime.datetime(2023, 12, 7, 23, 00), relativeHumidity=69, - airTemperature=256.0, + airTemperature=255.95, pressure=77300.0, windDirection=149, windSpeed=14.9, @@ -180,7 +180,7 @@ def test_get_bufr_has_new_data_dont_store_position(self): # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 timestamp=datetime.datetime(2023, 12, 7, 23, 00), relativeHumidity=69, - airTemperature=256.0, + airTemperature=255.95, pressure=77300.0, windDirection=149, windSpeed=14.9, @@ -260,7 +260,7 @@ def test_get_bufr_includes_datasets_not_in_latests_timestamps(self): # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 timestamp=datetime.datetime(2023, 12, 7, 23, 00), relativeHumidity=69, - airTemperature=256.0, + airTemperature=255.95, pressure=77300.0, windDirection=149, windSpeed=14.9, @@ -321,7 +321,7 @@ def test_invalid_value_at_last_index(self): # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 timestamp=datetime.datetime(2023, 12, 7, 23, 00), relativeHumidity=69, - airTemperature=256.0, + airTemperature=255.95, pressure=np.nan, windDirection=149, windSpeed=14.9, @@ -459,7 +459,7 @@ def test_ignore_newer_data_than_now_input(self): # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 but now_timestamp is 2023-12-06 timestamp=datetime.datetime(2023, 12, 6, 0, 0), relativeHumidity=82, - airTemperature=250.8, + airTemperature=250.85, pressure=77370.0, windDirection=153, windSpeed=10.4, @@ -500,7 +500,7 @@ def test_land_station_export(self): # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 timestamp=datetime.datetime(2023, 12, 7, 23, 00), relativeHumidity=69, - airTemperature=256.0, + airTemperature=255.95, pressure=77300.0, windDirection=149, windSpeed=14.9, From c6d15ea9b0bf2057e776c6f8c492a31797c585bb Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Tue, 9 Jul 2024 11:31:45 +0200 Subject: [PATCH 12/16] Increased the real_time_utilities rounding precisions --- .../postprocess/real_time_utilities.py | 12 +++++----- tests/unit/bufr_export/test_get_bufr.py | 16 ++++++------- .../bufr_export/test_get_bufr_integration.py | 24 +++++++++---------- .../bufr_export/test_realtime_utilitites.py | 24 +++++++++---------- 4 files changed, 38 insertions(+), 38 deletions(-) diff --git a/src/pypromice/postprocess/real_time_utilities.py b/src/pypromice/postprocess/real_time_utilities.py index 17efac92..2352f30d 100644 --- a/src/pypromice/postprocess/real_time_utilities.py +++ b/src/pypromice/postprocess/real_time_utilities.py @@ -72,7 +72,7 @@ def get_latest_data( # Apply smoothing to z_boom_u # require at least 2 hourly obs? Sometimes seeing once/day data for z_boom_u - df_limited = rolling_window(df_limited, "z_boom_u", "72H", 2, 1) + df_limited = rolling_window(df_limited, "z_boom_u", "72H", 2, 3) # limit to single most recent valid row (convert to series) s_current = df_limited.loc[last_valid_index] @@ -149,9 +149,9 @@ def find_positions(df, time_limit): logger.info(f"last transmission: {df_limited.index.max()}") # Extrapolate recommended for altitude, optional for lat and lon. - df_limited, lat_valid = linear_fit(df_limited, "gps_lat", 6) - df_limited, lon_valid = linear_fit(df_limited, "gps_lon", 6) - df_limited, alt_valid = linear_fit(df_limited, "gps_alt", 1) + df_limited, lat_valid = linear_fit(df_limited, "gps_lat", 7) + df_limited, lon_valid = linear_fit(df_limited, "gps_lon", 7) + df_limited, alt_valid = linear_fit(df_limited, "gps_alt", 4) # If we have no valid lat, lon or alt data in the df_limited window, then interpolate # using full tx dataset. @@ -162,9 +162,9 @@ def find_positions(df, time_limit): logger.info(f"----> Using full history for linear extrapolation: {k}") logger.info(f"first transmission: {df.index.min()}") if k == "gps_alt": - df, valid = linear_fit(df, k, 1) + df, valid = linear_fit(df, k, 2) else: - df, valid = linear_fit(df, k, 6) + df, valid = linear_fit(df, k, 7) check_valid_again[k] = valid if check_valid_again[k] is True: df_limited[f"{k}_fit"] = df.loc[df_limited.index, f"{k}_fit"] diff --git a/tests/unit/bufr_export/test_get_bufr.py b/tests/unit/bufr_export/test_get_bufr.py index edb93b5b..a100cd55 100644 --- a/tests/unit/bufr_export/test_get_bufr.py +++ b/tests/unit/bufr_export/test_get_bufr.py @@ -61,9 +61,9 @@ def test_bufr_variables_gcnet(self): latitude=66.482488, longitude=-46.294266, heightOfStationGroundAboveMeanSeaLevel=2123.2, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.6, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, - heightOfBarometerAboveMeanSeaLevel=2125.25, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.59, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, + heightOfBarometerAboveMeanSeaLevel=2125.3, ) def test_bufr_variables_promice_v2(self): @@ -91,9 +91,9 @@ def test_bufr_variables_promice_v2(self): latitude=66.482488, longitude=-46.294266, heightOfStationGroundAboveMeanSeaLevel=2123.8, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.2, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, - heightOfBarometerAboveMeanSeaLevel=2124.45, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.19, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, + heightOfBarometerAboveMeanSeaLevel=2124.5, ) def test_bufr_variables_promice_v3(self): @@ -121,8 +121,8 @@ def test_bufr_variables_promice_v3(self): latitude=66.482488, longitude=-46.294266, heightOfStationGroundAboveMeanSeaLevel=2123.8, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.2, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.19, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, heightOfBarometerAboveMeanSeaLevel=2126, ) diff --git a/tests/unit/bufr_export/test_get_bufr_integration.py b/tests/unit/bufr_export/test_get_bufr_integration.py index fd755fd9..1a21b3ee 100644 --- a/tests/unit/bufr_export/test_get_bufr_integration.py +++ b/tests/unit/bufr_export/test_get_bufr_integration.py @@ -149,8 +149,8 @@ def test_get_bufr_has_new_data(self): longitude=-46.29427, heightOfStationGroundAboveMeanSeaLevel=2123.7, heightOfBarometerAboveMeanSeaLevel=2124.7, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.09, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, ) pd.testing.assert_series_equal( bufr_data.as_series(), @@ -188,8 +188,8 @@ def test_get_bufr_has_new_data_dont_store_position(self): longitude=-46.29427, heightOfStationGroundAboveMeanSeaLevel=2123.7, heightOfBarometerAboveMeanSeaLevel=2124.7, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.09, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, ) pd.testing.assert_series_equal( bufr_data.as_series(), @@ -268,8 +268,8 @@ def test_get_bufr_includes_datasets_not_in_latests_timestamps(self): longitude=-46.29427, heightOfStationGroundAboveMeanSeaLevel=2123.7, heightOfBarometerAboveMeanSeaLevel=2124.7, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.09, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, ) pd.testing.assert_series_equal( bufr_data.as_series(), @@ -329,8 +329,8 @@ def test_invalid_value_at_last_index(self): longitude=-46.29427, heightOfStationGroundAboveMeanSeaLevel=2123.7, heightOfBarometerAboveMeanSeaLevel=2124.7, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.09, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, ) pd.testing.assert_series_equal( bufr_data.as_series(), @@ -467,8 +467,8 @@ def test_ignore_newer_data_than_now_input(self): longitude=-46.29426, heightOfStationGroundAboveMeanSeaLevel=2123.3, heightOfBarometerAboveMeanSeaLevel=2124.3, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.09, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, ) pd.testing.assert_series_equal( bufr_data.as_series(), @@ -508,8 +508,8 @@ def test_land_station_export(self): longitude=-46.29427, heightOfStationGroundAboveMeanSeaLevel=2123.7, heightOfBarometerAboveMeanSeaLevel=2124.7, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.1, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.09, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, ) pd.testing.assert_series_equal( bufr_data.as_series(), diff --git a/tests/unit/bufr_export/test_realtime_utilitites.py b/tests/unit/bufr_export/test_realtime_utilitites.py index a7306a3f..1acdb5b7 100644 --- a/tests/unit/bufr_export/test_realtime_utilitites.py +++ b/tests/unit/bufr_export/test_realtime_utilitites.py @@ -50,10 +50,10 @@ def test_1(self): "gps_lon": -46.294232, "gps_alt": 2116.0, "z_boom_u": 4.1901, - "gps_lat_fit": 66.482479, - "gps_lon_fit": -46.294269, - "gps_alt_fit": 2121.4, - "z_boom_u_smooth": 4.2, + "gps_lat_fit": 66.4824788, + "gps_lon_fit": -46.2942685, + "gps_alt_fit": 2121.4118, + "z_boom_u_smooth": 4.188, }, name=datetime.datetime(2023, 12, 7, 6), ) @@ -94,10 +94,10 @@ def test_latest_data_row_is_invalid(self): "gps_lon": -46.294335, "gps_alt": 2125.0, "z_boom_u": 4.1844, - "gps_lat_fit": 66.482483, - "gps_lon_fit": -46.294275, - "gps_alt_fit": 2123.3, - "z_boom_u_smooth": 4.2, + "gps_lat_fit": 66.4824828, + "gps_lon_fit": -46.2942753, + "gps_alt_fit": 2123.3088, + "z_boom_u_smooth": 4.187, }, name=expected_output_timestamp, ) @@ -127,10 +127,10 @@ def test_latest_data_has_some_invalid_values(self): "gps_lon": -46.294232, "gps_alt": 2116.0, "z_boom_u": 4.1901, - "gps_lat_fit": 66.482479, - "gps_lon_fit": -46.294269, - "gps_alt_fit": 2121.4, - "z_boom_u_smooth": 4.2, + "gps_lat_fit": 66.4824788, + "gps_lon_fit": -46.2942685, + "gps_alt_fit": 2121.4118, + "z_boom_u_smooth": 4.188, }, name=datetime.datetime(2023, 12, 7, 6), ) From fb7c69270d05026eca29eaca2f132bd334fe8823 Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Thu, 18 Jul 2024 14:25:38 +0200 Subject: [PATCH 13/16] Updated get_bufr to separate station position from bufr * The station position determination (AWS_latest_locations) is separated from the bufr file export * Updated the unit tests Corrected minimum data check to allow p_i or t_i to be nan Renamed process_station parameters for readability * Rename now_timestamp -> target_timestamp * Rename time_limit -> linear_regression_time_limit Applied black --- .../postprocess/create_bufr_files.py | 4 +- src/pypromice/postprocess/get_bufr.py | 328 ++-- .../postprocess/real_time_utilities.py | 30 +- src/pypromice/station_configuration.py | 18 +- .../unit/bufr_export/test_bufr_utilitites.py | 2 +- .../bufr_export/test_create_bufr_files.py | 8 +- tests/unit/bufr_export/test_get_bufr.py | 1392 ++++++++--------- .../bufr_export/test_get_bufr_integration.py | 104 +- .../bufr_export/test_realtime_utilitites.py | 32 + tests/utilities.py | 2 +- 10 files changed, 940 insertions(+), 980 deletions(-) diff --git a/src/pypromice/postprocess/create_bufr_files.py b/src/pypromice/postprocess/create_bufr_files.py index 2b9925c0..1b6b4b78 100644 --- a/src/pypromice/postprocess/create_bufr_files.py +++ b/src/pypromice/postprocess/create_bufr_files.py @@ -59,9 +59,9 @@ def create_bufr_files( input_files=input_files, store_positions=False, positions_filepath=None, - time_limit=DEFAULT_LIN_REG_TIME_LIMIT, + linear_regression_time_limit=DEFAULT_LIN_REG_TIME_LIMIT, timestamps_pickle_filepath=None, - now_timestamp=period, + target_timestamp=period, station_configuration_mapping=station_configuration_mapping, positions_seed_path=DEFAULT_POSITION_SEED_PATH, break_on_error=break_on_error, diff --git a/src/pypromice/postprocess/get_bufr.py b/src/pypromice/postprocess/get_bufr.py index ab198236..c08b6b95 100644 --- a/src/pypromice/postprocess/get_bufr.py +++ b/src/pypromice/postprocess/get_bufr.py @@ -4,6 +4,13 @@ Post-processing functions for AWS station data, such as converting PROMICE and GC-Net data files to WMO-compliant BUFR files """ +__all__ = [ + "get_bufr", + "main", + "DEFAULT_POSITION_SEED_PATH", + "DEFAULT_LIN_REG_TIME_LIMIT", +] + import argparse import glob import logging @@ -11,7 +18,7 @@ import sys from datetime import datetime, timedelta from pathlib import Path -from typing import List, Dict, Optional, Collection, Sequence, Mapping +from typing import List, Dict, Optional, Collection, Sequence, Mapping, BinaryIO import numpy as np import pandas as pd @@ -19,12 +26,6 @@ from pypromice.postprocess.bufr_utilities import write_bufr_message, BUFRVariables from pypromice.postprocess.real_time_utilities import get_latest_data -__all__ = [ - "get_bufr", - "main", - "DEFAULT_POSITION_SEED_PATH", - "DEFAULT_LIN_REG_TIME_LIMIT", -] from pypromice.station_configuration import ( StationConfiguration, @@ -35,79 +36,27 @@ DEFAULT_POSITION_SEED_PATH = Path(__file__).parent.joinpath("positions_seed.csv") DEFAULT_LIN_REG_TIME_LIMIT = "91d" +REQUIRED_KEYS = ( + "t_i", + "p_i", + "rh_i", + "wdir_i", + "wspd_i", + "gps_lat_fit", + "gps_lon_fit", + "gps_alt_fit", + "z_boom_u_smooth", +) -def process_station( - file_path: Path, - output_path: Path, - now_timestamp: datetime, - latest_timestamp: Optional[datetime], - time_limit: str, - stid: str, - station_configuration: StationConfiguration, -) -> Optional[Dict]: - df = load_data(file_path, now_timestamp) - - # Select current data - latest_data = get_latest_data( - df, - lin_reg_time_limit=time_limit, - ) - - if latest_data is None: - logger.info("No valid instantaneous timestamps!") - return None - - latest_data = filter_skipped_variables( - latest_data, vars_to_skip=station_configuration.skipped_variables - ) - - # Check that we have minimum required valid data - sufficient_wx_data, sufficient_position_data = min_data_check(latest_data) - - station_position = dict() - station_position["timestamp"] = latest_data.name - if sufficient_position_data: - station_position["lon"] = latest_data.get("gps_lon_fit") - station_position["lat"] = latest_data.get("gps_lat_fit") - station_position["alt"] = latest_data.get("gps_alt_fit") - else: - logger.warning("Insufficient position data") - # Don't use any position attributes from latest_data - station_position["lon"] = None - station_position["lat"] = None - station_position["alt"] = None - return station_position - - if station_configuration.export_bufr: - if not sufficient_wx_data: - logger.warning(f"Failed min data wx {stid}") - return station_position - - # Store current timest - if latest_data.name <= latest_timestamp: - logger.info(f"No new data {latest_data.name} <= {latest_timestamp}") - return station_position - - # Construct and export BUFR file - bufr_variables = get_bufr_variables( - data=latest_data, - station_configuration=station_configuration, - ) - with output_path.open("bw") as fp: - write_bufr_message(variables=bufr_variables, file=fp) - - return station_position - - -def load_data(file_path: Path, now_timestamp: datetime) -> pd.DataFrame: +def load_data(file_path: Path, latest_timestamp: datetime) -> pd.DataFrame: """ - Read AWS data from csv file using time as index and filter all rows after now_timestamp + Read AWS data from csv file using time as index and filter all rows after latest_timestamp Parameters ---------- file_path - now_timestamp + latest_timestamp Returns ------- @@ -119,7 +68,7 @@ def load_data(file_path: Path, now_timestamp: datetime) -> pd.DataFrame: .set_index("time") .sort_index() ) - df = df[:now_timestamp] + df = df[:latest_timestamp] return df @@ -129,11 +78,11 @@ def get_bufr( positions_filepath: Optional[Path], timestamps_pickle_filepath: Optional[Path], station_configuration_mapping: Mapping[str, StationConfiguration], - now_timestamp: Optional[datetime] = None, + target_timestamp: Optional[datetime] = None, positions_seed_path: Optional[Path] = None, - earliest_timestamp: datetime = None, + time_window_length: timedelta = timedelta(days=2), store_positions: bool = False, - time_limit: str = "91d", + linear_regression_time_limit: str = "91d", break_on_error: bool = False, ): """ @@ -148,38 +97,41 @@ def get_bufr( bufr_out Path to the BUFR out directory. input_files - List of L3 csv file paths. + List of csv file paths. positions_filepath Path to write latest positions. Used to retrieve a static set of positions to register stations with DMI/WMO timestamps_pickle_filepath Path to pickle file used for storing latest timestamp station_configuration_mapping Mapping of station id to StationConfiguration object - now_timestamp - get_bufr will export the latest data before now_timestamp. Default datetime.utcnow() + target_timestamp + get_bufr will export the latest data before target_timestamp. Default datetime.utcnow() positions_seed_path Path to csv file with position data used as default values for the output position. - earliest_timestamp - The earliest allowed timestamp for data to be included in the output. Default now_timestamp - 2 days + time_window_length + The length of the time window to consider for the latest data. Default 2 days store_positions Flag determine if latest positions are exported. - time_limit + linear_regression_time_limit Previous time to limit dataframe before applying linear regression. break_on_error If True, the function will raise an exception if an error occurs during processing. """ - if now_timestamp is None: - now_timestamp = datetime.utcnow() + if target_timestamp is None: + target_timestamp = datetime.utcnow() - if earliest_timestamp is None: - earliest_timestamp = now_timestamp - timedelta(days=2) + # if earliest_timestamp is None: + # earliest_timestamp = now_timestamp - timedelta(days=2) # Prepare (latest) positions positions = dict() if positions_seed_path: positions_seed = pd.read_csv( - positions_seed_path, index_col=0, delimiter=",", parse_dates=["timestamp"] + positions_seed_path, + index_col="stid", + delimiter=",", + parse_dates=["timestamp"], ).to_dict(orient="index") logger.info(f"Seed positions for {positions_seed.keys()}") positions.update(positions_seed) @@ -195,9 +147,6 @@ def get_bufr( logger.info("latest_timestamps.pickle not found!") latest_timestamps = {} - # Initiate a new dict for current timestamps - current_timestamps = {} - # Setup diagnostic lists (logger.info at end) skipped = [] no_recent_data = [] @@ -220,42 +169,60 @@ def get_bufr( output_path = bufr_out / f"{stid}.bufr" logger.info(f"Generating {output_path} from {file_path}") - latest_timestamp = latest_timestamps.get(stid, earliest_timestamp) - latest_timestamp = max(earliest_timestamp, latest_timestamp) + + time_window_start = target_timestamp - time_window_length + # Use only newer data than the latest timestamp + if stid in latest_timestamps: + time_window_start = max(latest_timestamps[stid], time_window_start) try: - station_position = process_station( - file_path=file_path, - output_path=output_path, - now_timestamp=now_timestamp, - latest_timestamp=latest_timestamp, - time_limit=time_limit, - stid=stid, - station_configuration=station_configuration, - ) - except Exception: - logger.exception(f"Failed processing {stid}") - if break_on_error: - raise - continue + input_data = load_data(file_path, target_timestamp) - if station_position is None: - logger.warning(f"No position information available for {stid}") + # Select current data + latest_data = get_latest_data( + input_data, + lin_reg_time_limit=linear_regression_time_limit, + vars_to_skip=station_configuration.skipped_variables, + ) + if latest_data is None: + logger.info("No valid instantaneous timestamps!") + continue - else: + # Create station positions + station_position = get_station_positions(latest_data) if stid not in positions: positions[stid] = dict() - if station_configuration.positions_update_timestamp_only: positions[stid]["timestamp"] = station_position["timestamp"] else: positions[stid].update(station_position) + # Create BUFR File + if ( + station_configuration.export_bufr + and latest_data.name > time_window_start + ): + latest_timestamps[stid] = latest_data.name + bufr_variables = get_bufr_variables(latest_data, station_configuration) + if bufr_variables: + with output_path.open("bw") as output_file: + write_bufr_message(bufr_variables, output_file) + else: + logger.info(f"No new data {latest_data.name} <= {time_window_start}") + + except Exception: + logger.exception(f"Failed processing {stid}") + if output_path.exists(): + output_path.unlink() + if break_on_error: + raise + continue + # Write the most recent timestamps back to the pickle on disk logger.info(f"writing latest_timestamps to {timestamps_pickle_filepath}") if timestamps_pickle_filepath: with timestamps_pickle_filepath.open("wb") as handle: - pickle.dump(current_timestamps, handle, protocol=pickle.HIGHEST_PROTOCOL) + pickle.dump(latest_timestamps, handle, protocol=pickle.HIGHEST_PROTOCOL) if store_positions: positions_df = pd.DataFrame.from_dict( @@ -289,38 +256,34 @@ def get_bufr( logger.info("--------------------------------") -def filter_skipped_variables( - row: pd.Series, vars_to_skip: Collection[str] -) -> pd.Series: - """ - Mutate input series by setting var_to_skip to np.nan - - Parameters - ---------- - row - vars_to_skip - List of variable names to be skipped - - Returns - ------- - Input series - - """ - vars_to_skip = set(row.keys()) & set(vars_to_skip) - for var_key in vars_to_skip: - row[var_key] = np.nan - logger.info("----> Skipping var: {}".format(var_key)) - return row +def get_station_positions(latest_data: pd.Series) -> Dict: + station_position = dict() + station_position["timestamp"] = latest_data.name + station_position["lat"] = latest_data["gps_lat_fit"] + station_position["lon"] = latest_data["gps_lon_fit"] + station_position["alt"] = latest_data["gps_alt_fit"] + if any( + [ + pd.isna(station_position["lat"]), + pd.isna(station_position["lon"]), + pd.isna(station_position["alt"]), + ] + ): + logger.warning("Insufficient position data") + station_position["lat"] = None + station_position["lon"] = None + station_position["alt"] = None + return station_position def get_bufr_variables( data: pd.Series, station_configuration: StationConfiguration, -) -> BUFRVariables: +) -> Optional[BUFRVariables]: """ Helper function for converting our variables to the variables needed for bufr export. - Raises AttributeError if station_configuration dont have the minimum dimension fields since they are required to determine barometer heights. + Raises AttributeError if station_configuration don't have the minimum dimension fields since they are required to determine barometer heights. * height_of_gps_from_station_ground * barometer_from_gps @@ -329,7 +292,7 @@ def get_bufr_variables( Parameters ---------- data - Series with processed l3 variables from get_latest_datas + Series with processed variables from get_latest_datas station_configuration @@ -339,6 +302,24 @@ def get_bufr_variables( """ + if not all(key in data.index for key in REQUIRED_KEYS): + raise ValueError( + f"Failed to process BUFRVariables. Missing required keys: {REQUIRED_KEYS}" + ) + + # Check that we have minimum required fields to proceed with writing to BUFR + # Always require minimum a valid air temp or a valid pressure. + # If both air temp and pressure are nan, do not submit. + # This will allow the case of having only one or the other. + if data[["t_i", "p_i"]].isna().all(): + logger.warning("Failed to process BUFRVariables - insufficient data") + return None + + # Always require a valid position data + if data[["gps_lat_fit", "gps_lon_fit", "gps_alt_fit"]].isna().any(): + logger.warning("Failed to process BUFRVariables - insufficient position data") + return None + if station_configuration.height_of_gps_from_station_ground is None: raise AttributeError( "height_of_gps_from_station_ground is required for BUFR export" @@ -362,7 +343,6 @@ def get_bufr_variables( height_of_gps_above_mean_sea_level + station_configuration.barometer_from_gps ) - if station_configuration.temperature_from_sonic_ranger is None: heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = np.nan else: @@ -402,59 +382,6 @@ def get_bufr_variables( return output_row -def min_data_check(s): - """Check that we have minimum required fields to proceed with writing to BUFR - For wx vars, we currently require both air temp and pressure to be non-NaN. - If you know a specific var is reporting bad data, you can ignore just that var - using the vars_to_skip dict in wmo_config. - - Parameters - ---------- - s : pandas series - The current obset we are working with (for BUFR submission) - - Returns - ------- - min_data_wx_result : bool - True (default), the test for min wx data passed. False, the test failed. - min_data_pos_result : bool - True (default), the test for min position data passed. False, the test failed. - """ - min_data_wx_result = True - min_data_pos_result = True - - # Can use pd.isna() or math.isnan() below... - - # Always require valid air temp and valid pressure (both must be non-nan) - # if (pd.isna(s['t_i']) is False) and (pd.isna(s['p_i']) is False): - # pass - # else: - # print('----> Failed min_data_check for air temp and pressure!') - # min_data_wx_result = False - - # If both air temp and pressure are nan, do not submit. - # This will allow the case of having only one or the other. - if (pd.isna(s["t_i"]) is True) and (pd.isna(s["p_i"]) is True): - logger.warning("----> Failed min_data_check for air temp and pressure!") - min_data_wx_result = False - - # Missing just elevation OK - # if (pd.isna(s['gps_lat_fit']) is False) and (pd.isna(s['gps_lon_fit']) is False): - # pass - # Require all three: lat, lon, elev - if ( - (pd.isna(s["gps_lat_fit"]) is False) - and (pd.isna(s["gps_lon_fit"]) is False) - and (pd.isna(s["gps_alt_fit"]) is False) - ): - pass - else: - logger.warning("----> Failed min_data_check for position!") - min_data_pos_result = False - - return min_data_wx_result, min_data_pos_result - - def main(): parser = argparse.ArgumentParser() parser.add_argument( @@ -473,6 +400,7 @@ def main(): help="Path to write AWS_latest_locations.csv file.", ) parser.add_argument( + "--linear_regression_time_limit", "--time-limit", default=DEFAULT_LIN_REG_TIME_LIMIT, type=str, @@ -481,12 +409,11 @@ def main(): ) parser.add_argument( "--input_files", - "--l3-filepath", "-i", type=Path, nargs="+", required=True, - help="Path to L3 tx .csv files. Can be direct paths or glob patterns", + help="Path to input files .csv files. Can be direct paths or glob patterns", ) parser.add_argument( "--bufr-out", @@ -502,7 +429,7 @@ def main(): help="Path to the latest_timestamps.pickle file.", ) parser.add_argument( - "--station_configuration_root", + "--station_configurations_root", type=Path, required=True, help="Path to root directory containing station configuration toml files", @@ -515,7 +442,8 @@ def main(): help="Path to csv file with seed values for output positions.", ) parser.add_argument( - "--latest_timestamp", + "--target_timestamp", + "--now-timestamp", default=datetime.utcnow(), type=pd.Timestamp, help="Timestamp used to determine latest data. Default utcnow.", @@ -542,17 +470,19 @@ def main(): # The input path might be a glob pattern input_files += map(Path, glob.glob(path.as_posix())) - station_configuration_mapping = load_station_configuration_mapping(args.station_configuration_root) + station_configuration_mapping = load_station_configuration_mapping( + args.station_configurations_root + ) get_bufr( bufr_out=args.bufr_out, input_files=input_files, store_positions=args.store_positions, positions_filepath=args.positions_filepath, - time_limit=args.time_limit, + linear_regression_time_limit=args.linear_regression_time_limit, timestamps_pickle_filepath=args.timestamps_pickle_filepath, - now_timestamp=args.latest_timestamp, - station_configuration_mapping=args.station_configuration_mapping, + target_timestamp=args.target_timestamp, + station_configuration_mapping=station_configuration_mapping, positions_seed_path=args.position_seed, ) diff --git a/src/pypromice/postprocess/real_time_utilities.py b/src/pypromice/postprocess/real_time_utilities.py index 2352f30d..f79f9ca0 100644 --- a/src/pypromice/postprocess/real_time_utilities.py +++ b/src/pypromice/postprocess/real_time_utilities.py @@ -7,7 +7,7 @@ """ import logging -from typing import Optional +from typing import Optional, Collection import numpy as np import pandas as pd @@ -22,6 +22,7 @@ def get_latest_data( df: pd.DataFrame, lin_reg_time_limit: str, + vars_to_skip: Optional[Collection[str]] = None, ) -> Optional[pd.Series]: """ Determine instantaneous values for the latest valid timestamp in the input dataframe @@ -77,9 +78,36 @@ def get_latest_data( # limit to single most recent valid row (convert to series) s_current = df_limited.loc[last_valid_index] + if vars_to_skip is not None: + s_current = filter_skipped_variables(s_current, vars_to_skip) + return s_current +def filter_skipped_variables( + row: pd.Series, vars_to_skip: Collection[str] +) -> pd.Series: + """ + Mutate input series by setting var_to_skip to np.nan + + Parameters + ---------- + row + vars_to_skip + List of variable names to be skipped + + Returns + ------- + Input series + + """ + vars_to_skip = set(row.keys()) & set(vars_to_skip) + for var_key in vars_to_skip: + row[var_key] = np.nan + logger.info("----> Skipping var: {}".format(var_key)) + return row + + def rolling_window(df, column, window, min_periods, decimals) -> pd.DataFrame: """Apply a rolling window (smoothing) to the input column diff --git a/src/pypromice/station_configuration.py b/src/pypromice/station_configuration.py index 34e85ff3..fb8d5439 100644 --- a/src/pypromice/station_configuration.py +++ b/src/pypromice/station_configuration.py @@ -57,14 +57,14 @@ def as_dict(self) -> Dict: def load_station_configuration_mapping( - configuration_root_dir: Path, + configurations_root_dir: Path, ) -> Mapping[str, StationConfiguration]: """ - Load station configurations from toml files in configuration_root_dir + Load station configurations from toml files in configurations_root_dir Parameters ---------- - configuration_root_dir + configurations_root_dir Root directory containing toml files Returns @@ -74,26 +74,26 @@ def load_station_configuration_mapping( """ return { config_file.stem: StationConfiguration(**toml.load(config_file)) - for config_file in configuration_root_dir.glob("*.toml") + for config_file in configurations_root_dir.glob("*.toml") } def write_station_configuration_mapping( station_configurations: Mapping[str, StationConfiguration], - configuration_root_dir: Path, + configurations_root_dir: Path, ) -> None: """ - Write station configurations to toml files in configuration_root_dir + Write station configurations to toml files in configurations_root_dir Parameters ---------- station_configurations Mapping from stid to StationConfiguration - configuration_root_dir + configurations_root_dir Output directory """ - configuration_root_dir.mkdir(parents=True, exist_ok=True) + configurations_root_dir.mkdir(parents=True, exist_ok=True) for stid, station_configuration in station_configurations.items(): - with (configuration_root_dir / f"{stid}.toml").open("w") as fp: + with (configurations_root_dir / f"{stid}.toml").open("w") as fp: toml.dump(station_configuration.as_dict(), fp) diff --git a/tests/unit/bufr_export/test_bufr_utilitites.py b/tests/unit/bufr_export/test_bufr_utilitites.py index 49ecc203..bd9ec586 100644 --- a/tests/unit/bufr_export/test_bufr_utilitites.py +++ b/tests/unit/bufr_export/test_bufr_utilitites.py @@ -194,7 +194,7 @@ def test_precision(self): timestamp=datetime.datetime(2023, 12, 19, 10, 0), relativeHumidity=np.random.random(), airTemperature=np.random.random(), - pressure=1000*np.random.random(), + pressure=1000 * np.random.random(), windDirection=np.random.random(), windSpeed=np.random.random(), latitude=np.random.random(), diff --git a/tests/unit/bufr_export/test_create_bufr_files.py b/tests/unit/bufr_export/test_create_bufr_files.py index 2cb25afc..1b79b421 100644 --- a/tests/unit/bufr_export/test_create_bufr_files.py +++ b/tests/unit/bufr_export/test_create_bufr_files.py @@ -56,7 +56,7 @@ def test_create_bufr_files(self): } write_station_configuration_mapping( station_configurations=station_configuration_mapping, - configuration_root_dir=station_configuration_root, + configurations_root_dir=station_configuration_root, ) create_bufr_files( @@ -110,7 +110,7 @@ def test_get_bufr_from_empty_data_file_raises_error(self): ) write_station_configuration_mapping( station_configurations={station_configuration.stid: station_configuration}, - configuration_root_dir=station_configuration_root, + configurations_root_dir=station_configuration_root, ) with self.assertRaises(ValueError): @@ -141,7 +141,7 @@ def test_get_bufr_continues_when_break_on_error_is_false(self): "THU_L2": get_station_configuration(stid="THU_L2", export_bufr=True), "KAN_Lv3": get_station_configuration(stid="KAN_Lv3", export_bufr=True), }, - configuration_root_dir=station_configuration_root, + configurations_root_dir=station_configuration_root, ) expected_compiled_output_file = compiled_output_dir / "geus_20231206T0000.bufr" expected_individual_output_dir = individual_output_root / "20231206T0000" @@ -184,7 +184,7 @@ def test_get_bufr_where_period_does_not_exist(self): ) write_station_configuration_mapping( station_configurations={station_configuration.stid: station_configuration}, - configuration_root_dir=station_configuration_root, + configurations_root_dir=station_configuration_root, ) create_bufr_files( diff --git a/tests/unit/bufr_export/test_get_bufr.py b/tests/unit/bufr_export/test_get_bufr.py index a100cd55..83b650b3 100644 --- a/tests/unit/bufr_export/test_get_bufr.py +++ b/tests/unit/bufr_export/test_get_bufr.py @@ -1,29 +1,23 @@ import datetime import logging -import pickle +import random import sys -import unittest -import uuid +import tempfile +from io import BufferedWriter from pathlib import Path -from tempfile import TemporaryDirectory from unittest import TestCase, mock -import numpy as np import pandas as pd from pypromice.postprocess.bufr_utilities import BUFRVariables from pypromice.postprocess.get_bufr import ( - process_station, - get_bufr, + get_station_positions, get_bufr_variables, + REQUIRED_KEYS, + get_bufr, ) -from pypromice.station_configuration import ( - StationConfiguration, -) -from tests.unit.bufr_export.test_get_bufr_integration import ( - DATA_DIR, - run_get_bufr, -) +from pypromice.station_configuration import StationConfiguration +from tests.utilities import get_station_configuration logging.basicConfig( stream=sys.stdout, @@ -31,12 +25,106 @@ level=logging.WARNING, ) -MOCK_BASE_STR = "pypromice.postprocess.get_bufr.{}" +class GetStationPositionsTestCase(TestCase): + def test_all_data_available(self): + """ + Test the get_station_positions function + """ + timestamp = pd.to_datetime("2024-03-01 00:00:00") + latest_data = pd.Series( + name=timestamp, + data={ + "gps_lat_fit": 78.52901, + "gps_lon_fit": -56.8450358, + "gps_alt_fit": 1968.561, + }, + ) + + positions = get_station_positions(latest_data=latest_data) + + self.assertDictEqual( + positions, + dict( + timestamp=timestamp, + lat=78.52901, + lon=-56.8450358, + alt=1968.561, + ), + ) + + def test_missing_data(self): + """ + Test the get_station_positions function with missing data + """ + timestamp = pd.to_datetime("2024-03-01 00:00:00") + latest_data = pd.Series( + name=timestamp, + data={ + "gps_lat_fit": 78.52901, + "gps_lon_fit": -56.8450358, + }, + ) + + with self.assertRaises(KeyError): + get_station_positions(latest_data=latest_data) + + def test_nan_latitude(self): + """ + get_station_positions shall discard all position data if latitude is NaN + """ + timestamp = pd.to_datetime("2024-03-01 00:00:00") + latest_data = pd.Series( + name=timestamp, + data={ + "gps_lat_fit": float("nan"), + "gps_lon_fit": -56.8450358, + "gps_alt_fit": 1968.561, + }, + ) -class BufrVariablesTestCase(TestCase): + positions = get_station_positions(latest_data=latest_data) + + self.assertDictEqual( + positions, + dict( + timestamp=timestamp, + lat=None, + lon=None, + alt=None, + ), + ) + + def test_nan_altitude(self): + """ + get_station_positions shall discard all position data if altitude is NaN + """ + timestamp = pd.to_datetime("2024-03-01 00:00:00") + latest_data = pd.Series( + name=timestamp, + data={ + "gps_lat_fit": 78.52901, + "gps_lon_fit": -56.8450358, + "gps_alt_fit": float("nan"), + }, + ) + + positions = get_station_positions(latest_data=latest_data) + + self.assertDictEqual( + positions, + dict( + timestamp=timestamp, + lat=None, + lon=None, + alt=None, + ), + ) + + +class TestGetBufrVariablesTestCase(TestCase): def test_bufr_variables_gcnet(self): - station_configuration = StationConfiguration( + config = StationConfiguration( stid="DY2", station_site="DY2", project="GC-Net", @@ -49,85 +137,54 @@ def test_bufr_variables_gcnet(self): sonic_ranger_from_gps=0.15, export_bufr=True, ) - - self._test_bufr_variables( - stid=station_configuration.stid, - station_configuration=station_configuration, - relativeHumidity=69.0, - airTemperature=255.95, - pressure=77300.0, - windDirection=149.0, - windSpeed=14.9, - latitude=66.482488, - longitude=-46.294266, - heightOfStationGroundAboveMeanSeaLevel=2123.2, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.59, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, - heightOfBarometerAboveMeanSeaLevel=2125.3, - ) - - def test_bufr_variables_promice_v2(self): - station_configuration = StationConfiguration( - stid="NUK_L", - station_site="NUK_L", - project="Promice", - station_type="mobile", - wmo_id="04403", - barometer_from_gps=-0.25, - anemometer_from_sonic_ranger=0.4, - temperature_from_sonic_ranger=0.0, - height_of_gps_from_station_ground=0.9, - sonic_ranger_from_gps=1.3, - export_bufr=True, + timestamp = pd.to_datetime("2024-03-01 00:00:00") + data = pd.Series( + name=timestamp, + data={ + "t_i": -12.5, + "p_i": 3.1, + "rh_i": 0.5, + "wspd_i": 2.5, + "wdir_i": 182.1, + "z_boom_u_smooth": 1.6, + "gps_lat_fit": 78.52901, + "gps_lon_fit": -56.8450358, + "gps_alt_fit": 1968.561, + }, ) - self._test_bufr_variables( - stid=station_configuration.stid, - station_configuration=station_configuration, - relativeHumidity=69.0, - airTemperature=255.95, - pressure=77300.0, - windDirection=149.0, - windSpeed=14.9, - latitude=66.482488, - longitude=-46.294266, - heightOfStationGroundAboveMeanSeaLevel=2123.8, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.19, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, - heightOfBarometerAboveMeanSeaLevel=2124.5, + expected_bufr_variables = BUFRVariables( + wmo_id=config.wmo_id, + station_type=config.station_type, + timestamp=timestamp, + relativeHumidity=data.rh_i, + airTemperature=data.t_i + 273.15, + pressure=100310, + windDirection=data.wdir_i, + windSpeed=data.wspd_i, + latitude=data.gps_lat_fit, + longitude=data.gps_lon_fit, + heightOfStationGroundAboveMeanSeaLevel=data.gps_alt_fit + - config.height_of_gps_from_station_ground, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=data.z_boom_u_smooth + + config.temperature_from_sonic_ranger, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=data.z_boom_u_smooth + + config.anemometer_from_sonic_ranger, + heightOfBarometerAboveMeanSeaLevel=data.gps_alt_fit + + config.barometer_from_gps, + ) + + bufr_variables = get_bufr_variables( + data=data, + station_configuration=config, ) - def test_bufr_variables_promice_v3(self): - station_configuration = StationConfiguration( - stid="QAS_Mv3", - station_site="QAS_M", - project="Promice", - station_type="mobile", - wmo_id="04441", - barometer_from_gps=1.3, - anemometer_from_sonic_ranger=0.4, - temperature_from_sonic_ranger=0.0, - height_of_gps_from_station_ground=0.9, - sonic_ranger_from_gps=1.3, - export_bufr=True, - ) - self._test_bufr_variables( - stid=station_configuration.stid, - station_configuration=station_configuration, - relativeHumidity=69.0, - airTemperature=255.95, - pressure=77300.0, - windDirection=149.0, - windSpeed=14.9, - latitude=66.482488, - longitude=-46.294266, - heightOfStationGroundAboveMeanSeaLevel=2123.8, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.19, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, - heightOfBarometerAboveMeanSeaLevel=2126, + pd.testing.assert_series_equal( + bufr_variables.as_series(), + expected_bufr_variables.as_series(), ) def test_bufr_variables_static_gps_elevation(self): - timestamp = datetime.datetime.now() + timestamp = pd.to_datetime("2024-03-01 00:00:00") data = pd.Series( data=dict( rh_i=0.93, @@ -144,7 +201,7 @@ def test_bufr_variables_static_gps_elevation(self): ), name=timestamp, ) - station_config = StationConfiguration( + config = StationConfiguration( stid="A_STID", station_type="land", wmo_id="4201", @@ -159,10 +216,9 @@ def test_bufr_variables_static_gps_elevation(self): # The elevations should be determined from the static variable expected_station_ground_elevation = 17.5 - 0.9 expected_barometer_elevation = 17.5 + 1.3 - expected_bufr_variables = BUFRVariables( - wmo_id=station_config.wmo_id, - station_type=station_config.station_type, + wmo_id=config.wmo_id, + station_type=config.station_type, timestamp=timestamp, relativeHumidity=1.0, airTemperature=252.15, # Converted to kelvin @@ -174,25 +230,25 @@ def test_bufr_variables_static_gps_elevation(self): heightOfStationGroundAboveMeanSeaLevel=expected_station_ground_elevation, heightOfBarometerAboveMeanSeaLevel=expected_barometer_elevation, # The sensor heights are ignored since the necessary dimension values are missing - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=np.nan, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=np.nan, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=float("nan"), + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=float("nan"), ) - output = get_bufr_variables( - data, - station_configuration=station_config, + bufr_variables = get_bufr_variables( + data=data, + station_configuration=config, ) - self.assertEqual( - expected_bufr_variables, - output, + pd.testing.assert_series_equal( + bufr_variables.as_series(), + expected_bufr_variables.as_series(), ) def test_fails_on_missing_dimension_values(self): """ Test that get_bufr_variables raises an AttributeError if the data is missing """ - timestamp = datetime.datetime.now() + timestamp = pd.to_datetime("2024-03-01 00:00:00") data = pd.Series( data=dict( rh_i=0.93, @@ -208,701 +264,597 @@ def test_fails_on_missing_dimension_values(self): ), name=timestamp, ) - station_config = StationConfiguration( + config = StationConfiguration( stid="A_STID", station_type="land", wmo_id="4201", export_bufr=True, ) - with self.assertRaises(AttributeError) as context: + with self.assertRaises(AttributeError): get_bufr_variables( data, - station_configuration=station_config, + station_configuration=config, ) - @mock.patch("pypromice.postprocess.get_bufr.write_bufr_message") - def _test_bufr_variables( - self, - write_bufr_message_mock: mock.MagicMock, - stid: str, - station_configuration: StationConfiguration, - relativeHumidity: float, - airTemperature: float, - pressure: float, - windDirection: float, - windSpeed: float, - latitude: float, - longitude: float, - heightOfStationGroundAboveMeanSeaLevel: float, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH: float, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD: float, - heightOfBarometerAboveMeanSeaLevel: float, - ): - l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") - l3_src = pd.read_csv(l3_src_filepath) - now_timestamp = datetime.datetime(2023, 12, 8) - - timestamps = {} - run_get_bufr( - l3_data=l3_src, - now_timestamp=now_timestamp, - latest_timestamps=timestamps, - stid=stid, - store_positions=True, - time_limit="91d", - station_configuration_mapping={ - station_configuration.stid: station_configuration + def test_nan_location_yields_none(self): + config = get_station_configuration(export_bufr=True) + data = pd.Series( + name=pd.to_datetime("2024-03-01 00:00:00"), + data={ + "t_i": -12.5, + "p_i": 1003.1, + "rh_i": 0.5, + "wspd_i": 2.5, + "wdir_i": 182.1, + "z_boom_u_smooth": 1.6, + "gps_lat_fit": 78.52901, + "gps_lon_fit": float("nan"), + "gps_alt_fit": 1968.561, }, ) - write_bufr_message_mock.assert_called_once() - call = write_bufr_message_mock.call_args_list[0] - expected_time = datetime.datetime(year=2023, month=12, day=7, hour=23) - expected_bufr_variables = BUFRVariables( - wmo_id=station_configuration.wmo_id, - station_type=station_configuration.station_type, - timestamp=expected_time, - relativeHumidity=relativeHumidity, - airTemperature=airTemperature, - pressure=pressure, - windDirection=windDirection, - windSpeed=windSpeed, - latitude=latitude, - longitude=longitude, - heightOfStationGroundAboveMeanSeaLevel=heightOfStationGroundAboveMeanSeaLevel, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD, - heightOfBarometerAboveMeanSeaLevel=heightOfBarometerAboveMeanSeaLevel, + return_value = get_bufr_variables( + data, + station_configuration=config, ) - pd.testing.assert_series_equal( - pd.Series(expected_bufr_variables), - pd.Series(call.kwargs["variables"]), + + self.assertIsNone(return_value) + + def test_nan_t_i_and_p_i_yields_none(self): + config = get_station_configuration(export_bufr=True) + data = pd.Series( + name=pd.to_datetime("2024-03-01 00:00:00"), + data={ + "t_i": float("nan"), + "p_i": float("nan"), + "rh_i": 0.5, + "wspd_i": 2.5, + "wdir_i": 182.1, + "z_boom_u_smooth": 1.6, + "gps_lat_fit": 78.52901, + "gps_lon_fit": -56.8450358, + "gps_alt_fit": 1968.561, + }, ) + return_value = get_bufr_variables( + data, + station_configuration=config, + ) + + self.assertIsNone(return_value) + + def test_missing_keys(self): + config = get_station_configuration(export_bufr=True) + for key in REQUIRED_KEYS: + data = pd.Series( + name=pd.to_datetime("2024-03-01 00:00:00"), + data={ + "t_i": -12.5, + "p_i": 1003.1, + "rh_i": 0.5, + "wspd_i": 2.5, + "wdir_i": 182.1, + "z_boom_u_smooth": 1.6, + "gps_lat_fit": 78.52901, + "gps_lon_fit": -56.8450358, + "gps_alt_fit": 1968.561, + }, + ) + del data[key] + + with self.assertRaises(ValueError, msg=f"Key: {key}"): + get_bufr_variables( + data=data, + station_configuration=config, + ) + + +MOCK_BASE_STR = "pypromice.postprocess.get_bufr.{}" + +@mock.patch(MOCK_BASE_STR.format("get_station_positions")) @mock.patch(MOCK_BASE_STR.format("get_bufr_variables")) @mock.patch(MOCK_BASE_STR.format("write_bufr_message")) @mock.patch(MOCK_BASE_STR.format("get_latest_data")) @mock.patch(MOCK_BASE_STR.format("load_data")) -class ProcessStationTestCase(unittest.TestCase): - def setUp(self) -> None: - self.file_path = mock.create_autospec(Path) - self.output_path = mock.create_autospec(Path) - self.now_timestamp = mock.create_autospec(datetime.datetime) - self.time_limit = mock.create_autospec(str) - self.stid = str(uuid.uuid4()) - self.station_configuration = mock.MagicMock() - self.earliest_timestamp = mock.MagicMock() - - def test_process_station_no_new_data( +class TestGetBufrTestCase(TestCase): + def test_has_new_data( self, load_data_mock: mock.MagicMock, get_latest_data_mock: mock.MagicMock, write_bufr_message_mock: mock.MagicMock, get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, ): - self.earliest_timestamp = datetime.datetime(2023, 10, 3) - latest_data_datetime = datetime.datetime(2023, 10, 3) - get_latest_data_mock.return_value = pd.Series( - data={ - "p_i": -227.1, - "t_i": -16.7, - "rh_i": 84.6, - "wspd_i": 14.83, - "wdir_i": 142.2, - "gps_lat": 66.482469, - "gps_lon": -46.294232, - "gps_alt": 2116.0, - "z_boom_u": 4.1901, - "gps_lat_fit": 66.482474, - "gps_lon_fit": -46.294261, - "gps_alt_fit": 2119.6, - "z_boom_u_smooth": 4.2, - }, - name=latest_data_datetime, - ) - expected_output = { - "timestamp": latest_data_datetime, - "lat": 66.482474, - "lon": -46.294261, - "alt": 2119.6, - } - - output = process_station( - file_path=self.file_path, - output_path=self.output_path, - now_timestamp=self.now_timestamp, - latest_timestamp=self.earliest_timestamp, - time_limit=self.time_limit, - stid=self.stid, - station_configuration=self.station_configuration, - ) + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + output_path = root_path / "bufr_out" + station_config = get_station_configuration( + export_bufr=True, + positions_update_timestamp_only=False, + ) + input_file = root_path / "input" / f"{station_config.stid}_hour.csv" + positions_filepath = root_path / "positions.csv" + timestamps_pickle_filepath = root_path / "timestamps.pickle" + now_timestamp = pd.to_datetime("2024-03-01 00:12:00") + latest_timestamp = pd.to_datetime("2024-03-01 00:01:00") + get_latest_data_mock.return_value.name = latest_timestamp + get_station_positions_mock.return_value = dict( + timestamp=latest_timestamp, + lat=78.52901, + lon=-56.8450358, + alt=1968.561, + ) - self.assertDictEqual( - output, - expected_output, - ) - get_bufr_variables_mock.assert_not_called() - write_bufr_message_mock.assert_not_called() + get_bufr( + input_files=[input_file], + station_configuration_mapping={station_config.stid: station_config}, + break_on_error=True, + bufr_out=output_path, + target_timestamp=now_timestamp, + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=timestamps_pickle_filepath, + ) - def test_process_station_has_new_data( + load_data_mock.assert_called_once_with(input_file, now_timestamp) + get_latest_data_mock.assert_called_once_with( + load_data_mock.return_value, + lin_reg_time_limit="91d", + vars_to_skip=station_config.skipped_variables, + ) + get_station_positions_mock.assert_called_once_with( + get_latest_data_mock.return_value + ) + get_bufr_variables_mock.assert_called_once_with( + get_latest_data_mock.return_value, + station_config, + ) + write_bufr_message_mock.assert_called_once_with( + get_bufr_variables_mock.return_value, + mock.ANY, + ) + # Write bufr is invoked with an open file object. It is therefore necessary to check the path of the file + expected_output_file_path = output_path / f"{station_config.stid}.bufr" + output_file = write_bufr_message_mock.call_args[0][1] + self.assertIsInstance(output_file, BufferedWriter) + self.assertEqual(Path(output_file.name), expected_output_file_path) + written_positions = pd.read_csv( + positions_filepath, index_col=0, parse_dates=["timestamp"] + ) + self.assertDictEqual( + get_station_positions_mock.return_value, + dict(written_positions.loc[station_config.stid]), + ) + self.assertTrue(timestamps_pickle_filepath.exists()) + timestamps = pd.read_pickle(timestamps_pickle_filepath) + self.assertDictEqual( + timestamps, + {station_config.stid: latest_timestamp}, + ) + + def test_no_new_data( self, load_data_mock: mock.MagicMock, get_latest_data_mock: mock.MagicMock, write_bufr_message_mock: mock.MagicMock, get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, ): - self.earliest_timestamp = datetime.datetime(2023, 10, 2) - latest_data_datetime = datetime.datetime(2023, 10, 3) - get_latest_data_mock.return_value = pd.Series( - data={ - "p_i": -227.1, - "t_i": -16.7, - "rh_i": 84.6, - "wspd_i": 14.83, - "wdir_i": 142.2, - "gps_lat": 66.482469, - "gps_lon": -46.294232, - "gps_alt": 2116.0, - "z_boom_u": 4.1901, - "gps_lat_fit": 66.482474, - "gps_lon_fit": -46.294261, - "gps_alt_fit": 2119.6, - "z_boom_u_smooth": 4.2, - }, - name=latest_data_datetime, - ) - expected_output = { - "timestamp": latest_data_datetime, - "lat": 66.482474, - "lon": -46.294261, - "alt": 2119.6, - } - - output = process_station( - file_path=self.file_path, - output_path=self.output_path, - now_timestamp=self.now_timestamp, - latest_timestamp=self.earliest_timestamp, - time_limit=self.time_limit, - stid=self.stid, - station_configuration=self.station_configuration, - ) + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + output_path = root_path / "bur_out" + station_config = get_station_configuration( + export_bufr=True, + positions_update_timestamp_only=False, + ) + input_file = root_path / "input" / f"{station_config.stid}_hour.csv" + positions_filepath = root_path / "positions.csv" + now_timestamp = pd.to_datetime("2024-03-01 00:12:00") + # The latest data is two month old + latest_timestamp = pd.to_datetime("2024-01-01 00:12:00") + get_latest_data_mock.return_value.name = latest_timestamp + get_station_positions_mock.return_value = dict( + timestamp=latest_timestamp, + lat=78.52901, + lon=-56.8450358, + alt=1968.561, + ) - self.assertDictEqual( - output, - expected_output, - ) - get_bufr_variables_mock.assert_called_once_with( - data=get_latest_data_mock.return_value, - station_configuration=self.station_configuration, - ) - write_bufr_message_mock.assert_called_once_with( - variables=get_bufr_variables_mock.return_value, - file=self.output_path.open().__enter__(), - ) + get_bufr( + input_files=[input_file], + station_configuration_mapping={station_config.stid: station_config}, + break_on_error=True, + bufr_out=output_path, + target_timestamp=now_timestamp, + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=None, + time_window_length=pd.to_timedelta("2d"), + ) + + get_latest_data_mock.assert_called_once_with( + load_data_mock.return_value, + lin_reg_time_limit="91d", + vars_to_skip=station_config.skipped_variables, + ) + get_station_positions_mock.assert_called_once_with( + get_latest_data_mock.return_value + ) + get_bufr_variables_mock.assert_not_called() + write_bufr_message_mock.assert_not_called() + written_positions = pd.read_csv( + positions_filepath, index_col=0, parse_dates=["timestamp"] + ) + self.assertDictEqual( + get_station_positions_mock.return_value, + dict(written_positions.loc[station_config.stid]), + ) - def test_min_data_wx_failed( + def test_position_seed( self, load_data_mock: mock.MagicMock, get_latest_data_mock: mock.MagicMock, write_bufr_message_mock: mock.MagicMock, get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, ): - self.earliest_timestamp = datetime.datetime(2023, 10, 2) - latest_data_datetime = datetime.datetime(2023, 10, 3) - get_latest_data_mock.return_value = pd.Series( - data={ - "p_i": np.nan, - "t_i": np.nan, - "rh_i": 84.6, - "wspd_i": 14.83, - "wdir_i": 142.2, - "gps_lat": 66.482469, - "gps_lon": -46.294232, - "gps_alt": 2116.0, - "z_boom_u": 4.1901, - "gps_lat_fit": 66.482474, - "gps_lon_fit": -46.294261, - "gps_alt_fit": 2119.6, - "z_boom_u_smooth": 4.2, - }, - name=latest_data_datetime, - ) - expected_output = { - "timestamp": latest_data_datetime, - "lat": 66.482474, - "lon": -46.294261, - "alt": 2119.6, - } - - output = process_station( - file_path=self.file_path, - output_path=self.output_path, - now_timestamp=self.now_timestamp, - latest_timestamp=self.earliest_timestamp, - time_limit=self.time_limit, - stid=self.stid, - station_configuration=self.station_configuration, - ) + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + positions_filepath = root_path / "positions.csv" + positions_seed_path = root_path / "positions_seed.csv" + positions_seed = pd.DataFrame( + columns=["stid", "timestamp", "lat", "lon", "alt"], + data=[ + ["STATION_A", datetime.datetime(2021, 10, 2), 65.0, -40.0, 800], + ["STATION_B", datetime.datetime(2023, 11, 12), 66.0, -50.0, 1100], + ], + ).set_index("stid") + positions_seed.to_csv(positions_seed_path, index=True) + + get_bufr( + input_files=[], + station_configuration_mapping=dict(), + break_on_error=True, + bufr_out=mock.create_autospec(Path), + target_timestamp=mock.create_autospec(datetime.timedelta), + positions_filepath=positions_filepath, + positions_seed_path=positions_seed_path, + store_positions=True, + timestamps_pickle_filepath=None, + time_window_length=pd.to_timedelta("2d"), + ) - # The BUFR export step shall be skipped - get_bufr_variables_mock.assert_not_called() - write_bufr_message_mock.assert_not_called() - self.assertDictEqual( - output, - expected_output, - ) + written_positions = pd.read_csv( + positions_filepath, index_col="stid", parse_dates=["timestamp"] + ) + pd.testing.assert_frame_equal( + positions_seed, + written_positions, + ) - def test_min_data_pos_failed( + def test_no_input_paths( self, load_data_mock: mock.MagicMock, get_latest_data_mock: mock.MagicMock, write_bufr_message_mock: mock.MagicMock, get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, ): - self.earliest_timestamp = datetime.datetime(2023, 10, 2) - latest_data_datetime = datetime.datetime(2023, 10, 3) - get_latest_data_mock.return_value = pd.Series( - data={ - "p_i": -227.1, - "t_i": -16.7, - "rh_i": 84.6, - "wspd_i": 14.83, - "wdir_i": 142.2, - "gps_lat": 66.482469, - "gps_lon": -46.294232, - "gps_alt": 2116.0, - "z_boom_u": 4.1901, - "gps_lat_fit": 66.482474, - "gps_lon_fit": -46.294261, - "gps_alt_fit": np.nan, - "z_boom_u_smooth": 4.2, - }, - name=latest_data_datetime, - ) - expected_output = { - "timestamp": latest_data_datetime, - "lat": None, - "lon": None, - "alt": None, - } - - output = process_station( - file_path=self.file_path, - output_path=self.output_path, - now_timestamp=self.now_timestamp, - latest_timestamp=self.earliest_timestamp, - time_limit=self.time_limit, - stid=self.stid, - station_configuration=self.station_configuration, - ) + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + positions_filepath = root_path / "positions.csv" + get_bufr( + input_files=[], + station_configuration_mapping=dict(), + break_on_error=True, + bufr_out=mock.create_autospec(Path), + target_timestamp=mock.create_autospec(datetime.timedelta), + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=None, + time_window_length=pd.to_timedelta("2d"), + ) - # The BUFR export step shall be skipped - get_bufr_variables_mock.assert_not_called() - write_bufr_message_mock.assert_not_called() - self.assertDictEqual( - output, - expected_output, - ) + load_data_mock.assert_not_called() + get_latest_data_mock.assert_not_called() + get_station_positions_mock.assert_not_called() + get_bufr_variables_mock.assert_not_called() + write_bufr_message_mock.assert_not_called() + # The positions file should be created, but empty + self.assertTrue(positions_filepath.exists()) + written_positions = pd.read_csv( + positions_filepath, index_col=0, parse_dates=["timestamp"] + ) + self.assertEqual(0, len(written_positions)) - def test_no_valid_data( + def test_get_latest_data_fails( self, load_data_mock: mock.MagicMock, get_latest_data_mock: mock.MagicMock, write_bufr_message_mock: mock.MagicMock, get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, ): - get_latest_data_mock.return_value = None - - output = process_station( - file_path=self.file_path, - output_path=self.output_path, - now_timestamp=self.now_timestamp, - latest_timestamp=self.earliest_timestamp, - time_limit=self.time_limit, - stid=self.stid, - station_configuration=self.station_configuration, - ) + """ + get_latest_data returns None when there are no valid data available for the staiton + """ + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + positions_filepath = root_path / "positions.csv" + station_config = get_station_configuration( + export_bufr=True, + positions_update_timestamp_only=False, + ) + input_file = root_path / "input" / f"{station_config.stid}_hour.csv" + target_timestamp = mock.create_autospec(datetime.timedelta) + get_latest_data_mock.return_value = None + get_bufr( + input_files=[input_file], + station_configuration_mapping={station_config.stid: station_config}, + break_on_error=True, + bufr_out=mock.create_autospec(Path), + target_timestamp=target_timestamp, + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=None, + time_window_length=pd.to_timedelta("2d"), + ) - load_data_mock.assert_called_once() - get_latest_data_mock.assert_called_once() - write_bufr_message_mock.assert_not_called() - get_bufr_variables_mock.assert_not_called() - self.assertIsNone(output) + load_data_mock.assert_called_once_with(input_file, target_timestamp) + get_latest_data_mock.assert_called_once_with( + load_data_mock.return_value, + lin_reg_time_limit="91d", + vars_to_skip=station_config.skipped_variables, + ) + get_station_positions_mock.assert_not_called() + get_bufr_variables_mock.assert_not_called() + write_bufr_message_mock.assert_not_called() + self.assertTrue(positions_filepath.exists()) - def test_skipped_variables( + def test_already_existing_in_latest_timestamps( self, load_data_mock: mock.MagicMock, get_latest_data_mock: mock.MagicMock, write_bufr_message_mock: mock.MagicMock, get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, ): - self.earliest_timestamp = datetime.datetime(2023, 10, 2) - latest_data_datetime = datetime.datetime(2023, 10, 3) - original_p_i = 42.0 - get_latest_data_mock.return_value = pd.Series( - data={ - "p_i": original_p_i, - "t_i": -16.7, - "rh_i": 84.6, - "wspd_i": 14.83, - "wdir_i": 142.2, - "gps_lat": 66.482469, - "gps_lon": -46.294232, - "gps_alt": 2116.0, - "z_boom_u": 4.1901, - "gps_lat_fit": 66.482474, - "gps_lon_fit": -46.294261, - "gps_alt_fit": 2119.6, - "z_boom_u_smooth": 4.2, - }, - name=latest_data_datetime, - ) - self.station_configuration = StationConfiguration( - stid="A_STID", - station_site="A_STATION_SITE", - station_type="mobile", - wmo_id="04242", - skipped_variables=["p_i"], - height_of_gps_from_station_ground=1.4, - barometer_from_gps=0.1, - anemometer_from_sonic_ranger=0.1, - temperature_from_sonic_ranger=0.2, - export_bufr=True, - ) - expected_output = { - "timestamp": latest_data_datetime, - "lat": 66.482474, - "lon": -46.294261, - "alt": 2119.6, - } - self.assertEqual( - original_p_i, - get_latest_data_mock.return_value["p_i"], - ) - - output = process_station( - file_path=self.file_path, - output_path=self.output_path, - now_timestamp=self.now_timestamp, - latest_timestamp=self.earliest_timestamp, - time_limit=self.time_limit, - stid=self.stid, - station_configuration=self.station_configuration, - ) - - self.assertTrue( - np.isnan(get_latest_data_mock.return_value["p_i"]), - "p_i shall be set to nan since it is in skipped_variables", - ) - self.assertDictEqual( - output, - expected_output, - ) - get_bufr_variables_mock.assert_called_once_with( - data=get_latest_data_mock.return_value, - station_configuration=self.station_configuration, - ) - write_bufr_message_mock.assert_called_once_with( - variables=get_bufr_variables_mock.return_value, - file=self.output_path.open().__enter__(), - ) - - -class GetBufrTestCase(unittest.TestCase): - def setUp(self) -> None: - self.temporary_root = TemporaryDirectory() - self.root_path = Path(self.temporary_root.name) - self.l3_data_root = self.root_path / "l3" - self.l3_data_root.mkdir() - self.bufr_root = self.root_path / "bufr" - self.bufr_root.mkdir() + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + output_path = root_path / "bufr_out" + positions_filepath = root_path / "positions.csv" + station_config = get_station_configuration( + export_bufr=True, + positions_update_timestamp_only=False, + ) + now_timestamp = pd.to_datetime("2024-03-01 00:12:00") + latest_timestamp = pd.to_datetime("2024-03-01 00:01:00") + input_file = root_path / "input" / f"{station_config.stid}_hour.csv" + timestamps_pickle_filepath = root_path / "timestamps.pickle" + latest_timestamps = {station_config.stid: latest_timestamp} + with timestamps_pickle_filepath.open("wb") as f: + pd.to_pickle(latest_timestamps, f) + get_latest_data_mock.return_value.name = latest_timestamp + get_station_positions_mock.return_value = dict( + timestamp=latest_timestamp, + lat=78.52901, + lon=-56.8450358, + alt=1968.561, + ) - self.positions_file_path = self.root_path / "positions.csv" - self.positions_seed_path = self.root_path / "positions_seed.csv" - self.timestamps_pickle_filepath = self.root_path / "latest_timestamps.pickle" - self.station_configuration_path = self.root_path / "station_configuration.toml" + get_bufr( + input_files=[input_file], + station_configuration_mapping={station_config.stid: station_config}, + break_on_error=True, + bufr_out=output_path, + target_timestamp=now_timestamp, + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=timestamps_pickle_filepath, + ) - def tearDown(self) -> None: - self.temporary_root.cleanup() + get_station_positions_mock.assert_called_once() + # The BUFR export should be skipped since the latest timestamp is already in the timestamps + get_bufr_variables_mock.assert_not_called() + write_bufr_message_mock.assert_not_called() + self.assertTrue(positions_filepath.exists()) - @mock.patch(MOCK_BASE_STR.format("process_station")) - def test_process_station_raises_exception( - self, process_station_mock: mock.MagicMock + def test_no_station_configuration( + self, + load_data_mock: mock.MagicMock, + get_latest_data_mock: mock.MagicMock, + write_bufr_message_mock: mock.MagicMock, + get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, ): - """ - get_bufr should skip stations where process_station raises exception - """ - timestamps_pickle_filepath = self.root_path / "timestamps.pickle" - stid = "THE_STID_FOR_A_STATION" - input_file_path = self.root_path / f"{stid}_hourly.csv" - process_station_mock.side_effect = Exception("Test exception") - now_timestamp = datetime.datetime.now() - self.assertFalse(self.positions_file_path.exists()) - self.assertFalse(timestamps_pickle_filepath.exists()) - - get_bufr( - store_positions=True, - bufr_out=self.bufr_root, - input_files=[input_file_path], - positions_filepath=self.positions_file_path, - station_configuration_mapping=dict(), - timestamps_pickle_filepath=timestamps_pickle_filepath, - now_timestamp=now_timestamp, - ) - - self.assertTrue(self.positions_file_path.exists()) - self.assertTrue(timestamps_pickle_filepath.exists()) - - @mock.patch(MOCK_BASE_STR.format("process_station")) - def test_multiple_stations(self, process_station_mock: mock.MagicMock): - station_config01 = StationConfiguration(stid="station_01", export_bufr=True) - station_config02 = StationConfiguration(stid="station_02", export_bufr=True) - station_config03 = StationConfiguration(stid="station_03", export_bufr=False) - process_station_return_values = { - station_config01.stid: dict( - timestamp=datetime.datetime(2023, 2, 1, 10), lat=1, lon=3, alt=31 - ), - station_config02.stid: dict( - timestamp=datetime.datetime(2023, 2, 1, 10), lat=2, lon=3, alt=31 - ), - station_config03.stid: dict( - timestamp=datetime.datetime(2023, 2, 1, 10), lat=3, lon=3, alt=31 - ), - } - process_station_mock.side_effect = ( - lambda **kwargs: process_station_return_values[ - kwargs["station_configuration"].stid - ] - ) - input_files = [ - self.root_path / f"{station_config01.stid}_hourly.csv", - self.root_path / f"{station_config02.stid}_hourly.csv", - self.root_path / f"{station_config03.stid}_hourly.csv", - ] - station_config_mapping = { - station_config01.stid: station_config01, - station_config02.stid: station_config02, - station_config03.stid: station_config03, - } - - get_bufr( - store_positions=True, - bufr_out=self.bufr_root, - input_files=input_files, - positions_filepath=self.positions_file_path, - station_configuration_mapping=station_config_mapping, - timestamps_pickle_filepath=self.timestamps_pickle_filepath, - positions_seed_path=None, - now_timestamp=datetime.datetime.now(), - ) - - self.assertEqual(3, process_station_mock.call_count) - read_positions = pd.read_csv( - self.positions_file_path, index_col=0, parse_dates=["timestamp"] - ).to_dict(orient="index") - self.assertDictEqual( - read_positions, - process_station_return_values, - ) - - def test_no_stations(self): - now_timestamp = datetime.datetime.now() - self.assertFalse(self.positions_file_path.exists()) - self.assertFalse(self.timestamps_pickle_filepath.exists()) - - get_bufr( - store_positions=True, - bufr_out=self.bufr_root, - input_files=(), - positions_filepath=self.positions_file_path, - station_configuration_mapping=dict(), - timestamps_pickle_filepath=self.timestamps_pickle_filepath, - now_timestamp=now_timestamp, - ) - - self.assertTrue(self.positions_file_path.exists()) - self.assertTrue(self.timestamps_pickle_filepath.exists()) - positions = pd.read_csv(self.positions_file_path) - pd.testing.assert_frame_equal( - positions, - pd.DataFrame(columns=["stid", "timestamp", "lat", "lon", "alt"], data=[]), - ) - with self.timestamps_pickle_filepath.open("br") as fp: - timestamps = pickle.load(fp) - self.assertDictEqual(dict(), timestamps) - - @mock.patch(MOCK_BASE_STR.format("process_station")) - def test_single_station(self, process_station_mock: mock.MagicMock): - now_timestamp = datetime.datetime.now() - stid = "THE_STID_FOR_A_STATION" - input_file_path = self.root_path / f"{stid}_hourly.csv" - station_configuration = StationConfiguration(stid=stid, export_bufr=True) - station_configuration_mapping = { - stid: station_configuration, - } - expected_output_path = self.bufr_root / f"{stid}.bufr" - expected_latest_timestamp = now_timestamp - datetime.timedelta(days=2) - expected_station_configuration = StationConfiguration( - stid=stid, export_bufr=True - ) - - get_bufr( - store_positions=True, - bufr_out=self.bufr_root, - input_files=[input_file_path], - positions_filepath=self.positions_file_path, - station_configuration_mapping=station_configuration_mapping, - timestamps_pickle_filepath=self.timestamps_pickle_filepath, - positions_seed_path=None, - now_timestamp=now_timestamp, - ) + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + output_path = root_path / "bufr_out" + positions_filepath = root_path / "positions.csv" + station_id = "A_STID" + now_timestamp = pd.to_datetime("2024-03-01 00:12:00") + latest_timestamp = pd.to_datetime("2024-03-01 00:01:00") + input_file = root_path / "input" / f"{station_id}_hour.csv" + get_latest_data_mock.return_value.name = latest_timestamp + get_station_positions_mock.return_value = dict( + timestamp=latest_timestamp, + lat=78.52901, + lon=-56.8450358, + alt=1968.561, + ) - process_station_mock.assert_called_once_with( - file_path=input_file_path, - output_path=expected_output_path, - now_timestamp=now_timestamp, - latest_timestamp=expected_latest_timestamp, - time_limit="91d", - stid=stid, - station_configuration=expected_station_configuration, - ) + get_bufr( + input_files=[input_file], + station_configuration_mapping=dict(), + break_on_error=True, + bufr_out=output_path, + target_timestamp=now_timestamp, + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=None, + ) - @mock.patch(MOCK_BASE_STR.format("process_station")) - def test_station_without_configuration(self, process_station_mock: mock.MagicMock): - now_timestamp = datetime.datetime.now() - stid = "STATION_ID" - input_file_path = self.root_path / f"{stid}_hourly.csv" - expected_station_configuration = StationConfiguration(stid=stid) - expected_output_path = self.bufr_root / f"{stid}.bufr" - - get_bufr( - store_positions=True, - bufr_out=self.bufr_root, - input_files=[input_file_path], - positions_filepath=self.positions_file_path, - station_configuration_mapping=dict(), - timestamps_pickle_filepath=self.timestamps_pickle_filepath, - positions_seed_path=None, - now_timestamp=now_timestamp, - ) + get_station_positions_mock.assert_called_once() + get_bufr_variables_mock.assert_not_called() + write_bufr_message_mock.assert_not_called() + self.assertTrue(positions_filepath.exists()) - process_station_mock.assert_called_once_with( - file_path=input_file_path, - output_path=expected_output_path, - now_timestamp=now_timestamp, - latest_timestamp=now_timestamp - datetime.timedelta(days=2), - time_limit="91d", - stid=stid, - station_configuration=expected_station_configuration, - ) + def test_update_timestamps_only( + self, + load_data_mock: mock.MagicMock, + get_latest_data_mock: mock.MagicMock, + write_bufr_message_mock: mock.MagicMock, + get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, + ): + pass - @mock.patch(MOCK_BASE_STR.format("process_station")) - def test_latest_timestamp(self, process_station_mock: mock.MagicMock): - stid = "STATION_ID" - now_timestamp = datetime.datetime(2022, 1, 5, 10, 21) - latest_timestamp = datetime.datetime(2022, 1, 5, 10, 0) - # Save latest timestamp to pickle file - with self.timestamps_pickle_filepath.open("wb") as fp: - pickle.dump({stid: latest_timestamp}, fp) - input_file_path = self.root_path / f"{stid}_hourly.csv" - - get_bufr( - store_positions=True, - bufr_out=self.bufr_root, - input_files=[input_file_path], - positions_filepath=self.positions_file_path, - station_configuration_mapping=dict(), - timestamps_pickle_filepath=self.timestamps_pickle_filepath, - positions_seed_path=None, - now_timestamp=now_timestamp, - ) + def test_cleans_up_when_on_exception( + self, + load_data_mock: mock.MagicMock, + get_latest_data_mock: mock.MagicMock, + write_bufr_message_mock: mock.MagicMock, + get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, + ): + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + positions_filepath = root_path / "positions.csv" + station_config = get_station_configuration( + export_bufr=True, + positions_update_timestamp_only=False, + ) + input_file = root_path / "input" / f"{station_config.stid}_hour.csv" + target_timestamp = mock.create_autospec(datetime.timedelta) + get_latest_data_mock.side_effect = Exception("Test exception") + + get_bufr( + input_files=[input_file], + station_configuration_mapping={station_config.stid: station_config}, + break_on_error=False, + bufr_out=mock.create_autospec(Path), + target_timestamp=target_timestamp, + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=None, + time_window_length=pd.to_timedelta("2d"), + ) - process_station_mock.assert_called_once_with( - file_path=input_file_path, - output_path=self.bufr_root / f"{stid}.bufr", - now_timestamp=now_timestamp, - latest_timestamp=latest_timestamp, - time_limit="91d", - stid=stid, - station_configuration=StationConfiguration(stid=stid), - ) + load_data_mock.assert_called_once_with(input_file, target_timestamp) + get_latest_data_mock.assert_called_once_with( + load_data_mock.return_value, + lin_reg_time_limit="91d", + vars_to_skip=station_config.skipped_variables, + ) + get_station_positions_mock.assert_not_called() + get_bufr_variables_mock.assert_not_called() + write_bufr_message_mock.assert_not_called() + self.assertTrue(positions_filepath.exists()) - @mock.patch(MOCK_BASE_STR.format("process_station")) - def test_update_timestamp_only(self, process_station_mock: mock.MagicMock): - stid = "STATION_ID" - # Prepare station config - station_config = StationConfiguration( - stid=stid, positions_update_timestamp_only=True - ) - config_mapping = {station_config.stid: station_config} - input_file_path = self.root_path / f"{stid}_hourly.csv" - seed_timestamp = datetime.datetime(2021, 10, 2, 10, 0) - now_timestamp = datetime.datetime(2023, 3, 3, 5, 0) - positions_seed = pd.DataFrame( - columns=["stid", "timestamp", "lat", "lon", "alt"], - data=[ - [stid, seed_timestamp, 65.0, -40.0, 800], - ], - ) - positions_seed.to_csv(self.positions_seed_path, index=False) - process_station_mock.return_value = { - "timestamp": now_timestamp, - # All position values should be ignored - "lat": None, - "lot": np.nan, - "alt": 2414.0, - } - # Only timestamp should be updated - expected_positions = positions_seed.copy() - expected_positions["timestamp"] = now_timestamp - - get_bufr( - store_positions=True, - bufr_out=self.bufr_root, - input_files=[input_file_path], - positions_filepath=self.positions_file_path, - station_configuration_mapping=config_mapping, - timestamps_pickle_filepath=self.timestamps_pickle_filepath, - positions_seed_path=self.positions_seed_path, - now_timestamp=now_timestamp, - ) + def test_multiple_stations( + self, + load_data_mock: mock.MagicMock, + get_latest_data_mock: mock.MagicMock, + write_bufr_message_mock: mock.MagicMock, + get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, + ): + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + positions_filepath = root_path / "positions.csv" + output_path = root_path / "bufr_out" + station_config1 = StationConfiguration(stid="station_01", export_bufr=True) + station_config2 = StationConfiguration(stid="station_02", export_bufr=True) + station_config3 = StationConfiguration(stid="station_03", export_bufr=False) + station_configs = [station_config1, station_config2, station_config3] + station_configuration_mapping = { + config.stid: config for config in station_configs + } + input_files = [ + root_path / "input" / f"{config.stid}_hour.csv" + for config in station_configs + ] + target_timestamp = pd.to_datetime("2024-03-01 00:12:00") + latest_timestamp = pd.to_datetime("2024-03-01 00:01:00") + get_latest_data_mock.return_value.name = latest_timestamp + station_positions = [ + dict( + timestamp=latest_timestamp, + lat=random.random() * 180 - 90, + lon=random.random() * 360 - 180, + alt=2000 * random.random(), + ) + for _ in range(3) + ] + get_station_positions_mock.side_effect = station_positions + + get_bufr( + input_files=input_files, + station_configuration_mapping=station_configuration_mapping, + break_on_error=True, + bufr_out=output_path, + target_timestamp=target_timestamp, + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=None, + time_window_length=pd.to_timedelta("2d"), + ) - positions = pd.read_csv(self.positions_file_path, parse_dates=["timestamp"]) - self.assertEqual(1, len(positions)) - pd.testing.assert_series_equal( - positions.iloc[0], - expected_positions.iloc[0], - ) + self.assertTrue(positions_filepath.exists()) + self.assertEqual(3, get_station_positions_mock.call_count) + self.assertEqual(2, write_bufr_message_mock.call_count) + written_positions = pd.read_csv( + positions_filepath, index_col=0, parse_dates=["timestamp"] + ) + self.assertSetEqual( + set(written_positions.index), + {config.stid for config in station_configs}, + ) - def test_position_seed(self): - """ - There are no data files available. get_bufr should use the position_seed for output positions. - """ - positions_seed = pd.DataFrame( - columns=["stid", "timestamp", "lat", "lon", "alt"], - data=[ - ["STATION_A", datetime.datetime(2021, 10, 2), 65.0, -40.0, 800], - ["STATION_B", datetime.datetime(2023, 11, 12), 66.0, -50.0, 1100], - ], - ) - positions_seed.to_csv(self.positions_seed_path, index=False) - - get_bufr( - store_positions=True, - bufr_out=self.bufr_root, - input_files=(), - positions_filepath=self.positions_file_path, - station_configuration_mapping=dict(), - timestamps_pickle_filepath=self.timestamps_pickle_filepath, - positions_seed_path=self.positions_seed_path, - now_timestamp=datetime.datetime.now(), - ) + def test_station_without_configuration( + self, + load_data_mock: mock.MagicMock, + get_latest_data_mock: mock.MagicMock, + write_bufr_message_mock: mock.MagicMock, + get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, + ): + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + positions_filepath = root_path / "positions.csv" + output_path = root_path / "bufr_out" + target_timestamp = datetime.datetime.now() + stid = "STATION_ID" + input_file_path = root_path / f"{stid}_hourly.csv" + get_station_positions_mock.return_value = dict( + timestamp=target_timestamp, + lat=78.52901, + lon=-56.8450358, + alt=1968.561, + ) - for p in self.root_path.glob("*"): - print(p) + get_bufr( + input_files=[input_file_path], + station_configuration_mapping={}, + break_on_error=True, + bufr_out=output_path, + target_timestamp=target_timestamp, + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=None, + time_window_length=pd.to_timedelta("2d"), + ) - positions = pd.read_csv(self.positions_file_path, parse_dates=["timestamp"]) - pd.testing.assert_frame_equal(positions, positions_seed) + get_latest_data_mock.assert_called_once() + get_station_positions_mock.assert_called_once() + get_bufr_variables_mock.assert_not_called() + written_positions = pd.read_csv( + positions_filepath, index_col=0, parse_dates=["timestamp"] + ) + self.assertDictEqual( + get_station_positions_mock.return_value, + dict(written_positions.loc[stid]), + ) diff --git a/tests/unit/bufr_export/test_get_bufr_integration.py b/tests/unit/bufr_export/test_get_bufr_integration.py index 1a21b3ee..f03a60a5 100644 --- a/tests/unit/bufr_export/test_get_bufr_integration.py +++ b/tests/unit/bufr_export/test_get_bufr_integration.py @@ -5,7 +5,6 @@ import datetime import logging import pickle -import shutil import sys from pathlib import Path from tempfile import TemporaryDirectory @@ -19,9 +18,7 @@ from pypromice.postprocess.bufr_utilities import read_bufr_message, BUFRVariables from pypromice.station_configuration import ( StationConfiguration, - write_station_configuration_mapping, ) -from tests.utilities import get_station_configuration logging.basicConfig( stream=sys.stdout, @@ -124,15 +121,15 @@ def test_get_bufr_has_new_data(self): stid = "DY2" # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"DY2": datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 8) + target_timestamp = datetime.datetime(2023, 12, 8) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) expected_bufr_variables = BUFRVariables( @@ -163,15 +160,15 @@ def test_get_bufr_has_new_data_dont_store_position(self): stid = "DY2" # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"DY2": datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 8) + target_timestamp = datetime.datetime(2023, 12, 8) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=False, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) expected_bufr_variables = BUFRVariables( @@ -202,17 +199,17 @@ def test_get_bufr_stid_to_skip(self): stid = "DY2" # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"DY2": datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 6) + target_timestamp = datetime.datetime(2023, 12, 6) mapping = self.get_station_configuration_mapping( stid, wmo_id="04464", export_bufr=False ) bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) self.assertIsNone(bufr_data) @@ -223,16 +220,16 @@ def test_get_bufr_has_no_data_newer_than_latests_timestamps(self): stid = "DY2" # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {stid: datetime.datetime(2023, 12, 7, 23, 00)} - now_timestamp = datetime.datetime(2023, 12, 8) + target_timestamp = datetime.datetime(2023, 12, 8) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) self.assertIsNone(bufr_data) @@ -242,15 +239,15 @@ def test_get_bufr_includes_datasets_not_in_latests_timestamps(self): l3_src = pd.read_csv(l3_src_filepath) stid = "DY2" latest_timestamps = {} - now_timestamp = datetime.datetime(2023, 12, 8) + target_timestamp = datetime.datetime(2023, 12, 8) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) @@ -282,16 +279,16 @@ def test_get_bufr_has_old_data_compared_to_now(self): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") l3_src = pd.read_csv(l3_src_filepath) latest_timestamps = {stid: datetime.datetime(2023, 12, 6)} - now_timestamp = datetime.datetime(2023, 12, 20) + target_timestamp = datetime.datetime(2023, 12, 20) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) self.assertIsNone(bufr_data) @@ -304,15 +301,15 @@ def test_invalid_value_at_last_index(self): # Set some of instantanous values to nan l3_src.loc[140:, "p_i"] = np.nan latest_timestamps = {stid: datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 8) + target_timestamp = datetime.datetime(2023, 12, 8) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) expected_bufr_variables = BUFRVariables( @@ -337,6 +334,27 @@ def test_invalid_value_at_last_index(self): expected_bufr_variables.as_series(), ) + def test_invalid_position_data(self): + stid = "DY2" + # Newest measurement in DY2_hour: 2023-12-07 23:00:00 + l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") + l3_src = pd.read_csv(l3_src_filepath) + # Set some of instantanous values to nan + l3_src.loc[:, "gps_lat"] = np.nan + latest_timestamps = {stid: datetime.datetime(2023, 12, 1)} + target_timestamp = datetime.datetime(2023, 12, 8) + mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") + bufr_data = run_get_bufr( + l3_data=l3_src, + target_timestamp=target_timestamp, + latest_timestamps=latest_timestamps, + stid=stid, + store_positions=True, + linear_regression_time_limit="91d", + station_configuration_mapping=mapping, + ) + self.assertIsNone(bufr_data) + def test_multiple_last_valid_indices_all_instantaneous_timestamps_are_none(self): stid = "DY2" # Newest measurement in DY2_hour: 2023-12-07 23:00:00 @@ -354,15 +372,15 @@ def test_multiple_last_valid_indices_all_instantaneous_timestamps_are_none(self) ], ] = np.nan latest_timestamps = {stid: datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 6) + target_timestamp = datetime.datetime(2023, 12, 6) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) @@ -376,16 +394,16 @@ def test_multiple_last_valid_indices_all_older_than_2days(self): # Set some of instantanous values to nan l3_src.loc[140:, "p_i"] = np.nan latest_timestamps = {stid: datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 10) + target_timestamp = datetime.datetime(2023, 12, 10) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) self.assertIsNone(bufr_data) @@ -397,15 +415,15 @@ def test_min_data_wx_failed(self): stid = "DY2" # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"DY2": datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 6) + target_timestamp = datetime.datetime(2023, 12, 6) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) @@ -418,15 +436,15 @@ def test_min_data_pos_failed(self): stid = "DY2" # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"DY2": datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 6) + target_timestamp = datetime.datetime(2023, 12, 6) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) self.assertIsNone(bufr_data) @@ -438,7 +456,7 @@ def test_ignore_newer_data_than_now_input(self): # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {stid: datetime.datetime(2023, 12, 1)} # New is before the latest data - now_timestamp = datetime.datetime( + target_timestamp = datetime.datetime( 2023, 12, 6, @@ -446,17 +464,17 @@ def test_ignore_newer_data_than_now_input(self): mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) expected_bufr_variables = BUFRVariables( wmo_id="04464", station_type="mobile", - # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 but now_timestamp is 2023-12-06 + # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 but target_timestamp is 2023-12-06 timestamp=datetime.datetime(2023, 12, 6, 0, 0), relativeHumidity=82, airTemperature=250.85, @@ -481,17 +499,17 @@ def test_land_station_export(self): stid = "WEG_B" # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"WEG_B": datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 8) + target_timestamp = datetime.datetime(2023, 12, 8) mapping = self.get_station_configuration_mapping( stid, wmo_id="460", station_type="land" ) bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) expected_bufr_variables = BUFRVariables( diff --git a/tests/unit/bufr_export/test_realtime_utilitites.py b/tests/unit/bufr_export/test_realtime_utilitites.py index 1acdb5b7..557ffe5c 100644 --- a/tests/unit/bufr_export/test_realtime_utilitites.py +++ b/tests/unit/bufr_export/test_realtime_utilitites.py @@ -173,3 +173,35 @@ def test_auxiliary_input_data(self): ) self.assertEqual(expected_output, latest_data["auxiliary_data"]) + + def test_skipped_variables(self): + """ + Test that the variables in vars_to_skip are set to nan if they are present in the input data. + """ + data = self.get_data() + expected_output = pd.Series( + data={ + "p_i": float("nan"), + "t_i": -16.7, + "rh_i": 84.6, + "wspd_i": 14.83, + "wdir_i": 142.2, + "gps_lat": 66.482469, + "gps_lon": -46.294232, + "gps_alt": 2116.0, + "z_boom_u": 4.1901, + "gps_lat_fit": 66.4824788, + "gps_lon_fit": -46.2942685, + "gps_alt_fit": 2121.4118, + "z_boom_u_smooth": 4.188, + }, + name=datetime.datetime(2023, 12, 7, 6), + ) + + latest_data = get_latest_data( + df=data, + lin_reg_time_limit="1w", + vars_to_skip=["p_i", "a_non_existing_variable"], + ) + + pd.testing.assert_series_equal(latest_data, expected_output, rtol=1e-8) diff --git a/tests/utilities.py b/tests/utilities.py index 742f6861..a56301d4 100644 --- a/tests/utilities.py +++ b/tests/utilities.py @@ -13,7 +13,7 @@ def get_station_configuration(**kwargs) -> StationConfiguration: Parameters ---------- - kwargs : dict + kwargs Keyword arguments to providie explicit values for the StationConfiguration object. Returns ------- From 805e6b365991389c20ed7e1c8d24ec793c7acf0a Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Wed, 7 Aug 2024 09:54:56 +0200 Subject: [PATCH 14/16] Minor cleanup --- src/pypromice/postprocess/get_bufr.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/src/pypromice/postprocess/get_bufr.py b/src/pypromice/postprocess/get_bufr.py index c08b6b95..2da014d8 100644 --- a/src/pypromice/postprocess/get_bufr.py +++ b/src/pypromice/postprocess/get_bufr.py @@ -18,7 +18,7 @@ import sys from datetime import datetime, timedelta from pathlib import Path -from typing import List, Dict, Optional, Collection, Sequence, Mapping, BinaryIO +from typing import List, Dict, Optional, Sequence, Mapping import numpy as np import pandas as pd @@ -121,9 +121,6 @@ def get_bufr( if target_timestamp is None: target_timestamp = datetime.utcnow() - # if earliest_timestamp is None: - # earliest_timestamp = now_timestamp - timedelta(days=2) - # Prepare (latest) positions positions = dict() if positions_seed_path: @@ -150,9 +147,6 @@ def get_bufr( # Setup diagnostic lists (logger.info at end) skipped = [] no_recent_data = [] - no_entry_latest_timestamps = [] - failed_min_data_wx = [] - failed_min_data_pos = [] # Iterate through csv files for file_path in input_files: @@ -186,6 +180,7 @@ def get_bufr( ) if latest_data is None: logger.info("No valid instantaneous timestamps!") + skipped.append(stid) continue # Create station positions @@ -209,6 +204,7 @@ def get_bufr( write_bufr_message(bufr_variables, output_file) else: logger.info(f"No new data {latest_data.name} <= {time_window_start}") + no_recent_data.append(stid) except Exception: logger.exception(f"Failed processing {stid}") @@ -216,6 +212,7 @@ def get_bufr( output_path.unlink() if break_on_error: raise + skipped.append(stid) continue # Write the most recent timestamps back to the pickle on disk @@ -235,12 +232,9 @@ def get_bufr( positions_df.to_csv(positions_filepath, index_label="stid") logger.info("--------------------------------") - not_processed_wx_pos = set(failed_min_data_wx + failed_min_data_pos) not_processed_count = ( len(skipped) + len(no_recent_data) - + len(no_entry_latest_timestamps) - + len(not_processed_wx_pos) ) logger.info( "BUFR exported for {} of {} fpaths.".format( @@ -250,9 +244,6 @@ def get_bufr( logger.info("") logger.info("skipped: {}".format(skipped)) logger.info("no_recent_data: {}".format(no_recent_data)) - logger.info("no_entry_latest_timestamps: {}".format(no_entry_latest_timestamps)) - logger.info("failed_min_data_wx: {}".format(failed_min_data_wx)) - logger.info("failed_min_data_pos: {}".format(failed_min_data_pos)) logger.info("--------------------------------") From f94a7ff178e6c411d724f77cb0109a4a6bf1e322 Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Wed, 7 Aug 2024 14:36:43 +0200 Subject: [PATCH 15/16] Updated StationConfiguration IO to handle unknown attributes from input --- .../postprocess/create_bufr_files.py | 7 ++- src/pypromice/postprocess/get_bufr.py | 3 +- src/pypromice/station_configuration.py | 25 +++++++-- tests/unit/test_station_config.py | 51 +++++++++++++++++++ 4 files changed, 81 insertions(+), 5 deletions(-) diff --git a/src/pypromice/postprocess/create_bufr_files.py b/src/pypromice/postprocess/create_bufr_files.py index 1b6b4b78..f542a8d0 100644 --- a/src/pypromice/postprocess/create_bufr_files.py +++ b/src/pypromice/postprocess/create_bufr_files.py @@ -28,11 +28,13 @@ def create_bufr_files( Generate hourly bufr files from the for all input files :param input_files: Paths to csv l3 hourly data files + :param station_configuration_root: Root directory containing station configuration toml files :param period_start: Datetime string for period start. Eg '2024-01-01T00:00' or '20240101 :param period_end: Datetime string for period end :param output_root: Output dir for both bufr files for individual stations and compiled. Organized in two sub directories. :param override: If False: Skip a period if the compiled output file exists. :param break_on_error: If True: Stop processing if an error occurs + :param output_filename_suffix: Suffix for the compiled output file :return: """ periods = pd.date_range(period_start, period_end, freq="H") @@ -41,7 +43,10 @@ def create_bufr_files( output_individual_root.mkdir(parents=True, exist_ok=True) output_compiled_root.mkdir(parents=True, exist_ok=True) - station_configuration_mapping = load_station_configuration_mapping(station_configuration_root) + station_configuration_mapping = load_station_configuration_mapping( + station_configuration_root, + skip_unexpected_fields=True, + ) for period in periods: period: pd.Timestamp diff --git a/src/pypromice/postprocess/get_bufr.py b/src/pypromice/postprocess/get_bufr.py index 2da014d8..48efa656 100644 --- a/src/pypromice/postprocess/get_bufr.py +++ b/src/pypromice/postprocess/get_bufr.py @@ -462,7 +462,8 @@ def main(): input_files += map(Path, glob.glob(path.as_posix())) station_configuration_mapping = load_station_configuration_mapping( - args.station_configurations_root + args.station_configurations_root, + skip_unexpected_fields=True, ) get_bufr( diff --git a/src/pypromice/station_configuration.py b/src/pypromice/station_configuration.py index fb8d5439..4ec4baec 100644 --- a/src/pypromice/station_configuration.py +++ b/src/pypromice/station_configuration.py @@ -1,3 +1,4 @@ +import logging from pathlib import Path from typing import Optional, Dict, Mapping, Sequence @@ -30,6 +31,7 @@ class StationConfiguration: sonic_ranger_from_gps: Optional[float] = None static_height_of_gps_from_mean_sea_level: Optional[float] = None station_relocation: Sequence[str] = attrs.field(factory=list) + location_type: Optional[str] = None # The station data will be exported to BUFR if True. Otherwise, it will only export latest position export_bufr: bool = False @@ -45,8 +47,22 @@ class StationConfiguration: positions_update_timestamp_only: bool = False @classmethod - def load_toml(cls, path): - return cls(**toml.load(path)) + def load_toml(cls, path, skip_unexpected_fields=False): + config_fields = {field.name for field in attrs.fields(cls)} + input_dict = toml.load(path) + unexpected_fields = set(input_dict.keys()) - config_fields + if unexpected_fields: + if skip_unexpected_fields: + logging.info( + f"Skipping unexpected fields in toml file {path}: " + + ", ".join(unexpected_fields) + ) + for field in unexpected_fields: + input_dict.pop(field) + else: + raise ValueError(f"Unexpected fields: {unexpected_fields}") + + return cls(**input_dict) def dump_toml(self, path: Path): with path.open("w") as fp: @@ -58,6 +74,7 @@ def as_dict(self) -> Dict: def load_station_configuration_mapping( configurations_root_dir: Path, + **kwargs, ) -> Mapping[str, StationConfiguration]: """ Load station configurations from toml files in configurations_root_dir @@ -66,6 +83,8 @@ def load_station_configuration_mapping( ---------- configurations_root_dir Root directory containing toml files + kwargs + Additional arguments to pass to StationConfiguration.load_toml Returns ------- @@ -73,7 +92,7 @@ def load_station_configuration_mapping( """ return { - config_file.stem: StationConfiguration(**toml.load(config_file)) + config_file.stem: StationConfiguration.load_toml(config_file, **kwargs) for config_file in configurations_root_dir.glob("*.toml") } diff --git a/tests/unit/test_station_config.py b/tests/unit/test_station_config.py index a2b117fd..4788d019 100644 --- a/tests/unit/test_station_config.py +++ b/tests/unit/test_station_config.py @@ -55,6 +55,57 @@ def test_read_toml(self): station_configuration, ) + def test_read_toml_with_unexpected_field(self): + with TemporaryDirectory() as temp_dir: + source_path = Path(temp_dir) / "UPE_L.toml" + source_str = """ + stid = "UPE_L" + station_site = "UPE_L" + project = "Promice" + station_type = "mobile" + wmo_id = "04423" + barometer_from_gps = -0.25 + anemometer_from_sonic_ranger = 0.4 + temperature_from_sonic_ranger = 0.0 + height_of_gps_from_station_ground = 0.9 + sonic_ranger_from_gps = 1.3 + export_bufr = true + skipped_variables = [] + positions_update_timestamp_only = false + an_unexpected_field = 42 + """ + with source_path.open("w") as source_io: + source_io.writelines(source_str) + + expected_configuration = StationConfiguration( + stid="UPE_L", + station_site="UPE_L", + project="Promice", + station_type="mobile", + wmo_id="04423", + barometer_from_gps=-0.25, + anemometer_from_sonic_ranger=0.4, + temperature_from_sonic_ranger=0.0, + height_of_gps_from_station_ground=0.9, + sonic_ranger_from_gps=1.3, + export_bufr=True, + comment=None, + skipped_variables=[], + positions_update_timestamp_only=False, + ) + + with self.assertRaises(ValueError): + StationConfiguration.load_toml(source_path) + + station_configuration = StationConfiguration.load_toml(source_path, skip_unexpected_fields=True) + + self.assertEqual( + expected_configuration, + station_configuration, + ) + + + def test_write_read(self): with TemporaryDirectory() as temp_dir: output_path = Path(temp_dir) / "UPE_L.toml" From 99659fa6c931b136a5209e2517fba615dd991c95 Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Mon, 12 Aug 2024 15:55:57 +0200 Subject: [PATCH 16/16] Updated docstring in create_bufr_files.py --- .../postprocess/create_bufr_files.py | 34 ++++++++++++------- src/pypromice/postprocess/get_bufr.py | 5 +-- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/src/pypromice/postprocess/create_bufr_files.py b/src/pypromice/postprocess/create_bufr_files.py index f542a8d0..a6cb7842 100644 --- a/src/pypromice/postprocess/create_bufr_files.py +++ b/src/pypromice/postprocess/create_bufr_files.py @@ -27,15 +27,25 @@ def create_bufr_files( """ Generate hourly bufr files from the for all input files - :param input_files: Paths to csv l3 hourly data files - :param station_configuration_root: Root directory containing station configuration toml files - :param period_start: Datetime string for period start. Eg '2024-01-01T00:00' or '20240101 - :param period_end: Datetime string for period end - :param output_root: Output dir for both bufr files for individual stations and compiled. Organized in two sub directories. - :param override: If False: Skip a period if the compiled output file exists. - :param break_on_error: If True: Stop processing if an error occurs - :param output_filename_suffix: Suffix for the compiled output file - :return: + Parameters + ---------- + input_files + Paths to csv l3 hourly data files + station_configuration_root + Root directory containing station configuration toml files + period_start + Datetime string for period start. Eg '2024-01-01T00:00' or '20240101 + period_end + Datetime string for period end + output_root + Output dir for both bufr files for individual stations and compiled. Organized in two sub directories. + override + If False: Skip a period if the compiled output file exists. + break_on_error + If True: Stop processing if an error occurs + output_filename_suffix + Suffix for the compiled output file + """ periods = pd.date_range(period_start, period_end, freq="H") output_individual_root = output_root / "individual" @@ -82,6 +92,7 @@ def create_bufr_files( # %% + def main(): import argparse import glob @@ -101,9 +112,7 @@ def main(): main_logger.addHandler(main_handler) main_logger.setLevel(logging.INFO) - parser = argparse.ArgumentParser( - "Create BUFR files from L3 tx .csv files." - ) + parser = argparse.ArgumentParser("Create BUFR files from L3 tx .csv files.") parser.add_argument( "--input_files", "--l3-filepath", @@ -164,5 +173,6 @@ def main(): station_configuration_root=args.station_configuration_root, ) + if __name__ == "__main__": main() diff --git a/src/pypromice/postprocess/get_bufr.py b/src/pypromice/postprocess/get_bufr.py index 48efa656..c59553e6 100644 --- a/src/pypromice/postprocess/get_bufr.py +++ b/src/pypromice/postprocess/get_bufr.py @@ -232,10 +232,7 @@ def get_bufr( positions_df.to_csv(positions_filepath, index_label="stid") logger.info("--------------------------------") - not_processed_count = ( - len(skipped) - + len(no_recent_data) - ) + not_processed_count = len(skipped) + len(no_recent_data) logger.info( "BUFR exported for {} of {} fpaths.".format( (len(input_files) - not_processed_count), len(input_files)