diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml index 14aa27e8..e0ebc350 100644 --- a/.github/workflows/unit_test.yml +++ b/.github/workflows/unit_test.yml @@ -4,7 +4,7 @@ on: workflow_dispatch: jobs: - build: + test: name: unit_test runs-on: ubuntu-latest strategy: @@ -19,6 +19,9 @@ jobs: uses: actions/checkout@v3 with: token: ${{ secrets.GITHUB_TOKEN }} + - name: Install eccodes + run : | + sudo apt-get install -y libeccodes-dev - name: Install dependencies shell: bash run: | @@ -30,4 +33,4 @@ jobs: - name: Run unit tests shell: bash run: | - python3 -m unittest discover tests.e2e + python3 -m unittest discover tests diff --git a/setup.py b/setup.py index 52a9b216..08b72656 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ package_data={ "pypromice.tx": ["payload_formats.csv", "payload_types.csv"], "pypromice.qc.percentiles": ["thresholds.csv"], - "pypromice.postprocess": ["station_configurations.toml", "positions_seed.csv"], + "pypromice.postprocess": ["positions_seed.csv"], }, install_requires=['numpy~=1.23', 'pandas>=1.5.0', 'xarray>=2022.6.0', 'toml', 'scipy>=1.9.0', 'Bottleneck', 'netcdf4', 'pyDataverse==0.3.1', 'eccodes', 'scikit-learn>=1.1.0'], # extras_require={'postprocess': ['eccodes','scikit-learn>=1.1.0']}, @@ -47,6 +47,8 @@ 'get_l2tol3 = pypromice.process.get_l2tol3:main', 'get_watsontx = pypromice.tx.get_watsontx:get_watsontx', 'get_bufr = pypromice.postprocess.get_bufr:main', + 'create_bufr_files = pypromice.postprocess.create_bufr_files:main', + 'bufr_to_csv = pypromice.postprocess.bufr_to_csv:main', 'get_msg = pypromice.tx.get_msg:get_msg' ], }, diff --git a/src/pypromice/postprocess/bufr_to_csv.py b/src/pypromice/postprocess/bufr_to_csv.py index 788aef39..d80f99a3 100644 --- a/src/pypromice/postprocess/bufr_to_csv.py +++ b/src/pypromice/postprocess/bufr_to_csv.py @@ -3,9 +3,14 @@ from pypromice.postprocess.bufr_utilities import read_bufr_file -if __name__ == "__main__": + +def main(): parser = argparse.ArgumentParser("BUFR to CSV converter") parser.add_argument("path", type=Path) args = parser.parse_args() print(read_bufr_file(args.path).to_csv()) + + +if __name__ == "__main__": + main() diff --git a/src/pypromice/postprocess/bufr_utilities.py b/src/pypromice/postprocess/bufr_utilities.py index d53f2733..8537e7f2 100644 --- a/src/pypromice/postprocess/bufr_utilities.py +++ b/src/pypromice/postprocess/bufr_utilities.py @@ -45,6 +45,7 @@ def round(value: float): return round + # Enforce precision # Note the sensor accuracies listed here: # https://essd.copernicus.org/articles/13/3819/2021/#section8 @@ -64,28 +65,82 @@ class BUFRVariables: * heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD: Corresponds to "#7#heightOfSensorAboveLocalGroundOrDeckOfMarinePlatform" which is height if anemometer relative to ground or deck of marine platform. """ - wmo_id: str + + # Station type: "mobile" or "land" + # =============================== + # Fixed land station schema: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/307080 + # Mobile station schema: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/307090 + station_type: str + + # WMO station identifier + # Land stations: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/301090 + # Mobile stations: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/301092 + # ====================================================================================================== + wmo_id: str timestamp: datetime.datetime - relativeHumidity: float = attrs.field(converter=round_converter(0)) - airTemperature: float = attrs.field(converter=round_converter(1)) - pressure: float = attrs.field(converter=round_converter(1)) - windDirection: float = attrs.field(converter=round_converter(0)) - windSpeed: float = attrs.field(converter=round_converter(1)) - latitude: float = attrs.field(converter=round_converter(6)) - longitude: float = attrs.field(converter=round_converter(6)) + + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/005001 + # Scale: 5, unit: degrees + # TODO: Test if eccodes does the rounding as well. The rounding is was 6 which is larger that the scale. + latitude: float = attrs.field(converter=round_converter(5)) + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/006001 + # Scale: 5, unit: degrees + longitude: float = attrs.field(converter=round_converter(5)) + + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/007030 + # Scale: 1, unit: m heightOfStationGroundAboveMeanSeaLevel: float = attrs.field( - converter=round_converter(2) + converter=round_converter(1) ) - # + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/007031 + # Scale: 1, unit: m heightOfBarometerAboveMeanSeaLevel: float = attrs.field( - converter=round_converter(2), + converter=round_converter(1), ) + + # Pressure information + # ==================== + # Definition table: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/302031 + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/007004 + # Scale: -1, unit: Pa + pressure: float = attrs.field(converter=round_converter(-1)) + # There are two other pressure variables in the template: 302001 and 010062. + + # Basic synoptic "instantaneous" data + # =================================== + # Definition table: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/302035 + # This section only include the temperature and humidity data (302032). + # Precipitation and cloud data are currently ignored. + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/007032 + # Scale: 2, unit: m + # This is the first appearance of this variable id. heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH: float = attrs.field( - converter=round_converter(4), + converter=round_converter(2), ) + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/012101 + # Scale: 2, unit: K + airTemperature: float = attrs.field(converter=round_converter(2)) + # There is also a Dewpoint temperature in this template: 012103 which is currently unused. + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/012103 + # Scale: 0, unit: % + relativeHumidity: float = attrs.field(converter=round_converter(0)) + + # Basic synoptic "period" data + # ============================ + # Definition table: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/302043 + # Wind data: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/302042 + # Wind direction: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/011001 + # Scale: 0, unit: degrees + windDirection: float = attrs.field(converter=round_converter(0)) + # Wind speed: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/011002 + # Scale: 1, unit: m/s + windSpeed: float = attrs.field(converter=round_converter(1)) + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/007032 + # Scale: 2, unit: m + # This is the 7th appearance of this variable id. heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD: float = attrs.field( - converter=round_converter(4) + converter=round_converter(2) ) def as_series(self) -> pd.Series: @@ -129,6 +184,7 @@ def __eq__(self, other: "BUFRVariables"): BUFR_TEMPLATES = { "mobile": { + # Template definition: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/307090 "unexpandedDescriptors": (307090), # message template, "synopMobil" "edition": 4, # latest edition "masterTableNumber": 0, @@ -144,6 +200,7 @@ def __eq__(self, other: "BUFRVariables"): "compressedData": 0, }, "land": { + # Template definition: https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_D/307080 "unexpandedDescriptors": (307080), # message template, "synopLand" "edition": 4, # latest edition "masterTableNumber": 0, @@ -246,6 +303,11 @@ def set_station(ibufr, station_type: str, wmo_id: str): elif station_type == "land": # StationNumber for land stations are integeres wmo_id_int = int(wmo_id) + if wmo_id_int >= 1024: + raise ValueError( + f"Invalid WMO ID {wmo_id}. Land station number must be less than 1024." + "See https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/001002" + ) station_config = dict(stationNumber=wmo_id_int) else: raise Exception(f"Unsupported station station type {station_type}") @@ -485,5 +547,6 @@ def read_bufr_file(path: PathLike) -> pd.DataFrame: message_vars = read_bufr_message(fp) if message_vars is None: break - lines.append(message_vars) - return pd.DataFrame(lines).rename_axis("message_index") + lines.append(message_vars.as_series()) + data_frame = pd.DataFrame(lines).set_index("wmo_id") + return data_frame diff --git a/src/pypromice/postprocess/create_bufr_files.py b/src/pypromice/postprocess/create_bufr_files.py new file mode 100644 index 00000000..a6cb7842 --- /dev/null +++ b/src/pypromice/postprocess/create_bufr_files.py @@ -0,0 +1,178 @@ +import logging +from pathlib import Path +from typing import Sequence, List + +import pandas as pd +from pypromice.station_configuration import load_station_configuration_mapping + +from pypromice.postprocess.get_bufr import ( + get_bufr, + DEFAULT_LIN_REG_TIME_LIMIT, + DEFAULT_POSITION_SEED_PATH, +) + +main_logger = logging.getLogger(__name__) + + +def create_bufr_files( + input_files: Sequence[Path], + station_configuration_root: Path, + period_start: str, + period_end: str, + output_root: Path, + override: bool, + break_on_error: bool = False, + output_filename_suffix: str = "geus_", +): + """ + Generate hourly bufr files from the for all input files + + Parameters + ---------- + input_files + Paths to csv l3 hourly data files + station_configuration_root + Root directory containing station configuration toml files + period_start + Datetime string for period start. Eg '2024-01-01T00:00' or '20240101 + period_end + Datetime string for period end + output_root + Output dir for both bufr files for individual stations and compiled. Organized in two sub directories. + override + If False: Skip a period if the compiled output file exists. + break_on_error + If True: Stop processing if an error occurs + output_filename_suffix + Suffix for the compiled output file + + """ + periods = pd.date_range(period_start, period_end, freq="H") + output_individual_root = output_root / "individual" + output_compiled_root = output_root / "compiled" + output_individual_root.mkdir(parents=True, exist_ok=True) + output_compiled_root.mkdir(parents=True, exist_ok=True) + + station_configuration_mapping = load_station_configuration_mapping( + station_configuration_root, + skip_unexpected_fields=True, + ) + + for period in periods: + period: pd.Timestamp + date_str = period.strftime("%Y%m%dT%H%M") + main_logger.info(f"Processing {date_str}") + output_dir_path = output_individual_root / f"{date_str}" + output_file_path = ( + output_compiled_root / f"{output_filename_suffix}{date_str}.bufr" + ) + + main_logger.info(f"{period}, {date_str}") + if override or not output_file_path.exists(): + get_bufr( + bufr_out=output_dir_path, + input_files=input_files, + store_positions=False, + positions_filepath=None, + linear_regression_time_limit=DEFAULT_LIN_REG_TIME_LIMIT, + timestamps_pickle_filepath=None, + target_timestamp=period, + station_configuration_mapping=station_configuration_mapping, + positions_seed_path=DEFAULT_POSITION_SEED_PATH, + break_on_error=break_on_error, + ) + + with output_file_path.open("wb") as fp_dst: + for src_path in output_dir_path.glob("*.bufr"): + with src_path.open("rb") as fp_src: + fp_dst.write(fp_src.read()) + else: + main_logger.info(f"Output file exists. Skipping {output_file_path}") + + +# %% + + +def main(): + import argparse + import glob + import sys + + logger_format_string = "%(asctime)s; %(levelname)s; %(name)s; %(message)s" + logging.basicConfig( + level=logging.ERROR, + stream=sys.stdout, + format=logger_format_string, + ) + + main_handler = logging.StreamHandler(sys.stdout) + main_handler.setLevel(logging.INFO) + formatter = logging.Formatter(logger_format_string) + main_handler.setFormatter(formatter) + main_logger.addHandler(main_handler) + main_logger.setLevel(logging.INFO) + + parser = argparse.ArgumentParser("Create BUFR files from L3 tx .csv files.") + parser.add_argument( + "--input_files", + "--l3-filepath", + "-i", + type=Path, + nargs="+", + required=True, + help="Path to L3 tx .csv files. Can be direct paths or glob patterns", + ) + parser.add_argument( + "--period_start", + "-s", + required=True, + help="Datetime string for period start. Eg '2024-01-01T00:00' or '20240101", + ) + parser.add_argument( + "--period_end", "-e", required=True, help="Datetime string for period end" + ) + parser.add_argument( + "--output_root", + "-o", + required=True, + type=Path, + help="Output dir for both bufr files for individual stations and compiled. Organized in two sub directories.", + ) + parser.add_argument( + "--station_configuration_root", + "-c", + required=True, + type=Path, + help="Root directory containing station configuration toml files", + ) + parser.add_argument( + "--override", + "-f", + default=False, + action="store_true", + help="Recreate and overide existing output files", + ) + args = parser.parse_args() + + # Interpret all input file paths as glob patterns if they don't exist + input_files: List[Path] = list() + for path in args.input_files: + if path.exists(): + input_files.append(path) + else: + # The input path might be a glob pattern + input_files += map(Path, glob.glob(path.as_posix())) + + main_logger.info(f"Processing {len(input_files)} input files") + create_bufr_files( + input_files=input_files, + period_start=args.period_start, + period_end=args.period_end, + output_root=args.output_root, + override=args.override, + station_configuration_root=args.station_configuration_root, + ) + + +if __name__ == "__main__": + main() diff --git a/src/pypromice/postprocess/get_bufr.py b/src/pypromice/postprocess/get_bufr.py index 6b7bc217..c59553e6 100644 --- a/src/pypromice/postprocess/get_bufr.py +++ b/src/pypromice/postprocess/get_bufr.py @@ -1,11 +1,16 @@ -#!/usr/bin/env python - """ Command-line script for running BUFR file generation Post-processing functions for AWS station data, such as converting PROMICE and GC-Net data files to WMO-compliant BUFR files """ +__all__ = [ + "get_bufr", + "main", + "DEFAULT_POSITION_SEED_PATH", + "DEFAULT_LIN_REG_TIME_LIMIT", +] + import argparse import glob import logging @@ -13,258 +18,45 @@ import sys from datetime import datetime, timedelta from pathlib import Path -from typing import List, Dict, Mapping, Optional, Collection, Sequence, Union, TextIO +from typing import List, Dict, Optional, Sequence, Mapping -import attrs import numpy as np import pandas as pd -import toml from pypromice.postprocess.bufr_utilities import write_bufr_message, BUFRVariables from pypromice.postprocess.real_time_utilities import get_latest_data -logger = logging.getLogger(__name__) -DEFAULT_STATION_CONFIGURATION_PATH = Path(__file__).parent.joinpath( - "station_configurations.toml" +from pypromice.station_configuration import ( + StationConfiguration, + load_station_configuration_mapping, ) -DEFAULT_POSITION_SEED_PATH = Path(__file__).parent.joinpath("positions_seed.csv") -DEFAULT_LIN_REG_TIME_LIMIT = "91d" - -def parse_arguments_bufr() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser() - - parser.add_argument( - "--store_positions", - "--positions", - action="store_true", - required=False, - default=False, - help="If included (True), make a positions dict and output AWS_latest_locations.csv file.", - ) - - parser.add_argument( - "--positions-filepath", - "-p", - type=Path, - required=False, - help="Path to write AWS_latest_locations.csv file.", - ) - - parser.add_argument( - "--time-limit", - default=DEFAULT_LIN_REG_TIME_LIMIT, - type=str, - required=False, - help="Previous time to limit dataframe before applying linear regression.", - ) - - parser.add_argument( - "--input_files", - "--l3-filepath", - "-i", - type=Path, - nargs="+", - required=True, - help="Path to L3 tx .csv files. Can be direct paths or glob patterns", - ) - - parser.add_argument( - "--bufr-out", - "-o", - type=Path, - required=True, - help="Path to the BUFR out directory.", - ) - - parser.add_argument( - "--timestamps-pickle-filepath", - type=Path, - required=False, - help="Path to the latest_timestamps.pickle file.", - ) - - parser.add_argument( - "--station_configuration_mapping", - default=DEFAULT_STATION_CONFIGURATION_PATH, - type=Path, - required=False, - help="Path to csv file with station meta data and BUFR export configuration", - ) - - parser.add_argument( - "--position_seed", - default=DEFAULT_POSITION_SEED_PATH, - type=Path, - required=False, - help="Path to csv file with seed values for output positions.", - ) - - parser.add_argument( - '--latest_timestamp', - default=datetime.utcnow(), - type=pd.Timestamp, - help="Timestamp used to determine latest data. Default utcnow." - ) - - parser.add_argument("--verbose", "-v", default=False, action="store_true") - - return parser - - -@attrs.define -class StationConfiguration: - """ - Helper class for storing station specific configurations with respect to - - * Installation specific distance measurements such as height differences between instruments - * Reference strings such as stid, station_site and wmo_id - * BUFR export specific parameters - - # TODO: The station related meta data should be fetched from a station specific configuration files in the future or - # from header data in data source. - """ - stid: str - station_site: str = None - project: Optional[str] = None - station_type: Optional[str] = None - wmo_id: Optional[str] = None - barometer_from_gps: Optional[float] = None - anemometer_from_sonic_ranger: Optional[float] = None - temperature_from_sonic_ranger: Optional[float] = None - height_of_gps_from_station_ground: Optional[float] = None - sonic_ranger_from_gps: Optional[float] = None - - # The station data will be exported to BUFR if True. Otherwise, it will only export latest position - export_bufr: bool = False - comment: Optional[str] = None - - # skip specific variables for stations - # If a variable has known bad data, use this collection to skip the variable - # Note that if a station is not reporting both air temp and pressure it will be skipped, - # as currently implemented in csv2bufr.min_data_check(). - # ['p_i'], # EXAMPLE - skipped_variables: List[str] = attrs.field(factory=list) - - positions_update_timestamp_only: bool = False - - def as_dict(self) -> Dict: - return attrs.asdict(self) - - -def load_station_configuration_mapping( - fp: Union[str, Path, TextIO] -) -> Mapping[str, StationConfiguration]: - """ - Read station configurations from toml file - - Parameters - ---------- - fp : - Path to or open toml file - - Returns - ------- - Mapping from stid to StationConfiguration - - """ - return { - stid: StationConfiguration(**config_dict) - for stid, config_dict in toml.load(fp).items() - } - - -def write_station_configuration_mapping( - config_mapping: Mapping[str, StationConfiguration], fp: TextIO -): - """ - Write station configuration to toml file - - Parameters - ---------- - config_mapping - Mapping from stid to StationConfiguration - fp - open writable TextIO - """ - config_mapping = { - config.stid: config.as_dict() for config in config_mapping.values() - } - toml.dump(config_mapping, fp) - - -def process_station( - file_path: Path, - output_path: Path, - now_timestamp: datetime, - latest_timestamp: Optional[datetime], - time_limit: str, - stid: str, - station_configuration: StationConfiguration, -) -> Optional[Dict]: - df = load_data(file_path, now_timestamp) - - # Select current data - latest_data = get_latest_data( - df, - lin_reg_time_limit=time_limit, - ) - - if latest_data is None: - logger.info("No valid instantaneous timestamps!") - return None - - latest_data = filter_skipped_variables( - latest_data, vars_to_skip=station_configuration.skipped_variables - ) - - # Check that we have minimum required valid data - sufficient_wx_data, sufficient_position_data = min_data_check(latest_data) - - station_position = dict() - station_position["timestamp"] = latest_data.name - if sufficient_position_data: - station_position["lon"] = latest_data.get("gps_lon_fit") - station_position["lat"] = latest_data.get("gps_lat_fit") - station_position["alt"] = latest_data.get("gps_alt_fit") - else: - logger.warning("Insufficient position data") - # Don't use any position attributes from latest_data - station_position["lon"] = None - station_position["lat"] = None - station_position["alt"] = None - return station_position - - if station_configuration.export_bufr: - if not sufficient_wx_data: - logger.warning(f"Failed min data wx {stid}") - return station_position - - # Store current timest - if latest_data.name <= latest_timestamp: - logger.info(f"No new data {latest_data.name} <= {latest_timestamp}") - return station_position - - # Construct and export BUFR file - bufr_variables = get_bufr_variables( - data=latest_data, - station_configuration=station_configuration, - ) - with output_path.open("bw") as fp: - write_bufr_message(variables=bufr_variables, file=fp) +logger = logging.getLogger(__name__) - return station_position +DEFAULT_POSITION_SEED_PATH = Path(__file__).parent.joinpath("positions_seed.csv") +DEFAULT_LIN_REG_TIME_LIMIT = "91d" +REQUIRED_KEYS = ( + "t_i", + "p_i", + "rh_i", + "wdir_i", + "wspd_i", + "gps_lat_fit", + "gps_lon_fit", + "gps_alt_fit", + "z_boom_u_smooth", +) -def load_data(file_path: Path, now_timestamp: datetime) -> pd.DataFrame: +def load_data(file_path: Path, latest_timestamp: datetime) -> pd.DataFrame: """ - Read AWS data from csv file using time as index and filter all rows after now_timestamp + Read AWS data from csv file using time as index and filter all rows after latest_timestamp Parameters ---------- file_path - now_timestamp + latest_timestamp Returns ------- @@ -276,7 +68,7 @@ def load_data(file_path: Path, now_timestamp: datetime) -> pd.DataFrame: .set_index("time") .sort_index() ) - df = df[:now_timestamp] + df = df[:latest_timestamp] return df @@ -285,12 +77,13 @@ def get_bufr( input_files: Sequence[Path], positions_filepath: Optional[Path], timestamps_pickle_filepath: Optional[Path], - station_configuration_path: Optional[Path], - now_timestamp: Optional[datetime] = None, + station_configuration_mapping: Mapping[str, StationConfiguration], + target_timestamp: Optional[datetime] = None, positions_seed_path: Optional[Path] = None, - earliest_timestamp: datetime = None, + time_window_length: timedelta = timedelta(days=2), store_positions: bool = False, - time_limit: str = "91d", + linear_regression_time_limit: str = "91d", + break_on_error: bool = False, ): """ Main function for generating BUFR files and determine latest positions from a sequence of csv files @@ -304,48 +97,42 @@ def get_bufr( bufr_out Path to the BUFR out directory. input_files - List of L3 csv file paths. + List of csv file paths. positions_filepath Path to write latest positions. Used to retrieve a static set of positions to register stations with DMI/WMO timestamps_pickle_filepath Path to pickle file used for storing latest timestamp - station_configuration_path - Path to toml file with configuration entries for each station - now_timestamp - get_bufr will export the latest data before now_timestamp. Default datetime.utcnow() + station_configuration_mapping + Mapping of station id to StationConfiguration object + target_timestamp + get_bufr will export the latest data before target_timestamp. Default datetime.utcnow() positions_seed_path Path to csv file with position data used as default values for the output position. - earliest_timestamp - The earliest allowed timestamp for data to be included in the output. Default now_timestamp - 2 days + time_window_length + The length of the time window to consider for the latest data. Default 2 days store_positions Flag determine if latest positions are exported. - time_limit + linear_regression_time_limit Previous time to limit dataframe before applying linear regression. + break_on_error + If True, the function will raise an exception if an error occurs during processing. """ - if now_timestamp is None: - now_timestamp = datetime.utcnow() - - if earliest_timestamp is None: - earliest_timestamp = now_timestamp - timedelta(days=2) + if target_timestamp is None: + target_timestamp = datetime.utcnow() # Prepare (latest) positions positions = dict() if positions_seed_path: positions_seed = pd.read_csv( - positions_seed_path, index_col=0, delimiter=",", parse_dates=["timestamp"] + positions_seed_path, + index_col="stid", + delimiter=",", + parse_dates=["timestamp"], ).to_dict(orient="index") logger.info(f"Seed positions for {positions_seed.keys()}") positions.update(positions_seed) - # Prepare station configurations - if station_configuration_path is None: - station_configuration_mapping = dict() - else: - station_configuration_mapping = load_station_configuration_mapping( - station_configuration_path - ) - # Prepare bufr output dir bufr_out.mkdir(parents=True, exist_ok=True) @@ -357,18 +144,13 @@ def get_bufr( logger.info("latest_timestamps.pickle not found!") latest_timestamps = {} - # Initiate a new dict for current timestamps - current_timestamps = {} - # Setup diagnostic lists (logger.info at end) skipped = [] no_recent_data = [] - no_entry_latest_timestamps = [] - failed_min_data_wx = [] - failed_min_data_pos = [] # Iterate through csv files for file_path in input_files: + # TODO: This split is explicitly requiring the filename to have sampleate at suffix. This shuld be more robust stid = file_path.stem.rsplit("_", 1)[0] logger.info("####### Processing {} #######".format(stid)) @@ -381,40 +163,63 @@ def get_bufr( output_path = bufr_out / f"{stid}.bufr" logger.info(f"Generating {output_path} from {file_path}") - latest_timestamp = latest_timestamps.get(stid, earliest_timestamp) - latest_timestamp = max(earliest_timestamp, latest_timestamp) + + time_window_start = target_timestamp - time_window_length + # Use only newer data than the latest timestamp + if stid in latest_timestamps: + time_window_start = max(latest_timestamps[stid], time_window_start) try: - station_position = process_station( - file_path=file_path, - output_path=output_path, - now_timestamp=now_timestamp, - latest_timestamp=latest_timestamp, - time_limit=time_limit, - stid=stid, - station_configuration=station_configuration, - ) - except Exception: - logger.exception(f"Failed processing {stid}") - continue + input_data = load_data(file_path, target_timestamp) - if station_position is None: - logger.warning(f"No position information available for {stid}") + # Select current data + latest_data = get_latest_data( + input_data, + lin_reg_time_limit=linear_regression_time_limit, + vars_to_skip=station_configuration.skipped_variables, + ) + if latest_data is None: + logger.info("No valid instantaneous timestamps!") + skipped.append(stid) + continue - else: + # Create station positions + station_position = get_station_positions(latest_data) if stid not in positions: positions[stid] = dict() - if station_configuration.positions_update_timestamp_only: positions[stid]["timestamp"] = station_position["timestamp"] else: positions[stid].update(station_position) + # Create BUFR File + if ( + station_configuration.export_bufr + and latest_data.name > time_window_start + ): + latest_timestamps[stid] = latest_data.name + bufr_variables = get_bufr_variables(latest_data, station_configuration) + if bufr_variables: + with output_path.open("bw") as output_file: + write_bufr_message(bufr_variables, output_file) + else: + logger.info(f"No new data {latest_data.name} <= {time_window_start}") + no_recent_data.append(stid) + + except Exception: + logger.exception(f"Failed processing {stid}") + if output_path.exists(): + output_path.unlink() + if break_on_error: + raise + skipped.append(stid) + continue + # Write the most recent timestamps back to the pickle on disk logger.info(f"writing latest_timestamps to {timestamps_pickle_filepath}") if timestamps_pickle_filepath: with timestamps_pickle_filepath.open("wb") as handle: - pickle.dump(current_timestamps, handle, protocol=pickle.HIGHEST_PROTOCOL) + pickle.dump(latest_timestamps, handle, protocol=pickle.HIGHEST_PROTOCOL) if store_positions: positions_df = pd.DataFrame.from_dict( @@ -427,13 +232,7 @@ def get_bufr( positions_df.to_csv(positions_filepath, index_label="stid") logger.info("--------------------------------") - not_processed_wx_pos = set(failed_min_data_wx + failed_min_data_pos) - not_processed_count = ( - len(skipped) - + len(no_recent_data) - + len(no_entry_latest_timestamps) - + len(not_processed_wx_pos) - ) + not_processed_count = len(skipped) + len(no_recent_data) logger.info( "BUFR exported for {} of {} fpaths.".format( (len(input_files) - not_processed_count), len(input_files) @@ -442,47 +241,46 @@ def get_bufr( logger.info("") logger.info("skipped: {}".format(skipped)) logger.info("no_recent_data: {}".format(no_recent_data)) - logger.info("no_entry_latest_timestamps: {}".format(no_entry_latest_timestamps)) - logger.info("failed_min_data_wx: {}".format(failed_min_data_wx)) - logger.info("failed_min_data_pos: {}".format(failed_min_data_pos)) logger.info("--------------------------------") -def filter_skipped_variables( - row: pd.Series, vars_to_skip: Collection[str] -) -> pd.Series: - """ - Mutate input series by setting var_to_skip to np.nan - - Parameters - ---------- - row - vars_to_skip - List of variable names to be skipped - - Returns - ------- - Input series - - """ - vars_to_skip = set(row.keys()) & set(vars_to_skip) - for var_key in vars_to_skip: - row[var_key] = np.nan - logger.info("----> Skipping var: {}".format(var_key)) - return row +def get_station_positions(latest_data: pd.Series) -> Dict: + station_position = dict() + station_position["timestamp"] = latest_data.name + station_position["lat"] = latest_data["gps_lat_fit"] + station_position["lon"] = latest_data["gps_lon_fit"] + station_position["alt"] = latest_data["gps_alt_fit"] + if any( + [ + pd.isna(station_position["lat"]), + pd.isna(station_position["lon"]), + pd.isna(station_position["alt"]), + ] + ): + logger.warning("Insufficient position data") + station_position["lat"] = None + station_position["lon"] = None + station_position["alt"] = None + return station_position def get_bufr_variables( data: pd.Series, station_configuration: StationConfiguration, -) -> BUFRVariables: +) -> Optional[BUFRVariables]: """ - Helper function for converting our variables to the variables needed for bufr export. + Helper function for converting our variables to the variables needed for bufr export. + + Raises AttributeError if station_configuration don't have the minimum dimension fields since they are required to determine barometer heights. + * height_of_gps_from_station_ground + * barometer_from_gps + + Parameters ---------- data - Series with processed l3 variables from get_latest_datas + Series with processed variables from get_latest_datas station_configuration @@ -491,30 +289,62 @@ def get_bufr_variables( BUFRVariables used by bufr_utilities """ - heightOfStationGroundAboveMeanSeaLevel = np.nan - if isinstance(station_configuration.height_of_gps_from_station_ground, float): - heightOfStationGroundAboveMeanSeaLevel = ( - data["gps_alt_fit"] - station_configuration.height_of_gps_from_station_ground + + if not all(key in data.index for key in REQUIRED_KEYS): + raise ValueError( + f"Failed to process BUFRVariables. Missing required keys: {REQUIRED_KEYS}" ) - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = np.nan - if isinstance(station_configuration.temperature_from_sonic_ranger, float): - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = ( - data["z_boom_u_smooth"]+ station_configuration.temperature_from_sonic_ranger + # Check that we have minimum required fields to proceed with writing to BUFR + # Always require minimum a valid air temp or a valid pressure. + # If both air temp and pressure are nan, do not submit. + # This will allow the case of having only one or the other. + if data[["t_i", "p_i"]].isna().all(): + logger.warning("Failed to process BUFRVariables - insufficient data") + return None + + # Always require a valid position data + if data[["gps_lat_fit", "gps_lon_fit", "gps_alt_fit"]].isna().any(): + logger.warning("Failed to process BUFRVariables - insufficient position data") + return None + + if station_configuration.height_of_gps_from_station_ground is None: + raise AttributeError( + "height_of_gps_from_station_ground is required for BUFR export" ) + if station_configuration.barometer_from_gps is None: + raise AttributeError("barometer_from_gps is required for BUFR export") - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = np.nan - if isinstance(station_configuration.anemometer_from_sonic_ranger, float): - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = ( - data["z_boom_u_smooth"] + station_configuration.anemometer_from_sonic_ranger + if station_configuration.static_height_of_gps_from_mean_sea_level is None: + height_of_gps_above_mean_sea_level = data["gps_alt_fit"] + else: + height_of_gps_above_mean_sea_level = ( + station_configuration.static_height_of_gps_from_mean_sea_level ) - heightOfBarometerAboveMeanSeaLevel = np.nan - if isinstance(station_configuration.barometer_from_gps, float): - heightOfBarometerAboveMeanSeaLevel = ( - data["gps_alt_fit"] + station_configuration.barometer_from_gps + heightOfStationGroundAboveMeanSeaLevel = ( + height_of_gps_above_mean_sea_level + - station_configuration.height_of_gps_from_station_ground + ) + + heightOfBarometerAboveMeanSeaLevel = ( + height_of_gps_above_mean_sea_level + station_configuration.barometer_from_gps + ) + + if station_configuration.temperature_from_sonic_ranger is None: + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = np.nan + else: + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = ( + data["z_boom_u_smooth"] + + station_configuration.temperature_from_sonic_ranger ) + if station_configuration.anemometer_from_sonic_ranger is None: + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = np.nan + else: + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = ( + data["z_boom_u_smooth"] + station_configuration.anemometer_from_sonic_ranger + ) output_row = BUFRVariables( wmo_id=station_configuration.wmo_id, @@ -540,60 +370,75 @@ def get_bufr_variables( return output_row -def min_data_check(s): - """Check that we have minimum required fields to proceed with writing to BUFR - For wx vars, we currently require both air temp and pressure to be non-NaN. - If you know a specific var is reporting bad data, you can ignore just that var - using the vars_to_skip dict in wmo_config. - - Parameters - ---------- - s : pandas series - The current obset we are working with (for BUFR submission) - - Returns - ------- - min_data_wx_result : bool - True (default), the test for min wx data passed. False, the test failed. - min_data_pos_result : bool - True (default), the test for min position data passed. False, the test failed. - """ - min_data_wx_result = True - min_data_pos_result = True - - # Can use pd.isna() or math.isnan() below... - - # Always require valid air temp and valid pressure (both must be non-nan) - # if (pd.isna(s['t_i']) is False) and (pd.isna(s['p_i']) is False): - # pass - # else: - # print('----> Failed min_data_check for air temp and pressure!') - # min_data_wx_result = False - - # If both air temp and pressure are nan, do not submit. - # This will allow the case of having only one or the other. - if (pd.isna(s["t_i"]) is True) and (pd.isna(s["p_i"]) is True): - logger.warning("----> Failed min_data_check for air temp and pressure!") - min_data_wx_result = False - - # Missing just elevation OK - # if (pd.isna(s['gps_lat_fit']) is False) and (pd.isna(s['gps_lon_fit']) is False): - # pass - # Require all three: lat, lon, elev - if ( - (pd.isna(s["gps_lat_fit"]) is False) - and (pd.isna(s["gps_lon_fit"]) is False) - and (pd.isna(s["gps_alt_fit"]) is False) - ): - pass - else: - logger.warning("----> Failed min_data_check for position!") - min_data_pos_result = False - - return min_data_wx_result, min_data_pos_result - def main(): - args = parse_arguments_bufr().parse_args() + parser = argparse.ArgumentParser() + parser.add_argument( + "--store_positions", + "--positions", + action="store_true", + required=False, + default=False, + help="If included (True), make a positions dict and output AWS_latest_locations.csv file.", + ) + parser.add_argument( + "--positions-filepath", + "-p", + type=Path, + required=False, + help="Path to write AWS_latest_locations.csv file.", + ) + parser.add_argument( + "--linear_regression_time_limit", + "--time-limit", + default=DEFAULT_LIN_REG_TIME_LIMIT, + type=str, + required=False, + help="Previous time to limit dataframe before applying linear regression.", + ) + parser.add_argument( + "--input_files", + "-i", + type=Path, + nargs="+", + required=True, + help="Path to input files .csv files. Can be direct paths or glob patterns", + ) + parser.add_argument( + "--bufr-out", + "-o", + type=Path, + required=True, + help="Path to the BUFR out directory.", + ) + parser.add_argument( + "--timestamps-pickle-filepath", + type=Path, + required=False, + help="Path to the latest_timestamps.pickle file.", + ) + parser.add_argument( + "--station_configurations_root", + type=Path, + required=True, + help="Path to root directory containing station configuration toml files", + ) + parser.add_argument( + "--position_seed", + default=DEFAULT_POSITION_SEED_PATH, + type=Path, + required=False, + help="Path to csv file with seed values for output positions.", + ) + parser.add_argument( + "--target_timestamp", + "--now-timestamp", + default=datetime.utcnow(), + type=pd.Timestamp, + help="Timestamp used to determine latest data. Default utcnow.", + ) + parser.add_argument("--verbose", "-v", default=False, action="store_true") + + args = parser.parse_args() log_level = logging.INFO if args.verbose: @@ -613,17 +458,23 @@ def main(): # The input path might be a glob pattern input_files += map(Path, glob.glob(path.as_posix())) + station_configuration_mapping = load_station_configuration_mapping( + args.station_configurations_root, + skip_unexpected_fields=True, + ) + get_bufr( bufr_out=args.bufr_out, input_files=input_files, store_positions=args.store_positions, positions_filepath=args.positions_filepath, - time_limit=args.time_limit, + linear_regression_time_limit=args.linear_regression_time_limit, timestamps_pickle_filepath=args.timestamps_pickle_filepath, - now_timestamp=args.latest_timestamp, - station_configuration_path=args.station_configuration_mapping, + target_timestamp=args.target_timestamp, + station_configuration_mapping=station_configuration_mapping, positions_seed_path=args.position_seed, ) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/pypromice/postprocess/real_time_utilities.py b/src/pypromice/postprocess/real_time_utilities.py index 952a69d8..f79f9ca0 100644 --- a/src/pypromice/postprocess/real_time_utilities.py +++ b/src/pypromice/postprocess/real_time_utilities.py @@ -7,7 +7,7 @@ """ import logging -from typing import Optional +from typing import Optional, Collection import numpy as np import pandas as pd @@ -22,6 +22,7 @@ def get_latest_data( df: pd.DataFrame, lin_reg_time_limit: str, + vars_to_skip: Optional[Collection[str]] = None, ) -> Optional[pd.Series]: """ Determine instantaneous values for the latest valid timestamp in the input dataframe @@ -66,16 +67,47 @@ def get_latest_data( lin_reg_time_limit, ) + if last_valid_index not in df_limited.index: + logger.info("No valid data limited period") + return None + # Apply smoothing to z_boom_u # require at least 2 hourly obs? Sometimes seeing once/day data for z_boom_u - df_limited = rolling_window(df_limited, "z_boom_u", "72H", 2, 1) + df_limited = rolling_window(df_limited, "z_boom_u", "72H", 2, 3) # limit to single most recent valid row (convert to series) s_current = df_limited.loc[last_valid_index] + if vars_to_skip is not None: + s_current = filter_skipped_variables(s_current, vars_to_skip) + return s_current +def filter_skipped_variables( + row: pd.Series, vars_to_skip: Collection[str] +) -> pd.Series: + """ + Mutate input series by setting var_to_skip to np.nan + + Parameters + ---------- + row + vars_to_skip + List of variable names to be skipped + + Returns + ------- + Input series + + """ + vars_to_skip = set(row.keys()) & set(vars_to_skip) + for var_key in vars_to_skip: + row[var_key] = np.nan + logger.info("----> Skipping var: {}".format(var_key)) + return row + + def rolling_window(df, column, window, min_periods, decimals) -> pd.DataFrame: """Apply a rolling window (smoothing) to the input column @@ -145,9 +177,9 @@ def find_positions(df, time_limit): logger.info(f"last transmission: {df_limited.index.max()}") # Extrapolate recommended for altitude, optional for lat and lon. - df_limited, lat_valid = linear_fit(df_limited, "gps_lat", 6) - df_limited, lon_valid = linear_fit(df_limited, "gps_lon", 6) - df_limited, alt_valid = linear_fit(df_limited, "gps_alt", 1) + df_limited, lat_valid = linear_fit(df_limited, "gps_lat", 7) + df_limited, lon_valid = linear_fit(df_limited, "gps_lon", 7) + df_limited, alt_valid = linear_fit(df_limited, "gps_alt", 4) # If we have no valid lat, lon or alt data in the df_limited window, then interpolate # using full tx dataset. @@ -158,17 +190,15 @@ def find_positions(df, time_limit): logger.info(f"----> Using full history for linear extrapolation: {k}") logger.info(f"first transmission: {df.index.min()}") if k == "gps_alt": - df, valid = linear_fit(df, k, 1) + df, valid = linear_fit(df, k, 2) else: - df, valid = linear_fit(df, k, 6) + df, valid = linear_fit(df, k, 7) check_valid_again[k] = valid if check_valid_again[k] is True: df_limited[f"{k}_fit"] = df.loc[df_limited.index, f"{k}_fit"] else: logger.info(f"----> No data exists for {k}. Stubbing out with NaN.") - df_limited[f"{k}_fit"] = pd.Series( - np.nan, index=df_limited.index - ) + df_limited[f"{k}_fit"] = pd.Series(np.nan, index=df_limited.index) return df_limited diff --git a/src/pypromice/postprocess/station_configurations.toml b/src/pypromice/postprocess/station_configurations.toml deleted file mode 100644 index 99bca21e..00000000 --- a/src/pypromice/postprocess/station_configurations.toml +++ /dev/null @@ -1,762 +0,0 @@ -[CEN2] -stid = "CEN2" -station_site = "CEN" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04407" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[CP1] -stid = "CP1" -station_site = "CP1" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04442" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[DY2] -stid = "DY2" -station_site = "DY2" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04464" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[EGP] -stid = "EGP" -station_site = "EGP" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04451" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[HUM] -stid = "HUM" -station_site = "HUM" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04432" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[NAE] -stid = "NAE" -station_site = "NAE" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04420" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[NAU] -stid = "NAU" -station_site = "NAU" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04443" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[NEM] -stid = "NEM" -station_site = "NEM" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04436" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[NSE] -stid = "NSE" -station_site = "NSE" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04488" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[SDL] -stid = "SDL" -station_site = "SDL" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04485" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[SDM] -stid = "SDM" -station_site = "SDM" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04492" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[TUN] -stid = "TUN" -station_site = "TUN" -project = "GC-Net" -station_type = "mobile" -wmo_id = "04425" -barometer_from_gps = 0.55 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.4 -height_of_gps_from_station_ground = 1.5 -sonic_ranger_from_gps = 0.15 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[KAN_M] -stid = "KAN_M" -station_site = "KAN_M" -project = "Promice" -station_type = "mobile" -wmo_id = "04411" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[KAN_U] -stid = "KAN_U" -station_site = "KAN_U" -project = "Promice" -station_type = "mobile" -wmo_id = "04409" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[LYN_L] -stid = "LYN_L" -station_site = "LYN_L" -project = "Disko" -station_type = "mobile" -wmo_id = "04450" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[LYN_T] -stid = "LYN_T" -station_site = "LYN_T" -project = "Disko" -station_type = "mobile" -wmo_id = "04429" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[MIT] -stid = "MIT" -station_site = "MIT" -project = "Promice" -station_type = "mobile" -wmo_id = "04430" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[NUK_K] -stid = "NUK_K" -station_site = "NUK_K" -project = "Promice" -station_type = "mobile" -wmo_id = "04437" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[NUK_L] -stid = "NUK_L" -station_site = "NUK_L" -project = "Promice" -station_type = "mobile" -wmo_id = "04403" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[SCO_L] -stid = "SCO_L" -station_site = "SCO_L" -project = "Promice" -station_type = "mobile" -wmo_id = "04413" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[SCO_U] -stid = "SCO_U" -station_site = "SCO_U" -project = "Promice" -station_type = "mobile" -wmo_id = "04421" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[TAS_A] -stid = "TAS_A" -station_site = "TAS_A" -project = "Promice" -station_type = "mobile" -wmo_id = "04408" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[TAS_L] -stid = "TAS_L" -station_site = "TAS_L" -project = "Promice" -station_type = "mobile" -wmo_id = "04404" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[THU_L] -stid = "THU_L" -station_site = "THU_L" -project = "Promice" -station_type = "mobile" -wmo_id = "04424" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[THU_L2] -stid = "THU_L2" -station_site = "THU_L2" -project = "Promice" -station_type = "mobile" -wmo_id = "04453" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[UPE_L] -stid = "UPE_L" -station_site = "UPE_L" -project = "Promice" -station_type = "mobile" -wmo_id = "04423" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[UPE_U] -stid = "UPE_U" -station_site = "UPE_U" -project = "Promice" -station_type = "mobile" -wmo_id = "04422" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[KAN_Lv3] -stid = "KAN_Lv3" -station_site = "KAN_L" -project = "Promice" -station_type = "mobile" -wmo_id = "04412" -barometer_from_gps = 1.3 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[NUK_Uv3] -stid = "NUK_Uv3" -station_site = "NUK_U" -project = "Promice" -station_type = "mobile" -wmo_id = "04439" -barometer_from_gps = 1.3 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[QAS_Lv3] -stid = "QAS_Lv3" -station_site = "QAS_L" -project = "Promice" -station_type = "mobile" -wmo_id = "04401" -barometer_from_gps = 1.3 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[QAS_Mv3] -stid = "QAS_Mv3" -station_site = "QAS_M" -project = "Promice" -station_type = "mobile" -wmo_id = "04441" -barometer_from_gps = 1.3 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[QAS_Uv3] -stid = "QAS_Uv3" -station_site = "QAS_U" -project = "Promice" -station_type = "mobile" -wmo_id = "04402" -barometer_from_gps = 1.3 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[ZAK_Lv3] -stid = "ZAK_Lv3" -station_site = "ZAK_L" -project = "GlacioBasis" -station_type = "mobile" -wmo_id = "04461" -barometer_from_gps = 1.3 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[ZAK_Uv3] -stid = "ZAK_Uv3" -station_site = "ZAK_U" -project = "GlacioBasis" -station_type = "mobile" -wmo_id = "04462" -barometer_from_gps = 1.3 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[WEG_B] -stid = "WEG_B" -station_site = "NUK_U" -project = "Wegener" -station_type = "land" -wmo_id = "460" -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false - -[KAN_B] -stid = "KAN_B" -station_site = "KAN_B" -project = "Promice" -station_type = "land" -wmo_id = "445" -export_bufr = false -comment = "no_instantaneous" -skipped_variables = [] -positions_update_timestamp_only = true - -[CEN1] -stid = "CEN1" -station_site = "CEN1" -project = "GC-Net" -export_bufr = false -comment = "discontinued" -skipped_variables = [] -positions_update_timestamp_only = false - -[JAR_O] -stid = "JAR_O" -station_site = "JAR" -project = "GC-Net" -wmo_id = "04452" -export_bufr = false -comment = "discontinued" -skipped_variables = [] -positions_update_timestamp_only = false - -[KAN_L] -stid = "KAN_L" -station_site = "KAN_L" -project = "Promice" -wmo_id = "04412" -export_bufr = false -comment = "use_v3" -skipped_variables = [] -positions_update_timestamp_only = false - -[KPC_Lv3] -stid = "KPC_Lv3" -station_site = "KPC_L" -project = "Promice" -station_type = "mobile" -wmo_id = "04428" -export_bufr = false -comment = "v3_bad" -skipped_variables = [] -positions_update_timestamp_only = false - -[NUK_N] -stid = "NUK_N" -station_site = "NUK_N" -export_bufr = false -comment = "discontinued" -skipped_variables = [] -positions_update_timestamp_only = false - -[NUK_U] -stid = "NUK_U" -station_site = "NUK_U" -project = "Promice" -wmo_id = "04439" -export_bufr = false -comment = "use_v3" -skipped_variables = [] -positions_update_timestamp_only = false - -[QAS_A] -stid = "QAS_A" -station_site = "QAS_A" -project = "Promice" -export_bufr = false -comment = "discontinued" -skipped_variables = [] -positions_update_timestamp_only = false - -[QAS_L] -stid = "QAS_L" -station_site = "QAS_L" -project = "Promice" -wmo_id = "04401" -export_bufr = false -comment = "use_v3" -skipped_variables = [] -positions_update_timestamp_only = false - -[QAS_M] -stid = "QAS_M" -station_site = "QAS_M" -project = "Promice" -wmo_id = "04441" -export_bufr = false -comment = "use_v3" -skipped_variables = [] -positions_update_timestamp_only = false - -[QAS_U] -stid = "QAS_U" -station_site = "QAS_U" -project = "Promice" -wmo_id = "04402" -export_bufr = false -comment = "use_v3" -skipped_variables = [] -positions_update_timestamp_only = false - -[SWC_O] -stid = "SWC_O" -station_site = "SWC" -project = "GC-Net" -wmo_id = "04458" -export_bufr = false -comment = "discontinued" -skipped_variables = [] -positions_update_timestamp_only = false - -[TAS_U] -stid = "TAS_U" -station_site = "TAS_U" -project = "Promice" -export_bufr = false -comment = "discontinued" -skipped_variables = [] -positions_update_timestamp_only = false - -[THU_U] -stid = "THU_U" -station_site = "THU_U" -project = "Promice" -station_type = "mobile" -wmo_id = "04454" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = false -comment = "discontinued" -skipped_variables = [] -positions_update_timestamp_only = false - -[UWN] -stid = "UWN" -station_site = "UWN" -export_bufr = false -comment = "not_registered" -skipped_variables = [] -positions_update_timestamp_only = false - -[WEG_L] -stid = "WEG_L" -station_site = "WEG_L" -project = "Wegener" -export_bufr = false -comment = "not_registered" -skipped_variables = [] -positions_update_timestamp_only = false - -[XXX] -stid = "XXX" -station_site = "XXX" -export_bufr = false -comment = "test" -skipped_variables = [] -positions_update_timestamp_only = false - -[ZAK_A] -stid = "ZAK_A" -station_site = "ZAK_A" -project = "GlacioBasis" -export_bufr = false -comment = "not_registered" -skipped_variables = [] -positions_update_timestamp_only = false - -[ZAK_L] -stid = "ZAK_L" -station_site = "ZAK_L" -project = "GlacioBasis" -wmo_id = "04461" -export_bufr = false -comment = "use_v3,no_instantaneous" -skipped_variables = [] -positions_update_timestamp_only = false - -[ZAK_U] -stid = "ZAK_U" -station_site = "ZAK_U" -project = "GlacioBasis" -wmo_id = "04462" -export_bufr = false -comment = "use_v3,no_instantaneous" -skipped_variables = [] -positions_update_timestamp_only = false - -[KPC_Uv3] -stid = "KPC_Uv3" -station_site = "KPC_U" -project = "Promice" -station_type = "mobile" -wmo_id = "04427" -export_bufr = false -comment = "v3_bad" -skipped_variables = [] -positions_update_timestamp_only = false - -[KPC_L] -stid = "KPC_L" -station_site = "KPC_L" -project = "Promice" -wmo_id = "04428" -export_bufr = false -comment = "use_v3" -skipped_variables = [] -positions_update_timestamp_only = false - -[KPC_U] -stid = "KPC_U" -station_site = "KPC_U" -project = "Promice" -wmo_id = "04427" -export_bufr = false -comment = "use_v3" -skipped_variables = [] -positions_update_timestamp_only = false - -[THU_U2] -stid = "THU_U2" -station_site = "THU_U" -project = "Promice" -station_type = "mobile" -wmo_id = "04454" -barometer_from_gps = -0.25 -anemometer_from_sonic_ranger = 0.4 -temperature_from_sonic_ranger = 0.0 -height_of_gps_from_station_ground = 0.9 -sonic_ranger_from_gps = 1.3 -export_bufr = true -skipped_variables = [] -positions_update_timestamp_only = false diff --git a/src/pypromice/station_configuration.py b/src/pypromice/station_configuration.py new file mode 100644 index 00000000..4ec4baec --- /dev/null +++ b/src/pypromice/station_configuration.py @@ -0,0 +1,118 @@ +import logging +from pathlib import Path +from typing import Optional, Dict, Mapping, Sequence + +import attrs +import toml + + +@attrs.define +class StationConfiguration: + """ + Helper class for storing station specific configurations with respect to + + * Installation specific distance measurements such as height differences between instruments + * Reference strings such as stid, station_site and wmo_id + * BUFR export specific parameters + + # TODO: The station related meta data should be fetched from a station specific configuration files in the future or + # from header data in data source. + """ + + stid: str + station_site: str = None + project: Optional[str] = None + station_type: Optional[str] = None + wmo_id: Optional[str] = None + barometer_from_gps: Optional[float] = None + anemometer_from_sonic_ranger: Optional[float] = None + temperature_from_sonic_ranger: Optional[float] = None + height_of_gps_from_station_ground: Optional[float] = None + sonic_ranger_from_gps: Optional[float] = None + static_height_of_gps_from_mean_sea_level: Optional[float] = None + station_relocation: Sequence[str] = attrs.field(factory=list) + location_type: Optional[str] = None + + # The station data will be exported to BUFR if True. Otherwise, it will only export latest position + export_bufr: bool = False + comment: Optional[str] = None + + # skip specific variables for stations + # If a variable has known bad data, use this collection to skip the variable + # Note that if a station is not reporting both air temp and pressure it will be skipped, + # as currently implemented in csv2bufr.min_data_check(). + # ['p_i'], # EXAMPLE + skipped_variables: Sequence[str] = attrs.field(factory=list) + + positions_update_timestamp_only: bool = False + + @classmethod + def load_toml(cls, path, skip_unexpected_fields=False): + config_fields = {field.name for field in attrs.fields(cls)} + input_dict = toml.load(path) + unexpected_fields = set(input_dict.keys()) - config_fields + if unexpected_fields: + if skip_unexpected_fields: + logging.info( + f"Skipping unexpected fields in toml file {path}: " + + ", ".join(unexpected_fields) + ) + for field in unexpected_fields: + input_dict.pop(field) + else: + raise ValueError(f"Unexpected fields: {unexpected_fields}") + + return cls(**input_dict) + + def dump_toml(self, path: Path): + with path.open("w") as fp: + toml.dump(self.as_dict(), fp) + + def as_dict(self) -> Dict: + return attrs.asdict(self) + + +def load_station_configuration_mapping( + configurations_root_dir: Path, + **kwargs, +) -> Mapping[str, StationConfiguration]: + """ + Load station configurations from toml files in configurations_root_dir + + Parameters + ---------- + configurations_root_dir + Root directory containing toml files + kwargs + Additional arguments to pass to StationConfiguration.load_toml + + Returns + ------- + Mapping from stid to StationConfiguration + + """ + return { + config_file.stem: StationConfiguration.load_toml(config_file, **kwargs) + for config_file in configurations_root_dir.glob("*.toml") + } + + +def write_station_configuration_mapping( + station_configurations: Mapping[str, StationConfiguration], + configurations_root_dir: Path, +) -> None: + """ + Write station configurations to toml files in configurations_root_dir + + Parameters + ---------- + station_configurations + Mapping from stid to StationConfiguration + configurations_root_dir + Output directory + + """ + configurations_root_dir.mkdir(parents=True, exist_ok=True) + for stid, station_configuration in station_configurations.items(): + with (configurations_root_dir / f"{stid}.toml").open("w") as fp: + toml.dump(station_configuration.as_dict(), fp) diff --git a/tests/unit/bufr_export/test_bufr_utilitites.py b/tests/unit/bufr_export/test_bufr_utilitites.py index 2b9a19b1..bd9ec586 100644 --- a/tests/unit/bufr_export/test_bufr_utilitites.py +++ b/tests/unit/bufr_export/test_bufr_utilitites.py @@ -181,3 +181,37 @@ def test_nan_value_serialization(self): variables_src, variables_read, ) + + def test_precision(self): + """ + Test if the BUFRVariable rounding configurations aligns with the BUFR format. + + Use np.random.random() to generate high precision random values. + """ + variables_src = BUFRVariables( + wmo_id="04464", + station_type="mobile", + timestamp=datetime.datetime(2023, 12, 19, 10, 0), + relativeHumidity=np.random.random(), + airTemperature=np.random.random(), + pressure=1000 * np.random.random(), + windDirection=np.random.random(), + windSpeed=np.random.random(), + latitude=np.random.random(), + longitude=np.random.random(), + heightOfStationGroundAboveMeanSeaLevel=np.random.random(), + heightOfBarometerAboveMeanSeaLevel=np.random.random(), + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=np.random.random(), + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=np.random.random(), + ) + with tempfile.TemporaryFile("w+b") as fp: + write_bufr_message(variables=variables_src, file=fp) + fp.seek(0) + variables_read = read_bufr_message( + fp=fp, + ) + + self.assertEqual( + variables_src, + variables_read, + ) diff --git a/tests/unit/bufr_export/test_create_bufr_files.py b/tests/unit/bufr_export/test_create_bufr_files.py new file mode 100644 index 00000000..1b79b421 --- /dev/null +++ b/tests/unit/bufr_export/test_create_bufr_files.py @@ -0,0 +1,230 @@ +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Optional +from unittest import TestCase + +import toml +from pypromice.station_configuration import write_station_configuration_mapping + +from pypromice.postprocess.create_bufr_files import create_bufr_files +from tests.utilities import get_station_configuration + +DATA_DIR = Path(__file__).parent.absolute() + + +def create_data_file(path: Path, src_path: Optional[Path] = None): + if src_path is None: + src_path = Path("/dev/null") + + path.parent.mkdir(exist_ok=True, parents=True) + with src_path.open() as fp_src: + with path.open("w") as fp_out: + fp_out.write(fp_src.read()) + + +class TestCreateBufrFiles(TestCase): + def setUp(self): + self._temp_dir = TemporaryDirectory() + self.temp_dir = Path(self._temp_dir.name) + + def tearDown(self): + self._temp_dir.cleanup() + + def test_create_bufr_files(self): + """ + Test the creation of bufr files and their output folder structure. + It does not test the content of the bufr files. + """ + input_dir = self.temp_dir / "input" + output_dir = self.temp_dir / "output" + input_files = [ + input_dir / "THU_L2_hourly.csv", + input_dir / "KAN_Lv3_hourly.csv", + ] + # Use the same data for all input files + for input_file in input_files: + create_data_file( + input_file, + src_path=DATA_DIR.joinpath("tx_l3_test1.csv"), + ) + + station_configuration_root = self.temp_dir / "station_configuration" + station_configuration_root.mkdir(parents=True, exist_ok=True) + station_configuration_mapping = { + "THU_L2": get_station_configuration(stid="THU_L2", export_bufr=True), + "KAN_Lv3": get_station_configuration(stid="KAN_Lv3", export_bufr=True), + } + write_station_configuration_mapping( + station_configurations=station_configuration_mapping, + configurations_root_dir=station_configuration_root, + ) + + create_bufr_files( + input_files=input_files, + period_start="2023-12-06T00:00", + period_end="2023-12-06T04:00", + output_root=output_dir, + override=True, + break_on_error=True, + station_configuration_root=station_configuration_root, + ) + + compiled_output_dir = output_dir / "compiled" + individual_output_root = output_dir / "individual" + self.assertTrue(compiled_output_dir.exists()) + self.assertTrue(individual_output_root.exists()) + expected_output_timestamps = [ + "20231206T0000", + "20231206T0100", + "20231206T0200", + "20231206T0300", + "20231206T0400", + ] + compiled_output_files = sorted(compiled_output_dir.glob("*.bufr")) + expected_output_file_names = sorted( + [ + f"geus_{timestamp_str}.bufr" + for timestamp_str in expected_output_timestamps + ] + ) + self.assertListEqual( + expected_output_file_names, [p.name for p in compiled_output_files] + ) + individual_output_dirs = sorted(individual_output_root.glob("*")) + self.assertListEqual( + expected_output_timestamps, [p.stem for p in individual_output_dirs] + ) + for dir in individual_output_dirs: + # There should be a bufr file for each station + self.assertTrue((dir / "THU_L2.bufr").exists()) + self.assertTrue((dir / "KAN_Lv3.bufr").exists()) + + def test_get_bufr_from_empty_data_file_raises_error(self): + input_dir = self.temp_dir / "input" + output_dir = self.temp_dir / "output" + input_file = input_dir / "THU_L2_hourly.csv" + create_data_file(input_file, src_path=None) + station_configuration_root = self.temp_dir / "station_configuration" + station_configuration = get_station_configuration( + stid="KAN_Lv3", export_bufr=True + ) + write_station_configuration_mapping( + station_configurations={station_configuration.stid: station_configuration}, + configurations_root_dir=station_configuration_root, + ) + + with self.assertRaises(ValueError): + create_bufr_files( + input_files=[input_file], + period_start="2023-12-06T00:00", + period_end="2023-12-06T04:00", + output_root=output_dir, + override=True, + break_on_error=True, + station_configuration_root=station_configuration_root, + ) + + def test_get_bufr_continues_when_break_on_error_is_false(self): + input_dir = self.temp_dir / "input" + output_dir = self.temp_dir / "output" + input_file_without_data = input_dir / "THU_L2_hourly.csv" + create_data_file(input_file_without_data, src_path=None) + input_file_with_data = input_dir / "KAN_Lv3_hourly.csv" + create_data_file( + input_file_with_data, src_path=DATA_DIR.joinpath("tx_l3_test1.csv") + ) + compiled_output_dir = output_dir / "compiled" + individual_output_root = output_dir / "individual" + station_configuration_root = self.temp_dir / "station_configuration" + write_station_configuration_mapping( + station_configurations={ + "THU_L2": get_station_configuration(stid="THU_L2", export_bufr=True), + "KAN_Lv3": get_station_configuration(stid="KAN_Lv3", export_bufr=True), + }, + configurations_root_dir=station_configuration_root, + ) + expected_compiled_output_file = compiled_output_dir / "geus_20231206T0000.bufr" + expected_individual_output_dir = individual_output_root / "20231206T0000" + expected_individual_output_file = ( + expected_individual_output_dir / "KAN_Lv3.bufr" + ) + + create_bufr_files( + input_files=[ + input_file_without_data, + input_file_with_data, + ], + period_start="2023-12-06T00:00", + period_end="2023-12-06T00:00", + output_root=output_dir, + override=True, + break_on_error=False, + station_configuration_root=station_configuration_root, + ) + + self.assertTrue(expected_compiled_output_file.exists()) + # There should only be a single output file since the first input file is empty + self.assertEqual(1, len(list(expected_individual_output_dir.glob("*")))) + self.assertTrue(expected_individual_output_file.exists()) + individual_data = expected_individual_output_file.read_bytes() + compiled_data = expected_compiled_output_file.read_bytes() + self.assertEqual( + individual_data, + compiled_data, + ) + + def test_get_bufr_where_period_does_not_exist(self): + input_dir = self.temp_dir / "input" + output_dir = self.temp_dir / "output" + input_file = input_dir / "THU_L2_hourly.csv" + create_data_file(input_file, src_path=DATA_DIR.joinpath("tx_l3_test1.csv")) + station_configuration_root = self.temp_dir / "station_configuration" + station_configuration = get_station_configuration( + stid="THU_L2", export_bufr=True + ) + write_station_configuration_mapping( + station_configurations={station_configuration.stid: station_configuration}, + configurations_root_dir=station_configuration_root, + ) + + create_bufr_files( + input_files=[input_file], + period_start="2025-12-06T00:00", + period_end="2025-12-06T04:00", + output_root=output_dir, + override=True, + break_on_error=True, + station_configuration_root=station_configuration_root, + ) + + compiled_output_dir = output_dir / "compiled" + individual_output_root = output_dir / "individual" + self.assertTrue(compiled_output_dir.exists()) + self.assertTrue(individual_output_root.exists()) + expected_output_timestamps = [ + "20251206T0000", + "20251206T0100", + "20251206T0200", + "20251206T0300", + "20251206T0400", + ] + compiled_output_files = sorted(compiled_output_dir.glob("*.bufr")) + expected_output_file_names = sorted( + [ + f"geus_{timestamp_str}.bufr" + for timestamp_str in expected_output_timestamps + ] + ) + self.assertListEqual( + expected_output_file_names, [p.name for p in compiled_output_files] + ) + for file in compiled_output_files: + # The compiled bufr files should be empty + self.assertEqual(0, file.stat().st_size) + individual_output_dirs = sorted(individual_output_root.glob("*")) + self.assertListEqual( + expected_output_timestamps, [p.stem for p in individual_output_dirs] + ) + for dir in individual_output_dirs: + # There should be no bufr files in the individual directories + self.assertEqual(0, len(list(dir.glob("*.bufr")))) diff --git a/tests/unit/bufr_export/test_get_bufr.py b/tests/unit/bufr_export/test_get_bufr.py index 5095c2d8..83b650b3 100644 --- a/tests/unit/bufr_export/test_get_bufr.py +++ b/tests/unit/bufr_export/test_get_bufr.py @@ -1,30 +1,23 @@ import datetime import logging -import pickle +import random import sys -import unittest -import uuid -from io import StringIO +import tempfile +from io import BufferedWriter from pathlib import Path -from tempfile import TemporaryDirectory from unittest import TestCase, mock -import numpy as np import pandas as pd from pypromice.postprocess.bufr_utilities import BUFRVariables from pypromice.postprocess.get_bufr import ( - process_station, - StationConfiguration, - get_bufr, + get_station_positions, get_bufr_variables, - write_station_configuration_mapping, - load_station_configuration_mapping, -) -from tests.unit.bufr_export.test_get_bufr_integration import ( - DATA_DIR, - run_get_bufr, + REQUIRED_KEYS, + get_bufr, ) +from pypromice.station_configuration import StationConfiguration +from tests.utilities import get_station_configuration logging.basicConfig( stream=sys.stdout, @@ -32,175 +25,166 @@ level=logging.WARNING, ) -MOCK_BASE_STR = "pypromice.postprocess.get_bufr.{}" - -class StationConfigurationTestCase(TestCase): - def test_read(self): - source_lines = [ - "[UPE_L]\n", - 'stid = "UPE_L"\n', - 'station_site = "UPE_L"\n', - 'project = "Promice"\n', - 'station_type = "mobile"\n', - 'wmo_id = "04423"\n', - "barometer_from_gps = -0.25\n", - "anemometer_from_sonic_ranger = 0.4\n", - "temperature_from_sonic_ranger = 0.0\n", - "height_of_gps_from_station_ground = 0.9\n", - "sonic_ranger_from_gps = 1.3\n", - "export_bufr = true\n", - "skipped_variables = []\n", - "positions_update_timestamp_only = false\n", - ] - source_io = StringIO() - source_io.writelines(source_lines) - source_io.seek(0) - expected_configuration_mapping = { - "UPE_L": StationConfiguration( - stid="UPE_L", - station_site="UPE_L", - project="Promice", - station_type="mobile", - wmo_id="04423", - barometer_from_gps=-0.25, - anemometer_from_sonic_ranger=0.4, - temperature_from_sonic_ranger=0.0, - height_of_gps_from_station_ground=0.9, - sonic_ranger_from_gps=1.3, - export_bufr=True, - comment=None, - skipped_variables=[], - positions_update_timestamp_only=False, - ) - } +class GetStationPositionsTestCase(TestCase): + def test_all_data_available(self): + """ + Test the get_station_positions function + """ + timestamp = pd.to_datetime("2024-03-01 00:00:00") + latest_data = pd.Series( + name=timestamp, + data={ + "gps_lat_fit": 78.52901, + "gps_lon_fit": -56.8450358, + "gps_alt_fit": 1968.561, + }, + ) - station_configuration_mapping = load_station_configuration_mapping(source_io) + positions = get_station_positions(latest_data=latest_data) self.assertDictEqual( - expected_configuration_mapping, - station_configuration_mapping, + positions, + dict( + timestamp=timestamp, + lat=78.52901, + lon=-56.8450358, + alt=1968.561, + ), ) - def test_write_read(self): - station_config = StationConfiguration( - stid="UPE_L", - station_site="UPE_L", - project="Promice", - station_type="mobile", - wmo_id="04423", - barometer_from_gps=-0.25, - anemometer_from_sonic_ranger=0.4, - temperature_from_sonic_ranger=0.0, - height_of_gps_from_station_ground=0.9, - sonic_ranger_from_gps=1.3, - export_bufr=True, - comment=None, - skipped_variables=[], - positions_update_timestamp_only=False, + def test_missing_data(self): + """ + Test the get_station_positions function with missing data + """ + timestamp = pd.to_datetime("2024-03-01 00:00:00") + latest_data = pd.Series( + name=timestamp, + data={ + "gps_lat_fit": 78.52901, + "gps_lon_fit": -56.8450358, + }, ) - config_mapping = {station_config.stid: station_config} - source_io = StringIO() - write_station_configuration_mapping(config_mapping, source_io) - source_io.seek(0) - read_mapping = load_station_configuration_mapping(source_io) + with self.assertRaises(KeyError): + get_station_positions(latest_data=latest_data) - self.assertDictEqual( - config_mapping, - read_mapping, + def test_nan_latitude(self): + """ + get_station_positions shall discard all position data if latitude is NaN + """ + timestamp = pd.to_datetime("2024-03-01 00:00:00") + latest_data = pd.Series( + name=timestamp, + data={ + "gps_lat_fit": float("nan"), + "gps_lon_fit": -56.8450358, + "gps_alt_fit": 1968.561, + }, ) - def test_write_read_minimal_config(self): - station_config = StationConfiguration(stid="UPE_L") - config_mapping = {station_config.stid: station_config} - source_io = StringIO() - - write_station_configuration_mapping(config_mapping, source_io) - source_io.seek(0) - read_mapping = load_station_configuration_mapping(source_io) + positions = get_station_positions(latest_data=latest_data) - self.maxDiff = None - self.assertEqual( - station_config, - config_mapping[station_config.stid], - ) self.assertDictEqual( - config_mapping, - read_mapping, + positions, + dict( + timestamp=timestamp, + lat=None, + lon=None, + alt=None, + ), ) - def test_write_read_empty_mapping(self): - config_mapping = {} - source_io = StringIO() + def test_nan_altitude(self): + """ + get_station_positions shall discard all position data if altitude is NaN + """ + timestamp = pd.to_datetime("2024-03-01 00:00:00") + latest_data = pd.Series( + name=timestamp, + data={ + "gps_lat_fit": 78.52901, + "gps_lon_fit": -56.8450358, + "gps_alt_fit": float("nan"), + }, + ) - write_station_configuration_mapping(config_mapping, source_io) - source_io.seek(0) - read_mapping = load_station_configuration_mapping(source_io) + positions = get_station_positions(latest_data=latest_data) self.assertDictEqual( - config_mapping, - read_mapping, + positions, + dict( + timestamp=timestamp, + lat=None, + lon=None, + alt=None, + ), ) -class BufrVariablesTestCase(TestCase): +class TestGetBufrVariablesTestCase(TestCase): def test_bufr_variables_gcnet(self): - self._test_bufr_variables( + config = StationConfiguration( stid="DY2", + station_site="DY2", + project="GC-Net", wmo_id="04464", station_type="mobile", - relativeHumidity=69.0, - airTemperature=256.0, - pressure=77300.0, - windDirection=149.0, - windSpeed=14.9, - latitude=66.482488, - longitude=-46.294266, - heightOfStationGroundAboveMeanSeaLevel=2123.2, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.6, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, - heightOfBarometerAboveMeanSeaLevel=2125.25, + barometer_from_gps=0.55, + anemometer_from_sonic_ranger=0.4, + temperature_from_sonic_ranger=0.4, + height_of_gps_from_station_ground=1.5, + sonic_ranger_from_gps=0.15, + export_bufr=True, ) - - def test_bufr_variables_promice_v2(self): - self._test_bufr_variables( - stid="NUK_L", - wmo_id="04403", - station_type="mobile", - relativeHumidity=69.0, - airTemperature=256.0, - pressure=77300.0, - windDirection=149.0, - windSpeed=14.9, - latitude=66.482488, - longitude=-46.294266, - heightOfStationGroundAboveMeanSeaLevel=2123.8, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.2, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, - heightOfBarometerAboveMeanSeaLevel=2124.45, + timestamp = pd.to_datetime("2024-03-01 00:00:00") + data = pd.Series( + name=timestamp, + data={ + "t_i": -12.5, + "p_i": 3.1, + "rh_i": 0.5, + "wspd_i": 2.5, + "wdir_i": 182.1, + "z_boom_u_smooth": 1.6, + "gps_lat_fit": 78.52901, + "gps_lon_fit": -56.8450358, + "gps_alt_fit": 1968.561, + }, + ) + expected_bufr_variables = BUFRVariables( + wmo_id=config.wmo_id, + station_type=config.station_type, + timestamp=timestamp, + relativeHumidity=data.rh_i, + airTemperature=data.t_i + 273.15, + pressure=100310, + windDirection=data.wdir_i, + windSpeed=data.wspd_i, + latitude=data.gps_lat_fit, + longitude=data.gps_lon_fit, + heightOfStationGroundAboveMeanSeaLevel=data.gps_alt_fit + - config.height_of_gps_from_station_ground, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=data.z_boom_u_smooth + + config.temperature_from_sonic_ranger, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=data.z_boom_u_smooth + + config.anemometer_from_sonic_ranger, + heightOfBarometerAboveMeanSeaLevel=data.gps_alt_fit + + config.barometer_from_gps, + ) + + bufr_variables = get_bufr_variables( + data=data, + station_configuration=config, ) - def test_bufr_variables_promice_v3(self): - self._test_bufr_variables( - stid="QAS_Mv3", - wmo_id="04441", - station_type="mobile", - relativeHumidity=69.0, - airTemperature=256.0, - pressure=77300.0, - windDirection=149.0, - windSpeed=14.9, - latitude=66.482488, - longitude=-46.294266, - heightOfStationGroundAboveMeanSeaLevel=2123.8, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.2, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.6, - heightOfBarometerAboveMeanSeaLevel=2126, + pd.testing.assert_series_equal( + bufr_variables.as_series(), + expected_bufr_variables.as_series(), ) - def test_none_values_in_config(self): - timestamp = datetime.datetime.now() + def test_bufr_variables_static_gps_elevation(self): + timestamp = pd.to_datetime("2024-03-01 00:00:00") data = pd.Series( data=dict( rh_i=0.93, @@ -209,739 +193,668 @@ def test_none_values_in_config(self): p_i=993, wdir_i=32.1, wspd_i=5.3, - gps_lat_fit=66.0, gps_lon_fit=-46.0, - gps_alt_fit=1094, + gps_lat_fit=66.0, + # This is a erroneous value that should be overridden by the static value + gps_alt_fit=142.1, z_boom_u_smooth=2.1, ), name=timestamp, ) - station_config = StationConfiguration( + config = StationConfiguration( stid="A_STID", station_type="land", wmo_id="4201", - barometer_from_gps=0.2, - anemometer_from_sonic_ranger=0.1, - temperature_from_sonic_ranger=1.3, - height_of_gps_from_station_ground=2.1, + export_bufr=True, + barometer_from_gps=1.3, + height_of_gps_from_station_ground=0.9, + static_height_of_gps_from_mean_sea_level=17.5, + anemometer_from_sonic_ranger=None, + temperature_from_sonic_ranger=None, + sonic_ranger_from_gps=None, + ) + # The elevations should be determined from the static variable + expected_station_ground_elevation = 17.5 - 0.9 + expected_barometer_elevation = 17.5 + 1.3 + expected_bufr_variables = BUFRVariables( + wmo_id=config.wmo_id, + station_type=config.station_type, + timestamp=timestamp, + relativeHumidity=1.0, + airTemperature=252.15, # Converted to kelvin + pressure=199300.0, + windDirection=32.0, + windSpeed=5.3, + latitude=66.0, + longitude=-46.0, + heightOfStationGroundAboveMeanSeaLevel=expected_station_ground_elevation, + heightOfBarometerAboveMeanSeaLevel=expected_barometer_elevation, + # The sensor heights are ignored since the necessary dimension values are missing + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=float("nan"), + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=float("nan"), + ) + + bufr_variables = get_bufr_variables( + data=data, + station_configuration=config, ) - output = get_bufr_variables( - data, - station_configuration=station_config, + pd.testing.assert_series_equal( + bufr_variables.as_series(), + expected_bufr_variables.as_series(), ) - self.assertEqual( - BUFRVariables( - wmo_id=station_config.wmo_id, - station_type=station_config.station_type, - timestamp=timestamp, - relativeHumidity=1.0, - airTemperature=252.2, # Converted to kelvin - pressure=199300.0, - windDirection=32.0, - windSpeed=5.3, - latitude=66.0, - longitude=-46.0, - heightOfStationGroundAboveMeanSeaLevel=1091.9, - heightOfBarometerAboveMeanSeaLevel=1094.2, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=3.4, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=2.2, + def test_fails_on_missing_dimension_values(self): + """ + Test that get_bufr_variables raises an AttributeError if the data is missing + """ + timestamp = pd.to_datetime("2024-03-01 00:00:00") + data = pd.Series( + data=dict( + rh_i=0.93, + t_i=-21, + name="", + p_i=993, + wdir_i=32.1, + wspd_i=5.3, + gps_lat_fit=66.0, + gps_lon_fit=-46.0, + gps_alt_fit=1094, + z_boom_u_smooth=2.1, ), - output, + name=timestamp, + ) + config = StationConfiguration( + stid="A_STID", + station_type="land", + wmo_id="4201", + export_bufr=True, ) - @mock.patch("pypromice.postprocess.get_bufr.write_bufr_message") - def _test_bufr_variables( - self, - write_bufr_message_mock: mock.MagicMock, - stid: str, - wmo_id: str, - relativeHumidity: float, - airTemperature: float, - pressure: float, - windDirection: float, - windSpeed: float, - latitude: float, - longitude: float, - heightOfStationGroundAboveMeanSeaLevel: float, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH: float, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD: float, - heightOfBarometerAboveMeanSeaLevel: float, - station_type: str, - ): - l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") - l3_src = pd.read_csv(l3_src_filepath) - now_timestamp = datetime.datetime(2023, 12, 8) - - timestamps = {} - run_get_bufr( - l3_data=l3_src, - now_timestamp=now_timestamp, - latest_timestamps=timestamps, - stid=stid, - store_positions=True, - time_limit="91d", + with self.assertRaises(AttributeError): + get_bufr_variables( + data, + station_configuration=config, + ) + + def test_nan_location_yields_none(self): + config = get_station_configuration(export_bufr=True) + data = pd.Series( + name=pd.to_datetime("2024-03-01 00:00:00"), + data={ + "t_i": -12.5, + "p_i": 1003.1, + "rh_i": 0.5, + "wspd_i": 2.5, + "wdir_i": 182.1, + "z_boom_u_smooth": 1.6, + "gps_lat_fit": 78.52901, + "gps_lon_fit": float("nan"), + "gps_alt_fit": 1968.561, + }, ) - write_bufr_message_mock.assert_called_once() - call = write_bufr_message_mock.call_args_list[0] - expected_time = datetime.datetime(year=2023, month=12, day=7, hour=23) - expected_bufr_variables = BUFRVariables( - wmo_id=wmo_id, - station_type=station_type, - timestamp=expected_time, - relativeHumidity=relativeHumidity, - airTemperature=airTemperature, - pressure=pressure, - windDirection=windDirection, - windSpeed=windSpeed, - latitude=latitude, - longitude=longitude, - heightOfStationGroundAboveMeanSeaLevel=heightOfStationGroundAboveMeanSeaLevel, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH, - heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD, - heightOfBarometerAboveMeanSeaLevel=heightOfBarometerAboveMeanSeaLevel, + return_value = get_bufr_variables( + data, + station_configuration=config, ) - pd.testing.assert_series_equal( - pd.Series(expected_bufr_variables), - pd.Series(call.kwargs["variables"]), + + self.assertIsNone(return_value) + + def test_nan_t_i_and_p_i_yields_none(self): + config = get_station_configuration(export_bufr=True) + data = pd.Series( + name=pd.to_datetime("2024-03-01 00:00:00"), + data={ + "t_i": float("nan"), + "p_i": float("nan"), + "rh_i": 0.5, + "wspd_i": 2.5, + "wdir_i": 182.1, + "z_boom_u_smooth": 1.6, + "gps_lat_fit": 78.52901, + "gps_lon_fit": -56.8450358, + "gps_alt_fit": 1968.561, + }, ) + return_value = get_bufr_variables( + data, + station_configuration=config, + ) + + self.assertIsNone(return_value) + + def test_missing_keys(self): + config = get_station_configuration(export_bufr=True) + for key in REQUIRED_KEYS: + data = pd.Series( + name=pd.to_datetime("2024-03-01 00:00:00"), + data={ + "t_i": -12.5, + "p_i": 1003.1, + "rh_i": 0.5, + "wspd_i": 2.5, + "wdir_i": 182.1, + "z_boom_u_smooth": 1.6, + "gps_lat_fit": 78.52901, + "gps_lon_fit": -56.8450358, + "gps_alt_fit": 1968.561, + }, + ) + del data[key] + + with self.assertRaises(ValueError, msg=f"Key: {key}"): + get_bufr_variables( + data=data, + station_configuration=config, + ) + + +MOCK_BASE_STR = "pypromice.postprocess.get_bufr.{}" + +@mock.patch(MOCK_BASE_STR.format("get_station_positions")) @mock.patch(MOCK_BASE_STR.format("get_bufr_variables")) @mock.patch(MOCK_BASE_STR.format("write_bufr_message")) @mock.patch(MOCK_BASE_STR.format("get_latest_data")) @mock.patch(MOCK_BASE_STR.format("load_data")) -class ProcessStationTestCase(unittest.TestCase): - def setUp(self) -> None: - self.file_path = mock.create_autospec(Path) - self.output_path = mock.create_autospec(Path) - self.now_timestamp = mock.create_autospec(datetime.datetime) - self.time_limit = mock.create_autospec(str) - self.stid = str(uuid.uuid4()) - self.station_configuration = mock.MagicMock() - self.earliest_timestamp = mock.MagicMock() - - def test_process_station_no_new_data( +class TestGetBufrTestCase(TestCase): + def test_has_new_data( self, load_data_mock: mock.MagicMock, get_latest_data_mock: mock.MagicMock, write_bufr_message_mock: mock.MagicMock, get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, ): - self.earliest_timestamp = datetime.datetime(2023, 10, 3) - latest_data_datetime = datetime.datetime(2023, 10, 3) - get_latest_data_mock.return_value = pd.Series( - data={ - "p_i": -227.1, - "t_i": -16.7, - "rh_i": 84.6, - "wspd_i": 14.83, - "wdir_i": 142.2, - "gps_lat": 66.482469, - "gps_lon": -46.294232, - "gps_alt": 2116.0, - "z_boom_u": 4.1901, - "gps_lat_fit": 66.482474, - "gps_lon_fit": -46.294261, - "gps_alt_fit": 2119.6, - "z_boom_u_smooth": 4.2, - }, - name=latest_data_datetime, - ) - expected_output = { - "timestamp": latest_data_datetime, - "lat": 66.482474, - "lon": -46.294261, - "alt": 2119.6, - } - - output = process_station( - file_path=self.file_path, - output_path=self.output_path, - now_timestamp=self.now_timestamp, - latest_timestamp=self.earliest_timestamp, - time_limit=self.time_limit, - stid=self.stid, - station_configuration=self.station_configuration, - ) + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + output_path = root_path / "bufr_out" + station_config = get_station_configuration( + export_bufr=True, + positions_update_timestamp_only=False, + ) + input_file = root_path / "input" / f"{station_config.stid}_hour.csv" + positions_filepath = root_path / "positions.csv" + timestamps_pickle_filepath = root_path / "timestamps.pickle" + now_timestamp = pd.to_datetime("2024-03-01 00:12:00") + latest_timestamp = pd.to_datetime("2024-03-01 00:01:00") + get_latest_data_mock.return_value.name = latest_timestamp + get_station_positions_mock.return_value = dict( + timestamp=latest_timestamp, + lat=78.52901, + lon=-56.8450358, + alt=1968.561, + ) - self.assertDictEqual( - output, - expected_output, - ) - get_bufr_variables_mock.assert_not_called() - write_bufr_message_mock.assert_not_called() + get_bufr( + input_files=[input_file], + station_configuration_mapping={station_config.stid: station_config}, + break_on_error=True, + bufr_out=output_path, + target_timestamp=now_timestamp, + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=timestamps_pickle_filepath, + ) - def test_process_station_has_new_data( + load_data_mock.assert_called_once_with(input_file, now_timestamp) + get_latest_data_mock.assert_called_once_with( + load_data_mock.return_value, + lin_reg_time_limit="91d", + vars_to_skip=station_config.skipped_variables, + ) + get_station_positions_mock.assert_called_once_with( + get_latest_data_mock.return_value + ) + get_bufr_variables_mock.assert_called_once_with( + get_latest_data_mock.return_value, + station_config, + ) + write_bufr_message_mock.assert_called_once_with( + get_bufr_variables_mock.return_value, + mock.ANY, + ) + # Write bufr is invoked with an open file object. It is therefore necessary to check the path of the file + expected_output_file_path = output_path / f"{station_config.stid}.bufr" + output_file = write_bufr_message_mock.call_args[0][1] + self.assertIsInstance(output_file, BufferedWriter) + self.assertEqual(Path(output_file.name), expected_output_file_path) + written_positions = pd.read_csv( + positions_filepath, index_col=0, parse_dates=["timestamp"] + ) + self.assertDictEqual( + get_station_positions_mock.return_value, + dict(written_positions.loc[station_config.stid]), + ) + self.assertTrue(timestamps_pickle_filepath.exists()) + timestamps = pd.read_pickle(timestamps_pickle_filepath) + self.assertDictEqual( + timestamps, + {station_config.stid: latest_timestamp}, + ) + + def test_no_new_data( self, load_data_mock: mock.MagicMock, get_latest_data_mock: mock.MagicMock, write_bufr_message_mock: mock.MagicMock, get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, ): - self.earliest_timestamp = datetime.datetime(2023, 10, 2) - latest_data_datetime = datetime.datetime(2023, 10, 3) - get_latest_data_mock.return_value = pd.Series( - data={ - "p_i": -227.1, - "t_i": -16.7, - "rh_i": 84.6, - "wspd_i": 14.83, - "wdir_i": 142.2, - "gps_lat": 66.482469, - "gps_lon": -46.294232, - "gps_alt": 2116.0, - "z_boom_u": 4.1901, - "gps_lat_fit": 66.482474, - "gps_lon_fit": -46.294261, - "gps_alt_fit": 2119.6, - "z_boom_u_smooth": 4.2, - }, - name=latest_data_datetime, - ) - expected_output = { - "timestamp": latest_data_datetime, - "lat": 66.482474, - "lon": -46.294261, - "alt": 2119.6, - } - - output = process_station( - file_path=self.file_path, - output_path=self.output_path, - now_timestamp=self.now_timestamp, - latest_timestamp=self.earliest_timestamp, - time_limit=self.time_limit, - stid=self.stid, - station_configuration=self.station_configuration, - ) + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + output_path = root_path / "bur_out" + station_config = get_station_configuration( + export_bufr=True, + positions_update_timestamp_only=False, + ) + input_file = root_path / "input" / f"{station_config.stid}_hour.csv" + positions_filepath = root_path / "positions.csv" + now_timestamp = pd.to_datetime("2024-03-01 00:12:00") + # The latest data is two month old + latest_timestamp = pd.to_datetime("2024-01-01 00:12:00") + get_latest_data_mock.return_value.name = latest_timestamp + get_station_positions_mock.return_value = dict( + timestamp=latest_timestamp, + lat=78.52901, + lon=-56.8450358, + alt=1968.561, + ) - self.assertDictEqual( - output, - expected_output, - ) - get_bufr_variables_mock.assert_called_once_with( - data=get_latest_data_mock.return_value, - station_configuration=self.station_configuration, - ) - write_bufr_message_mock.assert_called_once_with( - variables=get_bufr_variables_mock.return_value, - file=self.output_path.open().__enter__(), - ) + get_bufr( + input_files=[input_file], + station_configuration_mapping={station_config.stid: station_config}, + break_on_error=True, + bufr_out=output_path, + target_timestamp=now_timestamp, + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=None, + time_window_length=pd.to_timedelta("2d"), + ) + + get_latest_data_mock.assert_called_once_with( + load_data_mock.return_value, + lin_reg_time_limit="91d", + vars_to_skip=station_config.skipped_variables, + ) + get_station_positions_mock.assert_called_once_with( + get_latest_data_mock.return_value + ) + get_bufr_variables_mock.assert_not_called() + write_bufr_message_mock.assert_not_called() + written_positions = pd.read_csv( + positions_filepath, index_col=0, parse_dates=["timestamp"] + ) + self.assertDictEqual( + get_station_positions_mock.return_value, + dict(written_positions.loc[station_config.stid]), + ) - def test_min_data_wx_failed( + def test_position_seed( self, load_data_mock: mock.MagicMock, get_latest_data_mock: mock.MagicMock, write_bufr_message_mock: mock.MagicMock, get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, ): - self.earliest_timestamp = datetime.datetime(2023, 10, 2) - latest_data_datetime = datetime.datetime(2023, 10, 3) - get_latest_data_mock.return_value = pd.Series( - data={ - "p_i": np.nan, - "t_i": np.nan, - "rh_i": 84.6, - "wspd_i": 14.83, - "wdir_i": 142.2, - "gps_lat": 66.482469, - "gps_lon": -46.294232, - "gps_alt": 2116.0, - "z_boom_u": 4.1901, - "gps_lat_fit": 66.482474, - "gps_lon_fit": -46.294261, - "gps_alt_fit": 2119.6, - "z_boom_u_smooth": 4.2, - }, - name=latest_data_datetime, - ) - expected_output = { - "timestamp": latest_data_datetime, - "lat": 66.482474, - "lon": -46.294261, - "alt": 2119.6, - } - - output = process_station( - file_path=self.file_path, - output_path=self.output_path, - now_timestamp=self.now_timestamp, - latest_timestamp=self.earliest_timestamp, - time_limit=self.time_limit, - stid=self.stid, - station_configuration=self.station_configuration, - ) + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + positions_filepath = root_path / "positions.csv" + positions_seed_path = root_path / "positions_seed.csv" + positions_seed = pd.DataFrame( + columns=["stid", "timestamp", "lat", "lon", "alt"], + data=[ + ["STATION_A", datetime.datetime(2021, 10, 2), 65.0, -40.0, 800], + ["STATION_B", datetime.datetime(2023, 11, 12), 66.0, -50.0, 1100], + ], + ).set_index("stid") + positions_seed.to_csv(positions_seed_path, index=True) + + get_bufr( + input_files=[], + station_configuration_mapping=dict(), + break_on_error=True, + bufr_out=mock.create_autospec(Path), + target_timestamp=mock.create_autospec(datetime.timedelta), + positions_filepath=positions_filepath, + positions_seed_path=positions_seed_path, + store_positions=True, + timestamps_pickle_filepath=None, + time_window_length=pd.to_timedelta("2d"), + ) - # The BUFR export step shall be skipped - get_bufr_variables_mock.assert_not_called() - write_bufr_message_mock.assert_not_called() - self.assertDictEqual( - output, - expected_output, - ) + written_positions = pd.read_csv( + positions_filepath, index_col="stid", parse_dates=["timestamp"] + ) + pd.testing.assert_frame_equal( + positions_seed, + written_positions, + ) - def test_min_data_pos_failed( + def test_no_input_paths( self, load_data_mock: mock.MagicMock, get_latest_data_mock: mock.MagicMock, write_bufr_message_mock: mock.MagicMock, get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, ): - self.earliest_timestamp = datetime.datetime(2023, 10, 2) - latest_data_datetime = datetime.datetime(2023, 10, 3) - get_latest_data_mock.return_value = pd.Series( - data={ - "p_i": -227.1, - "t_i": -16.7, - "rh_i": 84.6, - "wspd_i": 14.83, - "wdir_i": 142.2, - "gps_lat": 66.482469, - "gps_lon": -46.294232, - "gps_alt": 2116.0, - "z_boom_u": 4.1901, - "gps_lat_fit": 66.482474, - "gps_lon_fit": -46.294261, - "gps_alt_fit": np.nan, - "z_boom_u_smooth": 4.2, - }, - name=latest_data_datetime, - ) - expected_output = { - "timestamp": latest_data_datetime, - "lat": None, - "lon": None, - "alt": None, - } - - output = process_station( - file_path=self.file_path, - output_path=self.output_path, - now_timestamp=self.now_timestamp, - latest_timestamp=self.earliest_timestamp, - time_limit=self.time_limit, - stid=self.stid, - station_configuration=self.station_configuration, - ) + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + positions_filepath = root_path / "positions.csv" + get_bufr( + input_files=[], + station_configuration_mapping=dict(), + break_on_error=True, + bufr_out=mock.create_autospec(Path), + target_timestamp=mock.create_autospec(datetime.timedelta), + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=None, + time_window_length=pd.to_timedelta("2d"), + ) - # The BUFR export step shall be skipped - get_bufr_variables_mock.assert_not_called() - write_bufr_message_mock.assert_not_called() - self.assertDictEqual( - output, - expected_output, - ) + load_data_mock.assert_not_called() + get_latest_data_mock.assert_not_called() + get_station_positions_mock.assert_not_called() + get_bufr_variables_mock.assert_not_called() + write_bufr_message_mock.assert_not_called() + # The positions file should be created, but empty + self.assertTrue(positions_filepath.exists()) + written_positions = pd.read_csv( + positions_filepath, index_col=0, parse_dates=["timestamp"] + ) + self.assertEqual(0, len(written_positions)) - def test_no_valid_data( + def test_get_latest_data_fails( self, load_data_mock: mock.MagicMock, get_latest_data_mock: mock.MagicMock, write_bufr_message_mock: mock.MagicMock, get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, ): - get_latest_data_mock.return_value = None - - output = process_station( - file_path=self.file_path, - output_path=self.output_path, - now_timestamp=self.now_timestamp, - latest_timestamp=self.earliest_timestamp, - time_limit=self.time_limit, - stid=self.stid, - station_configuration=self.station_configuration, - ) + """ + get_latest_data returns None when there are no valid data available for the staiton + """ + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + positions_filepath = root_path / "positions.csv" + station_config = get_station_configuration( + export_bufr=True, + positions_update_timestamp_only=False, + ) + input_file = root_path / "input" / f"{station_config.stid}_hour.csv" + target_timestamp = mock.create_autospec(datetime.timedelta) + get_latest_data_mock.return_value = None + get_bufr( + input_files=[input_file], + station_configuration_mapping={station_config.stid: station_config}, + break_on_error=True, + bufr_out=mock.create_autospec(Path), + target_timestamp=target_timestamp, + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=None, + time_window_length=pd.to_timedelta("2d"), + ) - load_data_mock.assert_called_once() - get_latest_data_mock.assert_called_once() - write_bufr_message_mock.assert_not_called() - get_bufr_variables_mock.assert_not_called() - self.assertIsNone(output) + load_data_mock.assert_called_once_with(input_file, target_timestamp) + get_latest_data_mock.assert_called_once_with( + load_data_mock.return_value, + lin_reg_time_limit="91d", + vars_to_skip=station_config.skipped_variables, + ) + get_station_positions_mock.assert_not_called() + get_bufr_variables_mock.assert_not_called() + write_bufr_message_mock.assert_not_called() + self.assertTrue(positions_filepath.exists()) - def test_skipped_variables( + def test_already_existing_in_latest_timestamps( self, load_data_mock: mock.MagicMock, get_latest_data_mock: mock.MagicMock, write_bufr_message_mock: mock.MagicMock, get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, ): - self.earliest_timestamp = datetime.datetime(2023, 10, 2) - latest_data_datetime = datetime.datetime(2023, 10, 3) - original_p_i = 42.0 - get_latest_data_mock.return_value = pd.Series( - data={ - "p_i": original_p_i, - "t_i": -16.7, - "rh_i": 84.6, - "wspd_i": 14.83, - "wdir_i": 142.2, - "gps_lat": 66.482469, - "gps_lon": -46.294232, - "gps_alt": 2116.0, - "z_boom_u": 4.1901, - "gps_lat_fit": 66.482474, - "gps_lon_fit": -46.294261, - "gps_alt_fit": 2119.6, - "z_boom_u_smooth": 4.2, - }, - name=latest_data_datetime, - ) - self.station_configuration = StationConfiguration( - stid="A_STID", - station_site="A_STATION_SITE", - station_type="mobile", - wmo_id="04242", - skipped_variables=["p_i"], - height_of_gps_from_station_ground=1.4, - barometer_from_gps=0.1, - anemometer_from_sonic_ranger=0.1, - temperature_from_sonic_ranger=0.2, - export_bufr=True, - ) - expected_output = { - "timestamp": latest_data_datetime, - "lat": 66.482474, - "lon": -46.294261, - "alt": 2119.6, - } - self.assertEqual( - original_p_i, - get_latest_data_mock.return_value["p_i"], - ) - - output = process_station( - file_path=self.file_path, - output_path=self.output_path, - now_timestamp=self.now_timestamp, - latest_timestamp=self.earliest_timestamp, - time_limit=self.time_limit, - stid=self.stid, - station_configuration=self.station_configuration, - ) - - self.assertTrue( - np.isnan(get_latest_data_mock.return_value["p_i"]), - "p_i shall be set to nan since it is in skipped_variables", - ) - self.assertDictEqual( - output, - expected_output, - ) - get_bufr_variables_mock.assert_called_once_with( - data=get_latest_data_mock.return_value, - station_configuration=self.station_configuration, - ) - write_bufr_message_mock.assert_called_once_with( - variables=get_bufr_variables_mock.return_value, - file=self.output_path.open().__enter__(), - ) - - -class GetBufrTestCase(unittest.TestCase): - def setUp(self) -> None: - self.temporary_root = TemporaryDirectory() - self.root_path = Path(self.temporary_root.name) - self.l3_data_root = self.root_path / "l3" - self.l3_data_root.mkdir() - self.bufr_root = self.root_path / "bufr" - self.bufr_root.mkdir() + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + output_path = root_path / "bufr_out" + positions_filepath = root_path / "positions.csv" + station_config = get_station_configuration( + export_bufr=True, + positions_update_timestamp_only=False, + ) + now_timestamp = pd.to_datetime("2024-03-01 00:12:00") + latest_timestamp = pd.to_datetime("2024-03-01 00:01:00") + input_file = root_path / "input" / f"{station_config.stid}_hour.csv" + timestamps_pickle_filepath = root_path / "timestamps.pickle" + latest_timestamps = {station_config.stid: latest_timestamp} + with timestamps_pickle_filepath.open("wb") as f: + pd.to_pickle(latest_timestamps, f) + get_latest_data_mock.return_value.name = latest_timestamp + get_station_positions_mock.return_value = dict( + timestamp=latest_timestamp, + lat=78.52901, + lon=-56.8450358, + alt=1968.561, + ) - self.positions_file_path = self.root_path / "positions.csv" - self.positions_seed_path = self.root_path / "positions_seed.csv" - self.timestamps_pickle_filepath = self.root_path / "latest_timestamps.pickle" - self.station_configuration_path = self.root_path / "station_configuration.toml" + get_bufr( + input_files=[input_file], + station_configuration_mapping={station_config.stid: station_config}, + break_on_error=True, + bufr_out=output_path, + target_timestamp=now_timestamp, + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=timestamps_pickle_filepath, + ) - def tearDown(self) -> None: - self.temporary_root.cleanup() + get_station_positions_mock.assert_called_once() + # The BUFR export should be skipped since the latest timestamp is already in the timestamps + get_bufr_variables_mock.assert_not_called() + write_bufr_message_mock.assert_not_called() + self.assertTrue(positions_filepath.exists()) - @mock.patch(MOCK_BASE_STR.format("process_station")) - def test_process_station_raises_exception( - self, process_station_mock: mock.MagicMock + def test_no_station_configuration( + self, + load_data_mock: mock.MagicMock, + get_latest_data_mock: mock.MagicMock, + write_bufr_message_mock: mock.MagicMock, + get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, ): - """ - get_bufr should skip stations where process_station raises exception - """ - timestamps_pickle_filepath = self.root_path / "timestamps.pickle" - stid = "THE_STID_FOR_A_STATION" - input_file_path = self.root_path / f"{stid}_hourly.csv" - process_station_mock.side_effect = Exception("Test exception") - now_timestamp = datetime.datetime.now() - self.assertFalse(self.positions_file_path.exists()) - self.assertFalse(timestamps_pickle_filepath.exists()) - - get_bufr( - store_positions=True, - bufr_out=self.bufr_root, - input_files=[input_file_path], - positions_filepath=self.positions_file_path, - station_configuration_path=None, - timestamps_pickle_filepath=timestamps_pickle_filepath, - now_timestamp=now_timestamp, - ) - - self.assertTrue(self.positions_file_path.exists()) - self.assertTrue(timestamps_pickle_filepath.exists()) - - @mock.patch(MOCK_BASE_STR.format("process_station")) - def test_multiple_stations(self, process_station_mock: mock.MagicMock): - station_config01 = StationConfiguration(stid="station_01", export_bufr=True) - station_config02 = StationConfiguration(stid="station_02", export_bufr=True) - station_config03 = StationConfiguration(stid="station_03", export_bufr=False) - process_station_return_values = { - station_config01.stid: dict( - timestamp=datetime.datetime(2023, 2, 1, 10), lat=1, lon=3, alt=31 - ), - station_config02.stid: dict( - timestamp=datetime.datetime(2023, 2, 1, 10), lat=2, lon=3, alt=31 - ), - station_config03.stid: dict( - timestamp=datetime.datetime(2023, 2, 1, 10), lat=3, lon=3, alt=31 - ), - } - process_station_mock.side_effect = ( - lambda **kwargs: process_station_return_values[ - kwargs["station_configuration"].stid - ] - ) - input_files = [ - self.root_path / f"{station_config01.stid}_hourly.csv", - self.root_path / f"{station_config02.stid}_hourly.csv", - self.root_path / f"{station_config03.stid}_hourly.csv", - ] - station_configs = { - station_config01.stid: station_config01, - station_config02.stid: station_config02, - station_config03.stid: station_config03, - } - with self.station_configuration_path.open("w") as fp: - write_station_configuration_mapping( - station_configs, - fp, - ) - - get_bufr( - store_positions=True, - bufr_out=self.bufr_root, - input_files=input_files, - positions_filepath=self.positions_file_path, - station_configuration_path=self.station_configuration_path, - timestamps_pickle_filepath=self.timestamps_pickle_filepath, - positions_seed_path=None, - now_timestamp=datetime.datetime.now(), - ) - - self.assertEqual(3, process_station_mock.call_count) - read_positions = pd.read_csv( - self.positions_file_path, index_col=0, parse_dates=["timestamp"] - ).to_dict(orient="index") - self.assertDictEqual( - read_positions, - process_station_return_values, - ) - - def test_no_stations(self): - now_timestamp = datetime.datetime.now() - self.assertFalse(self.positions_file_path.exists()) - self.assertFalse(self.timestamps_pickle_filepath.exists()) - - get_bufr( - store_positions=True, - bufr_out=self.bufr_root, - input_files=(), - positions_filepath=self.positions_file_path, - station_configuration_path=None, - timestamps_pickle_filepath=self.timestamps_pickle_filepath, - now_timestamp=now_timestamp, - ) - - self.assertTrue(self.positions_file_path.exists()) - self.assertTrue(self.timestamps_pickle_filepath.exists()) - positions = pd.read_csv(self.positions_file_path) - pd.testing.assert_frame_equal( - positions, - pd.DataFrame(columns=["stid", "timestamp", "lat", "lon", "alt"], data=[]), - ) - with self.timestamps_pickle_filepath.open("br") as fp: - timestamps = pickle.load(fp) - self.assertDictEqual(dict(), timestamps) - - @mock.patch(MOCK_BASE_STR.format("process_station")) - def test_single_station(self, process_station_mock: mock.MagicMock): - now_timestamp = datetime.datetime.now() - stid = "THE_STID_FOR_A_STATION" - input_file_path = self.root_path / f"{stid}_hourly.csv" - station_configuration = StationConfiguration(stid=stid, export_bufr=True) - with self.station_configuration_path.open("w") as fp: - write_station_configuration_mapping( - dict(stid=station_configuration), - fp, - ) - expected_output_path = self.bufr_root / f"{stid}.bufr" - expected_latest_timestamp = now_timestamp - datetime.timedelta(days=2) - expected_station_configuration = StationConfiguration( - stid=stid, export_bufr=True - ) - - get_bufr( - store_positions=True, - bufr_out=self.bufr_root, - input_files=[input_file_path], - positions_filepath=self.positions_file_path, - station_configuration_path=self.station_configuration_path, - timestamps_pickle_filepath=self.timestamps_pickle_filepath, - positions_seed_path=None, - now_timestamp=now_timestamp, - ) + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + output_path = root_path / "bufr_out" + positions_filepath = root_path / "positions.csv" + station_id = "A_STID" + now_timestamp = pd.to_datetime("2024-03-01 00:12:00") + latest_timestamp = pd.to_datetime("2024-03-01 00:01:00") + input_file = root_path / "input" / f"{station_id}_hour.csv" + get_latest_data_mock.return_value.name = latest_timestamp + get_station_positions_mock.return_value = dict( + timestamp=latest_timestamp, + lat=78.52901, + lon=-56.8450358, + alt=1968.561, + ) - process_station_mock.assert_called_once_with( - file_path=input_file_path, - output_path=expected_output_path, - now_timestamp=now_timestamp, - latest_timestamp=expected_latest_timestamp, - time_limit="91d", - stid=stid, - station_configuration=expected_station_configuration, - ) + get_bufr( + input_files=[input_file], + station_configuration_mapping=dict(), + break_on_error=True, + bufr_out=output_path, + target_timestamp=now_timestamp, + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=None, + ) - @mock.patch(MOCK_BASE_STR.format("process_station")) - def test_station_without_configuration(self, process_station_mock: mock.MagicMock): - now_timestamp = datetime.datetime.now() - stid = "STATION_ID" - input_file_path = self.root_path / f"{stid}_hourly.csv" - expected_station_configuration = StationConfiguration(stid=stid) - expected_output_path = self.bufr_root / f"{stid}.bufr" - - get_bufr( - store_positions=True, - bufr_out=self.bufr_root, - input_files=[input_file_path], - positions_filepath=self.positions_file_path, - station_configuration_path=None, - timestamps_pickle_filepath=self.timestamps_pickle_filepath, - positions_seed_path=None, - now_timestamp=now_timestamp, - ) + get_station_positions_mock.assert_called_once() + get_bufr_variables_mock.assert_not_called() + write_bufr_message_mock.assert_not_called() + self.assertTrue(positions_filepath.exists()) - process_station_mock.assert_called_once_with( - file_path=input_file_path, - output_path=expected_output_path, - now_timestamp=now_timestamp, - latest_timestamp=now_timestamp - datetime.timedelta(days=2), - time_limit="91d", - stid=stid, - station_configuration=expected_station_configuration, - ) + def test_update_timestamps_only( + self, + load_data_mock: mock.MagicMock, + get_latest_data_mock: mock.MagicMock, + write_bufr_message_mock: mock.MagicMock, + get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, + ): + pass - @mock.patch(MOCK_BASE_STR.format("process_station")) - def test_latest_timestamp(self, process_station_mock: mock.MagicMock): - stid = "STATION_ID" - now_timestamp = datetime.datetime(2022, 1, 5, 10, 21) - latest_timestamp = datetime.datetime(2022, 1, 5, 10, 0) - # Save latest timestamp to pickle file - with self.timestamps_pickle_filepath.open("wb") as fp: - pickle.dump({stid: latest_timestamp}, fp) - input_file_path = self.root_path / f"{stid}_hourly.csv" - - get_bufr( - store_positions=True, - bufr_out=self.bufr_root, - input_files=[input_file_path], - positions_filepath=self.positions_file_path, - station_configuration_path=None, - timestamps_pickle_filepath=self.timestamps_pickle_filepath, - positions_seed_path=None, - now_timestamp=now_timestamp, - ) + def test_cleans_up_when_on_exception( + self, + load_data_mock: mock.MagicMock, + get_latest_data_mock: mock.MagicMock, + write_bufr_message_mock: mock.MagicMock, + get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, + ): + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + positions_filepath = root_path / "positions.csv" + station_config = get_station_configuration( + export_bufr=True, + positions_update_timestamp_only=False, + ) + input_file = root_path / "input" / f"{station_config.stid}_hour.csv" + target_timestamp = mock.create_autospec(datetime.timedelta) + get_latest_data_mock.side_effect = Exception("Test exception") + + get_bufr( + input_files=[input_file], + station_configuration_mapping={station_config.stid: station_config}, + break_on_error=False, + bufr_out=mock.create_autospec(Path), + target_timestamp=target_timestamp, + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=None, + time_window_length=pd.to_timedelta("2d"), + ) - process_station_mock.assert_called_once_with( - file_path=input_file_path, - output_path=self.bufr_root / f"{stid}.bufr", - now_timestamp=now_timestamp, - latest_timestamp=latest_timestamp, - time_limit="91d", - stid=stid, - station_configuration=StationConfiguration(stid=stid), - ) + load_data_mock.assert_called_once_with(input_file, target_timestamp) + get_latest_data_mock.assert_called_once_with( + load_data_mock.return_value, + lin_reg_time_limit="91d", + vars_to_skip=station_config.skipped_variables, + ) + get_station_positions_mock.assert_not_called() + get_bufr_variables_mock.assert_not_called() + write_bufr_message_mock.assert_not_called() + self.assertTrue(positions_filepath.exists()) - @mock.patch(MOCK_BASE_STR.format("process_station")) - def test_update_timestamp_only(self, process_station_mock: mock.MagicMock): - stid = "STATION_ID" - # Prepare station config - station_config = StationConfiguration( - stid=stid, positions_update_timestamp_only=True - ) - with self.station_configuration_path.open("w") as fp: - write_station_configuration_mapping( - config_mapping={station_config.stid: station_config}, - fp=fp, - ) - input_file_path = self.root_path / f"{stid}_hourly.csv" - seed_timestamp = datetime.datetime(2021, 10, 2, 10, 0) - now_timestamp = datetime.datetime(2023, 3, 3, 5, 0) - positions_seed = pd.DataFrame( - columns=["stid", "timestamp", "lat", "lon", "alt"], - data=[ - [stid, seed_timestamp, 65.0, -40.0, 800], - ], - ) - positions_seed.to_csv(self.positions_seed_path, index=False) - process_station_mock.return_value = { - "timestamp": now_timestamp, - # All position values should be ignored - "lat": None, - "lot": np.nan, - "alt": 2414.0, - } - # Only timestamp should be updated - expected_positions = positions_seed.copy() - expected_positions["timestamp"] = now_timestamp - - get_bufr( - store_positions=True, - bufr_out=self.bufr_root, - input_files=[input_file_path], - positions_filepath=self.positions_file_path, - station_configuration_path=self.station_configuration_path, - timestamps_pickle_filepath=self.timestamps_pickle_filepath, - positions_seed_path=self.positions_seed_path, - now_timestamp=now_timestamp, - ) + def test_multiple_stations( + self, + load_data_mock: mock.MagicMock, + get_latest_data_mock: mock.MagicMock, + write_bufr_message_mock: mock.MagicMock, + get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, + ): + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + positions_filepath = root_path / "positions.csv" + output_path = root_path / "bufr_out" + station_config1 = StationConfiguration(stid="station_01", export_bufr=True) + station_config2 = StationConfiguration(stid="station_02", export_bufr=True) + station_config3 = StationConfiguration(stid="station_03", export_bufr=False) + station_configs = [station_config1, station_config2, station_config3] + station_configuration_mapping = { + config.stid: config for config in station_configs + } + input_files = [ + root_path / "input" / f"{config.stid}_hour.csv" + for config in station_configs + ] + target_timestamp = pd.to_datetime("2024-03-01 00:12:00") + latest_timestamp = pd.to_datetime("2024-03-01 00:01:00") + get_latest_data_mock.return_value.name = latest_timestamp + station_positions = [ + dict( + timestamp=latest_timestamp, + lat=random.random() * 180 - 90, + lon=random.random() * 360 - 180, + alt=2000 * random.random(), + ) + for _ in range(3) + ] + get_station_positions_mock.side_effect = station_positions + + get_bufr( + input_files=input_files, + station_configuration_mapping=station_configuration_mapping, + break_on_error=True, + bufr_out=output_path, + target_timestamp=target_timestamp, + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=None, + time_window_length=pd.to_timedelta("2d"), + ) - positions = pd.read_csv(self.positions_file_path, parse_dates=["timestamp"]) - self.assertEqual(1, len(positions)) - pd.testing.assert_series_equal( - positions.iloc[0], - expected_positions.iloc[0], - ) + self.assertTrue(positions_filepath.exists()) + self.assertEqual(3, get_station_positions_mock.call_count) + self.assertEqual(2, write_bufr_message_mock.call_count) + written_positions = pd.read_csv( + positions_filepath, index_col=0, parse_dates=["timestamp"] + ) + self.assertSetEqual( + set(written_positions.index), + {config.stid for config in station_configs}, + ) - def test_position_seed(self): - """ - There are no data files available. get_bufr should use the position_seed for output positions. - """ - positions_seed = pd.DataFrame( - columns=["stid", "timestamp", "lat", "lon", "alt"], - data=[ - ["STATION_A", datetime.datetime(2021, 10, 2), 65.0, -40.0, 800], - ["STATION_B", datetime.datetime(2023, 11, 12), 66.0, -50.0, 1100], - ], - ) - positions_seed.to_csv(self.positions_seed_path, index=False) - - get_bufr( - store_positions=True, - bufr_out=self.bufr_root, - input_files=(), - positions_filepath=self.positions_file_path, - station_configuration_path=None, - timestamps_pickle_filepath=self.timestamps_pickle_filepath, - positions_seed_path=self.positions_seed_path, - now_timestamp=datetime.datetime.now(), - ) + def test_station_without_configuration( + self, + load_data_mock: mock.MagicMock, + get_latest_data_mock: mock.MagicMock, + write_bufr_message_mock: mock.MagicMock, + get_bufr_variables_mock: mock.MagicMock, + get_station_positions_mock: mock.MagicMock, + ): + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) + positions_filepath = root_path / "positions.csv" + output_path = root_path / "bufr_out" + target_timestamp = datetime.datetime.now() + stid = "STATION_ID" + input_file_path = root_path / f"{stid}_hourly.csv" + get_station_positions_mock.return_value = dict( + timestamp=target_timestamp, + lat=78.52901, + lon=-56.8450358, + alt=1968.561, + ) - for p in self.root_path.glob("*"): - print(p) + get_bufr( + input_files=[input_file_path], + station_configuration_mapping={}, + break_on_error=True, + bufr_out=output_path, + target_timestamp=target_timestamp, + positions_filepath=positions_filepath, + store_positions=True, + timestamps_pickle_filepath=None, + time_window_length=pd.to_timedelta("2d"), + ) - positions = pd.read_csv(self.positions_file_path, parse_dates=["timestamp"]) - pd.testing.assert_frame_equal(positions, positions_seed) + get_latest_data_mock.assert_called_once() + get_station_positions_mock.assert_called_once() + get_bufr_variables_mock.assert_not_called() + written_positions = pd.read_csv( + positions_filepath, index_col=0, parse_dates=["timestamp"] + ) + self.assertDictEqual( + get_station_positions_mock.return_value, + dict(written_positions.loc[stid]), + ) diff --git a/tests/unit/bufr_export/test_get_bufr_integration.py b/tests/unit/bufr_export/test_get_bufr_integration.py index b60d235a..f03a60a5 100644 --- a/tests/unit/bufr_export/test_get_bufr_integration.py +++ b/tests/unit/bufr_export/test_get_bufr_integration.py @@ -3,10 +3,8 @@ """ import datetime -import hashlib import logging import pickle -import shutil import sys from pathlib import Path from tempfile import TemporaryDirectory @@ -17,10 +15,9 @@ import pandas as pd from pypromice.postprocess import get_bufr -from pypromice.postprocess.get_bufr import ( - DEFAULT_STATION_CONFIGURATION_PATH, +from pypromice.postprocess.bufr_utilities import read_bufr_message, BUFRVariables +from pypromice.station_configuration import ( StationConfiguration, - write_station_configuration_mapping, ) logging.basicConfig( @@ -36,12 +33,11 @@ def run_get_bufr( l3_data: pd.DataFrame, stid: str, latest_timestamps: Optional[Dict[str, datetime.datetime]], - station_configuration_mapping=None, + station_configuration_mapping: Dict[str, StationConfiguration], **get_bufr_kwargs, -) -> Dict[str, str]: +) -> Optional[BUFRVariables]: """ Run get_bufr using a temporary folder structure for input and output data - The output bufr files can be verified using the sha256 hashes. Parameters ---------- @@ -51,7 +47,8 @@ def run_get_bufr( Returns ------- - mapping from station id to sha256 hashes + Optional[BUFRVariables] + BUFR variables if the output file was generated successfully """ with TemporaryDirectory() as output_path: @@ -59,22 +56,9 @@ def run_get_bufr( bufr_out = output_path.joinpath("BUFR_out") timestamps_pickle_filepath = output_path.joinpath("latest_timestamps.pickle") positions_filepath = output_path.joinpath("AWS_latest_locations.csv") - station_configuration_path = output_path.joinpath("station_configuration.toml") l3_filepath = output_path.joinpath(f"{stid}_hour.csv") l3_data.to_csv(l3_filepath) - if station_configuration_mapping is None: - shutil.copy( - DEFAULT_STATION_CONFIGURATION_PATH, - station_configuration_path, - ) - else: - with station_configuration_path.open("w") as fp: - write_station_configuration_mapping( - station_configuration_mapping, - fp, - ) - if latest_timestamps is not None: with timestamps_pickle_filepath.open("wb") as fp: pickle.dump(latest_timestamps, fp) @@ -84,17 +68,16 @@ def run_get_bufr( input_files=[l3_filepath], timestamps_pickle_filepath=timestamps_pickle_filepath, positions_filepath=positions_filepath, - station_configuration_path=station_configuration_path, + station_configuration_mapping=station_configuration_mapping, **get_bufr_kwargs, ) - output_bufr_files = bufr_out.glob("*.bufr") - file_hashes = dict() - for p in output_bufr_files: - with p.open("rb") as fp: - file_hashes[p.stem] = hashlib.sha256(fp.read()).hexdigest() + output_path = bufr_out.joinpath(f"{stid}.bufr") + if not output_path.exists(): + return None - return file_hashes + with output_path.open("rb") as fp: + return read_bufr_message(fp) class PreRefactoringBufrTestCase(TestCase): @@ -104,10 +87,11 @@ def get_station_configuration_mapping( wmo_id: str, station_site: Optional[str] = None, station_type: str = "mobile", - barometer_from_gps: float = 0, + barometer_from_gps: float = 0.0, anemometer_from_sonic_ranger: float = 0.4, temperature_from_sonic_ranger: float = -0.1, - height_of_gps_from_station_ground: float = 0, + height_of_gps_from_station_ground: float = 1.0, + sonic_ranger_from_gps: float = 1.5, skipped_variables=(), comment=None, export_bufr=True, @@ -122,6 +106,7 @@ def get_station_configuration_mapping( barometer_from_gps=barometer_from_gps, anemometer_from_sonic_ranger=anemometer_from_sonic_ranger, temperature_from_sonic_ranger=temperature_from_sonic_ranger, + sonic_ranger_from_gps=sonic_ranger_from_gps, height_of_gps_from_station_ground=height_of_gps_from_station_ground, skipped_variables=skipped_variables, comment=comment, @@ -136,23 +121,37 @@ def test_get_bufr_has_new_data(self): stid = "DY2" # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"DY2": datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 8) - expected_file_hashes = { - stid: "2b94d2ef611cfddb6dd537ca63d0ec4fb5d8e880943f81a6d5e724c042ac8971" - } + target_timestamp = datetime.datetime(2023, 12, 8) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, + expected_bufr_variables = BUFRVariables( + wmo_id="04464", + station_type="mobile", + # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 + timestamp=datetime.datetime(2023, 12, 7, 23, 00), + relativeHumidity=69, + airTemperature=255.95, + pressure=77300.0, + windDirection=149, + windSpeed=14.9, + latitude=66.48249, + longitude=-46.29427, + heightOfStationGroundAboveMeanSeaLevel=2123.7, + heightOfBarometerAboveMeanSeaLevel=2124.7, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.09, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, + ) + pd.testing.assert_series_equal( + bufr_data.as_series(), + expected_bufr_variables.as_series(), ) def test_get_bufr_has_new_data_dont_store_position(self): @@ -161,23 +160,37 @@ def test_get_bufr_has_new_data_dont_store_position(self): stid = "DY2" # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"DY2": datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 8) - expected_file_hashes = { - stid: "2b94d2ef611cfddb6dd537ca63d0ec4fb5d8e880943f81a6d5e724c042ac8971" - } + target_timestamp = datetime.datetime(2023, 12, 8) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=False, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, + expected_bufr_variables = BUFRVariables( + wmo_id="04464", + station_type="mobile", + # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 + timestamp=datetime.datetime(2023, 12, 7, 23, 00), + relativeHumidity=69, + airTemperature=255.95, + pressure=77300.0, + windDirection=149, + windSpeed=14.9, + latitude=66.48249, + longitude=-46.29427, + heightOfStationGroundAboveMeanSeaLevel=2123.7, + heightOfBarometerAboveMeanSeaLevel=2124.7, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.09, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, + ) + pd.testing.assert_series_equal( + bufr_data.as_series(), + expected_bufr_variables.as_series(), ) def test_get_bufr_stid_to_skip(self): @@ -186,24 +199,20 @@ def test_get_bufr_stid_to_skip(self): stid = "DY2" # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"DY2": datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 6) - expected_file_hashes = {} + target_timestamp = datetime.datetime(2023, 12, 6) mapping = self.get_station_configuration_mapping( stid, wmo_id="04464", export_bufr=False ) - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, - ) + self.assertIsNone(bufr_data) def test_get_bufr_has_no_data_newer_than_latests_timestamps(self): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") @@ -211,47 +220,57 @@ def test_get_bufr_has_no_data_newer_than_latests_timestamps(self): stid = "DY2" # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {stid: datetime.datetime(2023, 12, 7, 23, 00)} - now_timestamp = datetime.datetime(2023, 12, 8) - expected_file_hashes = {} + target_timestamp = datetime.datetime(2023, 12, 8) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, - ) + self.assertIsNone(bufr_data) def test_get_bufr_includes_datasets_not_in_latests_timestamps(self): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") l3_src = pd.read_csv(l3_src_filepath) stid = "DY2" latest_timestamps = {} - now_timestamp = datetime.datetime(2023, 12, 8) - expected_file_hashes = { - stid: "2b94d2ef611cfddb6dd537ca63d0ec4fb5d8e880943f81a6d5e724c042ac8971" - } - + target_timestamp = datetime.datetime(2023, 12, 8) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, + + expected_bufr_variables = BUFRVariables( + wmo_id="04464", + station_type="mobile", + # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 + timestamp=datetime.datetime(2023, 12, 7, 23, 00), + relativeHumidity=69, + airTemperature=255.95, + pressure=77300.0, + windDirection=149, + windSpeed=14.9, + latitude=66.48249, + longitude=-46.29427, + heightOfStationGroundAboveMeanSeaLevel=2123.7, + heightOfBarometerAboveMeanSeaLevel=2124.7, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.09, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, + ) + pd.testing.assert_series_equal( + bufr_data.as_series(), + expected_bufr_variables.as_series(), ) def test_get_bufr_has_old_data_compared_to_now(self): @@ -260,23 +279,19 @@ def test_get_bufr_has_old_data_compared_to_now(self): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") l3_src = pd.read_csv(l3_src_filepath) latest_timestamps = {stid: datetime.datetime(2023, 12, 6)} - now_timestamp = datetime.datetime(2023, 12, 20) - expected_file_hashes = {} + target_timestamp = datetime.datetime(2023, 12, 20) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, - ) + self.assertIsNone(bufr_data) def test_invalid_value_at_last_index(self): stid = "DY2" @@ -286,26 +301,60 @@ def test_invalid_value_at_last_index(self): # Set some of instantanous values to nan l3_src.loc[140:, "p_i"] = np.nan latest_timestamps = {stid: datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 8) - expected_file_hashes = { - stid: "bb951e0245ce3f6fe656b9bb5c85f097753a6969cc60b2cf8b34e0764495e627" - } - + target_timestamp = datetime.datetime(2023, 12, 8) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, + expected_bufr_variables = BUFRVariables( + wmo_id="04464", + station_type="mobile", + # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 + timestamp=datetime.datetime(2023, 12, 7, 23, 00), + relativeHumidity=69, + airTemperature=255.95, + pressure=np.nan, + windDirection=149, + windSpeed=14.9, + latitude=66.48249, + longitude=-46.29427, + heightOfStationGroundAboveMeanSeaLevel=2123.7, + heightOfBarometerAboveMeanSeaLevel=2124.7, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.09, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, + ) + pd.testing.assert_series_equal( + bufr_data.as_series(), + expected_bufr_variables.as_series(), ) + def test_invalid_position_data(self): + stid = "DY2" + # Newest measurement in DY2_hour: 2023-12-07 23:00:00 + l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") + l3_src = pd.read_csv(l3_src_filepath) + # Set some of instantanous values to nan + l3_src.loc[:, "gps_lat"] = np.nan + latest_timestamps = {stid: datetime.datetime(2023, 12, 1)} + target_timestamp = datetime.datetime(2023, 12, 8) + mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") + bufr_data = run_get_bufr( + l3_data=l3_src, + target_timestamp=target_timestamp, + latest_timestamps=latest_timestamps, + stid=stid, + store_positions=True, + linear_regression_time_limit="91d", + station_configuration_mapping=mapping, + ) + self.assertIsNone(bufr_data) + def test_multiple_last_valid_indices_all_instantaneous_timestamps_are_none(self): stid = "DY2" # Newest measurement in DY2_hour: 2023-12-07 23:00:00 @@ -323,23 +372,19 @@ def test_multiple_last_valid_indices_all_instantaneous_timestamps_are_none(self) ], ] = np.nan latest_timestamps = {stid: datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 6) - expected_file_hashes = {} - + target_timestamp = datetime.datetime(2023, 12, 6) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, - ) + + self.assertIsNone(bufr_data) def test_multiple_last_valid_indices_all_older_than_2days(self): stid = "DY2" @@ -349,23 +394,19 @@ def test_multiple_last_valid_indices_all_older_than_2days(self): # Set some of instantanous values to nan l3_src.loc[140:, "p_i"] = np.nan latest_timestamps = {stid: datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 10) - expected_file_hashes = {} + target_timestamp = datetime.datetime(2023, 12, 10) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, - ) + self.assertIsNone(bufr_data) def test_min_data_wx_failed(self): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") @@ -374,22 +415,19 @@ def test_min_data_wx_failed(self): stid = "DY2" # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"DY2": datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 6) - expected_file_hashes = {} + target_timestamp = datetime.datetime(2023, 12, 6) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, - ) + + self.assertIsNone(bufr_data) def test_min_data_pos_failed(self): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") @@ -398,22 +436,18 @@ def test_min_data_pos_failed(self): stid = "DY2" # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"DY2": datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 6) - expected_file_hashes = {} + target_timestamp = datetime.datetime(2023, 12, 6) mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, - ) + self.assertIsNone(bufr_data) def test_ignore_newer_data_than_now_input(self): l3_src_filepath = DATA_DIR.joinpath("tx_l3_test1.csv") @@ -422,27 +456,41 @@ def test_ignore_newer_data_than_now_input(self): # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {stid: datetime.datetime(2023, 12, 1)} # New is before the latest data - now_timestamp = datetime.datetime( + target_timestamp = datetime.datetime( 2023, 12, 6, ) - expected_file_hashes = { - stid: "976a24edef2d0e6e2f29fa13d6242419fa05b24905db715fe351c19a1aa1d577" - } mapping = self.get_station_configuration_mapping(stid, wmo_id="04464") - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, + expected_bufr_variables = BUFRVariables( + wmo_id="04464", + station_type="mobile", + # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 but target_timestamp is 2023-12-06 + timestamp=datetime.datetime(2023, 12, 6, 0, 0), + relativeHumidity=82, + airTemperature=250.85, + pressure=77370.0, + windDirection=153, + windSpeed=10.4, + latitude=66.48249, + longitude=-46.29426, + heightOfStationGroundAboveMeanSeaLevel=2123.3, + heightOfBarometerAboveMeanSeaLevel=2124.3, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.09, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, + ) + pd.testing.assert_series_equal( + bufr_data.as_series(), + expected_bufr_variables.as_series(), ) def test_land_station_export(self): @@ -451,24 +499,37 @@ def test_land_station_export(self): stid = "WEG_B" # Newest measurement in DY2_hour: 2023-12-07 23:00:00 latest_timestamps = {"WEG_B": datetime.datetime(2023, 12, 1)} - now_timestamp = datetime.datetime(2023, 12, 8) - expected_file_hashes = { - stid: "eb42044f38326a295bcd18bd42fba5ed88800c5a688f885b87147aacaa5f5001" - } - + target_timestamp = datetime.datetime(2023, 12, 8) mapping = self.get_station_configuration_mapping( stid, wmo_id="460", station_type="land" ) - file_hashes = run_get_bufr( + bufr_data = run_get_bufr( l3_data=l3_src, - now_timestamp=now_timestamp, + target_timestamp=target_timestamp, latest_timestamps=latest_timestamps, stid=stid, store_positions=True, - time_limit="91d", + linear_regression_time_limit="91d", station_configuration_mapping=mapping, ) - self.assertDictEqual( - expected_file_hashes, - file_hashes, + expected_bufr_variables = BUFRVariables( + wmo_id="460", + station_type="land", + # Newest measurement in tx_l3_test1.csv: 2023-12-07 23:00:00 + timestamp=datetime.datetime(2023, 12, 7, 23, 00), + relativeHumidity=69, + airTemperature=255.95, + pressure=77300.0, + windDirection=149, + windSpeed=14.9, + latitude=66.48249, + longitude=-46.29427, + heightOfStationGroundAboveMeanSeaLevel=2123.7, + heightOfBarometerAboveMeanSeaLevel=2124.7, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=4.09, + heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=4.59, + ) + pd.testing.assert_series_equal( + bufr_data.as_series(), + expected_bufr_variables.as_series(), ) diff --git a/tests/unit/bufr_export/test_realtime_utilitites.py b/tests/unit/bufr_export/test_realtime_utilitites.py index a7306a3f..557ffe5c 100644 --- a/tests/unit/bufr_export/test_realtime_utilitites.py +++ b/tests/unit/bufr_export/test_realtime_utilitites.py @@ -50,10 +50,10 @@ def test_1(self): "gps_lon": -46.294232, "gps_alt": 2116.0, "z_boom_u": 4.1901, - "gps_lat_fit": 66.482479, - "gps_lon_fit": -46.294269, - "gps_alt_fit": 2121.4, - "z_boom_u_smooth": 4.2, + "gps_lat_fit": 66.4824788, + "gps_lon_fit": -46.2942685, + "gps_alt_fit": 2121.4118, + "z_boom_u_smooth": 4.188, }, name=datetime.datetime(2023, 12, 7, 6), ) @@ -94,10 +94,10 @@ def test_latest_data_row_is_invalid(self): "gps_lon": -46.294335, "gps_alt": 2125.0, "z_boom_u": 4.1844, - "gps_lat_fit": 66.482483, - "gps_lon_fit": -46.294275, - "gps_alt_fit": 2123.3, - "z_boom_u_smooth": 4.2, + "gps_lat_fit": 66.4824828, + "gps_lon_fit": -46.2942753, + "gps_alt_fit": 2123.3088, + "z_boom_u_smooth": 4.187, }, name=expected_output_timestamp, ) @@ -127,10 +127,10 @@ def test_latest_data_has_some_invalid_values(self): "gps_lon": -46.294232, "gps_alt": 2116.0, "z_boom_u": 4.1901, - "gps_lat_fit": 66.482479, - "gps_lon_fit": -46.294269, - "gps_alt_fit": 2121.4, - "z_boom_u_smooth": 4.2, + "gps_lat_fit": 66.4824788, + "gps_lon_fit": -46.2942685, + "gps_alt_fit": 2121.4118, + "z_boom_u_smooth": 4.188, }, name=datetime.datetime(2023, 12, 7, 6), ) @@ -173,3 +173,35 @@ def test_auxiliary_input_data(self): ) self.assertEqual(expected_output, latest_data["auxiliary_data"]) + + def test_skipped_variables(self): + """ + Test that the variables in vars_to_skip are set to nan if they are present in the input data. + """ + data = self.get_data() + expected_output = pd.Series( + data={ + "p_i": float("nan"), + "t_i": -16.7, + "rh_i": 84.6, + "wspd_i": 14.83, + "wdir_i": 142.2, + "gps_lat": 66.482469, + "gps_lon": -46.294232, + "gps_alt": 2116.0, + "z_boom_u": 4.1901, + "gps_lat_fit": 66.4824788, + "gps_lon_fit": -46.2942685, + "gps_alt_fit": 2121.4118, + "z_boom_u_smooth": 4.188, + }, + name=datetime.datetime(2023, 12, 7, 6), + ) + + latest_data = get_latest_data( + df=data, + lin_reg_time_limit="1w", + vars_to_skip=["p_i", "a_non_existing_variable"], + ) + + pd.testing.assert_series_equal(latest_data, expected_output, rtol=1e-8) diff --git a/tests/unit/test_station_config.py b/tests/unit/test_station_config.py new file mode 100644 index 00000000..4788d019 --- /dev/null +++ b/tests/unit/test_station_config.py @@ -0,0 +1,194 @@ +from pathlib import Path +from unittest import TestCase +from tempfile import TemporaryDirectory + +from pypromice.station_configuration import ( + StationConfiguration, + load_station_configuration_mapping, + write_station_configuration_mapping, +) +from tests.utilities import get_station_configuration + + +class StationConfigurationTestCase(TestCase): + def test_read_toml(self): + with TemporaryDirectory() as temp_dir: + source_path = Path(temp_dir) / "UPE_L.toml" + source_str = """ + stid = "UPE_L" + station_site = "UPE_L" + project = "Promice" + station_type = "mobile" + wmo_id = "04423" + barometer_from_gps = -0.25 + anemometer_from_sonic_ranger = 0.4 + temperature_from_sonic_ranger = 0.0 + height_of_gps_from_station_ground = 0.9 + sonic_ranger_from_gps = 1.3 + export_bufr = true + skipped_variables = [] + positions_update_timestamp_only = false + """ + with source_path.open("w") as source_io: + source_io.writelines(source_str) + + expected_configuration = StationConfiguration( + stid="UPE_L", + station_site="UPE_L", + project="Promice", + station_type="mobile", + wmo_id="04423", + barometer_from_gps=-0.25, + anemometer_from_sonic_ranger=0.4, + temperature_from_sonic_ranger=0.0, + height_of_gps_from_station_ground=0.9, + sonic_ranger_from_gps=1.3, + export_bufr=True, + comment=None, + skipped_variables=[], + positions_update_timestamp_only=False, + ) + + station_configuration = StationConfiguration.load_toml(source_path) + self.assertEqual( + expected_configuration, + station_configuration, + ) + + def test_read_toml_with_unexpected_field(self): + with TemporaryDirectory() as temp_dir: + source_path = Path(temp_dir) / "UPE_L.toml" + source_str = """ + stid = "UPE_L" + station_site = "UPE_L" + project = "Promice" + station_type = "mobile" + wmo_id = "04423" + barometer_from_gps = -0.25 + anemometer_from_sonic_ranger = 0.4 + temperature_from_sonic_ranger = 0.0 + height_of_gps_from_station_ground = 0.9 + sonic_ranger_from_gps = 1.3 + export_bufr = true + skipped_variables = [] + positions_update_timestamp_only = false + an_unexpected_field = 42 + """ + with source_path.open("w") as source_io: + source_io.writelines(source_str) + + expected_configuration = StationConfiguration( + stid="UPE_L", + station_site="UPE_L", + project="Promice", + station_type="mobile", + wmo_id="04423", + barometer_from_gps=-0.25, + anemometer_from_sonic_ranger=0.4, + temperature_from_sonic_ranger=0.0, + height_of_gps_from_station_ground=0.9, + sonic_ranger_from_gps=1.3, + export_bufr=True, + comment=None, + skipped_variables=[], + positions_update_timestamp_only=False, + ) + + with self.assertRaises(ValueError): + StationConfiguration.load_toml(source_path) + + station_configuration = StationConfiguration.load_toml(source_path, skip_unexpected_fields=True) + + self.assertEqual( + expected_configuration, + station_configuration, + ) + + + + def test_write_read(self): + with TemporaryDirectory() as temp_dir: + output_path = Path(temp_dir) / "UPE_L.toml" + src_station_config = StationConfiguration( + stid="UPE_L", + station_site="UPE_L", + project="Promice", + station_type="mobile", + wmo_id="04423", + barometer_from_gps=-0.25, + anemometer_from_sonic_ranger=0.4, + temperature_from_sonic_ranger=0.0, + height_of_gps_from_station_ground=0.9, + sonic_ranger_from_gps=1.3, + export_bufr=True, + comment=None, + skipped_variables=[], + positions_update_timestamp_only=False, + ) + src_station_config.dump_toml(output_path) + + read_station_config = StationConfiguration.load_toml(output_path) + self.assertEqual( + src_station_config, + read_station_config, + ) + + def test_read_station_config_mapping(self): + with TemporaryDirectory() as temp_dir: + station_config_root = Path(temp_dir) / "station_configs" + station_config_root.mkdir() + source_mapping = { + "UPE_L": get_station_configuration(stid="UPE_L"), + "UPE_R": get_station_configuration(stid="UPE_R"), + } + for stid, station_config in source_mapping.items(): + station_config.dump_toml(station_config_root / f"{stid}.toml") + + read_mapping = load_station_configuration_mapping(station_config_root) + self.assertDictEqual( + source_mapping, + read_mapping, + ) + + def test_write_station_config_mapping(self): + with TemporaryDirectory() as temp_dir: + station_config_root = Path(temp_dir) / "station_configs" + station_config_root.mkdir() + source_mapping = { + "UPE_L": get_station_configuration(stid="UPE_L"), + "UPE_R": get_station_configuration(stid="UPE_R"), + } + + write_station_configuration_mapping(source_mapping, station_config_root) + + read_mapping = load_station_configuration_mapping(station_config_root) + self.assertDictEqual( + source_mapping, + read_mapping, + ) + + def test_read_station_config_mapping_empty(self): + with TemporaryDirectory() as temp_dir: + station_config_root = Path(temp_dir) / "station_configs" + station_config_root.mkdir() + + read_mapping = load_station_configuration_mapping(station_config_root) + self.assertDictEqual( + {}, + read_mapping, + ) + + def test_read_station_config_mapping_ingore_filenames(self): + def test_read_station_config_mapping(self): + with TemporaryDirectory() as temp_dir: + station_config_root = Path(temp_dir) / "station_configs" + station_config_root.mkdir() + station_config = get_station_configuration(stid="UPE_L") + station_config.dump_toml(station_config_root / "a_custom_filename.toml") + expected_station_config_mapping = {station_config.stid: station_config} + + read_mapping = load_station_configuration_mapping(station_config_root) + self.assertDictEqual( + expected_station_config_mapping, + read_mapping, + ) diff --git a/tests/utilities.py b/tests/utilities.py new file mode 100644 index 00000000..a56301d4 --- /dev/null +++ b/tests/utilities.py @@ -0,0 +1,61 @@ +import random +import uuid + +from pypromice.postprocess.bufr_utilities import BUFR_TEMPLATES +from pypromice.station_configuration import StationConfiguration + +STATION_TYPE_STRINGS = tuple(BUFR_TEMPLATES.keys()) + + +def get_station_configuration(**kwargs) -> StationConfiguration: + """ + Create a StationConfiguration object with random values. + + Parameters + ---------- + kwargs + Keyword arguments to providie explicit values for the StationConfiguration object. + Returns + ------- + """ + stid = kwargs.get("stid", str(uuid.uuid4())) + station_site = kwargs.get("station_site", str(uuid.uuid4())) + project = kwargs.get("project", str(uuid.uuid4())) + station_type = kwargs.get("station_type", random.choice(STATION_TYPE_STRINGS)) + # WMO Station number <1024 for land stations + # https://vocabulary-manager.eumetsat.int/vocabularies/BUFR/WMO/32/TABLE_B/001002 + wmo_id = kwargs.get("wmo_id", "{:05}".format(random.randint(0, 1023))) + barometer_from_gps = kwargs.get("barometer_from_gps", random.random() * 3) + anemometer_from_sonic_ranger = kwargs.get( + "anemometer_from_sonic_ranger", random.random() * 3 + ) + temperature_from_sonic_ranger = kwargs.get( + "temperature_from_sonic_ranger", random.random() * 3 + ) + height_of_gps_from_station_ground = kwargs.get( + "height_of_gps_from_station_ground", random.random() * 3 + ) + sonic_ranger_from_gps = kwargs.get("sonic_ranger_from_gps", random.random() * 3) + export_bufr = kwargs.get("export_bufr", random.random() > 0.5) + skipped_variables = kwargs.get("skipped_variables", []) + positions_update_timestamp_only = kwargs.get( + "positions_update_timestamp_only", random.random() > 0.5 + ) + station_relocation = kwargs.get("station_relocation", []) + + return StationConfiguration( + stid=stid, + station_site=station_site, + project=project, + station_type=station_type, + wmo_id=wmo_id, + barometer_from_gps=barometer_from_gps, + anemometer_from_sonic_ranger=anemometer_from_sonic_ranger, + temperature_from_sonic_ranger=temperature_from_sonic_ranger, + height_of_gps_from_station_ground=height_of_gps_from_station_ground, + sonic_ranger_from_gps=sonic_ranger_from_gps, + export_bufr=export_bufr, + skipped_variables=skipped_variables, + positions_update_timestamp_only=positions_update_timestamp_only, + station_relocation=station_relocation, + )