From c3f1ef029fa2334a4c780eedbc0216e2c17ef7cd Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Fri, 8 Apr 2022 12:07:25 +0100 Subject: [PATCH 1/3] assert the types we want of pv data - TDD --- tests/data_sources/test_pv_data_source.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/data_sources/test_pv_data_source.py b/tests/data_sources/test_pv_data_source.py index 23b5b3a3..aeac1f5e 100644 --- a/tests/data_sources/test_pv_data_source.py +++ b/tests/data_sources/test_pv_data_source.py @@ -64,6 +64,9 @@ def test_get_example_and_batch(): # noqa: D103 # start at 6, to avoid some nans batch = pv_data_source.get_batch(locations=locations[6:16]) assert batch.power_mw.shape == (10, 19, DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE) + assert str(batch.x_osgb.dtype) == "float32" + assert str(batch.y_osgb.dtype) == "float32" + assert str(batch.id.dtype) == "int64" def test_drop_pv_systems_which_produce_overnight(): # noqa: D103 From fc40c60db458811145855431e9767017abed37b3 Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Fri, 8 Apr 2022 12:18:55 +0100 Subject: [PATCH 2/3] change data types to float32 or int32 --- nowcasting_dataset/data_sources/pv/pv_data_source.py | 7 +++++-- nowcasting_dataset/dataset/xr_utils.py | 11 ++++++++--- tests/data_sources/test_pv_data_source.py | 5 ++++- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/nowcasting_dataset/data_sources/pv/pv_data_source.py b/nowcasting_dataset/data_sources/pv/pv_data_source.py index 30f525d9..7d25d14d 100644 --- a/nowcasting_dataset/data_sources/pv/pv_data_source.py +++ b/nowcasting_dataset/data_sources/pv/pv_data_source.py @@ -342,14 +342,17 @@ def get_example(self, location: SpaceTimeLocation) -> xr.Dataset: data=pv_system_row_number, dims=["id"], ) - pv["x_osgb"] = x_coords - pv["y_osgb"] = y_coords + pv["x_osgb"] = x_coords.astype("float32") + pv["y_osgb"] = y_coords.astype("float32") pv["pv_system_row_number"] = pv_system_row_number # pad out so that there are always n_pv_systems_per_example, pad with zeros pad_n = self.n_pv_systems_per_example - len(pv.id) pv = pv.pad(id=(0, pad_n), power_mw=((0, 0), (0, pad_n)), constant_values=0) + # format id + pv.__setitem__("id", pv.id.astype("int32")) + return pv def get_locations(self, t0_datetimes_utc: pd.DatetimeIndex) -> List[SpaceTimeLocation]: diff --git a/nowcasting_dataset/dataset/xr_utils.py b/nowcasting_dataset/dataset/xr_utils.py index 7b9fab6d..f8b700a5 100644 --- a/nowcasting_dataset/dataset/xr_utils.py +++ b/nowcasting_dataset/dataset/xr_utils.py @@ -17,7 +17,12 @@ def join_list_dataset_to_batch_dataset(datasets: list[xr.Dataset]) -> xr.Dataset new_dataset = dataset.expand_dims(dim="example").assign_coords(example=("example", [i])) new_datasets.append(new_dataset) - return xr.concat(new_datasets, dim="example") + joined_dataset = xr.concat(new_datasets, dim="example") + + # format example index + joined_dataset.__setitem__("example", joined_dataset.example.astype("int32")) + + return joined_dataset def convert_coordinates_to_indexes_for_list_datasets( @@ -43,9 +48,9 @@ def convert_coordinates_to_indexes(dataset: xr.Dataset) -> xr.Dataset: for original_dim_name in original_dim_names: original_coords = dataset[original_dim_name] - new_index_coords = np.arange(len(original_coords)) + new_index_coords = np.arange(len(original_coords)).astype("int32") new_index_dim_name = f"{original_dim_name}_index" - dataset[original_dim_name] = new_index_coords + dataset[original_dim_name] = new_index_coords.astype("int32") dataset = dataset.rename({original_dim_name: new_index_dim_name}) # Save the original_coords back into dataset, but this time it won't be used as # coords for the variables payload in the dataset. diff --git a/tests/data_sources/test_pv_data_source.py b/tests/data_sources/test_pv_data_source.py index aeac1f5e..3c8a1300 100644 --- a/tests/data_sources/test_pv_data_source.py +++ b/tests/data_sources/test_pv_data_source.py @@ -66,7 +66,10 @@ def test_get_example_and_batch(): # noqa: D103 assert batch.power_mw.shape == (10, 19, DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE) assert str(batch.x_osgb.dtype) == "float32" assert str(batch.y_osgb.dtype) == "float32" - assert str(batch.id.dtype) == "int64" + assert str(batch.id.dtype) == "int32" + assert str(batch.example.dtype) == "int32" + assert str(batch.id_index.dtype) == "int32" + assert str(batch.time_index.dtype) == "int32" def test_drop_pv_systems_which_produce_overnight(): # noqa: D103 From c3d5bc737f3d1646ed909b58855bac4e7e89257c Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Fri, 8 Apr 2022 12:19:35 +0100 Subject: [PATCH 3/3] tidy --- nowcasting_dataset/dataset/xr_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nowcasting_dataset/dataset/xr_utils.py b/nowcasting_dataset/dataset/xr_utils.py index f8b700a5..d0ad97aa 100644 --- a/nowcasting_dataset/dataset/xr_utils.py +++ b/nowcasting_dataset/dataset/xr_utils.py @@ -50,7 +50,7 @@ def convert_coordinates_to_indexes(dataset: xr.Dataset) -> xr.Dataset: original_coords = dataset[original_dim_name] new_index_coords = np.arange(len(original_coords)).astype("int32") new_index_dim_name = f"{original_dim_name}_index" - dataset[original_dim_name] = new_index_coords.astype("int32") + dataset[original_dim_name] = new_index_coords dataset = dataset.rename({original_dim_name: new_index_dim_name}) # Save the original_coords back into dataset, but this time it won't be used as # coords for the variables payload in the dataset.