From 0d5c5264d31ec51ff51ac9657552b402afa2ee4a Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Tue, 9 Jul 2024 21:12:08 +0100 Subject: [PATCH] remove no distinct forecast values last seven days --- nowcasting_datamodel/save/save.py | 31 ++++------- nowcasting_datamodel/save/update.py | 83 ++++++++++++++++++++++++++++- tests/save/test_save.py | 4 ++ 3 files changed, 96 insertions(+), 22 deletions(-) diff --git a/nowcasting_datamodel/save/save.py b/nowcasting_datamodel/save/save.py index a96f3545..1e4f45cb 100644 --- a/nowcasting_datamodel/save/save.py +++ b/nowcasting_datamodel/save/save.py @@ -25,7 +25,7 @@ def save( update_gsp: Optional[bool] = True, apply_adjuster: Optional[bool] = True, save_to_last_seven_days: Optional[bool] = True, - save_distinct_last_seven_days: bool = True, + remove_non_distinct_last_seven_days: bool = True, ): """ Save forecast to database @@ -41,7 +41,7 @@ def save( :param update_gsp: Optional (default true), to update all the GSP forecasts :param apply_adjuster: Optional (default true), to apply the adjuster :param save_to_last_seven_days: Optional (default true), to save to the last seven days table - :param save_distinct_last_seven_days: Optional (default True), to only save distinct + :param remove_non_distinct_last_seven_days: Optional (default True), to only keep distinct forecast values in the forecast_value_last_seven_days table """ @@ -71,7 +71,9 @@ def save( if save_to_last_seven_days: logger.debug("Saving to last seven days table") save_all_forecast_values_seven_days( - session=session, forecasts=forecasts, save_distinct=save_distinct_last_seven_days + session=session, + forecasts=forecasts, + remove_non_distinct=remove_non_distinct_last_seven_days, ) session.commit() @@ -117,14 +119,14 @@ def save_pv_system(session: Session, pv_system: PVSystem) -> PVSystemSQL: def save_all_forecast_values_seven_days( - session: Session, forecasts: List[ForecastSQL], save_distinct: bool = True + session: Session, forecasts: List[ForecastSQL], remove_non_distinct: bool = True ): """ Save all the forecast values in the last seven days table :param session: database sessions :param forecasts: list of forecasts - :param save_distinct: Optional (default True), to only save distinct forecast values + :param remove_non_distinct: Optional (default True), to only keep distinct forecast values If the last saved forecast value is the same as the current one, it will not be saved """ @@ -135,24 +137,13 @@ def save_all_forecast_values_seven_days( forecast_value_last_7_days = change_forecast_value_to_forecast_last_7_days( forecast_value ) - - if save_distinct: - # only keep it if its unique - # this does take extra time, but it keeps the database smaller - forecast_is_new = is_new_forecast_values_distinct( - forecast_value=forecast_value_last_7_days, - session=session, - location_id=forecast.location_id, - model_name=forecast.model.name, - ) - if forecast_is_new: - forecast_values_last_7_days.append(forecast_value_last_7_days) - else: - forecast_values_last_7_days.append(forecast_value_last_7_days) + forecast_values_last_7_days.append(forecast_value_last_7_days) # add them to the database add_forecast_last_7_days_and_remove_old_data( - session=session, forecast_values=forecast_values_last_7_days + session=session, + forecast_values=forecast_values_last_7_days, + remove_non_distinct=remove_non_distinct, ) diff --git a/nowcasting_datamodel/save/update.py b/nowcasting_datamodel/save/update.py index 1083e281..743d9883 100644 --- a/nowcasting_datamodel/save/update.py +++ b/nowcasting_datamodel/save/update.py @@ -9,7 +9,7 @@ from sqlalchemy.orm.session import Session from nowcasting_datamodel import N_GSP -from nowcasting_datamodel.models import ForecastValueSevenDaysSQL +from nowcasting_datamodel.models import ForecastValueSevenDaysSQL, LocationSQL, MLModelSQL from nowcasting_datamodel.models.forecast import ( ForecastSQL, ForecastValueLatestSQL, @@ -296,14 +296,84 @@ def change_forecast_value_to_forecast_last_7_days( return forecast_new +def remove_non_distinct_forecast_values( + session: Session, start_datetime: datetime, end_datetime: datetime +): + """ + Remove any non-distinct forecast values + + We remove any non-distinct values from ForecastValueSevenDaysSQL. The values are distinct on + - target_time + - location_id + - model_name + - expected_power_generation_megawatts + + We keep the first value, and remove and values from later on. + + :param session: database session + :param start_datetime: start datetime + :param end_datetime: end_dateimte + """ + + logger.debug(f"Removing non distinct forecast values for {start_datetime} to {end_datetime}") + + # 1. sub query, these are the values we want to keep + sub_query = session.query(ForecastValueSevenDaysSQL.uuid) + + # distinct + sub_query = sub_query.distinct( + ForecastValueSevenDaysSQL.target_time, + ForecastSQL.location_id, + MLModelSQL.name, + ForecastValueSevenDaysSQL.expected_power_generation_megawatts, + ) + + # join + sub_query = sub_query.join(ForecastSQL) + sub_query = sub_query.join(LocationSQL) + sub_query = sub_query.join(MLModelSQL) + + # filter on time + sub_query = sub_query.filter(ForecastValueSevenDaysSQL.target_time >= start_datetime) + sub_query = sub_query.filter(ForecastValueSevenDaysSQL.target_time < end_datetime) + + # order by + sub_query = sub_query.order_by( + ForecastValueSevenDaysSQL.target_time, + ForecastSQL.location_id, + MLModelSQL.name, + ForecastValueSevenDaysSQL.expected_power_generation_megawatts, + ForecastValueSevenDaysSQL.created_utc, # get the first one + ) + + sub_query = sub_query.subquery() + + # 2. main query + query = session.query(ForecastValueSevenDaysSQL) + + # filter on tiem + query = query.filter(ForecastValueSevenDaysSQL.target_time >= start_datetime) + query = query.filter(ForecastValueSevenDaysSQL.target_time < end_datetime) + + # select uuid not in subquery + query = query.filter(ForecastValueSevenDaysSQL.uuid.notin_(sub_query)) + + # delete all results + query.delete() + + def add_forecast_last_7_days_and_remove_old_data( - forecast_values: List[ForecastValueSevenDaysSQL], session: Session + forecast_values: List[ForecastValueSevenDaysSQL], + session: Session, + remove_non_distinct: bool = True, ): """ Add forecast values and delete old values :param forecast_values: :param session: + :param remove_non_distinct: Optional (default True), to only keep distinct forecast values + If the last saved forecast value is the same as the current one, it will not be saved :return: """ @@ -313,7 +383,16 @@ def add_forecast_last_7_days_and_remove_old_data( # remove old data now_minus_7_days = datetime.now(tz=timezone.utc) - timedelta(days=7) + # remove any duplicate forecast values + if remove_non_distinct: + for i in range(0, 24 * 10): + start_datetime = now_minus_7_days + timedelta(hours=i) + end_datetime = start_datetime + timedelta(hours=1) + remove_non_distinct_forecast_values(session, start_datetime, end_datetime) + logger.debug(f"Removing data before {now_minus_7_days}") query = session.query(ForecastValueSevenDaysSQL) query = query.where(ForecastValueSevenDaysSQL.target_time < now_minus_7_days) query.delete() + + session.commit() diff --git a/tests/save/test_save.py b/tests/save/test_save.py index 28ed6b6f..3954a919 100644 --- a/tests/save/test_save.py +++ b/tests/save/test_save.py @@ -116,6 +116,10 @@ def test_save_all_forecast_values_seven_days(db_session): now = datetime.now(tz=timezone.utc) forecasts = make_fake_forecasts(gsp_ids=range(0, 3), session=db_session, t0_datetime_utc=now) + # need to commit the forecasts to the database + db_session.add_all(forecasts) + db_session.commit() + save_all_forecast_values_seven_days(session=db_session, forecasts=forecasts) assert len(db_session.query(ForecastValueSevenDaysSQL).all()) == 3 * 112