From c4d84b66fa820b7346776f46b3f688315354bc13 Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Fri, 22 Dec 2023 15:09:38 +0000 Subject: [PATCH] remove old database: major change --- pvconsumer/app.py | 100 +++++++----------- pvconsumer/pv_systems.py | 150 ++++++++++++++++----------- pvconsumer/save.py | 38 +++---- pvconsumer/solar_sheffield_passiv.py | 12 +-- pvconsumer/utils.py | 71 ++++--------- tests/conftest.py | 13 +-- tests/intergration/test_app.py | 45 +++----- tests/intergration/test_ss_passiv.py | 2 +- tests/unittest/test_pv_systems.py | 109 ++++++++----------- tests/unittest/test_utils.py | 74 +++++-------- 10 files changed, 254 insertions(+), 360 deletions(-) diff --git a/pvconsumer/app.py b/pvconsumer/app.py index e5fc2de..f8f058c 100644 --- a/pvconsumer/app.py +++ b/pvconsumer/app.py @@ -12,19 +12,16 @@ from typing import List, Optional, Tuple import click -from nowcasting_datamodel.connection import DatabaseConnection -from nowcasting_datamodel.models.base import Base_Forecast, Base_PV -from nowcasting_datamodel.models.pv import PVSystemSQL -from nowcasting_datamodel.read.read import update_latest_input_data_last_updated -from pvoutput import PVOutput -from pvsite_datamodel.connection import DatabaseConnection as PVSiteDatabaseConnection -from sqlalchemy.orm import Session - +import pandas as pd import pvconsumer from pvconsumer.pv_systems import filter_pv_systems_which_have_new_data, get_pv_systems -from pvconsumer.save import save_to_database, save_to_pv_site_database +from pvconsumer.save import save_to_pv_site_database from pvconsumer.solar_sheffield_passiv import get_all_latest_pv_yield_from_solar_sheffield -from pvconsumer.utils import FakeDatabaseConnection, format_pv_data +from pvconsumer.utils import format_pv_data +from pvoutput import PVOutput +from pvsite_datamodel.connection import DatabaseConnection +from pvsite_datamodel.sqlmodels import SiteSQL +from sqlalchemy.orm import Session logging.basicConfig( level=getattr(logging, os.getenv("LOGLEVEL", "INFO")), @@ -37,20 +34,6 @@ @click.option( "--db-url", default=None, - envvar="DB_URL", - help="The Database URL where pv data will be saved", - type=click.STRING, -) -@click.option( - "--db-url-forecast", - default=None, - envvar="DB_URL_FORECAST", - help="The Database URL where update latest data will be saved", - type=click.STRING, -) -@click.option( - "--db-url-pv-site", - default=None, envvar="DB_URL_PV_SITE", help="The PV site Database URL where update latest data will be saved", type=click.STRING, @@ -70,9 +53,7 @@ type=click.STRING, ) def app( - db_url: str, - db_url_forecast: str, - db_url_pv_site: Optional[str] = None, + db_url: Optional[str] = None, filename: Optional[str] = None, provider: str = "pvoutput.org", ): @@ -87,15 +68,9 @@ def app( logger.info(f"Running PV Consumer app ({pvconsumer.__version__})") - connection = DatabaseConnection(url=db_url, base=Base_PV, echo=False) - connection_forecast = DatabaseConnection(url=db_url_forecast, base=Base_Forecast, echo=False) + connection = DatabaseConnection(url=db_url, echo=False) - if db_url_pv_site is not None: - connection_pv_site = PVSiteDatabaseConnection(url=db_url_pv_site, echo=False) - else: - connection_pv_site = FakeDatabaseConnection() - - with connection.get_session() as session, connection_pv_site.get_session() as session_pv_site: + with connection.get_session() as session: # 1. Read list of PV systems (from local file) # and get their refresh times (refresh times can also be stored locally) logger.debug("Read list of PV systems (from local file)") @@ -107,27 +82,21 @@ def app( "Find most recent entered data (for each PV system) in OCF database," "and filter pv systems depending on refresh rate" ) - pv_systems = filter_pv_systems_which_have_new_data(pv_systems=pv_systems) + pv_systems = filter_pv_systems_which_have_new_data(pv_systems=pv_systems, session=session) # 3. Pull data pull_data_and_save( pv_systems=pv_systems, session=session, provider=provider, - session_pv_site=session_pv_site, ) - # update latest data - with connection_forecast.get_session() as session_forecast: - update_latest_input_data_last_updated(session=session_forecast, component="pv") - def pull_data_and_save( - pv_systems: List[PVSystemSQL], + pv_systems: List[SiteSQL], session: Session, provider: str, datetime_utc: Optional[None] = None, - session_pv_site: Optional[Session] = None, ): """ Pull the pv ield data and save to database @@ -157,15 +126,16 @@ def pull_data_and_save( n_pv_systems_per_batch = 50 pv_system_chunks = chunks(original_list=pv_systems, n=n_pv_systems_per_batch) - all_pv_yields_sql = [] + all_pv_yields_df = [] for pv_system_chunk in pv_system_chunks: - # get all the pv system ids from a a group of pv systems - pv_system_ids = [pv_system_id.pv_system_id for pv_system_id in pv_system_chunk] if provider == "pvoutput.org": # set up pv output.org pv_output = PVOutput() + # get all the pv system ids from a a group of pv systems + pv_system_ids = [pv_system.client_site_id for pv_system in pv_system_chunk] + logger.debug(f"Getting data from {provider}") # lets take the date of the datetime now. @@ -187,43 +157,43 @@ def pull_data_and_save( for i, pv_system in enumerate(pv_system_chunk): logger.debug( - f"Processing {i}th pv system ({pv_system.pv_system_id=}), " + f"Processing {i}th pv system ({pv_system.client_site_id=}), " f"out of {len(pv_systems)}" ) # take only the data we need for system id pv_yield_df = all_pv_yield_df[ - all_pv_yield_df["system_id"].astype(int) == pv_system.pv_system_id + all_pv_yield_df["system_id"].astype(int) == pv_system.client_site_id ] + pv_yield_df["site_uuid"] = pv_system.site_uuid logger.debug( f"Got {len(pv_yield_df)} pv yield for " - f"pv systems {pv_system.pv_system_id} before filtering" + f"pv systems {pv_system.client_site_id} before filtering" ) if len(pv_yield_df) == 0: - logger.warning(f"Did not find any data for {pv_system.pv_system_id} for {date}") + logger.warning(f"Did not find any data for {pv_system.client_site_id} for {date}") else: - pv_yields_sql = format_pv_data(pv_system=pv_system, pv_yield_df=pv_yield_df) - all_pv_yields_sql = all_pv_yields_sql + pv_yields_sql + # filter out which is in our database and a funny 0 bug + pv_yield_df = format_pv_data( + pv_system=pv_system, pv_yield_df=pv_yield_df, session=session + ) + + if len(all_pv_yields_df) == 0: + all_pv_yields_df = pv_yield_df + else: + all_pv_yields_df = pd.concat([all_pv_yields_df, pv_yield_df]) - if len(all_pv_yields_sql) > 100: + if len(all_pv_yields_df) > 100: # 4. Save to database - perhaps check no duplicate data. (for each PV system) - save_to_database(session=session, pv_yields=all_pv_yields_sql) - all_pv_yields_sql = [] - - # 5. save to pv sites database - if session_pv_site is not None: - # TODO we are current doing this every round, - # we might find we have to batch it like the other save method - save_to_pv_site_database( - session=session_pv_site, pv_system=pv_system, pv_yield_df=pv_yield_df - ) - session_pv_site.commit() + save_to_pv_site_database(session=session, pv_yield_df=all_pv_yields_df) + all_pv_yields_df = [] # 4. Save to database - perhaps check no duplicate data. (for each PV system) - save_to_database(session=session, pv_yields=all_pv_yields_sql) + logger.debug(all_pv_yields_df) + save_to_pv_site_database(session=session, pv_yield_df=all_pv_yields_df) def chunks(original_list: List, n: int) -> Tuple[List]: diff --git a/pvconsumer/pv_systems.py b/pvconsumer/pv_systems.py index a8d7683..66b7e68 100644 --- a/pvconsumer/pv_systems.py +++ b/pvconsumer/pv_systems.py @@ -1,24 +1,27 @@ """ PV system functions """ import logging import os -from datetime import datetime, timedelta, timezone +from datetime import datetime, timedelta from typing import List, Optional import pandas as pd -from nowcasting_datamodel.models.pv import PVSystem, PVSystemSQL, pv_output, solar_sheffield_passiv -from nowcasting_datamodel.read.read_pv import get_latest_pv_yield -from nowcasting_datamodel.read.read_pv import get_pv_systems as get_pv_systems_from_db +from pvconsumer.utils import pv_output, solar_sheffield_passiv from pvoutput import PVOutput from sqlalchemy.orm import Session +from sqlalchemy import func + +from pvsite_datamodel.sqlmodels import SiteSQL, GenerationSQL +from pvsite_datamodel.read import get_all_sites import pvconsumer from pvconsumer.solar_sheffield_passiv import get_all_systems_from_solar_sheffield -from pvconsumer.utils import df_to_list_pv_system, list_pv_system_to_df + +# from pvconsumer.utils import df_to_list_pv_system, list_pv_system_to_df logger = logging.getLogger(__name__) -def load_pv_systems(provider: str = pv_output, filename: Optional[str] = None) -> List[PVSystem]: +def load_pv_systems(provider: str = pv_output, filename: Optional[str] = None) -> pd.DataFrame: """ Load pv systems from file @@ -36,25 +39,23 @@ def load_pv_systems(provider: str = pv_output, filename: Optional[str] = None) - logger.debug(f"Loading local pv systems from {filename}") - pv_capacity = pd.read_csv(filename) + pv_systems_df = pd.read_csv(filename, index_col=0) - pv_systems = df_to_list_pv_system(pv_systems_df=pv_capacity) - - return pv_systems + return pv_systems_df def find_missing_pv_systems( - pv_systems_local: List[PVSystem], - pv_systems_db: List[PVSystem], + pv_systems_local: pd.DataFrame, + pv_systems_db: pd.DataFrame, provider: str, -) -> List[PVSystem]: +) -> pd.DataFrame: """ Find missing pv systems Gte the pv systems that are in local file, but not in the database Args: - pv_systems_local: list of pv systems stored locally - pv_systems_db: list of pv systems from the database + pv_systems_local: dataframe with "pv_system_id" from local file + pv_systems_db: dataframe with "pv_system_id" from local db Returns: list of pv systems that are not in the database @@ -65,9 +66,11 @@ def find_missing_pv_systems( if len(pv_systems_db) == 0: return pv_systems_local - # change to dataframes - pv_systems_db = list_pv_system_to_df(pv_systems=pv_systems_db)[["pv_system_id"]] - pv_systems_local = list_pv_system_to_df(pv_systems=pv_systems_local)[["pv_system_id"]] + # get system ids + if "pv_system_id" not in pv_systems_db.columns: + pv_systems_db["pv_system_id"] = pv_systems_db["client_site_id"] + pv_systems_db = pv_systems_db[["pv_system_id"]] + pv_systems_local = pv_systems_local[["pv_system_id"]] # https://stackoverflow.com/questions/28901683/pandas-get-rows-which-are-not-in-other-dataframe # merge together @@ -79,12 +82,12 @@ def find_missing_pv_systems( pv_systems_missing = df_all[missing].copy() pv_systems_missing["provider"] = provider - return df_to_list_pv_system(pv_systems_missing) + return pv_systems_missing def get_pv_systems( session: Session, provider: str, filename: Optional[str] = None -) -> List[PVSystemSQL]: +) -> List[SiteSQL]: """ Get PV systems @@ -99,19 +102,19 @@ def get_pv_systems( """ # load all pv systems in database - pv_systems_sql_db: List[PVSystemSQL] = get_pv_systems_from_db( - provider=provider, session=session - ) + pv_systems_sql_db: List[SiteSQL] = get_all_sites(session=session) - pv_systems_db = [PVSystem.from_orm(pv_system) for pv_system in pv_systems_sql_db] + # convert to sql objects to Pandas datafraome + pv_systems_db_df = pd.DataFrame([pv_system.__dict__ for pv_system in pv_systems_sql_db]) # load master file - pv_system_local = load_pv_systems(filename=filename, provider=provider) + pv_system_local_df = load_pv_systems(filename=filename, provider=provider) # get missing pv systems missing_pv_system = find_missing_pv_systems( - pv_systems_local=pv_system_local, pv_systems_db=pv_systems_db, provider=provider + pv_systems_local=pv_system_local_df, pv_systems_db=pv_systems_db_df, provider=provider ) + logger.debug(missing_pv_system) logger.debug(f"There are {len(missing_pv_system)} pv systems to add to the database") @@ -123,7 +126,10 @@ def get_pv_systems( pv_systems = get_all_systems_from_solar_sheffield() else: raise Exception(f"Can not use provider {provider}") - for i, pv_system in enumerate(missing_pv_system): + + logger.debug(missing_pv_system) + for i, pv_system in missing_pv_system.iterrows(): + logger.debug(pv_system) # get metadata if provider == pv_output: metadata = pv_output_data.get_metadata( @@ -145,33 +151,35 @@ def get_pv_systems( else: raise Exception(f"Can not use provider {provider}") - # validate - _ = PVSystem.from_orm(pv_system) + # get the current max ml id, small chance this could lead to a raise condition + max_ml_id = session.query(func.max(SiteSQL.ml_id)).scalar() + if max_ml_id is None: + max_ml_id = 0 + + site = SiteSQL( + client_site_id=pv_system.pv_system_id, + client_site_name=f"{provider}_pv_system.pv_system_name", + latitude=pv_system.latitude, + longitude=pv_system.longitude, + capacity_kw=pv_system.capacity_kw, + ml_id=max_ml_id + 1, + ) # add to database logger.debug(f"Adding pv system {pv_system.pv_system_id} to database") - session.add(pv_system.to_orm()) + session.add(site) # The first time we do this, we might hit a rate limit of 900, # therefore its good to save this on the go session.commit() - pv_systems = get_pv_systems_from_db(provider=provider, session=session) - - yesterday = datetime.now(timezone.utc).date() - timedelta(days=1) - pv_systems = get_latest_pv_yield( - session=session, - append_to_pv_systems=True, - pv_systems=pv_systems, - start_datetime_utc=yesterday, - start_created_utc=yesterday, - ) + pv_systems_sql_db: List[SiteSQL] = get_all_sites(session=session) - return pv_systems + return pv_systems_sql_db def filter_pv_systems_which_have_new_data( - pv_systems: List[PVSystemSQL], datetime_utc: Optional[datetime] = None + session: Session, pv_systems: List[SiteSQL], datetime_utc: Optional[datetime] = None ): """ Filter pv systems which have new data available @@ -199,43 +207,61 @@ def filter_pv_systems_which_have_new_data( if datetime_utc is None: datetime_utc = datetime.utcnow() # add timezone + site_uuids = [pv_system.site_uuid for pv_system in pv_systems] + + # pull the latest data from the database + query = ( + session.query(SiteSQL.site_uuid, GenerationSQL.start_utc) + .distinct( + GenerationSQL.site_uuid, + GenerationSQL.start_utc, + ) + .join(SiteSQL) + .filter( + GenerationSQL.start_utc <= datetime_utc, + GenerationSQL.site_uuid.in_(site_uuids), + ) + .order_by( + GenerationSQL.site_uuid, + GenerationSQL.start_utc, + GenerationSQL.created_utc.desc(), + ) + ) + last_generations = query.all() + last_generations = {row[0]: row[1] for row in last_generations} + keep_pv_systems = [] for i, pv_system in enumerate(pv_systems): logger.debug(f"Looking at {i}th pv system, out of {len(pv_systems)} pv systems") - last_pv_yield = pv_system.last_pv_yield + if pv_system.site_uuid in last_generations: + last_datetime = last_generations[pv_system.site_uuid] + else: + last_datetime = None - if pv_system.status_interval_minutes is None: - # don't know the status interval refresh time, so lets keep it - logger.debug( - f"We dont know the refresh time for pv systems {pv_system.pv_system_id}, " - f"so will be getting data " - ) - keep_pv_systems.append(pv_system) - elif last_pv_yield is None: + if last_datetime is None: # there is no pv yield data for this pv system, so lets keep it logger.debug( - f"There is no pv yield data for pv systems {pv_system.pv_system_id}, " + f"There is no pv yield data for pv systems {pv_system.site_uuid}, " f"so will be getting data " ) keep_pv_systems.append(pv_system) else: - next_datetime_data_available = ( - timedelta(minutes=pv_system.status_interval_minutes) + last_pv_yield.datetime_utc - ) + next_datetime_data_available = timedelta(minutes=5) + last_datetime + logger.debug(next_datetime_data_available) if next_datetime_data_available < datetime_utc: logger.debug( - f"For pv system {pv_system.pv_system_id} as " - f"last pv yield datetime is {last_pv_yield.datetime_utc}," - f"refresh interval is {pv_system.status_interval_minutes}, " - f"so will be getting data" + f"For pv system {pv_system.site_uuid} as " + f"last pv yield datetime is {last_datetime}," + f"refresh interval is 5 minutes, " + f"so will be getting data, {next_datetime_data_available=}" ) keep_pv_systems.append(pv_system) else: logger.debug( - f"Not keeping pv system {pv_system.pv_system_id} as " - f"last pv yield datetime is {last_pv_yield.datetime_utc}," - f"refresh interval is {pv_system.status_interval_minutes}" + f"Not keeping pv system {pv_system.site_uuid} as " + f"last pv yield datetime is {last_datetime}," + f"refresh interval is 5 minutes" ) return keep_pv_systems diff --git a/pvconsumer/save.py b/pvconsumer/save.py index 8a546c1..0ba567d 100644 --- a/pvconsumer/save.py +++ b/pvconsumer/save.py @@ -3,28 +3,17 @@ from typing import List import pandas as pd -from nowcasting_datamodel.models import PVSystem, PVYield + +# from nowcasting_datamodel.models import PVSystem, PVYield from pvsite_datamodel.read.site import get_site_by_client_site_id +from pvsite_datamodel.sqlmodels import SiteSQL from pvsite_datamodel.write.generation import insert_generation_values from sqlalchemy.orm import Session logger = logging.getLogger(__name__) -def save_to_database(session: Session, pv_yields: List[PVYield]): - """ - Save pv data to database - - :param session: database session - :param pv_yields: list of pv data - """ - logger.debug(f"Will be adding {len(pv_yields)} pv yield object to database") - - session.add_all(pv_yields) - session.commit() - - -def save_to_pv_site_database(session: Session, pv_system: PVSystem, pv_yield_df: pd.DataFrame): +def save_to_pv_site_database(session: Session, pv_yield_df: pd.DataFrame): """ Save to pv site database @@ -41,15 +30,17 @@ def save_to_pv_site_database(session: Session, pv_system: PVSystem, pv_yield_df: if len(pv_yield_df) == 0: return - # get site from the pv_system - site = get_site_by_client_site_id( - session=session, - client_name=pv_system.provider, - client_site_id=pv_system.pv_system_id, - ) - # format dataframe - pv_yield_df["site_uuid"] = site.site_uuid + if "instantaneous_power_gen_W" in pv_yield_df.columns: + pv_yield_df["solar_generation_kw"] = pv_yield_df["instantaneous_power_gen_W"] / 1000 + if "datetime_utc" not in pv_yield_df.columns: + pv_yield_df.rename( + columns={ + "datetime": "datetime_utc", + }, + inplace=True, + ) + pv_yield_df["power_kw"] = pv_yield_df["solar_generation_kw"] pv_yield_df["end_utc"] = pv_yield_df["datetime_utc"] # TODO this is hard coded for Sheffield Solar Passiv @@ -58,3 +49,4 @@ def save_to_pv_site_database(session: Session, pv_system: PVSystem, pv_yield_df: # save to database logger.debug(f"Inserting {len(pv_yield_df)} records to pv site database") insert_generation_values(session, pv_yield_df) + session.commit() diff --git a/pvconsumer/solar_sheffield_passiv.py b/pvconsumer/solar_sheffield_passiv.py index 4ef5a5c..e2ef8ba 100644 --- a/pvconsumer/solar_sheffield_passiv.py +++ b/pvconsumer/solar_sheffield_passiv.py @@ -7,9 +7,6 @@ import pandas as pd import requests -from nowcasting_datamodel.models.pv import PVSystem - -from pvconsumer.utils import df_to_list_pv_system logger = logging.getLogger(__name__) @@ -31,7 +28,7 @@ def raw_to_dataframe(response): return pd.DataFrame(data=data, columns=columns) -def get_all_systems_from_solar_sheffield(pv_system_ids: List[int] = None) -> List[PVSystem]: +def get_all_systems_from_solar_sheffield(pv_system_ids: List[int] = None) -> pd.DataFrame: """ Get the pv systesm from solar sheffield @@ -71,9 +68,7 @@ def get_all_systems_from_solar_sheffield(pv_system_ids: List[int] = None) -> Lis data_df = data_df[data_df["pv_system_id"].isin(pv_system_ids)] # reformat - pv_systems = df_to_list_pv_system(data_df) - - return pv_systems + return data_df def get_all_latest_pv_yield_from_solar_sheffield() -> pd.DataFrame: @@ -114,4 +109,7 @@ def get_all_latest_pv_yield_from_solar_sheffield() -> pd.DataFrame: # add timestamp UTC data_df["datetime_utc"] = data_df["datetime_utc"].dt.tz_localize("UTC") + # only take Passiv data + data_df = data_df[data_df["owner_name"] == "Passiv"] + return data_df diff --git a/pvconsumer/utils.py b/pvconsumer/utils.py index 5aeb2d9..8cd0da5 100644 --- a/pvconsumer/utils.py +++ b/pvconsumer/utils.py @@ -1,49 +1,22 @@ -""" Utils functions """ +# """ Utils functions """ import logging from datetime import timezone -from typing import List import pandas as pd -from nowcasting_datamodel.models import PVSystem, PVSystemSQL, PVYield, PVYieldSQL from sqlalchemy.orm import Session -logger = logging.getLogger(__name__) - - -def list_pv_system_to_df(pv_systems: List[PVSystem]) -> pd.DataFrame: - """ - Change list of pv systems to dataframe - - Args: - pv_systems: list of pv systems (pdyantic objects) - - Returns: dataframe with columns the same as the pv systems pydantic object - - """ - return pd.DataFrame([pv_system.dict() for pv_system in pv_systems]) - - -def df_to_list_pv_system(pv_systems_df=pd.DataFrame) -> List[PVSystem]: - """ - Change dataframe to lsit of pv systems - - Args: - pv_systems_df: dataframe with columns the same as the pv systems pydantic object - - Returns: list of pv systems +from pvsite_datamodel.sqlmodels import SiteSQL, GenerationSQL - """ - return [PVSystem(**row) for row in pv_systems_df.to_dict(orient="records")] +# +logger = logging.getLogger(__name__) -def format_pv_data(pv_system: PVSystemSQL, pv_yield_df: pd.DataFrame) -> List[PVYieldSQL]: +def format_pv_data(pv_system: SiteSQL, pv_yield_df: pd.DataFrame, session: Session) -> pd.DataFrame: """ Format the pv data 1. get rid of 0 bug 2. remove data if already in our database - 3. format in to PVYield objects - 4. convert to SQL objects :param pv_system: the pv system this data is about :param pv_yield_df: the pv yield data with columns 'instantaneous_power_gen_W' and 'datetime' @@ -73,14 +46,22 @@ def format_pv_data(pv_system: PVSystemSQL, pv_yield_df: pd.DataFrame) -> List[PV ): logger.debug( f"Dropping last row of pv data for " - f"{pv_system.pv_system_id} " + f"{pv_system.client_site_id} " f"as last row is 0, but the second to last row is not." ) pv_yield_df.drop(pv_yield_df.tail(1).index, inplace=True) # 2. filter by last - if pv_system.last_pv_yield is not None: - last_pv_yield_datetime = pv_system.last_pv_yield.datetime_utc.replace(tzinfo=timezone.utc) + last_pv_generation = ( + session.query(GenerationSQL) + .join(SiteSQL) + .filter(SiteSQL.site_uuid == pv_system.site_uuid) + .order_by(GenerationSQL.created_utc.desc()) + .first() + ) + + if last_pv_generation is not None: + last_pv_yield_datetime = last_pv_generation.start_utc.replace(tzinfo=timezone.utc) pv_yield_df = pv_yield_df[pv_yield_df["datetime_utc"] > last_pv_yield_datetime] @@ -92,22 +73,10 @@ def format_pv_data(pv_system: PVSystemSQL, pv_yield_df: pd.DataFrame) -> List[PV logger.debug(pv_yield_df) else: logger.debug( - f"This is the first lot pv yield data for " f"pv system {(pv_system.pv_system_id)}" + f"This is the first lot pv yield data for pv system {(pv_system.client_site_id)}" ) - # 3. format in to PVYield objects - # need columns datetime_utc, solar_generation_kw - pv_yield_df = pv_yield_df[["solar_generation_kw", "datetime_utc"]] - - # change to list of pydantic objects - pv_yields = [PVYield(**row) for row in pv_yield_df.to_dict(orient="records")] - # 4. change to sqlalamcy objects and add pv systems - pv_yields_sql = [pv_yield.to_orm() for pv_yield in pv_yields] - for pv_yield_sql in pv_yields_sql: - pv_yield_sql.pv_system = pv_system - logger.debug(f"Found {len(pv_yields_sql)} pv yield for pv systems {pv_system.pv_system_id}") - - return pv_yields_sql + return pv_yield_df class FakeDatabaseConnection: @@ -137,3 +106,7 @@ def __exit__(self, type, value, traceback): # noqa def get_session(self) -> Session: """Get sqlalamcy session""" return self.Session() + + +pv_output = "pvoutput.org" +solar_sheffield_passiv = "solar_sheffield_passiv" diff --git a/tests/conftest.py b/tests/conftest.py index 749779c..36662ab 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,9 +3,10 @@ import pandas as pd import pytest -from nowcasting_datamodel.connection import DatabaseConnection -from nowcasting_datamodel.models.base import Base_Forecast, Base_PV + +# from nowcasting_datamodel.connection import DatabaseConnection from pvsite_datamodel.sqlmodels import Base, GenerationSQL, SiteSQL +from pvsite_datamodel.connection import DatabaseConnection from testcontainers.postgres import PostgresContainer import pvconsumer @@ -34,13 +35,7 @@ def postgresql_database(bases: list): @pytest.fixture def db_connection(scope="session"): - with postgresql_database([Base_PV, Base]) as db_conn: - yield db_conn - - -@pytest.fixture(scope="session") -def db_connection_forecast(): - with postgresql_database([Base_Forecast]) as db_conn: + with postgresql_database([Base]) as db_conn: yield db_conn diff --git a/tests/intergration/test_app.py b/tests/intergration/test_app.py index de4df8c..0089b9f 100644 --- a/tests/intergration/test_app.py +++ b/tests/intergration/test_app.py @@ -1,35 +1,25 @@ from click.testing import CliRunner -from nowcasting_datamodel.models.pv import PVSystem, PVSystemSQL, PVYieldSQL -from pvsite_datamodel.sqlmodels import GenerationSQL - from pvconsumer.app import app, pull_data_and_save +from pvsite_datamodel.sqlmodels import GenerationSQL, SiteSQL def test_pull_data(db_session, sites): - pv_systems = [ - PVSystem(pv_system_id=10020, provider="pvoutput.org").to_orm(), - ] - pv_systems[0].last_pv_yield = None - pull_data_and_save(pv_systems=pv_systems, session=db_session, provider="pvoutput.org") + pull_data_and_save(pv_systems=sites, session=db_session, provider="pvoutput.org") - pv_yields = db_session.query(PVYieldSQL).all() + pv_yields = db_session.query(GenerationSQL).all() assert len(pv_yields) > 0 def test_pull_data_solar_sheffield(db_session, sites): - pv_systems = [ - PVSystem(pv_system_id=4383, provider="solar_sheffield_passiv").to_orm(), - ] - pv_systems[0].last_pv_yield = None - pull_data_and_save(pv_systems=pv_systems, session=db_session, provider="solar_sheffield_passiv") + pull_data_and_save(pv_systems=sites, session=db_session, provider="solar_sheffield_passiv") - pv_yields = db_session.query(PVYieldSQL).all() + pv_yields = db_session.query(GenerationSQL).all() assert len(pv_yields) > 0 -def test_app(db_connection, db_connection_forecast, filename, sites): +def test_app(db_connection, filename, sites): runner = CliRunner() response = runner.invoke( app, @@ -38,24 +28,21 @@ def test_app(db_connection, db_connection_forecast, filename, sites): db_connection.url, "--filename", filename, - "--db-url-forecast", - db_connection_forecast.url, ], ) assert response.exit_code == 0, response.exception with db_connection.get_session() as session: - pv_systems = session.query(PVSystemSQL).all() - _ = PVSystem.from_orm(pv_systems[0]) - assert len(pv_systems) == 20 + pv_systems = session.query(SiteSQL).all() + assert len(pv_systems) == 30 - pv_yields = session.query(PVYieldSQL).all() + pv_yields = session.query(GenerationSQL).all() assert len(pv_yields) > 7 # the app gets multiple values for each pv system. # There is a chance this will fail in the early morning when no data is available -def test_app_ss(db_connection, db_connection_forecast, filename_solar_sheffield, sites): +def test_app_ss(db_connection, filename_solar_sheffield, sites): runner = CliRunner() response = runner.invoke( app, @@ -64,10 +51,6 @@ def test_app_ss(db_connection, db_connection_forecast, filename_solar_sheffield, db_connection.url, "--filename", filename_solar_sheffield, - "--db-url-forecast", - db_connection_forecast.url, - "--db-url-pv-site", - db_connection.url, "--provider", "solar_sheffield_passiv", ], @@ -75,12 +58,8 @@ def test_app_ss(db_connection, db_connection_forecast, filename_solar_sheffield, assert response.exit_code == 0, response.exception with db_connection.get_session() as session: - pv_systems = session.query(PVSystemSQL).all() - _ = PVSystem.from_orm(pv_systems[0]) - assert len(pv_systems) == 10 - - pv_yields = session.query(PVYieldSQL).all() - assert len(pv_yields) >= 9 + pv_systems = session.query(SiteSQL).all() + assert len(pv_systems) == 30 # the app gets multiple values for each pv system. # There is a chance this will fail in the early morning when no data is available diff --git a/tests/intergration/test_ss_passiv.py b/tests/intergration/test_ss_passiv.py index c4cb5bf..263871e 100644 --- a/tests/intergration/test_ss_passiv.py +++ b/tests/intergration/test_ss_passiv.py @@ -12,7 +12,7 @@ def test_get_all_systems(): # these numbers seem to change over time assert len(pv_systems) >= 56824 assert len(pv_systems) <= 57200 - assert pv_systems[0].installed_capacity_kw is not None + assert pv_systems.iloc[0].installed_capacity_kw is not None def test_get_all_systems_filter(): diff --git a/tests/unittest/test_pv_systems.py b/tests/unittest/test_pv_systems.py index b6fd999..bc7e6db 100644 --- a/tests/unittest/test_pv_systems.py +++ b/tests/unittest/test_pv_systems.py @@ -1,14 +1,14 @@ -from datetime import datetime, timezone -from typing import List - -from nowcasting_datamodel.models.pv import PVSystem, PVSystemSQL, PVYield, solar_sheffield_passiv -from nowcasting_datamodel.read.read_pv import get_latest_pv_yield +import uuid +from datetime import datetime +import pandas as pd from pvconsumer.pv_systems import ( filter_pv_systems_which_have_new_data, find_missing_pv_systems, load_pv_systems, ) +from pvconsumer.utils import solar_sheffield_passiv +from pvsite_datamodel.sqlmodels import SiteSQL, GenerationSQL def test_load_pv_systems(): @@ -20,15 +20,19 @@ def test_load_pv_systems_passiv(): def test_find_missing_pv_systems(): - pv_systems_local = [ - PVSystem(pv_system_id=1, provider="pvoutput.org"), - PVSystem(pv_system_id=2, provider="pvoutput.org"), - PVSystem(pv_system_id=3, provider="pvoutput.org"), - ] + pv_systems_local = pd.DataFrame( + [ + dict(pv_system_id=1, provider="pvoutput.org"), + dict(pv_system_id=2, provider="pvoutput.org"), + dict(pv_system_id=3, provider="pvoutput.org"), + ] + ) - pv_systems_db = [ - PVSystem(pv_system_id=1, provider="pvoutput.org"), - ] + pv_systems_db = pd.DataFrame( + [ + dict(client_site_id=1, provider="pvoutput.org"), + ] + ) pv_systems_missing = find_missing_pv_systems( pv_systems_local=pv_systems_local, @@ -39,72 +43,49 @@ def test_find_missing_pv_systems(): assert len(pv_systems_missing) == 2 -def test_filter_pv_systems_which_have_new_data_no_refresh_interval(db_session): +def test_filter_pv_systems_which_have_no_datal(db_session): pv_systems = [ - PVSystem(pv_system_id=1, provider="pvoutput.org").to_orm(), - PVSystem(pv_system_id=2, provider="pvoutput.org").to_orm(), - PVSystem(pv_system_id=3, provider="pvoutput.org").to_orm(), + SiteSQL(site_uuid=uuid.uuid4()), + SiteSQL(site_uuid=uuid.uuid4()), + SiteSQL(site_uuid=uuid.uuid4()), ] - pv_systems = get_latest_pv_yield( - session=db_session, pv_systems=pv_systems, append_to_pv_systems=True + pv_systems_keep = filter_pv_systems_which_have_new_data( + pv_systems=pv_systems, session=db_session ) - pv_systems_keep = filter_pv_systems_which_have_new_data(pv_systems=pv_systems) - assert len(pv_systems_keep) == 3 -def test_filter_pv_systems_which_have_new_data_no_data(db_session): - pv_systems = [ - PVSystem(pv_system_id=1, provider="pvoutput.org", status_interval_minutes=5).to_orm(), - PVSystem(pv_system_id=2, provider="pvoutput.org", status_interval_minutes=5).to_orm(), - PVSystem(pv_system_id=3, provider="pvoutput.org", status_interval_minutes=5).to_orm(), - ] - - pv_systems = get_latest_pv_yield( - session=db_session, pv_systems=pv_systems, append_to_pv_systems=True +def test_filter_pv_systems_which_have_new_data(db_session, sites): + pv_yield_0 = GenerationSQL( + start_utc=datetime(2022, 1, 1), end_utc=datetime(2022, 1, 1, 0, 5), generation_power_kw=1 + ) + pv_yield_1 = GenerationSQL( + start_utc=datetime(2022, 1, 1, 0, 4), + end_utc=datetime(2022, 1, 1, 0, 5), + generation_power_kw=3, ) - pv_systems_keep = filter_pv_systems_which_have_new_data(pv_systems=pv_systems) - - assert len(pv_systems_keep) == 3 - - -def test_filter_pv_systems_which_have_new_data(db_session): - pv_yield_0 = PVYield(datetime_utc=datetime(2022, 1, 1), solar_generation_kw=1).to_orm() - pv_yield_1 = PVYield(datetime_utc=datetime(2022, 1, 1), solar_generation_kw=2).to_orm() - pv_yield_2 = PVYield(datetime_utc=datetime(2022, 1, 1, 0, 4), solar_generation_kw=3).to_orm() - - pv_systems = [ - PVSystem(pv_system_id=1, provider="pvoutput.org", status_interval_minutes=4).to_orm(), - PVSystem(pv_system_id=2, provider="pvoutput.org", status_interval_minutes=1).to_orm(), - PVSystem(pv_system_id=3, provider="pvoutput.org", status_interval_minutes=5).to_orm(), - ] - - pv_yield_0.pv_system = pv_systems[0] - pv_yield_1.pv_system = pv_systems[1] - pv_yield_2.pv_system = pv_systems[2] + pv_yield_0.site = sites[0] + pv_yield_1.site = sites[1] - db_session.add_all([pv_yield_0, pv_yield_1, pv_yield_2]) - db_session.add_all(pv_systems) + db_session.add_all([pv_yield_0, pv_yield_1]) + db_session.commit() - pv_systems: List[PVSystemSQL] = db_session.query(PVSystemSQL).all() - pv_systems = get_latest_pv_yield( - session=db_session, pv_systems=pv_systems, append_to_pv_systems=True - ) + assert len(sites) == 30 # - # | last data | refresh | keep? - # 1 | 5 mins | 4 mins | True - # 2 | 5 mins | 1 mins | True - # 3 | 1 mins | 5 mins | False + # | last data | keep? + # 1 | 6 mins | True + # 2 | 2 mins | False pv_systems_keep = filter_pv_systems_which_have_new_data( - pv_systems=pv_systems, - datetime_utc=datetime(2022, 1, 1, 0, 5, tzinfo=timezone.utc), + pv_systems=sites, + session=db_session, + datetime_utc=datetime(2022, 1, 1, 0, 6), ) - assert len(pv_systems_keep) == 2 - assert pv_systems_keep[0].id == 1 - assert pv_systems_keep[1].id == 2 + assert len(pv_systems_keep) == 29 + assert pv_systems_keep[0].site_uuid == sites[0].site_uuid + assert pv_systems_keep[1].site_uuid == sites[2].site_uuid diff --git a/tests/unittest/test_utils.py b/tests/unittest/test_utils.py index a8cef87..870d5a3 100644 --- a/tests/unittest/test_utils.py +++ b/tests/unittest/test_utils.py @@ -1,57 +1,35 @@ from datetime import datetime, timezone import pandas as pd -from nowcasting_datamodel.fake import make_fake_pv_system -from nowcasting_datamodel.models.pv import PVSystem, PVYield +from pvconsumer.utils import format_pv_data +from pvsite_datamodel.sqlmodels import GenerationSQL -from pvconsumer.utils import df_to_list_pv_system, format_pv_data, list_pv_system_to_df - -def test_list_pv_system_to_df(): - pv_systems_1 = PVSystem.from_orm(make_fake_pv_system()) - pv_systems_2 = PVSystem.from_orm(make_fake_pv_system()) - - _ = list_pv_system_to_df([pv_systems_1, pv_systems_2]) - - -def test_df_to_list_pv_system(): - pv_systems_1 = PVSystem.from_orm(make_fake_pv_system()) - pv_systems_2 = PVSystem.from_orm(make_fake_pv_system()) - - df = list_pv_system_to_df([pv_systems_1, pv_systems_2]) - _ = df_to_list_pv_system(df) - - -def test_pv_yield_df_no_data(): - pv_systems = [ - PVSystem(pv_system_id=10020, provider="pvoutput.org").to_orm(), - ] - pv_systems[0].last_pv_yield = None +def test_pv_yield_df_no_data(db_session, sites): pv_yield_df = pd.DataFrame(columns=["instantaneous_power_gen_W", "datetime"]) - pv_yields = format_pv_data(pv_system=pv_systems[0], pv_yield_df=pv_yield_df) + pv_yields = format_pv_data(pv_system=sites[0], pv_yield_df=pv_yield_df, session=db_session) assert len(pv_yields) == 0 -def test_pv_yield_df(): - pv_system = PVSystem(pv_system_id=10020, provider="pvoutput.org").to_orm() - pv_system.last_pv_yield = None +def test_pv_yield_df(sites, db_session): pv_yield_df = pd.DataFrame( columns=["instantaneous_power_gen_W", "datetime"], data=[[1, datetime(2022, 1, 1)]] ) - pv_yields = format_pv_data(pv_system=pv_system, pv_yield_df=pv_yield_df) + pv_yields = format_pv_data(pv_system=sites[0], pv_yield_df=pv_yield_df, session=db_session) assert len(pv_yields) == 1 - assert pv_yields[0].solar_generation_kw == 1 / 1000 - + assert pv_yields.iloc[0].solar_generation_kw == 1 / 1000 -def test_pv_yield_df_last_pv_yield(): - pv_system = PVSystem(pv_system_id=10020, provider="pvoutput.org").to_orm() - last_pv_yield = PVYield(datetime_utc=datetime(2022, 1, 1), solar_generation_kw=10) - pv_system.last_pv_yield = last_pv_yield +def test_pv_yield_df_last_pv_yield(sites, db_session): + last_pv_yield = GenerationSQL( + start_utc=datetime(2022, 1, 1), end_utc=datetime(2022, 1, 1), generation_power_kw=10 + ) + last_pv_yield.site = sites[0] + db_session.add(last_pv_yield) pv_yield_df = pd.DataFrame( columns=["instantaneous_power_gen_W", "datetime"], @@ -61,14 +39,18 @@ def test_pv_yield_df_last_pv_yield(): ], ) - pv_yields = format_pv_data(pv_system=pv_system, pv_yield_df=pv_yield_df) + pv_yields = format_pv_data(pv_system=sites[0], pv_yield_df=pv_yield_df, session=db_session) assert len(pv_yields) == 1 - assert pv_yields[0].solar_generation_kw == 2 / 1000 + assert pv_yields.iloc[0].solar_generation_kw == 2 / 1000 -def test_pv_yield_df_0_bug(): - pv_system = PVSystem(pv_system_id=10020, provider="pvoutput.org").to_orm() - pv_system.last_pv_yield = None +# +def test_pv_yield_df_0_bug(sites, db_session): + last_pv_yield = GenerationSQL( + start_utc=datetime(2021, 1, 1), end_utc=datetime(2021, 1, 1), generation_power_kw=10 + ) + last_pv_yield.site = sites[0] + db_session.add(last_pv_yield) pv_yield_df = pd.DataFrame( columns=["instantaneous_power_gen_W", "datetime"], @@ -78,14 +60,12 @@ def test_pv_yield_df_0_bug(): ], ) - pv_yields = format_pv_data(pv_system=pv_system, pv_yield_df=pv_yield_df) + pv_yields = format_pv_data(pv_system=sites[0], pv_yield_df=pv_yield_df, session=db_session) assert len(pv_yields) == 1 - assert pv_yields[0].solar_generation_kw == 1 / 1000 + assert pv_yields.iloc[0].solar_generation_kw == 1 / 1000 -def test_pv_yield_df_zeros(): - pv_system = PVSystem(pv_system_id=10020, provider="pvoutput.org").to_orm() - pv_system.last_pv_yield = None +def test_pv_yield_df_zeros(sites, db_session): pv_yield_df = pd.DataFrame( columns=["instantaneous_power_gen_W", "datetime"], @@ -95,6 +75,6 @@ def test_pv_yield_df_zeros(): ], ) - pv_yields = format_pv_data(pv_system=pv_system, pv_yield_df=pv_yield_df) + pv_yields = format_pv_data(pv_system=sites[0], pv_yield_df=pv_yield_df, session=db_session) assert len(pv_yields) == 2 - assert pv_yields[0].solar_generation_kw == 0 + assert pv_yields.iloc[0].solar_generation_kw == 0