From 6d7c7cd89f04aed7e6ecd1be29278361ed03408e Mon Sep 17 00:00:00 2001 From: lujzi05 <36622811+lujzi05@users.noreply.github.com> Date: Sun, 23 Jun 2024 13:00:16 +0200 Subject: [PATCH] the zero baseline model --- models/hazel_rabbit/README.md | 1 + .../hazel_rabbit/configs/config_deployment.py | 16 ++++ .../configs/config_hyperparameters.py | 18 +++++ .../hazel_rabbit/configs/config_input_data.py | 25 ++++++ models/hazel_rabbit/configs/config_meta.py | 17 ++++ models/hazel_rabbit/configs/config_sweep.py | 27 +++++++ models/hazel_rabbit/main.py | 60 ++++++++++++++ models/hazel_rabbit/requirements.txt | 1 + .../src/forecasting/generate_forecast.py | 63 +++++++++++++++ .../src/management/execute_model_runs.py | 44 ++++++++++ .../src/management/execute_model_tasks.py | 80 +++++++++++++++++++ .../src/offline_evaluation/evaluate_model.py | 60 ++++++++++++++ models/hazel_rabbit/src/utils/utils.py | 74 +++++++++++++++++ models/hazel_rabbit/src/utils/utils_wandb.py | 34 ++++++++ 14 files changed, 520 insertions(+) create mode 100644 models/hazel_rabbit/README.md create mode 100644 models/hazel_rabbit/configs/config_deployment.py create mode 100644 models/hazel_rabbit/configs/config_hyperparameters.py create mode 100644 models/hazel_rabbit/configs/config_input_data.py create mode 100644 models/hazel_rabbit/configs/config_meta.py create mode 100644 models/hazel_rabbit/configs/config_sweep.py create mode 100644 models/hazel_rabbit/main.py create mode 100644 models/hazel_rabbit/requirements.txt create mode 100644 models/hazel_rabbit/src/forecasting/generate_forecast.py create mode 100644 models/hazel_rabbit/src/management/execute_model_runs.py create mode 100644 models/hazel_rabbit/src/management/execute_model_tasks.py create mode 100644 models/hazel_rabbit/src/offline_evaluation/evaluate_model.py create mode 100644 models/hazel_rabbit/src/utils/utils.py create mode 100644 models/hazel_rabbit/src/utils/utils_wandb.py diff --git a/models/hazel_rabbit/README.md b/models/hazel_rabbit/README.md new file mode 100644 index 00000000..72869b65 --- /dev/null +++ b/models/hazel_rabbit/README.md @@ -0,0 +1 @@ +# Model README diff --git a/models/hazel_rabbit/configs/config_deployment.py b/models/hazel_rabbit/configs/config_deployment.py new file mode 100644 index 00000000..65b15f3e --- /dev/null +++ b/models/hazel_rabbit/configs/config_deployment.py @@ -0,0 +1,16 @@ +def get_deployment_config(): + + """ + Contains the configuration for deploying the model into different environments. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + + Returns: + - deployment_config (dict): A dictionary containing deployment settings, determining how the model is deployed, including status, endpoints, and resource allocation. + """ + + # More deployment settings can/will be added here + deployment_config = { + "deployment_status": "baseline", # shadow, deployed, baseline, or deprecated + } + + return deployment_config \ No newline at end of file diff --git a/models/hazel_rabbit/configs/config_hyperparameters.py b/models/hazel_rabbit/configs/config_hyperparameters.py new file mode 100644 index 00000000..a0c303ff --- /dev/null +++ b/models/hazel_rabbit/configs/config_hyperparameters.py @@ -0,0 +1,18 @@ + +def get_hp_config(): + + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + 'sweep' : False, # no sweep for the zero baseline model + 'partitioner' : False, # True: if hardcoded months from set_partitioner.py are used, False: max months - time_steps + 'save_generated' : True, # save evaulation results in the generated folder + 'time_steps' : 36, # 36 right? + } + return hyperparameters diff --git a/models/hazel_rabbit/configs/config_input_data.py b/models/hazel_rabbit/configs/config_input_data.py new file mode 100644 index 00000000..d4105438 --- /dev/null +++ b/models/hazel_rabbit/configs/config_input_data.py @@ -0,0 +1,25 @@ +from viewser import Queryset, Column + +def get_input_data_config(): + + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and architecture accordingly. + + Returns: + queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6 + queryset_base = (Queryset("hazel_rabbit", "priogrid_month") + .with_column(Column("ln_sb_best", from_loa = "priogrid_month", from_column = "ged_sb_best_count_nokgi").transform.ops.ln().transform.missing.replace_na()) + .with_column(Column("ln_ns_best", from_loa = "priogrid_month", from_column = "ged_ns_best_count_nokgi").transform.ops.ln().transform.missing.replace_na()) + .with_column(Column("ln_os_best", from_loa = "priogrid_month", from_column = "ged_os_best_count_nokgi").transform.ops.ln().transform.missing.replace_na()) + .with_column(Column("month", from_loa = "month", from_column = "month")) + .with_column(Column("year_id", from_loa = "country_year", from_column = "year_id")) + .with_column(Column("c_id", from_loa = "country_year", from_column = "country_id")) + .with_column(Column("col", from_loa = "priogrid", from_column = "col")) + .with_column(Column("row", from_loa = "priogrid", from_column = "row"))) + + return queryset_base \ No newline at end of file diff --git a/models/hazel_rabbit/configs/config_meta.py b/models/hazel_rabbit/configs/config_meta.py new file mode 100644 index 00000000..7d7eb6aa --- /dev/null +++ b/models/hazel_rabbit/configs/config_meta.py @@ -0,0 +1,17 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model architecture, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + meta_config = { + "name": "hazel_rabbit", + "algorithm": "zero baseline", + "target(S)": ["ln_sb_best", "ln_ns_best", "ln_os_best", "ln_sb_best_binarized", "ln_ns_best_binarized", "ln_os_best_binarized"], + "queryset": "hazel_rabbit", + "level": "pgm", + "creator": "Borbala" + } + return meta_config \ No newline at end of file diff --git a/models/hazel_rabbit/configs/config_sweep.py b/models/hazel_rabbit/configs/config_sweep.py new file mode 100644 index 00000000..9c71d6d5 --- /dev/null +++ b/models/hazel_rabbit/configs/config_sweep.py @@ -0,0 +1,27 @@ +def get_swep_config(): + + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid' + } + + metric = { + + } + + sweep_config['metric'] = metric + + parameters_dict = { + + } + + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/hazel_rabbit/main.py b/models/hazel_rabbit/main.py new file mode 100644 index 00000000..e025dd21 --- /dev/null +++ b/models/hazel_rabbit/main.py @@ -0,0 +1,60 @@ +import time + +import wandb + +import sys +from pathlib import Path + +PATH = Path(__file__) +sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_artifacts_paths +setup_project_paths(PATH) + +from cli_parser_utils import parse_args, validate_arguments +#from artifacts_utils import get_latest_model_artifact + +#from model_run_handlers import handle_sweep_run, handle_single_run +from execute_model_runs import execute_sweep_run, execute_single_run + +#from mode_run_manager import model_run_manager + +if __name__ == "__main__": + + # new argpars solution. + args = parse_args() + #print(args) + + # Validate the parsed arguments to ensure they conform to the required logic and combinations. + validate_arguments(args) + + # wandb login + wandb.login() + + start_t = time.time() + + # Test if and why a model_metadata_dict.py was saved in the artifacts folder.. + + # first you need to check if you are running a sweep or not, because the sweep will overwrite the train and evaluate flags + if args.sweep == True: + + #handle_sweep_run(args) + execute_sweep_run(args) + + elif args.sweep == False: + + #handle_single_run(args) + execute_single_run(args) + + end_t = time.time() + minutes = (end_t - start_t)/60 + print(f'Done. Runtime: {minutes:.3f} minutes') + + # notes on stepshifted models: + # There will be some thinking here in regards to how we store, denote (naming convention), and retrieve the model artifacts from stepshifted models. + # It is not a big issue, but it is something to consider os we don't do something headless. + # A possible format could be: _model_s_.pt example: calibration_model_s00_20210831_123456.pt, calibration_model_s01_20210831_123456.pt, etc. + # And the rest of the code maded in a way to handle this naming convention without any issues. Could be a simple fix. + # Alternatively, we could store the model artifacts in a subfolder for each stepshifted model. This would make it easier to handle the artifacts, but it would also make it harder to retrieve the latest artifact for a given run type. + # Lastly, the solution Xiaolong is working on might allow us the store multiple models (steps) in one artifact, which would make this whole discussion obsolete and be the best solution. + + diff --git a/models/hazel_rabbit/requirements.txt b/models/hazel_rabbit/requirements.txt new file mode 100644 index 00000000..1fa9034a --- /dev/null +++ b/models/hazel_rabbit/requirements.txt @@ -0,0 +1 @@ +# Requirements diff --git a/models/hazel_rabbit/src/forecasting/generate_forecast.py b/models/hazel_rabbit/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..2ebd993c --- /dev/null +++ b/models/hazel_rabbit/src/forecasting/generate_forecast.py @@ -0,0 +1,63 @@ + +from set_partition import get_partitioner_dict + +import pandas as pd + +import sys +from pathlib import Path + +PATH = Path(__file__) +sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_data_paths +setup_project_paths(PATH) + + +from utils import get_raw_data, create_model_time_stamp, save_generated_pred + + +def forecast_with_model_artifact(config, views_raw): + """ + Create forecasts using the zero baseline model. Return a DataFrame with the predictions. + + Args: + config : Configuration object containing parameters and settings. + views_raw : DataFrame containing the raw data + """ + + partitioner_dict = get_partitioner_dict(config.run_type) + + # get the months for the predictions + first_month = partitioner_dict['predict'][0] #if config.partitioner==True else partitioner_dict['predict'][1]-config.time_steps + last_month = partitioner_dict['predict'][1] + + views_raw = views_raw[['month_id', 'pg_id', 'month', 'year_id', 'c_id']] + + views_res = generate_forecast(config, views_raw, first_month, last_month) + + # add timestamp + config = create_model_time_stamp(config) + + # save the DataFrame of model outputs + if config.save_generated == True: + save_generated_pred(config, views_res) + + return views_res + + + + +def generate_forecast(config, views_raw, first_month, last_month): + # get the unique grids as a Series + unique_grids = views_raw['pg_id'].unique() + + # create the next 36 months for these grids + next_months = pd.DataFrame({ + 'pg_id': unique_grids.repeat(config.time_steps), + 'month_id': [month for _ in unique_grids for month in range(first_month, last_month)] + }) + + # assign the sequence from 1 to 36 for the new months + next_months['out_sample_months'] = next_months.groupby('pg_id').cumcount() + 1 + next_months['y_pred'] = 0 + + return next_months \ No newline at end of file diff --git a/models/hazel_rabbit/src/management/execute_model_runs.py b/models/hazel_rabbit/src/management/execute_model_runs.py new file mode 100644 index 00000000..65848d56 --- /dev/null +++ b/models/hazel_rabbit/src/management/execute_model_runs.py @@ -0,0 +1,44 @@ +import sys +from pathlib import Path + +PATH = Path(__file__) +sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_artifacts_paths +setup_project_paths(PATH) + +#from config_sweep import get_swep_config +from config_hyperparameters import get_hp_config +#from model_run_manager import model_run_manager +from execute_model_tasks import execute_model_tasks + + +def execute_sweep_run(args): + print('Running sweep...') + + project = f"hazel_rabbit_sweep" # check naming convention + + print('Sweep run is not implemented. Exiting...') + + +def execute_single_run(args): + + # get config + config = get_hp_config() + config['run_type'] = args.run_type + + + # get run type and denoting project name - check convention! + project = f"hazel_rabbit_{args.run_type}" + + if args.run_type == 'calibration' or args.run_type == 'testing': + + execute_model_tasks(config = config, project = project, train = args.train, eval = args.evaluate, forecast = False) + + elif args.run_type == 'forecasting': + + execute_model_tasks(config = config, project = project, train = False, eval = False, forecast=True) + + else: + raise ValueError(f"Invalid run type: {args.run_type}") + + diff --git a/models/hazel_rabbit/src/management/execute_model_tasks.py b/models/hazel_rabbit/src/management/execute_model_tasks.py new file mode 100644 index 00000000..58f99693 --- /dev/null +++ b/models/hazel_rabbit/src/management/execute_model_tasks.py @@ -0,0 +1,80 @@ + +import wandb + +import sys +from pathlib import Path + +PATH = Path(__file__) +sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_artifacts_paths, setup_data_paths + +from ingester3.ViewsMonth import ViewsMonth +setup_project_paths(PATH) + +from utils import get_raw_data +from utils_wandb import add_wandb_monthly_metrics + + +from evaluate_model import evaluate_model_artifact +from generate_forecast import forecast_with_model_artifact + + + +def execute_model_tasks(config = None, project = None, train = None, eval = None, forecast = None): + + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + """ + + # Define the path for the artifacts + PATH_ARTIFACTS = setup_artifacts_paths(PATH) + + #device = setup_device() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # Retrieve raw data (partition) based on the configuration + views_raw = get_raw_data(config) + + + # Handle the sweep runs + if config.sweep: + + pass + + # Handle the single model runs: train and save the model as an artifact + if train: + + print('No need to train the zero baseline model. Exiting...') + pass + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + #handle_evaluation(config, device, views_vol, PATH_ARTIFACTS, artifact_name) + evaluate_model_artifact(config, views_raw) + + + + if forecast: + #handle_forecasting(config, device, views_vol, PATH_ARTIFACTS, artifact_name) + forecast_with_model_artifact(config, views_raw) + + diff --git a/models/hazel_rabbit/src/offline_evaluation/evaluate_model.py b/models/hazel_rabbit/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..81d09ba6 --- /dev/null +++ b/models/hazel_rabbit/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,60 @@ + +from set_partition import get_partitioner_dict + +import sys +from pathlib import Path + +PATH = Path(__file__) +sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_data_paths +setup_project_paths(PATH) + + +from utils import get_raw_data, create_model_time_stamp, save_generated_pred + + +def evaluate_model_artifact(config, views_raw): + """ + Create predictions using the zero baseline model. Return a DataFrame with the predictions. + + Args: + config : Configuration object containing parameters and settings. + views_raw : DataFrame containing the raw data + """ + + partitioner_dict = get_partitioner_dict(config.run_type) + + # get the months for the predictions + first_month = partitioner_dict['predict'][0] if config.partitioner==True else partitioner_dict['predict'][1]-config.time_steps + last_month = partitioner_dict['predict'][1] + + # apply the function to each grid group + views_res = views_raw.groupby('pg_id').apply(create_months_index, config=config).reset_index(drop=True) + + # add 0 prediction + views_res['y_pred'] = 0 + + # add timestamp + config = create_model_time_stamp(config) + + # save the DataFrame of model outputs + if config.save_generated == True: + save_generated_pred(config, views_res) + + return views_res + + + +def create_months_index(group_df, config): + """ + Add a new column (named 'out_sample_months') to a DataFrame with the numbers from 1 to config.time_steps. + + Args: + group_df : DataFrame grouped by the grid id ('pg_id'). + config : Configuration object containing parameters and settings. + """ + + group_df = group_df.sort_values(by='month_id').tail(config.time_steps) + group_df['out_sample_months'] = range(1, config.time_steps + 1) + + return group_df \ No newline at end of file diff --git a/models/hazel_rabbit/src/utils/utils.py b/models/hazel_rabbit/src/utils/utils.py new file mode 100644 index 00000000..7eff232f --- /dev/null +++ b/models/hazel_rabbit/src/utils/utils.py @@ -0,0 +1,74 @@ +import numpy as np +from datetime import datetime +import pickle + + + + +import sys +from pathlib import Path + +PATH = Path(__file__) +sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_data_paths +setup_project_paths(PATH) + + + + +def get_raw_data(config): + """ + Return the raw data. + + Args: + config : Configuration object containing parameters and settings. + """ + + PATH_RAW, _, PATH_GENERATED = setup_data_paths(PATH) + run_type = config.run_type + file_name = f'/{run_type}_viewser_df.pkl' + print(f'Loading {run_type} data from {file_name}...') + views_raw = np.load(str(PATH_RAW) + file_name, allow_pickle=True) + + return views_raw + + + + +def create_model_time_stamp(config): + """ + Create the timestamp of the evaluation and add to the config. + + Args: + config : Configuration object containing parameters and settings. + """ + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + print(f"timestamp: {timestamp}") + # add to config for logging and conciseness + config.model_time_stamp = timestamp + + return config + + +def save_generated_pred(config, views_res): + """ + Save the predictions in a pickle file. + + Args: + config : Configuration object containing parameters and settings. + views_res : DataFrame containing the predictions. + """ + + _, _, PATH_GENERATED = setup_data_paths(PATH) + outputs_path = f'{PATH_GENERATED}/df_sb_os_ns_output_{config.time_steps}_{config.run_type}_{config.model_time_stamp}.pkl' + with open(outputs_path, 'wb') as file: + pickle.dump(views_res, file) + + return + + + + + + diff --git a/models/hazel_rabbit/src/utils/utils_wandb.py b/models/hazel_rabbit/src/utils/utils_wandb.py new file mode 100644 index 00000000..df82c326 --- /dev/null +++ b/models/hazel_rabbit/src/utils/utils_wandb.py @@ -0,0 +1,34 @@ +import numpy as np +from sklearn.metrics import mean_squared_error, average_precision_score, roc_auc_score, brier_score_loss +import wandb + +# there are things in other utils that should be here... + +def add_wandb_monthly_metrics(): + + # Define "new" monthly metrics for WandB logging + wandb.define_metric("monthly/out_sample_month") + wandb.define_metric("monthly/*", step_metric="monthly/out_sample_month") + + +def log_wandb_monthly_metrics(config, mse_list, ap_list, auc_list, brier_list): + + """ + Logs evaluation metrics to WandB. + + This function computes the mean of provided metrics and logs them to WandB. + The metrics include mean squared error, average precision score, ROC AUC score, and Brier score loss. + + Args: + config : Configuration object containing parameters and settings. + mse_list : List of monthly mean squared errors. + ap_list : List of monthly average precision scores. + auc_list : List of monthly ROC AUC scores. + brier_list : List of monthly Brier scores. + + """ + + wandb.log({f"{config.time_steps}month_mean_squared_error": np.mean(mse_list)}) + wandb.log({f"{config.time_steps}month_average_precision_score": np.mean(ap_list)}) + wandb.log({f"{config.time_steps}month_roc_auc_score": np.mean(auc_list)}) + wandb.log({f"{config.time_steps}month_brier_score_loss": np.mean(brier_list)}) \ No newline at end of file