From 6d7c7cd89f04aed7e6ecd1be29278361ed03408e Mon Sep 17 00:00:00 2001
From: lujzi05 <36622811+lujzi05@users.noreply.github.com>
Date: Sun, 23 Jun 2024 13:00:16 +0200
Subject: [PATCH] the zero baseline model

---
 models/hazel_rabbit/README.md                 |  1 +
 .../hazel_rabbit/configs/config_deployment.py | 16 ++++
 .../configs/config_hyperparameters.py         | 18 +++++
 .../hazel_rabbit/configs/config_input_data.py | 25 ++++++
 models/hazel_rabbit/configs/config_meta.py    | 17 ++++
 models/hazel_rabbit/configs/config_sweep.py   | 27 +++++++
 models/hazel_rabbit/main.py                   | 60 ++++++++++++++
 models/hazel_rabbit/requirements.txt          |  1 +
 .../src/forecasting/generate_forecast.py      | 63 +++++++++++++++
 .../src/management/execute_model_runs.py      | 44 ++++++++++
 .../src/management/execute_model_tasks.py     | 80 +++++++++++++++++++
 .../src/offline_evaluation/evaluate_model.py  | 60 ++++++++++++++
 models/hazel_rabbit/src/utils/utils.py        | 74 +++++++++++++++++
 models/hazel_rabbit/src/utils/utils_wandb.py  | 34 ++++++++
 14 files changed, 520 insertions(+)
 create mode 100644 models/hazel_rabbit/README.md
 create mode 100644 models/hazel_rabbit/configs/config_deployment.py
 create mode 100644 models/hazel_rabbit/configs/config_hyperparameters.py
 create mode 100644 models/hazel_rabbit/configs/config_input_data.py
 create mode 100644 models/hazel_rabbit/configs/config_meta.py
 create mode 100644 models/hazel_rabbit/configs/config_sweep.py
 create mode 100644 models/hazel_rabbit/main.py
 create mode 100644 models/hazel_rabbit/requirements.txt
 create mode 100644 models/hazel_rabbit/src/forecasting/generate_forecast.py
 create mode 100644 models/hazel_rabbit/src/management/execute_model_runs.py
 create mode 100644 models/hazel_rabbit/src/management/execute_model_tasks.py
 create mode 100644 models/hazel_rabbit/src/offline_evaluation/evaluate_model.py
 create mode 100644 models/hazel_rabbit/src/utils/utils.py
 create mode 100644 models/hazel_rabbit/src/utils/utils_wandb.py

diff --git a/models/hazel_rabbit/README.md b/models/hazel_rabbit/README.md
new file mode 100644
index 00000000..72869b65
--- /dev/null
+++ b/models/hazel_rabbit/README.md
@@ -0,0 +1 @@
+# Model README
diff --git a/models/hazel_rabbit/configs/config_deployment.py b/models/hazel_rabbit/configs/config_deployment.py
new file mode 100644
index 00000000..65b15f3e
--- /dev/null
+++ b/models/hazel_rabbit/configs/config_deployment.py
@@ -0,0 +1,16 @@
+def get_deployment_config():
+
+    """
+    Contains the configuration for deploying the model into different environments.
+    This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system.
+
+    Returns:
+    - deployment_config (dict): A dictionary containing deployment settings, determining how the model is deployed, including status, endpoints, and resource allocation.
+    """
+
+    # More deployment settings can/will be added here
+    deployment_config = {
+       "deployment_status": "baseline", # shadow, deployed, baseline, or deprecated
+    }
+
+    return deployment_config
\ No newline at end of file
diff --git a/models/hazel_rabbit/configs/config_hyperparameters.py b/models/hazel_rabbit/configs/config_hyperparameters.py
new file mode 100644
index 00000000..a0c303ff
--- /dev/null
+++ b/models/hazel_rabbit/configs/config_hyperparameters.py
@@ -0,0 +1,18 @@
+    
+def get_hp_config():
+
+    """
+    Contains the hyperparameter configurations for model training.
+    This configuration is "operational" so modifying these settings will impact the model's behavior during training.
+
+    Returns:
+    - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase.
+    """
+
+    hyperparameters = {
+    'sweep' : False, # no sweep for the zero baseline model
+    'partitioner' : False, # True: if hardcoded months from set_partitioner.py are used, False: max months - time_steps
+    'save_generated' : True, # save evaulation results in the generated folder 
+    'time_steps' : 36, # 36 right?
+   }
+    return hyperparameters
diff --git a/models/hazel_rabbit/configs/config_input_data.py b/models/hazel_rabbit/configs/config_input_data.py
new file mode 100644
index 00000000..d4105438
--- /dev/null
+++ b/models/hazel_rabbit/configs/config_input_data.py
@@ -0,0 +1,25 @@
+from viewser import Queryset, Column
+
+def get_input_data_config():
+
+    """
+    Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model.
+    This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system.
+    There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and architecture accordingly.
+
+    Returns:
+    queryset_base (Queryset): A queryset containing the base data for the model training.
+    """
+
+    # VIEWSER 6
+    queryset_base = (Queryset("hazel_rabbit", "priogrid_month")
+        .with_column(Column("ln_sb_best", from_loa = "priogrid_month", from_column = "ged_sb_best_count_nokgi").transform.ops.ln().transform.missing.replace_na())
+        .with_column(Column("ln_ns_best", from_loa = "priogrid_month", from_column = "ged_ns_best_count_nokgi").transform.ops.ln().transform.missing.replace_na())
+        .with_column(Column("ln_os_best", from_loa = "priogrid_month", from_column = "ged_os_best_count_nokgi").transform.ops.ln().transform.missing.replace_na())
+        .with_column(Column("month", from_loa = "month", from_column = "month"))
+        .with_column(Column("year_id", from_loa = "country_year", from_column = "year_id"))
+        .with_column(Column("c_id", from_loa = "country_year", from_column = "country_id"))
+        .with_column(Column("col", from_loa = "priogrid", from_column = "col"))
+        .with_column(Column("row", from_loa = "priogrid", from_column = "row")))
+
+    return queryset_base
\ No newline at end of file
diff --git a/models/hazel_rabbit/configs/config_meta.py b/models/hazel_rabbit/configs/config_meta.py
new file mode 100644
index 00000000..7d7eb6aa
--- /dev/null
+++ b/models/hazel_rabbit/configs/config_meta.py
@@ -0,0 +1,17 @@
+def get_meta_config():
+    """
+    Contains the meta data for the model (model architecture, name, target variable, and level of analysis).
+    This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation.
+
+    Returns:
+    - meta_config (dict): A dictionary containing model meta configuration.
+    """
+    meta_config = {
+        "name": "hazel_rabbit",
+        "algorithm": "zero baseline", 
+        "target(S)": ["ln_sb_best", "ln_ns_best", "ln_os_best", "ln_sb_best_binarized", "ln_ns_best_binarized", "ln_os_best_binarized"], 
+        "queryset": "hazel_rabbit",
+        "level": "pgm",
+        "creator": "Borbala" 
+    }
+    return meta_config 
\ No newline at end of file
diff --git a/models/hazel_rabbit/configs/config_sweep.py b/models/hazel_rabbit/configs/config_sweep.py
new file mode 100644
index 00000000..9c71d6d5
--- /dev/null
+++ b/models/hazel_rabbit/configs/config_sweep.py
@@ -0,0 +1,27 @@
+def get_swep_config():
+
+    """
+    Contains the configuration for hyperparameter sweeps using WandB.
+    This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance.
+    
+    Returns:
+    - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters.
+    """
+ 
+    sweep_config = {
+    'method': 'grid'
+    }
+
+    metric = {
+         
+        }
+
+    sweep_config['metric'] = metric
+
+    parameters_dict = {
+        
+        }
+
+    sweep_config['parameters'] = parameters_dict
+
+    return sweep_config
diff --git a/models/hazel_rabbit/main.py b/models/hazel_rabbit/main.py
new file mode 100644
index 00000000..e025dd21
--- /dev/null
+++ b/models/hazel_rabbit/main.py
@@ -0,0 +1,60 @@
+import time
+
+import wandb
+
+import sys
+from pathlib import Path
+
+PATH = Path(__file__)
+sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS  
+from set_path import setup_project_paths, setup_artifacts_paths
+setup_project_paths(PATH)
+
+from cli_parser_utils import parse_args, validate_arguments
+#from artifacts_utils import get_latest_model_artifact
+
+#from model_run_handlers import handle_sweep_run, handle_single_run
+from execute_model_runs import execute_sweep_run, execute_single_run
+
+#from mode_run_manager import model_run_manager
+
+if __name__ == "__main__":
+
+    # new argpars solution.
+    args = parse_args()
+    #print(args)
+
+    # Validate the parsed arguments to ensure they conform to the required logic and combinations.
+    validate_arguments(args)
+
+    # wandb login
+    wandb.login()
+
+    start_t = time.time()
+
+    # Test if and why a model_metadata_dict.py was saved in the artifacts folder..
+
+    # first you need to check if you are running a sweep or not, because the sweep will overwrite the train and evaluate flags
+    if args.sweep == True:
+
+        #handle_sweep_run(args)
+        execute_sweep_run(args)
+
+    elif args.sweep == False:
+        
+        #handle_single_run(args)
+        execute_single_run(args)
+
+    end_t = time.time()
+    minutes = (end_t - start_t)/60
+    print(f'Done. Runtime: {minutes:.3f} minutes')
+
+    # notes on stepshifted models:
+    # There will be some thinking here in regards to how we store, denote (naming convention), and retrieve the model artifacts from stepshifted models.
+    # It is not a big issue, but it is something to consider os we don't do something headless. 
+    # A possible format could be: <run_type>_model_s<step>_<timestamp>.pt example: calibration_model_s00_20210831_123456.pt, calibration_model_s01_20210831_123456.pt, etc.
+    # And the rest of the code maded in a way to handle this naming convention without any issues. Could be a simple fix.
+    # Alternatively, we could store the model artifacts in a subfolder for each stepshifted model. This would make it easier to handle the artifacts, but it would also make it harder to retrieve the latest artifact for a given run type.
+    # Lastly, the solution Xiaolong is working on might allow us the store multiple models (steps) in one artifact, which would make this whole discussion obsolete and be the best solution.
+
+
diff --git a/models/hazel_rabbit/requirements.txt b/models/hazel_rabbit/requirements.txt
new file mode 100644
index 00000000..1fa9034a
--- /dev/null
+++ b/models/hazel_rabbit/requirements.txt
@@ -0,0 +1 @@
+# Requirements
diff --git a/models/hazel_rabbit/src/forecasting/generate_forecast.py b/models/hazel_rabbit/src/forecasting/generate_forecast.py
new file mode 100644
index 00000000..2ebd993c
--- /dev/null
+++ b/models/hazel_rabbit/src/forecasting/generate_forecast.py
@@ -0,0 +1,63 @@
+
+from set_partition import get_partitioner_dict
+
+import pandas as pd
+
+import sys
+from pathlib import Path
+
+PATH = Path(__file__)
+sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS  
+from set_path import setup_project_paths, setup_data_paths
+setup_project_paths(PATH)
+
+
+from utils import get_raw_data, create_model_time_stamp, save_generated_pred
+
+
+def forecast_with_model_artifact(config, views_raw):
+    """
+    Create forecasts using the zero baseline model. Return a DataFrame with the predictions.
+
+    Args:
+        config : Configuration object containing parameters and settings.
+        views_raw : DataFrame containing the raw data
+    """
+
+    partitioner_dict = get_partitioner_dict(config.run_type)
+            
+    # get the months for the predictions
+    first_month = partitioner_dict['predict'][0] #if config.partitioner==True else partitioner_dict['predict'][1]-config.time_steps
+    last_month = partitioner_dict['predict'][1]
+
+    views_raw = views_raw[['month_id', 'pg_id', 'month', 'year_id', 'c_id']]  
+
+    views_res = generate_forecast(config, views_raw, first_month, last_month)  
+
+    # add timestamp
+    config = create_model_time_stamp(config)
+
+    # save the DataFrame of model outputs
+    if config.save_generated == True:
+        save_generated_pred(config, views_res)
+
+    return views_res
+
+
+
+
+def generate_forecast(config, views_raw, first_month, last_month):
+    # get the unique grids as a Series
+    unique_grids = views_raw['pg_id'].unique()
+    
+    # create the next 36 months for these grids
+    next_months = pd.DataFrame({
+        'pg_id': unique_grids.repeat(config.time_steps),
+        'month_id': [month for _ in unique_grids for month in range(first_month, last_month)]  
+    })
+
+    # assign the sequence from 1 to 36 for the new months
+    next_months['out_sample_months'] = next_months.groupby('pg_id').cumcount() + 1
+    next_months['y_pred'] = 0
+
+    return next_months
\ No newline at end of file
diff --git a/models/hazel_rabbit/src/management/execute_model_runs.py b/models/hazel_rabbit/src/management/execute_model_runs.py
new file mode 100644
index 00000000..65848d56
--- /dev/null
+++ b/models/hazel_rabbit/src/management/execute_model_runs.py
@@ -0,0 +1,44 @@
+import sys
+from pathlib import Path
+
+PATH = Path(__file__)
+sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS  
+from set_path import setup_project_paths, setup_artifacts_paths
+setup_project_paths(PATH)
+
+#from config_sweep import get_swep_config
+from config_hyperparameters import get_hp_config
+#from model_run_manager import model_run_manager
+from execute_model_tasks import execute_model_tasks
+
+
+def execute_sweep_run(args):
+    print('Running sweep...')
+
+    project = f"hazel_rabbit_sweep" # check naming convention
+    
+    print('Sweep run is not implemented. Exiting...')
+
+
+def execute_single_run(args):
+    
+    # get config
+    config = get_hp_config()
+    config['run_type'] = args.run_type
+    
+
+    # get run type and denoting project name - check convention!
+    project = f"hazel_rabbit_{args.run_type}"
+
+    if args.run_type == 'calibration' or args.run_type == 'testing':
+      
+        execute_model_tasks(config = config, project = project, train = args.train, eval = args.evaluate, forecast = False)
+
+    elif args.run_type == 'forecasting':
+     
+        execute_model_tasks(config = config, project = project, train = False, eval = False, forecast=True)
+
+    else:
+        raise ValueError(f"Invalid run type: {args.run_type}")
+
+
diff --git a/models/hazel_rabbit/src/management/execute_model_tasks.py b/models/hazel_rabbit/src/management/execute_model_tasks.py
new file mode 100644
index 00000000..58f99693
--- /dev/null
+++ b/models/hazel_rabbit/src/management/execute_model_tasks.py
@@ -0,0 +1,80 @@
+
+import wandb
+
+import sys
+from pathlib import Path
+
+PATH = Path(__file__)
+sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS  
+from set_path import setup_project_paths, setup_artifacts_paths, setup_data_paths
+
+from ingester3.ViewsMonth import ViewsMonth
+setup_project_paths(PATH)
+
+from utils import get_raw_data
+from utils_wandb import add_wandb_monthly_metrics
+
+
+from evaluate_model import evaluate_model_artifact 
+from generate_forecast import forecast_with_model_artifact
+
+
+
+def execute_model_tasks(config = None, project = None, train = None, eval = None, forecast = None):
+
+    """
+        Executes various model-related tasks including training, evaluation, and forecasting.
+
+    This function manages the execution of different tasks such as training the model,
+    evaluating an existing model, or performing forecasting.
+    It also initializes the WandB project. 
+
+    Args:
+        config: Configuration object containing parameters and settings.
+        project: The WandB project name.
+        train: Flag to indicate if the model should be trained.
+        eval: Flag to indicate if the model should be evaluated.
+        forecast: Flag to indicate if forecasting should be performed.
+    """
+
+    # Define the path for the artifacts
+    PATH_ARTIFACTS = setup_artifacts_paths(PATH)
+
+    #device = setup_device()
+
+    # Initialize WandB
+    with wandb.init(project=project, entity="views_pipeline", config=config): # project and config ignored when running a sweep 
+        
+        # add the monthly metrics to WandB
+        add_wandb_monthly_metrics() 
+
+        # Update config from WandB initialization above
+        config = wandb.config
+
+        # Retrieve raw data (partition) based on the configuration
+        views_raw = get_raw_data(config) 
+        
+
+        # Handle the sweep runs
+        if config.sweep:  
+
+            pass
+
+        # Handle the single model runs: train and save the model as an artifact
+        if train:
+        
+            print('No need to train the zero baseline model. Exiting...')
+            pass
+
+        # Handle the single model runs: evaluate a trained model (artifact)
+        if eval:
+            #handle_evaluation(config, device, views_vol, PATH_ARTIFACTS, artifact_name)
+            evaluate_model_artifact(config, views_raw)
+
+
+
+        if forecast:
+            #handle_forecasting(config, device, views_vol, PATH_ARTIFACTS, artifact_name)
+            forecast_with_model_artifact(config, views_raw)
+
+            
diff --git a/models/hazel_rabbit/src/offline_evaluation/evaluate_model.py b/models/hazel_rabbit/src/offline_evaluation/evaluate_model.py
new file mode 100644
index 00000000..81d09ba6
--- /dev/null
+++ b/models/hazel_rabbit/src/offline_evaluation/evaluate_model.py
@@ -0,0 +1,60 @@
+
+from set_partition import get_partitioner_dict
+
+import sys
+from pathlib import Path
+
+PATH = Path(__file__)
+sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS  
+from set_path import setup_project_paths, setup_data_paths
+setup_project_paths(PATH)
+
+
+from utils import get_raw_data, create_model_time_stamp, save_generated_pred
+
+
+def evaluate_model_artifact(config, views_raw):
+    """
+    Create predictions using the zero baseline model. Return a DataFrame with the predictions.
+
+    Args:
+        config : Configuration object containing parameters and settings.
+        views_raw : DataFrame containing the raw data
+    """
+
+    partitioner_dict = get_partitioner_dict(config.run_type)
+            
+    # get the months for the predictions
+    first_month = partitioner_dict['predict'][0] if config.partitioner==True else partitioner_dict['predict'][1]-config.time_steps
+    last_month = partitioner_dict['predict'][1]
+
+    # apply the function to each grid group
+    views_res = views_raw.groupby('pg_id').apply(create_months_index, config=config).reset_index(drop=True)
+
+    # add 0 prediction
+    views_res['y_pred'] = 0
+
+    # add timestamp
+    config = create_model_time_stamp(config)
+
+    # save the DataFrame of model outputs
+    if config.save_generated == True:
+        save_generated_pred(config, views_res)
+
+    return views_res
+
+
+
+def create_months_index(group_df, config):
+    """
+    Add a new column (named 'out_sample_months') to a DataFrame with the numbers from 1 to config.time_steps.
+
+    Args:
+        group_df : DataFrame grouped by the grid id ('pg_id').
+        config : Configuration object containing parameters and settings.
+    """
+
+    group_df = group_df.sort_values(by='month_id').tail(config.time_steps)
+    group_df['out_sample_months'] = range(1, config.time_steps + 1)
+
+    return group_df
\ No newline at end of file
diff --git a/models/hazel_rabbit/src/utils/utils.py b/models/hazel_rabbit/src/utils/utils.py
new file mode 100644
index 00000000..7eff232f
--- /dev/null
+++ b/models/hazel_rabbit/src/utils/utils.py
@@ -0,0 +1,74 @@
+import numpy as np
+from datetime import datetime
+import pickle
+
+
+
+
+import sys
+from pathlib import Path
+
+PATH = Path(__file__)
+sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS  
+from set_path import setup_project_paths, setup_data_paths
+setup_project_paths(PATH)
+
+
+
+
+def get_raw_data(config):
+    """
+    Return the raw data.
+
+    Args:
+        config : Configuration object containing parameters and settings.
+    """
+
+    PATH_RAW, _, PATH_GENERATED = setup_data_paths(PATH)
+    run_type = config.run_type
+    file_name = f'/{run_type}_viewser_df.pkl'
+    print(f'Loading {run_type} data from {file_name}...')
+    views_raw = np.load(str(PATH_RAW) + file_name, allow_pickle=True)
+    
+    return views_raw
+
+
+
+
+def create_model_time_stamp(config):
+    """
+    Create the timestamp of the evaluation and add to the config.
+
+    Args:
+        config : Configuration object containing parameters and settings.
+    """
+
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    print(f"timestamp: {timestamp}")
+    # add to config for logging and conciseness
+    config.model_time_stamp = timestamp
+
+    return config
+
+
+def save_generated_pred(config, views_res):
+    """
+    Save the predictions in a pickle file.
+
+    Args:
+        config : Configuration object containing parameters and settings.
+        views_res : DataFrame containing the predictions.
+    """
+
+    _, _, PATH_GENERATED = setup_data_paths(PATH)
+    outputs_path = f'{PATH_GENERATED}/df_sb_os_ns_output_{config.time_steps}_{config.run_type}_{config.model_time_stamp}.pkl'
+    with open(outputs_path, 'wb') as file:
+        pickle.dump(views_res, file)
+
+    return
+
+
+
+
+
+
diff --git a/models/hazel_rabbit/src/utils/utils_wandb.py b/models/hazel_rabbit/src/utils/utils_wandb.py
new file mode 100644
index 00000000..df82c326
--- /dev/null
+++ b/models/hazel_rabbit/src/utils/utils_wandb.py
@@ -0,0 +1,34 @@
+import numpy as np
+from sklearn.metrics import mean_squared_error, average_precision_score, roc_auc_score, brier_score_loss
+import wandb
+
+# there are things in other utils that should be here...
+
+def add_wandb_monthly_metrics():
+        
+    # Define "new" monthly metrics for WandB logging
+    wandb.define_metric("monthly/out_sample_month")
+    wandb.define_metric("monthly/*", step_metric="monthly/out_sample_month")
+
+
+def log_wandb_monthly_metrics(config, mse_list, ap_list, auc_list, brier_list):
+    
+    """
+    Logs evaluation metrics to WandB.
+
+    This function computes the mean of provided metrics and logs them to WandB.
+    The metrics include mean squared error, average precision score, ROC AUC score, and Brier score loss.
+
+    Args:
+        config : Configuration object containing parameters and settings.
+        mse_list : List of monthly mean squared errors.
+        ap_list : List of monthly average precision scores.
+        auc_list : List of monthly ROC AUC scores.
+        brier_list : List of monthly Brier scores.
+
+    """
+
+    wandb.log({f"{config.time_steps}month_mean_squared_error": np.mean(mse_list)})
+    wandb.log({f"{config.time_steps}month_average_precision_score": np.mean(ap_list)})
+    wandb.log({f"{config.time_steps}month_roc_auc_score": np.mean(auc_list)})
+    wandb.log({f"{config.time_steps}month_brier_score_loss": np.mean(brier_list)})
\ No newline at end of file