prio-data · lujzi05 · Jun 23, 2024 · angelicalmcgowan · Aug 12, 2024 · sarakallis
diff --git a/models/hazel_rabbit/README.md b/models/hazel_rabbit/README.md
@@ -0,0 +1 @@
+# Model README
diff --git a/models/hazel_rabbit/configs/config_deployment.py b/models/hazel_rabbit/configs/config_deployment.py
@@ -0,0 +1,16 @@
+def get_deployment_config():
+
+    """
+    Contains the configuration for deploying the model into different environments.
+    This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system.
+
+    Returns:
+    - deployment_config (dict): A dictionary containing deployment settings, determining how the model is deployed, including status, endpoints, and resource allocation.
+    """
+
+    # More deployment settings can/will be added here
+    deployment_config = {
+       "deployment_status": "baseline", # shadow, deployed, baseline, or deprecated
+    }
+
+    return deployment_config
diff --git a/models/hazel_rabbit/configs/config_hyperparameters.py b/models/hazel_rabbit/configs/config_hyperparameters.py
@@ -0,0 +1,18 @@
+
+def get_hp_config():
+
+    """
+    Contains the hyperparameter configurations for model training.
+    This configuration is "operational" so modifying these settings will impact the model's behavior during training.
+
+    Returns:
+    - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase.
+    """
+
+    hyperparameters = {
+    'sweep' : False, # no sweep for the zero baseline model
+    'partitioner' : False, # True: if hardcoded months from set_partitioner.py are used, False: max months - time_steps
+    'save_generated' : True, # save evaulation results in the generated folder 
+    'time_steps' : 36, # 36 right?
+   }
+    return hyperparameters
diff --git a/models/hazel_rabbit/configs/config_input_data.py b/models/hazel_rabbit/configs/config_input_data.py
@@ -0,0 +1,25 @@
+from viewser import Queryset, Column
+
+def get_input_data_config():
+
+    """
+    Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model.
+    This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system.
+    There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and architecture accordingly.
+
+    Returns:
+    queryset_base (Queryset): A queryset containing the base data for the model training.
+    """
+
+    # VIEWSER 6
+    queryset_base = (Queryset("hazel_rabbit", "priogrid_month")
+        .with_column(Column("ln_sb_best", from_loa = "priogrid_month", from_column = "ged_sb_best_count_nokgi").transform.ops.ln().transform.missing.replace_na())
+        .with_column(Column("ln_ns_best", from_loa = "priogrid_month", from_column = "ged_ns_best_count_nokgi").transform.ops.ln().transform.missing.replace_na())
+        .with_column(Column("ln_os_best", from_loa = "priogrid_month", from_column = "ged_os_best_count_nokgi").transform.ops.ln().transform.missing.replace_na())
+        .with_column(Column("month", from_loa = "month", from_column = "month"))
+        .with_column(Column("year_id", from_loa = "country_year", from_column = "year_id"))
+        .with_column(Column("c_id", from_loa = "country_year", from_column = "country_id"))
+        .with_column(Column("col", from_loa = "priogrid", from_column = "col"))
+        .with_column(Column("row", from_loa = "priogrid", from_column = "row")))
+
+    return queryset_base
diff --git a/models/hazel_rabbit/configs/config_meta.py b/models/hazel_rabbit/configs/config_meta.py
@@ -0,0 +1,17 @@
+def get_meta_config():
+    """
+    Contains the meta data for the model (model architecture, name, target variable, and level of analysis).
+    This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation.
+
+    Returns:
+    - meta_config (dict): A dictionary containing model meta configuration.
+    """
+    meta_config = {
+        "name": "hazel_rabbit",
+        "algorithm": "zero baseline", 
+        "target(S)": ["ln_sb_best", "ln_ns_best", "ln_os_best", "ln_sb_best_binarized", "ln_ns_best_binarized", "ln_os_best_binarized"], 
+        "queryset": "hazel_rabbit",
+        "level": "pgm",
+        "creator": "Borbala" 
+    }
+    return meta_config 
diff --git a/models/hazel_rabbit/configs/config_sweep.py b/models/hazel_rabbit/configs/config_sweep.py
@@ -0,0 +1,27 @@
+def get_swep_config():
+
+    """
+    Contains the configuration for hyperparameter sweeps using WandB.
+    This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance.
+
+    Returns:
+    - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters.
+    """
+
+    sweep_config = {
+    'method': 'grid'
+    }
+
+    metric = {
+
+        }
+
+    sweep_config['metric'] = metric
+
+    parameters_dict = {
+
+        }
+
+    sweep_config['parameters'] = parameters_dict
+
+    return sweep_config
diff --git a/models/hazel_rabbit/main.py b/models/hazel_rabbit/main.py
@@ -0,0 +1,60 @@
+import time
+
+import wandb
+
+import sys
+from pathlib import Path
+
+PATH = Path(__file__)
+sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS  
+from set_path import setup_project_paths, setup_artifacts_paths
+setup_project_paths(PATH)
+
+from cli_parser_utils import parse_args, validate_arguments
+#from artifacts_utils import get_latest_model_artifact
+
+#from model_run_handlers import handle_sweep_run, handle_single_run
+from execute_model_runs import execute_sweep_run, execute_single_run
+
+#from mode_run_manager import model_run_manager
+
+if __name__ == "__main__":
+
+    # new argpars solution.
+    args = parse_args()
+    #print(args)
+
+    # Validate the parsed arguments to ensure they conform to the required logic and combinations.
+    validate_arguments(args)
+
+    # wandb login
+    wandb.login()
+
+    start_t = time.time()
+
+    # Test if and why a model_metadata_dict.py was saved in the artifacts folder..
+
+    # first you need to check if you are running a sweep or not, because the sweep will overwrite the train and evaluate flags
+    if args.sweep == True:
+
+        #handle_sweep_run(args)
+        execute_sweep_run(args)
+
+    elif args.sweep == False:
+
+        #handle_single_run(args)
+        execute_single_run(args)
+
+    end_t = time.time()
+    minutes = (end_t - start_t)/60
+    print(f'Done. Runtime: {minutes:.3f} minutes')
+
+    # notes on stepshifted models:
+    # There will be some thinking here in regards to how we store, denote (naming convention), and retrieve the model artifacts from stepshifted models.
+    # It is not a big issue, but it is something to consider os we don't do something headless. 
+    # A possible format could be: <run_type>_model_s<step>_<timestamp>.pt example: calibration_model_s00_20210831_123456.pt, calibration_model_s01_20210831_123456.pt, etc.
+    # And the rest of the code maded in a way to handle this naming convention without any issues. Could be a simple fix.
+    # Alternatively, we could store the model artifacts in a subfolder for each stepshifted model. This would make it easier to handle the artifacts, but it would also make it harder to retrieve the latest artifact for a given run type.
+    # Lastly, the solution Xiaolong is working on might allow us the store multiple models (steps) in one artifact, which would make this whole discussion obsolete and be the best solution.
+
+
diff --git a/models/hazel_rabbit/requirements.txt b/models/hazel_rabbit/requirements.txt
@@ -0,0 +1 @@
+# Requirements
diff --git a/models/hazel_rabbit/src/forecasting/generate_forecast.py b/models/hazel_rabbit/src/forecasting/generate_forecast.py
@@ -0,0 +1,63 @@
+
+from set_partition import get_partitioner_dict
+
+import pandas as pd
+
+import sys
+from pathlib import Path
+
+PATH = Path(__file__)
+sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS  
+from set_path import setup_project_paths, setup_data_paths
+setup_project_paths(PATH)
+
+
+from utils import get_raw_data, create_model_time_stamp, save_generated_pred
+
+
+def forecast_with_model_artifact(config, views_raw):
+    """
+    Create forecasts using the zero baseline model. Return a DataFrame with the predictions.
+
+    Args:
+        config : Configuration object containing parameters and settings.
+        views_raw : DataFrame containing the raw data
+    """
+
+    partitioner_dict = get_partitioner_dict(config.run_type)
+
+    # get the months for the predictions
+    first_month = partitioner_dict['predict'][0] #if config.partitioner==True else partitioner_dict['predict'][1]-config.time_steps
+    last_month = partitioner_dict['predict'][1]
+
+    views_raw = views_raw[['month_id', 'pg_id', 'month', 'year_id', 'c_id']]  
+
+    views_res = generate_forecast(config, views_raw, first_month, last_month)  
+
+    # add timestamp
+    config = create_model_time_stamp(config)
+
+    # save the DataFrame of model outputs
+    if config.save_generated == True:
+        save_generated_pred(config, views_res)
+
+    return views_res
+
+
+
+
+def generate_forecast(config, views_raw, first_month, last_month):
+    # get the unique grids as a Series
+    unique_grids = views_raw['pg_id'].unique()
+
+    # create the next 36 months for these grids
+    next_months = pd.DataFrame({
+        'pg_id': unique_grids.repeat(config.time_steps),
+        'month_id': [month for _ in unique_grids for month in range(first_month, last_month)]  
+    })
+
+    # assign the sequence from 1 to 36 for the new months
+    next_months['out_sample_months'] = next_months.groupby('pg_id').cumcount() + 1
+    next_months['y_pred'] = 0
+
+    return next_months
diff --git a/models/hazel_rabbit/src/management/execute_model_runs.py b/models/hazel_rabbit/src/management/execute_model_runs.py
@@ -0,0 +1,44 @@
+import sys
+from pathlib import Path
+
+PATH = Path(__file__)
+sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS  
+from set_path import setup_project_paths, setup_artifacts_paths
+setup_project_paths(PATH)
+
+#from config_sweep import get_swep_config
+from config_hyperparameters import get_hp_config
+#from model_run_manager import model_run_manager
+from execute_model_tasks import execute_model_tasks
+
+
+def execute_sweep_run(args):
+    print('Running sweep...')
+
+    project = f"hazel_rabbit_sweep" # check naming convention
+
+    print('Sweep run is not implemented. Exiting...')
+
+
+def execute_single_run(args):
+
+    # get config
+    config = get_hp_config()
+    config['run_type'] = args.run_type
+
+
+    # get run type and denoting project name - check convention!
+    project = f"hazel_rabbit_{args.run_type}"
+
+    if args.run_type == 'calibration' or args.run_type == 'testing':
+
+        execute_model_tasks(config = config, project = project, train = args.train, eval = args.evaluate, forecast = False)
+
+    elif args.run_type == 'forecasting':
+
+        execute_model_tasks(config = config, project = project, train = False, eval = False, forecast=True)
+
+    else:
+        raise ValueError(f"Invalid run type: {args.run_type}")
+
+
diff --git a/models/hazel_rabbit/src/management/execute_model_tasks.py b/models/hazel_rabbit/src/management/execute_model_tasks.py
@@ -0,0 +1,80 @@
+
+import wandb
+
+import sys
+from pathlib import Path
+
+PATH = Path(__file__)
+sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS  
+from set_path import setup_project_paths, setup_artifacts_paths, setup_data_paths
+
+from ingester3.ViewsMonth import ViewsMonth
+setup_project_paths(PATH)
+
+from utils import get_raw_data
+from utils_wandb import add_wandb_monthly_metrics
+
+
+from evaluate_model import evaluate_model_artifact 
+from generate_forecast import forecast_with_model_artifact
+
+
+
+def execute_model_tasks(config = None, project = None, train = None, eval = None, forecast = None):
+
+    """
+        Executes various model-related tasks including training, evaluation, and forecasting.
+
+    This function manages the execution of different tasks such as training the model,
+    evaluating an existing model, or performing forecasting.
+    It also initializes the WandB project. 
+
+    Args:
+        config: Configuration object containing parameters and settings.
+        project: The WandB project name.
+        train: Flag to indicate if the model should be trained.
+        eval: Flag to indicate if the model should be evaluated.
+        forecast: Flag to indicate if forecasting should be performed.
+    """
+
+    # Define the path for the artifacts
+    PATH_ARTIFACTS = setup_artifacts_paths(PATH)
+
+    #device = setup_device()
+
+    # Initialize WandB
+    with wandb.init(project=project, entity="views_pipeline", config=config): # project and config ignored when running a sweep 
+
+        # add the monthly metrics to WandB
+        add_wandb_monthly_metrics() 
+
+        # Update config from WandB initialization above
+        config = wandb.config
+
+        # Retrieve raw data (partition) based on the configuration
+        views_raw = get_raw_data(config) 
+
+
+        # Handle the sweep runs
+        if config.sweep:  
+
+            pass
+
+        # Handle the single model runs: train and save the model as an artifact
+        if train:
+
+            print('No need to train the zero baseline model. Exiting...')
+            pass
+
+        # Handle the single model runs: evaluate a trained model (artifact)
+        if eval:
+            #handle_evaluation(config, device, views_vol, PATH_ARTIFACTS, artifact_name)
+            evaluate_model_artifact(config, views_raw)
+
+
+
+        if forecast:
+            #handle_forecasting(config, device, views_vol, PATH_ARTIFACTS, artifact_name)
+            forecast_with_model_artifact(config, views_raw)
+
+