Merge branch 'development' into update_generel_logger

prio-data · Oct 31, 2024 · 6d0db3c · 6d0db3c
2 parents e962c1c + 9ac24f0
commit 6d0db3c
Show file tree

Hide file tree

Showing 11 changed files with 589 additions and 69 deletions.
diff --git a/common_querysets/queryset_meow_meow.py b/common_querysets/queryset_meow_meow.py
diff --git a/meta_tools/model_scaffold_builder.py b/meta_tools/model_scaffold_builder.py
@@ -25,6 +25,14 @@
     template_config_meta,
     template_config_sweep,
     template_main,
+    template_get_data,
+    template_execute_model_runs,
+    template_execute_model_tasks,
+    template_evaluate_model,
+    template_evaluate_sweep,
+    template_train_model,
+    template_utils_run,
+    template_generate_forecast
 )
 
 logging.basicConfig(level=logging.INFO)
@@ -152,15 +160,15 @@ def build_model_scripts(self):
                 f"Model directory {self._model.model_dir} does not exist. Please call build_model_directory() first. Aborting script generation."
             )
         template_config_deployment.generate(
-            script_dir=self._model.model_dir / "configs/config_deployment.py"
+            script_dir=self._model.configs / "config_deployment.py"
         )
         self._model_algorithm = str(
             input(
                 "Enter the algorithm of the model (e.g. XGBoost, LightBGM, HydraNet): "
             )
         )
         template_config_hyperparameters.generate(
-            script_dir=self._model.model_dir / "configs/config_hyperparameters.py",
+            script_dir=self._model.configs / "config_hyperparameters.py",
             model_algorithm=self._model_algorithm,
         )
         template_config_input_data.generate(
@@ -169,15 +177,39 @@ def build_model_scripts(self):
             model_name=self._model.model_name,
         )
         template_config_meta.generate(
-            script_dir=self._model.model_dir / "configs/config_meta.py",
+            script_dir=self._model.configs / "config_meta.py",
             model_name=self._model.model_name,
             model_algorithm=self._model_algorithm,
         )
         template_config_sweep.generate(
-            script_dir=self._model.model_dir / "configs/config_sweep.py",
+            script_dir=self._model.configs / "config_sweep.py",
             model_algorithm=self._model_algorithm,
         )
         template_main.generate(script_dir=self._model.model_dir / "main.py")
+        template_get_data.generate(script_dir=self._model.dataloaders / "get_data.py")
+        template_generate_forecast.generate(
+            script_dir=self._model.forecasting / "generate_forecast.py"
+        )
+        template_execute_model_runs.generate(
+            script_dir=self._model.management / "execute_model_runs.py")
+        template_execute_model_tasks.generate(
+            script_dir=self._model.management / "execute_model_tasks.py"
+        )
+        template_evaluate_model.generate(
+            script_dir=self._model.offline_evaluation / "evaluate_model.py"
+        )
+        template_evaluate_sweep.generate(
+            script_dir=self._model.offline_evaluation / "evaluate_sweep.py"
+        )
+        template_train_model.generate(
+            script_dir=self._model.training / "train_model.py"
+        )
+        template_utils_run.generate(
+            script_dir=self._model.utils / "utils_run.py"
+        )
+        # INFO: utils_outputs.py was not templated because it will probably be moved to common_utils in the future.
+        logging.info(f"Remember to update the queryset file at {self._model.queryset_path}!")
+
 
     def assess_model_directory(self) -> dict:
         """

diff --git a/meta_tools/templates/model/template_evaluate_model.py b/meta_tools/templates/model/template_evaluate_model.py
@@ -0,0 +1,76 @@
+from utils import utils_script_gen
+from pathlib import Path
+
+
+def generate(script_dir: Path) -> bool:
+    """
+    Generates a Python script with a predefined template and saves it to the specified directory.
+
+    The generated script includes a function to evaluate a model artifact. It handles loading the model,
+    making predictions, standardizing the data, generating evaluation metrics, and saving the outputs.
+    It also logs relevant information using Weights & Biases (wandb).
+
+    Args:
+        script_dir (Path): The directory where the generated script will be saved.
+
+    Returns:
+        bool: True if the script was successfully saved, False otherwise.
+    """
+
+    code = f"""from datetime import datetime
+import pandas as pd
+import logging
+from model_path import ModelPath
+from utils_log_files import create_log_file, read_log_file
+from utils_outputs import save_model_outputs, save_predictions
+from utils_run import get_standardized_df
+from utils_artifacts import get_latest_model_artifact
+from utils_evaluation_metrics import generate_metric_dict
+from utils_model_outputs import generate_output_dict
+from utils_wandb import log_wandb_log_dict
+from views_forecasts.extensions import *
+
+logger = logging.getLogger(__name__)
+
+def evaluate_model_artifact(config, artifact_name):
+    model_path = ModelPath(config["name"])
+    path_raw = model_path.data_raw
+    path_generated = model_path.data_generated
+    path_artifacts = model_path.artifacts
+    run_type = config["run_type"]
+
+    # if an artifact name is provided through the CLI, use it.
+    # Otherwise, get the latest model artifact based on the run type
+    if artifact_name:
+        logger.info(f"Using (non-default) artifact: {{artifact_name}}")
+
+        if not artifact_name.endswith(".pkl"):
+            artifact_name += ".pkl"
+        PATH_ARTIFACT = path_artifacts / artifact_name
+    else:
+        # use the latest model artifact based on the run type
+        logger.info(f"Using latest (default) run type ({{run_type}}) specific artifact")
+        PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type)
+
+    config["timestamp"] = PATH_ARTIFACT.stem[-15:]
+    df_viewser = pd.read_pickle(path_raw / f"{{run_type}}_viewser_df.pkl")
+
+    try:
+        stepshift_model = pd.read_pickle(PATH_ARTIFACT)
+    except FileNotFoundError:
+        logger.exception(f"Model artifact not found at {{PATH_ARTIFACT}}")
+
+    df = stepshift_model.predict(run_type, df_viewser)
+    df = get_standardized_df(df, config)
+    data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    date_fetch_timestamp = read_log_file(path_raw / f"{{run_type}}_data_fetch_log.txt").get("Data Fetch Timestamp", None)
+
+    _, df_output = generate_output_dict(df, config)
+    evaluation, df_evaluation = generate_metric_dict(df, config)
+    log_wandb_log_dict(config, evaluation)
+
+    save_model_outputs(df_evaluation, df_output, path_generated, config)
+    save_predictions(df, path_generated, config)
+    create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp)
+"""
+    return utils_script_gen.save_script(script_dir, code)
diff --git a/meta_tools/templates/model/template_evaluate_sweep.py b/meta_tools/templates/model/template_evaluate_sweep.py
@@ -0,0 +1,49 @@
+from utils import utils_script_gen
+from pathlib import Path
+
+
+def generate(script_dir: Path) -> bool:
+    """
+    Generates a Python script with a predefined template and saves it to the specified directory.
+
+    The generated script includes a function to evaluate a sweep of model runs. It handles loading the model,
+    making predictions, standardizing the data, calculating the mean squared error (MSE), generating evaluation metrics,
+    and logging the results using Weights & Biases (wandb).
+
+    Args:
+        script_dir (Path): The directory where the generated script will be saved.
+
+    Returns:
+        bool: True if the script was successfully saved, False otherwise.
+    """
+
+    code = f"""import pandas as pd
+import wandb
+from sklearn.metrics import mean_squared_error
+from model_path import ModelPath
+from utils_run import get_standardized_df
+from utils_wandb import log_wandb_log_dict
+from utils_evaluation_metrics import generate_metric_dict
+
+
+def evaluate_sweep(config, stepshift_model):
+    model_path = ModelPath(config["name"])
+    path_raw = model_path.data_raw
+    run_type = config["run_type"]
+    steps = config["steps"]
+
+    df_viewser = pd.read_pickle(path_raw / f"{{{{run_type}}}}_viewser_df.pkl")
+    df = stepshift_model.predict(run_type, df_viewser)
+    df = get_standardized_df(df, config)
+
+    # Temporarily keep this because the metric to minimize is MSE
+    pred_cols = [f"step_pred_{{{{str(i)}}}}" for i in steps]
+    df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36,
+                                                        [row[col] for col in pred_cols]), axis=1)
+
+    wandb.log({{"MSE": df["mse"].mean()}})
+
+    evaluation, _ = generate_metric_dict(df, config)
+    log_wandb_log_dict(config, evaluation)
+"""
+    return utils_script_gen.save_script(script_dir, code)
diff --git a/meta_tools/templates/model/template_execute_model_runs.py b/meta_tools/templates/model/template_execute_model_runs.py
@@ -0,0 +1,59 @@
+from utils import utils_script_gen
+from pathlib import Path
+
+
+def generate(script_dir: Path) -> bool:
+    """
+    Generates a Python script with a predefined template and saves it to the specified directory.
+
+    The generated script includes functions to execute model runs, either as a sweep or a single run.
+    It uses configurations for deployment, hyperparameters, meta, and sweep, and integrates with Weights & Biases (wandb) for experiment tracking.
+
+    Args:
+        script_dir (Path): The directory where the generated script will be saved.
+
+    Returns:
+        bool: True if the script was successfully saved, False otherwise.
+    """
+
+    code = f"""import wandb
+from config_deployment import get_deployment_config
+from config_hyperparameters import get_hp_config
+from config_meta import get_meta_config
+from config_sweep import get_sweep_config
+from execute_model_tasks import execute_model_tasks
+from get_data import get_data
+from utils_run import update_config, update_sweep_config
+
+
+def execute_sweep_run(args):
+    sweep_config = get_sweep_config()
+    meta_config = get_meta_config()
+    update_sweep_config(sweep_config, args, meta_config)
+
+    get_data(args, sweep_config["name"])
+
+    project = f"{{sweep_config['name']}}_sweep"  # we can name the sweep in the config file
+    sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline")
+    wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline")
+
+
+def execute_single_run(args):
+    hp_config = get_hp_config()
+    meta_config = get_meta_config()
+    dp_config = get_deployment_config()
+    config = update_config(hp_config, meta_config, dp_config, args)
+
+    get_data(args, config["name"])
+
+    project = f"{{config['name']}}_{{args.run_type}}"
+
+    if args.run_type == "calibration" or args.run_type == "testing":
+        execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate,
+                            forecast=False, artifact_name=args.artifact_name)
+
+    elif args.run_type == "forecasting":
+        execute_model_tasks(config=config, project=project, train=args.train, eval=False,
+                            forecast=args.forecast, artifact_name=args.artifact_name)
+"""
+    return utils_script_gen.save_script(script_dir, code)
diff --git a/meta_tools/templates/model/template_execute_model_tasks.py b/meta_tools/templates/model/template_execute_model_tasks.py
@@ -0,0 +1,91 @@
+from utils import utils_script_gen
+from pathlib import Path
+
+
+def generate(script_dir: Path) -> bool:
+    """
+    Generates a Python script with a predefined template and saves it to the specified directory.
+
+    The generated script includes a function to execute various model-related tasks such as training,
+    evaluation, and forecasting. It integrates with Weights & Biases (wandb) for experiment tracking
+    and logging.
+
+    Args:
+        script_dir (Path): The directory where the generated script will be saved.
+
+    Returns:
+        bool: True if the script was successfully saved, False otherwise.
+    """
+
+    code = f"""import wandb
+import logging
+import time
+from evaluate_model import evaluate_model_artifact
+from evaluate_sweep import evaluate_sweep
+from generate_forecast import forecast_model_artifact
+from train_model import train_model_artifact
+from utils_run import split_hurdle_parameters
+from utils_wandb import add_wandb_monthly_metrics
+
+logger = logging.getLogger(__name__)
+
+def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None):
+    \"""
+    Executes various model-related tasks including training, evaluation, and forecasting.
+
+    This function manages the execution of different tasks such as training the model,
+    evaluating an existing model, or performing forecasting.
+    It also initializes the WandB project.
+
+    Args:
+        config: Configuration object containing parameters and settings.
+        project: The WandB project name.
+        train: Flag to indicate if the model should be trained.
+        eval: Flag to indicate if the model should be evaluated.
+        forecast: Flag to indicate if forecasting should be performed.
+        artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting.
+    \"""
+
+    start_t = time.time()
+
+    # Initialize WandB
+    with wandb.init(project=project, entity="views_pipeline",
+                    config=config):  # project and config ignored when running a sweep
+
+        # add the monthly metrics to WandB
+        add_wandb_monthly_metrics()
+
+        # Update config from WandB initialization above
+        config = wandb.config
+
+        # W&B does not directly support nested dictionaries for hyperparameters
+        # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs
+        if config["sweep"] and config["algorithm"] == "HurdleRegression":
+            config["parameters"] = {{}}
+            config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config)
+
+        if config["sweep"]:
+            logger.info(f"Sweeping model {{config['name']}}...")
+            stepshift_model = train_model_artifact(config)
+            logger.info(f"Evaluating model {{config['name']}}...")
+            evaluate_sweep(config, stepshift_model)
+
+        # Handle the single model runs: train and save the model as an artifact
+        if train:
+            logger.info(f"Training model {{config['name']}}...")
+            train_model_artifact(config)
+
+        # Handle the single model runs: evaluate a trained model (artifact)
+        if eval:
+            logger.info(f"Evaluating model {{config['name']}}...")
+            evaluate_model_artifact(config, artifact_name)
+
+        if forecast:
+            logger.info(f"Forecasting model {{config['name']}}...")
+            forecast_model_artifact(config, artifact_name)
+
+        end_t = time.time()
+        minutes = (end_t - start_t) / 60
+        logger.info(f"Done. Runtime: {{minutes:.3f}} minutes.\\n")
+"""
+    return utils_script_gen.save_script(script_dir, code)