From d304d21128e9397affe8cc4fad2ff223ad6bec1d Mon Sep 17 00:00:00 2001 From: Dylan <52908667+smellycloud@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:46:00 +0100 Subject: [PATCH 1/4] add new templates for darts models --- common_querysets/queryset_meow_meow.py | 41 ------- meta_tools/model_scaffold_builder.py | 39 ++++++- .../model/template_evaluate_model.py | 77 +++++++++++++ .../model/template_evaluate_sweep.py | 50 +++++++++ .../model/template_execute_model_runs.py | 60 ++++++++++ .../model/template_execute_model_tasks.py | 92 ++++++++++++++++ .../model/template_generate_forecast.py | 68 ++++++++++++ .../templates/model/template_get_data.py | 36 ++++++ meta_tools/templates/model/template_main.py | 44 ++++---- .../templates/model/template_train_model.py | 53 +++++++++ .../templates/model/template_utils_run.py | 104 ++++++++++++++++++ 11 files changed, 595 insertions(+), 69 deletions(-) delete mode 100644 common_querysets/queryset_meow_meow.py create mode 100644 meta_tools/templates/model/template_evaluate_model.py create mode 100644 meta_tools/templates/model/template_evaluate_sweep.py create mode 100644 meta_tools/templates/model/template_execute_model_runs.py create mode 100644 meta_tools/templates/model/template_execute_model_tasks.py create mode 100644 meta_tools/templates/model/template_generate_forecast.py create mode 100644 meta_tools/templates/model/template_get_data.py create mode 100644 meta_tools/templates/model/template_train_model.py create mode 100644 meta_tools/templates/model/template_utils_run.py diff --git a/common_querysets/queryset_meow_meow.py b/common_querysets/queryset_meow_meow.py deleted file mode 100644 index 02b00ecd..00000000 --- a/common_querysets/queryset_meow_meow.py +++ /dev/null @@ -1,41 +0,0 @@ -from viewser import Queryset, Column - -def generate(): - """ - Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. - This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. - There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. - - Returns: - - queryset_base (Queryset): A queryset containing the base data for the model training. - """ - - # VIEWSER 6, Example configuration. Modify as needed. - - queryset_base = (Queryset("meow_meow", "priogrid_month") - # Create a new column 'ln_sb_best' using data from 'priogrid_month' and 'ged_sb_best_count_nokgi' column - # Apply logarithmic transformation, handle missing values by replacing them with NA - .with_column(Column("ln_sb_best", from_loa="priogrid_month", from_column="ged_sb_best_count_nokgi") - .transform.ops.ln().transform.missing.replace_na()) - - # Create a new column 'ln_ns_best' using data from 'priogrid_month' and 'ged_ns_best_count_nokgi' column - # Apply logarithmic transformation, handle missing values by replacing them with NA - .with_column(Column("ln_ns_best", from_loa="priogrid_month", from_column="ged_ns_best_count_nokgi") - .transform.ops.ln().transform.missing.replace_na()) - - # Create a new column 'ln_os_best' using data from 'priogrid_month' and 'ged_os_best_count_nokgi' column - # Apply logarithmic transformation, handle missing values by replacing them with NA - .with_column(Column("ln_os_best", from_loa="priogrid_month", from_column="ged_os_best_count_nokgi") - .transform.ops.ln().transform.missing.replace_na()) - - # Create columns for month and year_id - .with_column(Column("month", from_loa="month", from_column="month")) - .with_column(Column("year_id", from_loa="country_year", from_column="year_id")) - - # Create columns for country_id, col, and row - .with_column(Column("c_id", from_loa="country_year", from_column="country_id")) - .with_column(Column("col", from_loa="priogrid", from_column="col")) - .with_column(Column("row", from_loa="priogrid", from_column="row")) - ) - - return queryset_base diff --git a/meta_tools/model_scaffold_builder.py b/meta_tools/model_scaffold_builder.py index 6fa48bb1..0d69e0f0 100644 --- a/meta_tools/model_scaffold_builder.py +++ b/meta_tools/model_scaffold_builder.py @@ -25,6 +25,14 @@ template_config_meta, template_config_sweep, template_main, + template_get_data, + template_execute_model_runs, + template_execute_model_tasks, + template_evaluate_model, + template_evaluate_sweep, + template_train_model, + template_utils_run, + template_generate_forecast ) logging.basicConfig(level=logging.INFO) @@ -152,7 +160,7 @@ def build_model_scripts(self): f"Model directory {self._model.model_dir} does not exist. Please call build_model_directory() first. Aborting script generation." ) template_config_deployment.generate( - script_dir=self._model.model_dir / "configs/config_deployment.py" + script_dir=self._model.configs / "config_deployment.py" ) self._model_algorithm = str( input( @@ -160,7 +168,7 @@ def build_model_scripts(self): ) ) template_config_hyperparameters.generate( - script_dir=self._model.model_dir / "configs/config_hyperparameters.py", + script_dir=self._model.configs / "config_hyperparameters.py", model_algorithm=self._model_algorithm, ) template_config_input_data.generate( @@ -169,15 +177,38 @@ def build_model_scripts(self): model_name=self._model.model_name, ) template_config_meta.generate( - script_dir=self._model.model_dir / "configs/config_meta.py", + script_dir=self._model.configs / "config_meta.py", model_name=self._model.model_name, model_algorithm=self._model_algorithm, ) template_config_sweep.generate( - script_dir=self._model.model_dir / "configs/config_sweep.py", + script_dir=self._model.configs / "config_sweep.py", model_algorithm=self._model_algorithm, ) template_main.generate(script_dir=self._model.model_dir / "main.py") + template_get_data.generate(script_dir=self._model.dataloaders / "get_data.py") + template_generate_forecast.generate( + script_dir=self._model.forecasting / "generate_forecast.py" + ) + template_execute_model_runs.generate( + script_dir=self._model.management / "execute_model_runs.py") + template_execute_model_tasks.generate( + script_dir=self._model.management / "execute_model_tasks.py" + ) + template_evaluate_model.generate( + script_dir=self._model.offline_evaluation / "evaluate_model.py" + ) + template_evaluate_sweep.generate( + script_dir=self._model.offline_evaluation / "evaluate_sweep.py" + ) + template_train_model.generate( + script_dir=self._model.training / "train_model.py" + ) + template_utils_run.generate( + script_dir=self._model.utils / "utils_run.py" + ) + # INFO: utils_outputs.py was not templated because it will probably be moved to common_utils in the future. + def assess_model_directory(self) -> dict: """ diff --git a/meta_tools/templates/model/template_evaluate_model.py b/meta_tools/templates/model/template_evaluate_model.py new file mode 100644 index 00000000..bdfdd47f --- /dev/null +++ b/meta_tools/templates/model/template_evaluate_model.py @@ -0,0 +1,77 @@ +from utils import utils_script_gen +from pathlib import Path + + +def generate(script_dir: Path) -> bool: + """ + Generates a Python script with a predefined template and saves it to the specified directory. + + The generated script includes a function to evaluate a model artifact. It handles loading the model, + making predictions, standardizing the data, generating evaluation metrics, and saving the outputs. + It also logs relevant information using Weights & Biases (wandb). + + Args: + script_dir (Path): The directory where the generated script will be saved. + + Returns: + bool: True if the script was successfully saved, False otherwise. + """ + + code = f""" +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {{artifact_name}}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({{run_type}}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{{run_type}}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {{PATH_ARTIFACT}}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{{run_type}}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) +""" + return utils_script_gen.save_script(script_dir, code) \ No newline at end of file diff --git a/meta_tools/templates/model/template_evaluate_sweep.py b/meta_tools/templates/model/template_evaluate_sweep.py new file mode 100644 index 00000000..d4c254c3 --- /dev/null +++ b/meta_tools/templates/model/template_evaluate_sweep.py @@ -0,0 +1,50 @@ +from utils import utils_script_gen +from pathlib import Path + + +def generate(script_dir: Path) -> bool: + """ + Generates a Python script with a predefined template and saves it to the specified directory. + + The generated script includes a function to evaluate a sweep of model runs. It handles loading the model, + making predictions, standardizing the data, calculating the mean squared error (MSE), generating evaluation metrics, + and logging the results using Weights & Biases (wandb). + + Args: + script_dir (Path): The directory where the generated script will be saved. + + Returns: + bool: True if the script was successfully saved, False otherwise. + """ + + code = f""" +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{{{{run_type}}}}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{{{{str(i)}}}}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({{"MSE": df["mse"].mean()}}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) +""" + return utils_script_gen.save_script(script_dir, code) \ No newline at end of file diff --git a/meta_tools/templates/model/template_execute_model_runs.py b/meta_tools/templates/model/template_execute_model_runs.py new file mode 100644 index 00000000..95a142cd --- /dev/null +++ b/meta_tools/templates/model/template_execute_model_runs.py @@ -0,0 +1,60 @@ +from utils import utils_script_gen +from pathlib import Path + + +def generate(script_dir: Path) -> bool: + """ + Generates a Python script with a predefined template and saves it to the specified directory. + + The generated script includes functions to execute model runs, either as a sweep or a single run. + It uses configurations for deployment, hyperparameters, meta, and sweep, and integrates with Weights & Biases (wandb) for experiment tracking. + + Args: + script_dir (Path): The directory where the generated script will be saved. + + Returns: + bool: True if the script was successfully saved, False otherwise. + """ + + code = f""" +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + get_data(args, sweep_config["name"]) + + project = f"{{sweep_config['name']}}_sweep" # we can name the sweep in the config file + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + get_data(args, config["name"]) + + project = f"{{config['name']}}_{{args.run_type}}" + + if args.run_type == "calibration" or args.run_type == "testing": + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) +""" + return utils_script_gen.save_script(script_dir, code) \ No newline at end of file diff --git a/meta_tools/templates/model/template_execute_model_tasks.py b/meta_tools/templates/model/template_execute_model_tasks.py new file mode 100644 index 00000000..a62cf057 --- /dev/null +++ b/meta_tools/templates/model/template_execute_model_tasks.py @@ -0,0 +1,92 @@ +from utils import utils_script_gen +from pathlib import Path + + +def generate(script_dir: Path) -> bool: + """ + Generates a Python script with a predefined template and saves it to the specified directory. + + The generated script includes a function to execute various model-related tasks such as training, + evaluation, and forecasting. It integrates with Weights & Biases (wandb) for experiment tracking + and logging. + + Args: + script_dir (Path): The directory where the generated script will be saved. + + Returns: + bool: True if the script was successfully saved, False otherwise. + """ + + code = f""" +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + \""" + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + \""" + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {{}} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {{config['name']}}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {{config['name']}}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {{config['name']}}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {{config['name']}}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {{config['name']}}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {{minutes:.3f}} minutes.\\n") +""" + return utils_script_gen.save_script(script_dir, code) \ No newline at end of file diff --git a/meta_tools/templates/model/template_generate_forecast.py b/meta_tools/templates/model/template_generate_forecast.py new file mode 100644 index 00000000..cf513bc0 --- /dev/null +++ b/meta_tools/templates/model/template_generate_forecast.py @@ -0,0 +1,68 @@ +from utils import utils_script_gen +from pathlib import Path + + +def generate(script_dir: Path) -> bool: + """ + Generates a Python script with a predefined template and saves it to the specified directory. + + The generated script includes a function to forecast using a model artifact. It handles loading the model, + making predictions, standardizing the data, and saving the predictions and log files. + + Args: + script_dir (Path): The directory where the generated script will be saved. + + Returns: + bool: True if the script was successfully saved, False otherwise. + """ + + code = f""" +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {{{{artifact_name}}}}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({{{{run_type}}}}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{{{{run_type}}}}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {{{{path_artifact}}}}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{{{{run_type}}}}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) +""" + return utils_script_gen.save_script(script_dir, code) \ No newline at end of file diff --git a/meta_tools/templates/model/template_get_data.py b/meta_tools/templates/model/template_get_data.py new file mode 100644 index 00000000..d2cf4bc8 --- /dev/null +++ b/meta_tools/templates/model/template_get_data.py @@ -0,0 +1,36 @@ +from utils import utils_script_gen +from pathlib import Path + + +def generate(script_dir: Path) -> bool: + """ + Generates a Python script with a predefined template and saves it to the specified directory. + + Args: + script_dir (Path): The directory where the generated script will be saved. + + Returns: + bool: True if the script was successfully saved, False otherwise. + """ + + code = f""" +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name): + model_path = ModelPath(model_name) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, use_saved=args.saved) + logger.debug(f"DataFrame shape: {{data.shape if data is not None else 'None'}}") + + for ialert, alert in enumerate(str(alerts).strip('[').strip(']').split('Input')): + if 'offender' in alert: + logger.warning({{f"{{args.run_type}} data alert {{ialert}}": str(alert)}}) + + return data +""" + return utils_script_gen.save_script(script_dir, code) \ No newline at end of file diff --git a/meta_tools/templates/model/template_main.py b/meta_tools/templates/model/template_main.py index c41e5eed..4706a03a 100644 --- a/meta_tools/templates/model/template_main.py +++ b/meta_tools/templates/model/template_main.py @@ -33,44 +33,40 @@ def generate(script_dir: Path) -> bool: specified script directory. - The generated script is designed to be executed as a standalone Python script. """ - code = """import time -import wandb + code = """import wandb import sys -import logging -logging.basicConfig(filename='run.log', encoding='utf-8', level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) +import warnings + from pathlib import Path -# Set up the path to include common_utils module PATH = Path(__file__) sys.path.insert(0, str(Path( *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS -# Import necessary functions for project setup and model execution from set_path import setup_project_paths setup_project_paths(PATH) + from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging from execute_model_runs import execute_sweep_run, execute_single_run +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + GlobalCache["current_model"] = ModelPath.get_model_name_from_path(Path(__file__)) +except Exception: + pass +logger = setup_logging("run.log") + + if __name__ == "__main__": - # Parse command-line arguments - args = parse_args() + wandb.login() - # Validate the arguments to ensure they are correct + args = parse_args() validate_arguments(args) - # Log in to Weights & Biases (wandb) - wandb.login() - # Record the start time - start_t = time.time() - # Execute the model run based on the sweep flag + if args.sweep: - execute_sweep_run(args) # Execute sweep run + execute_sweep_run(args) else: - execute_single_run(args) # Execute single run - # Record the end time - end_t = time.time() - - # Calculate and print the runtime in minutes - minutes = (end_t - start_t) / 60 - logger.info(f'Done. Runtime: {minutes:.3f} minutes') + execute_single_run(args) """ return utils_script_gen.save_script(script_dir, code) diff --git a/meta_tools/templates/model/template_train_model.py b/meta_tools/templates/model/template_train_model.py new file mode 100644 index 00000000..2be82037 --- /dev/null +++ b/meta_tools/templates/model/template_train_model.py @@ -0,0 +1,53 @@ +from utils import utils_script_gen +from pathlib import Path + + +def generate(script_dir: Path) -> bool: + """ + Generates a Python script with a predefined template and saves it to the specified directory. + + The generated script includes functions to train a model artifact. It handles loading the data, + training the model, saving the model artifact, and creating log files. + + Args: + script_dir (Path): The directory where the generated script will be saved. + + Returns: + bool: True if the script was successfully saved, False otherwise. + """ + + code = f"""from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{{{{run_type}}}}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{{{{run_type}}}}_model_{{{{timestamp}}}}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{{{{run_type}}}}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model +""" + return utils_script_gen.save_script(script_dir, code) \ No newline at end of file diff --git a/meta_tools/templates/model/template_utils_run.py b/meta_tools/templates/model/template_utils_run.py new file mode 100644 index 00000000..65c8d12e --- /dev/null +++ b/meta_tools/templates/model/template_utils_run.py @@ -0,0 +1,104 @@ +from utils import utils_script_gen +from pathlib import Path + + +def generate(script_dir: Path) -> bool: + """ + Generates a Python script with a predefined template and saves it to the specified directory. + + The generated script includes functions to get a model based on the configuration, standardize a DataFrame, + split hurdle parameters, and update configurations for both single runs and sweeps. + + Args: + script_dir (Path): The directory where the generated script will be saved. + + Returns: + bool: True if the script was successfully saved, False otherwise. + """ + + code = f""" +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + \""" + Get the model based on the algorithm specified in the config + \""" + + if config["algorithm"] == "HurdleRegression": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + \""" + Standardize the DataFrame based on the run type + \""" + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{{{{i}}}}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + \""" + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + \""" + + cls_dict = {{}} + reg_dict = {{}} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleRegression": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {{"value": args.run_type}} + sweep_config["parameters"]["sweep"] = {{"value": True}} + sweep_config["parameters"]["name"] = {{"value": meta_config["name"]}} + sweep_config["parameters"]["depvar"] = {{"value": meta_config["depvar"]}} + sweep_config["parameters"]["algorithm"] = {{"value": meta_config["algorithm"]}} + if meta_config["algorithm"] == "HurdleRegression": + sweep_config["parameters"]["model_clf"] = {{"value": meta_config["model_clf"]}} + sweep_config["parameters"]["model_reg"] = {{"value": meta_config["model_reg"]}} +""" + return utils_script_gen.save_script(script_dir, code) \ No newline at end of file From 4c65b6982aa13addb99e307928db9437691899f6 Mon Sep 17 00:00:00 2001 From: Dylan <52908667+smellycloud@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:51:21 +0100 Subject: [PATCH 2/4] cleanup --- meta_tools/model_scaffold_builder.py | 1 + meta_tools/templates/model/template_evaluate_model.py | 3 +-- meta_tools/templates/model/template_evaluate_sweep.py | 3 +-- meta_tools/templates/model/template_execute_model_runs.py | 3 +-- meta_tools/templates/model/template_execute_model_tasks.py | 3 +-- meta_tools/templates/model/template_generate_forecast.py | 3 +-- meta_tools/templates/model/template_get_data.py | 3 +-- meta_tools/templates/model/template_utils_run.py | 3 +-- 8 files changed, 8 insertions(+), 14 deletions(-) diff --git a/meta_tools/model_scaffold_builder.py b/meta_tools/model_scaffold_builder.py index 0d69e0f0..4cc8d7d7 100644 --- a/meta_tools/model_scaffold_builder.py +++ b/meta_tools/model_scaffold_builder.py @@ -208,6 +208,7 @@ def build_model_scripts(self): script_dir=self._model.utils / "utils_run.py" ) # INFO: utils_outputs.py was not templated because it will probably be moved to common_utils in the future. + logging.info(f"Remember to update the queryset file at {self._model.queryset_path}!") def assess_model_directory(self) -> dict: diff --git a/meta_tools/templates/model/template_evaluate_model.py b/meta_tools/templates/model/template_evaluate_model.py index bdfdd47f..e0ce4a3d 100644 --- a/meta_tools/templates/model/template_evaluate_model.py +++ b/meta_tools/templates/model/template_evaluate_model.py @@ -17,8 +17,7 @@ def generate(script_dir: Path) -> bool: bool: True if the script was successfully saved, False otherwise. """ - code = f""" -from datetime import datetime + code = f"""from datetime import datetime import pandas as pd import logging from model_path import ModelPath diff --git a/meta_tools/templates/model/template_evaluate_sweep.py b/meta_tools/templates/model/template_evaluate_sweep.py index d4c254c3..805181a8 100644 --- a/meta_tools/templates/model/template_evaluate_sweep.py +++ b/meta_tools/templates/model/template_evaluate_sweep.py @@ -17,8 +17,7 @@ def generate(script_dir: Path) -> bool: bool: True if the script was successfully saved, False otherwise. """ - code = f""" -import pandas as pd + code = f"""import pandas as pd import wandb from sklearn.metrics import mean_squared_error from model_path import ModelPath diff --git a/meta_tools/templates/model/template_execute_model_runs.py b/meta_tools/templates/model/template_execute_model_runs.py index 95a142cd..1a59dddb 100644 --- a/meta_tools/templates/model/template_execute_model_runs.py +++ b/meta_tools/templates/model/template_execute_model_runs.py @@ -16,8 +16,7 @@ def generate(script_dir: Path) -> bool: bool: True if the script was successfully saved, False otherwise. """ - code = f""" -import wandb + code = f"""import wandb from config_deployment import get_deployment_config from config_hyperparameters import get_hp_config from config_meta import get_meta_config diff --git a/meta_tools/templates/model/template_execute_model_tasks.py b/meta_tools/templates/model/template_execute_model_tasks.py index a62cf057..67f9e212 100644 --- a/meta_tools/templates/model/template_execute_model_tasks.py +++ b/meta_tools/templates/model/template_execute_model_tasks.py @@ -17,8 +17,7 @@ def generate(script_dir: Path) -> bool: bool: True if the script was successfully saved, False otherwise. """ - code = f""" -import wandb + code = f"""import wandb import logging import time from evaluate_model import evaluate_model_artifact diff --git a/meta_tools/templates/model/template_generate_forecast.py b/meta_tools/templates/model/template_generate_forecast.py index cf513bc0..bd49882e 100644 --- a/meta_tools/templates/model/template_generate_forecast.py +++ b/meta_tools/templates/model/template_generate_forecast.py @@ -16,8 +16,7 @@ def generate(script_dir: Path) -> bool: bool: True if the script was successfully saved, False otherwise. """ - code = f""" -import pandas as pd + code = f"""import pandas as pd from datetime import datetime import logging from model_path import ModelPath diff --git a/meta_tools/templates/model/template_get_data.py b/meta_tools/templates/model/template_get_data.py index d2cf4bc8..fea091a5 100644 --- a/meta_tools/templates/model/template_get_data.py +++ b/meta_tools/templates/model/template_get_data.py @@ -13,8 +13,7 @@ def generate(script_dir: Path) -> bool: bool: True if the script was successfully saved, False otherwise. """ - code = f""" -import logging + code = f"""import logging from model_path import ModelPath from utils_dataloaders import fetch_or_load_views_df diff --git a/meta_tools/templates/model/template_utils_run.py b/meta_tools/templates/model/template_utils_run.py index 65c8d12e..2b84ed64 100644 --- a/meta_tools/templates/model/template_utils_run.py +++ b/meta_tools/templates/model/template_utils_run.py @@ -16,8 +16,7 @@ def generate(script_dir: Path) -> bool: bool: True if the script was successfully saved, False otherwise. """ - code = f""" -import numpy as np + code = f"""import numpy as np from views_stepshifter_darts.stepshifter import StepshifterModel from views_stepshifter_darts.hurdle_model import HurdleModel from views_forecasts.extensions import * From 3733aa73aad927f3232f495d07996de9b388e62b Mon Sep 17 00:00:00 2001 From: Dylan <52908667+smellycloud@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:46:00 +0100 Subject: [PATCH 3/4] add new templates for darts models --- common_querysets/queryset_meow_meow.py | 41 ------- meta_tools/model_scaffold_builder.py | 39 ++++++- .../model/template_evaluate_model.py | 77 +++++++++++++ .../model/template_evaluate_sweep.py | 50 +++++++++ .../model/template_execute_model_runs.py | 60 ++++++++++ .../model/template_execute_model_tasks.py | 92 ++++++++++++++++ .../model/template_generate_forecast.py | 68 ++++++++++++ .../templates/model/template_get_data.py | 36 ++++++ meta_tools/templates/model/template_main.py | 44 ++++---- .../templates/model/template_train_model.py | 53 +++++++++ .../templates/model/template_utils_run.py | 104 ++++++++++++++++++ 11 files changed, 595 insertions(+), 69 deletions(-) delete mode 100644 common_querysets/queryset_meow_meow.py create mode 100644 meta_tools/templates/model/template_evaluate_model.py create mode 100644 meta_tools/templates/model/template_evaluate_sweep.py create mode 100644 meta_tools/templates/model/template_execute_model_runs.py create mode 100644 meta_tools/templates/model/template_execute_model_tasks.py create mode 100644 meta_tools/templates/model/template_generate_forecast.py create mode 100644 meta_tools/templates/model/template_get_data.py create mode 100644 meta_tools/templates/model/template_train_model.py create mode 100644 meta_tools/templates/model/template_utils_run.py diff --git a/common_querysets/queryset_meow_meow.py b/common_querysets/queryset_meow_meow.py deleted file mode 100644 index 02b00ecd..00000000 --- a/common_querysets/queryset_meow_meow.py +++ /dev/null @@ -1,41 +0,0 @@ -from viewser import Queryset, Column - -def generate(): - """ - Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. - This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. - There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. - - Returns: - - queryset_base (Queryset): A queryset containing the base data for the model training. - """ - - # VIEWSER 6, Example configuration. Modify as needed. - - queryset_base = (Queryset("meow_meow", "priogrid_month") - # Create a new column 'ln_sb_best' using data from 'priogrid_month' and 'ged_sb_best_count_nokgi' column - # Apply logarithmic transformation, handle missing values by replacing them with NA - .with_column(Column("ln_sb_best", from_loa="priogrid_month", from_column="ged_sb_best_count_nokgi") - .transform.ops.ln().transform.missing.replace_na()) - - # Create a new column 'ln_ns_best' using data from 'priogrid_month' and 'ged_ns_best_count_nokgi' column - # Apply logarithmic transformation, handle missing values by replacing them with NA - .with_column(Column("ln_ns_best", from_loa="priogrid_month", from_column="ged_ns_best_count_nokgi") - .transform.ops.ln().transform.missing.replace_na()) - - # Create a new column 'ln_os_best' using data from 'priogrid_month' and 'ged_os_best_count_nokgi' column - # Apply logarithmic transformation, handle missing values by replacing them with NA - .with_column(Column("ln_os_best", from_loa="priogrid_month", from_column="ged_os_best_count_nokgi") - .transform.ops.ln().transform.missing.replace_na()) - - # Create columns for month and year_id - .with_column(Column("month", from_loa="month", from_column="month")) - .with_column(Column("year_id", from_loa="country_year", from_column="year_id")) - - # Create columns for country_id, col, and row - .with_column(Column("c_id", from_loa="country_year", from_column="country_id")) - .with_column(Column("col", from_loa="priogrid", from_column="col")) - .with_column(Column("row", from_loa="priogrid", from_column="row")) - ) - - return queryset_base diff --git a/meta_tools/model_scaffold_builder.py b/meta_tools/model_scaffold_builder.py index 6fa48bb1..0d69e0f0 100644 --- a/meta_tools/model_scaffold_builder.py +++ b/meta_tools/model_scaffold_builder.py @@ -25,6 +25,14 @@ template_config_meta, template_config_sweep, template_main, + template_get_data, + template_execute_model_runs, + template_execute_model_tasks, + template_evaluate_model, + template_evaluate_sweep, + template_train_model, + template_utils_run, + template_generate_forecast ) logging.basicConfig(level=logging.INFO) @@ -152,7 +160,7 @@ def build_model_scripts(self): f"Model directory {self._model.model_dir} does not exist. Please call build_model_directory() first. Aborting script generation." ) template_config_deployment.generate( - script_dir=self._model.model_dir / "configs/config_deployment.py" + script_dir=self._model.configs / "config_deployment.py" ) self._model_algorithm = str( input( @@ -160,7 +168,7 @@ def build_model_scripts(self): ) ) template_config_hyperparameters.generate( - script_dir=self._model.model_dir / "configs/config_hyperparameters.py", + script_dir=self._model.configs / "config_hyperparameters.py", model_algorithm=self._model_algorithm, ) template_config_input_data.generate( @@ -169,15 +177,38 @@ def build_model_scripts(self): model_name=self._model.model_name, ) template_config_meta.generate( - script_dir=self._model.model_dir / "configs/config_meta.py", + script_dir=self._model.configs / "config_meta.py", model_name=self._model.model_name, model_algorithm=self._model_algorithm, ) template_config_sweep.generate( - script_dir=self._model.model_dir / "configs/config_sweep.py", + script_dir=self._model.configs / "config_sweep.py", model_algorithm=self._model_algorithm, ) template_main.generate(script_dir=self._model.model_dir / "main.py") + template_get_data.generate(script_dir=self._model.dataloaders / "get_data.py") + template_generate_forecast.generate( + script_dir=self._model.forecasting / "generate_forecast.py" + ) + template_execute_model_runs.generate( + script_dir=self._model.management / "execute_model_runs.py") + template_execute_model_tasks.generate( + script_dir=self._model.management / "execute_model_tasks.py" + ) + template_evaluate_model.generate( + script_dir=self._model.offline_evaluation / "evaluate_model.py" + ) + template_evaluate_sweep.generate( + script_dir=self._model.offline_evaluation / "evaluate_sweep.py" + ) + template_train_model.generate( + script_dir=self._model.training / "train_model.py" + ) + template_utils_run.generate( + script_dir=self._model.utils / "utils_run.py" + ) + # INFO: utils_outputs.py was not templated because it will probably be moved to common_utils in the future. + def assess_model_directory(self) -> dict: """ diff --git a/meta_tools/templates/model/template_evaluate_model.py b/meta_tools/templates/model/template_evaluate_model.py new file mode 100644 index 00000000..bdfdd47f --- /dev/null +++ b/meta_tools/templates/model/template_evaluate_model.py @@ -0,0 +1,77 @@ +from utils import utils_script_gen +from pathlib import Path + + +def generate(script_dir: Path) -> bool: + """ + Generates a Python script with a predefined template and saves it to the specified directory. + + The generated script includes a function to evaluate a model artifact. It handles loading the model, + making predictions, standardizing the data, generating evaluation metrics, and saving the outputs. + It also logs relevant information using Weights & Biases (wandb). + + Args: + script_dir (Path): The directory where the generated script will be saved. + + Returns: + bool: True if the script was successfully saved, False otherwise. + """ + + code = f""" +from datetime import datetime +import pandas as pd +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_outputs import save_model_outputs, save_predictions +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + +def evaluate_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {{artifact_name}}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + PATH_ARTIFACT = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({{run_type}}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{{run_type}}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {{PATH_ARTIFACT}}") + + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{{run_type}}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + _, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, path_generated, config) + save_predictions(df, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) +""" + return utils_script_gen.save_script(script_dir, code) \ No newline at end of file diff --git a/meta_tools/templates/model/template_evaluate_sweep.py b/meta_tools/templates/model/template_evaluate_sweep.py new file mode 100644 index 00000000..d4c254c3 --- /dev/null +++ b/meta_tools/templates/model/template_evaluate_sweep.py @@ -0,0 +1,50 @@ +from utils import utils_script_gen +from pathlib import Path + + +def generate(script_dir: Path) -> bool: + """ + Generates a Python script with a predefined template and saves it to the specified directory. + + The generated script includes a function to evaluate a sweep of model runs. It handles loading the model, + making predictions, standardizing the data, calculating the mean squared error (MSE), generating evaluation metrics, + and logging the results using Weights & Biases (wandb). + + Args: + script_dir (Path): The directory where the generated script will be saved. + + Returns: + bool: True if the script was successfully saved, False otherwise. + """ + + code = f""" +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error +from model_path import ModelPath +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + run_type = config["run_type"] + steps = config["steps"] + + df_viewser = pd.read_pickle(path_raw / f"{{{{run_type}}}}_viewser_df.pkl") + df = stepshift_model.predict(run_type, df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{{{{str(i)}}}}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config["depvar"]]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({{"MSE": df["mse"].mean()}}) + + evaluation, _ = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) +""" + return utils_script_gen.save_script(script_dir, code) \ No newline at end of file diff --git a/meta_tools/templates/model/template_execute_model_runs.py b/meta_tools/templates/model/template_execute_model_runs.py new file mode 100644 index 00000000..95a142cd --- /dev/null +++ b/meta_tools/templates/model/template_execute_model_runs.py @@ -0,0 +1,60 @@ +from utils import utils_script_gen +from pathlib import Path + + +def generate(script_dir: Path) -> bool: + """ + Generates a Python script with a predefined template and saves it to the specified directory. + + The generated script includes functions to execute model runs, either as a sweep or a single run. + It uses configurations for deployment, hyperparameters, meta, and sweep, and integrates with Weights & Biases (wandb) for experiment tracking. + + Args: + script_dir (Path): The directory where the generated script will be saved. + + Returns: + bool: True if the script was successfully saved, False otherwise. + """ + + code = f""" +import wandb +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + get_data(args, sweep_config["name"]) + + project = f"{{sweep_config['name']}}_sweep" # we can name the sweep in the config file + sweep_id = wandb.sweep(sweep_config, project=project, entity="views_pipeline") + wandb.agent(sweep_id, execute_model_tasks, entity="views_pipeline") + + +def execute_single_run(args): + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + get_data(args, config["name"]) + + project = f"{{config['name']}}_{{args.run_type}}" + + if args.run_type == "calibration" or args.run_type == "testing": + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == "forecasting": + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) +""" + return utils_script_gen.save_script(script_dir, code) \ No newline at end of file diff --git a/meta_tools/templates/model/template_execute_model_tasks.py b/meta_tools/templates/model/template_execute_model_tasks.py new file mode 100644 index 00000000..a62cf057 --- /dev/null +++ b/meta_tools/templates/model/template_execute_model_tasks.py @@ -0,0 +1,92 @@ +from utils import utils_script_gen +from pathlib import Path + + +def generate(script_dir: Path) -> bool: + """ + Generates a Python script with a predefined template and saves it to the specified directory. + + The generated script includes a function to execute various model-related tasks such as training, + evaluation, and forecasting. It integrates with Weights & Biases (wandb) for experiment tracking + and logging. + + Args: + script_dir (Path): The directory where the generated script will be saved. + + Returns: + bool: True if the script was successfully saved, False otherwise. + """ + + code = f""" +import wandb +import logging +import time +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + \""" + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + \""" + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config["sweep"] and config["algorithm"] == "HurdleRegression": + config["parameters"] = {{}} + config["parameters"]["clf"], config["parameters"]["reg"] = split_hurdle_parameters(config) + + if config["sweep"]: + logger.info(f"Sweeping model {{config['name']}}...") + stepshift_model = train_model_artifact(config) + logger.info(f"Evaluating model {{config['name']}}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {{config['name']}}...") + train_model_artifact(config) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {{config['name']}}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {{config['name']}}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f"Done. Runtime: {{minutes:.3f}} minutes.\\n") +""" + return utils_script_gen.save_script(script_dir, code) \ No newline at end of file diff --git a/meta_tools/templates/model/template_generate_forecast.py b/meta_tools/templates/model/template_generate_forecast.py new file mode 100644 index 00000000..cf513bc0 --- /dev/null +++ b/meta_tools/templates/model/template_generate_forecast.py @@ -0,0 +1,68 @@ +from utils import utils_script_gen +from pathlib import Path + + +def generate(script_dir: Path) -> bool: + """ + Generates a Python script with a predefined template and saves it to the specified directory. + + The generated script includes a function to forecast using a model artifact. It handles loading the model, + making predictions, standardizing the data, and saving the predictions and log files. + + Args: + script_dir (Path): The directory where the generated script will be saved. + + Returns: + bool: True if the script was successfully saved, False otherwise. + """ + + code = f""" +import pandas as pd +from datetime import datetime +import logging +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_standardized_df +from utils_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {{{{artifact_name}}}}") + + if not artifact_name.endswith(".pkl"): + artifact_name += ".pkl" + path_artifact = path_artifacts / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({{{{run_type}}}}) specific artifact") + path_artifact = get_latest_model_artifact(path_artifacts, run_type) + + config["timestamp"] = path_artifact.stem[-15:] + df_viewser = pd.read_pickle(path_raw / f"{{{{run_type}}}}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(path_artifact) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {{{{path_artifact}}}}") + + df_predictions = stepshift_model.predict(run_type, df_viewser) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{{{{run_type}}}}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + + save_predictions(df_predictions, path_generated, config) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) +""" + return utils_script_gen.save_script(script_dir, code) \ No newline at end of file diff --git a/meta_tools/templates/model/template_get_data.py b/meta_tools/templates/model/template_get_data.py new file mode 100644 index 00000000..d2cf4bc8 --- /dev/null +++ b/meta_tools/templates/model/template_get_data.py @@ -0,0 +1,36 @@ +from utils import utils_script_gen +from pathlib import Path + + +def generate(script_dir: Path) -> bool: + """ + Generates a Python script with a predefined template and saves it to the specified directory. + + Args: + script_dir (Path): The directory where the generated script will be saved. + + Returns: + bool: True if the script was successfully saved, False otherwise. + """ + + code = f""" +import logging +from model_path import ModelPath +from utils_dataloaders import fetch_or_load_views_df + +logger = logging.getLogger(__name__) + +def get_data(args, model_name): + model_path = ModelPath(model_name) + path_raw = model_path.data_raw + + data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, use_saved=args.saved) + logger.debug(f"DataFrame shape: {{data.shape if data is not None else 'None'}}") + + for ialert, alert in enumerate(str(alerts).strip('[').strip(']').split('Input')): + if 'offender' in alert: + logger.warning({{f"{{args.run_type}} data alert {{ialert}}": str(alert)}}) + + return data +""" + return utils_script_gen.save_script(script_dir, code) \ No newline at end of file diff --git a/meta_tools/templates/model/template_main.py b/meta_tools/templates/model/template_main.py index c41e5eed..4706a03a 100644 --- a/meta_tools/templates/model/template_main.py +++ b/meta_tools/templates/model/template_main.py @@ -33,44 +33,40 @@ def generate(script_dir: Path) -> bool: specified script directory. - The generated script is designed to be executed as a standalone Python script. """ - code = """import time -import wandb + code = """import wandb import sys -import logging -logging.basicConfig(filename='run.log', encoding='utf-8', level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) +import warnings + from pathlib import Path -# Set up the path to include common_utils module PATH = Path(__file__) sys.path.insert(0, str(Path( *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS -# Import necessary functions for project setup and model execution from set_path import setup_project_paths setup_project_paths(PATH) + from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging from execute_model_runs import execute_sweep_run, execute_single_run +warnings.filterwarnings("ignore") +try: + from common_utils.model_path import ModelPath + from common_utils.global_cache import GlobalCache + GlobalCache["current_model"] = ModelPath.get_model_name_from_path(Path(__file__)) +except Exception: + pass +logger = setup_logging("run.log") + + if __name__ == "__main__": - # Parse command-line arguments - args = parse_args() + wandb.login() - # Validate the arguments to ensure they are correct + args = parse_args() validate_arguments(args) - # Log in to Weights & Biases (wandb) - wandb.login() - # Record the start time - start_t = time.time() - # Execute the model run based on the sweep flag + if args.sweep: - execute_sweep_run(args) # Execute sweep run + execute_sweep_run(args) else: - execute_single_run(args) # Execute single run - # Record the end time - end_t = time.time() - - # Calculate and print the runtime in minutes - minutes = (end_t - start_t) / 60 - logger.info(f'Done. Runtime: {minutes:.3f} minutes') + execute_single_run(args) """ return utils_script_gen.save_script(script_dir, code) diff --git a/meta_tools/templates/model/template_train_model.py b/meta_tools/templates/model/template_train_model.py new file mode 100644 index 00000000..2be82037 --- /dev/null +++ b/meta_tools/templates/model/template_train_model.py @@ -0,0 +1,53 @@ +from utils import utils_script_gen +from pathlib import Path + + +def generate(script_dir: Path) -> bool: + """ + Generates a Python script with a predefined template and saves it to the specified directory. + + The generated script includes functions to train a model artifact. It handles loading the data, + training the model, saving the model artifact, and creating log files. + + Args: + script_dir (Path): The directory where the generated script will be saved. + + Returns: + bool: True if the script was successfully saved, False otherwise. + """ + + code = f"""from datetime import datetime +import pandas as pd +from model_path import ModelPath +from utils_log_files import create_log_file, read_log_file +from utils_run import get_model +from set_partition import get_partitioner_dict +from views_forecasts.extensions import * + + +def train_model_artifact(config): + # print(config) + model_path = ModelPath(config["name"]) + path_raw = model_path.data_raw + path_generated = model_path.data_generated + path_artifacts = model_path.artifacts + run_type = config["run_type"] + df_viewser = pd.read_pickle(path_raw / f"{{{{run_type}}}}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{{{{run_type}}}}_model_{{{{timestamp}}}}.pkl" + stepshift_model.save(path_artifacts / model_filename) + date_fetch_timestamp = read_log_file(path_raw / f"{{{{run_type}}}}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, dataset): + partitioner_dict = get_partitioner_dict(partition_name) + stepshift_model = get_model(config, partitioner_dict) + stepshift_model.fit(dataset) + return stepshift_model +""" + return utils_script_gen.save_script(script_dir, code) \ No newline at end of file diff --git a/meta_tools/templates/model/template_utils_run.py b/meta_tools/templates/model/template_utils_run.py new file mode 100644 index 00000000..65c8d12e --- /dev/null +++ b/meta_tools/templates/model/template_utils_run.py @@ -0,0 +1,104 @@ +from utils import utils_script_gen +from pathlib import Path + + +def generate(script_dir: Path) -> bool: + """ + Generates a Python script with a predefined template and saves it to the specified directory. + + The generated script includes functions to get a model based on the configuration, standardize a DataFrame, + split hurdle parameters, and update configurations for both single runs and sweeps. + + Args: + script_dir (Path): The directory where the generated script will be saved. + + Returns: + bool: True if the script was successfully saved, False otherwise. + """ + + code = f""" +import numpy as np +from views_stepshifter_darts.stepshifter import StepshifterModel +from views_stepshifter_darts.hurdle_model import HurdleModel +from views_forecasts.extensions import * + + +def get_model(config, partitioner_dict): + \""" + Get the model based on the algorithm specified in the config + \""" + + if config["algorithm"] == "HurdleRegression": + model = HurdleModel(config, partitioner_dict) + else: + config["model_reg"] = config["algorithm"] + model = StepshifterModel(config, partitioner_dict) + + return model + + +def get_standardized_df(df, config): + \""" + Standardize the DataFrame based on the run type + \""" + + run_type = config["run_type"] + steps = config["steps"] + depvar = config["depvar"] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ["calibration", "testing"]: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f"step_pred_{{{{i}}}}" for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + \""" + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + \""" + + cls_dict = {{}} + reg_dict = {{}} + + for key, value in parameters_dict.items(): + if key.startswith("cls_"): + cls_key = key.replace("cls_", "") + cls_dict[cls_key] = value + elif key.startswith("reg_"): + reg_key = key.replace("reg_", "") + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config["run_type"] = args.run_type + config["sweep"] = False + config["name"] = meta_config["name"] + config["depvar"] = meta_config["depvar"] + config["algorithm"] = meta_config["algorithm"] + if meta_config["algorithm"] == "HurdleRegression": + config["model_clf"] = meta_config["model_clf"] + config["model_reg"] = meta_config["model_reg"] + config["deployment_status"] = dp_config["deployment_status"] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config["parameters"]["run_type"] = {{"value": args.run_type}} + sweep_config["parameters"]["sweep"] = {{"value": True}} + sweep_config["parameters"]["name"] = {{"value": meta_config["name"]}} + sweep_config["parameters"]["depvar"] = {{"value": meta_config["depvar"]}} + sweep_config["parameters"]["algorithm"] = {{"value": meta_config["algorithm"]}} + if meta_config["algorithm"] == "HurdleRegression": + sweep_config["parameters"]["model_clf"] = {{"value": meta_config["model_clf"]}} + sweep_config["parameters"]["model_reg"] = {{"value": meta_config["model_reg"]}} +""" + return utils_script_gen.save_script(script_dir, code) \ No newline at end of file From 73b1c094ad77cebb973ddf0def3f98651cb0171f Mon Sep 17 00:00:00 2001 From: Dylan <52908667+smellycloud@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:51:21 +0100 Subject: [PATCH 4/4] cleanup --- meta_tools/model_scaffold_builder.py | 1 + meta_tools/templates/model/template_evaluate_model.py | 3 +-- meta_tools/templates/model/template_evaluate_sweep.py | 3 +-- meta_tools/templates/model/template_execute_model_runs.py | 3 +-- meta_tools/templates/model/template_execute_model_tasks.py | 3 +-- meta_tools/templates/model/template_generate_forecast.py | 3 +-- meta_tools/templates/model/template_get_data.py | 3 +-- meta_tools/templates/model/template_utils_run.py | 3 +-- 8 files changed, 8 insertions(+), 14 deletions(-) diff --git a/meta_tools/model_scaffold_builder.py b/meta_tools/model_scaffold_builder.py index 0d69e0f0..4cc8d7d7 100644 --- a/meta_tools/model_scaffold_builder.py +++ b/meta_tools/model_scaffold_builder.py @@ -208,6 +208,7 @@ def build_model_scripts(self): script_dir=self._model.utils / "utils_run.py" ) # INFO: utils_outputs.py was not templated because it will probably be moved to common_utils in the future. + logging.info(f"Remember to update the queryset file at {self._model.queryset_path}!") def assess_model_directory(self) -> dict: diff --git a/meta_tools/templates/model/template_evaluate_model.py b/meta_tools/templates/model/template_evaluate_model.py index bdfdd47f..e0ce4a3d 100644 --- a/meta_tools/templates/model/template_evaluate_model.py +++ b/meta_tools/templates/model/template_evaluate_model.py @@ -17,8 +17,7 @@ def generate(script_dir: Path) -> bool: bool: True if the script was successfully saved, False otherwise. """ - code = f""" -from datetime import datetime + code = f"""from datetime import datetime import pandas as pd import logging from model_path import ModelPath diff --git a/meta_tools/templates/model/template_evaluate_sweep.py b/meta_tools/templates/model/template_evaluate_sweep.py index d4c254c3..805181a8 100644 --- a/meta_tools/templates/model/template_evaluate_sweep.py +++ b/meta_tools/templates/model/template_evaluate_sweep.py @@ -17,8 +17,7 @@ def generate(script_dir: Path) -> bool: bool: True if the script was successfully saved, False otherwise. """ - code = f""" -import pandas as pd + code = f"""import pandas as pd import wandb from sklearn.metrics import mean_squared_error from model_path import ModelPath diff --git a/meta_tools/templates/model/template_execute_model_runs.py b/meta_tools/templates/model/template_execute_model_runs.py index 95a142cd..1a59dddb 100644 --- a/meta_tools/templates/model/template_execute_model_runs.py +++ b/meta_tools/templates/model/template_execute_model_runs.py @@ -16,8 +16,7 @@ def generate(script_dir: Path) -> bool: bool: True if the script was successfully saved, False otherwise. """ - code = f""" -import wandb + code = f"""import wandb from config_deployment import get_deployment_config from config_hyperparameters import get_hp_config from config_meta import get_meta_config diff --git a/meta_tools/templates/model/template_execute_model_tasks.py b/meta_tools/templates/model/template_execute_model_tasks.py index a62cf057..67f9e212 100644 --- a/meta_tools/templates/model/template_execute_model_tasks.py +++ b/meta_tools/templates/model/template_execute_model_tasks.py @@ -17,8 +17,7 @@ def generate(script_dir: Path) -> bool: bool: True if the script was successfully saved, False otherwise. """ - code = f""" -import wandb + code = f"""import wandb import logging import time from evaluate_model import evaluate_model_artifact diff --git a/meta_tools/templates/model/template_generate_forecast.py b/meta_tools/templates/model/template_generate_forecast.py index cf513bc0..bd49882e 100644 --- a/meta_tools/templates/model/template_generate_forecast.py +++ b/meta_tools/templates/model/template_generate_forecast.py @@ -16,8 +16,7 @@ def generate(script_dir: Path) -> bool: bool: True if the script was successfully saved, False otherwise. """ - code = f""" -import pandas as pd + code = f"""import pandas as pd from datetime import datetime import logging from model_path import ModelPath diff --git a/meta_tools/templates/model/template_get_data.py b/meta_tools/templates/model/template_get_data.py index d2cf4bc8..fea091a5 100644 --- a/meta_tools/templates/model/template_get_data.py +++ b/meta_tools/templates/model/template_get_data.py @@ -13,8 +13,7 @@ def generate(script_dir: Path) -> bool: bool: True if the script was successfully saved, False otherwise. """ - code = f""" -import logging + code = f"""import logging from model_path import ModelPath from utils_dataloaders import fetch_or_load_views_df diff --git a/meta_tools/templates/model/template_utils_run.py b/meta_tools/templates/model/template_utils_run.py index 65c8d12e..2b84ed64 100644 --- a/meta_tools/templates/model/template_utils_run.py +++ b/meta_tools/templates/model/template_utils_run.py @@ -16,8 +16,7 @@ def generate(script_dir: Path) -> bool: bool: True if the script was successfully saved, False otherwise. """ - code = f""" -import numpy as np + code = f"""import numpy as np from views_stepshifter_darts.stepshifter import StepshifterModel from views_stepshifter_darts.hurdle_model import HurdleModel from views_forecasts.extensions import *