From eacf039f27ff03672eb7ba047483d4883171dc63 Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Tue, 29 Oct 2024 15:59:40 +0100 Subject: [PATCH 1/4] Remove validate=False --- ensembles/cruel_summer/src/utils/utils_checks.py | 2 +- ensembles/cruel_summer/src/utils/utils_run.py | 2 +- ensembles/white_mustang/src/utils/utils_checks.py | 2 +- ensembles/white_mustang/src/utils/utils_run.py | 2 +- models/blank_space/src/dataloaders/get_data.py | 2 +- models/blank_space/src/offline_evaluation/evaluate_sweep.py | 2 +- models/electric_relaxation/src/dataloaders/get_data.py | 2 +- .../src/offline_evaluation/evaluate_sweep.py | 2 +- models/lavender_haze/src/dataloaders/get_data.py | 2 +- models/lavender_haze/src/offline_evaluation/evaluate_sweep.py | 2 +- models/old_money/src/dataloaders/get_data.py | 2 +- models/old_money/src/offline_evaluation/evaluate_sweep.py | 2 +- models/orange_pasta/src/dataloaders/get_data.py | 2 +- models/orange_pasta/src/offline_evaluation/evaluate_sweep.py | 2 +- models/wildest_dream/src/dataloaders/get_data.py | 2 +- models/wildest_dream/src/offline_evaluation/evaluate_sweep.py | 2 +- models/yellow_pikachu/src/dataloaders/get_data.py | 2 +- models/yellow_pikachu/src/offline_evaluation/evaluate_sweep.py | 2 +- 18 files changed, 18 insertions(+), 18 deletions(-) diff --git a/ensembles/cruel_summer/src/utils/utils_checks.py b/ensembles/cruel_summer/src/utils/utils_checks.py index 264ceac4..87a8ac97 100644 --- a/ensembles/cruel_summer/src/utils/utils_checks.py +++ b/ensembles/cruel_summer/src/utils/utils_checks.py @@ -111,7 +111,7 @@ def ensemble_model_check(config): """ for model_name in config["models"]: - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) path_generated = model_path.data_generated if ( diff --git a/ensembles/cruel_summer/src/utils/utils_run.py b/ensembles/cruel_summer/src/utils/utils_run.py index df9b07a4..093191e7 100644 --- a/ensembles/cruel_summer/src/utils/utils_run.py +++ b/ensembles/cruel_summer/src/utils/utils_run.py @@ -65,7 +65,7 @@ def update_config(hp_config, meta_config, dp_config, args): def get_single_model_config(model_name): - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) hp_config = runpy.run_path(model_path.configs / "config_hyperparameters.py")["get_hp_config"]() meta_config = runpy.run_path(model_path.configs / "config_meta.py")["get_meta_config"]() dp_config = runpy.run_path(model_path.configs / "config_deployment.py")["get_deployment_config"]() diff --git a/ensembles/white_mustang/src/utils/utils_checks.py b/ensembles/white_mustang/src/utils/utils_checks.py index 264ceac4..87a8ac97 100644 --- a/ensembles/white_mustang/src/utils/utils_checks.py +++ b/ensembles/white_mustang/src/utils/utils_checks.py @@ -111,7 +111,7 @@ def ensemble_model_check(config): """ for model_name in config["models"]: - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) path_generated = model_path.data_generated if ( diff --git a/ensembles/white_mustang/src/utils/utils_run.py b/ensembles/white_mustang/src/utils/utils_run.py index df9b07a4..093191e7 100644 --- a/ensembles/white_mustang/src/utils/utils_run.py +++ b/ensembles/white_mustang/src/utils/utils_run.py @@ -65,7 +65,7 @@ def update_config(hp_config, meta_config, dp_config, args): def get_single_model_config(model_name): - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) hp_config = runpy.run_path(model_path.configs / "config_hyperparameters.py")["get_hp_config"]() meta_config = runpy.run_path(model_path.configs / "config_meta.py")["get_meta_config"]() dp_config = runpy.run_path(model_path.configs / "config_deployment.py")["get_deployment_config"]() diff --git a/models/blank_space/src/dataloaders/get_data.py b/models/blank_space/src/dataloaders/get_data.py index 9f7e248f..34e47eb0 100644 --- a/models/blank_space/src/dataloaders/get_data.py +++ b/models/blank_space/src/dataloaders/get_data.py @@ -6,7 +6,7 @@ def get_data(args, model_name): - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) path_raw = model_path.data_raw data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, use_saved=args.saved) diff --git a/models/blank_space/src/offline_evaluation/evaluate_sweep.py b/models/blank_space/src/offline_evaluation/evaluate_sweep.py index 029a3496..681fe506 100644 --- a/models/blank_space/src/offline_evaluation/evaluate_sweep.py +++ b/models/blank_space/src/offline_evaluation/evaluate_sweep.py @@ -8,7 +8,7 @@ def evaluate_sweep(config, stepshift_model): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw run_type = config["run_type"] steps = config["steps"] diff --git a/models/electric_relaxation/src/dataloaders/get_data.py b/models/electric_relaxation/src/dataloaders/get_data.py index 9f7e248f..34e47eb0 100644 --- a/models/electric_relaxation/src/dataloaders/get_data.py +++ b/models/electric_relaxation/src/dataloaders/get_data.py @@ -6,7 +6,7 @@ def get_data(args, model_name): - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) path_raw = model_path.data_raw data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, use_saved=args.saved) diff --git a/models/electric_relaxation/src/offline_evaluation/evaluate_sweep.py b/models/electric_relaxation/src/offline_evaluation/evaluate_sweep.py index 029a3496..681fe506 100644 --- a/models/electric_relaxation/src/offline_evaluation/evaluate_sweep.py +++ b/models/electric_relaxation/src/offline_evaluation/evaluate_sweep.py @@ -8,7 +8,7 @@ def evaluate_sweep(config, stepshift_model): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw run_type = config["run_type"] steps = config["steps"] diff --git a/models/lavender_haze/src/dataloaders/get_data.py b/models/lavender_haze/src/dataloaders/get_data.py index 9f7e248f..34e47eb0 100644 --- a/models/lavender_haze/src/dataloaders/get_data.py +++ b/models/lavender_haze/src/dataloaders/get_data.py @@ -6,7 +6,7 @@ def get_data(args, model_name): - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) path_raw = model_path.data_raw data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, use_saved=args.saved) diff --git a/models/lavender_haze/src/offline_evaluation/evaluate_sweep.py b/models/lavender_haze/src/offline_evaluation/evaluate_sweep.py index 029a3496..681fe506 100644 --- a/models/lavender_haze/src/offline_evaluation/evaluate_sweep.py +++ b/models/lavender_haze/src/offline_evaluation/evaluate_sweep.py @@ -8,7 +8,7 @@ def evaluate_sweep(config, stepshift_model): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw run_type = config["run_type"] steps = config["steps"] diff --git a/models/old_money/src/dataloaders/get_data.py b/models/old_money/src/dataloaders/get_data.py index 9f7e248f..34e47eb0 100644 --- a/models/old_money/src/dataloaders/get_data.py +++ b/models/old_money/src/dataloaders/get_data.py @@ -6,7 +6,7 @@ def get_data(args, model_name): - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) path_raw = model_path.data_raw data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, use_saved=args.saved) diff --git a/models/old_money/src/offline_evaluation/evaluate_sweep.py b/models/old_money/src/offline_evaluation/evaluate_sweep.py index fc926a12..d6726cf0 100644 --- a/models/old_money/src/offline_evaluation/evaluate_sweep.py +++ b/models/old_money/src/offline_evaluation/evaluate_sweep.py @@ -8,7 +8,7 @@ def evaluate_sweep(config, stepshift_model): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw run_type = config["run_type"] steps = config["steps"] diff --git a/models/orange_pasta/src/dataloaders/get_data.py b/models/orange_pasta/src/dataloaders/get_data.py index 9f7e248f..34e47eb0 100644 --- a/models/orange_pasta/src/dataloaders/get_data.py +++ b/models/orange_pasta/src/dataloaders/get_data.py @@ -6,7 +6,7 @@ def get_data(args, model_name): - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) path_raw = model_path.data_raw data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, use_saved=args.saved) diff --git a/models/orange_pasta/src/offline_evaluation/evaluate_sweep.py b/models/orange_pasta/src/offline_evaluation/evaluate_sweep.py index fc926a12..d6726cf0 100644 --- a/models/orange_pasta/src/offline_evaluation/evaluate_sweep.py +++ b/models/orange_pasta/src/offline_evaluation/evaluate_sweep.py @@ -8,7 +8,7 @@ def evaluate_sweep(config, stepshift_model): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw run_type = config["run_type"] steps = config["steps"] diff --git a/models/wildest_dream/src/dataloaders/get_data.py b/models/wildest_dream/src/dataloaders/get_data.py index 9f7e248f..34e47eb0 100644 --- a/models/wildest_dream/src/dataloaders/get_data.py +++ b/models/wildest_dream/src/dataloaders/get_data.py @@ -6,7 +6,7 @@ def get_data(args, model_name): - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) path_raw = model_path.data_raw data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, use_saved=args.saved) diff --git a/models/wildest_dream/src/offline_evaluation/evaluate_sweep.py b/models/wildest_dream/src/offline_evaluation/evaluate_sweep.py index 029a3496..681fe506 100644 --- a/models/wildest_dream/src/offline_evaluation/evaluate_sweep.py +++ b/models/wildest_dream/src/offline_evaluation/evaluate_sweep.py @@ -8,7 +8,7 @@ def evaluate_sweep(config, stepshift_model): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw run_type = config["run_type"] steps = config["steps"] diff --git a/models/yellow_pikachu/src/dataloaders/get_data.py b/models/yellow_pikachu/src/dataloaders/get_data.py index 9f7e248f..34e47eb0 100644 --- a/models/yellow_pikachu/src/dataloaders/get_data.py +++ b/models/yellow_pikachu/src/dataloaders/get_data.py @@ -6,7 +6,7 @@ def get_data(args, model_name): - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) path_raw = model_path.data_raw data, alerts = fetch_or_load_views_df(model_name, args.run_type, path_raw, use_saved=args.saved) diff --git a/models/yellow_pikachu/src/offline_evaluation/evaluate_sweep.py b/models/yellow_pikachu/src/offline_evaluation/evaluate_sweep.py index 029a3496..681fe506 100644 --- a/models/yellow_pikachu/src/offline_evaluation/evaluate_sweep.py +++ b/models/yellow_pikachu/src/offline_evaluation/evaluate_sweep.py @@ -8,7 +8,7 @@ def evaluate_sweep(config, stepshift_model): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw run_type = config["run_type"] steps = config["steps"] From 5b3eed3183f762c21ec6919647956ef5eb0e3b7c Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Tue, 29 Oct 2024 16:04:39 +0100 Subject: [PATCH 2/4] log data fetch timestamp --- common_utils/utils_dataloaders.py | 8 +++- common_utils/utils_log_files.py | 43 ++++++++++++++----- .../src/forecasting/generate_forecast.py | 13 +++--- .../offline_evaluation/evaluate_ensemble.py | 15 ++++--- .../src/training/train_ensemble.py | 7 +-- .../src/forecasting/generate_forecast.py | 13 +++--- .../offline_evaluation/evaluate_ensemble.py | 15 ++++--- .../src/training/train_ensemble.py | 7 +-- .../src/forecasting/generate_forecast.py | 7 +-- .../src/offline_evaluation/evaluate_model.py | 7 +-- .../blank_space/src/training/train_model.py | 7 +-- .../src/forecasting/generate_forecast.py | 7 +-- .../src/offline_evaluation/evaluate_model.py | 7 +-- .../src/training/train_model.py | 7 +-- .../src/forecasting/generate_forecast.py | 7 +-- .../src/offline_evaluation/evaluate_model.py | 7 +-- .../lavender_haze/src/training/train_model.py | 7 +-- .../src/forecasting/generate_forecast.py | 7 +-- .../src/offline_evaluation/evaluate_model.py | 7 +-- models/old_money/src/training/train_model.py | 7 +-- .../src/forecasting/generate_forecast.py | 7 +-- .../src/offline_evaluation/evaluate_model.py | 7 +-- .../orange_pasta/src/training/train_model.py | 7 +-- .../src/forecasting/generate_forecast.py | 7 +-- .../src/offline_evaluation/evaluate_model.py | 7 +-- .../wildest_dream/src/training/train_model.py | 7 +-- .../src/forecasting/generate_forecast.py | 7 +-- .../src/offline_evaluation/evaluate_model.py | 7 +-- .../src/training/train_model.py | 7 +-- 29 files changed, 162 insertions(+), 106 deletions(-) diff --git a/common_utils/utils_dataloaders.py b/common_utils/utils_dataloaders.py index 60171709..cd1a8799 100644 --- a/common_utils/utils_dataloaders.py +++ b/common_utils/utils_dataloaders.py @@ -4,10 +4,12 @@ import pandas as pd import sys import logging +from datetime import datetime from set_partition import get_partitioner_dict from common_configs import config_drift_detection from utils_df_to_vol_conversion import df_to_vol +from utils_log_files import create_data_fetch_log_file from viewser import Queryset, Column import logging @@ -26,7 +28,7 @@ def fetch_data_from_viewser(model_name, month_first, month_last, drift_config_di pd.DataFrame: The prepared DataFrame with initial processing done. """ logger.info(f'Beginning file download through viewser with month range {month_first},{month_last}') - model_path = ModelPath(model_name, validate=True) + model_path = ModelPath(model_name) queryset_base = model_path.get_queryset() # just used here.. if queryset_base is None: raise RuntimeError(f'Could not find queryset for {model_path.model_name} in common_querysets') @@ -224,6 +226,10 @@ def fetch_or_load_views_df(model_name, partition, PATH_RAW, self_test=False, use else: logger.info(f'Fetching data...') df, alerts = get_views_df(model_name, partition, override_month, self_test) # which is then used here + + data_fetch_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + create_data_fetch_log_file(PATH_RAW, partition, model_name, data_fetch_timestamp) + logger.info(f'Saving data to {path_viewser_df}') df.to_pickle(path_viewser_df) diff --git a/common_utils/utils_log_files.py b/common_utils/utils_log_files.py index 9aafc25c..9bf3a1f0 100644 --- a/common_utils/utils_log_files.py +++ b/common_utils/utils_log_files.py @@ -1,6 +1,5 @@ import logging from pathlib import Path -from set_path import setup_data_paths from model_path import ModelPath logger = logging.getLogger(__name__) @@ -11,11 +10,12 @@ def read_log_file(log_file_path): Reads the log file and returns a dictionary with the relevant information. Args: - - log_file_path (str): The path to the log file. + - log_file_path (str or Path): The path to the log file. Returns: - dict: A dictionary containing the model name, model timestamp, data generation timestamp, and data fetch timestamp. """ + log_data = {} with open(log_file_path, "r") as file: for line in file: @@ -28,7 +28,31 @@ def read_log_file(log_file_path): key, value = line.split(": ", 1) # There are duplicated keys for ensemble models, but it's not a problem bc these keys are not used log_data[key] = value + return log_data + + +def create_data_fetch_log_file(path_raw, + run_type, + model_name, + data_fetch_timestamp): + """ + Creates a log file in the specified single model folder with details about the data fetch. + + Args: + - path_raw (Path): The path to the folder where the log file will be created. + - run_type (str): The type of run. + - model_name (str): The name of the model. + - data_fetch_timestamp (str): The timestamp when the raw data used was fetched from VIEWS. + """ + + data_fetch_log_file_path = f"{path_raw}/{run_type}_data_fetch_log.txt" + + with open(data_fetch_log_file_path, "w") as log_file: + log_file.write(f"Single Model Name: {model_name}\n") + log_file.write(f"Data Fetch Timestamp: {data_fetch_timestamp}\n\n") + + logger.info(f"Data fetch log file created at {data_fetch_log_file_path}") def create_specific_log_file(path_generated, @@ -36,12 +60,12 @@ def create_specific_log_file(path_generated, model_name, deployment_status, model_timestamp, - data_generation_timestamp=None, - data_fetch_timestamp=None, + data_generation_timestamp, + data_fetch_timestamp, model_type="single", mode="w",): """ - Creates a log file in the specified model-specific folder with details about the generated data. + Creates a log file in the specified model folder with details about the generated data. Args: - path_generated (Path): The path to the folder where the log file will be created. @@ -55,7 +79,6 @@ def create_specific_log_file(path_generated, - mode (str, optional): The mode in which the file will be opened. Default is "w". """ - Path(path_generated).mkdir(parents=True, exist_ok=True) log_file_path = f"{path_generated}/{run_type}_log.txt" # Capitalize the first letter of the model type @@ -72,8 +95,8 @@ def create_specific_log_file(path_generated, def create_log_file(path_generated, model_config, model_timestamp, - data_generation_timestamp=None, - data_fetch_timestamp=None, + data_generation_timestamp, + data_fetch_timestamp, model_type="single", models=None): @@ -85,8 +108,8 @@ def create_log_file(path_generated, model_timestamp, data_generation_timestamp, data_fetch_timestamp, model_type) if models: for m_name in models: - model_path = ModelPath(m_name, validate=False).model_dir - _, _, model_path_generated = setup_data_paths(model_path) + model_path = ModelPath(m_name) + model_path_generated = model_path.data_generated log_data = read_log_file(model_path_generated / f"{run_type}_log.txt") create_specific_log_file(path_generated, run_type, m_name, log_data["Deployment Status"], log_data["Single Model Timestamp"], log_data["Data Generation Timestamp"], log_data["Data Fetch Timestamp"], mode="a") diff --git a/ensembles/cruel_summer/src/forecasting/generate_forecast.py b/ensembles/cruel_summer/src/forecasting/generate_forecast.py index e111c182..92877a86 100644 --- a/ensembles/cruel_summer/src/forecasting/generate_forecast.py +++ b/ensembles/cruel_summer/src/forecasting/generate_forecast.py @@ -6,7 +6,7 @@ from model_path import ModelPath from ensemble_path import EnsemblePath from set_partition import get_partitioner_dict -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_outputs import save_predictions from utils_run import get_standardized_df, get_aggregated_df, get_single_model_config from utils_artifacts import get_latest_model_artifact @@ -15,7 +15,7 @@ def forecast_ensemble(config): - ensemble_path = EnsemblePath(config["name"], validate=False) + ensemble_path = EnsemblePath(config["name"]) path_generated_e = ensemble_path.data_generated run_type = config["run_type"] steps = config["steps"] @@ -25,7 +25,7 @@ def forecast_ensemble(config): for model_name in config["models"]: logger.info(f"Forecasting single model {model_name}...") - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -56,19 +56,20 @@ def forecast_ensemble(config): df = get_standardized_df(df, model_config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) save_predictions(df, path_generated, model_config) - create_log_file(path_generated, model_config, ts, data_generation_timestamp) + create_log_file(path_generated, model_config, ts, data_generation_timestamp, date_fetch_timestamp) dfs.append(df) df_prediction = get_aggregated_df(dfs, config["aggregation"]) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - # I don"t think current timestamp is useful here because timestamp of single models is more important. + # Timestamp of single models is more important than ensemble model timestamp config["timestamp"] = timestamp[:-1] save_predictions(df_prediction, path_generated_e, config) # How to define an ensemble model timestamp? Currently set as data_generation_timestamp. - create_log_file(path_generated_e, config, data_generation_timestamp, data_generation_timestamp, + create_log_file(path_generated_e, config, data_generation_timestamp, data_generation_timestamp, date_fetch_timestamp=None, model_type="ensemble", models=config["models"]) \ No newline at end of file diff --git a/ensembles/cruel_summer/src/offline_evaluation/evaluate_ensemble.py b/ensembles/cruel_summer/src/offline_evaluation/evaluate_ensemble.py index 7be6d21b..9d17aa21 100644 --- a/ensembles/cruel_summer/src/offline_evaluation/evaluate_ensemble.py +++ b/ensembles/cruel_summer/src/offline_evaluation/evaluate_ensemble.py @@ -3,7 +3,7 @@ from pathlib import Path from model_path import ModelPath from ensemble_path import EnsemblePath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_outputs import save_model_outputs, save_predictions from utils_run import get_standardized_df, get_aggregated_df, get_single_model_config from utils_evaluation_metrics import generate_metric_dict @@ -16,7 +16,7 @@ def evaluate_ensemble(config): - ensemble_path = EnsemblePath(config["name"], validate=False) + ensemble_path = EnsemblePath(config["name"]) path_generated_e = ensemble_path.data_generated run_type = config["run_type"] steps = config["steps"] @@ -26,7 +26,7 @@ def evaluate_ensemble(config): for model_name in config["models"]: logger.info(f"Evaluating single model {model_name}...") - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -55,28 +55,29 @@ def evaluate_ensemble(config): df = stepshift_model.predict(run_type, "predict", df_viewser) df = get_standardized_df(df, model_config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) _, df_output = generate_output_dict(df, model_config) evaluation, df_evaluation = generate_metric_dict(df, model_config) save_model_outputs(df_evaluation, df_output, path_generated, model_config) save_predictions(df, path_generated, model_config) - create_log_file(path_generated, model_config, ts, data_generation_timestamp) + create_log_file(path_generated, model_config, ts, data_generation_timestamp, date_fetch_timestamp) dfs.append(df) df_agg = get_aggregated_df(dfs, config["aggregation"]) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + _, df_output = generate_output_dict(df_agg, config) evaluation, df_evaluation = generate_metric_dict(df_agg, config) log_wandb_log_dict(config, evaluation) - # I don"t think current timestamp is useful here. - # Timestamp of single models is more important but how should we register them in ensemble config? + # Timestamp of single models is more important than ensemble model timestamp config["timestamp"] = timestamp[:-1] save_model_outputs(df_evaluation, df_output, path_generated_e, config) save_predictions(df_agg, path_generated_e, config) # How to define an ensemble model timestamp? Currently set as data_generation_timestamp. - create_log_file(path_generated_e, config, data_generation_timestamp, data_generation_timestamp, + create_log_file(path_generated_e, config, data_generation_timestamp, data_generation_timestamp, data_fetch_timestamp=None, model_type="ensemble", models=config["models"]) diff --git a/ensembles/cruel_summer/src/training/train_ensemble.py b/ensembles/cruel_summer/src/training/train_ensemble.py index f61787f4..c93e367f 100644 --- a/ensembles/cruel_summer/src/training/train_ensemble.py +++ b/ensembles/cruel_summer/src/training/train_ensemble.py @@ -2,7 +2,7 @@ from datetime import datetime from model_path import ModelPath from set_partition import get_partitioner_dict -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_run import get_model, get_single_model_config from views_stepshift.run import ViewsRun from stepshift.views import StepshiftedModels @@ -19,7 +19,7 @@ def train_ensemble(config): for model_name in config["models"]: logger.info(f"Training single model {model_name}...") - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -32,7 +32,8 @@ def train_ensemble(config): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") model_filename = f"{run_type}_model_{timestamp}.pkl" stepshift_model.save(path_artifacts / model_filename) - create_log_file(path_generated, model_config, timestamp) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) def stepshift_training(config, partition_name, model, dataset): diff --git a/ensembles/white_mustang/src/forecasting/generate_forecast.py b/ensembles/white_mustang/src/forecasting/generate_forecast.py index e111c182..92877a86 100644 --- a/ensembles/white_mustang/src/forecasting/generate_forecast.py +++ b/ensembles/white_mustang/src/forecasting/generate_forecast.py @@ -6,7 +6,7 @@ from model_path import ModelPath from ensemble_path import EnsemblePath from set_partition import get_partitioner_dict -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_outputs import save_predictions from utils_run import get_standardized_df, get_aggregated_df, get_single_model_config from utils_artifacts import get_latest_model_artifact @@ -15,7 +15,7 @@ def forecast_ensemble(config): - ensemble_path = EnsemblePath(config["name"], validate=False) + ensemble_path = EnsemblePath(config["name"]) path_generated_e = ensemble_path.data_generated run_type = config["run_type"] steps = config["steps"] @@ -25,7 +25,7 @@ def forecast_ensemble(config): for model_name in config["models"]: logger.info(f"Forecasting single model {model_name}...") - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -56,19 +56,20 @@ def forecast_ensemble(config): df = get_standardized_df(df, model_config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) save_predictions(df, path_generated, model_config) - create_log_file(path_generated, model_config, ts, data_generation_timestamp) + create_log_file(path_generated, model_config, ts, data_generation_timestamp, date_fetch_timestamp) dfs.append(df) df_prediction = get_aggregated_df(dfs, config["aggregation"]) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - # I don"t think current timestamp is useful here because timestamp of single models is more important. + # Timestamp of single models is more important than ensemble model timestamp config["timestamp"] = timestamp[:-1] save_predictions(df_prediction, path_generated_e, config) # How to define an ensemble model timestamp? Currently set as data_generation_timestamp. - create_log_file(path_generated_e, config, data_generation_timestamp, data_generation_timestamp, + create_log_file(path_generated_e, config, data_generation_timestamp, data_generation_timestamp, date_fetch_timestamp=None, model_type="ensemble", models=config["models"]) \ No newline at end of file diff --git a/ensembles/white_mustang/src/offline_evaluation/evaluate_ensemble.py b/ensembles/white_mustang/src/offline_evaluation/evaluate_ensemble.py index 7be6d21b..9d17aa21 100644 --- a/ensembles/white_mustang/src/offline_evaluation/evaluate_ensemble.py +++ b/ensembles/white_mustang/src/offline_evaluation/evaluate_ensemble.py @@ -3,7 +3,7 @@ from pathlib import Path from model_path import ModelPath from ensemble_path import EnsemblePath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_outputs import save_model_outputs, save_predictions from utils_run import get_standardized_df, get_aggregated_df, get_single_model_config from utils_evaluation_metrics import generate_metric_dict @@ -16,7 +16,7 @@ def evaluate_ensemble(config): - ensemble_path = EnsemblePath(config["name"], validate=False) + ensemble_path = EnsemblePath(config["name"]) path_generated_e = ensemble_path.data_generated run_type = config["run_type"] steps = config["steps"] @@ -26,7 +26,7 @@ def evaluate_ensemble(config): for model_name in config["models"]: logger.info(f"Evaluating single model {model_name}...") - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -55,28 +55,29 @@ def evaluate_ensemble(config): df = stepshift_model.predict(run_type, "predict", df_viewser) df = get_standardized_df(df, model_config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) _, df_output = generate_output_dict(df, model_config) evaluation, df_evaluation = generate_metric_dict(df, model_config) save_model_outputs(df_evaluation, df_output, path_generated, model_config) save_predictions(df, path_generated, model_config) - create_log_file(path_generated, model_config, ts, data_generation_timestamp) + create_log_file(path_generated, model_config, ts, data_generation_timestamp, date_fetch_timestamp) dfs.append(df) df_agg = get_aggregated_df(dfs, config["aggregation"]) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + _, df_output = generate_output_dict(df_agg, config) evaluation, df_evaluation = generate_metric_dict(df_agg, config) log_wandb_log_dict(config, evaluation) - # I don"t think current timestamp is useful here. - # Timestamp of single models is more important but how should we register them in ensemble config? + # Timestamp of single models is more important than ensemble model timestamp config["timestamp"] = timestamp[:-1] save_model_outputs(df_evaluation, df_output, path_generated_e, config) save_predictions(df_agg, path_generated_e, config) # How to define an ensemble model timestamp? Currently set as data_generation_timestamp. - create_log_file(path_generated_e, config, data_generation_timestamp, data_generation_timestamp, + create_log_file(path_generated_e, config, data_generation_timestamp, data_generation_timestamp, data_fetch_timestamp=None, model_type="ensemble", models=config["models"]) diff --git a/ensembles/white_mustang/src/training/train_ensemble.py b/ensembles/white_mustang/src/training/train_ensemble.py index f61787f4..c93e367f 100644 --- a/ensembles/white_mustang/src/training/train_ensemble.py +++ b/ensembles/white_mustang/src/training/train_ensemble.py @@ -2,7 +2,7 @@ from datetime import datetime from model_path import ModelPath from set_partition import get_partitioner_dict -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_run import get_model, get_single_model_config from views_stepshift.run import ViewsRun from stepshift.views import StepshiftedModels @@ -19,7 +19,7 @@ def train_ensemble(config): for model_name in config["models"]: logger.info(f"Training single model {model_name}...") - model_path = ModelPath(model_name, validate=False) + model_path = ModelPath(model_name) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -32,7 +32,8 @@ def train_ensemble(config): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") model_filename = f"{run_type}_model_{timestamp}.pkl" stepshift_model.save(path_artifacts / model_filename) - create_log_file(path_generated, model_config, timestamp) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) def stepshift_training(config, partition_name, model, dataset): diff --git a/models/blank_space/src/forecasting/generate_forecast.py b/models/blank_space/src/forecasting/generate_forecast.py index 84e7de75..96a5cc77 100644 --- a/models/blank_space/src/forecasting/generate_forecast.py +++ b/models/blank_space/src/forecasting/generate_forecast.py @@ -3,7 +3,7 @@ import logging from model_path import ModelPath from set_partition import get_partitioner_dict -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_run import get_standardized_df from utils_outputs import save_predictions from utils_artifacts import get_latest_model_artifact @@ -12,7 +12,7 @@ def forecast_model_artifact(config, artifact_name): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -43,6 +43,7 @@ def forecast_model_artifact(config, artifact_name): df_predictions = stepshift_model.future_point_predict(partition[0] - 1, df_viewser, keep_specific=True) df_predictions = get_standardized_df(df_predictions, config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) save_predictions(df_predictions, path_generated, config) - create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/blank_space/src/offline_evaluation/evaluate_model.py b/models/blank_space/src/offline_evaluation/evaluate_model.py index fa748276..750b3725 100644 --- a/models/blank_space/src/offline_evaluation/evaluate_model.py +++ b/models/blank_space/src/offline_evaluation/evaluate_model.py @@ -2,7 +2,7 @@ import pandas as pd import logging from model_path import ModelPath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_outputs import save_model_outputs, save_predictions from utils_run import get_standardized_df from utils_artifacts import get_latest_model_artifact @@ -15,7 +15,7 @@ def evaluate_model_artifact(config, artifact_name): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -45,6 +45,7 @@ def evaluate_model_artifact(config, artifact_name): df = stepshift_model.predict(run_type, "predict", df_viewser) df = get_standardized_df(df, config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) _, df_output = generate_output_dict(df, config) evaluation, df_evaluation = generate_metric_dict(df, config) @@ -52,4 +53,4 @@ def evaluate_model_artifact(config, artifact_name): save_model_outputs(df_evaluation, df_output, path_generated, config) save_predictions(df, path_generated, config) - create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/blank_space/src/training/train_model.py b/models/blank_space/src/training/train_model.py index 39c66a62..f21f4ead 100644 --- a/models/blank_space/src/training/train_model.py +++ b/models/blank_space/src/training/train_model.py @@ -1,7 +1,7 @@ from datetime import datetime import pandas as pd from model_path import ModelPath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from set_partition import get_partitioner_dict from views_stepshift.run import ViewsRun from stepshift.views import StepshiftedModels @@ -11,7 +11,7 @@ def train_model_artifact(config, model): # print(config) - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -23,7 +23,8 @@ def train_model_artifact(config, model): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") model_filename = f"{run_type}_model_{timestamp}.pkl" stepshift_model.save(path_artifacts / model_filename) - create_log_file(path_generated, config, timestamp) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) return stepshift_model diff --git a/models/electric_relaxation/src/forecasting/generate_forecast.py b/models/electric_relaxation/src/forecasting/generate_forecast.py index 84e7de75..96a5cc77 100644 --- a/models/electric_relaxation/src/forecasting/generate_forecast.py +++ b/models/electric_relaxation/src/forecasting/generate_forecast.py @@ -3,7 +3,7 @@ import logging from model_path import ModelPath from set_partition import get_partitioner_dict -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_run import get_standardized_df from utils_outputs import save_predictions from utils_artifacts import get_latest_model_artifact @@ -12,7 +12,7 @@ def forecast_model_artifact(config, artifact_name): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -43,6 +43,7 @@ def forecast_model_artifact(config, artifact_name): df_predictions = stepshift_model.future_point_predict(partition[0] - 1, df_viewser, keep_specific=True) df_predictions = get_standardized_df(df_predictions, config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) save_predictions(df_predictions, path_generated, config) - create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/electric_relaxation/src/offline_evaluation/evaluate_model.py b/models/electric_relaxation/src/offline_evaluation/evaluate_model.py index fa748276..750b3725 100644 --- a/models/electric_relaxation/src/offline_evaluation/evaluate_model.py +++ b/models/electric_relaxation/src/offline_evaluation/evaluate_model.py @@ -2,7 +2,7 @@ import pandas as pd import logging from model_path import ModelPath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_outputs import save_model_outputs, save_predictions from utils_run import get_standardized_df from utils_artifacts import get_latest_model_artifact @@ -15,7 +15,7 @@ def evaluate_model_artifact(config, artifact_name): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -45,6 +45,7 @@ def evaluate_model_artifact(config, artifact_name): df = stepshift_model.predict(run_type, "predict", df_viewser) df = get_standardized_df(df, config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) _, df_output = generate_output_dict(df, config) evaluation, df_evaluation = generate_metric_dict(df, config) @@ -52,4 +53,4 @@ def evaluate_model_artifact(config, artifact_name): save_model_outputs(df_evaluation, df_output, path_generated, config) save_predictions(df, path_generated, config) - create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/electric_relaxation/src/training/train_model.py b/models/electric_relaxation/src/training/train_model.py index 39c66a62..f21f4ead 100644 --- a/models/electric_relaxation/src/training/train_model.py +++ b/models/electric_relaxation/src/training/train_model.py @@ -1,7 +1,7 @@ from datetime import datetime import pandas as pd from model_path import ModelPath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from set_partition import get_partitioner_dict from views_stepshift.run import ViewsRun from stepshift.views import StepshiftedModels @@ -11,7 +11,7 @@ def train_model_artifact(config, model): # print(config) - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -23,7 +23,8 @@ def train_model_artifact(config, model): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") model_filename = f"{run_type}_model_{timestamp}.pkl" stepshift_model.save(path_artifacts / model_filename) - create_log_file(path_generated, config, timestamp) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) return stepshift_model diff --git a/models/lavender_haze/src/forecasting/generate_forecast.py b/models/lavender_haze/src/forecasting/generate_forecast.py index 84e7de75..af625d3f 100644 --- a/models/lavender_haze/src/forecasting/generate_forecast.py +++ b/models/lavender_haze/src/forecasting/generate_forecast.py @@ -3,7 +3,7 @@ import logging from model_path import ModelPath from set_partition import get_partitioner_dict -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_run import get_standardized_df from utils_outputs import save_predictions from utils_artifacts import get_latest_model_artifact @@ -12,7 +12,7 @@ def forecast_model_artifact(config, artifact_name): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -43,6 +43,7 @@ def forecast_model_artifact(config, artifact_name): df_predictions = stepshift_model.future_point_predict(partition[0] - 1, df_viewser, keep_specific=True) df_predictions = get_standardized_df(df_predictions, config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + data_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) save_predictions(df_predictions, path_generated, config) - create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, data_fetch_timestamp) diff --git a/models/lavender_haze/src/offline_evaluation/evaluate_model.py b/models/lavender_haze/src/offline_evaluation/evaluate_model.py index fa748276..750b3725 100644 --- a/models/lavender_haze/src/offline_evaluation/evaluate_model.py +++ b/models/lavender_haze/src/offline_evaluation/evaluate_model.py @@ -2,7 +2,7 @@ import pandas as pd import logging from model_path import ModelPath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_outputs import save_model_outputs, save_predictions from utils_run import get_standardized_df from utils_artifacts import get_latest_model_artifact @@ -15,7 +15,7 @@ def evaluate_model_artifact(config, artifact_name): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -45,6 +45,7 @@ def evaluate_model_artifact(config, artifact_name): df = stepshift_model.predict(run_type, "predict", df_viewser) df = get_standardized_df(df, config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) _, df_output = generate_output_dict(df, config) evaluation, df_evaluation = generate_metric_dict(df, config) @@ -52,4 +53,4 @@ def evaluate_model_artifact(config, artifact_name): save_model_outputs(df_evaluation, df_output, path_generated, config) save_predictions(df, path_generated, config) - create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/lavender_haze/src/training/train_model.py b/models/lavender_haze/src/training/train_model.py index 39c66a62..f21f4ead 100644 --- a/models/lavender_haze/src/training/train_model.py +++ b/models/lavender_haze/src/training/train_model.py @@ -1,7 +1,7 @@ from datetime import datetime import pandas as pd from model_path import ModelPath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from set_partition import get_partitioner_dict from views_stepshift.run import ViewsRun from stepshift.views import StepshiftedModels @@ -11,7 +11,7 @@ def train_model_artifact(config, model): # print(config) - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -23,7 +23,8 @@ def train_model_artifact(config, model): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") model_filename = f"{run_type}_model_{timestamp}.pkl" stepshift_model.save(path_artifacts / model_filename) - create_log_file(path_generated, config, timestamp) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) return stepshift_model diff --git a/models/old_money/src/forecasting/generate_forecast.py b/models/old_money/src/forecasting/generate_forecast.py index d7db7333..969dda9f 100644 --- a/models/old_money/src/forecasting/generate_forecast.py +++ b/models/old_money/src/forecasting/generate_forecast.py @@ -2,7 +2,7 @@ from datetime import datetime import logging from model_path import ModelPath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_run import get_standardized_df from utils_outputs import save_predictions from utils_artifacts import get_latest_model_artifact @@ -11,7 +11,7 @@ def forecast_model_artifact(config, artifact_name): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -41,6 +41,7 @@ def forecast_model_artifact(config, artifact_name): df_predictions = stepshift_model.predict(run_type, df_viewser) df_predictions = get_standardized_df(df_predictions, config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) save_predictions(df_predictions, path_generated, config) - create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/old_money/src/offline_evaluation/evaluate_model.py b/models/old_money/src/offline_evaluation/evaluate_model.py index e4334dd7..2b8d036d 100644 --- a/models/old_money/src/offline_evaluation/evaluate_model.py +++ b/models/old_money/src/offline_evaluation/evaluate_model.py @@ -2,7 +2,7 @@ import pandas as pd import logging from model_path import ModelPath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_outputs import save_model_outputs, save_predictions from utils_run import get_standardized_df from utils_artifacts import get_latest_model_artifact @@ -15,7 +15,7 @@ def evaluate_model_artifact(config, artifact_name): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -45,6 +45,7 @@ def evaluate_model_artifact(config, artifact_name): df = stepshift_model.predict(run_type, df_viewser) df = get_standardized_df(df, config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) _, df_output = generate_output_dict(df, config) evaluation, df_evaluation = generate_metric_dict(df, config) @@ -52,4 +53,4 @@ def evaluate_model_artifact(config, artifact_name): save_model_outputs(df_evaluation, df_output, path_generated, config) save_predictions(df, path_generated, config) - create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/old_money/src/training/train_model.py b/models/old_money/src/training/train_model.py index bfdffd4c..f2342912 100644 --- a/models/old_money/src/training/train_model.py +++ b/models/old_money/src/training/train_model.py @@ -1,7 +1,7 @@ from datetime import datetime import pandas as pd from model_path import ModelPath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_run import get_model from set_partition import get_partitioner_dict from views_forecasts.extensions import * @@ -9,7 +9,7 @@ def train_model_artifact(config): # print(config) - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -21,7 +21,8 @@ def train_model_artifact(config): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") model_filename = f"{run_type}_model_{timestamp}.pkl" stepshift_model.save(path_artifacts / model_filename) - create_log_file(path_generated, config, timestamp) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) return stepshift_model diff --git a/models/orange_pasta/src/forecasting/generate_forecast.py b/models/orange_pasta/src/forecasting/generate_forecast.py index d7db7333..969dda9f 100644 --- a/models/orange_pasta/src/forecasting/generate_forecast.py +++ b/models/orange_pasta/src/forecasting/generate_forecast.py @@ -2,7 +2,7 @@ from datetime import datetime import logging from model_path import ModelPath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_run import get_standardized_df from utils_outputs import save_predictions from utils_artifacts import get_latest_model_artifact @@ -11,7 +11,7 @@ def forecast_model_artifact(config, artifact_name): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -41,6 +41,7 @@ def forecast_model_artifact(config, artifact_name): df_predictions = stepshift_model.predict(run_type, df_viewser) df_predictions = get_standardized_df(df_predictions, config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) save_predictions(df_predictions, path_generated, config) - create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/orange_pasta/src/offline_evaluation/evaluate_model.py b/models/orange_pasta/src/offline_evaluation/evaluate_model.py index e4334dd7..2b8d036d 100644 --- a/models/orange_pasta/src/offline_evaluation/evaluate_model.py +++ b/models/orange_pasta/src/offline_evaluation/evaluate_model.py @@ -2,7 +2,7 @@ import pandas as pd import logging from model_path import ModelPath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_outputs import save_model_outputs, save_predictions from utils_run import get_standardized_df from utils_artifacts import get_latest_model_artifact @@ -15,7 +15,7 @@ def evaluate_model_artifact(config, artifact_name): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -45,6 +45,7 @@ def evaluate_model_artifact(config, artifact_name): df = stepshift_model.predict(run_type, df_viewser) df = get_standardized_df(df, config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) _, df_output = generate_output_dict(df, config) evaluation, df_evaluation = generate_metric_dict(df, config) @@ -52,4 +53,4 @@ def evaluate_model_artifact(config, artifact_name): save_model_outputs(df_evaluation, df_output, path_generated, config) save_predictions(df, path_generated, config) - create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/orange_pasta/src/training/train_model.py b/models/orange_pasta/src/training/train_model.py index bfdffd4c..f2342912 100644 --- a/models/orange_pasta/src/training/train_model.py +++ b/models/orange_pasta/src/training/train_model.py @@ -1,7 +1,7 @@ from datetime import datetime import pandas as pd from model_path import ModelPath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_run import get_model from set_partition import get_partitioner_dict from views_forecasts.extensions import * @@ -9,7 +9,7 @@ def train_model_artifact(config): # print(config) - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -21,7 +21,8 @@ def train_model_artifact(config): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") model_filename = f"{run_type}_model_{timestamp}.pkl" stepshift_model.save(path_artifacts / model_filename) - create_log_file(path_generated, config, timestamp) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) return stepshift_model diff --git a/models/wildest_dream/src/forecasting/generate_forecast.py b/models/wildest_dream/src/forecasting/generate_forecast.py index 84e7de75..96a5cc77 100644 --- a/models/wildest_dream/src/forecasting/generate_forecast.py +++ b/models/wildest_dream/src/forecasting/generate_forecast.py @@ -3,7 +3,7 @@ import logging from model_path import ModelPath from set_partition import get_partitioner_dict -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_run import get_standardized_df from utils_outputs import save_predictions from utils_artifacts import get_latest_model_artifact @@ -12,7 +12,7 @@ def forecast_model_artifact(config, artifact_name): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -43,6 +43,7 @@ def forecast_model_artifact(config, artifact_name): df_predictions = stepshift_model.future_point_predict(partition[0] - 1, df_viewser, keep_specific=True) df_predictions = get_standardized_df(df_predictions, config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) save_predictions(df_predictions, path_generated, config) - create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/wildest_dream/src/offline_evaluation/evaluate_model.py b/models/wildest_dream/src/offline_evaluation/evaluate_model.py index fa748276..750b3725 100644 --- a/models/wildest_dream/src/offline_evaluation/evaluate_model.py +++ b/models/wildest_dream/src/offline_evaluation/evaluate_model.py @@ -2,7 +2,7 @@ import pandas as pd import logging from model_path import ModelPath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_outputs import save_model_outputs, save_predictions from utils_run import get_standardized_df from utils_artifacts import get_latest_model_artifact @@ -15,7 +15,7 @@ def evaluate_model_artifact(config, artifact_name): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -45,6 +45,7 @@ def evaluate_model_artifact(config, artifact_name): df = stepshift_model.predict(run_type, "predict", df_viewser) df = get_standardized_df(df, config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) _, df_output = generate_output_dict(df, config) evaluation, df_evaluation = generate_metric_dict(df, config) @@ -52,4 +53,4 @@ def evaluate_model_artifact(config, artifact_name): save_model_outputs(df_evaluation, df_output, path_generated, config) save_predictions(df, path_generated, config) - create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/wildest_dream/src/training/train_model.py b/models/wildest_dream/src/training/train_model.py index 39c66a62..f21f4ead 100644 --- a/models/wildest_dream/src/training/train_model.py +++ b/models/wildest_dream/src/training/train_model.py @@ -1,7 +1,7 @@ from datetime import datetime import pandas as pd from model_path import ModelPath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from set_partition import get_partitioner_dict from views_stepshift.run import ViewsRun from stepshift.views import StepshiftedModels @@ -11,7 +11,7 @@ def train_model_artifact(config, model): # print(config) - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -23,7 +23,8 @@ def train_model_artifact(config, model): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") model_filename = f"{run_type}_model_{timestamp}.pkl" stepshift_model.save(path_artifacts / model_filename) - create_log_file(path_generated, config, timestamp) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) return stepshift_model diff --git a/models/yellow_pikachu/src/forecasting/generate_forecast.py b/models/yellow_pikachu/src/forecasting/generate_forecast.py index 84e7de75..96a5cc77 100644 --- a/models/yellow_pikachu/src/forecasting/generate_forecast.py +++ b/models/yellow_pikachu/src/forecasting/generate_forecast.py @@ -3,7 +3,7 @@ import logging from model_path import ModelPath from set_partition import get_partitioner_dict -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_run import get_standardized_df from utils_outputs import save_predictions from utils_artifacts import get_latest_model_artifact @@ -12,7 +12,7 @@ def forecast_model_artifact(config, artifact_name): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -43,6 +43,7 @@ def forecast_model_artifact(config, artifact_name): df_predictions = stepshift_model.future_point_predict(partition[0] - 1, df_viewser, keep_specific=True) df_predictions = get_standardized_df(df_predictions, config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) save_predictions(df_predictions, path_generated, config) - create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/yellow_pikachu/src/offline_evaluation/evaluate_model.py b/models/yellow_pikachu/src/offline_evaluation/evaluate_model.py index fa748276..750b3725 100644 --- a/models/yellow_pikachu/src/offline_evaluation/evaluate_model.py +++ b/models/yellow_pikachu/src/offline_evaluation/evaluate_model.py @@ -2,7 +2,7 @@ import pandas as pd import logging from model_path import ModelPath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from utils_outputs import save_model_outputs, save_predictions from utils_run import get_standardized_df from utils_artifacts import get_latest_model_artifact @@ -15,7 +15,7 @@ def evaluate_model_artifact(config, artifact_name): - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -45,6 +45,7 @@ def evaluate_model_artifact(config, artifact_name): df = stepshift_model.predict(run_type, "predict", df_viewser) df = get_standardized_df(df, config) data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) _, df_output = generate_output_dict(df, config) evaluation, df_evaluation = generate_metric_dict(df, config) @@ -52,4 +53,4 @@ def evaluate_model_artifact(config, artifact_name): save_model_outputs(df_evaluation, df_output, path_generated, config) save_predictions(df, path_generated, config) - create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp) + create_log_file(path_generated, config, config["timestamp"], data_generation_timestamp, date_fetch_timestamp) diff --git a/models/yellow_pikachu/src/training/train_model.py b/models/yellow_pikachu/src/training/train_model.py index 39c66a62..f21f4ead 100644 --- a/models/yellow_pikachu/src/training/train_model.py +++ b/models/yellow_pikachu/src/training/train_model.py @@ -1,7 +1,7 @@ from datetime import datetime import pandas as pd from model_path import ModelPath -from utils_log_files import create_log_file +from utils_log_files import create_log_file, read_log_file from set_partition import get_partitioner_dict from views_stepshift.run import ViewsRun from stepshift.views import StepshiftedModels @@ -11,7 +11,7 @@ def train_model_artifact(config, model): # print(config) - model_path = ModelPath(config["name"], validate=False) + model_path = ModelPath(config["name"]) path_raw = model_path.data_raw path_generated = model_path.data_generated path_artifacts = model_path.artifacts @@ -23,7 +23,8 @@ def train_model_artifact(config, model): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") model_filename = f"{run_type}_model_{timestamp}.pkl" stepshift_model.save(path_artifacts / model_filename) - create_log_file(path_generated, config, timestamp) + date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None) + create_log_file(path_generated, config, timestamp, None, date_fetch_timestamp) return stepshift_model From daed0ca4635eba37558f3842f91e53f5a5e5c8aa Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Tue, 29 Oct 2024 16:06:02 +0100 Subject: [PATCH 3/4] New validate arguments: if --train is not set, you should only use saved data --- common_utils/utils_cli_parser.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/common_utils/utils_cli_parser.py b/common_utils/utils_cli_parser.py index 54b05fe9..a68fde7c 100644 --- a/common_utils/utils_cli_parser.py +++ b/common_utils/utils_cli_parser.py @@ -128,3 +128,11 @@ def validate_arguments(args): "Error: --aggregation flag cannot be used with --sweep. Exiting." ) sys.exit(1) + + if not args.train and not args.saved: + # if not training, then we need to use saved data + print( + "Error: if --train is not set, you should only use --saved flag. Exiting." + ) + print("To fix: Add --train or --saved flag.") + sys.exit(1) From 9efca3837cce6d74e93ab9776cda40d5b84dbfa4 Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Tue, 29 Oct 2024 16:06:16 +0100 Subject: [PATCH 4/4] Update .gitignore to ignore all text files --- .gitignore | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 873afdaa..182b2063 100644 --- a/.gitignore +++ b/.gitignore @@ -149,9 +149,6 @@ venv.bak/ # Global cache .global_cache.pkl -# Generated calibration logs -*calibration_log.txt - # mypy .mypy_cache/ .dmypy.json @@ -214,6 +211,4 @@ cython_debug/ *.bak # txt logs -calibration_log.txt -testing_log.txt -forecasting_log.txt \ No newline at end of file +*.txt \ No newline at end of file