diff --git a/common_utils/set_path.py b/common_utils/set_path.py index 9713d0c3..b949bffb 100644 --- a/common_utils/set_path.py +++ b/common_utils/set_path.py @@ -118,10 +118,10 @@ def setup_data_paths(PATH) -> Path: PATH_DATA = PATH_MODEL / "data" PATH_RAW = PATH_DATA / "raw" - PATH_PROCCEDS = PATH_DATA / "processed" + PATH_PROCESSED = PATH_DATA / "processed" PATH_GENERATED = PATH_DATA / "generated" - return PATH_RAW, PATH_PROCCEDS, PATH_GENERATED # added in accordance with Sara's escwa branch + return PATH_RAW, PATH_PROCESSED, PATH_GENERATED # added in accordance with Sara's escwa branch def setup_artifacts_paths(PATH) -> Path: diff --git a/models/electric_relaxation/artifacts/evaluation_metrics.py b/models/electric_relaxation/artifacts/evaluation_metrics.py index 0e471c34..07a4e175 100644 --- a/models/electric_relaxation/artifacts/evaluation_metrics.py +++ b/models/electric_relaxation/artifacts/evaluation_metrics.py @@ -1 +1 @@ -evaluation_metrics = {'Mean Mean Squared Error': 0.002929262727152805, 'Mean Average Precision': 0.07515270506108203, 'Mean Brier Score': 0.002929262727152805} \ No newline at end of file +evaluation_metrics = {'Mean Mean Squared Error': 0.0029154554168954083, 'Mean Average Precision': 0.07515270506108203, 'Mean Brier Score': 0.0029154554168954083} \ No newline at end of file diff --git a/models/electric_relaxation/configs/config_model.py b/models/electric_relaxation/configs/config_model.py index 5af67917..83ba6159 100644 --- a/models/electric_relaxation/configs/config_model.py +++ b/models/electric_relaxation/configs/config_model.py @@ -1,14 +1,16 @@ -def get_model_config(): +from sklearn.ensemble import RandomForestClassifier + +def get_meta_config(): """ Contains the common configuration settings for the model (model architecture, name, target variable, level of analysis and deployment status). Returns: - model_config (dict): A dictionary containing model configuration settings. """ - model_config = { + meta_config = { "name": "electric_relaxation", - "algorithm": "RandomForestClassifier", - "depvar": "ged_sb_dep", #or target? + "algorithm": RandomForestClassifier, + "target": "ged_sb_dep", #or depvar "queryset": "escwa001_cflong", "level": "cm", "sweep": False, @@ -17,4 +19,4 @@ def get_model_config(): "deployment_status": "shadow", #unsure "creator": "Sara" #new addition, could be useful for managing maintenance & transfer of ownership } - return model_config #formerly common_config \ No newline at end of file + return meta_config #formerly common_config and model_config \ No newline at end of file diff --git a/models/electric_relaxation/notebooks/paths.ipynb b/models/electric_relaxation/notebooks/paths.ipynb new file mode 100644 index 00000000..3f6ac7be --- /dev/null +++ b/models/electric_relaxation/notebooks/paths.ipynb @@ -0,0 +1,101 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "import pickle\n", + "\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "from stepshift.views import StepshiftedModels\n", + "from views_runs import DataPartitioner, ViewsRun\n", + "\n", + "PATH = Path.cwd() \n", + "sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index(\"views_pipeline\")+1]]) / \"common_utils\")) # PATH_COMMON_UTILS\n", + "from set_path import setup_project_paths, setup_artifacts_paths, setup_data_paths\n", + "setup_project_paths(PATH) #adds all necessary paths to sys.path\n", + "\n", + "from config_data_partitions import get_data_partitions #change to common_utils/set_partition.py\n", + "from config_hyperparameters import get_hp_config\n", + "from config_model import get_model_config" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def train(model_config, hp_config, data_partitions):\n", + " print(\"Training...\")\n", + "\n", + " # Define the artifacts path manually or according to your notebook structure\n", + " artifacts_path = Path(\"your_path_to_artifacts_directory\")\n", + "\n", + " calib_pickle_path = artifacts_path / \"model_calibration_partition.pkl\"\n", + " future_pickle_path = artifacts_path / \"model_future_partition.pkl\"\n", + "\n", + " if calib_pickle_path.exists() and future_pickle_path.exists():\n", + " print(\"Pickle files already exist. Loading models from pickle files...\")\n", + " with open(calib_pickle_path, 'rb') as file:\n", + " model_calibration_partition = pickle.load(file)\n", + " with open(future_pickle_path, 'rb') as file:\n", + " model_future_partition = pickle.load(file)\n", + "\n", + " else:\n", + " # Assuming you have loaded the dataset before calling this function\n", + " dataset = \"models/electric_relaxation/data/raw/raw.parquet\" # Load your dataset here\n", + "\n", + " calib_partition = DataPartitioner({'calib': data_partitions[\"calib_partitioner_dict\"]})\n", + " future_partition = DataPartitioner({'future': data_partitions[\"future_partitioner_dict\"]})\n", + "\n", + " base_model = RandomForestClassifier(n_estimators=hp_config[\"n_estimators\"], n_jobs=hp_config[\"n_jobs\"])\n", + " stepshifter_def = StepshiftedModels(base_model, model_config[\"steps\"], model_config[\"depvar\"])\n", + "\n", + " model_calibration_partition = ViewsRun(calib_partition, stepshifter_def)\n", + " model_calibration_partition.fit('calib', 'train', dataset)\n", + "\n", + " model_future_partition = ViewsRun(future_partition, stepshifter_def)\n", + " model_future_partition.fit('future', 'train', dataset)\n", + "\n", + " assert model_calibration_partition is not None and model_future_partition is not None, \"Model training failed.\"\n", + "\n", + " with open(calib_pickle_path, 'wb') as file:\n", + " pickle.dump(model_calibration_partition, file)\n", + " with open(future_pickle_path, 'wb') as file:\n", + " pickle.dump(model_future_partition, file)\n", + "\n", + " print(\"Models trained and saved in artifacts folder!\")\n", + "\n", + " return model_calibration_partition, model_future_partition\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "viewser", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/models/electric_relaxation/src/dataloaders/get_calibration_data.py b/models/electric_relaxation/src/dataloaders/get_calibration_data.py deleted file mode 100644 index e69de29b..00000000 diff --git a/models/electric_relaxation/src/dataloaders/get_forecasting_data.py b/models/electric_relaxation/src/dataloaders/get_forecasting_data.py deleted file mode 100644 index e69de29b..00000000 diff --git a/models/electric_relaxation/src/dataloaders/get_testing_data.py b/models/electric_relaxation/src/dataloaders/get_testing_data.py deleted file mode 100644 index e69de29b..00000000 diff --git a/models/electric_relaxation/src/forecasting/generate_forecast.py b/models/electric_relaxation/src/forecasting/generate_forecast.py index 6dc0849f..e92e1fc0 100644 --- a/models/electric_relaxation/src/forecasting/generate_forecast.py +++ b/models/electric_relaxation/src/forecasting/generate_forecast.py @@ -4,15 +4,16 @@ from views_runs import DataPartitioner -model_path = Path(__file__).resolve().parents[2] -sys.path.append(str(model_path)) -print(sys.path) +PATH = Path(__file__) +sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_data_paths, setup_artifacts_paths, setup_generated_data_path +setup_project_paths(PATH) -from configs.config_data_partitions import get_data_partitions -from configs.config_hyperparameters import get_hp_config -from configs.config_model import get_model_config -from src.training.train_model import train -from src.utils.set_paths import get_data_path, get_generated_data_path +from config_data_partitions import get_data_partitions +from config_hyperparameters import get_hp_config +from config_model import get_model_config +from train_model import train +#from src.utils.set_paths import get_data_path, get_generated_data_path def forecast(data_partitions, model_calibration_partition, model_future_partition): """ @@ -31,18 +32,20 @@ def forecast(data_partitions, model_calibration_partition, model_future_partitio print("Generating forecasts...") - data = pd.read_parquet(get_data_path("raw")) + PATH_RAW, _, PATH_GENERATED = setup_data_paths(PATH) + PATH_ARTIFACTS = setup_artifacts_paths(PATH) + data = pd.read_parquet(PATH_RAW / 'raw.parquet') future_partitioner_dict = data_partitions["future_partitioner_dict"] calib_predictions = model_calibration_partition.predict('calib','predict',data, proba=True) future_partition = DataPartitioner({'future':future_partitioner_dict}) #is this being used? we don't define an equivalent for calib_predictions future_predictions = model_future_partition.future_predict('future','predict',data) - future_point_predictions = model_future_partition.future_point_predict(time=529, data=data, proba=True) - - calib_predictions.to_parquet(get_generated_data_path("calibration")) - future_predictions.to_parquet(get_generated_data_path("future")) - future_point_predictions.to_parquet(get_generated_data_path("future_point")) + future_point_predictions = model_future_partition.future_point_predict(time=future_partitioner_dict['future_start'], data=data, proba=True) + + calib_predictions.to_parquet(setup_generated_data_path(PATH, "calibration")) + future_predictions.to_parquet(setup_generated_data_path(PATH, "future")) + future_point_predictions.to_parquet(setup_generated_data_path(PATH, "future_point")) print("Forecasts generated and saved in data/generated!") diff --git a/models/electric_relaxation/src/offline_evaluation/evaluate_model.py b/models/electric_relaxation/src/offline_evaluation/evaluate_model.py index b147a3cf..e26ea549 100644 --- a/models/electric_relaxation/src/offline_evaluation/evaluate_model.py +++ b/models/electric_relaxation/src/offline_evaluation/evaluate_model.py @@ -6,9 +6,12 @@ from sklearn.metrics import mean_squared_error, average_precision_score, roc_auc_score, brier_score_loss -model_path = Path(__file__).resolve().parents[2] -sys.path.append(str(model_path)) -from configs.config_model import get_model_config +PATH = Path(__file__) +sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_artifacts_paths, setup_data_paths +setup_project_paths(PATH) #adds all necessary paths to sys.path + +from config_model import get_model_config def evaluate_model(model_config): @@ -34,7 +37,10 @@ def evaluate_model(model_config): """ print("Evaluating...") - df_calib = pd.read_parquet(model_path/"data"/"generated"/"calibration_predictions.parquet") + PATH_MODEL, PATH_RAW, PATH_PROCESSED, PATH_GENERATED = setup_data_paths(PATH) + + #df_calib = pd.read_parquet(model_path/"data"/"generated"/"calibration_predictions.parquet") + df_calib = pd.read_parquet(PATH_GENERATED / "calibration_predictions.parquet") steps = model_config["steps"] depvar = [model_config["depvar"]] #formerly stepcols, changed to depvar to also use in true_values @@ -61,7 +67,7 @@ def evaluate_model(model_config): [row[col] for col in pred_cols]), axis=1) mean_brier_score = df_calib["brier_score"].mean() - metrics_dict_path = model_path / "artifacts" / "evaluation_metrics.py" + metrics_dict_path = PATH_MODEL / "artifacts" / "evaluation_metrics.py" evaluation_metrics_calib = { "Mean Mean Squared Error": mean_mse, diff --git a/models/electric_relaxation/src/training/train_calibration_model.py b/models/electric_relaxation/src/training/train_calibration_model.py deleted file mode 100644 index 8b137891..00000000 --- a/models/electric_relaxation/src/training/train_calibration_model.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/models/electric_relaxation/src/training/train_forecasting_model.py b/models/electric_relaxation/src/training/train_forecasting_model.py deleted file mode 100644 index e69de29b..00000000 diff --git a/models/electric_relaxation/src/training/train_model.py b/models/electric_relaxation/src/training/train_model.py index d7397929..c2fe6ac5 100644 --- a/models/electric_relaxation/src/training/train_model.py +++ b/models/electric_relaxation/src/training/train_model.py @@ -3,20 +3,20 @@ import pandas as pd import pickle -from sklearn.ensemble import RandomForestClassifier +#from sklearn.ensemble import RandomForestClassifier from stepshift.views import StepshiftedModels from views_runs import DataPartitioner, ViewsRun -model_path = Path(__file__).resolve().parents[2] -sys.path.append(str(model_path)) -print(sys.path) +PATH = Path(__file__) +sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_artifacts_paths, setup_data_paths +setup_project_paths(PATH) #adds all necessary paths to sys.path -from configs.config_data_partitions import get_data_partitions -from configs.config_hyperparameters import get_hp_config -from configs.config_model import get_model_config -#from configs.config_sweep import get_sweep_config -from src.utils.set_paths import get_data_path, get_artifacts_path +from config_data_partitions import get_data_partitions #change to common_utils/set_partition.py +from config_hyperparameters import get_hp_config +from config_model import get_model_config +#from config_sweep import get_sweep_config def train(model_config, hp_config, data_partitions): """ @@ -33,16 +33,17 @@ def train(model_config, hp_config, data_partitions): Returns: - tuple: Trained models for calibration and future partitions. + + Note: + - The 'artifacts' directory must exist in the system path for saving and loading pickle files. + - Ensure that the raw dataset is successfully loaded before proceeding with model training. """ print("Training...") - #calib_pickle_path = get_artifacts_path("calibration") #not sure why code doesn't run well with these - #future_pickle_path = get_artifacts_path("forecast") - calib_pickle_path = model_path / "artifacts" / "model_calibration_partition.pkl" - future_pickle_path = model_path / "artifacts" / "model_future_partition.pkl" - print(calib_pickle_path) - print(future_pickle_path) + artifacts_path = setup_artifacts_paths(PATH) + calib_pickle_path = artifacts_path / "model_calibration_partition.pkl" + future_pickle_path = artifacts_path / "model_future_partition.pkl" if calib_pickle_path.exists() and future_pickle_path.exists(): print("Pickle files already exist. Loading models from pickle files...") @@ -52,12 +53,14 @@ def train(model_config, hp_config, data_partitions): model_future_partition = pickle.load(file) else: - dataset = pd.read_parquet(get_data_path("raw")) + PATH_RAW, _, _ = setup_data_paths(PATH) + dataset = pd.read_parquet(PATH_RAW / 'raw.parquet') assert not dataset.empty, "Data loading failed." calib_partition = DataPartitioner({'calib': data_partitions["calib_partitioner_dict"]}) future_partition = DataPartitioner({'future': data_partitions["future_partitioner_dict"]}) - base_model = [model_config["algorithm"]](n_estimators=hp_config["n_estimators"], n_jobs=hp_config["n_jobs"]) + #base_model = [model_config["algorithm"]](n_estimators=hp_config["n_estimators"], n_jobs=hp_config["n_jobs"]) + base_model = model_config["algorithm"](n_estimators=hp_config["n_estimators"], n_jobs=hp_config["n_jobs"]) stepshifter_def = StepshiftedModels(base_model, model_config["steps"], model_config["depvar"]) model_calibration_partition = ViewsRun(calib_partition, stepshifter_def) diff --git a/models/electric_relaxation/src/training/train_testing_model.py b/models/electric_relaxation/src/training/train_testing_model.py deleted file mode 100644 index e69de29b..00000000 diff --git a/models/electric_relaxation/src/utils/set_paths.py b/models/electric_relaxation/src/utils/set_paths.py deleted file mode 100644 index 326453a3..00000000 --- a/models/electric_relaxation/src/utils/set_paths.py +++ /dev/null @@ -1,63 +0,0 @@ -#TBD: move to root/common_utils - -import sys -from pathlib import Path - - -def get_artifacts_path(partition_name): - ''' - The artifacts are saved in src/artifacts/model_{partition_name}.pkl - ''' - - return Path(__file__).parent.parent.parent / "artifacts" / f"model_{partition_name}_partition.pkl" - - -def get_data_path(data_name): - ''' - E.g., The data is saved in data/raw/raw.parquet - ''' - - return Path(__file__).parent.parent.parent / "data" / f"{data_name}" / f"{data_name}.parquet" - -def get_generated_data_path(partition_name): - ''' - The data is saved in data/generated/{partition_name}_predictions.parquet - ''' - - return Path(__file__).parent.parent.parent / "data" / "generated" / f"{partition_name}_predictions.parquet" - -def set_paths(): #not using this yet in code - - """ - Set the paths for various directories for the model, independently of (Mac, Linux) machine. - This structure assumes that this python script is located in root/models/example_model/src/utils. - - Next development: implement this from root. Not sure how to make the selected model path work there, though. - - Returns: - dict_values: A view object containing the values (paths) of the dictionary. - """ - - # Set the path to the root of the repo (for common configurations) and model (for model-specific configurations) - root_path = Path(__file__).resolve().parents[4] #4 folders up from this file (i.e., utils > src > model > models > root) - model_path = Path(__file__).resolve().parents[2] #2 folders up from this file (i.e., utils > src > model) - - # Define relative paths in a dictionary - paths = { - 'common_utils': root_path / 'common_utils', - 'artifacts': model_path / 'src/artifacts', - 'configs': model_path / 'src/configs', - 'raw_data': model_path / 'src/data/raw', - 'processed_data': model_path / 'src/data/processed', - 'generated_data': model_path / 'src/data/generated', - 'dataloaders': model_path / 'src/dataloaders', - 'forecasting': model_path / 'src/forecasting', - 'offline_evaluation': model_path / 'src/offline_evaluation', - 'online_evaluation': model_path / 'src/online_evaluation', - 'training': model_path / 'src/training', - 'utils': model_path / 'src/utils', - 'visualization': model_path / 'src/visualization', - } - - return root_path, model_path, paths.values() - \ No newline at end of file