From 261cbc583fe5d94f7160d4e3ae18c880a93b52ad Mon Sep 17 00:00:00 2001 From: Sara Kallis Date: Thu, 28 Mar 2024 15:29:53 +0100 Subject: [PATCH 01/13] New path solution WIP --- common_utils/set_path.py | 25 +++++++- .../src/forecasting/generate_forecast.py | 25 ++++---- .../src/training/train_model.py | 29 +++++---- .../src/training/train_testing_model.py | 15 +++++ .../src/utils/set_paths.py | 63 ------------------- 5 files changed, 67 insertions(+), 90 deletions(-) delete mode 100644 models/electric_relaxation/src/utils/set_paths.py diff --git a/common_utils/set_path.py b/common_utils/set_path.py index 6ba2957d..e23229dd 100644 --- a/common_utils/set_path.py +++ b/common_utils/set_path.py @@ -53,7 +53,7 @@ def setup_project_paths(PATH) -> None: PATH_OFFLINE_EVALUATION = PATH_SRC / "offline_evaluation" PATH_DATALOADERS = PATH_SRC / "dataloaders" - paths_to_add = [PATH_ROOT, PATH_COMMON_UTILS, PATH_COMMON_CONFIGS, PATH_CONFIGS, PATH_UTILS, PATH_ARCHITECTURES, PATH_TRAINING, PATH_FORECASTING, PATH_OFFLINE_EVALUATION, PATH_DATALOADERS] + paths_to_add = [PATH_ROOT, PATH_COMMON_UTILS, PATH_COMMON_CONFIGS, PATH_CONFIGS, PATH_UTILS, PATH_ARCHITECTURES, PATH_TRAINING, PATH_FORECASTING, PATH_OFFLINE_EVALUATION, PATH_DATALOADERS, PATH_SRC] for path in paths_to_add: path_str = str(path) @@ -77,10 +77,10 @@ def setup_data_paths(PATH) -> None: PATH_DATA = PATH_MODEL / "data" PATH_RAW = PATH_DATA / "raw" - PATH_PROCCEDS = PATH_DATA / "processed" + PATH_PROCESSED = PATH_DATA / "processed" PATH_GENERATED = PATH_DATA / "generated" - return PATH_RAW, PATH_PROCCEDS, PATH_GENERATED + return PATH_RAW, PATH_PROCESSED, PATH_GENERATED def setup_artifacts_paths(PATH) -> None: @@ -99,3 +99,22 @@ def setup_artifacts_paths(PATH) -> None: PATH_ARTIFACTS = PATH_MODEL / "artifacts" # print(f"Artifacts path: {PATH_ARTIFACTS}") return PATH_ARTIFACTS + +def setup_generated_data_path(partition_name) -> Path: + """ + Set up the path to the generated data for a specific partition. + + Args: + partition_name (str): The name of the data partition. + + Returns: + Path: The path to the generated data file for the specified partition. + + Note: + - Temporary fix? + """ + PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models")+2]]) # The +2 is to include the "models" and the individual model name in the path + + PATH_DATA_GENERATED = Path(PATH_MODEL / "data" / "generated" / f"{partition_name}_predictions.parquet") + print(f"Generated data path: {PATH_DATA_GENERATED}") + return PATH_DATA_GENERATED diff --git a/models/electric_relaxation/src/forecasting/generate_forecast.py b/models/electric_relaxation/src/forecasting/generate_forecast.py index 6dc0849f..fa270c08 100644 --- a/models/electric_relaxation/src/forecasting/generate_forecast.py +++ b/models/electric_relaxation/src/forecasting/generate_forecast.py @@ -4,15 +4,18 @@ from views_runs import DataPartitioner -model_path = Path(__file__).resolve().parents[2] -sys.path.append(str(model_path)) +PATH = Path(__file__) +print(PATH) +sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS print(sys.path) +from set_path import setup_project_paths, setup_data_paths, setup_generated_data_path +setup_project_paths(PATH) -from configs.config_data_partitions import get_data_partitions -from configs.config_hyperparameters import get_hp_config -from configs.config_model import get_model_config -from src.training.train_model import train -from src.utils.set_paths import get_data_path, get_generated_data_path +from config_data_partitions import get_data_partitions +from config_hyperparameters import get_hp_config +from config_model import get_model_config +from training.train_model import train +#from src.utils.set_paths import get_data_path, get_generated_data_path def forecast(data_partitions, model_calibration_partition, model_future_partition): """ @@ -31,7 +34,7 @@ def forecast(data_partitions, model_calibration_partition, model_future_partitio print("Generating forecasts...") - data = pd.read_parquet(get_data_path("raw")) + data = pd.read_parquet(setup_data_paths("raw")) #formerly get_data_path("raw") future_partitioner_dict = data_partitions["future_partitioner_dict"] calib_predictions = model_calibration_partition.predict('calib','predict',data, proba=True) @@ -40,9 +43,9 @@ def forecast(data_partitions, model_calibration_partition, model_future_partitio future_predictions = model_future_partition.future_predict('future','predict',data) future_point_predictions = model_future_partition.future_point_predict(time=529, data=data, proba=True) - calib_predictions.to_parquet(get_generated_data_path("calibration")) - future_predictions.to_parquet(get_generated_data_path("future")) - future_point_predictions.to_parquet(get_generated_data_path("future_point")) + calib_predictions.to_parquet(setup_generated_data_path("calibration")) + future_predictions.to_parquet(setup_generated_data_path("future")) + future_point_predictions.to_parquet(setup_generated_data_path("future_point")) print("Forecasts generated and saved in data/generated!") diff --git a/models/electric_relaxation/src/training/train_model.py b/models/electric_relaxation/src/training/train_model.py index d7397929..e146848a 100644 --- a/models/electric_relaxation/src/training/train_model.py +++ b/models/electric_relaxation/src/training/train_model.py @@ -8,15 +8,17 @@ from stepshift.views import StepshiftedModels from views_runs import DataPartitioner, ViewsRun -model_path = Path(__file__).resolve().parents[2] -sys.path.append(str(model_path)) +PATH = Path(__file__) +print(PATH) +sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS print(sys.path) +from set_path import setup_project_paths, setup_data_paths +setup_project_paths(PATH) -from configs.config_data_partitions import get_data_partitions -from configs.config_hyperparameters import get_hp_config -from configs.config_model import get_model_config +from config_data_partitions import get_data_partitions +from config_hyperparameters import get_hp_config +from config_model import get_model_config #from configs.config_sweep import get_sweep_config -from src.utils.set_paths import get_data_path, get_artifacts_path def train(model_config, hp_config, data_partitions): """ @@ -33,16 +35,17 @@ def train(model_config, hp_config, data_partitions): Returns: - tuple: Trained models for calibration and future partitions. + + Note: + - The 'artifacts' directory must exist in the system path for saving and loading pickle files. + - Ensure that the raw dataset is successfully loaded before proceeding with model training. """ print("Training...") - #calib_pickle_path = get_artifacts_path("calibration") #not sure why code doesn't run well with these - #future_pickle_path = get_artifacts_path("forecast") - calib_pickle_path = model_path / "artifacts" / "model_calibration_partition.pkl" - future_pickle_path = model_path / "artifacts" / "model_future_partition.pkl" - print(calib_pickle_path) - print(future_pickle_path) + PATH_ARTIFACTS = [i for i in sys.path if "artifacts" in i][0] + calib_pickle_path = Path(PATH_ARTIFACTS) / "model_calibration_partition.pkl" + future_pickle_path = Path(PATH_ARTIFACTS) / "model_future_partition.pkl" if calib_pickle_path.exists() and future_pickle_path.exists(): print("Pickle files already exist. Loading models from pickle files...") @@ -52,7 +55,7 @@ def train(model_config, hp_config, data_partitions): model_future_partition = pickle.load(file) else: - dataset = pd.read_parquet(get_data_path("raw")) + dataset = pd.read_parquet(setup_data_paths("raw")) #formerly get_data_path("raw") assert not dataset.empty, "Data loading failed." calib_partition = DataPartitioner({'calib': data_partitions["calib_partitioner_dict"]}) diff --git a/models/electric_relaxation/src/training/train_testing_model.py b/models/electric_relaxation/src/training/train_testing_model.py index e69de29b..74a88526 100644 --- a/models/electric_relaxation/src/training/train_testing_model.py +++ b/models/electric_relaxation/src/training/train_testing_model.py @@ -0,0 +1,15 @@ +import sys +from pathlib import Path + + +PATH = Path(__file__) +sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS +#'/Users/sarakallis/Documents/PRIO Local/views_pipeline/common_utils', '/Users/sarakallis/Documents/PRIO Local/views_pipeline/models/electric_relaxation/src/training' + +from set_path import setup_project_paths +setup_project_paths(PATH) + +PATH_ARTIFACTS = [i for i in sys.path if "artifacts" in i][0] # this is a list with one element (a str), so I can just index it with 0 + +calib_pickle_path = PATH_ARTIFACTS + "/model_calibration_partition.pkl" +print(calib_pickle_path) \ No newline at end of file diff --git a/models/electric_relaxation/src/utils/set_paths.py b/models/electric_relaxation/src/utils/set_paths.py deleted file mode 100644 index 326453a3..00000000 --- a/models/electric_relaxation/src/utils/set_paths.py +++ /dev/null @@ -1,63 +0,0 @@ -#TBD: move to root/common_utils - -import sys -from pathlib import Path - - -def get_artifacts_path(partition_name): - ''' - The artifacts are saved in src/artifacts/model_{partition_name}.pkl - ''' - - return Path(__file__).parent.parent.parent / "artifacts" / f"model_{partition_name}_partition.pkl" - - -def get_data_path(data_name): - ''' - E.g., The data is saved in data/raw/raw.parquet - ''' - - return Path(__file__).parent.parent.parent / "data" / f"{data_name}" / f"{data_name}.parquet" - -def get_generated_data_path(partition_name): - ''' - The data is saved in data/generated/{partition_name}_predictions.parquet - ''' - - return Path(__file__).parent.parent.parent / "data" / "generated" / f"{partition_name}_predictions.parquet" - -def set_paths(): #not using this yet in code - - """ - Set the paths for various directories for the model, independently of (Mac, Linux) machine. - This structure assumes that this python script is located in root/models/example_model/src/utils. - - Next development: implement this from root. Not sure how to make the selected model path work there, though. - - Returns: - dict_values: A view object containing the values (paths) of the dictionary. - """ - - # Set the path to the root of the repo (for common configurations) and model (for model-specific configurations) - root_path = Path(__file__).resolve().parents[4] #4 folders up from this file (i.e., utils > src > model > models > root) - model_path = Path(__file__).resolve().parents[2] #2 folders up from this file (i.e., utils > src > model) - - # Define relative paths in a dictionary - paths = { - 'common_utils': root_path / 'common_utils', - 'artifacts': model_path / 'src/artifacts', - 'configs': model_path / 'src/configs', - 'raw_data': model_path / 'src/data/raw', - 'processed_data': model_path / 'src/data/processed', - 'generated_data': model_path / 'src/data/generated', - 'dataloaders': model_path / 'src/dataloaders', - 'forecasting': model_path / 'src/forecasting', - 'offline_evaluation': model_path / 'src/offline_evaluation', - 'online_evaluation': model_path / 'src/online_evaluation', - 'training': model_path / 'src/training', - 'utils': model_path / 'src/utils', - 'visualization': model_path / 'src/visualization', - } - - return root_path, model_path, paths.values() - \ No newline at end of file From a95438e98649c45838db9e46df2fbabd75b42d49 Mon Sep 17 00:00:00 2001 From: Sara Kallis Date: Mon, 8 Apr 2024 08:03:42 +0200 Subject: [PATCH 02/13] Start new paths solution --- common_utils/set_path.py | 6 +- .../electric_relaxation/notebooks/paths.ipynb | 101 ++++++++++++++++++ .../src/dataloaders/get_calibration_data.py | 0 .../src/dataloaders/get_forecasting_data.py | 0 .../src/dataloaders/get_testing_data.py | 0 .../src/training/train_model.py | 19 ++-- .../src/training/train_testing_model.py | 15 --- 7 files changed, 112 insertions(+), 29 deletions(-) create mode 100644 models/electric_relaxation/notebooks/paths.ipynb delete mode 100644 models/electric_relaxation/src/dataloaders/get_calibration_data.py delete mode 100644 models/electric_relaxation/src/dataloaders/get_forecasting_data.py delete mode 100644 models/electric_relaxation/src/dataloaders/get_testing_data.py diff --git a/common_utils/set_path.py b/common_utils/set_path.py index e23229dd..025ea3cd 100644 --- a/common_utils/set_path.py +++ b/common_utils/set_path.py @@ -92,10 +92,8 @@ def setup_artifacts_paths(PATH) -> None: PATH (Path): The base path, typically the path of the script invoking this function (i.e., `Path(__file__)`). config (str): The model configuration file. - """ - - PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models")+2]]) # The +2 is to include the "models" and the individual model name in the path - + """ + PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models")+2]]) PATH_ARTIFACTS = PATH_MODEL / "artifacts" # print(f"Artifacts path: {PATH_ARTIFACTS}") return PATH_ARTIFACTS diff --git a/models/electric_relaxation/notebooks/paths.ipynb b/models/electric_relaxation/notebooks/paths.ipynb new file mode 100644 index 00000000..3f6ac7be --- /dev/null +++ b/models/electric_relaxation/notebooks/paths.ipynb @@ -0,0 +1,101 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "import pickle\n", + "\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "from stepshift.views import StepshiftedModels\n", + "from views_runs import DataPartitioner, ViewsRun\n", + "\n", + "PATH = Path.cwd() \n", + "sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index(\"views_pipeline\")+1]]) / \"common_utils\")) # PATH_COMMON_UTILS\n", + "from set_path import setup_project_paths, setup_artifacts_paths, setup_data_paths\n", + "setup_project_paths(PATH) #adds all necessary paths to sys.path\n", + "\n", + "from config_data_partitions import get_data_partitions #change to common_utils/set_partition.py\n", + "from config_hyperparameters import get_hp_config\n", + "from config_model import get_model_config" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def train(model_config, hp_config, data_partitions):\n", + " print(\"Training...\")\n", + "\n", + " # Define the artifacts path manually or according to your notebook structure\n", + " artifacts_path = Path(\"your_path_to_artifacts_directory\")\n", + "\n", + " calib_pickle_path = artifacts_path / \"model_calibration_partition.pkl\"\n", + " future_pickle_path = artifacts_path / \"model_future_partition.pkl\"\n", + "\n", + " if calib_pickle_path.exists() and future_pickle_path.exists():\n", + " print(\"Pickle files already exist. Loading models from pickle files...\")\n", + " with open(calib_pickle_path, 'rb') as file:\n", + " model_calibration_partition = pickle.load(file)\n", + " with open(future_pickle_path, 'rb') as file:\n", + " model_future_partition = pickle.load(file)\n", + "\n", + " else:\n", + " # Assuming you have loaded the dataset before calling this function\n", + " dataset = \"models/electric_relaxation/data/raw/raw.parquet\" # Load your dataset here\n", + "\n", + " calib_partition = DataPartitioner({'calib': data_partitions[\"calib_partitioner_dict\"]})\n", + " future_partition = DataPartitioner({'future': data_partitions[\"future_partitioner_dict\"]})\n", + "\n", + " base_model = RandomForestClassifier(n_estimators=hp_config[\"n_estimators\"], n_jobs=hp_config[\"n_jobs\"])\n", + " stepshifter_def = StepshiftedModels(base_model, model_config[\"steps\"], model_config[\"depvar\"])\n", + "\n", + " model_calibration_partition = ViewsRun(calib_partition, stepshifter_def)\n", + " model_calibration_partition.fit('calib', 'train', dataset)\n", + "\n", + " model_future_partition = ViewsRun(future_partition, stepshifter_def)\n", + " model_future_partition.fit('future', 'train', dataset)\n", + "\n", + " assert model_calibration_partition is not None and model_future_partition is not None, \"Model training failed.\"\n", + "\n", + " with open(calib_pickle_path, 'wb') as file:\n", + " pickle.dump(model_calibration_partition, file)\n", + " with open(future_pickle_path, 'wb') as file:\n", + " pickle.dump(model_future_partition, file)\n", + "\n", + " print(\"Models trained and saved in artifacts folder!\")\n", + "\n", + " return model_calibration_partition, model_future_partition\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "viewser", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/models/electric_relaxation/src/dataloaders/get_calibration_data.py b/models/electric_relaxation/src/dataloaders/get_calibration_data.py deleted file mode 100644 index e69de29b..00000000 diff --git a/models/electric_relaxation/src/dataloaders/get_forecasting_data.py b/models/electric_relaxation/src/dataloaders/get_forecasting_data.py deleted file mode 100644 index e69de29b..00000000 diff --git a/models/electric_relaxation/src/dataloaders/get_testing_data.py b/models/electric_relaxation/src/dataloaders/get_testing_data.py deleted file mode 100644 index e69de29b..00000000 diff --git a/models/electric_relaxation/src/training/train_model.py b/models/electric_relaxation/src/training/train_model.py index e146848a..19a81329 100644 --- a/models/electric_relaxation/src/training/train_model.py +++ b/models/electric_relaxation/src/training/train_model.py @@ -9,16 +9,14 @@ from views_runs import DataPartitioner, ViewsRun PATH = Path(__file__) -print(PATH) sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS -print(sys.path) -from set_path import setup_project_paths, setup_data_paths -setup_project_paths(PATH) +from set_path import setup_project_paths, setup_artifacts_paths, setup_data_paths +setup_project_paths(PATH) #adds all necessary paths to sys.path -from config_data_partitions import get_data_partitions +from config_data_partitions import get_data_partitions #change to common_utils/set_partition.py from config_hyperparameters import get_hp_config from config_model import get_model_config -#from configs.config_sweep import get_sweep_config +#from config_sweep import get_sweep_config def train(model_config, hp_config, data_partitions): """ @@ -43,9 +41,9 @@ def train(model_config, hp_config, data_partitions): print("Training...") - PATH_ARTIFACTS = [i for i in sys.path if "artifacts" in i][0] - calib_pickle_path = Path(PATH_ARTIFACTS) / "model_calibration_partition.pkl" - future_pickle_path = Path(PATH_ARTIFACTS) / "model_future_partition.pkl" + artifacts_path = setup_artifacts_paths() + calib_pickle_path = artifacts_path / "model_calibration_partition.pkl" + future_pickle_path = artifacts_path / "model_future_partition.pkl" if calib_pickle_path.exists() and future_pickle_path.exists(): print("Pickle files already exist. Loading models from pickle files...") @@ -55,7 +53,8 @@ def train(model_config, hp_config, data_partitions): model_future_partition = pickle.load(file) else: - dataset = pd.read_parquet(setup_data_paths("raw")) #formerly get_data_path("raw") + setup_data_paths(Path(__file__)) + dataset = pd.read_parquet("raw") # Load from raw data path assert not dataset.empty, "Data loading failed." calib_partition = DataPartitioner({'calib': data_partitions["calib_partitioner_dict"]}) diff --git a/models/electric_relaxation/src/training/train_testing_model.py b/models/electric_relaxation/src/training/train_testing_model.py index 74a88526..e69de29b 100644 --- a/models/electric_relaxation/src/training/train_testing_model.py +++ b/models/electric_relaxation/src/training/train_testing_model.py @@ -1,15 +0,0 @@ -import sys -from pathlib import Path - - -PATH = Path(__file__) -sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS -#'/Users/sarakallis/Documents/PRIO Local/views_pipeline/common_utils', '/Users/sarakallis/Documents/PRIO Local/views_pipeline/models/electric_relaxation/src/training' - -from set_path import setup_project_paths -setup_project_paths(PATH) - -PATH_ARTIFACTS = [i for i in sys.path if "artifacts" in i][0] # this is a list with one element (a str), so I can just index it with 0 - -calib_pickle_path = PATH_ARTIFACTS + "/model_calibration_partition.pkl" -print(calib_pickle_path) \ No newline at end of file From 77bc3a7f020cf1a516acf96d830743991a511c8a Mon Sep 17 00:00:00 2001 From: Sara Kallis Date: Fri, 12 Apr 2024 16:41:19 +0200 Subject: [PATCH 03/13] Fix training script --- .../electric_relaxation/src/training/train_calibration_model.py | 1 - .../electric_relaxation/src/training/train_forecasting_model.py | 0 models/electric_relaxation/src/training/train_model.py | 2 +- models/electric_relaxation/src/training/train_testing_model.py | 0 4 files changed, 1 insertion(+), 2 deletions(-) delete mode 100644 models/electric_relaxation/src/training/train_calibration_model.py delete mode 100644 models/electric_relaxation/src/training/train_forecasting_model.py delete mode 100644 models/electric_relaxation/src/training/train_testing_model.py diff --git a/models/electric_relaxation/src/training/train_calibration_model.py b/models/electric_relaxation/src/training/train_calibration_model.py deleted file mode 100644 index 8b137891..00000000 --- a/models/electric_relaxation/src/training/train_calibration_model.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/models/electric_relaxation/src/training/train_forecasting_model.py b/models/electric_relaxation/src/training/train_forecasting_model.py deleted file mode 100644 index e69de29b..00000000 diff --git a/models/electric_relaxation/src/training/train_model.py b/models/electric_relaxation/src/training/train_model.py index 19a81329..4fd3122f 100644 --- a/models/electric_relaxation/src/training/train_model.py +++ b/models/electric_relaxation/src/training/train_model.py @@ -41,7 +41,7 @@ def train(model_config, hp_config, data_partitions): print("Training...") - artifacts_path = setup_artifacts_paths() + artifacts_path = setup_artifacts_paths(PATH) calib_pickle_path = artifacts_path / "model_calibration_partition.pkl" future_pickle_path = artifacts_path / "model_future_partition.pkl" diff --git a/models/electric_relaxation/src/training/train_testing_model.py b/models/electric_relaxation/src/training/train_testing_model.py deleted file mode 100644 index e69de29b..00000000 From 69ede05d1ffa467046f9030f674cef22223de737 Mon Sep 17 00:00:00 2001 From: Sara Kallis Date: Fri, 12 Apr 2024 17:21:03 +0200 Subject: [PATCH 04/13] Update set_path.py --- common_utils/set_path.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/common_utils/set_path.py b/common_utils/set_path.py index 025ea3cd..431c7375 100644 --- a/common_utils/set_path.py +++ b/common_utils/set_path.py @@ -98,18 +98,26 @@ def setup_artifacts_paths(PATH) -> None: # print(f"Artifacts path: {PATH_ARTIFACTS}") return PATH_ARTIFACTS -def setup_generated_data_path(partition_name) -> Path: +def setup_generated_data_path(PATH, partition_name) -> Path: """ Set up the path to the generated data for a specific partition. Args: partition_name (str): The name of the data partition. + PATH (Path): The base path, typically the path of the script invoking this function (i.e., `Path(__file__)`). Returns: Path: The path to the generated data file for the specified partition. - Note: - - Temporary fix? + Usage Example: + ```python + PATH = Path(__file__) + sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS + from set_path import setup_project_paths, setup_generated_data_path + setup_project_paths(PATH) + setup_generated_data_path(PATH, "calibration") + ``` + """ PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models")+2]]) # The +2 is to include the "models" and the individual model name in the path From 067f6e92ef5a002b47cbc74236cd894496a9a265 Mon Sep 17 00:00:00 2001 From: Sara Kallis Date: Fri, 12 Apr 2024 17:21:10 +0200 Subject: [PATCH 05/13] Update config_model.py --- models/electric_relaxation/configs/config_model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/models/electric_relaxation/configs/config_model.py b/models/electric_relaxation/configs/config_model.py index 5af67917..99a29bc9 100644 --- a/models/electric_relaxation/configs/config_model.py +++ b/models/electric_relaxation/configs/config_model.py @@ -1,3 +1,5 @@ +from sklearn.ensemble import RandomForestClassifier + def get_model_config(): """ Contains the common configuration settings for the model (model architecture, name, target variable, level of analysis and deployment status). @@ -7,7 +9,7 @@ def get_model_config(): """ model_config = { "name": "electric_relaxation", - "algorithm": "RandomForestClassifier", + "algorithm": RandomForestClassifier, "depvar": "ged_sb_dep", #or target? "queryset": "escwa001_cflong", "level": "cm", From cb09a903e8cc263fc65cdc3bbc7bdb15873cf582 Mon Sep 17 00:00:00 2001 From: Sara Kallis Date: Fri, 12 Apr 2024 17:21:20 +0200 Subject: [PATCH 06/13] Update train_model.py --- models/electric_relaxation/src/training/train_model.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/models/electric_relaxation/src/training/train_model.py b/models/electric_relaxation/src/training/train_model.py index 4fd3122f..c2fe6ac5 100644 --- a/models/electric_relaxation/src/training/train_model.py +++ b/models/electric_relaxation/src/training/train_model.py @@ -3,7 +3,7 @@ import pandas as pd import pickle -from sklearn.ensemble import RandomForestClassifier +#from sklearn.ensemble import RandomForestClassifier from stepshift.views import StepshiftedModels from views_runs import DataPartitioner, ViewsRun @@ -53,13 +53,14 @@ def train(model_config, hp_config, data_partitions): model_future_partition = pickle.load(file) else: - setup_data_paths(Path(__file__)) - dataset = pd.read_parquet("raw") # Load from raw data path + PATH_RAW, _, _ = setup_data_paths(PATH) + dataset = pd.read_parquet(PATH_RAW / 'raw.parquet') assert not dataset.empty, "Data loading failed." calib_partition = DataPartitioner({'calib': data_partitions["calib_partitioner_dict"]}) future_partition = DataPartitioner({'future': data_partitions["future_partitioner_dict"]}) - base_model = [model_config["algorithm"]](n_estimators=hp_config["n_estimators"], n_jobs=hp_config["n_jobs"]) + #base_model = [model_config["algorithm"]](n_estimators=hp_config["n_estimators"], n_jobs=hp_config["n_jobs"]) + base_model = model_config["algorithm"](n_estimators=hp_config["n_estimators"], n_jobs=hp_config["n_jobs"]) stepshifter_def = StepshiftedModels(base_model, model_config["steps"], model_config["depvar"]) model_calibration_partition = ViewsRun(calib_partition, stepshifter_def) From 1ac2ec417f533387910ec626ae3a958adb70025f Mon Sep 17 00:00:00 2001 From: Sara Kallis Date: Fri, 12 Apr 2024 17:21:34 +0200 Subject: [PATCH 07/13] Update generate_forecast.py --- .../src/forecasting/generate_forecast.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/models/electric_relaxation/src/forecasting/generate_forecast.py b/models/electric_relaxation/src/forecasting/generate_forecast.py index fa270c08..8260be13 100644 --- a/models/electric_relaxation/src/forecasting/generate_forecast.py +++ b/models/electric_relaxation/src/forecasting/generate_forecast.py @@ -5,16 +5,14 @@ from views_runs import DataPartitioner PATH = Path(__file__) -print(PATH) sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS -print(sys.path) -from set_path import setup_project_paths, setup_data_paths, setup_generated_data_path +from set_path import setup_project_paths, setup_data_paths, setup_artifacts_paths, setup_generated_data_path setup_project_paths(PATH) from config_data_partitions import get_data_partitions from config_hyperparameters import get_hp_config from config_model import get_model_config -from training.train_model import train +from train_model import train #from src.utils.set_paths import get_data_path, get_generated_data_path def forecast(data_partitions, model_calibration_partition, model_future_partition): @@ -34,7 +32,9 @@ def forecast(data_partitions, model_calibration_partition, model_future_partitio print("Generating forecasts...") - data = pd.read_parquet(setup_data_paths("raw")) #formerly get_data_path("raw") + PATH_RAW, _, PATH_GENERATED = setup_data_paths(PATH) + PATH_ARTIFACTS = setup_artifacts_paths(PATH) + data = pd.read_parquet(PATH_RAW / 'raw.parquet') future_partitioner_dict = data_partitions["future_partitioner_dict"] calib_predictions = model_calibration_partition.predict('calib','predict',data, proba=True) @@ -43,9 +43,9 @@ def forecast(data_partitions, model_calibration_partition, model_future_partitio future_predictions = model_future_partition.future_predict('future','predict',data) future_point_predictions = model_future_partition.future_point_predict(time=529, data=data, proba=True) - calib_predictions.to_parquet(setup_generated_data_path("calibration")) - future_predictions.to_parquet(setup_generated_data_path("future")) - future_point_predictions.to_parquet(setup_generated_data_path("future_point")) + calib_predictions.to_parquet(setup_generated_data_path(PATH, "calibration")) + future_predictions.to_parquet(setup_generated_data_path(PATH, "future")) + future_point_predictions.to_parquet(setup_generated_data_path(PATH, "future_point")) print("Forecasts generated and saved in data/generated!") From 8223931601aa9e20fdf8de7049aad12e154aa4c9 Mon Sep 17 00:00:00 2001 From: Sara Kallis Date: Fri, 12 Apr 2024 17:28:39 +0200 Subject: [PATCH 08/13] Add PATH_MODEL to output --- common_utils/set_path.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_utils/set_path.py b/common_utils/set_path.py index 431c7375..5e49c9a4 100644 --- a/common_utils/set_path.py +++ b/common_utils/set_path.py @@ -80,7 +80,7 @@ def setup_data_paths(PATH) -> None: PATH_PROCESSED = PATH_DATA / "processed" PATH_GENERATED = PATH_DATA / "generated" - return PATH_RAW, PATH_PROCESSED, PATH_GENERATED + return PATH_MODEL, PATH_RAW, PATH_PROCESSED, PATH_GENERATED def setup_artifacts_paths(PATH) -> None: From c109228e1c32eb92ff27d0771b0e72b2ee074fab Mon Sep 17 00:00:00 2001 From: Sara Kallis Date: Fri, 12 Apr 2024 17:28:55 +0200 Subject: [PATCH 09/13] Update evaluate_model.py --- .../src/offline_evaluation/evaluate_model.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/models/electric_relaxation/src/offline_evaluation/evaluate_model.py b/models/electric_relaxation/src/offline_evaluation/evaluate_model.py index b147a3cf..e26ea549 100644 --- a/models/electric_relaxation/src/offline_evaluation/evaluate_model.py +++ b/models/electric_relaxation/src/offline_evaluation/evaluate_model.py @@ -6,9 +6,12 @@ from sklearn.metrics import mean_squared_error, average_precision_score, roc_auc_score, brier_score_loss -model_path = Path(__file__).resolve().parents[2] -sys.path.append(str(model_path)) -from configs.config_model import get_model_config +PATH = Path(__file__) +sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_artifacts_paths, setup_data_paths +setup_project_paths(PATH) #adds all necessary paths to sys.path + +from config_model import get_model_config def evaluate_model(model_config): @@ -34,7 +37,10 @@ def evaluate_model(model_config): """ print("Evaluating...") - df_calib = pd.read_parquet(model_path/"data"/"generated"/"calibration_predictions.parquet") + PATH_MODEL, PATH_RAW, PATH_PROCESSED, PATH_GENERATED = setup_data_paths(PATH) + + #df_calib = pd.read_parquet(model_path/"data"/"generated"/"calibration_predictions.parquet") + df_calib = pd.read_parquet(PATH_GENERATED / "calibration_predictions.parquet") steps = model_config["steps"] depvar = [model_config["depvar"]] #formerly stepcols, changed to depvar to also use in true_values @@ -61,7 +67,7 @@ def evaluate_model(model_config): [row[col] for col in pred_cols]), axis=1) mean_brier_score = df_calib["brier_score"].mean() - metrics_dict_path = model_path / "artifacts" / "evaluation_metrics.py" + metrics_dict_path = PATH_MODEL / "artifacts" / "evaluation_metrics.py" evaluation_metrics_calib = { "Mean Mean Squared Error": mean_mse, From fdb3f2c00a04c527890e7c621590346814778b39 Mon Sep 17 00:00:00 2001 From: Sara Kallis Date: Fri, 12 Apr 2024 17:28:58 +0200 Subject: [PATCH 10/13] Update evaluation_metrics.py --- models/electric_relaxation/artifacts/evaluation_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/electric_relaxation/artifacts/evaluation_metrics.py b/models/electric_relaxation/artifacts/evaluation_metrics.py index 0e471c34..07a4e175 100644 --- a/models/electric_relaxation/artifacts/evaluation_metrics.py +++ b/models/electric_relaxation/artifacts/evaluation_metrics.py @@ -1 +1 @@ -evaluation_metrics = {'Mean Mean Squared Error': 0.002929262727152805, 'Mean Average Precision': 0.07515270506108203, 'Mean Brier Score': 0.002929262727152805} \ No newline at end of file +evaluation_metrics = {'Mean Mean Squared Error': 0.0029154554168954083, 'Mean Average Precision': 0.07515270506108203, 'Mean Brier Score': 0.0029154554168954083} \ No newline at end of file From 2031c08715786ceb1ce4fab87e86070c5b8af235 Mon Sep 17 00:00:00 2001 From: Sara Kallis Date: Tue, 16 Apr 2024 10:34:44 +0200 Subject: [PATCH 11/13] Remove hardcoded month Thanks to @xiaolong0728 for pointing out --- .../electric_relaxation/src/forecasting/generate_forecast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/models/electric_relaxation/src/forecasting/generate_forecast.py b/models/electric_relaxation/src/forecasting/generate_forecast.py index 8260be13..e92e1fc0 100644 --- a/models/electric_relaxation/src/forecasting/generate_forecast.py +++ b/models/electric_relaxation/src/forecasting/generate_forecast.py @@ -41,8 +41,8 @@ def forecast(data_partitions, model_calibration_partition, model_future_partitio future_partition = DataPartitioner({'future':future_partitioner_dict}) #is this being used? we don't define an equivalent for calib_predictions future_predictions = model_future_partition.future_predict('future','predict',data) - future_point_predictions = model_future_partition.future_point_predict(time=529, data=data, proba=True) - + future_point_predictions = model_future_partition.future_point_predict(time=future_partitioner_dict['future_start'], data=data, proba=True) + calib_predictions.to_parquet(setup_generated_data_path(PATH, "calibration")) future_predictions.to_parquet(setup_generated_data_path(PATH, "future")) future_point_predictions.to_parquet(setup_generated_data_path(PATH, "future_point")) From a8f05f0aaea513fb1fa355bb01f8c6c41450d9dc Mon Sep 17 00:00:00 2001 From: Sara Kallis Date: Thu, 20 Jun 2024 16:39:18 +0200 Subject: [PATCH 12/13] Typo --- common_utils/set_path.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common_utils/set_path.py b/common_utils/set_path.py index 9713d0c3..b949bffb 100644 --- a/common_utils/set_path.py +++ b/common_utils/set_path.py @@ -118,10 +118,10 @@ def setup_data_paths(PATH) -> Path: PATH_DATA = PATH_MODEL / "data" PATH_RAW = PATH_DATA / "raw" - PATH_PROCCEDS = PATH_DATA / "processed" + PATH_PROCESSED = PATH_DATA / "processed" PATH_GENERATED = PATH_DATA / "generated" - return PATH_RAW, PATH_PROCCEDS, PATH_GENERATED # added in accordance with Sara's escwa branch + return PATH_RAW, PATH_PROCESSED, PATH_GENERATED # added in accordance with Sara's escwa branch def setup_artifacts_paths(PATH) -> Path: From c9011f13a1be97acd4ccfdee67cd55f46072264b Mon Sep 17 00:00:00 2001 From: Sara Kallis Date: Thu, 20 Jun 2024 16:41:39 +0200 Subject: [PATCH 13/13] Meta_config --- models/electric_relaxation/configs/config_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/models/electric_relaxation/configs/config_model.py b/models/electric_relaxation/configs/config_model.py index 99a29bc9..83ba6159 100644 --- a/models/electric_relaxation/configs/config_model.py +++ b/models/electric_relaxation/configs/config_model.py @@ -1,16 +1,16 @@ from sklearn.ensemble import RandomForestClassifier -def get_model_config(): +def get_meta_config(): """ Contains the common configuration settings for the model (model architecture, name, target variable, level of analysis and deployment status). Returns: - model_config (dict): A dictionary containing model configuration settings. """ - model_config = { + meta_config = { "name": "electric_relaxation", "algorithm": RandomForestClassifier, - "depvar": "ged_sb_dep", #or target? + "target": "ged_sb_dep", #or depvar "queryset": "escwa001_cflong", "level": "cm", "sweep": False, @@ -19,4 +19,4 @@ def get_model_config(): "deployment_status": "shadow", #unsure "creator": "Sara" #new addition, could be useful for managing maintenance & transfer of ownership } - return model_config #formerly common_config \ No newline at end of file + return meta_config #formerly common_config and model_config \ No newline at end of file