Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new Path solution to ESCWA model #23

Open
wants to merge 17 commits into
base: production
Choose a base branch
from
Open
39 changes: 32 additions & 7 deletions common_utils/set_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def setup_project_paths(PATH) -> None:
PATH_OFFLINE_EVALUATION = PATH_SRC / "offline_evaluation"
PATH_DATALOADERS = PATH_SRC / "dataloaders"

paths_to_add = [PATH_ROOT, PATH_COMMON_UTILS, PATH_COMMON_CONFIGS, PATH_CONFIGS, PATH_UTILS, PATH_ARCHITECTURES, PATH_TRAINING, PATH_FORECASTING, PATH_OFFLINE_EVALUATION, PATH_DATALOADERS]
paths_to_add = [PATH_ROOT, PATH_COMMON_UTILS, PATH_COMMON_CONFIGS, PATH_CONFIGS, PATH_UTILS, PATH_ARCHITECTURES, PATH_TRAINING, PATH_FORECASTING, PATH_OFFLINE_EVALUATION, PATH_DATALOADERS, PATH_SRC]

for path in paths_to_add:
path_str = str(path)
Expand All @@ -77,10 +77,10 @@ def setup_data_paths(PATH) -> None:

PATH_DATA = PATH_MODEL / "data"
PATH_RAW = PATH_DATA / "raw"
PATH_PROCCEDS = PATH_DATA / "processed"
PATH_PROCESSED = PATH_DATA / "processed"
PATH_GENERATED = PATH_DATA / "generated"

return PATH_RAW, PATH_PROCCEDS, PATH_GENERATED
return PATH_MODEL, PATH_RAW, PATH_PROCESSED, PATH_GENERATED
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have kept this as purely a data path thing. The PATH_MODEL can be defined using setup_model_paths().
But I think the way it is used below you might be better off just using setup_artifacts_path()

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, I'll push the latest changes to main tomorrow. Just running sweep to see that everything is as it should be after incorporating your last comments. After that you should merge from main to this



def setup_artifacts_paths(PATH) -> None:
Expand All @@ -92,10 +92,35 @@ def setup_artifacts_paths(PATH) -> None:
PATH (Path): The base path, typically the path of the script invoking this function (i.e., `Path(__file__)`).
config (str): The model configuration file.

"""

PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models")+2]]) # The +2 is to include the "models" and the individual model name in the path

"""
PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models")+2]])
PATH_ARTIFACTS = PATH_MODEL / "artifacts"
# print(f"Artifacts path: {PATH_ARTIFACTS}")
return PATH_ARTIFACTS

def setup_generated_data_path(PATH, partition_name) -> Path:
"""
Set up the path to the generated data for a specific partition.

Args:
partition_name (str): The name of the data partition.
PATH (Path): The base path, typically the path of the script invoking this function (i.e., `Path(__file__)`).

Returns:
Path: The path to the generated data file for the specified partition.

Usage Example:
```python
PATH = Path(__file__)
sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS
from set_path import setup_project_paths, setup_generated_data_path
setup_project_paths(PATH)
setup_generated_data_path(PATH, "calibration")
```

"""
PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models")+2]]) # The +2 is to include the "models" and the individual model name in the path

PATH_DATA_GENERATED = Path(PATH_MODEL / "data" / "generated" / f"{partition_name}_predictions.parquet")
print(f"Generated data path: {PATH_DATA_GENERATED}")
return PATH_DATA_GENERATED
2 changes: 1 addition & 1 deletion models/electric_relaxation/artifacts/evaluation_metrics.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
evaluation_metrics = {'Mean Mean Squared Error': 0.002929262727152805, 'Mean Average Precision': 0.07515270506108203, 'Mean Brier Score': 0.002929262727152805}
evaluation_metrics = {'Mean Mean Squared Error': 0.0029154554168954083, 'Mean Average Precision': 0.07515270506108203, 'Mean Brier Score': 0.0029154554168954083}
4 changes: 3 additions & 1 deletion models/electric_relaxation/configs/config_model.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from sklearn.ensemble import RandomForestClassifier

def get_model_config():
"""
Contains the common configuration settings for the model (model architecture, name, target variable, level of analysis and deployment status).
Expand All @@ -7,7 +9,7 @@ def get_model_config():
"""
model_config = {
"name": "electric_relaxation",
"algorithm": "RandomForestClassifier",
"algorithm": RandomForestClassifier,
"depvar": "ged_sb_dep", #or target?
sarakallis marked this conversation as resolved.
Show resolved Hide resolved
"queryset": "escwa001_cflong",
"level": "cm",
Expand Down
101 changes: 101 additions & 0 deletions models/electric_relaxation/notebooks/paths.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"from pathlib import Path\n",
"import pandas as pd\n",
"import pickle\n",
"\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"from stepshift.views import StepshiftedModels\n",
"from views_runs import DataPartitioner, ViewsRun\n",
"\n",
"PATH = Path.cwd() \n",
"sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index(\"views_pipeline\")+1]]) / \"common_utils\")) # PATH_COMMON_UTILS\n",
"from set_path import setup_project_paths, setup_artifacts_paths, setup_data_paths\n",
"setup_project_paths(PATH) #adds all necessary paths to sys.path\n",
"\n",
"from config_data_partitions import get_data_partitions #change to common_utils/set_partition.py\n",
"from config_hyperparameters import get_hp_config\n",
"from config_model import get_model_config"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def train(model_config, hp_config, data_partitions):\n",
" print(\"Training...\")\n",
"\n",
" # Define the artifacts path manually or according to your notebook structure\n",
" artifacts_path = Path(\"your_path_to_artifacts_directory\")\n",
"\n",
" calib_pickle_path = artifacts_path / \"model_calibration_partition.pkl\"\n",
" future_pickle_path = artifacts_path / \"model_future_partition.pkl\"\n",
"\n",
" if calib_pickle_path.exists() and future_pickle_path.exists():\n",
" print(\"Pickle files already exist. Loading models from pickle files...\")\n",
" with open(calib_pickle_path, 'rb') as file:\n",
" model_calibration_partition = pickle.load(file)\n",
" with open(future_pickle_path, 'rb') as file:\n",
" model_future_partition = pickle.load(file)\n",
"\n",
" else:\n",
" # Assuming you have loaded the dataset before calling this function\n",
" dataset = \"models/electric_relaxation/data/raw/raw.parquet\" # Load your dataset here\n",
"\n",
" calib_partition = DataPartitioner({'calib': data_partitions[\"calib_partitioner_dict\"]})\n",
" future_partition = DataPartitioner({'future': data_partitions[\"future_partitioner_dict\"]})\n",
"\n",
" base_model = RandomForestClassifier(n_estimators=hp_config[\"n_estimators\"], n_jobs=hp_config[\"n_jobs\"])\n",
" stepshifter_def = StepshiftedModels(base_model, model_config[\"steps\"], model_config[\"depvar\"])\n",
"\n",
" model_calibration_partition = ViewsRun(calib_partition, stepshifter_def)\n",
" model_calibration_partition.fit('calib', 'train', dataset)\n",
"\n",
" model_future_partition = ViewsRun(future_partition, stepshifter_def)\n",
" model_future_partition.fit('future', 'train', dataset)\n",
"\n",
" assert model_calibration_partition is not None and model_future_partition is not None, \"Model training failed.\"\n",
"\n",
" with open(calib_pickle_path, 'wb') as file:\n",
" pickle.dump(model_calibration_partition, file)\n",
" with open(future_pickle_path, 'wb') as file:\n",
" pickle.dump(model_future_partition, file)\n",
"\n",
" print(\"Models trained and saved in artifacts folder!\")\n",
"\n",
" return model_calibration_partition, model_future_partition\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "viewser",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Empty file.
Empty file.
Empty file.
31 changes: 17 additions & 14 deletions models/electric_relaxation/src/forecasting/generate_forecast.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@

from views_runs import DataPartitioner

model_path = Path(__file__).resolve().parents[2]
sys.path.append(str(model_path))
print(sys.path)
PATH = Path(__file__)
sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS
from set_path import setup_project_paths, setup_data_paths, setup_artifacts_paths, setup_generated_data_path
setup_project_paths(PATH)

from configs.config_data_partitions import get_data_partitions
from configs.config_hyperparameters import get_hp_config
from configs.config_model import get_model_config
from src.training.train_model import train
from src.utils.set_paths import get_data_path, get_generated_data_path
from config_data_partitions import get_data_partitions
from config_hyperparameters import get_hp_config
from config_model import get_model_config
from train_model import train
#from src.utils.set_paths import get_data_path, get_generated_data_path

def forecast(data_partitions, model_calibration_partition, model_future_partition):
"""
Expand All @@ -31,18 +32,20 @@ def forecast(data_partitions, model_calibration_partition, model_future_partitio

print("Generating forecasts...")

data = pd.read_parquet(get_data_path("raw"))
PATH_RAW, _, PATH_GENERATED = setup_data_paths(PATH)
PATH_ARTIFACTS = setup_artifacts_paths(PATH)
data = pd.read_parquet(PATH_RAW / 'raw.parquet')
future_partitioner_dict = data_partitions["future_partitioner_dict"]

calib_predictions = model_calibration_partition.predict('calib','predict',data, proba=True)

future_partition = DataPartitioner({'future':future_partitioner_dict}) #is this being used? we don't define an equivalent for calib_predictions
future_predictions = model_future_partition.future_predict('future','predict',data)
future_point_predictions = model_future_partition.future_point_predict(time=529, data=data, proba=True)

calib_predictions.to_parquet(get_generated_data_path("calibration"))
future_predictions.to_parquet(get_generated_data_path("future"))
future_point_predictions.to_parquet(get_generated_data_path("future_point"))
future_point_predictions = model_future_partition.future_point_predict(time=future_partitioner_dict['future_start'], data=data, proba=True)
calib_predictions.to_parquet(setup_generated_data_path(PATH, "calibration"))
future_predictions.to_parquet(setup_generated_data_path(PATH, "future"))
future_point_predictions.to_parquet(setup_generated_data_path(PATH, "future_point"))

print("Forecasts generated and saved in data/generated!")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@

from sklearn.metrics import mean_squared_error, average_precision_score, roc_auc_score, brier_score_loss

model_path = Path(__file__).resolve().parents[2]
sys.path.append(str(model_path))
from configs.config_model import get_model_config
PATH = Path(__file__)
sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS
from set_path import setup_project_paths, setup_artifacts_paths, setup_data_paths
setup_project_paths(PATH) #adds all necessary paths to sys.path

from config_model import get_model_config


def evaluate_model(model_config):
Expand All @@ -34,7 +37,10 @@ def evaluate_model(model_config):
"""
print("Evaluating...")

df_calib = pd.read_parquet(model_path/"data"/"generated"/"calibration_predictions.parquet")
PATH_MODEL, PATH_RAW, PATH_PROCESSED, PATH_GENERATED = setup_data_paths(PATH)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with the new implementation, you should not get PATH_MODEL from here. You can get it from setup_model_path but given the use below I think you should just use setup_artifacts_path


#df_calib = pd.read_parquet(model_path/"data"/"generated"/"calibration_predictions.parquet")
df_calib = pd.read_parquet(PATH_GENERATED / "calibration_predictions.parquet")

steps = model_config["steps"]
depvar = [model_config["depvar"]] #formerly stepcols, changed to depvar to also use in true_values
Expand All @@ -61,7 +67,7 @@ def evaluate_model(model_config):
[row[col] for col in pred_cols]), axis=1)
mean_brier_score = df_calib["brier_score"].mean()

metrics_dict_path = model_path / "artifacts" / "evaluation_metrics.py"
metrics_dict_path = PATH_MODEL / "artifacts" / "evaluation_metrics.py"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then you can also simplify this a bit


evaluation_metrics_calib = {
"Mean Mean Squared Error": mean_mse,
Expand Down

This file was deleted.

Empty file.
37 changes: 20 additions & 17 deletions models/electric_relaxation/src/training/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,20 @@
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestClassifier
#from sklearn.ensemble import RandomForestClassifier

from stepshift.views import StepshiftedModels
from views_runs import DataPartitioner, ViewsRun

model_path = Path(__file__).resolve().parents[2]
sys.path.append(str(model_path))
print(sys.path)
PATH = Path(__file__)
sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS
from set_path import setup_project_paths, setup_artifacts_paths, setup_data_paths
setup_project_paths(PATH) #adds all necessary paths to sys.path

from configs.config_data_partitions import get_data_partitions
from configs.config_hyperparameters import get_hp_config
from configs.config_model import get_model_config
#from configs.config_sweep import get_sweep_config
from src.utils.set_paths import get_data_path, get_artifacts_path
from config_data_partitions import get_data_partitions #change to common_utils/set_partition.py
from config_hyperparameters import get_hp_config
from config_model import get_model_config
#from config_sweep import get_sweep_config

def train(model_config, hp_config, data_partitions):
"""
Expand All @@ -33,16 +33,17 @@ def train(model_config, hp_config, data_partitions):

Returns:
- tuple: Trained models for calibration and future partitions.

Note:
- The 'artifacts' directory must exist in the system path for saving and loading pickle files.
- Ensure that the raw dataset is successfully loaded before proceeding with model training.
"""

print("Training...")

#calib_pickle_path = get_artifacts_path("calibration") #not sure why code doesn't run well with these
#future_pickle_path = get_artifacts_path("forecast")
calib_pickle_path = model_path / "artifacts" / "model_calibration_partition.pkl"
future_pickle_path = model_path / "artifacts" / "model_future_partition.pkl"
print(calib_pickle_path)
print(future_pickle_path)
artifacts_path = setup_artifacts_paths(PATH)
calib_pickle_path = artifacts_path / "model_calibration_partition.pkl"
future_pickle_path = artifacts_path / "model_future_partition.pkl"

if calib_pickle_path.exists() and future_pickle_path.exists():
print("Pickle files already exist. Loading models from pickle files...")
Expand All @@ -52,12 +53,14 @@ def train(model_config, hp_config, data_partitions):
model_future_partition = pickle.load(file)

else:
dataset = pd.read_parquet(get_data_path("raw"))
PATH_RAW, _, _ = setup_data_paths(PATH)
dataset = pd.read_parquet(PATH_RAW / 'raw.parquet')
assert not dataset.empty, "Data loading failed."

calib_partition = DataPartitioner({'calib': data_partitions["calib_partitioner_dict"]})
future_partition = DataPartitioner({'future': data_partitions["future_partitioner_dict"]})
base_model = [model_config["algorithm"]](n_estimators=hp_config["n_estimators"], n_jobs=hp_config["n_jobs"])
#base_model = [model_config["algorithm"]](n_estimators=hp_config["n_estimators"], n_jobs=hp_config["n_jobs"])
base_model = model_config["algorithm"](n_estimators=hp_config["n_estimators"], n_jobs=hp_config["n_jobs"])
stepshifter_def = StepshiftedModels(base_model, model_config["steps"], model_config["depvar"])

model_calibration_partition = ViewsRun(calib_partition, stepshifter_def)
Expand Down
Empty file.
Loading