Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new Path solution to ESCWA model #23

Open
wants to merge 17 commits into
base: production
Choose a base branch
from
Open
4 changes: 2 additions & 2 deletions common_utils/set_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,10 @@ def setup_data_paths(PATH) -> Path:

PATH_DATA = PATH_MODEL / "data"
PATH_RAW = PATH_DATA / "raw"
PATH_PROCCEDS = PATH_DATA / "processed"
PATH_PROCESSED = PATH_DATA / "processed"
PATH_GENERATED = PATH_DATA / "generated"

return PATH_RAW, PATH_PROCCEDS, PATH_GENERATED # added in accordance with Sara's escwa branch
return PATH_RAW, PATH_PROCESSED, PATH_GENERATED # added in accordance with Sara's escwa branch
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed a typo FYI @Polichinel



def setup_artifacts_paths(PATH) -> Path:
Expand Down
2 changes: 1 addition & 1 deletion models/electric_relaxation/artifacts/evaluation_metrics.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
evaluation_metrics = {'Mean Mean Squared Error': 0.002929262727152805, 'Mean Average Precision': 0.07515270506108203, 'Mean Brier Score': 0.002929262727152805}
evaluation_metrics = {'Mean Mean Squared Error': 0.0029154554168954083, 'Mean Average Precision': 0.07515270506108203, 'Mean Brier Score': 0.0029154554168954083}
4 changes: 3 additions & 1 deletion models/electric_relaxation/configs/config_model.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from sklearn.ensemble import RandomForestClassifier

def get_model_config():
"""
Contains the common configuration settings for the model (model architecture, name, target variable, level of analysis and deployment status).
Expand All @@ -7,7 +9,7 @@ def get_model_config():
"""
model_config = {
"name": "electric_relaxation",
"algorithm": "RandomForestClassifier",
"algorithm": RandomForestClassifier,
"depvar": "ged_sb_dep", #or target?
sarakallis marked this conversation as resolved.
Show resolved Hide resolved
"queryset": "escwa001_cflong",
"level": "cm",
Expand Down
101 changes: 101 additions & 0 deletions models/electric_relaxation/notebooks/paths.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"from pathlib import Path\n",
"import pandas as pd\n",
"import pickle\n",
"\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"from stepshift.views import StepshiftedModels\n",
"from views_runs import DataPartitioner, ViewsRun\n",
"\n",
"PATH = Path.cwd() \n",
"sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index(\"views_pipeline\")+1]]) / \"common_utils\")) # PATH_COMMON_UTILS\n",
"from set_path import setup_project_paths, setup_artifacts_paths, setup_data_paths\n",
"setup_project_paths(PATH) #adds all necessary paths to sys.path\n",
"\n",
"from config_data_partitions import get_data_partitions #change to common_utils/set_partition.py\n",
"from config_hyperparameters import get_hp_config\n",
"from config_model import get_model_config"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def train(model_config, hp_config, data_partitions):\n",
" print(\"Training...\")\n",
"\n",
" # Define the artifacts path manually or according to your notebook structure\n",
" artifacts_path = Path(\"your_path_to_artifacts_directory\")\n",
"\n",
" calib_pickle_path = artifacts_path / \"model_calibration_partition.pkl\"\n",
" future_pickle_path = artifacts_path / \"model_future_partition.pkl\"\n",
"\n",
" if calib_pickle_path.exists() and future_pickle_path.exists():\n",
" print(\"Pickle files already exist. Loading models from pickle files...\")\n",
" with open(calib_pickle_path, 'rb') as file:\n",
" model_calibration_partition = pickle.load(file)\n",
" with open(future_pickle_path, 'rb') as file:\n",
" model_future_partition = pickle.load(file)\n",
"\n",
" else:\n",
" # Assuming you have loaded the dataset before calling this function\n",
" dataset = \"models/electric_relaxation/data/raw/raw.parquet\" # Load your dataset here\n",
"\n",
" calib_partition = DataPartitioner({'calib': data_partitions[\"calib_partitioner_dict\"]})\n",
" future_partition = DataPartitioner({'future': data_partitions[\"future_partitioner_dict\"]})\n",
"\n",
" base_model = RandomForestClassifier(n_estimators=hp_config[\"n_estimators\"], n_jobs=hp_config[\"n_jobs\"])\n",
" stepshifter_def = StepshiftedModels(base_model, model_config[\"steps\"], model_config[\"depvar\"])\n",
"\n",
" model_calibration_partition = ViewsRun(calib_partition, stepshifter_def)\n",
" model_calibration_partition.fit('calib', 'train', dataset)\n",
"\n",
" model_future_partition = ViewsRun(future_partition, stepshifter_def)\n",
" model_future_partition.fit('future', 'train', dataset)\n",
"\n",
" assert model_calibration_partition is not None and model_future_partition is not None, \"Model training failed.\"\n",
"\n",
" with open(calib_pickle_path, 'wb') as file:\n",
" pickle.dump(model_calibration_partition, file)\n",
" with open(future_pickle_path, 'wb') as file:\n",
" pickle.dump(model_future_partition, file)\n",
"\n",
" print(\"Models trained and saved in artifacts folder!\")\n",
"\n",
" return model_calibration_partition, model_future_partition\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "viewser",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Empty file.
Empty file.
Empty file.
31 changes: 17 additions & 14 deletions models/electric_relaxation/src/forecasting/generate_forecast.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@

from views_runs import DataPartitioner

model_path = Path(__file__).resolve().parents[2]
sys.path.append(str(model_path))
print(sys.path)
PATH = Path(__file__)
sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS
from set_path import setup_project_paths, setup_data_paths, setup_artifacts_paths, setup_generated_data_path
setup_project_paths(PATH)

from configs.config_data_partitions import get_data_partitions
from configs.config_hyperparameters import get_hp_config
from configs.config_model import get_model_config
from src.training.train_model import train
from src.utils.set_paths import get_data_path, get_generated_data_path
from config_data_partitions import get_data_partitions
from config_hyperparameters import get_hp_config
from config_model import get_model_config
from train_model import train
#from src.utils.set_paths import get_data_path, get_generated_data_path

def forecast(data_partitions, model_calibration_partition, model_future_partition):
"""
Expand All @@ -31,18 +32,20 @@ def forecast(data_partitions, model_calibration_partition, model_future_partitio

print("Generating forecasts...")

data = pd.read_parquet(get_data_path("raw"))
PATH_RAW, _, PATH_GENERATED = setup_data_paths(PATH)
PATH_ARTIFACTS = setup_artifacts_paths(PATH)
data = pd.read_parquet(PATH_RAW / 'raw.parquet')
future_partitioner_dict = data_partitions["future_partitioner_dict"]

calib_predictions = model_calibration_partition.predict('calib','predict',data, proba=True)

future_partition = DataPartitioner({'future':future_partitioner_dict}) #is this being used? we don't define an equivalent for calib_predictions
future_predictions = model_future_partition.future_predict('future','predict',data)
future_point_predictions = model_future_partition.future_point_predict(time=529, data=data, proba=True)

calib_predictions.to_parquet(get_generated_data_path("calibration"))
future_predictions.to_parquet(get_generated_data_path("future"))
future_point_predictions.to_parquet(get_generated_data_path("future_point"))
future_point_predictions = model_future_partition.future_point_predict(time=future_partitioner_dict['future_start'], data=data, proba=True)
calib_predictions.to_parquet(setup_generated_data_path(PATH, "calibration"))
future_predictions.to_parquet(setup_generated_data_path(PATH, "future"))
future_point_predictions.to_parquet(setup_generated_data_path(PATH, "future_point"))

print("Forecasts generated and saved in data/generated!")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@

from sklearn.metrics import mean_squared_error, average_precision_score, roc_auc_score, brier_score_loss

model_path = Path(__file__).resolve().parents[2]
sys.path.append(str(model_path))
from configs.config_model import get_model_config
PATH = Path(__file__)
sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS
from set_path import setup_project_paths, setup_artifacts_paths, setup_data_paths
setup_project_paths(PATH) #adds all necessary paths to sys.path

from config_model import get_model_config


def evaluate_model(model_config):
Expand All @@ -34,7 +37,10 @@ def evaluate_model(model_config):
"""
print("Evaluating...")

df_calib = pd.read_parquet(model_path/"data"/"generated"/"calibration_predictions.parquet")
PATH_MODEL, PATH_RAW, PATH_PROCESSED, PATH_GENERATED = setup_data_paths(PATH)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with the new implementation, you should not get PATH_MODEL from here. You can get it from setup_model_path but given the use below I think you should just use setup_artifacts_path


#df_calib = pd.read_parquet(model_path/"data"/"generated"/"calibration_predictions.parquet")
df_calib = pd.read_parquet(PATH_GENERATED / "calibration_predictions.parquet")

steps = model_config["steps"]
depvar = [model_config["depvar"]] #formerly stepcols, changed to depvar to also use in true_values
Expand All @@ -61,7 +67,7 @@ def evaluate_model(model_config):
[row[col] for col in pred_cols]), axis=1)
mean_brier_score = df_calib["brier_score"].mean()

metrics_dict_path = model_path / "artifacts" / "evaluation_metrics.py"
metrics_dict_path = PATH_MODEL / "artifacts" / "evaluation_metrics.py"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then you can also simplify this a bit


evaluation_metrics_calib = {
"Mean Mean Squared Error": mean_mse,
Expand Down

This file was deleted.

Empty file.
37 changes: 20 additions & 17 deletions models/electric_relaxation/src/training/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,20 @@
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestClassifier
#from sklearn.ensemble import RandomForestClassifier

from stepshift.views import StepshiftedModels
from views_runs import DataPartitioner, ViewsRun

model_path = Path(__file__).resolve().parents[2]
sys.path.append(str(model_path))
print(sys.path)
PATH = Path(__file__)
sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS
from set_path import setup_project_paths, setup_artifacts_paths, setup_data_paths
setup_project_paths(PATH) #adds all necessary paths to sys.path

from configs.config_data_partitions import get_data_partitions
from configs.config_hyperparameters import get_hp_config
from configs.config_model import get_model_config
#from configs.config_sweep import get_sweep_config
from src.utils.set_paths import get_data_path, get_artifacts_path
from config_data_partitions import get_data_partitions #change to common_utils/set_partition.py
from config_hyperparameters import get_hp_config
from config_model import get_model_config
#from config_sweep import get_sweep_config

def train(model_config, hp_config, data_partitions):
"""
Expand All @@ -33,16 +33,17 @@ def train(model_config, hp_config, data_partitions):

Returns:
- tuple: Trained models for calibration and future partitions.

Note:
- The 'artifacts' directory must exist in the system path for saving and loading pickle files.
- Ensure that the raw dataset is successfully loaded before proceeding with model training.
"""

print("Training...")

#calib_pickle_path = get_artifacts_path("calibration") #not sure why code doesn't run well with these
#future_pickle_path = get_artifacts_path("forecast")
calib_pickle_path = model_path / "artifacts" / "model_calibration_partition.pkl"
future_pickle_path = model_path / "artifacts" / "model_future_partition.pkl"
print(calib_pickle_path)
print(future_pickle_path)
artifacts_path = setup_artifacts_paths(PATH)
calib_pickle_path = artifacts_path / "model_calibration_partition.pkl"
future_pickle_path = artifacts_path / "model_future_partition.pkl"

if calib_pickle_path.exists() and future_pickle_path.exists():
print("Pickle files already exist. Loading models from pickle files...")
Expand All @@ -52,12 +53,14 @@ def train(model_config, hp_config, data_partitions):
model_future_partition = pickle.load(file)

else:
dataset = pd.read_parquet(get_data_path("raw"))
PATH_RAW, _, _ = setup_data_paths(PATH)
dataset = pd.read_parquet(PATH_RAW / 'raw.parquet')
assert not dataset.empty, "Data loading failed."

calib_partition = DataPartitioner({'calib': data_partitions["calib_partitioner_dict"]})
future_partition = DataPartitioner({'future': data_partitions["future_partitioner_dict"]})
base_model = [model_config["algorithm"]](n_estimators=hp_config["n_estimators"], n_jobs=hp_config["n_jobs"])
#base_model = [model_config["algorithm"]](n_estimators=hp_config["n_estimators"], n_jobs=hp_config["n_jobs"])
base_model = model_config["algorithm"](n_estimators=hp_config["n_estimators"], n_jobs=hp_config["n_jobs"])
stepshifter_def = StepshiftedModels(base_model, model_config["steps"], model_config["depvar"])

model_calibration_partition = ViewsRun(calib_partition, stepshifter_def)
Expand Down
Empty file.
63 changes: 0 additions & 63 deletions models/electric_relaxation/src/utils/set_paths.py

This file was deleted.