diff --git a/.github/workflows/check_if_new_model_added.yml b/.github/workflows/check_if_new_model_added.yml index d6b9dce1..41957cef 100644 --- a/.github/workflows/check_if_new_model_added.yml +++ b/.github/workflows/check_if_new_model_added.yml @@ -16,7 +16,7 @@ jobs: with: fetch-depth: 0 - - name: Check for new directories + - name: Check for new directories and generate catalog if models directory has changed run: | git status @@ -41,8 +41,7 @@ jobs: git checkout create_pgm_catalog_01 git status - - name: Generate catalog if models directory has changed - run: | + # generate catalog if models directory has changed if [ -n "$new_dirs" ] || [ -n "$removed_dirs" ]; then python documentation/catalogs/generate_model_catalog.py echo "Model catalog is updated." diff --git a/models/lavender_haze/.DS_Store b/models/lavender_haze/.DS_Store new file mode 100644 index 00000000..7224dcaa Binary files /dev/null and b/models/lavender_haze/.DS_Store differ diff --git a/models/lavender_haze/README.md b/models/lavender_haze/README.md new file mode 100644 index 00000000..7306bc97 --- /dev/null +++ b/models/lavender_haze/README.md @@ -0,0 +1,85 @@ +# Lavender Haze Model +## Overview +This folder contains code for Lavender Haze model, a machine learning model designed for predicting fatalities. + +The model utilizes Hurdle Model (LGBMClassifier+LGBMRegressor) for its predictions and is on pgm level of analysis. + +The model uses log fatalities. + +## Repository Structure +``` + +lavender_haze/ # should follow the naming convention adjective_noun +|-- README.md +|-- requirements.txt +| +|-- artifacts/ # ensemble stepshifter models +| |-- model_metadata_dict.py # the standard meta data dict for models +| +|-- configs/ # ... +| |-- config_deployment.py # configuration for deploying the model into different environments +| |-- config_hyperparameters.py # hyperparameters for the model +| |-- config_input_data.py # defined queryset as the input data +| |-- config_meta # metadata for the model (model architecture, name, target variable, and level of analysis) +| |-- config_sweep # sweeping parameters for weights & biases +| +|-- data/ # all input, processed, output data +| |-- generated/ # Data generated - i.e. forecast/ evaluation +| |-- processed/ # Data processed +| |-- raw/ # Data directly from VIEiWSER +| +|-- notebooks/ +| +|-- reports/ # dissemination material - internal and external +| |-- figures/ # figures for papers, reports, newsletters, and slides +| |-- papers/ # working papers, white papers, articles ect. +| |-- plots/ # plots for papers, reports, newsletters, and slides +| |-- slides/ # slides, presentation and similar +| |-- timelapse/ # plots to create timelapse and the timelapse +| +|-- src/ # all source code needed to train, test, and forecast + | + |-- dataloaders/ + | |-- get_data.py # script to get data from VIEWSER (and input drift detection) + | + |-- forecasting/ + | |-- generate_forecast.py # script to genereate true-future fc + | + |-- management/ + | |-- execute_model_runs.py # execute a single run + | |-- execute_model_tasks.py # execute various model-related tasks + | + |-- offline_evaluation/ # aka offline quality assurance + | |-- evaluate_model.py # script to evaluate a single model + | |-- evaluate_sweep.py # script to evaluate a model during sweeping + | + |-- online_evaluation/ + | + |-- training/ + | |-- train_model.py # script to train a single model + | + |-- utils/ # functions and classes + | |-- utils.py # a general utils function + | |-- utils_wandb.py # a w&b specific utils function + | + |-- visualization/ # scripts to create visualizations + |-- visual.py + + +``` + +## Setup Instructions +Clone the repository. + +Install dependencies. + +## Usage +Modify configurations in configs/. + +Run main.py. + +``` +python main.py -r calibration -t -e +``` + +Monitor progress and results using [Weights & Biases](https://wandb.ai/views_pipeline/lavender_haze). \ No newline at end of file diff --git a/models/lavender_haze/artifacts/model_metadata_dict.py b/models/lavender_haze/artifacts/model_metadata_dict.py new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/configs/config_deployment.py b/models/lavender_haze/configs/config_deployment.py new file mode 100644 index 00000000..e1d56586 --- /dev/null +++ b/models/lavender_haze/configs/config_deployment.py @@ -0,0 +1,16 @@ +def get_deployment_config(): + + """ + Contains the configuration for deploying the model into different environments. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + + Returns: + - deployment_config (dict): A dictionary containing deployment settings, determining how the model is deployed, including status, endpoints, and resource allocation. + """ + + # More deployment settings can/will be added here + deployment_config = { + "deployment_status": "shadow", # shadow, deployed, baseline, or deprecated + } + + return deployment_config \ No newline at end of file diff --git a/models/lavender_haze/configs/config_hyperparameters.py b/models/lavender_haze/configs/config_hyperparameters.py new file mode 100644 index 00000000..11675b1f --- /dev/null +++ b/models/lavender_haze/configs/config_hyperparameters.py @@ -0,0 +1,17 @@ +def get_hp_config(): + hp_config = { + "steps": [*range(1, 36 + 1, 1)], + "parameters": { + "clf":{ + "learning_rate": 0.05, + "n_estimators": 100, + "n_jobs": 12 + }, + "reg":{ + "learning_rate": 0.05, + "n_estimators": 100, + "n_jobs": 12 + } + } + } + return hp_config \ No newline at end of file diff --git a/models/lavender_haze/configs/config_input_data.py b/models/lavender_haze/configs/config_input_data.py new file mode 100644 index 00000000..28f790c6 --- /dev/null +++ b/models/lavender_haze/configs/config_input_data.py @@ -0,0 +1,167 @@ +import numpy as np +from viewser import Queryset, Column + +def get_input_data_config(): + + thetacrit_spatial = 0.7 + return_values = 'distances' + n_nearest = 1 + power = 0.0 + + qs_broad = (Queryset("fatalities003_pgm_broad", "priogrid_month") + + # target variable + .with_column(Column("ln_ged_sb_dep", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") + .transform.missing.replace_na() + .transform.ops.ln() + ) + + # timelags 0 of conflict variables, ged_best versions + + .with_column(Column("ged_sb", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column("ged_os", from_loa="priogrid_month", from_column="ged_os_best_sum_nokgi") + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column("ged_ns", from_loa="priogrid_month", from_column="ged_ns_best_sum_nokgi") + .transform.missing.fill() + .transform.missing.replace_na() + ) + + # Spatial lag + .with_column(Column("splag_1_1_sb_1", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") + .transform.missing.replace_na() + .transform.bool.gte(1) + .transform.temporal.time_since() + .transform.temporal.decay(24) + .transform.spatial.lag(1, 1, 0, 0) + .transform.missing.replace_na() + ) + + # Decay functions + # sb + .with_column(Column("decay_ged_sb_5", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(12) + .transform.missing.replace_na() + ) + # os + .with_column(Column("decay_ged_os_5", from_loa="priogrid_month", from_column="ged_os_best_sum_nokgi") + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(12) + .transform.missing.replace_na() + ) + + # ns + .with_column(Column("decay_ged_ns_5", from_loa="priogrid_month", from_column="ged_ns_best_sum_nokgi") + .transform.missing.replace_na() + .transform.bool.gte(5) + .transform.temporal.time_since() + .transform.temporal.decay(12) + .transform.missing.replace_na() + ) + + # Trees + + .with_column(Column("treelag_1_sb", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") + .transform.missing.replace_na() + .transform.spatial.treelag(thetacrit_spatial, 1) + ) + + .with_column(Column("treelag_2_sb", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") + .transform.missing.replace_na() + .transform.spatial.treelag(thetacrit_spatial, 2) + ) + # sptime + + # continuous, sptime_dist, nu=1 + .with_column(Column("sptime_dist_k1_1_ged_sb", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") + .transform.missing.replace_na() + .transform.spatial.sptime_dist(return_values, n_nearest, 1.0, power) + ) + + .with_column(Column("sptime_dist_k1_2_ged_sb", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") + .transform.missing.replace_na() + .transform.spatial.sptime_dist(return_values, n_nearest, 10.0, power) + ) + + .with_column(Column("sptime_dist_k1_3_ged_sb", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") + .transform.missing.replace_na() + .transform.spatial.sptime_dist(return_values, n_nearest, 0.01, power) + ) + + # From natsoc + .with_column(Column("ln_ttime_mean", from_loa="priogrid_year", from_column="ttime_mean") + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column("ln_bdist3", from_loa="priogrid_year", from_column="bdist3") + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column("ln_capdist", from_loa="priogrid_year", from_column="capdist") + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column("dist_diamsec", from_loa="priogrid", from_column="dist_diamsec_s_wgs") + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_column(Column("imr_mean", from_loa="priogrid_year", from_column="imr_mean") + .transform.missing.fill() + .transform.missing.replace_na() + ) + + # From drought + .with_column(Column("tlag1_dr_mod_gs", from_loa="priogrid_month", + from_column="tlag1_dr_mod_gs") + .transform.missing.replace_na(0) + ) + + .with_column(Column("spei1_gs_prev10_anom", from_loa="priogrid_month", + from_column="spei1_gs_prev10_anom") + .transform.missing.replace_na(0) + ) + + .with_column(Column("tlag_12_crop_sum", from_loa="priogrid_month", + from_column="tlag_12_crop_sum") + .transform.missing.replace_na(0) + ) + + .with_column(Column("spei1gsy_lowermedian_count", from_loa="priogrid_month", + from_column="spei1gsy_lowermedian_count") + .transform.missing.replace_na(0) + ) + + # Log population as control + .with_column(Column("ln_pop_gpw_sum", from_loa="priogrid_year", from_column="pop_gpw_sum") + .transform.ops.ln() + .transform.missing.fill() + .transform.missing.replace_na() + ) + + .with_theme("fatalities") + .describe("""fatalities broad model, pgm level + + Predicting ln(ged_best_sb), broad model + + """) + ) + + return qs_broad \ No newline at end of file diff --git a/models/lavender_haze/configs/config_meta.py b/models/lavender_haze/configs/config_meta.py new file mode 100644 index 00000000..e7ca1fc5 --- /dev/null +++ b/models/lavender_haze/configs/config_meta.py @@ -0,0 +1,19 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model architecture, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + meta_config = { + "name": "lavender_haze", + "algorithm": "HurdleRegression", + "model_clf": "LGBMClassifier", + "model_reg": "LGBMRegressor", + "depvar": "ln_ged_sb_dep", # IMPORTANT! The current stepshift only takes one target variable! Not compatiable with Simon's code! + "queryset": "fatalities003_pgm_broad", + "level": "pgm", + "creator": "Xiaolong" + } + return meta_config \ No newline at end of file diff --git a/models/lavender_haze/configs/config_sweep.py b/models/lavender_haze/configs/config_sweep.py new file mode 100644 index 00000000..36ebf61c --- /dev/null +++ b/models/lavender_haze/configs/config_sweep.py @@ -0,0 +1,26 @@ +def get_sweep_config(): + sweep_config = { + "name": "lavender_haze", + "method": "grid" + } + + metric = { + "name": "MSE", + "goal": "minimize" + } + + sweep_config["metric"] = metric + + parameters_dict = { + "steps": {"values": [[*range(1, 36 + 1, 1)]]}, + "cls_n_estimators": {"values": [100, 200]}, + "cls_learning_rate": {"values": [0.05]}, + "cls_n_jobs": {"values": [12]}, + "reg_n_estimators": {"values": [100, 200]}, + "reg_learning_rate": {"values": [0.05]}, + "reg_n_jobs": {"values": [12]} + } + + sweep_config["parameters"] = parameters_dict + + return sweep_config \ No newline at end of file diff --git a/models/lavender_haze/data/generated/.gitkeep b/models/lavender_haze/data/generated/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/data/processed/.gitkeep b/models/lavender_haze/data/processed/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/data/raw/.gitkeep b/models/lavender_haze/data/raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/main.py b/models/lavender_haze/main.py new file mode 100644 index 00000000..c199227c --- /dev/null +++ b/models/lavender_haze/main.py @@ -0,0 +1,30 @@ +import wandb +import sys +import warnings + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_root_paths +setup_project_paths(PATH) + +from utils_cli_parser import parse_args, validate_arguments +from utils_logger import setup_logging +from execute_model_runs import execute_sweep_run, execute_single_run + +warnings.filterwarnings("ignore") + +logger = setup_logging('run.log') + + +if __name__ == "__main__": + wandb.login() + + args = parse_args() + validate_arguments(args) + + if args.sweep: + execute_sweep_run(args) + else: + execute_single_run(args) diff --git a/models/lavender_haze/notebooks/notebook001.ipynb b/models/lavender_haze/notebooks/notebook001.ipynb new file mode 100644 index 00000000..4d25bd82 --- /dev/null +++ b/models/lavender_haze/notebooks/notebook001.ipynb @@ -0,0 +1,696 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Optional, Union\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from sklearn.linear_model import LinearRegression, LogisticRegression\n", + "from sklearn.base import BaseEstimator\n", + "from sklearn.utils.estimator_checks import check_estimator\n", + "from sklearn.utils.validation import check_X_y, check_array, check_is_fitted\n", + "from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.ensemble import HistGradientBoostingRegressor\n", + "from sklearn.ensemble import HistGradientBoostingClassifier\n", + "from xgboost import XGBRegressor\n", + "from xgboost import XGBClassifier\n", + "from xgboost import XGBRFRegressor, XGBRFClassifier\n", + "from lightgbm import LGBMClassifier, LGBMRegressor\n", + "\n", + "#from lightgbm import LGBMClassifier, LGBMRegressor\n", + "\n", + "\n", + "class HurdleRegression(BaseEstimator):\n", + " \"\"\" Regression model which handles excessive zeros by fitting a two-part model and combining predictions:\n", + " 1) binary classifier\n", + " 2) continuous regression\n", + " Implementeted as a valid sklearn estimator, so it can be used in pipelines and GridSearch objects.\n", + " Args:\n", + " clf_name: currently supports either 'logistic' or 'LGBMClassifier'\n", + " reg_name: currently supports either 'linear' or 'LGBMRegressor'\n", + " clf_params: dict of parameters to pass to classifier sub-model when initialized\n", + " reg_params: dict of parameters to pass to regression sub-model when initialized\n", + " \"\"\"\n", + "\n", + " def __init__(self,\n", + " clf_name: str = 'logistic',\n", + " reg_name: str = 'linear',\n", + " clf_params: Optional[dict] = None,\n", + " reg_params: Optional[dict] = None):\n", + "\n", + " self.clf_name = clf_name\n", + " self.reg_name = reg_name\n", + " self.clf_params = clf_params\n", + " self.reg_params = reg_params\n", + " self.clf_fi = []\n", + " self.reg_fi = []\n", + "\n", + " @staticmethod\n", + " def _resolve_estimator(func_name: str):\n", + " \"\"\" Lookup table for supported estimators.\n", + " This is necessary because sklearn estimator default arguments\n", + " must pass equality test, and instantiated sub-estimators are not equal. \"\"\"\n", + "\n", + " funcs = {'linear': LinearRegression(),\n", + " 'logistic': LogisticRegression(solver='liblinear'),\n", + " 'LGBMRegressor': LGBMRegressor(n_estimators=250),\n", + " 'LGBMClassifier': LGBMClassifier(n_estimators=250),\n", + " 'RFRegressor': XGBRFRegressor(n_estimators=250,n_jobs=-2),\n", + " 'RFClassifier': XGBRFClassifier(n_estimators=250,n_jobs=-2),\n", + " 'GBMRegressor': GradientBoostingRegressor(n_estimators=200),\n", + " 'GBMClassifier': GradientBoostingClassifier(n_estimators=200),\n", + " 'XGBRegressor': XGBRegressor(n_estimators=100,learning_rate=0.05,n_jobs=-2),\n", + " 'XGBClassifier': XGBClassifier(n_estimators=100,learning_rate=0.05,n_jobs=-2),\n", + " 'HGBRegressor': HistGradientBoostingRegressor(max_iter=200),\n", + " 'HGBClassifier': HistGradientBoostingClassifier(max_iter=200),\n", + " }\n", + "\n", + " return funcs[func_name]\n", + "\n", + " def fit(self,\n", + " X: Union[np.ndarray, pd.DataFrame],\n", + " y: Union[np.ndarray, pd.Series]):\n", + " X, y = check_X_y(X, y, dtype=None,\n", + " accept_sparse=False,\n", + " accept_large_sparse=False,\n", + " force_all_finite='allow-nan')\n", + "\n", + " if X.shape[1] < 2:\n", + " raise ValueError('Cannot fit model when n_features = 1')\n", + "\n", + " self.clf_ = self._resolve_estimator(self.clf_name)\n", + " if self.clf_params:\n", + " self.clf_.set_params(**self.clf_params)\n", + " self.clf_.fit(X, y > 0)\n", + " self.clf_fi = self.clf_.feature_importances_\n", + "\n", + " self.reg_ = self._resolve_estimator(self.reg_name)\n", + " if self.reg_params:\n", + " self.reg_.set_params(**self.reg_params)\n", + " self.reg_.fit(X[y > 0], y[y > 0])\n", + " self.reg_fi = self.reg_.feature_importances_\n", + "\n", + " self.is_fitted_ = True\n", + " return self\n", + "\n", + "\n", + " def predict_bck(self, X: Union[np.ndarray, pd.DataFrame]):\n", + " \"\"\" Predict combined response using binary classification outcome \"\"\"\n", + " X = check_array(X, accept_sparse=False, accept_large_sparse=False)\n", + " check_is_fitted(self, 'is_fitted_')\n", + " return self.clf_.predict(X) * self.reg_.predict(X)\n", + "\n", + " def predict(self, X: Union[np.ndarray, pd.DataFrame]):\n", + " \"\"\" Predict combined response using probabilistic classification outcome \"\"\"\n", + " X = check_array(X, accept_sparse=False, accept_large_sparse=False)\n", + " check_is_fitted(self, 'is_fitted_')\n", + " return self.clf_.predict_proba(X)[:, 1] * self.reg_.predict(X)\n", + "\n", + " \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "hp_config = {\n", + " \"clf\":{\n", + " \"learning_rate\": 0.05,\n", + " \"n_estimators\": 100,\n", + " \"n_jobs\": 12\n", + " },\n", + " \"reg\":{\n", + " \"learning_rate\": 0.05,\n", + " \"n_estimators\": 100,\n", + " \"n_jobs\": 12\n", + " }\n", + "}\n", + "common_config = {\n", + " \"name\": \"lavender_haze\",\n", + " \"algorithm\": \"HurdleRegression\",\n", + " \"clf_name\":\"LGBMClassifier\",\n", + " \"reg_name\":\"LGBMRegressor\",\n", + " \"depvar\": \"ged_sb_dep\",\n", + " \"queryset\": \"fatalities003_pgm_broad\",\n", + " \"data_train\": \"baseline\",\n", + " \"level\": \"pgm\",\n", + " 'steps': [*range(1, 36 + 1, 1)],\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "cls_model = HurdleRegression(clf_name=common_config['clf_name'], reg_name=common_config['reg_name'], clf_params=hp_config['clf'], reg_params=hp_config['reg'])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
HurdleRegression(clf_name='LGBMClassifier',\n",
+       "                 clf_params={'learning_rate': 0.05, 'n_estimators': 100,\n",
+       "                             'n_jobs': 12},\n",
+       "                 reg_name='LGBMRegressor',\n",
+       "                 reg_params={'learning_rate': 0.05, 'n_estimators': 100,\n",
+       "                             'n_jobs': 12})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "HurdleRegression(clf_name='LGBMClassifier',\n", + " clf_params={'learning_rate': 0.05, 'n_estimators': 100,\n", + " 'n_jobs': 12},\n", + " reg_name='LGBMRegressor',\n", + " reg_params={'learning_rate': 0.05, 'n_estimators': 100,\n", + " 'n_jobs': 12})" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cls_model" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tlag1_dr_mod_gsspei1_gs_prev10_anomtlag_12_crop_sumspei1gsy_lowermedian_countln_ged_sb_depged_sbged_osged_nstreelag_1_sbtreelag_2_sb...dist_diamsecimr_meanln_ttime_meanln_bdist3ln_capdistln_pop_gpw_sumdecay_ged_sb_5decay_ged_os_5decay_ged_ns_5splag_1_1_sb_1
month_idpriogrid_gid
1623560.00.00.00.00.00.00.00.00.00.0...19.2353840.07.9894642.2639007.8174370.0000000.000000e+000.000000e+000.000000e+000.000000e+00
795990.00.00.00.00.00.00.00.00.00.0...3.640055100.05.2510892.9619987.1879348.2664450.000000e+000.000000e+000.000000e+000.000000e+00
796000.00.00.00.00.00.00.00.00.00.0...3.807887100.05.6565250.3649527.1643957.8052370.000000e+000.000000e+000.000000e+000.000000e+00
796010.00.00.00.00.00.00.00.00.00.0...4.031129100.05.4656522.3793257.1411389.3351590.000000e+000.000000e+000.000000e+000.000000e+00
803170.00.00.00.00.00.00.00.00.00.0...3.000000100.03.4099152.5209817.20801512.6544270.000000e+000.000000e+000.000000e+000.000000e+00
.....................................................................
8521904960.00.00.00.00.00.00.00.00.00.0...10.295630150.05.6872430.4939025.91006010.4086264.487001e-224.487001e-224.487001e-228.473017e-11
1905070.00.00.00.00.00.00.00.00.00.0...6.103278419.05.3359343.3175415.5644566.6472834.487001e-224.487001e-224.487001e-228.473017e-11
1905080.00.00.00.00.00.00.00.00.00.0...5.830952419.00.0000003.4339055.5964574.5621024.487001e-224.487001e-224.487001e-228.473017e-11
1905100.00.00.00.00.00.00.00.00.00.0...5.385165419.05.9048223.2404685.7160547.6195764.487001e-224.487001e-224.487001e-228.473017e-11
1905110.00.00.00.00.00.00.00.00.00.0...5.220153419.05.4791703.2879235.7919367.5960844.487001e-224.487001e-224.487001e-228.473017e-11
\n", + "

11169720 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " tlag1_dr_mod_gs spei1_gs_prev10_anom \\\n", + "month_id priogrid_gid \n", + "1 62356 0.0 0.0 \n", + " 79599 0.0 0.0 \n", + " 79600 0.0 0.0 \n", + " 79601 0.0 0.0 \n", + " 80317 0.0 0.0 \n", + "... ... ... \n", + "852 190496 0.0 0.0 \n", + " 190507 0.0 0.0 \n", + " 190508 0.0 0.0 \n", + " 190510 0.0 0.0 \n", + " 190511 0.0 0.0 \n", + "\n", + " tlag_12_crop_sum spei1gsy_lowermedian_count \\\n", + "month_id priogrid_gid \n", + "1 62356 0.0 0.0 \n", + " 79599 0.0 0.0 \n", + " 79600 0.0 0.0 \n", + " 79601 0.0 0.0 \n", + " 80317 0.0 0.0 \n", + "... ... ... \n", + "852 190496 0.0 0.0 \n", + " 190507 0.0 0.0 \n", + " 190508 0.0 0.0 \n", + " 190510 0.0 0.0 \n", + " 190511 0.0 0.0 \n", + "\n", + " ln_ged_sb_dep ged_sb ged_os ged_ns treelag_1_sb \\\n", + "month_id priogrid_gid \n", + "1 62356 0.0 0.0 0.0 0.0 0.0 \n", + " 79599 0.0 0.0 0.0 0.0 0.0 \n", + " 79600 0.0 0.0 0.0 0.0 0.0 \n", + " 79601 0.0 0.0 0.0 0.0 0.0 \n", + " 80317 0.0 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... ... \n", + "852 190496 0.0 0.0 0.0 0.0 0.0 \n", + " 190507 0.0 0.0 0.0 0.0 0.0 \n", + " 190508 0.0 0.0 0.0 0.0 0.0 \n", + " 190510 0.0 0.0 0.0 0.0 0.0 \n", + " 190511 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + " treelag_2_sb ... dist_diamsec imr_mean \\\n", + "month_id priogrid_gid ... \n", + "1 62356 0.0 ... 19.235384 0.0 \n", + " 79599 0.0 ... 3.640055 100.0 \n", + " 79600 0.0 ... 3.807887 100.0 \n", + " 79601 0.0 ... 4.031129 100.0 \n", + " 80317 0.0 ... 3.000000 100.0 \n", + "... ... ... ... ... \n", + "852 190496 0.0 ... 10.295630 150.0 \n", + " 190507 0.0 ... 6.103278 419.0 \n", + " 190508 0.0 ... 5.830952 419.0 \n", + " 190510 0.0 ... 5.385165 419.0 \n", + " 190511 0.0 ... 5.220153 419.0 \n", + "\n", + " ln_ttime_mean ln_bdist3 ln_capdist ln_pop_gpw_sum \\\n", + "month_id priogrid_gid \n", + "1 62356 7.989464 2.263900 7.817437 0.000000 \n", + " 79599 5.251089 2.961998 7.187934 8.266445 \n", + " 79600 5.656525 0.364952 7.164395 7.805237 \n", + " 79601 5.465652 2.379325 7.141138 9.335159 \n", + " 80317 3.409915 2.520981 7.208015 12.654427 \n", + "... ... ... ... ... \n", + "852 190496 5.687243 0.493902 5.910060 10.408626 \n", + " 190507 5.335934 3.317541 5.564456 6.647283 \n", + " 190508 0.000000 3.433905 5.596457 4.562102 \n", + " 190510 5.904822 3.240468 5.716054 7.619576 \n", + " 190511 5.479170 3.287923 5.791936 7.596084 \n", + "\n", + " decay_ged_sb_5 decay_ged_os_5 decay_ged_ns_5 \\\n", + "month_id priogrid_gid \n", + "1 62356 0.000000e+00 0.000000e+00 0.000000e+00 \n", + " 79599 0.000000e+00 0.000000e+00 0.000000e+00 \n", + " 79600 0.000000e+00 0.000000e+00 0.000000e+00 \n", + " 79601 0.000000e+00 0.000000e+00 0.000000e+00 \n", + " 80317 0.000000e+00 0.000000e+00 0.000000e+00 \n", + "... ... ... ... \n", + "852 190496 4.487001e-22 4.487001e-22 4.487001e-22 \n", + " 190507 4.487001e-22 4.487001e-22 4.487001e-22 \n", + " 190508 4.487001e-22 4.487001e-22 4.487001e-22 \n", + " 190510 4.487001e-22 4.487001e-22 4.487001e-22 \n", + " 190511 4.487001e-22 4.487001e-22 4.487001e-22 \n", + "\n", + " splag_1_1_sb_1 \n", + "month_id priogrid_gid \n", + "1 62356 0.000000e+00 \n", + " 79599 0.000000e+00 \n", + " 79600 0.000000e+00 \n", + " 79601 0.000000e+00 \n", + " 80317 0.000000e+00 \n", + "... ... \n", + "852 190496 8.473017e-11 \n", + " 190507 8.473017e-11 \n", + " 190508 8.473017e-11 \n", + " 190510 8.473017e-11 \n", + " 190511 8.473017e-11 \n", + "\n", + "[11169720 rows x 23 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_parquet(\"../data/raw/raw.parquet\")\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "viewser", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/models/lavender_haze/reports/.DS_Store b/models/lavender_haze/reports/.DS_Store new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/reports/figures/.gitkeep b/models/lavender_haze/reports/figures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/reports/papers/.gitkeep b/models/lavender_haze/reports/papers/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/reports/plots/.gitkeep b/models/lavender_haze/reports/plots/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/reports/slides/.gitkeep b/models/lavender_haze/reports/slides/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/reports/timelapse/.gitkeep b/models/lavender_haze/reports/timelapse/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/requirements.txt b/models/lavender_haze/requirements.txt new file mode 100644 index 00000000..1fa9034a --- /dev/null +++ b/models/lavender_haze/requirements.txt @@ -0,0 +1 @@ +# Requirements diff --git a/models/lavender_haze/src/.DS_Store b/models/lavender_haze/src/.DS_Store new file mode 100644 index 00000000..a952ae4a Binary files /dev/null and b/models/lavender_haze/src/.DS_Store differ diff --git a/models/lavender_haze/src/architectures/.gitkeep b/models/lavender_haze/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/src/dataloaders/get_data.py b/models/lavender_haze/src/dataloaders/get_data.py new file mode 100644 index 00000000..cc332719 --- /dev/null +++ b/models/lavender_haze/src/dataloaders/get_data.py @@ -0,0 +1,27 @@ +import sys +import logging + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_data_paths +setup_project_paths(PATH) + +from utils_dataloaders import fetch_or_load_views_df, create_or_load_views_vol, get_alert_help_string + +logger = logging.getLogger(__name__) + + +def get_data(args): + logger.info("Getting data...") + PATH_RAW, _, _ = setup_data_paths(PATH) + + data, alerts = fetch_or_load_views_df(args.run_type, PATH_RAW, args.saved) + logger.info(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + for ialert, alert in enumerate(str(alerts).strip('[').strip(']').split('Input')): + if 'offender' in alert: + logger.warning({f"{args.run_type} data alert {ialert}": str(alert)}) + + return data diff --git a/models/lavender_haze/src/forecasting/generate_forecast.py b/models/lavender_haze/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..13fef6f0 --- /dev/null +++ b/models/lavender_haze/src/forecasting/generate_forecast.py @@ -0,0 +1,54 @@ +import sys +import pandas as pd +from datetime import datetime +import logging + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_data_paths, setup_artifacts_paths +setup_project_paths(PATH) + +from set_partition import get_partitioner_dict +from utils_log_files import create_log_file +from utils_run import get_standardized_df +from utils_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + PATH_RAW, _, PATH_GENERATED = setup_data_paths(PATH) + PATH_ARTIFACTS = setup_artifacts_paths(PATH) + run_type = config['run_type'] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith('.pkl'): + artifact_name += '.pkl' + PATH_ARTIFACT = PATH_ARTIFACTS / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(PATH_ARTIFACTS, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(PATH_RAW / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + partition = get_partitioner_dict(run_type)['predict'] + df_predictions = stepshift_model.future_point_predict(partition[0] - 1, df_viewser, keep_specific=True) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + save_predictions(df_predictions, PATH_GENERATED, config) + create_log_file(PATH_GENERATED, config, config["timestamp"], data_generation_timestamp) diff --git a/models/lavender_haze/src/management/execute_model_runs.py b/models/lavender_haze/src/management/execute_model_runs.py new file mode 100644 index 00000000..48fa198f --- /dev/null +++ b/models/lavender_haze/src/management/execute_model_runs.py @@ -0,0 +1,48 @@ +import sys +import wandb + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + get_data(args) + + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + sweep_id = wandb.sweep(sweep_config, project=project, entity='views_pipeline') + wandb.agent(sweep_id, execute_model_tasks, entity='views_pipeline') + + +def execute_single_run(args): + get_data(args) + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + if args.run_type == 'calibration' or args.run_type == 'testing': + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == 'forecasting': + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) diff --git a/models/lavender_haze/src/management/execute_model_tasks.py b/models/lavender_haze/src/management/execute_model_tasks.py new file mode 100644 index 00000000..51c04d0b --- /dev/null +++ b/models/lavender_haze/src/management/execute_model_tasks.py @@ -0,0 +1,83 @@ +import sys +import wandb +import logging +import time + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import get_model, split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config['sweep'] and config['algorithm'] == "HurdleRegression": + config['parameters'] = {} + config['parameters']['clf'], config['parameters']['reg'] = split_hurdle_parameters(config) + + model = get_model(config) + # logger.info(model) + + if config['sweep']: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config, model) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config, model) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f'Done. Runtime: {minutes:.3f} minutes.\n') diff --git a/models/lavender_haze/src/offline_evaluation/evaluate_model.py b/models/lavender_haze/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..7e0a17a2 --- /dev/null +++ b/models/lavender_haze/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,60 @@ +import sys +from datetime import datetime +import pandas as pd +import logging + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_data_paths, setup_artifacts_paths +setup_project_paths(PATH) + +from utils_log_files import create_log_file +from utils_outputs import save_model_outputs +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + + +def evaluate_model_artifact(config, artifact_name): + PATH_RAW, _, PATH_GENERATED = setup_data_paths(PATH) + PATH_ARTIFACTS = setup_artifacts_paths(PATH) + run_type = config['run_type'] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith('.pkl'): + artifact_name += '.pkl' + PATH_ARTIFACT = PATH_ARTIFACTS / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(PATH_ARTIFACTS, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(PATH_RAW / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, "predict", df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + output, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, PATH_GENERATED, config) + create_log_file(PATH_GENERATED, config, config["timestamp"], data_generation_timestamp) diff --git a/models/lavender_haze/src/offline_evaluation/evaluate_sweep.py b/models/lavender_haze/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..1f2647f3 --- /dev/null +++ b/models/lavender_haze/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,36 @@ +import sys +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_data_paths +setup_project_paths(PATH) + +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + PATH_RAW, _, _ = setup_data_paths(PATH) + run_type = config['run_type'] + steps = config['steps'] + + df_viewser = pd.read_pickle(PATH_RAW / f"{run_type}_viewser_df.pkl") + + df = stepshift_model.predict(run_type, "predict", df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config['depvar']]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({'MSE': df['mse'].mean()}) + + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/lavender_haze/src/online_evaluation/evaluate_forecast.py b/models/lavender_haze/src/online_evaluation/evaluate_forecast.py new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/src/training/train_model.py b/models/lavender_haze/src/training/train_model.py new file mode 100644 index 00000000..faaac741 --- /dev/null +++ b/models/lavender_haze/src/training/train_model.py @@ -0,0 +1,40 @@ +from datetime import datetime +import pandas as pd +from pathlib import Path +PATH = Path(__file__) +from set_path import setup_project_paths, setup_data_paths, setup_artifacts_paths +setup_project_paths(PATH) + +from utils_log_files import create_log_file +from set_partition import get_partitioner_dict +from stepshift.views import StepshiftedModels +from views_forecasts.extensions import * +from views_partitioning.data_partitioner import DataPartitioner +from views_stepshift.run import ViewsRun + + +def train_model_artifact(config, model): + # print(config) + PATH_RAW, _, PATH_GENERATED = setup_data_paths(PATH) + PATH_ARTIFACTS = setup_artifacts_paths(PATH) + run_type = config['run_type'] + df_viewser = pd.read_pickle(PATH_RAW / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, model, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(PATH_ARTIFACTS / model_filename) + create_log_file(PATH_GENERATED, config, timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, model, dataset): + steps = config["steps"] + target = config["depvar"] + partitioner_dict = get_partitioner_dict(partition_name) + partition = DataPartitioner({partition_name: partitioner_dict}) + stepshift_def = StepshiftedModels(model, steps, target) + stepshift_model = ViewsRun(partition, stepshift_def) + stepshift_model.fit(partition_name, "train", dataset) + return stepshift_model diff --git a/models/lavender_haze/src/utils/utils_log_files.py b/models/lavender_haze/src/utils/utils_log_files.py new file mode 100644 index 00000000..b5d58a9d --- /dev/null +++ b/models/lavender_haze/src/utils/utils_log_files.py @@ -0,0 +1,33 @@ +from pathlib import Path +import logging + +logger = logging.getLogger(__name__) + + +def create_log_file(PATH_GENERATED, + config, + model_timestamp, + data_generation_timestamp=None, + data_fetch_timestamp=None): + """ + Creates a log file in the specified model-specific folder with details about the generated data. + + Args: + - PATH_GENERATED (Path): The path to the folder where the log file will be created. + - config (dict): The configuration dictionary containing the model details. + - model_timestamp (str): The timestamp when the model was trained. + - data_generation_timestamp (str): The timestamp when the data was generated. + - data_fetch_timestamp (str, optional): The timestamp when the raw data used was fetched from VIEWS. + """ + + Path(PATH_GENERATED).mkdir(parents=True, exist_ok=True) + log_file_path = f"{PATH_GENERATED}/{config['run_type']}_log.txt" + + with open(log_file_path, 'w') as log_file: + log_file.write(f"Model Name: {config['name']}\n") + log_file.write(f"Model Timestamp: {model_timestamp}\n") + log_file.write(f"Data Generation Timestamp: {data_generation_timestamp}\n") + log_file.write(f"Data Fetch Timestamp: {data_fetch_timestamp}\n") + log_file.write(f"Deployment Status: {config['deployment_status']}\n") + + logger.info(f"Model log file created at: {log_file_path}") diff --git a/models/lavender_haze/src/utils/utils_outputs.py b/models/lavender_haze/src/utils/utils_outputs.py new file mode 100644 index 00000000..b88cf9a2 --- /dev/null +++ b/models/lavender_haze/src/utils/utils_outputs.py @@ -0,0 +1,30 @@ +from pathlib import Path +import pickle +import logging + +logger = logging.getLogger(__name__) + + +def save_model_outputs(df_evaluation, df_output, PATH_GENERATED, config): + Path(PATH_GENERATED).mkdir(parents=True, exist_ok=True) + + # Save the DataFrame of model outputs + outputs_path = f"{PATH_GENERATED}/output_{config['steps'][-1]}_{config['run_type']}_{config['timestamp']}.pkl" + with open(outputs_path, 'wb') as file: + pickle.dump(df_output, file) + logger.info(f"Model outputs saved at: {outputs_path}") + + # Save the DataFrame of evaluation metrics + evaluation_path = f"{PATH_GENERATED}/evaluation_{config['steps'][-1]}_{config['run_type']}_{config['timestamp']}.pkl" + with open(evaluation_path, 'wb') as file: + pickle.dump(df_evaluation, file) + logger.info(f"Evaluation metrics saved at: {evaluation_path}") + + +def save_predictions(df_predictions, PATH_GENERATED, config): + Path(PATH_GENERATED).mkdir(parents=True, exist_ok=True) + + predictions_path = f"{PATH_GENERATED}/predictions_{config['steps'][-1]}_{config['run_type']}_{config['timestamp']}.pkl" + with open(predictions_path, 'wb') as file: + pickle.dump(df_predictions, file) + logger.info(f"Predictions saved at: {predictions_path}") diff --git a/models/lavender_haze/src/utils/utils_run.py b/models/lavender_haze/src/utils/utils_run.py new file mode 100644 index 00000000..f8955a3a --- /dev/null +++ b/models/lavender_haze/src/utils/utils_run.py @@ -0,0 +1,110 @@ +import sys +import numpy as np +from lightgbm import LGBMRegressor +from xgboost import XGBRegressor +from sklearn.ensemble import RandomForestClassifier + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from hurdle_model import HurdleRegression +from views_forecasts.extensions import * + + +def get_model(config): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleRegression": + model = HurdleRegression(clf_name=config["model_clf"], reg_name=config["model_reg"], + clf_params=config["parameters"]["clf"], reg_params=config["parameters"]["reg"]) + else: + parameters = get_parameters(config) + model = globals()[config["algorithm"]](**parameters) + + return model + + +def get_parameters(config): + """ + Get the parameters from the config file. + If not sweep, then get directly from the config file, otherwise have to remove some parameters. + """ + + if config["sweep"]: + keys_to_remove = ["algorithm", "depvar", "steps", "sweep", "run_type", "model_cls", "model_reg"] + parameters = {k: v for k, v in config.items() if k not in keys_to_remove} + else: + parameters = config["parameters"] + + return parameters + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config['run_type'] + steps = config['steps'] + depvar = config['depvar'] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ['calibration', 'testing']: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f'step_pred_{i}' for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith('cls_'): + cls_key = key.replace('cls_', '') + cls_dict[cls_key] = value + elif key.startswith('reg_'): + reg_key = key.replace('reg_', '') + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config['run_type'] = args.run_type + config['sweep'] = False + config['name'] = meta_config['name'] + config['depvar'] = meta_config['depvar'] + config['algorithm'] = meta_config['algorithm'] + if meta_config['algorithm'] == 'HurdleRegression': + config['model_clf'] = meta_config['model_clf'] + config['model_reg'] = meta_config['model_reg'] + config['deployment_status'] = dp_config['deployment_status'] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config['parameters']['run_type'] = {'value': args.run_type} + sweep_config['parameters']['sweep'] = {'value': True} + sweep_config['parameters']['name'] = {'value': meta_config['name']} + sweep_config['parameters']['depvar'] = {'value': meta_config['depvar']} + sweep_config['parameters']['algorithm'] = {'value': meta_config['algorithm']} + if meta_config['algorithm'] == 'HurdleRegression': + sweep_config['parameters']['model_clf'] = {'value': meta_config['model_clf']} + sweep_config['parameters']['model_reg'] = {'value': meta_config['model_reg']} diff --git a/models/lavender_haze/src/visualization/visual.py b/models/lavender_haze/src/visualization/visual.py new file mode 100644 index 00000000..e69de29b