From 2f283d1e237f668bd8b53b29c31603cdbe4da9c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Borb=C3=A1la=20Farkas?= Date: Mon, 28 Oct 2024 16:55:03 +0100 Subject: [PATCH] changing model directories for testing --- .../workflows/check_if_new_model_added.yml | 5 +- models/lavender_haze/.DS_Store | Bin 0 -> 6148 bytes models/lavender_haze/README.md | 85 +++ .../artifacts/model_metadata_dict.py | 0 .../configs/config_deployment.py | 16 + .../configs/config_hyperparameters.py | 17 + .../configs/config_input_data.py | 167 +++++ models/lavender_haze/configs/config_meta.py | 19 + models/lavender_haze/configs/config_sweep.py | 26 + models/lavender_haze/data/generated/.gitkeep | 0 models/lavender_haze/data/processed/.gitkeep | 0 models/lavender_haze/data/raw/.gitkeep | 0 models/lavender_haze/main.py | 30 + .../lavender_haze/notebooks/notebook001.ipynb | 696 ++++++++++++++++++ models/lavender_haze/reports/.DS_Store | 0 models/lavender_haze/reports/figures/.gitkeep | 0 models/lavender_haze/reports/papers/.gitkeep | 0 models/lavender_haze/reports/plots/.gitkeep | 0 models/lavender_haze/reports/slides/.gitkeep | 0 .../lavender_haze/reports/timelapse/.gitkeep | 0 models/lavender_haze/requirements.txt | 1 + models/lavender_haze/src/.DS_Store | Bin 0 -> 6148 bytes .../lavender_haze/src/architectures/.gitkeep | 0 .../lavender_haze/src/dataloaders/get_data.py | 27 + .../src/forecasting/generate_forecast.py | 54 ++ .../src/management/execute_model_runs.py | 48 ++ .../src/management/execute_model_tasks.py | 83 +++ .../src/offline_evaluation/evaluate_model.py | 60 ++ .../src/offline_evaluation/evaluate_sweep.py | 36 + .../online_evaluation/evaluate_forecast.py | 0 .../lavender_haze/src/training/train_model.py | 40 + .../src/utils/utils_log_files.py | 33 + .../lavender_haze/src/utils/utils_outputs.py | 30 + models/lavender_haze/src/utils/utils_run.py | 110 +++ .../lavender_haze/src/visualization/visual.py | 0 35 files changed, 1580 insertions(+), 3 deletions(-) create mode 100644 models/lavender_haze/.DS_Store create mode 100644 models/lavender_haze/README.md create mode 100644 models/lavender_haze/artifacts/model_metadata_dict.py create mode 100644 models/lavender_haze/configs/config_deployment.py create mode 100644 models/lavender_haze/configs/config_hyperparameters.py create mode 100644 models/lavender_haze/configs/config_input_data.py create mode 100644 models/lavender_haze/configs/config_meta.py create mode 100644 models/lavender_haze/configs/config_sweep.py create mode 100644 models/lavender_haze/data/generated/.gitkeep create mode 100644 models/lavender_haze/data/processed/.gitkeep create mode 100644 models/lavender_haze/data/raw/.gitkeep create mode 100644 models/lavender_haze/main.py create mode 100644 models/lavender_haze/notebooks/notebook001.ipynb create mode 100644 models/lavender_haze/reports/.DS_Store create mode 100644 models/lavender_haze/reports/figures/.gitkeep create mode 100644 models/lavender_haze/reports/papers/.gitkeep create mode 100644 models/lavender_haze/reports/plots/.gitkeep create mode 100644 models/lavender_haze/reports/slides/.gitkeep create mode 100644 models/lavender_haze/reports/timelapse/.gitkeep create mode 100644 models/lavender_haze/requirements.txt create mode 100644 models/lavender_haze/src/.DS_Store create mode 100644 models/lavender_haze/src/architectures/.gitkeep create mode 100644 models/lavender_haze/src/dataloaders/get_data.py create mode 100644 models/lavender_haze/src/forecasting/generate_forecast.py create mode 100644 models/lavender_haze/src/management/execute_model_runs.py create mode 100644 models/lavender_haze/src/management/execute_model_tasks.py create mode 100644 models/lavender_haze/src/offline_evaluation/evaluate_model.py create mode 100644 models/lavender_haze/src/offline_evaluation/evaluate_sweep.py create mode 100644 models/lavender_haze/src/online_evaluation/evaluate_forecast.py create mode 100644 models/lavender_haze/src/training/train_model.py create mode 100644 models/lavender_haze/src/utils/utils_log_files.py create mode 100644 models/lavender_haze/src/utils/utils_outputs.py create mode 100644 models/lavender_haze/src/utils/utils_run.py create mode 100644 models/lavender_haze/src/visualization/visual.py diff --git a/.github/workflows/check_if_new_model_added.yml b/.github/workflows/check_if_new_model_added.yml index d6b9dce1..41957cef 100644 --- a/.github/workflows/check_if_new_model_added.yml +++ b/.github/workflows/check_if_new_model_added.yml @@ -16,7 +16,7 @@ jobs: with: fetch-depth: 0 - - name: Check for new directories + - name: Check for new directories and generate catalog if models directory has changed run: | git status @@ -41,8 +41,7 @@ jobs: git checkout create_pgm_catalog_01 git status - - name: Generate catalog if models directory has changed - run: | + # generate catalog if models directory has changed if [ -n "$new_dirs" ] || [ -n "$removed_dirs" ]; then python documentation/catalogs/generate_model_catalog.py echo "Model catalog is updated." diff --git a/models/lavender_haze/.DS_Store b/models/lavender_haze/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..7224dcaa19e238197507ed8e3f4b3c72a381797a GIT binary patch literal 6148 zcmeHK%TB{U3>?!+wBphu$NdHVU{#eb=np{CA|Wb*8o1}iZ+HBtXb4<6P+Rg&*7j_Y zIm9slS$=HqfCYdVT@jzUj7{_EBfE;olqhz_6CTmv9o@DYRsS4N?wqUxS 0)\n", + " self.clf_fi = self.clf_.feature_importances_\n", + "\n", + " self.reg_ = self._resolve_estimator(self.reg_name)\n", + " if self.reg_params:\n", + " self.reg_.set_params(**self.reg_params)\n", + " self.reg_.fit(X[y > 0], y[y > 0])\n", + " self.reg_fi = self.reg_.feature_importances_\n", + "\n", + " self.is_fitted_ = True\n", + " return self\n", + "\n", + "\n", + " def predict_bck(self, X: Union[np.ndarray, pd.DataFrame]):\n", + " \"\"\" Predict combined response using binary classification outcome \"\"\"\n", + " X = check_array(X, accept_sparse=False, accept_large_sparse=False)\n", + " check_is_fitted(self, 'is_fitted_')\n", + " return self.clf_.predict(X) * self.reg_.predict(X)\n", + "\n", + " def predict(self, X: Union[np.ndarray, pd.DataFrame]):\n", + " \"\"\" Predict combined response using probabilistic classification outcome \"\"\"\n", + " X = check_array(X, accept_sparse=False, accept_large_sparse=False)\n", + " check_is_fitted(self, 'is_fitted_')\n", + " return self.clf_.predict_proba(X)[:, 1] * self.reg_.predict(X)\n", + "\n", + " \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "hp_config = {\n", + " \"clf\":{\n", + " \"learning_rate\": 0.05,\n", + " \"n_estimators\": 100,\n", + " \"n_jobs\": 12\n", + " },\n", + " \"reg\":{\n", + " \"learning_rate\": 0.05,\n", + " \"n_estimators\": 100,\n", + " \"n_jobs\": 12\n", + " }\n", + "}\n", + "common_config = {\n", + " \"name\": \"lavender_haze\",\n", + " \"algorithm\": \"HurdleRegression\",\n", + " \"clf_name\":\"LGBMClassifier\",\n", + " \"reg_name\":\"LGBMRegressor\",\n", + " \"depvar\": \"ged_sb_dep\",\n", + " \"queryset\": \"fatalities003_pgm_broad\",\n", + " \"data_train\": \"baseline\",\n", + " \"level\": \"pgm\",\n", + " 'steps': [*range(1, 36 + 1, 1)],\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "cls_model = HurdleRegression(clf_name=common_config['clf_name'], reg_name=common_config['reg_name'], clf_params=hp_config['clf'], reg_params=hp_config['reg'])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
HurdleRegression(clf_name='LGBMClassifier',\n",
+       "                 clf_params={'learning_rate': 0.05, 'n_estimators': 100,\n",
+       "                             'n_jobs': 12},\n",
+       "                 reg_name='LGBMRegressor',\n",
+       "                 reg_params={'learning_rate': 0.05, 'n_estimators': 100,\n",
+       "                             'n_jobs': 12})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "HurdleRegression(clf_name='LGBMClassifier',\n", + " clf_params={'learning_rate': 0.05, 'n_estimators': 100,\n", + " 'n_jobs': 12},\n", + " reg_name='LGBMRegressor',\n", + " reg_params={'learning_rate': 0.05, 'n_estimators': 100,\n", + " 'n_jobs': 12})" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cls_model" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tlag1_dr_mod_gsspei1_gs_prev10_anomtlag_12_crop_sumspei1gsy_lowermedian_countln_ged_sb_depged_sbged_osged_nstreelag_1_sbtreelag_2_sb...dist_diamsecimr_meanln_ttime_meanln_bdist3ln_capdistln_pop_gpw_sumdecay_ged_sb_5decay_ged_os_5decay_ged_ns_5splag_1_1_sb_1
month_idpriogrid_gid
1623560.00.00.00.00.00.00.00.00.00.0...19.2353840.07.9894642.2639007.8174370.0000000.000000e+000.000000e+000.000000e+000.000000e+00
795990.00.00.00.00.00.00.00.00.00.0...3.640055100.05.2510892.9619987.1879348.2664450.000000e+000.000000e+000.000000e+000.000000e+00
796000.00.00.00.00.00.00.00.00.00.0...3.807887100.05.6565250.3649527.1643957.8052370.000000e+000.000000e+000.000000e+000.000000e+00
796010.00.00.00.00.00.00.00.00.00.0...4.031129100.05.4656522.3793257.1411389.3351590.000000e+000.000000e+000.000000e+000.000000e+00
803170.00.00.00.00.00.00.00.00.00.0...3.000000100.03.4099152.5209817.20801512.6544270.000000e+000.000000e+000.000000e+000.000000e+00
.....................................................................
8521904960.00.00.00.00.00.00.00.00.00.0...10.295630150.05.6872430.4939025.91006010.4086264.487001e-224.487001e-224.487001e-228.473017e-11
1905070.00.00.00.00.00.00.00.00.00.0...6.103278419.05.3359343.3175415.5644566.6472834.487001e-224.487001e-224.487001e-228.473017e-11
1905080.00.00.00.00.00.00.00.00.00.0...5.830952419.00.0000003.4339055.5964574.5621024.487001e-224.487001e-224.487001e-228.473017e-11
1905100.00.00.00.00.00.00.00.00.00.0...5.385165419.05.9048223.2404685.7160547.6195764.487001e-224.487001e-224.487001e-228.473017e-11
1905110.00.00.00.00.00.00.00.00.00.0...5.220153419.05.4791703.2879235.7919367.5960844.487001e-224.487001e-224.487001e-228.473017e-11
\n", + "

11169720 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " tlag1_dr_mod_gs spei1_gs_prev10_anom \\\n", + "month_id priogrid_gid \n", + "1 62356 0.0 0.0 \n", + " 79599 0.0 0.0 \n", + " 79600 0.0 0.0 \n", + " 79601 0.0 0.0 \n", + " 80317 0.0 0.0 \n", + "... ... ... \n", + "852 190496 0.0 0.0 \n", + " 190507 0.0 0.0 \n", + " 190508 0.0 0.0 \n", + " 190510 0.0 0.0 \n", + " 190511 0.0 0.0 \n", + "\n", + " tlag_12_crop_sum spei1gsy_lowermedian_count \\\n", + "month_id priogrid_gid \n", + "1 62356 0.0 0.0 \n", + " 79599 0.0 0.0 \n", + " 79600 0.0 0.0 \n", + " 79601 0.0 0.0 \n", + " 80317 0.0 0.0 \n", + "... ... ... \n", + "852 190496 0.0 0.0 \n", + " 190507 0.0 0.0 \n", + " 190508 0.0 0.0 \n", + " 190510 0.0 0.0 \n", + " 190511 0.0 0.0 \n", + "\n", + " ln_ged_sb_dep ged_sb ged_os ged_ns treelag_1_sb \\\n", + "month_id priogrid_gid \n", + "1 62356 0.0 0.0 0.0 0.0 0.0 \n", + " 79599 0.0 0.0 0.0 0.0 0.0 \n", + " 79600 0.0 0.0 0.0 0.0 0.0 \n", + " 79601 0.0 0.0 0.0 0.0 0.0 \n", + " 80317 0.0 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... ... \n", + "852 190496 0.0 0.0 0.0 0.0 0.0 \n", + " 190507 0.0 0.0 0.0 0.0 0.0 \n", + " 190508 0.0 0.0 0.0 0.0 0.0 \n", + " 190510 0.0 0.0 0.0 0.0 0.0 \n", + " 190511 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + " treelag_2_sb ... dist_diamsec imr_mean \\\n", + "month_id priogrid_gid ... \n", + "1 62356 0.0 ... 19.235384 0.0 \n", + " 79599 0.0 ... 3.640055 100.0 \n", + " 79600 0.0 ... 3.807887 100.0 \n", + " 79601 0.0 ... 4.031129 100.0 \n", + " 80317 0.0 ... 3.000000 100.0 \n", + "... ... ... ... ... \n", + "852 190496 0.0 ... 10.295630 150.0 \n", + " 190507 0.0 ... 6.103278 419.0 \n", + " 190508 0.0 ... 5.830952 419.0 \n", + " 190510 0.0 ... 5.385165 419.0 \n", + " 190511 0.0 ... 5.220153 419.0 \n", + "\n", + " ln_ttime_mean ln_bdist3 ln_capdist ln_pop_gpw_sum \\\n", + "month_id priogrid_gid \n", + "1 62356 7.989464 2.263900 7.817437 0.000000 \n", + " 79599 5.251089 2.961998 7.187934 8.266445 \n", + " 79600 5.656525 0.364952 7.164395 7.805237 \n", + " 79601 5.465652 2.379325 7.141138 9.335159 \n", + " 80317 3.409915 2.520981 7.208015 12.654427 \n", + "... ... ... ... ... \n", + "852 190496 5.687243 0.493902 5.910060 10.408626 \n", + " 190507 5.335934 3.317541 5.564456 6.647283 \n", + " 190508 0.000000 3.433905 5.596457 4.562102 \n", + " 190510 5.904822 3.240468 5.716054 7.619576 \n", + " 190511 5.479170 3.287923 5.791936 7.596084 \n", + "\n", + " decay_ged_sb_5 decay_ged_os_5 decay_ged_ns_5 \\\n", + "month_id priogrid_gid \n", + "1 62356 0.000000e+00 0.000000e+00 0.000000e+00 \n", + " 79599 0.000000e+00 0.000000e+00 0.000000e+00 \n", + " 79600 0.000000e+00 0.000000e+00 0.000000e+00 \n", + " 79601 0.000000e+00 0.000000e+00 0.000000e+00 \n", + " 80317 0.000000e+00 0.000000e+00 0.000000e+00 \n", + "... ... ... ... \n", + "852 190496 4.487001e-22 4.487001e-22 4.487001e-22 \n", + " 190507 4.487001e-22 4.487001e-22 4.487001e-22 \n", + " 190508 4.487001e-22 4.487001e-22 4.487001e-22 \n", + " 190510 4.487001e-22 4.487001e-22 4.487001e-22 \n", + " 190511 4.487001e-22 4.487001e-22 4.487001e-22 \n", + "\n", + " splag_1_1_sb_1 \n", + "month_id priogrid_gid \n", + "1 62356 0.000000e+00 \n", + " 79599 0.000000e+00 \n", + " 79600 0.000000e+00 \n", + " 79601 0.000000e+00 \n", + " 80317 0.000000e+00 \n", + "... ... \n", + "852 190496 8.473017e-11 \n", + " 190507 8.473017e-11 \n", + " 190508 8.473017e-11 \n", + " 190510 8.473017e-11 \n", + " 190511 8.473017e-11 \n", + "\n", + "[11169720 rows x 23 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_parquet(\"../data/raw/raw.parquet\")\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "viewser", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/models/lavender_haze/reports/.DS_Store b/models/lavender_haze/reports/.DS_Store new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/reports/figures/.gitkeep b/models/lavender_haze/reports/figures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/reports/papers/.gitkeep b/models/lavender_haze/reports/papers/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/reports/plots/.gitkeep b/models/lavender_haze/reports/plots/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/reports/slides/.gitkeep b/models/lavender_haze/reports/slides/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/reports/timelapse/.gitkeep b/models/lavender_haze/reports/timelapse/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/requirements.txt b/models/lavender_haze/requirements.txt new file mode 100644 index 00000000..1fa9034a --- /dev/null +++ b/models/lavender_haze/requirements.txt @@ -0,0 +1 @@ +# Requirements diff --git a/models/lavender_haze/src/.DS_Store b/models/lavender_haze/src/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..a952ae4a1860eb8fa7a66c54539fb80750a23fb0 GIT binary patch literal 6148 zcmeHKOKQVF4ArD5xP(C3RhPa(Z(u@tf*znBJ82Cj20QMeD<8e@jYP2P6iOE*6e&p0 z)<`q*8)IceB&@sQPGlt_36wayf;k}^r%q&M9u9^$M=1x{N)FpiqTO+h3~=w3@+=K@ z?M;UJ*I*~xqRp$9x~^FILJ9;Ppq13H1p=C{Av+8)GeS*(6cGYLeZ@=;0!neM+W$PNTI~cuvd(q4w!fZ0M6l#f@_{7n3Ec2hP@(2AVE`s zni}kg5i}kC)Z#M3UQyE}*x@7C%E3-3A+C=3(-Kqp@Id^J NK%~KqGw`Pjd;$Y~P$d8W literal 0 HcmV?d00001 diff --git a/models/lavender_haze/src/architectures/.gitkeep b/models/lavender_haze/src/architectures/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/src/dataloaders/get_data.py b/models/lavender_haze/src/dataloaders/get_data.py new file mode 100644 index 00000000..cc332719 --- /dev/null +++ b/models/lavender_haze/src/dataloaders/get_data.py @@ -0,0 +1,27 @@ +import sys +import logging + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_data_paths +setup_project_paths(PATH) + +from utils_dataloaders import fetch_or_load_views_df, create_or_load_views_vol, get_alert_help_string + +logger = logging.getLogger(__name__) + + +def get_data(args): + logger.info("Getting data...") + PATH_RAW, _, _ = setup_data_paths(PATH) + + data, alerts = fetch_or_load_views_df(args.run_type, PATH_RAW, args.saved) + logger.info(f"DataFrame shape: {data.shape if data is not None else 'None'}") + + for ialert, alert in enumerate(str(alerts).strip('[').strip(']').split('Input')): + if 'offender' in alert: + logger.warning({f"{args.run_type} data alert {ialert}": str(alert)}) + + return data diff --git a/models/lavender_haze/src/forecasting/generate_forecast.py b/models/lavender_haze/src/forecasting/generate_forecast.py new file mode 100644 index 00000000..13fef6f0 --- /dev/null +++ b/models/lavender_haze/src/forecasting/generate_forecast.py @@ -0,0 +1,54 @@ +import sys +import pandas as pd +from datetime import datetime +import logging + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_data_paths, setup_artifacts_paths +setup_project_paths(PATH) + +from set_partition import get_partitioner_dict +from utils_log_files import create_log_file +from utils_run import get_standardized_df +from utils_outputs import save_predictions +from utils_artifacts import get_latest_model_artifact + +logger = logging.getLogger(__name__) + + +def forecast_model_artifact(config, artifact_name): + PATH_RAW, _, PATH_GENERATED = setup_data_paths(PATH) + PATH_ARTIFACTS = setup_artifacts_paths(PATH) + run_type = config['run_type'] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith('.pkl'): + artifact_name += '.pkl' + PATH_ARTIFACT = PATH_ARTIFACTS / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(PATH_ARTIFACTS, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(PATH_RAW / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + partition = get_partitioner_dict(run_type)['predict'] + df_predictions = stepshift_model.future_point_predict(partition[0] - 1, df_viewser, keep_specific=True) + df_predictions = get_standardized_df(df_predictions, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + save_predictions(df_predictions, PATH_GENERATED, config) + create_log_file(PATH_GENERATED, config, config["timestamp"], data_generation_timestamp) diff --git a/models/lavender_haze/src/management/execute_model_runs.py b/models/lavender_haze/src/management/execute_model_runs.py new file mode 100644 index 00000000..48fa198f --- /dev/null +++ b/models/lavender_haze/src/management/execute_model_runs.py @@ -0,0 +1,48 @@ +import sys +import wandb + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from config_deployment import get_deployment_config +from config_hyperparameters import get_hp_config +from config_meta import get_meta_config +from config_sweep import get_sweep_config +from execute_model_tasks import execute_model_tasks +from get_data import get_data +from utils_run import update_config, update_sweep_config + + +def execute_sweep_run(args): + get_data(args) + + sweep_config = get_sweep_config() + meta_config = get_meta_config() + update_sweep_config(sweep_config, args, meta_config) + + project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file + sweep_id = wandb.sweep(sweep_config, project=project, entity='views_pipeline') + wandb.agent(sweep_id, execute_model_tasks, entity='views_pipeline') + + +def execute_single_run(args): + get_data(args) + + hp_config = get_hp_config() + meta_config = get_meta_config() + dp_config = get_deployment_config() + config = update_config(hp_config, meta_config, dp_config, args) + + project = f"{config['name']}_{args.run_type}" + + if args.run_type == 'calibration' or args.run_type == 'testing': + execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, + forecast=False, artifact_name=args.artifact_name) + + elif args.run_type == 'forecasting': + execute_model_tasks(config=config, project=project, train=args.train, eval=False, + forecast=args.forecast, artifact_name=args.artifact_name) diff --git a/models/lavender_haze/src/management/execute_model_tasks.py b/models/lavender_haze/src/management/execute_model_tasks.py new file mode 100644 index 00000000..51c04d0b --- /dev/null +++ b/models/lavender_haze/src/management/execute_model_tasks.py @@ -0,0 +1,83 @@ +import sys +import wandb +import logging +import time + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from evaluate_model import evaluate_model_artifact +from evaluate_sweep import evaluate_sweep +from generate_forecast import forecast_model_artifact +from train_model import train_model_artifact +from utils_run import get_model, split_hurdle_parameters +from utils_wandb import add_wandb_monthly_metrics + +logger = logging.getLogger(__name__) + + +def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): + """ + Executes various model-related tasks including training, evaluation, and forecasting. + + This function manages the execution of different tasks such as training the model, + evaluating an existing model, or performing forecasting. + It also initializes the WandB project. + + Args: + config: Configuration object containing parameters and settings. + project: The WandB project name. + train: Flag to indicate if the model should be trained. + eval: Flag to indicate if the model should be evaluated. + forecast: Flag to indicate if forecasting should be performed. + artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. + """ + + start_t = time.time() + + # Initialize WandB + with wandb.init(project=project, entity="views_pipeline", + config=config): # project and config ignored when running a sweep + + # add the monthly metrics to WandB + add_wandb_monthly_metrics() + + # Update config from WandB initialization above + config = wandb.config + + # W&B does not directly support nested dictionaries for hyperparameters + # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs + if config['sweep'] and config['algorithm'] == "HurdleRegression": + config['parameters'] = {} + config['parameters']['clf'], config['parameters']['reg'] = split_hurdle_parameters(config) + + model = get_model(config) + # logger.info(model) + + if config['sweep']: + logger.info(f"Sweeping model {config['name']}...") + stepshift_model = train_model_artifact(config, model) + logger.info(f"Evaluating model {config['name']}...") + evaluate_sweep(config, stepshift_model) + + # Handle the single model runs: train and save the model as an artifact + if train: + logger.info(f"Training model {config['name']}...") + train_model_artifact(config, model) + + # Handle the single model runs: evaluate a trained model (artifact) + if eval: + logger.info(f"Evaluating model {config['name']}...") + evaluate_model_artifact(config, artifact_name) + + if forecast: + logger.info(f"Forecasting model {config['name']}...") + forecast_model_artifact(config, artifact_name) + + end_t = time.time() + minutes = (end_t - start_t) / 60 + logger.info(f'Done. Runtime: {minutes:.3f} minutes.\n') diff --git a/models/lavender_haze/src/offline_evaluation/evaluate_model.py b/models/lavender_haze/src/offline_evaluation/evaluate_model.py new file mode 100644 index 00000000..7e0a17a2 --- /dev/null +++ b/models/lavender_haze/src/offline_evaluation/evaluate_model.py @@ -0,0 +1,60 @@ +import sys +from datetime import datetime +import pandas as pd +import logging + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_data_paths, setup_artifacts_paths +setup_project_paths(PATH) + +from utils_log_files import create_log_file +from utils_outputs import save_model_outputs +from utils_run import get_standardized_df +from utils_artifacts import get_latest_model_artifact +from utils_evaluation_metrics import generate_metric_dict +from utils_model_outputs import generate_output_dict +from utils_wandb import log_wandb_log_dict +from views_forecasts.extensions import * + +logger = logging.getLogger(__name__) + + +def evaluate_model_artifact(config, artifact_name): + PATH_RAW, _, PATH_GENERATED = setup_data_paths(PATH) + PATH_ARTIFACTS = setup_artifacts_paths(PATH) + run_type = config['run_type'] + + # if an artifact name is provided through the CLI, use it. + # Otherwise, get the latest model artifact based on the run type + if artifact_name: + logger.info(f"Using (non-default) artifact: {artifact_name}") + + if not artifact_name.endswith('.pkl'): + artifact_name += '.pkl' + PATH_ARTIFACT = PATH_ARTIFACTS / artifact_name + else: + # use the latest model artifact based on the run type + logger.info(f"Using latest (default) run type ({run_type}) specific artifact") + PATH_ARTIFACT = get_latest_model_artifact(PATH_ARTIFACTS, run_type) + + config["timestamp"] = PATH_ARTIFACT.stem[-15:] + df_viewser = pd.read_pickle(PATH_RAW / f"{run_type}_viewser_df.pkl") + + try: + stepshift_model = pd.read_pickle(PATH_ARTIFACT) + except FileNotFoundError: + logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") + + df = stepshift_model.predict(run_type, "predict", df_viewser) + df = get_standardized_df(df, config) + data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + output, df_output = generate_output_dict(df, config) + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) + + save_model_outputs(df_evaluation, df_output, PATH_GENERATED, config) + create_log_file(PATH_GENERATED, config, config["timestamp"], data_generation_timestamp) diff --git a/models/lavender_haze/src/offline_evaluation/evaluate_sweep.py b/models/lavender_haze/src/offline_evaluation/evaluate_sweep.py new file mode 100644 index 00000000..1f2647f3 --- /dev/null +++ b/models/lavender_haze/src/offline_evaluation/evaluate_sweep.py @@ -0,0 +1,36 @@ +import sys +import pandas as pd +import wandb +from sklearn.metrics import mean_squared_error + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths, setup_data_paths +setup_project_paths(PATH) + +from utils_run import get_standardized_df +from utils_wandb import log_wandb_log_dict +from utils_evaluation_metrics import generate_metric_dict + + +def evaluate_sweep(config, stepshift_model): + PATH_RAW, _, _ = setup_data_paths(PATH) + run_type = config['run_type'] + steps = config['steps'] + + df_viewser = pd.read_pickle(PATH_RAW / f"{run_type}_viewser_df.pkl") + + df = stepshift_model.predict(run_type, "predict", df_viewser) + df = get_standardized_df(df, config) + + # Temporarily keep this because the metric to minimize is MSE + pred_cols = [f"step_pred_{str(i)}" for i in steps] + df["mse"] = df.apply(lambda row: mean_squared_error([row[config['depvar']]] * 36, + [row[col] for col in pred_cols]), axis=1) + + wandb.log({'MSE': df['mse'].mean()}) + + evaluation, df_evaluation = generate_metric_dict(df, config) + log_wandb_log_dict(config, evaluation) diff --git a/models/lavender_haze/src/online_evaluation/evaluate_forecast.py b/models/lavender_haze/src/online_evaluation/evaluate_forecast.py new file mode 100644 index 00000000..e69de29b diff --git a/models/lavender_haze/src/training/train_model.py b/models/lavender_haze/src/training/train_model.py new file mode 100644 index 00000000..faaac741 --- /dev/null +++ b/models/lavender_haze/src/training/train_model.py @@ -0,0 +1,40 @@ +from datetime import datetime +import pandas as pd +from pathlib import Path +PATH = Path(__file__) +from set_path import setup_project_paths, setup_data_paths, setup_artifacts_paths +setup_project_paths(PATH) + +from utils_log_files import create_log_file +from set_partition import get_partitioner_dict +from stepshift.views import StepshiftedModels +from views_forecasts.extensions import * +from views_partitioning.data_partitioner import DataPartitioner +from views_stepshift.run import ViewsRun + + +def train_model_artifact(config, model): + # print(config) + PATH_RAW, _, PATH_GENERATED = setup_data_paths(PATH) + PATH_ARTIFACTS = setup_artifacts_paths(PATH) + run_type = config['run_type'] + df_viewser = pd.read_pickle(PATH_RAW / f"{run_type}_viewser_df.pkl") + + stepshift_model = stepshift_training(config, run_type, model, df_viewser) + if not config["sweep"]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + model_filename = f"{run_type}_model_{timestamp}.pkl" + stepshift_model.save(PATH_ARTIFACTS / model_filename) + create_log_file(PATH_GENERATED, config, timestamp) + return stepshift_model + + +def stepshift_training(config, partition_name, model, dataset): + steps = config["steps"] + target = config["depvar"] + partitioner_dict = get_partitioner_dict(partition_name) + partition = DataPartitioner({partition_name: partitioner_dict}) + stepshift_def = StepshiftedModels(model, steps, target) + stepshift_model = ViewsRun(partition, stepshift_def) + stepshift_model.fit(partition_name, "train", dataset) + return stepshift_model diff --git a/models/lavender_haze/src/utils/utils_log_files.py b/models/lavender_haze/src/utils/utils_log_files.py new file mode 100644 index 00000000..b5d58a9d --- /dev/null +++ b/models/lavender_haze/src/utils/utils_log_files.py @@ -0,0 +1,33 @@ +from pathlib import Path +import logging + +logger = logging.getLogger(__name__) + + +def create_log_file(PATH_GENERATED, + config, + model_timestamp, + data_generation_timestamp=None, + data_fetch_timestamp=None): + """ + Creates a log file in the specified model-specific folder with details about the generated data. + + Args: + - PATH_GENERATED (Path): The path to the folder where the log file will be created. + - config (dict): The configuration dictionary containing the model details. + - model_timestamp (str): The timestamp when the model was trained. + - data_generation_timestamp (str): The timestamp when the data was generated. + - data_fetch_timestamp (str, optional): The timestamp when the raw data used was fetched from VIEWS. + """ + + Path(PATH_GENERATED).mkdir(parents=True, exist_ok=True) + log_file_path = f"{PATH_GENERATED}/{config['run_type']}_log.txt" + + with open(log_file_path, 'w') as log_file: + log_file.write(f"Model Name: {config['name']}\n") + log_file.write(f"Model Timestamp: {model_timestamp}\n") + log_file.write(f"Data Generation Timestamp: {data_generation_timestamp}\n") + log_file.write(f"Data Fetch Timestamp: {data_fetch_timestamp}\n") + log_file.write(f"Deployment Status: {config['deployment_status']}\n") + + logger.info(f"Model log file created at: {log_file_path}") diff --git a/models/lavender_haze/src/utils/utils_outputs.py b/models/lavender_haze/src/utils/utils_outputs.py new file mode 100644 index 00000000..b88cf9a2 --- /dev/null +++ b/models/lavender_haze/src/utils/utils_outputs.py @@ -0,0 +1,30 @@ +from pathlib import Path +import pickle +import logging + +logger = logging.getLogger(__name__) + + +def save_model_outputs(df_evaluation, df_output, PATH_GENERATED, config): + Path(PATH_GENERATED).mkdir(parents=True, exist_ok=True) + + # Save the DataFrame of model outputs + outputs_path = f"{PATH_GENERATED}/output_{config['steps'][-1]}_{config['run_type']}_{config['timestamp']}.pkl" + with open(outputs_path, 'wb') as file: + pickle.dump(df_output, file) + logger.info(f"Model outputs saved at: {outputs_path}") + + # Save the DataFrame of evaluation metrics + evaluation_path = f"{PATH_GENERATED}/evaluation_{config['steps'][-1]}_{config['run_type']}_{config['timestamp']}.pkl" + with open(evaluation_path, 'wb') as file: + pickle.dump(df_evaluation, file) + logger.info(f"Evaluation metrics saved at: {evaluation_path}") + + +def save_predictions(df_predictions, PATH_GENERATED, config): + Path(PATH_GENERATED).mkdir(parents=True, exist_ok=True) + + predictions_path = f"{PATH_GENERATED}/predictions_{config['steps'][-1]}_{config['run_type']}_{config['timestamp']}.pkl" + with open(predictions_path, 'wb') as file: + pickle.dump(df_predictions, file) + logger.info(f"Predictions saved at: {predictions_path}") diff --git a/models/lavender_haze/src/utils/utils_run.py b/models/lavender_haze/src/utils/utils_run.py new file mode 100644 index 00000000..f8955a3a --- /dev/null +++ b/models/lavender_haze/src/utils/utils_run.py @@ -0,0 +1,110 @@ +import sys +import numpy as np +from lightgbm import LGBMRegressor +from xgboost import XGBRegressor +from sklearn.ensemble import RandomForestClassifier + +from pathlib import Path +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +from set_path import setup_project_paths +setup_project_paths(PATH) + +from hurdle_model import HurdleRegression +from views_forecasts.extensions import * + + +def get_model(config): + """ + Get the model based on the algorithm specified in the config + """ + + if config["algorithm"] == "HurdleRegression": + model = HurdleRegression(clf_name=config["model_clf"], reg_name=config["model_reg"], + clf_params=config["parameters"]["clf"], reg_params=config["parameters"]["reg"]) + else: + parameters = get_parameters(config) + model = globals()[config["algorithm"]](**parameters) + + return model + + +def get_parameters(config): + """ + Get the parameters from the config file. + If not sweep, then get directly from the config file, otherwise have to remove some parameters. + """ + + if config["sweep"]: + keys_to_remove = ["algorithm", "depvar", "steps", "sweep", "run_type", "model_cls", "model_reg"] + parameters = {k: v for k, v in config.items() if k not in keys_to_remove} + else: + parameters = config["parameters"] + + return parameters + + +def get_standardized_df(df, config): + """ + Standardize the DataFrame based on the run type + """ + + run_type = config['run_type'] + steps = config['steps'] + depvar = config['depvar'] + + # choose the columns to keep based on the run type and replace negative values with 0 + if run_type in ['calibration', 'testing']: + cols = [depvar] + df.forecasts.prediction_columns + elif run_type == "forecasting": + cols = [f'step_pred_{i}' for i in steps] + df = df.replace([np.inf, -np.inf], 0)[cols] + df = df.mask(df < 0, 0) + return df + + +def split_hurdle_parameters(parameters_dict): + """ + Split the parameters dictionary into two separate dictionaries, one for the + classification model and one for the regression model. + """ + + cls_dict = {} + reg_dict = {} + + for key, value in parameters_dict.items(): + if key.startswith('cls_'): + cls_key = key.replace('cls_', '') + cls_dict[cls_key] = value + elif key.startswith('reg_'): + reg_key = key.replace('reg_', '') + reg_dict[reg_key] = value + + return cls_dict, reg_dict + + +def update_config(hp_config, meta_config, dp_config, args): + config = hp_config.copy() + config['run_type'] = args.run_type + config['sweep'] = False + config['name'] = meta_config['name'] + config['depvar'] = meta_config['depvar'] + config['algorithm'] = meta_config['algorithm'] + if meta_config['algorithm'] == 'HurdleRegression': + config['model_clf'] = meta_config['model_clf'] + config['model_reg'] = meta_config['model_reg'] + config['deployment_status'] = dp_config['deployment_status'] + + return config + + +def update_sweep_config(sweep_config, args, meta_config): + sweep_config['parameters']['run_type'] = {'value': args.run_type} + sweep_config['parameters']['sweep'] = {'value': True} + sweep_config['parameters']['name'] = {'value': meta_config['name']} + sweep_config['parameters']['depvar'] = {'value': meta_config['depvar']} + sweep_config['parameters']['algorithm'] = {'value': meta_config['algorithm']} + if meta_config['algorithm'] == 'HurdleRegression': + sweep_config['parameters']['model_clf'] = {'value': meta_config['model_clf']} + sweep_config['parameters']['model_reg'] = {'value': meta_config['model_reg']} diff --git a/models/lavender_haze/src/visualization/visual.py b/models/lavender_haze/src/visualization/visual.py new file mode 100644 index 00000000..e69de29b