diff --git a/common_querysets/queryset_caring_fish.py b/common_querysets/queryset_caring_fish.py new file mode 100644 index 00000000..65f8b0f7 --- /dev/null +++ b/common_querysets/queryset_caring_fish.py @@ -0,0 +1,41 @@ +from viewser import Queryset, Column + +def generate(): + """ + Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model. + This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. + There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly. + + Returns: + - queryset_base (Queryset): A queryset containing the base data for the model training. + """ + + # VIEWSER 6, Example configuration. Modify as needed. + + queryset_base = (Queryset("caring_fish", "priogrid_month") + # Create a new column 'ln_sb_best' using data from 'priogrid_month' and 'ged_sb_best_count_nokgi' column + # Apply logarithmic transformation, handle missing values by replacing them with NA + .with_column(Column("ln_sb_best", from_loa="priogrid_month", from_column="ged_sb_best_count_nokgi") + .transform.ops.ln().transform.missing.replace_na()) + + # Create a new column 'ln_ns_best' using data from 'priogrid_month' and 'ged_ns_best_count_nokgi' column + # Apply logarithmic transformation, handle missing values by replacing them with NA + .with_column(Column("ln_ns_best", from_loa="priogrid_month", from_column="ged_ns_best_count_nokgi") + .transform.ops.ln().transform.missing.replace_na()) + + # Create a new column 'ln_os_best' using data from 'priogrid_month' and 'ged_os_best_count_nokgi' column + # Apply logarithmic transformation, handle missing values by replacing them with NA + .with_column(Column("ln_os_best", from_loa="priogrid_month", from_column="ged_os_best_count_nokgi") + .transform.ops.ln().transform.missing.replace_na()) + + # Create columns for month and year_id + .with_column(Column("month", from_loa="month", from_column="month")) + .with_column(Column("year_id", from_loa="country_year", from_column="year_id")) + + # Create columns for country_id, col, and row + .with_column(Column("c_id", from_loa="country_year", from_column="country_id")) + .with_column(Column("col", from_loa="priogrid", from_column="col")) + .with_column(Column("row", from_loa="priogrid", from_column="row")) + ) + + return queryset_base diff --git a/models/caring_fish/README.md b/models/caring_fish/README.md new file mode 100644 index 00000000..bfca3432 --- /dev/null +++ b/models/caring_fish/README.md @@ -0,0 +1,3 @@ +# Model README +## Model name: caring_fish +## Created on: 2024-10-28 16:45:11.931747 \ No newline at end of file diff --git a/models/caring_fish/configs/config_deployment.py b/models/caring_fish/configs/config_deployment.py new file mode 100644 index 00000000..9e45b735 --- /dev/null +++ b/models/caring_fish/configs/config_deployment.py @@ -0,0 +1,20 @@ +""" +Deployment Configuration Script + +This script defines the deployment configuration settings for the application. +It includes the deployment status and any additional settings specified. + +Deployment Status: +- shadow: The deployment is shadowed and not yet active. +- deployed: The deployment is active and in use. +- baseline: The deployment is in a baseline state, for reference or comparison. +- deprecated: The deployment is deprecated and no longer supported. + +Additional settings can be included in the configuration dictionary as needed. + +""" + +def get_deployment_config(): + # Deployment settings + deployment_config = {'deployment_status': 'shadow'} + return deployment_config diff --git a/models/caring_fish/configs/config_hyperparameters.py b/models/caring_fish/configs/config_hyperparameters.py new file mode 100644 index 00000000..8dc75e49 --- /dev/null +++ b/models/caring_fish/configs/config_hyperparameters.py @@ -0,0 +1,14 @@ +def get_hp_config(): + """ + Contains the hyperparameter configurations for model training. + This configuration is "operational" so modifying these settings will impact the model's behavior during the training. + + Returns: + - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase. + """ + + hyperparameters = { + 'model': 'LightBGM', # The model algorithm used. Eg. "LSTM", "CNN", "Transformer" + # Add more hyperparameters as needed + } + return hyperparameters diff --git a/models/caring_fish/configs/config_meta.py b/models/caring_fish/configs/config_meta.py new file mode 100644 index 00000000..2a2d7ab3 --- /dev/null +++ b/models/caring_fish/configs/config_meta.py @@ -0,0 +1,19 @@ +def get_meta_config(): + """ + Contains the meta data for the model (model algorithm, name, target variable, and level of analysis). + This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. + + Returns: + - meta_config (dict): A dictionary containing model meta configuration. + """ + + meta_config = { + "name": "caring_fish", # Eg. happy_kitten + "algorithm": "LightBGM", # Eg. "LSTM", "CNN", "Transformer" + # Uncomment and modify the following lines as needed for additional metadata: + # "target(S)": ["ln_sb_best", "ln_ns_best", "ln_os_best", "ln_sb_best_binarized", "ln_ns_best_binarized", "ln_os_best_binarized"], + "queryset": "escwa001_cflong", + # "level": "pgm", + # "creator": "Your name here" + } + return meta_config diff --git a/models/caring_fish/configs/config_sweep.py b/models/caring_fish/configs/config_sweep.py new file mode 100644 index 00000000..f9997c4f --- /dev/null +++ b/models/caring_fish/configs/config_sweep.py @@ -0,0 +1,29 @@ +def get_sweep_config(): + """ + Contains the configuration for hyperparameter sweeps using WandB. + This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance. + + Returns: + - sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters. + """ + + sweep_config = { + 'method': 'grid', + } + + # Example metric setup: + metric = { + 'name': 'MSE', + 'goal': 'minimize' + } + sweep_config['metric'] = metric + + # Example parameters setup: + parameters_dict = { + 'model': { + 'value': 'LightBGM' # Eg. "LSTM", "CNN", "Transformer" + }, + } + sweep_config['parameters'] = parameters_dict + + return sweep_config diff --git a/models/caring_fish/main.py b/models/caring_fish/main.py new file mode 100644 index 00000000..576958b2 --- /dev/null +++ b/models/caring_fish/main.py @@ -0,0 +1,39 @@ +import time +import wandb +import sys +import logging +logging.basicConfig(filename='run.log', encoding='utf-8', level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) +from pathlib import Path +# Set up the path to include common_utils module +PATH = Path(__file__) +sys.path.insert(0, str(Path( + *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS +# Import necessary functions for project setup and model execution +from set_path import setup_project_paths +setup_project_paths(PATH) +from utils_cli_parser import parse_args, validate_arguments +from execute_model_runs import execute_sweep_run, execute_single_run + +if __name__ == "__main__": + # Parse command-line arguments + args = parse_args() + + # Validate the arguments to ensure they are correct + validate_arguments(args) + # Log in to Weights & Biases (wandb) + wandb.login() + # Record the start time + start_t = time.time() + # Execute the model run based on the sweep flag + if args.sweep: + execute_sweep_run(args) # Execute sweep run + else: + execute_single_run(args) # Execute single run + # Record the end time + end_t = time.time() + + # Calculate and print the runtime in minutes + minutes = (end_t - start_t) / 60 + logger.info(f'Done. Runtime: {minutes:.3f} minutes') diff --git a/models/lavender_haze/requirements.txt b/models/caring_fish/requirements.txt similarity index 100% rename from models/lavender_haze/requirements.txt rename to models/caring_fish/requirements.txt diff --git a/models/lavender_haze/.DS_Store b/models/lavender_haze/.DS_Store deleted file mode 100644 index 7224dcaa..00000000 Binary files a/models/lavender_haze/.DS_Store and /dev/null differ diff --git a/models/lavender_haze/README.md b/models/lavender_haze/README.md deleted file mode 100644 index 7306bc97..00000000 --- a/models/lavender_haze/README.md +++ /dev/null @@ -1,85 +0,0 @@ -# Lavender Haze Model -## Overview -This folder contains code for Lavender Haze model, a machine learning model designed for predicting fatalities. - -The model utilizes Hurdle Model (LGBMClassifier+LGBMRegressor) for its predictions and is on pgm level of analysis. - -The model uses log fatalities. - -## Repository Structure -``` - -lavender_haze/ # should follow the naming convention adjective_noun -|-- README.md -|-- requirements.txt -| -|-- artifacts/ # ensemble stepshifter models -| |-- model_metadata_dict.py # the standard meta data dict for models -| -|-- configs/ # ... -| |-- config_deployment.py # configuration for deploying the model into different environments -| |-- config_hyperparameters.py # hyperparameters for the model -| |-- config_input_data.py # defined queryset as the input data -| |-- config_meta # metadata for the model (model architecture, name, target variable, and level of analysis) -| |-- config_sweep # sweeping parameters for weights & biases -| -|-- data/ # all input, processed, output data -| |-- generated/ # Data generated - i.e. forecast/ evaluation -| |-- processed/ # Data processed -| |-- raw/ # Data directly from VIEiWSER -| -|-- notebooks/ -| -|-- reports/ # dissemination material - internal and external -| |-- figures/ # figures for papers, reports, newsletters, and slides -| |-- papers/ # working papers, white papers, articles ect. -| |-- plots/ # plots for papers, reports, newsletters, and slides -| |-- slides/ # slides, presentation and similar -| |-- timelapse/ # plots to create timelapse and the timelapse -| -|-- src/ # all source code needed to train, test, and forecast - | - |-- dataloaders/ - | |-- get_data.py # script to get data from VIEWSER (and input drift detection) - | - |-- forecasting/ - | |-- generate_forecast.py # script to genereate true-future fc - | - |-- management/ - | |-- execute_model_runs.py # execute a single run - | |-- execute_model_tasks.py # execute various model-related tasks - | - |-- offline_evaluation/ # aka offline quality assurance - | |-- evaluate_model.py # script to evaluate a single model - | |-- evaluate_sweep.py # script to evaluate a model during sweeping - | - |-- online_evaluation/ - | - |-- training/ - | |-- train_model.py # script to train a single model - | - |-- utils/ # functions and classes - | |-- utils.py # a general utils function - | |-- utils_wandb.py # a w&b specific utils function - | - |-- visualization/ # scripts to create visualizations - |-- visual.py - - -``` - -## Setup Instructions -Clone the repository. - -Install dependencies. - -## Usage -Modify configurations in configs/. - -Run main.py. - -``` -python main.py -r calibration -t -e -``` - -Monitor progress and results using [Weights & Biases](https://wandb.ai/views_pipeline/lavender_haze). \ No newline at end of file diff --git a/models/lavender_haze/artifacts/model_metadata_dict.py b/models/lavender_haze/artifacts/model_metadata_dict.py deleted file mode 100644 index e69de29b..00000000 diff --git a/models/lavender_haze/configs/config_deployment.py b/models/lavender_haze/configs/config_deployment.py deleted file mode 100644 index e1d56586..00000000 --- a/models/lavender_haze/configs/config_deployment.py +++ /dev/null @@ -1,16 +0,0 @@ -def get_deployment_config(): - - """ - Contains the configuration for deploying the model into different environments. - This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system. - - Returns: - - deployment_config (dict): A dictionary containing deployment settings, determining how the model is deployed, including status, endpoints, and resource allocation. - """ - - # More deployment settings can/will be added here - deployment_config = { - "deployment_status": "shadow", # shadow, deployed, baseline, or deprecated - } - - return deployment_config \ No newline at end of file diff --git a/models/lavender_haze/configs/config_hyperparameters.py b/models/lavender_haze/configs/config_hyperparameters.py deleted file mode 100644 index 11675b1f..00000000 --- a/models/lavender_haze/configs/config_hyperparameters.py +++ /dev/null @@ -1,17 +0,0 @@ -def get_hp_config(): - hp_config = { - "steps": [*range(1, 36 + 1, 1)], - "parameters": { - "clf":{ - "learning_rate": 0.05, - "n_estimators": 100, - "n_jobs": 12 - }, - "reg":{ - "learning_rate": 0.05, - "n_estimators": 100, - "n_jobs": 12 - } - } - } - return hp_config \ No newline at end of file diff --git a/models/lavender_haze/configs/config_input_data.py b/models/lavender_haze/configs/config_input_data.py deleted file mode 100644 index 28f790c6..00000000 --- a/models/lavender_haze/configs/config_input_data.py +++ /dev/null @@ -1,167 +0,0 @@ -import numpy as np -from viewser import Queryset, Column - -def get_input_data_config(): - - thetacrit_spatial = 0.7 - return_values = 'distances' - n_nearest = 1 - power = 0.0 - - qs_broad = (Queryset("fatalities003_pgm_broad", "priogrid_month") - - # target variable - .with_column(Column("ln_ged_sb_dep", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") - .transform.missing.replace_na() - .transform.ops.ln() - ) - - # timelags 0 of conflict variables, ged_best versions - - .with_column(Column("ged_sb", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") - .transform.missing.fill() - .transform.missing.replace_na() - ) - - .with_column(Column("ged_os", from_loa="priogrid_month", from_column="ged_os_best_sum_nokgi") - .transform.missing.fill() - .transform.missing.replace_na() - ) - - .with_column(Column("ged_ns", from_loa="priogrid_month", from_column="ged_ns_best_sum_nokgi") - .transform.missing.fill() - .transform.missing.replace_na() - ) - - # Spatial lag - .with_column(Column("splag_1_1_sb_1", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") - .transform.missing.replace_na() - .transform.bool.gte(1) - .transform.temporal.time_since() - .transform.temporal.decay(24) - .transform.spatial.lag(1, 1, 0, 0) - .transform.missing.replace_na() - ) - - # Decay functions - # sb - .with_column(Column("decay_ged_sb_5", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") - .transform.missing.replace_na() - .transform.bool.gte(5) - .transform.temporal.time_since() - .transform.temporal.decay(12) - .transform.missing.replace_na() - ) - # os - .with_column(Column("decay_ged_os_5", from_loa="priogrid_month", from_column="ged_os_best_sum_nokgi") - .transform.missing.replace_na() - .transform.bool.gte(5) - .transform.temporal.time_since() - .transform.temporal.decay(12) - .transform.missing.replace_na() - ) - - # ns - .with_column(Column("decay_ged_ns_5", from_loa="priogrid_month", from_column="ged_ns_best_sum_nokgi") - .transform.missing.replace_na() - .transform.bool.gte(5) - .transform.temporal.time_since() - .transform.temporal.decay(12) - .transform.missing.replace_na() - ) - - # Trees - - .with_column(Column("treelag_1_sb", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") - .transform.missing.replace_na() - .transform.spatial.treelag(thetacrit_spatial, 1) - ) - - .with_column(Column("treelag_2_sb", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") - .transform.missing.replace_na() - .transform.spatial.treelag(thetacrit_spatial, 2) - ) - # sptime - - # continuous, sptime_dist, nu=1 - .with_column(Column("sptime_dist_k1_1_ged_sb", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") - .transform.missing.replace_na() - .transform.spatial.sptime_dist(return_values, n_nearest, 1.0, power) - ) - - .with_column(Column("sptime_dist_k1_2_ged_sb", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") - .transform.missing.replace_na() - .transform.spatial.sptime_dist(return_values, n_nearest, 10.0, power) - ) - - .with_column(Column("sptime_dist_k1_3_ged_sb", from_loa="priogrid_month", from_column="ged_sb_best_sum_nokgi") - .transform.missing.replace_na() - .transform.spatial.sptime_dist(return_values, n_nearest, 0.01, power) - ) - - # From natsoc - .with_column(Column("ln_ttime_mean", from_loa="priogrid_year", from_column="ttime_mean") - .transform.ops.ln() - .transform.missing.fill() - .transform.missing.replace_na() - ) - - .with_column(Column("ln_bdist3", from_loa="priogrid_year", from_column="bdist3") - .transform.ops.ln() - .transform.missing.fill() - .transform.missing.replace_na() - ) - - .with_column(Column("ln_capdist", from_loa="priogrid_year", from_column="capdist") - .transform.ops.ln() - .transform.missing.fill() - .transform.missing.replace_na() - ) - - .with_column(Column("dist_diamsec", from_loa="priogrid", from_column="dist_diamsec_s_wgs") - .transform.missing.fill() - .transform.missing.replace_na() - ) - - .with_column(Column("imr_mean", from_loa="priogrid_year", from_column="imr_mean") - .transform.missing.fill() - .transform.missing.replace_na() - ) - - # From drought - .with_column(Column("tlag1_dr_mod_gs", from_loa="priogrid_month", - from_column="tlag1_dr_mod_gs") - .transform.missing.replace_na(0) - ) - - .with_column(Column("spei1_gs_prev10_anom", from_loa="priogrid_month", - from_column="spei1_gs_prev10_anom") - .transform.missing.replace_na(0) - ) - - .with_column(Column("tlag_12_crop_sum", from_loa="priogrid_month", - from_column="tlag_12_crop_sum") - .transform.missing.replace_na(0) - ) - - .with_column(Column("spei1gsy_lowermedian_count", from_loa="priogrid_month", - from_column="spei1gsy_lowermedian_count") - .transform.missing.replace_na(0) - ) - - # Log population as control - .with_column(Column("ln_pop_gpw_sum", from_loa="priogrid_year", from_column="pop_gpw_sum") - .transform.ops.ln() - .transform.missing.fill() - .transform.missing.replace_na() - ) - - .with_theme("fatalities") - .describe("""fatalities broad model, pgm level - - Predicting ln(ged_best_sb), broad model - - """) - ) - - return qs_broad \ No newline at end of file diff --git a/models/lavender_haze/configs/config_meta.py b/models/lavender_haze/configs/config_meta.py deleted file mode 100644 index e7ca1fc5..00000000 --- a/models/lavender_haze/configs/config_meta.py +++ /dev/null @@ -1,19 +0,0 @@ -def get_meta_config(): - """ - Contains the meta data for the model (model architecture, name, target variable, and level of analysis). - This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation. - - Returns: - - meta_config (dict): A dictionary containing model meta configuration. - """ - meta_config = { - "name": "lavender_haze", - "algorithm": "HurdleRegression", - "model_clf": "LGBMClassifier", - "model_reg": "LGBMRegressor", - "depvar": "ln_ged_sb_dep", # IMPORTANT! The current stepshift only takes one target variable! Not compatiable with Simon's code! - "queryset": "fatalities003_pgm_broad", - "level": "pgm", - "creator": "Xiaolong" - } - return meta_config \ No newline at end of file diff --git a/models/lavender_haze/configs/config_sweep.py b/models/lavender_haze/configs/config_sweep.py deleted file mode 100644 index 36ebf61c..00000000 --- a/models/lavender_haze/configs/config_sweep.py +++ /dev/null @@ -1,26 +0,0 @@ -def get_sweep_config(): - sweep_config = { - "name": "lavender_haze", - "method": "grid" - } - - metric = { - "name": "MSE", - "goal": "minimize" - } - - sweep_config["metric"] = metric - - parameters_dict = { - "steps": {"values": [[*range(1, 36 + 1, 1)]]}, - "cls_n_estimators": {"values": [100, 200]}, - "cls_learning_rate": {"values": [0.05]}, - "cls_n_jobs": {"values": [12]}, - "reg_n_estimators": {"values": [100, 200]}, - "reg_learning_rate": {"values": [0.05]}, - "reg_n_jobs": {"values": [12]} - } - - sweep_config["parameters"] = parameters_dict - - return sweep_config \ No newline at end of file diff --git a/models/lavender_haze/data/generated/.gitkeep b/models/lavender_haze/data/generated/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/models/lavender_haze/data/processed/.gitkeep b/models/lavender_haze/data/processed/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/models/lavender_haze/data/raw/.gitkeep b/models/lavender_haze/data/raw/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/models/lavender_haze/main.py b/models/lavender_haze/main.py deleted file mode 100644 index c199227c..00000000 --- a/models/lavender_haze/main.py +++ /dev/null @@ -1,30 +0,0 @@ -import wandb -import sys -import warnings - -from pathlib import Path -PATH = Path(__file__) -sys.path.insert(0, str(Path( - *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS -from set_path import setup_project_paths, setup_root_paths -setup_project_paths(PATH) - -from utils_cli_parser import parse_args, validate_arguments -from utils_logger import setup_logging -from execute_model_runs import execute_sweep_run, execute_single_run - -warnings.filterwarnings("ignore") - -logger = setup_logging('run.log') - - -if __name__ == "__main__": - wandb.login() - - args = parse_args() - validate_arguments(args) - - if args.sweep: - execute_sweep_run(args) - else: - execute_single_run(args) diff --git a/models/lavender_haze/notebooks/notebook001.ipynb b/models/lavender_haze/notebooks/notebook001.ipynb deleted file mode 100644 index 4d25bd82..00000000 --- a/models/lavender_haze/notebooks/notebook001.ipynb +++ /dev/null @@ -1,696 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Optional, Union\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from sklearn.linear_model import LinearRegression, LogisticRegression\n", - "from sklearn.base import BaseEstimator\n", - "from sklearn.utils.estimator_checks import check_estimator\n", - "from sklearn.utils.validation import check_X_y, check_array, check_is_fitted\n", - "from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor\n", - "from sklearn.ensemble import RandomForestRegressor\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.ensemble import HistGradientBoostingRegressor\n", - "from sklearn.ensemble import HistGradientBoostingClassifier\n", - "from xgboost import XGBRegressor\n", - "from xgboost import XGBClassifier\n", - "from xgboost import XGBRFRegressor, XGBRFClassifier\n", - "from lightgbm import LGBMClassifier, LGBMRegressor\n", - "\n", - "#from lightgbm import LGBMClassifier, LGBMRegressor\n", - "\n", - "\n", - "class HurdleRegression(BaseEstimator):\n", - " \"\"\" Regression model which handles excessive zeros by fitting a two-part model and combining predictions:\n", - " 1) binary classifier\n", - " 2) continuous regression\n", - " Implementeted as a valid sklearn estimator, so it can be used in pipelines and GridSearch objects.\n", - " Args:\n", - " clf_name: currently supports either 'logistic' or 'LGBMClassifier'\n", - " reg_name: currently supports either 'linear' or 'LGBMRegressor'\n", - " clf_params: dict of parameters to pass to classifier sub-model when initialized\n", - " reg_params: dict of parameters to pass to regression sub-model when initialized\n", - " \"\"\"\n", - "\n", - " def __init__(self,\n", - " clf_name: str = 'logistic',\n", - " reg_name: str = 'linear',\n", - " clf_params: Optional[dict] = None,\n", - " reg_params: Optional[dict] = None):\n", - "\n", - " self.clf_name = clf_name\n", - " self.reg_name = reg_name\n", - " self.clf_params = clf_params\n", - " self.reg_params = reg_params\n", - " self.clf_fi = []\n", - " self.reg_fi = []\n", - "\n", - " @staticmethod\n", - " def _resolve_estimator(func_name: str):\n", - " \"\"\" Lookup table for supported estimators.\n", - " This is necessary because sklearn estimator default arguments\n", - " must pass equality test, and instantiated sub-estimators are not equal. \"\"\"\n", - "\n", - " funcs = {'linear': LinearRegression(),\n", - " 'logistic': LogisticRegression(solver='liblinear'),\n", - " 'LGBMRegressor': LGBMRegressor(n_estimators=250),\n", - " 'LGBMClassifier': LGBMClassifier(n_estimators=250),\n", - " 'RFRegressor': XGBRFRegressor(n_estimators=250,n_jobs=-2),\n", - " 'RFClassifier': XGBRFClassifier(n_estimators=250,n_jobs=-2),\n", - " 'GBMRegressor': GradientBoostingRegressor(n_estimators=200),\n", - " 'GBMClassifier': GradientBoostingClassifier(n_estimators=200),\n", - " 'XGBRegressor': XGBRegressor(n_estimators=100,learning_rate=0.05,n_jobs=-2),\n", - " 'XGBClassifier': XGBClassifier(n_estimators=100,learning_rate=0.05,n_jobs=-2),\n", - " 'HGBRegressor': HistGradientBoostingRegressor(max_iter=200),\n", - " 'HGBClassifier': HistGradientBoostingClassifier(max_iter=200),\n", - " }\n", - "\n", - " return funcs[func_name]\n", - "\n", - " def fit(self,\n", - " X: Union[np.ndarray, pd.DataFrame],\n", - " y: Union[np.ndarray, pd.Series]):\n", - " X, y = check_X_y(X, y, dtype=None,\n", - " accept_sparse=False,\n", - " accept_large_sparse=False,\n", - " force_all_finite='allow-nan')\n", - "\n", - " if X.shape[1] < 2:\n", - " raise ValueError('Cannot fit model when n_features = 1')\n", - "\n", - " self.clf_ = self._resolve_estimator(self.clf_name)\n", - " if self.clf_params:\n", - " self.clf_.set_params(**self.clf_params)\n", - " self.clf_.fit(X, y > 0)\n", - " self.clf_fi = self.clf_.feature_importances_\n", - "\n", - " self.reg_ = self._resolve_estimator(self.reg_name)\n", - " if self.reg_params:\n", - " self.reg_.set_params(**self.reg_params)\n", - " self.reg_.fit(X[y > 0], y[y > 0])\n", - " self.reg_fi = self.reg_.feature_importances_\n", - "\n", - " self.is_fitted_ = True\n", - " return self\n", - "\n", - "\n", - " def predict_bck(self, X: Union[np.ndarray, pd.DataFrame]):\n", - " \"\"\" Predict combined response using binary classification outcome \"\"\"\n", - " X = check_array(X, accept_sparse=False, accept_large_sparse=False)\n", - " check_is_fitted(self, 'is_fitted_')\n", - " return self.clf_.predict(X) * self.reg_.predict(X)\n", - "\n", - " def predict(self, X: Union[np.ndarray, pd.DataFrame]):\n", - " \"\"\" Predict combined response using probabilistic classification outcome \"\"\"\n", - " X = check_array(X, accept_sparse=False, accept_large_sparse=False)\n", - " check_is_fitted(self, 'is_fitted_')\n", - " return self.clf_.predict_proba(X)[:, 1] * self.reg_.predict(X)\n", - "\n", - " \n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "hp_config = {\n", - " \"clf\":{\n", - " \"learning_rate\": 0.05,\n", - " \"n_estimators\": 100,\n", - " \"n_jobs\": 12\n", - " },\n", - " \"reg\":{\n", - " \"learning_rate\": 0.05,\n", - " \"n_estimators\": 100,\n", - " \"n_jobs\": 12\n", - " }\n", - "}\n", - "common_config = {\n", - " \"name\": \"lavender_haze\",\n", - " \"algorithm\": \"HurdleRegression\",\n", - " \"clf_name\":\"LGBMClassifier\",\n", - " \"reg_name\":\"LGBMRegressor\",\n", - " \"depvar\": \"ged_sb_dep\",\n", - " \"queryset\": \"fatalities003_pgm_broad\",\n", - " \"data_train\": \"baseline\",\n", - " \"level\": \"pgm\",\n", - " 'steps': [*range(1, 36 + 1, 1)],\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "cls_model = HurdleRegression(clf_name=common_config['clf_name'], reg_name=common_config['reg_name'], clf_params=hp_config['clf'], reg_params=hp_config['reg'])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
HurdleRegression(clf_name='LGBMClassifier',\n",
-       "                 clf_params={'learning_rate': 0.05, 'n_estimators': 100,\n",
-       "                             'n_jobs': 12},\n",
-       "                 reg_name='LGBMRegressor',\n",
-       "                 reg_params={'learning_rate': 0.05, 'n_estimators': 100,\n",
-       "                             'n_jobs': 12})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" - ], - "text/plain": [ - "HurdleRegression(clf_name='LGBMClassifier',\n", - " clf_params={'learning_rate': 0.05, 'n_estimators': 100,\n", - " 'n_jobs': 12},\n", - " reg_name='LGBMRegressor',\n", - " reg_params={'learning_rate': 0.05, 'n_estimators': 100,\n", - " 'n_jobs': 12})" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cls_model" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tlag1_dr_mod_gsspei1_gs_prev10_anomtlag_12_crop_sumspei1gsy_lowermedian_countln_ged_sb_depged_sbged_osged_nstreelag_1_sbtreelag_2_sb...dist_diamsecimr_meanln_ttime_meanln_bdist3ln_capdistln_pop_gpw_sumdecay_ged_sb_5decay_ged_os_5decay_ged_ns_5splag_1_1_sb_1
month_idpriogrid_gid
1623560.00.00.00.00.00.00.00.00.00.0...19.2353840.07.9894642.2639007.8174370.0000000.000000e+000.000000e+000.000000e+000.000000e+00
795990.00.00.00.00.00.00.00.00.00.0...3.640055100.05.2510892.9619987.1879348.2664450.000000e+000.000000e+000.000000e+000.000000e+00
796000.00.00.00.00.00.00.00.00.00.0...3.807887100.05.6565250.3649527.1643957.8052370.000000e+000.000000e+000.000000e+000.000000e+00
796010.00.00.00.00.00.00.00.00.00.0...4.031129100.05.4656522.3793257.1411389.3351590.000000e+000.000000e+000.000000e+000.000000e+00
803170.00.00.00.00.00.00.00.00.00.0...3.000000100.03.4099152.5209817.20801512.6544270.000000e+000.000000e+000.000000e+000.000000e+00
.....................................................................
8521904960.00.00.00.00.00.00.00.00.00.0...10.295630150.05.6872430.4939025.91006010.4086264.487001e-224.487001e-224.487001e-228.473017e-11
1905070.00.00.00.00.00.00.00.00.00.0...6.103278419.05.3359343.3175415.5644566.6472834.487001e-224.487001e-224.487001e-228.473017e-11
1905080.00.00.00.00.00.00.00.00.00.0...5.830952419.00.0000003.4339055.5964574.5621024.487001e-224.487001e-224.487001e-228.473017e-11
1905100.00.00.00.00.00.00.00.00.00.0...5.385165419.05.9048223.2404685.7160547.6195764.487001e-224.487001e-224.487001e-228.473017e-11
1905110.00.00.00.00.00.00.00.00.00.0...5.220153419.05.4791703.2879235.7919367.5960844.487001e-224.487001e-224.487001e-228.473017e-11
\n", - "

11169720 rows × 23 columns

\n", - "
" - ], - "text/plain": [ - " tlag1_dr_mod_gs spei1_gs_prev10_anom \\\n", - "month_id priogrid_gid \n", - "1 62356 0.0 0.0 \n", - " 79599 0.0 0.0 \n", - " 79600 0.0 0.0 \n", - " 79601 0.0 0.0 \n", - " 80317 0.0 0.0 \n", - "... ... ... \n", - "852 190496 0.0 0.0 \n", - " 190507 0.0 0.0 \n", - " 190508 0.0 0.0 \n", - " 190510 0.0 0.0 \n", - " 190511 0.0 0.0 \n", - "\n", - " tlag_12_crop_sum spei1gsy_lowermedian_count \\\n", - "month_id priogrid_gid \n", - "1 62356 0.0 0.0 \n", - " 79599 0.0 0.0 \n", - " 79600 0.0 0.0 \n", - " 79601 0.0 0.0 \n", - " 80317 0.0 0.0 \n", - "... ... ... \n", - "852 190496 0.0 0.0 \n", - " 190507 0.0 0.0 \n", - " 190508 0.0 0.0 \n", - " 190510 0.0 0.0 \n", - " 190511 0.0 0.0 \n", - "\n", - " ln_ged_sb_dep ged_sb ged_os ged_ns treelag_1_sb \\\n", - "month_id priogrid_gid \n", - "1 62356 0.0 0.0 0.0 0.0 0.0 \n", - " 79599 0.0 0.0 0.0 0.0 0.0 \n", - " 79600 0.0 0.0 0.0 0.0 0.0 \n", - " 79601 0.0 0.0 0.0 0.0 0.0 \n", - " 80317 0.0 0.0 0.0 0.0 0.0 \n", - "... ... ... ... ... ... \n", - "852 190496 0.0 0.0 0.0 0.0 0.0 \n", - " 190507 0.0 0.0 0.0 0.0 0.0 \n", - " 190508 0.0 0.0 0.0 0.0 0.0 \n", - " 190510 0.0 0.0 0.0 0.0 0.0 \n", - " 190511 0.0 0.0 0.0 0.0 0.0 \n", - "\n", - " treelag_2_sb ... dist_diamsec imr_mean \\\n", - "month_id priogrid_gid ... \n", - "1 62356 0.0 ... 19.235384 0.0 \n", - " 79599 0.0 ... 3.640055 100.0 \n", - " 79600 0.0 ... 3.807887 100.0 \n", - " 79601 0.0 ... 4.031129 100.0 \n", - " 80317 0.0 ... 3.000000 100.0 \n", - "... ... ... ... ... \n", - "852 190496 0.0 ... 10.295630 150.0 \n", - " 190507 0.0 ... 6.103278 419.0 \n", - " 190508 0.0 ... 5.830952 419.0 \n", - " 190510 0.0 ... 5.385165 419.0 \n", - " 190511 0.0 ... 5.220153 419.0 \n", - "\n", - " ln_ttime_mean ln_bdist3 ln_capdist ln_pop_gpw_sum \\\n", - "month_id priogrid_gid \n", - "1 62356 7.989464 2.263900 7.817437 0.000000 \n", - " 79599 5.251089 2.961998 7.187934 8.266445 \n", - " 79600 5.656525 0.364952 7.164395 7.805237 \n", - " 79601 5.465652 2.379325 7.141138 9.335159 \n", - " 80317 3.409915 2.520981 7.208015 12.654427 \n", - "... ... ... ... ... \n", - "852 190496 5.687243 0.493902 5.910060 10.408626 \n", - " 190507 5.335934 3.317541 5.564456 6.647283 \n", - " 190508 0.000000 3.433905 5.596457 4.562102 \n", - " 190510 5.904822 3.240468 5.716054 7.619576 \n", - " 190511 5.479170 3.287923 5.791936 7.596084 \n", - "\n", - " decay_ged_sb_5 decay_ged_os_5 decay_ged_ns_5 \\\n", - "month_id priogrid_gid \n", - "1 62356 0.000000e+00 0.000000e+00 0.000000e+00 \n", - " 79599 0.000000e+00 0.000000e+00 0.000000e+00 \n", - " 79600 0.000000e+00 0.000000e+00 0.000000e+00 \n", - " 79601 0.000000e+00 0.000000e+00 0.000000e+00 \n", - " 80317 0.000000e+00 0.000000e+00 0.000000e+00 \n", - "... ... ... ... \n", - "852 190496 4.487001e-22 4.487001e-22 4.487001e-22 \n", - " 190507 4.487001e-22 4.487001e-22 4.487001e-22 \n", - " 190508 4.487001e-22 4.487001e-22 4.487001e-22 \n", - " 190510 4.487001e-22 4.487001e-22 4.487001e-22 \n", - " 190511 4.487001e-22 4.487001e-22 4.487001e-22 \n", - "\n", - " splag_1_1_sb_1 \n", - "month_id priogrid_gid \n", - "1 62356 0.000000e+00 \n", - " 79599 0.000000e+00 \n", - " 79600 0.000000e+00 \n", - " 79601 0.000000e+00 \n", - " 80317 0.000000e+00 \n", - "... ... \n", - "852 190496 8.473017e-11 \n", - " 190507 8.473017e-11 \n", - " 190508 8.473017e-11 \n", - " 190510 8.473017e-11 \n", - " 190511 8.473017e-11 \n", - "\n", - "[11169720 rows x 23 columns]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_parquet(\"../data/raw/raw.parquet\")\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "viewser", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/models/lavender_haze/reports/.DS_Store b/models/lavender_haze/reports/.DS_Store deleted file mode 100644 index e69de29b..00000000 diff --git a/models/lavender_haze/reports/figures/.gitkeep b/models/lavender_haze/reports/figures/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/models/lavender_haze/reports/papers/.gitkeep b/models/lavender_haze/reports/papers/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/models/lavender_haze/reports/plots/.gitkeep b/models/lavender_haze/reports/plots/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/models/lavender_haze/reports/slides/.gitkeep b/models/lavender_haze/reports/slides/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/models/lavender_haze/reports/timelapse/.gitkeep b/models/lavender_haze/reports/timelapse/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/models/lavender_haze/src/.DS_Store b/models/lavender_haze/src/.DS_Store deleted file mode 100644 index a952ae4a..00000000 Binary files a/models/lavender_haze/src/.DS_Store and /dev/null differ diff --git a/models/lavender_haze/src/architectures/.gitkeep b/models/lavender_haze/src/architectures/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/models/lavender_haze/src/dataloaders/get_data.py b/models/lavender_haze/src/dataloaders/get_data.py deleted file mode 100644 index cc332719..00000000 --- a/models/lavender_haze/src/dataloaders/get_data.py +++ /dev/null @@ -1,27 +0,0 @@ -import sys -import logging - -from pathlib import Path -PATH = Path(__file__) -sys.path.insert(0, str(Path( - *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS -from set_path import setup_project_paths, setup_data_paths -setup_project_paths(PATH) - -from utils_dataloaders import fetch_or_load_views_df, create_or_load_views_vol, get_alert_help_string - -logger = logging.getLogger(__name__) - - -def get_data(args): - logger.info("Getting data...") - PATH_RAW, _, _ = setup_data_paths(PATH) - - data, alerts = fetch_or_load_views_df(args.run_type, PATH_RAW, args.saved) - logger.info(f"DataFrame shape: {data.shape if data is not None else 'None'}") - - for ialert, alert in enumerate(str(alerts).strip('[').strip(']').split('Input')): - if 'offender' in alert: - logger.warning({f"{args.run_type} data alert {ialert}": str(alert)}) - - return data diff --git a/models/lavender_haze/src/forecasting/generate_forecast.py b/models/lavender_haze/src/forecasting/generate_forecast.py deleted file mode 100644 index 13fef6f0..00000000 --- a/models/lavender_haze/src/forecasting/generate_forecast.py +++ /dev/null @@ -1,54 +0,0 @@ -import sys -import pandas as pd -from datetime import datetime -import logging - -from pathlib import Path -PATH = Path(__file__) -sys.path.insert(0, str(Path( - *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS -from set_path import setup_project_paths, setup_data_paths, setup_artifacts_paths -setup_project_paths(PATH) - -from set_partition import get_partitioner_dict -from utils_log_files import create_log_file -from utils_run import get_standardized_df -from utils_outputs import save_predictions -from utils_artifacts import get_latest_model_artifact - -logger = logging.getLogger(__name__) - - -def forecast_model_artifact(config, artifact_name): - PATH_RAW, _, PATH_GENERATED = setup_data_paths(PATH) - PATH_ARTIFACTS = setup_artifacts_paths(PATH) - run_type = config['run_type'] - - # if an artifact name is provided through the CLI, use it. - # Otherwise, get the latest model artifact based on the run type - if artifact_name: - logger.info(f"Using (non-default) artifact: {artifact_name}") - - if not artifact_name.endswith('.pkl'): - artifact_name += '.pkl' - PATH_ARTIFACT = PATH_ARTIFACTS / artifact_name - else: - # use the latest model artifact based on the run type - logger.info(f"Using latest (default) run type ({run_type}) specific artifact") - PATH_ARTIFACT = get_latest_model_artifact(PATH_ARTIFACTS, run_type) - - config["timestamp"] = PATH_ARTIFACT.stem[-15:] - df_viewser = pd.read_pickle(PATH_RAW / f"{run_type}_viewser_df.pkl") - - try: - stepshift_model = pd.read_pickle(PATH_ARTIFACT) - except FileNotFoundError: - logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") - - partition = get_partitioner_dict(run_type)['predict'] - df_predictions = stepshift_model.future_point_predict(partition[0] - 1, df_viewser, keep_specific=True) - df_predictions = get_standardized_df(df_predictions, config) - data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - - save_predictions(df_predictions, PATH_GENERATED, config) - create_log_file(PATH_GENERATED, config, config["timestamp"], data_generation_timestamp) diff --git a/models/lavender_haze/src/management/execute_model_runs.py b/models/lavender_haze/src/management/execute_model_runs.py deleted file mode 100644 index 48fa198f..00000000 --- a/models/lavender_haze/src/management/execute_model_runs.py +++ /dev/null @@ -1,48 +0,0 @@ -import sys -import wandb - -from pathlib import Path -PATH = Path(__file__) -sys.path.insert(0, str(Path( - *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS -from set_path import setup_project_paths -setup_project_paths(PATH) - -from config_deployment import get_deployment_config -from config_hyperparameters import get_hp_config -from config_meta import get_meta_config -from config_sweep import get_sweep_config -from execute_model_tasks import execute_model_tasks -from get_data import get_data -from utils_run import update_config, update_sweep_config - - -def execute_sweep_run(args): - get_data(args) - - sweep_config = get_sweep_config() - meta_config = get_meta_config() - update_sweep_config(sweep_config, args, meta_config) - - project = f"{sweep_config['name']}_sweep" # we can name the sweep in the config file - sweep_id = wandb.sweep(sweep_config, project=project, entity='views_pipeline') - wandb.agent(sweep_id, execute_model_tasks, entity='views_pipeline') - - -def execute_single_run(args): - get_data(args) - - hp_config = get_hp_config() - meta_config = get_meta_config() - dp_config = get_deployment_config() - config = update_config(hp_config, meta_config, dp_config, args) - - project = f"{config['name']}_{args.run_type}" - - if args.run_type == 'calibration' or args.run_type == 'testing': - execute_model_tasks(config=config, project=project, train=args.train, eval=args.evaluate, - forecast=False, artifact_name=args.artifact_name) - - elif args.run_type == 'forecasting': - execute_model_tasks(config=config, project=project, train=args.train, eval=False, - forecast=args.forecast, artifact_name=args.artifact_name) diff --git a/models/lavender_haze/src/management/execute_model_tasks.py b/models/lavender_haze/src/management/execute_model_tasks.py deleted file mode 100644 index 51c04d0b..00000000 --- a/models/lavender_haze/src/management/execute_model_tasks.py +++ /dev/null @@ -1,83 +0,0 @@ -import sys -import wandb -import logging -import time - -from pathlib import Path -PATH = Path(__file__) -sys.path.insert(0, str(Path( - *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS -from set_path import setup_project_paths -setup_project_paths(PATH) - -from evaluate_model import evaluate_model_artifact -from evaluate_sweep import evaluate_sweep -from generate_forecast import forecast_model_artifact -from train_model import train_model_artifact -from utils_run import get_model, split_hurdle_parameters -from utils_wandb import add_wandb_monthly_metrics - -logger = logging.getLogger(__name__) - - -def execute_model_tasks(config=None, project=None, train=None, eval=None, forecast=None, artifact_name=None): - """ - Executes various model-related tasks including training, evaluation, and forecasting. - - This function manages the execution of different tasks such as training the model, - evaluating an existing model, or performing forecasting. - It also initializes the WandB project. - - Args: - config: Configuration object containing parameters and settings. - project: The WandB project name. - train: Flag to indicate if the model should be trained. - eval: Flag to indicate if the model should be evaluated. - forecast: Flag to indicate if forecasting should be performed. - artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting. - """ - - start_t = time.time() - - # Initialize WandB - with wandb.init(project=project, entity="views_pipeline", - config=config): # project and config ignored when running a sweep - - # add the monthly metrics to WandB - add_wandb_monthly_metrics() - - # Update config from WandB initialization above - config = wandb.config - - # W&B does not directly support nested dictionaries for hyperparameters - # This will make the sweep config super ugly, but we don't have to distinguish between sweep and single runs - if config['sweep'] and config['algorithm'] == "HurdleRegression": - config['parameters'] = {} - config['parameters']['clf'], config['parameters']['reg'] = split_hurdle_parameters(config) - - model = get_model(config) - # logger.info(model) - - if config['sweep']: - logger.info(f"Sweeping model {config['name']}...") - stepshift_model = train_model_artifact(config, model) - logger.info(f"Evaluating model {config['name']}...") - evaluate_sweep(config, stepshift_model) - - # Handle the single model runs: train and save the model as an artifact - if train: - logger.info(f"Training model {config['name']}...") - train_model_artifact(config, model) - - # Handle the single model runs: evaluate a trained model (artifact) - if eval: - logger.info(f"Evaluating model {config['name']}...") - evaluate_model_artifact(config, artifact_name) - - if forecast: - logger.info(f"Forecasting model {config['name']}...") - forecast_model_artifact(config, artifact_name) - - end_t = time.time() - minutes = (end_t - start_t) / 60 - logger.info(f'Done. Runtime: {minutes:.3f} minutes.\n') diff --git a/models/lavender_haze/src/offline_evaluation/evaluate_model.py b/models/lavender_haze/src/offline_evaluation/evaluate_model.py deleted file mode 100644 index 7e0a17a2..00000000 --- a/models/lavender_haze/src/offline_evaluation/evaluate_model.py +++ /dev/null @@ -1,60 +0,0 @@ -import sys -from datetime import datetime -import pandas as pd -import logging - -from pathlib import Path -PATH = Path(__file__) -sys.path.insert(0, str(Path( - *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS -from set_path import setup_project_paths, setup_data_paths, setup_artifacts_paths -setup_project_paths(PATH) - -from utils_log_files import create_log_file -from utils_outputs import save_model_outputs -from utils_run import get_standardized_df -from utils_artifacts import get_latest_model_artifact -from utils_evaluation_metrics import generate_metric_dict -from utils_model_outputs import generate_output_dict -from utils_wandb import log_wandb_log_dict -from views_forecasts.extensions import * - -logger = logging.getLogger(__name__) - - -def evaluate_model_artifact(config, artifact_name): - PATH_RAW, _, PATH_GENERATED = setup_data_paths(PATH) - PATH_ARTIFACTS = setup_artifacts_paths(PATH) - run_type = config['run_type'] - - # if an artifact name is provided through the CLI, use it. - # Otherwise, get the latest model artifact based on the run type - if artifact_name: - logger.info(f"Using (non-default) artifact: {artifact_name}") - - if not artifact_name.endswith('.pkl'): - artifact_name += '.pkl' - PATH_ARTIFACT = PATH_ARTIFACTS / artifact_name - else: - # use the latest model artifact based on the run type - logger.info(f"Using latest (default) run type ({run_type}) specific artifact") - PATH_ARTIFACT = get_latest_model_artifact(PATH_ARTIFACTS, run_type) - - config["timestamp"] = PATH_ARTIFACT.stem[-15:] - df_viewser = pd.read_pickle(PATH_RAW / f"{run_type}_viewser_df.pkl") - - try: - stepshift_model = pd.read_pickle(PATH_ARTIFACT) - except FileNotFoundError: - logger.exception(f"Model artifact not found at {PATH_ARTIFACT}") - - df = stepshift_model.predict(run_type, "predict", df_viewser) - df = get_standardized_df(df, config) - data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - - output, df_output = generate_output_dict(df, config) - evaluation, df_evaluation = generate_metric_dict(df, config) - log_wandb_log_dict(config, evaluation) - - save_model_outputs(df_evaluation, df_output, PATH_GENERATED, config) - create_log_file(PATH_GENERATED, config, config["timestamp"], data_generation_timestamp) diff --git a/models/lavender_haze/src/offline_evaluation/evaluate_sweep.py b/models/lavender_haze/src/offline_evaluation/evaluate_sweep.py deleted file mode 100644 index 1f2647f3..00000000 --- a/models/lavender_haze/src/offline_evaluation/evaluate_sweep.py +++ /dev/null @@ -1,36 +0,0 @@ -import sys -import pandas as pd -import wandb -from sklearn.metrics import mean_squared_error - -from pathlib import Path -PATH = Path(__file__) -sys.path.insert(0, str(Path( - *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS -from set_path import setup_project_paths, setup_data_paths -setup_project_paths(PATH) - -from utils_run import get_standardized_df -from utils_wandb import log_wandb_log_dict -from utils_evaluation_metrics import generate_metric_dict - - -def evaluate_sweep(config, stepshift_model): - PATH_RAW, _, _ = setup_data_paths(PATH) - run_type = config['run_type'] - steps = config['steps'] - - df_viewser = pd.read_pickle(PATH_RAW / f"{run_type}_viewser_df.pkl") - - df = stepshift_model.predict(run_type, "predict", df_viewser) - df = get_standardized_df(df, config) - - # Temporarily keep this because the metric to minimize is MSE - pred_cols = [f"step_pred_{str(i)}" for i in steps] - df["mse"] = df.apply(lambda row: mean_squared_error([row[config['depvar']]] * 36, - [row[col] for col in pred_cols]), axis=1) - - wandb.log({'MSE': df['mse'].mean()}) - - evaluation, df_evaluation = generate_metric_dict(df, config) - log_wandb_log_dict(config, evaluation) diff --git a/models/lavender_haze/src/online_evaluation/evaluate_forecast.py b/models/lavender_haze/src/online_evaluation/evaluate_forecast.py deleted file mode 100644 index e69de29b..00000000 diff --git a/models/lavender_haze/src/training/train_model.py b/models/lavender_haze/src/training/train_model.py deleted file mode 100644 index faaac741..00000000 --- a/models/lavender_haze/src/training/train_model.py +++ /dev/null @@ -1,40 +0,0 @@ -from datetime import datetime -import pandas as pd -from pathlib import Path -PATH = Path(__file__) -from set_path import setup_project_paths, setup_data_paths, setup_artifacts_paths -setup_project_paths(PATH) - -from utils_log_files import create_log_file -from set_partition import get_partitioner_dict -from stepshift.views import StepshiftedModels -from views_forecasts.extensions import * -from views_partitioning.data_partitioner import DataPartitioner -from views_stepshift.run import ViewsRun - - -def train_model_artifact(config, model): - # print(config) - PATH_RAW, _, PATH_GENERATED = setup_data_paths(PATH) - PATH_ARTIFACTS = setup_artifacts_paths(PATH) - run_type = config['run_type'] - df_viewser = pd.read_pickle(PATH_RAW / f"{run_type}_viewser_df.pkl") - - stepshift_model = stepshift_training(config, run_type, model, df_viewser) - if not config["sweep"]: - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - model_filename = f"{run_type}_model_{timestamp}.pkl" - stepshift_model.save(PATH_ARTIFACTS / model_filename) - create_log_file(PATH_GENERATED, config, timestamp) - return stepshift_model - - -def stepshift_training(config, partition_name, model, dataset): - steps = config["steps"] - target = config["depvar"] - partitioner_dict = get_partitioner_dict(partition_name) - partition = DataPartitioner({partition_name: partitioner_dict}) - stepshift_def = StepshiftedModels(model, steps, target) - stepshift_model = ViewsRun(partition, stepshift_def) - stepshift_model.fit(partition_name, "train", dataset) - return stepshift_model diff --git a/models/lavender_haze/src/utils/utils_log_files.py b/models/lavender_haze/src/utils/utils_log_files.py deleted file mode 100644 index b5d58a9d..00000000 --- a/models/lavender_haze/src/utils/utils_log_files.py +++ /dev/null @@ -1,33 +0,0 @@ -from pathlib import Path -import logging - -logger = logging.getLogger(__name__) - - -def create_log_file(PATH_GENERATED, - config, - model_timestamp, - data_generation_timestamp=None, - data_fetch_timestamp=None): - """ - Creates a log file in the specified model-specific folder with details about the generated data. - - Args: - - PATH_GENERATED (Path): The path to the folder where the log file will be created. - - config (dict): The configuration dictionary containing the model details. - - model_timestamp (str): The timestamp when the model was trained. - - data_generation_timestamp (str): The timestamp when the data was generated. - - data_fetch_timestamp (str, optional): The timestamp when the raw data used was fetched from VIEWS. - """ - - Path(PATH_GENERATED).mkdir(parents=True, exist_ok=True) - log_file_path = f"{PATH_GENERATED}/{config['run_type']}_log.txt" - - with open(log_file_path, 'w') as log_file: - log_file.write(f"Model Name: {config['name']}\n") - log_file.write(f"Model Timestamp: {model_timestamp}\n") - log_file.write(f"Data Generation Timestamp: {data_generation_timestamp}\n") - log_file.write(f"Data Fetch Timestamp: {data_fetch_timestamp}\n") - log_file.write(f"Deployment Status: {config['deployment_status']}\n") - - logger.info(f"Model log file created at: {log_file_path}") diff --git a/models/lavender_haze/src/utils/utils_outputs.py b/models/lavender_haze/src/utils/utils_outputs.py deleted file mode 100644 index b88cf9a2..00000000 --- a/models/lavender_haze/src/utils/utils_outputs.py +++ /dev/null @@ -1,30 +0,0 @@ -from pathlib import Path -import pickle -import logging - -logger = logging.getLogger(__name__) - - -def save_model_outputs(df_evaluation, df_output, PATH_GENERATED, config): - Path(PATH_GENERATED).mkdir(parents=True, exist_ok=True) - - # Save the DataFrame of model outputs - outputs_path = f"{PATH_GENERATED}/output_{config['steps'][-1]}_{config['run_type']}_{config['timestamp']}.pkl" - with open(outputs_path, 'wb') as file: - pickle.dump(df_output, file) - logger.info(f"Model outputs saved at: {outputs_path}") - - # Save the DataFrame of evaluation metrics - evaluation_path = f"{PATH_GENERATED}/evaluation_{config['steps'][-1]}_{config['run_type']}_{config['timestamp']}.pkl" - with open(evaluation_path, 'wb') as file: - pickle.dump(df_evaluation, file) - logger.info(f"Evaluation metrics saved at: {evaluation_path}") - - -def save_predictions(df_predictions, PATH_GENERATED, config): - Path(PATH_GENERATED).mkdir(parents=True, exist_ok=True) - - predictions_path = f"{PATH_GENERATED}/predictions_{config['steps'][-1]}_{config['run_type']}_{config['timestamp']}.pkl" - with open(predictions_path, 'wb') as file: - pickle.dump(df_predictions, file) - logger.info(f"Predictions saved at: {predictions_path}") diff --git a/models/lavender_haze/src/utils/utils_run.py b/models/lavender_haze/src/utils/utils_run.py deleted file mode 100644 index f8955a3a..00000000 --- a/models/lavender_haze/src/utils/utils_run.py +++ /dev/null @@ -1,110 +0,0 @@ -import sys -import numpy as np -from lightgbm import LGBMRegressor -from xgboost import XGBRegressor -from sklearn.ensemble import RandomForestClassifier - -from pathlib import Path -PATH = Path(__file__) -sys.path.insert(0, str(Path( - *[i for i in PATH.parts[:PATH.parts.index("views_pipeline") + 1]]) / "common_utils")) # PATH_COMMON_UTILS -from set_path import setup_project_paths -setup_project_paths(PATH) - -from hurdle_model import HurdleRegression -from views_forecasts.extensions import * - - -def get_model(config): - """ - Get the model based on the algorithm specified in the config - """ - - if config["algorithm"] == "HurdleRegression": - model = HurdleRegression(clf_name=config["model_clf"], reg_name=config["model_reg"], - clf_params=config["parameters"]["clf"], reg_params=config["parameters"]["reg"]) - else: - parameters = get_parameters(config) - model = globals()[config["algorithm"]](**parameters) - - return model - - -def get_parameters(config): - """ - Get the parameters from the config file. - If not sweep, then get directly from the config file, otherwise have to remove some parameters. - """ - - if config["sweep"]: - keys_to_remove = ["algorithm", "depvar", "steps", "sweep", "run_type", "model_cls", "model_reg"] - parameters = {k: v for k, v in config.items() if k not in keys_to_remove} - else: - parameters = config["parameters"] - - return parameters - - -def get_standardized_df(df, config): - """ - Standardize the DataFrame based on the run type - """ - - run_type = config['run_type'] - steps = config['steps'] - depvar = config['depvar'] - - # choose the columns to keep based on the run type and replace negative values with 0 - if run_type in ['calibration', 'testing']: - cols = [depvar] + df.forecasts.prediction_columns - elif run_type == "forecasting": - cols = [f'step_pred_{i}' for i in steps] - df = df.replace([np.inf, -np.inf], 0)[cols] - df = df.mask(df < 0, 0) - return df - - -def split_hurdle_parameters(parameters_dict): - """ - Split the parameters dictionary into two separate dictionaries, one for the - classification model and one for the regression model. - """ - - cls_dict = {} - reg_dict = {} - - for key, value in parameters_dict.items(): - if key.startswith('cls_'): - cls_key = key.replace('cls_', '') - cls_dict[cls_key] = value - elif key.startswith('reg_'): - reg_key = key.replace('reg_', '') - reg_dict[reg_key] = value - - return cls_dict, reg_dict - - -def update_config(hp_config, meta_config, dp_config, args): - config = hp_config.copy() - config['run_type'] = args.run_type - config['sweep'] = False - config['name'] = meta_config['name'] - config['depvar'] = meta_config['depvar'] - config['algorithm'] = meta_config['algorithm'] - if meta_config['algorithm'] == 'HurdleRegression': - config['model_clf'] = meta_config['model_clf'] - config['model_reg'] = meta_config['model_reg'] - config['deployment_status'] = dp_config['deployment_status'] - - return config - - -def update_sweep_config(sweep_config, args, meta_config): - sweep_config['parameters']['run_type'] = {'value': args.run_type} - sweep_config['parameters']['sweep'] = {'value': True} - sweep_config['parameters']['name'] = {'value': meta_config['name']} - sweep_config['parameters']['depvar'] = {'value': meta_config['depvar']} - sweep_config['parameters']['algorithm'] = {'value': meta_config['algorithm']} - if meta_config['algorithm'] == 'HurdleRegression': - sweep_config['parameters']['model_clf'] = {'value': meta_config['model_clf']} - sweep_config['parameters']['model_reg'] = {'value': meta_config['model_reg']} diff --git a/models/lavender_haze/src/visualization/visual.py b/models/lavender_haze/src/visualization/visual.py deleted file mode 100644 index e69de29b..00000000