Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

the zero baseline model #38

Open
wants to merge 1 commit into
base: production
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions models/hazel_rabbit/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Model README
16 changes: 16 additions & 0 deletions models/hazel_rabbit/configs/config_deployment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
def get_deployment_config():

"""
Contains the configuration for deploying the model into different environments.
This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system.

Returns:
- deployment_config (dict): A dictionary containing deployment settings, determining how the model is deployed, including status, endpoints, and resource allocation.
"""

# More deployment settings can/will be added here
deployment_config = {
"deployment_status": "baseline", # shadow, deployed, baseline, or deprecated
}

return deployment_config
18 changes: 18 additions & 0 deletions models/hazel_rabbit/configs/config_hyperparameters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

def get_hp_config():

"""
Contains the hyperparameter configurations for model training.
This configuration is "operational" so modifying these settings will impact the model's behavior during training.

Returns:
- hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase.
"""

hyperparameters = {
'sweep' : False, # no sweep for the zero baseline model
'partitioner' : False, # True: if hardcoded months from set_partitioner.py are used, False: max months - time_steps
'save_generated' : True, # save evaulation results in the generated folder
'time_steps' : 36, # 36 right?
}
return hyperparameters
25 changes: 25 additions & 0 deletions models/hazel_rabbit/configs/config_input_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from viewser import Queryset, Column

def get_input_data_config():

"""
Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model.
This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system.
There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and architecture accordingly.

Returns:
queryset_base (Queryset): A queryset containing the base data for the model training.
"""

# VIEWSER 6
queryset_base = (Queryset("hazel_rabbit", "priogrid_month")
.with_column(Column("ln_sb_best", from_loa = "priogrid_month", from_column = "ged_sb_best_count_nokgi").transform.ops.ln().transform.missing.replace_na())
.with_column(Column("ln_ns_best", from_loa = "priogrid_month", from_column = "ged_ns_best_count_nokgi").transform.ops.ln().transform.missing.replace_na())
.with_column(Column("ln_os_best", from_loa = "priogrid_month", from_column = "ged_os_best_count_nokgi").transform.ops.ln().transform.missing.replace_na())
.with_column(Column("month", from_loa = "month", from_column = "month"))
.with_column(Column("year_id", from_loa = "country_year", from_column = "year_id"))
.with_column(Column("c_id", from_loa = "country_year", from_column = "country_id"))
.with_column(Column("col", from_loa = "priogrid", from_column = "col"))
.with_column(Column("row", from_loa = "priogrid", from_column = "row")))

return queryset_base
17 changes: 17 additions & 0 deletions models/hazel_rabbit/configs/config_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
def get_meta_config():
"""
Contains the meta data for the model (model architecture, name, target variable, and level of analysis).
This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation.

Returns:
- meta_config (dict): A dictionary containing model meta configuration.
"""
meta_config = {
"name": "hazel_rabbit",
"algorithm": "zero baseline",
"target(S)": ["ln_sb_best", "ln_ns_best", "ln_os_best", "ln_sb_best_binarized", "ln_ns_best_binarized", "ln_os_best_binarized"],

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just out of curiosity: is there a reason for the capital S in "target(S)"?

"queryset": "hazel_rabbit",
"level": "pgm",
"creator": "Borbala"
}
return meta_config
27 changes: 27 additions & 0 deletions models/hazel_rabbit/configs/config_sweep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
def get_swep_config():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

small typo (sweep)


"""
Contains the configuration for hyperparameter sweeps using WandB.
This configuration is "operational" so modifying it will change the search strategy, parameter ranges, and other settings for hyperparameter tuning aimed at optimizing model performance.

Returns:
- sweep_config (dict): A dictionary containing the configuration for hyperparameter sweeps, defining the methods and parameter ranges used to search for optimal hyperparameters.
"""

sweep_config = {
'method': 'grid'
}

metric = {

}

sweep_config['metric'] = metric

parameters_dict = {

}

sweep_config['parameters'] = parameters_dict

return sweep_config
60 changes: 60 additions & 0 deletions models/hazel_rabbit/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import time

import wandb

import sys
from pathlib import Path

PATH = Path(__file__)
sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS
from set_path import setup_project_paths, setup_artifacts_paths
setup_project_paths(PATH)

from cli_parser_utils import parse_args, validate_arguments
#from artifacts_utils import get_latest_model_artifact

#from model_run_handlers import handle_sweep_run, handle_single_run
from execute_model_runs import execute_sweep_run, execute_single_run

#from mode_run_manager import model_run_manager

if __name__ == "__main__":

# new argpars solution.
args = parse_args()
#print(args)

# Validate the parsed arguments to ensure they conform to the required logic and combinations.
validate_arguments(args)

# wandb login
wandb.login()

start_t = time.time()

# Test if and why a model_metadata_dict.py was saved in the artifacts folder..

# first you need to check if you are running a sweep or not, because the sweep will overwrite the train and evaluate flags
if args.sweep == True:

#handle_sweep_run(args)
execute_sweep_run(args)

elif args.sweep == False:

#handle_single_run(args)
execute_single_run(args)

end_t = time.time()
minutes = (end_t - start_t)/60
print(f'Done. Runtime: {minutes:.3f} minutes')

# notes on stepshifted models:
# There will be some thinking here in regards to how we store, denote (naming convention), and retrieve the model artifacts from stepshifted models.
# It is not a big issue, but it is something to consider os we don't do something headless.
# A possible format could be: <run_type>_model_s<step>_<timestamp>.pt example: calibration_model_s00_20210831_123456.pt, calibration_model_s01_20210831_123456.pt, etc.
# And the rest of the code maded in a way to handle this naming convention without any issues. Could be a simple fix.
# Alternatively, we could store the model artifacts in a subfolder for each stepshifted model. This would make it easier to handle the artifacts, but it would also make it harder to retrieve the latest artifact for a given run type.
# Lastly, the solution Xiaolong is working on might allow us the store multiple models (steps) in one artifact, which would make this whole discussion obsolete and be the best solution.


1 change: 1 addition & 0 deletions models/hazel_rabbit/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Requirements
63 changes: 63 additions & 0 deletions models/hazel_rabbit/src/forecasting/generate_forecast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@

from set_partition import get_partitioner_dict

import pandas as pd

import sys
from pathlib import Path

PATH = Path(__file__)
sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS
from set_path import setup_project_paths, setup_data_paths
setup_project_paths(PATH)


from utils import get_raw_data, create_model_time_stamp, save_generated_pred


def forecast_with_model_artifact(config, views_raw):
"""
Create forecasts using the zero baseline model. Return a DataFrame with the predictions.

Args:
config : Configuration object containing parameters and settings.
views_raw : DataFrame containing the raw data
"""

partitioner_dict = get_partitioner_dict(config.run_type)

# get the months for the predictions
first_month = partitioner_dict['predict'][0] #if config.partitioner==True else partitioner_dict['predict'][1]-config.time_steps
last_month = partitioner_dict['predict'][1]

views_raw = views_raw[['month_id', 'pg_id', 'month', 'year_id', 'c_id']]

views_res = generate_forecast(config, views_raw, first_month, last_month)

# add timestamp
config = create_model_time_stamp(config)

# save the DataFrame of model outputs
if config.save_generated == True:
save_generated_pred(config, views_res)

return views_res




def generate_forecast(config, views_raw, first_month, last_month):
# get the unique grids as a Series
unique_grids = views_raw['pg_id'].unique()

# create the next 36 months for these grids
next_months = pd.DataFrame({
'pg_id': unique_grids.repeat(config.time_steps),
'month_id': [month for _ in unique_grids for month in range(first_month, last_month)]
})

# assign the sequence from 1 to 36 for the new months
next_months['out_sample_months'] = next_months.groupby('pg_id').cumcount() + 1
next_months['y_pred'] = 0

return next_months
44 changes: 44 additions & 0 deletions models/hazel_rabbit/src/management/execute_model_runs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import sys
from pathlib import Path

PATH = Path(__file__)
sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS
from set_path import setup_project_paths, setup_artifacts_paths
setup_project_paths(PATH)

#from config_sweep import get_swep_config
from config_hyperparameters import get_hp_config
#from model_run_manager import model_run_manager
from execute_model_tasks import execute_model_tasks


def execute_sweep_run(args):
print('Running sweep...')

project = f"hazel_rabbit_sweep" # check naming convention

print('Sweep run is not implemented. Exiting...')


def execute_single_run(args):

# get config
config = get_hp_config()
config['run_type'] = args.run_type


# get run type and denoting project name - check convention!
project = f"hazel_rabbit_{args.run_type}"

if args.run_type == 'calibration' or args.run_type == 'testing':

execute_model_tasks(config = config, project = project, train = args.train, eval = args.evaluate, forecast = False)

elif args.run_type == 'forecasting':

execute_model_tasks(config = config, project = project, train = False, eval = False, forecast=True)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

train is not necessarily False. We might also train the model based on the forecasting partition


else:
raise ValueError(f"Invalid run type: {args.run_type}")


80 changes: 80 additions & 0 deletions models/hazel_rabbit/src/management/execute_model_tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@

import wandb

import sys
from pathlib import Path

PATH = Path(__file__)
sys.path.insert(0, str(Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) / "common_utils")) # PATH_COMMON_UTILS
from set_path import setup_project_paths, setup_artifacts_paths, setup_data_paths

from ingester3.ViewsMonth import ViewsMonth
setup_project_paths(PATH)

from utils import get_raw_data
from utils_wandb import add_wandb_monthly_metrics


from evaluate_model import evaluate_model_artifact
from generate_forecast import forecast_with_model_artifact



def execute_model_tasks(config = None, project = None, train = None, eval = None, forecast = None):

"""
Executes various model-related tasks including training, evaluation, and forecasting.

This function manages the execution of different tasks such as training the model,
evaluating an existing model, or performing forecasting.
It also initializes the WandB project.

Args:
config: Configuration object containing parameters and settings.
project: The WandB project name.
train: Flag to indicate if the model should be trained.
eval: Flag to indicate if the model should be evaluated.
forecast: Flag to indicate if forecasting should be performed.
"""

# Define the path for the artifacts
PATH_ARTIFACTS = setup_artifacts_paths(PATH)

#device = setup_device()

# Initialize WandB
with wandb.init(project=project, entity="views_pipeline", config=config): # project and config ignored when running a sweep

# add the monthly metrics to WandB
add_wandb_monthly_metrics()

# Update config from WandB initialization above
config = wandb.config

# Retrieve raw data (partition) based on the configuration
views_raw = get_raw_data(config)


# Handle the sweep runs
if config.sweep:

pass

# Handle the single model runs: train and save the model as an artifact
if train:

print('No need to train the zero baseline model. Exiting...')
pass

# Handle the single model runs: evaluate a trained model (artifact)
if eval:
#handle_evaluation(config, device, views_vol, PATH_ARTIFACTS, artifact_name)
evaluate_model_artifact(config, views_raw)



if forecast:
#handle_forecasting(config, device, views_vol, PATH_ARTIFACTS, artifact_name)
forecast_with_model_artifact(config, views_raw)


Loading