prio-data · Polichinel · Jun 12, 2024 · May 23, 2024 · May 23, 2024 · May 23, 2024
diff --git a/common_utils/artifacts_utils.py b/common_utils/artifacts_utils.py
@@ -0,0 +1,74 @@
+import os
+from pathlib import Path
+
+def get_artifact_files(PATH, run_type):
+    """
+    Retrieve artifact files from a directory that match the given run type and common extensions.
+
+    Args:
+        path (str): The directory path where model files are stored.
+        run_type (str): The type of run (e.g., calibration, testing).
+
+    Returns:
+        list: List of matching model file paths.
+    """
+    # Define the common model file extensions - more can be added as needed
+    common_extensions = ['.pt', '.pth', '.h5', '.hdf5', '.pkl', '.json', '.bst', '.txt', '.bin', '.cbm', '.onnx']
+
+    # Retrieve files that start with run_type and end with any of the common extensions
+    # artifact_files = [f for f in os.listdir(PATH) if f.startswith(f"{run_type}_model_") and any(f.endswith(ext) for ext in common_extensions)]
+
+    # pathlib alternative
+    artifact_files = [f for f in PATH.iterdir() if f.is_file() and f.stem.startswith(f"{run_type}_model_") and f.suffix in common_extensions]
+
+
+    return artifact_files
+
+
+def get_latest_model_artifact(PATH, run_type):
+    """
+    Retrieve the path (pathlib path object) latest model artifact for a given run type based on the modification time.
+
+    Args:
+        path (str): The model specifc directory path where artifacts are stored. 
+        Where PATH_ARTIFACTS = setup_artifacts_paths(PATH) executed in the model specifc main.py script.
+        and PATH = Path(__file__)
+
+        run_type (str): The type of run (e.g., calibration, testing, forecasting).
+
+    Returns:
+        The path (pathlib path objsect) to the latest model artifact given the run type.
+
+    Raises:
+        FileNotFoundError: If no model artifacts are found for the given run type.
+    """
+
+    # List all model files for the given specific run_type with the expected filename pattern
+    model_files = get_artifact_files(PATH, run_type) #[f for f in os.listdir(path) if f.startswith(f"{run_type}_model_") and f.endswith('.pt')]
+
+    if not model_files:
+        raise FileNotFoundError(f"No model artifacts found for run type '{run_type}' in path '{PATH}'")
+
+    # Sort the files based on the timestamp embedded in the filename. With format %Y%m%d_%H%M%S For example, '20210831_123456.pt'
+    model_files.sort(reverse=True)
+
+    #print statements for debugging
+    print(f"artifacts availible: {model_files}")
+    print(f"artifact used: {model_files[0]}")
+
+    # Return the latest model file
+    #PATH_MODEL_ARTIFACT = os.path.join(path, model_files[0])
+
+    # pathlib alternative
+    PATH_MODEL_ARTIFACT = Path(PATH) / model_files[0]
+
+    return PATH_MODEL_ARTIFACT
+
+    # notes on stepshifted models:
+    # There will be some thinking here in regards to how we store, denote (naming convention), and retrieve the model artifacts from stepshifted models.
+    # It is not a big issue, but it is something to consider os we don't do something headless. 
+    # A possible format could be: <run_type>_model_s<step>_<timestamp>.pt example: calibration_model_s00_20210831_123456.pt, calibration_model_s01_20210831_123456.pt, etc.
+    # And the rest of the code maded in a way to handle this naming convention without any issues. Could be a simple fix.
+    # Alternatively, we could store the model artifacts in a subfolder for each stepshifted model. This would make it easier to handle the artifacts, but it would also make it harder to retrieve the latest artifact for a given run type.
+    # Lastly, the solution Xiaolong is working on might allow us the store multiple models (steps) in one artifact, which would make this whole discussion obsolete and be the best solution.
+
diff --git a/common_utils/cli_parser_utils.py b/common_utils/cli_parser_utils.py
@@ -0,0 +1,76 @@
+import sys
+import argparse
+
+def parse_args():
+
+    """
+    CLI parser for model specific main.py scripts.
+    """
+
+    parser = argparse.ArgumentParser(description='Run model pipeline with specified run type.')
+
+    parser.add_argument('-r', '--run_type',
+                        choices=['calibration', 'testing', 'forecasting'],
+                        type=str,
+                        default='calibration',
+                        help='Choose the run type for the model: calibration, testing, or forecasting. Default is calibration. '
+                             'Note: If --sweep is flagged, --run_type must be calibration.')
+
+    parser.add_argument('-s', '--sweep',
+                        action='store_true',
+                        help='Set flag to run the model pipeline as part of a sweep. No explicit flag means no sweep.'
+                             'Note: If --sweep is flagged, --run_type must be calibration, and both training and evaluation is automatically implied.')
+
+    parser.add_argument('-t', '--train',
+                        action='store_true',
+                        help='Flag to indicate if a new model should be trained. '
+                             'Note: If --sweep is flagged, --train will also automatically be flagged.')
+
+    parser.add_argument('-e', '--evaluate',
+                        action='store_true',
+                        help='Flag to indicate if the model should be evaluated. '
+                             'Note: If --sweep is specified, --evaluate will also automatically be flagged. '
+                             'Cannot be used with --run_type forecasting.')
+
+    parser.add_argument('-a', '--artifact_name',
+                        type=str,
+                        help='Specify the name of the model artifact to be used for evaluation. '
+                             'The file extension will be added in the main and fit with the specific model algorithm.'
+                             'The artifact name should be in the format: <run_type>_model_<timestamp>.pt.'
+                             'where <run_type> is calibration, testing, or forecasting, and <timestamp> is in the format YMD_HMS.'
+                             'If not provided, the latest artifact will be used by default.')
+
+    return parser.parse_args()
+
+def validate_arguments(args):
+    if args.sweep:
+        if args.run_type != 'calibration':
+            print("Error: Sweep runs must have --run_type set to 'calibration'. Exiting.")
+            print("To fix: Use --run_type calibration when --sweep is flagged.")
+            sys.exit(1)
+
+    if args.run_type in ['testing', 'forecasting'] and args.sweep:
+        print("Error: Sweep cannot be performed with testing or forecasting run types. Exiting.")
+        print("To fix: Remove --sweep flag or set --run_type to 'calibration'.")
+        sys.exit(1)
+
+    if args.run_type == 'forecasting' and args.evaluate:
+        print("Error: Forecasting runs cannot evaluate. Exiting.")
+        print("To fix: Remove --evaluate flag when --run_type is 'forecasting'.")
+        sys.exit(1)
+
+    if args.run_type in ['calibration', 'testing'] and not args.train and not args.evaluate and not args.sweep:
+        print(f"Error: Run type is {args.run_type} but neither --train, --evaluate, nor --sweep flag is set. Nothing to do... Exiting.")
+        print("To fix: Add --train and/or --evaluate flag. Or use --sweep to run both training and evaluation in a WadnB sweep loop.")
+        sys.exit(1)
+
+
+    # notes on stepshifted models:
+    # There will be some thinking here in regards to how we store, denote (naming convention), and retrieve the model artifacts from stepshifted models.
+    # It is not a big issue, but it is something to consider os we don't do something headless. 
+    # A possible format could be: <run_type>_model_s<step>_<timestamp>.pt example: calibration_model_s00_20210831_123456.pt, calibration_model_s01_20210831_123456.pt, etc.
+    # And the rest of the code maded in a way to handle this naming convention without any issues. Could be a simple fix.
+    # Alternatively, we could store the model artifacts in a subfolder for each stepshifted model. This would make it easier to handle the artifacts, but it would also make it harder to retrieve the latest artifact for a given run type.
+    # Lastly, the solution Xiaolong is working on might allow us the store multiple models (steps) in one artifact, which would make this whole discussion obsolete and be the best solution.
+
+
diff --git a/common_utils/set_partition.py b/common_utils/set_partition.py
@@ -20,4 +20,13 @@ def get_partitioner_dict(partion, step=36):
 
     print('partitioner_dict', partitioner_dict) 
 
-    return partitioner_dict
+    return partitioner_dict
+
+# currently these differ from the ones in the config_data_partitions.py file for the stepshifted models (see below). This needs to be sorted out asap.
+
+#    data_partitions = {
+#        'calib_partitioner_dict': {"train": (121, 396), "predict": (409, 456)},   # Does not make sense that the eval set starts at 409, it should start at 397, no?
+#        'test_partitioner_dict': {"train": (121, 456), "predict": (457, 504)},
+#        'future_partitioner_dict': {"train": (121, 504), "predict": (529, 529)}, # NO HARD CODIGN THE FUTURE START DATE
+#        'FutureStart': 529, #Jan 24 # THIS SHOULD NOT BE HARD CODED!!!! Whatever the right partitions are for calibration and testing, the forecasting should be automatically infered from the current date.
+#    }
diff --git a/common_utils/set_path.py b/common_utils/set_path.py
@@ -1,6 +1,42 @@
 import sys
 from pathlib import Path
 
+def setup_root_paths(PATH) -> Path:
+
+    """
+    Extracts and returns the root path (pathlib path object) up to and including the "views_pipeline" directory from any given path.
+    This function identifies the "views_pipeline" directory within the provided path and constructs a new path up to and including this directory. 
+    This is useful for setting up root paths for project-wide resources and utilities.
+
+    Args:
+        PATH (Path): The base path, typically the path of the script invoking this function (e.g., `PATH = Path(__file__)`).
+
+    Returns:
+        PATH_ROOT: The root path (pathlib path object) including the "views_pipeline" directory.
+    """
+
+    PATH_ROOT  = Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) # The +1 is to include the "views_pipeline" part in the path
+    return PATH_ROOT
+
+
+def setup_model_paths(PATH):
+
+    """
+    Extracts and returns the model-specific path (pathlib path object) including the "models" directory and its immediate subdirectory.
+    This function identifies the "models" (e.g. purple_alien or orange_pasta) directory within the provided path and constructs a new path up to and including the next subdirectory after "models". 
+    This is useful for setting up paths specific to a model within the project.
+
+    Args:
+        PATH (Path): The base path, typically the path of the script invoking this function (e.g., `PATH = Path(__file__)`).
+
+    Returns:
+        PATH_model: The path (pathlib path object) including the "models" directory and its immediate subdirectory.
+    """
+
+    PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models")+2]]) # The +2 is to include the "models" and the individual model name in the path
+    return PATH_MODEL
+
+
 def setup_project_paths(PATH) -> None:
 
     """
@@ -30,9 +66,12 @@ def setup_project_paths(PATH) -> None:
     Disclaimer: A solution that avoids the insertion of the code above would be preferred.
     """
 
-    PATH_ROOT  = Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) # The +1 is to include the "views_pipeline" part in the path
-    PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models")+2]]) # The +2 is to include the "models" and the individual model name in the path
-
+#    PATH_ROOT  = Path(*[i for i in PATH.parts[:PATH.parts.index("views_pipeline")+1]]) # The +1 is to include the "views_pipeline" part in the path
+#    PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models")+2]]) # The +2 is to include the "models" and the individual model name in the path
+
+    PATH_ROOT  = setup_root_paths(PATH) 
+    PATH_MODEL = setup_model_paths(PATH)
+
     # print(f"Root path: {PATH_ROOT}") # debug
     # print(f"Model path: {PATH_MODEL}") # debug
 
@@ -47,13 +86,14 @@ def setup_project_paths(PATH) -> None:
     PATH_CONFIGS = PATH_MODEL / "configs"
     PATH_SRC = PATH_MODEL / "src"
     PATH_UTILS = PATH_SRC / "utils"
+    PATH_MANAGEMENT = PATH_SRC / "management" # added to keep the management scripts in a separate folder the utils according to Sara's point
     PATH_ARCHITECTURES = PATH_SRC / "architectures"
     PATH_TRAINING = PATH_SRC / "training"
     PATH_FORECASTING = PATH_SRC / "forecasting"
     PATH_OFFLINE_EVALUATION = PATH_SRC / "offline_evaluation"
     PATH_DATALOADERS = PATH_SRC / "dataloaders"
 
-    paths_to_add = [PATH_ROOT, PATH_COMMON_UTILS, PATH_COMMON_CONFIGS, PATH_CONFIGS, PATH_UTILS, PATH_ARCHITECTURES, PATH_TRAINING, PATH_FORECASTING, PATH_OFFLINE_EVALUATION, PATH_DATALOADERS]
+    paths_to_add = [PATH_ROOT, PATH_COMMON_UTILS, PATH_COMMON_CONFIGS, PATH_CONFIGS, PATH_UTILS, PATH_MANAGEMENT, PATH_ARCHITECTURES, PATH_TRAINING, PATH_FORECASTING, PATH_OFFLINE_EVALUATION, PATH_DATALOADERS]
 
     for path in paths_to_add:
         path_str = str(path)
@@ -62,40 +102,42 @@ def setup_project_paths(PATH) -> None:
             sys.path.insert(0, path_str)
 
 
-def setup_data_paths(PATH) -> None:
+def setup_data_paths(PATH) -> Path:
 
     """
-    Returns the raw, processed, and generated data paths for the specified model.
+    Returns the raw, processed, and generated data paths (pathlib path object) for the specified model.
 
     Args:
     PATH (Path): The base path, typically the path of the script invoking this function (i.e., `Path(__file__)`).
     config (str): The model configuration file.
 
     """    
 
-    PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models")+2]]) # The +2 is to include the "models" and the individual model name in the path
-
+    #PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models")+2]]) # The +2 is to include the "models" and the individual model name in the path
+    PATH_MODEL = setup_model_paths(PATH)
+
     PATH_DATA = PATH_MODEL / "data"
     PATH_RAW = PATH_DATA / "raw"
     PATH_PROCCEDS = PATH_DATA / "processed"
     PATH_GENERATED = PATH_DATA / "generated"
 
-    return PATH_RAW, PATH_PROCCEDS, PATH_GENERATED
+    return PATH_RAW, PATH_PROCCEDS, PATH_GENERATED # added in accordance with Sara's escwa branch
 
 
-def setup_artifacts_paths(PATH) -> None:
+def setup_artifacts_paths(PATH) -> Path:
 
     """
-    Returns the paths for the artifacts for the specified model.
+    Returns the paths (pathlib path object) for the artifacts for the specified model.
 
     Args:
     PATH (Path): The base path, typically the path of the script invoking this function (i.e., `Path(__file__)`).
     config (str): The model configuration file.
 
     """    
 
-    PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models")+2]]) # The +2 is to include the "models" and the individual model name in the path
-
+    #PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models")+2]]) # The +2 is to include the "models" and the individual model name in the path
+    PATH_MODEL = setup_model_paths(PATH)
+
     PATH_ARTIFACTS = PATH_MODEL / "artifacts"
     # print(f"Artifacts path: {PATH_ARTIFACTS}")
     return PATH_ARTIFACTS
diff --git a/models/purple_alien/configs/config_deployment.py b/models/purple_alien/configs/config_deployment.py
@@ -0,0 +1,16 @@
+def get_deployment_config():
+
+    """
+    Contains the configuration for deploying the model into different environments.
+    This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system.
+
+    Returns:
+    - deployment_config (dict): A dictionary containing deployment settings, determining how the model is deployed, including status, endpoints, and resource allocation.
+    """
+
+    # More deployment settings can/will be added here
+    deployment_config = {
+       "deployment_status": "shadow", # shadow, deployed, baseline, or deprecated
+    }
+
+    return deployment_config
diff --git a/models/purple_alien/configs/config_hyperparameters.py b/models/purple_alien/configs/config_hyperparameters.py
@@ -1,14 +1,22 @@
 
 def get_hp_config():
-
+
+    """
+    Contains the hyperparameter configurations for model training.
+    This configuration is "operational" so modifying these settings will impact the model's behavior during training.
+
+    Returns:
+    - hyperparameters (dict): A dictionary containing hyperparameters for training the model, which determine the model's behavior during the training phase.
+    """
+
     hyperparameters = {
     'model' : 'HydraBNUNet06_LSTM4', #'BNUNet',
     'weight_init' : 'xavier_norm',
     'clip_grad_norm' : True,
     'scheduler' : 'WarmupDecay', #  'CosineAnnealingLR' 'OneCycleLR'
     'total_hidden_channels' : 32,
     'min_events' : 5,
-    'samples': 600, # 10 just for debug
+    'samples': 300, # 600 for actual trainnig, 10 for debug
     'batch_size': 3, 
     'dropout_rate' : 0.125,
     'learning_rate' :  0.001,
@@ -24,7 +32,7 @@ def get_hp_config():
     'loss_reg': 'b',
     'loss_reg_a' : 258, 
     'loss_reg_c' :  0.001, # 0.05 works...
-    'test_samples': 128,
+    'test_samples': 128, # 128 for actual testing, 10 for debug
     'np_seed' : 4,
     'torch_seed' : 4,
     'window_dim' : 32,

diff --git a/models/purple_alien/configs/config_input_data.py b/models/purple_alien/configs/config_input_data.py
@@ -0,0 +1,25 @@
+from viewser import Queryset, Column
+
+def get_input_data_config():
+
+    """
+    Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model.
+    This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system.
+    There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and architecture accordingly.
+
+    Returns:
+    queryset_base (Queryset): A queryset containing the base data for the model training.
+    """
+
+    # VIEWSER 6
+    queryset_base = (Queryset("purple_alien", "priogrid_month")
+        .with_column(Column("ln_sb_best", from_loa = "priogrid_month", from_column = "ged_sb_best_count_nokgi").transform.ops.ln().transform.missing.replace_na())
+        .with_column(Column("ln_ns_best", from_loa = "priogrid_month", from_column = "ged_ns_best_count_nokgi").transform.ops.ln().transform.missing.replace_na())
+        .with_column(Column("ln_os_best", from_loa = "priogrid_month", from_column = "ged_os_best_count_nokgi").transform.ops.ln().transform.missing.replace_na())
+        .with_column(Column("month", from_loa = "month", from_column = "month"))
+        .with_column(Column("year_id", from_loa = "country_year", from_column = "year_id"))
+        .with_column(Column("c_id", from_loa = "country_year", from_column = "country_id"))
+        .with_column(Column("col", from_loa = "priogrid", from_column = "col"))
+        .with_column(Column("row", from_loa = "priogrid", from_column = "row")))
+
+    return queryset_base
diff --git a/models/purple_alien/configs/config_meta.py b/models/purple_alien/configs/config_meta.py
@@ -0,0 +1,17 @@
+def get_meta_config():
+    """
+    Contains the meta data for the model (model architecture, name, target variable, and level of analysis).
+    This config is for documentation purposes only, and modifying it will not affect the model, the training, or the evaluation.
+
+    Returns:
+    - meta_config (dict): A dictionary containing model meta configuration.
+    """
+    meta_config = {
+        "name": "purple_alien",
+        "algorithm": "HydraNet", 
+        "target(S)": ["ln_sb_best", "ln_ns_best", "ln_os_best", "ln_sb_best_binarized", "ln_ns_best_binarized", "ln_os_best_binarized"], 
+        "queryset": "escwa001_cflong",
+        "level": "cm",
+        "creator": "Simon" 
+    }
+    return meta_config