merging pgm_models into cm_models

prio-data · Nov 8, 2024 · b2c2605 · b2c2605
2 parents 86cf1c6 + 8273771
commit b2c2605
Show file tree

Hide file tree

Showing 129 changed files with 261 additions and 204 deletions.
diff --git a/common_querysets/queryset_bad_blood.py b/common_querysets/queryset_bad_blood.py
@@ -2,7 +2,7 @@
 
 def generate():
 
-    qs_natsoc = (Queryset('fatalities002_pgm_natsoc','priogrid_month')
+    qs_natsoc = (Queryset('fatalities003_pgm_natsoc','priogrid_month')
 
                 .with_column(Column('ln_ged_sb_dep', from_loa='priogrid_month', from_column='ged_sb_best_sum_nokgi')
                     .transform.missing.replace_na()

diff --git a/common_querysets/queryset_blank_space.py b/common_querysets/queryset_blank_space.py
@@ -2,7 +2,7 @@
 
 def generate():
 
-    qs_natsoc = (Queryset('fatalities002_pgm_natsoc','priogrid_month')
+    qs_natsoc = (Queryset('fatalities003_pgm_natsoc','priogrid_month')
 
                 .with_column(Column('ln_ged_sb_dep', from_loa='priogrid_month', from_column='ged_sb_best_sum_nokgi')
                     .transform.missing.replace_na()

diff --git a/common_querysets/queryset_caring_fish.py b/common_querysets/queryset_caring_fish.py
@@ -2,7 +2,7 @@
 
 def generate():
 
-    qs_conflict_history = (Queryset('fatalities002_pgm_conflict_history','priogrid_month')
+    qs_conflict_history = (Queryset('fatalities003_pgm_conflict_history','priogrid_month')
 
                         .with_column(Column('ln_ged_sb_dep', from_loa='priogrid_month', from_column='ged_sb_best_sum_nokgi')
                             .transform.ops.ln()

diff --git a/common_querysets/queryset_chunky_cat.py b/common_querysets/queryset_chunky_cat.py
@@ -2,7 +2,7 @@
 
 def generate():
 
-    qs_conflictlong = (Queryset('fatalities002_pgm_conflictlong','priogrid_month')
+    qs_conflictlong = (Queryset('fatalities003_pgm_conflictlong','priogrid_month')
         .with_column(Column('ln_ged_sb_dep', from_loa='priogrid_month', from_column='ged_sb_best_sum_nokgi')
             .transform.missing.replace_na()
             .transform.ops.ln()

diff --git a/common_querysets/queryset_dark_paradise.py b/common_querysets/queryset_dark_paradise.py
@@ -2,7 +2,7 @@
 
 def generate():
 
-    qs_conflictlong = (Queryset('fatalities002_pgm_conflictlong','priogrid_month')
+    qs_conflictlong = (Queryset('fatalities003_pgm_conflictlong','priogrid_month')
 
         .with_column(Column('ln_ged_sb_dep', from_loa='priogrid_month', from_column='ged_sb_best_sum_nokgi')
             .transform.missing.replace_na()

diff --git a/common_querysets/queryset_invisible_string.py b/common_querysets/queryset_invisible_string.py
@@ -2,7 +2,7 @@
 
 def generate():
 
-    qs_broad = (Queryset('fatalities002_pgm_broad','priogrid_month')
+    qs_broad = (Queryset('fatalities003_pgm_broad','priogrid_month')
 
               .with_column(Column('tlag1_dr_mod_gs', from_loa='priogrid_month', from_column='tlag1_dr_mod_gs')
                      .transform.missing.replace_na(0)
@@ -52,17 +52,17 @@ def generate():
 
               .with_column(Column('sptime_dist_k1_ged_sb', from_loa='priogrid_month', from_column='ged_sb_best_sum_nokgi')
                      .transform.missing.replace_na()
-                     .transform.spatial.sptime_dist(distances,1,1.0,0.0)
+                     .transform.spatial.sptime_dist('distances',1,1.0,0.0)
                      )
 
               .with_column(Column('sptime_dist_k10_ged_sb', from_loa='priogrid_month', from_column='ged_sb_best_sum_nokgi')
                      .transform.missing.replace_na()
-                     .transform.spatial.sptime_dist(distances,1,10.0,0.0)
+                     .transform.spatial.sptime_dist('distances',1,10.0,0.0)
                      )
 
               .with_column(Column('sptime_dist_k001_ged_sb', from_loa='priogrid_month', from_column='ged_sb_best_sum_nokgi')
                      .transform.missing.replace_na()
-                     .transform.spatial.sptime_dist(distances,1,0.01,0.0)
+                     .transform.spatial.sptime_dist('distances',1,0.01,0.0)
                      )
 
               .with_column(Column('dist_diamsec', from_loa='priogrid', from_column='dist_diamsec_s_wgs')

diff --git a/common_querysets/queryset_lavender_haze.py b/common_querysets/queryset_lavender_haze.py
@@ -2,7 +2,7 @@
 
 def generate():
 
-    qs_broad = (Queryset('fatalities002_pgm_broad','priogrid_month')
+    qs_broad = (Queryset('fatalities003_pgm_broad','priogrid_month')
 
               .with_column(Column('tlag1_dr_mod_gs', from_loa='priogrid_month', from_column='tlag1_dr_mod_gs')
                      .transform.missing.replace_na(0)

diff --git a/common_querysets/queryset_midnight_rain.py b/common_querysets/queryset_midnight_rain.py
@@ -10,7 +10,7 @@ def generate():
     - queryset_base (Queryset): A queryset containing the base data for the model training.
     """
 
-    qs_escwa_drought = (Queryset('fatalities002_pgm_escwa_drought','priogrid_month')
+    qs_escwa_drought = (Queryset('fatalities003_pgm_escwa_drought','priogrid_month')
 
               .with_column(Column('pgd_nlights_calib_mean', from_loa='priogrid_year', from_column='nlights_calib_mean')
                      .transform.missing.replace_na(0)

diff --git a/common_querysets/queryset_old_money.py b/common_querysets/queryset_old_money.py
@@ -2,7 +2,7 @@
 
 def generate():
 
-    qs_escwa_drought = (Queryset('fatalities002_pgm_escwa_drought','priogrid_month')
+    qs_escwa_drought = (Queryset('fatalities003_pgm_escwa_drought','priogrid_month')
 
               .with_column(Column('pgd_nlights_calib_mean', from_loa='priogrid_year', from_column='nlights_calib_mean')
                      .transform.missing.replace_na(0)

diff --git a/common_querysets/queryset_orange_pasta.py b/common_querysets/queryset_orange_pasta.py
@@ -2,7 +2,7 @@
 
 def generate():
 
-    qs_baseline = (Queryset('fatalities002_pgm_baseline','priogrid_month')
+    qs_baseline = (Queryset('fatalities003_pgm_baseline','priogrid_month')
 
                 .with_column(Column('ln_ged_sb_dep', from_loa='priogrid_month', from_column='ged_sb_best_sum_nokgi')
                     .transform.missing.replace_na()

diff --git a/common_querysets/queryset_wildest_dream.py b/common_querysets/queryset_wildest_dream.py
@@ -3,7 +3,7 @@
 def generate():
 
 
-    qs_sptime_dist = (Queryset('fatalities002_pgm_conflict_sptime_dist','priogrid_month')
+    qs_sptime_dist = (Queryset('fatalities003_pgm_conflict_sptime_dist','priogrid_month')
 
                      .with_column(Column('ged_gte_1', from_loa='priogrid_month', from_column='ged_sb_best_sum_nokgi')
                             .transform.bool.gte(1)

diff --git a/common_querysets/queryset_yellow_pikachu.py b/common_querysets/queryset_yellow_pikachu.py
@@ -2,7 +2,7 @@
 
 def generate():
 
-    qs_treelag = (Queryset('fatalities002_pgm_conflict_treelag','priogrid_month')
+    qs_treelag = (Queryset('fatalities003_pgm_conflict_treelag','priogrid_month')
 
                 .with_column(Column('ged_gte_1', from_loa='priogrid_month', from_column='ged_sb_best_sum_nokgi')
                     .transform.bool.gte(1)

diff --git a/common_utils/model_path.py b/common_utils/model_path.py
@@ -178,24 +178,24 @@ def get_model_name_from_path(path: Union[Path, str]) -> str:
             ValueError: If the model name is not found in the provided path.
         """
         path = Path(path)
-        logger.info(f"Extracting model name from Path: {path}")
+        logger.debug(f"Extracting model name from Path: {path}")
         if "models" in path.parts and "ensembles" not in path.parts:
             model_idx = path.parts.index("models")
             model_name = path.parts[model_idx + 1]
             if utils_model_naming.validate_model_name(model_name):
-                logger.info(f"Valid model name {model_name} found in path {path}")
+                logger.debug(f"Valid model name {model_name} found in path {path}")
                 return str(model_name)
             else:
-                logger.info(f"No valid model name found in path {path}")
+                logger.debug(f"No valid model name found in path {path}")
                 return None
         if "ensembles" in path.parts and "models" not in path.parts:
             model_idx = path.parts.index("ensembles")
             model_name = path.parts[model_idx + 1]
             if utils_model_naming.validate_model_name(model_name):
-                logger.info(f"Valid ensemble name {model_name} found in path {path}")
+                logger.debug(f"Valid ensemble name {model_name} found in path {path}")
                 return str(model_name)
             else:
-                logger.info(f"No valid ensemble name found in path {path}")
+                logger.debug(f"No valid ensemble name found in path {path}")
                 return None
         return None
 
@@ -319,7 +319,7 @@ def _handle_global_cache(self) -> None:
 
             cached_instance = GlobalCache[self._instance_hash]
             if cached_instance and not self._force_cache_overwrite:
-                logger.info(
+                logger.debug(
                     f"ModelPath instance {self.model_name} found in GlobalCache. Using cached instance."
                 )
                 return cached_instance
@@ -337,13 +337,13 @@ def _write_to_global_cache(self) -> None:
         from global_cache import GlobalCache
 
         if GlobalCache[self._instance_hash] is None:
-            logger.info(
+            logger.debug(
                 f"Writing {self.target.title}Path object to cache for model {self.model_name}."
             )
             GlobalCache[self._instance_hash] = self
         else:
             if self._force_cache_overwrite:
-                logger.info(
+                logger.debug(
                     f"Overwriting {self.target.title}Path object in cache for model {self.model_name}. (_force_cache_overwrite is set to True)"
                 )
                 GlobalCache[self._instance_hash] = self
@@ -481,7 +481,7 @@ def get_queryset(self) -> Optional[Dict[str, str]]:
                 logger.error(f"Error importing queryset: {e}")
                 self._queryset = None
             else:
-                logger.info(f"Queryset {self.queryset_path} imported successfully.")
+                logger.debug(f"Queryset {self.queryset_path} imported successfully.")
                 return self._queryset.generate() if self._queryset else None
         else:
             logger.warning(
@@ -569,7 +569,7 @@ def add_paths_to_sys(self) -> List[str]:
                     )
                     return
                 if model_name == self.model_name:
-                    logger.info(
+                    logger.debug(
                         f"Path {str(path)} for '{model_name}' is already added to sys.path. Skipping..."
                     )
         if self._sys_paths is None:

diff --git a/ensembles/cruel_summer/README.md b/ensembles/cruel_summer/README.md
@@ -2,10 +2,6 @@
 ## Overview
 This folder contains code for Cruel Summer model, an ensemble machine learning model designed for predicting fatalities. 
 
-The model utilizes **latest** Lavender Haze (Hurdle Model LGBMClassifier+LGBMRegressor), **latest** Blank Space 
-(Hurdle Model LGBMClassifier+LGBMRegressor) and **latest** Wildest Dream (Hurdle Model XGBClassifier+XGBRegressor) 
-for its predictions and is on pgm level of analysis.
-
 The model uses log fatalities.
 
 ## Repository Structure

diff --git a/ensembles/cruel_summer/configs/config_deployment.py b/ensembles/cruel_summer/configs/config_deployment.py
@@ -1,16 +1,20 @@
-def get_deployment_config():
+"""
+Deployment Configuration Script
+
+This script defines the deployment configuration settings for the application. 
+It includes the deployment status and any additional settings specified.
 
-    """
-    Contains the configuration for deploying the model into different environments.
-    This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system.
+Deployment Status:
+- shadow: The deployment is shadowed and not yet active.
+- deployed: The deployment is active and in use.
+- baseline: The deployment is in a baseline state, for reference or comparison.
+- deprecated: The deployment is deprecated and no longer supported.
 
-    Returns:
-    - deployment_config (dict): A dictionary containing deployment settings, determining how the model is deployed, including status, endpoints, and resource allocation.
-    """
+Additional settings can be included in the configuration dictionary as needed.
 
-    # More deployment settings can/will be added here
-    deployment_config = {
-       "deployment_status": "shadow", # shadow, deployed, baseline, or deprecated
-    }
+"""
 
-    return deployment_config
+def get_deployment_config():
+    # Deployment settings
+    deployment_config = {'deployment_status': 'shadow'}
+    return deployment_config
diff --git a/ensembles/cruel_summer/configs/config_hyperparameters.py b/ensembles/cruel_summer/configs/config_hyperparameters.py
@@ -2,4 +2,4 @@ def get_hp_config():
     hp_config = {
         "steps": [*range(1, 36 + 1, 1)]
     }
-    return hp_config
+    return hp_config
diff --git a/ensembles/cruel_summer/configs/config_meta.py b/ensembles/cruel_summer/configs/config_meta.py
@@ -8,10 +8,10 @@ def get_meta_config():
     """
     meta_config = {
         "name": "cruel_summer",
-        "models": ["lavender_haze", "blank_space", "wildest_dream"],
-        "depvar": "ln_ged_sb_dep",  # Double-check the target variables of each model
-        "level": "pgm",
-        "aggregation": "median",
-        "creator": "Xiaolong"
+        "models": ["chunky_cat", "bad_blood"],
+        "depvar": "ln_ged_sb_dep", 
+        "level": "pgm", 
+        "aggregation": "median", 
+        "creator": "Xiaolong" 
     }
-    return meta_config
+    return meta_config
diff --git a/.../cruel_summer/src/visualization/visual.py → ...s/cruel_summer/src/architectures/.gitkeep b/.../cruel_summer/src/visualization/visual.py → ...s/cruel_summer/src/architectures/.gitkeep
diff --git a/...nk_space/artifacts/model_metadata_dict.py → ...les/cruel_summer/src/dataloaders/.gitkeep b/...nk_space/artifacts/model_metadata_dict.py → ...les/cruel_summer/src/dataloaders/.gitkeep
diff --git a/ensembles/cruel_summer/src/dataloaders/get_data.py b/ensembles/cruel_summer/src/dataloaders/get_data.py
@@ -0,0 +1,14 @@
+import logging
+from model_path import ModelPath
+from utils_dataloaders import fetch_or_load_views_df
+
+logger = logging.getLogger(__name__)
+
+def get_data(model_name, run_type, use_saved, self_test):
+    model_path = ModelPath(model_name, validate=False)
+    path_raw = model_path.data_raw
+
+    data, alerts = fetch_or_load_views_df(model_name, run_type, path_raw, self_test, use_saved)
+    logger.debug(f"DataFrame shape: {data.shape if data is not None else 'None'}")
+
+    return data
diff --git a/ensembles/cruel_summer/src/forecasting/generate_forecast.py b/ensembles/cruel_summer/src/forecasting/generate_forecast.py
@@ -5,7 +5,6 @@
 from pathlib import Path
 from model_path import ModelPath
 from ensemble_path import EnsemblePath
-from set_partition import get_partitioner_dict
 from utils_log_files import create_log_file, read_log_file
 from utils_save_outputs import save_predictions
 from utils_run import get_standardized_df, get_aggregated_df, get_single_model_config
@@ -50,14 +49,13 @@ def forecast_ensemble(config):
             except FileNotFoundError:
                 logger.exception(f"Model artifact not found at {path_artifact}")
 
-            partition = get_partitioner_dict(run_type)["predict"]
-            df = stepshift_model.future_point_predict(partition[0]-1, df_viewser, keep_specific=True)
+            df = stepshift_model.predict(run_type, df_viewser)
             df = get_standardized_df(df, model_config)
 
             data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None)
+            data_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None)
             save_predictions(df, path_generated, model_config)
-            create_log_file(path_generated, model_config, ts, data_generation_timestamp, date_fetch_timestamp)
+            create_log_file(path_generated, model_config, ts, data_generation_timestamp, data_fetch_timestamp)
 
         dfs.append(df)
 
@@ -69,7 +67,6 @@ def forecast_ensemble(config):
     save_predictions(df_prediction, path_generated_e, config)
 
     # How to define an ensemble model timestamp? Currently set as data_generation_timestamp.
-
-    create_log_file(path_generated_e, config, data_generation_timestamp, data_generation_timestamp, date_fetch_timestamp=None,
+    create_log_file(path_generated_e, config, data_generation_timestamp, data_generation_timestamp, data_fetch_timestamp=None,
                     model_type="ensemble", models=config["models"])
 
diff --git a/ensembles/cruel_summer/src/offline_evaluation/evaluate_ensemble.py b/ensembles/cruel_summer/src/offline_evaluation/evaluate_ensemble.py
@@ -53,16 +53,16 @@ def evaluate_ensemble(config):
             except FileNotFoundError:
                 logger.exception(f"Model artifact not found at {path_artifact}")
 
-            df = stepshift_model.predict(run_type, "predict", df_viewser)
+            df = stepshift_model.predict(run_type, df_viewser)
             df = get_standardized_df(df, model_config)
             data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            date_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None)
+            data_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None)
 
             _, df_output = generate_output_dict(df, model_config)
             evaluation, df_evaluation = generate_metric_dict(df, model_config)
             save_model_outputs(df_evaluation, df_output, path_generated, model_config)
             save_predictions(df, path_generated, model_config)
-            create_log_file(path_generated, model_config, ts, data_generation_timestamp, date_fetch_timestamp)
+            create_log_file(path_generated, model_config, ts, data_generation_timestamp, data_fetch_timestamp)
 
         dfs.append(df)