Minor fixes

Removed redundancies and unnecessary code segments.
e-mission · Nov 2, 2023 · 1d7be5a · 1d7be5a
1 parent bf7f406
commit 1d7be5a
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 101 deletions.
diff --git a/emission/analysis/modelling/trip_model/config copy.py b/emission/analysis/modelling/trip_model/config copy.py
@@ -0,0 +1,79 @@
+import json
+import re
+from this import d
+from typing import Optional
+import logging
+from numpy import isin
+
+import emission.analysis.modelling.trip_model.model_storage as eamums
+import emission.analysis.modelling.trip_model.model_type as eamumt
+
+config_filename = ""
+
+def load_config():
+    global config_filename
+    try:
+        config_filename = 'conf/analysis/trip_model.conf.json'
+        config_file = open(config_filename)
+    except:
+        print("analysis.trip_model.conf.json not configured, falling back to sample, default configuration")
+        config_filename = 'conf/analysis/trip_model.conf.json.sample'
+        config_file = open('conf/analysis/trip_model.conf.json.sample')
+    ret_val = json.load(config_file)
+    config_file.close()
+    return ret_val
+
+config_data = load_config()
+
+def reload_config():
+    global config_data
+    config_data = load_config()
+
+def get_config():
+    return config_data
+
+def get_optional_config_value(key) -> Optional[str]:
+    """
+    get a config value at the provided path/key
+
+    :param key: a key name or a dot-delimited path to some key within the config object
+    :return: the value at the key, or, None if not found
+    """
+    cursor = config_data
+    path = key.split(".")
+    for k in path:
+        cursor = cursor.get(k)
+        if cursor is None:
+            return None
+    return cursor
+
+def get_config_value_or_raise(key):
+    logging.debug(f'getting key {key} in config')
+    value = get_optional_config_value(key)
+    if value is None:
+        logging.debug('config object:')
+        logging.debug(json.dumps(config_data, indent=2))
+        msg = f"expected config key {key} not found in config file {config_filename}"
+        raise KeyError(msg)
+    else:
+        return value
+
+def get_model_type():
+    model_type_str = get_config_value_or_raise('model_type')
+    model_type = eamumt.ModelType.from_str(model_type_str)
+    return model_type
+
+def get_model_storage():
+    model_storage_str = get_config_value_or_raise('model_storage')
+    model_storage = eamums.ModelStorage.from_str(model_storage_str)
+    return model_storage
+
+def get_minimum_trips():
+    minimum_trips = get_config_value_or_raise('minimum_trips')
+    if not isinstance(minimum_trips, int):
+        msg = f"config key 'minimum_trips' not an integer in config file {config_filename}"
+        raise TypeError(msg)
+    return minimum_trips
+
+
+
diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py
@@ -133,7 +133,6 @@ def fit(self, trips: List[ecwc.Confirmedtrip]):
         corresponds to a label at the matching index of the label input
 
         :param trips: 2D array of features to train from
-        :param tripsdf: trips data in dataframe format
         """
 
         logging.debug(f'fit called with {len(trips)} trips')

diff --git a/emission/analysis/modelling/trip_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py
@@ -72,8 +72,8 @@ def update_trip_model(
             epq.mark_trip_model_failed(user_id)
         else:
 
-            # train and store the model. pass both List of event and dataframe time data
-            # that both standard( which mostly work on df) and self implemented models can use.
+            # train and store the model. pass only List of event and only convert 
+            # to dataframe type data whereever required.
             model.fit(trips)
             model_data_next = model.to_dict()
 

diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py
@@ -3,12 +3,6 @@
 import numpy as np
 import pandas as pd
 from numpy.linalg import norm
-import copy
-
-from sklearn.preprocessing import OneHotEncoder
-from sklearn.pipeline import make_pipeline
-from sklearn.impute import SimpleImputer
-
 
 def find_knee_point(values: List[float]) -> Tuple[float, int]:
     """for a list of values, find the value which represents the cut-off point
@@ -76,95 +70,3 @@ def single_cluster_purity(points_in_cluster, label_col='purpose_confirm'):
     purity = len(points_in_cluster[points_in_cluster[label_col] ==
                                    most_freq_label]) / len(points_in_cluster)
     return purity
-
-
-class OneHotWrapper():
-    """ Helper class to streamline one-hot encoding. 
-    
-        Args: 
-            impute_missing (bool): whether or not to impute np.nan values. 
-            sparse (bool): whether or not to return a sparse matrix. 
-            handle_unknown (str): specifies the way unknown categories are 
-                handled during transform.
-    """
-
-    def __init__(
-        self,
-        impute_missing=False,
-        sparse=False,
-        handle_unknown='ignore',
-    ):
-        self.impute_missing = impute_missing
-        if self.impute_missing:
-            self.encoder = make_pipeline(
-                SimpleImputer(missing_values=np.nan,
-                              strategy='constant',
-                              fill_value='missing'),
-                OneHotEncoder(sparse=False, handle_unknown=handle_unknown))
-        else:
-            self.encoder = OneHotEncoder(sparse=sparse,
-                                         handle_unknown=handle_unknown)
-
-    def fit_transform(self, train_df, output_col_prefix=None):
-        """ Establish one-hot encoded variables. 
-        
-            Args: 
-                train_df (DataFrame): DataFrame containing train trips. 
-                output_col_prefix (str): only if train_df is a single column
-        """
-        # TODO: handle pd series
-
-        train_df = train_df.copy()  # to avoid SettingWithCopyWarning
-
-        # if imputing, the dtype of each column must be string/object and not
-        # numerical, otherwise the SimpleImputer will fail
-        if self.impute_missing:
-            for col in train_df.columns:
-                train_df[col] = train_df[col].astype(object)
-        onehot_encoding = self.encoder.fit_transform(train_df)
-        self.onehot_encoding_cols_all = []
-        for col in train_df.columns:
-            if train_df.shape[1] > 1 or output_col_prefix is None:
-                output_col_prefix = col
-            self.onehot_encoding_cols_all += [
-                f'{output_col_prefix}_{val}'
-                for val in np.sort(train_df[col].dropna().unique())
-            ]
-            # we handle np.nan separately because it is of type float, and may
-            # cause issues with np.sort if the rest of the unique values are
-            # strings
-            if any((train_df[col].isna())):
-                self.onehot_encoding_cols_all += [f'{output_col_prefix}_nan']
-
-        onehot_encoding_df = pd.DataFrame(
-            onehot_encoding,
-            columns=self.onehot_encoding_cols_all).set_index(train_df.index)
-
-        # ignore the encoded columns for missing entries
-        self.onehot_encoding_cols = copy.deepcopy(self.onehot_encoding_cols_all)
-        for col in self.onehot_encoding_cols_all:
-            if col.endswith('_nan'):
-                onehot_encoding_df = onehot_encoding_df.drop(columns=[col])
-                self.onehot_encoding_cols.remove(col)
-
-        return onehot_encoding_df.astype(int)
-
-    def transform(self, test_df):
-        """ One-hot encoded features in accordance with features seen in the 
-            train set. 
-        
-            Args: 
-                test_df (DataFrame): DataFrame of trips. 
-        """
-        # TODO: rename test_df, this one doesn't necessarily need to be a df
-        onehot_encoding = self.encoder.transform(test_df)
-        onehot_encoding_df = pd.DataFrame(
-            onehot_encoding,
-            columns=self.onehot_encoding_cols_all).set_index(test_df.index)
-
-        # ignore the encoded columns for missing entries
-        for col in self.onehot_encoding_cols_all:
-            if col.endswith('_nan'):
-                onehot_encoding_df = onehot_encoding_df.drop(columns=[col])
-
-        return onehot_encoding_df.astype(int)