Skip to content

Commit

Permalink
Minor fixes
Browse files Browse the repository at this point in the history
Removed redundancies and unnecessary code segments.
  • Loading branch information
humbleOldSage committed Nov 2, 2023
1 parent bf7f406 commit 1d7be5a
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 101 deletions.
79 changes: 79 additions & 0 deletions emission/analysis/modelling/trip_model/config copy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import json
import re
from this import d
from typing import Optional
import logging
from numpy import isin

import emission.analysis.modelling.trip_model.model_storage as eamums
import emission.analysis.modelling.trip_model.model_type as eamumt

config_filename = ""

def load_config():
global config_filename
try:
config_filename = 'conf/analysis/trip_model.conf.json'
config_file = open(config_filename)
except:
print("analysis.trip_model.conf.json not configured, falling back to sample, default configuration")
config_filename = 'conf/analysis/trip_model.conf.json.sample'
config_file = open('conf/analysis/trip_model.conf.json.sample')
ret_val = json.load(config_file)
config_file.close()
return ret_val

config_data = load_config()

def reload_config():
global config_data
config_data = load_config()

def get_config():
return config_data

def get_optional_config_value(key) -> Optional[str]:
"""
get a config value at the provided path/key
:param key: a key name or a dot-delimited path to some key within the config object
:return: the value at the key, or, None if not found
"""
cursor = config_data
path = key.split(".")
for k in path:
cursor = cursor.get(k)
if cursor is None:
return None
return cursor

def get_config_value_or_raise(key):
logging.debug(f'getting key {key} in config')
value = get_optional_config_value(key)
if value is None:
logging.debug('config object:')
logging.debug(json.dumps(config_data, indent=2))
msg = f"expected config key {key} not found in config file {config_filename}"
raise KeyError(msg)
else:
return value

def get_model_type():
model_type_str = get_config_value_or_raise('model_type')
model_type = eamumt.ModelType.from_str(model_type_str)
return model_type

def get_model_storage():
model_storage_str = get_config_value_or_raise('model_storage')
model_storage = eamums.ModelStorage.from_str(model_storage_str)
return model_storage

def get_minimum_trips():
minimum_trips = get_config_value_or_raise('minimum_trips')
if not isinstance(minimum_trips, int):
msg = f"config key 'minimum_trips' not an integer in config file {config_filename}"
raise TypeError(msg)
return minimum_trips



Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ def fit(self, trips: List[ecwc.Confirmedtrip]):
corresponds to a label at the matching index of the label input
:param trips: 2D array of features to train from
:param tripsdf: trips data in dataframe format
"""

logging.debug(f'fit called with {len(trips)} trips')
Expand Down
4 changes: 2 additions & 2 deletions emission/analysis/modelling/trip_model/run_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ def update_trip_model(
epq.mark_trip_model_failed(user_id)
else:

# train and store the model. pass both List of event and dataframe time data
# that both standard( which mostly work on df) and self implemented models can use.
# train and store the model. pass only List of event and only convert
# to dataframe type data whereever required.
model.fit(trips)
model_data_next = model.to_dict()

Expand Down
98 changes: 0 additions & 98 deletions emission/analysis/modelling/trip_model/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,6 @@
import numpy as np
import pandas as pd
from numpy.linalg import norm
import copy

from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer


def find_knee_point(values: List[float]) -> Tuple[float, int]:
"""for a list of values, find the value which represents the cut-off point
Expand Down Expand Up @@ -76,95 +70,3 @@ def single_cluster_purity(points_in_cluster, label_col='purpose_confirm'):
purity = len(points_in_cluster[points_in_cluster[label_col] ==
most_freq_label]) / len(points_in_cluster)
return purity


class OneHotWrapper():
""" Helper class to streamline one-hot encoding.
Args:
impute_missing (bool): whether or not to impute np.nan values.
sparse (bool): whether or not to return a sparse matrix.
handle_unknown (str): specifies the way unknown categories are
handled during transform.
"""

def __init__(
self,
impute_missing=False,
sparse=False,
handle_unknown='ignore',
):
self.impute_missing = impute_missing
if self.impute_missing:
self.encoder = make_pipeline(
SimpleImputer(missing_values=np.nan,
strategy='constant',
fill_value='missing'),
OneHotEncoder(sparse=False, handle_unknown=handle_unknown))
else:
self.encoder = OneHotEncoder(sparse=sparse,
handle_unknown=handle_unknown)

def fit_transform(self, train_df, output_col_prefix=None):
""" Establish one-hot encoded variables.
Args:
train_df (DataFrame): DataFrame containing train trips.
output_col_prefix (str): only if train_df is a single column
"""
# TODO: handle pd series

train_df = train_df.copy() # to avoid SettingWithCopyWarning

# if imputing, the dtype of each column must be string/object and not
# numerical, otherwise the SimpleImputer will fail
if self.impute_missing:
for col in train_df.columns:
train_df[col] = train_df[col].astype(object)
onehot_encoding = self.encoder.fit_transform(train_df)
self.onehot_encoding_cols_all = []
for col in train_df.columns:
if train_df.shape[1] > 1 or output_col_prefix is None:
output_col_prefix = col
self.onehot_encoding_cols_all += [
f'{output_col_prefix}_{val}'
for val in np.sort(train_df[col].dropna().unique())
]
# we handle np.nan separately because it is of type float, and may
# cause issues with np.sort if the rest of the unique values are
# strings
if any((train_df[col].isna())):
self.onehot_encoding_cols_all += [f'{output_col_prefix}_nan']

onehot_encoding_df = pd.DataFrame(
onehot_encoding,
columns=self.onehot_encoding_cols_all).set_index(train_df.index)

# ignore the encoded columns for missing entries
self.onehot_encoding_cols = copy.deepcopy(self.onehot_encoding_cols_all)
for col in self.onehot_encoding_cols_all:
if col.endswith('_nan'):
onehot_encoding_df = onehot_encoding_df.drop(columns=[col])
self.onehot_encoding_cols.remove(col)

return onehot_encoding_df.astype(int)

def transform(self, test_df):
""" One-hot encoded features in accordance with features seen in the
train set.
Args:
test_df (DataFrame): DataFrame of trips.
"""
# TODO: rename test_df, this one doesn't necessarily need to be a df
onehot_encoding = self.encoder.transform(test_df)
onehot_encoding_df = pd.DataFrame(
onehot_encoding,
columns=self.onehot_encoding_cols_all).set_index(test_df.index)

# ignore the encoded columns for missing entries
for col in self.onehot_encoding_cols_all:
if col.endswith('_nan'):
onehot_encoding_df = onehot_encoding_df.drop(columns=[col])

return onehot_encoding_df.astype(int)

0 comments on commit 1d7be5a

Please sign in to comment.