Skip to content

Commit

Permalink
Import format changed + Whitespace / Newline fixes
Browse files Browse the repository at this point in the history
1. Newline fixes - reverted removal of line
a. util.py
e-mission#938 (comment)

b. run_model.py
e-mission#938 (comment)

2. Import format changed to "import X as Y" instead of "from X import Y"
Files: clustering.py, models.py, TestForestModelIntegration.py, TestForestModelLoadandSave.py, TestRunForestMode.py
e-mission#938 (comment)
  • Loading branch information
Mahadik, Mukul Chandrakant authored and Mahadik, Mukul Chandrakant committed Nov 18, 2024
1 parent ad968de commit 6d4cc3a
Show file tree
Hide file tree
Showing 7 changed files with 66 additions and 62 deletions.
30 changes: 15 additions & 15 deletions emission/analysis/modelling/trip_model/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
import logging

# import clustering algorithms
import sklearn.metrics.pairwise as smp
import sklearn.cluster as sc
from sklearn import metrics
from sklearn import svm
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import sklearn.metrics.pairwise as sklmp
import sklearn.cluster as sklc
import sklearn.metrics.cluster as sklmc
import sklearn.svm as skls
import sklearn.pipeline as sklpl
import sklearn.preprocessing as sklpp

# our imports
# NOTE: this requires changing the branch of e-mission-server to
Expand Down Expand Up @@ -102,7 +102,7 @@ def add_loc_clusters(
dist_matrix_meters = get_distance_matrix(loc_df, loc_type)

for r in radii:
model = sc.DBSCAN(r, metric="precomputed",
model = sklc.DBSCAN(r, metric="precomputed",
min_samples=min_samples).fit(dist_matrix_meters)
labels = model.labels_
# print(model.n_features_in_)
Expand Down Expand Up @@ -150,7 +150,7 @@ def add_loc_clusters(
dist_matrix_meters = get_distance_matrix(loc_df, loc_type)

for r in radii:
labels = sc.OPTICS(
labels = sklc.OPTICS(
min_samples=optics_min_samples,
max_eps=r,
xi=optics_xi,
Expand Down Expand Up @@ -178,7 +178,7 @@ def add_loc_clusters(
dist_matrix_meters = get_distance_matrix(p_loc_df, loc_type)

for r in radii:
labels = sc.DBSCAN(
labels = sklc.DBSCAN(
r, metric="precomputed",
min_samples=min_samples).fit(dist_matrix_meters).labels_

Expand Down Expand Up @@ -231,7 +231,7 @@ def add_loc_clusters(
# what the bandwidth roughly corresponds to in the real world/make
# the value a little more interpretable.
LATLON_TO_M = 1 / 111139
labels = sc.MeanShift(
labels = sklc.MeanShift(
bandwidth=LATLON_TO_M * r,
min_bin_freq=min_samples,
cluster_all=False,
Expand Down Expand Up @@ -325,9 +325,9 @@ def add_loc_SVM(loc_df,
]]
y_train = labeled_points_in_cluster.purpose_confirm.to_list()

labels = make_pipeline(
StandardScaler(),
svm.SVC(
labels = sklpl.make_pipeline(
sklpp.StandardScaler(),
skls.SVC(
kernel='rbf',
gamma=svm_gamma,
C=svm_C,
Expand Down Expand Up @@ -381,7 +381,7 @@ def get_distance_matrix(loc_df, loc_type):
radians_lat_lon = np.radians(loc_df[[loc_type + "_lat", loc_type + "_lon"]])

dist_matrix_meters = pd.DataFrame(
smp.haversine_distances(radians_lat_lon, radians_lat_lon) *
sklmp.haversine_distances(radians_lat_lon, radians_lat_lon) *
EARTH_RADIUS)
return dist_matrix_meters

Expand All @@ -404,7 +404,7 @@ def single_cluster_purity(points_in_cluster, label_col='purpose_confirm'):


def purity_score(y_true, y_pred):
contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
contingency_matrix = sklmc.contingency_matrix(y_true, y_pred)
purity = np.sum(np.amax(contingency_matrix,
axis=0)) / np.sum(contingency_matrix)
return purity
73 changes: 36 additions & 37 deletions emission/analysis/modelling/trip_model/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,18 @@
import copy

# sklearn imports
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics.pairwise import haversine_distances
from sklearn.cluster import DBSCAN
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.exceptions import NotFittedError
import sklearn.pipeline as sklpl
import sklearn.preprocessing as sklpp
import sklearn.impute as skli
import sklearn.metrics.pairwise as sklmp
import sklearn.cluster as sklc
import sklearn.svm as skls
import sklearn.ensemble as sklen
import sklearn.tree as sklt
import sklearn.exceptions as sklex

# our imports
from emission.analysis.modelling.trip_model.clustering import get_distance_matrix, single_cluster_purity
import emission.analysis.modelling.trip_model.clustering as eamtcl
import emission.analysis.modelling.trip_model.data_wrangling as eamtd
import emission.storage.decorations.trip_queries as esdtq
import emission.analysis.classification.inference.labels.inferrers as eacili
Expand All @@ -28,7 +28,6 @@
import emission.analysis.modelling.trip_model.run_model as eamur


import emission.analysis.modelling.trip_model.clustering as eamtc
# NOTE: tour_model_extended.similarity is on the
# eval-private-data-compatibility branch in e-mission-server

Expand Down Expand Up @@ -334,7 +333,7 @@ def fit(self, unused,train_entry_list=None):

# fit the bins
self.sim_model= eamtg.GreedySimilarityBinning(model_config)
cleaned_trip_entry= eamtc.cleanEntryTypeData(self.train_df,train_entry_list)
cleaned_trip_entry= eamtcl.cleanEntryTypeData(self.train_df,train_entry_list)
self.sim_model.fit(cleaned_trip_entry)

labels = [int(l) for l in self.sim_model.tripLabels]
Expand Down Expand Up @@ -526,8 +525,8 @@ def fit(self, train_df,unused=None):
#########################
### get base clusters ###
#########################
dist_matrix_meters = get_distance_matrix(self.train_df, self.loc_type)
self.base_model = DBSCAN(self.radius,
dist_matrix_meters = eamtcl.get_distance_matrix(self.train_df, self.loc_type)
self.base_model = sklc.DBSCAN(self.radius,
metric="precomputed",
min_samples=1).fit(dist_matrix_meters)
base_clusters = self.base_model.labels_
Expand Down Expand Up @@ -557,17 +556,17 @@ def fit(self, train_df,unused=None):
continue

# only do SVM if purity is below threshold
purity = single_cluster_purity(points_in_cluster,
purity = eamtcl.single_cluster_purity(points_in_cluster,
label_col='purpose_true')
if purity < self.purity_thresh:
X = points_in_cluster[[
f"{self.loc_type}_lon", f"{self.loc_type}_lat"
]]
y = points_in_cluster.purpose_true.to_list()

svm_model = make_pipeline(
StandardScaler(),
svm.SVC(
svm_model = sklpl.make_pipeline(
sklpp.StandardScaler(),
skls.SVC(
kernel='rbf',
gamma=self.gamma,
C=self.C,
Expand Down Expand Up @@ -655,7 +654,7 @@ def _NN_predict(self, test_df):
new_loc_radians = np.radians(
row[[self.loc_type + "_lat", self.loc_type + "_lon"]].to_list())
new_loc_radians = np.reshape(new_loc_radians, (1, 2))
dist_matrix_meters = haversine_distances(
dist_matrix_meters = sklmp.haversine_distances(
new_loc_radians, train_radians) * EARTH_RADIUS

shortest_dist_idx = np.argmin(dist_matrix_meters)
Expand Down Expand Up @@ -1259,7 +1258,7 @@ def predict_proba(self, test_df):

mode_proba, replaced_proba = self._try_predict_proba_mode_replaced()

except NotFittedError as e:
except sklex.NotFittedError as e:
# if we can't predict purpose, we can still try to predict mode and
# replaced-mode without one-hot encoding the purpose

Expand All @@ -1277,7 +1276,7 @@ def predict_proba(self, test_df):
and replaced_pred.dtype == np.float64):
# this indicates that all the predictions are np.nan so none of the
# random forest classifiers were fitted
raise NotFittedError
raise sklex.NotFittedError

# TODO: move this to a Mixin for cluster-based predictors and use the
# 'cluster' column of the proba_df outputs
Expand Down Expand Up @@ -1379,7 +1378,7 @@ class probabilities for mode and replaced-mode respectively
[self.X_test_for_mode, onehot_mode_df], axis=1)
replaced_proba = self._try_predict_proba_replaced()

except NotFittedError as e:
except sklex.NotFittedError as e:
mode_proba_raw = np.full((len(self.X_test_for_mode), 1), 0)
mode_proba = pd.DataFrame(mode_proba_raw, columns=[np.nan])

Expand Down Expand Up @@ -1409,7 +1408,7 @@ def _try_predict_proba_replaced(self):
replaced_proba = pd.DataFrame(
replaced_proba_raw, columns=self.replaced_predictor.classes_)

except NotFittedError as e:
except sklex.NotFittedError as e:
replaced_proba_raw = np.full((len(self.X_test_for_replaced), 1), 0)
replaced_proba = pd.DataFrame(replaced_proba_raw, columns=[np.nan])

Expand All @@ -1436,7 +1435,7 @@ def _clusterable(self, test_df):
# haversine distance, so we have to reimplement it ourselves
new_loc_radians = np.radians(row[["end_lat", "end_lon"]].to_list())
new_loc_radians = np.reshape(new_loc_radians, (1, 2))
dist_matrix_meters = haversine_distances(
dist_matrix_meters = sklmp.haversine_distances(
new_loc_radians, train_radians) * EARTH_RADIUS

shortest_dist = np.min(dist_matrix_meters)
Expand Down Expand Up @@ -1569,7 +1568,7 @@ def __init__(
handle_unknown='error')

# ensemble classifiers for each label category
self.purpose_predictor = RandomForestClassifier(
self.purpose_predictor = sklen.RandomForestClassifier(
n_estimators=self.n_estimators,
criterion=self.criterion,
max_depth=self.max_depth,
Expand All @@ -1578,7 +1577,7 @@ def __init__(
max_features=self.max_features,
bootstrap=self.bootstrap,
random_state=self.random_state)
self.mode_predictor = RandomForestClassifier(
self.mode_predictor = sklen.RandomForestClassifier(
n_estimators=self.n_estimators,
criterion=self.criterion,
max_depth=self.max_depth,
Expand All @@ -1587,7 +1586,7 @@ def __init__(
max_features=self.max_features,
bootstrap=self.bootstrap,
random_state=self.random_state)
self.replaced_predictor = RandomForestClassifier(
self.replaced_predictor = sklen.RandomForestClassifier(
n_estimators=self.n_estimators,
criterion=self.criterion,
max_depth=self.max_depth,
Expand Down Expand Up @@ -1842,33 +1841,33 @@ def __init__(
sparse=False,
handle_unknown='error')

self.purpose_predictor = AdaBoostClassifier(
self.purpose_predictor = sklen.AdaBoostClassifier(
n_estimators=self.n_estimators,
learning_rate=self.learning_rate,
random_state=self.random_state,
base_estimator=DecisionTreeClassifier(
base_estimator=sklt.DecisionTreeClassifier(
criterion=self.criterion,
max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf,
max_features=self.max_features,
random_state=self.random_state))
self.mode_predictor = AdaBoostClassifier(
self.mode_predictor = sklen.AdaBoostClassifier(
n_estimators=self.n_estimators,
learning_rate=self.learning_rate,
random_state=self.random_state,
base_estimator=DecisionTreeClassifier(
base_estimator=sklt.DecisionTreeClassifier(
criterion=self.criterion,
max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf,
max_features=self.max_features,
random_state=self.random_state))
self.replaced_predictor = AdaBoostClassifier(
self.replaced_predictor = sklen.AdaBoostClassifier(
n_estimators=self.n_estimators,
learning_rate=self.learning_rate,
random_state=self.random_state,
base_estimator=DecisionTreeClassifier(
base_estimator=sklt.DecisionTreeClassifier(
criterion=self.criterion,
max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
Expand Down Expand Up @@ -2024,13 +2023,13 @@ def __init__(
):
self.impute_missing = impute_missing
if self.impute_missing:
self.encoder = make_pipeline(
SimpleImputer(missing_values=np.nan,
self.encoder = sklpl.make_pipeline(
skli.SimpleImputer(missing_values=np.nan,
strategy='constant',
fill_value='missing'),
OneHotEncoder(sparse=False, handle_unknown=handle_unknown))
sklpp.OneHotEncoder(sparse=False, handle_unknown=handle_unknown))
else:
self.encoder = OneHotEncoder(sparse=sparse,
self.encoder = sklpp.OneHotEncoder(sparse=sparse,
handle_unknown=handle_unknown)

def fit_transform(self, train_df, output_col_prefix=None):
Expand Down
2 changes: 2 additions & 0 deletions emission/analysis/modelling/trip_model/run_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ def update_trip_model(
time_query = time_query_from_pipeline if model.is_incremental else None
logging.debug(f'model type {model_type.name} is incremental? {model.is_incremental}')
logging.debug(f'time query for training data collection: {time_query}')

trips = _get_training_data(user_id, time_query)

# don't start training for a user that doesn't have at least $trips many trips
# (assume if a stored model exists for the user, that they met this requirement previously)
if len(trips) == 0:
Expand Down
3 changes: 3 additions & 0 deletions emission/analysis/modelling/trip_model/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
import pandas as pd
from numpy.linalg import norm


def find_knee_point(values: List[float]) -> Tuple[float, int]:
"""for a list of values, find the value which represents the cut-off point
or "elbow" in the function when values are sorted.
copied from original similarity algorithm. permalink:
[https://github.com/e-mission/e-mission-server/blob/5b9e608154de15e32df4f70a07a5b95477e7dbf5/emission/analysis/modelling/tour_model/similarity.py#L256]
with `y` passed in as `values`
based on this stack overflow answer: https://stackoverflow.com/a/2022348/4803266
And summarized by the statement: "A quick way of finding the elbow is to draw a
Expand Down
6 changes: 3 additions & 3 deletions emission/tests/modellingTests/TestForestModelIntegration.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import emission.tests.common as etc
import emission.pipeline.intake_stage as epi
import logging
from bson.objectid import ObjectId
import bson.objectid as boi

import emission.analysis.modelling.trip_model.config as eamtc

Expand Down Expand Up @@ -60,8 +60,8 @@ def setUp(self):
for result_entry in train:
result_entry['data']['start_local_dt']=result_entry['metadata']['write_local_dt']
result_entry['data']['end_local_dt']=result_entry['metadata']['write_local_dt']
result_entry['data']['start_place']=ObjectId()
result_entry['data']['end_place']=ObjectId()
result_entry['data']['start_place']=boi.ObjectId()
result_entry['data']['end_place']=boi.ObjectId()
ts.bulk_insert(train)
# confirm data write did not fail
check_data = esda.get_entries(key="analysis/confirmed_trip", user_id=self.testUUID, time_query=None)
Expand Down
6 changes: 3 additions & 3 deletions emission/tests/modellingTests/TestForestModelLoadandSave.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import ByteString
import unittest
import logging
from unittest.mock import patch
import unittest.mock as um
import emission.analysis.modelling.trip_model.run_model as eamur
import emission.analysis.modelling.trip_model.model_type as eamumt
import emission.analysis.modelling.trip_model.model_storage as eamums
Expand Down Expand Up @@ -220,7 +220,7 @@ def mock_dump(*args,**kwargs):
# side_effect, which is set to mock_dump, is called instead of
# real joblib.dump function when 'to_dict' is invoked

with patch('joblib.dump',side_effect=mock_dump):
with um.patch('joblib.dump',side_effect=mock_dump):
with self.assertRaises(RuntimeError):
model.to_dict()

Expand Down Expand Up @@ -260,7 +260,7 @@ def mock_load(*args,**kwargs):
# side_effect, which is set to mock_load, is called instead of
# real joblib.load function when 'to_dict' is invoked

with patch('joblib.load',side_effect=mock_load):
with um.patch('joblib.load',side_effect=mock_load):
with self.assertRaises(RuntimeError):
deserialized_model.from_dict(model_data)

Expand Down
8 changes: 4 additions & 4 deletions emission/tests/modellingTests/TestRunForestModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import emission.storage.pipeline_queries as epq
import emission.core.wrapper.pipelinestate as ecwp
import emission.analysis.modelling.trip_model.forest_classifier as eamtf
from sklearn.ensemble import RandomForestClassifier
import sklearn.ensemble as sklen

class TestRunForestModel(unittest.TestCase):
"""
Expand Down Expand Up @@ -98,9 +98,9 @@ def testBuildForestModelFromConfig(self):
"""

built_model = eamumt.ModelType.RANDOM_FOREST_CLASSIFIER.build()
attributes={'purpose_predictor': RandomForestClassifier ,
'mode_predictor' :RandomForestClassifier,
'replaced_predictor':RandomForestClassifier,
attributes={'purpose_predictor': sklen.RandomForestClassifier ,
'mode_predictor' :sklen.RandomForestClassifier,
'replaced_predictor':sklen.RandomForestClassifier,
'purpose_enc' : eamtm.OneHotWrapper,
'mode_enc':eamtm.OneHotWrapper
}
Expand Down

0 comments on commit 6d4cc3a

Please sign in to comment.