Skip to content

Commit

Permalink
Using train / test data split + Added value-check tests + Reduced ins…
Browse files Browse the repository at this point in the history
…tance variables

1. Split up mock trips data into train / test data.
- Saw that this was being done in one of the tests in TestForestModelLoadandSave.py itself as well as in TestGreedySimilarityBinning.py
- Hence added it for all tests in forest model tests for uniformity.

2. Reduced number of instance variables since they were used inside setUp() only.
This addresses review comment mentioned originally for TestForestModelIntegration
e-mission#938 (comment)

3. Cleaned up TestForestModeIntegration.py
- Added equality tests that check for prediction values generated in pipeline.
Address review comment:
e-mission#938 (comment)

- Added train / test data split.

- Removed check for empty data in setUp()
Addresses review comment:
e-mission#938 (comment)
  • Loading branch information
Mahadik, Mukul Chandrakant authored and Mahadik, Mukul Chandrakant committed Nov 22, 2024
1 parent e7f5d21 commit 6daf0b8
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 83 deletions.
102 changes: 60 additions & 42 deletions emission/tests/modellingTests/TestForestModelIntegration.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# This tests the label inference pipeline. It uses real data and placeholder inference algorithms
import unittest
import numpy as np
import time
import logging
import bson.objectid as boi
import emission.analysis.classification.inference.labels.pipeline as eacilp
import emission.analysis.classification.inference.labels.inferrers as eacili
import emission.core.wrapper.labelprediction as ecwl
Expand All @@ -11,30 +12,29 @@
import emission.core.get_database as edb
import emission.tests.common as etc
import emission.pipeline.intake_stage as epi
import logging
import bson.objectid as boi

import emission.analysis.modelling.trip_model.config as eamtc

import emission.analysis.modelling.trip_model.run_model as eamur
import emission.analysis.modelling.trip_model.model_type as eamumt
import emission.analysis.modelling.trip_model.model_storage as eamums
import emission.tests.modellingTests.modellingTestAssets as etmm
import emission.storage.timeseries.abstract_timeseries as esta


class TestForestModelIntegration(unittest.TestCase):
# Test if the forest model for label prediction is smoothly integrated with the inference pipeline.
# In the initial setup, build a dummy forest model. Then run the pipeline on real example data.
# Finally in the test, assert the type of label predictions expected.

"""
This tests the label inference pipeline. It uses real data and placeholder inference algorithms.
Test if the forest model for label prediction is smoothly integrated with the inference pipeline.
In the initial setup, build a dummy forest model. Then run the pipeline on real example data.
Finally in the test, assert the type of label predictions expected.
The label_data dict and mock_trip_data are copied over from TestRunGreedyModel.py
"""
def setUp(self):
np.random.seed(91)
self.test_algorithms = eacilp.primary_algorithms
forest_model_config = eamtc.get_config_value_or_raise('model_parameters.forest')

etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2015-07-22") ##maybe use a different file
ts = esta.TimeSeries.get_time_series(self.testUUID)

# Generate labels with a known sample weight that we can rely on in the test
label_data = {
"mode_confirm": ['ebike', 'bike'],
"purpose_confirm": ['happy-hour', 'dog-park'],
Expand All @@ -43,11 +43,10 @@ def setUp(self):
"purpose_weights": [0.1, 0.9]
}

self.total_trips=100
## generate mock trips
train = etmm.generate_mock_trips(
# Configuration values for randomly-generated test data copied over from TestRunGreedyModel.py
mock_trip_data = etmm.generate_mock_trips(
user_id=self.testUUID,
trips=self.total_trips,
trips=100,
origin=(-105.1705977, 39.7402654),
destination=(-105.1755606, 39.7673075),
trip_part='od',
Expand All @@ -56,61 +55,80 @@ def setUp(self):
threshold=0.004, # ~400m
has_label_p=0.9
)
## Required for Forest model inference
for result_entry in train:

# Required for Forest model inference
for result_entry in mock_trip_data:
result_entry['data']['start_local_dt']=result_entry['metadata']['write_local_dt']
result_entry['data']['end_local_dt']=result_entry['metadata']['write_local_dt']
result_entry['data']['start_place']=boi.ObjectId()
result_entry['data']['end_place']=boi.ObjectId()
ts.bulk_insert(train)
# confirm data write did not fail
check_data = esda.get_entries(key="analysis/confirmed_trip", user_id=self.testUUID, time_query=None)
if len(check_data) != self.total_trips:
logging.debug(f'test invariant failed after generating test data')
self.fail()
else:
logging.debug(f'found {self.total_trips} trips in database')
## Build an already existing model or a new model

split = int(len(mock_trip_data)*0.7)
mock_train_data = mock_trip_data[:split]
self.mock_test_data = mock_trip_data[split:]

ts.bulk_insert(mock_train_data)

# Build and train model
logging.debug(f'(TRAIN) creating a model based on trips in database')
eamur.update_trip_model(
user_id=self.testUUID,
model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER,
model_storage=eamums.ModelStorage.DOCUMENT_DATABASE,
min_trips=4,
min_trips=14,
model_config=forest_model_config
)
## run inference pipeline

# Run inference pipeline
self.run_pipeline(self.test_algorithms)
time_range = estt.TimeQuery("metadata.write_ts", None, time.time())
self.inferred_trips = esda.get_entries(esda.INFERRED_TRIP_KEY, self.testUUID, time_query=time_range)

def tearDown(self):
self.reset_all()
etc.dropAllCollections(edb._get_current_db())

def run_pipeline(self, algorithms):
default_primary_algorithms = eacilp.primary_algorithms
eacilp.primary_algorithms = algorithms
epi.run_intake_pipeline_for_user(self.testUUID,skip_if_no_new_data = False)
eacilp.primary_algorithms = default_primary_algorithms

def reset_all(self):
edb.get_analysis_timeseries_db().delete_many({'user_id': self.testUUID})
edb.get_model_db().delete_many({'user_id': self.testUUID})
edb.get_pipeline_state_db().delete_many({'user_id': self.testUUID})


# Tests that forest algorithm being tested runs successfully
def testForestAlgorithm(self):
'''
Tests that forest algorithm runs successfully when called from the analysis pipeline
The tests are based on the existing tests in TestLabelInferencePipeline.py
'''
valid_modes = ['ebike', 'bike']
valid_purposes = ['happy-hour', 'dog-park']

for trip in self.inferred_trips:
entries = esdt.get_sections_for_trip("inference/labels", self.testUUID, trip.get_id())
self.assertEqual(len(entries), len(self.test_algorithms))
for entry in entries:
self.assertGreater(len(entry["data"]["prediction"]), 0)
# Test 1: Check that non-empty prediction list is generated
self.assertGreater(len(entry["data"]["prediction"]), 0, "Prediction list should not be empty - model failed to generate any predictions")

# Test 2: Check for equality of trip inferred labels and prediction value in entry
self.assertEqual(trip["data"]["inferred_labels"], entry["data"]["prediction"])

# Test 3: Check that prediction value in entry is equal to the prediction generated by the algorithm
this_algorithm = ecwl.AlgorithmTypes(entry["data"]["algorithm_id"])
self.assertIn(this_algorithm, self.test_algorithms)
self.assertEqual(entry["data"]["prediction"], self.test_algorithms[this_algorithm]([trip])[0])

for singleprediction in entry["data"]["prediction"]:
self.assertIsInstance(singleprediction, dict, " should be an instance of the dictionary class")
self.assertIsInstance(singleprediction['labels'], dict, " should be an instance of the dictionary class")
self.assertIn('mode_confirm',singleprediction['labels'].keys())
self.assertIn('replaced_mode',singleprediction['labels'].keys())
self.assertIn('purpose_confirm',singleprediction['labels'].keys())
# Test 4: Check that the prediction is a dictionary
self.assertIsInstance(singleprediction, dict, "should be an instance of the dictionary class")
self.assertIsInstance(singleprediction['labels'], dict, "should be an instance of the dictionary class")

# Test 5: Check that the prediction dictionary contains the required keys
self.assertIn('mode_confirm', singleprediction['labels'].keys())
self.assertIn('replaced_mode', singleprediction['labels'].keys())
self.assertIn('purpose_confirm', singleprediction['labels'].keys())

# Test 6: Check that the prediction dictionary contains the correct values
self.assertIn(singleprediction['labels']['mode_confirm'], valid_modes)
self.assertIn(singleprediction['labels']['purpose_confirm'], valid_purposes)

def main():
etc.configLogging()
Expand Down
62 changes: 25 additions & 37 deletions emission/tests/modellingTests/TestForestModelLoadandSave.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,14 @@
class TestForestModelLoadandSave(unittest.TestCase):
"""
Tests to make sure the model load and save properly
The label_data dict and mock_trip_data are copied over from TestRunGreedyModel.py
"""
def setUp(self):
"""
sets up the end-to-end run model test with Confirmedtrip data
"""
# configuration for randomly-generated test data
self.user_id = user_id = 'TestForestModelLoadAndSave-TestData'
self.origin = (-105.1705977, 39.7402654,)
self.destination = (-105.1755606, 39.7673075)
self.min_trips = 14
self.total_trips = 100
self.clustered_trips = 33 # must have at least self.min_trips similar trips by default
self.has_label_percent = 0.9 # let's make a few that don't have a label, but invariant
# $clustered_trips * $has_label_percent > self.min_trips
# must be correct or else this test could fail under some random test cases.

def setUp(self):
self.user_id = 'TestForestModelLoadAndSave-TestData'
self.unused_user_id = 'asdjfkl;asdfjkl;asd08234ur13fi4jhf2103mkl'
ts = esta.TimeSeries.get_time_series(self.user_id)

ts = esta.TimeSeries.get_time_series(user_id)

# generate labels with a known sample weight that we can rely on in the test
# Generate labels with a known sample weight that we can rely on in the test
label_data = {
"mode_confirm": ['ebike', 'bike'],
"purpose_confirm": ['happy-hour', 'dog-park'],
Expand All @@ -46,24 +33,29 @@ def setUp(self):
"purpose_weights": [0.1, 0.9]
}

# generate test data for the database
test_data = etmm.generate_mock_trips(
user_id=user_id,
trips=self.total_trips,
origin=self.origin,
destination=self.destination,
# Configuration values for randomly-generated test data copied over from TestRunGreedyModel.py
mock_trip_data = etmm.generate_mock_trips(
user_id=self.user_id,
trips=100,
origin=(-105.1705977, 39.7402654,),
destination=(-105.1755606, 39.7673075),
trip_part='od',
label_data=label_data,
within_threshold=self.clustered_trips,
within_threshold=33,
threshold=0.004, # ~400m
has_label_p=self.has_label_percent
has_label_p=0.9
)

for result_entry in test_data:
# Required for Forest model inference
for result_entry in mock_trip_data:
result_entry['data']['start_local_dt']=result_entry['metadata']['write_local_dt']
result_entry['data']['end_local_dt']=result_entry['metadata']['write_local_dt']

ts.bulk_insert(test_data)
split = int(len(mock_trip_data)*0.7)
mock_train_data = mock_trip_data[:split]
self.mock_test_data = mock_trip_data[split:]

ts.bulk_insert(mock_train_data)

self.forest_model_config= eamtc.get_config_value_or_raise('model_parameters.forest')

Expand All @@ -73,7 +65,7 @@ def setUp(self):
user_id=self.user_id,
model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER,
model_storage=eamums.ModelStorage.DOCUMENT_DATABASE,
min_trips=self.min_trips,
min_trips=14,
model_config=self.forest_model_config
)

Expand All @@ -98,10 +90,8 @@ def testForestModelPredictionsEquality(self):
The type of deserialized model attributes and the predictions of this must match
those of initial model.
"""
test_trip_data = esda.get_entries(key=esda.CONFIRMED_TRIP_KEY, user_id=self.user_id, time_query=None)

predictions_list = eamur.predict_labels_with_n(
trip_list = test_trip_data,
trip_list = self.mock_test_data,
model=self.model
)

Expand All @@ -111,7 +101,7 @@ def testForestModelPredictionsEquality(self):
deserialized_model.from_dict(model_data)

predictions_deserialized_model_list = eamur.predict_labels_with_n(
trip_list = test_trip_data,
trip_list = self.mock_test_data,
model=deserialized_model
)

Expand All @@ -130,10 +120,8 @@ def testForestModelConsistency(self):
ConsistencyTest : To Verify that the serialization and deserialization process
is consistent across multiple executions
"""
test_trip_data = esda.get_entries(key=esda.CONFIRMED_TRIP_KEY, user_id=self.user_id, time_query=None)

predictions_list_model1 = eamur.predict_labels_with_n(
trip_list = test_trip_data,
trip_list = self.mock_test_data,
model=self.model
)

Expand All @@ -145,7 +133,7 @@ def testForestModelConsistency(self):
)

predictions_list_model2 = eamur.predict_labels_with_n(
trip_list = test_trip_data,
trip_list = self.mock_test_data,
model=model_iter2
)

Expand Down
8 changes: 4 additions & 4 deletions emission/tests/modellingTests/TestRunForestModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def testTrainForestModelWithZeroTrips(self):
"pipeline should not have a current timestamp for the test user")


def test1RoundPredictForestModel(self):
def testRoundPredictForestModel(self):
"""
forest model takes config arguments via the constructor for testing
purposes but will load from a file in /conf/analysis/ which is tested here
Expand Down Expand Up @@ -204,11 +204,11 @@ def test1RoundPredictForestModel(self):
)
for prediction, n in predictions_list:
[logging.debug(p) for p in sorted(prediction, key=lambda r: r['p'], reverse=True)]
self.assertNotEqual(len(prediction), 0, "should have a prediction")
self.assertNotEqual(len(prediction), 0, "Prediction list should not be empty - model failed to generate any predictions")
self.assertIn('labels',prediction[0].keys())
self.assertIn('p',prediction[0].keys())
self.assertIsInstance(prediction[0], dict, " should be an instance of the dictionary class")
self.assertIsInstance(prediction[0]['labels'], dict, " should be an instance of the dictionary class")
self.assertIsInstance(prediction[0], dict, "should be an instance of the dictionary class")
self.assertIsInstance(prediction[0]['labels'], dict, "should be an instance of the dictionary class")
self.assertIn('mode_confirm',prediction[0]['labels'].keys())
self.assertIn('replaced_mode',prediction[0]['labels'].keys())
self.assertIn('purpose_confirm',prediction[0]['labels'].keys())

0 comments on commit 6daf0b8

Please sign in to comment.