diff --git a/emission/tests/modellingTests/TestForestModelIntegration.py b/emission/tests/modellingTests/TestForestModelIntegration.py index 6677221a3..e08345f5d 100644 --- a/emission/tests/modellingTests/TestForestModelIntegration.py +++ b/emission/tests/modellingTests/TestForestModelIntegration.py @@ -1,7 +1,8 @@ -# This tests the label inference pipeline. It uses real data and placeholder inference algorithms import unittest import numpy as np import time +import logging +import bson.objectid as boi import emission.analysis.classification.inference.labels.pipeline as eacilp import emission.analysis.classification.inference.labels.inferrers as eacili import emission.core.wrapper.labelprediction as ecwl @@ -11,30 +12,29 @@ import emission.core.get_database as edb import emission.tests.common as etc import emission.pipeline.intake_stage as epi -import logging -import bson.objectid as boi - import emission.analysis.modelling.trip_model.config as eamtc - import emission.analysis.modelling.trip_model.run_model as eamur import emission.analysis.modelling.trip_model.model_type as eamumt import emission.analysis.modelling.trip_model.model_storage as eamums import emission.tests.modellingTests.modellingTestAssets as etmm import emission.storage.timeseries.abstract_timeseries as esta - class TestForestModelIntegration(unittest.TestCase): - # Test if the forest model for label prediction is smoothly integrated with the inference pipeline. - # In the initial setup, build a dummy forest model. Then run the pipeline on real example data. - # Finally in the test, assert the type of label predictions expected. - + """ + This tests the label inference pipeline. It uses real data and placeholder inference algorithms. + Test if the forest model for label prediction is smoothly integrated with the inference pipeline. + In the initial setup, build a dummy forest model. Then run the pipeline on real example data. + Finally in the test, assert the type of label predictions expected. + The label_data dict and mock_trip_data are copied over from TestRunGreedyModel.py + """ def setUp(self): np.random.seed(91) self.test_algorithms = eacilp.primary_algorithms forest_model_config = eamtc.get_config_value_or_raise('model_parameters.forest') - etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2015-07-22") ##maybe use a different file ts = esta.TimeSeries.get_time_series(self.testUUID) + + # Generate labels with a known sample weight that we can rely on in the test label_data = { "mode_confirm": ['ebike', 'bike'], "purpose_confirm": ['happy-hour', 'dog-park'], @@ -43,11 +43,10 @@ def setUp(self): "purpose_weights": [0.1, 0.9] } - self.total_trips=100 - ## generate mock trips - train = etmm.generate_mock_trips( + # Configuration values for randomly-generated test data copied over from TestRunGreedyModel.py + mock_trip_data = etmm.generate_mock_trips( user_id=self.testUUID, - trips=self.total_trips, + trips=100, origin=(-105.1705977, 39.7402654), destination=(-105.1755606, 39.7673075), trip_part='od', @@ -56,35 +55,37 @@ def setUp(self): threshold=0.004, # ~400m has_label_p=0.9 ) - ## Required for Forest model inference - for result_entry in train: + + # Required for Forest model inference + for result_entry in mock_trip_data: result_entry['data']['start_local_dt']=result_entry['metadata']['write_local_dt'] result_entry['data']['end_local_dt']=result_entry['metadata']['write_local_dt'] result_entry['data']['start_place']=boi.ObjectId() result_entry['data']['end_place']=boi.ObjectId() - ts.bulk_insert(train) - # confirm data write did not fail - check_data = esda.get_entries(key="analysis/confirmed_trip", user_id=self.testUUID, time_query=None) - if len(check_data) != self.total_trips: - logging.debug(f'test invariant failed after generating test data') - self.fail() - else: - logging.debug(f'found {self.total_trips} trips in database') - ## Build an already existing model or a new model + + split = int(len(mock_trip_data)*0.7) + mock_train_data = mock_trip_data[:split] + self.mock_test_data = mock_trip_data[split:] + + ts.bulk_insert(mock_train_data) + + # Build and train model + logging.debug(f'(TRAIN) creating a model based on trips in database') eamur.update_trip_model( user_id=self.testUUID, model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, - min_trips=4, + min_trips=14, model_config=forest_model_config ) - ## run inference pipeline + + # Run inference pipeline self.run_pipeline(self.test_algorithms) time_range = estt.TimeQuery("metadata.write_ts", None, time.time()) self.inferred_trips = esda.get_entries(esda.INFERRED_TRIP_KEY, self.testUUID, time_query=time_range) def tearDown(self): - self.reset_all() + etc.dropAllCollections(edb._get_current_db()) def run_pipeline(self, algorithms): default_primary_algorithms = eacilp.primary_algorithms @@ -92,25 +93,42 @@ def run_pipeline(self, algorithms): epi.run_intake_pipeline_for_user(self.testUUID,skip_if_no_new_data = False) eacilp.primary_algorithms = default_primary_algorithms - def reset_all(self): - edb.get_analysis_timeseries_db().delete_many({'user_id': self.testUUID}) - edb.get_model_db().delete_many({'user_id': self.testUUID}) - edb.get_pipeline_state_db().delete_many({'user_id': self.testUUID}) - - - # Tests that forest algorithm being tested runs successfully def testForestAlgorithm(self): + ''' + Tests that forest algorithm runs successfully when called from the analysis pipeline + The tests are based on the existing tests in TestLabelInferencePipeline.py + ''' + valid_modes = ['ebike', 'bike'] + valid_purposes = ['happy-hour', 'dog-park'] + for trip in self.inferred_trips: entries = esdt.get_sections_for_trip("inference/labels", self.testUUID, trip.get_id()) self.assertEqual(len(entries), len(self.test_algorithms)) for entry in entries: - self.assertGreater(len(entry["data"]["prediction"]), 0) + # Test 1: Check that non-empty prediction list is generated + self.assertGreater(len(entry["data"]["prediction"]), 0, "Prediction list should not be empty - model failed to generate any predictions") + + # Test 2: Check for equality of trip inferred labels and prediction value in entry + self.assertEqual(trip["data"]["inferred_labels"], entry["data"]["prediction"]) + + # Test 3: Check that prediction value in entry is equal to the prediction generated by the algorithm + this_algorithm = ecwl.AlgorithmTypes(entry["data"]["algorithm_id"]) + self.assertIn(this_algorithm, self.test_algorithms) + self.assertEqual(entry["data"]["prediction"], self.test_algorithms[this_algorithm]([trip])[0]) + for singleprediction in entry["data"]["prediction"]: - self.assertIsInstance(singleprediction, dict, " should be an instance of the dictionary class") - self.assertIsInstance(singleprediction['labels'], dict, " should be an instance of the dictionary class") - self.assertIn('mode_confirm',singleprediction['labels'].keys()) - self.assertIn('replaced_mode',singleprediction['labels'].keys()) - self.assertIn('purpose_confirm',singleprediction['labels'].keys()) + # Test 4: Check that the prediction is a dictionary + self.assertIsInstance(singleprediction, dict, "should be an instance of the dictionary class") + self.assertIsInstance(singleprediction['labels'], dict, "should be an instance of the dictionary class") + + # Test 5: Check that the prediction dictionary contains the required keys + self.assertIn('mode_confirm', singleprediction['labels'].keys()) + self.assertIn('replaced_mode', singleprediction['labels'].keys()) + self.assertIn('purpose_confirm', singleprediction['labels'].keys()) + + # Test 6: Check that the prediction dictionary contains the correct values + self.assertIn(singleprediction['labels']['mode_confirm'], valid_modes) + self.assertIn(singleprediction['labels']['purpose_confirm'], valid_purposes) def main(): etc.configLogging() diff --git a/emission/tests/modellingTests/TestForestModelLoadandSave.py b/emission/tests/modellingTests/TestForestModelLoadandSave.py index 431b9ddb3..e92d4273a 100644 --- a/emission/tests/modellingTests/TestForestModelLoadandSave.py +++ b/emission/tests/modellingTests/TestForestModelLoadandSave.py @@ -17,27 +17,14 @@ class TestForestModelLoadandSave(unittest.TestCase): """ Tests to make sure the model load and save properly + The label_data dict and mock_trip_data are copied over from TestRunGreedyModel.py """ - def setUp(self): - """ - sets up the end-to-end run model test with Confirmedtrip data - """ - # configuration for randomly-generated test data - self.user_id = user_id = 'TestForestModelLoadAndSave-TestData' - self.origin = (-105.1705977, 39.7402654,) - self.destination = (-105.1755606, 39.7673075) - self.min_trips = 14 - self.total_trips = 100 - self.clustered_trips = 33 # must have at least self.min_trips similar trips by default - self.has_label_percent = 0.9 # let's make a few that don't have a label, but invariant - # $clustered_trips * $has_label_percent > self.min_trips - # must be correct or else this test could fail under some random test cases. - + def setUp(self): + self.user_id = 'TestForestModelLoadAndSave-TestData' self.unused_user_id = 'asdjfkl;asdfjkl;asd08234ur13fi4jhf2103mkl' + ts = esta.TimeSeries.get_time_series(self.user_id) - ts = esta.TimeSeries.get_time_series(user_id) - - # generate labels with a known sample weight that we can rely on in the test + # Generate labels with a known sample weight that we can rely on in the test label_data = { "mode_confirm": ['ebike', 'bike'], "purpose_confirm": ['happy-hour', 'dog-park'], @@ -46,24 +33,29 @@ def setUp(self): "purpose_weights": [0.1, 0.9] } - # generate test data for the database - test_data = etmm.generate_mock_trips( - user_id=user_id, - trips=self.total_trips, - origin=self.origin, - destination=self.destination, + # Configuration values for randomly-generated test data copied over from TestRunGreedyModel.py + mock_trip_data = etmm.generate_mock_trips( + user_id=self.user_id, + trips=100, + origin=(-105.1705977, 39.7402654,), + destination=(-105.1755606, 39.7673075), trip_part='od', label_data=label_data, - within_threshold=self.clustered_trips, + within_threshold=33, threshold=0.004, # ~400m - has_label_p=self.has_label_percent + has_label_p=0.9 ) - for result_entry in test_data: + # Required for Forest model inference + for result_entry in mock_trip_data: result_entry['data']['start_local_dt']=result_entry['metadata']['write_local_dt'] result_entry['data']['end_local_dt']=result_entry['metadata']['write_local_dt'] - ts.bulk_insert(test_data) + split = int(len(mock_trip_data)*0.7) + mock_train_data = mock_trip_data[:split] + self.mock_test_data = mock_trip_data[split:] + + ts.bulk_insert(mock_train_data) self.forest_model_config= eamtc.get_config_value_or_raise('model_parameters.forest') @@ -73,7 +65,7 @@ def setUp(self): user_id=self.user_id, model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, - min_trips=self.min_trips, + min_trips=14, model_config=self.forest_model_config ) @@ -98,10 +90,8 @@ def testForestModelPredictionsEquality(self): The type of deserialized model attributes and the predictions of this must match those of initial model. """ - test_trip_data = esda.get_entries(key=esda.CONFIRMED_TRIP_KEY, user_id=self.user_id, time_query=None) - predictions_list = eamur.predict_labels_with_n( - trip_list = test_trip_data, + trip_list = self.mock_test_data, model=self.model ) @@ -111,7 +101,7 @@ def testForestModelPredictionsEquality(self): deserialized_model.from_dict(model_data) predictions_deserialized_model_list = eamur.predict_labels_with_n( - trip_list = test_trip_data, + trip_list = self.mock_test_data, model=deserialized_model ) @@ -130,10 +120,8 @@ def testForestModelConsistency(self): ConsistencyTest : To Verify that the serialization and deserialization process is consistent across multiple executions """ - test_trip_data = esda.get_entries(key=esda.CONFIRMED_TRIP_KEY, user_id=self.user_id, time_query=None) - predictions_list_model1 = eamur.predict_labels_with_n( - trip_list = test_trip_data, + trip_list = self.mock_test_data, model=self.model ) @@ -145,7 +133,7 @@ def testForestModelConsistency(self): ) predictions_list_model2 = eamur.predict_labels_with_n( - trip_list = test_trip_data, + trip_list = self.mock_test_data, model=model_iter2 ) diff --git a/emission/tests/modellingTests/TestRunForestModel.py b/emission/tests/modellingTests/TestRunForestModel.py index 672775483..6ecad60a5 100644 --- a/emission/tests/modellingTests/TestRunForestModel.py +++ b/emission/tests/modellingTests/TestRunForestModel.py @@ -155,7 +155,7 @@ def testTrainForestModelWithZeroTrips(self): "pipeline should not have a current timestamp for the test user") - def test1RoundPredictForestModel(self): + def testRoundPredictForestModel(self): """ forest model takes config arguments via the constructor for testing purposes but will load from a file in /conf/analysis/ which is tested here @@ -204,11 +204,11 @@ def test1RoundPredictForestModel(self): ) for prediction, n in predictions_list: [logging.debug(p) for p in sorted(prediction, key=lambda r: r['p'], reverse=True)] - self.assertNotEqual(len(prediction), 0, "should have a prediction") + self.assertNotEqual(len(prediction), 0, "Prediction list should not be empty - model failed to generate any predictions") self.assertIn('labels',prediction[0].keys()) self.assertIn('p',prediction[0].keys()) - self.assertIsInstance(prediction[0], dict, " should be an instance of the dictionary class") - self.assertIsInstance(prediction[0]['labels'], dict, " should be an instance of the dictionary class") + self.assertIsInstance(prediction[0], dict, "should be an instance of the dictionary class") + self.assertIsInstance(prediction[0]['labels'], dict, "should be an instance of the dictionary class") self.assertIn('mode_confirm',prediction[0]['labels'].keys()) self.assertIn('replaced_mode',prediction[0]['labels'].keys()) self.assertIn('purpose_confirm',prediction[0]['labels'].keys()) \ No newline at end of file