From c1d2aee01cfbd14e5a32eb94f03f98129064f876 Mon Sep 17 00:00:00 2001 From: "Mahadik, Mukul Chandrakant" Date: Mon, 5 Feb 2024 13:26:33 -0700 Subject: [PATCH] Addressed latest review comments - Improved logging statements to included whether model count has changed. - Skipped check for saved test data in Test file as we clear teardown() related collections in the DB. --- .../modifiable/builtin_model_storage.py | 19 +++--- .../tests/storageTests/TestModelStorage.py | 65 +++++++++---------- 2 files changed, 39 insertions(+), 45 deletions(-) diff --git a/emission/storage/modifiable/builtin_model_storage.py b/emission/storage/modifiable/builtin_model_storage.py index 781d6133b..36f494d8a 100644 --- a/emission/storage/modifiable/builtin_model_storage.py +++ b/emission/storage/modifiable/builtin_model_storage.py @@ -23,7 +23,6 @@ def upsert_model(self, key:str, model: ecwb.WrapperBase): """ logging.debug("upsert_doc called with key %s" % key) entry = ecwe.Entry.create_entry(self.user_id, key, model) - ## TODO: Cleanup old/obsolete models # Cleaning up older models, before inserting new model self.trim_model_entries(key) logging.debug("Inserting entry %s into model DB" % entry) @@ -61,13 +60,13 @@ def trim_model_entries(self, key:str): The flow of model insertion function calls is: eamur.update_trip_model() -> eamums.save_model() -> esma.upsert_model() -> esma.trim_model_entries() """ - current_model_count = edb.get_model_db().count_documents({"user_id": self.user_id}) - logging.debug("Before trimming, model count for user %s = %s" % (self.user_id, current_model_count)) + old_model_count = edb.get_model_db().count_documents({"user_id": self.user_id}) + deleted_model_count = 0 find_query = {"user_id": self.user_id, "metadata.key": key} result_it = edb.get_model_db().find(find_query).sort("metadata.write_ts", -1) result_list = list(result_it) maximum_stored_model_count = eamtc.get_maximum_stored_model_count() - if current_model_count >= maximum_stored_model_count: + if old_model_count >= maximum_stored_model_count: # Specify the last or minimum timestamp of Kth model entry write_ts_limit = result_list[maximum_stored_model_count - 1]['metadata']['write_ts'] logging.debug(f"Write ts limit = {write_ts_limit}") @@ -77,9 +76,11 @@ def trim_model_entries(self, key:str): "metadata.write_ts" : { "$lte" : write_ts_limit } } models_to_delete = edb.get_model_db().delete_many(filter_clause) - if models_to_delete.deleted_count > 0: - logging.debug(f"{models_to_delete.deleted_count} documents deleted successfully\n") - else: - logging.debug("No documents found or none deleted\n") + deleted_model_count = models_to_delete.deleted_count new_model_count = edb.get_model_db().count_documents({"user_id": self.user_id}) - logging.debug("After trimming, model count for user %s = %s" % (self.user_id, new_model_count)) \ No newline at end of file + if deleted_model_count > 0: + logging.debug(f"{deleted_model_count} models deleted successfully") + logging.debug("Model count for user %s has changed %s -> %s" % (self.user_id, old_model_count, new_model_count)) + else: + logging.debug("No models found or none deleted") + logging.debug("Model count for user %s unchanged %s -> %s" % (self.user_id, old_model_count, new_model_count)) \ No newline at end of file diff --git a/emission/tests/storageTests/TestModelStorage.py b/emission/tests/storageTests/TestModelStorage.py index 6a3414439..09150382b 100644 --- a/emission/tests/storageTests/TestModelStorage.py +++ b/emission/tests/storageTests/TestModelStorage.py @@ -47,46 +47,39 @@ def setUp(self): # $clustered_trips * $has_label_percent > self.min_trips # must be correct or else this test could fail under some random test cases. - # for a negative test, below - self.unused_user_id = 'asdjfkl;asdfjkl;asd08234ur13fi4jhf2103mkl' - - # test data can be saved between test invocations, check if data exists before generating ts = esta.TimeSeries.get_time_series(user_id) - test_data = list(ts.find_entries(["analysis/confirmed_trip"])) - if len(test_data) == 0: - # generate test data for the database - logging.debug(f"inserting mock Confirmedtrips into database") - - # generate labels with a known sample weight that we can rely on in the test - label_data = { - "mode_confirm": ['ebike', 'bike'], - "purpose_confirm": ['happy-hour', 'dog-park'], - "replaced_mode": ['walk'], - "mode_weights": [0.9, 0.1], - "purpose_weights": [0.1, 0.9] - } + logging.debug(f"inserting mock Confirmedtrips into database") + + # generate labels with a known sample weight that we can rely on in the test + label_data = { + "mode_confirm": ['ebike', 'bike'], + "purpose_confirm": ['happy-hour', 'dog-park'], + "replaced_mode": ['walk'], + "mode_weights": [0.9, 0.1], + "purpose_weights": [0.1, 0.9] + } - train = etmm.generate_mock_trips( - user_id=user_id, - trips=self.total_trips, - origin=self.origin, - destination=self.destination, - trip_part='od', - label_data=label_data, - within_threshold=self.clustered_trips, - threshold=0.004, # ~400m - has_label_p=self.has_label_percent - ) + train = etmm.generate_mock_trips( + user_id=user_id, + trips=self.total_trips, + origin=self.origin, + destination=self.destination, + trip_part='od', + label_data=label_data, + within_threshold=self.clustered_trips, + threshold=0.004, # ~400m + has_label_p=self.has_label_percent + ) - ts.bulk_insert(train) + ts.bulk_insert(train) - # confirm data write did not fail - test_data = esda.get_entries(key="analysis/confirmed_trip", user_id=user_id, time_query=None) - if len(test_data) != self.total_trips: - logging.debug(f'test invariant failed after generating test data') - self.fail() - else: - logging.debug(f'found {self.total_trips} trips in database') + # confirm data write did not fail + test_data = esda.get_entries(key="analysis/confirmed_trip", user_id=user_id, time_query=None) + if len(test_data) != self.total_trips: + logging.debug(f'test invariant failed after generating test data') + self.fail() + else: + logging.debug(f'found {self.total_trips} trips in database') def tearDown(self): edb.get_analysis_timeseries_db().delete_many({'user_id': self.user_id})