diff --git a/conf/analysis/trip_model.conf.json.sample b/conf/analysis/trip_model.conf.json.sample index 845e67a6a..37a50fefa 100644 --- a/conf/analysis/trip_model.conf.json.sample +++ b/conf/analysis/trip_model.conf.json.sample @@ -2,6 +2,7 @@ "model_type": "greedy", "model_storage": "document_database", "minimum_trips": 14, + "maximum_stored_model_count": 3, "model_parameters": { "greedy": { "metric": "od_similarity", diff --git a/emission/analysis/modelling/trip_model/config.py b/emission/analysis/modelling/trip_model/config.py index 76b3c6e6d..e8d0b2623 100644 --- a/emission/analysis/modelling/trip_model/config.py +++ b/emission/analysis/modelling/trip_model/config.py @@ -75,5 +75,9 @@ def get_minimum_trips(): raise TypeError(msg) return minimum_trips - - +def get_maximum_stored_model_count(): + maximum_stored_model_count = get_config_value_or_raise('maximum_stored_model_count') + if not isinstance(maximum_stored_model_count, int): + msg = f"config key 'maximum_stored_model_count' not an integer in config file {config_filename}" + raise TypeError(msg) + return maximum_stored_model_count \ No newline at end of file diff --git a/emission/storage/modifiable/builtin_model_storage.py b/emission/storage/modifiable/builtin_model_storage.py index cfaa88eb6..781d6133b 100644 --- a/emission/storage/modifiable/builtin_model_storage.py +++ b/emission/storage/modifiable/builtin_model_storage.py @@ -6,14 +6,11 @@ import emission.core.get_database as edb import emission.storage.modifiable.abstract_model_storage as esma - +import emission.analysis.modelling.trip_model.config as eamtc import emission.core.wrapper.entry as ecwe import emission.core.wrapper.wrapperbase as ecwb class BuiltinModelStorage(esma.ModelStorage): - # TODO: Discuss how to decide on model_count limit - K_MODEL_COUNT = 10 - def __init__(self, user_id): super(BuiltinModelStorage, self).__init__(user_id) self.key_query = lambda key: {"metadata.key": key} @@ -64,30 +61,25 @@ def trim_model_entries(self, key:str): The flow of model insertion function calls is: eamur.update_trip_model() -> eamums.save_model() -> esma.upsert_model() -> esma.trim_model_entries() """ - current_model_count = edb.get_model_db().count_documents({"user_id": self.user_id}) logging.debug("Before trimming, model count for user %s = %s" % (self.user_id, current_model_count)) find_query = {"user_id": self.user_id, "metadata.key": key} result_it = edb.get_model_db().find(find_query).sort("metadata.write_ts", -1) result_list = list(result_it) - - if current_model_count >= self.K_MODEL_COUNT: + maximum_stored_model_count = eamtc.get_maximum_stored_model_count() + if current_model_count >= maximum_stored_model_count: # Specify the last or minimum timestamp of Kth model entry - write_ts_limit = result_list[self.K_MODEL_COUNT - 1]['metadata']['write_ts'] + write_ts_limit = result_list[maximum_stored_model_count - 1]['metadata']['write_ts'] logging.debug(f"Write ts limit = {write_ts_limit}") - filter_clause = { "user_id" : self.user_id, "metadata.key" : key, "metadata.write_ts" : { "$lte" : write_ts_limit } } - models_to_delete = edb.get_model_db().delete_many(filter_clause) - if models_to_delete.deleted_count > 0: logging.debug(f"{models_to_delete.deleted_count} documents deleted successfully\n") else: logging.debug("No documents found or none deleted\n") - new_model_count = edb.get_model_db().count_documents({"user_id": self.user_id}) logging.debug("After trimming, model count for user %s = %s" % (self.user_id, new_model_count)) \ No newline at end of file diff --git a/emission/tests/storageTests/TestModelStorage.py b/emission/tests/storageTests/TestModelStorage.py index ae9a2a195..6a3414439 100644 --- a/emission/tests/storageTests/TestModelStorage.py +++ b/emission/tests/storageTests/TestModelStorage.py @@ -21,7 +21,7 @@ import emission.analysis.modelling.trip_model.run_model as eamur import emission.storage.timeseries.abstract_timeseries as esta import emission.tests.modellingTests.modellingTestAssets as etmm -from emission.storage.modifiable.builtin_model_storage import BuiltinModelStorage as esmb +import emission.analysis.modelling.trip_model.config as eamtc # Test imports import emission.tests.common as etc @@ -98,7 +98,7 @@ def testTrimModelEntries(self): Took this code from emission.tests.modellingTests.TestRunGreedyModel.py with the objective of inserting multiple models into the model_db. The test involves building and inserting 20 models, which is greater than - the K_MODEL_COUNT (= 10) limit defined in emission.storage.modifiable.builtin_model_storage.py + the maximum_stored_model_count (= 3) limit defined in conf/analysis/trip_model.conf.json.sample train a model, save it, load it, and use it for prediction, using the high-level training/testing API provided via @@ -107,7 +107,6 @@ def testTrimModelEntries(self): for clustering, use the default greedy similarity binning model """ - # pass along debug model configuration greedy_model_config = { "metric": "od_similarity", @@ -116,7 +115,7 @@ def testTrimModelEntries(self): "clustering_way": 'origin-destination', "incremental_evaluation": False } - + maximum_stored_model_count = eamtc.get_maximum_stored_model_count() logging.debug(f'(TRAIN) creating a model based on trips in database') for i in range(20): logging.debug(f"Creating dummy model no. {i}") @@ -128,10 +127,10 @@ def testTrimModelEntries(self): model_config=greedy_model_config ) current_model_count = edb.get_model_db().count_documents({"user_id": self.user_id}) - if i <= (esmb.K_MODEL_COUNT - 1): + if i <= (maximum_stored_model_count - 1): self.assertEqual(current_model_count, i+1) else: - self.assertEqual(current_model_count, esmb.K_MODEL_COUNT) + self.assertEqual(current_model_count, maximum_stored_model_count) if __name__ == '__main__': import emission.tests.common as etc