Skip to content

Commit

Permalink
Addressed latest review comments
Browse files Browse the repository at this point in the history
- Improved logging statements to included whether model count has changed.
- Skipped check for saved test data in Test file as we clear teardown() related collections in the DB.
  • Loading branch information
Mahadik, Mukul Chandrakant authored and Mahadik, Mukul Chandrakant committed Feb 5, 2024
1 parent 6f4ac50 commit c1d2aee
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 45 deletions.
19 changes: 10 additions & 9 deletions emission/storage/modifiable/builtin_model_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ def upsert_model(self, key:str, model: ecwb.WrapperBase):
"""
logging.debug("upsert_doc called with key %s" % key)
entry = ecwe.Entry.create_entry(self.user_id, key, model)
## TODO: Cleanup old/obsolete models
# Cleaning up older models, before inserting new model
self.trim_model_entries(key)
logging.debug("Inserting entry %s into model DB" % entry)
Expand Down Expand Up @@ -61,13 +60,13 @@ def trim_model_entries(self, key:str):
The flow of model insertion function calls is:
eamur.update_trip_model() -> eamums.save_model() -> esma.upsert_model() -> esma.trim_model_entries()
"""
current_model_count = edb.get_model_db().count_documents({"user_id": self.user_id})
logging.debug("Before trimming, model count for user %s = %s" % (self.user_id, current_model_count))
old_model_count = edb.get_model_db().count_documents({"user_id": self.user_id})
deleted_model_count = 0
find_query = {"user_id": self.user_id, "metadata.key": key}
result_it = edb.get_model_db().find(find_query).sort("metadata.write_ts", -1)
result_list = list(result_it)
maximum_stored_model_count = eamtc.get_maximum_stored_model_count()
if current_model_count >= maximum_stored_model_count:
if old_model_count >= maximum_stored_model_count:
# Specify the last or minimum timestamp of Kth model entry
write_ts_limit = result_list[maximum_stored_model_count - 1]['metadata']['write_ts']
logging.debug(f"Write ts limit = {write_ts_limit}")
Expand All @@ -77,9 +76,11 @@ def trim_model_entries(self, key:str):
"metadata.write_ts" : { "$lte" : write_ts_limit }
}
models_to_delete = edb.get_model_db().delete_many(filter_clause)
if models_to_delete.deleted_count > 0:
logging.debug(f"{models_to_delete.deleted_count} documents deleted successfully\n")
else:
logging.debug("No documents found or none deleted\n")
deleted_model_count = models_to_delete.deleted_count
new_model_count = edb.get_model_db().count_documents({"user_id": self.user_id})
logging.debug("After trimming, model count for user %s = %s" % (self.user_id, new_model_count))
if deleted_model_count > 0:
logging.debug(f"{deleted_model_count} models deleted successfully")
logging.debug("Model count for user %s has changed %s -> %s" % (self.user_id, old_model_count, new_model_count))
else:
logging.debug("No models found or none deleted")
logging.debug("Model count for user %s unchanged %s -> %s" % (self.user_id, old_model_count, new_model_count))
65 changes: 29 additions & 36 deletions emission/tests/storageTests/TestModelStorage.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,46 +47,39 @@ def setUp(self):
# $clustered_trips * $has_label_percent > self.min_trips
# must be correct or else this test could fail under some random test cases.

# for a negative test, below
self.unused_user_id = 'asdjfkl;asdfjkl;asd08234ur13fi4jhf2103mkl'

# test data can be saved between test invocations, check if data exists before generating
ts = esta.TimeSeries.get_time_series(user_id)
test_data = list(ts.find_entries(["analysis/confirmed_trip"]))
if len(test_data) == 0:
# generate test data for the database
logging.debug(f"inserting mock Confirmedtrips into database")

# generate labels with a known sample weight that we can rely on in the test
label_data = {
"mode_confirm": ['ebike', 'bike'],
"purpose_confirm": ['happy-hour', 'dog-park'],
"replaced_mode": ['walk'],
"mode_weights": [0.9, 0.1],
"purpose_weights": [0.1, 0.9]
}
logging.debug(f"inserting mock Confirmedtrips into database")

# generate labels with a known sample weight that we can rely on in the test
label_data = {
"mode_confirm": ['ebike', 'bike'],
"purpose_confirm": ['happy-hour', 'dog-park'],
"replaced_mode": ['walk'],
"mode_weights": [0.9, 0.1],
"purpose_weights": [0.1, 0.9]
}

train = etmm.generate_mock_trips(
user_id=user_id,
trips=self.total_trips,
origin=self.origin,
destination=self.destination,
trip_part='od',
label_data=label_data,
within_threshold=self.clustered_trips,
threshold=0.004, # ~400m
has_label_p=self.has_label_percent
)
train = etmm.generate_mock_trips(
user_id=user_id,
trips=self.total_trips,
origin=self.origin,
destination=self.destination,
trip_part='od',
label_data=label_data,
within_threshold=self.clustered_trips,
threshold=0.004, # ~400m
has_label_p=self.has_label_percent
)

ts.bulk_insert(train)
ts.bulk_insert(train)

# confirm data write did not fail
test_data = esda.get_entries(key="analysis/confirmed_trip", user_id=user_id, time_query=None)
if len(test_data) != self.total_trips:
logging.debug(f'test invariant failed after generating test data')
self.fail()
else:
logging.debug(f'found {self.total_trips} trips in database')
# confirm data write did not fail
test_data = esda.get_entries(key="analysis/confirmed_trip", user_id=user_id, time_query=None)
if len(test_data) != self.total_trips:
logging.debug(f'test invariant failed after generating test data')
self.fail()
else:
logging.debug(f'found {self.total_trips} trips in database')

def tearDown(self):
edb.get_analysis_timeseries_db().delete_many({'user_id': self.user_id})
Expand Down

0 comments on commit c1d2aee

Please sign in to comment.