From f5944ccef1140db347211c93b851b8864c0362ee Mon Sep 17 00:00:00 2001 From: $aTyam Date: Tue, 12 Sep 2023 12:27:47 -0400 Subject: [PATCH] [TESTED] Explicit clustering method, Improved mock trip generation `od_similarity.py` 1. Explicitly passing 'origin', 'destination', 'origin-destination' for similarity check in `similarity` `similarity_metric.py` 2. Passing the clustering_way parameter `greedy_similarity_binning.py` 3. Since this decision making is moved downstream to `similarity`, so removing it from here. `modellingTestAssets.py` 4. Removing both 2 line wrappers (SetModelConfig, setTripConfig ) from this file since this was parametrised using sub-Test 2 commits back. 5. Removed CalDistanceTest. This was introduced to keep calDistance of test separate from the calDistance being used by the one being used by `greedySimilaritybinning`. Unnecesary. 6. Using ref. coordinates whenever provided to generate trip coordinates. If not, use randomly generated coordinates as reference points. 7. receiving and passing origin and destination ref. points. in `generate_mock_trips' `TestGreedySimilarityBinning.py` 8. removed wrappers for trip and model generation. 9. Using just single threshold for generating trips and for binning. Removed two thresholds. `TestSimilarityMetric.py` 10. Removing the implicitness used in binning by passing this as a parameter. --- .../modelling/similarity/od_similarity.py | 34 ++-- .../modelling/similarity/similarity_metric.py | 12 +- .../trip_model/greedy_similarity_binning.py | 9 +- .../TestGreedySimilarityBinning.py | 109 ++++++++++--- .../modellingTests/TestSimilarityMetric.py | 26 ++-- .../modellingTests/modellingTestAssets.py | 145 +++++------------- 6 files changed, 168 insertions(+), 167 deletions(-) diff --git a/emission/analysis/modelling/similarity/od_similarity.py b/emission/analysis/modelling/similarity/od_similarity.py index 9a6a49d0d..056c721a3 100644 --- a/emission/analysis/modelling/similarity/od_similarity.py +++ b/emission/analysis/modelling/similarity/od_similarity.py @@ -15,24 +15,28 @@ class OriginDestinationSimilarity(eamss.SimilarityMetric): def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]: return ctfe.od_features(trip) - def similarity(self, a: List[float], b: List[float]) -> List[float]: + def similarity(self, a: List[float], b: List[float], clustering_way='origin-destination') -> List[float]: """ - a : a list of point features that can take either of two forms - 1. [point1_latitude,point1_longitude] - 2. [point1_latitude,point1_longitude,point2_latitude,point2_longitude] + a : a list of point features that takes the forms + [point1_longitude,point1_latitude,point2_longitude,point2_latitude] - b : a list of point features that can take either of two forms - 1. [point3_latitude,point3_longitude] - 2. [point3_latitude,point3_longitude,point4_latitude,point4_longitude] - - It'll always take the same form as parameter a. - + b : a list of point features that takes the forms + [point1_longitude,point1_latitude,point2_longitude,point2_latitude] + + clustering_way : takes one among 'origin', 'destination', 'origin-destination' as value. + tells the part of the trip to be used for binning trips together if that + part lies within threshold. + return: a list of size 1 ([distance between point1-point3]) if a and b take form 1 or of size 2 ([distance between point1-point3, distance between point2-point4]) if a and b take form 2. """ - - point_dist = [ecc.calDistance(a[i:i+2], b[i:i+2]) - for i in range (0,len(a),2)] - - return point_dist \ No newline at end of file + origin_dist = ecc.calDistance(a[0:2], b[0:2]) + destination_dist=ecc.calDistance(a[2:4], b[2:4]) + + if clustering_way == 'origin-destination': + return [origin_dist,destination_dist] + elif clustering_way == 'origin': + return [origin_dist] + else: + return [destination_dist] \ No newline at end of file diff --git a/emission/analysis/modelling/similarity/similarity_metric.py b/emission/analysis/modelling/similarity/similarity_metric.py index 1b520318f..c009be9e9 100644 --- a/emission/analysis/modelling/similarity/similarity_metric.py +++ b/emission/analysis/modelling/similarity/similarity_metric.py @@ -17,26 +17,32 @@ def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]: pass @abstractmethod - def similarity(self, a: List[float], b: List[float]) -> List[float]: + def similarity(self, a: List[float], b: List[float], clustering_way = 'origin-destination') -> List[float]: """compares the features, producing their similarity as computed by this similarity metric :param a: features for a trip :param b: features for another trip + :param clustering_way : takes one among 'origin', 'destination', 'origin-destination' as value. + tells the part of the trip to be used for binning trips together if that + part lies within a threshold. :return: for each feature, the similarity of these features """ pass - def similar(self, a: List[float], b: List[float], thresh: float) -> bool: + def similar(self, a: List[float], b: List[float], thresh: float, clustering_way= 'origin-destination') -> bool: """compares the features, returning true if they are similar within some threshold :param a: features for a trip :param b: features for another trip :param thresh: threshold for similarity + :param clustering_way : takes one among 'origin', 'destination', 'origin-destination' as value. + tells the part of the trip to be used for binning trips together if that + part lies within a threshold. :return: true if the feature similarity is within some threshold """ - similarity_values = self.similarity(a, b) + similarity_values = self.similarity(a, b, clustering_way) is_similar = all(sim <= thresh for sim in similarity_values) return is_similar diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index efcce4f02..226fdefb5 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -212,14 +212,7 @@ def _find_matching_bin_id(self, trip_features: List[float]) -> Optional[str]: :return: the id of a bin if a match was found, otherwise None """ for bin_id, bin_record in self.bins.items(): - if self.clusteringWay == 'origin': - start,end=0,2 #since first two features in trip_features are for origin - elif self.clusteringWay == 'destination': - start,end=2,4 #third and fourth values intrip_features are for destination - elif self.clusteringWay == 'origin-destination': - start,end=0,4 #when clusteromgWay is 'origin-destination',we pass all four features - - matches_bin = all([self.metric.similar(trip_features[start:end], bin_sample[start:end], self.sim_thresh) + matches_bin = all([self.metric.similar(trip_features, bin_sample, self.sim_thresh,self.clusteringWay) for bin_sample in bin_record['feature_rows']]) if matches_bin: return bin_id diff --git a/emission/tests/modellingTests/TestGreedySimilarityBinning.py b/emission/tests/modellingTests/TestGreedySimilarityBinning.py index 3e1cd78c2..937effc94 100644 --- a/emission/tests/modellingTests/TestGreedySimilarityBinning.py +++ b/emission/tests/modellingTests/TestGreedySimilarityBinning.py @@ -1,6 +1,7 @@ import unittest import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg -import emission.tests.modellingTests.utilities as etmu +import emission.tests.modellingTests.modellingTestAssets as etmm + import logging @@ -21,11 +22,29 @@ def testNoBinning(self): """ # generate $n trips. - n = 20 - + n = 20 + binning_threshold=500 #this generates 20 trips one-by-one, where each trip's respective origin and destination # points are more than 500m away. - trips = [ etmu.setTripConfig(1, (i, i), (i+1, i+1), 'od', 1)[0] for i in range(n)] + + + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive'] + } + + + trips =etmm.generate_mock_trips( + user_id="joe", + trips=n, + trip_part='__', + label_data=label_data, + within_threshold=1, + threshold=binning_threshold, + origin=(0,0), + destination=(1,1) + ) # parameters passed for testing. A list, where each element is one way of clustering clustering_ways_paramters= ["origin","destination","origin-destination"] @@ -34,7 +53,14 @@ def testNoBinning(self): for cw in clustering_ways_paramters: with self.subTest(clustering_way=cw): #initialise the binning model and fit with previously generated trips - model = etmu.setModelConfig("od_similarity", 500, False, cw, False) + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": binning_threshold, # meters, + "apply_cutoff": False, + "clustering_way": cw, + "incremental_evaluation": False + } + model= eamtg.GreedySimilarityBinning(model_config) model.fit(trips) #check each bins for no of trips no_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model.bins.values())) @@ -53,6 +79,12 @@ def testBinning(self): # within a radius that should have them binned. n = 20 m = 5 + binning_threshold=500 + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive'] + } # parameters passed for testing. A list, where each element of this list takes the form # [trip part to be sampled within mentioned threshold , clustering way used to check similarity] @@ -60,10 +92,25 @@ def testBinning(self): for tp,cw in parameters: with self.subTest(trip_part=tp,clustering_way=cw): #generate random trips using utilities - trips = etmu.setTripConfig(trips=n, org=(0, 0), dest=(1, 1), - trip_part=tp, within_thr=m) + trips =etmm.generate_mock_trips( + user_id="joe", + trips=n, + trip_part=tp, + label_data=label_data, + within_threshold=m, + threshold=binning_threshold, + origin=(0,0), + destination=(1,1) + ) #initialise the binning model and fit with previously generated trips - model = etmu.setModelConfig("od_similarity", 500, False, cw, False) + model_config = { + "metric": "od_similarity" , + "similarity_threshold_meters": binning_threshold, # meters, + "apply_cutoff": False, + "clustering_way": cw, + "incremental_evaluation": False + } + model = eamtg.GreedySimilarityBinning(model_config) model.fit(trips) #check each bins for no of trips at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model.bins.values())) @@ -81,11 +128,24 @@ def testPrediction(self): } n = 6 - trips = etmu.setTripConfig(trips=n, org=(0, 0), dest=(1, 1), - trip_part='od', label_data=label_data, - ) - model = etmu.setModelConfig("od_similarity", 500, False, "origin-destination", False) - + trips =etmm.generate_mock_trips( + user_id="joe", + trips=n, + trip_part='od', + label_data=label_data, + within_threshold=n, + threshold=500, + origin=(0,0), + destination=(1,1) + ) + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 500, # meters, + "apply_cutoff": False, + "clustering_way": 'origin_destination', + "incremental_evaluation": False + } + model= eamtg.GreedySimilarityBinning(model_config) train = trips[0:5] test = trips[5] @@ -105,16 +165,25 @@ def testNoPrediction(self): "replaced_mode": ['crabwalking'] } n = 5 - - train = etmu.setTripConfig(trips=n, org=(39.7645187, -104.9951944), # Denver, CO - dest=(39.7435206, -105.2369292), # Golden, CO - trip_part='od', label_data=label_data + binning_threshold = 500 + train = etmm.generate_mock_trips( user_id="joe",trips=n, origin=(39.7645187, -104.9951944), # Denver, CO + destination=(39.7435206, -105.2369292), # Golden, CO + trip_part='od', label_data=label_data, + threshold=binning_threshold, within_threshold=n ) - test = etmu.setTripConfig(trips=n, org=(61.1042262, -150.5611644), # Denver, CO - dest=(62.2721466, -150.3233046), # Golden, CO + test = etmm.generate_mock_trips( user_id="amanda",trips=n, origin=(61.1042262, -150.5611644), # Denver, CO + destination=(62.2721466, -150.3233046), # Golden, CO trip_part='od', label_data=label_data, + threshold=binning_threshold, within_threshold=n ) - model = etmu.setModelConfig("od_similarity", 500, False, "origin-destination", False) + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 500, # meters, + "apply_cutoff": False, + "clustering_way": 'origin_destination', + "incremental_evaluation": False + } + model= eamtg.GreedySimilarityBinning(model_config) model.fit(train) results, n = model.predict(test[0]) diff --git a/emission/tests/modellingTests/TestSimilarityMetric.py b/emission/tests/modellingTests/TestSimilarityMetric.py index cbe500b23..fe038be4e 100644 --- a/emission/tests/modellingTests/TestSimilarityMetric.py +++ b/emission/tests/modellingTests/TestSimilarityMetric.py @@ -1,11 +1,9 @@ import unittest import emission.analysis.modelling.similarity.od_similarity as eamso -import emission.tests.modellingTests.utilities as etmu - +import emission.tests.modellingTests.modellingTestAssets as etmm class TestSimilarityMetric(unittest.TestCase): def testODsAreSimilar(self): - generate_points_thresh = 0.001 # approx. 111 meters similarity_threshold = 500 # in meters metric = eamso.OriginDestinationSimilarity() @@ -16,17 +14,17 @@ def testODsAreSimilar(self): # a.origin, we pass first two values of this list,i.e. from 0 till before 2 index # b.destination, we pas last two values of this list,i.e. from 2 till before 4 index # c.origin-destination, we pass the entire list , i.e. from 0 till before 4 index - parameters= [["od",(0,4)],["_d",(2,4)],["o_",(0,2)]] + parameters= [["o_",'origin'],["_d",'destination'],["od",'origin-destination']] - for tp,(coord_start,coord_end) in parameters: + for tp,cw in parameters: with self.subTest(trip_part=tp): #generate 2 trips with parameter values - trips = etmu.setTripConfig(2, [0, 0], [1, 1], trip_part=tp,threshold=generate_points_thresh) + trips = etmm.generate_mock_trips('joe',2, threshold=similarity_threshold,origin=[0, 0], destination=[1, 1], within_threshold=2,trip_part=tp) # depending on the parametrs, extract the relevant coordinates - trip0_coords = metric.extract_features(trips[0])[coord_start:coord_end] - trip1_coords = metric.extract_features(trips[1])[coord_start:coord_end] + trip0_coords = metric.extract_features(trips[0]) + trip1_coords = metric.extract_features(trips[1]) #check for similarity using relevant coordinates - similarOD = metric.similar(trip0_coords,trip1_coords, similarity_threshold) + similarOD = metric.similar(trip0_coords,trip1_coords, similarity_threshold,cw) # Since both origin and destination poitns lie within threshold limits,they should be similar # when we check by just origin or just destination or both origin-and-destination self.assertTrue(similarOD) @@ -42,17 +40,17 @@ def testODsAreNotSimilar(self): # a.origin, we pass first two values of this list,i.e. from 0 till before 2 index # b.destination, we pas last two values of this list,i.e. from 2 till before 4 index # c.origin-destination, we pass the entire list , i.e. from 0 till before 4 index - parameters= [(0,2),(2,4),[0,4]] + parameters= ['origin','destination','origin-destination'] n=2 #this generates 2 trips one-by-one, where each trip's respective origin and destination # points are more than 500m away. - trips = [etmu.setTripConfig(1, (i, i), (i+1, i+1), 'od', 1)[0] for i in range(n)] + trips = [ etmm.generate_mock_trips('joe',2, origin=[i, i], destination=[i+1, i+1], trip_part= 'od', within_threshold=1,threshold=500)[0] for i in range(n)] trip0_coord = metric.extract_features(trips[0]) trip1_coord = metric.extract_features(trips[1]) - for (coord_start,coord_end) in parameters: - with self.subTest(coordinates=(coord_start,coord_end)): - IsSimilar = metric.similar(trip0_coord[coord_start:coord_end],trip1_coord[coord_start:coord_end], similarity_threshold) + for cw in parameters: + with self.subTest(clustering_way=cw): + IsSimilar = metric.similar(trip0_coord,trip1_coord, similarity_threshold,cw) # Two trips with neither origin nor destination coordinates within the threshold # must not be similar by any configuration of similarity testing. self.assertFalse(IsSimilar) diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index f98736048..9ad662fe3 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -2,122 +2,39 @@ from typing import Optional, Tuple, List, Dict from uuid import UUID import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg -import emission.tests.modellingTests.modellingTestAssets as etmm import emission.core.wrapper.confirmedtrip as ecwc - +import emission.core.common as ecc import emission.core.wrapper.entry as ecwe import time import math -def setModelConfig(metric,threshold,cutoff,clustering_way,incrementalevaluation): - """ - TODO: Write about each parameter to the function - pass in a test configuration to the binning algorithm. - - clustering_way : Part of the trip used for checking pairwise proximity. - Can take one of the three values: - - 1. 'origin' -> using origin of the trip to check if 2 points - lie within the mentioned similarity_threshold_meters - 2. 'destination' -> using destination of the trip to check if 2 points - lie within the mentioned similarity_threshold_meters - 3. 'origin-destination' -> both origin and destination of the trip to check - if 2 points lie within the mentioned - similarity_threshold_meters - """ - model_config = { - "metric": metric, - "similarity_threshold_meters": threshold, # meters, - "apply_cutoff": cutoff, - "clustering_way": clustering_way, - "incremental_evaluation": incrementalevaluation - } - - return eamtg.GreedySimilarityBinning(model_config) - def generate_random_point(): - """Generate a completetly random point valid WGS84 latitiude and longtidude""" + """Generate a completetly random point valid WGS84 latitiude and longtidude. + CAUTION : In order to save trips, GeoJSON requires points in [lon,lat] format""" lat=random.uniform(-90,90) lon=random.uniform(-180,180) - return [lat,lon] + return [lon,lat] def generate_nearby_random_points(ref_coords,threshold): """ Generate valid WGS84 latitiude and longtidude in threshold(m) proximity to - ref coordinates + ref coordinates. """ - + #convert given threshold in m to approx WGS84 coord dist. thresholdInWGS84 = threshold* (0.000001/0.11) + + #generate a random coordinate in threshold's limit around the ref points. dx=random.uniform(-thresholdInWGS84/2,thresholdInWGS84/2) dy=random.uniform(-thresholdInWGS84/2,thresholdInWGS84/2) - return [ref_coords[0] +dx , ref_coords[1] +dy] - -def calDistanceTest(point1, point2, coordinates=False): - """haversine distance - - :param point1: a coordinate in degrees WGS84 - :param point2: another coordinate in degrees WGS84 - :param coordinates: if false, expect a list of coordinates, defaults to False - :return: distance approximately in meters - """ - earthRadius = 6371000 # meters - if coordinates: - dLat = math.radians(point1.lat-point2.lat) - dLon = math.radians(point1.lon-point2.lon) - lat1 = math.radians(point1.lat) - lat2 = math.radians(point2.lat) - else: - dLat = math.radians(point1[1]-point2[1]) - dLon = math.radians(point1[0]-point2[0]) - lat1 = math.radians(point1[1]) - lat2 = math.radians(point2[1]) - - a = (math.sin(dLat/2) ** 2) + ((math.sin(dLon/2) ** 2) * math.cos(lat1) * math.cos(lat2)) - c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a)) - d = earthRadius * c - - return d - -def setTripConfig(trips,trip_part,threshold,within_thr=None,label_data=None): - """ - TODO: Write about each parameter to the function - trip_part: when mock trips are generated, coordinates of this part of - m trips will be within the threshold. trip_part can take one - among the four values: - - 1. '__' ->(None, meaning NEITHER origin nor destination of any trip will lie - within the mentioned threshold when trips are generated), - - 2. 'o_' ->(origin, meaning ONLY origin of m trips will lie within the mentioned - threshold when trips are generated), - - 3. '_d' ->(destination),meaning ONLY destination of m trips will lie within the - mentioned threshold when trips are generated) - - 4. 'od' ->(origin and destination,meaning BOTH origin and destination of m trips - will lie within the mentioned threshold when trips are generated) - """ - if label_data == None: - label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive'] - } - - trip =etmm.generate_mock_trips( - user_id="joe", - trips=trips, - trip_part=trip_part, - label_data=label_data, - within_threshold=within_thr, - threshold=threshold, - ) - return trip + #This basically gives a way to sample a point from within a square of length thresholdInWGS84 + # around the ref. point. + return [ref_coords[0] +dx , ref_coords[1] +dy] def generate_trip_coordinates( - points_list: list[float], - within_threshold: bool, + points_list: list[float], + ref_coords, + InsideThreshold: bool, threshold_meters: float, ) -> Tuple[float, float]: """generates trip coordinate data to use when mocking a set of trip data.i @@ -132,12 +49,22 @@ def generate_trip_coordinates( :return: generated coordinate pairs sampled in a circle from some coordinates up to some threshold """ - - if within_threshold and points_list: - new_point = generate_nearby_random_points(random.choice(points_list), threshold_meters) - else: - new_point = generate_random_point() - while not all(calDistanceTest(new_point, pt) > threshold_meters for pt in points_list): + # if the point is to be generated within a threshold and it's not the first point + if InsideThreshold and points_list: + # if no ref. coordinates are provided, use any previously accepted point as ref. + if ref_coords == None: + ref_coords=random.choice(points_list) + # generate a new point in threshold proximity to ref. point + new_point = generate_nearby_random_points(ref_coords, threshold_meters) + else: # If point need not be in the threshold OR if its the first point we are generating, then + #Generate random coordinates if no reference coords were provided + if ref_coords == None: + new_point = generate_random_point() + else: + # if ref coordinate are provided, use them as the starting point and iterate till required + # condition is satisfied + new_point = ref_coords + while not all(ecc.calDistance(new_point, pt) > threshold_meters for pt in points_list): new_point = generate_random_point() return new_point @@ -241,6 +168,8 @@ def generate_mock_trips( trips, threshold, trip_part='od', + origin=None, + destination=None, label_data = None, within_threshold = None, start_ts: None = None, @@ -276,6 +205,8 @@ def generate_mock_trips( mentioned threshold when trips are generated) 4. 'od' ->(origin and destination,meaning BOTH origin and destination of m trips will lie within the mentioned threshold when trips are generated) + :param origin : reference point for trip origin generally + :param destination : reference point for trip origin generally :param label_data: dictionary of label data, see above, defaults to None :param within_threshold: number of trips that should fall within the provided distance threshold in m @@ -292,11 +223,11 @@ def generate_mock_trips( origin_points=[] destination_points=[] - # generate trip number of points based on which among 'o' ,'d' or 'od' should be in threshold - # proximity to each other. + # generate 'trip' number of points based on which among 'o' (Origin) ,'d' (Destination) or + # 'od' (Origin-Destination) or '__' (None) should be in threshold proximity to each other. for within in trips_within_threshold: - origin_points.append(generate_trip_coordinates(origin_points, (trip_part[0] == 'o' and within), threshold)) - destination_points.append(generate_trip_coordinates(destination_points, (trip_part[1] == 'd' and within), threshold)) + origin_points.append(generate_trip_coordinates(origin_points, origin, InsideThreshold= (trip_part[0] == 'o' and within), threshold_meters= threshold)) + destination_points.append(generate_trip_coordinates(destination_points, destination, InsideThreshold=(trip_part[1] == 'd' and within), threshold_meters=threshold)) for o,d in zip(origin_points,destination_points): labels = {} if label_data is None or random.random() > has_label_p \