[TESTED] Explicit clustering method, Improved mock trip generation

`od_similarity.py` 1. Explicitly passing 'origin', 'destination', 'origin-destination' for similarity check in `similarity` `similarity_metric.py` 2. Passing the clustering_way parameter `greedy_similarity_binning.py` 3. Since this decision making is moved downstream to `similarity`, so removing it from here. `modellingTestAssets.py` 4. Removing both 2 line wrappers (SetModelConfig, setTripConfig ) from this file since this was parametrised using sub-Test 2 commits back. 5. Removed CalDistanceTest. This was introduced to keep calDistance of test separate from the calDistance being used by the one being used by `greedySimilaritybinning`. Unnecesary. 6. Using ref. coordinates whenever provided to generate trip coordinates. If not, use randomly generated coordinates as reference points. 7. receiving and passing origin and destination ref. points. in `generate_mock_trips' `TestGreedySimilarityBinning.py` 8. removed wrappers for trip and model generation. 9. Using just single threshold for generating trips and for binning. Removed two thresholds. `TestSimilarityMetric.py` 10. Removing the implicitness used in binning by passing this as a parameter.
e-mission · Sep 12, 2023 · f5944cc · f5944cc
1 parent c35b7c1
commit f5944cc
Show file tree

Hide file tree

Showing 6 changed files with 168 additions and 167 deletions.
diff --git a/emission/analysis/modelling/similarity/od_similarity.py b/emission/analysis/modelling/similarity/od_similarity.py
@@ -15,24 +15,28 @@ class OriginDestinationSimilarity(eamss.SimilarityMetric):
     def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]:
         return ctfe.od_features(trip)
 
-    def similarity(self, a: List[float], b: List[float]) -> List[float]:
+    def similarity(self, a: List[float], b: List[float], clustering_way='origin-destination') -> List[float]:
         """
-        a : a list of point features that can take either of two forms
-                    1. [point1_latitude,point1_longitude]  
-                    2. [point1_latitude,point1_longitude,point2_latitude,point2_longitude] 
+        a : a list of point features that takes the forms
+          [point1_longitude,point1_latitude,point2_longitude,point2_latitude] 
                     
-        b : a list of point features that can take either of two forms
-                    1. [point3_latitude,point3_longitude]  
-                    2. [point3_latitude,point3_longitude,point4_latitude,point4_longitude] 
-            
-            It'll always take the same form as parameter a.
-
+        b : a list of point features that takes the forms
+          [point1_longitude,point1_latitude,point2_longitude,point2_latitude] 
+        
+        clustering_way : takes one among 'origin', 'destination', 'origin-destination' as value.
+                         tells the part of the trip to be used for binning trips together if that 
+                         part lies within threshold.
+                                                  
         return: a list of size 1 ([distance between point1-point3]) if a and b take form 1
                 or of size 2 ([distance between point1-point3, distance between point2-point4])
                 if a and b take form 2.
         """
-
-        point_dist = [ecc.calDistance(a[i:i+2], b[i:i+2]) 
-                      for i in range (0,len(a),2)] 
-
-        return point_dist
+        origin_dist = ecc.calDistance(a[0:2], b[0:2])
+        destination_dist=ecc.calDistance(a[2:4], b[2:4])
+
+        if clustering_way == 'origin-destination':
+            return [origin_dist,destination_dist]
+        elif clustering_way == 'origin':
+            return [origin_dist]
+        else:
+            return [destination_dist]
diff --git a/emission/analysis/modelling/similarity/similarity_metric.py b/emission/analysis/modelling/similarity/similarity_metric.py
@@ -17,26 +17,32 @@ def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]:
         pass
 
     @abstractmethod
-    def similarity(self, a: List[float], b: List[float]) -> List[float]:
+    def similarity(self, a: List[float], b: List[float], clustering_way = 'origin-destination') -> List[float]:
         """compares the features, producing their similarity
         as computed by this similarity metric
 
         :param a: features for a trip
         :param b: features for another trip
+        :param clustering_way : takes one among 'origin', 'destination', 'origin-destination' as value.
+                                tells the part of the trip to be used for binning trips together if that 
+                                part lies within a threshold.
         :return: for each feature, the similarity of these features
         """
         pass
 
-    def similar(self, a: List[float], b: List[float], thresh: float) -> bool:
+    def similar(self, a: List[float], b: List[float], thresh: float, clustering_way= 'origin-destination') -> bool:
         """compares the features, returning true if they are similar
         within some threshold
 
         :param a: features for a trip 
         :param b: features for another trip
         :param thresh: threshold for similarity
+        :param clustering_way : takes one among 'origin', 'destination', 'origin-destination' as value.
+                                tells the part of the trip to be used for binning trips together if that 
+                                part lies within a threshold.
         :return: true if the feature similarity is within some threshold
         """
-        similarity_values = self.similarity(a, b)
+        similarity_values = self.similarity(a, b, clustering_way)
         is_similar = all(sim <= thresh for sim in similarity_values)
 
         return is_similar
diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py
@@ -212,14 +212,7 @@ def _find_matching_bin_id(self, trip_features: List[float]) -> Optional[str]:
         :return: the id of a bin if a match was found, otherwise None
         """
         for bin_id, bin_record in self.bins.items():
-            if self.clusteringWay == 'origin':
-                start,end=0,2  #since first two features in trip_features are for origin
-            elif self.clusteringWay == 'destination':
-                start,end=2,4  #third and fourth values intrip_features are for destination
-            elif self.clusteringWay == 'origin-destination':
-                start,end=0,4  #when clusteromgWay is 'origin-destination',we pass all four features
-
-            matches_bin = all([self.metric.similar(trip_features[start:end], bin_sample[start:end], self.sim_thresh)
+            matches_bin = all([self.metric.similar(trip_features, bin_sample, self.sim_thresh,self.clusteringWay)
                 for bin_sample in bin_record['feature_rows']])
             if matches_bin:
                 return bin_id

diff --git a/emission/tests/modellingTests/TestGreedySimilarityBinning.py b/emission/tests/modellingTests/TestGreedySimilarityBinning.py
@@ -1,6 +1,7 @@
 import unittest
 import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg
-import emission.tests.modellingTests.utilities as etmu
+import emission.tests.modellingTests.modellingTestAssets as etmm
+
 import logging
 
 
@@ -21,11 +22,29 @@ def testNoBinning(self):
         """
 
         # generate $n trips.
-        n = 20     
-
+        n = 20   
+        binning_threshold=500
         #this generates 20 trips one-by-one, where each trip's respective origin and destination 
         # points are more than 500m away.
-        trips = [ etmu.setTripConfig(1, (i, i), (i+1, i+1), 'od', 1)[0] for i in range(n)]    
+
+
+        label_data = {
+            "mode_confirm": ['walk', 'bike', 'transit'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive']
+        }         
+
+
+        trips =etmm.generate_mock_trips(
+                user_id="joe", 
+                trips=n, 
+                trip_part='__',
+                label_data=label_data, 
+                within_threshold=1, 
+                threshold=binning_threshold,
+                origin=(0,0),
+                destination=(1,1)
+            )
 
         # parameters passed for testing. A list, where each element is one way of clustering
         clustering_ways_paramters= ["origin","destination","origin-destination"]
@@ -34,7 +53,14 @@ def testNoBinning(self):
         for cw in clustering_ways_paramters:
             with self.subTest(clustering_way=cw):
                 #initialise the binning model and fit with previously generated trips
-                model = etmu.setModelConfig("od_similarity",  500,  False, cw, False)
+                model_config = {
+                                    "metric": "od_similarity",
+                                    "similarity_threshold_meters": binning_threshold,  # meters,
+                                    "apply_cutoff": False,
+                                    "clustering_way": cw,  
+                                    "incremental_evaluation": False
+                                }
+                model= eamtg.GreedySimilarityBinning(model_config)
                 model.fit(trips)
                 #check each bins for no of trips
                 no_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model.bins.values()))
@@ -53,17 +79,38 @@ def testBinning(self):
         # within a radius that should have them binned.
         n = 20
         m = 5
+        binning_threshold=500
+        label_data = {
+            "mode_confirm": ['walk', 'bike', 'transit'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive']
+        }
 
         # parameters passed for testing. A list, where each element of this list takes the form 
         # [trip part to be sampled within mentioned threshold , clustering way used to check similarity]
         parameters= [["o_",'origin'],["_d",'destination'],["od",'origin-destination']]
         for tp,cw in parameters:
             with self.subTest(trip_part=tp,clustering_way=cw):
                 #generate random trips using utilities
-                trips = etmu.setTripConfig(trips=n, org=(0, 0), dest=(1, 1),
-                                trip_part=tp, within_thr=m)
+                trips =etmm.generate_mock_trips(
+                    user_id="joe", 
+                    trips=n, 
+                    trip_part=tp,
+                    label_data=label_data, 
+                    within_threshold=m, 
+                    threshold=binning_threshold,
+                    origin=(0,0),
+                    destination=(1,1)
+                )
                 #initialise the binning model and fit with previously generated trips
-                model = etmu.setModelConfig("od_similarity",  500,  False, cw, False)
+                model_config = {
+                            "metric": "od_similarity" ,
+                            "similarity_threshold_meters": binning_threshold,  # meters,
+                            "apply_cutoff": False,
+                            "clustering_way": cw,  
+                            "incremental_evaluation": False
+                 }
+                model = eamtg.GreedySimilarityBinning(model_config)
                 model.fit(trips)
                 #check each bins for no of trips
                 at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model.bins.values()))
@@ -81,11 +128,24 @@ def testPrediction(self):
         }
 
         n = 6
-        trips = etmu.setTripConfig(trips=n, org=(0, 0), dest=(1, 1),
-                                   trip_part='od', label_data=label_data,                                   
-        )
-        model = etmu.setModelConfig("od_similarity",  500,  False, "origin-destination", False)
-
+        trips =etmm.generate_mock_trips(
+                user_id="joe", 
+                trips=n, 
+                trip_part='od',
+                label_data=label_data, 
+                within_threshold=n, 
+                threshold=500,
+                origin=(0,0),
+                destination=(1,1)
+            )
+        model_config = {
+                    "metric": "od_similarity",
+                    "similarity_threshold_meters": 500,  # meters,
+                    "apply_cutoff": False,
+                    "clustering_way": 'origin_destination',  
+                    "incremental_evaluation": False
+                                }
+        model= eamtg.GreedySimilarityBinning(model_config)
         train = trips[0:5]
         test = trips[5]
 
@@ -105,16 +165,25 @@ def testNoPrediction(self):
             "replaced_mode": ['crabwalking']
         }
         n = 5
-
-        train = etmu.setTripConfig(trips=n, org=(39.7645187, -104.9951944), # Denver, CO
-                                   dest=(39.7435206, -105.2369292),  # Golden, CO
-                                   trip_part='od', label_data=label_data                                 
+        binning_threshold = 500
+        train = etmm.generate_mock_trips( user_id="joe",trips=n, origin=(39.7645187, -104.9951944), # Denver, CO
+                                   destination=(39.7435206, -105.2369292),  # Golden, CO
+                                   trip_part='od', label_data=label_data,
+                                   threshold=binning_threshold, within_threshold=n
         )
-        test = etmu.setTripConfig(trips=n, org=(61.1042262, -150.5611644), # Denver, CO
-                                   dest=(62.2721466, -150.3233046),  # Golden, CO
+        test = etmm.generate_mock_trips( user_id="amanda",trips=n, origin=(61.1042262, -150.5611644), # Denver, CO
+                                   destination=(62.2721466, -150.3233046),  # Golden, CO
                                    trip_part='od', label_data=label_data,                                   
+                                    threshold=binning_threshold, within_threshold=n
         )
-        model = etmu.setModelConfig("od_similarity",  500,  False, "origin-destination", False)
+        model_config = {
+                    "metric": "od_similarity",
+                    "similarity_threshold_meters": 500,  # meters,
+                    "apply_cutoff": False,
+                    "clustering_way": 'origin_destination',  
+                    "incremental_evaluation": False
+                                }
+        model= eamtg.GreedySimilarityBinning(model_config)
         model.fit(train)
         results, n = model.predict(test[0])
 

diff --git a/emission/tests/modellingTests/TestSimilarityMetric.py b/emission/tests/modellingTests/TestSimilarityMetric.py
@@ -1,11 +1,9 @@
 import unittest
 import emission.analysis.modelling.similarity.od_similarity as eamso
-import emission.tests.modellingTests.utilities as etmu
-
+import emission.tests.modellingTests.modellingTestAssets as etmm
 class TestSimilarityMetric(unittest.TestCase):
 
     def testODsAreSimilar(self):
-        generate_points_thresh = 0.001  # approx. 111 meters
         similarity_threshold = 500  # in meters
         metric = eamso.OriginDestinationSimilarity()
 
@@ -16,17 +14,17 @@ def testODsAreSimilar(self):
         #   a.origin, we pass first two values of this list,i.e. from 0 till before 2 index
         #   b.destination, we pas last two values of this list,i.e. from 2 till before 4 index
         #   c.origin-destination, we pass the entire list , i.e. from 0 till before 4 index
-        parameters= [["od",(0,4)],["_d",(2,4)],["o_",(0,2)]]
+        parameters= [["o_",'origin'],["_d",'destination'],["od",'origin-destination']]
 
-        for tp,(coord_start,coord_end) in parameters:
+        for tp,cw in parameters:
             with self.subTest(trip_part=tp):
                 #generate 2 trips with parameter values
-                trips = etmu.setTripConfig(2, [0, 0], [1, 1], trip_part=tp,threshold=generate_points_thresh) 
+                trips = etmm.generate_mock_trips('joe',2, threshold=similarity_threshold,origin=[0, 0], destination=[1, 1], within_threshold=2,trip_part=tp) 
                 # depending on the parametrs, extract the relevant coordinates
-                trip0_coords = metric.extract_features(trips[0])[coord_start:coord_end]
-                trip1_coords = metric.extract_features(trips[1])[coord_start:coord_end]
+                trip0_coords = metric.extract_features(trips[0])
+                trip1_coords = metric.extract_features(trips[1])
                 #check for similarity using relevant coordinates
-                similarOD = metric.similar(trip0_coords,trip1_coords, similarity_threshold)
+                similarOD = metric.similar(trip0_coords,trip1_coords, similarity_threshold,cw)
                 # Since both origin and destination poitns lie within threshold limits,they should be similar
                 # when we check by just origin or just destination or both origin-and-destination
                 self.assertTrue(similarOD)
@@ -42,17 +40,17 @@ def testODsAreNotSimilar(self):
         #   a.origin, we pass first two values of this list,i.e. from 0 till before 2 index
         #   b.destination, we pas last two values of this list,i.e. from 2 till before 4 index
         #   c.origin-destination, we pass the entire list , i.e. from 0 till before 4 index
-        parameters= [(0,2),(2,4),[0,4]]
+        parameters= ['origin','destination','origin-destination']
         n=2
         #this generates 2 trips one-by-one, where each trip's respective origin and destination 
         # points are more than 500m away.
-        trips = [etmu.setTripConfig(1, (i, i), (i+1, i+1), 'od', 1)[0] for i in range(n)]    
+        trips = [ etmm.generate_mock_trips('joe',2, origin=[i, i], destination=[i+1, i+1], trip_part= 'od', within_threshold=1,threshold=500)[0] for i in range(n)]    
         trip0_coord = metric.extract_features(trips[0])
         trip1_coord = metric.extract_features(trips[1])
 
-        for (coord_start,coord_end) in parameters:
-            with self.subTest(coordinates=(coord_start,coord_end)):      
-                IsSimilar = metric.similar(trip0_coord[coord_start:coord_end],trip1_coord[coord_start:coord_end], similarity_threshold)
+        for cw in parameters:
+            with self.subTest(clustering_way=cw):      
+                IsSimilar = metric.similar(trip0_coord,trip1_coord, similarity_threshold,cw)
                 # Two trips with neither origin nor destination coordinates within the threshold
                 # must not be similar by any configuration of similarity testing.
                 self.assertFalse(IsSimilar)