Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Moving dependence from custom branch's tour_model to master's trip_model #933

Merged
merged 10 commits into from
Sep 14, 2023
29 changes: 25 additions & 4 deletions emission/analysis/modelling/similarity/od_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,28 @@ class OriginDestinationSimilarity(eamss.SimilarityMetric):
def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]:
return ctfe.od_features(trip)

def similarity(self, a: List[float], b: List[float]) -> List[float]:
o_dist = ecc.calDistance([a[0], a[1]], [b[0], b[1]])
d_dist = ecc.calDistance([a[2], a[3]], [b[2], b[3]])
return [o_dist, d_dist]
def similarity(self, a: List[float], b: List[float], clustering_way='origin-destination') -> List[float]:
"""
a : a list of point features that takes the forms
[point1_longitude,point1_latitude,point2_longitude,point2_latitude]

b : a list of point features that takes the forms
[point1_longitude,point1_latitude,point2_longitude,point2_latitude]

clustering_way : takes one among 'origin', 'destination', 'origin-destination' as value.
tells the part of the trip to be used for binning trips together if that
part lies within threshold.

return: a list of size 1 ([distance between point1-point3]) if a and b take form 1
or of size 2 ([distance between point1-point3, distance between point2-point4])
if a and b take form 2.
"""
origin_dist = ecc.calDistance(a[0:2], b[0:2])
destination_dist=ecc.calDistance(a[2:4], b[2:4])

if clustering_way == 'origin-destination':
return [origin_dist,destination_dist]
elif clustering_way == 'origin':
return [origin_dist]
else:
return [destination_dist]
17 changes: 12 additions & 5 deletions emission/analysis/modelling/similarity/similarity_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,25 +17,32 @@ def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]:
pass

@abstractmethod
def similarity(self, a: List[float], b: List[float]) -> List[float]:
def similarity(self, a: List[float], b: List[float], clustering_way = 'origin-destination') -> List[float]:
"""compares the features, producing their similarity
as computed by this similarity metric

:param a: features for a trip
:param b: features for another trip
:param clustering_way : takes one among 'origin', 'destination', 'origin-destination' as value.
tells the part of the trip to be used for binning trips together if that
part lies within a threshold.
:return: for each feature, the similarity of these features
"""
pass

def similar(self, a: List[float], b: List[float], thresh: float) -> bool:
def similar(self, a: List[float], b: List[float], thresh: float, clustering_way= 'origin-destination') -> bool:
"""compares the features, returning true if they are similar
within some threshold

:param a: features for a trip
:param a: features for a trip
:param b: features for another trip
:param thresh: threshold for similarity
:param clustering_way : takes one among 'origin', 'destination', 'origin-destination' as value.
tells the part of the trip to be used for binning trips together if that
part lies within a threshold.
:return: true if the feature similarity is within some threshold
"""
similarity_values = self.similarity(a, b)
is_similar = all(map(lambda sim: sim <= thresh, similarity_values))
shankari marked this conversation as resolved.
Show resolved Hide resolved
similarity_values = self.similarity(a, b, clustering_way)
is_similar = all(sim <= thresh for sim in similarity_values)
shankari marked this conversation as resolved.
Show resolved Hide resolved

return is_similar
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,11 @@ class label to apply:
self.sim_thresh = config['similarity_threshold_meters']
self.apply_cutoff = config['apply_cutoff']
self.is_incremental = config['incremental_evaluation']
if config.get('clustering_way') is None:
shankari marked this conversation as resolved.
Show resolved Hide resolved
self.clusteringWay='origin-destination' # previous default
else:
self.clusteringWay= config['clustering_way']
self.tripLabels=[]

self.bins: Dict[str, Dict] = {}

Expand Down Expand Up @@ -184,9 +189,11 @@ def _assign_bins(self, trips: List[ecwc.Confirmedtrip]):
logging.debug(f"adding trip to bin {bin_id} with features {trip_features}")
self.bins[bin_id]['feature_rows'].append(trip_features)
self.bins[bin_id]['labels'].append(trip_labels)
self.tripLabels.append(bin_id)
else:
# create new bin
new_bin_id = str(len(self.bins))
self.tripLabels.append(new_bin_id)
new_bin_record = {
'feature_rows': [trip_features],
'labels': [trip_labels],
Expand All @@ -200,14 +207,15 @@ def _find_matching_bin_id(self, trip_features: List[float]) -> Optional[str]:
finds an existing bin where all bin features are "similar" to the incoming
trip features.

:param trip_features: feature row for the incoming trip
:param trip_features: feature row for the incoming trip.
takes the form [orig_lat, orig_lon, dest_lat, dest_lon]
:return: the id of a bin if a match was found, otherwise None
"""
for bin_id, bin_record in self.bins.items():
matches_bin = all([self.metric.similar(trip_features, bin_sample, self.sim_thresh)
for bin_sample in bin_record['feature_rows']])
if matches_bin:
return bin_id
matches_bin = all([self.metric.similar(trip_features, bin_sample, self.sim_thresh,self.clusteringWay)
for bin_sample in bin_record['feature_rows']])
if matches_bin:
return bin_id
return None

def _nearest_bin(self, trip: ecwc.Confirmedtrip) -> Tuple[Optional[int], Optional[Dict]]:
Expand Down
5 changes: 5 additions & 0 deletions emission/tests/modellingTests/TestBackwardsCompat.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def testAnyVsAllWhilePredicting(self):
"metric": "od_similarity",
"similarity_threshold_meters": 16000, # meters,
"apply_cutoff": False,
"clustering_way": 'origin-destination',
"incremental_evaluation": False
}
new_builder = eamtg.GreedySimilarityBinning(model_config)
Expand Down Expand Up @@ -96,6 +97,7 @@ def testRandomTripsWithinTheSameThreshold(self):
trips=n,
origin=(0, 0),
destination=(1, 1),
trip_part='od',
label_data=label_data,
threshold=0.001, # ~ 111 meters in degrees WGS84
)
Expand All @@ -113,6 +115,7 @@ def testRandomTripsWithinTheSameThreshold(self):
"metric": "od_similarity",
"similarity_threshold_meters": 500, # meters,
"apply_cutoff": False,
"clustering_way": 'origin-destination',
"incremental_evaluation": False
}
new_model = eamtg.GreedySimilarityBinning(model_config)
Expand Down Expand Up @@ -156,6 +159,7 @@ def testRandomTripsOutsideTheSameThreshold(self):
trips=n,
origin=(0, 0),
destination=(1, 1),
trip_part='od',
label_data=label_data,
threshold=0.1, # Much bigger than the 500m threshold, so we will get multiple bins
)
Expand All @@ -173,6 +177,7 @@ def testRandomTripsOutsideTheSameThreshold(self):
"metric": "od_similarity",
"similarity_threshold_meters": 500, # meters,
"apply_cutoff": False,
"clustering_way": 'origin-destination',
"incremental_evaluation": False
}
new_model = eamtg.GreedySimilarityBinning(model_config)
Expand Down
176 changes: 119 additions & 57 deletions emission/tests/modellingTests/TestGreedySimilarityBinning.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import unittest
import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg
import emission.tests.modellingTests.modellingTestAssets as etmm

import logging


Expand All @@ -10,44 +11,111 @@ def setUp(self) -> None:
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',
level=logging.DEBUG)

def testBinning(self):
def testNoBinning(self):
"""
when $should_be_grouped trips are the same, they should appear in a bin
Tests the three (origin, destination and origin-destination based)
binning configuration for trips.

When the origin and destination points of trips are outside a threshold
limit, none of the trips should be binned with the other in any of the three
configs (origin, destination or origin-and-destination based).
"""

# generate $n trips.
n = 20
binning_threshold=500
#this generates 20 trips one-by-one, where each trip's respective origin and destination
# points are more than 500m away.


label_data = {
"mode_confirm": ['walk', 'bike', 'transit'],
"purpose_confirm": ['work', 'home', 'school'],
"replaced_mode": ['drive']
}
}


trips =etmm.generate_mock_trips(
user_id="joe",
trips=n,
trip_part='__',
label_data=label_data,
within_threshold=1,
threshold=binning_threshold,
origin=(0,0),
destination=(1,1)
)

# parameters passed for testing. A list, where each element is one way of clustering
clustering_ways_paramters= ["origin","destination","origin-destination"]

#Testing each of the three clustering_ways by passing them as parameters
for cw in clustering_ways_paramters:
with self.subTest(clustering_way=cw):
#initialise the binning model and fit with previously generated trips
model_config = {
"metric": "od_similarity",
"similarity_threshold_meters": binning_threshold, # meters,
"apply_cutoff": False,
"clustering_way": cw,
"incremental_evaluation": False
}
model= eamtg.GreedySimilarityBinning(model_config)
model.fit(trips)
#check each bins for no of trips
no_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model.bins.values()))
#Since all trips were sampled outside the threshold, there should be no bin
# with more then 1 trip
self.assertTrue(no_large_bin,"no bin should have more than 1 features in it")

# generate $n trips. $m of them should have origin and destinations sampled
def testBinning(self):
"""
Tests the three (origin, destination and origin-destination based)
binning configuration for trips.

When the points lie within threshold ,the trips are binned together.
"""
# generate $n trips. $m of them should have origin sampled
# within a radius that should have them binned.
n = 20
m = 5
trips = etmm.generate_mock_trips(
user_id="joe",
trips=n,
origin=(0, 0),
destination=(1, 1),
label_data=label_data,
within_threshold=m,
threshold=0.001, # ~ 111 meters in degrees WGS84
)

# pass in a test configuration to the binning algorithm
model_config = {
"metric": "od_similarity",
"similarity_threshold_meters": 500, # meters,
"apply_cutoff": False,
"incremental_evaluation": False
binning_threshold=500
label_data = {
"mode_confirm": ['walk', 'bike', 'transit'],
"purpose_confirm": ['work', 'home', 'school'],
"replaced_mode": ['drive']
}
model = eamtg.GreedySimilarityBinning(model_config)

model.fit(trips)

# $m trip features should appear together in one bin
at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model.bins.values()))
self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it")
# parameters passed for testing. A list, where each element of this list takes the form
# [trip part to be sampled within mentioned threshold , clustering way used to check similarity]
parameters= [["o_",'origin'],["_d",'destination'],["od",'origin-destination']]
for tp,cw in parameters:
with self.subTest(trip_part=tp,clustering_way=cw):
shankari marked this conversation as resolved.
Show resolved Hide resolved
#generate random trips using utilities
trips =etmm.generate_mock_trips(
user_id="joe",
trips=n,
trip_part=tp,
label_data=label_data,
within_threshold=m,
threshold=binning_threshold,
origin=(0,0),
destination=(1,1)
)
#initialise the binning model and fit with previously generated trips
model_config = {
"metric": "od_similarity" ,
"similarity_threshold_meters": binning_threshold, # meters,
"apply_cutoff": False,
"clustering_way": cw,
"incremental_evaluation": False
}
model = eamtg.GreedySimilarityBinning(model_config)
model.fit(trips)
#check each bins for no of trips
at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model.bins.values()))
#Since 5 trips were sampled within the threshold, there should be one bin with 5 trips
self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it")

def testPrediction(self):
"""
Expand All @@ -60,23 +128,24 @@ def testPrediction(self):
}

n = 6
trips = etmm.generate_mock_trips(
user_id="joe",
trips=n,
origin=(0, 0),
destination=(1, 1),
label_data=label_data,
threshold=0.001, # ~ 111 meters in degrees WGS84
)

trips =etmm.generate_mock_trips(
user_id="joe",
trips=n,
trip_part='od',
label_data=label_data,
within_threshold=n,
threshold=500,
origin=(0,0),
destination=(1,1)
)
model_config = {
"metric": "od_similarity",
"similarity_threshold_meters": 500, # meters,
"similarity_threshold_meters": 500, # meters,
"apply_cutoff": False,
"clustering_way": 'origin_destination',
"incremental_evaluation": False
}
model = eamtg.GreedySimilarityBinning(model_config)

model= eamtg.GreedySimilarityBinning(model_config)
train = trips[0:5]
test = trips[5]

Expand All @@ -95,33 +164,26 @@ def testNoPrediction(self):
"purpose_confirm": ['pizza_party'],
"replaced_mode": ['crabwalking']
}

n = 5
train = etmm.generate_mock_trips(
user_id="joe",
trips=n,
origin=(39.7645187, -104.9951944), # Denver, CO
destination=(39.7435206, -105.2369292), # Golden, CO
label_data=label_data,
threshold=0.001, # ~ 111 meters in degrees WGS84
binning_threshold = 500
train = etmm.generate_mock_trips( user_id="joe",trips=n, origin=(39.7645187, -104.9951944), # Denver, CO
destination=(39.7435206, -105.2369292), # Golden, CO
trip_part='od', label_data=label_data,
threshold=binning_threshold, within_threshold=n
)
test = etmm.generate_mock_trips(
user_id="joe",
trips=1,
origin=(61.1042262, -150.5611644), # Anchorage, AK
destination=(62.2721466, -150.3233046), # Talkeetna, AK
label_data=label_data,
threshold=0.001, # ~ 111 meters in degrees WGS84
test = etmm.generate_mock_trips( user_id="amanda",trips=n, origin=(61.1042262, -150.5611644), # Denver, CO
destination=(62.2721466, -150.3233046), # Golden, CO
trip_part='od', label_data=label_data,
threshold=binning_threshold, within_threshold=n
)

model_config = {
"metric": "od_similarity",
"similarity_threshold_meters": 500, # meters,
"similarity_threshold_meters": 500, # meters,
"apply_cutoff": False,
"clustering_way": 'origin_destination',
"incremental_evaluation": False
}
model = eamtg.GreedySimilarityBinning(model_config)

model= eamtg.GreedySimilarityBinning(model_config)
model.fit(train)
results, n = model.predict(test[0])

Expand Down
Loading