Dist Filter

fixed dist name
e-mission · Nov 8, 2024 · c48a655 · c48a655
1 parent 0382abf
commit c48a655
Showing 1 changed file with 178 additions and 82 deletions.
diff --git a/.../analysis/intake/segmentation/trip_segmentation_methods/dwell_segmentation_dist_filter.py b/.../analysis/intake/segmentation/trip_segmentation_methods/dwell_segmentation_dist_filter.py
@@ -12,6 +12,7 @@
 import attrdict as ad
 import numpy as np
 import datetime as pydt
+import time
 
 # Our imports
 import emission.analysis.point_features as pf
@@ -20,6 +21,9 @@
 
 import emission.analysis.intake.segmentation.restart_checking as eaisr
 import emission.analysis.intake.segmentation.trip_segmentation_methods.trip_end_detection_corner_cases as eaistc
+import emission.storage.decorations.stats_queries as esds
+import emission.core.timer as ect
+import emission.core.wrapper.pipelinestate as ecwp
 
 class DwellSegmentationDistFilter(eaist.TripSegmentationMethod):
     def __init__(self, time_threshold, point_threshold, distance_threshold):
@@ -46,9 +50,34 @@ def segment_into_trips(self, timeseries, time_query):
         data that they want from the sensor streams in order to determine the
         segmentation points.
         """
-        self.filtered_points_df = timeseries.get_data_df("background/filtered_location", time_query)
-        self.filtered_points_df.loc[:,"valid"] = True
-        self.transition_df = timeseries.get_data_df("statemachine/transition", time_query)
+        with ect.Timer() as t_get_filtered_points:
+            self.filtered_points_df = timeseries.get_data_df("background/filtered_location", time_query)
+            user_id = self.filtered_points_df["user_id"].iloc[0]
+        esds.store_pipeline_time(
+            user_id,
+            ecwp.PipelineStages.TRIP_SEGMENTATION.name + "/segment_into_trips_dist/get_filtered_points_df",
+            time.time(),
+            t_get_filtered_points.elapsed
+        )
+
+        with ect.Timer() as t_mark_valid:
+            self.filtered_points_df.loc[:, "valid"] = True
+        esds.store_pipeline_time(
+            user_id,
+            ecwp.PipelineStages.TRIP_SEGMENTATION.name + "/segment_into_trips_dist/mark_valid",
+            time.time(),
+            t_mark_valid.elapsed
+        )
+
+        with ect.Timer() as t_get_transition_df:
+            self.transition_df = timeseries.get_data_df("statemachine/transition", time_query)
+        esds.store_pipeline_time(
+            user_id,
+            ecwp.PipelineStages.TRIP_SEGMENTATION.name + "/segment_into_trips_dist/get_transition_df",
+            time.time(),
+            t_get_transition_df.elapsed
+        )
+
         if len(self.transition_df) > 0:
             logging.debug("self.transition_df = %s" % self.transition_df[["fmt_time", "transition"]])
         else:
@@ -62,86 +91,153 @@ def segment_into_trips(self, timeseries, time_query):
         last_trip_end_point = None
         curr_trip_start_point = None
         just_ended = True
-        for idx, row in self.filtered_points_df.iterrows():
-            currPoint = ad.AttrDict(row)
-            currPoint.update({"idx": idx})
-            logging.debug("-" * 30 + str(currPoint.fmt_time) + "-" * 30)
-            if curr_trip_start_point is None:
-                logging.debug("Appending currPoint because the current start point is None")
-                # segmentation_points.append(currPoint)
-
-            if just_ended:
-                if self.continue_just_ended(idx, currPoint, self.filtered_points_df):
-                    # We have "processed" the currPoint by deciding to glom it
-                    self.last_ts_processed = currPoint.metadata_write_ts
-                    continue
-                # else: 
-                # Here's where we deal with the start trip. At this point, the
-                # distance is greater than the filter. 
-                sel_point = currPoint
-                logging.debug("Setting new trip start point %s with idx %s" % (sel_point, sel_point.idx))
-                curr_trip_start_point = sel_point
-                just_ended = False
-            else:
-                # Using .loc here causes problems if we have filtered out some points and so the index is non-consecutive.
-                # Using .iloc just ends up including points after this one.
-                # So we reset_index upstream and use it here.
-                last10Points_df = self.filtered_points_df.iloc[max(idx-self.point_threshold, curr_trip_start_point.idx):idx+1]
-                lastPoint = self.find_last_valid_point(idx)
-                if self.has_trip_ended(lastPoint, currPoint, timeseries):
-                    last_trip_end_point = lastPoint
-                    logging.debug("Appending last_trip_end_point %s with index %s " %
-                        (last_trip_end_point, idx-1))
-                    segmentation_points.append((curr_trip_start_point, last_trip_end_point))
-                    logging.info("Found trip end at %s" % last_trip_end_point.fmt_time)
-                    # We have processed everything up to the trip end by marking it as a completed trip
-                    self.last_ts_processed = currPoint.metadata_write_ts
-                    just_ended = True
-                    # Now, we have finished processing the previous point as a trip
-                    # end or not. But we still need to process this point by seeing
-                    # whether it should represent a new trip start, or a glom to the
-                    # previous trip
-                    if not self.continue_just_ended(idx, currPoint, self.filtered_points_df):
-                        sel_point = currPoint
-                        logging.debug("Setting new trip start point %s with idx %s" % (sel_point, sel_point.idx))
+
+        with ect.Timer() as t_loop:
+            for idx, row in self.filtered_points_df.iterrows():
+                currPoint = ad.AttrDict(row)
+                currPoint.update({"idx": idx})
+                logging.debug("-" * 30 + str(currPoint.fmt_time) + "-" * 30)
+
+                if curr_trip_start_point is None:
+                    logging.debug("Appending currPoint because the current start point is None")
+                    # segmentation_points.append(currPoint)
+
+                if just_ended:
+                    with ect.Timer() as t_continue_just_ended:
+                        continue_flag = self.continue_just_ended(idx, currPoint, self.filtered_points_df)
+                    esds.store_pipeline_time(
+                        user_id,
+                        ecwp.PipelineStages.TRIP_SEGMENTATION.name + "/segment_into_trips_dist/continue_just_ended",
+                        time.time(),
+                        t_continue_just_ended.elapsed
+                    )
+
+                    if continue_flag:
+                        # We have "processed" the currPoint by deciding to glom it
+                        self.last_ts_processed = currPoint.metadata_write_ts
+                        continue
+                    # else:
+                    sel_point = currPoint
+                    logging.debug("Setting new trip start point %s with idx %s" % (sel_point, sel_point.idx))
+                    with ect.Timer() as t_set_start_point:
                         curr_trip_start_point = sel_point
-                        just_ended = False
-
-        # Since we only end a trip when we start a new trip, this means that
-        # the last trip that was pushed is ignored. Consider the example of
-        # 2016-02-22 when I took kids to karate. We arrived shortly after 4pm,
-        # so during that remote push, a trip end was not detected. And we got
-        # back home shortly after 5pm, so the trip end was only detected on the
-        # phone at 6pm. At that time, the following points were pushed:
-        # ..., 2016-02-22T16:04:02, 2016-02-22T16:49:34, 2016-02-22T16:49:50,
-        # ..., 2016-02-22T16:57:04
-        # Then, on the server, while iterating through the points, we detected
-        # a trip end at 16:04, and a new trip start at 16:49. But we did not
-        # detect the trip end at 16:57, because we didn't start a new point.
-        # This has two issues:
-        # - we won't see this trip until the next trip start, which may be on the next day
-        # - we won't see this trip at all, because when we run the pipeline the
-        # next time, we will only look at points from that time onwards. These
-        # points have been marked as "processed", so they won't even be considered.
-
-        # There are multiple potential fixes:
-        # - we can mark only the completed trips as processed. This will solve (2) above, but not (1)
-        # - we can mark a trip end based on the fact that we only push data
-        # when a trip ends, so if we have data, it means that the trip has been
-        # detected as ended on the phone.
-        # This seems a bit fragile - what if we start pushing incomplete trip
-        # data for efficiency reasons? Therefore, we also check to see if there
-        # is a trip_end_detected in this timeframe after the last point. If so,
-        # then we end the trip at the last point that we have.
-        if not just_ended and len(self.transition_df) > 0:
-            stopped_moving_after_last = self.transition_df[(self.transition_df.ts > currPoint.ts) & (self.transition_df.transition == 2)]
-            logging.debug("stopped_moving_after_last = %s" % stopped_moving_after_last[["fmt_time", "transition"]])
-            if len(stopped_moving_after_last) > 0:
-                logging.debug("Found %d transitions after last point, ending trip..." % len(stopped_moving_after_last))
-                segmentation_points.append((curr_trip_start_point, currPoint))
-                self.last_ts_processed = currPoint.metadata_write_ts
-            else:
-                logging.debug("Found %d transitions after last point, not ending trip..." % len(stopped_moving_after_last))
+                    esds.store_pipeline_time(
+                        user_id,
+                        ecwp.PipelineStages.TRIP_SEGMENTATION.name + "/segment_into_trips_dist/set_new_trip_start_point",
+                        time.time(),
+                        t_set_start_point.elapsed
+                    )
+                    just_ended = False
+                else:
+                    with ect.Timer() as t_process_trip:
+                        # Using .loc here causes problems if we have filtered out some points and so the index is non-consecutive.
+                        # Using .iloc just ends up including points after this one.
+                        # So we reset_index upstream and use it here.
+                        last10Points_df = self.filtered_points_df.iloc[
+                            max(idx - self.point_threshold, curr_trip_start_point.idx):idx + 1
+                        ]
+                        lastPoint = self.find_last_valid_point(idx)
+                        with ect.Timer() as t_has_trip_ended:
+                            trip_ended = self.has_trip_ended(lastPoint, currPoint, timeseries)
+                        esds.store_pipeline_time(
+                            user_id,
+                            ecwp.PipelineStages.TRIP_SEGMENTATION.name + "/segment_into_trips_dist/has_trip_ended",
+                            time.time(),
+                            t_has_trip_ended.elapsed
+                        )
+
+                        if trip_ended:
+                            with ect.Timer() as t_get_last_trip_end_point:
+                                last_trip_end_point = lastPoint
+                                logging.debug("Appending last_trip_end_point %s with index %s " %
+                                            (last_trip_end_point, idx - 1))
+                                segmentation_points.append((curr_trip_start_point, last_trip_end_point))
+                                logging.info("Found trip end at %s" % last_trip_end_point.fmt_time)
+                                # We have processed everything up to the trip end by marking it as a completed trip
+                                self.last_ts_processed = currPoint.metadata_write_ts
+                            esds.store_pipeline_time(
+                                user_id,
+                                ecwp.PipelineStages.TRIP_SEGMENTATION.name + "/segment_into_trips_dist/get_last_trip_end_point",
+                                time.time(),
+                                t_get_last_trip_end_point.elapsed
+                            )
+
+                            with ect.Timer() as t_handle_trip_end:
+                                just_ended = True
+                                # Now, we have finished processing the previous point as a trip
+                                # end or not. But we still need to process this point by seeing
+                                # whether it should represent a new trip start, or a glom to the
+                                # previous trip
+                                if not self.continue_just_ended(idx, currPoint, self.filtered_points_df):
+                                    sel_point = currPoint
+                                    logging.debug("Setting new trip start point %s with idx %s" % (sel_point, sel_point.idx))
+                                    curr_trip_start_point = sel_point
+                                    just_ended = False
+                            esds.store_pipeline_time(
+                                user_id,
+                                ecwp.PipelineStages.TRIP_SEGMENTATION.name + "/segment_into_trips_dist/handle_trip_end",
+                                time.time(),
+                                t_handle_trip_end.elapsed
+                            )
+        esds.store_pipeline_time(
+            user_id,
+            ecwp.PipelineStages.TRIP_SEGMENTATION.name + "/segment_into_trips_dist/loop",
+            time.time(),
+            t_loop.elapsed
+        )
+
+        with ect.Timer() as t_post_loop:
+            # Since we only end a trip when we start a new trip, this means that
+            # the last trip that was pushed is ignored. Consider the example of
+            # 2016-02-22 when I took kids to karate. We arrived shortly after 4pm,
+            # so during that remote push, a trip end was not detected. And we got
+            # back home shortly after 5pm, so the trip end was only detected on the
+            # phone at 6pm. At that time, the following points were pushed:
+            # ..., 2016-02-22T16:04:02, 2016-02-22T16:49:34, 2016-02-22T16:49:50,
+            # ..., 2016-02-22T16:57:04
+            # Then, on the server, while iterating through the points, we detected
+            # a trip end at 16:04, and a new trip start at 16:49. But we did not
+            # detect the trip end at 16:57, because we didn't start a new point.
+            # This has two issues:
+            # - we won't see this trip until the next trip start, which may be on the next day
+            # - we won't see this trip at all, because when we run the pipeline the
+            # next time, we will only look at points from that time onwards. These
+            # points have been marked as "processed", so they won't even be considered.
+
+            # There are multiple potential fixes:
+            # - we can mark only the completed trips as processed. This will solve (2) above, but not (1)
+            # - we can mark a trip end based on the fact that we only push data
+            # when a trip ends, so if we have data, it means that the trip has been
+            # detected as ended on the phone.
+            # This seems a bit fragile - what if we start pushing incomplete trip
+            # data for efficiency reasons? Therefore, we also check to see if there
+            # is a trip_end_detected in this timeframe after the last point. If so,
+            # then we end the trip at the last point that we have.
+            if not just_ended and len(self.transition_df) > 0:
+                with ect.Timer() as t_check_transitions:
+                    stopped_moving_after_last = self.transition_df[
+                        (self.transition_df.ts > currPoint.ts) & (self.transition_df.transition == 2)
+                    ]
+                    logging.debug("stopped_moving_after_last = %s" % stopped_moving_after_last[["fmt_time", "transition"]])
+                    if len(stopped_moving_after_last) > 0:
+                        logging.debug("Found %d transitions after last point, ending trip..." % len(stopped_moving_after_last))
+                        segmentation_points.append((curr_trip_start_point, currPoint))
+                        self.last_ts_processed = currPoint.metadata_write_ts
+                    else:
+                        logging.debug("Found %d transitions after last point, not ending trip..." % len(stopped_moving_after_last))
+                esds.store_pipeline_time(
+                    user_id,
+                    ecwp.PipelineStages.TRIP_SEGMENTATION.name + "/segment_into_trips_dist/check_transitions_post_loop",
+                    time.time(),
+                    t_check_transitions.elapsed
+                )
+        esds.store_pipeline_time(
+            user_id,
+            ecwp.PipelineStages.TRIP_SEGMENTATION.name + "/segment_into_trips_dist/post_loop",
+            time.time(),
+            t_post_loop.elapsed
+        )
+
         return segmentation_points
 
     def has_trip_ended(self, lastPoint, currPoint, timeseries):