e-mission · shankari · Mar 19, 2024 · Aug 11, 2023 · Aug 11, 2023 · Aug 11, 2023
diff --git a/README.md b/README.md
@@ -81,6 +81,11 @@ specified in the following sections.
 - `map_bubble`: User can view the bubble map in the Map page.
 - `map_trip_lines`: User can view the trip lines map in the Map page.
 
+### Segment Trip Time Page
+- `segment_trip_time`: User can view this page. (default `true`)
+- `segment_trip_time_full_trips`: User can see the table containing non-aggregated data (default `true`)
+- `segment_trip_time_min_users`: Minimal number of distinct users in data required to display anything (value is a number, default `0`).
+
 ### Push Notification Page
 - `push_send`: User can send push notifications in the Push Notification page.
 

diff --git a/app_sidebar_collapsible.py b/app_sidebar_collapsible.py
@@ -99,6 +99,15 @@
                     href=dash.get_relative_path("/map"),
                     active="exact",
                 ),
+                dbc.NavLink(
+                    [
+                        html.I(className="fas fa-solid fa-hourglass me-2"),
+                        html.Span("Segment trip time"),
+                    ],
+                    href=dash.get_relative_path("/segment_trip_time"),
+                    active="exact",
+                    style={'display': 'block' if has_permission('segment_trip_time') else 'none'},
+                ),
                 dbc.NavLink(
                     [
                         html.I(className="fas fa-solid fa-envelope-open-text me-2"),
@@ -186,6 +195,7 @@ def update_store_uuids(start_date, end_date):
     return store
 
 
+# Note: this triggers twice on load, not great with a slow db
 @app.callback(
     Output("store-trips", "data"),
     Input('date-picker', 'start_date'),

diff --git a/pages/home.py b/pages/home.py
@@ -64,7 +64,7 @@ def compute_sign_up_trend(uuid_df):
 
 
 def compute_trips_trend(trips_df, date_col):
-    trips_df[date_col] = pd.to_datetime(trips_df[date_col], utc=True)
+    trips_df[date_col] = pd.to_datetime(trips_df[date_col], utc=True, format='ISO8601')
     trips_df[date_col] = pd.DatetimeIndex(trips_df[date_col]).date
     res_df = (
         trips_df

diff --git a/pages/segment_trip_time.py b/pages/segment_trip_time.py
@@ -0,0 +1,284 @@
+from dash import dcc, html, Input, Output, State, callback, register_page, dash_table
+import dash_bootstrap_components as dbc
+import dash_leaflet as dl
+import pandas as pd
+
+import logging
+import json
+
+from utils.permissions import has_permission, permissions
+from utils import db_utils
+
+register_page(__name__, path="/segment_trip_time")
+
+intro = """
+## Segment average trip time
+This page displays some statistics on average trip duration between two selected zones.
+
+### Usage
+Using the polygon or square tools on the maps' menu, draw the start (left map) and end (right map) zones to consider.
+
+Data will then be fetched for trips crossing the start zone and then the end zone.
+
+Here are some tips on how to draw zones:
+* Zones shouldn't cover more than one parallel road; otherwise, it is unclear which path the user took.
+* A bigger zone will give more results, at the cost of lower accuracy in trip durations (the start point could be anywhere in the zone).
+* For exhaustivity, zone length should somewhat match the distance a vehicle can cross at the maximum allowed speed in 30 seconds (sample rate).
+* A smaller zone will give more accurate time results, but the number of trips might be too low to be significant.
+* Zones can be moved and edited using the Edit layer menu, and they can be deleted with the Delete layer button.
+* Please be advised that only the last added zone will be considered on each map. It is thus advised to delete existing zones before creating new ones.
+"""
+
+
+not_enough_data_message = f"""
+Not enough segments could be found between endpoints. This means that the number of recorded trips going from start to end point is too low. 
+* There could be data, but on an insufficient number of users, breaking anonymity (minimum number of users is currently set to {permissions.get('segment_trip_time_min_users', 0)})
+* You could try to increase the zone sizes, or chose different start and end points.
+"""
+
+initial_maps_center = [32.7, -96.8]
+initial_maps_zoom = 5
+layout = html.Div(
+    [
+        dcc.Store(id='link-trip-time-start', data=json.dumps({"features": []})),
+        dcc.Store(id='link-trip-time-end', data=json.dumps({"features": []})),
+        dcc.Markdown(intro),
+        dbc.Row(
+            [
+                dbc.Col(
+                    [
+                        html.H4('Start zone selection'),
+                        dl.Map(
+                            [
+                                dl.TileLayer(), 
+                                dl.FeatureGroup([
+                                    dl.EditControl(
+                                        id="stt-edit-control-start", 
+                                        draw=dict(circle=False, marker=False, polyline=False, circlemarker=False)
+                                    )
+                                ])
+                            ],
+                            #[dl.TileLayer(), dl.LayerGroup(id='stt-trip-layer-start')],
+                            id='stt-trip-map-start',
+                            style={'height': '500px'},
+                            center=initial_maps_center,
+                            zoom=initial_maps_zoom
+                        ),
+                    ]
+                ),
+                dbc.Col(
+                    [
+                        html.H4('End zone selection'),
+                        dl.Map(
+                            [
+                                dl.TileLayer(), 
+                                dl.FeatureGroup([
+                                    dl.EditControl(
+                                        id="stt-edit-control-end", 
+                                        draw=dict(circle=False, marker=False, polyline=False, circlemarker=False)
+                                    )
+                                ])
+                            ],
+                            id='stt-trip-map-end',
+                            style={'height': '500px'},
+                            center=initial_maps_center,
+                            zoom=initial_maps_zoom
+                        ),
+                    ]
+                ),
+            ]
+        ),
+        dbc.Row(
+            html.Div(id='message'),
+        ),
+    ]
+)
+
+
+
+@callback(
+    Output('link-trip-time-start', 'data'),
+    Input('stt-edit-control-start', 'geojson'),
+    prevent_initial_call=True,
+)
+def map_start_draw(geojson):
+    return json.dumps(geojson)
+
+@callback(
+    Output('link-trip-time-end', 'data'),
+    Input('stt-edit-control-end', 'geojson'),
+    prevent_initial_call=True,
+)
+def map_end_draw(geojson):
+    return json.dumps(geojson)
+
+
+
+def format_duration_df(df, time_column_name='Time sample'):
+    df['Median time (minutes)'] = df.duration / 60  # convert seconds in minutes
+    df = df.reset_index().rename(
+        columns={
+            'start_fmt_time': time_column_name,
+            'duration': 'Median time (seconds)',
+            'section': 'Count',
+            'mode': 'Mode',
+        }
+    )
+    if time_column_name in df:
+        if 'Mode' in df:
+            df = df[
+                [
+                    'Mode',
+                    time_column_name,
+                    'Median time (seconds)',
+                    'Median time (minutes)',
+                    'Count',
+                ]
+            ]  # reorder cols
+        else:
+            df = df[
+                [
+                    time_column_name,
+                    'Median time (seconds)',
+                    'Median time (minutes)',
+                    'Count',
+                ]
+            ]  # reorder cols
+    else:
+        df = df[
+            ['Mode', 'Median time (seconds)', 'Median time (minutes)', 'Count']
+        ]  # reorder cols
+    df = df.to_dict('records')  # Format for display
+    return df
+
+
+@callback(
+    Output('message', 'children'),
+    Input('link-trip-time-start', 'data'),
+    Input('link-trip-time-end', 'data'),
+    prevent_initial_call=True,
+)
+def generate_content_on_endpoints_change(link_trip_time_start_str, link_trip_time_end_str):
+    link_trip_time_start = json.loads(link_trip_time_start_str)
+    link_trip_time_end = json.loads(link_trip_time_end_str)
+    if len(link_trip_time_end["features"]) == 0 or len(link_trip_time_start["features"]) == 0:
+        return ''
+    # logging.debug("link_trip_time_start: " + str(link_trip_time_start))
+    # logging.debug("link_trip_time_end: " + str(link_trip_time_end))
+
+    # Warning: This is a database call, looks here if there is a performance hog.
+    # From initial tests, this seems to be performing well, without the need to do geoqueries in memory
+    df = db_utils.query_segments_crossing_endpoints(
+        link_trip_time_start["features"][len(link_trip_time_start["features"])-1],
+        link_trip_time_end["features"][len(link_trip_time_end["features"])-1],
+    )
+    total_nb_trips = df.shape[0]
+    if total_nb_trips > 0:
+        # Warning: Another db call here.
+        # In theory, we could load all inferred_section modes in memory at start time, instead of fetching it everytime
+        # However, when testing it, the operation is quite heavy on the db and on ram.
+        # I opted for querying only sections we're interested in, every time. Page load is still decent, especially when the number of section is low.
+        mode_by_section_id = db_utils.query_inferred_sections_modes(
+            df[['section', 'user_id']].to_dict('records')
+        )
+        df['mode'] = df['section'].apply(
+            lambda section_id: mode_by_section_id[str(section_id)].name
+        )
+        median_trip_time = df['duration'].median()
+        times = pd.to_datetime(df['start_fmt_time'], errors='coerce', utc=True)
+        duration_per_hour = format_duration_df(
+            df.groupby(times.dt.hour).agg({'duration': 'median', 'section': 'count'}),
+            time_column_name='Hour',
+        )
+        duration_per_mode = format_duration_df(
+            df.groupby('mode').agg({'duration': 'median', 'section': 'count'})
+        )
+        duration_per_mode_per_hour = format_duration_df(
+            df.groupby(['mode', times.dt.hour]).agg(
+                {'duration': 'median', 'section': 'count'}
+            ),
+            time_column_name='Hour',
+        )
+        duration_per_mode_per_month = format_duration_df(
+            df.groupby(['mode', times.dt.month]).agg(
+                {'duration': 'median', 'section': 'count'}
+            ),
+            time_column_name='Month',
+        )
+        return dbc.Row(
+            [
+                dbc.Col(
+                    [
+                        html.Br(),
+                        html.H3('Results'),
+                        html.Div(
+                            f'Computed median segment duration is {median_trip_time} seconds, {total_nb_trips} trips considered'
+                        ),
+                        html.Br(),
+                        html.H4('Median segment duration by mode of transport'),
+                        dash_table.DataTable(
+                            id='duration_per_mode',
+                            data=duration_per_mode,
+                            sort_action='native',
+                            sort_mode='multi',
+                            export_format='csv',
+                        ),
+                        html.Br(),
+                        html.H4(
+                            'Median segment duration by hour of the day (UTC)'
+                        ),
+                        dash_table.DataTable(
+                            id='duration_per_hour',
+                            data=duration_per_hour,
+                            sort_action='native',
+                            sort_mode='multi',
+                            export_format='csv',
+                        ),
+                        html.Br(),
+                        html.H4(
+                            'Median segment duration by mode and hour of the day (UTC)'
+                        ),
+                        dash_table.DataTable(
+                            id='duration_per_mode_per_hour',
+                            data=duration_per_mode_per_hour,
+                            sort_action='native',
+                            sort_mode='multi',
+                            export_format='csv',
+                        ),
+                        html.Br(),
+                        html.H4('Median segment duration by mode and month'),
+                        dash_table.DataTable(
+                            id='duration_per_mode_per_month',
+                            data=duration_per_mode_per_month,
+                            sort_action='native',
+                            sort_mode='multi',
+                            export_format='csv',
+                        ),
+                    ],
+                    xs=6,
+                ),
+                dbc.Col(
+                    [
+                        html.Br(),
+                        html.H3('Trips Data'),
+                        dash_table.DataTable(
+                            id='trips_data',
+                            data=df[
+                                ['start_fmt_time', 'end_fmt_time', 'mode', 'duration']
+                            ].to_dict('records'),
+                            page_size=15,
+                            sort_action='native',
+                            sort_mode='multi',
+                            export_format='csv',
+                        ),
+                    ],
+                    xs=6,
+                    style={
+                        'display': 'block'
+                        if has_permission('segment_trip_time_full_trips')
+                        else 'none'
+                    },
+                ),
+            ]
+        )
+    return [html.H3('Results'), dcc.Markdown(not_enough_data_message)]
diff --git a/requirements.txt b/requirements.txt
@@ -14,3 +14,4 @@ python-jose==3.3.0
 flask==2.2.5
 flask-talisman==1.0.0
 dash_auth==2.0.0
+dash-leaflet==1.0.7
diff --git a/utils/db_utils.py b/utils/db_utils.py
@@ -9,7 +9,11 @@
 
 import emission.core.get_database as edb
 import emission.storage.timeseries.abstract_timeseries as esta
+import emission.storage.timeseries.aggregate_timeseries as estag
 import emission.storage.timeseries.timequery as estt
+import emission.storage.timeseries.geoquery as estg
+import emission.storage.decorations.section_queries as esds
+import emission.core.wrapper.modeprediction as ecwm
 
 from utils import constants
 from utils import permissions as perm_utils
@@ -47,7 +51,7 @@ def query_uuids(start_date, end_date):
     return df
 
 def query_confirmed_trips(start_date, end_date):
-    start_ts, end_ts = None, datetime.max.timestamp()
+    start_ts, end_ts = None, datetime.max.replace(tzinfo=timezone.utc).timestamp()
     if start_date is not None:
         start_ts = datetime.combine(start_date, datetime.min.time()).timestamp()
 
@@ -158,4 +162,40 @@ def add_user_stats(user_data):
             if last_call != -1:
                 user['last_call'] = arrow.get(last_call).format(time_format)
 
-    return user_data
+    return user_data
+
+def query_segments_crossing_endpoints(poly_region_start, poly_region_end):
+    agg_ts = estag.AggregateTimeSeries().get_aggregate_time_series()
+
+    locs_matching_start = agg_ts.get_data_df("analysis/recreated_location", geo_query = estg.GeoQuery(['data.loc'], poly_region_start))
+    locs_matching_start = locs_matching_start.drop_duplicates(subset=['section'])
+    if locs_matching_start.empty:
+        return locs_matching_start
+
+    locs_matching_end = agg_ts.get_data_df("analysis/recreated_location", geo_query = estg.GeoQuery(['data.loc'], poly_region_end))
+    locs_matching_end = locs_matching_end.drop_duplicates(subset=['section'])
+    if locs_matching_end.empty:
+        return locs_matching_end
+
+    merged = locs_matching_start.merge(locs_matching_end, how='outer', on=['section'])
+    filtered = merged.loc[merged['idx_x']<merged['idx_y']].copy()
+    filtered['duration'] = filtered['ts_y'] - filtered['ts_x']
+    filtered['mode'] = filtered['mode_x']
+    filtered['start_fmt_time'] = filtered['fmt_time_x']
+    filtered['end_fmt_time'] = filtered['fmt_time_y']
+    filtered['user_id'] = filtered['user_id_y']
+
+    number_user_seen = filtered.user_id_x.nunique()
+
+    if perm_utils.permissions.get("segment_trip_time_min_users", 0) <= number_user_seen:
+        return filtered
+    return pd.DataFrame.from_dict([])
+
+# The following query can be called multiple times, let's open db only once
+analysis_timeseries_db = edb.get_analysis_timeseries_db()
+
+# Fetches sensed_mode for each section in a list
+# sections format example: [{'section': ObjectId('648d02b227fd2bb6635414a0'), 'user_id': UUID('6d7edf29-8b3f-451b-8d66-984cb8dd8906')}]
+def query_inferred_sections_modes(sections):
+    return esds.cleaned2inferred_section_list(sections)
+