Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/Segment trip time page #61

Merged
merged 10 commits into from
Mar 19, 2024
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ specified in the following sections.
- `map_bubble`: User can view the bubble map in the Map page.
- `map_trip_lines`: User can view the trip lines map in the Map page.

### Segment Trip Time Page
- `segment_trip_time`: User can view this page. (default `true`)
- `segment_trip_time_full_trips`: User can see the table containing non-aggregated data (default `true`)
- `segment_trip_time_min_users`: Minimal number of distinct users in data required to display anything (value is a number, default `0`).

### Push Notification Page
- `push_send`: User can send push notifications in the Push Notification page.

Expand Down
10 changes: 10 additions & 0 deletions app_sidebar_collapsible.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,15 @@
href=dash.get_relative_path("/map"),
active="exact",
),
dbc.NavLink(
[
html.I(className="fas fa-solid fa-hourglass me-2"),
html.Span("Segment trip time"),
],
href=dash.get_relative_path("/segment_trip_time"),
active="exact",
style={'display': 'block' if has_permission('segment_trip_time') else 'none'},
),
dbc.NavLink(
[
html.I(className="fas fa-solid fa-envelope-open-text me-2"),
Expand Down Expand Up @@ -186,6 +195,7 @@ def update_store_uuids(start_date, end_date):
return store


# Note: this triggers twice on load, not great with a slow db
shankari marked this conversation as resolved.
Show resolved Hide resolved
@app.callback(
Output("store-trips", "data"),
Input('date-picker', 'start_date'),
Expand Down
2 changes: 1 addition & 1 deletion pages/home.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def compute_sign_up_trend(uuid_df):


def compute_trips_trend(trips_df, date_col):
trips_df[date_col] = pd.to_datetime(trips_df[date_col], utc=True)
trips_df[date_col] = pd.to_datetime(trips_df[date_col], utc=True, format='ISO8601')
trips_df[date_col] = pd.DatetimeIndex(trips_df[date_col]).date
res_df = (
trips_df
Expand Down
284 changes: 284 additions & 0 deletions pages/segment_trip_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
from dash import dcc, html, Input, Output, State, callback, register_page, dash_table
import dash_bootstrap_components as dbc
import dash_leaflet as dl
shankari marked this conversation as resolved.
Show resolved Hide resolved
import pandas as pd

import logging
import json

from utils.permissions import has_permission, permissions
from utils import db_utils

register_page(__name__, path="/segment_trip_time")

intro = """
## Segment average trip time
This page displays some statistics on average trip duration between two selected zones.

### Usage
Using the polygon or square tools on the maps' menu, draw the start (left map) and end (right map) zones to consider.

Data will then be fetched for trips crossing the start zone and then the end zone.

Here are some tips on how to draw zones:
* Zones shouldn't cover more than one parallel road; otherwise, it is unclear which path the user took.
* A bigger zone will give more results, at the cost of lower accuracy in trip durations (the start point could be anywhere in the zone).
* For exhaustivity, zone length should somewhat match the distance a vehicle can cross at the maximum allowed speed in 30 seconds (sample rate).
* A smaller zone will give more accurate time results, but the number of trips might be too low to be significant.
* Zones can be moved and edited using the Edit layer menu, and they can be deleted with the Delete layer button.
* Please be advised that only the last added zone will be considered on each map. It is thus advised to delete existing zones before creating new ones.
"""


not_enough_data_message = f"""
Not enough segments could be found between endpoints. This means that the number of recorded trips going from start to end point is too low.
* There could be data, but on an insufficient number of users, breaking anonymity (minimum number of users is currently set to {permissions.get('segment_trip_time_min_users', 0)})
* You could try to increase the zone sizes, or chose different start and end points.
"""

initial_maps_center = [32.7, -96.8]
initial_maps_zoom = 5
layout = html.Div(
[
dcc.Store(id='link-trip-time-start', data=json.dumps({"features": []})),
dcc.Store(id='link-trip-time-end', data=json.dumps({"features": []})),
dcc.Markdown(intro),
dbc.Row(
[
dbc.Col(
[
html.H4('Start zone selection'),
dl.Map(
[
dl.TileLayer(),
dl.FeatureGroup([
dl.EditControl(
id="stt-edit-control-start",
draw=dict(circle=False, marker=False, polyline=False, circlemarker=False)
)
])
],
#[dl.TileLayer(), dl.LayerGroup(id='stt-trip-layer-start')],
id='stt-trip-map-start',
style={'height': '500px'},
center=initial_maps_center,
zoom=initial_maps_zoom
),
]
),
dbc.Col(
[
html.H4('End zone selection'),
dl.Map(
[
dl.TileLayer(),
dl.FeatureGroup([
dl.EditControl(
id="stt-edit-control-end",
draw=dict(circle=False, marker=False, polyline=False, circlemarker=False)
)
])
],
id='stt-trip-map-end',
style={'height': '500px'},
center=initial_maps_center,
zoom=initial_maps_zoom
),
]
),
]
),
dbc.Row(
html.Div(id='message'),
),
]
)



@callback(
Output('link-trip-time-start', 'data'),
Input('stt-edit-control-start', 'geojson'),
prevent_initial_call=True,
)
def map_start_draw(geojson):
return json.dumps(geojson)

@callback(
Output('link-trip-time-end', 'data'),
Input('stt-edit-control-end', 'geojson'),
prevent_initial_call=True,
)
def map_end_draw(geojson):
return json.dumps(geojson)



def format_duration_df(df, time_column_name='Time sample'):
df['Median time (minutes)'] = df.duration / 60 # convert seconds in minutes
df = df.reset_index().rename(
columns={
'start_fmt_time': time_column_name,
'duration': 'Median time (seconds)',
'section': 'Count',
'mode': 'Mode',
}
)
if time_column_name in df:
if 'Mode' in df:
df = df[
[
'Mode',
time_column_name,
'Median time (seconds)',
'Median time (minutes)',
'Count',
]
] # reorder cols
else:
df = df[
[
time_column_name,
'Median time (seconds)',
'Median time (minutes)',
'Count',
]
] # reorder cols
else:
df = df[
['Mode', 'Median time (seconds)', 'Median time (minutes)', 'Count']
] # reorder cols
df = df.to_dict('records') # Format for display
return df


@callback(
Output('message', 'children'),
Input('link-trip-time-start', 'data'),
Input('link-trip-time-end', 'data'),
prevent_initial_call=True,
)
def generate_content_on_endpoints_change(link_trip_time_start_str, link_trip_time_end_str):
link_trip_time_start = json.loads(link_trip_time_start_str)
link_trip_time_end = json.loads(link_trip_time_end_str)
if len(link_trip_time_end["features"]) == 0 or len(link_trip_time_start["features"]) == 0:
return ''
# logging.debug("link_trip_time_start: " + str(link_trip_time_start))
# logging.debug("link_trip_time_end: " + str(link_trip_time_end))

# Warning: This is a database call, looks here if there is a performance hog.
# From initial tests, this seems to be performing well, without the need to do geoqueries in memory
df = db_utils.query_segments_crossing_endpoints(
link_trip_time_start["features"][len(link_trip_time_start["features"])-1],
link_trip_time_end["features"][len(link_trip_time_end["features"])-1],
)
total_nb_trips = df.shape[0]
if total_nb_trips > 0:
# Warning: Another db call here.
# In theory, we could load all inferred_section modes in memory at start time, instead of fetching it everytime
# However, when testing it, the operation is quite heavy on the db and on ram.
# I opted for querying only sections we're interested in, every time. Page load is still decent, especially when the number of section is low.
mode_by_section_id = db_utils.query_inferred_sections_modes(
df[['section', 'user_id']].to_dict('records')
)
df['mode'] = df['section'].apply(
lambda section_id: mode_by_section_id[str(section_id)].name
)
median_trip_time = df['duration'].median()
times = pd.to_datetime(df['start_fmt_time'], errors='coerce', utc=True)
duration_per_hour = format_duration_df(
df.groupby(times.dt.hour).agg({'duration': 'median', 'section': 'count'}),
time_column_name='Hour',
)
Comment on lines +188 to +192
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just tried to use this myself, and having the hours in UTC is very annoying. We already have the split out components in local time in the data.start_local_dt field

From emission/core/wrapper/section.py

             "start_local_dt": ecwb.WrapperBase.Access.WORM, # ********searchable datatime in local time of start location

I will spend ~ 10 mins trying to fix this myself while merging the change, but will file a cleanup issue if I can't get that to work.

duration_per_mode = format_duration_df(
df.groupby('mode').agg({'duration': 'median', 'section': 'count'})
)
duration_per_mode_per_hour = format_duration_df(
df.groupby(['mode', times.dt.hour]).agg(
{'duration': 'median', 'section': 'count'}
),
time_column_name='Hour',
)
duration_per_mode_per_month = format_duration_df(
df.groupby(['mode', times.dt.month]).agg(
{'duration': 'median', 'section': 'count'}
),
time_column_name='Month',
)
return dbc.Row(
[
dbc.Col(
[
html.Br(),
html.H3('Results'),
html.Div(
f'Computed median segment duration is {median_trip_time} seconds, {total_nb_trips} trips considered'
),
html.Br(),
html.H4('Median segment duration by mode of transport'),
dash_table.DataTable(
id='duration_per_mode',
data=duration_per_mode,
sort_action='native',
sort_mode='multi',
export_format='csv',
),
html.Br(),
html.H4(
'Median segment duration by hour of the day (UTC)'
),
dash_table.DataTable(
id='duration_per_hour',
data=duration_per_hour,
sort_action='native',
sort_mode='multi',
export_format='csv',
),
html.Br(),
html.H4(
'Median segment duration by mode and hour of the day (UTC)'
),
dash_table.DataTable(
id='duration_per_mode_per_hour',
data=duration_per_mode_per_hour,
sort_action='native',
sort_mode='multi',
export_format='csv',
),
html.Br(),
html.H4('Median segment duration by mode and month'),
dash_table.DataTable(
id='duration_per_mode_per_month',
data=duration_per_mode_per_month,
sort_action='native',
sort_mode='multi',
export_format='csv',
),
],
xs=6,
),
dbc.Col(
[
html.Br(),
html.H3('Trips Data'),
dash_table.DataTable(
id='trips_data',
data=df[
['start_fmt_time', 'end_fmt_time', 'mode', 'duration']
].to_dict('records'),
page_size=15,
sort_action='native',
sort_mode='multi',
export_format='csv',
),
],
xs=6,
style={
'display': 'block'
if has_permission('segment_trip_time_full_trips')
else 'none'
},
),
]
)
return [html.H3('Results'), dcc.Markdown(not_enough_data_message)]
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ python-jose==3.3.0
flask==2.2.5
flask-talisman==1.0.0
dash_auth==2.0.0
dash-leaflet==1.0.7
44 changes: 42 additions & 2 deletions utils/db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@

import emission.core.get_database as edb
import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.timeseries.aggregate_timeseries as estag
import emission.storage.timeseries.timequery as estt
import emission.storage.timeseries.geoquery as estg
import emission.storage.decorations.section_queries as esds
import emission.core.wrapper.modeprediction as ecwm

from utils import constants
from utils import permissions as perm_utils
Expand Down Expand Up @@ -47,7 +51,7 @@ def query_uuids(start_date, end_date):
return df

def query_confirmed_trips(start_date, end_date):
start_ts, end_ts = None, datetime.max.timestamp()
start_ts, end_ts = None, datetime.max.replace(tzinfo=timezone.utc).timestamp()
if start_date is not None:
start_ts = datetime.combine(start_date, datetime.min.time()).timestamp()

Expand Down Expand Up @@ -158,4 +162,40 @@ def add_user_stats(user_data):
if last_call != -1:
user['last_call'] = arrow.get(last_call).format(time_format)

return user_data
return user_data

def query_segments_crossing_endpoints(poly_region_start, poly_region_end):
agg_ts = estag.AggregateTimeSeries().get_aggregate_time_series()

locs_matching_start = agg_ts.get_data_df("analysis/recreated_location", geo_query = estg.GeoQuery(['data.loc'], poly_region_start))
locs_matching_start = locs_matching_start.drop_duplicates(subset=['section'])
if locs_matching_start.empty:
return locs_matching_start

locs_matching_end = agg_ts.get_data_df("analysis/recreated_location", geo_query = estg.GeoQuery(['data.loc'], poly_region_end))
locs_matching_end = locs_matching_end.drop_duplicates(subset=['section'])
if locs_matching_end.empty:
return locs_matching_end

merged = locs_matching_start.merge(locs_matching_end, how='outer', on=['section'])
filtered = merged.loc[merged['idx_x']<merged['idx_y']].copy()
filtered['duration'] = filtered['ts_y'] - filtered['ts_x']
filtered['mode'] = filtered['mode_x']
filtered['start_fmt_time'] = filtered['fmt_time_x']
filtered['end_fmt_time'] = filtered['fmt_time_y']
filtered['user_id'] = filtered['user_id_y']

number_user_seen = filtered.user_id_x.nunique()

if perm_utils.permissions.get("segment_trip_time_min_users", 0) <= number_user_seen:
return filtered
return pd.DataFrame.from_dict([])

# The following query can be called multiple times, let's open db only once
analysis_timeseries_db = edb.get_analysis_timeseries_db()

# Fetches sensed_mode for each section in a list
# sections format example: [{'section': ObjectId('648d02b227fd2bb6635414a0'), 'user_id': UUID('6d7edf29-8b3f-451b-8d66-984cb8dd8906')}]
def query_inferred_sections_modes(sections):
return esds.cleaned2inferred_section_list(sections)