Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix SHN segments #1291

Merged
merged 2 commits into from
Nov 14, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 119 additions & 40 deletions _shared_utils/shared_utils/shared_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely
from calitp_data_analysis import geography_utils, utils
from calitp_data_analysis.sql import to_snakecase
from shared_utils import arcgis_query, geo_utils
from shared_utils import arcgis_query

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/shared_data/"
COMPILED_CACHED_GCS = "gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/"
Expand Down Expand Up @@ -104,7 +105,47 @@ def export_shn_postmiles():
return


def segment_highway_lines_by_postmile(gdf: gpd.GeoDataFrame, group_cols: list) -> gpd.GeoDataFrame:
def scaled_proportion(x: float, odometer_min: float, odometer_max: float) -> float:
"""
Scale highway postmile start and end to 0-1.
Ex: if a highway line has odometer value of 0 - 1_000,
and the postmile segment we want to cut is 400-600, we need to know
proportionally which subset of coords to take.
Use this along with shapely.project to find out the distance
each coord is from the origin.
"""
return (x - odometer_min) / (odometer_max - odometer_min)


def get_segment_geom(
shn_line_geom: shapely.LineString, projected_coords: list, start_dist: float, end_dist: float
) -> shapely.LineString:
"""
Unpack SHN linestring coordinates as distances from origin
of linestring, and find where postmiles start/end.
The segment we're interested in is this line:
(postmile start point, coordinates on SHN line in between, and postmile end point).
"""
# Turn list into array so we can subset it
projected_coords = np.asarray(projected_coords)

# The postmile's bodometer is start_dist; postmile's eodometer is end_dist
# Convert these to point geometries (these are our vertices)
start_geom = shn_line_geom.interpolate(start_dist, normalized=True)
end_geom = shn_line_geom.interpolate(end_dist, normalized=True)

# Valid indices are all the coords in the SHN line that are between the 2 postmiles
valid_indices = ((projected_coords >= start_dist) & (projected_coords <= end_dist)).nonzero()[0]
valid_coords = list(np.asarray([shapely.Point(p) for p in shn_line_geom.coords])[valid_indices])

# Create our segment based on our postmile start/end points + all the coordinates in between
# found on the SHN linestring
segment_geom = shapely.LineString([start_geom, *valid_coords, end_geom])

return segment_geom


def segment_highway_lines_by_postmile(gdf: gpd.GeoDataFrame):
"""
Use the current postmile as the
starting geometry / segment beginning
Expand All @@ -113,31 +154,36 @@ def segment_highway_lines_by_postmile(gdf: gpd.GeoDataFrame, group_cols: list) -

Segment goes from current to next postmile.
"""
# For this postmile, snap it to the highway line and find the nearest index
# for a linestring with 10 points, an index value of 2 means it's the 3rd coordinate
nearest_idx_series = np.vectorize(geo_utils.nearest_snap)(gdf.line_geometry, gdf.geometry, 1)

gdf["idx"] = nearest_idx_series

# The segment will be index into the nearest point for a postmile
# until the index of the subsequent postmile
# Ex: idx=1 and subseq_idx=5 means we want to grab hwy_coords[1:6] as our segment
gdf = gdf.assign(
subseq_idx=(gdf.sort_values(group_cols + ["odometer"]).groupby(group_cols).idx.shift(-1).astype("Int64")),
eodometer=(gdf.sort_values(group_cols + ["odometer"]).groupby(group_cols).odometer.shift(-1)),
).rename(columns={"odometer": "bodometer"})
# follow the convention of b for begin odometer and e for end odometer
# Take the postmile's bodometer/eodometer and
# find the scaled version relative to the highway's bodometer/eodometer.
# hwy_bodometer/eodometer usually are floats like [3.5, 4.8], and we want to know
# proportionally where a value like 4.2 will fall on a scale of 0-1
start_dist = np.vectorize(scaled_proportion)(gdf.bodometer, gdf.hwy_bodometer, gdf.hwy_eodometer)
end_dist = np.vectorize(scaled_proportion)(gdf.eodometer, gdf.hwy_bodometer, gdf.hwy_eodometer)

segment_geom = np.vectorize(get_segment_geom)(gdf.line_geometry, gdf.projected_coords, start_dist, end_dist)

drop_cols = ["projected_coords", "hwy_bodometer", "hwy_eodometer", "line_geometry"]

# Assign segment geometry and overwrite the postmile geometry column
gdf2 = (
gdf.assign(geometry=gpd.GeoSeries(segment_geom, crs=geography_utils.CA_NAD83Albers))
.drop(columns=drop_cols)
.set_geometry("geometry")
)

# Drop NaNs because for 3 points, we can draw 2 segments
gdf2 = gdf.dropna(subset="subseq_idx").reset_index(drop=True)
return gdf2

segment_geom = np.vectorize(geo_utils.segmentize_by_indices)(gdf2.line_geometry, gdf2.idx, gdf2.subseq_idx)

gdf3 = gdf2.assign(
geometry=gpd.GeoSeries(segment_geom).set_crs(geography_utils.WGS84),
).drop(columns=["line_geometry", "idx", "subseq_idx"])
def round_odometer_values(df: pd.DataFrame, cols: list = ["odometer"], num_decimals: int = 3) -> pd.DataFrame:
"""
Round odometer columns, which can be named
odometer, bodometer (begin odometer), eodometer (end odometer)
to 3 decimal places.
"""
df[cols] = df[cols].round(num_decimals)

return gdf3
return df


def create_postmile_segments(
Expand All @@ -151,35 +197,66 @@ def create_postmile_segments(
"""
# We need multilinestrings to become linestrings (use gdf.explode)
# and the columns we select do uniquely tag lines (multilinestrings are 1 item)
hwy_lines = gpd.read_parquet(
f"{GCS_FILE_PATH}state_highway_network_raw.parquet",
columns=group_cols + ["bodometer", "eodometer", "geometry"],
).explode("geometry")
hwy_lines = (
gpd.read_parquet(
f"{GCS_FILE_PATH}state_highway_network_raw.parquet",
columns=group_cols + ["bodometer", "eodometer", "geometry"],
)
.explode("geometry")
.reset_index(drop=True)
.pipe(round_odometer_values, ["bodometer", "eodometer"], num_decimals=3)
.to_crs(geography_utils.CA_NAD83Albers)
)

hwy_postmiles = gpd.read_parquet(
f"{GCS_FILE_PATH}state_highway_network_postmiles.parquet", columns=group_cols + ["odometer", "geometry"]
# Have a list accompany the geometry
# linestring has many coords, shapely.project each point and calculate distance from origin
# and normalize it all between 0-1
hwy_lines = hwy_lines.assign(
projected_coords=hwy_lines.apply(
lambda x: [x.geometry.project(shapely.Point(p), normalized=True) for p in x.geometry.coords], axis=1
)
)

hwy_postmiles = (
gpd.read_parquet(
f"{GCS_FILE_PATH}state_highway_network_postmiles.parquet", columns=group_cols + ["odometer", "geometry"]
)
.pipe(round_odometer_values, ["odometer"], num_decimals=3)
.to_crs(geography_utils.CA_NAD83Albers)
)
# Round to 3 digits for odometer. When there are more decimal places, it makes our cutoffs iffy
# when we use this condition below: odometer >= bodometer & odometer <= eodometer

# follow the convention of b for begin odometer and e for end odometer
hwy_postmiles = (
hwy_postmiles.assign(
eodometer=(hwy_postmiles.sort_values(group_cols + ["odometer"]).groupby(group_cols).odometer.shift(-1)),
)
.rename(columns={"odometer": "bodometer"})
.dropna(subset="eodometer")
.reset_index(drop=True)
)

# Merge hwy points with the lines we want to cut segments from
gdf = (
pd.merge(hwy_postmiles, hwy_lines.rename(columns={"geometry": "line_geometry"}), on=group_cols, how="inner")
pd.merge(
hwy_postmiles,
hwy_lines.rename(
columns={"bodometer": "hwy_bodometer", "eodometer": "hwy_eodometer", "geometry": "line_geometry"}
),
on=group_cols,
how="inner",
)
.query(
# make sure that the postmile point falls between
# the beginning and ending odometer
# once we check this, we don't need b/e odometer.
"odometer >= bodometer & odometer <= eodometer"
"bodometer >= hwy_bodometer & eodometer <= hwy_eodometer"
)
.sort_values(group_cols + ["odometer"])
.sort_values(group_cols + ["bodometer"])
.reset_index(drop=True)
.drop(columns=["bodometer", "eodometer"])
)

gdf2 = segment_highway_lines_by_postmile(gdf, group_cols)

# TODO: there are rows with empty geometry because their indexed value is the same for current and subseq
# so no line was drawn
# check if it's ok for these to exist
# gdf2[gdf2.geometry.is_empty] shows about 57k rows that didn't get cut
gdf2 = segment_highway_lines_by_postmile(gdf).to_crs(geography_utils.WGS84)

utils.geoparquet_gcs_export(gdf2, GCS_FILE_PATH, "state_highway_network_postmile_segments")

Expand Down Expand Up @@ -270,6 +347,8 @@ def make_transit_operators_to_legislative_district_crosswalk(date_list: list) ->
# State Highway Network
make_clean_state_highway_network()
export_shn_postmiles()

# This takes 24 min to run, so if there's a way to optimize in the future, we should
create_postmile_segments(["district", "county", "routetype", "route", "direction", "routes", "pmrouteid"])

# Legislative Districts
Expand Down
Loading