diff --git a/_shared_utils/setup.py b/_shared_utils/setup.py
index e88635a02..4ca979af3 100644
--- a/_shared_utils/setup.py
+++ b/_shared_utils/setup.py
@@ -4,7 +4,7 @@
setup(
name="shared_utils",
packages=find_packages(),
- version="2.8",
+ version="3.0",
description="Shared utility functions for data analyses",
author="Cal-ITP",
license="Apache",
diff --git a/_shared_utils/shared_utils/__init__.py b/_shared_utils/shared_utils/__init__.py
index c95975451..a0de7ba01 100644
--- a/_shared_utils/shared_utils/__init__.py
+++ b/_shared_utils/shared_utils/__init__.py
@@ -2,22 +2,26 @@
arcgis_query,
catalog_utils,
dask_utils,
+ geo_utils,
gtfs_utils_v2,
portfolio_utils,
publish_utils,
rt_dates,
rt_utils,
schedule_rt_utils,
+ time_helpers,
)
__all__ = [
"arcgis_query",
"catalog_utils",
"dask_utils",
+ "geo_utils",
"gtfs_utils_v2",
"portfolio_utils",
"publish_utils",
"rt_dates",
"rt_utils",
"schedule_rt_utils",
+ "time_helpers",
]
diff --git a/_shared_utils/shared_utils/geo_utils.py b/_shared_utils/shared_utils/geo_utils.py
new file mode 100644
index 000000000..c030f1fd7
--- /dev/null
+++ b/_shared_utils/shared_utils/geo_utils.py
@@ -0,0 +1,177 @@
+"""
+Geospatial utility functions
+"""
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+import shapely
+from calitp_data_analysis import geography_utils
+from scipy.spatial import KDTree
+from shared_utils import rt_utils
+
+# Could we use distance to filter for nearest neighbor?
+# It can make the length of results more unpredictable...maybe we stick to
+# k_neighbors and keep the nearest k, so that we can at least be
+# more consistent with the arrays returned
+geo_const_meters = 6_371_000 * np.pi / 180
+geo_const_miles = 3_959_000 * np.pi / 180
+
+
+def nearest_snap(line: shapely.LineString, point: shapely.Point, k_neighbors: int = 1) -> np.ndarray:
+ """
+ Based off of this function,
+ but we want to return the index value, rather than the point.
+ https://github.com/UTEL-UIUC/gtfs_segments/blob/main/gtfs_segments/geom_utils.py
+ """
+ line = np.asarray(line.coords)
+ point = np.asarray(point.coords)
+ tree = KDTree(line)
+
+ # np_dist is array of distances of result (let's not return it)
+ # np_inds is array of indices of result
+ _, np_inds = tree.query(
+ point,
+ workers=-1,
+ k=k_neighbors,
+ )
+
+ return np_inds.squeeze()
+
+
+def vp_as_gdf(vp: pd.DataFrame, crs: str = "EPSG:3310") -> gpd.GeoDataFrame:
+ """
+ Turn vp as a gdf and project to EPSG:3310.
+ """
+ vp_gdf = (
+ geography_utils.create_point_geometry(vp, longitude_col="x", latitude_col="y", crs=geography_utils.WGS84)
+ .to_crs(crs)
+ .drop(columns=["x", "y"])
+ )
+
+ return vp_gdf
+
+
+def add_arrowized_geometry(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
+ """
+ Add a column where the segment is arrowized.
+ """
+ segment_geom = gpd.GeoSeries(gdf.geometry)
+ CRS = gdf.crs.to_epsg()
+
+ # TODO: parallel_offset is going to be deprecated? offset_curve is the new one
+ geom_parallel = gpd.GeoSeries([rt_utils.try_parallel(i) for i in segment_geom], crs=CRS)
+ # geom_parallel = gpd.GeoSeries(
+ # [i.offset_curve(30) for i in segment_geom],
+ # crs=CRS
+ # )
+
+ geom_arrowized = rt_utils.arrowize_segment(geom_parallel, buffer_distance=20)
+
+ gdf = gdf.assign(geometry_arrowized=geom_arrowized)
+
+ return gdf
+
+
+def get_direction_vector(start: shapely.geometry.Point, end: shapely.geometry.Point) -> tuple:
+ """
+ Given 2 points (in a projected CRS...not WGS84), return a
+ tuple that shows (delta_x, delta_y).
+
+ https://www.varsitytutors.com/precalculus-help/find-a-direction-vector-when-given-two-points
+ https://stackoverflow.com/questions/17332759/finding-vectors-with-2-points
+
+ """
+ return ((end.x - start.x), (end.y - start.y))
+
+
+def distill_array_into_direction_vector(array: np.ndarray) -> tuple:
+ """
+ Given an array of n items, let's take the start/end of that.
+ From start/end, we can turn 2 coordinate points into 1 distance vector.
+ Distance vector is a tuple that equals (delta_x, delta_y).
+ """
+ origin = array[0]
+ destination = array[-1]
+ return get_direction_vector(origin, destination)
+
+
+def get_vector_norm(vector: tuple) -> float:
+ """
+ Get the length (off of Pythagorean Theorem) by summing up
+ the squares of the components and then taking square root.
+
+ Use Pythagorean Theorem to get unit vector. Divide the vector
+ by the length of the vector to get unit/normalized vector.
+ This equation tells us what we need to divide by.
+ """
+ return np.sqrt(vector[0] ** 2 + vector[1] ** 2)
+
+
+def get_normalized_vector(vector: tuple) -> tuple:
+ """
+ Apply Pythagorean Theorem and normalize the vector of distances.
+ https://stackoverflow.com/questions/21030391/how-to-normalize-a-numpy-array-to-a-unit-vector
+ """
+ x_norm = vector[0] / get_vector_norm(vector)
+ y_norm = vector[1] / get_vector_norm(vector)
+
+ return (x_norm, y_norm)
+
+
+def dot_product(vec1: tuple, vec2: tuple) -> float:
+ """
+ Take the dot product. Multiply the x components, the y components, and
+ sum it up.
+ """
+ return vec1[0] * vec2[0] + vec1[1] * vec2[1]
+
+
+def segmentize_by_indices(line_geometry: shapely.LineString, start_idx: int, end_idx: int) -> shapely.LineString:
+ """
+ Cut a line according to index values.
+ Similar to shapely.segmentize, which allows you to cut
+ line according to distances.
+ Here, we don't have specified distances, but we want to customize
+ where segment the line.
+ """
+ all_coords = shapely.get_coordinates(line_geometry)
+
+ if end_idx + 1 > all_coords.size:
+ subset_coords = all_coords[start_idx:end_idx]
+ else:
+ subset_coords = all_coords[start_idx : end_idx + 1]
+
+ if len(subset_coords) < 2:
+ return shapely.LineString()
+ else:
+ return shapely.LineString([shapely.Point(i) for i in subset_coords])
+
+
+def draw_line_between_points(gdf: gpd.GeoDataFrame, group_cols: list) -> gpd.GeoDataFrame:
+ """
+ Use the current postmile as the
+ starting geometry / segment beginning
+ and the subsequent postmile (based on odometer)
+ as the ending geometry / segment end.
+
+ Segment goes from current to next postmile.
+ """
+ # Grab the subsequent point geometry
+ # We can drop whenever the last point is missing within
+ # a group. If we have 3 points, we can draw 2 lines.
+ gdf = gdf.assign(end_geometry=(gdf.groupby(group_cols, group_keys=False).geometry.shift(-1))).dropna(
+ subset="end_geometry"
+ )
+
+ # Construct linestring with 2 point coordinates
+ gdf = (
+ gdf.assign(
+ line_geometry=gdf.apply(lambda x: shapely.LineString([x.geometry, x.end_geometry]), axis=1).set_crs(
+ geography_utils.WGS84
+ )
+ )
+ .drop(columns=["geometry", "end_geometry"])
+ .rename(columns={"line_geometry": "geometry"})
+ )
+
+ return gdf
diff --git a/_shared_utils/shared_utils/schedule_rt_utils.py b/_shared_utils/shared_utils/schedule_rt_utils.py
index be78dae39..7b3c3bf7b 100644
--- a/_shared_utils/shared_utils/schedule_rt_utils.py
+++ b/_shared_utils/shared_utils/schedule_rt_utils.py
@@ -13,6 +13,14 @@
from siuba import *
PACIFIC_TIMEZONE = "US/Pacific"
+RENAME_DISTRICT_DICT = {
+ "Marysville / Sacramento": "Marysville", # D3
+ "Bay Area / Oakland": "Oakland", # D4
+ "San Luis Obispo / Santa Barbara": "San Luis Obispo", # D5
+ "Fresno / Bakersfield": "Fresno", # D6
+ "San Bernardino / Riverside": "San Bernardino", # D8
+ "Orange County": "Irvine", # D12
+}
def localize_timestamp_col(df: dd.DataFrame, timestamp_col: Union[str, list]) -> dd.DataFrame:
@@ -84,7 +92,10 @@ def filter_dim_gtfs_datasets(
custom_filtering: dict = None,
get_df: bool = True,
) -> Union[pd.DataFrame, siuba.sql.verbs.LazyTbl]:
- """ """
+ """
+ Filter mart_transit_database.dim_gtfs_dataset table
+ and keep only the valid rows that passed data quality checks.
+ """
if "key" not in keep_cols:
raise KeyError("Include key in keep_cols list")
@@ -164,9 +175,73 @@ def get_organization_id(
return df2
+def filter_dim_county_geography(
+ date: str,
+ keep_cols: list[str] = ["caltrans_district"],
+) -> pd.DataFrame:
+ """
+ Merge mart_transit_database.dim_county_geography with
+ mart_transit_database.bridge_organizations_x_headquarters_county_geography.
+ Both tables are at organization-county-feed_period grain.
+
+ dim_county_geography holds additional geography columns like
+ MSA, FIPS, etc.
+
+ Use this merge to get caltrans_district.
+ Organizations belong to county, and counties are assigned to districts.
+ """
+ bridge_orgs_county_geog = (
+ tbls.mart_transit_database.bridge_organizations_x_headquarters_county_geography()
+ >> gtfs_utils_v2.subset_cols([_.organization_name, _.county_geography_key, _._valid_from, _._valid_to])
+ >> collect()
+ )
+
+ keep_cols2 = list(set(keep_cols + ["county_geography_key", "caltrans_district_name"]))
+
+ dim_county_geography = (
+ tbls.mart_transit_database.dim_county_geography()
+ >> rename(county_geography_key=_.key)
+ >> gtfs_utils_v2.subset_cols(keep_cols2)
+ >> collect()
+ )
+
+ # Several caltrans_district values in mart_transit_database
+ # now contain slashes.
+ # Use dict to standardize these against how previous versions were
+ dim_county_geography = dim_county_geography.assign(
+ caltrans_district_name=dim_county_geography.apply(
+ lambda x: RENAME_DISTRICT_DICT[x.caltrans_district_name]
+ if x.caltrans_district_name in RENAME_DISTRICT_DICT.keys()
+ else x.caltrans_district_name,
+ axis=1,
+ )
+ )
+
+ bridge_orgs_county_geog = localize_timestamp_col(bridge_orgs_county_geog, ["_valid_from", "_valid_to"])
+
+ bridge_orgs_county_geog2 = bridge_orgs_county_geog >> filter(
+ _._valid_from_local <= pd.to_datetime(date), _._valid_to_local >= pd.to_datetime(date)
+ )
+
+ # Merge organization-county with caltrans_district info
+ # it appears to be a 1:1 merge. checked whether organization can belong to multiple districts,
+ # and that doesn't appear to happen
+ df = pd.merge(bridge_orgs_county_geog2, dim_county_geography, on="county_geography_key", how="inner")
+
+ df2 = (
+ df.assign(caltrans_district=df.caltrans_district.astype(str).str.zfill(2) + " - " + df.caltrans_district_name)[
+ ["organization_name"] + keep_cols
+ ]
+ .drop_duplicates()
+ .reset_index(drop=True)
+ )
+
+ return df2
+
+
def filter_dim_organizations(
date: str,
- keep_cols: list[str] = ["source_record_id", "caltrans_district"],
+ keep_cols: list[str] = ["source_record_id"],
custom_filtering: dict = None,
get_df: bool = True,
) -> Union[pd.DataFrame, siuba.sql.verbs.LazyTbl]:
@@ -201,7 +276,8 @@ def sample_gtfs_dataset_key_to_organization_crosswalk(
"base64_url",
"uri",
],
- dim_organization_cols: list[str] = ["source_record_id", "name", "caltrans_district"],
+ dim_organization_cols: list[str] = ["source_record_id", "name"],
+ dim_county_geography_cols: list[str] = ["caltrans_district"],
) -> pd.DataFrame:
"""
Get crosswalk from gtfs_dataset_key to certain quartet data identifiers
@@ -243,11 +319,17 @@ def sample_gtfs_dataset_key_to_organization_crosswalk(
feeds_with_org_id = get_organization_id(feeds_with_quartet_info, date, merge_cols=merge_cols)
- # (4) Merge in dim_orgs to get caltrans_district
+ # (4) Merge in dim_orgs to get organization info - everything except caltrans_district is found here
ORG_RENAME_DICT = {"source_record_id": "organization_source_record_id", "name": "organization_name"}
orgs = filter_dim_organizations(date, keep_cols=dim_organization_cols, get_df=True).rename(columns=ORG_RENAME_DICT)
- feeds_with_district = pd.merge(feeds_with_org_id, orgs, on="organization_source_record_id")
+ feeds_with_org_info = pd.merge(feeds_with_org_id, orgs, on="organization_source_record_id")
+
+ # (5) Merge in dim_county_geography to get caltrans_district
+ # https://github.com/cal-itp/data-analyses/issues/1282
+ district = filter_dim_county_geography(date, dim_county_geography_cols)
+
+ feeds_with_district = pd.merge(feeds_with_org_info, district, on="organization_name")
return feeds_with_district
diff --git a/_shared_utils/shared_utils/shared_data.py b/_shared_utils/shared_utils/shared_data.py
index aa0bceb03..f5a12e781 100644
--- a/_shared_utils/shared_utils/shared_data.py
+++ b/_shared_utils/shared_utils/shared_data.py
@@ -2,11 +2,11 @@
One-off functions, run once, save datasets for shared use.
"""
import geopandas as gpd
+import numpy as np
import pandas as pd
-import shapely
from calitp_data_analysis import geography_utils, utils
from calitp_data_analysis.sql import to_snakecase
-from shared_utils.arcgis_query import query_arcgis_feature_server
+from shared_utils import arcgis_query, geo_utils
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/shared_data/"
COMPILED_CACHED_GCS = "gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/"
@@ -67,6 +67,11 @@ def make_clean_state_highway_network():
gdf = gpd.read_file(URL)
+ # Save a raw, undissolved version
+ utils.geoparquet_gcs_export(
+ gdf.drop(columns=["Shape_Length", "OBJECTID"]).pipe(to_snakecase), GCS_FILE_PATH, "state_highway_network_raw"
+ )
+
keep_cols = ["Route", "County", "District", "RouteType", "Direction", "geometry"]
gdf = gdf[keep_cols]
@@ -90,7 +95,7 @@ def export_shn_postmiles():
"""
URL = "https://caltrans-gis.dot.ca.gov/arcgis/rest/services/" "CHhighway/SHN_Postmiles_Tenth/" "FeatureServer/0/"
- gdf = query_arcgis_feature_server(URL)
+ gdf = arcgis_query.query_arcgis_feature_server(URL)
gdf2 = to_snakecase(gdf).drop(columns="objectid")
@@ -99,7 +104,7 @@ def export_shn_postmiles():
return
-def draw_line_between_points(gdf: gpd.GeoDataFrame, group_cols: list) -> gpd.GeoDataFrame:
+def segment_highway_lines_by_postmile(gdf: gpd.GeoDataFrame, group_cols: list) -> gpd.GeoDataFrame:
"""
Use the current postmile as the
starting geometry / segment beginning
@@ -108,53 +113,75 @@ def draw_line_between_points(gdf: gpd.GeoDataFrame, group_cols: list) -> gpd.Geo
Segment goes from current to next postmile.
"""
- # Grab the subsequent point geometry
- # We can drop whenever the last point is missing within
- # a group. If we have 3 points, we can draw 2 lines.
- gdf = gdf.assign(end_geometry=(gdf.groupby(group_cols, group_keys=False).geometry.shift(-1))).dropna(
- subset="end_geometry"
- )
+ # For this postmile, snap it to the highway line and find the nearest index
+ # for a linestring with 10 points, an index value of 2 means it's the 3rd coordinate
+ nearest_idx_series = np.vectorize(geo_utils.nearest_snap)(gdf.line_geometry, gdf.geometry, 1)
- # Construct linestring with 2 point coordinates
- gdf = (
- gdf.assign(
- line_geometry=gdf.apply(lambda x: shapely.LineString([x.geometry, x.end_geometry]), axis=1).set_crs(
- geography_utils.WGS84
- )
- )
- .drop(columns=["geometry", "end_geometry"])
- .rename(columns={"line_geometry": "geometry"})
- )
+ gdf["idx"] = nearest_idx_series
+
+ # The segment will be index into the nearest point for a postmile
+ # until the index of the subsequent postmile
+ # Ex: idx=1 and subseq_idx=5 means we want to grab hwy_coords[1:6] as our segment
+ gdf = gdf.assign(
+ subseq_idx=(gdf.sort_values(group_cols + ["odometer"]).groupby(group_cols).idx.shift(-1).astype("Int64")),
+ eodometer=(gdf.sort_values(group_cols + ["odometer"]).groupby(group_cols).odometer.shift(-1)),
+ ).rename(columns={"odometer": "bodometer"})
+ # follow the convention of b for begin odometer and e for end odometer
+
+ # Drop NaNs because for 3 points, we can draw 2 segments
+ gdf2 = gdf.dropna(subset="subseq_idx").reset_index(drop=True)
+
+ segment_geom = np.vectorize(geo_utils.segmentize_by_indices)(gdf2.line_geometry, gdf2.idx, gdf2.subseq_idx)
+
+ gdf3 = gdf2.assign(
+ geometry=gpd.GeoSeries(segment_geom).set_crs(geography_utils.WGS84),
+ ).drop(columns=["line_geometry", "idx", "subseq_idx"])
- return gdf
+ return gdf3
-def create_postmile_segments(group_cols: list) -> gpd.GeoDataFrame:
+def create_postmile_segments(
+ group_cols: list = ["county", "routetype", "route", "direction", "routes", "pmrouteid"]
+) -> gpd.GeoDataFrame:
"""
Take the SHN postmiles gdf, group by highway / odometer
and convert the points into lines.
We'll lose the last postmile for each highway-direction.
Segment goes from current postmile point to subseq postmile point.
"""
- gdf = gpd.read_parquet(
- f"{GCS_FILE_PATH}state_highway_network_postmiles.parquet",
- columns=["route", "direction", "odometer", "geometry"],
+ # We need multilinestrings to become linestrings (use gdf.explode)
+ # and the columns we select do uniquely tag lines (multilinestrings are 1 item)
+ hwy_lines = gpd.read_parquet(
+ f"{GCS_FILE_PATH}state_highway_network_raw.parquet",
+ columns=group_cols + ["bodometer", "eodometer", "geometry"],
+ ).explode("geometry")
+
+ hwy_postmiles = gpd.read_parquet(
+ f"{GCS_FILE_PATH}state_highway_network_postmiles.parquet", columns=group_cols + ["odometer", "geometry"]
)
- # If there are duplicates with highway-direction and odometer
- # (where pm or other column differs slightly),
- # we'll drop and cut as long of a segment we can
- # There may be differences in postmile (relative to county start)
- # and odometer (relative to line's origin).
- gdf2 = (
- gdf.sort_values(group_cols + ["odometer"])
- .drop_duplicates(subset=group_cols + ["odometer"])
+ # Merge hwy points with the lines we want to cut segments from
+ gdf = (
+ pd.merge(hwy_postmiles, hwy_lines.rename(columns={"geometry": "line_geometry"}), on=group_cols, how="inner")
+ .query(
+ # make sure that the postmile point falls between
+ # the beginning and ending odometer
+ # once we check this, we don't need b/e odometer.
+ "odometer >= bodometer & odometer <= eodometer"
+ )
+ .sort_values(group_cols + ["odometer"])
.reset_index(drop=True)
+ .drop(columns=["bodometer", "eodometer"])
)
- gdf3 = draw_line_between_points(gdf2, group_cols)
+ gdf2 = segment_highway_lines_by_postmile(gdf, group_cols)
+
+ # TODO: there are rows with empty geometry because their indexed value is the same for current and subseq
+ # so no line was drawn
+ # check if it's ok for these to exist
+ # gdf2[gdf2.geometry.is_empty] shows about 57k rows that didn't get cut
- utils.geoparquet_gcs_export(gdf3, GCS_FILE_PATH, "state_highway_network_postmile_segments")
+ utils.geoparquet_gcs_export(gdf2, GCS_FILE_PATH, "state_highway_network_postmile_segments")
return
@@ -243,7 +270,7 @@ def make_transit_operators_to_legislative_district_crosswalk(date_list: list) ->
# State Highway Network
make_clean_state_highway_network()
export_shn_postmiles()
- create_postmile_segments(["route", "direction"])
+ create_postmile_segments(["district", "county", "routetype", "route", "direction", "routes", "pmrouteid"])
# Legislative Districts
export_combined_legislative_districts()
diff --git a/_shared_utils/shared_utils/time_helpers.py b/_shared_utils/shared_utils/time_helpers.py
new file mode 100644
index 000000000..29b3d0c1e
--- /dev/null
+++ b/_shared_utils/shared_utils/time_helpers.py
@@ -0,0 +1,78 @@
+"""
+Helpers for defining peak vs offpeak periods and
+weekend and weekends so we can aggregate our
+existing time-of-day bins.
+"""
+import datetime
+
+import pandas as pd
+
+PEAK_PERIODS = ["AM Peak", "PM Peak"]
+
+HOURS_BY_TIME_OF_DAY = {
+ "Owl": 4, # [0, 3]
+ "Early AM": 3, # [4, 6]
+ "AM Peak": 3, # [7, 9]
+ "Midday": 5, # [10, 14]
+ "PM Peak": 5, # [15, 19]
+ "Evening": 4, # [20, 23]
+}
+
+TIME_OF_DAY_DICT = {
+ **{k: "peak" for k, v in HOURS_BY_TIME_OF_DAY.items() if k in PEAK_PERIODS},
+ **{k: "offpeak" for k, v in HOURS_BY_TIME_OF_DAY.items() if k not in PEAK_PERIODS},
+}
+
+DAY_TYPE_DICT = {
+ 1: "Sunday",
+ 2: "Monday",
+ 3: "Tuesday",
+ 4: "Wednesday",
+ 5: "Thursday",
+ 6: "Friday",
+ 7: "Saturday",
+}
+
+WEEKDAY_DICT = {
+ **{k: "weekday" for k in ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]},
+ **{k: "weekend" for k in ["Saturday", "Sunday"]},
+}
+
+
+def time_span_labeling(date_list: list) -> tuple[str]:
+ """
+ If we grab a week's worth of trips, we'll
+ use this week's average to stand-in for the entire month.
+ Label with month and year.
+ """
+ time_span_str = list(set([datetime.datetime.strptime(d, "%Y-%m-%d").strftime("%b%Y").lower() for d in date_list]))
+
+ time_span_num = list(set([datetime.datetime.strptime(d, "%Y-%m-%d").strftime("%m_%Y").lower() for d in date_list]))
+
+ if len(time_span_str) == 1:
+ return time_span_str[0], time_span_num[0]
+
+ else:
+ print(f"multiple months: {time_span_str}")
+ return time_span_str, time_span_num
+
+
+def add_time_span_columns(df: pd.DataFrame, time_span_num: str) -> pd.DataFrame:
+ """
+ Add columns for month / year, use when we have aggregated time-series.
+ """
+ month = int(time_span_num.split("_")[0])
+ year = int(time_span_num.split("_")[1])
+
+ # Downgrade some dtypes for public bucket
+ df = df.assign(
+ month=month,
+ year=year,
+ ).astype(
+ {
+ "month": "int16",
+ "year": "int16",
+ }
+ )
+
+ return df
diff --git a/gtfs_funnel/concatenate_monthly_scheduled_service.py b/gtfs_funnel/concatenate_monthly_scheduled_service.py
index da2a9f615..b736c0e49 100644
--- a/gtfs_funnel/concatenate_monthly_scheduled_service.py
+++ b/gtfs_funnel/concatenate_monthly_scheduled_service.py
@@ -6,8 +6,8 @@
schedule_gtfs_dataset_key.
"""
import pandas as pd
-from segment_speed_utils import helpers, time_helpers, time_series_utils
-from shared_utils import rt_dates
+from segment_speed_utils import helpers, time_series_utils
+from shared_utils import rt_dates, time_helpers
from update_vars import GTFS_DATA_DICT, SCHED_GCS
def parse_service_date(df: pd.DataFrame) -> pd.DataFrame:
diff --git a/gtfs_funnel/stop_arrivals_in_roads.py b/gtfs_funnel/stop_arrivals_in_roads.py
index 488435c00..6d623d57b 100644
--- a/gtfs_funnel/stop_arrivals_in_roads.py
+++ b/gtfs_funnel/stop_arrivals_in_roads.py
@@ -8,10 +8,9 @@
from segment_speed_utils import (helpers,
gtfs_schedule_wrangling,
- time_helpers
)
from segment_speed_utils.project_vars import PROJECT_CRS
-from shared_utils import rt_dates
+from shared_utils import rt_dates, time_helpers
from update_vars import SHARED_GCS, SCHED_GCS
road_cols = ["linearid", "mtfcc", "fullname"]
diff --git a/gtfs_funnel/stop_times_with_direction.py b/gtfs_funnel/stop_times_with_direction.py
index 4e79e5604..284651297 100644
--- a/gtfs_funnel/stop_times_with_direction.py
+++ b/gtfs_funnel/stop_times_with_direction.py
@@ -2,7 +2,6 @@
Create a schedule stop_times table with direction of travel
between stops.
"""
-import dask.dataframe as dd
import datetime
import geopandas as gpd
import numpy as np
@@ -10,7 +9,7 @@
from calitp_data_analysis import utils
from shared_utils import rt_utils
-from segment_speed_utils import helpers, wrangle_shapes
+from segment_speed_utils import helpers
from segment_speed_utils.project_vars import PROJECT_CRS
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS
diff --git a/gtfs_funnel/vp_condenser.py b/gtfs_funnel/vp_condenser.py
index 10ccb0221..2ec815b24 100644
--- a/gtfs_funnel/vp_condenser.py
+++ b/gtfs_funnel/vp_condenser.py
@@ -11,7 +11,8 @@
from calitp_data_analysis.geography_utils import WGS84
from calitp_data_analysis import utils
-from segment_speed_utils import vp_transform, wrangle_shapes
+from segment_speed_utils import vp_transform
+from shared_utils import geo_utils
from update_vars import GTFS_DATA_DICT, SEGMENT_GCS
def condense_vp_to_linestring(
@@ -34,7 +35,7 @@ def condense_vp_to_linestring(
"location_timestamp_local",
"moving_timestamp_local",
],
- ).pipe(wrangle_shapes.vp_as_gdf, crs = WGS84)
+ ).pipe(geo_utils.vp_as_gdf, crs = WGS84)
vp_condensed = delayed(vp_transform.condense_point_geom_to_line)(
vp,
@@ -80,7 +81,7 @@ def prepare_vp_for_all_directions(
dfs = [
delayed(vp_transform.combine_valid_vp_for_direction)(
vp, direction)
- for direction in wrangle_shapes.ALL_DIRECTIONS
+ for direction in vp_transform.ALL_DIRECTIONS
]
results = [compute(i)[0] for i in dfs]
diff --git a/gtfs_funnel/vp_direction.py b/gtfs_funnel/vp_direction.py
index 87986fef1..21bf97145 100644
--- a/gtfs_funnel/vp_direction.py
+++ b/gtfs_funnel/vp_direction.py
@@ -18,7 +18,7 @@
from loguru import logger
from calitp_data_analysis.geography_utils import WGS84
-from segment_speed_utils import segment_calcs, wrangle_shapes
+from segment_speed_utils import segment_calcs
from segment_speed_utils.project_vars import PROJECT_CRS
from shared_utils import publish_utils, rt_utils
from update_vars import GTFS_DATA_DICT, SEGMENT_GCS
diff --git a/open_data/data_dictionary.yml b/open_data/data_dictionary.yml
index b8ff9e42b..e3e009814 100644
--- a/open_data/data_dictionary.yml
+++ b/open_data/data_dictionary.yml
@@ -60,7 +60,7 @@ common-fields:
offpeak - offpeak hours are 12am-6:59am inclusive, 10am-2:59pm inclusive, and 8pm-11:59pm inclusive.
peak - peak hours are 7am-9:59am inclusive and 3pm-7:59pm inclusive.
all_day
- definition_source: "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/segment_speed_utils/time_helpers.py"
+ definition_source: "https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/time_helpers.py"
- &hqta_type
definition: |-
Indicates the type of High Quality Transit Area as defined in the Public Resources Code that a stop is and/or falls within. Valid options are: hq_corridor_bus (PRC 21155), major_stop_bus (PRC 21064.3), major_stop_rail (PRC 21064.3), major_stop_ferry (PRC 21064.3), major_stop_brt (PRC 21064.3, 21060.2)
diff --git a/open_data/xml/speeds_by_route_time_of_day_fgdc.xml b/open_data/xml/speeds_by_route_time_of_day_fgdc.xml
index a50f6fc57..9adee1a3a 100644
--- a/open_data/xml/speeds_by_route_time_of_day_fgdc.xml
+++ b/open_data/xml/speeds_by_route_time_of_day_fgdc.xml
@@ -96,7 +96,7 @@
offpeak - offpeak hours are 12am-6:59am inclusive, 10am-2:59pm inclusive, and 8pm-11:59pm inclusive.
peak - peak hours are 7am-9:59am inclusive and 3pm-7:59pm inclusive.
all_day
- https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/segment_speed_utils/time_helpers.py
+ https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/time_helpers.py
speed_mph
diff --git a/open_data/xml/speeds_by_stop_segments_fgdc.xml b/open_data/xml/speeds_by_stop_segments_fgdc.xml
index 6cf551592..4f2d5f246 100644
--- a/open_data/xml/speeds_by_stop_segments_fgdc.xml
+++ b/open_data/xml/speeds_by_stop_segments_fgdc.xml
@@ -115,7 +115,7 @@
offpeak - offpeak hours are 12am-6:59am inclusive, 10am-2:59pm inclusive, and 8pm-11:59pm inclusive.
peak - peak hours are 7am-9:59am inclusive and 3pm-7:59pm inclusive.
all_day
- https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/segment_speed_utils/time_helpers.py
+ https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/time_helpers.py
base64_url
diff --git a/rt_segment_speeds/scripts/average_segment_speeds.py b/rt_segment_speeds/scripts/average_segment_speeds.py
index c7b0b6368..ad9e68170 100644
--- a/rt_segment_speeds/scripts/average_segment_speeds.py
+++ b/rt_segment_speeds/scripts/average_segment_speeds.py
@@ -16,12 +16,13 @@
helpers,
metrics,
segment_calcs,
- time_helpers,
time_series_utils
)
+from shared_utils import time_helpers
from update_vars import GTFS_DATA_DICT, SEGMENT_GCS
from segment_speed_utils.project_vars import SEGMENT_TYPES
+
OPERATOR_COLS = [
"schedule_gtfs_dataset_key",
]
diff --git a/rt_segment_speeds/scripts/average_summary_speeds.py b/rt_segment_speeds/scripts/average_summary_speeds.py
index 7ddb66a7b..d3c3ad25e 100644
--- a/rt_segment_speeds/scripts/average_summary_speeds.py
+++ b/rt_segment_speeds/scripts/average_summary_speeds.py
@@ -14,8 +14,8 @@
from calitp_data_analysis import utils
from segment_speed_utils import (gtfs_schedule_wrangling,
metrics,
- time_helpers,
)
+from shared_utils import time_helpers
from segment_speed_utils.project_vars import SEGMENT_TYPES
from update_vars import SEGMENT_GCS, GTFS_DATA_DICT
from average_segment_speeds import (OPERATOR_COLS, CROSSWALK_COLS,
diff --git a/rt_segment_speeds/scripts/interpolate_stop_arrival.py b/rt_segment_speeds/scripts/interpolate_stop_arrival.py
index 80fa10261..a155b74c1 100644
--- a/rt_segment_speeds/scripts/interpolate_stop_arrival.py
+++ b/rt_segment_speeds/scripts/interpolate_stop_arrival.py
@@ -12,7 +12,7 @@
from typing import Literal, Optional
from segment_speed_utils import (array_utils, helpers,
- segment_calcs, wrangle_shapes)
+ segment_calcs)
from update_vars import SEGMENT_GCS, GTFS_DATA_DICT
from segment_speed_utils.project_vars import PROJECT_CRS, SEGMENT_TYPES
from shared_utils import rt_dates
@@ -138,7 +138,7 @@ def add_arrival_time(
])
- interpolated_arrival = wrangle_shapes.interpolate_stop_arrival_time(
+ interpolated_arrival = segment_calcs.interpolate_stop_arrival_time(
stop_position, projected_points, timestamp_arr)
arrival_time_series.append(interpolated_arrival)
@@ -187,7 +187,7 @@ def stop_and_arrival_time_arrays_by_trip(
# Use correct values to fill in the missing arrival times
df2 = df2.assign(
arrival_time = df2.apply(
- lambda x: wrangle_shapes.interpolate_stop_arrival_time(
+ lambda x: segment_calcs.interpolate_stop_arrival_time(
x.stop_meters, x.stop_meters_arr, x.arrival_time_arr
), axis=1
)
diff --git a/rt_segment_speeds/scripts/nearest_vp_to_road.py b/rt_segment_speeds/scripts/nearest_vp_to_road.py
index 4d5952d78..d92798aaf 100644
--- a/rt_segment_speeds/scripts/nearest_vp_to_road.py
+++ b/rt_segment_speeds/scripts/nearest_vp_to_road.py
@@ -7,7 +7,7 @@
from dask import delayed, compute
-from segment_speed_utils import helpers, neighbor, segment_calcs, wrangle_shapes
+from segment_speed_utils import helpers, neighbor, segment_calcs
from segment_speed_utils.project_vars import SEGMENT_GCS, SHARED_GCS, PROJECT_CRS
import interpolate_stop_arrival
diff --git a/rt_segment_speeds/scripts/vp_around_stops.py b/rt_segment_speeds/scripts/vp_around_stops.py
index 904896139..9f76378e0 100644
--- a/rt_segment_speeds/scripts/vp_around_stops.py
+++ b/rt_segment_speeds/scripts/vp_around_stops.py
@@ -14,7 +14,8 @@
from pathlib import Path
from typing import Literal, Optional
-from segment_speed_utils import helpers, wrangle_shapes
+from segment_speed_utils import helpers
+from shared_utils import geo_utils
from update_vars import SEGMENT_GCS, GTFS_DATA_DICT
from segment_speed_utils.project_vars import SEGMENT_TYPES, PROJECT_CRS
@@ -115,7 +116,7 @@ def get_vp_projected_against_shape(
f"{SEGMENT_GCS}{input_file}_{analysis_date}",
columns = ["trip_instance_key", "vp_idx", "x", "y"],
**kwargs
- ).pipe(wrangle_shapes.vp_as_gdf, crs = PROJECT_CRS)
+ ).pipe(geo_utils.vp_as_gdf, crs = PROJECT_CRS)
# Merge all together so we can project vp point goem
# against shape line geom
diff --git a/rt_segment_speeds/segment_speed_utils/__init__.py b/rt_segment_speeds/segment_speed_utils/__init__.py
index b7e75b05e..4d10f7d0e 100644
--- a/rt_segment_speeds/segment_speed_utils/__init__.py
+++ b/rt_segment_speeds/segment_speed_utils/__init__.py
@@ -7,10 +7,8 @@
parallel_corridors,
project_vars,
segment_calcs,
- time_helpers,
time_series_utils,
vp_transform,
- wrangle_shapes,
)
__all__ = [
@@ -22,8 +20,6 @@
"parallel_corridors",
"project_vars",
"segment_calcs",
- "time_helpers",
"time_series_utils",
"vp_transform",
- "wrangle_shapes",
]
\ No newline at end of file
diff --git a/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py b/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py
index d73c378bd..da2ac2b2c 100644
--- a/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py
+++ b/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py
@@ -1,14 +1,14 @@
"""
All kinds of GTFS schedule table wrangling.
"""
+import dask.dataframe as dd
import geopandas as gpd
import pandas as pd
-import dask.dataframe as dd
from typing import Literal, Union
-from segment_speed_utils import helpers, time_helpers
-from shared_utils import portfolio_utils, rt_utils
+from segment_speed_utils import helpers
+from shared_utils import portfolio_utils, rt_utils, time_helpers
from segment_speed_utils.project_vars import SEGMENT_GCS
sched_rt_category_dict = {
diff --git a/rt_segment_speeds/segment_speed_utils/neighbor.py b/rt_segment_speeds/segment_speed_utils/neighbor.py
index ec18fe074..54ee5d8d4 100644
--- a/rt_segment_speeds/segment_speed_utils/neighbor.py
+++ b/rt_segment_speeds/segment_speed_utils/neighbor.py
@@ -6,41 +6,11 @@
import pandas as pd
import shapely
-from scipy.spatial import KDTree
-
from calitp_data_analysis.geography_utils import WGS84
-from segment_speed_utils import gtfs_schedule_wrangling, wrangle_shapes
+from segment_speed_utils import gtfs_schedule_wrangling
from segment_speed_utils.project_vars import SEGMENT_GCS, GTFS_DATA_DICT
+from shared_utils import geo_utils
-# Could we use distance to filter for nearest neighbor?
-# It can make the length of results more unpredictable...maybe we stick to
-# k_neighbors and keep the nearest k, so that we can at least be
-# more consistent with the arrays returned
-geo_const_meters = 6_371_000 * np.pi / 180
-geo_const_miles = 3_959_000 * np.pi / 180
-
-def nearest_snap(
- line: shapely.LineString,
- point: shapely.Point,
- k_neighbors: int = 1
-) -> np.ndarray:
- """
- Based off of this function,
- but we want to return the index value, rather than the point.
- https://github.com/UTEL-UIUC/gtfs_segments/blob/main/gtfs_segments/geom_utils.py
- """
- line = np.asarray(line.coords)
- point = np.asarray(point.coords)
- tree = KDTree(line)
-
- # np_dist is array of distances of result (let's not return it)
- # np_inds is array of indices of result
- _, np_inds = tree.query(
- point, workers=-1, k=k_neighbors,
- )
-
- return np_inds.squeeze()
-
def add_nearest_vp_idx(
vp_linestring: shapely.LineString,
@@ -51,7 +21,7 @@ def add_nearest_vp_idx(
Index into where the nearest vp is to the stop,
and return that vp_idx value from the vp_idx array.
"""
- idx = nearest_snap(vp_linestring, stop, k_neighbors=1)
+ idx = geo_utils.nearest_snap(vp_linestring, stop, k_neighbors=1)
return vp_idx_arr[idx]
@@ -105,7 +75,7 @@ def add_nearest_neighbor_result_array(
stop_geometry = getattr(row, "stop_geometry")
vp_idx_arr = getattr(row, "vp_idx")
- np_inds = nearest_snap(
+ np_inds = geo_utils.nearest_snap(
vp_coords_line, stop_geometry, N_NEAREST_POINTS
)
diff --git a/rt_segment_speeds/segment_speed_utils/segment_calcs.py b/rt_segment_speeds/segment_speed_utils/segment_calcs.py
index 46f33e047..9bfac4613 100644
--- a/rt_segment_speeds/segment_speed_utils/segment_calcs.py
+++ b/rt_segment_speeds/segment_speed_utils/segment_calcs.py
@@ -134,3 +134,19 @@ def get_usable_vp_bounds_by_trip(df: dd.DataFrame) -> pd.DataFrame:
).reset_index(drop=True).compute()
return df2
+
+
+def interpolate_stop_arrival_time(
+ stop_position: float,
+ shape_meters_arr: np.ndarray,
+ timestamp_arr: np.ndarray
+) -> float:
+ """
+ Interpolate the arrival time given the stop meters position.
+ Cast datetimes into floats and cast back as datetime.
+ """
+ timestamp_arr = np.asarray(timestamp_arr).astype("datetime64[s]").astype("float64")
+
+ return np.interp(
+ stop_position, np.asarray(shape_meters_arr), timestamp_arr
+ ).astype("datetime64[s]")
\ No newline at end of file
diff --git a/rt_segment_speeds/segment_speed_utils/time_helpers.py b/rt_segment_speeds/segment_speed_utils/time_helpers.py
deleted file mode 100644
index af74842ae..000000000
--- a/rt_segment_speeds/segment_speed_utils/time_helpers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-"""
-Helpers for defining peak vs offpeak periods and
-weekend and weekends so we can aggregate our
-existing time-of-day bins.
-"""
-import datetime
-import pandas as pd
-
-PEAK_PERIODS = ["AM Peak", "PM Peak"]
-
-HOURS_BY_TIME_OF_DAY = {
- "Owl": 4, #[0, 3]
- "Early AM": 3, #[4, 6]
- "AM Peak": 3, #[7, 9]
- "Midday": 5, #[10, 14]
- "PM Peak": 5, #[15, 19]
- "Evening": 4 #[20, 23]
-}
-
-TIME_OF_DAY_DICT = {
- **{k: "peak" for k, v in HOURS_BY_TIME_OF_DAY.items()
- if k in PEAK_PERIODS},
- **{k: "offpeak" for k, v in HOURS_BY_TIME_OF_DAY.items()
- if k not in PEAK_PERIODS}
-}
-
-DAY_TYPE_DICT = {
- 1: "Sunday",
- 2: "Monday",
- 3: "Tuesday",
- 4: "Wednesday",
- 5: "Thursday",
- 6: "Friday",
- 7: "Saturday",
-}
-
-WEEKDAY_DICT = {
- **{k: "weekday" for k in ["Monday", "Tuesday", "Wednesday",
- "Thursday", "Friday"]},
- **{k: "weekend" for k in ["Saturday", "Sunday"]}
-}
-
-def time_span_labeling(date_list: list) -> tuple[str]:
- """
- If we grab a week's worth of trips, we'll
- use this week's average to stand-in for the entire month.
- Label with month and year.
- """
- time_span_str = list(set(
- [datetime.datetime.strptime(d, "%Y-%m-%d").strftime("%b%Y").lower()
- for d in date_list]
- ))
-
- time_span_num = list(set(
- [datetime.datetime.strptime(d, "%Y-%m-%d").strftime("%m_%Y").lower()
- for d in date_list]
- ))
-
- if len(time_span_str) == 1:
- return time_span_str[0], time_span_num[0]
-
- else:
- print(f"multiple months: {time_span_str}")
- return time_span_str, time_span_num
-
-
-def add_time_span_columns(
- df: pd.DataFrame,
- time_span_num: str
-) -> pd.DataFrame:
-
- month = int(time_span_num.split('_')[0])
- year = int(time_span_num.split('_')[1])
-
- # Downgrade some dtypes for public bucket
- df = df.assign(
- month = month,
- year = year,
- ).astype({
- "month": "int16",
- "year": "int16",
- })
-
- return df
\ No newline at end of file
diff --git a/rt_segment_speeds/segment_speed_utils/vp_transform.py b/rt_segment_speeds/segment_speed_utils/vp_transform.py
index 32a157a63..154a5b040 100644
--- a/rt_segment_speeds/segment_speed_utils/vp_transform.py
+++ b/rt_segment_speeds/segment_speed_utils/vp_transform.py
@@ -4,7 +4,16 @@
import shapely
from calitp_data_analysis.geography_utils import WGS84
-from segment_speed_utils import wrangle_shapes
+
+ALL_DIRECTIONS = ["Northbound", "Southbound", "Eastbound", "Westbound"]
+
+OPPOSITE_DIRECTIONS = {
+ "Northbound": "Southbound",
+ "Southbound": "Northbound",
+ "Eastbound": "Westbound",
+ "Westbound": "Eastbound",
+ "Unknown": "",
+}
def condense_point_geom_to_line(
df: pd.DataFrame,
@@ -67,7 +76,7 @@ def combine_valid_vp_for_direction(
direction: str
) -> gpd.GeoDataFrame:
- opposite_direction = wrangle_shapes.OPPOSITE_DIRECTIONS[direction]
+ opposite_direction = OPPOSITE_DIRECTIONS[direction]
coords_series = []
vp_idx_series = []
diff --git a/rt_segment_speeds/segment_speed_utils/wrangle_shapes.py b/rt_segment_speeds/segment_speed_utils/wrangle_shapes.py
deleted file mode 100644
index 6becb9703..000000000
--- a/rt_segment_speeds/segment_speed_utils/wrangle_shapes.py
+++ /dev/null
@@ -1,195 +0,0 @@
-"""
-Functions for applying shapely project and interpolation.
-Move our shapes (linestrings) and stops (points) from coordinates
-to numpy arrays with numeric values (shape_meters) and vice versa.
-
-References:
-* Tried method 4: https://gis.stackexchange.com/questions/203048/split-lines-at-points-using-shapely -- debug because we lost curves
-* https://stackoverflow.com/questions/31072945/shapely-cut-a-piece-from-a-linestring-at-two-cutting-points
-* https://gis.stackexchange.com/questions/210220/break-a-shapely-linestring-at-multiple-points
-* https://gis.stackexchange.com/questions/416284/splitting-multiline-or-linestring-into-equal-segments-of-particular-length-using
-* https://stackoverflow.com/questions/62053253/how-to-split-a-linestring-to-segments
-"""
-import geopandas as gpd
-import numpy as np
-import pandas as pd
-import shapely
-
-from typing import Literal
-
-from calitp_data_analysis import geography_utils
-from shared_utils import rt_utils
-from segment_speed_utils.project_vars import PROJECT_CRS
-
-ALL_DIRECTIONS = ["Northbound", "Southbound", "Eastbound", "Westbound"]
-
-OPPOSITE_DIRECTIONS = {
- "Northbound": "Southbound",
- "Southbound": "Northbound",
- "Eastbound": "Westbound",
- "Westbound": "Eastbound",
- "Unknown": "",
-}
-
-def interpolate_projected_points(
- shape_geometry: shapely.geometry.LineString,
- projected_list: list
-):
- return [shape_geometry.interpolate(i) for i in projected_list]
-
-
-def project_list_of_coords(
- shape_geometry: shapely.geometry.LineString,
- point_geom_list: list = [],
- use_shapely_coords: bool = False
-) -> np.ndarray:
- if use_shapely_coords:
- # https://stackoverflow.com/questions/49330030/remove-a-duplicate-point-from-polygon-in-shapely
- # use simplify(0) to remove any points that might be duplicates
- return np.asarray(
- [shape_geometry.project(shapely.geometry.Point(p))
- for p in shape_geometry.simplify(0).coords])
- else:
- return np.asarray(
- [shape_geometry.project(i) for i in point_geom_list])
-
-
-def add_arrowized_geometry(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
- """
- Add a column where the segment is arrowized.
- """
-
- segment_geom = gpd.GeoSeries(gdf.geometry)
- CRS = gdf.crs.to_epsg()
-
- #TODO: parallel_offset is going to be deprecated? offset_curve is the new one
- geom_parallel = gpd.GeoSeries(
- [rt_utils.try_parallel(i) for i in segment_geom], crs=CRS)
- #geom_parallel = gpd.GeoSeries(
- # [i.offset_curve(30) for i in segment_geom],
- # crs=CRS
- #)
-
- geom_arrowized = rt_utils.arrowize_segment(
- geom_parallel,
- buffer_distance = 20
- )
-
- gdf = gdf.assign(
- geometry_arrowized = geom_arrowized
- )
-
- return gdf
-
-
-def array_to_geoseries(
- array: np.ndarray,
- geom_type: Literal["point", "line", "polygon"],
- crs: str = "EPSG:3310"
-)-> gpd.GeoSeries:
- """
- Turn array back into geoseries.
- """
- if geom_type == "point":
- gdf = gpd.GeoSeries(array, crs=crs)
-
- elif geom_type == "line":
- gdf = gpd.GeoSeries(
- shapely.geometry.LineString(array),
- crs=crs)
-
- elif geom_type == "polygon":
- gdf = gpd.GeoSeries(
- shapely.geometry.Polygon(array),
- crs = crs)
-
- return gdf
-
-
-def get_direction_vector(
- start: shapely.geometry.Point,
- end: shapely.geometry.Point
-) -> tuple:
- """
- Given 2 points (in a projected CRS...not WGS84), return a
- tuple that shows (delta_x, delta_y).
-
- https://www.varsitytutors.com/precalculus-help/find-a-direction-vector-when-given-two-points
- https://stackoverflow.com/questions/17332759/finding-vectors-with-2-points
-
- """
- return ((end.x - start.x), (end.y - start.y))
-
-
-def distill_array_into_direction_vector(array: np.ndarray) -> tuple:
- """
- Given an array of n items, let's take the start/end of that.
- From start/end, we can turn 2 coordinate points into 1 distance vector.
- Distance vector is a tuple that equals (delta_x, delta_y).
- """
- origin = array[0]
- destination = array[-1]
- return get_direction_vector(origin, destination)
-
-
-def get_vector_norm(vector: tuple) -> float:
- """
- Get the length (off of Pythagorean Theorem) by summing up
- the squares of the components and then taking square root.
-
- Use Pythagorean Theorem to get unit vector. Divide the vector
- by the length of the vector to get unit/normalized vector.
- This equation tells us what we need to divide by.
- """
- return np.sqrt(vector[0]**2 + vector[1]**2)
-
-
-def get_normalized_vector(vector: tuple) -> tuple:
- """
- Apply Pythagorean Theorem and normalize the vector of distances.
- https://stackoverflow.com/questions/21030391/how-to-normalize-a-numpy-array-to-a-unit-vector
- """
- x_norm = vector[0] / get_vector_norm(vector)
- y_norm = vector[1] / get_vector_norm(vector)
-
- return (x_norm, y_norm)
-
-
-def dot_product(vec1: tuple, vec2: tuple) -> float:
- """
- Take the dot product. Multiply the x components, the y components, and
- sum it up.
- """
- return vec1[0]*vec2[0] + vec1[1]*vec2[1]
-
-
-def vp_as_gdf(
- vp: pd.DataFrame,
- crs: str = PROJECT_CRS
-) -> gpd.GeoDataFrame:
- """
- Turn vp as a gdf and project to EPSG:3310.
- """
- vp_gdf = geography_utils.create_point_geometry(
- vp,
- longitude_col = "x", latitude_col = "y",
- crs = geography_utils.WGS84
- ).to_crs(crs).drop(columns = ["x", "y"])
-
- return vp_gdf
-
-
-def interpolate_stop_arrival_time(
- stop_position: float,
- shape_meters_arr: np.ndarray,
- timestamp_arr: np.ndarray
-) -> float:
- """
- Interpolate the arrival time given the stop meters position.
- Cast datetimes into floats and cast back as datetime.
- """
- timestamp_arr = np.asarray(timestamp_arr).astype("datetime64[s]").astype("float64")
-
- return np.interp(
- stop_position, np.asarray(shape_meters_arr), timestamp_arr
- ).astype("datetime64[s]")
\ No newline at end of file
diff --git a/rt_segment_speeds/setup.py b/rt_segment_speeds/setup.py
index 00030a4e9..be6555fa4 100644
--- a/rt_segment_speeds/setup.py
+++ b/rt_segment_speeds/setup.py
@@ -3,7 +3,7 @@
setup(
name="segment_speed_utils",
packages=find_packages(),
- version="1.5",
+ version="1.6",
description="Utility functions for GTFS RT segment speeds",
author="Cal-ITP",
license="Apache",