diff --git a/Makefile b/Makefile index 04b3e5e5c..41b98acf8 100644 --- a/Makefile +++ b/Makefile @@ -59,12 +59,25 @@ build_district_digest: make build_portfolio_site git add portfolio/$(site)/district_*/*.ipynb +build_starterkit_ha: + $(eval export site = ha_starterkit_district) + pip install -r portfolio/requirements.txt + make build_portfolio_site + git add portfolio/$(site)/district_*/ portfolio/$(site)/*.yml portfolio/$(site)/*.md + python portfolio/portfolio.py index --deploy --prod + +build_starterkit_LASTNAME: + $(eval export site = YOUR_SITE_NAME) + pip install -r portfolio/requirements.txt + make build_portfolio_site + git add portfolio/$(site)/district_*/ portfolio/$(site)/*.yml portfolio/$(site)/*.md + python portfolio/portfolio.py index --deploy --prod + add_precommit: pip install pre-commit pre-commit install #pre-commit run --all-files - # Add to _.bash_profile outside of data-analyses #alias go='cd ~/data-analyses/portfolio && pip install -r requirements.txt && cd #../_shared_utils && make setup_env && cd ..' diff --git a/ha_portfolio/README.md b/ha_portfolio/README.md new file mode 100644 index 000000000..825c171fc --- /dev/null +++ b/ha_portfolio/README.md @@ -0,0 +1,11 @@ +# Starter Kit Portfolio +I am revamping some of our exercises and one exercise will teach future analysts how to make a portfolio. Yay! + +## Who We Are +We want our audience to understand who we are and why our expertise and research should be trusted. Here is a blurb you can lift. + +This website was created by the [California Department of Transportation](https://dot.ca.gov/)'s Division of Data and Digital Services. We are a group of data analysts and scientists who analyze transportation data, such as General Transit Feed Specification (GTFS) data, or data from funding programs such as the Active Transportation Program. Our goal is to transform messy and indecipherable original datasets into usable, customer-friendly products to better the transportation landscape. For more of our work, visit our [portfolio](https://analysis.calitp.org/). + +Alt text Alt text + +
Caltrans®, the California Department of Transportation® and the Caltrans logo are registered service marks of the California Department of Transportation and may not be copied, distributed, displayed, reproduced or transmitted in any form without prior written permission from the California Department of Transportation. diff --git a/ha_portfolio/_starterkit_utils.py b/ha_portfolio/_starterkit_utils.py new file mode 100644 index 000000000..11869fb63 --- /dev/null +++ b/ha_portfolio/_starterkit_utils.py @@ -0,0 +1,195 @@ +import pandas as pd +import numpy as np +import altair as alt +from calitp_data_analysis import calitp_color_palette +from IPython.display import HTML, Image, Markdown, display, display_html + +def reverse_snakecase(df:pd.DataFrame)->pd.DataFrame: + """ + Clean up columns to remove underscores and spaces. + """ + df.columns = df.columns.str.replace("_", " ").str.strip().str.title() + + df.columns = (df.columns.str.replace("Dac", "DAC") + .str.replace("Vmt", "VMT") + .str.replace("Zev", "ZEV") + .str.replace("Lu", "Landuse") + .str.replace("Ct", "CalTrans") + ) + return df + +def load_dataset()->pd.DataFrame: + """ + Load the final dataframe. + """ + GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/starter_kit/" + FILE = "starter_kit_example_categorized.parquet" + + # Read dataframe in + df = pd.read_parquet(f"{GCS_FILE_PATH}{FILE}") + + # Capitalize the Scope of Work column again since it is all lowercase + df.scope_of_work = df.scope_of_work.str.capitalize() + + # Clean up the column names + df = reverse_snakecase(df) + return df + +def aggregate_by_category(df: pd.DataFrame) -> pd.DataFrame: + """ + Find the median overall score and project cost + and total unique projects by category. + """ + agg1 = ( + df.groupby(["Category"]) + .aggregate( + { + "Overall Score": "median", + "Project Cost": "median", + "Project Name": "nunique", + } + ) + .reset_index() + .rename( + columns={ + "Overall Score": "Median Score", + "Project Cost": "Median Project Cost", + "Project Name": "Total Projects", + } + ) + ) + + # Format the Cost column properly + agg1['Median Project Cost'] = agg1['Median Project Cost'].apply(lambda x: '${:,.0f}'.format(x)) + + return agg1 + +def wide_to_long(df:pd.DataFrame)->pd.DataFrame: + """ + Change the dataframe from wide to long based on the project name and + Caltrans District. + """ + df2 = pd.melt( + df, + id_vars=["CalTrans District","Project Name"], + value_vars=[ + "Accessibility Score", + "DAC Accessibility Score", + "DAC Traffic Impacts Score", + "Freight Efficiency Score", + "Freight Sustainability Score", + "Mode Shift Score", + "Landuse Natural Resources Score", + "Safety Score", + "VMT Score", + "ZEV Score", + "Public Engagement Score", + "Climate Resilience Score", + "Program Fit Score", + ]) + + df2 = df2.rename(columns = {'variable':'Metric', + 'value':'Score'}) + return df2 + +def style_df(df: pd.DataFrame): + """ + Styles a dataframe and displays it. + """ + display( + df.style.hide(axis="index") + .format(precision=0) # Display only 2 decimal points + .set_properties(**{ + "background-color": "white", + "text-align": "center" + }) + ) + +def create_metric_chart(df: pd.DataFrame) -> alt.Chart: + """ + Create a chart that displays metric scores + for each project. + """ + # Create dropdown + metrics_list = df["Metric"].unique().tolist() + + metrics_dropdown = alt.binding_select( + options=metrics_list, + name="Metrics: ", + ) + # Column that controls the bar charts + xcol_param = alt.selection_point( + fields=["Metric"], value=metrics_list[0], bind=metrics_dropdown + ) + + chart = ( + alt.Chart(df, title="Metric by Categories") + .mark_circle(size=200) + .encode( + x=alt.X("Score", scale=alt.Scale(domain=[0, 10])), + y=alt.Y("Project Name"), + color=alt.Color( + "Score", + scale=alt.Scale( + range=calitp_color_palette.CALITP_CATEGORY_BRIGHT_COLORS + ), + ), + tooltip=list(df.columns), + ) + .properties(width=400, height=250) + ) + + chart = chart.add_params(xcol_param).transform_filter(xcol_param) + + return chart + +def create_district_summary(df: pd.DataFrame, caltrans_district: int): + """ + Create a summary of CSIS metrics for one Caltrans District. + """ + filtered_df = df.loc[df["CalTrans District"] == caltrans_district].reset_index( + drop=True + ) + # Finding the values referenced in the narrative + median_score = filtered_df["Overall Score"].median() + total_projects = filtered_df["Project Name"].nunique() + max_project = filtered_df["Project Cost"].max() + max_project = f"${max_project:,.2f}" + + # Aggregate the dataframe + aggregated_df = aggregate_by_category(filtered_df) + + # Change the dataframe from wide to long + df2 = wide_to_long(filtered_df) + + # Create narrative + display( + Markdown( + f"""The median score for projects in District {caltrans_district} is {median_score}
+ The total number of projects is {total_projects}
+ The most expensive project costs {max_project} + """ + ) + ) + display( + Markdown( + f"""

Metrics aggregated by Categories

+ """ + ) + ) + style_df(aggregated_df) + + display( + Markdown( + f"""

Overview of Projects

+ """ + ) + ) + style_df(filtered_df[["Project Name", "Overall Score", "Scope Of Work"]]) + display( + Markdown( + f"""

Metric Scores by Project

+ """ + ) + ) + display(create_metric_chart(df2)) \ No newline at end of file diff --git a/ha_portfolio/ha_portfolio.ipynb b/ha_portfolio/ha_portfolio.ipynb new file mode 100644 index 000000000..b7cc25e14 --- /dev/null +++ b/ha_portfolio/ha_portfolio.ipynb @@ -0,0 +1,111 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "260ba8f3-dd02-4fdc-945d-450db01d188e", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import calitp_data_analysis.magics\n", + "\n", + "# All your other packages go here\n", + "# Here I just want pandas and my own utils.\n", + "import pandas as pd\n", + "import _starterkit_utils " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a2996fd-29d0-4a19-ac48-a6957d9f8140", + "metadata": {}, + "outputs": [], + "source": [ + "pd.options.display.max_columns = 100\n", + "pd.options.display.float_format = \"{:.2f}\".format\n", + "pd.set_option(\"display.max_rows\", None)\n", + "pd.set_option(\"display.max_colwidth\", None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d82c9a8-6f8f-485b-ace5-957f1b80c2f3", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "# district = 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43a07a8c-567d-471d-be10-a547cd0b3a13", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture_parameters\n", + "district" + ] + }, + { + "cell_type": "markdown", + "id": "cb5a0cc4-3e7e-4aea-81f2-c5e858fb315b", + "metadata": {}, + "source": [ + "# District {district} Analysis " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c91049e1-107d-47d9-9cda-63aa4fbf554b", + "metadata": {}, + "outputs": [], + "source": [ + "df = _starterkit_utils.load_dataset()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd1509c0-b435-456e-ad1c-b583a991f1e2", + "metadata": {}, + "outputs": [], + "source": [ + "_starterkit_utils.create_district_summary(df, district)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/portfolio/ha_starterkit_district/README.md b/portfolio/ha_starterkit_district/README.md new file mode 100644 index 000000000..825c171fc --- /dev/null +++ b/portfolio/ha_starterkit_district/README.md @@ -0,0 +1,11 @@ +# Starter Kit Portfolio +I am revamping some of our exercises and one exercise will teach future analysts how to make a portfolio. Yay! + +## Who We Are +We want our audience to understand who we are and why our expertise and research should be trusted. Here is a blurb you can lift. + +This website was created by the [California Department of Transportation](https://dot.ca.gov/)'s Division of Data and Digital Services. We are a group of data analysts and scientists who analyze transportation data, such as General Transit Feed Specification (GTFS) data, or data from funding programs such as the Active Transportation Program. Our goal is to transform messy and indecipherable original datasets into usable, customer-friendly products to better the transportation landscape. For more of our work, visit our [portfolio](https://analysis.calitp.org/). + +Alt text Alt text + +
Caltrans®, the California Department of Transportation® and the Caltrans logo are registered service marks of the California Department of Transportation and may not be copied, distributed, displayed, reproduced or transmitted in any form without prior written permission from the California Department of Transportation. diff --git a/portfolio/ha_starterkit_district/_config.yml b/portfolio/ha_starterkit_district/_config.yml new file mode 100644 index 000000000..47dea21ce --- /dev/null +++ b/portfolio/ha_starterkit_district/_config.yml @@ -0,0 +1,43 @@ +# Book settings +# Learn more at https://jupyterbook.org/customize/config.html + +title: Testing a Portfolio +author: Cal-ITP +copyright: "2024" +#logo: calitp_logo_MAIN.png + +# Force re-execution of notebooks on each build. +# See https://jupyterbook.org/content/execute.html +execute: + execute_notebooks: 'off' + allow_errors: false + timeout: -1 + +# Define the name of the latex output file for PDF builds +latex: + latex_documents: + targetname: book.tex + +launch_buttons: + binderhub_url: "https://mybinder.org" + jupyterhub_url: "https://hubtest.k8s.calitp.jarv.us" + thebe: true + +repository: + url: https://github.com/cal-itp/data-analyses/ # Online location of your book +# path_to_book: docs # Optional path to your book, relative to the repository root + path_to_book: ha_portfolio + branch: main # Which branch of the repository should be used when creating links (optional) + +# Add GitHub buttons to your book +# See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository +html: + use_issues_button: true + use_repository_button: true + use_edit_page_button: true + google_analytics_id: 'G-JCX3Z8JZJC' + +sphinx: + config: + html_js_files: + - https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js \ No newline at end of file diff --git a/portfolio/ha_starterkit_district/_toc.yml b/portfolio/ha_starterkit_district/_toc.yml new file mode 100644 index 000000000..957f7035e --- /dev/null +++ b/portfolio/ha_starterkit_district/_toc.yml @@ -0,0 +1,17 @@ +format: jb-book +parts: +- caption: null + chapters: + - file: district_1/00__ha_portfolio__district_1.ipynb + - file: district_2/00__ha_portfolio__district_2.ipynb + - file: district_3/00__ha_portfolio__district_3.ipynb + - file: district_4/00__ha_portfolio__district_4.ipynb + - file: district_5/00__ha_portfolio__district_5.ipynb + - file: district_6/00__ha_portfolio__district_6.ipynb + - file: district_7/00__ha_portfolio__district_7.ipynb + - file: district_8/00__ha_portfolio__district_8.ipynb + - file: district_9/00__ha_portfolio__district_9.ipynb + - file: district_10/00__ha_portfolio__district_10.ipynb + - file: district_11/00__ha_portfolio__district_11.ipynb + - file: district_12/00__ha_portfolio__district_12.ipynb +root: README diff --git a/portfolio/ha_starterkit_district/district_1/00__ha_portfolio__district_1.ipynb b/portfolio/ha_starterkit_district/district_1/00__ha_portfolio__district_1.ipynb new file mode 100644 index 000000000..4f7fb7932 --- /dev/null +++ b/portfolio/ha_starterkit_district/district_1/00__ha_portfolio__district_1.ipynb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d628df9d431f7cad06accab072a4d319b0169bb9b99b511f8551a81a63f3a810 +size 18087 diff --git a/portfolio/ha_starterkit_district/district_10/00__ha_portfolio__district_10.ipynb b/portfolio/ha_starterkit_district/district_10/00__ha_portfolio__district_10.ipynb new file mode 100644 index 000000000..1fe836308 --- /dev/null +++ b/portfolio/ha_starterkit_district/district_10/00__ha_portfolio__district_10.ipynb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4bf26deea732b1c78fa9da3703ba63f832c241f1396280642b2c0e6b8d16d1a +size 20922 diff --git a/portfolio/ha_starterkit_district/district_11/00__ha_portfolio__district_11.ipynb b/portfolio/ha_starterkit_district/district_11/00__ha_portfolio__district_11.ipynb new file mode 100644 index 000000000..6094b0e20 --- /dev/null +++ b/portfolio/ha_starterkit_district/district_11/00__ha_portfolio__district_11.ipynb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df47ac5ae53e2a6f6e7c797db582fc828c5f1ff8928fd29e5a0edd77b704c738 +size 29005 diff --git a/portfolio/ha_starterkit_district/district_12/00__ha_portfolio__district_12.ipynb b/portfolio/ha_starterkit_district/district_12/00__ha_portfolio__district_12.ipynb new file mode 100644 index 000000000..163365a1e --- /dev/null +++ b/portfolio/ha_starterkit_district/district_12/00__ha_portfolio__district_12.ipynb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5b5ea17fe11171d4ced9c109bc53f70ef2b67ccec04288bcda856f3464c5e4f +size 25791 diff --git a/portfolio/ha_starterkit_district/district_2/00__ha_portfolio__district_2.ipynb b/portfolio/ha_starterkit_district/district_2/00__ha_portfolio__district_2.ipynb new file mode 100644 index 000000000..9e8d86659 --- /dev/null +++ b/portfolio/ha_starterkit_district/district_2/00__ha_portfolio__district_2.ipynb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bd66c6c71557ad082090fe960b7c1900bd3c4701ad6ed61e4aa1fefb0086618 +size 20941 diff --git a/portfolio/ha_starterkit_district/district_3/00__ha_portfolio__district_3.ipynb b/portfolio/ha_starterkit_district/district_3/00__ha_portfolio__district_3.ipynb new file mode 100644 index 000000000..334ded8a2 --- /dev/null +++ b/portfolio/ha_starterkit_district/district_3/00__ha_portfolio__district_3.ipynb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9136e8a5da205dbc13e53c3fda3733e32ffa7606c71f74076d485b8c2ec7fa3d +size 30296 diff --git a/portfolio/ha_starterkit_district/district_4/00__ha_portfolio__district_4.ipynb b/portfolio/ha_starterkit_district/district_4/00__ha_portfolio__district_4.ipynb new file mode 100644 index 000000000..80653b809 --- /dev/null +++ b/portfolio/ha_starterkit_district/district_4/00__ha_portfolio__district_4.ipynb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1d8630e50736c232bc3664db0a264bd1a20d378f04def5ffbb9848eef5dfa73 +size 30891 diff --git a/portfolio/ha_starterkit_district/district_5/00__ha_portfolio__district_5.ipynb b/portfolio/ha_starterkit_district/district_5/00__ha_portfolio__district_5.ipynb new file mode 100644 index 000000000..104bf22f2 --- /dev/null +++ b/portfolio/ha_starterkit_district/district_5/00__ha_portfolio__district_5.ipynb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b387a497223ee90b0f281ea989fe9d65016f26434370bce798a30badb8b8a429 +size 26311 diff --git a/portfolio/ha_starterkit_district/district_6/00__ha_portfolio__district_6.ipynb b/portfolio/ha_starterkit_district/district_6/00__ha_portfolio__district_6.ipynb new file mode 100644 index 000000000..0ad26e968 --- /dev/null +++ b/portfolio/ha_starterkit_district/district_6/00__ha_portfolio__district_6.ipynb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff782ee23e9c5a27c332d8c00c6c3fa118bb022917e403a577183452c616a5b0 +size 23686 diff --git a/portfolio/ha_starterkit_district/district_7/00__ha_portfolio__district_7.ipynb b/portfolio/ha_starterkit_district/district_7/00__ha_portfolio__district_7.ipynb new file mode 100644 index 000000000..fc28ea0fe --- /dev/null +++ b/portfolio/ha_starterkit_district/district_7/00__ha_portfolio__district_7.ipynb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9b8c61b76270fbc128a0ab303ab6b518a8ecdc9ebf897a8ded23cb03491c63b +size 23053 diff --git a/portfolio/ha_starterkit_district/district_8/00__ha_portfolio__district_8.ipynb b/portfolio/ha_starterkit_district/district_8/00__ha_portfolio__district_8.ipynb new file mode 100644 index 000000000..ab7c46989 --- /dev/null +++ b/portfolio/ha_starterkit_district/district_8/00__ha_portfolio__district_8.ipynb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9adc67a6b90cf4e9697ce30f6339bf8fdf17559fe09f0b6dd26d85f684b77839 +size 27411 diff --git a/portfolio/ha_starterkit_district/district_9/00__ha_portfolio__district_9.ipynb b/portfolio/ha_starterkit_district/district_9/00__ha_portfolio__district_9.ipynb new file mode 100644 index 000000000..f0b3a913f --- /dev/null +++ b/portfolio/ha_starterkit_district/district_9/00__ha_portfolio__district_9.ipynb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2960ce8d6abb645e17252eb5425cb2b0a69736bc0a73a570b87ebabec3738189 +size 23228 diff --git a/portfolio/sites/ha_starterkit_district.yml b/portfolio/sites/ha_starterkit_district.yml new file mode 100644 index 000000000..dbb2e23d1 --- /dev/null +++ b/portfolio/sites/ha_starterkit_district.yml @@ -0,0 +1,31 @@ +directory: ./ha_portfolio/ +notebook: ./ha_portfolio/ha_portfolio.ipynb +parts: +- caption: Introduction +- chapters: + - params: + district: 1 + - params: + district: 2 + - params: + district: 3 + - params: + district: 4 + - params: + district: 5 + - params: + district: 6 + - params: + district: 7 + - params: + district: 8 + - params: + district: 9 + - params: + district: 10 + - params: + district: 11 + - params: + district: 12 +readme: ./ha_portfolio/README.md +title: Testing a Portfolio diff --git a/starter_kit/19319_en_1.jpg b/starter_kit/19319_en_1.jpg new file mode 100644 index 000000000..eb30bbba4 Binary files /dev/null and b/starter_kit/19319_en_1.jpg differ diff --git a/starter_kit/2024_basics_01.ipynb b/starter_kit/2024_basics_01.ipynb new file mode 100644 index 000000000..3431b92ed --- /dev/null +++ b/starter_kit/2024_basics_01.ipynb @@ -0,0 +1,1151 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "247e773f-0e29-4ed6-ab4d-5856325611b4", + "metadata": {}, + "source": [ + "# Exercise 1: Familiarize yourself with `pandas` and `python`\n", + "If you are new to Python, there are many resources!\n", + "* There are introductory Python courses available through [Caltrans's LinkedIn Learning Library](https://www.linkedin.com/learning/search?keywords=python&u=36029164).\n", + "* [Practical Python for Data Science](https://www.practicalpythonfordatascience.com/00_python_crash_course) is an incredibly helpful book and material from this resource are linked throughout.\n", + "\n", + "## Skills \n", + "* `pandas` is one of the base Python packages for working with tabular data.\n", + "* F-strings\n", + "* Export to Google Cloud Storage\n", + "* Practice committing on GitHub\n", + "\n", + "## How to use these tutorials\n", + "* The tutorials are divided by skills/concepts we are going to learn.\n", + "* There are hints and instructions on the top.\n", + "* There are links to references. **It is highly recommended to read through them and practice them in this notebook, in addition to these exercises.**\n", + "\n", + "## What are we working with today? \n", + "* Today we will be working on Caltrans System Investment Strategy (CSIS) today. Per this [description](https://dot.ca.gov/programs/transportation-planning/division-of-transportation-planning/corridor-and-system-planning/csis)\n", + "> The California Department of Transportation (Caltrans) is committed to leading climate action and advancing social equity in the transportation sector set forth by the California State Transportation Agency (CalSTA) Climate Action Plan for Transportation Infrastructure (CAPTI, 2021)...Caltrans is in a significant leadership role to carry out meaningful measures that advance state’s goals and priorities through the development and implementation of the Caltrans System Investment Strategy (CSIS). The CSIS, which implements one of CAPTI’s key actions, is envisioned to be an investment framework through a data and performance-driven approach that guides transportation investments and decisions.\n", + "* DDS is working on CSIS is by automating the scoring of projects using Python. We score each project based on how well they do in various categories, aka metrics such as Zero Emmission Vehicles, Vehicle Miles Traveled, and more. \n", + "* While the values in we are working with today are all fake, the exercise is based on actual datasets and assignments. " + ] + }, + { + "cell_type": "markdown", + "id": "4dd32eed-55a4-4fd1-874b-02f9b4bd94a7", + "metadata": {}, + "source": [ + "## Import Packages\n", + "* Before doing some data cleaning and analyzing, we need to equip ourselves with the right tools to get started.\n", + "* Part of our \"toolbox\" are packages. \n", + "\n", + "* **Resource**: [Importing Dependencies via Practical Python for Data Science](https://www.practicalpythonfordatascience.com/05_data_exploration.html?highlight=dependencies#importing-our-dependencies)\n", + "\n", + "### `Pandas`\n", + "* You are importing the package `pandas` that is the backbone of the majority of our data analysis work. \n", + "* You can import countless packages. \n", + "* We commonly use `geopandas` for geospatial data work. We use `altair` for making charts." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "50199af7-04a8-43c5-ba1b-4127940749bd", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "19b42c5d-4f2b-4d66-a7a7-98ab74a6591e", + "metadata": {}, + "source": [ + "* This block of code below adjusts the notebook.\n", + "* I am setting the maximum number of columns to be displayed to be 100.\n", + "* I want any `float` columns to be rounded to 2 decimal points.\n", + "* I want all of the rows in the dataframe to display. \n", + "* I don't want my columns to be truncated.\n", + " * If you have a column with `strings` that is very long, it will automatically cut off.\n", + " * Example: The California Department of Transportation (Caltrans) is committed to leading climate action and advancing social equity... would be displayed something like this The California Department of Transportation (Caltrans) is... without this line of code.\n", + "* Adjust some of these settings if you wish " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "8e18d8d7-2cce-4854-b6c4-56a7e7bdf636", + "metadata": {}, + "outputs": [], + "source": [ + "pd.options.display.max_columns = 100\n", + "pd.options.display.float_format = \"{:.2f}\".format\n", + "pd.set_option(\"display.max_rows\", None)\n", + "pd.set_option(\"display.max_colwidth\", None)" + ] + }, + { + "cell_type": "markdown", + "id": "f14077a3-2882-46eb-8cd2-27c08e4705a9", + "metadata": {}, + "source": [ + "### `calitp_data_analysis`\n", + "* DDS also has our own [internal library of functions](https://docs.calitp.org/data-infra/analytics_tools/python_libraries.html#calitp-data-analysis).\n", + "* You can check out all the functions [here](https://github.com/cal-itp/data-infra/tree/main/packages/calitp-data-analysis/calitp_data_analysis).\n", + "* Below, we are importing only one function called `to_snakecase` from the python submodule `sql` in our package `calitp_data_analysis`. `to_snakecase` allows us to change the column names of our dataset from something like `Project Description` to `project_description`. \n", + "* By turning the column names to lower case and replacing the spaces with underscores, this makes referencing specific columns much easier." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "bd388d88-d2d6-4dd6-9870-22c14db7a44a", + "metadata": {}, + "outputs": [], + "source": [ + "from calitp_data_analysis.sql import to_snakecase" + ] + }, + { + "cell_type": "markdown", + "id": "ff74b143-6ff2-46e9-ae88-4a208155e990", + "metadata": {}, + "source": [ + "## Jupyter Notebook\n", + "* You're using a Jupyter Notebook right now.\n", + "* There are many benefits listed here in our [DDS Docs](https://docs.calitp.org/data-infra/analytics_new_analysts/04-notebooks.html).\n", + "* Take some time to get used to this interface. There are many tutorials available on Youtube that shows tips and tricks, just skip the installation portion. \n", + " * [This one looks promising](https://youtu.be/LW2Rye_l8L0?si=B8kojobCe3OIF3xg)." + ] + }, + { + "cell_type": "markdown", + "id": "cc30cb7d-77d3-465b-9831-8810096af9b1", + "metadata": {}, + "source": [ + "## Check out the data \n", + "* Download the Excel workbook containing all the CSIS data from Google Cloud Storage [here](https://console.cloud.google.com/storage/browser/_details/calitp-analytics-data/data-analyses/starter_kit/starter_kit_csis_scoring_workbook.xlsx;tab=live_object?project=cal-itp-data-infra). \n", + " * Open it up in Excel and take a look at how many sheets and the data structure.\n", + "### Read in the data\n", + "* We are reading our Excel Workbook into a Pandas dataframe.\n", + "* While there is a very [technical definition](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) of what a dataframe is, you can think of it as an Excel sheet that holds your data. \n", + "* Resource: [This page of the Practical Python for Data Science](https://www.practicalpythonfordatascience.com/02_loading_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "5950cb87-75ab-4871-ab4b-a8f1c41f0a4a", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"gs://calitp-analytics-data/data-analyses/starter_kit/starter_kit_csis_scoring_workbook.xlsx\"" + ] + }, + { + "cell_type": "markdown", + "id": "88d79cea-c017-454e-a2aa-85c0bf511d85", + "metadata": {}, + "source": [ + "* Read in the dataframe without `to_snakecase()` first to see what happens." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "67ba9264-65d9-453b-a800-a91bd365e43e", + "metadata": {}, + "outputs": [], + "source": [ + "df_no_snakecase = (pd.read_excel(url))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "e2d886b4-c207-41e5-8325-7275619b60e6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ct_districtproject_nameScope of WorkProject Costlead agency
01Meadow Magic Multi-Use PathA 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife.5245734Meadow Bunny Public Transportation (MBPT)
14Bunny Hop Bike BoulevardA Class II bike lane with charming streetlights, benches, and bike racks designed to resemble carrot sticks, connecting residential neighborhoods to local schools and parks.6929368Unicorn Fairy Express Bus (UFX)
\n", + "
" + ], + "text/plain": [ + " ct_district project_name \\\n", + "0 1 Meadow Magic Multi-Use Path \n", + "1 4 Bunny Hop Bike Boulevard \n", + "\n", + " Scope of Work \\\n", + "0 A 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife. \n", + "1 A Class II bike lane with charming streetlights, benches, and bike racks designed to resemble carrot sticks, connecting residential neighborhoods to local schools and parks. \n", + "\n", + " Project Cost lead agency \n", + "0 5245734 Meadow Bunny Public Transportation (MBPT) \n", + "1 6929368 Unicorn Fairy Express Bus (UFX) " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_no_snakecase.head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "f959563e-7fa2-444a-b2b3-6c539dce802b", + "metadata": {}, + "source": [ + "* Read in the dataframe with `to_snakecase()` now and compare the difference between the column names. " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "e09456e0-dfd2-4388-85de-eb9e95f983fa", + "metadata": {}, + "outputs": [], + "source": [ + "df = to_snakecase(pd.read_excel(url))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "54c718b3-eeff-4ec5-b012-1cc612543c60", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ct_districtproject_namescope_of_workproject_costlead_agency
01Meadow Magic Multi-Use PathA 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife.5245734Meadow Bunny Public Transportation (MBPT)
14Bunny Hop Bike BoulevardA Class II bike lane with charming streetlights, benches, and bike racks designed to resemble carrot sticks, connecting residential neighborhoods to local schools and parks.6929368Unicorn Fairy Express Bus (UFX)
\n", + "
" + ], + "text/plain": [ + " ct_district project_name \\\n", + "0 1 Meadow Magic Multi-Use Path \n", + "1 4 Bunny Hop Bike Boulevard \n", + "\n", + " scope_of_work \\\n", + "0 A 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife. \n", + "1 A Class II bike lane with charming streetlights, benches, and bike racks designed to resemble carrot sticks, connecting residential neighborhoods to local schools and parks. \n", + "\n", + " project_cost lead_agency \n", + "0 5245734 Meadow Bunny Public Transportation (MBPT) \n", + "1 6929368 Unicorn Fairy Express Bus (UFX) " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "179960a3-6c9b-42af-a8f1-d6156c4be2d2", + "metadata": {}, + "source": [ + "### Previewing Data \n", + "* Often, you want to get a sneak preview of your data. \n", + "* Thankfully, Python provides many methods for you to do so. \n", + "* Below are a couple of very common methods we use. \n", + " * `.head()` shows the first five rows, while `.tail()` shows the last five.\n", + " * `.sample()` shows you a random row.\n", + " * Want to see or less than five? Specify it in the parantheses: `.head(10)` allows you to see the first 10 rows and `.head(2)` allows you to see the first 2.\n", + "* Try everything yourself below.\n", + "* **Resource**: [Practical Python for Data Science: Data Inspection](https://www.practicalpythonfordatascience.com/02_loading_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "5e966250-47b1-4f14-802b-c795e44330dd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ct_districtproject_namescope_of_workproject_costlead_agency
01Meadow Magic Multi-Use PathA 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife.5245734Meadow Bunny Public Transportation (MBPT)
14Bunny Hop Bike BoulevardA Class II bike lane with charming streetlights, benches, and bike racks designed to resemble carrot sticks, connecting residential neighborhoods to local schools and parks.6929368Unicorn Fairy Express Bus (UFX)
\n", + "
" + ], + "text/plain": [ + " ct_district project_name \\\n", + "0 1 Meadow Magic Multi-Use Path \n", + "1 4 Bunny Hop Bike Boulevard \n", + "\n", + " scope_of_work \\\n", + "0 A 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife. \n", + "1 A Class II bike lane with charming streetlights, benches, and bike racks designed to resemble carrot sticks, connecting residential neighborhoods to local schools and parks. \n", + "\n", + " project_cost lead_agency \n", + "0 5245734 Meadow Bunny Public Transportation (MBPT) \n", + "1 6929368 Unicorn Fairy Express Bus (UFX) " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "3386e9d8-15cd-48bc-8b1f-cf6f95512ad5", + "metadata": {}, + "source": [ + "### More Methods!\n", + "* `df.shape` gives you the number of rows and columns in your dataset.\n", + "* `df.columns` returns all of the column names.\n", + "* `df.info()` per the [pandas docs](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.info.html#pandas.DataFrame.info) prints information about a DataFrame including the index dtype and columns, non-null values and memory usage.\n", + "* Experiment below. \n", + "* More food for thought:\n", + " * `Dtype` is critical. There are integers, objects, booleans, floats...\n", + " * Does the `dtype` of each column below make sense to you? \n", + " * The `dtype` of `object` is a catchall term." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "7f55b33e-d402-473b-815a-92ad935d35d7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 44 entries, 0 to 43\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 ct_district 44 non-null int64 \n", + " 1 project_name 44 non-null object\n", + " 2 scope_of_work 44 non-null object\n", + " 3 project_cost 44 non-null int64 \n", + " 4 lead_agency 44 non-null object\n", + "dtypes: int64(2), object(3)\n", + "memory usage: 1.8+ KB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "markdown", + "id": "d117908f-af05-4e95-8042-39a3e0557d6f", + "metadata": {}, + "source": [ + "### Deeper Dive\n", + "* We now know a good amount about our dataset, but the # of rows and columns are not always so thrilling. \n", + "* Let's take a closer look at some columns.\n", + "* `.value_counts()` helps you see how many times the same value appears. " + ] + }, + { + "cell_type": "markdown", + "id": "55cece73-c3d5-4cd7-8896-f97d43fc1114", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "63f21ab5-0920-4310-afce-2ea657556912", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4 6\n", + "3 6\n", + "8 5\n", + "11 5\n", + "12 4\n", + "5 4\n", + "9 3\n", + "6 3\n", + "7 3\n", + "2 2\n", + "10 2\n", + "1 1\n", + "Name: ct_district, dtype: int64" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.ct_district.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "55baf38e-3776-4448-b375-9e124030bae2", + "metadata": {}, + "source": [ + "* `.nunique()` displays the number of distinct values in your column\n", + " * This is useful because often the number of unique values of a column should match the number of rows of your dataset exactly.\n", + " * In our case, our dataframe has 44 rows and we should have 44 unique project names and scope of work descriptions." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "1d832308-a425-404d-83a0-53ce8bfae279", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "44" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.project_name.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "55d2140f-feab-496b-b9b1-90bbe5701a9a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(44, 5)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "markdown", + "id": "7c0c499e-fa7b-4f01-a357-db7b0ec41416", + "metadata": {}, + "source": [ + "* You can preview a column with brackets [] as well with the column name encased in quotation marks." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "4e232324-f75f-46a0-962d-76ed9273dac7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "44" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"scope_of_work\"].nunique()" + ] + }, + { + "cell_type": "markdown", + "id": "06ee15f6-ee2e-4e3e-91b2-115875292042", + "metadata": {}, + "source": [ + "## Something missing? \n", + "* Open up our dataset using Excel. \n", + "* Take a look at the bottom: how many sheets are there in the Excel worbook? \n", + "* Which sheet is loaded into `df` above? " + ] + }, + { + "cell_type": "markdown", + "id": "5302dd99-acb2-40d7-b00d-4f0493ee5e09", + "metadata": {}, + "source": [ + "### Lists: An Introduction\n", + "* We can load in all of the sheets in an Excel workbook using a list\n", + "* Per [Practical Python for Data Science](https://www.practicalpythonfordatascience.com/00_python_crash_course_datatypes.html?highlight=dictionary#list): \"lists represent a collection of objects and are constructed with square brackets, separating items with commas. A list can contain a collection of one datatype...It can also contain a collection of mixed datatypes\".\n", + " * **Play around with some of the examples in the link above in this notebook.**\n", + " * You will be using lists often in your work, so it is best to be familiar with this datatype." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7f41842-853e-4ad0-ae9e-0da0955d4352", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "21a32ab4-bfb2-4e7a-b90a-6fa05b7ceb89", + "metadata": {}, + "source": [ + "* I am placing all of the sheets in our Excel Workbook in a list.\n", + "* Notice that the items in this list are strings. \n", + " * Read about strings [here](https://www.practicalpythonfordatascience.com/00_python_crash_course_datatypes.html?highlight=dictionary#string).\n", + "* You can access each element of the list using an index.\n", + " * An index represents the location of an element with a number.\n", + " * The index always starts at 0. What we consider the first item is not index \"1\", it's index \"0\"." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "02380fb6-c55b-477f-acfb-8b483e83beac", + "metadata": {}, + "outputs": [], + "source": [ + "my_sheets = [\"projects_auto\",\n", + " \"overall_score\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "8a9a1a3e-e10d-4447-96dd-92ecb2fe6357", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(my_sheets)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "a3be037d-b21b-4192-9099-25bfcb660f01", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'projects_auto'" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Index\n", + "my_sheets[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "ebf91535-a466-446a-9f7a-606503d78b6a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'overall_score'" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_sheets[1]" + ] + }, + { + "cell_type": "markdown", + "id": "75df89d0-92fb-4e4e-aaa3-54f4944c55c3", + "metadata": {}, + "source": [ + "* Read the in the Excel workbook into a dataframe.\n", + "* Using the argument `sheet_name` you can open up a specific sheet in an Excel workbook or multiple sheets that is held in a list." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "2e2578bc-db1f-41f5-bc07-3cb82998420e", + "metadata": {}, + "outputs": [], + "source": [ + "df2 = pd.read_excel(\n", + " url,\n", + " sheet_name=my_sheets,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6059f491-3966-4343-b000-0830fa3559d6", + "metadata": {}, + "source": [ + "### Specificity is beautiful.\n", + "* Grab out each individual sheet into its own dataframe using `df2.get(my_sheets[enter in the index number])`. \n", + "* Make sure your `dataframe` is titled descriptively.\n", + "* `df` is not exactly very telling. " + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "4c6f8fdb-33d3-4c44-bb00-6d1447d49feb", + "metadata": {}, + "outputs": [], + "source": [ + "projects_df = to_snakecase(df2.get(my_sheets[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "167af2f1-b09d-476d-87b4-b9374ad445c2", + "metadata": {}, + "outputs": [], + "source": [ + "scores_df = to_snakecase(df2.get(my_sheets[1]))" + ] + }, + { + "cell_type": "markdown", + "id": "cd0d51ea-b7da-41d0-bb03-5432b4de1a1b", + "metadata": {}, + "source": [ + "## Add a new column\n", + "* Oops! Us analysts were so wrapped up in scoring, we forgot to to total up all the metrics to find the overall_score for the project. \n", + "* Sum up all the metric columns into a column called `overall_score`\n", + "* There are a couple of ways to do this: experiment! \n", + "* Here are some resources:\n", + " * [Stackoverflow](https://stackoverflow.com/questions/22342285/summing-two-columns-in-a-pandas-dataframe)\n", + " * [Statology](https://www.statology.org/pandas-sum-specific-columns/)\n", + "* Food for thought:\n", + " * What does `axis = 1` mean?\n", + " * What happens if you do `.sum(axis=0)`?\n", + " * You don't always have to save everything into a dataframe. You can do something like `df.sum(axis=0)` just to see what happens. \n", + " * Just make sure your dataframe isn't too large or else you will run out of memory!\n", + " * What happens when you create a new column with `scores_df.overall_score` instead of `scores_df[\"overall_score\"]`? " + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "e9321f90-8c99-46fb-9d50-8571f3d94fc8", + "metadata": {}, + "outputs": [], + "source": [ + "scores_df[\"overall_score\"] = scores_df.select_dtypes(include=['int64', 'float64']).sum(axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "246437eb-f284-49b8-960d-d601a66f6362", + "metadata": { + "tags": [] + }, + "source": [ + "## Subsetting\n", + "* Your manager asks for the `overall_score` for each project. \n", + "* They do not want to see the other metrics, only the project's name and its `overall_score`\n", + "* Subset the dataframe and save it into a new dataframe.\n", + "* Again, there are many ways to do the same thing in Python. \n", + "* Method 1: Enter in all the columns you want to keep in a list and place the list in another set of brackets." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "4e6d8e70-ae57-46c5-a5aa-9972be77f415", + "metadata": {}, + "outputs": [], + "source": [ + "# Enter in the columns you want to keep\n", + "columns_to_keep = [\"project_name\",\"overall_score\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "48ee899b-3db9-464f-802f-d431189176b7", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "subsetted_df1 = scores_df[columns_to_keep]" + ] + }, + { + "cell_type": "markdown", + "id": "56865911-994c-4fb5-afe4-1fdc1d752d8b", + "metadata": {}, + "source": [ + "* Method 2: You can enter in all the columns in a list you want to drop and use `.drop()`" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "2c64cdcf-9598-4f4a-b077-5caec0cfe264", + "metadata": {}, + "outputs": [], + "source": [ + "# Enter in the columns you want to drop\n", + "columns_to_drop = []" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "47a96b86-e5d1-4fcd-ba73-7db5badae28b", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "\n", + "# subsetted_df2 = scores_df.drop(columns = columns_to_drop)" + ] + }, + { + "cell_type": "markdown", + "id": "e641185d-295d-4c42-ace1-16d33f2da0fa", + "metadata": {}, + "source": [ + "## F-Strings\n", + "* Save your subsetted dataframe from above back into the `starter_kit` folder. \n", + " * The file path should be something like this `\"gs://calitp-analytics-data/data-analyses/starter_kit/aggregated_csis.xlsx\"`.\n", + "* However, remember our original Excel workbook's file path? It was`\"gs://calitp-analytics-data/data-analyses/starter_kit/starter_kit_csis_scoring_workbook.xlsx\"`\n", + "* Essentially, the **only** difference between these two file paths are `aggregated_csis.xlsx` and `starter_kit_csis_scoring_workbook.xlsx` because the folder path `gs://calitp-analytics-data/data-analyses/starter_kit/` remains the same. \n", + "* This is where f-strings come in. Read more about them [here](https://realpython.com/python-f-strings/#f-strings-a-new-and-improved-way-to-format-strings-in-python).\n", + "> Python f-strings provide a quick way to interpolate and format strings. They’re readable, concise, and less prone to error than traditional string interpolation and formatting tools...\n", + "* Let's practice !\n", + " * My file_path is always going to be `gs://calitp-analytics-data/data-analyses/starter_kit/`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "4c9c53a5-dbf3-4dc0-aea0-832f3a91414d", + "metadata": {}, + "outputs": [], + "source": [ + "GCS_FILE_PATH = \"gs://calitp-analytics-data/data-analyses/starter_kit/\"" + ] + }, + { + "cell_type": "markdown", + "id": "11a088a5-e8e2-4a12-9736-44ae46c2d771", + "metadata": {}, + "source": [ + "* However the file is going to change.\n", + "* Save the file name in a variable called `FILE`." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "db111f34-08b8-42f9-96fe-6852c4af50ad", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "FILE = \"starter_kit_example_final_scores.xlsx\"" + ] + }, + { + "cell_type": "markdown", + "id": "bf96d0cf-7225-4a44-9955-988d982a0f7f", + "metadata": {}, + "source": [ + "* Using a `f-string`, combine `GCS_FILE_PATH` and `FILE` together." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "edff403c-ef37-48d8-8c7a-60b388752a51", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'gs://calitp-analytics-data/data-analyses/starter_kit/starter_kit_example_final_scores.xlsx'" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Put them together using a f-string\n", + "f\"{GCS_FILE_PATH}{FILE}\"" + ] + }, + { + "cell_type": "markdown", + "id": "5504c416-b65b-4c74-a2ba-95688cf8e77a", + "metadata": {}, + "source": [ + "* Now go open up your new Excel workbook and see if it's what you expect.\n", + " * Hint: you will probably get a very annoying extra column! \n", + " * Try out some of the arguments [here](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_excel.html#pandas.DataFrame.to_excel) and save your file again." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "bf37fc2d-ac6c-4134-94de-79a9a4141ffc", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "scores_df[[\"project_name\",\"overall_score\"]].to_excel(f\"{GCS_FILE_PATH}{FILE}\")" + ] + }, + { + "cell_type": "markdown", + "id": "17c17adb-404e-4e54-bdb4-c3295e0e2be2", + "metadata": {}, + "source": [ + "* Export the entire (not subsetted) dataframe with the new `overall_score` column using `df.to_parquet()`. \n", + " * We typically prefer saving to `parquets`. Why? Read below. Text taken from [here](https://docs.calitp.org/data-infra/analytics_new_analysts/03-data-management.html#parquet).\n", + " * Parquet is an “open source columnar storage format for use in data analysis systems.” Columnar storage is more efficient as it is easily compressed and the data is more homogenous. CSV files utilize a row-based storage format which is harder to compress, a reason why Parquets files are preferable for larger datasets. Parquet files are faster to read than CSVs, as they have a higher querying speed and preserve datatypes (i.e. Number, Timestamps, Points). They are best for intermediate data storage and large datasets (1GB+) on most any on-disk storage. This file format is also good for passing dataframes between Python and R. A similar option is feather.\n", + "* Reference\n", + " * [DDS Docs: Saving Code](https://docs.calitp.org/data-infra/analytics_tools/saving_code.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "22562f2f-8359-4e44-951c-25e5ac033282", + "metadata": {}, + "outputs": [], + "source": [ + "scores_df.to_parquet(f\"{GCS_FILE_PATH}starter_kit_example_final_scores.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "9bc1a3cb-85e2-4203-bdd4-e45bb6c20ba4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_nameaccessibility_scoredac_accessibility_scoredac_traffic_impacts_scorefreight_efficiency_scorefreight_sustainability_scoremode_shift_scorelu_natural_resources_scoresafety_scorevmt_scorezev_scorepublic_engagement_scoreclimate_resilience_scoreprogram_fit_scoreoverall_score
0Meadow Magic Multi-Use Path28810235327661072
1Bunny Hop Bike Boulevard3976763221026568
\n", + "
" + ], + "text/plain": [ + " project_name accessibility_score dac_accessibility_score \\\n", + "0 Meadow Magic Multi-Use Path 2 8 \n", + "1 Bunny Hop Bike Boulevard 3 9 \n", + "\n", + " dac_traffic_impacts_score freight_efficiency_score \\\n", + "0 8 10 \n", + "1 7 6 \n", + "\n", + " freight_sustainability_score mode_shift_score lu_natural_resources_score \\\n", + "0 2 3 5 \n", + "1 7 6 3 \n", + "\n", + " safety_score vmt_score zev_score public_engagement_score \\\n", + "0 3 2 7 6 \n", + "1 2 2 10 2 \n", + "\n", + " climate_resilience_score program_fit_score overall_score \n", + "0 6 10 72 \n", + "1 6 5 68 " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores_df.head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "69d211b4-89f0-4b2c-9093-1118114ba649", + "metadata": {}, + "source": [ + "## You're almost done!\n", + "* Name this notebook `YOURNAME_exercise1.ipynb`\n", + " * You can't right click and rename the file, since this notebook is tracked with Git. \n", + " * Rename it using `git mv OLDNAME.ipynb NEWNAME.ipynb`. \n", + " * The `mv` stands for move, and renaming a file is basically \"moving\" its path. \n", + " * Doing it this way retains the git history associated with the notebook. If you rename directly with right click, rename, you destroy the git history.\n", + "* Use a descriptive commit message (ex: adding chart, etc). GitHub already tracks who makes the commit, the date, the timestamp of it, the files being affected, so your commit message should be more descriptive than the metadata already stored." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/starter_kit/2024_basics_02.ipynb b/starter_kit/2024_basics_02.ipynb new file mode 100644 index 000000000..f9fc0551e --- /dev/null +++ b/starter_kit/2024_basics_02.ipynb @@ -0,0 +1,2040 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "685c09c1-4d11-42a8-a213-8267137eede8", + "metadata": {}, + "source": [ + "# Exercise 2: Merging, Aggregating, Filtering, and Visualizing" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6cbbfb96-1e9e-400a-9884-72f08d1191f3", + "metadata": {}, + "outputs": [], + "source": [ + "import altair as alt\n", + "import pandas as pd\n", + "from calitp_data_analysis.sql import to_snakecase" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3da62b06-24b4-4791-a073-185ee3765152", + "metadata": {}, + "outputs": [], + "source": [ + "pd.options.display.max_columns = 100\n", + "pd.options.display.float_format = \"{:.2f}\".format\n", + "pd.set_option(\"display.max_rows\", None)\n", + "pd.set_option(\"display.max_colwidth\", None)" + ] + }, + { + "cell_type": "markdown", + "id": "616f1aed-d082-4e49-8eae-5c3acf87155f", + "metadata": {}, + "source": [ + "* Read back in the `parquet` file with the `overall_score` you created from exercise 1.\n", + "* Read the Excel sheet containing the project information (scope of work, district, and project name).\n", + "* **Use f-strings.**" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e7e4cafe-eb24-477b-a45c-88bfcaff37f3", + "metadata": {}, + "outputs": [], + "source": [ + "GCS_FILE_PATH = \"gs://calitp-analytics-data/data-analyses/starter_kit/\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2c4af22f-91ac-4e03-8b80-2121adc9a348", + "metadata": {}, + "outputs": [], + "source": [ + "EXCEL_FILE = \"starter_kit_csis_scoring_workbook.xlsx\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "873bfb72-9b47-472c-a18b-248be7f8c694", + "metadata": {}, + "outputs": [], + "source": [ + "OVERALL_SCORE_FILE = \"starter_kit_example_final_scores.parquet\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6cf0c667-b81a-430f-afb8-68f4e0f0a147", + "metadata": {}, + "outputs": [], + "source": [ + "projects_df = to_snakecase(pd.read_excel(f\"{GCS_FILE_PATH}{EXCEL_FILE}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7de4e3b1-15bb-4f37-a392-36c3c0d3e39d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ct_districtproject_namescope_of_workproject_costlead_agency
01Meadow Magic Multi-Use PathA 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife.5245734Meadow Bunny Public Transportation (MBPT)
14Bunny Hop Bike BoulevardA Class II bike lane with charming streetlights, benches, and bike racks designed to resemble carrot sticks, connecting residential neighborhoods to local schools and parks.6929368Unicorn Fairy Express Bus (UFX)
\n", + "
" + ], + "text/plain": [ + " ct_district project_name \\\n", + "0 1 Meadow Magic Multi-Use Path \n", + "1 4 Bunny Hop Bike Boulevard \n", + "\n", + " scope_of_work \\\n", + "0 A 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife. \n", + "1 A Class II bike lane with charming streetlights, benches, and bike racks designed to resemble carrot sticks, connecting residential neighborhoods to local schools and parks. \n", + "\n", + " project_cost lead_agency \n", + "0 5245734 Meadow Bunny Public Transportation (MBPT) \n", + "1 6929368 Unicorn Fairy Express Bus (UFX) " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "projects_df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "8a5e10d5-f978-408d-87d9-05f930038a47", + "metadata": {}, + "outputs": [], + "source": [ + "overall_scores_df = pd.read_parquet(f\"{GCS_FILE_PATH}{OVERALL_SCORE_FILE}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "898592ba-7655-41c9-a982-251491bd9083", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_nameaccessibility_scoredac_accessibility_scoredac_traffic_impacts_scorefreight_efficiency_scorefreight_sustainability_scoremode_shift_scorelu_natural_resources_scoresafety_scorevmt_scorezev_scorepublic_engagement_scoreclimate_resilience_scoreprogram_fit_scoreoverall_score
0Meadow Magic Multi-Use Path28810235327661072
1Bunny Hop Bike Boulevard3976763221026568
\n", + "
" + ], + "text/plain": [ + " project_name accessibility_score dac_accessibility_score \\\n", + "0 Meadow Magic Multi-Use Path 2 8 \n", + "1 Bunny Hop Bike Boulevard 3 9 \n", + "\n", + " dac_traffic_impacts_score freight_efficiency_score \\\n", + "0 8 10 \n", + "1 7 6 \n", + "\n", + " freight_sustainability_score mode_shift_score lu_natural_resources_score \\\n", + "0 2 3 5 \n", + "1 7 6 3 \n", + "\n", + " safety_score vmt_score zev_score public_engagement_score \\\n", + "0 3 2 7 6 \n", + "1 2 2 10 2 \n", + "\n", + " climate_resilience_score program_fit_score overall_score \n", + "0 6 10 72 \n", + "1 6 5 68 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "overall_scores_df.head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "4c2dd160-ec10-41ce-b5c0-a9be5934d6ee", + "metadata": {}, + "source": [ + "## Merging \n", + "* **Goal**: Your manager asks you to aggregate the dataframe by the Caltrans District grain to find\n", + " * Median overall score\n", + " * Max overall score \n", + " * Min overall score\n", + " * Number of unique projects\n", + "* Annoyingly enough, the `overall_score` column and the `ct_district` are in two different dataframes. \n", + "* You'll have to merge it on the common column(s) the two dataframes share.\n", + "* Welcome to DDS! This will happen to you all the time starting now. \n", + "\n", + "### Relevant Resources\n", + "* Read about and practice merges before diving in. \n", + " * [Resource #1 is a great tutorial for beginners](https://www.practicalpythonfordatascience.com/03_cleaning_data.html?highlight=merge#merging-dataframes-together).\n", + " * [Resource #2 is written by our own Tiffany Ku, but it contains some geospatial references so it's a bit more to digest](https://docs.calitp.org/data-infra/analytics_new_analysts/01-data-analysis-intro.html#merge-tabular-and-geospatial-data-for-data-analysis).\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2d356494-a12a-4f67-beb3-b6ba92c8135f", + "metadata": {}, + "outputs": [], + "source": [ + "# Practice Here" + ] + }, + { + "cell_type": "markdown", + "id": "e60cf03f-6de0-4b30-b879-080c9ab7a22f", + "metadata": {}, + "source": [ + "### Now merge your two CSIS dataframes\n", + "**Food for Thought**\n", + "* Which columns do the two dataframes have in common?\n", + "* What type of merge will achieve my goal?\n", + " * Inner, outer, left, or right\n", + "* What do I expect out of the merge?\n", + " * Do I expect all the values of the merge keys to be 1:1? Or m:1? \n", + " * Do I expect a project to correspond with multiple districts? Maybe, projects can and do cross multiple boundaries.\n", + " * Do I expect a project to correspond with only one total cost estimate value? Yes, there shouldn't be multiple cost estimates for the same project!\n", + "* How do I go about checking the data after the merge?\n", + " * Which arguments are available to help me per the [docs](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html)?" + ] + }, + { + "cell_type": "markdown", + "id": "8ddf0077-061c-4d67-8881-36c9792d6e62", + "metadata": {}, + "source": [ + "### Double Checking\n", + "* How many rows do you expect?\n", + "* How many unique projects are there? \n", + "* Hint: check your original dataframes as well" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ad4962ca-ed83-48a3-b1e6-79e5d5b1042b", + "metadata": {}, + "outputs": [], + "source": [ + "m1 = pd.merge(projects_df, overall_scores_df, on=[\"project_name\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e820d7af-17d4-4b2a-8007-5d958a3f7d9e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(41, 19)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m1.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "3642de14-3bf4-47c0-bd80-3502819ea14d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "41" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m1.project_name.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "4b5be67a-f579-4f22-97cb-b6b31d7b8433", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "44" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "projects_df.project_name.nunique()" + ] + }, + { + "cell_type": "markdown", + "id": "94e866f0-bc46-43d3-92b7-dce71dc31c02", + "metadata": {}, + "source": [ + "### The Beauty of Outer Joins \n", + "* As you have noticed, we are missing a couple of projects.\n", + "* This is where `outer` joins are very useful.\n", + "* Merge your dataframes again using an `outer` join and with `indicator = True` on.\n", + "* Using `.value_counts()` check out how many rows are found in both dataframes, the left only, and the right only" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "98e92c3f-ccd4-45f8-b6a6-523ddcb4a7ac", + "metadata": {}, + "outputs": [], + "source": [ + "m2 = pd.merge(\n", + " projects_df, overall_scores_df, on=[\"project_name\"], indicator=True, how=\"outer\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "f134cddf-5220-44f9-9e15-1c5171cbedfd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "both 41\n", + "left_only 3\n", + "right_only 3\n", + "Name: _merge, dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m2._merge.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "aa04599b-805e-4813-ab1a-c4b9fe77cc9e", + "metadata": {}, + "source": [ + "### Filtering\n", + "* Filter out for only the `left_only` and `right_only` values.\n", + " * `!=` means does not equal to. \n", + " * `==` means equal to." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "4dd07bab-4d1b-41a0-954e-4c2d59584e57", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_name_merge
10Rainbow Rush hot Lanesleft_only
12Bunny Lane HOV+2 heavenleft_only
26main street muffin topleft_only
44Rainbow Rush HOT Lanesright_only
45Bunny Lane HOV+2 Havenright_only
46Main Street Muffin Top Revitalizationright_only
\n", + "
" + ], + "text/plain": [ + " project_name _merge\n", + "10 Rainbow Rush hot Lanes left_only\n", + "12 Bunny Lane HOV+2 heaven left_only\n", + "26 main street muffin top left_only\n", + "44 Rainbow Rush HOT Lanes right_only\n", + "45 Bunny Lane HOV+2 Haven right_only\n", + "46 Main Street Muffin Top Revitalization right_only" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m2.loc[m2._merge != \"both\"][[\"project_name\", \"_merge\"]]" + ] + }, + { + "cell_type": "markdown", + "id": "044330d9-8562-4510-ae62-268f240ec3bc", + "metadata": {}, + "source": [ + "* You could also use `isin([list of elements you want to keep])`" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "c47ef38d-6db5-4bf1-bd87-62b7d84943b6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_name_merge
10Rainbow Rush hot Lanesleft_only
12Bunny Lane HOV+2 heavenleft_only
26main street muffin topleft_only
44Rainbow Rush HOT Lanesright_only
45Bunny Lane HOV+2 Havenright_only
46Main Street Muffin Top Revitalizationright_only
\n", + "
" + ], + "text/plain": [ + " project_name _merge\n", + "10 Rainbow Rush hot Lanes left_only\n", + "12 Bunny Lane HOV+2 heaven left_only\n", + "26 main street muffin top left_only\n", + "44 Rainbow Rush HOT Lanes right_only\n", + "45 Bunny Lane HOV+2 Haven right_only\n", + "46 Main Street Muffin Top Revitalization right_only" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m2.loc[m2._merge.isin([\"left_only\",\"right_only\"])][[\"project_name\", \"_merge\"]]" + ] + }, + { + "cell_type": "markdown", + "id": "1c2b7437-f767-497c-a53a-26aeef9a3b0f", + "metadata": {}, + "source": [ + "* If you want to filter out multiple elements use `~df.column.isin([list of elements you don't want to keep])`" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "09ff3055-29ee-4ea1-a164-5d4796aa1807", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(41, 20)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m2.loc[~m2._merge.isin([\"left_only\",\"right_only\"])].shape" + ] + }, + { + "cell_type": "markdown", + "id": "e0a6bd58-734e-4f96-b150-9a53bef7d1aa", + "metadata": {}, + "source": [ + "### Dictionaries\n", + "* String data is often entered in many different ways. \n", + " * BART can be entered in as bart, Bay Area Rapid Transit, BaRT, and more. \n", + "* Often, strings are the reason why your dataframe is not merging properly.\n", + "* In Excel, it's easy to go in and manually tweak everything. However, that is not reproducible and time consuming. \n", + "* Luckily with Python we can automate this. \n", + "* Since there are a couple of names to replace, we can do it using a dictionary.\n", + "\n", + "#### What is a dictionary?\n", + "* Per Practical Python for Data Science, a dictionary is Dictionaries are used to store data values in key:value pairs. Similar to the list, a dictionary is a collection of objects. It is also mutable, meaning that you can add, remove, change values inside of it...With the list, we access elements using the index. With the dictionary, we access elements using keys..\n", + "* Dictionaries are very important. \n", + "* Read more [here](https://www.practicalpythonfordatascience.com/00_python_crash_course_datatypes.html?highlight=dictionary#dictionary) and **follow its example in the cells below.**\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "df6fa95e-cc25-4142-8c2b-ee254863e609", + "metadata": {}, + "outputs": [], + "source": [ + "# Practice Here" + ] + }, + { + "cell_type": "markdown", + "id": "76e42f11-fdcb-48f3-8951-2f2cea0384c0", + "metadata": {}, + "source": [ + "#### Replacing Values\n", + "* [Resource](https://www.practicalpythonfordatascience.com/03_cleaning_data#recoding-column-values)\n", + "* **Step 1**: Filter out for the rows that didn't merge. Find the unique values of the `project_name` column using `.unique()`\n", + "* Take a look at elements using \n", + " * Trailing white spaces\n", + " * Capitalization\n", + " * Spelling\n", + " * Symbols" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "5601fd36-d221-41da-ab76-b88c616e5e62", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Rainbow Rush hot Lanes', 'Bunny Lane HOV+2 heaven',\n", + " 'main street muffin top ', 'Rainbow Rush HOT Lanes',\n", + " 'Bunny Lane HOV+2 Haven', 'Main Street Muffin Top Revitalization'],\n", + " dtype=object)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m2.loc[m2._merge.isin([\"left_only\",\"right_only\"])].project_name.unique()" + ] + }, + { + "cell_type": "markdown", + "id": "21f38614-3b49-45fd-97f6-7161a59ab367", + "metadata": {}, + "source": [ + "* **Step 2:** Decide whether you want to rename the values in the left dataframe or the right one. \n", + "* **Step 3:** The keys, are the values you want to replace. The values, are what you want to replace these values with. " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "9dad92fe-87a6-434d-a62f-d269f3ad1054", + "metadata": {}, + "outputs": [], + "source": [ + "new_names = {\n", + " \"main street muffin top \": \"Main Street Muffin Top Revitalization\",\n", + " \"Bunny Lane HOV+2 heaven\": \"Bunny Lane HOV+2 Haven\",\n", + " \"Rainbow Rush hot Lanes\": \"Rainbow Rush HOT Lanes\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "abe24864-66bd-4ce4-bb46-38a13c8bb64a", + "metadata": {}, + "source": [ + "* **Step 4**: Use your dictionary in `.replace()` to recode the values." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "d8532992-771e-446a-b419-55ad757ff45f", + "metadata": {}, + "outputs": [], + "source": [ + "projects_df.project_name = projects_df.project_name.replace(new_names)" + ] + }, + { + "cell_type": "markdown", + "id": "68562b10-b9bd-4892-8780-a66cad1a06d4", + "metadata": {}, + "source": [ + "#### Merge your dataframes again. This time the number of unique project names should match the rows of the merged dataframe perfectly." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "db09aa04-7a94-4b94-9ade-10b1a987e006", + "metadata": {}, + "outputs": [], + "source": [ + "final_m = pd.merge(projects_df, overall_scores_df, how=\"inner\", on=\"project_name\")" + ] + }, + { + "cell_type": "markdown", + "id": "144aa8a8-df59-418a-a0c7-4dbc3537c68f", + "metadata": {}, + "source": [ + "* You can check if two values are equal using `==`." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "04f4f6d8-55b6-460c-8a52-8626dcfd1cb9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(final_m) == final_m.project_name.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "39d74f54-a72b-4acc-91b0-b3dcb4539a92", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ct_districtproject_namescope_of_workproject_costlead_agencyaccessibility_scoredac_accessibility_scoredac_traffic_impacts_scorefreight_efficiency_scorefreight_sustainability_scoremode_shift_scorelu_natural_resources_scoresafety_scorevmt_scorezev_scorepublic_engagement_scoreclimate_resilience_scoreprogram_fit_scoreoverall_score
01Meadow Magic Multi-Use PathA 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife.5245734Meadow Bunny Public Transportation (MBPT)28810235327661072
\n", + "
" + ], + "text/plain": [ + " ct_district project_name \\\n", + "0 1 Meadow Magic Multi-Use Path \n", + "\n", + " scope_of_work \\\n", + "0 A 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife. \n", + "\n", + " project_cost lead_agency \\\n", + "0 5245734 Meadow Bunny Public Transportation (MBPT) \n", + "\n", + " accessibility_score dac_accessibility_score dac_traffic_impacts_score \\\n", + "0 2 8 8 \n", + "\n", + " freight_efficiency_score freight_sustainability_score mode_shift_score \\\n", + "0 10 2 3 \n", + "\n", + " lu_natural_resources_score safety_score vmt_score zev_score \\\n", + "0 5 3 2 7 \n", + "\n", + " public_engagement_score climate_resilience_score program_fit_score \\\n", + "0 6 6 10 \n", + "\n", + " overall_score \n", + "0 72 " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_m.head(1)" + ] + }, + { + "cell_type": "markdown", + "id": "3965dc2d-9603-4a95-a1fd-ef0b7a80eaaa", + "metadata": {}, + "source": [ + "#### Save this dataframe as a parquet to GCS under a new name\n", + "* Use a `f-string`" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "70410a43-62c9-467c-b777-3415f22abe01", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "final_m.to_parquet(f\"{GCS_FILE_PATH}starter_kit_example_merge.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "8231c525-ec3a-4e91-b378-1ca51a5f4de8", + "metadata": {}, + "source": [ + "## Groupby\n", + "* You're done merging...Oh wait, that wasn't even part of your manager's request. You still need to aggregate. \n", + "* The refresh your memory by Caltrans District to find\n", + " * Median overall score\n", + " * Max overall score \n", + " * Min overall score\n", + " * Number of unique projects\n", + "* There are many options Some are `groupby / agg`, `pivot_table`, `groupby / transform`\n", + "* Resource: Use the space below to explore this example.\n", + " * [DDS Docs](https://docs.calitp.org/data-infra/analytics_new_analysts/01-data-analysis-intro.html#aggregating)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "02f34bdf-3c17-4674-bdf0-f9982e7fac0a", + "metadata": {}, + "outputs": [], + "source": [ + "# Practice tutorial here" + ] + }, + { + "cell_type": "markdown", + "id": "849f8fc8-b356-4169-9398-dafd72956afe", + "metadata": {}, + "source": [ + "### Apply your new knowledge to the prompt above.\n", + "* Hint: After aggregating, your column name will no longer be relevant. For example, if you use `scope_of_work` to count the number of projects, this column no longer represents `scope_of_work`. It should be renamed something like `n_projects`.\n", + " * Rename your columns using this `df.rename(columns={\"old_column_name\":\"new_column_name\"})`" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "7328fcf2-ea52-46b8-8624-a7f3f39428df", + "metadata": {}, + "outputs": [], + "source": [ + "final_m[\"min_score\"] = final_m.overall_score" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "8dc4063c-1150-4b67-a125-16f245f4b9c4", + "metadata": {}, + "outputs": [], + "source": [ + "final_m[\"max_score\"] = final_m.overall_score" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "0892a805-7d7f-47cf-b086-f5e320c5361c", + "metadata": {}, + "outputs": [], + "source": [ + "agg1 = (\n", + " final_m.groupby([\"ct_district\"])\n", + " .agg(\n", + " {\n", + " \"overall_score\": \"median\",\n", + " \"min_score\": \"min\",\n", + " \"max_score\": \"max\",\n", + " \"project_name\": \"nunique\",\n", + " }\n", + " )\n", + " .reset_index()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "94c178b1-ff70-4d63-8820-aef101928c75", + "metadata": {}, + "outputs": [], + "source": [ + "agg1 = agg1.rename(\n", + " columns={\"overall_score\": \"median_score\", \"project_name\": \"n_projects\"}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "70178e81-0d11-4d19-9001-96e466d6dced", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ct_districtmedian_scoremin_scoremax_scoren_projects
0172.0072721
1261.5060632
2380.5054976
3470.5060976
4577.0058984
5672.0063773
6782.0079943
7873.0066855
8975.0067873
91072.5059862
101175.0055895
111272.5060974
\n", + "
" + ], + "text/plain": [ + " ct_district median_score min_score max_score n_projects\n", + "0 1 72.00 72 72 1\n", + "1 2 61.50 60 63 2\n", + "2 3 80.50 54 97 6\n", + "3 4 70.50 60 97 6\n", + "4 5 77.00 58 98 4\n", + "5 6 72.00 63 77 3\n", + "6 7 82.00 79 94 3\n", + "7 8 73.00 66 85 5\n", + "8 9 75.00 67 87 3\n", + "9 10 72.50 59 86 2\n", + "10 11 75.00 55 89 5\n", + "11 12 72.50 60 97 4" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agg1" + ] + }, + { + "cell_type": "markdown", + "id": "452b850a-7e15-473f-93d8-0133d496fa96", + "metadata": {}, + "source": [ + "## Visualizing \n", + "* You're done aggregating, but the dataframe looks objectively plain.\n", + "* Let's explore a couple of ways to present your data." + ] + }, + { + "cell_type": "markdown", + "id": "ba703a21-6e54-4667-8607-d4b8900f6371", + "metadata": {}, + "source": [ + "### Styling a Dataframe\n", + "* `pandas` has quite a few options that allow you to style your dataframe.\n", + "* [This tutorial](https://betterdatascience.com/style-pandas-dataframes/) offers some great ways to jazz up your dataframe.\n", + "* You can always read the [pandas documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html) for more ideas.\n", + "* Some ideas:\n", + " * Change the font\n", + " * Turn off the index\n", + " * Use colors to indicate low-high values\n", + " * Change the alignment of the values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0d8b97b-0f34-495a-8dfb-61642c44879a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "c251617a-1936-4df1-b461-cc63f4be5e37", + "metadata": {}, + "source": [ + "### Altair\n", + "* While a table is great, sometimes a chart is a better way to display an insight.\n", + "* Our preferred visualization library is `Altair`.\n", + " * Their docs page is [here](https://altair-viz.github.io/).\n", + "* The code to create a simple bar chart goes something like this. \n", + " * `alt.Chart(source).mark_bar().encode(x='a',y='b')`\n", + " * `source` is the dataframe you want to use for your chart.\n", + " * `x` denotes the column you are plotting on the X-axis. Make sure your column name has quotation marks around it. \n", + " * `y` denotes the column you are plotting on the Y-axis. \n", + "* Make your first chart below." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "fdcece32-d053-4b32-9e76-0f5ffed9ff52", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alt.Chart(agg1).mark_bar().encode(x=\"ct_district\", y=\"n_projects\")" + ] + }, + { + "cell_type": "markdown", + "id": "ff3cf65e-174c-4ee2-bdc3-ca07f3bb951f", + "metadata": {}, + "source": [ + "#### Customizing\n", + "* `altair` offers an endless ways to amp up the personality of your chart.\n", + "* Additionally, the chart above without a title and legend is a data visualization \"taboo\" and the dull blue is uninspiring. \n", + "\n", + "##### Add a title\n", + "* You can do so within `.Chart()`" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "88e1dff9-0188-49c9-b6cc-599610aca9a7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alt.Chart(agg1, title=\"your_title_here\").mark_bar().encode(\n", + " x=\"ct_district\", y=\"n_projects\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "03fa6313-cbf7-4c3a-af76-8012b0a927ef", + "metadata": {}, + "source": [ + "#### Different Charts\n", + "* If you want something that isn't a bar chart, simply swap out `.mark_bar()` for `.mark_line` or `mark_circle`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "d43cae4f-1faf-48fb-8c21-559feb5243b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alt.Chart(agg1, title=\"your_title_here\").mark_circle().encode(\n", + " x=\"ct_district\", y=\"n_projects\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "9b94c3f2-af01-43d4-9f17-98cd863511a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alt.Chart(agg1, title=\"your_title_here\").mark_line().encode(\n", + " x=\"ct_district\", y=\"n_projects\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1d7073ac-6528-4999-9c9c-94c8147c0ac6", + "metadata": {}, + "source": [ + "#### Add some color/DDS's Python Library\n", + "* We have some default color palettes that are already in our [internal library of functions](https://docs.calitp.org/data-infra/analytics_tools/python_libraries.html#calitp-data-analysis)." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "cb8df2a3-bf37-4fe4-833e-1259a6ad7f15", + "metadata": {}, + "outputs": [], + "source": [ + "# Import the color palettes\n", + "from calitp_data_analysis import calitp_color_palette" + ] + }, + { + "cell_type": "markdown", + "id": "102dac58-605a-4bfb-88f8-b9a11ea86b83", + "metadata": {}, + "source": [ + "* To see what is inside a module, put two question marks behind it.\n", + "* From here, you can choose another color palette." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "aa21d088-3360-4d3e-811c-8cc5bdb2d3a8", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[0;31mType:\u001b[0m module\n", + "\u001b[0;31mString form:\u001b[0m \n", + "\u001b[0;31mFile:\u001b[0m /opt/conda/lib/python3.9/site-packages/calitp_data_analysis/calitp_color_palette.py\n", + "\u001b[0;31mSource:\u001b[0m \n", + "\u001b[0;31m# --------------------------------------------------------------#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# Cal-ITP style guide\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# Google Drive > Cal-ITP Team > Project Resources >\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# Branded Resources and External Comms Guidelines > Branded Resources > Style Guide\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# --------------------------------------------------------------#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0mCALITP_CATEGORY_BRIGHT_COLORS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#2EA8CE\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# darker blue\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#EB9F3C\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# orange\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#F4D837\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# yellow\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#51BF9D\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# green\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#8CBCCB\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# lighter blue\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#9487C0\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# purple\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0mCALITP_CATEGORY_BOLD_COLORS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#136C97\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# darker blue\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#E16B26\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# orange\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#F6BF16\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# yellow\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#00896B\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# green\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#7790A3\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# lighter blue\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#5B559C\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# purple\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0mCALITP_DIVERGING_COLORS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#E16B26\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#EB9F3C\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# oranges\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#f6e7e1\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# linen\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#8CBCCB\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#2EA8CE\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#136C97\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# blues\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0mCALITP_SEQUENTIAL_COLORS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#B9D6DF\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# light blue (lightest)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#8CBCCB\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# lighter blue bright\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#2EA8CE\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# darker blue bright\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#136C97\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# darker blue bold\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"#0B405B\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# indigo dye (darkest)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "calitp_color_palette??" + ] + }, + { + "cell_type": "markdown", + "id": "f3971ba8-1c8f-4003-8e34-c3fd31f3f585", + "metadata": {}, + "source": [ + "* Place your color palette in the `scale` argument `scale=alt.Scale(range=your_color_palette)`.\n", + "* If I'm using a palette from `calitp_color_palette`, I would write `scale=alt.Scale(range=calitp_color_palette.CALITP_DIVERGING_COLORS)`." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "c629f242-9b1b-49d1-b4b0-1bb956782d69", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alt.Chart(agg1, title=\"your_title_here\").mark_bar().encode(\n", + " x=\"ct_district\",\n", + " y=\"n_projects\",\n", + " color=alt.Color(\n", + " \"n_projects\", # This is the column you want the color of your bar to be based on\n", + " title=\"legend_title_here\", # This is the legend of your title\n", + " scale=alt.Scale(\n", + " range=calitp_color_palette.CALITP_DIVERGING_COLORS # This is where you can customize the colors,\n", + " ), \n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "565ba059-6e2c-4d2b-99a4-665e39c0a0e5", + "metadata": {}, + "source": [ + "#### Adjusting the Axis\n", + "* Sometimes, we want to adjust the axis to have a min and max value.\n", + "* You do so using the `scale=alt.Scale(domain=[min_value, max_value]))` argument behind the X and Y axis.\n", + "* `alt.X()` and `alt.Y` gives you many more customization options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9832f9fc-53a3-4c5e-ba87-4b346d6f6985", + "metadata": {}, + "outputs": [], + "source": [ + "alt.Chart(agg1, title=\"your_title_here\").mark_bar().encode(\n", + " x=alt.X(\"ct_district\", scale=alt.Scale(domain=[1, 12])),\n", + " y=alt.Y(\"n_projects\", scale=alt.Scale(domain=[0, 10])),\n", + " color=alt.Color(\n", + " \"n_projects\",\n", + " title=\"legend_title_here\",\n", + " scale=alt.Scale(range=calitp_color_palette.CALITP_DIVERGING_COLORS),\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1cf6657c-f0ab-4c6f-9f83-f5cf16f84e9e", + "metadata": {}, + "source": [ + "### Finishing Touches \n", + "* `.properties(width=400, height=250)` adjusts the size of your chart. \n", + "* `tooltip=[columns you want]` gives you additional details on the columns you specify when you hover over each bar/circle/etc.\n", + "* `.mark_bar(size=10)` adjusts the size of the bar/circle/etc." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "8b85dd29-88cb-4b4b-b3b7-20ee1851335e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alt.Chart(agg1, title=\"your_title_here\").mark_bar(size = 10).encode(\n", + " x=alt.X(\"ct_district\", scale=alt.Scale(domain=[1, 12])),\n", + " y=alt.Y(\"n_projects\", scale=alt.Scale(domain=[0, 10])),\n", + " color=alt.Color(\n", + " \"n_projects\",\n", + " title=\"legend_title_here\",\n", + " scale=alt.Scale(range=calitp_color_palette.CALITP_DIVERGING_COLORS),\n", + " ),\n", + " tooltip=[\"ct_district\", \"n_projects\"]\n", + ").properties(width=400, height=250)" + ] + }, + { + "cell_type": "markdown", + "id": "281e37d9-8ece-471d-abc2-38e4ad9f9e83", + "metadata": {}, + "source": [ + "### We have only visualized one column of data. \n", + "* We have only visualized one column of data, but we have a couple of columns above. \n", + "* Try to customize your graph. If you can dream it, you can probably do it with Altair. \n", + " * You can turn off the grid lines, rotate the axis labels by various degrees, label the bars, add a dropdown menu to change the axis, and more. \n", + "* Make a few other charts in different styles.\n", + "* Inspiration\n", + " * Altair's [gallery](https://altair-viz.github.io/gallery/index.html)\n", + " * DDS's [portfolio](https://analysis.calitp.org/)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/starter_kit/2024_basics_03.ipynb b/starter_kit/2024_basics_03.ipynb new file mode 100644 index 000000000..85796cd3d --- /dev/null +++ b/starter_kit/2024_basics_03.ipynb @@ -0,0 +1,2075 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3f74a524-f90a-4ad5-8d98-368afc398b46", + "metadata": {}, + "source": [ + "# Exercise 3: Strings, Functions, If Else, For Loops" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ba8a0d90-9d57-4d01-9eb4-0b255970995e", + "metadata": {}, + "outputs": [], + "source": [ + "import altair as alt\n", + "import numpy as np\n", + "import pandas as pd\n", + "from calitp_data_analysis import calitp_color_palette" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ddcdbbc1-2e1b-4797-bd34-07d9a1999cb6", + "metadata": {}, + "outputs": [], + "source": [ + "pd.options.display.max_columns = 100\n", + "pd.options.display.float_format = \"{:.2f}\".format\n", + "pd.set_option(\"display.max_rows\", None)\n", + "pd.set_option(\"display.max_colwidth\", None)" + ] + }, + { + "cell_type": "markdown", + "id": "8eec9257-7578-422c-b6d1-afe496e8ca70", + "metadata": {}, + "source": [ + "* Using a `f-string`, load in your merged dataframe from Exercise 3." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7c52b09e-90b5-4a5d-8fda-ca19cb8fe3cd", + "metadata": {}, + "outputs": [], + "source": [ + "GCS_FILE_PATH = \"gs://calitp-analytics-data/data-analyses/starter_kit/\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e0222b8c-0996-47bb-8639-fc703cfbd249", + "metadata": {}, + "outputs": [], + "source": [ + "FILE = \"starter_kit_example_merge.parquet\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "36bbc1d2-4285-4399-a0fd-1e02c5e5d5a1", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_parquet(f\"{GCS_FILE_PATH}{FILE}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c97f0ec6-bea0-401a-bb27-f37984a762eb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ct_districtproject_namescope_of_workproject_costlead_agencyaccessibility_scoredac_accessibility_scoredac_traffic_impacts_scorefreight_efficiency_scorefreight_sustainability_scoremode_shift_scorelu_natural_resources_scoresafety_scorevmt_scorezev_scorepublic_engagement_scoreclimate_resilience_scoreprogram_fit_scoreoverall_score
01Meadow Magic Multi-Use PathA 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife.5245734Meadow Bunny Public Transportation (MBPT)28810235327661072
\n", + "
" + ], + "text/plain": [ + " ct_district project_name \\\n", + "0 1 Meadow Magic Multi-Use Path \n", + "\n", + " scope_of_work \\\n", + "0 A 2-mile Class I bike lane and multi-use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife. \n", + "\n", + " project_cost lead_agency \\\n", + "0 5245734 Meadow Bunny Public Transportation (MBPT) \n", + "\n", + " accessibility_score dac_accessibility_score dac_traffic_impacts_score \\\n", + "0 2 8 8 \n", + "\n", + " freight_efficiency_score freight_sustainability_score mode_shift_score \\\n", + "0 10 2 3 \n", + "\n", + " lu_natural_resources_score safety_score vmt_score zev_score \\\n", + "0 5 3 2 7 \n", + "\n", + " public_engagement_score climate_resilience_score program_fit_score \\\n", + "0 6 6 10 \n", + "\n", + " overall_score \n", + "0 72 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(1)" + ] + }, + { + "cell_type": "markdown", + "id": "673fa239-dc06-4ef8-9513-ee167e80898e", + "metadata": {}, + "source": [ + "## Categorizing\n", + "* There are 40+ projects. They all vary in themes, some are transit oriented while others are focused on Active Transportation (ATP).\n", + "* Categorizing data is an important part of data cleaning and analyzing so we can present the data on a more succinct, broader level. \n", + "* Let's organize projects into three categories.\n", + " * ATP\n", + " * Transit\n", + " * General Lanes" + ] + }, + { + "cell_type": "markdown", + "id": "49486dc6-a686-47fa-8cef-e252d7ec349d", + "metadata": {}, + "source": [ + "### Task 1: Strings\n", + "* Below are some of the common keywords that fall into the categories detailed above. They are held in a `list`.\n", + "* Feel free to add other terms you think are relevant. \n", + "* We are going to search the `Scope of Work` column for these keywords. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6a6b817f-15e2-4d1c-aeae-5d7e9661a6f0", + "metadata": {}, + "outputs": [], + "source": [ + "transit = [\"transit\", \"passenger rail\", \"bus\", \"ferry\"]\n", + "atp = [\"bike\", \"pedestrian\", \"bicycle\", \"sidewalk\", \"path\"]\n", + "general_lanes = [\"general\", \"auxiliary\", \"highway\"]" + ] + }, + { + "cell_type": "markdown", + "id": "6caf3a84-fcd7-4531-befe-11e76c01c8f1", + "metadata": {}, + "source": [ + "#### Step 1: Cleaning\n", + "* Remember in Exercise 2 some of the project names didn't merge between the two dataframes?\n", + "* In the real world, you won't have the bandwidth and time to replace each individual string value with a dictionary.\n", + "* An easy way to clean most of the values up is by lowercasing, stripping the white spaces, and replacing characters.\n", + "* In our goal of categorizing values, we can search through it easier when we clean up the string values." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ea4a4df7-61ec-430b-a827-302704857318", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2254/3600759827.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n", + " df.scope_of_work.str.lower()\n" + ] + } + ], + "source": [ + "df.scope_of_work = (\n", + " df.scope_of_work.str.lower()\n", + " .str.strip()\n", + " .str.replace(\"-\", \" \")\n", + " .str.replace(\"+\", \" \")\n", + " .str.replace(\"_\", \" \")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c3da188c-2afe-49f4-bbbd-8fecd8dfe10f", + "metadata": {}, + "source": [ + "* `str.contains()` allows you to search through the column. \n", + "* Let's search for projects that have \"transit\" in their descriptions. \n", + "* There are many modifications you can make to `str.contains()`. Try them out and see what happens.\n", + " * `df.loc[df.scope_of_work.str.contains(\"transit\", case=False)]` \n", + " * Will search through your column without matching the case. It'll return rows with both \"Transit\" and \"transit\".\n", + " * `df.scope_of_work.str.contains(\"transit\", case=False, regex=False) `\n", + " * Will return any matches that include `transit` rather than an exact match. It'll return rows with values like \"transit\" and \"Transitory\"." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "be843d6a-b751-4e9f-8820-b521089914d3", + "metadata": {}, + "outputs": [], + "source": [ + "transit_only_projects = df.loc[df.scope_of_work.str.contains(\"transit\")]" + ] + }, + { + "cell_type": "markdown", + "id": "ec68f286-cdeb-4b7b-86ef-6d35c8ee9587", + "metadata": {}, + "source": [ + "* Let's see how many transit projects are in this dataset.\n", + "* Let's read through the Scope of Work to make sure it's what we expect.\n", + "* Tip\n", + " * The data we work with tends to be pretty wide. Scrolling horizontally gets tiresome.\n", + " * Placing all the columns you want to temporarily work within a `list` like `preview_subset` below is a good idea. " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0d9a6259-8748-41fe-a549-01bdf0e9c273", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(transit_only_projects)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "315228d8-a72e-4f18-a0e7-2a254c87cc23", + "metadata": {}, + "outputs": [], + "source": [ + "preview_subset = [\"project_name\", \"scope_of_work\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "6789307c-5808-4501-a1a6-5a14a12b0219", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_namescope_of_work
11Greenway Gables Managed Lanesmanaged lanes prioritizing carpools, clean vehicles, and public transit, featuring real time traffic updates and incentives for sustainable transportation choices.
16Sparkle City Smart Streets Initiativean intelligent transportation system integrating traffic management, real time transit information, and smart parking solutions to enhance mobility and reduce congestion.
19Rolling Renaissance Rabbit Expressnew, eco friendly rolling stock for public transit, incorporating advanced propulsion systems, comfortable seating, and onboard amenities.
20Transit Treasure Transit Oasistransit supportive features, including shelters, wi fi, and real time information displays, prioritizing passenger convenience and accessibility.
25Trail of Treats and Transit Huba multi use path connecting to public transit, featuring public art installations, wayfinding signage, and amenities like bike storage and repair stations.
27Park and Ride Petal Paradisean attractive park and ride facility with amenities like ev charging, wi fi, and convenient access to nearby transit options.
43Brookside Bus Blossom Laneprioritize public transportation and enhance air quality by dedicating lanes to buses and hovs on brookside boulevard, integrating smart traffic signals and real time transit information inspired by the ancient elves.
\n", + "
" + ], + "text/plain": [ + " project_name \\\n", + "11 Greenway Gables Managed Lanes \n", + "16 Sparkle City Smart Streets Initiative \n", + "19 Rolling Renaissance Rabbit Express \n", + "20 Transit Treasure Transit Oasis \n", + "25 Trail of Treats and Transit Hub \n", + "27 Park and Ride Petal Paradise \n", + "43 Brookside Bus Blossom Lane \n", + "\n", + " scope_of_work \n", + "11 managed lanes prioritizing carpools, clean vehicles, and public transit, featuring real time traffic updates and incentives for sustainable transportation choices. \n", + "16 an intelligent transportation system integrating traffic management, real time transit information, and smart parking solutions to enhance mobility and reduce congestion. \n", + "19 new, eco friendly rolling stock for public transit, incorporating advanced propulsion systems, comfortable seating, and onboard amenities. \n", + "20 transit supportive features, including shelters, wi fi, and real time information displays, prioritizing passenger convenience and accessibility. \n", + "25 a multi use path connecting to public transit, featuring public art installations, wayfinding signage, and amenities like bike storage and repair stations. \n", + "27 an attractive park and ride facility with amenities like ev charging, wi fi, and convenient access to nearby transit options. \n", + "43 prioritize public transportation and enhance air quality by dedicating lanes to buses and hovs on brookside boulevard, integrating smart traffic signals and real time transit information inspired by the ancient elves. " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transit_only_projects[preview_subset]" + ] + }, + { + "cell_type": "markdown", + "id": "d3adfb74-5a24-47f8-88da-92fe5591821a", + "metadata": {}, + "source": [ + "#### Step 2: Filtering\n", + "* We've found all the projects that says \"transit\" somewhere in its description. \n", + "* Now there are just many more elements to go. We forgot about bikes, bus, rail, so on and so forth.\n", + "* The method above leaves us with multiple dataframes. We actually just want our one original dataframe tagged with categories. \n", + "* A faster way: join all the keywords you want into one large string.\n", + " * | designates \"or\".\n", + " * You can read `transit_keywords` as \"I want projects that contain the word transit or passenger rai or bus or ferry\"" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c2575f75-44ac-46ba-a334-fdf984546cd3", + "metadata": {}, + "outputs": [], + "source": [ + "transit_keywords = f\"({'|'.join(transit)})\"" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f6a2a521-c0ae-4c2d-830d-4020a13855f2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'(transit|passenger rail|bus|ferry)'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Print it out\n", + "transit_keywords" + ] + }, + { + "cell_type": "markdown", + "id": "937913db-407e-415c-aabb-31d3f511ef0b", + "metadata": {}, + "source": [ + "* Filter again - notice the .loc after df and how there are brackets around `df`?\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "e5e23b6f-98b8-4219-bc52-d847ea39d121", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2254/1070197006.py:1: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " df.loc[df.scope_of_work.str.contains(transit_keywords)][preview_subset]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_namescope_of_work
11Greenway Gables Managed Lanesmanaged lanes prioritizing carpools, clean vehicles, and public transit, featuring real time traffic updates and incentives for sustainable transportation choices.
16Sparkle City Smart Streets Initiativean intelligent transportation system integrating traffic management, real time transit information, and smart parking solutions to enhance mobility and reduce congestion.
18Coastal Commuter Carousela 30 mile passenger rail line connecting coastal towns, featuring modern train sets, enhanced station amenities, and scenic viewing cars.
19Rolling Renaissance Rabbit Expressnew, eco friendly rolling stock for public transit, incorporating advanced propulsion systems, comfortable seating, and onboard amenities.
20Transit Treasure Transit Oasistransit supportive features, including shelters, wi fi, and real time information displays, prioritizing passenger convenience and accessibility.
21Berry Best Bus Rapid Transitdedicated bus lanes with comfortable stops, featuring off board fare payment, priority traffic signals, and enhanced passenger amenities.
25Trail of Treats and Transit Huba multi use path connecting to public transit, featuring public art installations, wayfinding signage, and amenities like bike storage and repair stations.
27Park and Ride Petal Paradisean attractive park and ride facility with amenities like ev charging, wi fi, and convenient access to nearby transit options.
43Brookside Bus Blossom Laneprioritize public transportation and enhance air quality by dedicating lanes to buses and hovs on brookside boulevard, integrating smart traffic signals and real time transit information inspired by the ancient elves.
\n", + "
" + ], + "text/plain": [ + " project_name \\\n", + "11 Greenway Gables Managed Lanes \n", + "16 Sparkle City Smart Streets Initiative \n", + "18 Coastal Commuter Carousel \n", + "19 Rolling Renaissance Rabbit Express \n", + "20 Transit Treasure Transit Oasis \n", + "21 Berry Best Bus Rapid Transit \n", + "25 Trail of Treats and Transit Hub \n", + "27 Park and Ride Petal Paradise \n", + "43 Brookside Bus Blossom Lane \n", + "\n", + " scope_of_work \n", + "11 managed lanes prioritizing carpools, clean vehicles, and public transit, featuring real time traffic updates and incentives for sustainable transportation choices. \n", + "16 an intelligent transportation system integrating traffic management, real time transit information, and smart parking solutions to enhance mobility and reduce congestion. \n", + "18 a 30 mile passenger rail line connecting coastal towns, featuring modern train sets, enhanced station amenities, and scenic viewing cars. \n", + "19 new, eco friendly rolling stock for public transit, incorporating advanced propulsion systems, comfortable seating, and onboard amenities. \n", + "20 transit supportive features, including shelters, wi fi, and real time information displays, prioritizing passenger convenience and accessibility. \n", + "21 dedicated bus lanes with comfortable stops, featuring off board fare payment, priority traffic signals, and enhanced passenger amenities. \n", + "25 a multi use path connecting to public transit, featuring public art installations, wayfinding signage, and amenities like bike storage and repair stations. \n", + "27 an attractive park and ride facility with amenities like ev charging, wi fi, and convenient access to nearby transit options. \n", + "43 prioritize public transportation and enhance air quality by dedicating lanes to buses and hovs on brookside boulevard, integrating smart traffic signals and real time transit information inspired by the ancient elves. " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[df.scope_of_work.str.contains(transit_keywords)][preview_subset]" + ] + }, + { + "cell_type": "markdown", + "id": "c82ef0b7-d2c9-48d1-a53f-625fb083e196", + "metadata": {}, + "source": [ + "* Notice how many more projects appear when we filter for 3 additional transit related keywords, compared to only transit?" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "7b62f28d-7b28-4258-8efa-74d1f9a41d04", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7\n", + "9\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2254/2770509021.py:2: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " print(len(df.loc[df.scope_of_work.str.contains(transit_keywords)]))\n" + ] + } + ], + "source": [ + "print(len(transit_only_projects))\n", + "print(len(df.loc[df.scope_of_work.str.contains(transit_keywords)]))" + ] + }, + { + "cell_type": "markdown", + "id": "7c6717f8-4088-4c1f-9ec6-b9959fd6d283", + "metadata": {}, + "source": [ + "\n", + "* Let's put this all together. \n", + "* I want any project that contains a transit component to be tagged as \"Y\" in a column called \"Transit\". \n", + "* If a project doesn't have a transit component, it gets tagged as a \"N\"." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "47afb269-672f-44c1-8ab5-d70921c6e703", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2254/653877654.py:2: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " (df.scope_of_work.str.contains(transit_keywords)),\n" + ] + } + ], + "source": [ + "df[\"Transit\"] = np.where(\n", + " (df.scope_of_work.str.contains(transit_keywords)),\n", + " \"Y\",\n", + " \"N\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "dfe862f0-f77e-4bf5-8710-888d3a8d7a4c", + "metadata": {}, + "source": [ + "* Using `value_counts()` we can see the total of transit related vs non-transit related projects." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "c63f2ff8-3d2f-41c6-96d1-36d35159aef8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "N 35\n", + "Y 9\n", + "Name: Transit, dtype: int64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.Transit.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "f18b2040-37f0-4e1a-b7ab-484eea69f1f9", + "metadata": { + "tags": [] + }, + "source": [ + "### Task 2: Functions \n", + "* It looks only the 9 transit projects were categorized.\n", + "* We are missing the 2 categories: ATP and General Lane related projects.\n", + "* We could repeat the steps above or we can use a function.\n", + " * You can think of a function as a piece of code you write only once but reuse more than once.\n", + " * In the long run, functions save you work and look neater when you present your work.\n", + "* You may not have realized this but you've been using functions this whole time.\n", + " * When you are taking the `len()` you are using a built-in function to find the number of rows in a dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "8c62fef2-8215-4983-a4e6-c671177b822f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "44" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "markdown", + "id": "c2180f69-6b3d-465c-8dda-a067e24f4ed1", + "metadata": {}, + "source": [ + "* `type` too is a built-in function that tells you what type of variable you are looking at. " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "0659a036-76ad-4251-80a1-323a0a04c912", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.core.frame.DataFrame" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "2985ec16-35e1-4eae-b2c5-facb354ce4e5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "str" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(GCS_FILE_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "65c6b0c7-a314-434f-8304-10afd6c84514", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "list" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(transit)" + ] + }, + { + "cell_type": "markdown", + "id": "b2a5b9d7-1b39-419e-892a-fe44da7a4cf0", + "metadata": { + "tags": [] + }, + "source": [ + "### Practice with outside resources\n", + "* Functions are incredibly important as such, **please spend more time than usual on this section and practice the tutorials linked.**\n", + "* [Tutorial #1 Practical Python for Data Science.](https://www.practicalpythonfordatascience.com/00_python_crash_course_functions)\n", + "* [DDS Functions.](https://docs.calitp.org/data-infra/analytics_new_analysts/01-data-analysis-intro.html#functions)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "00ead246-8879-4075-a632-d0ded58df558", + "metadata": {}, + "outputs": [], + "source": [ + "# Practice here" + ] + }, + { + "cell_type": "markdown", + "id": "463e13cf-7ba1-4499-bcd0-465a6457f856", + "metadata": { + "tags": [] + }, + "source": [ + "#### Let's build a function together.\n", + "* This will be repetitive after the tutorials, but you will use functions all the time at DDS and this is a concept we would like to drive home.\n", + "* Start your function with `def():`` and the name you'd like." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "97e597a2-8625-4f2b-8646-760c0c011208", + "metadata": {}, + "outputs": [], + "source": [ + "# def categorize():" + ] + }, + { + "cell_type": "markdown", + "id": "06ccd282-cf21-462b-8930-9a3148671ff1", + "metadata": {}, + "source": [ + "* Now let's think of what are the two elements that we will repeat.\n", + "* We merely want to substitute `transit_keywords` with ATP or General Lane related keywords.\n", + "* Instead of the `df[\"Transit]\"`, we want to create two new columns called something like `df[\"ATP]\"` and `df[\"General_Lanes]\"` to hold our yes/no results.\n", + "* Add the two elements that need to be substituted into the argument of your function.\n", + " * It's good practice to specify what exactly the parameter should be: a string/list/dataframe/etc. \n", + " * Including this detail make it easier for your coworkers to read and use your code." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "61973dc6-d99b-48f0-842f-a3c8fe74f064", + "metadata": {}, + "outputs": [], + "source": [ + "# def categorize(df:pd.DataFrame, keywords:list, new_column:str):" + ] + }, + { + "cell_type": "markdown", + "id": "ae178f6d-0f76-419c-aab2-9924ba294605", + "metadata": {}, + "source": [ + "* It's also a nice idea to document what your function will return.\n", + "* In our case, it's a Pandas dataframe. " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "a794693a-3bf2-48ba-b0a7-1ca3a41e03af", + "metadata": {}, + "outputs": [], + "source": [ + "# def categorize(df:pd.DataFrame, keywords:list, new_column:str)->pd.DataFrame:" + ] + }, + { + "cell_type": "markdown", + "id": "be820c1a-a0d2-4b2f-bf01-70e753603291", + "metadata": {}, + "source": [ + "* Think about the steps we took to categorize transit only.\n", + "* Add the sections of the code we will be reusing and sub in the original variables for the arguments.\n", + " * First, we joined the keywords from a list into a big string.\n", + " * Second, we searched through the Scope of Work column for the keywords.\n", + " * Third, if we find the keyword, we will tag the project as \"Y\" in the column \"new_column\". If the keyword isn't found, the project is tagged as \"N\".\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "4721b564-726a-4e05-9d27-8035609b5fcf", + "metadata": {}, + "outputs": [], + "source": [ + "def categorize(df: pd.DataFrame, keywords: list, new_column: str) -> pd.DataFrame:\n", + " \n", + " # Remember this used to be the list called transit_keywords, but it must be changed into a long string\n", + " joined_keywords = f\"({'|'.join(keywords)})\" \n", + "\n", + " # We are now creating a new column: notice how parameters has no quotation marks.\n", + " df[new_column] = np.where((df.scope_of_work.str.contains(joined_keywords)), \n", + " \"Y\",\n", + " \"N\",\n", + " )\n", + "\n", + " # We are returning the updated dataframe from this function\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "id": "81bbb109-beef-452c-b8d9-eb13e7b9ee03", + "metadata": {}, + "source": [ + "* Now let's use your function" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "23e31c98-17b3-41e2-883a-14dae9d6da7e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2254/2245515441.py:7: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " df[new_column] = np.where((df.scope_of_work.str.contains(joined_keywords)),\n" + ] + } + ], + "source": [ + "df = categorize(df, atp, \"ATP\")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "d5ec64cf-432c-45e2-b14d-f4ea7ca3de2a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "N 30\n", + "Y 14\n", + "Name: ATP, dtype: int64" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.ATP.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "882a02a6-ce39-4da2-b2be-7e91322624e4", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2254/2245515441.py:7: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " df[new_column] = np.where((df.scope_of_work.str.contains(joined_keywords)),\n" + ] + } + ], + "source": [ + "df = categorize(df, transit, \"Transit\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "ee56ee97-307c-44a4-a2d4-b02eff954f87", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2254/2245515441.py:7: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " df[new_column] = np.where((df.scope_of_work.str.contains(joined_keywords)),\n" + ] + } + ], + "source": [ + "df = categorize(df, general_lanes, \"General_Lanes\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "96f2efba-4179-4a8c-b969-fd2990f8a129", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "N 35\n", + "Y 9\n", + "Name: General_Lanes, dtype: int64" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.General_Lanes.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "405aac8e-4488-47fa-bbb1-a12121ed8d15", + "metadata": {}, + "source": [ + "#### Check out your results\n", + "* Use the `groupby` technique from Exercise 2 to get some descriptive statistics for these 3 new columns\n", + "* Use `.reset_index()` after `aggregate()` to see what happens.\n", + "* Try `.reset_index(drop = True)` as well. " + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "62115dcb-ea34-4bb1-9bd1-e678ec015b8c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
General_LanesTransitATPproject_nameoverall_score
0NNN1573.00
1NNY1172.00
2NYN875.00
3NYY175.00
4YNN773.00
5YNY282.00
\n", + "
" + ], + "text/plain": [ + " General_Lanes Transit ATP project_name overall_score\n", + "0 N N N 15 73.00\n", + "1 N N Y 11 72.00\n", + "2 N Y N 8 75.00\n", + "3 N Y Y 1 75.00\n", + "4 Y N N 7 73.00\n", + "5 Y N Y 2 82.00" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby([\"General_Lanes\", \"Transit\", \"ATP\"]).aggregate(\n", + " {\"project_name\": \"nunique\", \"overall_score\": \"median\"}\n", + ").reset_index()" + ] + }, + { + "cell_type": "markdown", + "id": "e17c3e18-5f55-4a00-9919-b1f0c826b77f", + "metadata": {}, + "source": [ + "## Function + If-Else\n", + "* There are many cases in which we want to categorize our columns to create broader groups for summarizing and aggregating.\n", + "* Using a function with an If-Else clause will help us accomplish this goal.\n", + "* **Resources:**\n", + " * [DDS Apply Docs](https://docs.calitp.org/data-infra/analytics_new_analysts/01-data-analysis-intro.html#functions)\n", + " * [DDS If-Else Tutorial](https://docs.calitp.org/data-infra/analytics_new_analysts/01-data-analysis-intro.html#if-else-statements)\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "6d824c18-4c2b-41c9-950b-866e567ab7f5", + "metadata": {}, + "outputs": [], + "source": [ + "# Practice here." + ] + }, + { + "cell_type": "markdown", + "id": "212570f5-e8ed-4151-be24-dd0994304334", + "metadata": {}, + "source": [ + "Goal: \n", + "* We are going to write an If-Else function that categorizes projects by whether it scored low, medium, or high based on its `overall_score` and percentiles.\n", + "* For example, if a project scores below the 25% percentile, it is a \"low scoring project\". If a project scores above the 25% percentile but below the 75% percentile, it is a \"medium scoring project\". Anything above the 75% percentile is \"high scoring\".\n", + "* Use the values you find from .describe() as reference.\n", + "* You aren't limited to only the 25th, 50th, and 75th percentile. You can categorize low,medium, and high based on other percentile ranges. \n", + " * You can do so by specifying within `describe` like `.describe(percentiles=[0.05, 0.1, 0.9, 0.95])`.\n", + "* In Data Science, we like to save our work into variables.\n", + " * If new projects are added, then what determines the different percentiles will likely switch.\n", + " * As such, you can save whatever percentile you like using `p75 = df.overall_score.quantile(0.75).astype(float)` which will change along with the dataset when you load in the new data." + ] + }, + { + "cell_type": "markdown", + "id": "d91c41b1-76c4-4673-b16f-ef9990d66270", + "metadata": {}, + "source": [ + "### Practice #2\n", + "* Goal:\n", + " * Above, we can see all types of combinations of categories a project can fall into. \n", + " * Let's do away with these \"Y\" and \"N\" columns and create actual categories in an actual column called `categories`.\n", + " * If a project has \"N\" for all 3 of the General Lane, Transit, and ATP columns, it should be `Other`. \n", + " * If a project has \"Y\" for all 3, it should be categorized as \"General Lane, Transit, and ATP\".\n", + " * If a project has \"Y\" for only ATP and Transit, it should be categorized as \"Transit and ATP\".\n", + " * Yes this will be very tedious given all the combinations!\n", + "* Resource:\n", + " * [Geeks for Geeks: if-else with multiple conditions](https://www.geeksforgeeks.org/check-multiple-conditions-in-if-statement-python/)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "d560dad0-de03-4469-99f8-5fadd9b198dc", + "metadata": {}, + "outputs": [], + "source": [ + "def categorize(row):\n", + " if (row.General_Lanes == \"N\") & (row.Transit == \"N\") & (row.ATP == \"N\"):\n", + " return \"Other\"\n", + " elif (row.General_Lanes == \"N\") & (row.Transit == \"N\") & (row.ATP == \"Y\"):\n", + " return \"ATP\"\n", + " elif (row.General_Lanes == \"N\") & (row.Transit == \"Y\") & (row.ATP == \"N\"):\n", + " return \"Transit\"\n", + " elif (row.General_Lanes == \"N\") & (row.Transit == \"Y\") & (row.ATP == \"Y\"):\n", + " return \"Transit and ATP\"\n", + " elif (row.General_Lanes == \"Y\") & (row.Transit == \"N\") & (row.ATP == \"N\"):\n", + " return \"General Lanes\"\n", + " elif (row.General_Lanes == \"Y\") & (row.Transit == \"N\") & (row.ATP == \"Y\"):\n", + " return \"General Lanes and ATP\"\n", + " else:\n", + " return \"Transit, General Lanes, and ATP\"" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "f8b7d946-c724-43cb-9a93-d1003f7f024f", + "metadata": {}, + "outputs": [], + "source": [ + "# Apply your function\n", + "df[\"category\"] = df.apply(categorize, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "df815f56-c2ed-43ff-9180-147beddcffe0", + "metadata": {}, + "source": [ + "### Please export your output as a `.parquet` to GCS before moving onto the next step" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "f18a7754-907c-46fa-ad77-4a09abb03206", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ct_districtproject_namescope_of_workproject_costlead_agencyaccessibility_scoredac_accessibility_scoredac_traffic_impacts_scorefreight_efficiency_scorefreight_sustainability_scoremode_shift_scorelu_natural_resources_scoresafety_scorevmt_scorezev_scorepublic_engagement_scoreclimate_resilience_scoreprogram_fit_scoreoverall_scoreTransitATPGeneral_Lanescategory
01Meadow Magic Multi-Use Patha 2 mile class i bike lane and multi use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife.5245734Meadow Bunny Public Transportation (MBPT)28810235327661072NYNATP
\n", + "
" + ], + "text/plain": [ + " ct_district project_name \\\n", + "0 1 Meadow Magic Multi-Use Path \n", + "\n", + " scope_of_work \\\n", + "0 a 2 mile class i bike lane and multi use path through a scenic meadow, featuring wildflower plantings, public art installations, and educational signage highlighting local wildlife. \n", + "\n", + " project_cost lead_agency \\\n", + "0 5245734 Meadow Bunny Public Transportation (MBPT) \n", + "\n", + " accessibility_score dac_accessibility_score dac_traffic_impacts_score \\\n", + "0 2 8 8 \n", + "\n", + " freight_efficiency_score freight_sustainability_score mode_shift_score \\\n", + "0 10 2 3 \n", + "\n", + " lu_natural_resources_score safety_score vmt_score zev_score \\\n", + "0 5 3 2 7 \n", + "\n", + " public_engagement_score climate_resilience_score program_fit_score \\\n", + "0 6 6 10 \n", + "\n", + " overall_score Transit ATP General_Lanes category \n", + "0 72 N Y N ATP " + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "2245ce0c-97fb-4f08-9791-9fb6b28b49c7", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "df.to_parquet(f\"{GCS_FILE_PATH}starter_kit_example_categorized.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "14ba020e-e2b3-4447-89e2-abdc0579fc6b", + "metadata": {}, + "source": [ + "## For Loops \n", + "* For Loops are one of the greatest gifts of Python. \n", + "* Below is a simple for loop that prints out all the numbers in range of 10.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "48495a9f-e29c-41eb-b3e7-de6371fbd182", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n", + "1\n", + "2\n", + "3\n", + "4\n", + "5\n", + "6\n", + "7\n", + "8\n", + "9\n" + ] + } + ], + "source": [ + "for i in range(10):\n", + " print(i)" + ] + }, + { + "cell_type": "markdown", + "id": "a8cdfc33-359c-4687-be4a-7f758c028640", + "metadata": {}, + "source": [ + "* Here, I'm looping over a couple of columns in my dataframe and printing some descriptive statistics about it.\n", + "* Notice how I have to use `print` and `display` to show the results.\n", + " * Try this same block of code without `print` and `display` to see the difference." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "fca9e430-a906-4d0e-8046-36a0687b0636", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Statistics for zev_score\n" + ] + }, + { + "data": { + "text/plain": [ + "count 44.00\n", + "mean 6.00\n", + "std 2.96\n", + "min 1.00\n", + "25% 3.75\n", + "50% 6.50\n", + "75% 8.00\n", + "max 10.00\n", + "Name: zev_score, dtype: float64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Statistics for vmt_score\n" + ] + }, + { + "data": { + "text/plain": [ + "count 44.00\n", + "mean 4.52\n", + "std 2.73\n", + "min 1.00\n", + "25% 2.00\n", + "50% 4.00\n", + "75% 6.00\n", + "max 10.00\n", + "Name: vmt_score, dtype: float64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Statistics for accessibility_score\n" + ] + }, + { + "data": { + "text/plain": [ + "count 44.00\n", + "mean 5.14\n", + "std 2.66\n", + "min 1.00\n", + "25% 3.00\n", + "50% 5.00\n", + "75% 7.00\n", + "max 10.00\n", + "Name: accessibility_score, dtype: float64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for column in [\"zev_score\", \"vmt_score\", \"accessibility_score\"]:\n", + " print(f\"Statistics for {column}\")\n", + " display(df[column].describe())" + ] + }, + { + "cell_type": "markdown", + "id": "ded54884-4bad-46ae-a82f-2a67936c57dd", + "metadata": {}, + "source": [ + "### Practice using a for loop\n", + "* Below, I have already aggregated the dataframe for you." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "5b414d3f-71a4-4078-9d98-b9082114e2c5", + "metadata": {}, + "outputs": [], + "source": [ + "agg1 = (\n", + " df.groupby([\"category\"])\n", + " .aggregate(\n", + " {\"overall_score\": \"median\", \"project_cost\": \"median\", \"project_name\": \"nunique\"}\n", + " )\n", + " .reset_index()\n", + " .rename(\n", + " columns={\n", + " \"overall_score\": \"median_score\",\n", + " \"project_cost\": \"median_project_cost\",\n", + " \"project_name\": \"total_projects\",\n", + " }\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "1698fe9c-6d1f-412b-a632-826aae1ffc65", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categorymedian_scoremedian_project_costtotal_projects
0ATP72.004991255.0011
1General Lanes73.007487963.007
2General Lanes and ATP82.005672550.502
3Other73.003708858.0015
4Transit75.004399886.008
5Transit and ATP75.002069143.001
\n", + "
" + ], + "text/plain": [ + " category median_score median_project_cost total_projects\n", + "0 ATP 72.00 4991255.00 11\n", + "1 General Lanes 73.00 7487963.00 7\n", + "2 General Lanes and ATP 82.00 5672550.50 2\n", + "3 Other 73.00 3708858.00 15\n", + "4 Transit 75.00 4399886.00 8\n", + "5 Transit and ATP 75.00 2069143.00 1" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agg1" + ] + }, + { + "cell_type": "markdown", + "id": "345a87ee-0f09-43f2-ad3e-70debb7ab25c", + "metadata": {}, + "source": [ + "* I have also prepared an Altair chart function. " + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "320bd91e-b9ed-4423-80d4-c1a1aa5ba59f", + "metadata": {}, + "outputs": [], + "source": [ + "def create_chart(df: pd.DataFrame, column: str) -> alt.Chart:\n", + " title = column.replace(\"_\", \" \").title()\n", + " chart = (\n", + " alt.Chart(df, title=f\"{title} by Categories\")\n", + " .mark_bar(size=20)\n", + " .encode(\n", + " x=alt.X(column),\n", + " y=alt.Y(\"category\"),\n", + " color=alt.Color(\n", + " \"category\",\n", + " scale=alt.Scale(\n", + " range=calitp_color_palette.CALITP_CATEGORY_BRIGHT_COLORS\n", + " ),\n", + " ),\n", + " tooltip=list(df.columns),\n", + " )\n", + " .properties(width=400, height=250)\n", + " )\n", + " return chart" + ] + }, + { + "cell_type": "markdown", + "id": "a47dc93c-ab8b-4be7-a90d-3ca941e94050", + "metadata": {}, + "source": [ + "* Use the function to create a chart out of the aggregated dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "a6103703-8131-4ed8-9482-314c7895c279", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "create_chart(agg1, \"median_score\")" + ] + }, + { + "cell_type": "markdown", + "id": "eff3b0be-7091-4995-b2b8-63d62bf9b6c4", + "metadata": {}, + "source": [ + "* We have a couple of other columns left that still need to be visualized. \n", + "* This is the perfect case for using a for loop, since we all we want to do is replace the column above with the two remainig columns. \n", + "* Try this below! \n", + " * Hint: you'll have to wrap the function with `display()` to get your results." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "ca8659f1-0842-4bb5-a544-9a2a5fb93c02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for column in [\"median_score\", \"median_project_cost\", \"total_projects\"]:\n", + " display(create_chart(agg1, column))" + ] + }, + { + "cell_type": "markdown", + "id": "0f77dcf4-7b19-4e58-b20b-ae59721deb9c", + "metadata": {}, + "source": [ + "### Try it out yourself\n", + "* Think of some other use cases for a for loop and try them out here." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/starter_kit/2024_basics_04.ipynb b/starter_kit/2024_basics_04.ipynb new file mode 100644 index 000000000..c36cc6a54 --- /dev/null +++ b/starter_kit/2024_basics_04.ipynb @@ -0,0 +1,2317 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "05dd29e6-ec3f-4f9d-a595-d28b578c74e3", + "metadata": {}, + "source": [ + "# Exercise 4: Python Scripts, Concept of Grains, Display, Markdown,\n", + "* Cleaning and analyzing data takes a lot of time, patience, and skill.\n", + "* However, presenting the data to stakeholders is also equaly important.\n", + "* At DDS, we often present our work in a Jupyter Notebook.\n", + "* This exercise will walk you through how we do so. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1d4e2cdf-a5b9-4ebb-aa2f-c7abe897a683", + "metadata": {}, + "outputs": [], + "source": [ + "import _starterkit_utils\n", + "import altair as alt\n", + "import numpy as np\n", + "import pandas as pd\n", + "from calitp_data_analysis import calitp_color_palette" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a0403b50-d81c-4499-9b69-e164eb38f8cd", + "metadata": {}, + "outputs": [], + "source": [ + "pd.options.display.max_columns = 100\n", + "pd.options.display.float_format = \"{:.2f}\".format\n", + "pd.set_option(\"display.max_rows\", None)\n", + "pd.set_option(\"display.max_colwidth\", None)" + ] + }, + { + "cell_type": "markdown", + "id": "20bbcce9-b48c-4ab3-ae05-7229b97c141b", + "metadata": {}, + "source": [ + "## Python Scripts\n", + "* Up until now, we have been placing all of our code in the Jupyter Notebook.\n", + "* While this is convenient, it's not the best practice. \n", + "* A notebook full of code isn't easy for viewers - it gets chaotic, quickly! \n", + "* Jupyter notebooks are also very difficult for Git to version control. \n", + "* **The best solution is to move the bulk of your code when you have reached a stopping point to a Python Script.**\n", + " * Read all about the benefits of scripts [here in our DDS docs](https://docs.calitp.org/data-infra/analytics_tools/scripts.html).\n", + " * Summary points from the docs page above:\n", + " * Python scripts (.py) are plain text files. Git tracks plain text changes easily.\n", + " * Scripts are robust to scaling and reproducing work.\n", + " * Break out scripts by concepts / stages\n", + " * All functions used in scripts should have docstrings. Type hints are encouraged!\n", + "* Making Python scripts is an art and not straight forward.\n", + "* I have already populated a `.py` file called `_starterkit_utils` with some sample functions.\n", + "* I imported my Python Script just like how I imported my other dependencies (Pandas, Altair, Numpy)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "68d8980b-e857-491e-b03a-4648c5f4c5f3", + "metadata": {}, + "outputs": [], + "source": [ + "import _starterkit_utils" + ] + }, + { + "cell_type": "markdown", + "id": "6f37fc46-a49e-45b4-92bf-d5b3910b2325", + "metadata": {}, + "source": [ + "### Breakdown of a Script.\n", + "#### Function 1\n", + "* You can also preview what a function does by writing `script_name.function_name??`\n", + "\n", + "* Following what the DDS docs says, I am creating a new function every time I am processing the data in another stage.\n", + "* I have one function that loads in my dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3454fecc-0b6b-4f1f-b74d-17792165f990", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[0;31mSignature:\u001b[0m \u001b[0m_starterkit_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mSource:\u001b[0m \n", + "\u001b[0;32mdef\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m->\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"\"\"\u001b[0m\n", + "\u001b[0;34m Load the final dataframe.\u001b[0m\n", + "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mGCS_FILE_PATH\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"gs://calitp-analytics-data/data-analyses/starter_kit/\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mFILE\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"starter_kit_example_categorized.parquet\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Read dataframe in\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_parquet\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{GCS_FILE_PATH}{FILE}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Capitalize the Scope of Work column again since it is all lowercase\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscope_of_work\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscope_of_work\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcapitalize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Clean up the column names\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreverse_snakecase\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mFile:\u001b[0m ~/data-analyses/starter_kit/_starterkit_utils.py\n", + "\u001b[0;31mType:\u001b[0m function" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_starterkit_utils.load_dataset??" + ] + }, + { + "cell_type": "markdown", + "id": "a5effa3c-cdb4-4aa4-870f-75f78e8461ad", + "metadata": {}, + "source": [ + "\n", + "* To use a function in a Script, write `name_of_your_script.name_of_the_function(whatever arguments)`\n", + "* Take a look at the column names: they are no longer in `snakecase` because I applied a function that capitalizes it properly." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "44467ccf-599f-4662-8164-8a58fac85711", + "metadata": {}, + "outputs": [], + "source": [ + "df = _starterkit_utils.load_dataset()" + ] + }, + { + "cell_type": "markdown", + "id": "422c3b29-822b-4957-bc5a-b9d0c55fa34c", + "metadata": {}, + "source": [ + "#### Function 2:\n", + "* After loading in the dataset from GCS, I am entering my second stage of processing the data.\n", + "* I am aggregating my dataframe by category. " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0d82c825-d789-469e-8ae2-c69a94984511", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[0;31mSignature:\u001b[0m \u001b[0m_starterkit_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maggregate_by_category\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mSource:\u001b[0m \n", + "\u001b[0;32mdef\u001b[0m \u001b[0maggregate_by_category\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"\"\"\u001b[0m\n", + "\u001b[0;34m Find the median overall score and project cost \u001b[0m\n", + "\u001b[0;34m and total unique projects by category.\u001b[0m\n", + "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0magg1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Category\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0maggregate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"Overall Score\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"median\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"Project Cost\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"median\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"Project Name\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"nunique\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mrename\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"Overall Score\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"Median Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"Project Cost\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"Median Project Cost\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"Project Name\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"Total Projects\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Format the Cost column properly\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0magg1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Median Project Cost'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0magg1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Median Project Cost'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'${:,.0f}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0magg1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mFile:\u001b[0m ~/data-analyses/starter_kit/_starterkit_utils.py\n", + "\u001b[0;31mType:\u001b[0m function" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_starterkit_utils.aggregate_by_category??" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f9635fe8-a6c7-4813-9f25-7ba555ce9726", + "metadata": {}, + "outputs": [], + "source": [ + "aggregated_df = _starterkit_utils.aggregate_by_category(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2bdcb3e6-2add-4af6-a20a-9072b7ba075c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategoryMedian ScoreMedian Project CostTotal Projects
0ATP72.00$4,991,25511
1General Lanes73.00$7,487,9637
2General Lanes and ATP82.00$5,672,5502
3Other73.00$3,708,85815
4Transit75.00$4,399,8868
5Transit and ATP75.00$2,069,1431
\n", + "
" + ], + "text/plain": [ + " Category Median Score Median Project Cost Total Projects\n", + "0 ATP 72.00 $4,991,255 11\n", + "1 General Lanes 73.00 $7,487,963 7\n", + "2 General Lanes and ATP 82.00 $5,672,550 2\n", + "3 Other 73.00 $3,708,858 15\n", + "4 Transit 75.00 $4,399,886 8\n", + "5 Transit and ATP 75.00 $2,069,143 1" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "aggregated_df" + ] + }, + { + "cell_type": "markdown", + "id": "c5567fc1-0f13-4913-8744-5568d85942f7", + "metadata": {}, + "source": [ + "#### Function 3\n", + "* I want to swap my dataframe from wide to long. \n", + "* [Read about wide to long.](https://www.statology.org/long-vs-wide-data/)\n", + "* [Pandas doc on melt](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.melt.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2b0231f0-eb97-46d4-9541-aee43b138755", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[0;31mSignature:\u001b[0m \u001b[0m_starterkit_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwide_to_long\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mSource:\u001b[0m \n", + "\u001b[0;32mdef\u001b[0m \u001b[0mwide_to_long\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m->\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"\"\"\u001b[0m\n", + "\u001b[0;34m Change the dataframe from wide to long based on the project name and\u001b[0m\n", + "\u001b[0;34m Caltrans District.\u001b[0m\n", + "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdf2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmelt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mid_vars\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"CalTrans District\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"Project Name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mvalue_vars\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"Accessibility Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"DAC Accessibility Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"DAC Traffic Impacts Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"Freight Efficiency Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"Freight Sustainability Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"Mode Shift Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"Landuse Natural Resources Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"Safety Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"VMT Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"ZEV Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"Public Engagement Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"Climate Resilience Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"Program Fit Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdf2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrename\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m'variable'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m'Metric'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m'value'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m'Score'\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mFile:\u001b[0m ~/data-analyses/starter_kit/_starterkit_utils.py\n", + "\u001b[0;31mType:\u001b[0m function" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_starterkit_utils.wide_to_long??" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "82172952-3d59-436e-b08c-7096454b6e04", + "metadata": {}, + "outputs": [], + "source": [ + "df2 = _starterkit_utils.wide_to_long(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "1bcac91b-b0a1-4efd-8a73-f019c376d030", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CalTrans DistrictProject NameMetricScore
01Meadow Magic Multi-Use PathAccessibility Score2
14Bunny Hop Bike BoulevardAccessibility Score3
\n", + "
" + ], + "text/plain": [ + " CalTrans District Project Name Metric Score\n", + "0 1 Meadow Magic Multi-Use Path Accessibility Score 2\n", + "1 4 Bunny Hop Bike Boulevard Accessibility Score 3" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "55622831-2e94-4101-b531-611ff864a1a7", + "metadata": {}, + "source": [ + "#### Function 4\n", + "* Now that I have my aggregated data, I want to visualize my results,\n", + "* `style_df` takes my pandas dataframe and makes it look a bit sleeker." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d69a4f91-4e37-4207-93e0-2eaa18f998ff", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategoryMedian ScoreMedian Project CostTotal Projects
ATP72$4,991,25511
General Lanes73$7,487,9637
General Lanes and ATP82$5,672,5502
Other73$3,708,85815
Transit75$4,399,8868
Transit and ATP75$2,069,1431
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_starterkit_utils.style_df(aggregated_df)" + ] + }, + { + "cell_type": "markdown", + "id": "f9836712-aecb-4d5e-ae50-895fdb3d427f", + "metadata": {}, + "source": [ + "#### Function 5 \n", + "* This is function that creates a chart that shows the scores by metric for each project." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "1e2ec6b7-b494-4db5-a863-91882c77a7a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[0;31mSignature:\u001b[0m \u001b[0m_starterkit_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate_metric_chart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0maltair\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvegalite\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mv5\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mChart\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mSource:\u001b[0m \n", + "\u001b[0;32mdef\u001b[0m \u001b[0mcreate_metric_chart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mChart\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"\"\"\u001b[0m\n", + "\u001b[0;34m Create a chart that displays metric scores\u001b[0m\n", + "\u001b[0;34m for each project.\u001b[0m\n", + "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Create dropdown\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmetrics_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Metric\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtolist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmetrics_dropdown\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbinding_select\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0moptions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmetrics_list\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Metrics: \"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Column that controls the bar charts\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mxcol_param\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselection_point\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Metric\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmetrics_list\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbind\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmetrics_dropdown\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mchart\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mChart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtitle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Metric by Categories\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mmark_circle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m200\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Score\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscale\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mScale\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdomain\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mY\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Project Name\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mcolor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mColor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"Score\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mscale\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0malt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mScale\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcalitp_color_palette\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mCALITP_CATEGORY_BRIGHT_COLORS\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mtooltip\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mproperties\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwidth\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m400\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m250\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mchart\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mchart\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxcol_param\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform_filter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxcol_param\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mchart\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mFile:\u001b[0m ~/data-analyses/starter_kit/_starterkit_utils.py\n", + "\u001b[0;31mType:\u001b[0m function" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_starterkit_utils.create_metric_chart??" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7f39ca4e-9fb9-497d-bee6-22be385a9d34", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "_starterkit_utils.create_metric_chart(df2)" + ] + }, + { + "cell_type": "markdown", + "id": "e268d5f7-30f6-4b36-bfa3-1391dfa772f9", + "metadata": {}, + "source": [ + "## Grains\n", + "* This is a light introduction to the concept of grains.\n", + "* Grain means the level your dataset is presented at.\n", + "* You can think of it as: what does each row represent?\n", + "* The original dataset is presented on the project-level grain because each row represents a unique project. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "bb228391-f907-4d76-a2b5-45b7fd188d21", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Project NameOverall Score
0Meadow Magic Multi-Use Path72
1Bunny Hop Bike Boulevard68
2Strawberry Shortcake Sidewalks87
3River Ramble Rabbit Trail75
4Lilac Lane Dream Complete Street72
\n", + "
" + ], + "text/plain": [ + " Project Name Overall Score\n", + "0 Meadow Magic Multi-Use Path 72\n", + "1 Bunny Hop Bike Boulevard 68\n", + "2 Strawberry Shortcake Sidewalks 87\n", + "3 River Ramble Rabbit Trail 75\n", + "4 Lilac Lane Dream Complete Street 72" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[[\"Project Name\", \"Overall Score\"]].head()" + ] + }, + { + "cell_type": "markdown", + "id": "69b70b73-4dba-4280-a385-99d0c2d06018", + "metadata": {}, + "source": [ + "* If we aggregate the dataset using Caltrans District, then this dataset would be on the district gain." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "9a332009-4ac5-4eca-9632-6d45c03765a4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CalTrans DistrictTotal Projects
011
122
236
346
454
563
673
785
893
9102
10115
11124
\n", + "
" + ], + "text/plain": [ + " CalTrans District Total Projects\n", + "0 1 1\n", + "1 2 2\n", + "2 3 6\n", + "3 4 6\n", + "4 5 4\n", + "5 6 3\n", + "6 7 3\n", + "7 8 5\n", + "8 9 3\n", + "9 10 2\n", + "10 11 5\n", + "11 12 4" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby([\"CalTrans District\"]).agg({\"Project Name\": \"nunique\"}).reset_index().rename(\n", + " columns={\"Project Name\": \"Total Projects\"}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "da0849c1-5f7e-417c-b321-e289fb46b262", + "metadata": {}, + "source": [ + "* If we aggregate the dataset by lead agency, then this dataset would be on the agency gain." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "c0f00432-60ca-4e8a-9c2a-45bba234dbd7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Lead AgencyTotal Projects
0Bunny's Meadow Hop Transportation (BMHT)3
1Cherry Metro Services (CMS)1
2Dewdrop Ride Transit2
3Elf's Efficient Transportation (EET)3
4Fairy Creek Public Transit (FCPT)5
5Gnome Valley Rail Link (GVRL)3
6Meadow Bunny Public Transportation (MBPT)4
7Morning Dewdrop Transit (MDT)4
8Mushroom Metro Transit Agency (MMTA)5
9Rainbow Mushroom Transportation Corporation (RMTC)5
10Shining Sparkle Transit Systems (SSTS)4
11Strawberry Rainbow Transit Systems (SRTS)4
12Unicorn Fairy Express Bus (UFX)1
\n", + "
" + ], + "text/plain": [ + " Lead Agency Total Projects\n", + "0 Bunny's Meadow Hop Transportation (BMHT) 3\n", + "1 Cherry Metro Services (CMS) 1\n", + "2 Dewdrop Ride Transit 2\n", + "3 Elf's Efficient Transportation (EET) 3\n", + "4 Fairy Creek Public Transit (FCPT) 5\n", + "5 Gnome Valley Rail Link (GVRL) 3\n", + "6 Meadow Bunny Public Transportation (MBPT) 4\n", + "7 Morning Dewdrop Transit (MDT) 4\n", + "8 Mushroom Metro Transit Agency (MMTA) 5\n", + "9 Rainbow Mushroom Transportation Corporation (RMTC) 5\n", + "10 Shining Sparkle Transit Systems (SSTS) 4\n", + "11 Strawberry Rainbow Transit Systems (SRTS) 4\n", + "12 Unicorn Fairy Express Bus (UFX) 1" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby([\"Lead Agency\"]).agg({\"Project Name\": \"nunique\"}).reset_index().rename(\n", + " columns={\"Project Name\": \"Total Projects\"}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "55137a34-1624-4d8a-8ee8-33c773868cde", + "metadata": {}, + "source": [ + "* Grains can get very complicated. The one below is Lead Agency and Category Grain. " + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "7892454f-5f70-4237-9f04-560405cf1775", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Lead AgencyCategoryTotal Projects
0Bunny's Meadow Hop Transportation (BMHT)Other2
1Bunny's Meadow Hop Transportation (BMHT)Transit1
2Cherry Metro Services (CMS)Other1
3Dewdrop Ride TransitATP1
4Dewdrop Ride TransitOther1
5Elf's Efficient Transportation (EET)ATP1
6Elf's Efficient Transportation (EET)General Lanes1
7Elf's Efficient Transportation (EET)Transit1
8Fairy Creek Public Transit (FCPT)ATP1
9Fairy Creek Public Transit (FCPT)Other2
10Fairy Creek Public Transit (FCPT)Transit2
11Gnome Valley Rail Link (GVRL)ATP1
12Gnome Valley Rail Link (GVRL)Other1
13Gnome Valley Rail Link (GVRL)Transit and ATP1
14Meadow Bunny Public Transportation (MBPT)ATP1
15Meadow Bunny Public Transportation (MBPT)General Lanes and ATP1
16Meadow Bunny Public Transportation (MBPT)Other1
17Meadow Bunny Public Transportation (MBPT)Transit1
18Morning Dewdrop Transit (MDT)General Lanes2
19Morning Dewdrop Transit (MDT)Other1
20Morning Dewdrop Transit (MDT)Transit1
21Mushroom Metro Transit Agency (MMTA)General Lanes1
22Mushroom Metro Transit Agency (MMTA)Other3
23Mushroom Metro Transit Agency (MMTA)Transit1
24Rainbow Mushroom Transportation Corporation (RMTC)ATP2
25Rainbow Mushroom Transportation Corporation (RMTC)General Lanes1
26Rainbow Mushroom Transportation Corporation (RMTC)Other1
27Rainbow Mushroom Transportation Corporation (RMTC)Transit1
28Shining Sparkle Transit Systems (SSTS)ATP1
29Shining Sparkle Transit Systems (SSTS)General Lanes1
30Shining Sparkle Transit Systems (SSTS)General Lanes and ATP1
31Shining Sparkle Transit Systems (SSTS)Other1
32Strawberry Rainbow Transit Systems (SRTS)ATP2
33Strawberry Rainbow Transit Systems (SRTS)General Lanes1
34Strawberry Rainbow Transit Systems (SRTS)Other1
35Unicorn Fairy Express Bus (UFX)ATP1
\n", + "
" + ], + "text/plain": [ + " Lead Agency Category \\\n", + "0 Bunny's Meadow Hop Transportation (BMHT) Other \n", + "1 Bunny's Meadow Hop Transportation (BMHT) Transit \n", + "2 Cherry Metro Services (CMS) Other \n", + "3 Dewdrop Ride Transit ATP \n", + "4 Dewdrop Ride Transit Other \n", + "5 Elf's Efficient Transportation (EET) ATP \n", + "6 Elf's Efficient Transportation (EET) General Lanes \n", + "7 Elf's Efficient Transportation (EET) Transit \n", + "8 Fairy Creek Public Transit (FCPT) ATP \n", + "9 Fairy Creek Public Transit (FCPT) Other \n", + "10 Fairy Creek Public Transit (FCPT) Transit \n", + "11 Gnome Valley Rail Link (GVRL) ATP \n", + "12 Gnome Valley Rail Link (GVRL) Other \n", + "13 Gnome Valley Rail Link (GVRL) Transit and ATP \n", + "14 Meadow Bunny Public Transportation (MBPT) ATP \n", + "15 Meadow Bunny Public Transportation (MBPT) General Lanes and ATP \n", + "16 Meadow Bunny Public Transportation (MBPT) Other \n", + "17 Meadow Bunny Public Transportation (MBPT) Transit \n", + "18 Morning Dewdrop Transit (MDT) General Lanes \n", + "19 Morning Dewdrop Transit (MDT) Other \n", + "20 Morning Dewdrop Transit (MDT) Transit \n", + "21 Mushroom Metro Transit Agency (MMTA) General Lanes \n", + "22 Mushroom Metro Transit Agency (MMTA) Other \n", + "23 Mushroom Metro Transit Agency (MMTA) Transit \n", + "24 Rainbow Mushroom Transportation Corporation (RMTC) ATP \n", + "25 Rainbow Mushroom Transportation Corporation (RMTC) General Lanes \n", + "26 Rainbow Mushroom Transportation Corporation (RMTC) Other \n", + "27 Rainbow Mushroom Transportation Corporation (RMTC) Transit \n", + "28 Shining Sparkle Transit Systems (SSTS) ATP \n", + "29 Shining Sparkle Transit Systems (SSTS) General Lanes \n", + "30 Shining Sparkle Transit Systems (SSTS) General Lanes and ATP \n", + "31 Shining Sparkle Transit Systems (SSTS) Other \n", + "32 Strawberry Rainbow Transit Systems (SRTS) ATP \n", + "33 Strawberry Rainbow Transit Systems (SRTS) General Lanes \n", + "34 Strawberry Rainbow Transit Systems (SRTS) Other \n", + "35 Unicorn Fairy Express Bus (UFX) ATP \n", + "\n", + " Total Projects \n", + "0 2 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "5 1 \n", + "6 1 \n", + "7 1 \n", + "8 1 \n", + "9 2 \n", + "10 2 \n", + "11 1 \n", + "12 1 \n", + "13 1 \n", + "14 1 \n", + "15 1 \n", + "16 1 \n", + "17 1 \n", + "18 2 \n", + "19 1 \n", + "20 1 \n", + "21 1 \n", + "22 3 \n", + "23 1 \n", + "24 2 \n", + "25 1 \n", + "26 1 \n", + "27 1 \n", + "28 1 \n", + "29 1 \n", + "30 1 \n", + "31 1 \n", + "32 2 \n", + "33 1 \n", + "34 1 \n", + "35 1 " + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby([\"Lead Agency\", \"Category\"]).agg({\"Project Name\": \"nunique\"}).reset_index().rename(\n", + " columns={\"Project Name\": \"Total Projects\"}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6fc393e2-bfcf-40fc-b6f1-5aa90e0c9715", + "metadata": {}, + "source": [ + "## Create your own Script\n", + "* **Make sure your functions make sense for the district grain.**\n", + "* You will be using these functions for Exercise 5. \n", + "* Make sure to separate out functions by theme. \n", + " * One function that loads the dataset and does some light cleaning.\n", + " * One (or more) functions that transform your dataframe.\n", + " * `melt()`, `.T`, `.groupby()` are just some of the many options available through `pandas`. \n", + " * One (or more) functions that visualize your dataframe.\n", + " * Could be a chart, a styled dataframe, a wordcloud. \n", + "* Other things to consider\n", + " * Our [DDS Docs](https://docs.calitp.org/data-infra/publishing/sections/4_notebooks_styling.html#narrative) has a great guide on what \"checkboxes\" need to be \"checked\" when presenting data. The first 3 sections are the most relevant.\n", + " * To summarize the docs, double check:\n", + " * Are the currency columns formatted with $ and commas?\n", + " * Are all the scores formatted with the same number of decimals?\n", + " * Are the string columns formatted with the right punctuation and capitalization?\n", + " * Are the column names formatted properly? While `snake_case` is very handy when we are analyzing the dataframe, it is not slightly when presenting the data. We typically reverse the `snake_case` back to something like `Project Name`.\n", + " * [CalTrans Districts are currently integers, but they have actual names that can be mapped.](https://cwwp2.dot.ca.gov/documentation/district-map-county-chart.htm) \n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "7e1ff2d3-ae53-4682-b540-cb8a3c11e076", + "metadata": {}, + "source": [ + "## Markdown/Display\n", + "* Although our code is now neatly stored in a Python script, a Jupyter Notebook on its own is a bit plain, even when we have beautiful charts. \n", + "* There are many ways to jazz it up.\n", + "* **Resource**: [Data Camp](https://www.datacamp.com/tutorial/markdown-in-jupyter-notebook)" + ] + }, + { + "cell_type": "markdown", + "id": "ed396a2f-c3f1-40be-aad2-64835be8431b", + "metadata": {}, + "source": [ + "#### Images\n", + "* You can add an image in a markdown cell\n", + "``

\n", + "\n", + "* You can add an image in a code cell if you import the packages below." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "4ec41786-491f-46ad-963e-f380d8095ade", + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import HTML, Image, Markdown, display, display_html" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "8d0a6849-f178-46fc-919a-f45b5436c423", + "metadata": {}, + "outputs": [ + { + "data": { + "image/jpeg": "", + "text/plain": [ + "" + ] + }, + "metadata": { + "image/jpeg": { + "height": 600, + "width": 960 + } + }, + "output_type": "display_data" + } + ], + "source": [ + "display(Image(filename=\"./19319_en_1.jpg\", retina=True))" + ] + }, + { + "cell_type": "markdown", + "id": "2b26a0da-b23c-436a-b79a-9749b33ef554", + "metadata": {}, + "source": [ + "### Display\n", + "* Of course, you can write your narratives in a Markdown cell like what I'm doing right now.\n", + "* However, what if you want to incorporate values from your dataframe into the narrative?\n", + "* Writing out the values manually in markdown locks you in. If the values change, you'll have to rewrite your narrative.\n", + "* The best way is to use `display` and `markdown` from `from IPython.display`\n", + "* We are using District 3 as an example" + ] + }, + { + "cell_type": "markdown", + "id": "3ebd21f4-0779-48ea-9cfd-eb912d5fda96", + "metadata": {}, + "source": [ + "#### No hard coding\n", + "* Save out your desired value into a new variable if you are manipulating it." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "caecab58-2d26-4604-a3f1-ab4a11400038", + "metadata": {}, + "outputs": [], + "source": [ + "# Filter for D3\n", + "d3_df = df.loc[df[\"CalTrans District\"] == 3].reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "995eb899-0397-4f60-b587-18fcf8a4cb0e", + "metadata": {}, + "outputs": [], + "source": [ + "# Find the median overall score\n", + "d3_median_score = d3_df[\"Overall Score\"].median()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "bc73cc66-911a-44bf-9710-920328b40609", + "metadata": {}, + "outputs": [], + "source": [ + "# Find total projects\n", + "d3_total_projects = d3_df[\"Project Name\"].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "8a5dafba-c901-412c-bf53-e418dc558787", + "metadata": {}, + "outputs": [], + "source": [ + "# Find the most expensive project\n", + "d3_max_project = d3_df[\"Project Cost\"].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "e183e629-7f79-45e3-810b-294851ca9abf", + "metadata": {}, + "outputs": [], + "source": [ + "# Format the cost so it's something like $1,000,000 instead of 1000000\n", + "d3_max_project = f\"${d3_max_project:,.2f}\"" + ] + }, + { + "cell_type": "markdown", + "id": "241607df-d133-4fb1-ac13-f1a32454b815", + "metadata": {}, + "source": [ + "#### Long F-String + Headers\n", + "* The f-string has multiple quotation marks. This allows you to write a f-string that goes over multiple lines.\n", + "*

and

displays District 3 in a header. Headers vary in size, 1 being the largest. \n", + "* `` bolds the text. \n", + " * ` italicizes the text.\n", + "* Notice that you always have to **close** your HTML with `District 3\n", + " The median score for projects in District 3 is 80.5
\n", + " The total number of projects is 6
\n", + " The most expensive project costs $9,448,022.00\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(\n", + " Markdown(\n", + " f\"\"\"

District 3

\n", + " The median score for projects in District 3 is {d3_median_score}
\n", + " The total number of projects is {d3_total_projects}
\n", + " The most expensive project costs {d3_max_project}\n", + " \"\"\"\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "4aa41900-7dbf-4c61-b927-4f1e42f6b8da", + "metadata": {}, + "source": [ + "* You can code in this cell. I'm filtering out for district 3 values.\n", + "* Notice the header went from `

` to `

`. " + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "38d84ed3-9626-4f91-9aea-e2449aef4cf8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "

Metric Scores

\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(\n", + " Markdown(\n", + " f\"\"\"

Metric Scores

\n", + " \"\"\"\n", + " )\n", + ")\n", + "display(_starterkit_utils.create_metric_chart(df2))" + ] + }, + { + "cell_type": "markdown", + "id": "fe45d252-1d46-4d34-98f4-7118afd96406", + "metadata": {}, + "source": [ + "### This can be a function too\n", + "* What if I wanted to generate these narratives for every district?\n", + "* I can simply turn this into a function.\n", + "* I only want to print out a couple of districts or else this notebook will become too large" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "8875a82f-2df3-4777-a115-87ba84ea96a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[0;31mSignature:\u001b[0m\n", + "\u001b[0m_starterkit_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate_district_summary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mcaltrans_district\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mSource:\u001b[0m \n", + "\u001b[0;32mdef\u001b[0m \u001b[0mcreate_district_summary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcaltrans_district\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"\"\"\u001b[0m\n", + "\u001b[0;34m Create a summary of CSIS metrics for one Caltrans District.\u001b[0m\n", + "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mfiltered_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"CalTrans District\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mcaltrans_district\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdrop\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Finding the values referenced in the narrative\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmedian_score\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfiltered_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Overall Score\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmedian\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mtotal_projects\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfiltered_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Project Name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnunique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmax_project\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfiltered_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Project Cost\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmax_project\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mf\"${max_project:,.2f}\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Aggregate the dataframe\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0maggregated_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maggregate_by_category\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfiltered_df\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Change the dataframe from wide to long\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdf2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwide_to_long\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfiltered_df\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Create narrative\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mMarkdown\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34mf\"\"\"The median score for projects in District {caltrans_district} is {median_score}
\u001b[0m\n", + "\u001b[0;34m The total number of projects is {total_projects}
\u001b[0m\n", + "\u001b[0;34m The most expensive project costs {max_project}\u001b[0m\n", + "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mMarkdown\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34mf\"\"\"

Metrics aggregated by Categories

\u001b[0m\n", + "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mstyle_df\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maggregated_df\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mMarkdown\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34mf\"\"\"

Overview of Projects

\u001b[0m\n", + "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mstyle_df\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfiltered_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Project Name\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Overall Score\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Scope Of Work\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mMarkdown\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34mf\"\"\"

Metric Scores by Project

\u001b[0m\n", + "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcreate_metric_chart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mFile:\u001b[0m ~/data-analyses/starter_kit/_starterkit_utils.py\n", + "\u001b[0;31mType:\u001b[0m function" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_starterkit_utils.create_district_summary??" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "6d6f524a-d49b-4729-801f-ccc4bd800149", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "The median score for projects in District 10 is 72.5
\n", + " The total number of projects is 2
\n", + " The most expensive project costs $7,160,933.00\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "

Metrics aggregated by Categories

\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategoryMedian ScoreMedian Project CostTotal Projects
Other59$816,5691
Transit86$7,160,9331
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "

Overview of Projects

\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Project NameOverall ScoreScope Of Work
Countryside Clover Rail Connector59A 20 mile rail improvement project for freight transportation, upgrading track infrastructure, and implementing advanced safety features to reduce derailment risk.
Brookside Bus Blossom Lane86Prioritize public transportation and enhance air quality by dedicating lanes to buses and hovs on brookside boulevard, integrating smart traffic signals and real time transit information inspired by the ancient elves.
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "

Metric Scores by Project

\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "The median score for projects in District 11 is 75.0
\n", + " The total number of projects is 5
\n", + " The most expensive project costs $8,956,026.00\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "

Metrics aggregated by Categories

\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategoryMedian ScoreMedian Project CostTotal Projects
ATP79$8,956,0261
General Lanes89$1,557,7511
Other75$5,796,4771
Transit55$5,425,7841
Transit and ATP75$2,069,1431
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "

Overview of Projects

\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Project NameOverall ScoreScope Of Work
Berry Best Bus Rapid Transit55Dedicated bus lanes with comfortable stops, featuring off board fare payment, priority traffic signals, and enhanced passenger amenities.
Trail of Treats and Transit Hub75A multi use path connecting to public transit, featuring public art installations, wayfinding signage, and amenities like bike storage and repair stations.
Fairy Glen Boulevard79Welcome travelers to our enchanted town with a refreshed fairy glen boulevard, featuring sparkling streetlights, lush wildflower medians, and meandering pedestrian paths
Parkside Pixie Carpool Lane75Encourage sustainable transportation and reduce traffic congestion by constructing high occupancy vehicle (hov) lanes along parkside drive, adorned with fairy inspired artwork.
Ridgewood Ride-Share Rainbow Lane89Support environmentally friendly commuting options by building hov lanes on ridgewood highway, featuring designated ride share pickup and drop off zones, and a touch of magic from the meadow.
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "

Metric Scores by Project

\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for district in range(10, 12):\n", + " _starterkit_utils.create_district_summary(df, district)" + ] + }, + { + "cell_type": "markdown", + "id": "c5082a4a-2b6c-4e72-8e2d-267305ad06a4", + "metadata": {}, + "source": [ + "## Your turn to combine all your functions into one function\n", + "* Take some inspiration from ` _starterkit_utils.create_district_summary(df, district).`\n", + "* Incorporate concepts from `markdown` and `display`. " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/starter_kit/2024_basics_05.ipynb b/starter_kit/2024_basics_05.ipynb new file mode 100644 index 000000000..40e31e42d --- /dev/null +++ b/starter_kit/2024_basics_05.ipynb @@ -0,0 +1,320 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36c5c03c-164a-4530-9fc6-43f5d1abbf7e", + "metadata": { + "tags": [] + }, + "source": [ + "**Portfolio**\n", + "* You might have seen DDS's [portfolio](https://analysis.calitp.org/).\n", + "* We often present our work on our portfolio because it retains the interactivity of the `Altair` charts and `Geopandas` maps we make.\n", + "* Additionally, it is very streamlined to update our work when it needs to be updated. \n", + "* Spend some time exploring our portfolio above. \n", + "\n", + "**How does the portfolio work?**\n", + "* For the majority of the sites on the portfolio are using a single notebook essentially as a template that is looped one or more variables. \n", + " * This [National Transit Dataset Monthly Ridership by Regional Transit Planning Authority (RTPA) portfolio](https://ntd-monthly-ridership--cal-itp-data-analyses.netlify.app/readme) takes [this notebook](https://github.com/cal-itp/data-analyses/blob/main/ntd/monthly_ridership_report.ipynb) and reruns it for every \n", + "RTPA in this [yml file](https://github.com/cal-itp/data-analyses/blob/main/portfolio/sites/ntd_monthly_ridership.yml). \n", + " * This process of looping over variables to generate new notebooks is called parameterizing a notebook.\n", + " \n", + "**Resources**\n", + " * [Preparing notebooks for the portfolio](https://docs.calitp.org/data-infra/publishing/sections/4_notebooks_styling.html)\n", + " * [Publishing to the portfolio](https://docs.calitp.org/data-infra/publishing/sections/5_analytics_portfolio_site.html)\n", + "\n", + "**Let's make a portfolio**\n", + "* Feel free to delete all the instructions off once you're done. \n", + "* Spoiler alert! Your end result will look something like [this](https://ha-starterkit-district--cal-itp-data-analyses.netlify.app/readme)." + ] + }, + { + "cell_type": "markdown", + "id": "4dc7d9d1-5722-467f-8e2f-d1e2b8d7e566", + "metadata": {}, + "source": [ + "**Step 1: Move this notebook**\n", + "* Create a new folder in the `data-analyses` repo called `lastname_portfolio`.\n", + "* Right click -> copy to move this notebook to the new folder.\n", + "* Right click -> rename this notebook as `lastname_portfolio.ipynb`\n", + "* Use `git mv` to move the Python file that holds your functions to the `lastname_portfolio`.\n", + "* Right click -> copy the `starterkit_district.yml` file to the folder `data-analyses/portfolio/sites`. Rename `starterkit_district.yml` to `lastname_starterkit_district`\n", + "* Close this original `2024_basics_05.ipynb` and begin working on your new `lastname_portfolio.ipynb`" + ] + }, + { + "cell_type": "markdown", + "id": "8cb15ca0-580e-4a0d-9dbc-accb761d77d1", + "metadata": {}, + "source": [ + "**Step 2: Netlify Setup**\n", + "* Follow the instructions [here](https://docs.calitp.org/data-infra/publishing/sections/5_analytics_portfolio_site.html#netlify-setup).\n", + "* You only need to do this step **once** for the entirety of your career at DDS. \n", + "* Once you have your key setup, you can publish countless portfolios." + ] + }, + { + "cell_type": "markdown", + "id": "89858742-9a67-4f10-9f65-29770f955075", + "metadata": {}, + "source": [ + "**Step 3: Create a `README.md`**\n", + "* When you go to each site on our [portfolio](https://analysis.calitp.org/), you'll always go to the introduction.\n", + "* Every portfolio must have a `README.md` file or else it won't build. \n", + "* It also serves as our page to discuss our methodology, the datasets we used, and other details to give our viewers some context into what they are looking at. \n", + "* We have a template for you to populate [here](https://github.com/cal-itp/data-analyses/blob/main/portfolio/template_README.md). \n", + " * Make sure to rename `template_README.md` as `README.md` in your folder. \n", + " * You cannot deviate from `README.md` such as `README_intro.md` because the portfolio will not build.\n", + "* **Further Reading**: [DDS Docs](https://docs.calitp.org/data-infra/publishing/sections/5_analytics_portfolio_site.html#file-setup)" + ] + }, + { + "cell_type": "markdown", + "id": "2d62df5f-1608-4cf9-8e66-2686d4b9f5da", + "metadata": {}, + "source": [ + "**Step 4: Update `starterkit_district.yml`**\n", + "* You can think of this yml file as a \"Table of Contents.\"\n", + "* We are taking this notebook you're currently reading and re-running it for every element that is listed in the yml file. After re-running a new notebook is generated for that element and published.\n", + "* In the `starterkit_district.yml` please replace text in all all caps such as REPLACE_WITH_YOUR_FOLDER_NAME with the proper file/folder/notebook. \n", + "* **Further Reading**: [DDS Docs on YML](https://docs.calitp.org/data-infra/publishing/sections/5_analytics_portfolio_site.html#yml)" + ] + }, + { + "cell_type": "markdown", + "id": "2986f064-d922-491a-846b-1d4f5e4ea9e4", + "metadata": {}, + "source": [ + "**Step 6: Importing the right packages**\n", + "* Making a parameterized notebook is extremely finicky.\n", + "* For every notebook you make, **you must copy and paste this block of code below in this exact order.** Otherwise, your notebook won't work.\n", + "* What am I importing?\n", + " * `%%capture`: Captures the parameter/yml parts.\n", + " * `import warnings warnings.filterwarnings('ignore')`: Sometimes when you are analyzing data, warnings pop up. These warnings are quite unattractive and we don't want them to be displayed in a portfolio so we turn off these warnings. You don't want to turn off the warnings if you are still analyzing your data! \n", + " * `import calitp_data_analysis.magics`: the library that makes the parameterization magic happen.\n", + "* **Resource**: [DDS Getting Notebooks Ready for Parameterization](https://docs.calitp.org/data-infra/publishing/sections/4_notebooks_styling.html#getting-ready-for-parameterization)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "260ba8f3-dd02-4fdc-945d-450db01d188e", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import calitp_data_analysis.magics\n", + "\n", + "# All your other packages go here\n", + "# Here I just want pandas and my own utils.\n", + "import pandas as pd\n", + "import _starterkit_utils " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a2996fd-29d0-4a19-ac48-a6957d9f8140", + "metadata": {}, + "outputs": [], + "source": [ + "pd.options.display.max_columns = 100\n", + "pd.options.display.float_format = \"{:.2f}\".format\n", + "pd.set_option(\"display.max_rows\", None)\n", + "pd.set_option(\"display.max_colwidth\", None)" + ] + }, + { + "cell_type": "markdown", + "id": "b07ecdbc-a3a9-4183-80dc-57e71cf61fe6", + "metadata": {}, + "source": [ + "**Step 7: Setting your parameters**\n", + "* While these steps have already been done for you, it would still benefit you to re-do these steps and refer to the resource below. \n", + "* **Resource**: [DDS Docs Capturing Parameters](https://docs.calitp.org/data-infra/publishing/sections/4_notebooks_styling.html#capturing-parameters).\n", + "* **Parameter #1:** Set a cell that is commented out with your parameter. Turn on the parameter tag.\n", + " * To turn on the parameter tag: go to the code cell go to the upper right hand corner -> click on the gears -> go to “Cell Tags” -> Add Tag + -> add a tag called “parameters” -> click on the new “parameters” tag to ensure a checkmark shows up and it turns dark gray" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d82c9a8-6f8f-485b-ace5-957f1b80c2f3", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "# district = 1" + ] + }, + { + "cell_type": "markdown", + "id": "2fca6082-1964-43d7-bcd8-8668c39afaac", + "metadata": {}, + "source": [ + "**Parameter #2:** This second cell replaces each district as the notebook loops over each parameter in the `starter_kit.yml` file.\n", + "* `%%capture_parameters` must be the first line of code in this block or else your notebook will fail to parameterize." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43a07a8c-567d-471d-be10-a547cd0b3a13", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture_parameters\n", + "district" + ] + }, + { + "cell_type": "markdown", + "id": "f9285e93-924d-4681-b526-97b8e46643b1", + "metadata": {}, + "source": [ + "* **Parameter #3:** The first markdown cell must include parameters to inject. This line below generates the title District 1 Analysis when it is creating the notebook for District 1. Likewise, it'll say District 2 Analysis for District 2's page. \n", + "* Feel free to change this to anything you wish, but make sure this stays a markdown cell.\n", + "* This cell is extremely important and read why [here](https://docs.calitp.org/data-infra/publishing/sections/4_notebooks_styling.html#header)." + ] + }, + { + "cell_type": "markdown", + "id": "cb5a0cc4-3e7e-4aea-81f2-c5e858fb315b", + "metadata": {}, + "source": [ + "# District {district} Analysis " + ] + }, + { + "cell_type": "markdown", + "id": "64615b63-3848-45b7-af2e-19c7b7346997", + "metadata": {}, + "source": [ + "**Step 8: Input your functions**\n", + "* I am loading my dataset first.\n", + "* Then I am adding in my dataset and the district parameter into `_starterkit_utils.create_district_summary`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c91049e1-107d-47d9-9cda-63aa4fbf554b", + "metadata": {}, + "outputs": [], + "source": [ + "df = _starterkit_utils.load_dataset()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd1509c0-b435-456e-ad1c-b583a991f1e2", + "metadata": {}, + "outputs": [], + "source": [ + "_starterkit_utils.create_district_summary(df, district)" + ] + }, + { + "cell_type": "markdown", + "id": "76052780-3c27-405c-9379-13e82130eec7", + "metadata": {}, + "source": [ + "**Step 9: Download the right packages**\n", + "* Navigate back to the root of your repo which is `~/data-analyses`.\n", + "* Once there, install the portfolio requirements using `pip install -r portfolio/requirements.txt`. This will take a bit.\n", + "* **Resource**: [DDS Deploying Portfolio](https://docs.calitp.org/data-infra/publishing/sections/5_analytics_portfolio_site.html#building-and-deploying-your-report)" + ] + }, + { + "cell_type": "markdown", + "id": "c8fce487-015d-4e24-8444-d5c07ec17890", + "metadata": {}, + "source": [ + "**Step 10: Build your portfolio**\n", + "* Double check you are at the root of your repo.\n", + "* Replace `REPLACE_YML_NAME` with just the name of your `yml` file without the `.yml` extension into the command below.\n", + "* Run `python portfolio/portfolio.py build REPLACE_YML_NAME --deploy` to build your portfolio.\n", + " * Example: My yml is called `ha_starterkit_district.yml` so I would run `python portfolio/portfolio.py build ha_starterkit_district --deploy`." + ] + }, + { + "cell_type": "markdown", + "id": "36567eba-a5d4-4a38-b67e-ffbf8fe74035", + "metadata": {}, + "source": [ + "**Step 11: View**\n", + "* Your portfolio should be up and running. \n", + "* You can view your portfolio using the draft URL. It'll look something like this: `https://your-site-name--cal-itp-data-analyses.netlify.app`.\n", + "* If everything looks great, commit your work. \n", + " * Parameterizing a notebook creates a lot of new files. Make sure you've committed everything.\n", + " * This is tedious and will involve many directory changes." + ] + }, + { + "cell_type": "markdown", + "id": "41c923e8-4d79-424b-9e28-50ae4924cc24", + "metadata": {}, + "source": [ + "**Step 12: Something not right?**\n", + "* What if something is a little off? After updating your code, rerun this line of code to redo your portfolio. You must always `clean` your portfolio before regenerating new notebooks. \n", + "` python portfolio/portfolio.py clean REPLACE_YML_NAME && python portfolio/portfolio.py build REPLACE_YML_NAME --deploy`\n", + "* There are many other specifications you can add to `python portfolio/portfolio.py build` and they are all detailed on [DDS Other Specifications](https://docs.calitp.org/data-infra/publishing/sections/5_analytics_portfolio_site.html#other-specifications). " + ] + }, + { + "cell_type": "markdown", + "id": "9c8f91a0-e0b5-465d-8f95-be8e3fc036ea", + "metadata": {}, + "source": [ + "**Step 13: Run a Makefile**\n", + "* You can generate all 12 of your notebooks in one swift line of code instead of running the same couple of lines over and over again using a `Makefile`. \n", + "* You can think of a `Makefile` as a coffee machine that does the same thing day in and day out. \n", + " * You always install the same packages.\n", + " * You always clean out the repo.\n", + " * You generally will rerun the notebook in its entirety.\n", + " * You always add the `md,yml,ipynb` and other files that the parameterization process creates.\n", + "* Makefiles are great for automating tasks and saving time. \n", + "\n", + "**Instructions** \n", + "* Make sure you are still at the root of our repo `~/data-analyses`.\n", + "* Under `data-analyses` you'll see a file called `Makefile`.\n", + "* Open up the `Makefile`. Scroll down to lines 68-72. \n", + "* Copy and paste the entire block of 68-72. \n", + "* Replace LASTNAME in `build_starterkit_LASTNAME:` with your name.\n", + "* Replace YOUR_SITE_NAME with the name of your .yml file in `/portfolio/sites` in `$(eval export site = YOUR_SITE_NAME)`\n", + " * My `yml` is named `ha_starterkit_district.yml` so my line is `$(eval export site = ha_starterkit_district)`\n", + "* Make sure you retain all the `\t` spaces! \n", + "* At the root of the repo run `Make build_starterkit_LASTNAME`.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/starter_kit/_starterkit_utils.py b/starter_kit/_starterkit_utils.py new file mode 100644 index 000000000..11869fb63 --- /dev/null +++ b/starter_kit/_starterkit_utils.py @@ -0,0 +1,195 @@ +import pandas as pd +import numpy as np +import altair as alt +from calitp_data_analysis import calitp_color_palette +from IPython.display import HTML, Image, Markdown, display, display_html + +def reverse_snakecase(df:pd.DataFrame)->pd.DataFrame: + """ + Clean up columns to remove underscores and spaces. + """ + df.columns = df.columns.str.replace("_", " ").str.strip().str.title() + + df.columns = (df.columns.str.replace("Dac", "DAC") + .str.replace("Vmt", "VMT") + .str.replace("Zev", "ZEV") + .str.replace("Lu", "Landuse") + .str.replace("Ct", "CalTrans") + ) + return df + +def load_dataset()->pd.DataFrame: + """ + Load the final dataframe. + """ + GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/starter_kit/" + FILE = "starter_kit_example_categorized.parquet" + + # Read dataframe in + df = pd.read_parquet(f"{GCS_FILE_PATH}{FILE}") + + # Capitalize the Scope of Work column again since it is all lowercase + df.scope_of_work = df.scope_of_work.str.capitalize() + + # Clean up the column names + df = reverse_snakecase(df) + return df + +def aggregate_by_category(df: pd.DataFrame) -> pd.DataFrame: + """ + Find the median overall score and project cost + and total unique projects by category. + """ + agg1 = ( + df.groupby(["Category"]) + .aggregate( + { + "Overall Score": "median", + "Project Cost": "median", + "Project Name": "nunique", + } + ) + .reset_index() + .rename( + columns={ + "Overall Score": "Median Score", + "Project Cost": "Median Project Cost", + "Project Name": "Total Projects", + } + ) + ) + + # Format the Cost column properly + agg1['Median Project Cost'] = agg1['Median Project Cost'].apply(lambda x: '${:,.0f}'.format(x)) + + return agg1 + +def wide_to_long(df:pd.DataFrame)->pd.DataFrame: + """ + Change the dataframe from wide to long based on the project name and + Caltrans District. + """ + df2 = pd.melt( + df, + id_vars=["CalTrans District","Project Name"], + value_vars=[ + "Accessibility Score", + "DAC Accessibility Score", + "DAC Traffic Impacts Score", + "Freight Efficiency Score", + "Freight Sustainability Score", + "Mode Shift Score", + "Landuse Natural Resources Score", + "Safety Score", + "VMT Score", + "ZEV Score", + "Public Engagement Score", + "Climate Resilience Score", + "Program Fit Score", + ]) + + df2 = df2.rename(columns = {'variable':'Metric', + 'value':'Score'}) + return df2 + +def style_df(df: pd.DataFrame): + """ + Styles a dataframe and displays it. + """ + display( + df.style.hide(axis="index") + .format(precision=0) # Display only 2 decimal points + .set_properties(**{ + "background-color": "white", + "text-align": "center" + }) + ) + +def create_metric_chart(df: pd.DataFrame) -> alt.Chart: + """ + Create a chart that displays metric scores + for each project. + """ + # Create dropdown + metrics_list = df["Metric"].unique().tolist() + + metrics_dropdown = alt.binding_select( + options=metrics_list, + name="Metrics: ", + ) + # Column that controls the bar charts + xcol_param = alt.selection_point( + fields=["Metric"], value=metrics_list[0], bind=metrics_dropdown + ) + + chart = ( + alt.Chart(df, title="Metric by Categories") + .mark_circle(size=200) + .encode( + x=alt.X("Score", scale=alt.Scale(domain=[0, 10])), + y=alt.Y("Project Name"), + color=alt.Color( + "Score", + scale=alt.Scale( + range=calitp_color_palette.CALITP_CATEGORY_BRIGHT_COLORS + ), + ), + tooltip=list(df.columns), + ) + .properties(width=400, height=250) + ) + + chart = chart.add_params(xcol_param).transform_filter(xcol_param) + + return chart + +def create_district_summary(df: pd.DataFrame, caltrans_district: int): + """ + Create a summary of CSIS metrics for one Caltrans District. + """ + filtered_df = df.loc[df["CalTrans District"] == caltrans_district].reset_index( + drop=True + ) + # Finding the values referenced in the narrative + median_score = filtered_df["Overall Score"].median() + total_projects = filtered_df["Project Name"].nunique() + max_project = filtered_df["Project Cost"].max() + max_project = f"${max_project:,.2f}" + + # Aggregate the dataframe + aggregated_df = aggregate_by_category(filtered_df) + + # Change the dataframe from wide to long + df2 = wide_to_long(filtered_df) + + # Create narrative + display( + Markdown( + f"""The median score for projects in District {caltrans_district} is {median_score}
+ The total number of projects is {total_projects}
+ The most expensive project costs {max_project} + """ + ) + ) + display( + Markdown( + f"""

Metrics aggregated by Categories

+ """ + ) + ) + style_df(aggregated_df) + + display( + Markdown( + f"""

Overview of Projects

+ """ + ) + ) + style_df(filtered_df[["Project Name", "Overall Score", "Scope Of Work"]]) + display( + Markdown( + f"""

Metric Scores by Project

+ """ + ) + ) + display(create_metric_chart(df2)) \ No newline at end of file diff --git a/starter_kit/starterkit_district.yml b/starter_kit/starterkit_district.yml new file mode 100644 index 000000000..a373713e8 --- /dev/null +++ b/starter_kit/starterkit_district.yml @@ -0,0 +1,31 @@ +directory: ./ha_portfolio/ +notebook: ./ha_portfolio/ha_portfolio.ipynb +parts: +- caption: Introduction +- chapters: + - params: + district: 1 + - params: + district: 2 + - params: + district: 3 + - params: + district: 4 + - params: + district: 5 + - params: + district: 6 + - params: + district: 7 + - params: + district: 8 + - params: + district: 9 + - params: + district: 10 + - params: + district: 11 + - params: + district: 12 +readme: ./ha_portfolio/README.md +title: Starter Kit Portfolio