Skip to content

Commit

Permalink
Merge branch 'main' into curriculum_docs_update
Browse files Browse the repository at this point in the history
  • Loading branch information
shweta487 authored Jan 13, 2025
2 parents 4dd18e4 + c9f7a2d commit a7c861e
Show file tree
Hide file tree
Showing 210 changed files with 8,391 additions and 3,054 deletions.
2 changes: 1 addition & 1 deletion .github/pull_request_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Resolves #\[issue\]

_Include commands/logs/screenshots as relevant._

_If making changes to dbt models, please run the command `poetry run dbt run -s CHANGED_MODEL` and include the output in this section of the PR._
_If making changes to dbt models, please run the command `poetry run dbt run -s CHANGED_MODEL` and `poetry run dbt test -s CHANGED_MODEL`, then include the output in this section of the PR._

## Post-merge follow-ups

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/publish-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:

- name: Build jupyter book
run: jb build docs --warningiserror --keep-going # set doc to fail on any sphinx warning
- uses: actions/upload-artifact@v2
- uses: actions/upload-artifact@v3
if: always()
with:
name: docs-build
Expand Down
6 changes: 4 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@ repos:
rev: 6.0.0
hooks:
- id: flake8
args: ["--ignore=E501,W503"] # line too long and line before binary operator (black is ok with these)
args: ["--ignore=E501,W503,E231"] # line too long and line before binary operator (black is ok with these) and explicitly ignore the whitespace after colon error
types:
- python
# Suppress SyntaxWarning about invalid escape sequence from calitp-data-infra dependency without modifying source
entry: env PYTHONWARNINGS="ignore::SyntaxWarning" flake8
- repo: https://github.com/psf/black
rev: 23.1.0
hooks:
Expand Down Expand Up @@ -71,6 +73,6 @@ repos:
exclude: 'README.md|warehouse/.*'
args: ["--number"]
additional_dependencies:
- mdformat-gfm
- mdformat-gfm==0.3.5
- mdformat-frontmatter
- mdformat-footnote
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
operator: operators.ExternalTable
bucket: gs://calitp-ntd-xlsx-products-clean
prefix_bucket: false
post_hook: |
SELECT *
FROM `{{ get_project_id() }}`.external_ntd__annual_reporting.2022__annual_database_agency_information
LIMIT 1;
source_objects:
- "annual_database_agency_information/2022/_2022_agency_information/*.jsonl.gz"
destination_project_dataset_table: "external_ntd__annual_reporting.2022__annual_database_agency_information"
source_format: NEWLINE_DELIMITED_JSON
use_bq_client: true
hive_options:
mode: AUTO
require_partition_filter: false
source_uri_prefix: "annual_database_agency_information/2022/_2022_agency_information/"
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
operator: operators.ExternalTable
bucket: gs://calitp-ntd-xlsx-products-clean
prefix_bucket: false
post_hook: |
SELECT *
FROM `{{ get_project_id() }}`.external_ntd__annual_reporting.2023__annual_database_agency_information
LIMIT 1;
source_objects:
- "annual_database_agency_information/2023/agency_information/*.jsonl.gz"
destination_project_dataset_table: "external_ntd__annual_reporting.2023__annual_database_agency_information"
source_format: NEWLINE_DELIMITED_JSON
use_bq_client: true
hive_options:
mode: AUTO
require_partition_filter: false
source_uri_prefix: "annual_database_agency_information/2023/agency_information/"
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
operator: operators.ExternalTable
bucket: gs://calitp-ntd-xlsx-products-clean
prefix_bucket: false
post_hook: |
SELECT *
FROM `{{ get_project_id() }}`.external_ntd__annual_reporting.2023__annual_database_contractual_relationships
LIMIT 1;
source_objects:
- "annual_database_contractual_relationship/2023/contractual_relationships/*.jsonl.gz"
destination_project_dataset_table: "external_ntd__annual_reporting.2023__annual_database_contractual_relationships"
source_format: NEWLINE_DELIMITED_JSON
use_bq_client: true
hive_options:
mode: AUTO
require_partition_filter: false
source_uri_prefix: "annual_database_contractual_relationship/2023/contractual_relationships/"
Original file line number Diff line number Diff line change
Expand Up @@ -16,125 +16,88 @@ hive_options:
source_uri_prefix: "annual-database-agency-information/{dt:DATE}/{ts:TIMESTAMP}/{year:INTEGER}/"
schema_fields:
- name: number_of_state_counties
type: FLOAT
mode: NULLABLE
type: NUMERIC
- name: tam_tier
type: STRING
mode: NULLABLE
- name: personal_vehicles
type: FLOAT
mode: NULLABLE
type: NUMERIC
- name: density
type: FLOAT
mode: NULLABLE
- name: uza_name
type: STRING
mode: NULLABLE
- name: tribal_area_name
type: STRING
mode: NULLABLE
- name: service_area_sq_miles
type: FLOAT
mode: NULLABLE
type: NUMERIC
- name: total_voms
type: FLOAT
mode: NULLABLE
type: NUMERIC
- name: city
type: STRING
mode: NULLABLE
- name: fta_recipient_id
type: FLOAT
mode: NULLABLE
type: NUMERIC
- name: region
type: FLOAT
mode: NULLABLE
type: NUMERIC
- name: state_admin_funds_expended
type: FLOAT
mode: NULLABLE
type: NUMERIC
- name: zip_code_ext
type: FLOAT
mode: NULLABLE
type: STRING
- name: zip_code
type: FLOAT
mode: NULLABLE
type: STRING
- name: ueid
type: STRING
mode: NULLABLE
- name: division_department
type: STRING
- name: state_parent_ntd_id
type: STRING
- name: address_line_2
type: STRING
mode: NULLABLE
- name: number_of_counties_with_service
type: FLOAT
mode: NULLABLE
type: NUMERIC
- name: reporter_acronym
type: STRING
mode: NULLABLE
- name: original_due_date
type: INTEGER
mode: NULLABLE
type: STRING
- name: sq_miles
type: FLOAT
mode: NULLABLE
type: NUMERIC
- name: address_line_1
type: STRING
mode: NULLABLE
- name: p_o__box
type: STRING
mode: NULLABLE
- name: fy_end_date
type: INTEGER
mode: NULLABLE
type: STRING
- name: reported_by_ntd_id
type: STRING
mode: NULLABLE
- name: population
type: FLOAT
mode: NULLABLE
type: NUMERIC
- name: reporting_module
type: STRING
mode: NULLABLE
- name: service_area_pop
type: FLOAT
mode: NULLABLE
type: NUMERIC
- name: subrecipient_type
type: STRING
mode: NULLABLE
- name: state
type: STRING
mode: NULLABLE
- name: volunteer_drivers
type: FLOAT
mode: NULLABLE
type: NUMERIC
- name: primary_uza
type: FLOAT
mode: NULLABLE
type: NUMERIC
- name: doing_business_as
type: STRING
mode: NULLABLE
- name: reporter_type
type: STRING
mode: NULLABLE
- name: legacy_ntd_id
type: STRING
mode: NULLABLE
- name: voms_do
type: FLOAT
mode: NULLABLE
type: NUMERIC
- name: url
type: STRING
mode: NULLABLE
- name: reported_by_name
type: STRING
mode: NULLABLE
- name: voms_pt
type: FLOAT
mode: NULLABLE
type: NUMERIC
- name: organization_type
type: STRING
mode: NULLABLE
- name: agency_name
type: STRING
mode: NULLABLE
- name: ntd_id
type: STRING
mode: NULLABLE
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
operator: operators.ExternalTable
bucket: gs://calitp-ntd-api-products
source_objects:
- "fra_regulated_mode_major_security_events/historical/*.jsonl.gz"
source_format: NEWLINE_DELIMITED_JSON
use_bq_client: true
hive_options:
mode: CUSTOM
require_partition_filter: false
source_uri_prefix: "fra_regulated_mode_major_security_events/historical/{dt:DATE}/{execution_ts:TIMESTAMP}"
destination_project_dataset_table: "external_ntd__safety_and_security.historical__fra_regulated_mode_major_security_events"
prefix_bucket: false
post_hook: SELECT * FROM `{{ get_project_id() }}`.external_ntd__safety_and_security.historical__fra_regulated_mode_major_security_events LIMIT 1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
operator: operators.ExternalTable
bucket: gs://calitp-ntd-api-products
source_objects:
- "major_safety_events/historical/*.jsonl.gz"
source_format: NEWLINE_DELIMITED_JSON
use_bq_client: true
hive_options:
mode: CUSTOM
require_partition_filter: false
source_uri_prefix: "major_safety_events/historical/{dt:DATE}/{execution_ts:TIMESTAMP}"
destination_project_dataset_table: "external_ntd__safety_and_security.historical__major_safety_events"
prefix_bucket: false
post_hook: SELECT * FROM `{{ get_project_id() }}`.external_ntd__safety_and_security.historical__major_safety_events LIMIT 1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
operator: operators.ExternalTable
bucket: gs://calitp-ntd-api-products
source_objects:
- "monthly_modal_time_series_safety_and_service/historical/*.jsonl.gz"
source_format: NEWLINE_DELIMITED_JSON
use_bq_client: true
hive_options:
mode: CUSTOM
require_partition_filter: false
source_uri_prefix: "monthly_modal_time_series_safety_and_service/historical/{dt:DATE}/{execution_ts:TIMESTAMP}"
destination_project_dataset_table: "external_ntd__safety_and_security.historical__monthly_modal_time_series_safety_and_service"
prefix_bucket: false
post_hook: SELECT * FROM `{{ get_project_id() }}`.external_ntd__safety_and_security.historical__monthly_modal_time_series_safety_and_service LIMIT 1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
operator: operators.ExternalTable
bucket: gs://calitp-ntd-api-products
source_objects:
- "nonmajor_safety_and_security_events/historical/*.jsonl.gz"
source_format: NEWLINE_DELIMITED_JSON
use_bq_client: true
hive_options:
mode: CUSTOM
require_partition_filter: false
source_uri_prefix: "nonmajor_safety_and_security_events/historical/{dt:DATE}/{execution_ts:TIMESTAMP}"
destination_project_dataset_table: "external_ntd__safety_and_security.historical__nonmajor_safety_and_security_events"
prefix_bucket: false
post_hook: SELECT * FROM `{{ get_project_id() }}`.external_ntd__safety_and_security.historical__nonmajor_safety_and_security_events LIMIT 1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
operator: operators.ExternalTable
bucket: gs://calitp-state-geoportal-scrape
source_objects:
- "state_highway_network_geodata/*.jsonl.gz"
source_format: NEWLINE_DELIMITED_JSON
use_bq_client: true
hive_options:
mode: CUSTOM
require_partition_filter: false
source_uri_prefix: "state_highway_network_geodata/{dt:DATE}/{execution_ts:TIMESTAMP}/"
destination_project_dataset_table: "external_state_geoportal.state_highway_network"
prefix_bucket: false
post_hook: |
SELECT *
FROM `{{ get_project_id() }}`.external_state_geoportal.state_highway_network
LIMIT 1;
schema_fields:
- name: Route
type: INTEGER
- name: County
type: STRING
- name: District
type: INTEGER
- name: RouteType
type: STRING
- name: Direction
type: STRING
- name: wkt_coordinates
type: GEOGRAPHY
19 changes: 19 additions & 0 deletions airflow/dags/scrape_state_geoportal/METADATA.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
description: "Scrape State Highway Network from State Geoportal"
schedule_interval: "0 4 1 * *" # 4am UTC first day of every month
tags:
- all_gusty_features
default_args:
owner: airflow
depends_on_past: False
catchup: False
start_date: "2024-09-15"
email:
- "[email protected]"
email_on_failure: True
email_on_retry: False
retries: 1
retry_delay: !timedelta 'minutes: 2'
concurrency: 50
#sla: !timedelta 'hours: 2'
wait_for_defaults:
timeout: 3600
7 changes: 7 additions & 0 deletions airflow/dags/scrape_state_geoportal/state_highway_network.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
operator: operators.StateGeoportalAPIOperator

root_url: 'https://caltrans-gis.dot.ca.gov/arcgis/rest/services/'
service: "CHhighway/SHN_Lines"
layer: "0"
product: 'state_highway_network'
resultRecordCount: 2000
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ operator: operators.NtdDataProductAPIOperator
year: 'historical'
product: 'major_safety_events'
root_url: 'https://data.transportation.gov/resource/'
endpoint_id: '9ivb-8ae9'
endpoint_id: 'urir-txqm'
file_format: '.json'
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ operator: operators.NtdDataProductAPIOperator
year: 'historical'
product: 'monthly_modal_time_series_safety_and_service'
root_url: 'https://data.transportation.gov/resource/'
endpoint_id: '65fa-qbkf'
endpoint_id: '5ti2-5uiv'
file_format: '.json'
5 changes: 3 additions & 2 deletions airflow/dags/sync_ntd_data_xlsx/METADATA.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
description: "Scrape tables from DOT Ridership XLSX file daily"
schedule_interval: "0 10 * * *" # 10am UTC every day
description: "Scrape tables from DOT Ridership XLSX file weekly"
schedule_interval: "0 10 * * 1" # 10am UTC every Monday
tags:
- all_gusty_features
default_args:
Expand All @@ -15,5 +15,6 @@ default_args:
retry_delay: !timedelta 'minutes: 2'
concurrency: 50
#sla: !timedelta 'hours: 2'
provide_context: True
wait_for_defaults:
timeout: 3600
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
operator: operators.NtdDataProductXLSXOperator

product: 'annual_database_agency_information'
xlsx_file_url: 'https://www.transit.dot.gov/ntd/data-product/2022-annual-database-agency-information' # placeholder for scraped url from scrape_ntd_xlsx_urls task
year: '2022' # one of: 'historical' (long history), 'mutli-year' (select history), or a specific year (ex: 2022)
dependencies:
- scrape_ntd_xlsx_urls
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
operator: operators.NtdDataProductXLSXOperator

product: 'annual_database_agency_information'
xlsx_file_url: 'https://www.transit.dot.gov/ntd/data-product/2023-annual-database-agency-information' # placeholder for scraped url from scrape_ntd_xlsx_urls task
year: '2023' # one of: 'historical' (long history), 'mutli-year' (select history), or a specific year (ex: 2022)
dependencies:
- scrape_ntd_xlsx_urls
Loading

0 comments on commit a7c861e

Please sign in to comment.