From 77a8d6eae7a1de4cbf82f67416c9ab7b930f5baa Mon Sep 17 00:00:00 2001 From: Geoff Boeing Date: Mon, 8 Jan 2024 11:41:03 -0800 Subject: [PATCH] updates --- .github/workflows/tests.yml | 19 +- .gitignore | 1 + .pre-commit-config.yaml | 50 + README.md | 4 - assignments/assignment2.md | 6 +- assignments/assignment4.md | 6 +- assignments/final-project.md | 26 +- assignments/mini-lecture.md | 22 +- environment.yml | 19 +- format.sh | 6 - modules/01-introduction/readme.md | 19 +- modules/03-python-data-science/lecture.ipynb | 165 ++-- .../lecture.ipynb | 194 ++-- modules/05-geocoding-apis/README.md | 12 +- modules/05-geocoding-apis/lecture.ipynb | 250 ++--- modules/06-spatial-data/lecture.ipynb | 191 ++-- .../06-spatial-data/raster-crop-bbox.ipynb | 28 +- modules/07-urban-networks-i/lecture.ipynb | 98 +- modules/08-urban-networks-ii/lecture.ipynb | 217 ++--- .../08-urban-networks-ii/process-lodes.ipynb | 63 +- modules/09-spatial-analysis/lecture.ipynb | 197 ++-- .../difference-in-differences.ipynb | 70 +- modules/10-spatial-models/lecture.ipynb | 212 +++-- modules/11-supervised-learning/lecture.ipynb | 83 +- .../12-unsupervised-learning/lecture.ipynb | 248 +++-- .../lecture.ipynb | 865 ------------------ modules/14-computer-vision/readme.md | 3 + .../README.md | 90 +- software/docker/Dockerfile | 4 +- software/docker/requirements.txt | 19 +- software/readme.md | 18 +- syllabus/readme.md | 158 ++-- 32 files changed, 1312 insertions(+), 2051 deletions(-) create mode 100644 .pre-commit-config.yaml delete mode 100644 format.sh delete mode 100644 modules/13-natural-language-processing/lecture.ipynb create mode 100644 modules/14-computer-vision/readme.md rename modules/{14-next-steps => 15-next-steps}/README.md (69%) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9c837c8..cafecf4 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -7,9 +7,7 @@ on: branches: [main] jobs: - build: - name: ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: @@ -19,24 +17,21 @@ jobs: defaults: run: - shell: bash -l {0} + shell: bash -elo pipefail {0} steps: - - name: Checkout repo uses: actions/checkout@v3 with: fetch-depth: 2 - - name: Setup Conda environment with Micromamba - uses: mamba-org/provision-with-micromamba@v14 + - name: Create environment with Micromamba + uses: mamba-org/setup-micromamba@v1 with: cache-downloads: true - cache-env: true - channels: conda-forge - channel-priority: strict + cache-environment: true environment-file: environment.yml - environment-name: ppde642 + post-cleanup: none - name: Test environment run: | @@ -45,3 +40,7 @@ jobs: conda info --all jupyter kernelspec list ipython -c "import osmnx; print('OSMnx version', osmnx.__version__)" + + - name: Lint + run: | + SKIP=no-commit-to-branch pre-commit run --all-files diff --git a/.gitignore b/.gitignore index e4a0c31..e0d0305 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ data/* modules/*/*.gal modules/*/*.png +modules/*/cache/* modules/*/keys.py syllabus/pdf/*.pdf diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..f78e52e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,50 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: "v4.5.0" + hooks: + - id: check-added-large-files + args: [--maxkb=50] + - id: check-ast + - id: check-builtin-literals + - id: check-case-conflict + - id: check-docstring-first + - id: check-json + - id: check-merge-conflict + args: [--assume-in-merge] + - id: check-yaml + - id: debug-statements + - id: detect-private-key + - id: end-of-file-fixer + - id: fix-byte-order-marker + - id: mixed-line-ending + - id: no-commit-to-branch + args: [--branch, main] + - id: trailing-whitespace + + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v3.0.3" + hooks: + - id: prettier + types_or: [markdown, yaml] + + - repo: https://github.com/nbQA-dev/nbQA + rev: "1.7.1" + hooks: + - id: nbqa-isort + additional_dependencies: [isort] + args: [--line-length=100, --sl] + - id: nbqa-black + additional_dependencies: [black] + args: [--line-length=100] + - id: nbqa-flake8 + additional_dependencies: [flake8] + args: [--max-line-length=100] + + - repo: local + hooks: + - id: nbconvert + name: clear notebook output + entry: jupyter nbconvert + language: system + types: [jupyter] + args: ["--clear-output", "--inplace"] diff --git a/README.md b/README.md index 9c35bdf..c31e941 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,12 @@ [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/gboeing/ppde642/main?urlpath=lab) [![Build Status](https://github.com/gboeing/ppde642/workflows/tests/badge.svg?branch=main)](https://github.com/gboeing/ppde642/actions?query=workflow%3A%22tests%22) - # PPDE642: Advanced Urban Analytics This is the second part of a two-course series on **urban data science** that I teach at the **University of Southern California**'s Department of Urban Planning and Spatial Analysis. This course series takes a computational social science approach to working with urban data. It uses Python and Jupyter notebooks to introduce coding and statistical methods that students can reproduce and experiment with in the cloud. The series as a whole presumes no prior knowledge as it introduces coding, stats, spatial analysis, and applied machine learning from the ground up, but PPDE642 assumes you have completed [PPD534](https://github.com/gboeing/ppd534) or its equivalent. - ## Urban Data Science course series ### PPD534: Data, Evidence, and Communication for the Public Good @@ -17,14 +15,12 @@ The first course in the series, **PPD534**, starts with the basics of coding wit **PPD534**'s lecture materials are available on [GitHub](https://github.com/gboeing/ppd534) and interactively on [Binder](https://mybinder.org/v2/gh/gboeing/ppd534/main). - ### PPDE642: Advanced Urban Analytics The second course, **PPDE642**, assumes you have completed PPD534 (or its equivalent) and builds on its topics. It introduces spatial analysis, network analysis, spatial models, and applied machine learning. It also digs deeper into the tools and workflows of urban data science in both research and practice. **PPDE642**'s lecture materials are available in this repo and interactively on [Binder](https://mybinder.org/v2/gh/gboeing/ppde642/main). - ## Not a USC student? Did you discover this course on GitHub? Come study with us: [consider applying](https://geoffboeing.com/lab/) to the urban planning master's or PhD programs at USC. diff --git a/assignments/assignment2.md b/assignments/assignment2.md index 35439ce..590e323 100644 --- a/assignments/assignment2.md +++ b/assignments/assignment2.md @@ -6,9 +6,9 @@ You will clean, organize, describe, and visualize the data you downloaded in Ass Create a new Jupyter notebook. The first cell of your notebook should be markdown explaining what your research question and hypotheses are, where you found your data set, and what it contains. Given your proposed project: - 1. Load your data set and clean/process it as needed. - 1. Identify at least two variables of interest and calculate relevant descriptive statistics. - 1. Using the techniques we learned in class, visualize interesting aspects of your data set. Create at least 4 visualizations using at least 3 different visualization types (e.g., scatterplots, barplots, maps, etc). +1. Load your data set and clean/process it as needed. +1. Identify at least two variables of interest and calculate relevant descriptive statistics. +1. Using the techniques we learned in class, visualize interesting aspects of your data set. Create at least 4 visualizations using at least 3 different visualization types (e.g., scatterplots, barplots, maps, etc). Make sure your code is well-commented throughout for explanatory clarity. Your notebook should be well-organized into high-level sections using markdown headers representing the steps above, plus subheaders as needed. Each visualization should be followed by a markdown cell that explains what you are visualizing, why it is interesting, and why you made your specific graphical design decisions. What story does each visual tell? How does it enrich, confirm, or contradict the descriptive statistics you calculated earlier? diff --git a/assignments/assignment4.md b/assignments/assignment4.md index 8113735..3f83cdc 100644 --- a/assignments/assignment4.md +++ b/assignments/assignment4.md @@ -6,9 +6,9 @@ You will conduct a spatial analysis using a spatial dataset (ideally the same on Create a new Jupyter notebook. The first cell of your notebook should be markdown explaining what your research question and hypotheses are, where you found your data set, and what it contains. Use geopandas to load your data set and clean/process it as needed. Make sure your code is well-commented throughout for explanatory clarity. Using the techniques we learned in class, do the following: - 1. conduct a spatial analysis to look for hot/cold spots and assess spatial autocorrelation - 1. compute spatial diagnostics to pick an appropriate spatial regression model - 1. estimate and interpret a spatial regression model +1. conduct a spatial analysis to look for hot/cold spots and assess spatial autocorrelation +1. compute spatial diagnostics to pick an appropriate spatial regression model +1. estimate and interpret a spatial regression model Your notebook should be separated into high-level sections using markdown headers representing the steps above. Each section should conclude with a markdown cell that succinctly explains your analysis/visuals, why you set it up the way you did, and how you interpret its results. Your notebook should conclude with a markdown cell that explains 1) what evidence does this analysis provide for your research question and hypothesis, 2) what is the "big picture" story, and 3) how can planners or policymakers use this finding. diff --git a/assignments/final-project.md b/assignments/final-project.md index d0aa569..8ca4203 100644 --- a/assignments/final-project.md +++ b/assignments/final-project.md @@ -8,27 +8,27 @@ The final project is a cumulative assignment that requires you to use the skills Identify a conference of interest and familiarize yourself with their paper submission requirements. You might consider the following conferences, among others: - - Transportation Research Board (TRB) - - Association of Collegiate Schools of Planning (ACSP) - - American Planning Association's National Planning Conference (APA) - - American Association of Geographers (AAG) - - Urban Affairs Association (UAA) +- Transportation Research Board (TRB) +- Association of Collegiate Schools of Planning (ACSP) +- American Planning Association's National Planning Conference (APA) +- American Association of Geographers (AAG) +- Urban Affairs Association (UAA) Develop an urban research question that fits with the themes of your chosen conference. Develop a research design to answer this question, then collect data, clean and organize it, visualize it, and analyze it. Write a conference paper organized into five sections: - 1. introduction: provide a short (3 paragraph) summary of the study's importance, methods, and findings/implications (1 paragraph each) - 2. background: explain the context of your study and provide a short lit review of relevant related work to establish what is known and what urgent open questions remain - 3. methods: present your data and your analysis methods with sufficient detail that a reader could reproduce your study - 4. results: present your findings and include supporting visuals - 5. discussion: circle back to your research question, interpret your findings, and discuss their importance and how planners or policymakers could use them to improve some aspect of urban living +1. introduction: provide a short (3 paragraph) summary of the study's importance, methods, and findings/implications (1 paragraph each) +2. background: explain the context of your study and provide a short lit review of relevant related work to establish what is known and what urgent open questions remain +3. methods: present your data and your analysis methods with sufficient detail that a reader could reproduce your study +4. results: present your findings and include supporting visuals +5. discussion: circle back to your research question, interpret your findings, and discuss their importance and how planners or policymakers could use them to improve some aspect of urban living Format your paper according to the conference's guidelines. For the purposes of this course, your paper must be at least 3000 words in length (not including tables, figures, captions, or references). It must include the following, at a minimum: - - a table of descriptive statistics - - a table of spatial regression or machine learning model results - - 4 aesthetically-pleasing figures containing data visualizations including at least 1 map +- a table of descriptive statistics +- a table of spatial regression or machine learning model results +- 4 aesthetically-pleasing figures containing data visualizations including at least 1 map You are strongly encouraged, but not required, to actually submit this paper to the conference. diff --git a/assignments/mini-lecture.md b/assignments/mini-lecture.md index 359d877..e6d4859 100644 --- a/assignments/mini-lecture.md +++ b/assignments/mini-lecture.md @@ -8,21 +8,21 @@ This exercise is intended to be informal and an opportunity for self-discovery. Instructions: - - Pick a method listed in the syllabus for those weeks or covered in the reading material. - - Learn how the method works by reading the week's reading material. - - Practice the method in your own notebook on your own data. - - Google for additional usage examples and further information. - - Prepare a mini-lecture notebook that would take 8-10 minutes to present that 1) briefly introduces why someone would use the method and how it works (~2 minutes), 2) demonstrates in code how to use the method for a simple data analysis (~5 minutes), 3) summarizes what the analysis revealed (~2 minutes). +- Pick a method listed in the syllabus for those weeks or covered in the reading material. +- Learn how the method works by reading the week's reading material. +- Practice the method in your own notebook on your own data. +- Google for additional usage examples and further information. +- Prepare a mini-lecture notebook that would take 8-10 minutes to present that 1) briefly introduces why someone would use the method and how it works (~2 minutes), 2) demonstrates in code how to use the method for a simple data analysis (~5 minutes), 3) summarizes what the analysis revealed (~2 minutes). -8 minutes is not a lot of time, so keep your lecture notebook simple and brief. Have a clean dataset ready to go at the beginning of your lecture. Do not show us a lot of preparatory steps setting things up in your notebook. Jump right into the analysis that demonstrates your method. +8 minutes is not a lot of time, so keep your lecture notebook simple and brief. Have a clean dataset ready to go at the beginning of your lecture. Do not show us a lot of preparatory steps setting things up in your notebook. Jump right into the analysis that demonstrates your method. You will be graded according to the following. In your notebook, did you: - - summarize why someone would use this method and how it works, at a high-level - - demonstrate the method with a simple data analysis - - summarize what your analysis revealed - - keep it all succinct +- summarize why someone would use this method and how it works, at a high-level +- demonstrate the method with a simple data analysis +- summarize what your analysis revealed +- keep it all succinct Make sure your notebook runs from the top without any errors (i.e., restart the kernel and run all cells) and that all the output can be seen inline without me having to re-run your notebook. Via Blackboard, submit your notebook and data files, all zipped as a single file, named `LastName_FirstName_Lecture.zip`. If your submission file exceeds Blackboard's maximum upload size limit, you may provide a Google Drive link to your zipped data in the comment field when you submit. -Note that if you pick a supervised learning method, your assignment is due prior to class in module 11. If you pick an unsupervised learning method, your assignment is due prior to class in module 12. The "presentation" is pretend: you are just creating the lecture notebook you would have presented, and submitting it via Blackboard *before that module's class session* begins. +Note that if you pick a supervised learning method, your assignment is due prior to class in module 11. If you pick an unsupervised learning method, your assignment is due prior to class in module 12. The "presentation" is pretend: you are just creating the lecture notebook you would have presented, and submitting it via Blackboard _before that module's class session_ begins. diff --git a/environment.yml b/environment.yml index 556473c..2763cc9 100644 --- a/environment.yml +++ b/environment.yml @@ -5,32 +5,33 @@ channels: dependencies: - beautifulsoup4 - - black - cartopy - cenpy - - conda - contextily - - dill - - flake8 - folium - - gensim - geopandas - - isort - jupyterlab - mapclassify - osmnx=1.8.1 - - nbqa - - nltk - pandana - pandas + - pre-commit - pysal - python=3.11.* - rasterio - - rtree - seaborn - scikit-learn - scipy - statsmodels + + # computer vision and NLP + - gensim + - nltk + - pillow + - pytorch + - torchvision + + # others (unused) # bokeh # datashader # holoviews diff --git a/format.sh b/format.sh deleted file mode 100644 index 3e33f48..0000000 --- a/format.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -e -nbqa isort ./modules/*/*.ipynb --nbqa-mutate --line-length 100 --sl -# nbqa black ./modules/*/*.ipynb --nbqa-mutate --line-length 100 -# nbqa flake8 ./modules/*/*.ipynb --max-line-length 140 -jupyter nbconvert --ClearOutputPreprocessor.enabled=True --inplace ./modules/*/*.ipynb diff --git a/modules/01-introduction/readme.md b/modules/01-introduction/readme.md index 7102319..ef8e4df 100644 --- a/modules/01-introduction/readme.md +++ b/modules/01-introduction/readme.md @@ -2,29 +2,24 @@ In this module, we introduce the course, the syllabus, the semester's expectations and schedule, and set up the computing environment for coursework. Then we introduce the foundational tools underlying much of the modern data science world: package management, version control, and computational notebooks. - ## Syllabus The syllabus is in the [syllabus](../../syllabus) folder. - ## Computing environment Make sure that you have already completed the course's initial [software](../../software) setup before proceeding. - ## Package management A Python **module** is a file of Python code containing variables, classes, functions, etc. A Python **package** is a collection of modules, kind of like a folder of files and subfolders. A package can be thought of as a computer program. **Package management** is the process of installing, uninstalling, configuring, and upgrading packages on a computer. A **package manager** is a software tool for package management, retrieving information and installing packages from a software repository. The most common Python package managers are `conda` and `pip`. These tools are typically used in the terminal. - ### pip `pip` installs Python packages from [PyPI](https://pypi.org/) in the form of wheels or source code. The latter often requires that you have library dependencies and compatible compilers already installed on your system to install the Python package. This often requires some expertise when installing complicated toolkits, such as the Python geospatial data science ecosystem. For that reason, I recommend using `conda` unless you have to use `pip`. - ### conda `conda` installs packages from Anaconda's software repositories. These packages are binaries, so no compilation is required of the user, and they are multi-language: a package could include Python, C, C++, R, Julia, or other languages. Anaconda software repositories are organized by **channel**. Beyond the "default" channel, the [conda-forge](https://conda-forge.org/) channel includes thousands of community-led packages. `conda` is the recommended way to get started with the Python geospatial data science ecosystem. @@ -46,25 +41,23 @@ conda env remove -n ox Read the `conda` [documentation](https://conda.io/) for more details. - ## Urban data science in a computational notebook During the course's initial software setup, you created a conda environment with all the required packages. The required packages are defined in the course's [environment file](../../environment.yml). These are the tools we will use all semester. All of the lectures and coursework will utilize Jupyter notebooks. These notebooks provide an interactive environment for working with code and have become standard in the data science world. [Read more](https://doi.org/10.22224/gistbok/2021.1.2). - ## Version control Distributed version control is central to modern analytics work in both research and practice. It allows (multiple) people to collaboratively develop source code while tracking changes. Today, git is the standard tool for version control and source code management. Sites like GitHub provide hosting for git repositories. GitHub Guides provides an excellent [introduction](https://guides.github.com/) to distributed version control with git, so I will not duplicate it here. Take some time to work through their lessons. You need to understand, at a minimum, how to: - - fork a repo - - clone a repo - - work with branches - - add/commit changes - - push and pull to/from a remote repo - - merge a feature branch into the main branch +- fork a repo +- clone a repo +- work with branches +- add/commit changes +- push and pull to/from a remote repo +- merge a feature branch into the main branch Start with their guides on the Git Handbook, Understanding the GitHub flow, Forking Projects, Mastering Markdown, and then explore from there. diff --git a/modules/03-python-data-science/lecture.ipynb b/modules/03-python-data-science/lecture.ipynb index 8186153..e4bbe8c 100644 --- a/modules/03-python-data-science/lecture.ipynb +++ b/modules/03-python-data-science/lecture.ipynb @@ -24,7 +24,6 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", "import pandas as pd" ] }, @@ -66,7 +65,7 @@ "outputs": [], "source": [ "# sequence of characters (str)\n", - "x = 'Los Angeles, CA 90089'\n", + "x = \"Los Angeles, CA 90089\"\n", "len(x)" ] }, @@ -77,7 +76,7 @@ "outputs": [], "source": [ "# list of items\n", - "x = [1, 2, 3, 'USC']\n", + "x = [1, 2, 3, \"USC\"]\n", "len(x)" ] }, @@ -121,7 +120,7 @@ "outputs": [], "source": [ "# dictionary of key:value pairs\n", - "person = {'first_name': 'Geoff', 'last_name': 'Boeing', 'employer': 'USC'}\n", + "person = {\"first_name\": \"Geoff\", \"last_name\": \"Boeing\", \"employer\": \"USC\"}\n", "type(person)" ] }, @@ -132,7 +131,7 @@ "outputs": [], "source": [ "# you can convert types\n", - "x = '100'\n", + "x = \"100\"\n", "print(type(x))\n", "y = int(x)\n", "print(type(y))" @@ -146,7 +145,7 @@ "source": [ "# you can loop through an iterable, such as a list or tuple\n", "for coord in latlng:\n", - " print('Current coordinate is:', coord)" + " print(\"Current coordinate is:\", coord)" ] }, { @@ -190,11 +189,11 @@ "# if, elif, else for conditional branching execution\n", "x = 101\n", "if x > 100:\n", - " print('Value is greater than 100.')\n", + " print(\"Value is greater than 100.\")\n", "elif x < 100:\n", - " print('Value is less than 100.')\n", + " print(\"Value is less than 100.\")\n", "else:\n", - " print('Value is 100.')" + " print(\"Value is 100.\")" ] }, { @@ -209,8 +208,9 @@ " new_list = [new_type(item) for item in my_list]\n", " return new_list\n", "\n", - "l = [1, 2, 3, 4]\n", - "convert_items(l)" + "\n", + "numbers_list = [1, 2, 3, 4]\n", + "convert_items(numbers_list)" ] }, { @@ -291,8 +291,10 @@ "outputs": [], "source": [ "# a dict can contain multiple lists and label them\n", - "my_dict = {'hh_income' : [75125, 22075, 31950, 115400],\n", - " 'home_value' : [525000, 275000, 395000, 985000]}\n", + "my_dict = {\n", + " \"hh_income\": [75125, 22075, 31950, 115400],\n", + " \"home_value\": [525000, 275000, 395000, 985000],\n", + "}\n", "my_dict" ] }, @@ -366,7 +368,7 @@ "# load a data file\n", "# note the relative filepath! where is this file located?\n", "# use dtype argument if you don't want pandas to guess your data types\n", - "df = pd.read_csv('../../data/world_cities.csv')" + "df = pd.read_csv(\"../../data/world_cities.csv\")" ] }, { @@ -424,7 +426,7 @@ "source": [ "# CHEAT SHEET OF COMMON TASKS\n", "# Operation Syntax Result\n", - "#------------------------------------------------------------\n", + "# ------------------------------------------------------------\n", "# Select column by name df[col] Series\n", "# Select columns by name df[col_list] DataFrame\n", "# Select row by label df.loc[label] Series\n", @@ -448,7 +450,7 @@ "source": [ "# select a single column by column name\n", "# this is a pandas series\n", - "df['resident_pop']" + "df[\"resident_pop\"]" ] }, { @@ -459,7 +461,7 @@ "source": [ "# select multiple columns by a list of column names\n", "# this is a pandas dataframe that is a subset of the original\n", - "df[['resident_pop', 'built_up_area']]" + "df[[\"resident_pop\", \"built_up_area\"]]" ] }, { @@ -469,13 +471,13 @@ "outputs": [], "source": [ "# create a new column by assigning df['new_col'] to some values\n", - "df['pop_density'] = df['resident_pop'] / df['built_up_area']\n", + "df[\"pop_density\"] = df[\"resident_pop\"] / df[\"built_up_area\"]\n", "\n", "# you can do vectorized math operations on any numeric columns\n", - "df['pop_density_1000s'] = df['pop_density'] / 1000\n", + "df[\"pop_density_1000s\"] = df[\"pop_density\"] / 1000\n", "\n", "# inspect the results\n", - "df[['resident_pop', 'built_up_area', 'pop_density', 'pop_density_1000s']].head()" + "df[[\"resident_pop\", \"built_up_area\", \"pop_density\", \"pop_density_1000s\"]].head()" ] }, { @@ -503,7 +505,7 @@ "outputs": [], "source": [ "# use .loc to select single value by row label, column name\n", - "df.loc[0, 'resident_pop']" + "df.loc[0, \"resident_pop\"]" ] }, { @@ -525,7 +527,7 @@ "source": [ "# slice of rows from label 5 to label 7, inclusive\n", "# slice of columns from uc_names to world_subregion, inclusive\n", - "df.loc[1:3, 'uc_names':'world_subregion']" + "df.loc[1:3, \"uc_names\":\"world_subregion\"]" ] }, { @@ -536,7 +538,7 @@ "source": [ "# subset of rows from with labels in list\n", "# subset of columns with names in list\n", - "df.loc[[1, 3], ['uc_names', 'world_subregion']]" + "df.loc[[1, 3], [\"uc_names\", \"world_subregion\"]]" ] }, { @@ -547,7 +549,7 @@ "source": [ "# you can use a column of identifiers as the index (indices do not *need* to be unique)\n", "# uc_id values uniquely identify each row (but verify!)\n", - "df = df.set_index('uc_id')\n", + "df = df.set_index(\"uc_id\")\n", "df.index.is_unique" ] }, @@ -569,8 +571,8 @@ "# .loc works by label, not by position in the dataframe\n", "try:\n", " df.loc[0]\n", - "except KeyError as e:\n", - " print('label not found')" + "except KeyError:\n", + " print(\"label not found\")" ] }, { @@ -638,7 +640,7 @@ "outputs": [], "source": [ "# filter the dataframe by urban areas with more than 25 million residents\n", - "df[df['resident_pop'] > 25000000]" + "df[df[\"resident_pop\"] > 25000000]" ] }, { @@ -648,7 +650,7 @@ "outputs": [], "source": [ "# what exactly did that do? let's break it out.\n", - "df['resident_pop'] > 25000000" + "df[\"resident_pop\"] > 25000000" ] }, { @@ -658,7 +660,7 @@ "outputs": [], "source": [ "# essentially a true/false mask that filters by value\n", - "mask = df['resident_pop'] > 25000000\n", + "mask = df[\"resident_pop\"] > 25000000\n", "df[mask]" ] }, @@ -671,7 +673,7 @@ "# you can chain multiple conditions together\n", "# pandas logical operators are: | for or, & for and, ~ for not\n", "# these must be grouped by using parentheses due to order of operations\n", - "mask = (df['resident_pop'] > 25000000) & (df['built_up_area'] > 2000)\n", + "mask = (df[\"resident_pop\"] > 25000000) & (df[\"built_up_area\"] > 2000)\n", "df[mask]" ] }, @@ -682,9 +684,9 @@ "outputs": [], "source": [ "# which urban areas have 25 million residents and either 2000 km2 area or >200 meter avg elevation?\n", - "mask1 = df['resident_pop'] > 25000000\n", - "mask2 = df['built_up_area'] > 2000\n", - "mask3 = df['avg_elevation'] > 200\n", + "mask1 = df[\"resident_pop\"] > 25000000\n", + "mask2 = df[\"built_up_area\"] > 2000\n", + "mask3 = df[\"avg_elevation\"] > 200\n", "mask = mask1 & (mask2 | mask3)\n", "df[mask]" ] @@ -716,8 +718,8 @@ "outputs": [], "source": [ "# which urban areas contain a city with \"New \" in its name?\n", - "mask = df['uc_names'].str.contains('New ')\n", - "df.loc[mask, ['uc_names', 'country']]" + "mask = df[\"uc_names\"].str.contains(\"New \")\n", + "df.loc[mask, [\"uc_names\", \"country\"]]" ] }, { @@ -754,8 +756,14 @@ "outputs": [], "source": [ "# create a subset dataframe with climate related variables\n", - "climate_cols = ['core_city', 'avg_elevation', 'avg_precipitation', 'avg_temperature', 'climate_classes']\n", - "df_climate = df[climate_cols].sample(2000).sort_values('avg_temperature', ascending=True)\n", + "climate_cols = [\n", + " \"core_city\",\n", + " \"avg_elevation\",\n", + " \"avg_precipitation\",\n", + " \"avg_temperature\",\n", + " \"climate_classes\",\n", + "]\n", + "df_climate = df[climate_cols].sample(2000).sort_values(\"avg_temperature\", ascending=True)\n", "df_climate.head()" ] }, @@ -766,8 +774,8 @@ "outputs": [], "source": [ "# create a subset dataframe with economic variables\n", - "econ_cols = ['core_city', 'gdp_ppp', 'night_light_em', 'un_income_class']\n", - "df_econ = df[econ_cols].sample(2000).sort_values('gdp_ppp', ascending=False)\n", + "econ_cols = [\"core_city\", \"gdp_ppp\", \"night_light_em\", \"un_income_class\"]\n", + "df_econ = df[econ_cols].sample(2000).sort_values(\"gdp_ppp\", ascending=False)\n", "df_econ.head()" ] }, @@ -778,7 +786,7 @@ "outputs": [], "source": [ "# merge them together, aligning rows based on their labels in the index\n", - "df_merged = pd.merge(left=df_econ, right=df_climate, how='inner', left_index=True, right_index=True)\n", + "df_merged = pd.merge(left=df_econ, right=df_climate, how=\"inner\", left_index=True, right_index=True)\n", "df_merged.head()" ] }, @@ -800,7 +808,7 @@ "outputs": [], "source": [ "# merge them together, aligning rows based on left's column values and right's index labels\n", - "df_merged = pd.merge(left=df_econ, right=df_climate, how='inner', left_on='uc_id', right_index=True)\n", + "df_merged = pd.merge(left=df_econ, right=df_climate, how=\"inner\", left_on=\"uc_id\", right_index=True)\n", "df_merged" ] }, @@ -832,8 +840,8 @@ "outputs": [], "source": [ "# create two subset dataframes\n", - "df_us = df[df['country'] == 'united_states']\n", - "df_uk = df[df['country'] == 'united_kingdom']" + "df_us = df[df[\"country\"] == \"united_states\"]\n", + "df_uk = df[df[\"country\"] == \"united_kingdom\"]" ] }, { @@ -861,8 +869,8 @@ "outputs": [], "source": [ "# calculate per capita GDP then group the rows by region\n", - "df['gdp_percap'] = df['gdp_ppp'] / df['resident_pop']\n", - "groups = df.groupby('world_subregion')" + "df[\"gdp_percap\"] = df[\"gdp_ppp\"] / df[\"resident_pop\"]\n", + "groups = df.groupby(\"world_subregion\")" ] }, { @@ -872,7 +880,7 @@ "outputs": [], "source": [ "# what is the median per capita GDP across the urban areas in each region?\n", - "groups['gdp_percap'].median().sort_values(ascending=False)" + "groups[\"gdp_percap\"].median().sort_values(ascending=False)" ] }, { @@ -882,7 +890,7 @@ "outputs": [], "source": [ "# look at several columns' medians by region\n", - "groups[['gdp_percap', 'avg_temperature', 'pop_density']].median()" + "groups[[\"gdp_percap\", \"avg_temperature\", \"pop_density\"]].median()" ] }, { @@ -892,7 +900,8 @@ "outputs": [], "source": [ "# now it's your turn\n", - "# re-group the urban areas by country and find the highest then lowest urban area avg elevation in each country" + "# re-group the urban areas by country\n", + "# and find the highest then lowest urban area avg elevation in each country" ] }, { @@ -905,9 +914,8 @@ "\n", " - avoid .iterrows() always\n", " - use vectorization wherever possible\n", - " - .map() applies a function element-wise on a Series\n", + " - .map() applies a function element-wise on a Series or DataFrame\n", " - .apply() works on a row or column basis on a DataFrame (specify the axis!), or element-wise on a Series\n", - " - .applymap() works element-wise on an entire DataFrame\n", "\n", "Let's see what that means in practice with some examples." ] @@ -919,7 +927,7 @@ "outputs": [], "source": [ "# calculate resident population z-scores, vectorized\n", - "z = (df['resident_pop'] - df['resident_pop'].mean()) / df['resident_pop'].std()\n", + "z = (df[\"resident_pop\"] - df[\"resident_pop\"].mean()) / df[\"resident_pop\"].std()\n", "z" ] }, @@ -936,13 +944,16 @@ "metadata": {}, "outputs": [], "source": [ - "mean = df['resident_pop'].mean()\n", - "std = df['resident_pop'].std()\n", + "mean = df[\"resident_pop\"].mean()\n", + "std = df[\"resident_pop\"].std()\n", + "\n", + "\n", "def calculate_zscore(x, mean=mean, std=std):\n", " return (x - mean) / std\n", - " \n", + "\n", + "\n", "# map the function to the series\n", - "z = df['resident_pop'].map(calculate_zscore)" + "z = df[\"resident_pop\"].map(calculate_zscore)" ] }, { @@ -958,11 +969,11 @@ "metadata": {}, "outputs": [], "source": [ - "mean = df['resident_pop'].mean()\n", - "std = df['resident_pop'].std()\n", + "mean = df[\"resident_pop\"].mean()\n", + "std = df[\"resident_pop\"].std()\n", "\n", "# map a lambda function to the series\n", - "z = df['resident_pop'].map(lambda x: (x - mean) / std)" + "z = df[\"resident_pop\"].map(lambda x: (x - mean) / std)" ] }, { @@ -973,7 +984,7 @@ "source": [ "%%timeit\n", "# check timings of vectorized vs map\n", - "z = (df['resident_pop'] - df['resident_pop'].mean()) / df['resident_pop'].std()" + "z = (df[\"resident_pop\"] - df[\"resident_pop\"].mean()) / df[\"resident_pop\"].std()" ] }, { @@ -983,9 +994,9 @@ "outputs": [], "source": [ "%%timeit\n", - "mean = df['resident_pop'].mean()\n", - "std = df['resident_pop'].std()\n", - "z = df['resident_pop'].map(lambda x: (x - mean) / std)" + "mean = df[\"resident_pop\"].mean()\n", + "std = df[\"resident_pop\"].std()\n", + "z = df[\"resident_pop\"].map(lambda x: (x - mean) / std)" ] }, { @@ -1002,8 +1013,8 @@ "outputs": [], "source": [ "# find the difference between the min and max values in each column (ie, row-wise)\n", - "df_subset = df[['area', 'built_up_area', 'avg_elevation']]\n", - "df_subset.apply(lambda col: col.max() - col.min(), axis='rows')" + "df_subset = df[[\"area\", \"built_up_area\", \"avg_elevation\"]]\n", + "df_subset.apply(lambda col: col.max() - col.min(), axis=\"rows\")" ] }, { @@ -1013,7 +1024,7 @@ "outputs": [], "source": [ "# find the difference between the min and max values in each row (ie, column-wise)\n", - "df_subset.apply(lambda row: row.max() - row.min(), axis='columns')" + "df_subset.apply(lambda row: row.max() - row.min(), axis=\"columns\")" ] }, { @@ -1031,7 +1042,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`.applymap()` works element-wise on an entire DataFrame. This is like doing a `.map()` to each column in the DataFrame." + "`.map()` also works element-wise on an entire DataFrame. This is like doing a `.map()` to each column in the DataFrame." ] }, { @@ -1040,10 +1051,10 @@ "metadata": {}, "outputs": [], "source": [ - "# this uses applymap, but you could (should) vectorize it\n", + "# this uses map, but you could (should) vectorize it\n", "# we'll see that next week\n", - "df_subset = df[['country', 'world_region', 'world_subregion']]\n", - "df_subset.applymap(lambda x: x.upper().replace('_', ' '))" + "df_subset = df[[\"country\", \"world_region\", \"world_subregion\"]]\n", + "df_subset.map(lambda x: x.upper().replace(\"_\", \" \"))" ] }, { @@ -1063,7 +1074,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = df.reset_index().set_index(['country', 'core_city']).sort_index()\n", + "df = df.reset_index().set_index([\"country\", \"core_city\"]).sort_index()\n", "df" ] }, @@ -1092,7 +1103,7 @@ "outputs": [], "source": [ "# select all urban areas in china (ie, first index level)\n", - "df.loc['china']" + "df.loc[\"china\"]" ] }, { @@ -1103,7 +1114,7 @@ "source": [ "# or select rows by multiple index levels\n", "# lots of unnamed core cities in china in this dataset\n", - "df.loc[('china', 'unnamed')]" + "df.loc[(\"china\", \"unnamed\")]" ] }, { @@ -1115,7 +1126,7 @@ "# select every row with an unnamed core city in the dataset\n", "# that is, only select by the 2nd level of the index\n", "# the first : slices everything in the first index level, and the trailing : slices all columns\n", - "df.loc[pd.IndexSlice[:, ['unnamed']], :]" + "df.loc[pd.IndexSlice[:, [\"unnamed\"]], :]" ] }, { @@ -1125,7 +1136,7 @@ "outputs": [], "source": [ "# select every row in argentina or spain with core city of cordoba\n", - "df.loc[(['argentina', 'spain'], ['cordoba']), :]" + "df.loc[([\"argentina\", \"spain\"], [\"cordoba\"]), :]" ] }, { @@ -1156,9 +1167,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (ppd599)", + "display_name": "Python (ppde642)", "language": "python", - "name": "ppd599" + "name": "ppde642" }, "language_info": { "codemirror_mode": { @@ -1170,7 +1181,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/modules/04-data-cleaning-exploration/lecture.ipynb b/modules/04-data-cleaning-exploration/lecture.ipynb index 4188d40..1f9317e 100644 --- a/modules/04-data-cleaning-exploration/lecture.ipynb +++ b/modules/04-data-cleaning-exploration/lecture.ipynb @@ -47,7 +47,7 @@ "outputs": [], "source": [ "# load the data\n", - "df = pd.read_csv('../../data/LA_County_Covid19_CSA_case_death_table.csv')\n", + "df = pd.read_csv(\"../../data/LA_County_Covid19_CSA_case_death_table.csv\")\n", "df.shape" ] }, @@ -78,7 +78,7 @@ "outputs": [], "source": [ "# drop the duplicate IDs and rename the place column to something meaningful\n", - "df = df.drop(columns=['Unnamed: 0']).rename(columns={'geo_merge':'place_name'})\n", + "df = df.drop(columns=[\"Unnamed: 0\"]).rename(columns={\"geo_merge\": \"place_name\"})\n", "df" ] }, @@ -89,8 +89,13 @@ "outputs": [], "source": [ "# clean up place names\n", - "df['place_name'] = df['place_name'].str.replace('City of ', '').str.replace('Unincorporated - ', '').str.replace('Los Angeles - ', '')\n", - "df.sort_values('place_name')" + "df[\"place_name\"] = (\n", + " df[\"place_name\"]\n", + " .str.replace(\"City of \", \"\")\n", + " .str.replace(\"Unincorporated - \", \"\")\n", + " .str.replace(\"Los Angeles - \", \"\")\n", + ")\n", + "df.sort_values(\"place_name\")" ] }, { @@ -128,7 +133,7 @@ "outputs": [], "source": [ "# load the data\n", - "df = pd.read_csv('../../data/Top_County_Earners.csv')\n", + "df = pd.read_csv(\"../../data/Top_County_Earners.csv\")\n", "df.shape" ] }, @@ -169,7 +174,7 @@ "outputs": [], "source": [ "# rename the total earnings column to something that won't trip you up\n", - "df = df.rename(columns={' Total Earnings':'Total Earnings'})" + "df = df.rename(columns={\" Total Earnings\": \"Total Earnings\"})" ] }, { @@ -180,11 +185,11 @@ "source": [ "# convert the float columns to ints: a couple ways you could do it (either works)...\n", "# OPTION 1: use IndexSlice from last week's lecture\n", - "slicer = pd.IndexSlice[:, 'Base Earnings':'Total Compensation']\n", + "slicer = pd.IndexSlice[:, \"Base Earnings\":\"Total Compensation\"]\n", "df.loc[slicer] = df.loc[slicer].astype(int)\n", "\n", "# OPTION 2: select columns where type is float64\n", - "float_cols = df.columns[df.dtypes=='float64']\n", + "float_cols = df.columns[df.dtypes == \"float64\"]\n", "df[float_cols] = df[float_cols].astype(int)" ] }, @@ -207,7 +212,7 @@ "outputs": [], "source": [ "# convert from USD to 1000s of USD\n", - "df['Total Compensation 1000s'] = df['Total Compensation'] / 1000" + "df[\"Total Compensation 1000s\"] = df[\"Total Compensation\"] / 1000" ] }, { @@ -217,8 +222,8 @@ "outputs": [], "source": [ "# improve the capitalization (note, only Series can do vectorized str methods)\n", - "slicer = pd.IndexSlice[:, 'Employee Name':'Department']\n", - "df.loc[slicer] = df.loc[slicer].apply(lambda col: col.str.title(), axis='rows')\n", + "slicer = pd.IndexSlice[:, \"Employee Name\":\"Department\"]\n", + "df.loc[slicer] = df.loc[slicer].apply(lambda col: col.str.title(), axis=\"rows\")\n", "df" ] }, @@ -266,7 +271,7 @@ "outputs": [], "source": [ "# load the data\n", - "df = pd.read_csv('../../data/Listing_of_Active_Businesses.csv')\n", + "df = pd.read_csv(\"../../data/Listing_of_Active_Businesses.csv\")\n", "df.shape" ] }, @@ -299,7 +304,7 @@ "# you have to make a decision: NAICS should be int, but it contains nulls\n", "# you could drop nulls then convert to int, or just leave it as float\n", "# OR in recent versions of pandas, you could cast to type pd.Int64Dtype() which allows nulls\n", - "pd.isnull(df['NAICS']).sum()" + "pd.isnull(df[\"NAICS\"]).sum()" ] }, { @@ -309,8 +314,8 @@ "outputs": [], "source": [ "# make sure end dates are all null, then drop that column\n", - "assert pd.isnull(df['LOCATION END DATE']).all()\n", - "df = df.drop(columns=['LOCATION END DATE'])" + "assert pd.isnull(df[\"LOCATION END DATE\"]).all()\n", + "df = df.drop(columns=[\"LOCATION END DATE\"])" ] }, { @@ -320,7 +325,7 @@ "outputs": [], "source": [ "# make the column names lower case and without spaces or hash signs\n", - "cols = df.columns.str.lower().str.replace(' ', '_').str.strip('_#')\n", + "cols = df.columns.str.lower().str.replace(\" \", \"_\").str.strip(\"_#\")\n", "df.columns = cols" ] }, @@ -331,8 +336,8 @@ "outputs": [], "source": [ "# make sure account numbers are unique, then set as index and sort index\n", - "assert df['location_account'].is_unique\n", - "df = df.set_index('location_account').sort_index()\n", + "assert df[\"location_account\"].is_unique\n", + "df = df.set_index(\"location_account\").sort_index()\n", "df" ] }, @@ -343,7 +348,7 @@ "outputs": [], "source": [ "# convert the start date from strings to datetimes\n", - "df['location_start_date'] = pd.to_datetime(df['location_start_date'])" + "df[\"location_start_date\"] = pd.to_datetime(df[\"location_start_date\"])" ] }, { @@ -353,8 +358,8 @@ "outputs": [], "source": [ "# improve the capitalization\n", - "slicer = pd.IndexSlice[:, 'business_name':'mailing_city']\n", - "df.loc[slicer] = df.loc[slicer].apply(lambda col: col.str.title(), axis='rows')\n", + "slicer = pd.IndexSlice[:, \"business_name\":\"mailing_city\"]\n", + "df.loc[slicer] = df.loc[slicer].apply(lambda col: col.str.title(), axis=\"rows\")\n", "df" ] }, @@ -365,7 +370,7 @@ "outputs": [], "source": [ "# what's going on with those location coordinates?\n", - "df['location'].iloc[0]" + "df[\"location\"].iloc[0]" ] }, { @@ -386,12 +391,12 @@ "metadata": {}, "outputs": [], "source": [ - "mask = pd.notnull(df['location'])\n", - "latlng = df.loc[mask, 'location'].map(ast.literal_eval)\n", - "df.loc[mask, ['lat', 'lng']] = pd.DataFrame(latlng.to_list(),\n", - " index=latlng.index,\n", - " columns=['lat', 'lng'])\n", - "df = df.drop(columns=['location'])\n", + "mask = pd.notnull(df[\"location\"])\n", + "latlng = df.loc[mask, \"location\"].map(ast.literal_eval)\n", + "df.loc[mask, [\"lat\", \"lng\"]] = pd.DataFrame(\n", + " latlng.to_list(), index=latlng.index, columns=[\"lat\", \"lng\"]\n", + ")\n", + "df = df.drop(columns=[\"location\"])\n", "df" ] }, @@ -441,8 +446,8 @@ "outputs": [], "source": [ "# configure seaborn's style for subsequent use\n", - "sns.set_style('whitegrid') #visual styles\n", - "sns.set_context('paper') #presets for scaling figure element sizes" + "sns.set_style(\"whitegrid\") # visual styles\n", + "sns.set_context(\"paper\") # presets for scaling figure element sizes" ] }, { @@ -472,7 +477,7 @@ "source": [ "# quick descriptive stats for some variable\n", "# but... looking across the whole population obscures between-group heterogeneity\n", - "df_earnings['Total Compensation 1000s'].describe()" + "df_earnings[\"Total Compensation 1000s\"].describe()" ] }, { @@ -482,7 +487,7 @@ "outputs": [], "source": [ "# which departments have the most employees in the data set?\n", - "dept_counts = df_earnings['Department'].value_counts().head()\n", + "dept_counts = df_earnings[\"Department\"].value_counts().head()\n", "dept_counts" ] }, @@ -494,8 +499,8 @@ "source": [ "# recall grouping and summarizing from last week\n", "# look at compensation distribution across the 5 largest departments\n", - "mask = df_earnings['Department'].isin(dept_counts.index)\n", - "df_earnings.loc[mask].groupby('Department')['Total Compensation 1000s'].describe().astype(int)" + "mask = df_earnings[\"Department\"].isin(dept_counts.index)\n", + "df_earnings.loc[mask].groupby(\"Department\")[\"Total Compensation 1000s\"].describe().astype(int)" ] }, { @@ -514,21 +519,21 @@ "outputs": [], "source": [ "# visualize compensation distribution across the 5 largest departments\n", - "x = df_earnings.loc[mask, 'Total Compensation 1000s']\n", - "y = df_earnings.loc[mask, 'Department']\n", + "x = df_earnings.loc[mask, \"Total Compensation 1000s\"]\n", + "y = df_earnings.loc[mask, \"Department\"]\n", "\n", "# fliersize changes the size of the outlier dots\n", "# boxprops lets you set more configs with a dict, such as alpha (which means opacity)\n", - "ax = sns.boxplot(x=x, y=y, fliersize=0.3, boxprops={'alpha':0.7})\n", + "ax = sns.boxplot(x=x, y=y, fliersize=0.3, boxprops={\"alpha\": 0.7})\n", "\n", "# set the x-axis limit, the figure title, and x/y axis labels\n", "ax.set_xlim(left=0)\n", - "ax.set_title('Total compensation by department')\n", - "ax.set_xlabel('Total compensation (USD, 1000s)')\n", - "ax.set_ylabel('')\n", + "ax.set_title(\"Total compensation by department\")\n", + "ax.set_xlabel(\"Total compensation (USD, 1000s)\")\n", + "ax.set_ylabel(\"\")\n", "\n", "# save figure to disk at 300 dpi and with a tight bounding box\n", - "ax.get_figure().savefig('boxplot-earnings.png', dpi=300, bbox_inches='tight')" + "ax.get_figure().savefig(\"boxplot-earnings.png\", dpi=300, bbox_inches=\"tight\")" ] }, { @@ -569,7 +574,7 @@ "source": [ "# manually change the plot's size/dimension by adjusting its figure's size\n", "fig = ax.get_figure()\n", - "fig.set_size_inches(16, 4) #width, height in inches\n", + "fig.set_size_inches(16, 4) # width, height in inches\n", "fig" ] }, @@ -587,7 +592,7 @@ "outputs": [], "source": [ "# histplot visualizes the variable's distribution as a histogram and optionally a KDE\n", - "ax = sns.histplot(df_earnings['Total Compensation 1000s'].dropna(), kde=False, bins=30)\n", + "ax = sns.histplot(df_earnings[\"Total Compensation 1000s\"].dropna(), kde=False, bins=30)\n", "_ = ax.set_xlim(left=0)" ] }, @@ -605,7 +610,9 @@ "outputs": [], "source": [ "# typical LASD employee earns more than the typical regional planner :(\n", - "df_earnings.groupby('Department')['Total Compensation 1000s'].median().sort_values(ascending=False).head(10)" + "df_earnings.groupby(\"Department\")[\"Total Compensation 1000s\"].median().sort_values(\n", + " ascending=False\n", + ").head(10)" ] }, { @@ -615,17 +622,15 @@ "outputs": [], "source": [ "# visually compare sheriff and social services dept subsets\n", - "mask = df_earnings['Department'].isin(['Public Social Services Dept', 'Sheriff'])\n", - "ax = sns.histplot(data=df_earnings.loc[mask],\n", - " x='Total Compensation 1000s',\n", - " hue='Department',\n", - " bins=50,\n", - " kde=False)\n", + "mask = df_earnings[\"Department\"].isin([\"Public Social Services Dept\", \"Sheriff\"])\n", + "ax = sns.histplot(\n", + " data=df_earnings.loc[mask], x=\"Total Compensation 1000s\", hue=\"Department\", bins=50, kde=False\n", + ")\n", "\n", "ax.set_xlim(0, 400)\n", - "ax.set_xlabel('Total compensation (USD, 1000s)')\n", - "ax.set_title('Employee Compensation: LASD vs Social Services')\n", - "ax.get_figure().savefig('boxplot-hists.png', dpi=300, bbox_inches='tight')" + "ax.set_xlabel(\"Total compensation (USD, 1000s)\")\n", + "ax.set_title(\"Employee Compensation: LASD vs Social Services\")\n", + "ax.get_figure().savefig(\"boxplot-hists.png\", dpi=300, bbox_inches=\"tight\")" ] }, { @@ -642,9 +647,11 @@ "outputs": [], "source": [ "# difference-in-means: compute difference, t-statistic, and p-value\n", - "group1 = df_earnings[df_earnings['Department']=='Public Social Services Dept']['Total Compensation 1000s']\n", - "group2 = df_earnings[df_earnings['Department']=='Sheriff']['Total Compensation 1000s']\n", - "t, p = stats.ttest_ind(group1, group2, equal_var=False, nan_policy='omit')\n", + "group1 = df_earnings[df_earnings[\"Department\"] == \"Public Social Services Dept\"][\n", + " \"Total Compensation 1000s\"\n", + "]\n", + "group2 = df_earnings[df_earnings[\"Department\"] == \"Sheriff\"][\"Total Compensation 1000s\"]\n", + "t, p = stats.ttest_ind(group1, group2, equal_var=False, nan_policy=\"omit\")\n", "print(group1.mean() - group2.mean(), t, p)" ] }, @@ -664,8 +671,8 @@ "outputs": [], "source": [ "# the big reveal... who (individually) had the highest earnings?\n", - "cols = ['Employee Name', 'Position Title', 'Department', 'Total Compensation 1000s']\n", - "df_earnings[cols].sort_values('Total Compensation 1000s', ascending=False).head(10)" + "cols = [\"Employee Name\", \"Position Title\", \"Department\", \"Total Compensation 1000s\"]\n", + "df_earnings[cols].sort_values(\"Total Compensation 1000s\", ascending=False).head(10)" ] }, { @@ -703,11 +710,10 @@ "outputs": [], "source": [ "# use seaborn to scatter-plot two variables\n", - "ax = sns.scatterplot(x=df_covid['cases_final'],\n", - " y=df_covid['deaths_final'])\n", + "ax = sns.scatterplot(x=df_covid[\"cases_final\"], y=df_covid[\"deaths_final\"])\n", "ax.set_xlim(left=0)\n", "ax.set_ylim(bottom=0)\n", - "ax.get_figure().set_size_inches(5, 5) #make it square" + "ax.get_figure().set_size_inches(5, 5) # make it square" ] }, { @@ -717,7 +723,7 @@ "outputs": [], "source": [ "# show a pair plot of these SF tracts across these 4 variables\n", - "cols = ['cases_final', 'deaths_final', 'population']\n", + "cols = [\"cases_final\", \"deaths_final\", \"population\"]\n", "ax = sns.pairplot(df_covid[cols].dropna())" ] }, @@ -735,7 +741,7 @@ "outputs": [], "source": [ "# calculate correlation (and significance) between two variables\n", - "r, p = stats.pearsonr(x=df_covid['population'], y=df_covid['cases_final'])\n", + "r, p = stats.pearsonr(x=df_covid[\"population\"], y=df_covid[\"cases_final\"])\n", "print(round(r, 3), round(p, 3))" ] }, @@ -758,8 +764,9 @@ "source": [ "# visual correlation matrix via seaborn heatmap\n", "# use vmin, vmax, center to set colorbar scale properly\n", - "ax = sns.heatmap(correlations, vmin=-1, vmax=1, center=0,\n", - " cmap='coolwarm', square=True, linewidths=1)" + "ax = sns.heatmap(\n", + " correlations, vmin=-1, vmax=1, center=0, cmap=\"coolwarm\", square=True, linewidths=1\n", + ")" ] }, { @@ -780,7 +787,7 @@ "outputs": [], "source": [ "# regress one variable on another: a change in x is associated with what change in y?\n", - "m, b, r, p, se = stats.linregress(x=df_covid['population'], y=df_covid['cases_final'])\n", + "m, b, r, p, se = stats.linregress(x=df_covid[\"population\"], y=df_covid[\"cases_final\"])\n", "print(m, b, r, p, se)" ] }, @@ -791,7 +798,7 @@ "outputs": [], "source": [ "# a linear (regression) trend line + confidence interval\n", - "ax = sns.regplot(x=df_covid['population'], y=df_covid['cases_final'])\n", + "ax = sns.regplot(x=df_covid[\"population\"], y=df_covid[\"cases_final\"])\n", "ax.get_figure().set_size_inches(5, 5)" ] }, @@ -823,7 +830,7 @@ "outputs": [], "source": [ "# extract the two-digit sector code from each NAICS classification\n", - "sectors = df_business['naics'].dropna().astype(int).astype(str).str.slice(0, 2)\n", + "sectors = df_business[\"naics\"].dropna().astype(int).astype(str).str.slice(0, 2)\n", "sectors" ] }, @@ -835,10 +842,10 @@ "source": [ "# count plot: like a histogram counting observations across categorical instead of continuous data\n", "order = sectors.value_counts().index\n", - "ax = sns.countplot(x=sectors, order=order, alpha=0.9, palette='plasma')\n", - "ax.set_xlabel('NAICS Sector')\n", - "ax.set_ylabel('Number of businesses')\n", - "ax.get_figure().savefig('countplot-naics.png', dpi=300, bbox_inches='tight')" + "ax = sns.countplot(x=sectors, order=order, alpha=0.9, palette=\"plasma\")\n", + "ax.set_xlabel(\"NAICS Sector\")\n", + "ax.set_ylabel(\"Number of businesses\")\n", + "ax.get_figure().savefig(\"countplot-naics.png\", dpi=300, bbox_inches=\"tight\")" ] }, { @@ -855,16 +862,23 @@ "outputs": [], "source": [ "# bar plot: estimate mean total compensation per dept + 95% confidence interval\n", - "order = df_earnings.groupby('Department')['Total Compensation 1000s'].mean().sort_values(ascending=False).index\n", - "ax = sns.barplot(x=df_earnings['Total Compensation 1000s'],\n", - " y=df_earnings['Department'],\n", - " estimator=np.mean,\n", - " errorbar=('ci', 95),\n", - " order=order,\n", - " alpha=0.9)\n", + "order = (\n", + " df_earnings.groupby(\"Department\")[\"Total Compensation 1000s\"]\n", + " .mean()\n", + " .sort_values(ascending=False)\n", + " .index\n", + ")\n", + "ax = sns.barplot(\n", + " x=df_earnings[\"Total Compensation 1000s\"],\n", + " y=df_earnings[\"Department\"],\n", + " estimator=np.mean,\n", + " errorbar=(\"ci\", 95),\n", + " order=order,\n", + " alpha=0.9,\n", + ")\n", "\n", - "ax.set_xlabel('Mean Total Compensation (USD, 1000s)')\n", - "ax.set_ylabel('')\n", + "ax.set_xlabel(\"Mean Total Compensation (USD, 1000s)\")\n", + "ax.set_ylabel(\"\")\n", "ax.get_figure().set_size_inches(4, 12)" ] }, @@ -894,7 +908,7 @@ "outputs": [], "source": [ "# extract years from each start date then count their appearances\n", - "years = df_business['location_start_date'].dropna().dt.year.value_counts().sort_index()\n", + "years = df_business[\"location_start_date\"].dropna().dt.year.value_counts().sort_index()\n", "years" ] }, @@ -920,14 +934,14 @@ "ax = sns.lineplot(data=years.loc[1980:2020])\n", "\n", "# rotate the tick labels\n", - "ax.tick_params(axis='x', labelrotation=45)\n", + "ax.tick_params(axis=\"x\", labelrotation=45)\n", "\n", "ax.set_xlim(1980, 2020)\n", "ax.set_ylim(bottom=0)\n", - "ax.set_xlabel('Year')\n", - "ax.set_ylabel('Count')\n", - "ax.set_title('Business Location Starts by Year')\n", - "ax.get_figure().savefig('lineplot-businesses.png', dpi=300, bbox_inches='tight')" + "ax.set_xlabel(\"Year\")\n", + "ax.set_ylabel(\"Count\")\n", + "ax.set_title(\"Business Location Starts by Year\")\n", + "ax.get_figure().savefig(\"lineplot-businesses.png\", dpi=300, bbox_inches=\"tight\")" ] }, { @@ -951,9 +965,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (ppd599)", + "display_name": "Python (ppde642)", "language": "python", - "name": "ppd599" + "name": "ppde642" }, "language_info": { "codemirror_mode": { @@ -965,7 +979,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/modules/05-geocoding-apis/README.md b/modules/05-geocoding-apis/README.md index d15fd1d..ce59dbf 100644 --- a/modules/05-geocoding-apis/README.md +++ b/modules/05-geocoding-apis/README.md @@ -6,9 +6,9 @@ You'll need a Google API key to use the Google Maps Geocoding API and the Google Complete the following steps before the class session: - 1. Go to the [Google API console](https://console.developers.google.com/). - 1. Sign in with your personal account (not a work or school Google account), create a new project for class, then click enable APIs. - 1. Enable the Google Maps Geocoding API and then the Google Places API. - 1. Go to billing and set up a credit card (you will not be billed for the usage in class, but these APIs require a card on file). - 1. Go to credentials, create an API key, then copy it. - 1. Rename the file `keys-example.py` (in this folder) to `keys.py`, open it, and replace the example API key with the one you copied above. +1. Go to the [Google API console](https://console.developers.google.com/). +1. Sign in with your personal account (not a work or school Google account), create a new project for class, then click enable APIs. +1. Enable the Google Maps Geocoding API and then the Google Places API. +1. Go to billing and set up a credit card (you will not be billed for the usage in class, but these APIs require a card on file). +1. Go to credentials, create an API key, then copy it. +1. Rename the file `keys-example.py` (in this folder) to `keys.py`, open it, and replace the example API key with the one you copied above. diff --git a/modules/05-geocoding-apis/lecture.ipynb b/modules/05-geocoding-apis/lecture.ipynb index 5df132b..365334c 100644 --- a/modules/05-geocoding-apis/lecture.ipynb +++ b/modules/05-geocoding-apis/lecture.ipynb @@ -29,7 +29,6 @@ "\n", "import folium\n", "import geopandas as gpd\n", - "import matplotlib.pyplot as plt\n", "import osmnx as ox\n", "import pandas as pd\n", "import requests\n", @@ -58,7 +57,7 @@ "outputs": [], "source": [ "# what is your current public IP address?\n", - "url = 'https://api.ipify.org?format=json'\n", + "url = \"https://api.ipify.org?format=json\"\n", "data = requests.get(url).json()\n", "data" ] @@ -70,7 +69,7 @@ "outputs": [], "source": [ "# and what is the location of that IP address?\n", - "url = 'http://ip-api.com/json/{}'.format(data['ip'])\n", + "url = \"http://ip-api.com/json/{}\".format(data[\"ip\"])\n", "requests.get(url).json()" ] }, @@ -88,16 +87,16 @@ "outputs": [], "source": [ "# query for the forecast url for a pair of lat-lng coords\n", - "location = '34.019268,-118.283554'\n", - "url = 'https://api.weather.gov/points/{}'.format(location)\n", + "location = \"34.019268,-118.283554\"\n", + "url = \"https://api.weather.gov/points/{}\".format(location)\n", "data = requests.get(url).json()\n", "\n", "# extract the forecast url and retrieve it\n", - "forecast_url = data['properties']['forecast']\n", + "forecast_url = data[\"properties\"][\"forecast\"]\n", "forecast = requests.get(forecast_url).json()\n", "\n", "# convert the forecast to a dataframe\n", - "pd.DataFrame(forecast['properties']['periods']).head()" + "pd.DataFrame(forecast[\"properties\"][\"periods\"]).head()" ] }, { @@ -117,10 +116,10 @@ "source": [ "# median household income by tract in santa monica\n", "# https://api.census.gov/data/2020/acs/acs5/variables/B19013_001E.json\n", - "sm = products.ACS(2017).from_place(place='Santa Monica, CA',\n", - " level='tract',\n", - " variables=['B19013_001E'])\n", - "ax = sm.dropna(subset=['B19013_001E'], axis=0).plot('B19013_001E', cmap='plasma')" + "sm = products.ACS(2017).from_place(\n", + " place=\"Santa Monica, CA\", level=\"tract\", variables=[\"B19013_001E\"]\n", + ")\n", + "ax = sm.dropna(subset=[\"B19013_001E\"], axis=0).plot(\"B19013_001E\", cmap=\"plasma\")" ] }, { @@ -143,7 +142,7 @@ "outputs": [], "source": [ "# geocode a place name to lat-lng\n", - "place = 'University of Southern California'\n", + "place = \"University of Southern California\"\n", "latlng = ox.geocode(place)\n", "latlng" ] @@ -155,11 +154,15 @@ "outputs": [], "source": [ "# geocode a series of place names to lat-lng\n", - "places = pd.Series(['San Diego, California',\n", - " 'Los Angeles, California',\n", - " 'San Francisco, California',\n", - " 'Seattle, Washington',\n", - " 'Vancouver, British Columbia'])\n", + "places = pd.Series(\n", + " [\n", + " \"San Diego, California\",\n", + " \"Los Angeles, California\",\n", + " \"San Francisco, California\",\n", + " \"Seattle, Washington\",\n", + " \"Vancouver, British Columbia\",\n", + " ]\n", + ")\n", "coords = places.map(ox.geocode)" ] }, @@ -170,9 +173,9 @@ "outputs": [], "source": [ "# parse out lats and lngs to individual columns in a dataframe\n", - "pd.DataFrame({'place': places,\n", - " 'lat': coords.map(lambda x: x[0]),\n", - " 'lng': coords.map(lambda x: x[1])})" + "pd.DataFrame(\n", + " {\"place\": places, \"lat\": coords.map(lambda x: x[0]), \"lng\": coords.map(lambda x: x[1])}\n", + ")" ] }, { @@ -213,7 +216,7 @@ "outputs": [], "source": [ "# extract the value from row 0's geometry column\n", - "polygon = gdf['geometry'].iloc[0]\n", + "polygon = gdf[\"geometry\"].iloc[0]\n", "polygon" ] }, @@ -231,7 +234,7 @@ "outputs": [], "source": [ "# get all the buildings within that polygon\n", - "tags = {'building': True}\n", + "tags = {\"building\": True}\n", "gdf_bldg = ox.geometries_from_polygon(polygon, tags)\n", "gdf_bldg.shape" ] @@ -272,7 +275,7 @@ "outputs": [], "source": [ "# geocode an address to lat-lng\n", - "address = '704 S Alvarado St, Los Angeles, California'\n", + "address = \"704 S Alvarado St, Los Angeles, California\"\n", "latlng = ox.geocode(address)\n", "latlng" ] @@ -294,9 +297,14 @@ "metadata": {}, "outputs": [], "source": [ - "locations = pd.DataFrame(['704 S Alvarado St, Los Angeles, CA',\n", - " '100 Larkin St, San Francisco, CA',\n", - " '350 5th Ave, New York, NY'], columns=['address'])\n", + "locations = pd.DataFrame(\n", + " [\n", + " \"704 S Alvarado St, Los Angeles, CA\",\n", + " \"100 Larkin St, San Francisco, CA\",\n", + " \"350 5th Ave, New York, NY\",\n", + " ],\n", + " columns=[\"address\"],\n", + ")\n", "locations" ] }, @@ -307,26 +315,28 @@ "outputs": [], "source": [ "# function accepts an address string, sends it to Google API, returns lat-lng result\n", + "\n", + "\n", "def geocode(address, print_url=False):\n", - " \n", " # pause for some duration before each request, to not hammer their server\n", " time.sleep(pause)\n", - " \n", + "\n", " # api url with placeholders to fill in with variables' values\n", - " url_template = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'\n", + " url_template = \"https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}\"\n", " url = url_template.format(address, google_api_key)\n", - " if print_url: print(url)\n", - " \n", + " if print_url:\n", + " print(url)\n", + "\n", " # send request to server, get response, and convert json string to dict\n", " data = requests.get(url).json()\n", - " \n", + "\n", " # if results were returned, extract lat-lng from top result\n", - " if len(data['results']) > 0:\n", - " lat = data['results'][0]['geometry']['location']['lat']\n", - " lng = data['results'][0]['geometry']['location']['lng']\n", - " \n", + " if len(data[\"results\"]) > 0:\n", + " lat = data[\"results\"][0][\"geometry\"][\"location\"][\"lat\"]\n", + " lng = data[\"results\"][0][\"geometry\"][\"location\"][\"lng\"]\n", + "\n", " # return lat-lng as a string\n", - " return '{},{}'.format(lat, lng)" + " return \"{},{}\".format(lat, lng)" ] }, { @@ -336,7 +346,7 @@ "outputs": [], "source": [ "# test the function\n", - "geocode('350 5th Ave, New York, NY')" + "geocode(\"350 5th Ave, New York, NY\")" ] }, { @@ -346,7 +356,7 @@ "outputs": [], "source": [ "# for each value in the address column, geocode it, save results as new column\n", - "locations['latlng'] = locations['address'].map(geocode)\n", + "locations[\"latlng\"] = locations[\"address\"].map(geocode)\n", "locations" ] }, @@ -357,7 +367,7 @@ "outputs": [], "source": [ "# parse the result into separate lat and lng columns, if desired\n", - "locations[['lat', 'lng']] = pd.DataFrame(data=locations['latlng'].str.split(',').to_list())\n", + "locations[[\"lat\", \"lng\"]] = pd.DataFrame(data=locations[\"latlng\"].str.split(\",\").to_list())\n", "locations" ] }, @@ -391,16 +401,18 @@ "outputs": [], "source": [ "# google places API URL, with placeholders\n", - "url_template = 'https://maps.googleapis.com/maps/api/place/search/json?keyword={}&location={}&radius={}&key={}'\n", + "url_template = (\n", + " \"https://maps.googleapis.com/maps/api/place/search/json?keyword={}&location={}&radius={}&key={}\"\n", + ")\n", "\n", "# what keyword to search for\n", - "keyword = 'restaurant'\n", + "keyword = \"restaurant\"\n", "\n", "# define the radius (in meters) for the search\n", "radius = 500\n", "\n", "# define the location coordinates\n", - "location = '34.019268,-118.283554'" + "location = \"34.019268,-118.283554\"" ] }, { @@ -422,7 +434,7 @@ "outputs": [], "source": [ "# how many results did we get?\n", - "len(data['results'])" + "len(data[\"results\"])" ] }, { @@ -432,7 +444,7 @@ "outputs": [], "source": [ "# inspect a result\n", - "data['results'][0]" + "data[\"results\"][0]" ] }, { @@ -442,8 +454,7 @@ "outputs": [], "source": [ "# turn the results into a dataframe of places\n", - "places = pd.DataFrame(data=data['results'],\n", - " columns=['name', 'geometry', 'rating', 'vicinity'])\n", + "places = pd.DataFrame(data=data[\"results\"], columns=[\"name\", \"geometry\", \"rating\", \"vicinity\"])\n", "places.head()" ] }, @@ -455,14 +466,17 @@ "source": [ "# parse out lat-long and return it as a series\n", "# this creates a dataframe of all the results when you .apply()\n", + "\n", + "\n", "def parse_coords(geometry):\n", " if isinstance(geometry, dict):\n", - " lng = geometry['location']['lng']\n", - " lat = geometry['location']['lat']\n", - " return pd.Series({'lat':lat, 'lng':lng})\n", - " \n", + " lng = geometry[\"location\"][\"lng\"]\n", + " lat = geometry[\"location\"][\"lat\"]\n", + " return pd.Series({\"lat\": lat, \"lng\": lng})\n", + "\n", + "\n", "# test our function\n", - "places['geometry'].head().apply(parse_coords)" + "places[\"geometry\"].head().apply(parse_coords)" ] }, { @@ -472,8 +486,8 @@ "outputs": [], "source": [ "# now run our function on the whole dataframe and save the output to 2 new dataframe columns\n", - "places[['lat', 'lng']] = places['geometry'].apply(parse_coords)\n", - "places_clean = places.drop('geometry', axis='columns')" + "places[[\"lat\", \"lng\"]] = places[\"geometry\"].apply(parse_coords)\n", + "places_clean = places.drop(\"geometry\", axis=\"columns\")" ] }, { @@ -483,7 +497,7 @@ "outputs": [], "source": [ "# sort the places by rating\n", - "places_clean = places_clean.sort_values(by='rating', ascending=False)\n", + "places_clean = places_clean.sort_values(by=\"rating\", ascending=False)\n", "places_clean.head(10)" ] }, @@ -518,7 +532,7 @@ "outputs": [], "source": [ "# we'll use the points from the Places API, but you could use any point data here\n", - "points = places_clean[['lat', 'lng']].head()\n", + "points = places_clean[[\"lat\", \"lng\"]].head()\n", "points" ] }, @@ -529,7 +543,7 @@ "outputs": [], "source": [ "# create a column to put lat-lng into the format google likes\n", - "points['latlng'] = points.apply(lambda row: '{},{}'.format(row['lat'], row['lng']), axis='columns')\n", + "points[\"latlng\"] = points.apply(lambda row: \"{},{}\".format(row[\"lat\"], row[\"lng\"]), axis=\"columns\")\n", "points.head()" ] }, @@ -540,6 +554,8 @@ "outputs": [], "source": [ "# tell geopy to reverse geocode using Google's API and return address\n", + "\n", + "\n", "def reverse_geopy(latlng):\n", " time.sleep(pause)\n", " geocoder = GoogleV3(api_key=google_api_key)\n", @@ -554,7 +570,7 @@ "outputs": [], "source": [ "# now reverse-geocode the points to addresses\n", - "points['address'] = points['latlng'].map(reverse_geopy)\n", + "points[\"address\"] = points[\"latlng\"].map(reverse_geopy)\n", "points.head()" ] }, @@ -573,16 +589,19 @@ "outputs": [], "source": [ "# pass the Google API latlng data to reverse geocode it\n", + "\n", + "\n", "def reverse_geocode(latlng):\n", " time.sleep(pause)\n", - " url_template = 'https://maps.googleapis.com/maps/api/geocode/json?latlng={}&key={}'\n", + " url_template = \"https://maps.googleapis.com/maps/api/geocode/json?latlng={}&key={}\"\n", " url = url_template.format(latlng, google_api_key)\n", " response = requests.get(url)\n", " data = response.json()\n", - " if len(data['results']) > 0:\n", - " return data['results'][0]\n", - " \n", - "geocode_results = points['latlng'].map(reverse_geocode)" + " if len(data[\"results\"]) > 0:\n", + " return data[\"results\"][0]\n", + "\n", + "\n", + "geocode_results = points[\"latlng\"].map(reverse_geocode)" ] }, { @@ -608,16 +627,17 @@ "outputs": [], "source": [ "def get_city(geocode_result):\n", - " if 'address_components' in geocode_result:\n", - " for address_component in geocode_result['address_components']:\n", - " if 'locality' in address_component['types']:\n", - " return address_component['long_name']\n", - " \n", + " if \"address_components\" in geocode_result:\n", + " for address_component in geocode_result[\"address_components\"]:\n", + " if \"locality\" in address_component[\"types\"]:\n", + " return address_component[\"long_name\"]\n", + "\n", + "\n", "def get_state(geocode_result):\n", - " if 'address_components' in geocode_result:\n", - " for address_component in geocode_result['address_components']:\n", - " if 'administrative_area_level_1' in address_component['types']:\n", - " return address_component['long_name']" + " if \"address_components\" in geocode_result:\n", + " for address_component in geocode_result[\"address_components\"]:\n", + " if \"administrative_area_level_1\" in address_component[\"types\"]:\n", + " return address_component[\"long_name\"]" ] }, { @@ -627,8 +647,8 @@ "outputs": [], "source": [ "# now map our functions to extract city and state names\n", - "points['city'] = geocode_results.map(get_city) \n", - "points['state'] = geocode_results.map(get_state)\n", + "points[\"city\"] = geocode_results.map(get_city)\n", + "points[\"state\"] = geocode_results.map(get_state)\n", "points.head()" ] }, @@ -661,7 +681,7 @@ "metadata": {}, "outputs": [], "source": [ - "url = 'https://en.wikipedia.org/wiki/List_of_National_Basketball_Association_arenas'\n", + "url = \"https://en.wikipedia.org/wiki/List_of_National_Basketball_Association_arenas\"\n", "response = requests.get(url)\n", "html = response.text" ] @@ -683,8 +703,8 @@ "outputs": [], "source": [ "# parse the html\n", - "soup = BeautifulSoup(html, features='html.parser')\n", - "#soup" + "soup = BeautifulSoup(html, features=\"html.parser\")\n", + "# soup" ] }, { @@ -693,9 +713,9 @@ "metadata": {}, "outputs": [], "source": [ - "rows = soup.find('tbody').findAll('tr')\n", - "#rows\n", - "#rows[1]" + "rows = soup.find(\"tbody\").findAll(\"tr\")\n", + "# rows\n", + "# rows[1]" ] }, { @@ -706,8 +726,8 @@ "source": [ "data = []\n", "for row in rows[1:]:\n", - " cells = row.findAll('td')\n", - " d = [cell.text.strip('\\n') for cell in cells[1:-1]]\n", + " cells = row.findAll(\"td\")\n", + " d = [cell.text.strip(\"\\n\") for cell in cells[1:-1]]\n", " data.append(d)" ] }, @@ -717,7 +737,7 @@ "metadata": {}, "outputs": [], "source": [ - "cols = ['arena', 'city', 'team', 'capacity', 'opened']\n", + "cols = [\"arena\", \"city\", \"team\", \"capacity\", \"opened\"]\n", "df = pd.DataFrame(data=data, columns=cols).dropna()\n", "df" ] @@ -729,7 +749,7 @@ "outputs": [], "source": [ "# strip out all the wikipedia notes in square brackets\n", - "df = df.applymap(lambda x: re.sub(r'\\[.\\]', '', x))\n", + "df = df.applymap(lambda x: re.sub(r\"\\[.\\]\", \"\", x))\n", "df" ] }, @@ -740,8 +760,8 @@ "outputs": [], "source": [ "# convert capacity and opened to integer\n", - "df['capacity'] = df['capacity'].str.replace(',', '')\n", - "df[['capacity', 'opened']] = df[['capacity', 'opened']].astype(int)" + "df[\"capacity\"] = df[\"capacity\"].str.replace(\",\", \"\")\n", + "df[[\"capacity\", \"opened\"]] = df[[\"capacity\", \"opened\"]].astype(int)" ] }, { @@ -750,7 +770,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.sort_values('capacity', ascending=False)" + "df.sort_values(\"capacity\", ascending=False)" ] }, { @@ -773,7 +793,7 @@ "outputs": [], "source": [ "# define API endpoint\n", - "url = 'https://data.lacity.org/resource/e7h6-4a3e.json'\n", + "url = \"https://data.lacity.org/resource/e7h6-4a3e.json\"\n", "\n", "# request the URL and download its response\n", "response = requests.get(url)\n", @@ -830,14 +850,14 @@ "outputs": [], "source": [ "# define API endpoint\n", - "url = 'https://opendata.arcgis.com/datasets/723c00530ea441deaa35f25e53d098a8_16.geojson'\n", + "url = \"https://opendata.arcgis.com/datasets/723c00530ea441deaa35f25e53d098a8_16.geojson\"\n", "\n", "# request the URL and download its response\n", "response = requests.get(url)\n", "\n", "# parse the json string into a Python dict\n", "data = response.json()\n", - "len(data['features'])" + "len(data[\"features\"])" ] }, { @@ -881,7 +901,7 @@ "outputs": [], "source": [ "# now merge sensor locations with current occupancy status\n", - "parking = pd.merge(left=gdf, right=df, left_on='SENSOR_UNIQUE_ID', right_on='spaceid', how='inner')\n", + "parking = pd.merge(left=gdf, right=df, left_on=\"SENSOR_UNIQUE_ID\", right_on=\"spaceid\", how=\"inner\")\n", "parking.shape" ] }, @@ -891,11 +911,11 @@ "metadata": {}, "outputs": [], "source": [ - "parking = parking[['occupancystate', 'geometry', 'ADDRESS_SPACE']]\n", + "parking = parking[[\"occupancystate\", \"geometry\", \"ADDRESS_SPACE\"]]\n", "\n", "# extract lat and lon from geometry column\n", - "parking['lon'] = parking['geometry'].x\n", - "parking['lat'] = parking['geometry'].y\n", + "parking[\"lon\"] = parking[\"geometry\"].x\n", + "parking[\"lat\"] = parking[\"geometry\"].y\n", "\n", "parking" ] @@ -907,7 +927,7 @@ "outputs": [], "source": [ "# how many vacant vs occupied spots are there right now?\n", - "parking['occupancystate'].value_counts()" + "parking[\"occupancystate\"].value_counts()" ] }, { @@ -917,11 +937,11 @@ "outputs": [], "source": [ "# map it\n", - "vacant = parking[parking['occupancystate'] == 'VACANT']\n", - "ax = vacant.plot(c='b', markersize=1, alpha=0.5)\n", + "vacant = parking[parking[\"occupancystate\"] == \"VACANT\"]\n", + "ax = vacant.plot(c=\"b\", markersize=1, alpha=0.5)\n", "\n", - "occupied = parking[parking['occupancystate'] == 'OCCUPIED']\n", - "ax = vacant.plot(ax=ax, c='r', markersize=1, alpha=0.5)" + "occupied = parking[parking[\"occupancystate\"] == \"OCCUPIED\"]\n", + "ax = vacant.plot(ax=ax, c=\"r\", markersize=1, alpha=0.5)" ] }, { @@ -938,18 +958,30 @@ "outputs": [], "source": [ "# create leaflet web map centered/zoomed to downtown LA\n", - "m = folium.Map(location=(34.05, -118.25), zoom_start=15, tiles='cartodbpositron')\n", + "m = folium.Map(location=(34.05, -118.25), zoom_start=15, tiles=\"cartodbpositron\")\n", "\n", "# add blue markers for each vacant spot\n", - "cols = ['lat', 'lon', 'ADDRESS_SPACE']\n", + "cols = [\"lat\", \"lon\", \"ADDRESS_SPACE\"]\n", "for lat, lng, address in vacant[cols].values:\n", - " folium.CircleMarker(location=(lat, lng), radius=5, color='#3186cc',\n", - " fill=True, fill_color='#3186cc', tooltip=address).add_to(m)\n", + " folium.CircleMarker(\n", + " location=(lat, lng),\n", + " radius=5,\n", + " color=\"#3186cc\",\n", + " fill=True,\n", + " fill_color=\"#3186cc\",\n", + " tooltip=address,\n", + " ).add_to(m)\n", "\n", "# add red markers for each occupied spot\n", - "for lat, lng, address in occupied[cols].values: \n", - " folium.CircleMarker(location=(lat, lng), radius=5, color='#dc143c',\n", - " fill=True, fill_color='#dc143c', tooltip=address).add_to(m)" + "for lat, lng, address in occupied[cols].values:\n", + " folium.CircleMarker(\n", + " location=(lat, lng),\n", + " radius=5,\n", + " color=\"#dc143c\",\n", + " fill=True,\n", + " fill_color=\"#dc143c\",\n", + " tooltip=address,\n", + " ).add_to(m)" ] }, { @@ -977,9 +1009,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (ppd599)", + "display_name": "Python (ppde642)", "language": "python", - "name": "ppd599" + "name": "ppde642" }, "language_info": { "codemirror_mode": { @@ -991,7 +1023,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/modules/06-spatial-data/lecture.ipynb b/modules/06-spatial-data/lecture.ipynb index 48d8f97..51ae935 100644 --- a/modules/06-spatial-data/lecture.ipynb +++ b/modules/06-spatial-data/lecture.ipynb @@ -23,6 +23,7 @@ "outputs": [], "source": [ "import ast\n", + "\n", "import contextily as cx\n", "import folium\n", "import geopandas as gpd\n", @@ -46,9 +47,9 @@ "metadata": {}, "outputs": [], "source": [ - "# tell geopandas to read a shapefile with its read_file() function, passing in the shapefile folder\n", + "# tell geopandas to read shapefile with its read_file() function, passing in shapefile folder\n", "# this produces a GeoDataFrame\n", - "gdf_tracts = gpd.read_file('../../data/tl_2020_06_tract/')\n", + "gdf_tracts = gpd.read_file(\"../../data/tl_2020_06_tract/\")\n", "gdf_tracts.shape" ] }, @@ -92,7 +93,7 @@ "outputs": [], "source": [ "# loading a GeoPackage works the same way\n", - "gdf_stations = gpd.read_file('../../data/rail_stations.gpkg')\n", + "gdf_stations = gpd.read_file(\"../../data/rail_stations.gpkg\")\n", "gdf_stations.shape" ] }, @@ -123,7 +124,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false, "jupyter": { "outputs_hidden": false } @@ -131,7 +131,7 @@ "outputs": [], "source": [ "# load business location data as a regular pandas dataframe\n", - "df = pd.read_csv('../../data/Listing_of_Active_Businesses.csv')\n", + "df = pd.read_csv(\"../../data/Listing_of_Active_Businesses.csv\")\n", "df.shape" ] }, @@ -142,15 +142,17 @@ "outputs": [], "source": [ "# clean up the data (same code from the data cleaning lecture)\n", - "df.columns = df.columns.str.lower().str.replace(' ', '_').str.strip('_#')\n", - "df = df.set_index('location_account').sort_index()\n", - "df['location_start_date'] = pd.to_datetime(df['location_start_date'])\n", - "slicer = pd.IndexSlice[:, 'business_name':'mailing_city']\n", - "df.loc[slicer] = df.loc[slicer].apply(lambda col: col.str.title(), axis='rows')\n", - "mask = pd.notnull(df['location'])\n", - "latlng = df.loc[mask, 'location'].map(ast.literal_eval)\n", - "df.loc[mask, ['lat', 'lng']] = pd.DataFrame(latlng.to_list(), index=latlng.index, columns=['lat', 'lng'])\n", - "df = df.drop(columns=['location']).dropna(subset=['lat', 'lng'])" + "df.columns = df.columns.str.lower().str.replace(\" \", \"_\").str.strip(\"_#\")\n", + "df = df.set_index(\"location_account\").sort_index()\n", + "df[\"location_start_date\"] = pd.to_datetime(df[\"location_start_date\"])\n", + "slicer = pd.IndexSlice[:, \"business_name\":\"mailing_city\"]\n", + "df.loc[slicer] = df.loc[slicer].apply(lambda col: col.str.title(), axis=\"rows\")\n", + "mask = pd.notnull(df[\"location\"])\n", + "latlng = df.loc[mask, \"location\"].map(ast.literal_eval)\n", + "df.loc[mask, [\"lat\", \"lng\"]] = pd.DataFrame(\n", + " latlng.to_list(), index=latlng.index, columns=[\"lat\", \"lng\"]\n", + ")\n", + "df = df.drop(columns=[\"location\"]).dropna(subset=[\"lat\", \"lng\"])" ] }, { @@ -183,9 +185,8 @@ "# create a geometry column to contain shapely geometry for geopandas to use\n", "# notice the shapely points are lng, lat so that they are equivalent to x, y\n", "# also notice that we set the CRS explicitly\n", - "gdf_business['geometry'] = gpd.points_from_xy(x=gdf_business['lng'],\n", - " y=gdf_business['lat'])\n", - "gdf_business.crs = 'epsg:4326'\n", + "gdf_business[\"geometry\"] = gpd.points_from_xy(x=gdf_business[\"lng\"], y=gdf_business[\"lat\"])\n", + "gdf_business.crs = \"epsg:4326\"\n", "gdf_business.shape" ] }, @@ -238,8 +239,8 @@ "metadata": {}, "outputs": [], "source": [ - "# load the raster file and view its band count, pixel width and height, null value, and geographic bounds\n", - "raster = rasterio.open('../../data/la-elevation.tif')\n", + "# load raster file and view its band count, pixel width/height, null value, and geographic bounds\n", + "raster = rasterio.open(\"../../data/la-elevation.tif\")\n", "print(raster.count, raster.width, raster.height)\n", "print(raster.nodata)\n", "print(raster.bounds)" @@ -263,7 +264,7 @@ "outputs": [], "source": [ "# histogram of elevations (meters above sea level) around downtown LA\n", - "ax = df[df!=raster.nodata].stack().hist(bins=50)" + "ax = df[df != raster.nodata].stack().hist(bins=50)" ] }, { @@ -274,8 +275,7 @@ "source": [ "# get shapes representing groups of adjacent pixels with same values\n", "# affine transformation maps pixel row/col -> spatial x/y\n", - "shapes = rasterio.features.shapes(source=raster.read(1),\n", - " transform=raster.transform)" + "shapes = rasterio.features.shapes(source=raster.read(1), transform=raster.transform)" ] }, { @@ -286,11 +286,11 @@ "source": [ "# convert raster to GeoJSON-like vector features and create a gdf from them\n", "# pro-tip: use generator comprehension for memory efficiency\n", - "features = ({'geometry': polygon, 'properties': {'elevation': value}} for polygon, value in shapes)\n", + "features = ({\"geometry\": polygon, \"properties\": {\"elevation\": value}} for polygon, value in shapes)\n", "gdf_srtm = gpd.GeoDataFrame.from_features(features, crs=raster.crs)\n", "\n", "# drop any null rows\n", - "gdf_srtm = gdf_srtm[gdf_srtm['elevation']!=raster.nodata]\n", + "gdf_srtm = gdf_srtm[gdf_srtm[\"elevation\"] != raster.nodata]\n", "gdf_srtm.shape" ] }, @@ -321,10 +321,10 @@ "outputs": [], "source": [ "# plot the elevation pixels and identify pershing square\n", - "fig, ax = plt.subplots(facecolor='#111111')\n", - "ax = gdf_srtm.plot(ax=ax, column='elevation', cmap='inferno')\n", - "_ = ax.axis('off')\n", - "_ = ax.scatter(y=34.048097, x=-118.253233, c='w', marker='x', s=100)" + "fig, ax = plt.subplots(facecolor=\"#111111\")\n", + "ax = gdf_srtm.plot(ax=ax, column=\"elevation\", cmap=\"inferno\")\n", + "_ = ax.axis(\"off\")\n", + "_ = ax.scatter(y=34.048097, x=-118.253233, c=\"w\", marker=\"x\", s=100)" ] }, { @@ -334,7 +334,7 @@ "outputs": [], "source": [ "# now it's your turn\n", - "# change the colors and also show the location of city hall on the map\n" + "# change the colors and also show the location of city hall on the map" ] }, { @@ -365,7 +365,7 @@ "outputs": [], "source": [ "# project them all to UTM zone 11N (see http://epsg.io/32611)\n", - "utm_crs = 'epsg:32611'\n", + "utm_crs = \"epsg:32611\"\n", "gdf_tracts = gdf_tracts.to_crs(utm_crs)\n", "gdf_stations = gdf_stations.to_crs(utm_crs)\n", "gdf_business = gdf_business.to_crs(utm_crs)\n", @@ -396,7 +396,7 @@ "outputs": [], "source": [ "# now it's your turn\n", - "# pick a different CRS and re-project the data to it\n" + "# pick a different CRS and re-project the data to it" ] }, { @@ -418,7 +418,7 @@ "# takes a few seconds...\n", "# dissolve lets you aggregate (merge geometries together) by shared attribute values\n", "# this is the spatial equivalent of pandas's groupby function\n", - "gdf_counties = gdf_tracts.dissolve(by='COUNTYFP', aggfunc=np.sum)" + "gdf_counties = gdf_tracts.dissolve(by=\"COUNTYFP\", aggfunc=np.sum)" ] }, { @@ -429,9 +429,9 @@ "source": [ "# now that we've dissolved tracts -> counties and summed their attributes,\n", "# plot the counties by land area\n", - "fig, ax = plt.subplots(facecolor='#111111')\n", - "ax = gdf_counties.plot(ax=ax, column='ALAND', cmap='Blues_r')\n", - "_ = ax.axis('off')" + "fig, ax = plt.subplots(facecolor=\"#111111\")\n", + "ax = gdf_counties.plot(ax=ax, column=\"ALAND\", cmap=\"Blues_r\")\n", + "_ = ax.axis(\"off\")" ] }, { @@ -442,7 +442,7 @@ "source": [ "# just like in regular pandas, we can filter and subset the GeoDataFrame\n", "# retain only tracts in LA county (FIPS code 037)\n", - "mask = gdf_tracts['COUNTYFP'] == '037'\n", + "mask = gdf_tracts[\"COUNTYFP\"] == \"037\"\n", "gdf_tracts_la = gdf_tracts[mask]\n", "ax = gdf_tracts_la.plot()" ] @@ -542,7 +542,7 @@ "source": [ "# euclidean buffers let you analyze the area around features (use projected CRS!)\n", "# buffer the rail stations by a half km (5-10 minute walk)\n", - "gdf_stations['geometry'] = gdf_stations.buffer(500)" + "gdf_stations[\"geometry\"] = gdf_stations.buffer(500)" ] }, { @@ -551,11 +551,13 @@ "metadata": {}, "outputs": [], "source": [ - "fig, ax = plt.subplots(figsize=(8, 8), facecolor='#111111')\n", - "ax = gdf_tracts_dtla.plot(ax=ax, color='k')\n", - "ax = gdf_stations.plot(ax=ax, color='w', alpha=0.3)\n", - "ax = gdf_business_dtla.plot(ax=ax, color='#ffff66', marker='.', linewidth=0, markersize=20, alpha=0.05)\n", - "_ = ax.axis('off')" + "fig, ax = plt.subplots(figsize=(8, 8), facecolor=\"#111111\")\n", + "ax = gdf_tracts_dtla.plot(ax=ax, color=\"k\")\n", + "ax = gdf_stations.plot(ax=ax, color=\"w\", alpha=0.3)\n", + "ax = gdf_business_dtla.plot(\n", + " ax=ax, color=\"#ffff66\", marker=\".\", linewidth=0, markersize=20, alpha=0.05\n", + ")\n", + "_ = ax.axis(\"off\")" ] }, { @@ -566,7 +568,7 @@ "source": [ "# you can do set operations like union, intersection, and difference\n", "# get all the portions of tracts >0.5km from a rail station\n", - "gdf_diff = gpd.overlay(gdf_tracts_dtla, gdf_stations, how='difference')\n", + "gdf_diff = gpd.overlay(gdf_tracts_dtla, gdf_stations, how=\"difference\")\n", "ax = gdf_diff.plot()" ] }, @@ -603,7 +605,7 @@ "outputs": [], "source": [ "# join stations to elevation data\n", - "gdf = gpd.sjoin(gdf_srtm, gdf_stations, how='inner', predicate='intersects')" + "gdf = gpd.sjoin(gdf_srtm, gdf_stations, how=\"inner\", predicate=\"intersects\")" ] }, { @@ -613,7 +615,7 @@ "outputs": [], "source": [ "# counts vary because these aren't elevation pixels, but regions of same value\n", - "gdf_elev_desc = gdf.groupby('name')['elevation'].describe().astype(int)\n", + "gdf_elev_desc = gdf.groupby(\"name\")[\"elevation\"].describe().astype(int)\n", "gdf_elev_desc" ] }, @@ -623,7 +625,7 @@ "metadata": {}, "outputs": [], "source": [ - "gdf_stations_elev = gdf_stations.merge(gdf_elev_desc, left_on='name', right_index=True)\n", + "gdf_stations_elev = gdf_stations.merge(gdf_elev_desc, left_on=\"name\", right_index=True)\n", "gdf_stations_elev.head()" ] }, @@ -634,7 +636,7 @@ "outputs": [], "source": [ "# which stations have the greatest elevation variation around them?\n", - "ax = gdf_stations_elev.plot(column='std')" + "ax = gdf_stations_elev.plot(column=\"std\")" ] }, { @@ -644,7 +646,7 @@ "outputs": [], "source": [ "# now it's your turn\n", - "# which station buffer covers the largest elevation range?\n" + "# which station buffer covers the largest elevation range?" ] }, { @@ -661,7 +663,7 @@ "outputs": [], "source": [ "# join stations to businesses data\n", - "gdf = gpd.sjoin(gdf_business, gdf_stations, how='inner', predicate='intersects')" + "gdf = gpd.sjoin(gdf_business, gdf_stations, how=\"inner\", predicate=\"intersects\")" ] }, { @@ -671,8 +673,8 @@ "outputs": [], "source": [ "# counts vary because these aren't elevation pixels, but regions of same value\n", - "gdf_business_desc = gdf.groupby('name').size().sort_values(ascending=False)\n", - "gdf_business_desc.name = 'count'\n", + "gdf_business_desc = gdf.groupby(\"name\").size().sort_values(ascending=False)\n", + "gdf_business_desc.name = \"count\"\n", "gdf_business_desc" ] }, @@ -690,7 +692,8 @@ "outputs": [], "source": [ "# now it's your turn\n", - "# change earlier parts of the notebook to make sure our station buffers capture all the businesses within them\n" + "# change earlier parts of the notebook to make sure\n", + "# our station buffers capture all the businesses within them" ] }, { @@ -699,7 +702,7 @@ "metadata": {}, "outputs": [], "source": [ - "gdf_stations_business = gdf_stations.merge(gdf_business_desc, left_on='name', right_index=True)" + "gdf_stations_business = gdf_stations.merge(gdf_business_desc, left_on=\"name\", right_index=True)" ] }, { @@ -709,7 +712,7 @@ "outputs": [], "source": [ "# which stations have the most businesses around them?\n", - "ax = gdf_stations_business.plot(column='count')" + "ax = gdf_stations_business.plot(column=\"count\")" ] }, { @@ -728,7 +731,7 @@ "outputs": [], "source": [ "# join tracts to business data\n", - "gdf = gpd.sjoin(gdf_business, gdf_tracts_dtla, how='inner', predicate='intersects')" + "gdf = gpd.sjoin(gdf_business, gdf_tracts_dtla, how=\"inner\", predicate=\"intersects\")" ] }, { @@ -738,8 +741,8 @@ "outputs": [], "source": [ "# count businesses per tract\n", - "counts = gdf.groupby('GEOID').size()\n", - "counts.name = 'count'" + "counts = gdf.groupby(\"GEOID\").size()\n", + "counts.name = \"count\"" ] }, { @@ -749,8 +752,10 @@ "outputs": [], "source": [ "# merge in the counts then calculate density (businesses per km^2)\n", - "gdf_tracts_dtla_business = gdf_tracts_dtla.merge(counts, left_on='GEOID', right_index=True)\n", - "gdf_tracts_dtla_business['density'] = gdf_tracts_dtla_business['count'] / gdf_tracts_dtla_business['ALAND'] * 1e6" + "gdf_tracts_dtla_business = gdf_tracts_dtla.merge(counts, left_on=\"GEOID\", right_index=True)\n", + "gdf_tracts_dtla_business[\"density\"] = (\n", + " gdf_tracts_dtla_business[\"count\"] / gdf_tracts_dtla_business[\"ALAND\"] * 1e6\n", + ")" ] }, { @@ -760,10 +765,10 @@ "outputs": [], "source": [ "# plot tracts as choropleth plus station buffers\n", - "fig, ax = plt.subplots(figsize=(8, 8), facecolor='#111111')\n", - "ax = gdf_tracts_dtla_business.plot(ax=ax, column='density', cmap='viridis')\n", - "ax = gdf_stations.plot(ax=ax, alpha=0.2, linewidth=3, edgecolor='w', color='none')\n", - "_ = ax.axis('off')" + "fig, ax = plt.subplots(figsize=(8, 8), facecolor=\"#111111\")\n", + "ax = gdf_tracts_dtla_business.plot(ax=ax, column=\"density\", cmap=\"viridis\")\n", + "ax = gdf_stations.plot(ax=ax, alpha=0.2, linewidth=3, edgecolor=\"w\", color=\"none\")\n", + "_ = ax.axis(\"off\")" ] }, { @@ -773,18 +778,18 @@ "outputs": [], "source": [ "# this time, let's add a basemap for context\n", - "fig, ax = plt.subplots(figsize=(8, 8), facecolor='#111111')\n", - "ax = gdf_tracts_dtla_business.plot(ax=ax, column='density', cmap='viridis',\n", - " alpha=0.7, linewidth=0.3, edgecolor='k')\n", - "ax = gdf_stations.plot(ax=ax, alpha=0.3, linewidth=3, edgecolor='w', color='none')\n", - "_ = ax.axis('off')\n", + "fig, ax = plt.subplots(figsize=(8, 8), facecolor=\"#111111\")\n", + "ax = gdf_tracts_dtla_business.plot(\n", + " ax=ax, column=\"density\", cmap=\"viridis\", alpha=0.7, linewidth=0.3, edgecolor=\"k\"\n", + ")\n", + "ax = gdf_stations.plot(ax=ax, alpha=0.3, linewidth=3, edgecolor=\"w\", color=\"none\")\n", + "_ = ax.axis(\"off\")\n", "\n", "# add the basemap with contextily, choosing a tile provider\n", "# or try cx.providers.Stamen.TonerBackground, etc\n", - "cx.add_basemap(ax, crs=gdf_stations.crs.to_string(),\n", - " source=cx.providers.CartoDB.DarkMatter)\n", + "cx.add_basemap(ax, crs=gdf_stations.crs.to_string(), source=cx.providers.CartoDB.DarkMatter)\n", "\n", - "ax.figure.savefig('map.png', dpi=600, bbox_inches='tight')" + "ax.figure.savefig(\"map.png\", dpi=600, bbox_inches=\"tight\")" ] }, { @@ -794,7 +799,7 @@ "outputs": [], "source": [ "# now it's your turn\n", - "# change the tile provider, the tract colors, the alphas, etc to find a plot your like\n" + "# change the tile provider, the tract colors, the alphas, etc to find a plot your like" ] }, { @@ -811,24 +816,26 @@ "outputs": [], "source": [ "# optionally bin the data into quintiles\n", - "bins = list(gdf_tracts_dtla_business['density'].quantile([0, 0.2, 0.4, 0.6, 0.8, 1]))\n", + "bins = list(gdf_tracts_dtla_business[\"density\"].quantile([0, 0.2, 0.4, 0.6, 0.8, 1]))\n", "\n", "# create leaflet choropleth web map\n", - "m = folium.Map(location=(34.047223, -118.253555), zoom_start=15, tiles='cartodbdark_matter')\n", - "c = folium.Choropleth(geo_data=gdf_tracts_dtla_business,\n", - " data=gdf_tracts_dtla_business,\n", - " #bins=bins,\n", - " columns=['GEOID', 'density'],\n", - " key_on='feature.properties.GEOID', \n", - " highlight=True,\n", - " fill_color='YlOrRd_r', \n", - " legend_name='Businesses per square km').add_to(m)\n", + "m = folium.Map(location=(34.047223, -118.253555), zoom_start=15, tiles=\"cartodbdark_matter\")\n", + "c = folium.Choropleth(\n", + " geo_data=gdf_tracts_dtla_business,\n", + " data=gdf_tracts_dtla_business,\n", + " # bins=bins,\n", + " columns=[\"GEOID\", \"density\"],\n", + " key_on=\"feature.properties.GEOID\",\n", + " highlight=True,\n", + " fill_color=\"YlOrRd_r\",\n", + " legend_name=\"Businesses per square km\",\n", + ").add_to(m)\n", "\n", "# add mouseover tooltip to the countries\n", - "c.geojson.add_child(folium.features.GeoJsonTooltip(['GEOID', 'density']))\n", + "c.geojson.add_child(folium.features.GeoJsonTooltip([\"GEOID\", \"density\"]))\n", "\n", "# save web map to disk\n", - "m.save('webmap.html')" + "m.save(\"webmap.html\")" ] }, { @@ -848,7 +855,7 @@ "source": [ "# now it's your turn\n", "# try binning the data in different ways. how would you do it?\n", - "# try changing the colors, basemap, and what variable you're visualizing\n" + "# try changing the colors, basemap, and what variable you're visualizing" ] }, { @@ -965,10 +972,10 @@ "outputs": [], "source": [ "# visualize the precise matches vs the false positives\n", - "ax = gdf_stations.plot(color='gray')\n", - "ax = false_positives.plot(ax=ax, c='r', markersize=0.1)\n", - "ax = precise_matches.plot(ax=ax, c='b', markersize=0.1)\n", - "_ = ax.axis('off')" + "ax = gdf_stations.plot(color=\"gray\")\n", + "ax = false_positives.plot(ax=ax, c=\"r\", markersize=0.1)\n", + "ax = precise_matches.plot(ax=ax, c=\"b\", markersize=0.1)\n", + "_ = ax.axis(\"off\")" ] }, { @@ -981,9 +988,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python (ppde642)", "language": "python", - "name": "python3" + "name": "ppde642" }, "language_info": { "codemirror_mode": { @@ -995,7 +1002,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/modules/06-spatial-data/raster-crop-bbox.ipynb b/modules/06-spatial-data/raster-crop-bbox.ipynb index e6bcd6b..55f93c6 100644 --- a/modules/06-spatial-data/raster-crop-bbox.ipynb +++ b/modules/06-spatial-data/raster-crop-bbox.ipynb @@ -15,7 +15,6 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", "import rasterio.mask\n", "from shapely.wkt import loads" ] @@ -28,7 +27,7 @@ "outputs": [], "source": [ "# load the SRTM raster\n", - "raster_path = '../../data/N34W119.hgt'\n", + "raster_path = \"../../data/N34W119.hgt\"\n", "raster = rasterio.open(raster_path)" ] }, @@ -40,7 +39,10 @@ "outputs": [], "source": [ "# define a bounding box to crop raster to\n", - "wkt = 'POLYGON((-118.2863 34.0171, -118.2863 34.0711, -118.2212 34.0711, -118.2212 34.0171, -118.2863 34.0171))'\n", + "wkt = (\n", + " \"POLYGON((-118.2863 34.0171, -118.2863 34.0711, -118.2212 34.0711, \"\n", + " \"-118.2212 34.0171, -118.2863 34.0171))\"\n", + ")\n", "bbox = loads(wkt)" ] }, @@ -54,10 +56,14 @@ "# crop the raster to the bounding box\n", "out_image, out_transform = rasterio.mask.mask(raster, [bbox], crop=True)\n", "out_meta = raster.meta\n", - "out_meta.update({'driver': 'GTiff',\n", - " 'height': out_image.shape[1],\n", - " 'width': out_image.shape[2],\n", - " 'transform': out_transform})" + "out_meta.update(\n", + " {\n", + " \"driver\": \"GTiff\",\n", + " \"height\": out_image.shape[1],\n", + " \"width\": out_image.shape[2],\n", + " \"transform\": out_transform,\n", + " }\n", + ")" ] }, { @@ -68,7 +74,7 @@ "outputs": [], "source": [ "# save the cropped raster as a tif file\n", - "with rasterio.open('../../data/la-elevation.tif', 'w', **out_meta) as f:\n", + "with rasterio.open(\"../../data/la-elevation.tif\", \"w\", **out_meta) as f:\n", " f.write(out_image)" ] }, @@ -83,9 +89,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (ppd599)", + "display_name": "Python (ppde642)", "language": "python", - "name": "ppd599" + "name": "ppde642" }, "language_info": { "codemirror_mode": { @@ -97,7 +103,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/modules/07-urban-networks-i/lecture.ipynb b/modules/07-urban-networks-i/lecture.ipynb index 9feb207..8672e6b 100644 --- a/modules/07-urban-networks-i/lecture.ipynb +++ b/modules/07-urban-networks-i/lecture.ipynb @@ -30,7 +30,7 @@ "\n", "# configure OSMnx\n", "ox.settings.log_console = True\n", - "ox.settings.cache_folder = '../../data/cache'" + "ox.settings.cache_folder = \"../../data/cache\"" ] }, { @@ -105,8 +105,8 @@ "source": [ "# assign random ages to each person in the network\n", "randoms = np.random.randint(low=18, high=90, size=len(G.nodes))\n", - "ages = {node:age for node, age in zip(G.nodes, randoms)}\n", - "nx.set_node_attributes(G, values=ages, name='age')" + "ages = {node: age for node, age in zip(G.nodes, randoms)}\n", + "nx.set_node_attributes(G, values=ages, name=\"age\")" ] }, { @@ -119,7 +119,7 @@ "# social distance is the inverse of how often they hang out each year\n", "hangout_counts = np.random.randint(low=1, high=100, size=len(G.edges))\n", "distances = {edge: 1 / hangout_count for edge, hangout_count in zip(G.edges, hangout_counts)}\n", - "nx.set_edge_attributes(G, values=distances, name='distance')" + "nx.set_edge_attributes(G, values=distances, name=\"distance\")" ] }, { @@ -129,7 +129,7 @@ "outputs": [], "source": [ "# view the nodes and optionally show their attribute data\n", - "G.nodes#(data=True)" + "G.nodes # (data=True)" ] }, { @@ -140,7 +140,7 @@ "source": [ "# view the edges and optionally show their attribute data\n", "# these are undirected edges, and there cannot be parallel edges\n", - "G.edges#(data=True)" + "G.edges # (data=True)" ] }, { @@ -161,7 +161,7 @@ "outputs": [], "source": [ "# calculate the shortest weighted path between two nodes\n", - "path2 = nx.shortest_path(G, source=0, target=50, weight='distance')\n", + "path2 = nx.shortest_path(G, source=0, target=50, weight=\"distance\")\n", "path2" ] }, @@ -172,7 +172,7 @@ "outputs": [], "source": [ "# calculate node betweenness centrality across the network\n", - "bc = nx.betweenness_centrality(G, weight='distance')\n", + "bc = nx.betweenness_centrality(G, weight=\"distance\")\n", "pd.Series(bc).describe()" ] }, @@ -183,7 +183,7 @@ "outputs": [], "source": [ "# now it's your turn\n", - "# try changing the social distance between our people, then recompute a shortest path\n" + "# try changing the social distance between our people, then recompute a shortest path" ] }, { @@ -228,8 +228,8 @@ "outputs": [], "source": [ "# download/model a street network for some city then visualize it\n", - "place = 'Piedmont, California, USA'\n", - "G = ox.graph_from_place(place, network_type='drive')\n", + "place = \"Piedmont, California, USA\"\n", + "G = ox.graph_from_place(place, network_type=\"drive\")\n", "fig, ax = ox.plot_graph(G)" ] }, @@ -326,7 +326,7 @@ "outputs": [], "source": [ "# now it's your turn\n", - "# download a graph of a different (small-ish) town, then plot it\n" + "# download a graph of a different (small-ish) town, then plot it" ] }, { @@ -345,7 +345,7 @@ "# get our study site's geometry\n", "gdf = ox.geocode_to_gdf(place)\n", "gdf_proj = ox.project_gdf(gdf)\n", - "geom_proj = gdf_proj['geometry'].iloc[0]\n", + "geom_proj = gdf_proj[\"geometry\"].iloc[0]\n", "geom_proj" ] }, @@ -368,7 +368,7 @@ "source": [ "# project the graph (automatically) then check its new CRS\n", "G_proj = ox.project_graph(G)\n", - "G_proj.graph['crs']" + "G_proj.graph[\"crs\"]" ] }, { @@ -395,8 +395,8 @@ "outputs": [], "source": [ "# save graph to disk as geopackage (for GIS) or GraphML file (for Gephi etc)\n", - "ox.save_graph_geopackage(G, filepath='./data/mynetwork.gpkg')\n", - "ox.save_graphml(G, filepath='./data/mynetwork.graphml')" + "ox.save_graph_geopackage(G, filepath=\"./data/mynetwork.gpkg\")\n", + "ox.save_graphml(G, filepath=\"./data/mynetwork.graphml\")" ] }, { @@ -416,7 +416,7 @@ "source": [ "# convert graph to line graph so edges become nodes and vice versa\n", "edge_centrality = nx.closeness_centrality(nx.line_graph(G))\n", - "nx.set_edge_attributes(G, edge_centrality, 'edge_centrality')" + "nx.set_edge_attributes(G, edge_centrality, \"edge_centrality\")" ] }, { @@ -426,7 +426,7 @@ "outputs": [], "source": [ "# color edges in original graph with centralities from line graph\n", - "ec = ox.plot.get_edge_colors_by_attr(G, 'edge_centrality', cmap='inferno')\n", + "ec = ox.plot.get_edge_colors_by_attr(G, \"edge_centrality\", cmap=\"inferno\")\n", "fig, ax = ox.plot_graph(G, edge_color=ec, edge_linewidth=2, node_size=0)" ] }, @@ -466,7 +466,7 @@ "outputs": [], "source": [ "# find the shortest path between these nodes, minimizing travel time, then plot it\n", - "route = ox.shortest_path(G, orig, dest, weight='travel_time')\n", + "route = ox.shortest_path(G, orig, dest, weight=\"travel_time\")\n", "fig, ax = ox.plot_graph_route(G, route, node_size=0)" ] }, @@ -477,7 +477,7 @@ "outputs": [], "source": [ "# how long is our route in meters?\n", - "edge_lengths = ox.utils_graph.get_route_edge_attributes(G, route, 'length')\n", + "edge_lengths = ox.utils_graph.get_route_edge_attributes(G, route, \"length\")\n", "sum(edge_lengths)" ] }, @@ -488,8 +488,9 @@ "outputs": [], "source": [ "# how far is it between these two nodes as the crow flies (haversine)?\n", - "ox.distance.great_circle_vec(G.nodes[orig]['y'], G.nodes[orig]['x'],\n", - " G.nodes[dest]['y'], G.nodes[dest]['x'])" + "ox.distance.great_circle_vec(\n", + " G.nodes[orig][\"y\"], G.nodes[orig][\"x\"], G.nodes[dest][\"y\"], G.nodes[dest][\"x\"]\n", + ")" ] }, { @@ -500,7 +501,7 @@ "source": [ "# now it's your turn\n", "# how circuitous is this route?\n", - "# try plotting it differently: change the colors and node/edge sizes\n" + "# try plotting it differently: change the colors and node/edge sizes" ] }, { @@ -519,11 +520,9 @@ "outputs": [], "source": [ "# you can make query an unambiguous dict to help the geocoder find it\n", - "place = {'city' : 'San Francisco',\n", - " 'state' : 'California',\n", - " 'country': 'USA'}\n", - "G = ox.graph_from_place(place, network_type='drive', truncate_by_edge=True)\n", - "fig, ax = ox.plot_graph(G, figsize=(10, 10), node_size=0, edge_color='y', edge_linewidth=0.2)" + "place = {\"city\": \"San Francisco\", \"state\": \"California\", \"country\": \"USA\"}\n", + "G = ox.graph_from_place(place, network_type=\"drive\", truncate_by_edge=True)\n", + "fig, ax = ox.plot_graph(G, figsize=(10, 10), node_size=0, edge_color=\"y\", edge_linewidth=0.2)" ] }, { @@ -533,7 +532,7 @@ "outputs": [], "source": [ "# you can get networks anywhere in the world\n", - "G = ox.graph_from_place('Sinalunga, Italy', network_type='all')\n", + "G = ox.graph_from_place(\"Sinalunga, Italy\", network_type=\"all\")\n", "fig, ax = ox.plot_graph(G, node_size=0, edge_linewidth=0.5)" ] }, @@ -546,8 +545,8 @@ "# or get network by address, coordinates, bounding box, or any custom polygon\n", "# ...useful when OSM just doesn't already have a polygon for the place you want\n", "lewis_hall = (34.019267, -118.283566)\n", - "one_mile = 1609 #meters\n", - "G = ox.graph_from_point(lewis_hall, dist=one_mile, network_type='drive')\n", + "one_mile = 1609 # meters\n", + "G = ox.graph_from_point(lewis_hall, dist=one_mile, network_type=\"drive\")\n", "fig, ax = ox.plot_graph(G, node_size=0)" ] }, @@ -558,7 +557,8 @@ "outputs": [], "source": [ "# now it's your turn\n", - "# create a graph of your hometown then calculate the shortest path between two points of your choice" + "# create a graph of your hometown\n", + "# then calculate the shortest path between two points of your choice" ] }, { @@ -577,11 +577,15 @@ "outputs": [], "source": [ "# get NY subway rail network\n", - "G = ox.graph_from_place('New York City, New York, USA',\n", - " retain_all=False, truncate_by_edge=True, simplify=True,\n", - " custom_filter='[\"railway\"~\"subway\"]')\n", + "G = ox.graph_from_place(\n", + " \"New York City, New York, USA\",\n", + " retain_all=False,\n", + " truncate_by_edge=True,\n", + " simplify=True,\n", + " custom_filter='[\"railway\"~\"subway\"]',\n", + ")\n", "\n", - "fig, ax = ox.plot_graph(G, node_size=0, edge_color='c', edge_linewidth=0.2)" + "fig, ax = ox.plot_graph(G, node_size=0, edge_color=\"c\", edge_linewidth=0.2)" ] }, { @@ -600,8 +604,8 @@ "outputs": [], "source": [ "# get all building footprints in some neighborhood\n", - "place = 'Civic Center, Los Angeles, California'\n", - "tags = {'building': True}\n", + "place = \"Civic Center, Los Angeles, California\"\n", + "tags = {\"building\": True}\n", "gdf = ox.geometries_from_place(place, tags)\n", "gdf.shape" ] @@ -622,8 +626,7 @@ "outputs": [], "source": [ "# get all parks and bus stops in some neighborhood\n", - "tags = {'leisure': 'park',\n", - " 'highway': 'bus_stop'}\n", + "tags = {\"leisure\": \"park\", \"highway\": \"bus_stop\"}\n", "gdf = ox.geometries_from_place(place, tags)\n", "gdf.shape" ] @@ -635,10 +638,10 @@ "outputs": [], "source": [ "# restaurants near the empire state buildings\n", - "address = '350 5th Ave, New York, NY 10001'\n", - "tags = {'amenity': 'restaurant'}\n", + "address = \"350 5th Ave, New York, NY 10001\"\n", + "tags = {\"amenity\": \"restaurant\"}\n", "gdf = ox.geometries_from_address(address, tags=tags, dist=500)\n", - "gdf[['name', 'cuisine', 'geometry']].dropna().head()" + "gdf[[\"name\", \"cuisine\", \"geometry\"]].dropna().head()" ] }, { @@ -649,7 +652,8 @@ "source": [ "# now it's your turn\n", "# find all the rail stations around downtown LA\n", - "# hint, the tag is railway and the value is station: https://wiki.openstreetmap.org/wiki/Tag:railway%3Dstation" + "# hint, the tag is railway and the value is station:\n", + "# https://wiki.openstreetmap.org/wiki/Tag:railway%3Dstation" ] }, { @@ -671,9 +675,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (ppd599)", + "display_name": "Python (ppde642)", "language": "python", - "name": "ppd599" + "name": "ppde642" }, "language_info": { "codemirror_mode": { @@ -685,7 +689,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/modules/08-urban-networks-ii/lecture.ipynb b/modules/08-urban-networks-ii/lecture.ipynb index b232b0f..7a07f6e 100644 --- a/modules/08-urban-networks-ii/lecture.ipynb +++ b/modules/08-urban-networks-ii/lecture.ipynb @@ -23,7 +23,6 @@ "source": [ "import geopandas as gpd\n", "import matplotlib.pyplot as plt\n", - "import networkx as nx\n", "import numpy as np\n", "import osmnx as ox\n", "import pandana\n", @@ -36,7 +35,7 @@ "# configure OSMnx\n", "ox.settings.log_console = True\n", "ox.settings.use_cache = True\n", - "ox.settings.cache_folder = '../../data/cache2'" + "ox.settings.cache_folder = \"../../data/cache2\"" ] }, { @@ -56,7 +55,7 @@ "source": [ "# create a study site: geocode city hall, convert coords to shapely geometry,\n", "# project geometry to UTM, buffer by 5km, project back to lat-lng\n", - "latlng_coords = ox.geocode('Los Angeles City Hall')\n", + "latlng_coords = ox.geocode(\"Los Angeles City Hall\")\n", "latlng_point = Point(latlng_coords[1], latlng_coords[0])\n", "latlng_point_proj, crs = ox.projection.project_geometry(latlng_point)\n", "polygon_proj = latlng_point_proj.buffer(5000)\n", @@ -72,8 +71,8 @@ "source": [ "# model the street network within study site\n", "# your parameterization makes assumptions about your interests here\n", - "G = ox.graph_from_polygon(polygon, network_type='drive', truncate_by_edge=True)\n", - "fig, ax = ox.plot_graph(G, node_size=0, edge_color='w', edge_linewidth=0.3)" + "G = ox.graph_from_polygon(polygon, network_type=\"drive\", truncate_by_edge=True)\n", + "fig, ax = ox.plot_graph(G, node_size=0, edge_color=\"w\", edge_linewidth=0.3)" ] }, { @@ -104,7 +103,7 @@ "outputs": [], "source": [ "# how many intersections does it contain?\n", - "street_counts = pd.Series(dict(G.nodes(data='street_count')))\n", + "street_counts = pd.Series(dict(G.nodes(data=\"street_count\")))\n", "intersect_count = len(street_counts[street_counts > 2])\n", "intersect_count" ] @@ -126,9 +125,9 @@ "outputs": [], "source": [ "# now clean up the intersections and re-calculate\n", - "clean_intersects = ox.consolidate_intersections(ox.project_graph(G),\n", - " rebuild_graph=False,\n", - " tolerance=10)\n", + "clean_intersects = ox.consolidate_intersections(\n", + " ox.project_graph(G), rebuild_graph=False, tolerance=10\n", + ")\n", "clean_intersect_count = len(clean_intersects)\n", "clean_intersect_count" ] @@ -158,7 +157,7 @@ "metadata": {}, "outputs": [], "source": [ - "od = pd.read_csv('../../data/od.csv').sample(1000)\n", + "od = pd.read_csv(\"../../data/od.csv\").sample(1000)\n", "od.shape" ] }, @@ -178,8 +177,8 @@ "outputs": [], "source": [ "# get home/work network nodes\n", - "home_nodes = ox.nearest_nodes(G, X=od['home_lng'], Y=od['home_lat'])\n", - "work_nodes = ox.nearest_nodes(G, X=od['work_lng'], Y=od['work_lat'])" + "home_nodes = ox.nearest_nodes(G, X=od[\"home_lng\"], Y=od[\"home_lat\"])\n", + "work_nodes = ox.nearest_nodes(G, X=od[\"work_lng\"], Y=od[\"work_lat\"])" ] }, { @@ -191,7 +190,10 @@ "outputs": [], "source": [ "# calculate each shortest path\n", - "paths = [ox.shortest_path(G, orig, dest, weight='travel_time') for orig, dest in zip(home_nodes, work_nodes)]\n", + "paths = [\n", + " ox.shortest_path(G, orig, dest, weight=\"travel_time\")\n", + " for orig, dest in zip(home_nodes, work_nodes)\n", + "]\n", "len(paths)" ] }, @@ -213,14 +215,16 @@ "outputs": [], "source": [ "# plot 100 routes\n", - "fig, ax = ox.plot_graph_routes(G,\n", - " routes=paths[0:100],\n", - " node_size=0,\n", - " edge_linewidth=0.2,\n", - " orig_dest_size=0,\n", - " route_colors='c',\n", - " route_linewidth=2,\n", - " route_alpha=0.2)" + "fig, ax = ox.plot_graph_routes(\n", + " G,\n", + " routes=paths[0:100],\n", + " node_size=0,\n", + " edge_linewidth=0.2,\n", + " orig_dest_size=0,\n", + " route_colors=\"c\",\n", + " route_linewidth=2,\n", + " route_alpha=0.2,\n", + ")" ] }, { @@ -249,18 +253,19 @@ "metadata": {}, "outputs": [], "source": [ - "def calc_efficiency(G, route, attr='length'):\n", + "def calc_efficiency(G, route, attr=\"length\"):\n", " # sum the edge lengths in the route\n", - " trip_distance = sum(ox.utils_graph.get_route_edge_attributes(G,\n", - " route=route,\n", - " attribute=attr))\n", + " trip_distance = sum(ox.utils_graph.get_route_edge_attributes(G, route=route, attribute=attr))\n", " # fast vectorized great-circle distance calculator\n", - " gc_distance = ox.distance.great_circle_vec(lat1=G.nodes[route[0]]['y'],\n", - " lng1=G.nodes[route[0]]['x'],\n", - " lat2=G.nodes[route[-1]]['y'],\n", - " lng2=G.nodes[route[-1]]['x'])\n", + " gc_distance = ox.distance.great_circle_vec(\n", + " lat1=G.nodes[route[0]][\"y\"],\n", + " lng1=G.nodes[route[0]][\"x\"],\n", + " lat2=G.nodes[route[-1]][\"y\"],\n", + " lng2=G.nodes[route[-1]][\"x\"],\n", + " )\n", " return gc_distance / trip_distance\n", "\n", + "\n", "# calculate each trip's efficiency and make a pandas series\n", "trip_efficiency = pd.Series([calc_efficiency(G, path) for path in paths])" ] @@ -326,9 +331,12 @@ "outputs": [], "source": [ "# get home/work network nodes again, calculate routes, drop nulls\n", - "home_nodes_per = ox.nearest_nodes(G_per, X=od['home_lng'], Y=od['home_lat'])\n", - "work_nodes_per = ox.nearest_nodes(G_per, X=od['work_lng'], Y=od['work_lat'])\n", - "paths_per = [ox.shortest_path(G_per, orig, dest, weight='travel_time') for orig, dest in zip(home_nodes_per, work_nodes_per)]\n", + "home_nodes_per = ox.nearest_nodes(G_per, X=od[\"home_lng\"], Y=od[\"home_lat\"])\n", + "work_nodes_per = ox.nearest_nodes(G_per, X=od[\"work_lng\"], Y=od[\"work_lat\"])\n", + "paths_per = [\n", + " ox.shortest_path(G_per, orig, dest, weight=\"travel_time\")\n", + " for orig, dest in zip(home_nodes_per, work_nodes_per)\n", + "]\n", "paths_per = [path for path in paths_per if path is not None]\n", "len(paths_per)" ] @@ -380,19 +388,21 @@ "# plot n routes apiece, before (cyan) and after (yellow) perturbation\n", "n = 100\n", "all_paths = paths[:n] + paths_per[:n]\n", - "colors = ['c'] * n + ['y'] * n\n", + "colors = [\"c\"] * n + [\"y\"] * n\n", "\n", "# shuffle the order, so you don't just plot new atop old\n", - "paths_colors = pd.DataFrame({'path': all_paths, 'color': colors}).sample(frac=1)\n", + "paths_colors = pd.DataFrame({\"path\": all_paths, \"color\": colors}).sample(frac=1)\n", "\n", - "fig, ax = ox.plot_graph_routes(G,\n", - " routes=paths_colors['path'],\n", - " node_size=0,\n", - " edge_linewidth=0.2,\n", - " orig_dest_size=0,\n", - " route_colors=paths_colors['color'],\n", - " route_linewidth=2,\n", - " route_alpha=0.3)" + "fig, ax = ox.plot_graph_routes(\n", + " G,\n", + " routes=paths_colors[\"path\"],\n", + " node_size=0,\n", + " edge_linewidth=0.2,\n", + " orig_dest_size=0,\n", + " route_colors=paths_colors[\"color\"],\n", + " route_linewidth=2,\n", + " route_alpha=0.3,\n", + ")" ] }, { @@ -435,7 +445,7 @@ "outputs": [], "source": [ "# study area within 1/2 mile of SF Civic Center\n", - "latlng_coords = ox.geocode('Civic Center, San Francisco, CA, USA')\n", + "latlng_coords = ox.geocode(\"Civic Center, San Francisco, CA, USA\")\n", "latlng_point = Point(latlng_coords[1], latlng_coords[0])\n", "latlng_point_proj, crs = ox.projection.project_geometry(latlng_point)\n", "polygon_proj = latlng_point_proj.buffer(800)\n", @@ -449,9 +459,9 @@ "outputs": [], "source": [ "# get the tracts that intersect the study area polygon\n", - "tracts = gpd.read_file('../../data/tl_2020_06_tract/').set_index('GEOID')\n", + "tracts = gpd.read_file(\"../../data/tl_2020_06_tract/\").set_index(\"GEOID\")\n", "mask = tracts.intersects(sf_polygon)\n", - "cols = ['ALAND', 'geometry']\n", + "cols = [\"ALAND\", \"geometry\"]\n", "sf_tracts = tracts.loc[mask, cols]\n", "sf_tracts.shape" ] @@ -508,10 +518,9 @@ "%%time\n", "# calculate clean intersection counts per tract\n", "intersect_counts = {}\n", - "for label, geom in zip(sf_tracts.index, sf_tracts['geometry']):\n", + "for label, geom in zip(sf_tracts.index, sf_tracts[\"geometry\"]):\n", " G_tmp = ox.graph_from_polygon(geom, custom_filter=custom_filter)\n", - " clean_intersects = ox.consolidate_intersections(ox.project_graph(G_tmp),\n", - " rebuild_graph=False)\n", + " clean_intersects = ox.consolidate_intersections(ox.project_graph(G_tmp), rebuild_graph=False)\n", " intersect_counts[label] = len(clean_intersects)" ] }, @@ -522,9 +531,9 @@ "outputs": [], "source": [ "# calculate intersection density per km^2\n", - "sf_tracts['intersect_count'] = pd.Series(intersect_counts)\n", - "sf_tracts['intersect_density'] = sf_tracts['intersect_count'] / (sf_tracts['ALAND'] / 1e6)\n", - "sf_tracts['intersect_density'].describe()" + "sf_tracts[\"intersect_count\"] = pd.Series(intersect_counts)\n", + "sf_tracts[\"intersect_density\"] = sf_tracts[\"intersect_count\"] / (sf_tracts[\"ALAND\"] / 1e6)\n", + "sf_tracts[\"intersect_density\"].describe()" ] }, { @@ -534,14 +543,15 @@ "outputs": [], "source": [ "# plot the tracts and the network\n", - "plt.style.use('dark_background')\n", + "plt.style.use(\"dark_background\")\n", "fig, ax = plt.subplots(figsize=(6, 6))\n", - "ax.axis('off')\n", - "ax.set_title('Intersection density (per km2)')\n", - "ax = sf_tracts.plot(ax=ax, column='intersect_density', cmap='Reds_r',\n", - " legend=True, legend_kwds={'shrink': 0.8})\n", - "fig, ax = ox.plot_graph(G_all, ax=ax, node_size=0, edge_color='#111111')\n", - "fig.savefig('map.png', dpi=300, facecolor='#111111', bbox_inches='tight')" + "ax.axis(\"off\")\n", + "ax.set_title(\"Intersection density (per km2)\")\n", + "ax = sf_tracts.plot(\n", + " ax=ax, column=\"intersect_density\", cmap=\"Reds_r\", legend=True, legend_kwds={\"shrink\": 0.8}\n", + ")\n", + "fig, ax = ox.plot_graph(G_all, ax=ax, node_size=0, edge_color=\"#111111\")\n", + "fig.savefig(\"map.png\", dpi=300, facecolor=\"#111111\", bbox_inches=\"tight\")" ] }, { @@ -593,8 +603,8 @@ "outputs": [], "source": [ "# model the walkable network within our original study site\n", - "G_walk = ox.graph_from_polygon(polygon, network_type='walk')\n", - "fig, ax = ox.plot_graph(G_walk, node_size=0, edge_color='w', edge_linewidth=0.3)" + "G_walk = ox.graph_from_polygon(polygon, network_type=\"walk\")\n", + "fig, ax = ox.plot_graph(G_walk, node_size=0, edge_color=\"w\", edge_linewidth=0.3)" ] }, { @@ -605,7 +615,7 @@ "source": [ "# set a uniform walking speed on every edge\n", "for u, v, data in G_walk.edges(data=True):\n", - " data['speed_kph'] = walk_speed\n", + " data[\"speed_kph\"] = walk_speed\n", "G_walk = ox.add_edge_travel_times(G_walk)" ] }, @@ -616,8 +626,8 @@ "outputs": [], "source": [ "# extract node/edge GeoDataFrames, retaining only necessary columns (for pandana)\n", - "nodes = ox.graph_to_gdfs(G_walk, edges=False)[['x', 'y']]\n", - "edges = ox.graph_to_gdfs(G_walk, nodes=False).reset_index()[['u', 'v', 'travel_time']]" + "nodes = ox.graph_to_gdfs(G_walk, edges=False)[[\"x\", \"y\"]]\n", + "edges = ox.graph_to_gdfs(G_walk, nodes=False).reset_index()[[\"u\", \"v\", \"travel_time\"]]" ] }, { @@ -628,12 +638,14 @@ "source": [ "# get all the \"fresh food\" stores on OSM within the study site\n", "# you could load any amenities DataFrame, but we'll get ours from OSM\n", - "tags = {'shop': ['grocery', 'greengrocer', 'supermarket']}\n", - "amenities = ox.geometries_from_bbox(north=nodes['y'].max(),\n", - " south=nodes['y'].min(),\n", - " east=nodes['x'].min(),\n", - " west=nodes['x'].max(),\n", - " tags=tags)\n", + "tags = {\"shop\": [\"grocery\", \"greengrocer\", \"supermarket\"]}\n", + "amenities = ox.geometries_from_bbox(\n", + " north=nodes[\"y\"].max(),\n", + " south=nodes[\"y\"].min(),\n", + " east=nodes[\"x\"].min(),\n", + " west=nodes[\"x\"].max(),\n", + " tags=tags,\n", + ")\n", "amenities.shape" ] }, @@ -644,11 +656,13 @@ "outputs": [], "source": [ "# construct the pandana network model\n", - "network = pandana.Network(node_x=nodes['x'],\n", - " node_y=nodes['y'], \n", - " edge_from=edges['u'],\n", - " edge_to=edges['v'],\n", - " edge_weights=edges[['travel_time']])" + "network = pandana.Network(\n", + " node_x=nodes[\"x\"],\n", + " node_y=nodes[\"y\"],\n", + " edge_from=edges[\"u\"],\n", + " edge_to=edges[\"v\"],\n", + " edge_weights=edges[[\"travel_time\"]],\n", + ")" ] }, { @@ -670,11 +684,9 @@ "# specify a max travel distance for this analysis\n", "# then set the amenities' locations on the network\n", "maxdist = walk_time * 60 # minutes -> seconds, to match travel_time units\n", - "network.set_pois(category='grocery',\n", - " maxdist=maxdist,\n", - " maxitems=3,\n", - " x_col=centroids.x, \n", - " y_col=centroids.y)" + "network.set_pois(\n", + " category=\"grocery\", maxdist=maxdist, maxitems=3, x_col=centroids.x, y_col=centroids.y\n", + ")" ] }, { @@ -684,9 +696,7 @@ "outputs": [], "source": [ "# calculate travel time to nearest amenity from each node in network\n", - "distances = network.nearest_pois(distance=maxdist,\n", - " category='grocery',\n", - " num_pois=3)\n", + "distances = network.nearest_pois(distance=maxdist, category=\"grocery\", num_pois=3)\n", "distances.astype(int).head()" ] }, @@ -697,17 +707,14 @@ "outputs": [], "source": [ "# plot distance to nearest amenity\n", - "fig, ax = ox.plot_graph(G_walk, node_size=0, edge_linewidth=0.1,\n", - " edge_color='gray', show=False, close=False)\n", + "fig, ax = ox.plot_graph(\n", + " G_walk, node_size=0, edge_linewidth=0.1, edge_color=\"gray\", show=False, close=False\n", + ")\n", "\n", - "sc = ax.scatter(x=nodes['x'],\n", - " y=nodes['y'], \n", - " c=distances[1],\n", - " s=1,\n", - " cmap='inferno_r')\n", + "sc = ax.scatter(x=nodes[\"x\"], y=nodes[\"y\"], c=distances[1], s=1, cmap=\"inferno_r\")\n", "\n", - "ax.set_title(f'Walking time to nearest grocery store')\n", - "plt.colorbar(sc, shrink=0.7).outline.set_edgecolor('none')" + "ax.set_title(\"Walking time to nearest grocery store\")\n", + "plt.colorbar(sc, shrink=0.7).outline.set_edgecolor(\"none\")" ] }, { @@ -725,14 +732,11 @@ "source": [ "# set a variable on the network, using the amenities' nodes\n", "node_ids = network.get_node_ids(centroids.x, centroids.y)\n", - "network.set(node_ids, name='grocery')\n", + "network.set(node_ids, name=\"grocery\")\n", "\n", "# aggregate the variable to all the nodes in the network\n", "# when counting, the decay doesn't matter (but would for summing)\n", - "access = network.aggregate(distance=maxdist,\n", - " type='count',\n", - " decay='linear',\n", - " name='grocery')\n", + "access = network.aggregate(distance=maxdist, type=\"count\", decay=\"linear\", name=\"grocery\")\n", "\n", "# let's cap it at 5, assuming no further utility from a larger choice set\n", "access = access.clip(upper=5)\n", @@ -746,17 +750,14 @@ "outputs": [], "source": [ "# plot amenity count within your walking horizon\n", - "fig, ax = ox.plot_graph(G_walk, node_size=0, edge_linewidth=0.1,\n", - " edge_color='gray', show=False, close=False)\n", + "fig, ax = ox.plot_graph(\n", + " G_walk, node_size=0, edge_linewidth=0.1, edge_color=\"gray\", show=False, close=False\n", + ")\n", "\n", - "sc = ax.scatter(x=nodes['x'],\n", - " y=nodes['y'], \n", - " c=access,\n", - " s=1,\n", - " cmap='inferno')\n", + "sc = ax.scatter(x=nodes[\"x\"], y=nodes[\"y\"], c=access, s=1, cmap=\"inferno\")\n", "\n", - "ax.set_title(f'Grocery stores within a {walk_time} minute walk')\n", - "plt.colorbar(sc, shrink=0.7).outline.set_edgecolor('none')" + "ax.set_title(f\"Grocery stores within a {walk_time} minute walk\")\n", + "plt.colorbar(sc, shrink=0.7).outline.set_edgecolor(\"none\")" ] }, { @@ -776,9 +777,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (ppd599)", + "display_name": "Python (ppde642)", "language": "python", - "name": "ppd599" + "name": "ppde642" }, "language_info": { "codemirror_mode": { @@ -790,7 +791,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/modules/08-urban-networks-ii/process-lodes.ipynb b/modules/08-urban-networks-ii/process-lodes.ipynb index 3f1c222..f4360d5 100644 --- a/modules/08-urban-networks-ii/process-lodes.ipynb +++ b/modules/08-urban-networks-ii/process-lodes.ipynb @@ -41,7 +41,7 @@ "outputs": [], "source": [ "# create a study site (copied code from lecture notebook)\n", - "latlng_coords = ox.geocode('Los Angeles City Hall')\n", + "latlng_coords = ox.geocode(\"Los Angeles City Hall\")\n", "latlng_point = Point(latlng_coords[1], latlng_coords[0])\n", "latlng_point_proj, crs = ox.projection.project_geometry(latlng_point)\n", "polygon_proj = latlng_point_proj.buffer(5000)\n", @@ -56,7 +56,7 @@ "outputs": [], "source": [ "# load CA census blocks shapefile\n", - "gdf_blocks = gpd.read_file('../../data/tl_2020_06_tabblock20/').set_index('GEOID20')\n", + "gdf_blocks = gpd.read_file(\"../../data/tl_2020_06_tabblock20/\").set_index(\"GEOID20\")\n", "gdf_blocks.shape" ] }, @@ -88,8 +88,9 @@ "outputs": [], "source": [ "# load CA LODES\n", - "df = pd.read_csv('../../data/lodes/ca_od_main_JT00_2018.csv',\n", - " dtype={'w_geocode': str, 'h_geocode': str})\n", + "df = pd.read_csv(\n", + " \"../../data/lodes/ca_od_main_JT00_2018.csv\", dtype={\"w_geocode\": str, \"h_geocode\": str}\n", + ")\n", "df.shape" ] }, @@ -101,8 +102,8 @@ "outputs": [], "source": [ "# get all LODES rows in study site blocks\n", - "mask_h = df['h_geocode'].isin(study_site_blocks.index)\n", - "mask_w = df['w_geocode'].isin(study_site_blocks.index)\n", + "mask_h = df[\"h_geocode\"].isin(study_site_blocks.index)\n", + "mask_w = df[\"w_geocode\"].isin(study_site_blocks.index)\n", "study_site_lodes = df[mask_h & mask_w]\n", "study_site_lodes.shape" ] @@ -123,8 +124,8 @@ "outputs": [], "source": [ "# retain the columns we want\n", - "block_trips = study_site_lodes[['h_geocode', 'w_geocode', 'S000']]\n", - "block_latlng = study_site_blocks[['INTPTLAT20', 'INTPTLON20']].astype(float)" + "block_trips = study_site_lodes[[\"h_geocode\", \"w_geocode\", \"S000\"]]\n", + "block_latlng = study_site_blocks[[\"INTPTLAT20\", \"INTPTLON20\"]].astype(float)" ] }, { @@ -135,19 +136,19 @@ "outputs": [], "source": [ "# merge in home lat-lng\n", - "od = pd.merge(left=block_trips,\n", - " right=block_latlng,\n", - " left_on='h_geocode',\n", - " right_index=True,\n", - " how='inner')\n", + "od = pd.merge(\n", + " left=block_trips, right=block_latlng, left_on=\"h_geocode\", right_index=True, how=\"inner\"\n", + ")\n", "\n", "# merge in work lat-lng\n", - "od = pd.merge(left=od,\n", - " right=block_latlng,\n", - " left_on='w_geocode',\n", - " right_index=True,\n", - " suffixes=['_home', '_work'],\n", - " how='inner')\n", + "od = pd.merge(\n", + " left=od,\n", + " right=block_latlng,\n", + " left_on=\"w_geocode\",\n", + " right_index=True,\n", + " suffixes=[\"_home\", \"_work\"],\n", + " how=\"inner\",\n", + ")\n", "od.shape" ] }, @@ -159,16 +160,18 @@ "outputs": [], "source": [ "# rename columns then save to disk\n", - "cols = {'h_geocode': 'home_block',\n", - " 'w_geocode': 'work_block',\n", - " 'S000': 'job_count',\n", - " 'INTPTLAT20_home': 'home_lat',\n", - " 'INTPTLON20_home': 'home_lng',\n", - " 'INTPTLAT20_work': 'work_lat',\n", - " 'INTPTLON20_work': 'work_lng'}\n", + "cols = {\n", + " \"h_geocode\": \"home_block\",\n", + " \"w_geocode\": \"work_block\",\n", + " \"S000\": \"job_count\",\n", + " \"INTPTLAT20_home\": \"home_lat\",\n", + " \"INTPTLON20_home\": \"home_lng\",\n", + " \"INTPTLAT20_work\": \"work_lat\",\n", + " \"INTPTLON20_work\": \"work_lng\",\n", + "}\n", "\n", "od = od.rename(columns=cols)\n", - "od.to_csv('../../data/od.csv', index=False)" + "od.to_csv(\"../../data/od.csv\", index=False)" ] }, { @@ -185,9 +188,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (ppd599)", + "display_name": "Python (ppde642)", "language": "python", - "name": "ppd599" + "name": "ppde642" }, "language_info": { "codemirror_mode": { @@ -199,7 +202,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.10" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/modules/09-spatial-analysis/lecture.ipynb b/modules/09-spatial-analysis/lecture.ipynb index db6ea1d..d6ef038 100644 --- a/modules/09-spatial-analysis/lecture.ipynb +++ b/modules/09-spatial-analysis/lecture.ipynb @@ -61,8 +61,8 @@ "outputs": [], "source": [ "# load CA tracts, display shape\n", - "tracts_ca = gpd.read_file('../../data/tl_2017_06_tract/')\n", - "tracts_ca = tracts_ca.set_index('GEOID')\n", + "tracts_ca = gpd.read_file(\"../../data/tl_2017_06_tract/\")\n", + "tracts_ca = tracts_ca.set_index(\"GEOID\")\n", "tracts_ca.shape" ] }, @@ -93,7 +93,7 @@ "outputs": [], "source": [ "# retain LA county only (and drop channel island tracts)\n", - "tracts_ca = tracts_ca[tracts_ca['COUNTYFP']=='037'].drop(index=['06037599100', '06037599000'])\n", + "tracts_ca = tracts_ca[tracts_ca[\"COUNTYFP\"] == \"037\"].drop(index=[\"06037599100\", \"06037599000\"])\n", "tracts_ca.shape" ] }, @@ -104,7 +104,7 @@ "outputs": [], "source": [ "# project spatial geometries to a meter-based projection for SoCal\n", - "crs = '+proj=utm +zone=11 +ellps=WGS84 +datum=WGS84 +units=m +no_defs'\n", + "crs = \"+proj=utm +zone=11 +ellps=WGS84 +datum=WGS84 +units=m +no_defs\"\n", "tracts_ca = tracts_ca.to_crs(crs)" ] }, @@ -122,7 +122,9 @@ "outputs": [], "source": [ "# load CA tract-level census variables\n", - "df_census = pd.read_csv('../../data/census_tracts_data_ca.csv', dtype={'GEOID10':str}).set_index('GEOID10')\n", + "df_census = pd.read_csv(\"../../data/census_tracts_data_ca.csv\", dtype={\"GEOID10\": str}).set_index(\n", + " \"GEOID10\"\n", + ")\n", "df_census.shape" ] }, @@ -158,7 +160,7 @@ "outputs": [], "source": [ "# merge tract geometries with census variables\n", - "tracts = tracts_ca.merge(df_census, left_index=True, right_index=True, how='left')\n", + "tracts = tracts_ca.merge(df_census, left_index=True, right_index=True, how=\"left\")\n", "tracts.shape" ] }, @@ -170,7 +172,7 @@ "source": [ "# calculate pop density in persons per sq km\n", "# turn any infinities into nulls\n", - "tracts['pop_density'] = tracts['total_pop'] / (tracts['ALAND'] / 1e6)\n", + "tracts[\"pop_density\"] = tracts[\"total_pop\"] / (tracts[\"ALAND\"] / 1e6)\n", "tracts = tracts.replace([np.inf, -np.inf], np.nan)" ] }, @@ -199,7 +201,7 @@ "outputs": [], "source": [ "# descriptive stats\n", - "tracts['med_household_income'].describe()" + "tracts[\"med_household_income\"].describe()" ] }, { @@ -209,7 +211,7 @@ "outputs": [], "source": [ "# descriptive stats\n", - "tracts['pop_density'].describe()" + "tracts[\"pop_density\"].describe()" ] }, { @@ -220,8 +222,8 @@ "source": [ "# inspect these variables' statistical distributions\n", "fig, axes = plt.subplots(1, 2, figsize=(12, 2))\n", - "ax1 = sns.boxplot(ax=axes[0], x=tracts['med_household_income'])\n", - "ax2 = sns.boxplot(ax=axes[1], x=tracts['pop_density'])" + "ax1 = sns.boxplot(ax=axes[0], x=tracts[\"med_household_income\"])\n", + "ax2 = sns.boxplot(ax=axes[1], x=tracts[\"pop_density\"])" ] }, { @@ -231,16 +233,18 @@ "outputs": [], "source": [ "# map a couple variables to inspect their spatial distributions\n", - "cols = ['pop_density', 'med_household_income']\n", + "cols = [\"pop_density\", \"med_household_income\"]\n", "for col in cols:\n", - " ax = tracts.dropna(subset=[col]).plot(column=col,\n", - " scheme='NaturalBreaks',\n", - " cmap='plasma',\n", - " figsize=(4, 4),\n", - " legend=True,\n", - " legend_kwds={'bbox_to_anchor': (1.7, 1)})\n", + " ax = tracts.dropna(subset=[col]).plot(\n", + " column=col,\n", + " scheme=\"NaturalBreaks\",\n", + " cmap=\"plasma\",\n", + " figsize=(4, 4),\n", + " legend=True,\n", + " legend_kwds={\"bbox_to_anchor\": (1.7, 1)},\n", + " )\n", " ax.set_title(col)\n", - " _ = ax.axis('off')" + " _ = ax.axis(\"off\")" ] }, { @@ -259,10 +263,9 @@ "outputs": [], "source": [ "# calculate correlation coefficient and p-value\n", - "subset = tracts.dropna(subset=['pop_density', 'med_household_income'])\n", - "r, p = stats.pearsonr(x=subset['pop_density'],\n", - " y=subset['med_household_income'])\n", - "print('r={:.4f}, p={:.4f}'.format(r, p))" + "subset = tracts.dropna(subset=[\"pop_density\", \"med_household_income\"])\n", + "r, p = stats.pearsonr(x=subset[\"pop_density\"], y=subset[\"med_household_income\"])\n", + "print(\"r={:.4f}, p={:.4f}\".format(r, p))" ] }, { @@ -273,9 +276,7 @@ "source": [ "# quick and dirty scatter plot with matplotlib\n", "fig, ax = plt.subplots()\n", - "sc = ax.scatter(x=subset['pop_density'],\n", - " y=subset['med_household_income'],\n", - " s=1)" + "sc = ax.scatter(x=subset[\"pop_density\"], y=subset[\"med_household_income\"], s=1)" ] }, { @@ -286,9 +287,8 @@ "source": [ "# estimate a simple linear regression model with scipy\n", "# what if you log transform your variables first?\n", - "m, b, r, p, se = stats.linregress(x=subset['pop_density'],\n", - " y=subset['med_household_income'])\n", - "print(f'm={m:.4f}, b={b:.4f}, r^2={r**2:.4f}, p={p:.4f}')" + "m, b, r, p, se = stats.linregress(x=subset[\"pop_density\"], y=subset[\"med_household_income\"])\n", + "print(f\"m={m:.4f}, b={b:.4f}, r^2={r**2:.4f}, p={p:.4f}\")" ] }, { @@ -461,24 +461,24 @@ "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(6, 6))\n", - "tracts.plot(ax=ax, facecolor='#666666', edgecolor='w', linewidth=0.5)\n", + "tracts.plot(ax=ax, facecolor=\"#666666\", edgecolor=\"w\", linewidth=0.5)\n", "\n", "# plot some tract of interest in red\n", "tract = tracts.loc[[label]]\n", - "tract.plot(ax=ax, facecolor='#ff0000', edgecolor='w', linewidth=2)\n", + "tract.plot(ax=ax, facecolor=\"#ff0000\", edgecolor=\"w\", linewidth=2)\n", "\n", "# plot the neighbors in blue\n", "neighbors = tracts.loc[w_queen[label].keys()]\n", - "neighbors.plot(ax=ax, facecolor='#0033cc', edgecolor='w', linewidth=2)\n", + "neighbors.plot(ax=ax, facecolor=\"#0033cc\", edgecolor=\"w\", linewidth=2)\n", "\n", "# zoom to area of interest\n", "xmin, ymin, xmax, ymax = neighbors.unary_union.bounds\n", - "ax.axis('equal')\n", - "ax.set_xlim(xmin-100, xmax+100) # +/- 100 meters\n", + "ax.axis(\"equal\")\n", + "ax.set_xlim(xmin - 100, xmax + 100) # +/- 100 meters\n", "ax.set_ylim(ymin, ymax)\n", "\n", - "ax.set_title('Neighbors of tract {}'.format(label))\n", - "_ = ax.axis('off')" + "ax.set_title(\"Neighbors of tract {}\".format(label))\n", + "_ = ax.axis(\"off\")" ] }, { @@ -489,17 +489,17 @@ "source": [ "%%time\n", "# draw a queen-contiguity graph of the tracts\n", - "fig, ax = plt.subplots(figsize=(12, 12), facecolor='#111111')\n", - "tracts.plot(ax=ax, facecolor='#333333', edgecolor='k', linewidth=0.3)\n", + "fig, ax = plt.subplots(figsize=(12, 12), facecolor=\"#111111\")\n", + "tracts.plot(ax=ax, facecolor=\"#333333\", edgecolor=\"k\", linewidth=0.3)\n", "\n", "# extract centroids of tract and its neighbors, then draw lines between them\n", "for tract, neighbors in w_queen:\n", - " tract_centroid = tracts.loc[tract, 'geometry'].centroid\n", - " for neighbor_centroid in tracts.loc[neighbors.keys(), 'geometry'].centroid:\n", + " tract_centroid = tracts.loc[tract, \"geometry\"].centroid\n", + " for neighbor_centroid in tracts.loc[neighbors.keys(), \"geometry\"].centroid:\n", " Xs = [tract_centroid.x, neighbor_centroid.x]\n", " Ys = [tract_centroid.y, neighbor_centroid.y]\n", - " ax.plot(Xs, Ys, color='r', linewidth=0.3)\n", - "_ = ax.axis('off')" + " ax.plot(Xs, Ys, color=\"r\", linewidth=0.3)\n", + "_ = ax.axis(\"off\")" ] }, { @@ -564,10 +564,9 @@ "outputs": [], "source": [ "# calculate linear decay continuous weights\n", - "w_dist = weights.distance.DistanceBand.from_dataframe(tracts,\n", - " threshold=threshold,\n", - " binary=False,\n", - " alpha=-1)" + "w_dist = weights.distance.DistanceBand.from_dataframe(\n", + " tracts, threshold=threshold, binary=False, alpha=-1\n", + ")" ] }, { @@ -588,31 +587,33 @@ "source": [ "# map the neighbors, colored by weight from nearest to furthest\n", "fig, ax = plt.subplots(figsize=(6, 6))\n", - "tracts.plot(ax=ax, facecolor='#333333', edgecolor='gray', linewidth=0.1)\n", + "tracts.plot(ax=ax, facecolor=\"#333333\", edgecolor=\"gray\", linewidth=0.1)\n", "\n", "# get the tract of interest and its neighbors/weights\n", "tract = tracts.loc[[label]]\n", "nweights = pd.Series(w_dist[label])\n", - "neighbors = tracts.loc[nweights.index, ['geometry']]\n", - "neighbors['weights_scaled'] = nweights\n", + "neighbors = tracts.loc[nweights.index, [\"geometry\"]]\n", + "neighbors[\"weights_scaled\"] = nweights\n", "\n", "# plot the tract's neighbors in blues by weight\n", - "neighbors.plot(ax=ax,\n", - " column='weights_scaled',\n", - " cmap='Blues_r',\n", - " edgecolor='gray',\n", - " linewidth=0.3,\n", - " scheme='NaturalBreaks')\n", + "neighbors.plot(\n", + " ax=ax,\n", + " column=\"weights_scaled\",\n", + " cmap=\"Blues_r\",\n", + " edgecolor=\"gray\",\n", + " linewidth=0.3,\n", + " scheme=\"NaturalBreaks\",\n", + ")\n", "\n", "# plot the tract of interest in red\n", - "tract.plot(ax=ax, facecolor='r', edgecolor='r', linewidth=0.1)\n", + "tract.plot(ax=ax, facecolor=\"r\", edgecolor=\"r\", linewidth=0.1)\n", "\n", "# zoom to area of interest\n", "xmin, ymin, xmax, ymax = neighbors.unary_union.bounds\n", "ax.set_xlim(xmin, xmax)\n", "ax.set_ylim(ymin, ymax)\n", - "ax.set_title('Neighbors of tract {}'.format(label))\n", - "_ = ax.axis('off')" + "ax.set_title(\"Neighbors of tract {}\".format(label))\n", + "_ = ax.axis(\"off\")" ] }, { @@ -662,7 +663,7 @@ "outputs": [], "source": [ "# transform the queen weights\n", - "w_queen.set_transform('R')\n", + "w_queen.set_transform(\"R\")\n", "w_queen[label]" ] }, @@ -673,8 +674,8 @@ "outputs": [], "source": [ "# transform the linear-decay distance-based weights\n", - "w_dist.set_transform('R')\n", - "#w_dist[label]" + "w_dist.set_transform(\"R\")\n", + "# w_dist[label]" ] }, { @@ -698,12 +699,12 @@ "outputs": [], "source": [ "# save your matrix to disk\n", - "f = io.open('tracts_queen.gal', 'w')\n", + "f = io.open(\"tracts_queen.gal\", \"w\")\n", "f.write(w_queen)\n", "f.close()\n", "\n", "# read a matrix from disk (notice its transformation)\n", - "w_queen = io.open('tracts_queen.gal', 'r').read()\n", + "w_queen = io.open(\"tracts_queen.gal\", \"r\").read()\n", "w_queen[label]" ] }, @@ -730,7 +731,7 @@ "outputs": [], "source": [ "# how many tracts are missing values for this variable?\n", - "col = 'med_household_income'\n", + "col = \"med_household_income\"\n", "nulls = tracts[pd.isnull(tracts[col])].index\n", "len(nulls)" ] @@ -812,8 +813,8 @@ "outputs": [], "source": [ "# pick a variable to investigate and drop null rows\n", - "col = 'med_household_income'\n", - "tracts_not_null = tracts[[col, 'geometry']].dropna()\n", + "col = \"med_household_income\"\n", + "tracts_not_null = tracts[[col, \"geometry\"]].dropna()\n", "y = tracts_not_null[col]" ] }, @@ -825,7 +826,7 @@ "source": [ "# recompute spatial weights for just these observations then row-standardize\n", "w_queen = weights.Queen.from_dataframe(tracts_not_null)\n", - "w_queen.set_transform('R')" + "w_queen.set_transform(\"R\")" ] }, { @@ -845,8 +846,8 @@ "outputs": [], "source": [ "# is a tract's med income similar to those of its neighbors?\n", - "col_lag = f'{col}_lag'\n", - "data_lag = pd.DataFrame(data={col:y, col_lag:y_lag}).astype(int)\n", + "col_lag = f\"{col}_lag\"\n", + "data_lag = pd.DataFrame(data={col: y, col_lag: y_lag}).astype(int)\n", "data_lag" ] }, @@ -950,7 +951,7 @@ "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(6, 6))\n", - "sns.regplot(x=col, y=col_lag, data=data_lag, scatter_kws={'s':1, 'color':'gray'})\n", + "sns.regplot(x=col, y=col_lag, data=data_lag, scatter_kws={\"s\": 1, \"color\": \"gray\"})\n", "plt.show()" ] }, @@ -981,9 +982,7 @@ "outputs": [], "source": [ "# compute spatial lag of standardized values and save as series with same index\n", - "y_std_lag = pd.Series(weights.lag_spatial(w_queen, y_std),\n", - " index=y_std.index,\n", - " name=col_lag)\n", + "y_std_lag = pd.Series(weights.lag_spatial(w_queen, y_std), index=y_std.index, name=col_lag)\n", "y_std_lag" ] }, @@ -995,7 +994,7 @@ "source": [ "# estimate a simple linear regression model\n", "m, b, r, p, se = stats.linregress(x=y_std, y=y_std_lag)\n", - "print('m={:.4f}, b={:.4f}, r^2={:.4f}, p={:.4f}'.format(m, b, r ** 2, p))" + "print(\"m={:.4f}, b={:.4f}, r^2={:.4f}, p={:.4f}\".format(m, b, r**2, p))" ] }, { @@ -1016,11 +1015,11 @@ "source": [ "# standardized moran's plot\n", "fig, ax = plt.subplots(figsize=(4, 4))\n", - "ax.scatter(x=y_std, y=y_std_lag, s=1, color='gray')\n", + "ax.scatter(x=y_std, y=y_std_lag, s=1, color=\"gray\")\n", "\n", "# draw quadrants and ignore outliers beyond 3 std devs (99.7% of distribution)\n", - "plt.axvline(0, c='k', alpha=0.5)\n", - "plt.axhline(0, c='k', alpha=0.5)\n", + "plt.axvline(0, c=\"k\", alpha=0.5)\n", + "plt.axhline(0, c=\"k\", alpha=0.5)\n", "ax.set_xlim(-3, 3)\n", "ax.set_ylim(-3, 3)\n", "\n", @@ -1066,19 +1065,19 @@ "source": [ "# standardized moran's plot again, from above, but labeled this time\n", "fig, ax = plt.subplots(figsize=(6, 6))\n", - "ax.scatter(x=y_std, y=y_std_lag, s=1, color='gray')\n", + "ax.scatter(x=y_std, y=y_std_lag, s=1, color=\"gray\")\n", "\n", "# draw quadrants and ignore outliers beyond 3 std devs\n", - "plt.axvline(0, c='k', alpha=0.5)\n", - "plt.axhline(0, c='k', alpha=0.5)\n", + "plt.axvline(0, c=\"k\", alpha=0.5)\n", + "plt.axhline(0, c=\"k\", alpha=0.5)\n", "ax.set_xlim(-3, 3)\n", "ax.set_ylim(-3, 3)\n", "\n", "# label the quadrants\n", - "ax.text(1.25, 1.25, 'HH', fontsize=30)\n", - "ax.text(1.25, -1.75, 'HL', fontsize=30)\n", - "ax.text(-1.75, 1.25, 'LH', fontsize=30)\n", - "ax.text(-1.75, -1.75, 'LL', fontsize=30)\n", + "ax.text(1.25, 1.25, \"HH\", fontsize=30)\n", + "ax.text(1.25, -1.75, \"HL\", fontsize=30)\n", + "ax.text(-1.75, 1.25, \"LH\", fontsize=30)\n", + "ax.text(-1.75, -1.75, \"LL\", fontsize=30)\n", "\n", "# draw a line with moran's I as the slope\n", "Xs = pd.Series([-3, 3])\n", @@ -1114,8 +1113,8 @@ "source": [ "# identify whether each observation is significant or not\n", "# p-value interpretation same as earlier with moran's I\n", - "data_lag['significant'] = lisa.p_sim < alpha\n", - "data_lag['significant'].value_counts()" + "data_lag[\"significant\"] = lisa.p_sim < alpha\n", + "data_lag[\"significant\"].value_counts()" ] }, { @@ -1125,9 +1124,9 @@ "outputs": [], "source": [ "# identify the quadrant each observation belongs to\n", - "data_lag['quadrant'] = lisa.q\n", - "data_lag['quadrant'] = data_lag['quadrant'].replace({1:'HH', 2:'LH', 3:'LL', 4:'HL'})\n", - "data_lag['quadrant'].sort_values().value_counts()" + "data_lag[\"quadrant\"] = lisa.q\n", + "data_lag[\"quadrant\"] = data_lag[\"quadrant\"].replace({1: \"HH\", 2: \"LH\", 3: \"LL\", 4: \"HL\"})\n", + "data_lag[\"quadrant\"].sort_values().value_counts()" ] }, { @@ -1156,18 +1155,18 @@ "fig, ax = plt.subplots(figsize=(9, 9))\n", "\n", "# merge original tracts and LISA quadrants data together, plot tracts basemap\n", - "tracts_lisa = tracts.merge(data_lag, how='left', left_index=True, right_index=True)\n", - "tracts_lisa.plot(ax=ax, facecolor='#999999', edgecolor='k', linewidth=0.1)\n", + "tracts_lisa = tracts.merge(data_lag, how=\"left\", left_index=True, right_index=True)\n", + "tracts_lisa.plot(ax=ax, facecolor=\"#999999\", edgecolor=\"k\", linewidth=0.1)\n", "\n", "# plot each quandrant's tracts (if significant LISA) in a different color\n", - "quadrant_colors = {'HH':'r', 'LL':'b', 'LH':'skyblue', 'HL':'pink'}\n", + "quadrant_colors = {\"HH\": \"r\", \"LL\": \"b\", \"LH\": \"skyblue\", \"HL\": \"pink\"}\n", "for q, c in quadrant_colors.items():\n", - " mask = tracts_lisa['significant'] & (tracts_lisa['quadrant']==q)\n", + " mask = tracts_lisa[\"significant\"] & (tracts_lisa[\"quadrant\"] == q)\n", " rows = tracts_lisa.loc[mask]\n", - " rows.plot(ax=ax, color=c, edgecolor='k', linewidth=0.1)\n", + " rows.plot(ax=ax, color=c, edgecolor=\"k\", linewidth=0.1)\n", "\n", - "ax.axis('off')\n", - "fig.savefig('clusters.png', dpi=600, bbox_inches='tight')" + "ax.axis(\"off\")\n", + "fig.savefig(\"clusters.png\", dpi=600, bbox_inches=\"tight\")" ] }, { @@ -1204,9 +1203,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (ppd599)", + "display_name": "Python (ppde642)", "language": "python", - "name": "ppd599" + "name": "ppde642" }, "language_info": { "codemirror_mode": { @@ -1218,7 +1217,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/modules/10-spatial-models/difference-in-differences.ipynb b/modules/10-spatial-models/difference-in-differences.ipynb index 6e47435..488cdcb 100644 --- a/modules/10-spatial-models/difference-in-differences.ipynb +++ b/modules/10-spatial-models/difference-in-differences.ipynb @@ -23,7 +23,7 @@ "import pandas as pd\n", "import statsmodels.api as sm\n", "\n", - "np.random.seed(0) #for recomputability" + "np.random.seed(0) # for recomputability" ] }, { @@ -79,22 +79,18 @@ "metadata": {}, "outputs": [], "source": [ - "df_treat_pre = pd.DataFrame(data={'tract_id': treat_tract_ids,\n", - " 'tract_median_rent_sqft': treat_pre,\n", - " 'time': 0,\n", - " 'group': 1})\n", - "df_treat_pst = pd.DataFrame(data={'tract_id': treat_tract_ids,\n", - " 'tract_median_rent_sqft': treat_pst,\n", - " 'time': 1,\n", - " 'group': 1})\n", - "df_cntrl_pre = pd.DataFrame(data={'tract_id': cntrl_tract_ids,\n", - " 'tract_median_rent_sqft': cntrl_pre,\n", - " 'time': 0,\n", - " 'group': 0})\n", - "df_cntrl_pst = pd.DataFrame(data={'tract_id': cntrl_tract_ids,\n", - " 'tract_median_rent_sqft': cntrl_pst,\n", - " 'time': 1,\n", - " 'group': 0})\n", + "df_treat_pre = pd.DataFrame(\n", + " data={\"tract_id\": treat_tract_ids, \"tract_median_rent_sqft\": treat_pre, \"time\": 0, \"group\": 1}\n", + ")\n", + "df_treat_pst = pd.DataFrame(\n", + " data={\"tract_id\": treat_tract_ids, \"tract_median_rent_sqft\": treat_pst, \"time\": 1, \"group\": 1}\n", + ")\n", + "df_cntrl_pre = pd.DataFrame(\n", + " data={\"tract_id\": cntrl_tract_ids, \"tract_median_rent_sqft\": cntrl_pre, \"time\": 0, \"group\": 0}\n", + ")\n", + "df_cntrl_pst = pd.DataFrame(\n", + " data={\"tract_id\": cntrl_tract_ids, \"tract_median_rent_sqft\": cntrl_pst, \"time\": 1, \"group\": 0}\n", + ")\n", "df = pd.concat([df_treat_pre, df_treat_pst, df_cntrl_pre, df_cntrl_pst]).reset_index(drop=True)" ] }, @@ -106,11 +102,11 @@ "source": [ "# create our DiD interaction dummy variable of interest:\n", "# 1 if is treatment group AND post-event, otherwise 0\n", - "df['post_treatment'] = df['time'] * df['group']\n", + "df[\"post_treatment\"] = df[\"time\"] * df[\"group\"]\n", "\n", "# add a couple random covariates\n", - "df['num_bedrooms'] = np.random.normal(loc=2, scale=0.3, size=len(df))\n", - "df['dist_to_transit'] = np.random.normal(loc=500, scale=200, size=len(df))" + "df[\"num_bedrooms\"] = np.random.normal(loc=2, scale=0.3, size=len(df))\n", + "df[\"dist_to_transit\"] = np.random.normal(loc=500, scale=200, size=len(df))" ] }, { @@ -140,11 +136,11 @@ "outputs": [], "source": [ "# choose a response and predictors\n", - "response = 'tract_median_rent_sqft'\n", - "predictors = ['time', 'group', 'post_treatment']\n", + "response = \"tract_median_rent_sqft\"\n", + "predictors = [\"time\", \"group\", \"post_treatment\"]\n", "\n", "# filter full dataset to retain only these columns and only rows without nulls in these columns\n", - "data = df[[response] + predictors].dropna()\n", + "data = df[[response] + predictors].dropna()\n", "\n", "# create design matrix and response vector\n", "X = data[predictors]\n", @@ -170,14 +166,16 @@ "outputs": [], "source": [ "# slice the dataset up into the 4 groups (pre/post, treatment/control)\n", - "pre_treat = ~df['time'].astype(bool) & df['group'].astype(bool)\n", - "pst_treat = df['time'].astype(bool) & df['group'].astype(bool)\n", - "pre_cntrl = ~df['time'].astype(bool) & ~df['group'].astype(bool)\n", - "pst_cntrl = df['time'].astype(bool) & ~df['group'].astype(bool)\n", + "pre_treat = ~df[\"time\"].astype(bool) & df[\"group\"].astype(bool)\n", + "pst_treat = df[\"time\"].astype(bool) & df[\"group\"].astype(bool)\n", + "pre_cntrl = ~df[\"time\"].astype(bool) & ~df[\"group\"].astype(bool)\n", + "pst_cntrl = df[\"time\"].astype(bool) & ~df[\"group\"].astype(bool)\n", "\n", "# then subtract their means\n", - "col = df['tract_median_rent_sqft']\n", - "did = (col[pst_treat].mean() - col[pre_treat].mean()) - (col[pst_cntrl].mean() - col[pre_cntrl].mean())\n", + "col = df[\"tract_median_rent_sqft\"]\n", + "did = (col[pst_treat].mean() - col[pre_treat].mean()) - (\n", + " col[pst_cntrl].mean() - col[pre_cntrl].mean()\n", + ")\n", "round(did, 4)" ] }, @@ -199,12 +197,12 @@ "outputs": [], "source": [ "# choose a response and predictors\n", - "response = 'tract_median_rent_sqft'\n", - "covariates = ['num_bedrooms', 'dist_to_transit']\n", - "predictors = ['time', 'group', 'post_treatment'] + covariates\n", + "response = \"tract_median_rent_sqft\"\n", + "covariates = [\"num_bedrooms\", \"dist_to_transit\"]\n", + "predictors = [\"time\", \"group\", \"post_treatment\"] + covariates\n", "\n", "# filter full dataset to retain only these columns and only rows without nulls in these columns\n", - "data = df[[response] + predictors].dropna()\n", + "data = df[[response] + predictors].dropna()\n", "\n", "# create design matrix and response vector\n", "X = data[predictors]\n", @@ -226,9 +224,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (ppd599)", + "display_name": "Python (ppde642)", "language": "python", - "name": "ppd599" + "name": "ppde642" }, "language_info": { "codemirror_mode": { @@ -240,7 +238,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/modules/10-spatial-models/lecture.ipynb b/modules/10-spatial-models/lecture.ipynb index 347df5a..eddcc4f 100644 --- a/modules/10-spatial-models/lecture.ipynb +++ b/modules/10-spatial-models/lecture.ipynb @@ -93,14 +93,14 @@ "outputs": [], "source": [ "# load CA tracts\n", - "tracts_ca = gpd.read_file('../../data/tl_2017_06_tract/').set_index('GEOID')\n", + "tracts_ca = gpd.read_file(\"../../data/tl_2017_06_tract/\").set_index(\"GEOID\")\n", "\n", "# keep LA, ventura, orange counties only (and drop offshore island tracts)\n", - "to_drop = ['06037599100', '06037599000', '06111980000', '06111990100', '06111003612']\n", - "tracts_ca = tracts_ca[tracts_ca['COUNTYFP'].isin(['037', '059', '111'])].drop(index=to_drop)\n", + "to_drop = [\"06037599100\", \"06037599000\", \"06111980000\", \"06111990100\", \"06111003612\"]\n", + "tracts_ca = tracts_ca[tracts_ca[\"COUNTYFP\"].isin([\"037\", \"059\", \"111\"])].drop(index=to_drop)\n", "\n", "# project tracts\n", - "crs = '+proj=utm +zone=11 +ellps=WGS84 +datum=WGS84 +units=m +no_defs'\n", + "crs = \"+proj=utm +zone=11 +ellps=WGS84 +datum=WGS84 +units=m +no_defs\"\n", "tracts_ca = tracts_ca.to_crs(crs)\n", "tracts_ca.shape" ] @@ -112,7 +112,9 @@ "outputs": [], "source": [ "# load CA tract-level census variables\n", - "df_census = pd.read_csv('../../data/census_tracts_data_ca.csv', dtype={'GEOID10':str}).set_index('GEOID10')\n", + "df_census = pd.read_csv(\"../../data/census_tracts_data_ca.csv\", dtype={\"GEOID10\": str}).set_index(\n", + " \"GEOID10\"\n", + ")\n", "df_census.shape" ] }, @@ -123,8 +125,8 @@ "outputs": [], "source": [ "# merge tract geometries with census variables and create med home value 1000s\n", - "tracts = tracts_ca.merge(df_census, left_index=True, right_index=True, how='left')\n", - "tracts['med_home_value_k'] = tracts['med_home_value'] / 1000\n", + "tracts = tracts_ca.merge(df_census, left_index=True, right_index=True, how=\"left\")\n", + "tracts[\"med_home_value_k\"] = tracts[\"med_home_value\"] / 1000\n", "tracts.shape" ] }, @@ -144,10 +146,10 @@ "outputs": [], "source": [ "# choose which variables to use as predictors\n", - "predictors = ['pct_white', 'pct_built_before_1940', 'med_rooms_per_home', 'pct_bachelors_degree']\n", + "predictors = [\"pct_white\", \"pct_built_before_1940\", \"med_rooms_per_home\", \"pct_bachelors_degree\"]\n", "\n", "# choose a response variable and drop any rows in which it is null\n", - "response = 'med_home_value_k'\n", + "response = \"med_home_value_k\"\n", "tracts = tracts.dropna(subset=[response])\n", "tracts.shape" ] @@ -180,11 +182,9 @@ "outputs": [], "source": [ "# estimate linear regression model with OLS\n", - "ols = spreg.OLS(y=Y.values,\n", - " x=X.values,\n", - " name_x=X.columns.tolist(),\n", - " name_y=response,\n", - " name_ds='tracts')\n", + "ols = spreg.OLS(\n", + " y=Y.values, x=X.values, name_x=X.columns.tolist(), name_y=response, name_ds=\"tracts\"\n", + ")\n", "print(ols.summary)" ] }, @@ -217,9 +217,9 @@ "outputs": [], "source": [ "# create a new dummy variable for each county, with 1 if tract is in this county and 0 if not\n", - "for county in tracts['COUNTYFP'].unique():\n", - " new_col = f'dummy_county_{county}'\n", - " tracts[new_col] = (tracts['COUNTYFP'] == county).astype(int)" + "for county in tracts[\"COUNTYFP\"].unique():\n", + " new_col = f\"dummy_county_{county}\"\n", + " tracts[new_col] = (tracts[\"COUNTYFP\"] == county).astype(int)" ] }, { @@ -230,7 +230,7 @@ "source": [ "# leave out one dummy variable to prevent perfect collinearity\n", "# ie, a subset of predictors sums to 1 (which full set of dummies will do)\n", - "county_dummies = [f'dummy_county_{county}' for county in tracts['COUNTYFP'].unique()]\n", + "county_dummies = [f\"dummy_county_{county}\" for county in tracts[\"COUNTYFP\"].unique()]\n", "county_dummies = county_dummies[:-1]\n", "county_dummies" ] @@ -253,11 +253,9 @@ "outputs": [], "source": [ "# estimate linear regression model with spatial fixed effects\n", - "ols = spreg.OLS(y=Y.values,\n", - " x=X.values,\n", - " name_x=X.columns.tolist(),\n", - " name_y=response,\n", - " name_ds='tracts')\n", + "ols = spreg.OLS(\n", + " y=Y.values, x=X.values, name_x=X.columns.tolist(), name_y=response, name_ds=\"tracts\"\n", + ")\n", "print(ols.summary)" ] }, @@ -290,7 +288,7 @@ "# create design matrix of predictors (drop nulls), response matrix, and regimes vector\n", "X = tracts[predictors].dropna()\n", "Y = tracts.loc[X.index][[response]]\n", - "regimes = tracts.loc[X.index]['COUNTYFP']\n", + "regimes = tracts.loc[X.index][\"COUNTYFP\"]\n", "regimes.sample(5)" ] }, @@ -301,13 +299,15 @@ "outputs": [], "source": [ "# estimate spatial regimes model with OLS\n", - "olsr = spreg.OLS_Regimes(y=Y.values,\n", - " x=X.values,\n", - " regimes=regimes.values,\n", - " name_regimes='county',\n", - " name_x=X.columns.tolist(),\n", - " name_y=response,\n", - " name_ds='tracts')\n", + "olsr = spreg.OLS_Regimes(\n", + " y=Y.values,\n", + " x=X.values,\n", + " regimes=regimes.values,\n", + " name_regimes=\"county\",\n", + " name_x=X.columns.tolist(),\n", + " name_y=response,\n", + " name_ds=\"tracts\",\n", + ")\n", "print(olsr.summary)" ] }, @@ -342,9 +342,9 @@ "outputs": [], "source": [ "fixed_kernel = False\n", - "spatial_kernel = 'gaussian'\n", - "search = 'golden_section'\n", - "criterion = 'AICc'" + "spatial_kernel = \"gaussian\"\n", + "search = \"golden_section\"\n", + "criterion = \"AICc\"" ] }, { @@ -357,11 +357,9 @@ "# select an adaptive (NN) bandwidth for our GWR model, given the data\n", "centroids = tracts.loc[X.index].centroid\n", "coords = list(zip(centroids.x, centroids.y))\n", - "sel = mgwr.sel_bw.Sel_BW(coords=coords,\n", - " y=Y.values,\n", - " X_loc=X.values,\n", - " kernel=spatial_kernel,\n", - " fixed=fixed_kernel)\n", + "sel = mgwr.sel_bw.Sel_BW(\n", + " coords=coords, y=Y.values, X_loc=X.values, kernel=spatial_kernel, fixed=fixed_kernel\n", + ")\n", "nn = sel.search(search_method=search, criterion=criterion)" ] }, @@ -385,12 +383,9 @@ "%%time\n", "# estimate the GWR model parameters\n", "# pass fixed=False to treat bw as number of NNs (adaptive kernel)\n", - "model = mgwr.gwr.GWR(coords=coords,\n", - " y=Y.values,\n", - " X=X.values,\n", - " bw=nn,\n", - " kernel=spatial_kernel,\n", - " fixed=fixed_kernel)\n", + "model = mgwr.gwr.GWR(\n", + " coords=coords, y=Y.values, X=X.values, bw=nn, kernel=spatial_kernel, fixed=fixed_kernel\n", + ")\n", "gwr = model.fit()" ] }, @@ -418,7 +413,7 @@ "outputs": [], "source": [ "# a constant was added, so we'll add it to our predictors\n", - "cols = ['constant'] + predictors\n", + "cols = [\"constant\"] + predictors\n", "cols" ] }, @@ -430,7 +425,7 @@ "source": [ "# turn GWR local parameter estimates into a GeoDataFrame with tract geometries\n", "params = pd.DataFrame(gwr.params, columns=cols, index=X.index)\n", - "params = tracts[['geometry']].merge(params, left_index=True, right_index=True, how='right')\n", + "params = tracts[[\"geometry\"]].merge(params, left_index=True, right_index=True, how=\"right\")\n", "params.head()" ] }, @@ -450,10 +445,13 @@ "outputs": [], "source": [ "# helper function to generate colormaps for GWR plots\n", - "def get_cmap(values, cmap_name='coolwarm', n=256):\n", + "\n", + "\n", + "def get_cmap(values, cmap_name=\"coolwarm\", n=256):\n", " import numpy as np\n", " from matplotlib.colors import LinearSegmentedColormap as lsc\n", - " name = f'{cmap_name}_new'\n", + "\n", + " name = f\"{cmap_name}_new\"\n", " cmap = plt.cm.get_cmap(cmap_name)\n", " vmin = values.min()\n", " vmax = values.max()\n", @@ -484,15 +482,13 @@ "# set nrows, ncols to match your number of predictors!\n", "fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 10))\n", "for col, ax in zip(predictors, axes.flat):\n", - " ax.set_aspect('equal')\n", - " ax.axis('off')\n", - " ax.set_title(f'Local {col} coefficients')\n", - " gdf = params.dropna(subset=[col], axis='rows')\n", - " ax = gdf.plot(ax=ax,\n", - " column=col,\n", - " cmap=get_cmap(gdf[col]),\n", - " legend=True,\n", - " legend_kwds={'shrink': 0.6})\n", + " ax.set_aspect(\"equal\")\n", + " ax.axis(\"off\")\n", + " ax.set_title(f\"Local {col} coefficients\")\n", + " gdf = params.dropna(subset=[col], axis=\"rows\")\n", + " ax = gdf.plot(\n", + " ax=ax, column=col, cmap=get_cmap(gdf[col]), legend=True, legend_kwds={\"shrink\": 0.6}\n", + " )\n", "fig.tight_layout()" ] }, @@ -513,20 +509,18 @@ "# set t-values below significance threshold to zero then clip to ± 4\n", "# p<.05 corresponds to |t|>1.96, and |t|>4 corresponds to p<.0001\n", "tvals = pd.DataFrame(gwr.filter_tvals(alpha=0.05), columns=cols, index=X.index).clip(-4, 4)\n", - "tvals = tracts[['geometry']].merge(tvals, left_index=True, right_index=True, how='right')\n", + "tvals = tracts[[\"geometry\"]].merge(tvals, left_index=True, right_index=True, how=\"right\")\n", "\n", "# plot the spatial distribution of local t-values\n", "fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 10))\n", "for col, ax in zip(predictors, axes.flat):\n", - " ax.set_aspect('equal')\n", - " ax.axis('off')\n", - " ax.set_title(f'Local {col} $t$ values')\n", - " gdf = tvals.dropna(subset=[col], axis='rows')\n", - " ax = gdf.plot(ax=ax,\n", - " column=col,\n", - " cmap=get_cmap(gdf[col]),\n", - " legend=True,\n", - " legend_kwds={'shrink': 0.6})\n", + " ax.set_aspect(\"equal\")\n", + " ax.axis(\"off\")\n", + " ax.set_title(f\"Local {col} $t$ values\")\n", + " gdf = tvals.dropna(subset=[col], axis=\"rows\")\n", + " ax = gdf.plot(\n", + " ax=ax, column=col, cmap=get_cmap(gdf[col]), legend=True, legend_kwds={\"shrink\": 0.6}\n", + " )\n", "fig.tight_layout()" ] }, @@ -544,21 +538,17 @@ "outputs": [], "source": [ "# turn GWR local R-squared values into a GeoDataFrame with tract geometries\n", - "col = 'Local $R^2$ values'\n", + "col = \"Local $R^2$ values\"\n", "r_squared = pd.DataFrame(gwr.localR2, index=X.index, columns=[col])\n", - "r_squared = tracts[['geometry']].merge(r_squared, left_index=True, right_index=True, how='right')\n", + "r_squared = tracts[[\"geometry\"]].merge(r_squared, left_index=True, right_index=True, how=\"right\")\n", "\n", "# plot the spatial distribution of local R-squared values\n", "fig, ax = plt.subplots(figsize=(5, 5))\n", - "ax.set_aspect('equal')\n", - "ax.axis('off')\n", + "ax.set_aspect(\"equal\")\n", + "ax.axis(\"off\")\n", "ax.set_title(col)\n", - "gdf = r_squared.dropna(subset=[col], axis='rows')\n", - "ax = gdf.plot(ax=ax,\n", - " column=col,\n", - " cmap='Reds',\n", - " legend=True,\n", - " legend_kwds={'shrink': 0.6})\n", + "gdf = r_squared.dropna(subset=[col], axis=\"rows\")\n", + "ax = gdf.plot(ax=ax, column=col, cmap=\"Reds\", legend=True, legend_kwds={\"shrink\": 0.6})\n", "fig.tight_layout()" ] }, @@ -590,7 +580,7 @@ "source": [ "# compute spatial weights for only those tracts that appear in design matrix\n", "W = weights.Queen.from_dataframe(tracts.loc[X.index])\n", - "W.transform = 'r'" + "W.transform = \"r\"" ] }, { @@ -600,11 +590,7 @@ "outputs": [], "source": [ "# compute OLS spatial diagnostics to check the nature of spatial dependence\n", - "ols = spreg.OLS(y=Y.values,\n", - " x=X.values,\n", - " w=W,\n", - " spat_diag=True,\n", - " moran=True)" + "ols = spreg.OLS(y=Y.values, x=X.values, w=W, spat_diag=True, moran=True)" ] }, { @@ -718,14 +704,16 @@ "outputs": [], "source": [ "# maximum-likelihood estimation with full matrix expression\n", - "mll = spreg.ML_Lag(y=Y.values,\n", - " x=X.values,\n", - " w=W,\n", - " method='full',\n", - " name_w='queen',\n", - " name_x=X.columns.tolist(),\n", - " name_y=response,\n", - " name_ds='tracts')\n", + "mll = spreg.ML_Lag(\n", + " y=Y.values,\n", + " x=X.values,\n", + " w=W,\n", + " method=\"full\",\n", + " name_w=\"queen\",\n", + " name_x=X.columns.tolist(),\n", + " name_y=response,\n", + " name_ds=\"tracts\",\n", + ")\n", "print(mll.summary)" ] }, @@ -771,14 +759,16 @@ "outputs": [], "source": [ "# maximum-likelihood estimation with full matrix expression\n", - "mle = spreg.ML_Error(y=Y.values,\n", - " x=X.values,\n", - " w=W,\n", - " method='full',\n", - " name_w='queen',\n", - " name_x=X.columns.tolist(),\n", - " name_y=response,\n", - " name_ds='tracts')\n", + "mle = spreg.ML_Error(\n", + " y=Y.values,\n", + " x=X.values,\n", + " w=W,\n", + " method=\"full\",\n", + " name_w=\"queen\",\n", + " name_x=X.columns.tolist(),\n", + " name_y=response,\n", + " name_ds=\"tracts\",\n", + ")\n", "print(mle.summary)" ] }, @@ -830,13 +820,15 @@ "metadata": {}, "outputs": [], "source": [ - "gmc = spreg.GM_Combo_Het(y=Y.values,\n", - " x=X.values,\n", - " w=W,\n", - " name_w='queen',\n", - " name_ds='tracts',\n", - " name_x=X.columns.tolist(),\n", - " name_y=response)\n", + "gmc = spreg.GM_Combo_Het(\n", + " y=Y.values,\n", + " x=X.values,\n", + " w=W,\n", + " name_w=\"queen\",\n", + " name_ds=\"tracts\",\n", + " name_x=X.columns.tolist(),\n", + " name_y=response,\n", + ")\n", "print(gmc.summary)" ] }, @@ -861,9 +853,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (ppd599)", + "display_name": "Python (ppde642)", "language": "python", - "name": "ppd599" + "name": "ppde642" }, "language_info": { "codemirror_mode": { @@ -875,7 +867,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/modules/11-supervised-learning/lecture.ipynb b/modules/11-supervised-learning/lecture.ipynb index a3ced55..6672813 100644 --- a/modules/11-supervised-learning/lecture.ipynb +++ b/modules/11-supervised-learning/lecture.ipynb @@ -78,7 +78,9 @@ "outputs": [], "source": [ "# load CA tract-level census variables\n", - "df = pd.read_csv('../../data/census_tracts_data_ca.csv', dtype={'GEOID10':str}).set_index('GEOID10')\n", + "df = pd.read_csv(\"../../data/census_tracts_data_ca.csv\", dtype={\"GEOID10\": str}).set_index(\n", + " \"GEOID10\"\n", + ")\n", "df.shape" ] }, @@ -118,8 +120,8 @@ "outputs": [], "source": [ "# classify tracts into high poverty vs not\n", - "df['poverty'] = (df['pct_below_poverty'] > 20).astype(int)\n", - "df['poverty'].value_counts().sort_index()" + "df[\"poverty\"] = (df[\"pct_below_poverty\"] > 20).astype(int)\n", + "df[\"poverty\"].value_counts().sort_index()" ] }, { @@ -130,8 +132,8 @@ "outputs": [], "source": [ "# feature selection: which features are important for predicting our categories?\n", - "response = 'poverty'\n", - "predictors = ['median_age', 'pct_renting', 'pct_bachelors_degree', 'pct_english_only']\n", + "response = \"poverty\"\n", + "predictors = [\"median_age\", \"pct_renting\", \"pct_bachelors_degree\", \"pct_english_only\"]\n", "data = df[[response] + predictors].dropna()\n", "y = data[response]\n", "X = data[predictors]" @@ -182,8 +184,8 @@ "# inspect the probabilities\n", "probs = blr.predict_proba(X_test)\n", "df_probs = pd.DataFrame(probs, columns=blr.classes_)\n", - "df_probs['pred'] = y_pred\n", - "df_probs['actual'] = y_test.values\n", + "df_probs[\"pred\"] = y_pred\n", + "df_probs[\"actual\"] = y_test.values\n", "df_probs.head()" ] }, @@ -240,17 +242,17 @@ "source": [ "# create a poverty classification variable\n", "# by default, set all as mid poverty tracts\n", - "df['poverty'] = 'mid'\n", + "df[\"poverty\"] = \"mid\"\n", "\n", "# identify all low poverty tracts\n", - "mask_low = df['pct_below_poverty'] <= 5\n", - "df.loc[mask_low, 'poverty'] = 'low'\n", + "mask_low = df[\"pct_below_poverty\"] <= 5\n", + "df.loc[mask_low, \"poverty\"] = \"low\"\n", "\n", "# identify all high poverty tracts\n", - "mask_high = df['pct_below_poverty'] >= 25\n", - "df.loc[mask_high, 'poverty'] = 'high'\n", + "mask_high = df[\"pct_below_poverty\"] >= 25\n", + "df.loc[mask_high, \"poverty\"] = \"high\"\n", "\n", - "df['poverty'].value_counts().sort_index()" + "df[\"poverty\"].value_counts().sort_index()" ] }, { @@ -261,8 +263,8 @@ "outputs": [], "source": [ "# feature selection\n", - "response = 'poverty'\n", - "predictors = ['median_age', 'pct_renting', 'pct_bachelors_degree', 'pct_english_only']\n", + "response = \"poverty\"\n", + "predictors = [\"median_age\", \"pct_renting\", \"pct_bachelors_degree\", \"pct_english_only\"]\n", "data = df[[response] + predictors].dropna()\n", "y = data[response]\n", "X = data[predictors]" @@ -308,7 +310,7 @@ "outputs": [], "source": [ "# train model on training data then use it to make predictions with test data\n", - "mlr = LogisticRegression(multi_class='multinomial', C=1)\n", + "mlr = LogisticRegression(multi_class=\"multinomial\", C=1)\n", "y_pred = mlr.fit(X_train, y_train).predict(X_test)" ] }, @@ -333,8 +335,8 @@ "source": [ "probs = mlr.predict_proba(X_test)\n", "df_probs = pd.DataFrame(probs, columns=mlr.classes_)\n", - "df_probs['pred'] = y_pred\n", - "df_probs['actual'] = y_test.values\n", + "df_probs[\"pred\"] = y_pred\n", + "df_probs[\"actual\"] = y_test.values\n", "df_probs.head()" ] }, @@ -387,8 +389,8 @@ "source": [ "# calculate the odds ratio for some class and some predictor\n", "# a 1-unit increase in predictor k increases the odds of class c by what %\n", - "k = 'pct_english_only'\n", - "c = 'low'\n", + "k = \"pct_english_only\"\n", + "c = \"low\"\n", "B_ck = df_coeffs.loc[c, k]\n", "odds_ratio = np.exp(B_ck)\n", "odds_ratio" @@ -419,13 +421,13 @@ "# calculate the logit of class c if nothing changes, then convert to odds\n", "x0 = X_test[i]\n", "log_odds0 = np.dot(mlr.coef_, X_test[i]) + mlr.intercept_\n", - "odds0 = np.exp(log_odds0[c]) # convert log-odds to odds\n", + "odds0 = np.exp(log_odds0[c]) # convert log-odds to odds\n", "\n", "# calculate the logit of class c if we increase k by 1, then convert to odds\n", "x1 = x0.copy()\n", "x1[k] = x1[k] + 1\n", "log_odds1 = np.dot(mlr.coef_, x1) + mlr.intercept_\n", - "odds1 = np.exp(log_odds1[c]) # convert log-odds to odds\n", + "odds1 = np.exp(log_odds1[c]) # convert log-odds to odds\n", "\n", "# calculate the odds ratio\n", "odds_ratio = odds1 / odds0\n", @@ -506,27 +508,28 @@ "source": [ "# helper function to visualize the model's decision surface\n", "# fits model pairwise to just 2 features at a time and plots them\n", + "\n", + "\n", "def plot_decision(X, y, feature_names, classifier):\n", - " \n", - " class_colors = {'high': 'r', 'mid': 'y', 'low': 'b'}\n", - " class_ints = {'high': 0, 'mid': 1, 'low': 2}\n", + " class_colors = {\"high\": \"r\", \"mid\": \"y\", \"low\": \"b\"}\n", + " class_ints = {\"high\": 0, \"mid\": 1, \"low\": 2}\n", " pairs = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]\n", " fig, axes = plt.subplots(2, 3, figsize=(9, 6))\n", " for ax, pair in zip(axes.flat, pairs):\n", - " \n", " # take the two corresponding features\n", " Xp = X[:, pair]\n", " x_min, x_max = Xp[:, 0].min() - 1, Xp[:, 0].max() + 1\n", " y_min, y_max = Xp[:, 1].min() - 1, Xp[:, 1].max() + 1\n", - " xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),\n", - " np.arange(y_min, y_max, 0.02))\n", - " \n", + " xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))\n", + "\n", " # fit model to the two features, predict for meshgrid points, then plot\n", " Z = classifier.fit(Xp, y).predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)\n", " for cat, i in class_ints.items():\n", - " Z[np.where(Z==cat)] = i\n", - " cs = ax.contourf(xx.astype(float), yy.astype(float), Z.astype(float), cmap=plt.cm.RdYlBu, alpha=0.7)\n", - " \n", + " Z[np.where(Z == cat)] = i\n", + " _ = ax.contourf(\n", + " xx.astype(float), yy.astype(float), Z.astype(float), cmap=plt.cm.RdYlBu, alpha=0.7\n", + " )\n", + "\n", " # scatter plot each class in same color as corresponding contour\n", " for cat, color in class_colors.items():\n", " idx = np.where(y == cat)\n", @@ -534,7 +537,7 @@ "\n", " ax.set_xlabel(feature_names[pair[0]])\n", " ax.set_ylabel(feature_names[pair[1]])\n", - " ax.figure.tight_layout() \n", + " ax.figure.tight_layout()\n", " plt.legend()" ] }, @@ -679,7 +682,7 @@ "outputs": [], "source": [ "# train model on training data then use it to make predictions with test data\n", - "knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')\n", + "knn = KNeighborsClassifier(n_neighbors=5, p=2, metric=\"minkowski\")\n", "y_pred = knn.fit(X_train, y_train).predict(X_test)" ] }, @@ -816,7 +819,7 @@ "source": [ "# train model on training data then use it to make predictions with test data\n", "# train the linear SVM (namely, support vector classification)\n", - "svc = SVC(kernel='linear', C=1)\n", + "svc = SVC(kernel=\"linear\", C=1)\n", "y_pred = svc.fit(X_train, y_train).predict(X_test)" ] }, @@ -849,7 +852,7 @@ "outputs": [], "source": [ "# train model on training data then use it to make predictions with test data\n", - "svc_kt = SVC(kernel='rbf', gamma=0.2, C=1)\n", + "svc_kt = SVC(kernel=\"rbf\", gamma=0.2, C=1)\n", "y_pred = svc_kt.fit(X_train, y_train).predict(X_test)" ] }, @@ -902,7 +905,7 @@ "outputs": [], "source": [ "# train model on training data then use it to make predictions with test data\n", - "svc_kt2 = SVC(kernel='rbf', gamma=10, C=1)\n", + "svc_kt2 = SVC(kernel=\"rbf\", gamma=10, C=1)\n", "y_pred = svc_kt2.fit(X_train, y_train).predict(X_test)" ] }, @@ -951,9 +954,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (ppd599)", + "display_name": "Python (ppde642)", "language": "python", - "name": "ppd599" + "name": "ppde642" }, "language_info": { "codemirror_mode": { @@ -965,7 +968,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/modules/12-unsupervised-learning/lecture.ipynb b/modules/12-unsupervised-learning/lecture.ipynb index 6824c24..55f078b 100644 --- a/modules/12-unsupervised-learning/lecture.ipynb +++ b/modules/12-unsupervised-learning/lecture.ipynb @@ -58,7 +58,9 @@ "outputs": [], "source": [ "# load CA tract-level census variables\n", - "df = pd.read_csv('../../data/census_tracts_data_ca.csv', dtype={'GEOID10':str}).set_index('GEOID10')\n", + "df = pd.read_csv(\"../../data/census_tracts_data_ca.csv\", dtype={\"GEOID10\": str}).set_index(\n", + " \"GEOID10\"\n", + ")\n", "df.shape" ] }, @@ -94,12 +96,31 @@ "outputs": [], "source": [ "# choose response and predictors\n", - "response = 'county_name'\n", - "features = ['median_age', 'pct_hispanic', 'pct_white', 'pct_black', 'pct_asian', 'pct_male',\n", - " 'pct_single_family_home', 'med_home_value', 'med_rooms_per_home', 'pct_built_before_1940',\n", - " 'pct_renting', 'rental_vacancy_rate', 'avg_renter_household_size', 'med_household_income',\n", - " 'mean_commute_time', 'pct_commute_drive_alone', 'pct_below_poverty', 'pct_college_grad_student',\n", - " 'pct_same_residence_year_ago', 'pct_bachelors_degree', 'pct_english_only', 'pct_foreign_born']" + "response = \"county_name\"\n", + "features = [\n", + " \"median_age\",\n", + " \"pct_hispanic\",\n", + " \"pct_white\",\n", + " \"pct_black\",\n", + " \"pct_asian\",\n", + " \"pct_male\",\n", + " \"pct_single_family_home\",\n", + " \"med_home_value\",\n", + " \"med_rooms_per_home\",\n", + " \"pct_built_before_1940\",\n", + " \"pct_renting\",\n", + " \"rental_vacancy_rate\",\n", + " \"avg_renter_household_size\",\n", + " \"med_household_income\",\n", + " \"mean_commute_time\",\n", + " \"pct_commute_drive_alone\",\n", + " \"pct_below_poverty\",\n", + " \"pct_college_grad_student\",\n", + " \"pct_same_residence_year_ago\",\n", + " \"pct_bachelors_degree\",\n", + " \"pct_english_only\",\n", + " \"pct_foreign_born\",\n", + "]" ] }, { @@ -109,8 +130,8 @@ "metadata": {}, "outputs": [], "source": [ - "counties = ['Los Angeles', 'Orange', 'Riverside']\n", - "mask = df['county_name'].isin(counties)\n", + "counties = [\"Los Angeles\", \"Orange\", \"Riverside\"]\n", + "mask = df[\"county_name\"].isin(counties)\n", "subset = features + [response]\n", "data = df.loc[mask].dropna(subset=subset)\n", "y = data[response]\n", @@ -150,16 +171,12 @@ "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(6, 6))\n", - "for county_name in data['county_name'].unique():\n", + "for county_name in data[\"county_name\"].unique():\n", " mask = y == county_name\n", - " ax.scatter(x=X_reduced[mask, 0],\n", - " y=X_reduced[mask, 1],\n", - " alpha=0.5,\n", - " s=3,\n", - " label=county_name)\n", - "ax.set_aspect('equal')\n", - "ax.legend(loc='best', scatterpoints=4)\n", - "_ = ax.set_title('')" + " ax.scatter(x=X_reduced[mask, 0], y=X_reduced[mask, 1], alpha=0.5, s=3, label=county_name)\n", + "ax.set_aspect(\"equal\")\n", + "ax.legend(loc=\"best\", scatterpoints=4)\n", + "_ = ax.set_title(\"\")" ] }, { @@ -231,12 +248,31 @@ "source": [ "# this is unsupervised, so we don't need a response variable, but we will\n", "# define one just so we can build a simple regression model when we're done\n", - "response = 'med_gross_rent'\n", - "features = ['median_age', 'pct_hispanic', 'pct_white', 'pct_black', 'pct_asian', 'pct_male',\n", - " 'pct_single_family_home', 'med_home_value', 'med_rooms_per_home', 'pct_built_before_1940',\n", - " 'pct_renting', 'rental_vacancy_rate', 'avg_renter_household_size', 'med_household_income',\n", - " 'mean_commute_time', 'pct_commute_drive_alone', 'pct_below_poverty', 'pct_college_grad_student',\n", - " 'pct_same_residence_year_ago', 'pct_bachelors_degree', 'pct_english_only', 'pct_foreign_born']" + "response = \"med_gross_rent\"\n", + "features = [\n", + " \"median_age\",\n", + " \"pct_hispanic\",\n", + " \"pct_white\",\n", + " \"pct_black\",\n", + " \"pct_asian\",\n", + " \"pct_male\",\n", + " \"pct_single_family_home\",\n", + " \"med_home_value\",\n", + " \"med_rooms_per_home\",\n", + " \"pct_built_before_1940\",\n", + " \"pct_renting\",\n", + " \"rental_vacancy_rate\",\n", + " \"avg_renter_household_size\",\n", + " \"med_household_income\",\n", + " \"mean_commute_time\",\n", + " \"pct_commute_drive_alone\",\n", + " \"pct_below_poverty\",\n", + " \"pct_college_grad_student\",\n", + " \"pct_same_residence_year_ago\",\n", + " \"pct_bachelors_degree\",\n", + " \"pct_english_only\",\n", + " \"pct_foreign_born\",\n", + "]" ] }, { @@ -311,13 +347,13 @@ "# create a variance-explained plot\n", "xpos = range(1, len(features) + 1)\n", "fig, ax = plt.subplots(figsize=(5, 5))\n", - "ax.plot(xpos, pve, marker='o', markersize=5, label='Individual')\n", - "ax.plot(xpos, np.cumsum(pve), marker='o', markersize=5, label='Cumulative')\n", - "ax.set_ylabel('Proportion of variance explained')\n", - "ax.set_xlabel('Principal component')\n", + "ax.plot(xpos, pve, marker=\"o\", markersize=5, label=\"Individual\")\n", + "ax.plot(xpos, np.cumsum(pve), marker=\"o\", markersize=5, label=\"Cumulative\")\n", + "ax.set_ylabel(\"Proportion of variance explained\")\n", + "ax.set_xlabel(\"Principal component\")\n", "ax.set_xlim(0, len(features) + 1)\n", "ax.set_ylim(0, 1)\n", - "ax.grid(True, ls='--')\n", + "ax.grid(True, ls=\"--\")\n", "_ = ax.legend()" ] }, @@ -398,8 +434,8 @@ "loadings = eigenvectors * np.sqrt(eigenvalues)\n", "\n", "# turn into a DataFrame with column names and row labels\n", - "cols = [f'PC{i}' for i in range(1, pca.n_components_ + 1)]\n", - "pd.DataFrame(loadings, index=features, columns=cols).sort_values('PC1')" + "cols = [f\"PC{i}\" for i in range(1, pca.n_components_ + 1)]\n", + "pd.DataFrame(loadings, index=features, columns=cols).sort_values(\"PC1\")" ] }, { @@ -427,11 +463,12 @@ "source": [ "# plot the points on their first 2 PCs, and color by the response variable\n", "fig, ax = plt.subplots(figsize=(6, 6))\n", - "ax = sns.scatterplot(ax=ax, x=X_reduced[:, 0], y=X_reduced[:, 1],\n", - " hue=y, palette='plasma_r', s=5, edgecolor='none')\n", - "ax.set_xlabel('PC1')\n", - "ax.set_ylabel('PC2')\n", - "_ = ax.set_aspect('equal')" + "ax = sns.scatterplot(\n", + " ax=ax, x=X_reduced[:, 0], y=X_reduced[:, 1], hue=y, palette=\"plasma_r\", s=5, edgecolor=\"none\"\n", + ")\n", + "ax.set_xlabel(\"PC1\")\n", + "ax.set_ylabel(\"PC2\")\n", + "_ = ax.set_aspect(\"equal\")" ] }, { @@ -467,11 +504,31 @@ "metadata": {}, "outputs": [], "source": [ - "features = ['median_age', 'pct_hispanic', 'pct_white', 'pct_black', 'pct_asian', 'pct_male', 'med_gross_rent',\n", - " 'pct_single_family_home', 'med_home_value', 'med_rooms_per_home', 'pct_built_before_1940',\n", - " 'pct_renting', 'rental_vacancy_rate', 'avg_renter_household_size', 'med_household_income',\n", - " 'mean_commute_time', 'pct_commute_drive_alone', 'pct_below_poverty', 'pct_college_grad_student',\n", - " 'pct_same_residence_year_ago', 'pct_bachelors_degree', 'pct_english_only', 'pct_foreign_born']" + "features = [\n", + " \"median_age\",\n", + " \"pct_hispanic\",\n", + " \"pct_white\",\n", + " \"pct_black\",\n", + " \"pct_asian\",\n", + " \"pct_male\",\n", + " \"med_gross_rent\",\n", + " \"pct_single_family_home\",\n", + " \"med_home_value\",\n", + " \"med_rooms_per_home\",\n", + " \"pct_built_before_1940\",\n", + " \"pct_renting\",\n", + " \"rental_vacancy_rate\",\n", + " \"avg_renter_household_size\",\n", + " \"med_household_income\",\n", + " \"mean_commute_time\",\n", + " \"pct_commute_drive_alone\",\n", + " \"pct_below_poverty\",\n", + " \"pct_college_grad_student\",\n", + " \"pct_same_residence_year_ago\",\n", + " \"pct_bachelors_degree\",\n", + " \"pct_english_only\",\n", + " \"pct_foreign_born\",\n", + "]" ] }, { @@ -482,7 +539,7 @@ "outputs": [], "source": [ "# calculate then standardize median values across counties\n", - "counties = df.groupby('county_name').median()\n", + "counties = df.groupby(\"county_name\").median()\n", "X = counties[features].dropna()\n", "X = StandardScaler().fit_transform(X)\n", "X.shape" @@ -522,7 +579,7 @@ "cluster_labels = km.labels_\n", "unique_labels = set(cluster_labels)\n", "num_clusters = len(unique_labels)\n", - "print(f'Number of clusters: {num_clusters}')" + "print(f\"Number of clusters: {num_clusters}\")" ] }, { @@ -544,11 +601,18 @@ "source": [ "# scatterplot points on first two PCs and color by cluster\n", "fig, ax = plt.subplots(figsize=(4, 4))\n", - "ax = sns.scatterplot(ax=ax, x=X_reduced[:, 0], y=X_reduced[:, 1],\n", - " hue=cluster_labels, palette='Set1', s=20, edgecolor='none')\n", - "ax.set_xlabel('PC1')\n", - "ax.set_ylabel('PC2')\n", - "_ = ax.set_aspect('equal')" + "ax = sns.scatterplot(\n", + " ax=ax,\n", + " x=X_reduced[:, 0],\n", + " y=X_reduced[:, 1],\n", + " hue=cluster_labels,\n", + " palette=\"Set1\",\n", + " s=20,\n", + " edgecolor=\"none\",\n", + ")\n", + "ax.set_xlabel(\"PC1\")\n", + "ax.set_ylabel(\"PC2\")\n", + "_ = ax.set_aspect(\"equal\")" ] }, { @@ -584,14 +648,14 @@ "source": [ "# create an elbow plot\n", "fig, ax = plt.subplots()\n", - "ax.set_xlabel('Number of clusters')\n", - "ax.set_ylabel('Distortion')\n", + "ax.set_xlabel(\"Number of clusters\")\n", + "ax.set_ylabel(\"Distortion\")\n", "kvals = range(1, 15)\n", "distortions = []\n", "for k in kvals:\n", " km = KMeans(n_clusters=k).fit(X_reduced)\n", " distortions.append(km.inertia_)\n", - "ax.plot(kvals, distortions, marker='o')\n", + "ax.plot(kvals, distortions, marker=\"o\")\n", "_ = ax.grid(True)" ] }, @@ -630,7 +694,7 @@ "source": [ "# cluster the data (in two dimensions again)\n", "X_reduced = PCA(n_components=2).fit_transform(X)\n", - "db = DBSCAN(eps=1, min_samples=3, metric='euclidean').fit(X_reduced)" + "db = DBSCAN(eps=1, min_samples=3, metric=\"euclidean\").fit(X_reduced)" ] }, { @@ -644,7 +708,7 @@ "cluster_labels = db.labels_\n", "unique_labels = set(cluster_labels)\n", "num_clusters = len(unique_labels)\n", - "print(f'Number of clusters: {num_clusters}')" + "print(f\"Number of clusters: {num_clusters}\")" ] }, { @@ -657,11 +721,18 @@ "# scatterplot points on first two PCs and color by cluster\n", "# cluster label -1 means noise\n", "fig, ax = plt.subplots(figsize=(4, 4))\n", - "ax = sns.scatterplot(ax=ax, x=X_reduced[:, 0], y=X_reduced[:, 1],\n", - " hue=cluster_labels, palette='Set1', s=20, edgecolor='none')\n", - "ax.set_xlabel('PC1')\n", - "ax.set_ylabel('PC2')\n", - "_ = ax.set_aspect('equal')" + "ax = sns.scatterplot(\n", + " ax=ax,\n", + " x=X_reduced[:, 0],\n", + " y=X_reduced[:, 1],\n", + " hue=cluster_labels,\n", + " palette=\"Set1\",\n", + " s=20,\n", + " edgecolor=\"none\",\n", + ")\n", + "ax.set_xlabel(\"PC1\")\n", + "ax.set_ylabel(\"PC2\")\n", + "_ = ax.set_aspect(\"equal\")" ] }, { @@ -725,7 +796,7 @@ "source": [ "# calculate distance matrix then linkage matrix, choosing a method (algorithm)\n", "distances = pdist(X_reduced)\n", - "Z = hierarchy.linkage(distances, method='complete', optimal_ordering=True)" + "Z = hierarchy.linkage(distances, method=\"complete\", optimal_ordering=True)" ] }, { @@ -752,17 +823,19 @@ "\n", "# plot the dendrogram, colored by clusters below the cut point\n", "fig, ax = plt.subplots(figsize=(5, 11))\n", - "ax.set_xlabel('Euclidean distance')\n", - "with plt.rc_context({'lines.linewidth': 1}):\n", - " R = hierarchy.dendrogram(Z=Z,\n", - " orientation='right',\n", - " labels=counties.index,\n", - " color_threshold=cut_point,\n", - " distance_sort='descending',\n", - " show_leaf_counts=False,\n", - " ax=ax)\n", - "plt.axvline(cut_point, c='k')\n", - "fig.savefig('dendrogram.png', dpi=600, facecolor='w', bbox_inches='tight')" + "ax.set_xlabel(\"Euclidean distance\")\n", + "with plt.rc_context({\"lines.linewidth\": 1}):\n", + " R = hierarchy.dendrogram(\n", + " Z=Z,\n", + " orientation=\"right\",\n", + " labels=counties.index,\n", + " color_threshold=cut_point,\n", + " distance_sort=\"descending\",\n", + " show_leaf_counts=False,\n", + " ax=ax,\n", + " )\n", + "plt.axvline(cut_point, c=\"k\")\n", + "fig.savefig(\"dendrogram.png\", dpi=600, facecolor=\"w\", bbox_inches=\"tight\")" ] }, { @@ -775,7 +848,7 @@ "# assign k cluster labels to the observations, based on where you cut tree\n", "# k = number of clusters = how many horizontal lines you intersected above\n", "k = 8\n", - "cluster_labels = hierarchy.fcluster(Z, t=k, criterion='maxclust')\n", + "cluster_labels = hierarchy.fcluster(Z, t=k, criterion=\"maxclust\")\n", "pd.Series(cluster_labels).value_counts().sort_index()" ] }, @@ -814,9 +887,7 @@ "source": [ "# t-SNE with two dimensions, then project features onto this space\n", "tsne = TSNE(n_components=2, n_iter=10000, random_state=0)\n", - "X_reduced = pd.DataFrame(data=tsne.fit_transform(X),\n", - " index=counties.index,\n", - " columns=['TC1', 'TC2'])" + "X_reduced = pd.DataFrame(data=tsne.fit_transform(X), index=counties.index, columns=[\"TC1\", \"TC2\"])" ] }, { @@ -828,16 +899,23 @@ "source": [ "# plot the colored clusters projected onto the two t-SNE dimensions\n", "fig, ax = plt.subplots(figsize=(4, 4))\n", - "ax.set_xlabel('t-SNE 1')\n", - "ax.set_ylabel('t-SNE 2')\n", - "X_reduced['color'] = pd.Series(dict(zip(R['ivl'], R['leaves_color_list'])))\n", - "ax.scatter(x=X_reduced['TC1'], y=X_reduced['TC2'], c=X_reduced['color'], s=10)\n", + "ax.set_xlabel(\"t-SNE 1\")\n", + "ax.set_ylabel(\"t-SNE 2\")\n", + "X_reduced[\"color\"] = pd.Series(dict(zip(R[\"ivl\"], R[\"leaves_color_list\"])))\n", + "ax.scatter(x=X_reduced[\"TC1\"], y=X_reduced[\"TC2\"], c=X_reduced[\"color\"], s=10)\n", "\n", "# identify a county of interest in the plot\n", - "county = 'San Francisco'\n", - "_ = ax.scatter(x=X_reduced.loc[county, 'TC1'],\n", - " y=X_reduced.loc[county, 'TC2'],\n", - " alpha=1, marker='o', s=300, linewidth=2, color='none', ec='k')" + "county = \"San Francisco\"\n", + "_ = ax.scatter(\n", + " x=X_reduced.loc[county, \"TC1\"],\n", + " y=X_reduced.loc[county, \"TC2\"],\n", + " alpha=1,\n", + " marker=\"o\",\n", + " s=300,\n", + " linewidth=2,\n", + " color=\"none\",\n", + " ec=\"k\",\n", + ")" ] }, { @@ -856,9 +934,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (ppd599)", + "display_name": "Python (ppde642)", "language": "python", - "name": "ppd599" + "name": "ppde642" }, "language_info": { "codemirror_mode": { @@ -870,7 +948,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/modules/13-natural-language-processing/lecture.ipynb b/modules/13-natural-language-processing/lecture.ipynb deleted file mode 100644 index 595edd1..0000000 --- a/modules/13-natural-language-processing/lecture.ipynb +++ /dev/null @@ -1,865 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "bcd90f55", - "metadata": {}, - "source": [ - "## NLP Lecture for PPD 599\n", - "\n", - "In this lecture, you will be introduced to using Natural Language Processing (NLP) in urban analytics." - ] - }, - { - "cell_type": "markdown", - "id": "4e9e3b10", - "metadata": {}, - "source": [ - "Objectives for this lecture:\n", - "\n", - "1. Understand and use common NLP python packages.\n", - "2. Find and visualize patterns in language topics. \n", - "3. Relate language and topics to the underlying urban landscape." - ] - }, - { - "cell_type": "markdown", - "id": "7677d9ce", - "metadata": {}, - "source": [ - "### What is NLP and how can you use it?\n", - "\n", - "NLP is ability to process text or spoken word based data with a computer in order to efficiently deal with large, potentially unruly or unstructured, data. \n", - "\n", - "In urban analytics, the uses of NLP are boundless! You can now handle large amounts of data coming from plans themselves, online open response questionaires, social media postings, transcripts from interviews or meetings, and more. Each of these datasets can illuminate important themes that may be difficult or time consuming to find by hand.\n", - "\n", - "The NLP processing chain is most often:\n", - "1. Preprocess data to make text as uniform as possible.\n", - "2. Decide what each \"document\" should be - whole body, paragraph, sentence, few words, etc.\n", - "3. Turn each document into vector.\n", - "4. Utilize various existing tools with vectorized data.\n", - "5. Analyze results!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5586ffd2", - "metadata": {}, - "outputs": [], - "source": [ - "import re\n", - "import string\n", - "import nltk \n", - "import gensim\n", - "from gensim import corpora\n", - "from collections import Counter\n", - "from itertools import chain\n", - "import pandas as pd\n", - "import geopandas as gpd\n", - "import matplotlib.pyplot as plt\n", - "import gensim.downloader as api\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", - "\n", - "nltk.download('stopwords')\n", - "nltk.download('wordnet')\n", - "nltk.download('averaged_perceptron_tagger')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2fa07527", - "metadata": {}, - "outputs": [], - "source": [ - "# Let's work with some example data from Zillow \n", - "data = pd.read_csv('../../data/newyork_housing.csv')" - ] - }, - { - "cell_type": "markdown", - "id": "9e9aa632", - "metadata": {}, - "source": [ - "### 1. Introduction to nltk" - ] - }, - { - "cell_type": "markdown", - "id": "415b2b60", - "metadata": {}, - "source": [ - "One of the most powerful tools in Python for NLP is the natural langauge toolkit (nltk) (https://www.nltk.org/). It is rich with processes and easy to use. Often, this package is used for the preprocessing stage where your text data may undergo any of the following:\n", - "\n", - "#### - forcing lowercase and removing unwanted symbols\n", - "Ultimately, you are working with one string composed of different symbols (letters and numbers), so creating uniformity however possible is helpful. You want the computer to recognize \"T\" and \"t\" as the same symbol. You might not also want your application to care about \"*\" or \"|\". It all depends on what you want to pick up on." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b1fe9c8a", - "metadata": {}, - "outputs": [], - "source": [ - "#isolate the text column\n", - "bodytext = data['description']\n", - "#make all letters lowercase\n", - "bodytext = bodytext.str.lower()\n", - "#remove non alphabetic characters \n", - "bodytext = bodytext.apply(lambda x: re.sub(\"[^A-Za-z']+\", ' ', str(x)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ca0109a", - "metadata": {}, - "outputs": [], - "source": [ - "#view the before\n", - "print(data['description'].iloc[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "03590280", - "metadata": {}, - "outputs": [], - "source": [ - "#view the after\n", - "print(bodytext.iloc[0])" - ] - }, - { - "cell_type": "markdown", - "id": "202a65f1", - "metadata": {}, - "source": [ - "#### - removing \"stopwords\"\n", - "\n", - "Stopwords are very common, usually insignificant words that you want filtered out before you do any processing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6602343", - "metadata": {}, - "outputs": [], - "source": [ - "#Take a look at some of the \"stopwords\"\n", - "nltk.corpus.stopwords.words('english')[0:10]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf8970b7", - "metadata": {}, - "outputs": [], - "source": [ - "#remove these from each document\n", - "bodytext = bodytext.apply(lambda x: x.split(\" \"))\n", - "no_stopwords = bodytext.apply(lambda x: sorted(set(x) - set(nltk.corpus.stopwords.words('english')), key=x.index))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "40264876", - "metadata": {}, - "outputs": [], - "source": [ - "#now view our sample text without any stopwords \n", - "print(no_stopwords.iloc[0])" - ] - }, - { - "cell_type": "markdown", - "id": "fc0a35ad", - "metadata": {}, - "source": [ - "#### - stemming or lemmatizing \n", - "\n", - "Stemming is the process of taking a word down to its root. Lemmatizing is the process of changing a word to its base format. Either step is usually performed in order to help your model capture variations in how people might represent words. For example, if you wanted to know how often people were talking about change in a system, you would want to capture whenever people say \"change\", \"changing\", \"changes\", or \"changed\". You can see how this would happen for stemming vs lemmatizing below.\n", - "\n", - "| Stemming | Lemmatizing |\n", - "| --- | --- | \n", - "| change $\\rightarrow$ chang | change $\\rightarrow$ change | \n", - "| changes $\\rightarrow$ chang | changes $\\rightarrow$ change | \n", - "| changing $\\rightarrow$ chang | changing $\\rightarrow$ change | \n", - "| changed $\\rightarrow$ chang | changed $\\rightarrow$ change | " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d2a3702b", - "metadata": {}, - "outputs": [], - "source": [ - "#stem each word \n", - "#initialze Stemmer\n", - "stemmer = nltk.stem.PorterStemmer()\n", - "#apply to each word in each document\n", - "bodytext_stemmed = bodytext.apply(lambda x: [stemmer.stem(i) for i in x])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a8543578", - "metadata": {}, - "outputs": [], - "source": [ - "#view our sample text after being stemmed\n", - "print(bodytext_stemmed.iloc[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12929c73", - "metadata": {}, - "outputs": [], - "source": [ - "#lemmatize each word\n", - "#initialize Lemmatizer\n", - "wnl = nltk.stem.WordNetLemmatizer()\n", - "#apply to each word in each document\n", - "bodytext_lemm = bodytext.apply(lambda x: [wnl.lemmatize(i) for i in x])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d593850c", - "metadata": {}, - "outputs": [], - "source": [ - "#view our sample text after being lemmatized\n", - "print(bodytext_lemm.iloc[0])" - ] - }, - { - "cell_type": "markdown", - "id": "de5b3f05", - "metadata": {}, - "source": [ - "#### NLTK has other powerful accessories!\n", - "\n", - "nltk can help identify the part of speech to isolate nouns, verbs, adjectives, etc. It can also identify groupings of words that most often occur together!\n", - "\n", - "The nltk POS codes are: \n", - "\n", - "| Code | Part of Speech || Code | Part of Speech |\n", - "| --- | --- || --- | --- | \n", - "|CC:| conjunction, coordinating ||PDT:| pre-determiner |\n", - "|CD:| numeral, cardinal ||POS:| genitive marker |\n", - "|DT:| determiner ||PRP:| pronoun, personal |\n", - "|EX:| existential there ||RB:| adverb |\n", - "|IN:| preposition or conjunction, subordinating ||RP:| particle |\n", - "|JJ:| adjective or numeral, ordinal ||TO:| \"to\" as preposition or infinitive marker |\n", - "|JJR:| adjective, comparative ||UH:| interjection |\n", - "|JJS:| adjective, superlative ||VB:| verb, base form |\n", - "|LS:| list item marker ||VBD:| verb, past tense |\n", - "|MD:| modal auxiliary ||VBG:| verb, present participle or gerund |\n", - "|NN:| noun, common, singular or mass ||VBN:| verb, past participle |\n", - "|NNP:| noun, proper, singular || WDT:| WH-determiner |\n", - "|NNS:| noun, common, plural|" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d7a50fae", - "metadata": {}, - "outputs": [], - "source": [ - "# Identify the part of speech and isolate adjectives, nouns, etc.\n", - "example_sentence = bodytext.iloc[0]\n", - "print(nltk.pos_tag(example_sentence)[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22e0bb05", - "metadata": {}, - "outputs": [], - "source": [ - "#look at all of the adjectives for the postings\n", - "def keep_pos(x,pos=['JJ','JJS','JJR']):\n", - " tagged = nltk.pos_tag(x)\n", - " words_to_keep = [t[0] for t in tagged if t[1] in pos]\n", - " return words_to_keep\n", - "\n", - "keep_pos(example_sentence, pos=['JJ','JJS','JJR'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5476f3b2", - "metadata": {}, - "outputs": [], - "source": [ - "# Identify words that often appear together\n", - "number_of_words = 2\n", - "ngrams = no_stopwords.apply(lambda x: list(nltk.ngrams(x,number_of_words)))\n", - "count = Counter(list(chain.from_iterable(list(ngrams.values))))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "69ef366d", - "metadata": {}, - "outputs": [], - "source": [ - "count.most_common(15)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4ae0615", - "metadata": {}, - "outputs": [], - "source": [ - "#Now your turn\n", - "#Identify the most common words across the whole dataset at each stage to see how the list changes\n", - "#With the original data, with lowercasing, with removing stopwords, with stemming\n" - ] - }, - { - "cell_type": "markdown", - "id": "ed4edc86", - "metadata": {}, - "source": [ - "### 2. Introduction to TFIDF" - ] - }, - { - "cell_type": "markdown", - "id": "d1c679bb", - "metadata": {}, - "source": [ - "Step 2 of the NLP process is determining what your \"document\" will be. This can be the whole text as one, each sentence individually, or even bi- or tri-grams of words. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea13c557", - "metadata": {}, - "outputs": [], - "source": [ - "#split by sentence\n", - "def split_by_sent(text, split_criteria=[' ','.', '!', '?','\\n']):\n", - " for x in split_criteria:\n", - " text = str(text).replace(x, '*')\n", - " bodylist = str(text).split('*')\n", - " bodylist = [w for w in bodylist if w != '']\n", - " return bodylist \n", - " \n", - "sentences = data['description'].str.lower().apply(lambda x: split_by_sent(x))\n", - "sentencedf = sentences.explode()\n", - "sentencedf = sentencedf[~sentencedf.isna()]\n", - "print(sentences.iloc[0])\n", - "print('\\n', sentencedf.iloc[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "71192ec1", - "metadata": {}, - "outputs": [], - "source": [ - "#split by bigram\n", - "bigrams = no_stopwords.apply(lambda x: list(nltk.ngrams(x,2)))\n", - "bigramdf = bigrams.explode()\n", - "print(bigrams.iloc[0])\n", - "print('\\n', bigramdf.iloc[0])" - ] - }, - { - "cell_type": "markdown", - "id": "5a438f02", - "metadata": {}, - "source": [ - "One method of performing step 3, turning each document into a vector, is through Term Frequency-Inverse Document Frequency (TFIDF). TF-IDF measures how important each word is to each document. \n", - "\n", - "Term Frequency (tf) refers to how often a word occurs in a document, ranging from 0 to 1. Inverse document frequency (idf) refers to how often a word occurs in _any_ of the documents, where closer to 0 represents more common words (think: and, the, it) and closer to 1 represents rarer words (think: quire, ulotrichous).\n", - "\n", - "The goal is to have a vector for each document that is 1 x n (n being the total number of words in the dataset dictionary) with values describing the tf * idf scores for each word." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bdfd8bc6", - "metadata": {}, - "outputs": [], - "source": [ - "#First, we need a vector that shows the counts of each word in each document. Most of it will be 0.\n", - "documents = bodytext.apply(lambda x: ' '.join(x))\n", - "count_vect = CountVectorizer()\n", - "data_counts = count_vect.fit_transform(documents)\n", - "#Then, we can create the tf-idf matrix\n", - "tfidf_transformer = TfidfTransformer()\n", - "data_tfidf = tfidf_transformer.fit_transform(data_counts)\n", - "#Inspect the shape of the matrix\n", - "print(data_counts.shape)\n", - "print(data_tfidf.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5fc77953", - "metadata": {}, - "outputs": [], - "source": [ - "#Now with the sentence dataframe\n", - "#First, we need a vector that shows the counts of each word in each document. Most of it will be 0.\n", - "count_vect_sent = CountVectorizer()\n", - "data_counts_sentences = count_vect_sent.fit_transform(sentencedf)\n", - "#Then, we can create the tf-idf matrix\n", - "tfidf_transformer_sent = TfidfTransformer()\n", - "data_tfidf_sentences = tfidf_transformer_sent.fit_transform(data_counts_sentences)\n", - "#Inspect the shape of the matrix\n", - "print(data_counts_sentences.shape)\n", - "print(data_tfidf_sentences.shape)" - ] - }, - { - "cell_type": "markdown", - "id": "63d78217", - "metadata": {}, - "source": [ - "### 3. Introduction to word2vec" - ] - }, - { - "cell_type": "markdown", - "id": "81483807", - "metadata": {}, - "source": [ - "Another method of performing step 3, turning a document into a vector, is through a \"word2vec\" model, which as you might have guessed, turns words in2 vectors!\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e9b02052", - "metadata": {}, - "outputs": [], - "source": [ - "#First, load a model pretrained on Google News articles\n", - "wv = api.load('word2vec-google-news-300')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c28e1a77", - "metadata": {}, - "outputs": [], - "source": [ - "#see how the word \"house\" is embedded in the vector space\n", - "vector = wv['house'] \n", - "vector" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a4e4516", - "metadata": {}, - "outputs": [], - "source": [ - "#You can see the most similar words in the corpus\n", - "wv.most_similar('house', topn=15)" - ] - }, - { - "cell_type": "markdown", - "id": "0cd1e818", - "metadata": {}, - "source": [ - "### 4. Topic Modeling" - ] - }, - { - "cell_type": "markdown", - "id": "aa10e296", - "metadata": {}, - "source": [ - "Now that we have our documents represented as a matrix (m documents x n words in dictionary OR m documents x n features in word2vec vector), we want to understand what topics are present " - ] - }, - { - "cell_type": "markdown", - "id": "62eff652", - "metadata": {}, - "source": [ - "#### Latent Dirichlet Allocation (LDA)\n", - "\n", - "LDA is an unsupervised topic modeling technique. We can use this technique to create clusters, or topics, that are commonly occuring across all of the documents. Then, we can understand what words describe those topics. Finally, we can trace the topics back to our documents (remember, this can be the full ad or a single sentence) and see what topics appear in each document. There can be more than one topic per document! " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b1b734ab", - "metadata": {}, - "outputs": [], - "source": [ - "#create a dictionary \n", - "documents = sentencedf.apply(lambda x: x.split(\" \"))\n", - "documents = documents.apply(lambda x: sorted(set(x) - set(nltk.corpus.stopwords.words('english')), key=x.index))\n", - "all_text = list(documents)\n", - "all_dict = corpora.Dictionary(all_text)\n", - "doc_term_matrix = [all_dict.doc2bow(i) for i in all_text]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7512f714", - "metadata": {}, - "outputs": [], - "source": [ - "#choose number of topics and create model\n", - "num_topics = 12\n", - "ldamodel = gensim.models.ldamodel.LdaModel(corpus=doc_term_matrix,\n", - " id2word=all_dict,\n", - " num_topics=num_topics,\n", - " eval_every=None,\n", - " passes=1,\n", - " random_state=0)\n", - "\n", - "#save the top num_words for each topic \n", - "num_words = 15\n", - "print_topics = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0cb079b7", - "metadata": {}, - "outputs": [], - "source": [ - "for topic in print_topics:\n", - " print('Topic {}'.format(topic[0]))\n", - " topwords = topic[1].split('\"')[1::2]\n", - " print(\", \".join(topwords))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0a5648f3", - "metadata": {}, - "outputs": [], - "source": [ - "doc_top_topics = []\n", - "for i in range(len(documents)):\n", - " topic_probs = ldamodel[doc_term_matrix[i]]\n", - " max_score = 0\n", - " top_topic = num_topics\n", - " for topic, prob in topic_probs:\n", - " if prob > max_score:\n", - " max_score = prob\n", - " top_topic = topic\n", - " doc_top_topics.append(top_topic)\n", - "\n", - "\n", - "sentencedf2 = pd.DataFrame({'adindex': sentencedf.index, \n", - " 'sentence': sentencedf.values, \n", - " 'top_topic': doc_top_topics, \n", - " 'sent_len': documents.apply(len)})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d1f9f9e8", - "metadata": {}, - "outputs": [], - "source": [ - "#calculate what percentage of the ad is dedicated to each topic \n", - "import numpy as np\n", - "percentages = np.zeros((len(data),num_topics))\n", - "#groupby the ad and the topic of the sentence. Sum the number of words per ad per topic\n", - "groupeddf = sentencedf2.groupby(['adindex', 'top_topic']).sent_len.sum()\n", - "#Put into a matrix\n", - "for idx in groupeddf.index:\n", - " percentages[idx] = groupeddf[idx]\n", - "percentages = np.transpose(np.transpose(percentages)/percentages.sum(axis=1))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4307df56", - "metadata": {}, - "outputs": [], - "source": [ - "#plot the percentage of the ads dedicated to each topic \n", - "pd.DataFrame(data=percentages, columns = range(num_topics)).boxplot()\n" - ] - }, - { - "cell_type": "markdown", - "id": "9c7da9dc", - "metadata": {}, - "source": [ - "#### Similar to keywords using word2vec model\n", - "\n", - "We can also use our word2vec model to find how similar our document is to a predetermined keyword or topic! We can do this by testing how similar all of the words within the document are to the keyword." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c556d7a8", - "metadata": {}, - "outputs": [], - "source": [ - "#set of words to compare ad words to \n", - "testwords = ['transit'] \n", - "\n", - "#get similarity scores of each word in each ad to keywords in testwords list\n", - "sims = []\n", - "for row in no_stopwords.values:\n", - " tmp= [], \n", - " for w in row:\n", - " #look at the similarity score between each word and the testwords\n", - " try:\n", - " tmp.append(wv.similarity(testwords[0], w))\n", - " #not all words are in our corpus defined under the wv model\n", - " except:\n", - " continue\n", - " sims.append(tmp)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6bfe3c2a", - "metadata": {}, - "outputs": [], - "source": [ - "#calculate mean of similarity scores for each keyword / document pair\n", - "means = np.zeros((len(sims), 1))\n", - "means[:,0] = list(map(lambda x: np.mean(x), sims))\n", - "\n", - "#plot distributions of keyword similarity scores\n", - "plt.close()\n", - "fig, ax = plt.subplots()\n", - "plt.rcParams['savefig.dpi'] = 300\n", - "ax.patch.set_alpha(0)\n", - "sns.distplot(means[:,0])\n", - "plt.xlabel('Mean Similarity Score of Words in Body Text')\n", - "plt.ylabel('Density')\n", - "plt.xlim(0.05,0.2)\n", - "plt.tight_layout()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "997d4351", - "metadata": {}, - "outputs": [], - "source": [ - "#Now your turn\n", - "#Look at the topics determined by our LDA method when using the whole ad \n", - "#Or try out the similarity scores for another keyword\n", - "#Or try to use our tf-idf vectors in another clustering method you know (k-means, dbscan, etc.)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "640acf58", - "metadata": {}, - "source": [ - "### 5. Putting it Together with Spatial Analysis" - ] - }, - { - "cell_type": "markdown", - "id": "76782912", - "metadata": {}, - "source": [ - "Once you have performed your text analysis, often you will end up with quantitative variables which can then be analyzed spatially as with any other data. \n", - "\n", - "You might have now integer values representing the most prominent topic for each document, the percent of the text dedicated to a word or topic, or even simply the boolean presence of a word or topic. If the documents contain some sort of spatial information (e.g., location of the Zillow ad), you can now perform your spatial analysis!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "640dbcbc", - "metadata": {}, - "outputs": [], - "source": [ - "import folium\n", - "import branca.colormap as cm\n", - "\n", - "#add the topic percentages to the original dataframe\n", - "data_new = data.join(pd.DataFrame(data=percentages, columns = range(num_topics)).fillna(0)) \n", - "data_new = data_new[~data_new.latitude.isna()]\n", - "\n", - "#create the map\n", - "centerlat = (data_new['latitude'].max() + data_new['latitude'].min()) / 2\n", - "centerlong = (data_new['longitude'].max() + data_new['longitude'].min()) / 2\n", - "center = (centerlat, centerlong)\n", - "colormap = cm.LinearColormap(colors=['green', 'yellow', 'red'], vmin=0, vmax=1)\n", - "map_nyc = folium.Map(location=center, zoom_start=10, tiles='Stamen Toner')\n", - "\n", - "#topic_data1\n", - "topic_number1 = 0\n", - "for i in range(len(data_new)):\n", - " folium.Circle(\n", - " location=[data_new.iloc[i]['latitude'], data_new.iloc[i]['longitude']],\n", - " radius=10,\n", - " fill=True,\n", - " color=colormap(data_new.iloc[i][topic_number1]),\n", - " fill_opacity=0.2\n", - " ).add_to(map_nyc)\n", - "\n", - "# the following line adds the scale directly to our map\n", - "map_nyc.add_child(colormap)\n", - "\n", - "map_nyc" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7fc802a9", - "metadata": {}, - "outputs": [], - "source": [ - "map_nyc2 = folium.Map(location=center, zoom_start=10, tiles='Stamen Toner')\n", - "\n", - "#topic_data2\n", - "topic_number2 = 10\n", - "for i in range(len(data_new)):\n", - " folium.Circle(\n", - " location=[data_new.iloc[i]['latitude'], data_new.iloc[i]['longitude']],\n", - " radius=10,\n", - " fill=True,\n", - " color=colormap(data_new.iloc[i][topic_number2]),\n", - " fill_opacity=0.2\n", - " ).add_to(map_nyc2)\n", - "\n", - "# the following line adds the scale directly to our map\n", - "map_nyc2.add_child(colormap)\n", - "\n", - "map_nyc2" - ] - }, - { - "cell_type": "markdown", - "id": "94d4af73", - "metadata": {}, - "source": [ - "### 6. Your Turn" - ] - }, - { - "cell_type": "markdown", - "id": "6a5c5464", - "metadata": {}, - "source": [ - "Work through the above examples to identify a pattern of your choosing. \n", - "Separate the data initially and see how your topics vary. \n", - "\n", - "For example, what LDA topics emerge when you separate on listing price? on number of bedrooms? on square footage?\n", - "\n", - "What keywords can you search on for similarity with the word2vec model? How do the distributions change across the above separations?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b0d50150", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "c53dec24", - "metadata": {}, - "source": [ - "### 7. BONUS - Working with Different Languages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "667d9ba7", - "metadata": {}, - "outputs": [], - "source": [ - "#detect the language(s) of your text along with a confidence score\n", - "from googletrans import Translator\n", - "def detect_lang(text):\n", - " translator = Translator()\n", - " detection=translator.detect(text)\n", - " return detection.confidence" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3f1e11e0", - "metadata": {}, - "outputs": [], - "source": [ - "detect_lang(bodytext.iloc[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dd8397fe-6b6d-44e3-a2ac-c986f0f98f73", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/modules/14-computer-vision/readme.md b/modules/14-computer-vision/readme.md new file mode 100644 index 0000000..bb89a60 --- /dev/null +++ b/modules/14-computer-vision/readme.md @@ -0,0 +1,3 @@ +# Module 14: Computer vision + +We introduce, from an applied perspective, applications of computer vision. diff --git a/modules/14-next-steps/README.md b/modules/15-next-steps/README.md similarity index 69% rename from modules/14-next-steps/README.md rename to modules/15-next-steps/README.md index d66b0be..d8e3e3e 100644 --- a/modules/14-next-steps/README.md +++ b/modules/15-next-steps/README.md @@ -2,7 +2,6 @@ You've made it to the end of the semester. Where do you go from here as you continue along your path in urban analytics and geospatial data science? Today we'll talk about some best practices and next steps to take. - ## More skills People often ask me "what should I do to keep learning more after this course ends?" Here are some suggestions. @@ -10,13 +9,13 @@ People often ask me "what should I do to keep learning more after this course en My first and best piece of advice is code a lot. Work on your own projects. Contribute to others' open source projects. Code is an ultimate example of "learning by doing" and your skill level will reflect the hours you've put into struggling (often miserably) with it. Other skills you should learn: - - cloud computing, such as with AWS - - working with databases, especially Postgres/PostGIS - - more machine learning, such as the resources [gathered here](https://github.com/hangtwenty/dive-into-machine-learning) - - bayesian stats, such as the resources [here](https://camdavidsonpilon.github.io/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/) and [here](https://github.com/markdregan/Bayesian-Modelling-in-Python) -Consider learning a little web development, particularly the core skills of HTML, CSS, and JavaScript. They can help you present your analytics as an interactive app online, as well as become a stronger web scraper. Check out Leaflet and Carto for web mapping. As you already know Python, consider also learning a Python web development framework like Django or Flask. +- cloud computing, such as with AWS +- working with databases, especially Postgres/PostGIS +- more machine learning, such as the resources [gathered here](https://github.com/hangtwenty/dive-into-machine-learning) +- bayesian stats, such as the resources [here](https://camdavidsonpilon.github.io/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/) and [here](https://github.com/markdregan/Bayesian-Modelling-in-Python) +Consider learning a little web development, particularly the core skills of HTML, CSS, and JavaScript. They can help you present your analytics as an interactive app online, as well as become a stronger web scraper. Check out Leaflet and Carto for web mapping. As you already know Python, consider also learning a Python web development framework like Django or Flask. ## Better code @@ -32,7 +31,6 @@ Use a **code formatter**. Common Python formatters are [black](https://pypi.org/ **Continuous integration**: [CI](https://docs.github.com/en/actions/guides/about-continuous-integration) is the standard workflow for continuously testing and merging your new features and fixes into your mainline branch. It can integrate code format checks, linters, unit tests, and coverage reports prior to merge. Commit often, push often, merge often. See the [OSMnx repo](https://github.com/gboeing/osmnx) for an example. - ## Documentation Good documentation is central to open science and essential for reusability, for both yourself and others. There are right and wrong ways to create documentation, but good documentation is clear, precise, and up-to-date. @@ -41,7 +39,6 @@ Good documentation is central to open science and essential for reusability, for Use **docstrings** to automatically document your modules and functions. Numpy-style docstring [standards](https://numpydoc.readthedocs.io/en/latest/format.html#docstring-standard) are common. See the [OSMnx repo](https://github.com/gboeing/osmnx) for an example. Consider a service like [readthedocs](https://osmnx.readthedocs.io/) for building and hosting your automated documentation. - ## Meaningful analytics **Read a lot**. Understand current knowledge, how different perspectives relate to one another, and their intellectual lineage and trajectory. Understand state-of-the-art methods and their strengths and trade-offs. Understand the big problems, real-world challenges, and applied/practice context of your discipline. Then you can identify a knowledge gap with real-world importance and the appropriate methods to contribute new knowledge to advance theory. @@ -51,12 +48,12 @@ Use **docstrings** to automatically document your modules and functions. Numpy-s **Let the methods follow the question**. Don't try to shoehorn a question into the method you want to use just because it's trendy or you're familiar with it. Pick the right tool for the job, and if it doesn't exist, create it and share it with the world. Make sure your work is - - reproducible: methods are sufficiently well described so they can be repeated by others - - replicable: others get approximately the same results when they reproduce your methods - - well-documented: see documentation notes above -Open science: share your code and data and avoid point-and-click software. Contribute to the open commons! +- reproducible: methods are sufficiently well described so they can be repeated by others +- replicable: others get approximately the same results when they reproduce your methods +- well-documented: see documentation notes above +Open science: share your code and data and avoid point-and-click software. Contribute to the open commons! ## Communicate well @@ -76,7 +73,7 @@ Write directly and succinctly. Don't use two words when one will do. Excise redu ### Show, don't tell -Be assertive and precise to make it clear what you mean. Prefer specific words to general ones. Prefer nouns to pronouns. Prefer verbs to nouns. Avoid nominalizations (verbs converted into nouns). Example: "a 1% increase in fuel prices caused a 0.2% *reduction* in VMT." Better: "a 1% increase in fuel prices *reduced* VMT by 0.2%." Write positively: avoid using "not" to obscure your meaning. That is, tell us what did happen rather than what didn't. +Be assertive and precise to make it clear what you mean. Prefer specific words to general ones. Prefer nouns to pronouns. Prefer verbs to nouns. Avoid nominalizations (verbs converted into nouns). Example: "a 1% increase in fuel prices caused a 0.2% _reduction_ in VMT." Better: "a 1% increase in fuel prices _reduced_ VMT by 0.2%." Write positively: avoid using "not" to obscure your meaning. That is, tell us what did happen rather than what didn't. ### Use active language @@ -90,43 +87,44 @@ Too many prepositions signal a circuitous sentence and often indicate other issu Before you draft a paper to share the results of an analytics project, make sure you can articulate your interrelated research question, argument, and significance. The **question** is simply what your project sets out to answer. The **argument** is your paper's single "big idea" in 1-2 sentences, motivated by an existing knowledge gap, and around which you organize your paper and your evidence to persuade the reader. One paper, one idea. The **significance** is the "so what?" Why should we read your paper? What will we do differently in research or practice now that we've read it? - - **Introduction**: your audience doesn't care about your work and you have three paragraphs to change their mind - - introduce your topic and its overall context and importance (persuade us to care) - - introduce your research question and argument - - summarize your methods, findings, and significance - - **Background**: what is currently known and unknown - - explain the context of your study - - review relevant previous work to establish what is currently known - - identify what important open question remains (gap in the literature or unmet need in practice) - - **Methods**: what did you do and how did you do it - - state your research question and hypotheses - - data subsection: describe your data collection, variables, and any relevant processing - - analysis subsection: describe your analysis with enough detail that an expert could replicate it - - **Findings**: what did your analysis reveal - - organize your results around your argument but present them objectively with limited interpretation - - include supporting tables/figures - - do not mix in any methods - - **Discussion**: answer your question, advance your argument, and demonstrate significance - - return to research question and interpret (don't repeat) your findings as evidence for/against it - - tell a story: link your findings together to persuasively advance your argument - - significance: discuss specific implications for research and practice ("so what") - - acknowledge study's limitations and alternative interpretations of your evidence - - **Conclusion**: succinctly wrap up - - summarize your topic, question, and argument - - summarize what you did, what you found, what it means, and why it matters +- **Introduction**: your audience doesn't care about your work and you have three paragraphs to change their mind + - introduce your topic and its overall context and importance (persuade us to care) + - introduce your research question and argument + - summarize your methods, findings, and significance +- **Background**: what is currently known and unknown + - explain the context of your study + - review relevant previous work to establish what is currently known + - identify what important open question remains (gap in the literature or unmet need in practice) +- **Methods**: what did you do and how did you do it + - state your research question and hypotheses + - data subsection: describe your data collection, variables, and any relevant processing + - analysis subsection: describe your analysis with enough detail that an expert could replicate it +- **Findings**: what did your analysis reveal + - organize your results around your argument but present them objectively with limited interpretation + - include supporting tables/figures + - do not mix in any methods +- **Discussion**: answer your question, advance your argument, and demonstrate significance + - return to research question and interpret (don't repeat) your findings as evidence for/against it + - tell a story: link your findings together to persuasively advance your argument + - significance: discuss specific implications for research and practice ("so what") + - acknowledge study's limitations and alternative interpretations of your evidence +- **Conclusion**: succinctly wrap up + - summarize your topic, question, and argument + - summarize what you did, what you found, what it means, and why it matters Sometimes the introduction and background are merged into a single section. Sometimes the discussion and conclusion are merged into a single section. But this is the basic structure of an effective paper, be it a technical report or a scholarly article. The structure represents a loose symmetry. The introduction and conclusion roughly mirror each other by explaining the topic's importance, the research question, how you answered it and what you found, and its meaning/significance in the real world. The background and discussion roughly mirror each other by explaining what is known about the topic before (to motivate your specific study) and after your study (to advance our knowledge and impact the real world). The methods and findings sections roughly mirror each other by laying out what you did and then presenting what you found when you did so. In general, people struggle most with writing an effective introduction and discussion. Perversely, these are also the most critical sections of your paper for persuading your readers (including peer reviewers). A strong **introduction** in four paragraphs: - 1. Lay out the topic's context and background using anecdotes or facts to illustrate its importance. Show, don't tell. Orient the reader toward your paper's subject, motivate why this should interest us in the first place, and establish your perspective on approaching it. - 1. What is the problem and its significance? What is the open research question that follows from it and what argument do you develop over the course of this paper? Why is it important from a real-world planning or policy perspective? - 1. How did you answer your question and what did you find? Summarize your data and methods in 1-2 sentences. Then summarize your findings specifically and precisely in 1-2 sentences, citing specific takeaways like "a 1% increase in fuel prices decreases VMT by 0.2%". Conclude with 1 sentence on "who cares?" That is, how do these findings impact real-world policy? What should a practitioner do differently after reading this study? Why is the world better off for having discovered what you have discovered? - 1. Optional roadmap: signpost the organization of the remaining sections so the reader knows what's coming and how you've laid things out. +1. Lay out the topic's context and background using anecdotes or facts to illustrate its importance. Show, don't tell. Orient the reader toward your paper's subject, motivate why this should interest us in the first place, and establish your perspective on approaching it. +1. What is the problem and its significance? What is the open research question that follows from it and what argument do you develop over the course of this paper? Why is it important from a real-world planning or policy perspective? +1. How did you answer your question and what did you find? Summarize your data and methods in 1-2 sentences. Then summarize your findings specifically and precisely in 1-2 sentences, citing specific takeaways like "a 1% increase in fuel prices decreases VMT by 0.2%". Conclude with 1 sentence on "who cares?" That is, how do these findings impact real-world policy? What should a practitioner do differently after reading this study? Why is the world better off for having discovered what you have discovered? +1. Optional roadmap: signpost the organization of the remaining sections so the reader knows what's coming and how you've laid things out. An **abstract** (when required) is written last and summarizes everything into a succinct paragraph. Consider using a five-sentence structure: - 1. what we currently know and don't know - 1. what is your research question - 1. how did you answer that question - 1. what did you find - 1. significance: how are these findings important and useful + +1. what we currently know and don't know +1. what is your research question +1. how did you answer that question +1. what did you find +1. significance: how are these findings important and useful diff --git a/software/docker/Dockerfile b/software/docker/Dockerfile index 1960f79..0f47e3a 100644 --- a/software/docker/Dockerfile +++ b/software/docker/Dockerfile @@ -9,7 +9,7 @@ LABEL maintainer="Geoff Boeing " LABEL url="https://github.com/gboeing/ppde642" LABEL description="USC PPDE642 course image" -COPY requirements.txt /tmp/ +COPY --chmod=0755 requirements.txt /tmp # install packages in one RUN to keep image tidy RUN mamba update --yes -c conda-forge --strict-channel-priority --no-banner -n base mamba && \ @@ -23,7 +23,7 @@ RUN mamba update --yes -c conda-forge --strict-channel-priority --no-banner -n b ipython -c "import osmnx; print('OSMnx version', osmnx.__version__)" # copy default jupyterlab settings, then set jupyter working directory to map to mounted volume -COPY overrides.json /opt/conda/share/jupyter/lab/settings/ +COPY --chmod=0755 overrides.json /opt/conda/share/jupyter/lab/settings/ WORKDIR /home/jovyan/work # set default command to launch when container is run diff --git a/software/docker/requirements.txt b/software/docker/requirements.txt index 2e75171..78b7add 100644 --- a/software/docker/requirements.txt +++ b/software/docker/requirements.txt @@ -1,27 +1,24 @@ beautifulsoup4 -black cartopy cenpy -conda contextily -dill -flake8 folium -gensim geopandas -isort jupyterlab mapclassify -osmnx=1.3.0 -nbqa -nltk +osmnx=1.8.1 pandana pandas +pre-commit pysal -python=3.10.* +python=3.11.* rasterio -rtree seaborn scikit-learn scipy statsmodels +gensim +nltk +pillow +pytorch +torchvision diff --git a/software/readme.md b/software/readme.md index 80258ef..88625f5 100644 --- a/software/readme.md +++ b/software/readme.md @@ -2,19 +2,16 @@ You need a laptop for this course, but all the required software is free and open-source. - ## Get to know your terminal Your computer comes with a "terminal" app that lets you type commands for your computer to run (on Windows, the terminal is called "command prompt"). Before you complete the initial software setup below, make sure you have read Module 1's assigned readings on how to use the terminal. - ## Initial software setup At the beginning of the semester, you need to do a one-time setup process to install and configure the course software on your computer. The two important pieces of software that you need to install are Conda and Git. Conda is a package manager we will use to set up your Python environment. Git is a tool that lets you pull in the latest versions of the course files each week. Install and configure the software by following the steps below. - ### Step 1: Git Download and install [git](https://git-scm.com/downloads). Then, on your computer, open a terminal window, change directories to your desktop, and clone the course repo to your desktop by running the following command in your terminal: @@ -25,7 +22,6 @@ git clone https://github.com/gboeing/ppde642.git You now have a `ppde642` folder on your desktop containing the course repo. - ### Step 2: Conda Download and install [mambaforge](https://mamba.readthedocs.io/en/latest/installation.html). Then open a terminal window (or Anaconda command prompt if on Windows), change directories to the `ppde642` folder on your desktop, and run the following commands, one at a time: @@ -34,7 +30,7 @@ Download and install [mambaforge](https://mamba.readthedocs.io/en/latest/install git pull conda config --prepend channels conda-forge conda config --set channel_priority strict -mamba clean --all --yes +conda clean --all --yes mamba env create --file environment.yml --force conda activate ppde642 python -m ipykernel install --sys-prefix --name ppde642 --display-name "Python (ppde642)" @@ -42,22 +38,20 @@ python -m ipykernel install --sys-prefix --name ppde642 --display-name "Python ( You now have a conda environment with all the packages needed for this course, and a Jupyter kernel installed in the environment. - ## How to run Jupyter First make sure you've completed the "initial software setup" instructions above. When you come into class each day, before the lecture begins, do the following steps (takes <1 minute): - 1. Make sure you have the latest data from the course's Google Drive data folder - 1. Open a terminal, change directories to the `ppde642` folder on your desktop that you created in the initial software setup - 1. Run `git pull` to bring your local clone of the course repo up to date with the remote - 1. Run `conda activate ppde642` then `jupyter lab` to start your Jupyter server - 1. In your web browser, visit http://localhost:8888 +1. Make sure you have the latest data from the course's Google Drive data folder +1. Open a terminal, change directories to the `ppde642` folder on your desktop that you created in the initial software setup +1. Run `git pull` to bring your local clone of the course repo up to date with the remote +1. Run `conda activate ppde642` then `jupyter lab` to start your Jupyter server +1. In your web browser, visit http://localhost:8888 When you're all done using Jupyter at the end of a session, in the menu click File > Shut Down. Do not just close your browser tab or terminal window without stopping Jupyter first. Note that you can only type commands into a terminal window when its cursor is blinking. Otherwise it's busy. - ## Troubleshooting If you run into software problems down the road, close all open programs, restart your computer, then try your task again. If the problem persists, uninstall Conda, uninstall Git, restart your computer, then re-do the "initial software setup" instructions above. For further troubleshooting, Google and StackOverflow are your friends! diff --git a/syllabus/readme.md b/syllabus/readme.md index ef56834..7ed6a0e 100644 --- a/syllabus/readme.md +++ b/syllabus/readme.md @@ -2,8 +2,6 @@ USC PPDE642 / Spring 2024 / 4 units - - # Instructor Info [Prof. Geoff Boeing](https://geoffboeing.com) @@ -14,32 +12,26 @@ Office hours: TBD Classroom location and meeting times are [listed online](https://classes.usc.edu/) - - # Course Description This course provides you with a modern toolkit and skills for urban data science, covering both professional and scholarly use cases. It teaches coding for spatial analysis, network analysis, spatial models, and applied machine learning. The course takes a computational social science approach to working with data. It uses Python and Jupyter notebooks to demonstrate coding and statistical methods that you can reproduce and experiment with in real-time in the classroom. Students will be expected to: - - Complete and be prepared to discuss all assigned readings - - Attend the lecture - - Complete and submit assignments - - Present a mini-lecture on an applied method - - Prepare a short conference paper and presentation +- Complete and be prepared to discuss all assigned readings +- Attend the lecture +- Complete and submit assignments +- Present a mini-lecture on an applied method +- Prepare a short conference paper and presentation This course has prerequisites: students are expected to have taken [PPD534](https://github.com/gboeing/ppd534) or an equivalent introductory course on Python, stats, and spatial concepts. These prerequisites will not be covered from the ground-up as you are expected to be already familiar with them. This course requires patience and practice: learning to code will take lots of trial-and-error, self-direction, repetition, and experimentation on your part. You will get out of it what you are willing to put into it. Please note that this syllabus is a living document and may be updated by the instructor during the semester as needed. - - # Learning Objectives - - Understand the evolution and applications of urban analytics and urban science - - Write efficient code to collect, organize, analyze, and visualize urban data - - Model and analyze urban networks and flows - - Train and interpret various kinds of urban models - - +- Understand the evolution and applications of urban analytics and urban science +- Write efficient code to collect, organize, analyze, and visualize urban data +- Model and analyze urban networks and flows +- Train and interpret various kinds of urban models # Questions and Assistance @@ -47,47 +39,41 @@ I am available if you need help throughout the semester and am happy to answer y Given the nature of this course, I do expect a few things of you before you seek assistance with coding/data questions: - 1. Close all open programs, restart your computer, then try your task again - 2. Search Google and StackOverflow for the topic/problem (for example, the name of the function you're struggling with or the error message you are seeing) - 3. Go back through the relevant lecture materials to look for any insights - 4. Go back through the assigned reading materials to look for any insights +1. Close all open programs, restart your computer, then try your task again +2. Search Google and StackOverflow for the topic/problem (for example, the name of the function you're struggling with or the error message you are seeing) +3. Go back through the relevant lecture materials to look for any insights +4. Go back through the assigned reading materials to look for any insights If the above steps haven't solved your problem, post on Slack (or attend office hours) and include the following information: - 1. A detailed description of what you're trying to do, why, and how - 2. A complete [minimal reproducible example](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) of your code so far (never send screenshots of code/tracebacks) - 3. What you've already tried to do to solve your problem and what you have learned from it so far (specifically, explain the results of steps 1-4 above, including relevant links from StackOverflow etc) +1. A detailed description of what you're trying to do, why, and how +2. A complete [minimal reproducible example](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) of your code so far (never send screenshots of code/tracebacks) +3. What you've already tried to do to solve your problem and what you have learned from it so far (specifically, explain the results of steps 1-4 above, including relevant links from StackOverflow etc) I do not ask for this to be pedantic. Rather, I need it to be able to help you solve your problem. - - # Materials Coursework will be based on free open-source software. You will need a laptop (non-Chromebook) to install and run the course software. Copyrighted course reading materials are available via Blackboard for enrolled students to download. The course lectures assume that you have read the assigned readings prior to the class session and are now reasonably fluent in their contents and ready to discuss/debate them in class. Lectures are supplemental to the assigned reading and are of little value if you haven't taken the time to prepare in advance. So, before class, make sure you have completed the assigned reading, taken thorough notes, and prepared any questions you may have about the material. - - # Assignments and Evaluation The following sections outline the work you agree to do by enrolling in this course. Per USC guidelines, for each course unit the university expects 2 hours of out-of-class student work per week. This is a 4 unit course. Therefore, you should expect an average of 8 hours of out-of-class work each week: please budget your time accordingly. See the schedule below for assignment due dates and see the "assignments" folder for instructions. Active participation is expected (and graded) in the classroom. Final grades will be weighted as follows: - - 50%: four analytics assignments - - 10%: mini-lecture - - 10%: final presentation - - 20%: final project - - 10%: active participation in classroom and on Slack +- 50%: four analytics assignments +- 10%: mini-lecture +- 10%: final presentation +- 20%: final project +- 10%: active participation in classroom and on Slack See the schedule below for due dates. Submit everything via Blackboard. Assignments are due by 23:59:00 pacific time on their stated due dates. Late submissions are deducted 25% per day late. Please submit early to avoid any last-minute headaches such as slow uploads, weak internet connections, or temporary system outages. If you have any technical issues when submitting, contact USC IT well before it is due. It is solely the student's responsibility to ensure that all submissions have gone through on time, so please doubly confirm complete successful submission in the system. The timestamp in the submission system is our official record: if it says you're late, you're late. Note that late final project submissions will not be accepted or graded. Grades in this course reflect how you have demonstrated and communicated your understanding of and competency in the material. They account for your adherence to the assignments' instructions and your peers' performance given the same instructions and time. Grades do not necessarily reflect the amount of time or effort you put in, because I cannot observe you 24 hours a day to monitor your time and effort. If your demonstrated understanding of and competency in the material are not where you want them to be, start coming to office hours, refocus your effort onto more effective study strategies, and consider seeking tutoring. Grades are non-negotiable. You can expect them to be posted within two weeks of the assignment's due date. - - # Schedule ## Module 1 @@ -98,11 +84,9 @@ We introduce the course, the syllabus, the semester's expectations and schedule, Readings to be completed prior to class: - - Boeing and Arribas-Bel. 2021. GIS and Computational Notebooks. In: The Geographic Information Science & Technology Body of Knowledge, edited by J.P. Wilson. [Direct link](https://doi.org/10.22224/gistbok/2021.1.2). - - If you're on Windows, read [this guide](http://dosprompt.info/)'s "basics" and "folders" sections. If you're on Mac, read [this guide](https://computers.tutsplus.com/tutorials/navigating-the-terminal-a-gentle-introduction--mac-3855). - - Conda [user guide](https://docs.conda.io/projects/conda/en/latest/user-guide/index.html) - - +- Boeing and Arribas-Bel. 2021. GIS and Computational Notebooks. In: The Geographic Information Science & Technology Body of Knowledge, edited by J.P. Wilson. [Direct link](https://doi.org/10.22224/gistbok/2021.1.2). +- If you're on Windows, read [this guide](http://dosprompt.info/)'s "basics" and "folders" sections. If you're on Mac, read [this guide](https://computers.tutsplus.com/tutorials/navigating-the-terminal-a-gentle-introduction--mac-3855). +- Conda [user guide](https://docs.conda.io/projects/conda/en/latest/user-guide/index.html) ## Module 2 @@ -112,11 +96,9 @@ We introduce ourselves (see Google Doc) and the fields of urban analytics and ur Readings to be completed prior to class: - - Kontokosta. 2018. Urban Informatics in the Science and Practice of Planning. Journal of Planning Education and Research. [Direct link](https://doi.org/10.1177/0739456X18793716). [USC link](https://libproxy.usc.edu/login?url=https://doi.org/10.1177/0739456X18793716). - - Kitchin. 2020. Urban Science: Prospect and Critique. The Routledge Companion to Smart Cities. [Direct link](https://www.taylorfrancis.com/chapters/edit/10.4324/9781315178387-4/urban-science-rob-kitchin). [USC link](https://libproxy.usc.edu/login?url=https://www.taylorfrancis.com/chapters/edit/10.4324/9781315178387-4/urban-science-rob-kitchin). - - Mattern. 2013. Methodolatry and the Art of Measure: The New Wave of Urban Data Science. Places. [Direct link](https://doi.org/10.22269/131105). - - +- Kontokosta. 2018. Urban Informatics in the Science and Practice of Planning. Journal of Planning Education and Research. [Direct link](https://doi.org/10.1177/0739456X18793716). [USC link](https://libproxy.usc.edu/login?url=https://doi.org/10.1177/0739456X18793716). +- Kitchin. 2020. Urban Science: Prospect and Critique. The Routledge Companion to Smart Cities. [Direct link](https://www.taylorfrancis.com/chapters/edit/10.4324/9781315178387-4/urban-science-rob-kitchin). [USC link](https://libproxy.usc.edu/login?url=https://www.taylorfrancis.com/chapters/edit/10.4324/9781315178387-4/urban-science-rob-kitchin). +- Mattern. 2013. Methodolatry and the Art of Measure: The New Wave of Urban Data Science. Places. [Direct link](https://doi.org/10.22269/131105). ## Module 3 @@ -126,11 +108,9 @@ We do a quick refresher on Python and pandas for (urban) data science. This cour Readings to be completed prior to class: - - McKinney. 2017. Python for Data Analysis, 2nd Ed. (Ch 5-8) - - *Optional but important*: if any of the preceding reading feels unfamiliar, first read Ch 1-4 - - *Optional if you haven't used JupyterLab before*: [JupyterLab](https://jupyterlab.readthedocs.io/) user guide "interface" and "notebooks" sections - - +- McKinney. 2017. Python for Data Analysis, 2nd Ed. (Ch 5-8) +- _Optional but important_: if any of the preceding reading feels unfamiliar, first read Ch 1-4 +- _Optional if you haven't used JupyterLab before_: [JupyterLab](https://jupyterlab.readthedocs.io/) user guide "interface" and "notebooks" sections ## Module 4 @@ -140,13 +120,11 @@ We explore advanced data wrangling, cleaning, and feature engineering to prepare Readings to be completed prior to class: - - McKinney. 2017. Python for Data Analysis, 2nd Ed. (Ch 9-10) - - *Optional linear algebra refresher, if needed*: Kolter and Do. 2008. Linear Algebra Review and Reference. +- McKinney. 2017. Python for Data Analysis, 2nd Ed. (Ch 9-10) +- _Optional linear algebra refresher, if needed_: Kolter and Do. 2008. Linear Algebra Review and Reference. Assignment 1 due the following Tuesday. - - ## Module 5 **Feb 7 - APIs and scraping** @@ -155,10 +133,8 @@ We introduce working with APIs, geocoding, and scraping. Readings to be completed prior to class: - - Wu. 2020. Web Scraping Basics. Towards Data Science. [Direct link](https://towardsdatascience.com/web-scraping-basics-82f8b5acd45c). - - Park. 2019. How Do APIs Work? Tray.io. [Direct link](https://tray.io/blog/how-do-apis-work). - - +- Wu. 2020. Web Scraping Basics. Towards Data Science. [Direct link](https://towardsdatascience.com/web-scraping-basics-82f8b5acd45c). +- Park. 2019. How Do APIs Work? Tray.io. [Direct link](https://tray.io/blog/how-do-apis-work). ## Module 6 @@ -168,14 +144,12 @@ We explore advanced methods for working with spatial data. Readings to be completed prior to class: - - GeoPandas [user guide](https://geopandas.org/) - - Shapely [user manual](https://shapely.readthedocs.io/) - - *Optional* Introduction to [PostGIS](https://postgis.net/workshops/postgis-intro/) +- GeoPandas [user guide](https://geopandas.org/) +- Shapely [user manual](https://shapely.readthedocs.io/) +- _Optional_ Introduction to [PostGIS](https://postgis.net/workshops/postgis-intro/) Assignment 2 due the following Tuesday. - - ## Module 7 **Feb 21 - Urban network analysis I** @@ -184,11 +158,9 @@ We introduce the theory and methods of computational spatial network analysis, i Readings to be completed prior to class: - - O'Sullivan. 2014. Spatial Network Analysis. Handbook of Regional Science, edited by Fischer and Nijkamp. [Direct link](https://doi.org/10.1007/978-3-642-23430-9_67). [USC link](https://libproxy.usc.edu/login?url=https://doi.org/10.1007/978-3-642-23430-9_67). - - NetworkX [tutorial](https://networkx.org/documentation/stable/tutorial.html) - - OSMnx [documentation](https://osmnx.readthedocs.io/) - - +- O'Sullivan. 2014. Spatial Network Analysis. Handbook of Regional Science, edited by Fischer and Nijkamp. [Direct link](https://doi.org/10.1007/978-3-642-23430-9_67). [USC link](https://libproxy.usc.edu/login?url=https://doi.org/10.1007/978-3-642-23430-9_67). +- NetworkX [tutorial](https://networkx.org/documentation/stable/tutorial.html) +- OSMnx [documentation](https://osmnx.readthedocs.io/) ## Module 8 @@ -196,13 +168,11 @@ Readings to be completed prior to class: We build on the theory and methods introduced in the prior module to explore applications of spatial network analysis in the science and practice of urban planning, including routing, accessibility, and network design. - - Boeing. 2021. Off the Grid... and Back Again? The Recent Evolution of American Street Network Planning and Design. Journal of the American Planning Association. [Direct link](https://doi.org/10.1080/01944363.2020.1819382). [USC link](https://libproxy.usc.edu/login?url=https://doi.org/10.1080/01944363.2020.1819382). - - Boeing et al. 2022. Using Open Data and Open-Source Software to Develop Spatial Indicators of Urban Design and Transport Features for Achieving Healthy and Sustainable Cities. The Lancet Global Health. [Direct link](https://doi.org/10.1016/S2214-109X(22)00072-9). +- Boeing. 2021. Off the Grid... and Back Again? The Recent Evolution of American Street Network Planning and Design. Journal of the American Planning Association. [Direct link](https://doi.org/10.1080/01944363.2020.1819382). [USC link](https://libproxy.usc.edu/login?url=https://doi.org/10.1080/01944363.2020.1819382). +- Boeing et al. 2022. Using Open Data and Open-Source Software to Develop Spatial Indicators of Urban Design and Transport Features for Achieving Healthy and Sustainable Cities. The Lancet Global Health. [Direct link](). Assignment 3 due the following Tuesday. - - ## Module 9 **Mar 6 - Spatial analysis** @@ -211,18 +181,14 @@ We cover a coding approach to exploratory spatial data analysis, including weigh Readings to be completed prior to class: - - Burt et al. 2009. Elementary Statistics for Geographers, 3rd Ed. (ch 14.1-14.3) - - PySAL [documentation](https://pysal.org/) - - *Optional* 3Blue1Brown's (excellent) Essence of Linear Algebra [video series](https://www.youtube.com/playlist?list=PLZHQObOWTQDPD3MizzM2xVFitgF8hE_ab) - - +- Burt et al. 2009. Elementary Statistics for Geographers, 3rd Ed. (ch 14.1-14.3) +- PySAL [documentation](https://pysal.org/) +- _Optional_ 3Blue1Brown's (excellent) Essence of Linear Algebra [video series](https://www.youtube.com/playlist?list=PLZHQObOWTQDPD3MizzM2xVFitgF8hE_ab) ## Spring Break **Mar 13 - No class** - - ## Module 10 **Mar 20 - Spatial models** @@ -231,13 +197,11 @@ We introduce explicitly spatial models in the regression framework, including sp Readings to be completed prior to class: - - Burt et al. 2009. Elementary Statistics for Geographers, 3rd Ed. (ch 12-13 and 14.4-14.6) - - Antonakis et al. 2010. On Making Causal Claims. The Leadership Quarterly. [Direct link](https://doi.org/10.1016/j.leaqua.2010.10.010). [USC link](https://libproxy.usc.edu/login?url=https://doi.org/10.1016/j.leaqua.2010.10.010). +- Burt et al. 2009. Elementary Statistics for Geographers, 3rd Ed. (ch 12-13 and 14.4-14.6) +- Antonakis et al. 2010. On Making Causal Claims. The Leadership Quarterly. [Direct link](https://doi.org/10.1016/j.leaqua.2010.10.010). [USC link](https://libproxy.usc.edu/login?url=https://doi.org/10.1016/j.leaqua.2010.10.010). Assignment 4 due the following Friday (Mar 31). - - ## Module 11 **Mar 27 - Supervised learning** @@ -246,14 +210,12 @@ We introduce, from an applied perspective, machine learning theory and methods. Readings to be completed prior to class: - - Raschka. 2019. Python Machine Learning, 3rd Ed. (Ch 1, 3, 4, 6) - - *Optional* Raschka. 2020. Model Evaluation, Model Selection, and Algorithm Selection in Machine Learning. [Direct link](https://arxiv.org/abs/1811.12808v2). - - *Optional* Troy. 2017. Machine Learning Intuition: Cost Function Optimization (and the two following posts in this series. [Direct link](https://chelseatroy.com/2017/02/07/machine-learning-intuition-cost-function-optimization/). +- Raschka. 2019. Python Machine Learning, 3rd Ed. (Ch 1, 3, 4, 6) +- _Optional_ Raschka. 2020. Model Evaluation, Model Selection, and Algorithm Selection in Machine Learning. [Direct link](https://arxiv.org/abs/1811.12808v2). +- _Optional_ Troy. 2017. Machine Learning Intuition: Cost Function Optimization (and the two following posts in this series. [Direct link](https://chelseatroy.com/2017/02/07/machine-learning-intuition-cost-function-optimization/). Mini-lectures, part 1: if you are doing a supervised learning topic, you must submit prior to our class session. - - ## Module 12 **Apr 3 - Unsupervised learning** @@ -262,51 +224,41 @@ We introduce, from an applied perspective, unsupervised learning through cluster Readings to be completed prior to class: - - Raschka. 2019. Python Machine Learning, 3rd Ed. (Ch 5, 11) - - Domingos. 2012. A Few Useful Things to Know about Machine Learning. Communications of the ACM. - - *Optional* Boeing. 2019. Urban Spatial Order: Street Network Orientation, Configuration, and Entropy. Applied Network Science. [Direct link](https://doi.org/10.1007/s41109-019-0189-1). +- Raschka. 2019. Python Machine Learning, 3rd Ed. (Ch 5, 11) +- Domingos. 2012. A Few Useful Things to Know about Machine Learning. Communications of the ACM. +- _Optional_ Boeing. 2019. Urban Spatial Order: Street Network Orientation, Configuration, and Entropy. Applied Network Science. [Direct link](https://doi.org/10.1007/s41109-019-0189-1). Mini-lectures, part 2: if you are doing an unsupervised learning topic, you must submit prior to our class session. - - ## Module 13 **Apr 10 - Natural language processing** Guest lecture T.B.D. - - ## Module 14 **Apr 17 - Computer vision** Guest lecture T.B.D. - - ## Module 15 **Apr 24 - Final Presentations** Present your final projects to the group and engage in a discussion around each. - - ## Exam Week **Final Projects** due Apr 30. - - # Academic Conduct and Support ## Accommodations and Extensions USC welcomes students with disabilities into all of the University's educational programs. The Office of Student Accessibility Services (OSAS) is responsible for the determination of appropriate accommodations for students who encounter disability-related barriers. Once a student has completed the OSAS process (registration, initial appointment, and submitted documentation) and accommodations are determined to be reasonable and appropriate, a Letter of Accommodation (LOA) will be available to generate for each course. The LOA must be given to each course instructor by the student and followed up with a discussion. This should be done as early in the semester as possible as accommodations are not retroactive. More information can be found at OSAS's [web site](https://osas.usc.edu/). You may contact OSAS at (213) 740-0776 or via email at osasfrontdesk@usc.edu. -To maintain fairness and equality for all students, extensions to due dates are only granted in accordance with official LOAs. If you need to request a *one-time emergency* extension to an assignment (e.g., due to a major illness requiring hospitalization or due to a death in the family) you must do the following proactively *prior to its due date*: 1) provide written documentation, such as an official doctor's note, explaining why you are unable to complete the assignment by its due date and 2) work out an extension with the instructor. +To maintain fairness and equality for all students, extensions to due dates are only granted in accordance with official LOAs. If you need to request a _one-time emergency_ extension to an assignment (e.g., due to a major illness requiring hospitalization or due to a death in the family) you must do the following proactively _prior to its due date_: 1) provide written documentation, such as an official doctor's note, explaining why you are unable to complete the assignment by its due date and 2) work out an extension with the instructor. Course content and classroom discussion may deal with topics that you find difficult or upsetting but are relevant to the course. It is your responsibility to review the syllabus to be aware of upcoming content so you can prepare for it adequately. Contact OSAS for an LOA if you require an accommodation. @@ -318,7 +270,7 @@ The University of Southern California is foremost a learning community committed This course will follow the expectations for academic integrity as stated in the USC Student Handbook. All students are expected to submit assignments that are original work and prepared specifically for the course/section in this academic term. You may not submit work written by others or "recycle" work prepared for other courses without obtaining written permission from the instructor(s). Students suspected of engaging in academic misconduct will be reported to the Office of Academic Integrity. Other violations of academic misconduct include, but are not limited to, cheating, plagiarism, fabrication (e.g., falsifying data), knowingly assisting others in acts of academic dishonesty, and any act that gains or is intended to gain an unfair academic advantage. The impact of academic dishonesty is far-reaching and is considered a serious offense against the university and could result in outcomes such as failure on the assignment, failure in the course, suspension, or even expulsion from the university. For more information about academic integrity see the student handbook or the Office of Academic Integrity's website, and university policies on Research and Scholarship Misconduct. -Make sure you review the student handbook for expectations on academic integrity, and never commit [plagiarism](https://apastyle.apa.org/style-grammar-guidelines/citations/plagiarism). It is serious academic misconduct. In all your assignments, make sure you do not copy/paste any words, images, code, or other content written by another author (including the author of the piece to which you are responding) without quote marks and citation. If you use someone else's words, you must always use *quote* marks and *cite* them. If you refer to their ideas in your own words, you must *cite* them to make it clear whose ideas you're referring to. In a reading response, citing the reading's author inline is sufficient for us to understand the citation. In other contexts, use a formal reference to make your citation clear. +Make sure you review the student handbook for expectations on academic integrity, and never commit [plagiarism](https://apastyle.apa.org/style-grammar-guidelines/citations/plagiarism). It is serious academic misconduct. In all your assignments, make sure you do not copy/paste any words, images, code, or other content written by another author (including the author of the piece to which you are responding) without quote marks and citation. If you use someone else's words, you must always use _quote_ marks and _cite_ them. If you refer to their ideas in your own words, you must _cite_ them to make it clear whose ideas you're referring to. In a reading response, citing the reading's author inline is sufficient for us to understand the citation. In other contexts, use a formal reference to make your citation clear. Content generated from AI, machine learning, or similar algorithmic tools cannot be submitted in this course. A violation of this policy constitutes academic misconduct.