Skip to content

Commit

Permalink
Merge pull request #3 from bobaekang/master
Browse files Browse the repository at this point in the history
Updating the fp repo
  • Loading branch information
bobaekang authored Nov 29, 2016
2 parents 6b4d5e6 + 28e955a commit fed1166
Show file tree
Hide file tree
Showing 8 changed files with 12,275 additions and 1 deletion.
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# Final project

# Instruction---------------------------------------#
This is the repository for your final project. It is up to you to decide how to structure your files and directories. The final result will be a website created using [R Markdown](http://rmarkdown.rstudio.com/rmarkdown_websites.html).

You can see the rendered example site [here](https://uc-cfss.github.io/fp-template/).
You can see the rendered example site [here](https://uc-cfss.github.io/fp-template/).
#---------------------------------------------------#

# Summary of my project-----------------------------#
Summary of my project in words will be provided here.
#---------------------------------------------------#

# Scripts-------------------------------------------#
A brief explanation on each script files will be provided here.
#---------------------------------------------------#

536 changes: 536 additions & 0 deletions data/Divvy_Trips_2016_Q1Q2/Divvy_Stations_2016_Q1Q2.csv

Large diffs are not rendered by default.

11,521 changes: 11,521 additions & 0 deletions data/stops.txt

Large diffs are not rendered by default.

22 changes: 22 additions & 0 deletions fp-00_download-data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Download the raw data
# Load library
library(downloader)
library(stringr)

# Download the data
# Divvy data
download(url = "https://s3.amazonaws.com/divvy-data/tripdata/Divvy_Trips_2016_Q1Q2.zip",
destfile = "data/Divvy_Trips_2016_Q1Q2.zip")

# CTA data: stops and schedule
download(url = "http://www.transitchicago.com/downloads/sch_data/google_transit.zip",
destfile = "data/google_transit.zip")

# unzip the file
datazip <- list.files("data", pattern = "\\.zip$")

for (zipfile in datazip){
filepath = str_c("data/", zipfile)
unzip(filepath, exdir = "data")
}

89 changes: 89 additions & 0 deletions fp-01_tidy_data1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
##---------------------------------------------------------------##
## This script reads Divvy and CTA data into R and tidy them, in ##
## order to creat a tidy data frame for Divvy trips with spatial ##
## variable, which classifies all trips into four different ##
## groups, based on stations from and to which each trip was ##
## made and on whether those stations are in proximity with ##
## any CTA stop. ##
##---------------------------------------------------------------##

# Load libraries
library(tidyverse)
library(feather)

## READ AND TRANSFORM THE CTA DATA
## The following codes reads two CTA dataset concerning public transit stops and combine them
# load and join the data
CTAStops <- read_csv("data/stops.txt")
CTAStopTimes <- read_csv("data/stop_times.txt")

CTAStopTimeLocation <- left_join(CTAStopTimes, CTAStops, by = "stop_id")
# drop less meaningful variables
CTAStopTimeLocation <- CTAStopTimeLocation %>%
select(-stop_sequence, -stop_headsign, -shape_dist_traveled, -stop_code, -stop_desc, -wheelchair_boarding)

# Write the outcome into feather file and store
write_feather(CTAStopTimeLocation, "data/CTA_Stop_time_location.feather")


## READ AND TRANSFORM THE DIVVY DATA
## The following codes read and join two Divvy dataset on 1) Divvy trips and 2) locations of Divvy stations.
# Create a vector of the names of Divvy trip files
DivvyAllFiles <- list.files("data/Divvy_Trips_2016_Q1Q2", pattern = "\\.csv$", full.names = TRUE)
DivvyTripFiles <- DivvyAllFiles[2:5]

# Use map function to read all four trip files and bind them
DivvyTrip <- DivvyTripFiles %>%
map(read_csv) %>%
bind_rows()

# Read the station file
DivvyStation <- read_csv("data/Divvy_Trips_2016_Q1Q2/Divvy_Stations_2016_Q1Q2.csv")
colnames(DivvyStation) <- c("id", "name", "lat", "lon", "dpcapacity", "online_date")

# adding a proximity variable to station data
Divvy_m <- cbind(DivvyStation$lon, DivvyStation$lat)
CTA_m <- cbind(CTAStops$stop_lon, CTAStops$stop_lat)
distance_m <- distm(Divvy_m, CTA_m, fun = distHaversine) # a 535 by 11520 matrix for distance

distance150 <- distance_m <= 150 # check if the distance is <=150 meters or approximately 0.1 mile
proximity150 <- (rowSums(distance150) > 0)*1 # a Divvy station is <=150m from any CTA stop, 1; otherwise, 0
DivvyStation$proximity <- proximity150

index150 <- which(distance_m <= 150, arr.ind = T) # matrix of indices where the distance is <= 150
for (i in range(1, ncol(index150))){ # switching the index number to id numbers
Divvyindex <- index150[i,1]
CTAindex <- index150[i,2]
DivvyId <- DivvyStation$id[Divvyindex]
CTAId <- CTAStops$stop_id[CTAindex]
index150[i,1] <- DivvyId
index150[i,2] <- CTAId
}
colnames(index150) <- c('id', 'stop_id') # matching the column names to those in `DivvyStation` and `CTAStops`
index150 <- index150 %>% as_data_frame()
test <- left_join(DivvyStation, index150)
print(test, n = 30)

# Adding spatial variables: lattitude and longitude of from and to stations
FromStation <- DivvyStation %>%
select(id, lon, lat, proximity)
colnames(FromStation) <- c("from_station_id", "from_lon", "from_lat", "from_prox")

ToStation <- DivvyStation %>%
select(id, lon, lat, proximity)
colnames(ToStation) <- c("to_station_id", "to_lon", "to_lat", "to_prox")

DivvyData_from <- left_join(DivvyTrip, FromStation)
DivvyData <- left_join(DivvyData_from, ToStation)

# make starttime and stoptime variables time data
DivvyData$starttime <- as.POSIXct(DivvyData$starttime, format = "%m/%d/%Y %H:%M", tz = "America/Chicago")
DivvyData$stoptime <- as.POSIXct(DivvyData$stoptime, format = "%m/%d/%Y %H:%M", tz = "America/Chicago")
# separate dates and time for starttime and stoptime variables
DivvyData <- DivvyData %>%
separate(starttime, c("starttime_ymd", "starttime_hms"), " ") %>%
separate(stoptime, c("stoptime_ymd", "stoptime_hms"), " ")

# Write the outcome into feather file and store
write_feather(DivvyData, "data/Divvy_clean.feather")
write_feather(DivvyStation, "data/Divvy_station.feather")
37 changes: 37 additions & 0 deletions fp-02_tidy-data2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
##---------------------------------------------------------------##
## This script further transform Divvy and CTA data in order to ##
## add the temperal variable, which classifies all Divvy trips ##
## into two different groups; ones likely to be multi-modal and ##
## the others that are not. ##
##---------------------------------------------------------------##

# Load libraries
library(tidyverse)
library(feather)
library(lubridate)

# Read in and prepare data for tidying
DivvyData <- read_feather("data/Divvy_clean.feather")
CTAStopTimeLocation <- read_feather("data/CTA_Stop_time_location.feather")

# Divvy trips that started at stations in proximity with CTA stops
DivvyData_from_prox <- DivvyData %>%
select(-stoptime_ymd, -stoptime_hms, -to_station_id, -to_station_name, -to_lon, -to_lat, -to_prox) %>%
filter(from_prox == 1)
DivvyData_from_prox$starttime_ymd <- ymd(DivvyData_from_prox$starttime_ymd)
DivvyData_from_prox$starttime_hms <- hms(DivvyData_from_prox$starttime_hms)

# Divvy trips that stopped at stations in proximity with CTA stops
DivvyData_to_prox <- DivvyData %>%
select(-starttime_ymd, -starttime_hms, -from_station_id, -from_station_name, -from_lon, -from_lat, -from_prox) %>%
filter(to_prox == 1)
DivvyData_to_prox$stoptime_ymd <- ymd(DivvyData_to_prox$stoptime_ymd)
DivvyData_to_prox$stoptime_hms <- hms(DivvyData_to_prox$stoptime_hms)

# CTA trip times, divided into arrivals and departures
CTAStop_arr <- CTAStopTimeLocation %>%
select(-departure_time)
CTAStop_arr$arrival_time <- hms(CTAStop_arr$arrival_time)
CTAStop_dep <- CTAStopTimeLocation %>%
select(-arrival_time)
CTAStop_dep$departure_time <- hms(CTAStop_dep$departure_time)
38 changes: 38 additions & 0 deletions fp-issue02.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
##---------------------------------------------------------------##
## This script reads Divvy station and CTA stop data into R, in ##
## order to creat a tidy data frame for Divvy Station data frame ##
## with spatial variable. This is a mini script for the Issue 02 ##
##---------------------------------------------------------------##

library(tidyverse)
library(feather)

# Read the CTA stop file
CTAStops <- read_csv("data/stops.txt")

# Read the Divvy station file
DivvyStation <- read_csv("data/Divvy_Trips_2016_Q1Q2/Divvy_Stations_2016_Q1Q2.csv")
colnames(DivvyStation) <- c("id", "name", "lat", "lon", "dpcapacity", "online_date")

# adding a proximity variable to station data
Divvy_m <- cbind(DivvyStation$lon, DivvyStation$lat)
CTA_m <- cbind(CTAStops$stop_lon, CTAStops$stop_lat)
distance_m <- distm(Divvy_m, CTA_m, fun = distHaversine) # a 535 by 11520 matrix for distance

distance150 <- distance_m <= 150 # check if the distance is <=150 meters or approximately 0.1 mile
proximity150 <- (rowSums(distance150) > 0)*1 # a Divvy station is <=150m from any CTA stop, 1; otherwise, 0
DivvyStation$proximity <- proximity150

index150 <- which(distance_m <= 150, arr.ind = T) # matrix of indices where the distance is <= 150
for (i in range(1, ncol(index150))){ # switching the index number to id numbers
Divvyindex <- index150[i,1]
CTAindex <- index150[i,2]
DivvyId <- DivvyStation$id[Divvyindex]
CTAId <- CTAStops$stop_id[CTAindex]
index150[i,1] <- DivvyId
index150[i,2] <- CTAId
}
colnames(index150) <- c('id', 'stop_id') # matching the column names to those in `DivvyStation` and `CTAStops`
index150 <- index150 %>% as_data_frame()
test <- left_join(DivvyStation, index150)
print(test, n = 30)
20 changes: 20 additions & 0 deletions fp-runfile.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
##---------------------------------------------------------------##
## This script creats the necessary directories to store key ##
## outputs of the project and runs all the scripts in order so ##
## that they download, transform, and analyse the data as well ##
## as visualize the analysis in a presentable format. ##
##---------------------------------------------------------------##

## clean out any previous work
paths <- c("data", "graphics", "output")

for(path in paths){
unlink(path, recursive = TRUE) # delete folder and contents
dir.create(path) # create empty folder
}

## run my scripts
source("fb-00_download-data.R")
source("fb-01_tidy-data1.R")
# rmarkdown::render("index.Rmd")
# rmarkdown::render("about.Rmd")

0 comments on commit fed1166

Please sign in to comment.