Merge pull request #3 from bobaekang/master

Updating the fp repo
cfss-old · Nov 29, 2016 · fed1166 · fed1166
2 parents 6b4d5e6 + 28e955a
commit fed1166
Show file tree

Hide file tree

Showing 8 changed files with 12,275 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -1,5 +1,16 @@
 # Final project
 
+# Instruction---------------------------------------#
 This is the repository for your final project. It is up to you to decide how to structure your files and directories. The final result will be a website created using [R Markdown](http://rmarkdown.rstudio.com/rmarkdown_websites.html).
 
-You can see the rendered example site [here](https://uc-cfss.github.io/fp-template/).
+You can see the rendered example site [here](https://uc-cfss.github.io/fp-template/).
+#---------------------------------------------------#
+
+# Summary of my project-----------------------------#
+Summary of my project in words will be provided here.
+#---------------------------------------------------#
+
+# Scripts-------------------------------------------#
+A brief explanation on each script files will be provided here.
+#---------------------------------------------------#
+
diff --git a/data/Divvy_Trips_2016_Q1Q2/Divvy_Stations_2016_Q1Q2.csv b/data/Divvy_Trips_2016_Q1Q2/Divvy_Stations_2016_Q1Q2.csv
diff --git a/data/stops.txt b/data/stops.txt
diff --git a/fp-00_download-data.R b/fp-00_download-data.R
@@ -0,0 +1,22 @@
+# Download the raw data
+# Load library
+library(downloader)
+library(stringr)
+
+# Download the data
+# Divvy data
+download(url = "https://s3.amazonaws.com/divvy-data/tripdata/Divvy_Trips_2016_Q1Q2.zip",
+         destfile = "data/Divvy_Trips_2016_Q1Q2.zip")
+
+# CTA data: stops and schedule
+download(url = "http://www.transitchicago.com/downloads/sch_data/google_transit.zip",
+         destfile = "data/google_transit.zip")
+
+# unzip the file
+datazip <- list.files("data", pattern = "\\.zip$")
+
+for (zipfile in datazip){
+  filepath = str_c("data/", zipfile)
+  unzip(filepath, exdir = "data")
+}
+
diff --git a/fp-01_tidy_data1.R b/fp-01_tidy_data1.R
@@ -0,0 +1,89 @@
+##---------------------------------------------------------------##
+## This script reads Divvy and CTA data into R and tidy them, in ##
+## order to creat a tidy data frame for Divvy trips with spatial ##
+## variable, which classifies all trips into four different      ##
+## groups, based on stations from and to which each trip was     ##
+## made and on whether those stations are in proximity with      ##
+## any CTA stop.                                                 ##
+##---------------------------------------------------------------##
+
+# Load libraries
+library(tidyverse)
+library(feather)
+
+## READ AND TRANSFORM THE CTA DATA
+## The following codes reads two CTA dataset concerning public transit stops and combine them
+# load and join the data
+CTAStops <- read_csv("data/stops.txt")
+CTAStopTimes <- read_csv("data/stop_times.txt")
+
+CTAStopTimeLocation <- left_join(CTAStopTimes, CTAStops, by = "stop_id")
+# drop less meaningful variables
+CTAStopTimeLocation <- CTAStopTimeLocation %>%
+  select(-stop_sequence, -stop_headsign, -shape_dist_traveled, -stop_code, -stop_desc, -wheelchair_boarding)
+
+# Write the outcome into feather file and store
+write_feather(CTAStopTimeLocation, "data/CTA_Stop_time_location.feather")
+
+
+## READ AND TRANSFORM THE DIVVY DATA
+## The following codes read and join two Divvy dataset on 1) Divvy trips and 2) locations of Divvy stations. 
+# Create a vector of the names of Divvy trip files 
+DivvyAllFiles <- list.files("data/Divvy_Trips_2016_Q1Q2", pattern = "\\.csv$", full.names = TRUE)
+DivvyTripFiles <- DivvyAllFiles[2:5]
+
+# Use map function to read all four trip files and bind them
+DivvyTrip <- DivvyTripFiles %>%
+  map(read_csv) %>%
+  bind_rows()
+
+# Read the station file
+DivvyStation <- read_csv("data/Divvy_Trips_2016_Q1Q2/Divvy_Stations_2016_Q1Q2.csv")
+colnames(DivvyStation) <- c("id", "name", "lat", "lon", "dpcapacity", "online_date")
+
+# adding a proximity variable to station data
+Divvy_m <- cbind(DivvyStation$lon, DivvyStation$lat)
+CTA_m <- cbind(CTAStops$stop_lon, CTAStops$stop_lat)
+distance_m <- distm(Divvy_m, CTA_m, fun = distHaversine) # a 535 by 11520 matrix for distance
+
+distance150 <- distance_m <= 150 # check if the distance is <=150 meters or approximately 0.1 mile
+proximity150 <- (rowSums(distance150) > 0)*1 # a Divvy station is <=150m from any CTA stop, 1; otherwise, 0 
+DivvyStation$proximity <- proximity150
+
+index150 <- which(distance_m <= 150, arr.ind = T) # matrix of indices where the distance is <= 150
+for (i in range(1, ncol(index150))){ # switching the index number to id numbers
+  Divvyindex <- index150[i,1]
+  CTAindex <- index150[i,2]
+  DivvyId <- DivvyStation$id[Divvyindex]
+  CTAId <- CTAStops$stop_id[CTAindex]
+  index150[i,1] <- DivvyId
+  index150[i,2] <- CTAId
+}
+colnames(index150) <- c('id', 'stop_id') # matching the column names to those in `DivvyStation` and `CTAStops`
+index150 <- index150 %>% as_data_frame()
+test <- left_join(DivvyStation, index150)
+print(test, n = 30)
+
+# Adding spatial variables: lattitude and longitude of from and to stations
+FromStation <- DivvyStation %>%
+  select(id, lon, lat, proximity)
+colnames(FromStation) <- c("from_station_id", "from_lon", "from_lat", "from_prox")
+
+ToStation <- DivvyStation %>%
+  select(id, lon, lat, proximity)
+colnames(ToStation) <- c("to_station_id", "to_lon", "to_lat", "to_prox")
+
+DivvyData_from <- left_join(DivvyTrip, FromStation)
+DivvyData <- left_join(DivvyData_from, ToStation)
+
+# make starttime and stoptime variables time data
+DivvyData$starttime <- as.POSIXct(DivvyData$starttime, format = "%m/%d/%Y %H:%M", tz = "America/Chicago")
+DivvyData$stoptime <- as.POSIXct(DivvyData$stoptime, format = "%m/%d/%Y %H:%M", tz = "America/Chicago")
+# separate dates and time for starttime and stoptime variables
+DivvyData <- DivvyData %>%
+  separate(starttime, c("starttime_ymd", "starttime_hms"), " ") %>%
+  separate(stoptime, c("stoptime_ymd", "stoptime_hms"), " ")
+
+# Write the outcome into feather file and store
+write_feather(DivvyData, "data/Divvy_clean.feather")
+write_feather(DivvyStation, "data/Divvy_station.feather")
diff --git a/fp-02_tidy-data2.R b/fp-02_tidy-data2.R
@@ -0,0 +1,37 @@
+##---------------------------------------------------------------##
+## This script further transform Divvy and CTA data in order to  ##
+## add the temperal variable, which classifies all Divvy trips   ##
+## into two different groups; ones likely to be multi-modal and  ##
+## the others that are not.                                      ##
+##---------------------------------------------------------------##
+
+# Load libraries
+library(tidyverse)
+library(feather)
+library(lubridate)
+
+# Read in and prepare data for tidying
+DivvyData <- read_feather("data/Divvy_clean.feather")
+CTAStopTimeLocation <- read_feather("data/CTA_Stop_time_location.feather")
+
+# Divvy trips that started at stations in proximity with CTA stops  
+DivvyData_from_prox <- DivvyData %>%
+  select(-stoptime_ymd, -stoptime_hms, -to_station_id, -to_station_name, -to_lon, -to_lat, -to_prox) %>%
+  filter(from_prox == 1)
+DivvyData_from_prox$starttime_ymd <- ymd(DivvyData_from_prox$starttime_ymd)
+DivvyData_from_prox$starttime_hms <- hms(DivvyData_from_prox$starttime_hms)
+
+# Divvy trips that stopped at stations in proximity with CTA stops 
+DivvyData_to_prox <- DivvyData %>%
+  select(-starttime_ymd, -starttime_hms, -from_station_id, -from_station_name, -from_lon, -from_lat, -from_prox) %>%
+  filter(to_prox == 1)
+DivvyData_to_prox$stoptime_ymd <- ymd(DivvyData_to_prox$stoptime_ymd)
+DivvyData_to_prox$stoptime_hms <- hms(DivvyData_to_prox$stoptime_hms)
+
+# CTA trip times, divided into arrivals and departures
+CTAStop_arr <- CTAStopTimeLocation %>%
+  select(-departure_time)
+CTAStop_arr$arrival_time <- hms(CTAStop_arr$arrival_time) 
+CTAStop_dep <- CTAStopTimeLocation %>%
+  select(-arrival_time)
+CTAStop_dep$departure_time <- hms(CTAStop_dep$departure_time)
diff --git a/fp-issue02.R b/fp-issue02.R
@@ -0,0 +1,38 @@
+##---------------------------------------------------------------##
+## This script reads Divvy station and CTA stop data into R, in  ##
+## order to creat a tidy data frame for Divvy Station data frame ##
+## with spatial variable. This is a mini script for the Issue 02 ##
+##---------------------------------------------------------------##
+
+library(tidyverse)
+library(feather)
+
+# Read the CTA stop file
+CTAStops <- read_csv("data/stops.txt")
+
+# Read the Divvy station file
+DivvyStation <- read_csv("data/Divvy_Trips_2016_Q1Q2/Divvy_Stations_2016_Q1Q2.csv")
+colnames(DivvyStation) <- c("id", "name", "lat", "lon", "dpcapacity", "online_date")
+
+# adding a proximity variable to station data
+Divvy_m <- cbind(DivvyStation$lon, DivvyStation$lat)
+CTA_m <- cbind(CTAStops$stop_lon, CTAStops$stop_lat)
+distance_m <- distm(Divvy_m, CTA_m, fun = distHaversine) # a 535 by 11520 matrix for distance
+
+distance150 <- distance_m <= 150 # check if the distance is <=150 meters or approximately 0.1 mile
+proximity150 <- (rowSums(distance150) > 0)*1 # a Divvy station is <=150m from any CTA stop, 1; otherwise, 0 
+DivvyStation$proximity <- proximity150
+
+index150 <- which(distance_m <= 150, arr.ind = T) # matrix of indices where the distance is <= 150
+for (i in range(1, ncol(index150))){ # switching the index number to id numbers
+  Divvyindex <- index150[i,1]
+  CTAindex <- index150[i,2]
+  DivvyId <- DivvyStation$id[Divvyindex]
+  CTAId <- CTAStops$stop_id[CTAindex]
+  index150[i,1] <- DivvyId
+  index150[i,2] <- CTAId
+}
+colnames(index150) <- c('id', 'stop_id') # matching the column names to those in `DivvyStation` and `CTAStops`
+index150 <- index150 %>% as_data_frame()
+test <- left_join(DivvyStation, index150)
+print(test, n = 30)
diff --git a/fp-runfile.R b/fp-runfile.R
@@ -0,0 +1,20 @@
+##---------------------------------------------------------------##
+## This script creats the necessary directories to store key     ##
+## outputs of the project and runs all the scripts in order so   ##
+## that they download, transform, and analyse the data as well   ##
+## as visualize the analysis in a presentable format.            ##
+##---------------------------------------------------------------##
+
+## clean out any previous work
+paths <- c("data", "graphics", "output")
+
+for(path in paths){
+  unlink(path, recursive = TRUE)    # delete folder and contents
+  dir.create(path)                  # create empty folder
+}
+
+## run my scripts
+source("fb-00_download-data.R")
+source("fb-01_tidy-data1.R")
+# rmarkdown::render("index.Rmd")
+# rmarkdown::render("about.Rmd")