Strava Ottawa Temporal Patterns

Colin Ferster and Vanessa Brum-Bastos September 23, 2019

The purpose of this document is to explore temporal patterns in Ottawa Strava Metro data.

# load packages
library(rgdal)
library(raster)
library(timeDate)
library(RColorBrewer)
library(dplyr)
library(magrittr)
library(chron)
library(ggplot2)
library(dtw)
library(tidyr)
library(parallelDist)
library(fpc)
library(pracma)
library(prettymapr)

1. Preprocess Strava Metro data

Get hourly means for weekdays in non-winter months

# 1 minute core street level Strava Metro ride data for edges in the study area
data <- read.csv("edges_ride_2016_studyArea.csv")

# Strava edges (street segments) clipped to the study area (spatial data)
edges <- readOGR(".", "studyArea_edges")

## OGR data source with driver: ESRI Shapefile 
## Source: "D:\strava_temporalPatterns\ottawa", layer: "studyArea_edges"
## with 4041 features
## It has 34 fields
## Integer64 fields read as strings:  GID RCSTATUT RCTYPTRON RCPROPRIO RCSENSUNIQ RCHIERARCH RCVITESS RRTYPSENT RRPROPRIO RRSTATUT RRROUTVERT

edges_utm <- spTransform(edges, CRS("+init=epsg:32618")) # project to UTM 11N NAD83 for Ottawa

# format Strava dates, get weekdays and season
data$date <- strptime(paste0(data$day,
                             " ",
                             data$year, " ",
                             data$hour, " ",
                             data$minute),
                             "%j %Y %H %M")

data$date_day <- format(data$date, "%Y-%m-%d")

# business days from Toronto Stock Exchange (TSX) calendar
data$weekday <- isBizday(as.timeDate(data$date_day),
                         holidayTSX(as.integer(unique(data$year))))

# season
# winter months with minimum daily temp less than zero degrees Celsius
# source: http://climate.weather.gc.ca/climate_normals/results_1981_2010_e.html
# Ottawa CDA (weather station nearest the study area)
winter <- c("11", "12", "01", "02", "03") 
data$month <- format.Date(data$date, format="%m")
data$winter <- data$month %in% winter

# select weekdays not in winter
data.summer.weekday <- data[which(data$weekday == T & data$winter == F),]

# sum Strava activity counts by hour for each day
data.summer.weekday.hourly <- aggregate(total_activity_count
                        ~edge_id + hour + date_day,
                        data=data.summer.weekday, 
                        FUN="sum")

# get hourly mean of Strava activity counts 
data.summer.weekday.hourly.means <- aggregate(total_activity_count ~ edge_id + hour,
                                            FUN = "mean",
                                            data=data.summer.weekday.hourly)

# change column names to indicate that it is the calculated mean of Strava activity counts
names(data.summer.weekday.hourly.means) <- c("edge_id", "hour", "mean_activity_count")

2. Descriptive plots

# create a matrix with Strava counts for each edge at each hour
unique_edges = unique(data.summer.weekday.hourly.means$edge_id)

# sequence of 24 hours, counting from zero
unique_hours = seq(0, 23, 1)

matrix <- data.frame(unique_edges)

for (i in 1:length(unique_hours)) {
  matrix[, i+1] <- NA
}

names(matrix) <- c("edge_id", paste0(unique_hours))

# populate the matrix with Strava counts
for (h in unique_hours){
  
  subs <- subset(data.summer.weekday.hourly.means, hour==h)
  un_edges = unique(subs$edge_id)
  
  for (id in un_edges){
    sub2 <- subset(subs, edge_id == id)
    row1 <- which(matrix$edge_id == id)
    # index + 2: 
    # +1 to skip edge_id 
    # +1 because we count hours from zero, yet the index starts at 1
    matrix[row1,h+2] <- sub2$mean_activity_count 
    
  }
  
}

# Assign 0 to NA (NA indicates a zero count)
matrix[is.na(matrix)] <- 0

# plot 24 hrs of aggregated Strava data
hourRange <- 1:24 # names of the hours
hourIndex <- 2:25 # index of the hours in the matrix

# get the range of Strava counts (find the highest hourly mean)
maxRange <- c(0, max(c(max(matrix[, hourIndex]))) +2) # + 2 is to increase the margin at the top of the plot

# matrix of average Strava counts by hour
matrix.hours <- data.matrix(matrix[1:nrow(matrix), hourIndex]) 

# labels for the tickmarks every 3 hours
threeHourTicks <- c(1, 4, 7, 10, 13, 16, 19, 22) # tickmark locations
threeHourLabels <- c("0", "3", "6", "9", "12", "15", "18", "21") # tickmark labels

# adjust margins
par(mar=c(3.9, 3.8, 0.5, 0.7))

{  
    matplot(t(matrix.hours), type="l", 
        ylim = c(0,maxRange[2]),
        ylab = 'Average count', 
        xlab = 'Hour', 
        axes = F, 
        xaxs = "i",  yaxs="i", 
        cex = 0.7)
  
  # x axis ticks and labels
  axis(1, at = 1:length(hourRange), 
       labels = NA, cex = 0.7) # ticks for each hour
  axis(1, at = threeHourTicks, 
       labels = threeHourLabels, cex = 0.7) # label every 3 hours
  
  # y axis ticks and labels
  axis(2, at = seq(0, maxRange[2], 5), 
       labels = seq(0, maxRange[2], 5), 
       cex = 0.7)
  axis(2, at = 0, labels = 0, 
       cex = 0.7) # make sure there is a tick at 0
}

3. Calculate dynamic time warped (DTW) distance matrix and cluster into groups

# calculate statistical distance between all edges using dynamic time warping (DTW), 
# Use a +-1 hour window using a sakoechiba band
alignment <- parDist(matrix.hours, 
                   method = "dtw", 
                   window.size = 1, 
                   window.type ='sakoechiba')

# heirarchical clustering using Ward's d
hclust_avg <- hclust(alignment, method = 'ward.D')

# produce a tree plot of the clusters
plot(hclust_avg)

# calculate the Calinski-Harabasz index for clusters ranging from 1 to 50
cl <- seq(from = 1, to = 50, by = 1)
x <- character(0)
y <- character(0)

# caculate the Calinski-Harabasz index for each level of pruning
for (n in cl){
  cl1.4 <- cutree(hclust_avg, k= n)
  ind <- calinhara(matrix.hours, cl1.4, cn = max(cl1.4))
  x <- c(x, n)
  y <- c(y, ind)
}

# plot of Calinski-Harabasz index for levels of pruning
{
plot(x, y, type='l', axes = FALSE, xlim=c(1, 50), ylim = c(0, 4500))
axis(side = 1, at = seq(50, 1))
axis(side = 2, at = seq(0, 4500, 100))
}

# inflection point at 6, so we pick 6 clusters
groups <- cutree(hclust_avg, k = 6)

4. Plot the clusters

# assign the cluster numbers to the edges
seeds_df_cl <- matrix
seeds_df_cl$cluster <- groups
count(seeds_df_cl, cluster)

## # A tibble: 6 x 2
##   cluster     n
##     <int> <int>
## 1       1  1131
## 2       2  1448
## 3       3   217
## 4       4    79
## 5       5   701
## 6       6   304

# get the unique cluster names
clusters <- unique(seeds_df_cl$cluster)

#plot the time series for each cluster
{
  par(mfrow = c(3, 2))
  
  # loop through each cluster and plot all of the time series
  for (n in clusters){
    
    # get the time series in each cluster
    c <- seeds_df_cl[which(seeds_df_cl$cluster == n), ]
    
    {
    title= paste('Cluster', as.character(n), '- n =', as.character(nrow(c)))
    matplot(t(c[, -which(names(c) %in% c("edge_id", "peaks", "cluster"))]), 
            type="l", ylab="average activities", axes = F, ylim = c(0, maxRange[2]),
            main = title)
    # x axis ticks and labels
    axis(1, at = 1:length(hourRange), labels = paste(min(hourRange):max(hourRange), 'h'), cex.axis=0.7)
    
    # y axis ticks and labels
    axis(2, at =seq(0,maxRange[2], 5), labels =seq(0, maxRange[2],5))
    }
  }
}

# plot means on the same graph
seeds_df_cl_means <- seeds_df_cl

# add an "X" to variable names to make it easier to use formulas
names(seeds_df_cl_means) <- paste0("X", names(seeds_df_cl_means))

# calculate the mean time series for each cluster
meanCurves <- aggregate(cbind(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10,
                              X11, X12, X13, X14, X15, X16, X17, X18, X19,
                              X20, X21, X22, X23) ~ Xcluster, 
                        data = seeds_df_cl_means,
                        FUN = "mean")

# 6 colour scheme
cols <- brewer.pal(6, "Dark2")
par(mfrow = c(1, 1))

{
plot(x = as.character(hourRange),
     y = meanCurves[1, 2:ncol(meanCurves)], 
     ylim = c(0, max(meanCurves)), pch = NA, # empty plot - add lines later
     xlab = "hour", ylab = "mean count", main = "mean curves for each cluster")
  
  # plot each mean curve
  for(i in 1:6){
  lines(x = as.character(hourRange),
        y = meanCurves[i, 2:ncol(meanCurves)], 
        col = cols[i])
  }

  # add legend  
  legend("topright",col = cols, lty= 1, legend = paste0(1:6))
}

# map the clusters

# merge the spatial data
edges_classes <- merge(edges_utm, seeds_df_cl, by.x = "GID", by.y ="edge_id", all.x =T)

# make sure the order in the legend is correct
edges_classes$class <- as.factor(edges_classes$cluster)
class_labels <- levels(edges_classes$class)

# assign colours to each class
mapColors <- cols[as.integer(edges_classes$class)]

# add in black for zero count edges
mapColors[is.na(mapColors)] <- "#000000"
cols <- c(cols, "#000000")

# get labels for legend
legendLabels <- levels(edges_classes$class)
legendLabels <- c(legendLabels, "Zero count")

{
  plot(edges_classes, col = mapColors)
  legend("topright", lty = 1,
       col = cols,
       legend = legendLabels)
}

# There are n = 161 edges with zero counts. 
# They are mostly private service roads, highway ramps, or poorly 
# connected residential streets that are not high priorities for 
# bike count programs.

5. Interpret and name the clusters

# name the clusters
seeds_df_cl$class <- ifelse(seeds_df_cl$cluster==1,'Low use commute',
                          ifelse(seeds_df_cl$cluster==2,'Low use - connecting streets',
                          ifelse(seeds_df_cl$cluster==3,'Medium use commute',       
                          ifelse(seeds_df_cl$cluster==4,'High use commute',    
                          ifelse(seeds_df_cl$cluster==5,'Low use - major roads',
                          ifelse(seeds_df_cl$cluster==6,'Low-medium use commute',
                          "NA"))))))

# make class a factor variable
seeds_df_cl$class<- as.factor(seeds_df_cl$class)

# order the clusters by counts: highest to lowest
levels(seeds_df_cl$class)

## [1] "High use commute"             "Low-medium use commute"      
## [3] "Low use - connecting streets" "Low use - major roads"       
## [5] "Low use commute"              "Medium use commute"

seeds_df_cl$class = factor(seeds_df_cl$class, levels(seeds_df_cl$class)[c(1, 6, 2, 5, 3, 4)])

# names of the classes
classes<-unique(seeds_df_cl$class)

# match the above order (by counts: highest to lowest)
classes <- classes[c(4, 3, 6, 1, 2, 5)]

{
  # layout with two panels
  par(mfrow = c(3,2))
  
  # plot out the time series for all edges in each cluster
  for (n in classes){
    c<-seeds_df_cl[which(seeds_df_cl$class==n),]
    {
      # adjust margins
      par(mar=c(3.9, 3.8, 3.9, 0.7))
      title= paste(as.character(n),'- n =', as.character(nrow(c)))
        {  
            matplot(t(c[, -which(names(c) %in% c("edge_id", "cluster", "class"))]), type="l", 
                    ylim=c(0, maxRange[2]),
                    ylab='Average count', 
                    xlab='Hour', 
                    main = title,
                    axes=F, xaxs="i", yaxs="i", cex = 0.7)
              # x axis ticks
              axis(1, at = 1:length(hourRange), labels = NA, cex = 0.7)
              
              # x axis labels (every 3 hours)
              axis(1, at = threeHourTicks, labels = threeHourLabels, cex = 0.7)
              
              # y axis ticks and labels
              axis(2, at =seq(0,maxRange[2],5), labels =seq(0,maxRange[2],5), cex = 0.7)
        }
    }
  }
}

# plot means on the same graph
seeds_df_cl_means <- seeds_df_cl

# add an x to variable names because some start with numbers - hard to work with in formula
names(seeds_df_cl_means) <- paste0("X", names(seeds_df_cl_means))

# calculate the mean time series for each cluster
meanCurves <- aggregate(cbind(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10,
                              X11, X12, X13, X14, X15, X16, X17, X18, X19,
                              X20, X21, X22, X23) ~ Xclass, 
                        data = seeds_df_cl_means,
                        FUN = "mean")

# get the class names
classes <- levels(as.factor(seeds_df_cl_means$Xclass))

# get the count of classes
nclass <- length(classes)

# create a sequential colour scheme
colsCommute <- brewer.pal(5,"BuPu")

# add colours for low use classes
cols <- c(rev(colsCommute)[-5],"grey","#8dd3c7")

# 2 panel layout
par(mfrow = c(1,2))
{
par(mar=c(4.1, 4.1, 4.1, 0.5))

plot(x = as.character(hourRange),
     y=meanCurves[1, 2:ncol(meanCurves)], 
     ylim = c(0, max(meanCurves[,c(2:ncol(meanCurves))])),
     pch = NA, # empty plot, add lines later
     xlab = "Hour", ylab = "Mean count", main = "Mean temporal profile", 
     axes = F, xaxs="i", yaxs="i")
  
  # x axis ticks
  axis(1, at = 1:length(hourRange), labels = NA, cex = 0.7)
  # x axis labels
  axis(1, at = threeHourTicks, labels = threeHourLabels, cex = 0.7)

  # y axis ticks and labels
  axis(2, at =seq(0, maxRange[2],5), labels =seq(0, maxRange[2], 5), cex = 0.7)

  # add lines for mean time series for each cluster
  for(i in 1:nclass){
    lines(x = as.character(hourRange),
          y=meanCurves[i,2:ncol(meanCurves)], 
          col=cols[i], 
          lwd = 2)
  } 

}

# map
edges_classes <- merge(edges_utm, seeds_df_cl, by.x = "GID", by.y ="edge_id", all.x =T)

# make sure the order in the legend is correct
edges_classes$class <- as.factor(edges_classes$class)

# get class labels
class_labels <- levels(edges_classes$class)

# assign colours
mapColors <- cols[as.integer(edges_classes$class)]

# adjust margins
par(mar=c(1, 0.5, 0.5, 0.5))
  
{
  # plot map
  plot(edges_classes, col = mapColors, lwd = 2)
  legend(x = edges_classes@bbox[1, 1],
         y = edges_classes@bbox[2, 2],
         lty = 1,
         col = cols,
         legend = levels(edges_classes$class),
         lwd = 4,
         cex = 0.75)
  # add map elements
  # add scalebar
  scalebar(1000, label = "1 km")
  # add north arrow
  addnortharrow(scale = 0.6)
}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

stravaPatterns_ottawa.md

stravaPatterns_ottawa.md

Strava Ottawa Temporal Patterns

1. Preprocess Strava Metro data

2. Descriptive plots

3. Calculate dynamic time warped (DTW) distance matrix and cluster into groups

4. Plot the clusters

5. Interpret and name the clusters

Files

stravaPatterns_ottawa.md

Latest commit

History

stravaPatterns_ottawa.md

File metadata and controls

Strava Ottawa Temporal Patterns

1. Preprocess Strava Metro data

2. Descriptive plots

3. Calculate dynamic time warped (DTW) distance matrix and cluster into groups

4. Plot the clusters

5. Interpret and name the clusters