Rcode_2019_Prediction-No2.R

## This file runs Same program as it was created for the year 2020
## here we run the model till 2018 data set and predict based on 
## 16 weeks of 2019 as it was done before for the 2020 data set
## for NO2 pollutant

rm(list=ls())


`%notin%` <- Negate(`%in%`)


library(readxl)
library(tidyverse)
library(boot)
library(ggthemes)
library(grid)
library(gridExtra)
library(forecast)
library(data.table)
library(cowplot)

dfpoll_orig <- read.csv('data_alltimeno2.csv')
dfpoll_orig$date <- as.Date(dfpoll_orig$date)

 state_policy<- read.csv('state_policy_changes_1.csv')
 state_policy <- state_policy %>% filter(State %notin% c('District of Columbia', 'Total with each policy (out of 51 with DC)'))

confounders_daily <- read.csv('confounders_all.csv')
confounders_daily$date <- as.Date(confounders_daily$date)

#state.abb = unique(dfpoll_orig2$state)

state.abb.delete = c("AK","AL","AR","DC", "DE","HI","IL","LA", "NE","NH", "NY","SD", "TN","TX","WV")

state.fullname.delete = c("Alabama", "Alaska", "Arkansas",  "Delaware", "Hawaii", "Illinois", "Louisiana",  "New Hampshire",
                          "New York", "South Dakota", "Tennessee", "Texas", "Nebraska", "West Virginia")


### remove those states

dfpoll_orig2 = subset(dfpoll_orig, !state %in% state.abb.delete)
dfpoll_orig2$state = factor(dfpoll_orig2$state)

confounders_daily2 = subset(confounders_daily, !stateabbr %in% state.abb.delete)
confounders_daily2$stateabbr = factor(confounders_daily2$stateabbr)

state_policy2 = subset(state_policy, !State %in% state.fullname.delete)
state_policy2$State = factor(state_policy2$State)

#################################################
## Select data before April 29, 2020 (inclusive)
## for all datasets
#################################################

maxdate ='2019-04-29'

#################################################
## INPUT PARAMETERS
#################################################


## train data
ldate <- as.Date("2019-01-01")
nweekspred = 16 # # of weeks to predict on
udate <- (ldate+7*nweekspred) # date to predict until


#################################################
## LOOP OVER STATES
#################################################

dfs_tosave = list()
p = list()
i=1

start.time <- Sys.time()

for (state_fullname in unique(state_policy2$State)){
 
  ## train data
  ldate <- as.Date("2019-01-01")
  nweekspred = 16 # # of weeks to predict on
  udate <- (ldate+7*nweekspred) # date to predict until
  
  # get abbreviated name
  state_name = state.abb[which(state.name == state_fullname)]
  
  
  dfpoll <- dfpoll_orig2 %>% filter (state==state_name) %>% group_by(date) %>% summarise(no2 = mean(no2))
  
  dfpoll<-dfpoll %>%
    complete(date = seq.Date(min(date), max(date), by="day")) %>%
    fill('no2') %>% filter( date < as.Date(maxdate))
  
  cat("State = ", state_fullname,"  ")
  
  print(dim(dfpoll)[1])
  
  
  conf_state <- confounders_daily2 %>% filter(stateabbr == state_name)%>% filter( date < as.Date(maxdate)) %>%
    complete(date = seq.Date(min(date), max(date), by="day")) %>%
    fill('tmmx','pr','rmax')
  
  
  n=7 ## average every seven rows
  m = (nrow(dfpoll)%/%n)*n
  
  
  ## take avg every n days. This will reduce the length of
  # the time series by a factor of n
  dfweek <- setDT(dfpoll[1:m,])[,.(no2=mean(no2)), date-0:(n-1)]
  dfweek$idx <- seq(1, nrow(dfweek))
  
  ggplot(data = dfweek, aes(x=date, y=no2, group=1)) + geom_line()
  
  ## take avg every n days for confounders. 
  temp_week <- setDT(conf_state[1:m,])[,.(temp = mean(tmmx)), date-0:(n-1)]
  ppt_week <- setDT(conf_state[1:m,])[,.(ppt = mean(pr)), date-0:(n-1)]
  hum_week <- setDT(conf_state[1:m,])[,.(hum = mean(rmax)), date-0:(n-1)]
  
  xregs <- cbind(temp_week, ppt_week$ppt, hum_week$hum)
  colnames(xregs) <- c('date','temp','ppt','hum')
  
  train = dfweek %>% filter(date<ldate) # ldate not included
  train$idx <- seq(1, nrow(train))
  
  xregs_train <- xregs %>% filter(date<ldate) # ldate not included
  xregs_train <- xregs_train[, .(temp,ppt,hum)]
  
  xregs_train <- as.matrix(xregs_train)
  
  ## test data from poll
  test = dfweek %>%  filter(date>=ldate & date <udate)## include ldate and filter(date>=ldate & date <udate)
  
  ## test data for confounders
  xregs_test <- xregs %>%  filter(date>=ldate & date <udate)
  xregs_test <- xregs_test[,.(temp,ppt,hum)]
  xregs_test <- as.matrix(xregs_test)
  
  
  ts=ts(train$no2)
  
  # number of bootstraps
  #  num_resamples=1000
  num_resamples <- 10 # small number for testing
  sim <- bld.mbb.bootstrap(ts, num_resamples)
  preds = matrix(list(), nrow=num_resamples)
  
  for (j in seq(1, length(sim))) {
    
    model = auto.arima(sim[[j]], xreg = as.matrix(xregs_train), max.p = 100, max.q = 100, max.P = 100, max.Q = 100)
    forecast = forecast(model,h = nweekspred, xreg = xregs_test,level = 0.95)
    
    preds[[j]] = forecast$mean
    print(j)
  }
  
  preds = as.data.frame(preds)
  sd_pred = apply(preds,1,sd)
  mean_pred = apply(preds,1,mean)
  
  mean_diff = test$no2-mean_pred 
  lower = mean_diff-1.96*sd_pred
  upper = mean_diff + 1.96*sd_pred
  
  ### plot with error bars
  df_diff <- as.data.frame(cbind(mean_diff, sd_pred))
  df_diff$date <- as.Date(test$date)
  
  
  p[[i]] = ggplot(df_diff, aes(x=date, y=mean_diff,color='red')) + 
    geom_line(linetype = 'solid', size = 1.5) + 
    geom_hline(yintercept = 0)+
    geom_point(size=3)+
    theme(axis.title=element_blank())+
    ## to only show 0 label on yaxis
    #scale_y_continuous() + 
    ## to show months as first letter only
    scale_x_date("Date",breaks = c(seq(from=as.Date("2019-01-01"),
                                       to=as.Date("2019-04-30"),by="month")),
                 labels = c('J','F','M','A')) +
    geom_errorbar(data=df_diff, aes(ymin=mean_diff-1.96*sd_pred, ymax=mean_diff+1.96*sd_pred), width=1,color='black',
                  position=position_dodge(0.05), size=1) +
    ggtitle(paste(state_name))+
    theme_classic()+
    theme(plot.title = element_text(hjust = 0.5))+
    theme(axis.text = element_text(size = 14), axis.title = element_text(size = 14))+theme(legend.position = "none")
  
  
  dfs_tosave[[i]] = df_diff
  
  i = i+1
  print(i)
}


end.time <- Sys.time()


################################################
## PLOTS
################################################


#ps <- paste('p[[',1:length(p),']]', sep='', collapse=',')


for (i in 1:length(p)) {
  p[[i]] <- p[[i]]+theme(axis.title=element_blank())+ scale_y_continuous(breaks=seq(-30, 30, 30))
  # axis.text.x=element_blank(),
  # axis.ticks.x=element_blank())
}

plot <- plot_grid( p[[1]],p[[2]],p[[3]],p[[4]],p[[5]],p[[6]],p[[7]],p[[8]],p[[9]],p[[10]],
                   p[[11]],p[[12]],p[[13]],p[[14]],p[[15]],p[[16]],p[[17]],p[[18]],p[[19]],p[[20]],
                   p[[21]],p[[22]],p[[23]],p[[24]],p[[25]],p[[26]],p[[27]],p[[28]],p[[29]],p[[30]],
                   p[[31]],p[[32]],p[[33]],p[[34]],p[[35]],p[[36]])
y.grob <- textGrob(expression(paste("Difference between actual and predicted ",NO[2]," concentrations (ppb)")), 
                   gp=gpar(fontface="bold", fontsize=15), rot=90)

png("figures/NO2_figure_2019_01_04.png", height = 1000, width=2000)
grid.arrange(arrangeGrob(plot, left = y.grob))          
dev.off()   

################################################
## TO MAKE BOXPLOTS
################################################

df_box <- dfs_tosave[[1]]
df_box$state <- p[[1]]$labels$title

for (i in 2:length(p)){
  dffill <- dfs_tosave[[i]]
  dffill$state <- p[[i]]$labels$title
  df_box <- rbind(df_box, dffill)
}

df_box_NO2_2019_Jan_Apr = df_box
df_box_NO2_2019_Jan_Apr$Year = rep(2019, dim(df_box_NO2_2019_Jan_Apr)[1])

fwrite(df_box_NO2_2019_Jan_Apr, "df_box_NO2_2019_Jan_Apr.csv")

###EOF