Rcode_regression_NEI_WLS.R

rm(list=ls())

`%notin%` <- Negate(`%in%`)
library(tidyverse)
library(Hmisc)
library(usmap)
library(car)
library(data.table)

########
# read data frames
########

cat("load pm25\n")
df_change_pm25 <- read.csv("df_change_pm25.csv")
df_change_pm25$diff <- df_change_pm25$before - df_change_pm25$after

##
cat("load NO2\n")
df_change_no2<- read.csv("df_change_no2.csv")
df_change_no2$diff <- df_change_no2$before - df_change_no2$after

cat("load pop density\n")
df_pop <- read.csv('pop_density_census2010.csv', encoding="us-ascii")
df_pop$state <- state.abb[match(df_pop$state, state.name)]
df_pop <- df_pop %>% drop_na()

df_regions <- read.csv('df_regions.csv')
df_regions$state <- trimws(df_regions$state, which = c("both"))
df_regions$state <- state.abb[match(df_regions$state, state.name)]

nei <- read.csv('NEI_sector_report.txt')

## select only NO2 and PM2.5
neif <- nei %>% select(c('STATE','MAJOR_SOURCE_TYPE','EMISSION_TONS','POLLUTANT')) %>%
  filter(POLLUTANT %in% c('PM2.5','Nitrogen Oxides')) %>%
  filter(STATE %notin% c('Puerto Rico', 'Virgin Islands','Tribal Land','District Of Columbia'))


## calc tot emission by state
nei_grp_source <- neif %>% group_by(STATE, MAJOR_SOURCE_TYPE) %>% 
  summarise(tot_emission = sum(EMISSION_TONS)) %>% arrange(STATE,tot_emission) %>%
  spread(MAJOR_SOURCE_TYPE, tot_emission) %>%
  replace(is.na(.), 0)

## change state name to abbr
nei_grp_source$state <- state.abb[match(nei_grp_source$STATE, state.name)] 


nei_grp_source_perc <- neif %>% group_by(STATE, MAJOR_SOURCE_TYPE) %>%
  summarise(tot_emission = sum(EMISSION_TONS)) %>% arrange(STATE,tot_emission) %>%
  mutate(emission_perc = 100*tot_emission/sum(tot_emission)) %>%
  # filter(MAJOR_SOURCE_TYPE %in% c("Mobile Sources", 'Stationary Sources')) %>%
  select(-tot_emission)%>%
  spread(MAJOR_SOURCE_TYPE, emission_perc) %>%
  replace(is.na(.), 0)
# 

nei_grp_source_perc$state <- state.abb[match(nei_grp_source_perc$STATE, state.name)] 



## merge with change in pollutants


df_all <- nei_grp_source_perc%>% 
  left_join(df_change_no2, by='state')%>% rename( no2_change = diff) %>%
  select(-c('before', 'after')) %>%
  left_join( df_change_pm25, by='state')%>% rename( pm25_change = diff) %>%
  select(-c('before', 'after', 'STATE', 'X'))

names(df_all) <- sub(" ", ".", names(df_all))


###############################
## ADD CONFOUNDERS: pop density,
## landmass and lat and longitude
###############################

df_all<- df_all %>% 
  left_join(df_pop , by='state') %>%
  left_join(df_regions, by='state')


################################
# SCALED PREDICTORS 
################################

cols_to_scale <- c('Fire.Sources','Mobile.Sources',"Stationary.Sources")

df_all_scaled <- df_all
df_all_scaled[,cols_to_scale] <- scale(df_all[, cols_to_scale])


#############
## WLS
#############
### interaction model

df_no2 <- df_all_scaled %>% filter(no2_change!="")

model_no2_I <- lm(no2_change ~  (Fire.Sources+Mobile.Sources+
                                   Stationary.Sources+
                                   pop_densitypermile2 + as.factor(region))^2, data=df_no2)

df_no2$resids.ols.no2.I <- model_no2_I$residuals

fit.SDfunc2 <- lm(abs(resids.ols.no2.I) ~ (Fire.Sources+Mobile.Sources+
                                             Stationary.Sources+
                                             pop_densitypermile2 + as.factor(region))^2, data=df_no2)

fitted.SDs2 <- fit.SDfunc2$fitted.values #Use fitted standard deviation estimates
weights.d.2 <- 1/fitted.SDs2^2 #Weights are inverse variances

fit.WLS.I.no2 <- lm(no2_change ~  (Fire.Sources+Mobile.Sources+
                                     Stationary.Sources+
                                     pop_densitypermile2 + as.factor(region))^2, data=df_no2, weights = weights.d.2)


summary(fit.WLS.I.no2)

## pm2.5
### Interaction Model


df_pm25 <- df_all_scaled %>% filter(pm25_change!="")

model_pm25_I <- lm(pm25_change ~  (Fire.Sources+Mobile.Sources+
                                     Stationary.Sources+
                                     pop_densitypermile2 + as.factor(region))^2, data=df_pm25)


df_pm25$resids.ols.pm25.I <- model_pm25_I$residuals

fit.SDfunc2pm25.2 <- lm(abs(resids.ols.pm25.I) ~ (Fire.Sources+Mobile.Sources+
                                                    Stationary.Sources+
                                                    pop_densitypermile2 + as.factor(region))^2, data=df_pm25)

fitted.SDs2pm25.2 <- fit.SDfunc2pm25.2$fitted.values #Use fitted standard deviation estimates
weights.d.pm25.2 <- 1/fitted.SDs2pm25.2^2 #Weights are inverse variances

fit.WLS.I.pm25 <- lm(pm25_change ~  (Fire.Sources+Mobile.Sources+
                                       Stationary.Sources+
                                       pop_densitypermile2 + as.factor(region))^2, data=df_pm25, weights = weights.d.pm25.2)


summary(fit.WLS.I.pm25)