-
Notifications
You must be signed in to change notification settings - Fork 0
/
IDP.R
304 lines (248 loc) · 15.1 KB
/
IDP.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
#IDP
library(readxl)
library(dplyr)
library(ggplot2)
library(lubridate)
library(janitor)
library(anchors) #replace.value()
library(cowplot)
library(stats)
library(MASS)
# shelter_type <- read_excel("C:/Users/dms228/OneDrive - University of Exeter/PhD/data/Yemen/dtm_displacement_2019.xlsx",
# sheet = "Shelter Type")
yemen_popDens <- readRDS("C:/Users/dms228/github/cholera-in-yemen/saved_data/yemen_popDensData.RDS")
WHO_weekly <- readRDS("C:/Users/dms228/github/cholera-in-yemen/saved_data/WHO_weekly_dist.RDS")
popDens_byGov <- readRDS("C:/Users/dms228/github/cholera-in-yemen/saved_data/yemen_popDens_govAgg.RDS")
WHO_weekly_per <- readRDS("C:/Users/dms228/github/cholera-in-yemen/saved_data/WHO_weekly_per.RDS")
WHO_monthly_per <- readRDS("C:/Users/dms228/github/cholera-in-yemen/saved_data/WHO_monthly_per.RDS")
#WHO_monthly <- readRDS("C:/Users/dms228/OneDrive - University of Exeter/R Scripts/saved_data/WHO_monthly.RDS")
#WHO_monthly_gov <- readRDS("C:/Users/dms228/OneDrive - University of Exeter/R Scripts/saved_data/WHO_monthly_gov.RDS")
WHO_weeklyGov_per <- readRDS("C:/Users/dms228/github/cholera-in-yemen/saved_data/WHO_weeklyGov_per.RDS")
WHO_weeklyGov <- readRDS("C:/Users/dms228/github/cholera-in-yemen/saved_data/WHO_weekly_gov.RDS")
WHO19_monthly_gov <- WHO_monthly %>% filter(Year == 2019)
WHO19_monthly_distPer <- WHO_monthly_per %>% filter(year == 2019)
WHO19_monthlyGov_per <- readRDS("C:/Users/dms228/github/cholera-in-yemen/saved_data/WHO_monthlyGov_per.RDS") %>% filter(year == 2019)
#popDens_byGov <- readRDS("C:/Users/dms228/OneDrive - University of Exeter/R Scripts/saved_data/yemen_popDens_gov.RDS")
IDPs <- read_excel("C:/Users/dms228/OneDrive - University of Exeter/PhD/data/Yemen/dtm_displacement_2019.xlsx",
sheet = "RDT-IDPs", skip = 1)[-1,] %>% dplyr::select(-c(2,3,5,6,7,8,11,13,16,33))
IDPs <- IDPs[IDPs$`Total # of HH` > 0,]
#C:/Users/dms228/github/cholera-in-yemen/saved_data/
WHO_weekly$Date <- WHO_weekly$Date %>% as.numeric() %>% as.POSIXct.Date()
WHO_weekly$Year <- WHO_weekly$Date %>% year()
##WHO total cases in 2019
WHO19_distTot <- WHO_weekly %>% filter(year(Date) == 2019) %>% group_by(Governorate, District) %>% dplyr::summarise(across(S.Cases:Under5, sum))
WHO19_govTot <- WHO_weeklyGov %>% filter(year(Date) == 2019) %>% group_by(Governorate) %>% dplyr::summarise(across(S.Cases:Under5, sum, na.rm = TRUE))
#####IDP Cleaning, cleaning and adding some helpful dates
#Change spellings to fit new data
IDPs[c(2,4,5)] <- gsub("[^[:alnum:]]", "", as.matrix(IDPs[c(2,4,5)]))
IDPs <- replace.value(IDPs, names = c("Governorate", "Governorate of Origin"), from = "Hadramaut", to = "Hadramawt")
IDPs <- replace.value(IDPs, names = c("District of Origin"), from = "AlMawasit", to = "AlMawasid")
IDPs <- replace.value(IDPs, names = c("District of Origin"), from = "MerkhahAsSufla", to = "MarkhahAssufla")
IDPs <- replace.value(IDPs, names = c("District of Origin"), from = "AlMakhad", to = "AlMakhadir")
IDPs <- replace.value(IDPs, names = c("District of Origin"), from = "AsSaddahgf", to = "AsSaddah")
IDPs <- replace.value(IDPs, names = c("District of Origin"), from = "Bidbadah", to = "BidBidah")
IDPs <- replace.value(IDPs, names = c("District of Origin"), from = "Medghal", to = "Madkhal")
IDPs$`Displacement Date` <- IDPs$`Displacement Date` %>% as.numeric() %>% as.Date(origin = '1900-01-01') %>% as.POSIXct()
IDPs$`Total # of HH` <- IDPs$`Total # of HH` %>% as.numeric()
IDPs$month <- month(IDPs$`Displacement Date`)
HH_leaving_dist <- aggregate(IDPs$`Total # of HH`, by = list(Gov = IDPs$`Governorate of Origin`, Dis = IDPs$`District of Origin`), FUN = sum)
HH_leaving_gov <- aggregate(IDPs$`Total # of HH`, by = list(Gov = IDPs$`Governorate of Origin`), FUN = sum)
HH_arriv <- aggregate(IDPs$`Total # of HH`, by = list(Gov = IDPs$`Governorate`), FUN = sum)
#Total number of households leaving each district in 2019
tot_idp_dist <- merge(HH_leaving_dist, WHO19_distTot, by.x = c("Gov","Dis"), by.y = c("Governorate","District"),
all.y = TRUE) %>% dplyr::rename(HH_leaving = x)
tot_idp_dist$HH_leaving[is.na(tot_idp_dist$HH_leaving)] <- 0 #Missing HH_leaving data is assumed to be zero
tot_idp_dist <- na.omit(tot_idp_dist) #District (mostly 'blank') with no cholera case data are removed.
#Total number of households leaving or arriving at each Governorate in 2019
tot_idp_gov <- merge(HH_leaving_gov, HH_arriv, by.x = c("Gov"), by.y = "Gov") %>% merge(WHO19_govTot, by.x = "Gov", by.y = "Governorate") %>%
dplyr::rename(HH_leaving = x.x, HH_arrive = x.y) %>% merge(popDens_byGov, by.x = "Gov", by.y = "Governorate") %>%
dplyr::select(-c(`2017`,`2018`,`2020`)) %>% dplyr::rename(PopDensity = `2019`)
#saveRDS(tot_idp_gov,"C:/Users/dms228/github/cholera-in-yemen/saved_data/tot_idp_gov.RDS")
#saveRDS(tot_idp_dist, "C:/Users/dms228/github/cholera-in-yemen/saved_data/tot_idp_dist.RDS")
#Linear regression (log) plot of comparing the No.Households fleeing a district with cholera suspected cases in the same district
png(file = "C:/Users/dms228/github/cholera-in-yemen/plots/idp/lgHH_fleeingVsCases_dist.png", width = 600, height = 400)
ggplot(data = tot_idp_dist, aes(x = log(HH_leaving), y = S.Cases)) +
geom_point() +
labs(x = "Number of Households Fleeing District (log)", y = "Number of Suspected Cases") +
theme_bw()
dev.off()
png(file = "C:/Users/dms228/github/cholera-in-yemen/plots/idp/HH_fleeingVsCases_dist.png", width = 600, height = 400)
ggplot(data = tot_idp_dist, aes(x = HH_leaving, y = S.Cases)) +
geom_point() +
labs(x = "Number of Households Fleeing District", y = "Number of Suspected Cases") +
theme_bw()
dev.off()
png(file = "C:/Users/dms228/github/cholera-in-yemen/plots/idp/HH_fleeingVsDeaths_gov.png", width = 600, height = 400)
ggplot(data = tot_idp_gov, aes(x = log(HH_leaving), y = log(Deaths))) +
geom_point() +
labs(x = "Number of Households Fleeing Governorate (log)", y = "Number of Deaths in Governorate (log)") +
theme_bw()
dev.off()
png(file = "C:/Users/dms228/github/cholera-in-yemen/plots/idp/HH_fleeingVsCases_gov.png", width = 600, height = 400)
ggplot(data = tot_idp_gov, aes(x = log(HH_leaving), y = log(S.Cases))) +
geom_point() +
labs(x = "Number of Households Fleeing Governorate (log)", y = "Number of Suspected Cases in Governorate (log)") +
theme_bw()
dev.off()
#Just a side not
ggplot(data = tot_idp_gov, aes(x = PopDensity, y = S.Cases)) +
geom_point() +
labs(x = "Population Density", y = "Total Number of Suspected Cases in 2019") +
theme_bw()
##Negative Binomial regression
leaving_deaths_nb <- glm.nb(Deaths ~ HH_leaving, data = tot_idp_gov)
summary(leaving_deaths_nb)
leaving_deathsPopDens_nb <- glm.nb(Deaths ~ HH_leaving + PopDensity, data = tot_idp_gov)
summary(leaving_deathsPopDens_nb)
leaving_cases_nb <- glm.nb(S.Cases ~ HH_leaving, data = tot_idp_gov)
summary(leaving_cases_nb)
leaving_casesPopDens_nb <- glm.nb(S.Cases ~ HH_leaving + PopDensity, data = tot_idp_gov)
summary(leaving_casesPopDens_nb)
m2 <- glm.nb(Deaths ~ HH_leaving + PopDensity, data = tot_idp_gov)
summary(m2)
colnames(tot_idp) <- c("gov", "HH_leaving", "HH_arriving", "totCases_19")
tot_idp$totCases_19 <- as.numeric(tot_idp$totCases_19)
#Plot of people fleeing a Governorate vs. Total 2019 cases (WHO)
ggplot(data = tot_idp_gov, aes(x = log(HH_leaving), y = S.Cases)) +
geom_point()
ggplot(data = tot_idp_gov, aes(x = log(HH_leaving), y = log(Deaths))) +
geom_point()
ggplot(data = tot_idp, aes(x = HH_arriving, y = totCases_19)) +
geom_point()
#Estimated Imported Infections for each governorate
tot_idp_gov <- readRDS("C:/Users/dms228/github/cholera-in-yemen/saved_data/tot_idp_gov.RDS")
#Initialise df
EstImpInf_byGov <- data.frame()
for(i in 1:21){
gov <- tot_idp_gov$Gov[i]
gov_idp <- IDPs %>% filter(Governorate == gov) #Filter people going to `gov`
gov_idp <- gov_idp[which(gov_idp$`Total # of HH` > 0),] #Remove negative HH numbers (assume data error)
idp_agg1 <- aggregate(gov_idp$`Total # of HH`, by = list(gov_idp$`Governorate of Origin`, gov_idp$month),
FUN = sum) #Ignore districts. Aggregate (sum) items with matching origin Governorate and month
colnames(idp_agg1) <- c("gov_orig", "month", "HH")
idp_agg1$origCholProb <- rep(0, nrow(idp_agg1))
for(j in 1:nrow(idp_agg1)){ #The probability that any given person coming from the district is likely to have cholera
gov_month <- WHO19_monthlyGov_per %>%
filter(month == idp_agg1$month[j]) %>%
filter(Governorate == idp_agg1$gov_orig[j])
tryCatch(idp_agg1$origCholProb[j] <- (gov_month[4] %>% pull()) / 100000,
error = function(e) {print(paste("Error! Gov_ent:", gov,
"Month:",idp_agg1$month[j],
"Gov_orig:",idp_agg1$gov_orig[j],
"ij = ",i,j))})
}
idp_agg1$estInf <- idp_agg1$HH * idp_agg1$origCholProb * 6.7 #Average persons per HH = 6.7
idp_agg2 <- aggregate(idp_agg1$estInf, by = list(idp_agg1$month), FUN = sum)
colnames(idp_agg2) <- c("month", "estInf")
idp_agg2$gov <- rep(gov, nrow(idp_agg2))
EstImpInf_byGov <- rbind(EstImpInf_byGov, idp_agg2)
}
#Estimated Imported Infections for each District
#Initialise df
EstImpInf_byDist <- data.frame()
i = 4
j = 1
for(i in 1:20){
gov <- tot_idp_gov$Gov[i] # The receiving governorate
gov_idp <- IDPs %>% filter(Governorate == gov)
gov_idp <- gov_idp[gov_idp$`Total # of HH`>0,] #Remove all negative numbers. Assume error in dataset.
idp_distMonthAgg <- aggregate(gov_idp$`Total # of HH`, by = list(gov_idp$`Governorate of Origin`,
gov_idp$`District of Origin`, gov_idp$month),
FUN = sum) # Aggregate (sum) items by origin Governorate, origin district and month for HH entering Governorate [i]
colnames(idp_distMonthAgg) <- c("gov_orig","dist_orig", "month", "HH")
idp_distMonthAgg <- cbind(gov_rec = gov, idp_distMonthAgg)
idp_distMonthAgg$origCholProb <- rep(0, nrow(idp_distMonthAgg))
for(j in 1:nrow(idp_distMonthAgg)){ #The probability that any given person coming from the country is likely to have cholera
orig_dist_month <- WHO19_monthly_distPer %>%
filter(month == idp_distMonthAgg$month[j]) %>%
filter(Governorate == idp_distMonthAgg$gov_orig[j]) %>%
filter(District == idp_distMonthAgg$dist_orig[j])
tryCatch(idp_distMonthAgg$origCholProb[j] <- (orig_dist_month[5] %>% pull()) / 100000,
error = function(e) {print(paste("Error! Gov_ent:", gov,
"Month:",idp_distMonthAgg$month[j],
" Gov_orig:",idp_distMonthAgg$gov_orig[j],
" Dist_orig:",idp_distMonthAgg$dist_orig[j],
"ij = ",i,j))})
}
idp_distMonthAgg$estInf <- idp_distMonthAgg$HH * idp_distMonthAgg$origCholProb * 6.7 #Average persons per HH = 6.7
idp_distMonthAgg <- idp_distMonthAgg[!is.na(idp_distMonthAgg$origCholProb),] #Remove NaNs. Not quite sure why they're there. I think difference in names
idp_MonthAgg <- aggregate(idp_distMonthAgg$estInf, by = list(idp_distMonthAgg$month), FUN = sum)
colnames(idp_MonthAgg) <- c("month", "estInf")
idp_MonthAgg$gov <- rep(gov, nrow(idp_MonthAgg))
EstImpInf_byDist <- rbind(EstImpInf_byDist, idp_MonthAgg)
}
#Combine EstImpInf_byGov and EstImpInf_byDist to compare
EstImpInf_both <- merge(EstImpInf_byGov, EstImpInf_byDist, by.x = c("month","gov"), by.y = c("month","gov"))
colnames(EstImpInf_both) <- c("Month","Governorate","byGov","byDist")
ggplot(data = EstImpInf_both) +
geom_line(aes(x = Month, y = byGov, col = Governorate))
ggplot(data = EstImpInf_both) +
geom_line(aes(x = Month, y = byDist, col = Governorate))
#example AlHudaydah
AlHud_estImp <- EstImpInf_both %>% filter(Governorate == "AlHudaydah")
ggplot(data = AlHud_estImp, aes(x = Month)) +
geom_line(aes(y = byGov, col = "byGov")) +
geom_line(aes(y = byDist, col = "byDist"))
#example Ibb
Ibb_estImp <- EstImpInf_both %>% filter(Governorate == "Ibb")
ggplot(data = Ibb_estImp, aes(x = Month)) +
geom_line(aes(y = byGov, col = "byGov")) +
geom_line(aes(y = byDist, col = "byDist"))
#example Ibb
Ibb_estImp <- EstImpInf_both %>% filter(Governorate == "Ibb")
ggplot(data = Ibb_estImp, aes(x = Month)) +
geom_line(aes(y = byGov, col = "byGov")) +
geom_line(aes(y = byDist, col = "byDist"))
#example Hajjah
Hajjah_estImp <- EstImpInf_both %>% filter(Governorate == "Hajjah")
ggplot(data = Hajjah_estImp, aes(x = Month)) +
geom_line(aes(y = byGov, col = "byGov")) +
geom_line(aes(y = byDist, col = "byDist"))
##Merge cholera data and EstImpInf_byGov
EstImpInf_annual <- EstImpInf_byGov %>% group_by(gov) %>% summarise(ImpCases = sum(estInf))
EstImpInfxCases <- merge(WHO19_govTot, EstImpInf_annual, by.x = "Governorate", by.y = "gov")
EstImpInfxCases$S.Cases <- EstImpInfxCases$S.Cases %>% as.integer
EstImpInfxCases$ImpCases <- EstImpInfxCases$ImpCases %>% as.integer
png(file = "C:/Users/dms228/github/cholera-in-yemen/plots/idp/EstImpVsCases_gov.png", width = 600, height = 400)
ggplot(data = EstImpInfxCases, aes(x = ImpCases, y = S.Cases)) +
geom_point() +
labs(x = "Estimated Imported Cases", y = "Suspected Cases") +
theme_bw()
dev.off()
negbin <- glm.nb(S.Cases ~ ImpCases, data = EstImpInfxCases)
lm <- lm(S.Cases ~ ImpCases, data = EstImpInfxCases)
summary(negbin)
summary(lm)
##Merge cholera data and EstImpInf_byDist
EstImpInf_annual.dist <- EstImpInf_byDist %>% group_by(gov) %>% summarise(ImpCases = sum(estInf))
EstImpInfxCases.dist <- merge(WHO19_govTot, EstImpInf_annual.dist, by.x = "Governorate", by.y = "gov")
EstImpInfxCases.dist$S.Cases <- EstImpInfxCases.dist$S.Cases %>% as.integer
EstImpInfxCases.dist$ImpCases <- EstImpInfxCases.dist$ImpCases %>% as.integer
ggplot(data = EstImpInfxCases.dist, aes(x = ImpCases, y = S.Cases)) +
geom_point() +
labs(x = "Estimated Imported Cases", y = "Suspected Cases") +
theme_bw()
dev.off()
negbin.dist <- glm.nb(S.Cases ~ ImpCases, data = EstImpInfxCases.dist)
lm.dist <- lm(S.Cases ~ ImpCases, data = EstImpInfxCases)
summary(negbin.dist)
summary(lm.dist)
###To conclude, aggregating by district is less helpful than aggregating just by Gov. So we'll now ignore aggregating by district
##Just to check it was useful, we'll do suspected cases vs. incoming idps.
png(file = "C:/Users/dms228/github/cholera-in-yemen/plots/idp/HHAriveVsCases_gov.png", width = 600, height = 400)
ggplot(data = tot_idp_gov, aes(x = HH_arrive, y = S.Cases)) +
geom_point() +
labs(x = "Households arriving in Governorate", y = "Suspected Cases") +
theme_bw()
dev.off()
png(file = "C:/Users/dms228/github/cholera-in-yemen/plots/idp/HHAriveVsDeaths_gov.png", width = 600, height = 400)
ggplot(data = tot_idp_gov, aes(x = HH_arrive, y = Deaths)) +
geom_point() +
labs(x = "Households arriving in Governorate", y = "Cholera deaths") +
theme_bw()
dev.off()
lm.HH <- lm(S.Cases ~ HH_arrive, data = tot_idp_gov)
lm.HH.deaths <- lm(Deaths ~ HH_arrive, data = tot_idp_gov)
summary(lm.HH)
summary(lm.HH.deaths)
#I'm not actually adding any new code here. Just a comment to test GitHub