-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessing_hd.R
137 lines (111 loc) · 5.4 KB
/
processing_hd.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Load libraries
library(tidyverse)
library(arrow)
# Set seed for reproducibility
set.seed(2024)
# Read cycleIntervention health output as an example
### Ali to bring in the fix dataset
# hd <- read_csv("C:/Users/ajnab/RMIT University/JIBE working group - General/manchester/simulationResults/ForUrbanTransition/cycleIntervention/health/04_death_and_disease/pp_healthDiseaseTracker_2029.csv")
### Ali to add cycling intervention health data.
## Belen folder
directory <- "C:/Users/mbzd2/OneDrive - RMIT University/JIBE/JIBE-WP6/healthmicrosim/"
health_base <- read_csv(paste0(directory, "manchester/simulationResults/ForUrbanTransition/reference/health/04_death_and_disease/pp_healthDiseaseTracker_2039.csv")) # %>%
# left_join(person_base)
# health_intervention <- read_csv(paste0(directory,"manchester/simulationResults/ForUrbanTransition/cycleIntervention/health/04_death_and_disease/pp_healthDiseaseTracker_2029.csv"))
# Same for quicker run
# shd <- hd |> slice_sample(n = 100)
# ####### Referecen data #######
#
# # Loop through each row
# for (i in 1:nrow(health_base)){
# # Create a long df for each row
# ur <- pivot_longer(health_base[i,], cols = -id)
# # Get a table for each disease state with the first year when it was appeared
# fv <- ur |> group_by(value) |> summarise(fy = first(name)) |> arrange(fy)
# # Loop through the first value dataset with first year for the disease appearance
# for (j in 1:nrow(fv)){
# # Ignore healty and null states
# if (!fv$value[j] %in% c("healthy", "null"))
# {
# # Do not carry forward the healthy state to any other state
# if (fv$value[j - 1] != "healthy")
# # Join previous state with the current one with a | as a separator
# fv$value[j] <- paste(fv$value[j], fv$value[j - 1], sep = "|")
# }
# }
# # Update the long format for each ID with the new states
# for (k in 1:nrow(fv)){
# if (!fv$value[k] %in% c("healthy", "null"))
# ur[(ur$name >= as.numeric(fv$fy[k]) & ur$name < (as.numeric(fv$fy[k + 1]))),]$value <- fv$value[k]
# }
#
# # Update the states
# health_base <- rows_update(health_base, pivot_wider(ur, id_cols=id))
# }
#
#
#
# ####### Scenario data #######
#
# # Loop through each row
# for (i in 1:nrow(health_intervention)){
# # Create a long df for each row
# ur <- pivot_longer(health_intervention[i,], cols = -id)
# # Get a table for each disease state with the first year when it was appeared
# fv <- ur |> group_by(value) |> summarise(fy = first(name)) |> arrange(fy)
# # Loop through the first value dataset with first year for the disease appearance
# for (j in 1:nrow(fv)){
# # Ignore healty and null states
# if (!fv$value[j] %in% c("healthy", "null"))
# {
# # Do not carry forward the healthy state to any other state
# if (fv$value[j - 1] != "healthy")
# # Join previous state with the current one with a | as a separator
# fv$value[j] <- paste(fv$value[j], fv$value[j - 1], sep = "|")
# }
# }
# # Update the long format for each ID with the new states
# for (k in 1:nrow(fv)){
# if (!fv$value[k] %in% c("healthy", "null"))
# ur[(ur$name >= as.numeric(fv$fy[k]) & ur$name < (as.numeric(fv$fy[k + 1]))),]$value <- fv$value[k]
# }
#
# # Update the states
# health_intervention <- rows_update(health_intervention, pivot_wider(ur, id_cols=id))
# }
### Optimised version
library(data.table)
library(dplyr)
# Convert health_base to data.table for fast processing
health_base_dt <- as.data.table(health_base)
# Melt (pivot_longer) the entire dataset once to create the long format
ur <- melt(health_base_dt, id.vars = "id", variable.name = "year", value.name = "state")
# For each id, determine the first year each disease state appears, excluding "healthy" and "null"
fv <- ur[!(state %in% c("healthy", "null")), .(first_year = min(as.numeric(year))), by = .(id, state)]
setorder(fv, id, first_year)
# Concatenate previous states cumulatively within each id
fv[, cumulative_state := Reduce(function(x, y) paste(y, x, sep = "|"), state, accumulate = TRUE), by = id]
# Merge cumulative states back to the main dataset
ur <- merge(ur, fv[, .(id, state, cumulative_state, first_year)], by = c("id", "state"), all.x = TRUE)
# Create a final state column preserving "healthy" where applicable and setting "dead" to "null"
ur[, final_state := ifelse(
state == "healthy" | is.na(cumulative_state),
state, # Keep "healthy" or original if cumulative_state is NA
cumulative_state # Otherwise use cumulative state
), by = id]
# Change any state that includes "dead" to "null"
ur[, final_state := ifelse(grepl("dead", final_state), "null", final_state)]
# Use dplyr::fill to carry forward cumulative states within each id
ur <- ur %>%
group_by(id) %>%
arrange(id, year) %>%
mutate(final_state = ifelse(final_state == "null", NA, final_state)) %>% # Treat "null" as NA
fill(final_state, .direction = "down") %>%
replace_na(list(final_state = "healthy")) %>% # Replace any remaining NA with "healthy"
ungroup()
# Reshape back to wide format, retaining the modified states
health_base_updated <- dcast(setDT(ur), id ~ year, value.var = "final_state")
# Ensure the original "healthy" and "null" states are preserved where they belong
health_base_updated[is.na(health_base_updated)] <- health_base[is.na(health_base_updated)]
# Assign the result back to the original health_base variable if desired
health_base_plot <- as.data.frame(health_base_updated)