-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCodeS2_step1_specify_covariate.r
145 lines (121 loc) · 5.31 KB
/
CodeS2_step1_specify_covariate.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# Start #####################################################################
# Title: Estimating Costs Associated with Disease Model States Using Generalized Linear Models: A Tutorial
# Step 1: Preparation of dataset
# > Specify covariate
# Author: Junwen Zhou
# Date: 1 August 2023
rm(list = ls())
# Set path
# setwd("C:\\XXX\\XXX") # If you don't set it, it will be the directory where you first open the r program
path_output <- getwd()
# Source package
library(tidyverse)
select <- dplyr::select
# Import data ----
dat <- readRDS(file.path(path_output, "step0_ana.rds"))
# Specification ----
tp1 <- dat %>%
# Initial specification of covariates
mutate(
# Standardardize continuous covariate
cur_age = (cur_age - 60) / 10,
ldl = (ldl - 3.6) / 1,
hdl = log(hdl),
creatinine = ((log(creatinine) - 4.4) / 0.2),
sbp = (sbp - 140) / 20,
dbp = (dbp - 80) / 10,
# Set reference level for discrete covariate
male = factor(male, level = c("0", "1")),
race = factor(race, level = c("white", "black", "asian", "other")),
townsend = factor(townsend, level = str_c("q", c(3,1,2,4,5))),
smoke = factor(smoke, level = c("none","former","current")),
pa = factor(pa, level = c("moderate", "low", "high")),
unhealthy_diet = factor(unhealthy_diet, level = c("0", "1")),
bmi = factor(bmi, level = c("normal","underweight","overweight",
"obesity1", "obesity2","obesity3")),
atht = factor(atht, level = c("0", "1")),
db = factor(db, level = c("0", "1")),
cancer = factor(cancer, level = c("0", "1")),
mental = factor(mental, level = c("0", "1")),
mi = factor(mi, level = as.character(0:10)),
stroke = factor(stroke, level = as.character(0:10)),
vd = factor(vd, level = c("0", "1")),
nvd = factor(nvd, level = c("0", "1"))
) %>%
# Further specify some covariates
mutate(
# Combine categories for temporal history covarites
mi = fct_collapse(mi, "4" = as.character(4:10)),
stroke = fct_collapse(stroke, "4" = as.character(4:10))
)
saveRDS(tp1, file = file.path(path_output, "step1_ana.rds"))
# Summary baseline characteristics ----
# Create a function to describe each covariate depending on whether it is continuous
f_des_cov <- function(cov, is_con = TRUE){
if(is_con){
# Describe mean and sd for continuous covariate
tmp1 <- mean(cov)
tmp2 <- sd(cov)
output <- tibble(term = "Z", # There is no category for continuous covariate, set Z here.
value = str_c(round(tmp1, 1), " (", round(tmp2, 1), ")"))
} else {
# Describe total number of each category and their proportion for discrete covariate
tmp1 <- table(cov)
tmp2 <- prop.table(tmp1) * 100
output <- tibble(term = names(tmp1),
value = str_c(tmp1, " (", round(tmp2, 1), ")"))
}
return(output)
}
# Name of baseline characteristics
cov_dbl <- c("cur_age", "ldl", "hdl", "creatinine", "sbp", "dbp") # Continuous covariate
cov_cat <- c("male", "race", "townsend", "smoke", "pa", "unhealthy_diet",
"bmi", "atht","db", "cancer", "mental") # discrete covariate
# Describe original dataset
temp1 <- map_df(dat %>% filter(year == 1) %>% select(all_of(cov_dbl)),
~f_des_cov(.x, is_con = TRUE),
.id = "cov")
temp2 <- map_df(dat %>% filter(year == 1) %>% select(all_of(cov_cat)),
~f_des_cov(.x, is_con = FALSE),
.id = "cov")
tmp1 <- bind_rows(temp1, temp2)
# Describe the dataset after specification
temp1 <- map_df(tp1 %>% filter(year == 1) %>% select(all_of(cov_dbl)),
~f_des_cov(.x, is_con = TRUE),
.id = "cov")
temp2 <- map_df(tp1 %>% filter(year == 1) %>% select(all_of(cov_cat)),
~f_des_cov(.x, is_con = FALSE),
.id = "cov")
tmp2 <- bind_rows(temp1, temp2)
# Covariate and term
# > Continuous covariate
temp1 <- tibble(cov = cov_dbl, # cov_dbl defined above
term = "Z",
term2 = c("(Z - 60) / 10",
"(Z - 3.6) / 1",
"Ln(Z)",
"(Ln(Z) - 4.4) / 0.2",
"(Z - 140) / 20",
"(Z - 80) / 10"))
# > Discrete covariate
temp2 <- tp1 %>%
select(all_of(cov_cat)) %>%
mutate(pa = fct_relevel(pa, "low"),
bmi = fct_relevel(bmi, "underweight"),
townsend = fct_relevel(townsend, "q1", "q2")) %>%
map_df(~tibble(term = levels(.x)), .id = "cov") %>%
mutate(term2 = term) # No difference in the term before and after specification except the reference level
# Order of covariate
temp3 <- c("cur_age",
"male", "race", "townsend", "smoke", "pa", "unhealthy_diet", "bmi",
"ldl", "hdl", "creatinine", "sbp", "dbp",
"atht","db", "cancer", "mental")
# Final term
tmp3 <- bind_rows(temp1, temp2) %>%
mutate(cov = factor(cov, levels = temp3)) %>%
arrange(cov)
# Final summary data
output <- left_join(tmp3, tmp1, by = c("cov", "term")) %>%
left_join(tmp2 %>% rename(value2 = value), by = c("cov", "term")) %>%
relocate(cov, term, value, term2, value2)
write.csv(output, file = file.path(path_output, "step1_tbl_cov_specification.csv"))