-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathObserveTo.Dysgly.R
277 lines (243 loc) · 11.9 KB
/
ObserveTo.Dysgly.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
library(tidyverse)
library(openxlsx)
library(readxl)
library(dplyr)
library(lubridate)
setwd("E:/Molecular Epidemiology/GLYCEMIA/GLY")
all <- read_excel("all.xlsx")
str(all)
####1.定位到治疗半年以上的数据####
#####1.1找到治疗半年以上的病载记录,去除半年以前的记录#####
TreatMore6ms <- all %>% group_by(ID) %>% arrange(ID, TestingDate) %>%
mutate(TimefromART.m = round(interval(start=ARTinitialDate, end=VLDate) / months(1), digits = 2)) %>%
mutate(ART6months = ifelse(!is.na(VL) & TimefromART.m > 6, cumsum(!is.na(VL) & TimefromART.m > 6),NA)) %>%
mutate(first_occurrence = if_else(any(ART6months == 1), which(ART6months == 1)[1], NA_integer_)) %>%
filter(is.na(first_occurrence) | row_number() >= first_occurrence) %>%
select(-first_occurrence) %>%
filter(!all(is.na(ART6months))) %>%
mutate(ART6months = ifelse(is.na(ART6months), -1, ART6months))
#####1.2对治疗半年以上只有一次记录的ID,若不是VF则去除,因为两次记录才能判定LLV和VS#####
ids.1record <- TreatMore6ms %>% group_by(ID) %>% filter(!any(ART6months == 2))
ids.1record.removed <- ids.1record %>% group_by(ID) %>% filter(any(ART6months == 1 & VL< 1000))
TreatMore6ms <- anti_join(TreatMore6ms, ids.1record.removed, by = "ID")
write.xlsx(TreatMore6ms, "TreatMore6ms.xlsx")
####2.分VS,LLV,VF组####
#####2.1.提取出只有一条记录的ID,分组为VF#####
ids.1record.keep <- ids.1record %>% anti_join(ids.1record.removed, by = "ID")
ids.1record.keep <- ids.1record.keep %>% mutate(ART6months = ifelse(ART6months== -1, NA, ART6months))
ids.1record.keep <- ids.1record.keep %>% mutate(VLgroup = "VF")
#####2.2两条以上记录的,依据条件分VS,LLV,VF组#####
ids.more2record <- TreatMore6ms %>% anti_join(ids.1record.keep, by = "ID")
ids.2record <- ids.more2record %>% group_by(ID) %>% filter(ART6months %in% c(1, 2))
ids.2record <- ids.2record %>% group_by(ID) %>%
mutate(
VLgroup = case_when(
all(VL <= 50) ~ "VS",
first(VL) >= 1000 |
(first(VL) < 1000 & lead(VL) >= 1000) |
(first(VL) >= 200 & lead(VL) >= 200) ~ "VF",
TRUE ~ NA_character_
)) %>%
group_by(ID) %>%
mutate(VLgroup = max(VLgroup, na.rm = TRUE)) %>%
mutate(VLgroup = ifelse(is.na(VLgroup), "LLV", VLgroup))
#####2.3提取单个ID和对应的VL分组#####
ids.2record.select <- ids.2record %>% select(1, 11) %>% distinct(ID, .keep_all = TRUE)
ids.1record.keep.select <- ids.1record.keep %>% select(1, 11) %>% distinct(ID, .keep_all = TRUE)
ids.select.group <- bind_rows(ids.2record.select, ids.1record.keep.select)
#####2.4完成VL分组#####
VLgroup.vs.llv.vf <- TreatMore6ms %>% left_join(ids.select.group, by = "ID")
#####2.5计算入组时间#####
#第一次记录VL不是大于等于1000则删除,只有VF可以通过一次判断入组,其余都要观察两次,根据第二次值入组
VLgroup.vs.llv.vf.follow <- VLgroup.vs.llv.vf %>% group_by(ID) %>% arrange(ID, TestingDate) %>%
mutate(row_number = row_number()) %>%
filter(!(VLgroup == "VF" & row_number == 1 & VL < 1000)) %>%
filter(!(VLgroup == "VS" & row_number == 1)) %>%
filter(!(VLgroup == "LLV" & row_number == 1)) %>%
select(-row_number)%>%
mutate(StartfollowDate = first(VLDate[!is.na(VLDate)]))%>%
mutate(FollowTime.m = round(interval(start=StartfollowDate, end=TestingDate) / months(1), digits = 2)) %>%
mutate(ART6months = ifelse(ART6months== -1, NA, ART6months)) %>%
filter(FollowTime.m >= 0)
####3.入组后开始观察####
#####3.1血糖#####
#去除血糖缺失值
Gly <- VLgroup.vs.llv.vf.follow %>% filter(!is.na(Gly))
#识别基线血糖异常的ID
abnormalGLY <- Gly %>% group_by(ID) %>% arrange(ID, TestingDate) %>%
slice(1) %>% filter(Gly >= 7 | Gly <= 2.8) %>% pull(ID)
#剔除基线血糖异常的ID,对随访的血糖异常值进行剔除
Gly <- Gly %>% filter(!ID %in% abnormalGLY) %>% filter(Gly>2.8&Gly<12) %>% filter(n()>2)
write.xlsx(Gly, "Gly.xlsx")
# hist(data.3.3.3$Gly, breaks = 120)
# boxplot(data.3.3.3$Gly)
# summary(data.3.3.3$Gly)
#IQR <- Q3 - Q1
#upper.IQR <- Q3 + 1.5 * IQR
#hist(data.hlme$Gly, breaks = 30)
#####3.2检测时间间隔#####
#去除检测时间间隔小于等于30天的数据,剔除检测时间间隔大于180天往后的数据
Gly.TestInterval.30.180 <- Gly %>% group_by(ID) %>% arrange(ID, TestingDate) %>%
mutate(Testintervalday = c(0, as.numeric(diff(TestingDate)))) %>%
filter(row_number() == 1 & Testintervalday == 0 | Testintervalday > 30) %>% filter(n()>2) %>%
mutate(Testintervalday = c(0, as.numeric(diff(TestingDate)))) %>%
filter(row_number() == 1 & Testintervalday == 0 | Testintervalday > 30) %>% filter(n()>2) %>%
mutate(Testintervalday = c(0, as.numeric(diff(TestingDate)))) %>%
mutate(Testintervalday = c(0, as.numeric(diff(TestingDate)))) %>%
mutate(FollowStatus = ifelse(Testintervalday > 180, "N", "Y")) %>%
mutate(cumsum_status = cumsum(FollowStatus == "N")) %>%
filter(cumsum_status == 0) %>%
select(-cumsum_status) %>%
filter(n()>2)
####4.观察至第一次发生血糖异常截至####
Gly.Observe.to.abnormal <- Gly.TestInterval.30.180 %>% group_by(ID) %>% arrange(ID, TestingDate) %>%
filter((cumsum(Gly >= 7) == 0 | (Gly >= 7 & cumsum(Gly >= 7) == 1))) %>%
filter(FollowTime.m <= 84) %>% filter(n()>2)
To.abnormal.glygroup <- Gly.Observe.to.abnormal %>% group_by(ID)%>%mutate(
Glygroup = case_when(any(Gly >= 7) ~ 1, TRUE ~ 0))
write.xlsx(To.abnormal.glygroup, "To.abnormal.glygroup.xlsx")
#####4.1制作VLgroup与Glygroup的统计交叉表
#简化数据框
vl.gly.group <- To.abnormal.hlme %>% ungroup(ID) %>% select(1,3,5) %>% distinct(ID, .keep_all = TRUE)
str(vl.gly.group)
vl.gly.group$Glygroup <- as.factor(vl.gly.group$Glygroup)
#制交叉表
table.vl.gly <- table(vl.gly.group$VLgroup, vl.gly.group$Glygroup)
percent <- prop.table(table.vl.gly, margin = 1) * 100
#将values转化为data
data.vl.gly <- as.data.frame(table.vl.gly)
data.vl.gly$Var1 <- as.character(data.vl.gly$Var1)
#循环统计频数及其百分比表格
Freq.percent <- table.vl.gly
for (i in 1:nrow(table.vl.gly)) {
for (j in 1:ncol(table.vl.gly)) {
Freq.percent[i, j] <- paste0(table.vl.gly[i, j], " (", round(percent[i, j], 2), "%)")
}
}
#将values转化为data
Freq.percent <- as.data.frame.matrix(Freq.percent)
#添加Total
rowtotals <- rowSums(table.vl.gly)
coltotals <- colSums(table.vl.gly)
Freq.percent$Total <- rowtotals
Freq.percent <- rbind(Freq.percent, Total = c(coltotals, sum(coltotals)))
Freq.percent <- Freq.percent %>% select(`1`, `0`,`Total`) #重新排列顺序
print(Freq.percent)
#总体卡方检验
chisq.result <- chisq.test(table.vl.gly)
print(chisq.result)
#计算相关系数
library(vcd)
assocstats(table.vl.gly)
#Contingency coefficient (列联系数)
#Cramer’s V (Cramer V系数)
library(jmv)
#两两比较:P值和关联强度
#P值
data.llv.vf <- data.vl.gly %>% filter(Var1 !="VS")
data.llv.vs <- data.vl.gly %>% filter(Var1 !="VF")
data.vf.vs <- data.vl.gly %>% filter(Var1 !="LLV")
p.llv.vf <- contTables(formula = Freq ~ Var1:Var2, data = data.llv.vf, contCoef = TRUE, phiCra = TRUE)
p.llv.vs <- contTables(formula = Freq ~ Var1:Var2, data = data.llv.vs, contCoef = TRUE, phiCra = TRUE)
p.vf.vs <- contTables(formula = Freq ~ Var1:Var2, data = data.vf.vs, contCoef = TRUE, phiCra = TRUE)
print(p.llv.vf)
print(p.llv.vs)
print(p.vf.vs)
#OR值
library(epiDisplay)
tab.llv.vf <- xtabs(Freq~Var1+Var2, data = data.llv.vf)
tab.llv.vf <- tab.llv.vf[, c("1", "0")]#重新更改列的位置,把1放在前面,因为OR值计算是(a/b)/(c/d)=ad/bc
tab.llv.vs <- xtabs(Freq~Var1+Var2, data = data.llv.vs)
tab.llv.vs <- tab.llv.vs[, c("1", "0")]
tab.vf.vs <- xtabs(Freq~Var1+Var2, data = data.vf.vs)
tab.vf.vs <- tab.vf.vs[, c("1", "0")]
cci(cctable = tab.llv.vf)
cci(cctable = tab.llv.vs)
cci(cctable = tab.vf.vs)
####5.生存分析####
library(survival)
library(survminer)
#生成SurvivalTime
Glygroup.surv <- To.abnormal.glygroup %>% group_by(ID) %>%
mutate(Survivaltime = max(FollowTime.m))%>%
select(1, 6, 11, 16,17) %>%
distinct(ID, .keep_all = TRUE)
write.xlsx(Glygroup.surv, "Glygroup.surv.xlsx")
#拟合生存函数
fit <- survfit(Surv(Survivaltime,Glygroup) ~ VLgroup,
data = Glygroup.surv)
fit
#绘制风险函数图
ggsurvplot(
fit,
data = Glygroup.surv,
fun = "cumhaz",#将生存曲线转化为风险曲线
linetype = 1, # 根据分层更改线型c(0,1) or c("solid", "dashed") or "strata"
#surv.median.line = "hv", # 同时显示垂直和水平参考线 即增加中位生存时间 可选 "none"、"hv"、"h"、"v"
palette = "lancet" ,#定义颜色 可选调色板有 "hue" "grey","npg","aaas","lancet","jco", "ucscgb","uchicago","simpsons"和"rickandmorty".
xlab = "Time(months)",
ylab = "Cumulative Hazard of Hyperglycemia",
title = "",
legend = "right", # 指定图例位置 "top"(默认),"left","right","none","bottom"
legend.title = "",
legend.labs = c("VS", "LLV", "VF") ,
xlim = c(0, 90),
ylim = c(0, 1),
break.x.by = 12,
break.y.by = .25,
axes.offset = F, # 逻辑词,默认为TRUE。FALSE则生存曲线图的坐标轴从原点开始
#conf.int = TRUE,#增加置信区间
#conf.int.alpha = .3,# 数值,指定置信区间填充颜色的透明度; # 数值在0-1之间,0为完全透明,1为不透明
pval = TRUE, #log 秩检验
pval.size = 5,# 指定p值文本大小的数字,默认为 5
pval.coord = c(1,0.24),# 长度为2的数字向量,指定p值位置x、y,如pval.coord=c(x,y)
censor = T, # 逻辑词,默认为TRUE,在图上绘制删失点。
censor.shape = 3, # 数值或字符,用于指定删失点的形状;默认为"+"(3), 可选"|"(124)
censor.size = 1.5,# 指定删失点形状的大小,默认为4.5
# risk.table = "absolute", #"absolute"、"percentage"、"abs_pct"", #绝对人数、百分比和危险之中
# #risk.table.pos =in,#分线表放在图内
# risk.table.col = "strata", #按组更改风险表颜色
# risk.table.y.text.col = TRUE, #颜色风险表文本注释(按层)
# risk.table.y.text = FALSE, #在风险表图例中的文本注释中显示条形而不是名称
# risk.table.height = 0.2,
# risk.table.title="Number at the time",
# fontsize=4,#风险表字体
#ggtheme = theme_bw(), #使用主题自定义情节和风险表。
ggtheme = theme_survminer()+
theme(
panel.grid.major = element_line(color = "gray", size = 0.5), # 设置主要网格线
panel.grid.minor = element_line(color = "gray", linetype = "dotted", size = 0.25)) # 设置次要网格线
)
#观察结果统计
To.abnormal.glygroup.uniqueID <- To.abnormal.glygroup %>% distinct(ID, .keep_all = TRUE)
table(To.abnormal.glygroup.uniqueID$Glygroup)
table(To.abnormal.glygroup.uniqueID$VLgroup)
####6.lcmm####
library(lcmm)
To.abnormal.hlme <- To.abnormal.glygroup %>% select( 1, 6, 11, 13, 16) %>% group_by(ID) %>%
arrange(ID, FollowTime.m)%>%
mutate(Subject = cur_group_id())
model.1 <- hlme(fixed = Gly ~ 1 + FollowTime.m + I(FollowTime.m^2),
random = ~ 1 + FollowTime.m,
ng = 1,
data = To.abnormal.hlme,
subject = "Subject")
summary(model.1)
model.2 <- hlme(fixed = Gly ~ 1 + FollowTime.m + I(FollowTime.m^2),
mixture = ~ 1 + FollowTime.m + I(FollowTime.m^2),
random = ~ 1 + FollowTime.m,
ng = 2,
nwg = TRUE,
B = model.1,
idiag = FALSE,
maxiter = 50,
data = To.abnormal.hlme,
subject = "Subject")
summary(model.2)
postprob(model.2)
windows(width=10, height=8)
datnew <- data.frame(FollowTime.m = seq(0,84, length = 1000))
plotpred <- predictY(model.3, datnew, var.time ="FollowTime.m")
plot(plotpred, lty = 1, lwd = 5,marg=FALSE, shades=T,
xlab="FollowTime(Months)", ylab="GLY",
legend.loc = "topleft", cex=0.75)