forked from HeatherARobinson/EHR-data-processing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSIR study subject selection
194 lines (153 loc) · 8.34 KB
/
SIR study subject selection
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
memory.size(100000) #Assign sufficient memory to R
load ("sir.data.rda") #Load lng format input file
library(zoo)
library(plyr)
library(tidyverse)
library(zoo)
library(data.table)
library(survival)
library(lubridate)
#READ CONDITION FILES- suffix names with a common suffix to import together
temp = list.files(pattern="*1.csv")
for (i in 1:length(temp)) assign(temp[i], read.csv(temp[i]))
#######################################################################################
#SELECT PATIENTS 18 OR ABOVE AT HEART FAILURE DIAGNOSIS
hf<-sir.data[sir.data$ReadCode %in% HeartFailure1.csv$ReadCode,] #Create subset of HF patients
#length(unique(as.factor(hf$PatientID))) #7254 total heart failure patients identified with confirmed Read Codes (V2)
hf$Age<-(as.numeric(year(strptime(hf$EntryDate, format="%Y%m%d"))))-hf$BirthYear
hf$hfage<-hf$Age
smalltab<-hf[,c("PatientID","hfage")]
smalltab$PatientID<-as.factor(smalltab$PatientID)
smalltab<-smalltab[!is.na(smalltab$hfage),]
first<-smalltab %>% group_by(PatientID) %>%
slice(which.min(hfage))
ungroup(first)
head(first)
sir.data<-merge(sir.data[sir.data$PatientID %in% hf$PatientID,],as.data.frame(first),all.x=TRUE)
sir.data<-sir.data[sir.data$hfage>=18,]
#length(unique(as.factor(sir.data$PatientID))) #number of patients 18 or over at first diagnosis
save(sir.data,file="sirdatahfonly.rda")
####################################################################################
#SELECT PATIENTS WITH CREATININE DATA
sir.data[sir.data$ReadCode=="44J3.",]->crea #Based on Read Codes v2- adapt as required
crea <- droplevels(crea)
#CHECK FOR CASES VIABLE VALUE IS ENTERED IN THE 'UNITS' COLUMN BY MISTAKE
temp<-ifelse(as.numeric(as.character(crea$CodeUnits))>0 & as.numeric(as.character(crea$CodeUnits))<1000 & !is.na(crea$CodeValue),crea$CodeUnits,NA)
table(temp)
crea$CodeValue<-ifelse(as.numeric(as.character(crea$CodeUnits))>0 &
as.numeric(as.character(crea$CodeUnits))<1000 &
is.na(crea$CodeValue),crea$CodeUnits,crea$CodeValue)
crea$CodeUnits<-ifelse(!is.na(temp),paste(""),paste(crea$CodeUnits))
crea$CodeValue<-as.numeric(as.character(crea$CodeValue))
crea<-crea[!is.na(crea$CodeValue),]
length(unique(as.factor(crea$PatientID))) #number of hf patients over 18 at diagnosis with creatinine data
###########################################################################
#SENSITIVITY TESTS- HOW MANY ZERO CR VALUES AND HOW MANY CR VALUES UNDER 20
lowcr<-crea[crea$CodeValue<18 & !is.na(crea$CodeValue),]
length(lowcr$PatientID) #How many values are present but <20
levels(as.factor(lowcr$PatientID)) #How many patients are affected
crea<-crea[crea$CodeValue>0,] #Remove rows with no numeric creatinine value
#Convert values in mmol to umol
crea$CodeUnits<-as.factor(crea$CodeUnits)
levels(crea$CodeUnits)
#[1] "" "%" "g/L" "g/mol" "L" "micmol/l"
#[7] "mmol" "mmol/L" "umol/l" "None" "umol/L"
levels(crea$CodeUnits)[c(1,6,9)]<-"umol/L"
levels(crea$CodeUnits)[c(6)]<-"mmol/L"
levels(crea$CodeUnits)[c(2:5)]<-"NA"
crea<-crea[!is.na(crea$CodeUnits),]
crea$CodeValue<-ifelse(crea$CodeUnits=="mmol/L" & as.numeric(crea$CodeValue)<50,(as.numeric(crea$CodeValue)*1000),as.numeric(crea$CodeValue))
crea$CodeUnits<-"umol/L"
crea<-crea[as.numeric(crea$CodeValue)>=20 & !is.na(crea$CodeValue),]
#Appy upper limit here if required
save(crea,file="crearecleaned.rda")
###############################################################################
#DATA CLEANING
#load("sirdatahfonly.rda") #full patient records from all adult hf patients from all years
#load("crearecleaned.rda") # creatinine data table including tests from selected patients
#REMOVE SAME DAY CREATININE ENTRIES IF THE SOURCE LOCATION CODE DIFFERS.
#(OCCURS IN SIR DURING TRANSFER BETWEEN PRIMARY AND SECONDARY CARE EHF SYSTEMS)
crea<-crea[order(crea$PatientID,crea$CodeValue, rev(crea$Source)),]
crea2<-crea[(duplicated(crea[,c(1,4,6)])&!duplicated(crea[,7])),] #Find duplicates of value and patient ID with different sources
length(crea.2$CodeValue)
crea3<-crea[(duplicated(crea[,c(1,4,6)])),] #Find duplicates of value, date and patient ID
length(crea3$CodeValue)
crea<-crea[!rownames(crea) %in% rownames(crea2),]
#REMOVE DELAYED CREATININE ENTRIES FROM SAME CALENDAR MONTH IF THE SOURCE LOCATION CODE DIFFERS.
crea$event.date<-as.Date(as.character(crea$EntryDate),format="%Y%m%d")
(as.numeric(year(strptime(crea$event.date, format="%Y-%m-%d")))) -> year
(as.numeric(month(strptime(crea$event.date, format="%Y-%m-%d")))) -> month
crea$EntryPeriod<-paste(month,year)
crea$Source<-ifelse(crea$Source=="salfordt",paste("2"),paste("1")) #Simplify source codes to '2' for hospital, '1' for GP
crea$Source<-ifelse(is.na(crea$Source),paste("2"),crea$Source)
crea<-crea[order(crea[,1], -(crea[,4]),(crea[,7])),]
crea4<-crea[(duplicated(crea[,c(1,6,14])&!duplicated(crea[,7])),] #DELAYED DUPLICATES (SAME PATIENT, MONTH, VALUE)
length(crea4$PatientID) #Only 1 of the remaining potential duplicate records outside of the same day window come from different locations
crea<-crea[!rownames(crea) %in% rownames(crea4),]
#SELECT MEAN DAILY CREATININE IF MULTIPLE ENTRIES AFTER REMOVING DELAYED DUPLICATES AND OUT OF RANGE VALUES
smalltab<-crea[,c("PatientID","CodeValue", "EntryDate")]
xcrea<-smalltab %>% group_by(PatientID, EntryDate) %>%
summarize(Creatinine = mean(as.numeric(as.character(CodeValue))))
ungroup(xcrea)
crea<-merge(crea,as.data.frame(xcrea),all.x=TRUE)
######################################################
#Add demographic variables from lookup tables
#LSOA and age should be added here if not already present
#ASSIGN AGE (BIRTH DATE GIVEN TO NEAREST MONTH IN SIR DATA)
crea$Age<-(as.numeric(year(strptime(crea$event.date, format="%Y-%m-%d"))))-crea$BirthYear
#CODE ETHNICITY
ethnic.data<-read.table("ethnic.data.csv",header=TRUE,sep=",") #Call data from lookup table (see Open Source Resources repo)
ethnic.data$Category<-floor(ethnic.data$Category)
crea<-merge(crea,ethnic.data,by.x="Ethnicity",by.y="ClinCode2",all.x=TRUE, all.y=FALSE)
crea<-subset(crea, select=-c(Ethnicity,ClinCode1,EntryPeriod,Rubric))
colnames(crea)[which(names(crea) == "Category")] <- "Ethnicity"
#ADD LSOA
imd<-read.csv("IMD2010.csv") #Call data from lookup table (see Open Source Resources repo)
imd<-imd[,c("LSOA","IMD_Decile2010")]
crea<-merge(crea,imd,all.x=TRUE)
#LIMIT TO PATIENTS WITH AT LEAST 2 POST 2008 CREATININE TEST VALUES
crea<-crea[as.numeric(year(strptime(crea$event.date, format="%Y-%m-%d")))>=2008,]
table(crea$PatientID) < 2 -> rare
rownames(as.matrix(rare)) -> ids
crea[!(crea$PatientID %in% ids[rare]),] -> crea.rep
sir.data<-sir.data[sir.data$PatientID %in% crea.rep$PatientID,]
#Breakpoint
#######################################################
save(crea.rep, file = "crea.rephf2tests.Rdata")
save(sir.data, file = "sir.datahf2tests.Rdata")
#######################################################
#SELECT PATIENTS WHICH HAVE AT LEAST 2 TESTS, OPTION FOR TIME RANGE RESTRICTION
sir.data$event.date<-as.Date(as.character(sir.data$EntryDate),format="%Y%m%d") #Ensure dates are in date format
aggregate(as.numeric(year(strptime(sir.data$event.date, format="%Y-%m-%d"))), list(sir.data$PatientID), range) -> ranges
ranges$x[,2] - ranges$x[,1] -> ranges$range
ranges[(which(ranges$range<2)),1] -> range_short_ids # define exclusion range as 2 years
crea.rep[-which(crea.rep$PatientID %in% range_short_ids),]->crea.rep
#ADD ON PRACTISE INFO TO CREA.REP
prac<-read.csv("PractiseKeySIR.csv")
pats<-merge(sir.data,prac,all.x=TRUE)
pats<-pats[!is.na(pats$Source),]
pats<-pats[!pats$Source=="",]
pats1<-pats[pats$Source=="salfordt",]
pats1<-pats1[,c(1,2,4)]
pats2<-pats[!pats$Source=="salfordt",]
pats2<-pats2[!duplicated(pats2[,c(2,4)],fromLast=TRUE),]
pats2<-pats2[,c(1,2,4,5,7)]
colnames(pats2)[1]<-"Source2"
pats3<-merge(pats1,pats2,all=TRUE)
pats3$Source2<-ifelse(is.na(pats3$Source2),paste(pats3$Source),paste(pats3$Source2))
pats3<-pats3[,c(1,2,4:6)]
colnames(pats3)[3]<-"Source"
pats3<-unique(pats3)
table(pats3$Source)
length(pats3$Source)
crea.rep<-merge(crea.rep,pats3,all.x=TRUE)
#Add a random practise ID for splitting the data
Source<-unique(crea.rep$Source[!crea.rep$Source=="salfordt"])
Source<-Source[c(1:38,40:57)] #remove NA row (hospital)
s1<-as.data.frame(Source)
randompracID<-sample(1:56, 56)
s2<-as.data.frame(randompracID)
s3<-cbind(s1,s2)
crea.rep<-merge(crea.rep,s3,all.x=TRUE)
crea.rep$randompracID<-ifelse(crea.rep$Source=="salfordt",0,crea.rep$randompracID)
save(crea.rep,file="crea.rep2yrsall.rda")