forked from vynguyen92/publish_nhanes_data_1988_2018
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathf - derive_a_variable_from_two_variables.R
143 lines (108 loc) · 4.93 KB
/
f - derive_a_variable_from_two_variables.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
derive_a_variable_from_two_variables <- function(df_doc_cleaning
, name_fix_categories_of_df
, list_cleaning_documentation
, df_unclean)
{
subset_variables_derived <- df_doc_cleaning %>%
filter(grepl("Derive", codename_note))
# View(subset_variables_derived)
codenames_derived <- subset_variables_derived %>%
pull(variable_codename_use) %>%
unique(.)
# print(codenames_derived)
dataset_fix_categories <- list_cleaning_documentation[[name_fix_categories_of_df]]
num_codenames_derived <- length(codenames_derived)
# print(num_codenames_derived)
for(i in seq(num_codenames_derived))
{
corrected_codename_i <- codenames_derived[i]
# print(corrected_codename_i)
subset_derived_i <- subset_variables_derived %>%
filter(variable_codename_use == corrected_codename_i)
# print(subset_derived_i)
unique_cycles <- subset_derived_i %>%
pull(SDDSRVYR)
# print(unique_cycles)
index_affected_cycles <- which(df_unclean$SDDSRVYR %in% unique_cycles)
# print(unique(df_unclean[index_affected_cycles, "SDDSRVYR"]))
codename_note_i <- subset_derived_i %>%
pull(codename_note) %>%
unique(.)
# print(codename_note_i)
codenames_used_for_deriving <- codename_note_i %>%
gsub("Derive from ", "", .) %>%
strsplit(., split = " and ") %>%
unlist(.)
# print(codenames_used_for_deriving)
questionaire_codename <- codenames_used_for_deriving %>%
grepl("G$", .) %>%
codenames_used_for_deriving[.]
# print(questionaire_codename)
value_codename <- codenames_used_for_deriving %>%
grepl("Q$", .) %>%
codenames_used_for_deriving[.]
# print(value_codename)
questionaire_variable <- df_unclean[index_affected_cycles, questionaire_codename]
value_variable <- df_unclean[index_affected_cycles, value_codename]
# print(unique(df_unclean[index_affected_cycles,corrected_codename_i]))
df_unclean[index_affected_cycles,corrected_codename_i] <- questionaire_variable
# print(unique(df_unclean[index_affected_cycles,corrected_codename_i]))
index_questionaire_variable_is_1 <- which(df_unclean[,corrected_codename_i] == 1 & df_unclean$SDDSRVYR %in% unique_cycles)
# print(unique(df_unclean[index_questionaire_variable_is_1,corrected_codename_i]))
df_unclean[index_questionaire_variable_is_1,corrected_codename_i] <- df_unclean[index_questionaire_variable_is_1,value_codename]
# print(questionaire_codename)
subset_fix_categories <- dataset_fix_categories %>%
filter(new_codename == corrected_codename_i) %>%
filter(codename_original == questionaire_codename)
# print(subset_fix_categories)
num_problematic_categories <- nrow(subset_fix_categories)
for(j in seq(num_problematic_categories))
{
subset_fix_categories_j <- subset_fix_categories[j,]
# print(subset_fix_categories_j)
problematic_category_j <- subset_fix_categories_j %>%
pull(categories_num)
corrected_category_j <- subset_fix_categories_j %>%
pull(new_categories) %>%
as.numeric(.)
index_problematic_category_j <- which(df_unclean[,questionaire_codename] == problematic_category_j)
df_unclean[index_problematic_category_j,corrected_codename_i] <- corrected_category_j
}
# For checking
df_checking <- df_unclean[index_affected_cycles,c(corrected_codename_i
, questionaire_codename
, value_codename
, "SDDSRVYR")] %>%
unique(.) %>%
drop_na(all_of(corrected_codename_i))
# View(df_checking)
df_harmonized_values <- df_checking %>%
filter(is.na(!!sym(value_codename)) == TRUE) %>%
select(-SDDSRVYR) %>%
unique(.) %>%
select(where(~!all(is.na(.)))) %>%
arrange(!!sym(corrected_codename_i)) %>%
mutate_all(as.numeric)
df_documented_values <- subset_fix_categories %>%
select(new_categories
, categories_num)
colnames(df_documented_values) <- c(corrected_codename_i
, questionaire_codename)
df_documented_values <- df_documented_values %>%
arrange(!!sym(corrected_codename_i)) %>%
mutate_all(as.numeric) %>%
as.data.frame(.)
are_categories_the_same <- identical(df_harmonized_values
, df_documented_values)
# print(are_categories_the_same)
if(are_categories_the_same == FALSE)
{
print(corrected_codename_i)
print(df_checking)
print(df_harmonized_values)
print(df_documented_values)
}
}
df_clean <- df_unclean
return(df_clean)
}