Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Previous years wave #66

Merged
merged 8 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ export(w2_enforce_streaming)
export(w2_rename_cols)
export(w3_enforce_streaming)
export(w3_rename_cols)
export(w4_check_skip_logic)
export(w4_clean_departments)
export(w4_enforce_skip_logic)
export(w4_enforce_streaming)
export(w4_rename_cols)
export(wrap_outputs)
importFrom(dplyr,across)
importFrom(dplyr,all_of)
Expand Down
15 changes: 12 additions & 3 deletions R/ingest.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ get_tidy_data_file <- function(...) {

#' @return the exported data as a dataframe

ingest <- function(survey = "1167489",
ingest <- function(survey = "1376897",
token = Sys.getenv("CARS_TOKEN"),
secret = Sys.getenv("CARS_SECRET"),
proxies = Sys.getenv("alt_proxy"),
Expand Down Expand Up @@ -199,10 +199,12 @@ get_all_waves <- function(mode = c("api", "file")) {

if (mode == "api") {
data <- get_tidy_data_api()
w4_data <- get_tidy_data_api(survey = "1167489")
w3_data <- get_tidy_data_api(survey = "961613")
w2_data <- get_tidy_data_api(survey = "790800")
} else if (mode == "file") {
data <- read_file("2022_data.csv")
w4_data <- read_file("2022_data.csv")
w3_data <- read_file("2021_data.csv")
w2_data <- read_file("2020_data.csv")
}
Expand All @@ -213,7 +215,14 @@ get_all_waves <- function(mode = c("api", "file")) {
apply_skip_logic() %>%
clean_departments() %>%
derive_vars()
data$year <- 2022
data$year <- 2023

w4_data <- w4_data %>%
tidy_colnames() %>%
w4_rename_cols() %>%
w4_enforce_streaming() %>%
w4_clean_departments()
w4_data$year <- 2022

w3_data <- w3_data %>%
tidy_colnames() %>%
Expand All @@ -227,7 +236,7 @@ get_all_waves <- function(mode = c("api", "file")) {
w2_enforce_streaming()
w2_data$year <- 2020

data <- dplyr::bind_rows(data, w3_data, w2_data)
data <- dplyr::bind_rows(data, w4_data, w3_data, w2_data)

return(data)
}
4 changes: 2 additions & 2 deletions R/wave_2_preprocessing.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#' Rename columns (wave 3)
#' Rename columns (wave 2)
#'
#' @description add meaningful column names to dataset ingested from smartsurvey API.
#'
#' @param data CARS wave 3 (2021) survey data (data.frame).
#' @param data CARS wave 2 (2020) survey data (data.frame).
#'
#' @return data.frame
#'
Expand Down
253 changes: 253 additions & 0 deletions R/wave_4_preprocessing.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@

#' @title Rename columns
#'
#' @description Renames columns and removes unnecessary columns
#'
#' @param data tidy CARS dataset
#'
#' @return data.frame
#'
#' @export

w4_rename_cols <- function(data) {

if (class(data) != "data.frame") {
stop("Unexpected input: data is not a data.frame.")
}
if (ncol(data) != 112) {
stop("Unexpected input: incorrect number of columns. Please use the 2022 CARS dataset.")
}

colnames(data)[c(1, 7:ncol(data))] <- c(
"ID",
"started",
"ended",
"tracking_link",
"workplace",
"CS_grade",
"department",
"other_department_name",
"prof_DS",
"prof_DDAT",
"prof_GAD",
"prof_GES",
"prof_geog",
"prof_GORS",
"prof_GSR",
"prof_GSG",
"prof_CS_none",
"prof_CS_other",
"ONS_directorate",
"highest_qualification",
"qual_1_subject",
"qual_1_level",
"qual_1_learn_code",
"qual_2_subject",
"qual_2_level",
"qual_2_learn_code",
"qual_3_subject",
"qual_3_level",
"qual_3_learn_code",
"code_freq",
"management",
"ops_analysis",
"ops_cleaning",
"ops_linking",
"ops_transfer_migration",
"ops_vis",
"ops_machine_learning",
"ops_modelling",
"ops_QA",
"ops_other",
"ops_other_name",
"knowledge_R",
"access_R",
"knowledge_SQL",
"access_SQL",
"knowledge_SAS",
"access_SAS",
"knowledge_VBA",
"access_VBA",
"knowledge_python",
"access_python",
"knowledge_SPSS",
"access_SPSS",
"knowledge_stata",
"access_stata",
"knowledge_JS",
"access_JS",
"knowledge_java",
"access_java",
"knowledge_C",
"access_C",
"knowledge_matlab",
"access_matlab",
"knowledge_access_other",
"knowledge_git",
"access_git",
"other_coding_experience",
"coding_ability_change",
"prev_coding_experience",
"first_learned",
"heard_of_RAP",
"know_RAP_champ",
"strategy_knowledge",
"RAP_confident",
"RAP_supported",
"RAP_resources",
"RAP_components",
"RAP_important",
"RAP_implementing",
"RAP_planning",
"RAP_comments",
"prac_use_open_source",
"prac_open_source_own",
"prac_version_control",
"prac_review",
"prac_functions",
"prac_unit_test",
"prac_package",
"prac_dir_structure",
"prac_style",
"prac_automated_QA",
"prac_AQUA_book",
"doc_comments",
"doc_functions",
"doc_readme",
"doc_desk_notes",
"doc_registers",
"doc_AQA_logs",
"doc_flow_charts",
"doc_other",
"CI",
"dep_management",
"reproducible_workflow",
"misc_coding",
"misc_support",
"misc_additional_data",
"misc_other"
)

data <- data[!colnames(data) %in% c("UserNo", "Name", "Email", "IP.Address", "Unique.ID")]

return(data)
}

#' @title Clean department data
#'
#' @description add NHS to department list and merge departments where needed.
#'
#' @param data cleaned CARS dataset
#'
#' @return CARS dataset
#' @export

w4_clean_departments <- function(data) {

data$department[grepl("forest research", tolower(data$other_department_name))] <- "Forestry Commission"

data$department[data$workplace == "NHS"] <- "NHS"

defra_orgs <- c(
"Department for Environment, Food and Rural Affairs (excl. agencies)",
"Forestry Commission",
"Animal and Plant Health Agency",
"Centre for Environment, Fisheries and Aquaculture Science",
"Rural Payments Agency",
"Environment Agency",
"Marine Management Organisation",
"Natural England"
)

data$defra <- data$department %in% defra_orgs

return(data)

}



#' @title Apply skip logic
#'
#' @description Iteratively applies enforce_skip_logic to the necessary fields in the data.
#'
#' @param data data.frame
#'
#' @return cleaned data.frame
#'
#' @export

w4_enforce_streaming <- function(data) {

conditions <- list(data$workplace %in% c("Civil service, including devolved administations", "test"),
data$department %in% c("Office for National Statistics", "test"),
data$highest_qualification != "Any other qualification",
data$code_freq != "Never",
data$other_coding_experience != "No",
data$prev_coding_experience != "No",
data$heard_of_RAP != "No")

skipped_cols <- list(colnames(data)[which(colnames(data) == "CS_grade"):which(colnames(data) == "ONS_directorate")],
colnames(data)[which(colnames(data) == "ONS_directorate")],
colnames(data)[which(colnames(data) == "qual_1_subject"):which(colnames(data) == "qual_3_learn_code")],
colnames(data)[which(colnames(data) == "prac_use_open_source"):which(colnames(data) == "misc_coding")],
colnames(data)[which(colnames(data) == "coding_ability_change"):which(colnames(data) == "first_learned")],
colnames(data)[which(colnames(data) == "first_learned")],
colnames(data)[which(colnames(data) == "know_RAP_champ"):which(colnames(data) == "RAP_comments")])

for(i in 1:length(conditions)){
data <- w4_enforce_skip_logic(data, conditions[[i]], skipped_cols[[i]])
}

return(data)

}


#' @title Check skip logic
#'
#' @description Checks whether the skip logic was followed correctly. Backtracking while filling the survey can result in inconsistent response sets.
#' This check returns row numbers where questions which should have been skipped contain anything other than NA.
#'
#' @param data data.frame
#' @param condition logical vector. Example: data$row == "skip response"
#' @param skipped_cols character. questions that should have been skipped if condition != TRUE
#'
#' @return list of rows failing the check
#'
#' @export

w4_check_skip_logic <- function(data, condition, skipped_cols) {

condition_failed <- !condition & !is.na(data[skipped_cols])

row_failed <- as.logical(rowSums(condition_failed))


return(
which(row_failed)
)

}

#' @title enforce skip logic
#'
#' @description Replaces values in rows with NAs where check_skip_logic has identified backtracking.
#'
#' @param data data.frame
#' @param condition logical vector. Example: data$row == "skip response"
#' @param skipped_cols character. questions that should have been skipped if condition != TRUE
#'
#' @return data.frame with rows failing the check replaced with NAs
#'
#' @export

w4_enforce_skip_logic <- function(data, condition, skipped_cols) {

row_index <- w4_check_skip_logic(data, condition, skipped_cols)

data[row_index, skipped_cols] <- NA

return(data)

}
2 changes: 1 addition & 1 deletion man/derive_language_status.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/ingest.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/w2_rename_cols.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions man/w4_check_skip_logic.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/w4_clean_departments.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading