diff --git a/NAMESPACE b/NAMESPACE index df6995a..caef0a0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -43,6 +43,11 @@ export(w2_enforce_streaming) export(w2_rename_cols) export(w3_enforce_streaming) export(w3_rename_cols) +export(w4_check_skip_logic) +export(w4_clean_departments) +export(w4_enforce_skip_logic) +export(w4_enforce_streaming) +export(w4_rename_cols) export(wrap_outputs) importFrom(dplyr,across) importFrom(dplyr,all_of) diff --git a/R/ingest.R b/R/ingest.R index ca0ab0b..6076bce 100644 --- a/R/ingest.R +++ b/R/ingest.R @@ -52,7 +52,7 @@ get_tidy_data_file <- function(...) { #' @return the exported data as a dataframe -ingest <- function(survey = "1167489", +ingest <- function(survey = "1376897", token = Sys.getenv("CARS_TOKEN"), secret = Sys.getenv("CARS_SECRET"), proxies = Sys.getenv("alt_proxy"), @@ -199,10 +199,12 @@ get_all_waves <- function(mode = c("api", "file")) { if (mode == "api") { data <- get_tidy_data_api() + w4_data <- get_tidy_data_api(survey = "1167489") w3_data <- get_tidy_data_api(survey = "961613") w2_data <- get_tidy_data_api(survey = "790800") } else if (mode == "file") { data <- read_file("2022_data.csv") + w4_data <- read_file("2022_data.csv") w3_data <- read_file("2021_data.csv") w2_data <- read_file("2020_data.csv") } @@ -213,7 +215,14 @@ get_all_waves <- function(mode = c("api", "file")) { apply_skip_logic() %>% clean_departments() %>% derive_vars() - data$year <- 2022 + data$year <- 2023 + + w4_data <- w4_data %>% + tidy_colnames() %>% + w4_rename_cols() %>% + w4_enforce_streaming() %>% + w4_clean_departments() + w4_data$year <- 2022 w3_data <- w3_data %>% tidy_colnames() %>% @@ -227,7 +236,7 @@ get_all_waves <- function(mode = c("api", "file")) { w2_enforce_streaming() w2_data$year <- 2020 - data <- dplyr::bind_rows(data, w3_data, w2_data) + data <- dplyr::bind_rows(data, w4_data, w3_data, w2_data) return(data) } diff --git a/R/wave_2_preprocessing.R b/R/wave_2_preprocessing.R index 988ed2d..0c96682 100644 --- a/R/wave_2_preprocessing.R +++ b/R/wave_2_preprocessing.R @@ -1,8 +1,8 @@ -#' Rename columns (wave 3) +#' Rename columns (wave 2) #' #' @description add meaningful column names to dataset ingested from smartsurvey API. #' -#' @param data CARS wave 3 (2021) survey data (data.frame). +#' @param data CARS wave 2 (2020) survey data (data.frame). #' #' @return data.frame #' diff --git a/R/wave_4_preprocessing.R b/R/wave_4_preprocessing.R new file mode 100644 index 0000000..e3673fe --- /dev/null +++ b/R/wave_4_preprocessing.R @@ -0,0 +1,253 @@ + +#' @title Rename columns +#' +#' @description Renames columns and removes unnecessary columns +#' +#' @param data tidy CARS dataset +#' +#' @return data.frame +#' +#' @export + +w4_rename_cols <- function(data) { + + if (class(data) != "data.frame") { + stop("Unexpected input: data is not a data.frame.") + } + if (ncol(data) != 112) { + stop("Unexpected input: incorrect number of columns. Please use the 2022 CARS dataset.") + } + + colnames(data)[c(1, 7:ncol(data))] <- c( + "ID", + "started", + "ended", + "tracking_link", + "workplace", + "CS_grade", + "department", + "other_department_name", + "prof_DS", + "prof_DDAT", + "prof_GAD", + "prof_GES", + "prof_geog", + "prof_GORS", + "prof_GSR", + "prof_GSG", + "prof_CS_none", + "prof_CS_other", + "ONS_directorate", + "highest_qualification", + "qual_1_subject", + "qual_1_level", + "qual_1_learn_code", + "qual_2_subject", + "qual_2_level", + "qual_2_learn_code", + "qual_3_subject", + "qual_3_level", + "qual_3_learn_code", + "code_freq", + "management", + "ops_analysis", + "ops_cleaning", + "ops_linking", + "ops_transfer_migration", + "ops_vis", + "ops_machine_learning", + "ops_modelling", + "ops_QA", + "ops_other", + "ops_other_name", + "knowledge_R", + "access_R", + "knowledge_SQL", + "access_SQL", + "knowledge_SAS", + "access_SAS", + "knowledge_VBA", + "access_VBA", + "knowledge_python", + "access_python", + "knowledge_SPSS", + "access_SPSS", + "knowledge_stata", + "access_stata", + "knowledge_JS", + "access_JS", + "knowledge_java", + "access_java", + "knowledge_C", + "access_C", + "knowledge_matlab", + "access_matlab", + "knowledge_access_other", + "knowledge_git", + "access_git", + "other_coding_experience", + "coding_ability_change", + "prev_coding_experience", + "first_learned", + "heard_of_RAP", + "know_RAP_champ", + "strategy_knowledge", + "RAP_confident", + "RAP_supported", + "RAP_resources", + "RAP_components", + "RAP_important", + "RAP_implementing", + "RAP_planning", + "RAP_comments", + "prac_use_open_source", + "prac_open_source_own", + "prac_version_control", + "prac_review", + "prac_functions", + "prac_unit_test", + "prac_package", + "prac_dir_structure", + "prac_style", + "prac_automated_QA", + "prac_AQUA_book", + "doc_comments", + "doc_functions", + "doc_readme", + "doc_desk_notes", + "doc_registers", + "doc_AQA_logs", + "doc_flow_charts", + "doc_other", + "CI", + "dep_management", + "reproducible_workflow", + "misc_coding", + "misc_support", + "misc_additional_data", + "misc_other" + ) + + data <- data[!colnames(data) %in% c("UserNo", "Name", "Email", "IP.Address", "Unique.ID")] + + return(data) +} + +#' @title Clean department data +#' +#' @description add NHS to department list and merge departments where needed. +#' +#' @param data cleaned CARS dataset +#' +#' @return CARS dataset +#' @export + +w4_clean_departments <- function(data) { + + data$department[grepl("forest research", tolower(data$other_department_name))] <- "Forestry Commission" + + data$department[data$workplace == "NHS"] <- "NHS" + + defra_orgs <- c( + "Department for Environment, Food and Rural Affairs (excl. agencies)", + "Forestry Commission", + "Animal and Plant Health Agency", + "Centre for Environment, Fisheries and Aquaculture Science", + "Rural Payments Agency", + "Environment Agency", + "Marine Management Organisation", + "Natural England" + ) + + data$defra <- data$department %in% defra_orgs + + return(data) + +} + + + +#' @title Apply skip logic +#' +#' @description Iteratively applies enforce_skip_logic to the necessary fields in the data. +#' +#' @param data data.frame +#' +#' @return cleaned data.frame +#' +#' @export + +w4_enforce_streaming <- function(data) { + + conditions <- list(data$workplace %in% c("Civil service, including devolved administations", "test"), + data$department %in% c("Office for National Statistics", "test"), + data$highest_qualification != "Any other qualification", + data$code_freq != "Never", + data$other_coding_experience != "No", + data$prev_coding_experience != "No", + data$heard_of_RAP != "No") + + skipped_cols <- list(colnames(data)[which(colnames(data) == "CS_grade"):which(colnames(data) == "ONS_directorate")], + colnames(data)[which(colnames(data) == "ONS_directorate")], + colnames(data)[which(colnames(data) == "qual_1_subject"):which(colnames(data) == "qual_3_learn_code")], + colnames(data)[which(colnames(data) == "prac_use_open_source"):which(colnames(data) == "misc_coding")], + colnames(data)[which(colnames(data) == "coding_ability_change"):which(colnames(data) == "first_learned")], + colnames(data)[which(colnames(data) == "first_learned")], + colnames(data)[which(colnames(data) == "know_RAP_champ"):which(colnames(data) == "RAP_comments")]) + + for(i in 1:length(conditions)){ + data <- w4_enforce_skip_logic(data, conditions[[i]], skipped_cols[[i]]) + } + + return(data) + +} + + +#' @title Check skip logic +#' +#' @description Checks whether the skip logic was followed correctly. Backtracking while filling the survey can result in inconsistent response sets. +#' This check returns row numbers where questions which should have been skipped contain anything other than NA. +#' +#' @param data data.frame +#' @param condition logical vector. Example: data$row == "skip response" +#' @param skipped_cols character. questions that should have been skipped if condition != TRUE +#' +#' @return list of rows failing the check +#' +#' @export + +w4_check_skip_logic <- function(data, condition, skipped_cols) { + + condition_failed <- !condition & !is.na(data[skipped_cols]) + + row_failed <- as.logical(rowSums(condition_failed)) + + + return( + which(row_failed) + ) + +} + +#' @title enforce skip logic +#' +#' @description Replaces values in rows with NAs where check_skip_logic has identified backtracking. +#' +#' @param data data.frame +#' @param condition logical vector. Example: data$row == "skip response" +#' @param skipped_cols character. questions that should have been skipped if condition != TRUE +#' +#' @return data.frame with rows failing the check replaced with NAs +#' +#' @export + +w4_enforce_skip_logic <- function(data, condition, skipped_cols) { + + row_index <- w4_check_skip_logic(data, condition, skipped_cols) + + data[row_index, skipped_cols] <- NA + + return(data) + +} diff --git a/man/derive_language_status.Rd b/man/derive_language_status.Rd index 8477a56..fc313d9 100644 --- a/man/derive_language_status.Rd +++ b/man/derive_language_status.Rd @@ -13,5 +13,5 @@ derive_language_status(data) data.frame } \description{ -Derve the status of each programmming language as "access" (access only), "knowledge" (knowledge only), "both" or "neither". +Derive the status of each programmming language as "access" (access only), "knowledge" (knowledge only), "both" or "neither". } diff --git a/man/ingest.Rd b/man/ingest.Rd index 9be5435..b970a63 100644 --- a/man/ingest.Rd +++ b/man/ingest.Rd @@ -5,7 +5,7 @@ \title{Ingest smartsurvey data} \usage{ ingest( - survey = "1167489", + survey = "1376897", token = Sys.getenv("CARS_TOKEN"), secret = Sys.getenv("CARS_SECRET"), proxies = Sys.getenv("alt_proxy"), diff --git a/man/w2_rename_cols.Rd b/man/w2_rename_cols.Rd index 00f6611..571830d 100644 --- a/man/w2_rename_cols.Rd +++ b/man/w2_rename_cols.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/wave_2_preprocessing.R \name{w2_rename_cols} \alias{w2_rename_cols} -\title{Rename columns (wave 3)} +\title{Rename columns (wave 2)} \usage{ w2_rename_cols(data) } \arguments{ -\item{data}{CARS wave 3 (2021) survey data (data.frame).} +\item{data}{CARS wave 2 (2020) survey data (data.frame).} } \value{ data.frame diff --git a/man/w4_check_skip_logic.Rd b/man/w4_check_skip_logic.Rd new file mode 100644 index 0000000..002bab7 --- /dev/null +++ b/man/w4_check_skip_logic.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wave_4_preprocessing.R +\name{w4_check_skip_logic} +\alias{w4_check_skip_logic} +\title{Check skip logic} +\usage{ +w4_check_skip_logic(data, condition, skipped_cols) +} +\arguments{ +\item{data}{data.frame} + +\item{condition}{logical vector. Example: data$row == "skip response"} + +\item{skipped_cols}{character. questions that should have been skipped if condition != TRUE} +} +\value{ +list of rows failing the check +} +\description{ +Checks whether the skip logic was followed correctly. Backtracking while filling the survey can result in inconsistent response sets. +This check returns row numbers where questions which should have been skipped contain anything other than NA. +} diff --git a/man/w4_clean_departments.Rd b/man/w4_clean_departments.Rd new file mode 100644 index 0000000..f29fa47 --- /dev/null +++ b/man/w4_clean_departments.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wave_4_preprocessing.R +\name{w4_clean_departments} +\alias{w4_clean_departments} +\title{Clean department data} +\usage{ +w4_clean_departments(data) +} +\arguments{ +\item{data}{cleaned CARS dataset} +} +\value{ +CARS dataset +} +\description{ +add NHS to department list and merge departments where needed. +} diff --git a/man/w4_enforce_skip_logic.Rd b/man/w4_enforce_skip_logic.Rd new file mode 100644 index 0000000..ba1943a --- /dev/null +++ b/man/w4_enforce_skip_logic.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wave_4_preprocessing.R +\name{w4_enforce_skip_logic} +\alias{w4_enforce_skip_logic} +\title{enforce skip logic} +\usage{ +w4_enforce_skip_logic(data, condition, skipped_cols) +} +\arguments{ +\item{data}{data.frame} + +\item{condition}{logical vector. Example: data$row == "skip response"} + +\item{skipped_cols}{character. questions that should have been skipped if condition != TRUE} +} +\value{ +data.frame with rows failing the check replaced with NAs +} +\description{ +Replaces values in rows with NAs where check_skip_logic has identified backtracking. +} diff --git a/man/w4_enforce_streaming.Rd b/man/w4_enforce_streaming.Rd new file mode 100644 index 0000000..170dac7 --- /dev/null +++ b/man/w4_enforce_streaming.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wave_4_preprocessing.R +\name{w4_enforce_streaming} +\alias{w4_enforce_streaming} +\title{Apply skip logic} +\usage{ +w4_enforce_streaming(data) +} +\arguments{ +\item{data}{data.frame} +} +\value{ +cleaned data.frame +} +\description{ +Iteratively applies enforce_skip_logic to the necessary fields in the data. +} diff --git a/man/w4_rename_cols.Rd b/man/w4_rename_cols.Rd new file mode 100644 index 0000000..5cd74df --- /dev/null +++ b/man/w4_rename_cols.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wave_4_preprocessing.R +\name{w4_rename_cols} +\alias{w4_rename_cols} +\title{Rename columns} +\usage{ +w4_rename_cols(data) +} +\arguments{ +\item{data}{tidy CARS dataset} +} +\value{ +data.frame +} +\description{ +Renames columns and removes unnecessary columns +}