Skip to content

Commit

Permalink
Merge pull request #66 from best-practice-and-impact/previous-years-wave
Browse files Browse the repository at this point in the history
Previous years wave
  • Loading branch information
CHCRowley authored Nov 27, 2023
2 parents cf0da38 + 48b48fc commit b49755e
Show file tree
Hide file tree
Showing 12 changed files with 370 additions and 9 deletions.
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ export(w2_enforce_streaming)
export(w2_rename_cols)
export(w3_enforce_streaming)
export(w3_rename_cols)
export(w4_check_skip_logic)
export(w4_clean_departments)
export(w4_enforce_skip_logic)
export(w4_enforce_streaming)
export(w4_rename_cols)
export(wrap_outputs)
importFrom(dplyr,across)
importFrom(dplyr,all_of)
Expand Down
15 changes: 12 additions & 3 deletions R/ingest.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ get_tidy_data_file <- function(...) {

#' @return the exported data as a dataframe

ingest <- function(survey = "1167489",
ingest <- function(survey = "1376897",
token = Sys.getenv("CARS_TOKEN"),
secret = Sys.getenv("CARS_SECRET"),
proxies = Sys.getenv("alt_proxy"),
Expand Down Expand Up @@ -199,10 +199,12 @@ get_all_waves <- function(mode = c("api", "file")) {

if (mode == "api") {
data <- get_tidy_data_api()
w4_data <- get_tidy_data_api(survey = "1167489")
w3_data <- get_tidy_data_api(survey = "961613")
w2_data <- get_tidy_data_api(survey = "790800")
} else if (mode == "file") {
data <- read_file("2022_data.csv")
w4_data <- read_file("2022_data.csv")
w3_data <- read_file("2021_data.csv")
w2_data <- read_file("2020_data.csv")
}
Expand All @@ -213,7 +215,14 @@ get_all_waves <- function(mode = c("api", "file")) {
apply_skip_logic() %>%
clean_departments() %>%
derive_vars()
data$year <- 2022
data$year <- 2023

w4_data <- w4_data %>%
tidy_colnames() %>%
w4_rename_cols() %>%
w4_enforce_streaming() %>%
w4_clean_departments()
w4_data$year <- 2022

w3_data <- w3_data %>%
tidy_colnames() %>%
Expand All @@ -227,7 +236,7 @@ get_all_waves <- function(mode = c("api", "file")) {
w2_enforce_streaming()
w2_data$year <- 2020

data <- dplyr::bind_rows(data, w3_data, w2_data)
data <- dplyr::bind_rows(data, w4_data, w3_data, w2_data)

return(data)
}
4 changes: 2 additions & 2 deletions R/wave_2_preprocessing.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#' Rename columns (wave 3)
#' Rename columns (wave 2)
#'
#' @description add meaningful column names to dataset ingested from smartsurvey API.
#'
#' @param data CARS wave 3 (2021) survey data (data.frame).
#' @param data CARS wave 2 (2020) survey data (data.frame).
#'
#' @return data.frame
#'
Expand Down
253 changes: 253 additions & 0 deletions R/wave_4_preprocessing.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@

#' @title Rename columns
#'
#' @description Renames columns and removes unnecessary columns
#'
#' @param data tidy CARS dataset
#'
#' @return data.frame
#'
#' @export

w4_rename_cols <- function(data) {

if (class(data) != "data.frame") {
stop("Unexpected input: data is not a data.frame.")
}
if (ncol(data) != 112) {
stop("Unexpected input: incorrect number of columns. Please use the 2022 CARS dataset.")
}

colnames(data)[c(1, 7:ncol(data))] <- c(
"ID",
"started",
"ended",
"tracking_link",
"workplace",
"CS_grade",
"department",
"other_department_name",
"prof_DS",
"prof_DDAT",
"prof_GAD",
"prof_GES",
"prof_geog",
"prof_GORS",
"prof_GSR",
"prof_GSG",
"prof_CS_none",
"prof_CS_other",
"ONS_directorate",
"highest_qualification",
"qual_1_subject",
"qual_1_level",
"qual_1_learn_code",
"qual_2_subject",
"qual_2_level",
"qual_2_learn_code",
"qual_3_subject",
"qual_3_level",
"qual_3_learn_code",
"code_freq",
"management",
"ops_analysis",
"ops_cleaning",
"ops_linking",
"ops_transfer_migration",
"ops_vis",
"ops_machine_learning",
"ops_modelling",
"ops_QA",
"ops_other",
"ops_other_name",
"knowledge_R",
"access_R",
"knowledge_SQL",
"access_SQL",
"knowledge_SAS",
"access_SAS",
"knowledge_VBA",
"access_VBA",
"knowledge_python",
"access_python",
"knowledge_SPSS",
"access_SPSS",
"knowledge_stata",
"access_stata",
"knowledge_JS",
"access_JS",
"knowledge_java",
"access_java",
"knowledge_C",
"access_C",
"knowledge_matlab",
"access_matlab",
"knowledge_access_other",
"knowledge_git",
"access_git",
"other_coding_experience",
"coding_ability_change",
"prev_coding_experience",
"first_learned",
"heard_of_RAP",
"know_RAP_champ",
"strategy_knowledge",
"RAP_confident",
"RAP_supported",
"RAP_resources",
"RAP_components",
"RAP_important",
"RAP_implementing",
"RAP_planning",
"RAP_comments",
"prac_use_open_source",
"prac_open_source_own",
"prac_version_control",
"prac_review",
"prac_functions",
"prac_unit_test",
"prac_package",
"prac_dir_structure",
"prac_style",
"prac_automated_QA",
"prac_AQUA_book",
"doc_comments",
"doc_functions",
"doc_readme",
"doc_desk_notes",
"doc_registers",
"doc_AQA_logs",
"doc_flow_charts",
"doc_other",
"CI",
"dep_management",
"reproducible_workflow",
"misc_coding",
"misc_support",
"misc_additional_data",
"misc_other"
)

data <- data[!colnames(data) %in% c("UserNo", "Name", "Email", "IP.Address", "Unique.ID")]

return(data)
}

#' @title Clean department data
#'
#' @description add NHS to department list and merge departments where needed.
#'
#' @param data cleaned CARS dataset
#'
#' @return CARS dataset
#' @export

w4_clean_departments <- function(data) {

data$department[grepl("forest research", tolower(data$other_department_name))] <- "Forestry Commission"

data$department[data$workplace == "NHS"] <- "NHS"

defra_orgs <- c(
"Department for Environment, Food and Rural Affairs (excl. agencies)",
"Forestry Commission",
"Animal and Plant Health Agency",
"Centre for Environment, Fisheries and Aquaculture Science",
"Rural Payments Agency",
"Environment Agency",
"Marine Management Organisation",
"Natural England"
)

data$defra <- data$department %in% defra_orgs

return(data)

}



#' @title Apply skip logic
#'
#' @description Iteratively applies enforce_skip_logic to the necessary fields in the data.
#'
#' @param data data.frame
#'
#' @return cleaned data.frame
#'
#' @export

w4_enforce_streaming <- function(data) {

conditions <- list(data$workplace %in% c("Civil service, including devolved administations", "test"),
data$department %in% c("Office for National Statistics", "test"),
data$highest_qualification != "Any other qualification",
data$code_freq != "Never",
data$other_coding_experience != "No",
data$prev_coding_experience != "No",
data$heard_of_RAP != "No")

skipped_cols <- list(colnames(data)[which(colnames(data) == "CS_grade"):which(colnames(data) == "ONS_directorate")],
colnames(data)[which(colnames(data) == "ONS_directorate")],
colnames(data)[which(colnames(data) == "qual_1_subject"):which(colnames(data) == "qual_3_learn_code")],
colnames(data)[which(colnames(data) == "prac_use_open_source"):which(colnames(data) == "misc_coding")],
colnames(data)[which(colnames(data) == "coding_ability_change"):which(colnames(data) == "first_learned")],
colnames(data)[which(colnames(data) == "first_learned")],
colnames(data)[which(colnames(data) == "know_RAP_champ"):which(colnames(data) == "RAP_comments")])

for(i in 1:length(conditions)){
data <- w4_enforce_skip_logic(data, conditions[[i]], skipped_cols[[i]])
}

return(data)

}


#' @title Check skip logic
#'
#' @description Checks whether the skip logic was followed correctly. Backtracking while filling the survey can result in inconsistent response sets.
#' This check returns row numbers where questions which should have been skipped contain anything other than NA.
#'
#' @param data data.frame
#' @param condition logical vector. Example: data$row == "skip response"
#' @param skipped_cols character. questions that should have been skipped if condition != TRUE
#'
#' @return list of rows failing the check
#'
#' @export

w4_check_skip_logic <- function(data, condition, skipped_cols) {

condition_failed <- !condition & !is.na(data[skipped_cols])

row_failed <- as.logical(rowSums(condition_failed))


return(
which(row_failed)
)

}

#' @title enforce skip logic
#'
#' @description Replaces values in rows with NAs where check_skip_logic has identified backtracking.
#'
#' @param data data.frame
#' @param condition logical vector. Example: data$row == "skip response"
#' @param skipped_cols character. questions that should have been skipped if condition != TRUE
#'
#' @return data.frame with rows failing the check replaced with NAs
#'
#' @export

w4_enforce_skip_logic <- function(data, condition, skipped_cols) {

row_index <- w4_check_skip_logic(data, condition, skipped_cols)

data[row_index, skipped_cols] <- NA

return(data)

}
2 changes: 1 addition & 1 deletion man/derive_language_status.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/ingest.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/w2_rename_cols.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions man/w4_check_skip_logic.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/w4_clean_departments.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit b49755e

Please sign in to comment.