diff --git a/.Rbuildignore b/.Rbuildignore index 8ecf470..b75bf30 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -7,4 +7,5 @@ ^quarto/main$ ^R/test\.R$ ^main\.R$ -^.*.quarto \ No newline at end of file +^.*.quarto +^data-raw$ diff --git a/.gitignore b/.gitignore index f323229..595ee10 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ temp/ *.csv *.rda exploratory_scripts/ +docs/ +quarto/qa/summary_qa.html +quarto/summary_qa_files/ diff --git a/DESCRIPTION b/DESCRIPTION index 2288909..e958836 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,3 +33,5 @@ LazyData: true Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 Config/testthat/edition: 3 +Depends: + R (>= 2.10) diff --git a/NAMESPACE b/NAMESPACE index df6995a..7640c9f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,7 +5,10 @@ export(apply_skip_logic) export(break_q_names) export(calculate_freqs) export(check_skip_logic) +export(clean_data) export(clean_departments) +export(clean_first_learned) +export(clean_workplace) export(compare_models) export(create_filtered_pages) export(create_y_lab) @@ -43,6 +46,11 @@ export(w2_enforce_streaming) export(w2_rename_cols) export(w3_enforce_streaming) export(w3_rename_cols) +export(w4_check_skip_logic) +export(w4_clean_departments) +export(w4_enforce_skip_logic) +export(w4_enforce_streaming) +export(w4_rename_cols) export(wrap_outputs) importFrom(dplyr,across) importFrom(dplyr,all_of) diff --git a/R/data_cleaning.R b/R/data_cleaning.R index 2716ae7..e713deb 100644 --- a/R/data_cleaning.R +++ b/R/data_cleaning.R @@ -11,7 +11,7 @@ rename_cols <- function(data) { if (ncol(data) != 112) { - stop("Unexpected input: incorrect number of columns. Please use the 2022 CARS dataset.") + stop("Unexpected input: incorrect number of columns. Please use the 2023 CARS dataset.") } colnames(data)[c(1, 7:ncol(data))] <- c( @@ -23,6 +23,7 @@ rename_cols <- function(data) { "CS_grade", "department", "other_department_name", + "prof_DE", "prof_DS", "prof_DDAT", "prof_GAD", @@ -34,6 +35,14 @@ rename_cols <- function(data) { "prof_CS_none", "prof_CS_other", "ONS_directorate", + "pay_band", + "NHS_band", + "NJC_grade", + "primary_work_country", + "England_NHS_organisation", + "Scotland_NHS_organisation", + "Wales_NHS_organisation", + "Northern_Ireland_NHS_organisation", "highest_qualification", "qual_1_subject", "qual_1_level", @@ -46,46 +55,35 @@ rename_cols <- function(data) { "qual_3_learn_code", "code_freq", "management", - "ops_analysis", - "ops_cleaning", - "ops_linking", - "ops_transfer_migration", - "ops_vis", - "ops_machine_learning", - "ops_modelling", - "ops_QA", - "ops_other", - "ops_other_name", - "knowledge_R", + "access_matlab", + "access_python", "access_R", - "knowledge_SQL", - "access_SQL", - "knowledge_SAS", "access_SAS", - "knowledge_VBA", + "access_SPSS", + "access_SQL", + "access_stata", "access_VBA", + "access_open_source_other", + "access_licensed_other", + "access_other_specified", + "knowledge_matlab", "knowledge_python", - "access_python", + "knowledge_R", + "knowledge_SAS", "knowledge_SPSS", - "access_SPSS", + "knowledge_SQL", "knowledge_stata", - "access_stata", - "knowledge_JS", - "access_JS", - "knowledge_java", - "access_java", - "knowledge_C", - "access_C", - "knowledge_matlab", - "access_matlab", - "knowledge_access_other", + "knowledge_VBA", + "knowledge_licensed_other", + "knowledge_open_source_other", + "knowledge_other_specified", "knowledge_git", "access_git", "other_coding_experience", - "coding_ability_change", - "prev_coding_experience", "first_learned", + "coding_ability_change", "heard_of_RAP", + "have_RAP_champ", "know_RAP_champ", "strategy_knowledge", "RAP_confident", @@ -102,11 +100,13 @@ rename_cols <- function(data) { "prac_review", "prac_functions", "prac_unit_test", + "prac_other_automated", "prac_package", "prac_dir_structure", "prac_style", "prac_automated_QA", - "prac_AQUA_book", + "prac_development_QA", + "prac_proportionate_QA", "doc_comments", "doc_functions", "doc_readme", @@ -129,6 +129,25 @@ rename_cols <- function(data) { return(data) } +#' @title Clean data +#' +#' @description Recategorise department, workplace and first_learned data +#' +#' @param data cleaned CARS dataset +#' +#' @return CARS dataset +#' @export + +clean_data <- function(data){ + + data %>% + clean_departments() %>% + clean_workplace() %>% + clean_first_learned() + +} + + #' @title Clean department data #' #' @description add NHS to department list and merge departments where needed. @@ -140,13 +159,29 @@ rename_cols <- function(data) { clean_departments <- function(data) { - data$department[grepl("forest research", tolower(data$other_department_name))] <- "Forestry Commission" + data$department[data$department == "Foreign, Commonwealth & Development Office (excl. agencies)"] <- "Foreign, Commonwealth and Development Office (excl. agencies)" data$department[data$workplace == "NHS"] <- "NHS" + data$department[data$other_department_name == "Office for National Statistics"] <- "Office for National Statistics" + + data$department[data$other_department_name == "Data Science Campus"] <- "Office for National Statistics" + + data$department[data$other_department_name == "Welsh Revenue Authority"] <- "Welsh Government" + + data$department[data$other_department_name == "Equality Hub, Cabinet Office"] <- "Cabinet Office (excl. agencies)" + + data$department[data$other_department_name == "Natural England"] <- "Natural England" + + data$department[data$other_department_name == "Department for Communities"] <- "Northern Ireland Executive" + + data$department[data$other_department_name == "Department of Education Northern Ireland"] <- "Northern Ireland Executive" + defra_orgs <- c( "Department for Environment, Food and Rural Affairs (excl. agencies)", "Forestry Commission", + "Forest Research", + "Forestry England", "Animal and Plant Health Agency", "Centre for Environment, Fisheries and Aquaculture Science", "Rural Payments Agency", @@ -161,3 +196,60 @@ clean_departments <- function(data) { } +#' @title Clean workplace data +#' +#' @description reclassify 'other' text responses into CS/NHS +#' +#' @param data cleaned CARS dataset +#' +#' @return CARS dataset +#' @export + +clean_workplace <- function(data) { + + data$workplace[data$workplace == "MOD"] <- "Civil service, including devolved administrations" + + data$workplace[data$workplace == "HMRC"] <- "Civil service, including devolved administrations" + + data$workplace[data$workplace == "The Pensions Regulator"] <- "Civil service, including devolved administrations" + + data$workplace[data$workplace == "Scottish Funding Council"] <- "Civil service, including devolved administrations" + + data$workplace[data$workplace == "Office for Students"] <- "Civil service, including devolved administrations" + + data$workplace[data$workplace == "Office for students"] <- "Civil service, including devolved administrations" + + data$workplace[data$workplace == "OfS"] <- "Civil service, including devolved administrations" + + data$workplace[data$workplace == "Dstl"] <- "Civil service, including devolved administrations" + + return(data) + +} + +#' @title Clean first learned data +#' +#' @description reclassify 'other' free text responses into self-taught based on common terms used +#' +#' @param data cleaned CARS dataset +#' +#' @return CARS dataset +#' @export + +clean_first_learned <- function(data) { + + matches <- c("self", + "hobby", + "personal", + "independ", + "home", + "for fun", + "free time", + "spare time", + "childhood") + + data$first_learned[stringr::str_detect(tolower(data$first_learned), stringr::str_c(matches, collapse = "|"))] <- "Self-taught" + + return(data) + +} diff --git a/R/derive_vars.R b/R/derive_vars.R index 158ad5e..23e3353 100644 --- a/R/derive_vars.R +++ b/R/derive_vars.R @@ -11,7 +11,8 @@ derive_vars <- function(data) { data <- data %>% derive_language_status() %>% - derive_rap_score() + derive_rap_score() %>% + derive_rap_champ_status() return(data) } @@ -20,7 +21,7 @@ derive_vars <- function(data) { #' @title Derive language status #' -#' @description Derve the status of each programmming language as "access" (access only), "knowledge" (knowledge only), "both" or "neither". +#' @description Derive the status of each programmming language as "access" (access only), "knowledge" (knowledge only), "both" or "neither". #' #' @param data tidied CARS wave 3 data (data.frame). #' @@ -85,7 +86,7 @@ derive_basic_rap_scores <- function(data) { "prac_open_source_own", "prac_version_control", "prac_review", - "prac_AQUA_book", + "prac_proportionate_QA", "doc_comments", "doc_readme") @@ -102,7 +103,7 @@ derive_basic_rap_scores <- function(data) { "open_code_score", "version_control_score", "peer_review_score", - "AQUA_book_score", + "proportionate_QA_score", "doc_score") high_vals <- c("Regularly", "All the time") @@ -115,8 +116,8 @@ derive_basic_rap_scores <- function(data) { .x %in% high_vals ~ 1, TRUE ~ 0), .names = "{.col}_score")) %>% - mutate(doc_score = as.integer(.data$doc_comments_score & .data$doc_readme_score)) %>% - select(-c(.data$doc_comments_score, .data$doc_readme_score)) %>% + mutate(doc_score = as.integer(doc_comments_score & doc_readme_score)) %>% + select(-c(doc_comments_score, doc_readme_score)) %>% rename_with(~ score_col_names[which(paste0(prac_cols, "_score") == .x)], .cols = paste0(prac_cols, "_score")) %>% @@ -181,3 +182,26 @@ derive_advanced_rap_scores <- function(data) { return(data) } + + +#' @title Derive RAP Champion status +#' +#' @description Derive RAP Champion status column from existing variables and add to the dataframe. +#' +#' @param data a date frame containing cleaned CARS wave 5 data +#' +#' @return dataframe containing the additional RAP Champion status columns +#' +#' @importFrom dplyr mutate case_when +derive_rap_champ_status <- function(data){ + + data <- data %>% + mutate(RAP_champ_status = case_when(have_RAP_champ == "Yes" & know_RAP_champ == "Yes, and I am a RAP Champion" ~ "Yes, and I am a RAP Champion", + have_RAP_champ == "Yes" & know_RAP_champ == "Yes" ~ "Yes, and I know who the RAP Champion is", + have_RAP_champ == "Yes" & know_RAP_champ == "No" ~ "Yes, but I don't know who the RAP Champion is", + have_RAP_champ == "No" ~ "No", + have_RAP_champ == "Don't know" ~ "I don't know")) + +} + + diff --git a/R/frequency-tables.R b/R/frequency-tables.R index 974a6e6..d6110ab 100644 --- a/R/frequency-tables.R +++ b/R/frequency-tables.R @@ -4,49 +4,50 @@ #' #' @param data full CARS dataset after pre-processing #' @param all_tables logical: whether to produce all summary output tables. Defaults to FALSE. +#' @param sample additionally returns count and sample size for selected tables for QA. FALSE by default #' #' @return list of frequency tables #' #' @export -summarise_all <- function(data, all_tables = FALSE) { +summarise_all <- function(data, all_tables = FALSE, sample = FALSE) { output_list <- list( - code_freq = summarise_code_freq(data), - operations = summarise_operations(data), - knowledge = summarise_coding_tools(data, "knowledge"), - access = summarise_coding_tools(data, "access"), + code_freq = summarise_code_freq(data, sample = sample), + knowledge = summarise_coding_tools(data, "knowledge", sample = sample), + access = summarise_coding_tools(data, "access", sample = sample), language_status = summarise_language_status(data), - where_learned = summarise_where_learned_code(data), - ability_change = summarise_ability_change(data), - coding_practices = summarise_coding_practices(data), - doc = summarise_doc(data), - rap_knowledge = summarise_rap_knowledge(data), - rap_opinions = summarise_rap_opinions(data), + where_learned = summarise_where_learned_code(data, sample = sample), + ability_change = summarise_ability_change(data, sample = sample), + coding_practices = summarise_coding_practices(data, sample = sample), + doc = summarise_doc(data, sample = sample), + rap_knowledge = summarise_rap_knowledge(data, sample = sample), + rap_champ_status = summarise_rap_champ_status(data, sample = sample), + rap_opinions = summarise_rap_opinions(data, sample = sample), basic_rap_scores = summarise_rap_basic(data), advanced_rap_scores = summarise_rap_advanced(data), - rap_components = summarise_rap_comp(data), + rap_components = summarise_rap_comp(data, sample = sample), ci = summarise_ci(data), dependency_management = summarise_dep_man(data), rep_workflow = summarise_rep_workflow(data), line_manage = summarise_line_manage(data), git_knowledge = summarise_knowledge_git(data), - git_access = summarise_access_git(data), - strategy_knowledge = summarise_strategy_knowledge(data) + git_access = summarise_access_git(data, sample = sample), + strategy_knowledge = summarise_strategy_knowledge(data, sample = sample) ) if (all_tables) { output_list <- c(output_list, list( - capability_change_by_freq = summarise_cap_change_by_freq(data), + capability_change_by_freq = summarise_cap_change_by_freq(data, sample = sample), capability_change_by_line_manage = summarise_cap_change_by_line_manage(data), capability_change_by_CS_grade = summarise_cap_change_by_CS_grade(data), basic_score_by_implementation = summarise_basic_score_by_imp(data), adv_score_by_implementation = summarise_adv_score_by_imp(data), basic_score_by_understanding = summarise_basic_score_by_understanding(data), adv_score_by_understanding = summarise_adv_score_by_understanding(data), - languages_by_prof = summarise_languages_by_prof(data), + languages_by_prof = summarise_languages_by_prof(data, sample = sample), open_source_by_prof = summarise_open_source_by_prof(data), heard_of_RAP_by_prof = summarise_heard_of_RAP_by_prof(data) )) @@ -63,15 +64,27 @@ summarise_all <- function(data, all_tables = FALSE) { #' @return list of sample sizes #' #' @export +#' sample_sizes <- function(data) { list( all = nrow(data), code_at_work = sum(!is.na(data$code_freq) & data$code_freq != "Never"), - can_code = sum(data$code_freq != "Never" | (data$other_coding_experience == "Yes" & data$prev_coding_experience != "No")), - other_code_experience = sum(!is.na(data$other_coding_experience ) & data$other_coding_experience == "Yes"), - heard_of_RAP = sum(!is.na(data$heard_of_RAP) & data$heard_of_RAP == "Yes"), - not_RAP_champ = sum(is.na(data$know_RAP_champ) | data$know_RAP_champ != "I am a RAP champion") + other_code_experience = sum(!is.na(data$code_freq) & data$code_freq != "Never" & data$other_coding_experience == "Yes" & data$first_learned != "Current employment"), + heard_of_RAP = sum(!is.na(data$code_freq) & data$code_freq != "Never" & data$heard_of_RAP == "Yes"), + not_RAP_champ = sum(is.na(data$know_RAP_champ) | data$know_RAP_champ != "I am a RAP champion"), + + profs = sapply(c("prof_DE", "prof_DS", "prof_DDAT", "prof_GAD", "prof_GES", + "prof_geog", "prof_GORS", "prof_GSR", "prof_GSG"), + function(prof) { + prof_count <- sum(data[prof] == "Yes", na.rm = TRUE) + if (prof_count > 0) { + prof_sample <- paste0(prof_count, " (", substring(prof, 6), ")") + return(prof_sample) + } + } + ) + ) } @@ -81,10 +94,11 @@ sample_sizes <- function(data) { #' @description calculate frequency table for coding frequency. #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_code_freq <- function(data) { +summarise_code_freq <- function(data, sample = FALSE) { # Validation checks if (!"code_freq" %in% colnames(data)) { @@ -99,36 +113,9 @@ summarise_code_freq <- function(data) { "Regularly", "All the time") - frequencies <- calculate_freqs(data, questions, levels) - - return(frequencies) -} - - -#' @title Summarise data operations -#' -#' @description calculate frequency table for data operations -#' -#' @param data full CARS dataset after pre-processing -#' -#' @return frequency table (data.frame) - -summarise_operations <- function(data) { - - questions <- c("ops_analysis", "ops_cleaning", "ops_linking", - "ops_transfer_migration", "ops_vis", "ops_machine_learning", - "ops_modelling", "ops_QA") - - levels <- c("I do some or all of this by coding", "I do this without coding") - - labels <- c("Data analysis", "Data cleaning", "Data linking", - "Data transfer / migration", "Data visualisation", - "Machine learning", "Modelling", "Quality assurance") - - frequencies <- calculate_freqs(data, questions, levels, labels) + frequencies <- calculate_freqs(data, questions, levels, sample = sample) return(frequencies) - } @@ -139,44 +126,50 @@ summarise_operations <- function(data) { #' @param data full CARS dataset after pre-processing #' @param type type of table (knowledge or access) #' @param prop whether to return proportion data (0-1). TRUE by default. Assumes mutually exclusive response options. +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_coding_tools <- function(data, type = list("knowledge", "access"), prop = TRUE) { +summarise_coding_tools <- function(data, type = list("knowledge", "access"), prop = TRUE, sample = FALSE) { questions <- c("knowledge_R", "access_R", "knowledge_SQL", "access_SQL", "knowledge_SAS", "access_SAS", "knowledge_VBA", "access_VBA", "knowledge_python", "access_python", "knowledge_SPSS", - "access_SPSS", "knowledge_stata", "access_stata", "knowledge_JS", - "access_JS", "knowledge_java", "access_java", "knowledge_C", - "access_C", "knowledge_matlab", "access_matlab") + "access_SPSS", "knowledge_stata", "access_stata", + "knowledge_matlab", "access_matlab") - levels <- c("Yes", "Don't Know", "No") + if (type == "knowledge") { + levels <- c("Yes", "No", "Not required for my work") + } else { + levels <- c("Yes", "No", "Don't know") + } - labels <- c("R", "SQL", "SAS", "VBA", "Python", "SPSS", "Stata", - "Javascript / Typescript", "Java / Scala", "C++ / C#", "Matlab") + labels <- c("R", "SQL", "SAS", "VBA", "Python", "SPSS", "Stata", "Matlab") type <- match.arg(type, several.ok = TRUE) questions <- questions[grepl(paste0(type, "_"), questions)] - frequencies <- calculate_freqs(data, questions, levels, labels, prop = prop) + frequencies <- calculate_freqs(data, questions, levels, labels, prop = prop, sample = sample) %>% + dplyr::arrange(match(name, c("Python", "R", "SQL", "Matlab", "SAS", "SPSS", "Stata", "VBA"))) return(frequencies) } + #' @title Summarise where respondents learned to code #' #' @description calculate frequency table of where respondents learned to code #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) #' #' @importFrom dplyr select mutate case_when -summarise_where_learned_code <- function(data){ +summarise_where_learned_code <- function(data, sample = FALSE){ # Validation checks if (!"first_learned" %in% colnames(data)) { @@ -188,31 +181,25 @@ summarise_where_learned_code <- function(data){ if (!"other_coding_experience" %in% colnames(data)) { stop("unexpected_input: no column called 'other_coding_experience'") } - if (!"prev_coding_experience" %in% colnames(data)) { - stop("unexpected_input: no column called 'prev_coding_experience'") - } questions <- "first_learned" - levels <- c("In current role", - "In education", - "In private sector employment", - "In public sector employment", + levels <- c("Current employment", + "Education", + "Previous private sector employment", + "Previous public sector employment", "Self-taught", "Other") data <- data %>% - select(first_learned, prev_coding_experience, code_freq) %>% + select(first_learned, code_freq) %>% mutate( - first_learned = case_when((data$other_coding_experience == "No" | - data$prev_coding_experience == "No") & - data$code_freq != "Never" ~ "In current role", + first_learned = case_when((data$other_coding_experience == "No") & + data$code_freq != "Never" ~ "Current employment", !is.na(data$first_learned) & !(data$first_learned %in% levels) ~ "Other", TRUE ~ first_learned)) - data$prev_coding_experience[data$other_coding_experience == "No"] <- "No" - - frequencies <- calculate_freqs(data, questions, levels) + frequencies <- calculate_freqs(data, questions, levels, sample = sample) return(frequencies) } @@ -223,15 +210,17 @@ summarise_where_learned_code <- function(data){ #' @description calculate frequency table for data practices #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_coding_practices <- function(data) { +summarise_coding_practices <- function(data, sample = FALSE) { questions <- c("prac_use_open_source", "prac_open_source_own", "prac_version_control", "prac_review", "prac_functions", "prac_unit_test", "prac_package", "prac_dir_structure", - "prac_style", "prac_automated_QA", "prac_AQUA_book") + "prac_style", "prac_automated_QA", "prac_development_QA", + "prac_proportionate_QA") levels <- c("I don't understand this question", "Never", "Rarely", "Sometimes", "Regularly", "All the time") @@ -246,9 +235,10 @@ summarise_coding_practices <- function(data) { "Standard directory structure", "Coding guidelines / Style guides", "Automated data quality assurance", - "Apply AQUA book principles with analysis code") + "Quality assurance throughout development", + "Proportionate quality assurance") - frequencies <- calculate_freqs(data, questions, levels, labels) + frequencies <- calculate_freqs(data, questions, levels, labels, sample = sample) return(frequencies) @@ -308,40 +298,59 @@ summarise_rap_advanced <- function(data){ #' @description Create a frequency table of knowledge of RAP #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_rap_knowledge <- function(data){ +summarise_rap_knowledge <- function(data, sample = FALSE){ - # Validation checks - if (!"heard_of_RAP" %in% colnames(data)) { - stop("unexpected_input: no column called 'heard_of_RAP'") - } + data <- data[data$code_freq != "Never", ] - questions <- "know_RAP_champ" + questions <- "heard_of_RAP" - levels <- c("Have not heard of RAP", - "I don't know what a RAP champion is", - "I know what a RAP champion is but don't know who the RAP champion in my department is", - "I know what a RAP champion is and there is no RAP champion in my department", - "I know who the RAP champion in my department is") + levels <- c("Yes", + "No") - data$know_RAP_champ[data$heard_of_RAP == "No"] <- "Have not heard of RAP" + frequencies <- calculate_freqs(data, questions, levels, sample = sample) + return(frequencies) +} - frequencies <- calculate_freqs(data, questions, levels) + +#' @title Knowledge of RAP Champions +#' +#' @description Create a frequency table of knowledge of RAP Champions +#' +#' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default +#' +#' @return frequency table (data.frame) + +summarise_rap_champ_status <- function(data, sample = FALSE){ + + questions <- "RAP_champ_status" + + levels <- c("Yes, and I am a RAP Champion", + "Yes, and I know who the RAP Champion is", + "Yes, but I don't know who the RAP Champion is", + "No", + "I don't know") + + frequencies <- calculate_freqs(data, questions, levels, sample = sample) return(frequencies) } + #' @title Opinions of RAP #' #' @description Create frequency table of opinions of RAP #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_rap_opinions <- function(data) { +summarise_rap_opinions <- function(data, sample = FALSE) { # Validation checks if (!"heard_of_RAP" %in% colnames(data)) { @@ -373,7 +382,7 @@ summarise_rap_opinions <- function(data) { "I or my team are planning on implementing RAP in the next 12 months") - frequencies <- calculate_freqs(opinion_rap_data, questions, levels, labels) + frequencies <- calculate_freqs(opinion_rap_data, questions, levels, labels, sample = sample) return(frequencies) @@ -385,10 +394,11 @@ summarise_rap_opinions <- function(data) { #' @description Create frequency table of documentation use #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_doc <- function(data) { +summarise_doc <- function(data, sample = FALSE) { # Validation checks if (!"code_freq" %in% colnames(data)) { @@ -421,7 +431,7 @@ summarise_doc <- function(data) { "Flow charts") - frequencies <- calculate_freqs(documentation_data, questions, levels, labels) + frequencies <- calculate_freqs(documentation_data, questions, levels, labels, sample = sample) return(frequencies) @@ -432,18 +442,19 @@ summarise_doc <- function(data) { #' @description Create frequency table of basic and advanced RAP score components #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) #' #' @importFrom dplyr mutate arrange -summarise_rap_comp <- function(data) { +summarise_rap_comp <- function(data, sample = FALSE) { labels <- c("Use open source software", "Team open source code", "Version control", "Peer review", - "AQUA book guidance", + "Proportionate QA", "Documentation", "Functions", "Unit testing", @@ -457,7 +468,7 @@ summarise_rap_comp <- function(data) { "open_code_score", "version_control_score", "peer_review_score", - "AQUA_book_score", + "proportionate_QA_score", "doc_score", "function_score", "unit_test_score", @@ -475,10 +486,17 @@ summarise_rap_comp <- function(data) { mutate(name = factor(name, levels = labels)) %>% arrange(name) %>% mutate(value = c(rep("Basic", 6), rep("Advanced", 7))) %>% - mutate(n = colSums(data[questions], na.rm = TRUE) / sum(data$code_freq != "Never")) + mutate(n = colSums(data[questions], na.rm = TRUE) / sum(data$code_freq != "Never", na.rm = TRUE)) names(components$n) <- NULL + if (sample == TRUE) { + components <- components %>% + mutate(count = colSums(data[questions], na.rm = TRUE)) + + components$sample <- sum(data$code_freq != "Never", na.rm = TRUE) + } + return(components) } @@ -573,25 +591,35 @@ summarise_rep_workflow <- function(data) { #' @description calculate frequency table for ability change #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_ability_change <- function(data) { +summarise_ability_change <- function(data, sample = FALSE) { # Validation checks if (!"coding_ability_change" %in% colnames(data)) { stop("unexpected_input: no column called 'coding_ability_change'") } + data <- data[data$first_learned != "Current employment", ] + questions <- "coding_ability_change" - levels <- c("Significantly worse", - "Slightly worse", - "No change", - "Slightly better", - "Significantly better") + levels <- c("It has become significantly worse", + "It has become slightly worse", + "It has stayed the same", + "It has become slightly better", + "It has become significantly better") - frequencies <- calculate_freqs(data, questions, levels) + frequencies <- calculate_freqs(data, questions, levels, sample = sample) + + frequencies$value <- frequencies$value %>% + dplyr::recode_factor("It has become significantly worse" = "Significantly worse", + "It has become slightly worse" = "Slightly worse", + "It has stayed the same" = "Stayed the same", + "It has become slightly better" = "Slightly better", + "It has become significantly better" = "Significantly better") return(frequencies) @@ -617,9 +645,6 @@ summarise_language_status <- function(data) { "status_python", "status_SPSS", "status_stata", - "status_JS", - "status_java", - "status_C", "status_matlab") levels <- c("Access Only", "Both", "Knowledge Only") @@ -631,9 +656,6 @@ summarise_language_status <- function(data) { "Python", "SPSS", "Stata", - "Javascript / Typescript", - "Java / Scala", - "C++ / C#", "Matlab") frequencies <- calculate_freqs(data, questions, levels, labels) @@ -699,10 +721,11 @@ summarise_knowledge_git <- function(data){ #' @description calculate frequency table for if someone has access to git #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_access_git <- function(data){ +summarise_access_git <- function(data, sample = FALSE){ # Validation checks if (!"access_git" %in% colnames(data)) { @@ -715,7 +738,7 @@ summarise_access_git <- function(data){ "No", "I don't know") - frequencies <- calculate_freqs(data, questions, levels) + frequencies <- calculate_freqs(data, questions, levels, sample = sample) return(frequencies) @@ -727,10 +750,11 @@ summarise_access_git <- function(data){ #' @description calculate frequency table for if someone heard of or read the RAP strategy #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_strategy_knowledge <- function(data){ +summarise_strategy_knowledge <- function(data, sample = FALSE){ # Validation checks if (!"strategy_knowledge" %in% colnames(data)) { @@ -744,11 +768,11 @@ summarise_strategy_knowledge <- function(data){ questions <- "strategy_knowledge" - levels <- c("I have not heard of the RAP strategy", - "I have heard of the RAP strategy, but I haven't read it", - "I have read the RAP strategy") + levels <- c("Yes", + "Yes, but I haven't read it", + "No") - frequencies <- calculate_freqs(data, questions, levels) + frequencies <- calculate_freqs(data, questions, levels, sample = sample) return(frequencies) @@ -760,30 +784,32 @@ summarise_strategy_knowledge <- function(data){ #' @description calculate the cross tab of coding frequency by capability change #' #' @param data full CARS dataset after pre-processing +#' @param sample returns proportion, count and, group size and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_cap_change_by_freq <- function(data){ +summarise_cap_change_by_freq <- function(data, sample = FALSE){ col1 <- "code_freq" col2 <- "coding_ability_change" + data <- dplyr::filter(data, (code_freq != "Never" & other_coding_experience == "Yes" & data$first_learned != "Current employment")) + levels1 <- c( - "Never", "Rarely", "Sometimes", "Regularly", "All the time") levels2 <- c( - "Significantly worse", - "Slightly worse", - "No change", - "Slightly better", - "Significantly better") + "It has become significantly worse", + "It has become slightly worse", + "It has stayed the same", + "It has become slightly better", + "It has become significantly better") - frequencies <- calculate_multi_table_freqs(data, col1, col2, levels1, levels2) + frequencies <- calculate_multi_table_freqs(data, col1, col2, levels1, levels2, sample = sample) return(frequencies) @@ -792,7 +818,7 @@ summarise_cap_change_by_freq <- function(data){ #' @title Summarise capability change by management responsibility #' -#' @description calculate the cross tab of capability change by management responsibilty +#' @description calculate the cross tab of capability change by management responsibility #' #' @param data full CARS dataset after pre-processing #' @@ -809,11 +835,11 @@ summarise_cap_change_by_line_manage <- function(data){ "No - I don't line manage anyone") levels2 <- c( - "Significantly worse", - "Slightly worse", - "No change", - "Slightly better", - "Significantly better") + "It has become significantly worse", + "It has become slightly worse", + "It has stayed the same", + "It has become slightly better", + "It has become significantly better") frequencies <- calculate_multi_table_freqs(data, col1, col2, levels1, levels2) @@ -841,11 +867,11 @@ summarise_cap_change_by_CS_grade <- function(data){ "Grade 6 and 7") levels2 <- c( - "Significantly worse", - "Slightly worse", - "No change", - "Slightly better", - "Significantly better") + "It has become significantly worse", + "It has become slightly worse", + "It has stayed the same", + "It has become slightly better", + "It has become significantly better") selected_data <- data %>% dplyr::select(CS_grade, coding_ability_change) %>% @@ -985,17 +1011,19 @@ summarise_adv_score_by_understanding <- function(data){ #' @description only used the main summary page. Needs to be turned into wide data for html table. #' #' @param data CARS data (pre-processed) +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return data.frame #' #' @importFrom dplyr recode -summarise_languages_by_prof <- function(data) { +summarise_languages_by_prof <- function(data, sample = FALSE) { - profs <- c("prof_DS", "prof_DDAT", "prof_GAD", "prof_GES", "prof_geog", + profs <- c("prof_DE", "prof_DS", "prof_DDAT", "prof_GAD", "prof_GES", "prof_geog", "prof_GORS", "prof_GSR", "prof_GSG") - prof_names <- c("Data scientists", + prof_names <- c("Data engineers", + "Data scientists", "Digital and data (DDAT)", "Actuaries", "Economists (GES)", @@ -1007,11 +1035,11 @@ summarise_languages_by_prof <- function(data) { names(prof_names) <- profs outputs <- lapply(profs, function(prof) { - filtered_data <- data[data[prof] == "Yes", ] + filtered_data <- dplyr::filter(data, get(prof) == "Yes") if(nrow(filtered_data) > 0) { - output <- summarise_coding_tools(filtered_data, "knowledge") + output <- summarise_coding_tools(filtered_data, "knowledge", sample = sample) # Retain frequencies for "Yes" responses only output <- output[output[[2]] == "Yes", ] @@ -1045,10 +1073,11 @@ summarise_languages_by_prof <- function(data) { summarise_open_source_by_prof <- function(data) { - profs <- c("prof_DS", "prof_DDAT", "prof_GAD", "prof_GES", "prof_geog", + profs <- c("prof_DE", "prof_DS", "prof_DDAT", "prof_GAD", "prof_GES", "prof_geog", "prof_GORS", "prof_GSR", "prof_GSG") - prof_names <- c("Data scientists", + prof_names <- c("Data engineers", + "Data scientists", "Digital and data (DDAT)", "Actuaries", "Economists (GES)", @@ -1060,7 +1089,7 @@ summarise_open_source_by_prof <- function(data) { names(prof_names) <- profs outputs <- lapply(profs, function(prof) { - filtered_data <- data[data[prof] == "Yes", ] + filtered_data <- dplyr::filter(data, get(prof) == "Yes") if(nrow(filtered_data) > 0) { @@ -1096,15 +1125,16 @@ summarise_open_source_by_prof <- function(data) { summarise_heard_of_RAP_by_prof <- function(data) { - filtered_data <- dplyr::filter(data, workplace == "Civil service, including devolved administations") + filtered_data <- dplyr::filter(data, workplace == "Civil service, including devolved administrations") filtered_RAP_data <- dplyr::filter(filtered_data, heard_of_RAP == "Yes") questions <- c("heard_of_RAP") - profs <- c("prof_DS", "prof_DDAT", "prof_GAD", "prof_GES", "prof_geog", + profs <- c("prof_DE", "prof_DS", "prof_DDAT", "prof_GAD", "prof_GES", "prof_geog", "prof_GORS", "prof_GSR", "prof_GSG") - prof_names <- c("Data scientists", + prof_names <- c("Data engineers", + "Data scientists", "Digital and data (DDAT)", "Actuaries", "Economists (GES)", @@ -1125,6 +1155,9 @@ summarise_heard_of_RAP_by_prof <- function(data) { 1)) rownames(frequencies) <- NULL + names(frequencies$n) <- NULL + + frequencies$value <- recode(frequencies$value, !!!prof_names) return(frequencies) @@ -1141,6 +1174,8 @@ summarise_heard_of_RAP_by_prof <- function(data) { #' @export summarise_os_vs_prop <- function(data) { + + data$open_source_lang_knowledge <- ifelse( data$knowledge_python == "Yes" | data$knowledge_R == "Yes", TRUE, FALSE @@ -1159,7 +1194,7 @@ summarise_os_vs_prop <- function(data) { data.frame %>% get_ci(freq_col = 2, n_col = 3) - os_freqs <- cbind(lang_type = "open source", os_freqs) + os_freqs <- cbind(lang_type = "Open Source", os_freqs) prop_freqs <- data %>% dplyr::group_by(year) %>% @@ -1167,11 +1202,11 @@ summarise_os_vs_prop <- function(data) { data.frame %>% get_ci(freq_col = 2, n_col = 3) - prop_freqs <- cbind(lang_type = "proprietary", prop_freqs) + prop_freqs <- cbind(lang_type = "Proprietary", prop_freqs) grouped_lang_freqs <- rbind(os_freqs, prop_freqs) grouped_lang_freqs$year <- as.character(grouped_lang_freqs$year) - grouped_lang_freqs$lang_type <- factor(grouped_lang_freqs$lang_type, levels = c("open source", "proprietary")) + grouped_lang_freqs$lang_type <- factor(grouped_lang_freqs$lang_type, levels = c("Open Source", "Proprietary")) return(grouped_lang_freqs) } @@ -1185,6 +1220,8 @@ summarise_os_vs_prop <- function(data) { summarise_rap_awareness_over_time <- function(data) { + data <- data[data$code_freq != "Never", ] + RAP_awareness <- table(data$heard_of_RAP, data$year) %>% data.frame %>% dplyr::group_by(Var2) %>% @@ -1205,6 +1242,7 @@ summarise_rap_awareness_over_time <- function(data) { #' @param levels all possible factor values in the filtered columns #' @param labels labels to rename the column headers. Only needed for multi-column frequencies #' @param prop whether to return proportion data (0-1). TRUE by default. Assumes mutually exclusive response options. +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return data.frame #' @@ -1213,7 +1251,7 @@ summarise_rap_awareness_over_time <- function(data) { #' @importFrom dplyr select all_of group_by count mutate recode arrange #' @importFrom tidyr pivot_longer drop_na -calculate_freqs <- function(data, questions, levels, labels = NULL, prop = TRUE){ +calculate_freqs <- function(data, questions, levels, labels = NULL, prop = TRUE, sample = FALSE){ if (!is.null(labels)) { labels_list <- as.list(labels) @@ -1232,6 +1270,13 @@ calculate_freqs <- function(data, questions, levels, labels = NULL, prop = TRUE) colnames(frequencies) <- c("value", "n") + if (sample == TRUE) { + frequencies <- frequencies %>% + mutate(count = n) + + frequencies$sample <- sum(!is.na(selected_data[1])) + } + if (prop) { frequencies$n <- frequencies$n / ifelse(sum(frequencies$n, na.rm = TRUE)==0, @@ -1253,6 +1298,13 @@ calculate_freqs <- function(data, questions, levels, labels = NULL, prop = TRUE) colnames(frequencies) <- c("name", "value", "n") + if (sample == TRUE) { + frequencies <- frequencies %>% + mutate(count = n) + + frequencies$sample <- sum(!is.na(selected_data[1])) + } + if (prop) { frequencies <- prop_by_group(frequencies) } @@ -1261,6 +1313,7 @@ calculate_freqs <- function(data, questions, levels, labels = NULL, prop = TRUE) return(frequencies) } + #' @title Create tidy cross table #' #' @description Returns a cross table in tidy data format. @@ -1271,12 +1324,13 @@ calculate_freqs <- function(data, questions, levels, labels = NULL, prop = TRUE) #' @param levels1 factor levels for col1 #' @param levels2 factor levels for col2 #' @param prop whether to return proportion data (0-1). TRUE by default. Assumes mutually exclusive response options. +#' @param sample returns proportion, count and, group size and sample size. FALSE by default #' #' @return data.frame #' #' @importFrom dplyr all_of across -calculate_multi_table_freqs <- function(data, col1, col2, levels1, levels2, prop = TRUE){ +calculate_multi_table_freqs <- function(data, col1, col2, levels1, levels2, prop = TRUE, sample = FALSE){ selected_data <- data %>% dplyr::select(all_of(c(col1, col2))) @@ -1285,18 +1339,30 @@ calculate_multi_table_freqs <- function(data, col1, col2, levels1, levels2, prop selected_data[col2] <- factor(selected_data[[col2]], levels = levels2) frequencies <- selected_data %>% - count(across(c(col1, col2)), .drop=FALSE) %>% + count(across(all_of(c(col1, col2))), .drop=FALSE) %>% drop_na() %>% data.frame() + if (sample == TRUE) { + frequencies <- frequencies %>% + group_by_at(1) %>% + mutate(count = n, + group_size = sum(n)) + + frequencies$sample <- sum(!is.na(selected_data[1])) + } + if(prop){ frequencies <- prop_by_group(frequencies) } + return(frequencies) } + + #' @title Convert frequencies to proportions #' #' @param data frequency table with three columns (can be of any name): name, value and count diff --git a/R/ingest.R b/R/ingest.R index ca0ab0b..62069a8 100644 --- a/R/ingest.R +++ b/R/ingest.R @@ -52,7 +52,7 @@ get_tidy_data_file <- function(...) { #' @return the exported data as a dataframe -ingest <- function(survey = "1167489", +ingest <- function(survey = "1376897", token = Sys.getenv("CARS_TOKEN"), secret = Sys.getenv("CARS_SECRET"), proxies = Sys.getenv("alt_proxy"), @@ -199,10 +199,12 @@ get_all_waves <- function(mode = c("api", "file")) { if (mode == "api") { data <- get_tidy_data_api() + w4_data <- get_tidy_data_api(survey = "1167489") w3_data <- get_tidy_data_api(survey = "961613") w2_data <- get_tidy_data_api(survey = "790800") } else if (mode == "file") { - data <- read_file("2022_data.csv") + data <- read_file("2023_data.csv") + w4_data <- read_file("2022_data.csv") w3_data <- read_file("2021_data.csv") w2_data <- read_file("2020_data.csv") } @@ -211,9 +213,16 @@ get_all_waves <- function(mode = c("api", "file")) { tidy_colnames() %>% rename_cols() %>% apply_skip_logic() %>% - clean_departments() %>% + clean_data() %>% derive_vars() - data$year <- 2022 + data$year <- 2023 + + w4_data <- w4_data %>% + tidy_colnames() %>% + w4_rename_cols() %>% + w4_enforce_streaming() %>% + w4_clean_departments() + w4_data$year <- 2022 w3_data <- w3_data %>% tidy_colnames() %>% @@ -227,7 +236,7 @@ get_all_waves <- function(mode = c("api", "file")) { w2_enforce_streaming() w2_data$year <- 2020 - data <- dplyr::bind_rows(data, w3_data, w2_data) + data <- dplyr::bind_rows(data, w4_data, w3_data, w2_data) return(data) } diff --git a/R/plot.R b/R/plot.R index 6d1f7d1..224dc47 100644 --- a/R/plot.R +++ b/R/plot.R @@ -22,7 +22,8 @@ #' #' @export -freq_subplots <- function(data, xlab, ylab, height, width, bar_colour, nrows = 3, y_margin = .1, x_margin = .1, font_size = 12, orientation = "h") { +freq_subplots <- function(data, xlab, ylab, height, width, bar_colour, nrows = 3, + y_margin = .1, x_margin = .1, font_size = 12, orientation = "h") { if (nrows == 1) { stop("Unexpected input: n_rows should be 2 or greater.") @@ -132,7 +133,10 @@ freq_subplots <- function(data, xlab, ylab, height, width, bar_colour, nrows = 3 #' #' @export -plot_freqs <- function(data, n, colour, break_q_names_col, type = c("bar", "line"), max_lines = 2, xlab = "", ylab = "", font_size = 12, orientation = c("v", "h"), ...) { + +plot_freqs <- function(data, n, colour, break_q_names_col, type = c("bar", "line"), + max_lines = 2, xlab = "", ylab = "", font_size = 12, + orientation = c("v", "h"), ...) { # Set default bar colour if (missing(colour)) { @@ -262,7 +266,7 @@ plot_stacked <- function(data, n, break_q_names_col, type = c("bar", "line"), ma if (!is.data.frame(data)) { stop("Unexpected input - data is not a data.frame.") } else if (ncol(data) != 3) { - stop("Unexpected input - data should have three columns") + stop("Unexpected input - data should have three columns.") } # Validate labels @@ -448,6 +452,7 @@ plot_grouped <- function(data, n, break_q_names_col, max_lines = 2, xlab = "", y x_axis <- axes$cat_axis y_axis <- axes$scale_axis legend = list(traceorder = 'normal') + hovertext <- paste0(data[[1]], ": ", round(abs(y_vals) * 100, 1), "%", " ") } else if (orientation == "h") { data[[1]] <- factor(rev(data[[1]]), levels = rev(unique(data[[1]]))) data[[2]] <- factor(rev(data[[2]]), levels = rev(unique(data[[2]]))) @@ -458,14 +463,13 @@ plot_grouped <- function(data, n, break_q_names_col, max_lines = 2, xlab = "", y y_axis <- axes$cat_axis legend = list(traceorder = 'reversed') ylab <- xlab + hovertext <- paste0(data[[1]], ": ", round(abs(x_vals) * 100, 1), "%", " ") } y_axis$title <- "" sample <- ifelse(!missing(n), paste0("Sample size = ", n), "") - hovertext <- paste0(data[[1]], ": ", round(abs(x_vals) * 100, 1), "%", " ") - fig <- plotly::plot_ly( x = x_vals, y = y_vals, @@ -536,20 +540,20 @@ plot_likert <- function(data, mid, n, break_q_names_col, max_lines = 2, xlab = " n_questions <- length(unique(data[[1]])) n_answers <- length(unique(data[[2]])) + # Validate neutral mid + if (!is.logical(neutral_mid)) { + stop("Unexpected input - neutral_mid is not logical.") + } + # Validate mid if (!is.numeric(mid)) { stop("Unexpected input - mid is not numeric.") } else if (mid < 2) { - stop("Unexpected inout - mid is smaller than 2.") - } else if (neutral_mid & mid > n_answers) { + stop("Unexpected input - mid is smaller than 2.") + } else if (neutral_mid & mid >= n_answers) { stop("Unexpected input - mid >= the number of answers.") } - # Validate neutral mid - if (!is.logical(neutral_mid)) { - stop("Unexpected input - mid is not logical (TRUE/FALSE)") - } - # Apply break_q_names to a column if(!missing(break_q_names_col)) { data[[break_q_names_col]] <- break_q_names(data[[break_q_names_col]], max_lines) @@ -682,7 +686,10 @@ calculate_bases <- function(data, mid, neutral_mid) { } } - negative_bases <- data %>% dplyr::group_by_at(1) %>% dplyr::mutate_at(3, get_neg_bases, mid = mid, neutral_mid = neutral_mid) %>% data.frame + negative_bases <- data %>% + dplyr::group_by_at(1) %>% + dplyr::mutate_at(3, get_neg_bases, mid = mid, neutral_mid = neutral_mid) %>% + data.frame() bases <- bases - negative_bases[[3]] diff --git a/R/question-routing.R b/R/question-routing.R index 28c20c6..b1f4a1d 100644 --- a/R/question-routing.R +++ b/R/question-routing.R @@ -11,21 +11,43 @@ apply_skip_logic <- function(data) { - conditions <- list(data$workplace %in% c("Civil service, including devolved administations", "test"), - data$department %in% c("Office for National Statistics", "test"), - data$highest_qualification != "Any other qualification", - data$code_freq != "Never", - data$other_coding_experience != "No", - data$prev_coding_experience != "No", - data$heard_of_RAP != "No") - - skipped_cols <- list(colnames(data)[which(colnames(data) == "CS_grade"):which(colnames(data) == "ONS_directorate")], - colnames(data)[which(colnames(data) == "ONS_directorate")], - colnames(data)[which(colnames(data) == "qual_1_subject"):which(colnames(data) == "qual_3_learn_code")], - colnames(data)[which(colnames(data) == "prac_use_open_source"):which(colnames(data) == "misc_coding")], - colnames(data)[which(colnames(data) == "coding_ability_change"):which(colnames(data) == "first_learned")], - colnames(data)[which(colnames(data) == "first_learned")], - colnames(data)[which(colnames(data) == "know_RAP_champ"):which(colnames(data) == "RAP_comments")]) + conditions <- list(data$workplace == "NHS or local healthcare service", + !data$workplace %in% c("Civil service, including devolved administrations", "NHS or local healthcare service") & !is.na(data$workplace) & data$workplace != "test", + data$department != "Office for National Statistics" & !is.na(data$department) & data$department != "test", + !is.na(data$ONS_directorate) & data$ONS_directorate != "test", + data$pay_band == "Local Authority or NJC", + data$pay_band == "Other / Not sure", + !is.na(data$NHS_band) & data$NHS_band != "test", + data$primary_work_country == "Scotland", + data$primary_work_country == "Wales", + data$primary_work_country == "Northern Ireland", + !is.na(data$England_NHS_organisation) & data$England_NHS_organisation != "test", + !is.na(data$Scotland_NHS_organisation) & data$Scotland_NHS_organisation != "test", + !is.na(data$Wales_NHS_organisation) & data$Wales_NHS_organisation != "test", + data$highest_qualification == "Any other qualification", + data$code_freq == "Never", + data$other_coding_experience == "No", + data$heard_of_RAP == "No", + data$have_RAP_champ != "Yes" & !is.na(data$have_RAP_champ) & data$have_RAP_champ != "test") + + skipped_cols <- list(colnames(dplyr::select(data, "CS_grade":"ONS_directorate")), + colnames(dplyr::select(data, "CS_grade":"Northern_Ireland_NHS_organisation")), + colnames(dplyr::select(data, "ONS_directorate":"Northern_Ireland_NHS_organisation")), + colnames(dplyr::select(data, "pay_band":"Northern_Ireland_NHS_organisation")), + colnames(dplyr::select(data, "NHS_band")), + colnames(dplyr::select(data, "NHS_band":"NJC_grade")), + colnames(dplyr::select(data, "NJC_grade")), + colnames(dplyr::select(data, "England_NHS_organisation")), + colnames(dplyr::select(data, "England_NHS_organisation":"Scotland_NHS_organisation")), + colnames(dplyr::select(data, "England_NHS_organisation":"Wales_NHS_organisation")), + colnames(dplyr::select(data, "Scotland_NHS_organisation":"Northern_Ireland_NHS_organisation")), + colnames(dplyr::select(data, "Wales_NHS_organisation":"Northern_Ireland_NHS_organisation")), + colnames(dplyr::select(data, "Northern_Ireland_NHS_organisation")), + colnames(dplyr::select(data, "qual_1_subject":"qual_3_learn_code")), + colnames(dplyr::select(data, "other_coding_experience":"reproducible_workflow")), + colnames(dplyr::select(data, "first_learned":"coding_ability_change")), + colnames(dplyr::select(data, "have_RAP_champ":"RAP_comments")), + colnames(dplyr::select(data, "know_RAP_champ"))) for(i in 1:length(conditions)){ data <- enforce_skip_logic(data, conditions[[i]], skipped_cols[[i]]) @@ -51,10 +73,9 @@ apply_skip_logic <- function(data) { check_skip_logic <- function(data, condition, skipped_cols) { - condition_failed <- !condition & !is.na(data[skipped_cols]) - - row_failed <- as.logical(rowSums(condition_failed)) + condition_met <- condition & !is.na(data[skipped_cols]) + row_failed <- as.logical(rowSums(condition_met)) return( which(row_failed) diff --git a/R/render.R b/R/render.R index 0cfeac8..16628a0 100644 --- a/R/render.R +++ b/R/render.R @@ -40,38 +40,39 @@ create_filtered_pages <- function(data, type = c("professions", "departments"), dir.create(filtered_pages_path) if (type == "professions") { - prof_cols <- c( - "prof_DS", - "prof_DDAT", - "prof_GAD", - "prof_GES", - "prof_geog", - "prof_GORS", - "prof_GSR", - "prof_GSG" + prof_ref <- data.frame(prof_cols = grep("prof", colnames(data), value = TRUE), + prof_names = c("government data engineers", + "government data scientists", + "digital and data profession (DDAT)", + "government actuary's department (GAD)", + "government economic service (GES)", + "government geography profession", + "government operational research (GORS)", + "government social research (GSR)", + "government statistician group (GSG)", + "no government profession", + "other government profession"), + filenames = c("data-engineers.qmd", + "data-scientists.qmd", + "digital-and-data.qmd", + "government-actuarys-department.qmd", + "government-economic-service.qmd", + "government-geography.qmd", + "government-operational-research.qmd", + "government-social-research.qmd", + "government-statistician-group.qmd", + "no-government-profession.qmd", + "other-government-profession.qmd" + ) ) - prof_names <- c( - "government data scientists", - "digital and data profession (DDAT)", - "government actuary's department (GAD)", - "government economic service (GES)", - "government geography profession", - "government operational research (GORS)", - "government social research (GSR)", - "government statistician group (GSG)" - ) + prof_cols <- data %>% + dplyr::select(dplyr::contains("prof") & !dplyr::contains("none")) %>% + dplyr::select_if(~ any(. == "Yes")) %>% + colnames() - filenames <- c( - "data-scientists.qmd", - "digital-and-data.qmd", - "government-actuarys-department.qmd", - "government-economic-service.qmd", - "government-geography.qmd", - "government-operational-research.qmd", - "government-social-research.qmd", - "government-statician-group.qmd" - ) + prof_names <- prof_ref$prof_names[prof_ref$prof_cols %in% prof_cols] + filenames <- prof_ref$filenames[prof_ref$prof_cols %in% prof_cols] n_pages <- length(prof_cols) } else if (type == "departments") { @@ -112,7 +113,7 @@ create_filtered_pages <- function(data, type = c("professions", "departments"), title <- paste0("Department summary: ", dep_list[i]) } - # Custom open and close tags are used here to avoid clashes with quarto syntax + # Custom open and close tags are used here to avoid clashes with quarto syntax contents <- glue::glue(template, .open = "{{{", .close = "}}}") %>% as.character() path <- paste0(filtered_pages_path, "/", filenames[[i]]) @@ -143,7 +144,6 @@ create_filtered_pages <- function(data, type = c("professions", "departments"), } - #' @title Display programming languages filtered by profession #' #' @param table frequency table (languages_by_prof, see frequency table functions). diff --git a/R/sysdata.rda b/R/sysdata.rda new file mode 100644 index 0000000..7744305 Binary files /dev/null and b/R/sysdata.rda differ diff --git a/R/wave_2_preprocessing.R b/R/wave_2_preprocessing.R index 988ed2d..0c96682 100644 --- a/R/wave_2_preprocessing.R +++ b/R/wave_2_preprocessing.R @@ -1,8 +1,8 @@ -#' Rename columns (wave 3) +#' Rename columns (wave 2) #' #' @description add meaningful column names to dataset ingested from smartsurvey API. #' -#' @param data CARS wave 3 (2021) survey data (data.frame). +#' @param data CARS wave 2 (2020) survey data (data.frame). #' #' @return data.frame #' diff --git a/R/wave_4_preprocessing.R b/R/wave_4_preprocessing.R new file mode 100644 index 0000000..e3673fe --- /dev/null +++ b/R/wave_4_preprocessing.R @@ -0,0 +1,253 @@ + +#' @title Rename columns +#' +#' @description Renames columns and removes unnecessary columns +#' +#' @param data tidy CARS dataset +#' +#' @return data.frame +#' +#' @export + +w4_rename_cols <- function(data) { + + if (class(data) != "data.frame") { + stop("Unexpected input: data is not a data.frame.") + } + if (ncol(data) != 112) { + stop("Unexpected input: incorrect number of columns. Please use the 2022 CARS dataset.") + } + + colnames(data)[c(1, 7:ncol(data))] <- c( + "ID", + "started", + "ended", + "tracking_link", + "workplace", + "CS_grade", + "department", + "other_department_name", + "prof_DS", + "prof_DDAT", + "prof_GAD", + "prof_GES", + "prof_geog", + "prof_GORS", + "prof_GSR", + "prof_GSG", + "prof_CS_none", + "prof_CS_other", + "ONS_directorate", + "highest_qualification", + "qual_1_subject", + "qual_1_level", + "qual_1_learn_code", + "qual_2_subject", + "qual_2_level", + "qual_2_learn_code", + "qual_3_subject", + "qual_3_level", + "qual_3_learn_code", + "code_freq", + "management", + "ops_analysis", + "ops_cleaning", + "ops_linking", + "ops_transfer_migration", + "ops_vis", + "ops_machine_learning", + "ops_modelling", + "ops_QA", + "ops_other", + "ops_other_name", + "knowledge_R", + "access_R", + "knowledge_SQL", + "access_SQL", + "knowledge_SAS", + "access_SAS", + "knowledge_VBA", + "access_VBA", + "knowledge_python", + "access_python", + "knowledge_SPSS", + "access_SPSS", + "knowledge_stata", + "access_stata", + "knowledge_JS", + "access_JS", + "knowledge_java", + "access_java", + "knowledge_C", + "access_C", + "knowledge_matlab", + "access_matlab", + "knowledge_access_other", + "knowledge_git", + "access_git", + "other_coding_experience", + "coding_ability_change", + "prev_coding_experience", + "first_learned", + "heard_of_RAP", + "know_RAP_champ", + "strategy_knowledge", + "RAP_confident", + "RAP_supported", + "RAP_resources", + "RAP_components", + "RAP_important", + "RAP_implementing", + "RAP_planning", + "RAP_comments", + "prac_use_open_source", + "prac_open_source_own", + "prac_version_control", + "prac_review", + "prac_functions", + "prac_unit_test", + "prac_package", + "prac_dir_structure", + "prac_style", + "prac_automated_QA", + "prac_AQUA_book", + "doc_comments", + "doc_functions", + "doc_readme", + "doc_desk_notes", + "doc_registers", + "doc_AQA_logs", + "doc_flow_charts", + "doc_other", + "CI", + "dep_management", + "reproducible_workflow", + "misc_coding", + "misc_support", + "misc_additional_data", + "misc_other" + ) + + data <- data[!colnames(data) %in% c("UserNo", "Name", "Email", "IP.Address", "Unique.ID")] + + return(data) +} + +#' @title Clean department data +#' +#' @description add NHS to department list and merge departments where needed. +#' +#' @param data cleaned CARS dataset +#' +#' @return CARS dataset +#' @export + +w4_clean_departments <- function(data) { + + data$department[grepl("forest research", tolower(data$other_department_name))] <- "Forestry Commission" + + data$department[data$workplace == "NHS"] <- "NHS" + + defra_orgs <- c( + "Department for Environment, Food and Rural Affairs (excl. agencies)", + "Forestry Commission", + "Animal and Plant Health Agency", + "Centre for Environment, Fisheries and Aquaculture Science", + "Rural Payments Agency", + "Environment Agency", + "Marine Management Organisation", + "Natural England" + ) + + data$defra <- data$department %in% defra_orgs + + return(data) + +} + + + +#' @title Apply skip logic +#' +#' @description Iteratively applies enforce_skip_logic to the necessary fields in the data. +#' +#' @param data data.frame +#' +#' @return cleaned data.frame +#' +#' @export + +w4_enforce_streaming <- function(data) { + + conditions <- list(data$workplace %in% c("Civil service, including devolved administations", "test"), + data$department %in% c("Office for National Statistics", "test"), + data$highest_qualification != "Any other qualification", + data$code_freq != "Never", + data$other_coding_experience != "No", + data$prev_coding_experience != "No", + data$heard_of_RAP != "No") + + skipped_cols <- list(colnames(data)[which(colnames(data) == "CS_grade"):which(colnames(data) == "ONS_directorate")], + colnames(data)[which(colnames(data) == "ONS_directorate")], + colnames(data)[which(colnames(data) == "qual_1_subject"):which(colnames(data) == "qual_3_learn_code")], + colnames(data)[which(colnames(data) == "prac_use_open_source"):which(colnames(data) == "misc_coding")], + colnames(data)[which(colnames(data) == "coding_ability_change"):which(colnames(data) == "first_learned")], + colnames(data)[which(colnames(data) == "first_learned")], + colnames(data)[which(colnames(data) == "know_RAP_champ"):which(colnames(data) == "RAP_comments")]) + + for(i in 1:length(conditions)){ + data <- w4_enforce_skip_logic(data, conditions[[i]], skipped_cols[[i]]) + } + + return(data) + +} + + +#' @title Check skip logic +#' +#' @description Checks whether the skip logic was followed correctly. Backtracking while filling the survey can result in inconsistent response sets. +#' This check returns row numbers where questions which should have been skipped contain anything other than NA. +#' +#' @param data data.frame +#' @param condition logical vector. Example: data$row == "skip response" +#' @param skipped_cols character. questions that should have been skipped if condition != TRUE +#' +#' @return list of rows failing the check +#' +#' @export + +w4_check_skip_logic <- function(data, condition, skipped_cols) { + + condition_failed <- !condition & !is.na(data[skipped_cols]) + + row_failed <- as.logical(rowSums(condition_failed)) + + + return( + which(row_failed) + ) + +} + +#' @title enforce skip logic +#' +#' @description Replaces values in rows with NAs where check_skip_logic has identified backtracking. +#' +#' @param data data.frame +#' @param condition logical vector. Example: data$row == "skip response" +#' @param skipped_cols character. questions that should have been skipped if condition != TRUE +#' +#' @return data.frame with rows failing the check replaced with NAs +#' +#' @export + +w4_enforce_skip_logic <- function(data, condition, skipped_cols) { + + row_index <- w4_check_skip_logic(data, condition, skipped_cols) + + data[row_index, skipped_cols] <- NA + + return(data) + +} diff --git a/data-raw/generate_dummy_data.R b/data-raw/generate_dummy_data.R new file mode 100644 index 0000000..04a25ae --- /dev/null +++ b/data-raw/generate_dummy_data.R @@ -0,0 +1,503 @@ +## code to prepare `generate_dummy_data` dataset goes here + +library(dplyr) +library(purrr) +library(usethis) + +sample_replace <- purrr::partial(sample, size = 502, replace = TRUE) + +sample_replace_500 <- purrr::partial(sample, size = 500, replace = TRUE) + +tracking_list = c("Default Web Link", + "Web Link 2", + "Web link 3", + "Web link 4", + "Web link 5") + +q1_options = c("Civil service, including devolved administrations", + "NHS or local healthcare service", + "Other") + +q2_options = c("SCS Pay Band 1 (or equivalent)", + "Grade 6 (or equivalent)", + "Grade 7 (or equivalent)", + "Senior Executive Officer (or equivalent)", + "Higher Executive Officer (or equivalent)", + "Executive Officer (or equivalent)", + "Administrative Officer (or equivalent)", + "Fast Stream", + "Other") + +q3_options = c( "Attorney General's Office", + "Cabinet Office (excl. agencies)", + "Department for Business & Trade (excl. agencies)", + "Department for Culture, Media & Sport", + "Department for Education (excl. agencies)", + "Department for Energy Security & Net Zero", + "Department for Environment Food & Rural Affairs (excl. agencies)", + "Department for Levelling Up, Housing & Communities (excl. agencies)", + "Department for Science, Innovation & Technology (excl. agencies)", + "Department for Transport (excl. agencies)", + "Department for Work & Pensions", + "Department of Health & Social Care (excl. agencies)", + "Foreign, Commonwealth & Development Office (excl. agencies)", + "HM Treasury (excl. agencies)", + "Home Office", + "Ministry of Defence (excl. agencies)", + "Ministry of Justice (excl. agencies)", + "Northern Ireland Office", + "Office of the Advocate General for Scotland", + "Office of the Leader of the House of Commons", + "Office of the Leader of the House of Lords", + "Office of the Secretary of State for Scotland", + "Office of the Secretary of State for Wales", + "UK Export Finance", + "The Charity Commission", + "Competition and Markets Authority", + "Crown Prosecution Service", + "Food Standards Agency", + "Forestry Commission (excl. agencies)", + "Government Actuary's Department", + "Government Legal Department", + "HM Land Registry", + "HM Revenue & Customs (excl. agencies)", + "NS&I", + "The National Archives", + "National Crime Agency", + "Office of Rail and Road", + "Ofgem", + "Ofqual", + "Ofsted", + "Serious Fraud Office", + "Supreme Court of the United Kingdom", + "UK Statistics Authority", + "The Water Services Regulation Authority", + "Prime Minister's Office, 10 Downing Street", + "Crown Commercial Service", + "Government Property Agency", + "Companies House", + "The Insolvency Service", + "Building Digital UK", + "Intellectual Property Office", + "Met Office", + "UK Space Agency", + "Planning Inspectorate", + "Queen Elizabeth II Conference Centre", + "Education and Skills Funding Agency", + "Teaching Regulation Agency", + "Standards and Testing Agency", + "Animal and Plant Health Agency", + "Centre for Environment, Fisheries and Aquaculture Science", + "Rural Payments Agency", + "Veterinary Medicines Directorate", + "Active Travel England", + "Driver and Vehicle Licensing Agency", + "Driver and Vehicle Standards Agency", + "Maritime and Coastguard Agency", + "Vehicle Certification Agency", + "Medicines and Healthcare products Regulatory Agency", + "UK Health Security Agency", + "Wilton Park", + "Government Internal Audit Agency", + "National Infrastructure Commission", + "UK Debt Management Office", + "Defence Electronics and Components Agency", + "Defence Equipment and Support", + "Defence Science and Technology Laboratory", + "UK Hydrographic Office", + "Submarine Delivery Agency", + "Criminal Injuries Compensation Authority", + "HM Courts & Tribunals Service", + "HM Prison and Probation Service", + "Legal Aid Agency", + "Office of the Public Guardian", + "Forest Research", + "Forestry England", + "Valuation Office Agency", + "Office for National Statistics", + "Northern Ireland Executive", + "Scottish Government (excl. agencies)", + "Welsh Government", + "Accountant in Bankruptcy", + "Disclosure Scotland", + "Education Scotland", + "Forestry and Land Scotland", + "Scottish Forestry", + "Scottish Prison Service", + "Scottish Public Pensions Agency", + "Social Security Scotland", + "Student Awards Agency for Scotland", + "Transport Scotland", + "-") + + + +q5_options = c("Directorate 1", + "Directorate 2", + "Directorate 3", + "Directorate 4", + "Directorate 5", + "Directorate 6") + +q6_options = c("NHS", + "Local Authority or NJC", + "Other / Not sure") + +q7_options = c("Band 1", + "Band 2", + "Band 3", + "Band 4", + "Band 5", + "Band 6") + +q9_options = c("England", + "Scotland", + "Wales", + "Northern Ireland") + +q10_options = c("NHS England", + "NHS Trust", + "Other") + +q11_options = c("NHS Scotland", + "Public Health Scotland", + "Regional Health Board", + "Special Health Board", + "Other") + +q12_options = c("Public Health Wales", + "Local Health Board", + "NHS Trust", + "Other") + +q13_options = c("Public Health Agency", + "Health and Social Care Trust", + "Other") + +q14_options = c("Doctoral degree (or equivalent)", + "Master's degree (or equivalent)", + "Bachelor's degree (or equivalent)", + "Any other qualification") + +subject_options = c("English", + "Mathematics", + "Science", + "Other") + +level_options = c("BSc", + "MSc", + "PhD", + "Other") + + +q25_options = c("Yes", + "No - I manage people who do not write code", + "No - I don't line manage anyone") + + +q31_options = c("Education", + "Current employment", + "Previous public sector employment", + "Previous private sector employment", + "Other: some text") + +q32_options = c("It has become significantly better", + "It has become slightly better", + "It has stayed the same", + "It has become slightly worse", + "It has become significantly worse" +) + +q35_options = c("Yes, and I am a RAP Champion", + "Yes", + "No") + +q36_options = c("Yes", + "Yes, but I haven't read it", + "No") + +q38_options = c("I don't understand this question", + "Never", + "Rarely", + "Sometimes", + "Regularly", + "All the time") + +yes_no = c("Yes", + "No") + +yes_no_dk = c("Yes", + "No", + "I don't know") + +yes_no_notreq = c("Yes", + "No", + "Not required for my work") + +likert = c("Strongly Disagree", + "Disagree", + "Neutral", + "Agree", + "Strongly Agree") + +freq_options = c("Never", + "Rarely", + "Sometimes", + "Regularly", + "All the time") + +create_dummy_data <- function(type = c("test", "clean")){ + + set.seed(100) + + cars_dummy_data <- tibble(UserID = c(NA, + NA, + 100000000:100000499), + UserNo = c(NA, + NA, + 1:500), + Name = c(NA), + Email = c(NA), + IP = c(NA, + NA, + rep("100.00.000.00", 500)), + ID = c(NA), + Started = c(NA, + NA, + sample(seq(as.POSIXct('2023/10/16'), + as.POSIXct('2023/11/16'), + by="15 mins"), + 500)), + Ended = c(NA, + NA, + sample(seq(as.POSIXct('2023/10/16'), + as.POSIXct('2023/11/16'), + by="15 mins"), + 500)), + Tracking = c(NA, + NA, + sample(tracking_list, + 500, + replace = TRUE)), + Q1 = c(NA, + NA, + sample(q1_options, + 500, + prob = (c(0.8, 0.19, 0.01)), + replace = TRUE))) %>% + mutate(Q2 = case_when(Q1 == "Civil service, including devolved administrations" ~ sample_replace(q2_options), + TRUE ~ NA), + Q3 = case_when(row_number() == 1 ~ "Answer", + Q1 == "Civil service, including devolved administrations" ~ sample_replace(q3_options), + TRUE ~ NA), + Q3.1 = case_when(Q3 == NA ~ "some text", + Q3 == "Answer" ~ "Other (please specify)", + TRUE ~ NA), + Q4.1 = case_when(row_number() == 1 ~ "Q4.1. Data Engineer (any profession)", + Q1 == "Civil service, including devolved administrations" ~ sample_replace(yes_no)), + Q4.2 = case_when(row_number() == 1 ~ "Q4.2. Data Scientist (any profession)", + Q1 == "Civil service, including devolved administrations" ~ sample_replace(yes_no)), + Q4.3 = case_when(row_number() == 1 ~ "Q4.3. Digital, Data and Technology Profession", + Q1 == "Civil service, including devolved administrations" ~ sample_replace(yes_no)), + Q4.4 = case_when(row_number() == 1 ~ "Q4.4. Government Actuary's Department", + Q1 == "Civil service, including devolved administrations" ~ sample_replace(yes_no)), + Q4.5 = case_when(row_number() == 1 ~ "Q4.5. Government Economic Service", + Q1 == "Civil service, including devolved administrations" ~ sample_replace(yes_no)), + Q4.6 = case_when(row_number() == 1 ~ "Q4.6. Government Geography Profession", + Q1 == "Civil service, including devolved administrations" ~ sample_replace(yes_no)), + Q4.7 = case_when(row_number() == 1 ~ "Q4.7. Government Operational Research Service", + Q1 == "Civil service, including devolved administrations" ~ sample_replace(yes_no)), + Q4.8 = case_when(row_number() == 1 ~ "Q4.8. Government Social Research", + Q1 == "Civil service, including devolved administrations" ~ sample_replace(yes_no)), + Q4.9 = case_when(row_number() == 1 ~ "Q4.9. Government Statistician Group", + Q1 == "Civil service, including devolved administrations" ~ sample_replace(yes_no)), + Q4.10 = case_when(row_number() == 1 ~ "Q4.10. Civil Service, no profession membership", + Q1 == "Civil service, including devolved administrations" ~ sample_replace(yes_no)), + Q4.11 = case_when(row_number() == 1 ~ "Q4.11. Other Civil Service profession", + Q1 == "Civil service, including devolved administrations" ~ sample_replace(yes_no)), + Q5 = case_when(Q3 == "Office for National Statistics" ~ sample_replace(q5_options)), + Q6 = case_when(Q1 == "NHS or local healthcare service" ~ sample_replace(q6_options, prob = c(0.9, 0.09, 0.01)), + TRUE ~ NA), + Q7 = case_when(Q6 == "NHS" ~ sample_replace(q7_options), + TRUE ~ NA), + Q8 = case_when(Q6 == "Local Authority or NJC" ~ sample_replace(q7_options), + TRUE ~ NA), + Q9 = case_when(Q1 == "NHS or local healthcare service" ~ sample_replace(q9_options), + TRUE ~ NA), + Q10 = case_when(Q9 == "England" ~ sample_replace(q10_options), + TRUE ~ NA), + Q11 = case_when(Q9 == "Scotland" ~ sample_replace(q11_options), + TRUE ~ NA), + Q12 = case_when(Q9 == "Wales" ~ sample_replace(q12_options), + TRUE ~ NA), + Q13 = case_when(Q9 == "Northern Ireland" ~ sample_replace(q13_options), + TRUE ~ NA), + Q14 = c(NA, NA, sample(q14_options, + 500, + replace = TRUE)), + Q15 = case_when(Q14 != "Any other qualification" ~ sample_replace(subject_options), + TRUE ~ NA), + Q16 = case_when(!is.na(Q15) ~ sample_replace(level_options), + TRUE ~ NA), + Q17 = case_when(!is.na(Q15) ~ sample_replace(yes_no), + TRUE ~ NA), + Q18 = case_when(Q14 != "Any other qualification" ~ sample_replace(c(subject_options, NA)), + TRUE ~ NA), + Q19 = case_when(!is.na(Q18) ~ sample_replace(level_options), + TRUE ~ NA), + Q20 = case_when(!is.na(Q18) ~ sample_replace(yes_no), + TRUE ~ NA), + Q21 = case_when(Q14 != "Any other qualification" & !is.na(Q18) ~ sample_replace(c(subject_options, NA)), + TRUE ~ NA), + Q22 = case_when(!is.na(Q21) ~ sample_replace(level_options), + TRUE ~ NA), + Q23 = case_when(!is.na(Q21) ~ sample_replace(yes_no), + TRUE ~ NA), + Q24 = c(NA, NA, sample(freq_options, + 500, + replace = TRUE)), + Q25 = c(NA, NA, sample(q25_options, + 500, + replace = TRUE)), + Q26.1 = c("Q26.1. Matlab", + NA, + sample_replace_500(yes_no_dk)), + Q26.2 = c("Q26.2. Python", + NA, + sample_replace_500(yes_no_dk)), + Q26.3 = c("Q26.3. R", + NA, + sample_replace_500(yes_no_dk)), + Q26.4 = c("Q26.4. SAS", + NA, + sample_replace_500(yes_no_dk)), + Q26.5 = c("Q26.5. SPSS", + NA, + sample_replace_500(yes_no_dk)), + Q26.6 = c("Q26.6. SQL", + NA, + sample_replace_500(yes_no_dk)), + Q26.7 = c("Q26.7. Stata", + NA, + sample_replace_500(yes_no_dk)), + Q26.8 = c("Q26.8. VBA", + NA, + sample_replace_500(yes_no_dk)), + Q26.9 = c("Q26.9. Other open source tool (please specify)", + NA, + sample_replace_500(yes_no_dk)), + Q26.10 = c("Q26.10. Other license/closed source tool (please specify)", + NA, + sample_replace_500(yes_no_dk)), + Q26.11 = c("Q26.11. Other (please specify the tool and if it available to use for your work):", + NA, sample_replace_500(c(NA, + "some text"), + prob = c(0.8, 0.2))), + Q27.1 = c("Q27.1. Matlab", + NA, + sample_replace_500(yes_no_dk)), + Q27.2 = c("Q27.2. Python", + NA, + sample_replace_500(yes_no_dk)), + Q27.3 = c("Q27.3. R", + NA, + sample_replace_500(yes_no_dk)), + Q27.4 = c("Q27.4. SAS", + NA, + sample_replace_500(yes_no_dk)), + Q27.5 = c("Q27.5. SPSS", + NA, + sample_replace_500(yes_no_dk)), + Q27.6 = c("Q27.6. SQL", + NA, + sample_replace_500(yes_no_dk)), + Q27.7 = c("Q27.7. Stata", + NA, + sample_replace_500(yes_no_dk)), + Q27.8 = c("Q27.8. VBA", + NA, + sample_replace_500(yes_no_dk)), + Q27.9 = c("Q27.9. Other open source tool (please specify)", + NA, + sample_replace_500(yes_no_dk)), + Q27.10 = c("Q27.10. Other license/closed source tool (please specify)", + NA, + sample_replace_500(yes_no_dk)), + Q27.11 = c("Q27.11. Other (please specify the tool and if it available to use for your work):", + NA, + sample_replace_500(c(NA, + "some text"), + prob = c(0.8, 0.2))), + Q28 = c(NA, NA, sample_replace_500(yes_no_dk)), + Q29 = c(NA, NA, sample_replace_500(yes_no_dk)), + Q30 = case_when(Q24 != "Never" ~ sample_replace(yes_no)), + Q31 = case_when(Q24 != "Never" & Q30 == "Yes" ~ sample_replace(q31_options)), + Q32 = case_when(Q24 != "Never" & Q30 == "Yes" ~ sample_replace(q32_options)), + Q33 = case_when(Q24 != "Never" ~ sample_replace(yes_no)), + Q34 = case_when(Q33 != "No" ~ sample_replace(yes_no_dk)), + Q35 = case_when(Q34 == "Yes" ~ sample_replace(q35_options)), + Q36 = case_when(Q33 != "No" ~ sample_replace(q36_options))) + + q37 <- paste0("Q", seq(37.1, 37.8, 0.1)) + cars_dummy_data[, q37] <- NA + + q38 <- paste0("Q", seq(38.01, 38.13, 0.01)) + cars_dummy_data[, q38] <- NA + + q39 <- paste0("Q", seq(39.1, 39.8, 0.1)) + cars_dummy_data[, q39] <- NA + + q40 <- paste0("Q", 40:42) + cars_dummy_data[, q40] <- NA + + cars_dummy_data <- cars_dummy_data %>% mutate(across(all_of(q37), + ~ case_when(Q24 != "Never" & Q33 == "Yes" ~ sample_replace(likert), + TRUE ~ NA)) + ) %>% + mutate(across(all_of(q38), + ~ case_when(Q24 != "Never" ~ sample_replace(q38_options), + TRUE ~ NA)) + ) %>% + mutate(across(all_of(q39), + ~ case_when(Q24 != "Never" ~ sample_replace(q38_options), + TRUE ~ NA)) + ) %>% + mutate(across(all_of(q40), + ~ case_when(Q24 != "Never" ~ sample_replace(yes_no_dk), + TRUE ~ NA)) + ) + + q43 <- paste0("Q", 43:46) + cars_dummy_data[, q43] <- c(NA, + NA, + sample(c("some text", NA), + 500, + replace = TRUE)) + + + if(type == "test"){ + cars_dummy_data <- cars_dummy_data %>% mutate(across(c(Q1:Q3, + Q4.1:Q17, + Q26.1:Q26.10, + Q27.1:Q27.10, + Q28:Q37.7, + Q38.1:Q39.7, + Q40:Q42), ~ case_when(row_number() %in% c(3:23) ~ tidyr::replace_na(., "test"), + TRUE ~ .))) + } + + return(cars_dummy_data) +} + + + + + + + +cars_dummy_data <- create_dummy_data(type = "test") +cars_dummy_data_clean <- create_dummy_data(type = "clean") + +usethis::use_data(cars_dummy_data, cars_dummy_data_clean, internal = TRUE, overwrite = TRUE) + + diff --git a/main.R b/main.R index b728366..02b0fb8 100644 --- a/main.R +++ b/main.R @@ -1,11 +1,11 @@ library(magrittr) -data <- CARS::get_tidy_data_file("2022_data.csv") %>% +data <- CARS::get_tidy_data_file("2023_data.csv") %>% CARS::rename_cols() %>% CARS::apply_skip_logic() %>% - CARS::clean_departments() %>% + CARS::clean_data() %>% CARS::derive_vars() CARS::create_filtered_pages(data, type = "departments") -CARS::create_filtered_pages(type = "professions") +CARS::create_filtered_pages(data, type = "professions") CARS::render_site() diff --git a/man/calculate_freqs.Rd b/man/calculate_freqs.Rd index 53294a4..42ce78a 100644 --- a/man/calculate_freqs.Rd +++ b/man/calculate_freqs.Rd @@ -4,7 +4,14 @@ \alias{calculate_freqs} \title{Calculate frequencies} \usage{ -calculate_freqs(data, questions, levels, labels = NULL, prop = TRUE) +calculate_freqs( + data, + questions, + levels, + labels = NULL, + prop = TRUE, + sample = FALSE +) } \arguments{ \item{data}{full CARS data frame after pre-processing} @@ -16,6 +23,8 @@ calculate_freqs(data, questions, levels, labels = NULL, prop = TRUE) \item{labels}{labels to rename the column headers. Only needed for multi-column frequencies} \item{prop}{whether to return proportion data (0-1). TRUE by default. Assumes mutually exclusive response options.} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ data.frame diff --git a/man/calculate_multi_table_freqs.Rd b/man/calculate_multi_table_freqs.Rd index ff62baa..991ad10 100644 --- a/man/calculate_multi_table_freqs.Rd +++ b/man/calculate_multi_table_freqs.Rd @@ -4,7 +4,15 @@ \alias{calculate_multi_table_freqs} \title{Create tidy cross table} \usage{ -calculate_multi_table_freqs(data, col1, col2, levels1, levels2, prop = TRUE) +calculate_multi_table_freqs( + data, + col1, + col2, + levels1, + levels2, + prop = TRUE, + sample = FALSE +) } \arguments{ \item{data}{pre-processed CARS data set} @@ -18,6 +26,8 @@ calculate_multi_table_freqs(data, col1, col2, levels1, levels2, prop = TRUE) \item{levels2}{factor levels for col2} \item{prop}{whether to return proportion data (0-1). TRUE by default. Assumes mutually exclusive response options.} + +\item{sample}{returns proportion, count and, group size and sample size. FALSE by default} } \value{ data.frame diff --git a/man/clean_data.Rd b/man/clean_data.Rd new file mode 100644 index 0000000..45d510f --- /dev/null +++ b/man/clean_data.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_cleaning.R +\name{clean_data} +\alias{clean_data} +\title{Clean data} +\usage{ +clean_data(data) +} +\arguments{ +\item{data}{cleaned CARS dataset} +} +\value{ +CARS dataset +} +\description{ +Recategorise department, workplace and first_learned data +} diff --git a/man/clean_first_learned.Rd b/man/clean_first_learned.Rd new file mode 100644 index 0000000..bb09ffe --- /dev/null +++ b/man/clean_first_learned.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_cleaning.R +\name{clean_first_learned} +\alias{clean_first_learned} +\title{Clean first learned data} +\usage{ +clean_first_learned(data) +} +\arguments{ +\item{data}{cleaned CARS dataset} +} +\value{ +CARS dataset +} +\description{ +reclassify 'other' free text responses into self-taught based on common terms used +} diff --git a/man/clean_workplace.Rd b/man/clean_workplace.Rd new file mode 100644 index 0000000..e670f74 --- /dev/null +++ b/man/clean_workplace.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_cleaning.R +\name{clean_workplace} +\alias{clean_workplace} +\title{Clean workplace data} +\usage{ +clean_workplace(data) +} +\arguments{ +\item{data}{cleaned CARS dataset} +} +\value{ +CARS dataset +} +\description{ +reclassify 'other' text responses into CS/NHS +} diff --git a/man/derive_language_status.Rd b/man/derive_language_status.Rd index 8477a56..fc313d9 100644 --- a/man/derive_language_status.Rd +++ b/man/derive_language_status.Rd @@ -13,5 +13,5 @@ derive_language_status(data) data.frame } \description{ -Derve the status of each programmming language as "access" (access only), "knowledge" (knowledge only), "both" or "neither". +Derive the status of each programmming language as "access" (access only), "knowledge" (knowledge only), "both" or "neither". } diff --git a/man/derive_rap_champ_status.Rd b/man/derive_rap_champ_status.Rd new file mode 100644 index 0000000..28a73a1 --- /dev/null +++ b/man/derive_rap_champ_status.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/derive_vars.R +\name{derive_rap_champ_status} +\alias{derive_rap_champ_status} +\title{Derive RAP Champion status} +\usage{ +derive_rap_champ_status(data) +} +\arguments{ +\item{data}{a date frame containing cleaned CARS wave 5 data} +} +\value{ +dataframe containing the additional RAP Champion status columns +} +\description{ +Derive RAP Champion status column from existing variables and add to the dataframe. +} diff --git a/man/ingest.Rd b/man/ingest.Rd index 9be5435..b970a63 100644 --- a/man/ingest.Rd +++ b/man/ingest.Rd @@ -5,7 +5,7 @@ \title{Ingest smartsurvey data} \usage{ ingest( - survey = "1167489", + survey = "1376897", token = Sys.getenv("CARS_TOKEN"), secret = Sys.getenv("CARS_SECRET"), proxies = Sys.getenv("alt_proxy"), diff --git a/man/summarise_ability_change.Rd b/man/summarise_ability_change.Rd index 9c9304d..3d2d7db 100644 --- a/man/summarise_ability_change.Rd +++ b/man/summarise_ability_change.Rd @@ -4,10 +4,12 @@ \alias{summarise_ability_change} \title{Summarise ability change frequency} \usage{ -summarise_ability_change(data) +summarise_ability_change(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_access_git.Rd b/man/summarise_access_git.Rd index ed5d029..32b05fb 100644 --- a/man/summarise_access_git.Rd +++ b/man/summarise_access_git.Rd @@ -4,10 +4,12 @@ \alias{summarise_access_git} \title{Summarise access to git} \usage{ -summarise_access_git(data) +summarise_access_git(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_all.Rd b/man/summarise_all.Rd index c0cdd8c..8173096 100644 --- a/man/summarise_all.Rd +++ b/man/summarise_all.Rd @@ -4,12 +4,14 @@ \alias{summarise_all} \title{Summarise all} \usage{ -summarise_all(data, all_tables = FALSE) +summarise_all(data, all_tables = FALSE, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} \item{all_tables}{logical: whether to produce all summary output tables. Defaults to FALSE.} + +\item{sample}{additionally returns count and sample size for selected tables for QA. FALSE by default} } \value{ list of frequency tables diff --git a/man/summarise_cap_change_by_freq.Rd b/man/summarise_cap_change_by_freq.Rd index 380d32a..44f329f 100644 --- a/man/summarise_cap_change_by_freq.Rd +++ b/man/summarise_cap_change_by_freq.Rd @@ -4,10 +4,12 @@ \alias{summarise_cap_change_by_freq} \title{Summarise capability change by coding frequency} \usage{ -summarise_cap_change_by_freq(data) +summarise_cap_change_by_freq(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{returns proportion, count and, group size and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_cap_change_by_line_manage.Rd b/man/summarise_cap_change_by_line_manage.Rd index c4a3fc9..c61d131 100644 --- a/man/summarise_cap_change_by_line_manage.Rd +++ b/man/summarise_cap_change_by_line_manage.Rd @@ -13,5 +13,5 @@ summarise_cap_change_by_line_manage(data) frequency table (data.frame) } \description{ -calculate the cross tab of capability change by management responsibilty +calculate the cross tab of capability change by management responsibility } diff --git a/man/summarise_code_freq.Rd b/man/summarise_code_freq.Rd index 128f279..76fd66d 100644 --- a/man/summarise_code_freq.Rd +++ b/man/summarise_code_freq.Rd @@ -4,10 +4,12 @@ \alias{summarise_code_freq} \title{Summarise coding frequency} \usage{ -summarise_code_freq(data) +summarise_code_freq(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_coding_practices.Rd b/man/summarise_coding_practices.Rd index 74f90be..d4894ef 100644 --- a/man/summarise_coding_practices.Rd +++ b/man/summarise_coding_practices.Rd @@ -4,10 +4,12 @@ \alias{summarise_coding_practices} \title{Summarise data practices questions} \usage{ -summarise_coding_practices(data) +summarise_coding_practices(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_coding_tools.Rd b/man/summarise_coding_tools.Rd index e8a0d39..cdcce48 100644 --- a/man/summarise_coding_tools.Rd +++ b/man/summarise_coding_tools.Rd @@ -4,7 +4,12 @@ \alias{summarise_coding_tools} \title{Summarise coding tools} \usage{ -summarise_coding_tools(data, type = list("knowledge", "access"), prop = TRUE) +summarise_coding_tools( + data, + type = list("knowledge", "access"), + prop = TRUE, + sample = FALSE +) } \arguments{ \item{data}{full CARS dataset after pre-processing} @@ -12,6 +17,8 @@ summarise_coding_tools(data, type = list("knowledge", "access"), prop = TRUE) \item{type}{type of table (knowledge or access)} \item{prop}{whether to return proportion data (0-1). TRUE by default. Assumes mutually exclusive response options.} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_doc.Rd b/man/summarise_doc.Rd index efe5600..1e34459 100644 --- a/man/summarise_doc.Rd +++ b/man/summarise_doc.Rd @@ -4,10 +4,12 @@ \alias{summarise_doc} \title{Frequency of documentation use} \usage{ -summarise_doc(data) +summarise_doc(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_languages_by_prof.Rd b/man/summarise_languages_by_prof.Rd index 8ab1e2f..00d9d15 100644 --- a/man/summarise_languages_by_prof.Rd +++ b/man/summarise_languages_by_prof.Rd @@ -4,10 +4,12 @@ \alias{summarise_languages_by_prof} \title{Summarise programming language knowledge by profession} \usage{ -summarise_languages_by_prof(data) +summarise_languages_by_prof(data, sample = FALSE) } \arguments{ \item{data}{CARS data (pre-processed)} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ data.frame diff --git a/man/summarise_operations.Rd b/man/summarise_operations.Rd deleted file mode 100644 index 34205f1..0000000 --- a/man/summarise_operations.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/frequency-tables.R -\name{summarise_operations} -\alias{summarise_operations} -\title{Summarise data operations} -\usage{ -summarise_operations(data) -} -\arguments{ -\item{data}{full CARS dataset after pre-processing} -} -\value{ -frequency table (data.frame) -} -\description{ -calculate frequency table for data operations -} diff --git a/man/summarise_rap_champ_status.Rd b/man/summarise_rap_champ_status.Rd new file mode 100644 index 0000000..b0c33f0 --- /dev/null +++ b/man/summarise_rap_champ_status.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/frequency-tables.R +\name{summarise_rap_champ_status} +\alias{summarise_rap_champ_status} +\title{Knowledge of RAP Champions} +\usage{ +summarise_rap_champ_status(data, sample = FALSE) +} +\arguments{ +\item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} +} +\value{ +frequency table (data.frame) +} +\description{ +Create a frequency table of knowledge of RAP Champions +} diff --git a/man/summarise_rap_comp.Rd b/man/summarise_rap_comp.Rd index 685ccd0..a660c48 100644 --- a/man/summarise_rap_comp.Rd +++ b/man/summarise_rap_comp.Rd @@ -4,10 +4,12 @@ \alias{summarise_rap_comp} \title{RAP score components} \usage{ -summarise_rap_comp(data) +summarise_rap_comp(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_rap_knowledge.Rd b/man/summarise_rap_knowledge.Rd index 18b617c..a355ca9 100644 --- a/man/summarise_rap_knowledge.Rd +++ b/man/summarise_rap_knowledge.Rd @@ -4,10 +4,12 @@ \alias{summarise_rap_knowledge} \title{Knowledge of RAP} \usage{ -summarise_rap_knowledge(data) +summarise_rap_knowledge(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_rap_opinions.Rd b/man/summarise_rap_opinions.Rd index cc64d5b..f9ad05c 100644 --- a/man/summarise_rap_opinions.Rd +++ b/man/summarise_rap_opinions.Rd @@ -4,10 +4,12 @@ \alias{summarise_rap_opinions} \title{Opinions of RAP} \usage{ -summarise_rap_opinions(data) +summarise_rap_opinions(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_strategy_knowledge.Rd b/man/summarise_strategy_knowledge.Rd index ce082d6..82341df 100644 --- a/man/summarise_strategy_knowledge.Rd +++ b/man/summarise_strategy_knowledge.Rd @@ -4,10 +4,12 @@ \alias{summarise_strategy_knowledge} \title{Summarise Analysis Function RAP strategy knowledge} \usage{ -summarise_strategy_knowledge(data) +summarise_strategy_knowledge(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_where_learned_code.Rd b/man/summarise_where_learned_code.Rd index 793f33f..2754cf0 100644 --- a/man/summarise_where_learned_code.Rd +++ b/man/summarise_where_learned_code.Rd @@ -4,10 +4,12 @@ \alias{summarise_where_learned_code} \title{Summarise where respondents learned to code} \usage{ -summarise_where_learned_code(data) +summarise_where_learned_code(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/w2_rename_cols.Rd b/man/w2_rename_cols.Rd index 00f6611..571830d 100644 --- a/man/w2_rename_cols.Rd +++ b/man/w2_rename_cols.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/wave_2_preprocessing.R \name{w2_rename_cols} \alias{w2_rename_cols} -\title{Rename columns (wave 3)} +\title{Rename columns (wave 2)} \usage{ w2_rename_cols(data) } \arguments{ -\item{data}{CARS wave 3 (2021) survey data (data.frame).} +\item{data}{CARS wave 2 (2020) survey data (data.frame).} } \value{ data.frame diff --git a/man/w4_check_skip_logic.Rd b/man/w4_check_skip_logic.Rd new file mode 100644 index 0000000..002bab7 --- /dev/null +++ b/man/w4_check_skip_logic.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wave_4_preprocessing.R +\name{w4_check_skip_logic} +\alias{w4_check_skip_logic} +\title{Check skip logic} +\usage{ +w4_check_skip_logic(data, condition, skipped_cols) +} +\arguments{ +\item{data}{data.frame} + +\item{condition}{logical vector. Example: data$row == "skip response"} + +\item{skipped_cols}{character. questions that should have been skipped if condition != TRUE} +} +\value{ +list of rows failing the check +} +\description{ +Checks whether the skip logic was followed correctly. Backtracking while filling the survey can result in inconsistent response sets. +This check returns row numbers where questions which should have been skipped contain anything other than NA. +} diff --git a/man/w4_clean_departments.Rd b/man/w4_clean_departments.Rd new file mode 100644 index 0000000..f29fa47 --- /dev/null +++ b/man/w4_clean_departments.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wave_4_preprocessing.R +\name{w4_clean_departments} +\alias{w4_clean_departments} +\title{Clean department data} +\usage{ +w4_clean_departments(data) +} +\arguments{ +\item{data}{cleaned CARS dataset} +} +\value{ +CARS dataset +} +\description{ +add NHS to department list and merge departments where needed. +} diff --git a/man/w4_enforce_skip_logic.Rd b/man/w4_enforce_skip_logic.Rd new file mode 100644 index 0000000..ba1943a --- /dev/null +++ b/man/w4_enforce_skip_logic.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wave_4_preprocessing.R +\name{w4_enforce_skip_logic} +\alias{w4_enforce_skip_logic} +\title{enforce skip logic} +\usage{ +w4_enforce_skip_logic(data, condition, skipped_cols) +} +\arguments{ +\item{data}{data.frame} + +\item{condition}{logical vector. Example: data$row == "skip response"} + +\item{skipped_cols}{character. questions that should have been skipped if condition != TRUE} +} +\value{ +data.frame with rows failing the check replaced with NAs +} +\description{ +Replaces values in rows with NAs where check_skip_logic has identified backtracking. +} diff --git a/man/w4_enforce_streaming.Rd b/man/w4_enforce_streaming.Rd new file mode 100644 index 0000000..170dac7 --- /dev/null +++ b/man/w4_enforce_streaming.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wave_4_preprocessing.R +\name{w4_enforce_streaming} +\alias{w4_enforce_streaming} +\title{Apply skip logic} +\usage{ +w4_enforce_streaming(data) +} +\arguments{ +\item{data}{data.frame} +} +\value{ +cleaned data.frame +} +\description{ +Iteratively applies enforce_skip_logic to the necessary fields in the data. +} diff --git a/man/w4_rename_cols.Rd b/man/w4_rename_cols.Rd new file mode 100644 index 0000000..5cd74df --- /dev/null +++ b/man/w4_rename_cols.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wave_4_preprocessing.R +\name{w4_rename_cols} +\alias{w4_rename_cols} +\title{Rename columns} +\usage{ +w4_rename_cols(data) +} +\arguments{ +\item{data}{tidy CARS dataset} +} +\value{ +data.frame +} +\description{ +Renames columns and removes unnecessary columns +} diff --git a/quarto/QA/summary_qa.qmd b/quarto/QA/summary_qa.qmd new file mode 100644 index 0000000..6fce9cb --- /dev/null +++ b/quarto/QA/summary_qa.qmd @@ -0,0 +1,332 @@ +--- +title: "Summary QA" +execute: + echo: false +output: + html_document +--- + +```{r output = FALSE} +library(magrittr) + +# loads CARS from local repo to include recent changes +devtools::load_all() + +data <- CARS::get_tidy_data_file("2023_data.csv") %>% + CARS::rename_cols() %>% + CARS::apply_skip_logic() %>% + CARS::clean_data() %>% + CARS::derive_vars() + + +raw_data <- CARS::get_tidy_data_file("2023_data.csv") %>% + CARS::rename_cols() %>% + CARS::clean_data() %>% + CARS::derive_vars() + +all_wave_data <- CARS::get_all_waves(mode = "file") + +tables <- CARS::summarise_all(data, all_tables = TRUE, sample = TRUE) + +exp_samples <- CARS::sample_sizes(raw_data) + +``` + +### QA checklist: + +* Spelling, grammar and readability +* All charts and tables are present +* All charts have titles, legends and axis labels +* All links work as expected + +In addition, this document can be used to QA the data underlying each of the frequency tables and charts. Denominator checks take the expected sample size based on the raw data following the logic rules of the sample_sizes function, as an additional check for question routing. The expected sample size will vary for each question depending on question streaming rules. Other checks include raw data tables used for percentage calculations, which can be used to cross-check calculations are correct. + +The datasets used in this document are: + +* data: data as used in the final publication, with question skip logic applied +* raw_data: data without question skip logic applied, used to determine the expected sample sizes based on question streaming logic +* all_wave_data: data as used in the final publication for each year, with question skip logic applied + +## Coding frequency and tools +#### Summarise coding frequency +Check data against figure +```{r echo = FALSE} +knitr::kable(tables$code_freq) +``` + +Denominator check: +```{r echo = FALSE} +if(tables$code_freq$sample[1] != exp_samples$all) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$all)) +} +``` +#### Coding frequency over time +Sample size should be the total response for each year. Percentages are calculated within the summary.qmd code. +```{r echo = FALSE} + +all_wave_data$code_freq <- factor(all_wave_data$code_freq, levels = c( + "Never", + "Rarely", + "Sometimes", + "Regularly", + "All the time" +)) + +table(all_wave_data$year, all_wave_data$code_freq) %>% + data.frame %>% + dplyr::group_by(Var1) %>% + dplyr::summarise(sample = sum(Freq)) %>% + knitr::kable() + +``` + +### Access to and knowledge of programming languages +#### Access +Check data against figure, check proportions are correct +```{r echo = FALSE} +knitr::kable(tables$access) +``` +Denominator check: +```{r echo = FALSE} + +if(tables$access$sample[1] != exp_samples$all) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$all)) + +} +``` + +#### Knowledge +Check data against figure, check proportions are correct +```{r echo = FALSE} +knitr::kable(tables$knowledge) +``` +Denominator check: +```{r echo = FALSE} + +if(tables$knowledge$sample[1] != exp_samples$all) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$all)) + +} +``` + +#### Open source capability +Check percentages are correct from the data in the table: +```{r echo = FALSE} +knitr::kable(CARS::summarise_os_vs_prop(all_wave_data)) +``` + +#### Different professions have capability in different tools +Check percentages are correct from the data in the table (final column = group sample size): +```{r echo = FALSE} +knitr::kable(tables$languages_by_prof) +``` +Denominator check - numbers of respondents in each profession, cross check with above: +```{r echo = FALSE} +raw_data %>% + tidyr::pivot_longer(contains("prof"), names_to = "prof", values_to = "value") %>% + dplyr::group_by(prof) %>% + dplyr::summarise(n = sum(value == "Yes")) %>% + knitr::kable() +``` +#### Access to git +Check data against figure, check proportions are correct +```{r echo = FALSE} +knitr::kable(tables$git_access) +``` +Denominator check: +```{r echo = FALSE} + +if(tables$git_access$sample[1] != exp_samples$all) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$all)) +} +``` + +## Capability +#### First learned +Check data against figure, check proportions are correct +```{r echo = FALSE} +knitr::kable(tables$where_learned) +``` +Denominator check: +```{r echo = FALSE} + +if(tables$where_learned$sample[1] != exp_samples$code_at_work) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$code_at_work)) + +} +``` + +#### Ability change +Check data against figure, check proportions are correct +```{r echo = FALSE} +knitr::kable(tables$ability_change) +``` +Denominator check: +```{r echo = FALSE} + +if(tables$ability_change$sample[1] != exp_samples$other_code_experience) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$other_code_experience)) +} +``` + +#### Ability change by frequency +Check data against figure, check proportions are correct +```{r echo = FALSE} +knitr::kable(tables$capability_change_by_freq) +``` + +Sample size check: +```{r echo = FALSE} + +if(tables$capability_change_by_freq$sample[1] != exp_samples$other_code_experience) { + warning("Sample size different from expected") + print(paste0("Expected: ", exp_samples$other_code_experience)) + print(paste0("Actual: ", tables$capability_change_by_freq$sample[1])) +} else { + print(paste0("Sample size as expected: ", exp_samples$other_code_experience)) +} +``` + +## RAP +#### Awareness of RAP +Check that the percentages in the chart and the figures in the text are correct +```{r echo = FALSE} +knitr::kable(CARS::summarise_rap_awareness_over_time(all_wave_data)) +``` + +#### RAP knowledge +Check data against figure, check proportions are correct +```{r echo = FALSE} +knitr::kable(tables$rap_knowledge) +``` + +Denominator check: +```{r echo = FALSE} + +if(tables$rap_knowledge$sample[1] != exp_samples$code_at_work) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$code_at_work)) +} +``` + + +#### RAP champs +Check data against figure, check proportions are correct +```{r echo = FALSE} +knitr::kable(tables$rap_champ_status) +``` + +Denominator check: +```{r echo = FALSE} + +if(tables$rap_champ_status$sample[1] != exp_samples$heard_of_RAP) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$heard_of_RAP)) +} +``` + + +#### RAP strategy knowledge +Check data against figure, check proportions are correct +```{r echo = FALSE} +knitr::kable(tables$strategy_knowledge) +``` + +Denominator check: +```{r echo = FALSE} + +if(tables$strategy_knowledge$sample[1] != exp_samples$heard_of_RAP) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$heard_of_RAP)) +} +``` + + +#### RAP opinions +Check data against figure, check proportions are correct +```{r} +knitr::kable(tables$rap_opinions) +``` + +Denominator check: +```{r echo = FALSE} + +if(tables$rap_opinions$sample[1] != exp_samples$heard_of_RAP) { + warning("Denominator different from expected") + print(paste0("Expected: ", exp_samples$heard_of_RAP)) + print(paste0("Actual: ", tables$rap_opinions[1])) +} else { + print(paste0("Denominator as expected: ", exp_samples$heard_of_RAP)) +} +``` + +### Coding practices +Check data against figure, check proportions are correct +```{r} +knitr::kable(tables$rap_components) +``` + +Denominator check: +In this function, denominator is derived directly from data based on logic rules as below +```{r echo = FALSE} + +if(sum(data$code_freq != "Never", na.rm = TRUE) != exp_samples$code_at_work) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$code_at_work)) + +} +``` + +#### Coding practices: frequency +Check data against figure, check proportions are correct +```{r} +knitr::kable(tables$coding_practices) +``` + +Denominator check: +```{r} + +if(tables$coding_practices$sample[1] != exp_samples$code_at_work) { + warning("Denominator different from expected") + print(paste0("Expected: ", exp_samples$code_at_work)) + print(paste0("Actual: ", tables$coding_practices$sample[1])) +} else { + print(paste0("Denominator as expected: ", exp_samples$code_at_work)) +} + +``` + +#### Documentation +Check data against figure, check proportions are correct +```{r} +knitr::kable(tables$doc) +``` + +Denominator check: +```{r} + +if(tables$doc$sample[1] != exp_samples$code_at_work) { + warning("Denominator different from expected") + print(paste0("Expected: ", exp_samples$code_at_work)) + print(paste0("Actual: ", tables$doc$sample[1])) +} else { + print(paste0("Denominator as expected: ", exp_samples$code_at_work)) +} + +``` diff --git a/quarto/main/_quarto.yml b/quarto/main/_quarto.yml index b0da09e..4d67527 100644 --- a/quarto/main/_quarto.yml +++ b/quarto/main/_quarto.yml @@ -3,7 +3,7 @@ project: output-dir: ../../docs/ website: - title: Coding in Analysis and Research Survey 2022 + title: Draft Coding in Analysis and Research Survey 2023 navbar: background: primary left: diff --git a/quarto/main/data_collection.qmd b/quarto/main/data_collection.qmd index b32ca4d..bfd13b6 100644 --- a/quarto/main/data_collection.qmd +++ b/quarto/main/data_collection.qmd @@ -8,27 +8,33 @@ library(magrittr) # Setup all_wave_data <- CARS::get_all_waves(mode = "file") -data <- CARS::get_tidy_data_file("2022_data.csv") %>% +data <- CARS::get_tidy_data_file("2023_data.csv") %>% CARS::rename_cols() %>% CARS::apply_skip_logic() %>% - CARS::clean_departments() %>% + CARS::clean_data() %>% CARS::derive_vars() ``` +:::{.callout-note} +Please note, these are the initial summary statistics for CARS 2023 and further analysis will follow. +We advise linking directly to this document when distributing to ensure the most up to date information. +::: + + # How we collect data -The Coding in Analysis and Research Survey (CARS) data collection takes place for approximately one month, every autumn. The survey is self-selecting and participation is voluntary. Launch dates vary slightly by year to maximise response rate, for example by avoiding clashes with other internal surveys. In 2022, data collection took place from 3 October to 11 November. +The Coding in Analysis and Research Survey (CARS) data collection takes place for approximately one month, every autumn. The survey is self-selecting and participation is voluntary. Launch dates vary slightly by year to maximise response rate, for example by avoiding clashes with other internal surveys. In 2023, data collection took place from 16 October to 4 December. -We invite analysts to participate in the survey using a variety of online channels, mailing lists, networks and newsletters. For the past three years, the most common source of data was through departmental Reproducible Analytical Pipeline (RAP) champions, who promote the survey in their organisations. We rely on various champion networks, Heads of Profession (HoPs) for analysis and Departmental Directors of Analysis (DDans) to promote the survey in their departments and encourage their analytical communities to participate. This means the response rate and any selection bias will vary across organisations. +We invite analysts to participate in the survey using a variety of online channels, mailing lists, networks and newsletters. For the past four years, the most common source of data has been through departmental Reproducible Analytical Pipeline (RAP) champions, who promote the survey in their organisations. We rely on various champion networks, Heads of Profession (HoPs) for analysis and Departmental Directors of Analysis (DDans) to promote the survey and encourage their analytical communities to participate. This means the response rate and any selection bias will vary across organisations. -Our promotional materials make it clear that we are interested in responses from all analysts, whether or not they use coding in their work. However, it may be the case that the survey attracts a disproportionate number of respondents who have an interest in coding and RAP. We advise against making strong inferences about differences between professions and departments or attempting to estimate real frequencies from the data because of these potential limitations. +Our promotional materials make it clear that we are interested in responses from all analysts, whether or not they use coding in their work. The survey may however attract a disproportionate number of respondents who have an interest in coding and RAP. We advise against making strong inferences about differences between professions and departments or attempting to estimate real frequencies from the data because of these potential limitations. -Lastly, while the survey is open to all public sector analysts, the vast majority of responses come from the UK and devolved Civil Service (`r round(sum(data$workplace == "Civil service, including devolved administations") / nrow(data) * 100, 1)`%). As such, follow-up questions on grade and profession applied only to civil servants. +Lastly, while the survey is open to all public sector analysts, the vast majority of responses come from the UK and devolved Civil Service (`r round(sum(data$workplace == "Civil service, including devolved administrations") / nrow(data) * 100, 1)`% in 2023). As such, follow-up questions on grade and profession applied only to civil servants. ## Where our data comes from -Link tracking allows us to see where responses are coming from. Links promoted by RAP champions were the most commonly used for the past three waves, but consistently account for fewer than half of responses. +Link tracking allows us to see where responses are coming from. Links promoted by RAP champions were the most commonly used for the past three waves, and accounted for over half of responses in 2023. ```{r} rename_list <- list( @@ -70,8 +76,19 @@ rename_list <- list( "Government digital DS slack" = "Slack", "GSS slack" = "Slack", "RAS mailing list/newsletter" = "ONS RAS mailing list", - "RAS mailing list" = "ONS RAS mailing" + "RAS mailing list" = "ONS RAS mailing", + "HoPs managers support network + GSG Teams Channel" = "HoP/DDan mailing list", + "HoPs weekly email" = "HoP/DDan mailing list", + "RAS newsletter" = "ONS RAS mailing list", + "AF newsletter" = "Profession newsletters/mailing lists", + "DDaT newsletter" = "Profession newsletters/mailing lists", + "GSR Friday Bulletin" = "Profession newsletters/mailing lists", + "GORS Newsletter" = "Profession newsletters/mailing lists", + "GSS Newsletter" = "Profession newsletters/mailing lists", + "RAP Champions Network" = "RAP champions", + "DATA SCIENCE SLACK" = "Slack" ) + all_wave_data$tracking_link %<>% dplyr::recode(!!!rename_list) links <- table(all_wave_data$tracking_link) @@ -86,12 +103,12 @@ tracking_link_freqs <- table(all_wave_data$year, all_wave_data$tracking_link) %> data.frame() # Reorder by 2022 frequencies -# As the dataset is ordered by year, the code below works out the correct order for the 2022 "block" and applies it to all three +# As the dataset is ordered by year, the code below works out the correct order for the 2023 "block" and applies it to all three -order <- rev(order(tracking_link_freqs$percent[17:24])) -tracking_link_freqs <- tracking_link_freqs[c(order, order+8, order+16) ,] +order <- rev(order(tracking_link_freqs$percent[25:32])) +tracking_link_freqs <- tracking_link_freqs[c(order, order+8, order+16, order+24) ,] -CARS::df_to_table(tracking_link_freqs[c(2,1,5)], column_headers = c("Tracking link", "2020", "2021", "2022"), crosstab = T) +CARS::df_to_table(tracking_link_freqs[c(2,1,5)], column_headers = c("Tracking link", "2020", "2021", "2022", "2023"), crosstab = T) ``` @@ -134,7 +151,7 @@ code_freqs_by_year <- table(all_wave_data$year, all_wave_data$code_freq) %>% plot <- CARS::plot_stacked(code_freqs_by_year[c(1,2,5)], orientation = "v", type = "bar", font_size = 14, xlab = "Year") %>% plotly::layout(legend = list(traceorder = "reversed")) -table <- CARS::df_to_table(code_freqs_by_year[c(2,1,5)], crosstab = T, column_headers = c("In your current role, how often do you write code to complete your work objectives?", "2020", "2021", "2022")) +table <- CARS::df_to_table(code_freqs_by_year[c(2,1,5)], crosstab = T, column_headers = c("In your current role, how often do you write code to complete your work objectives?", "2020", "2021", "2022", "2023")) CARS::wrap_outputs("code-freq", plot, table) @@ -142,7 +159,7 @@ CARS::wrap_outputs("code-freq", plot, table) ## Grade -Across all waves, over 80% of Civil Service respondents reported that they are at H, S or Grade 7 grades. While this will be representative of the grade distribution of analysts in some government organisations, it may not be the case for all organisations. +Across all years, over 80% of Civil Service respondents reported that they are at H, S or Grade 7 grades. While this will be representative of the grade distribution of analysts in some government organisations, it may not be the case for all organisations. ```{r} all_wave_data$CS_grade[all_wave_data$CS_grade == "Research Officer"] <- "Higher Executive Officer (or equivalent)" @@ -150,16 +167,16 @@ all_wave_data$CS_grade[all_wave_data$CS_grade == "Research Officer"] <- "Higher all_wave_data$CS_grade <- gsub(" \\(or equivalent\\)", "", all_wave_data$CS_grade) recode_list <- list( - "Administrative Officer" = "Administrative officer or executive officer", - "Executive Officer" = "Administrative officer or executive officer", + "Administrative Officer" = "Administrative officer or Executive officer", + "Executive Officer" = "Administrative officer or Executive officer", "Grade 6" = "Grade 6 or above", - "SCS Pay Band 1" = "Grade 6 or above" + "SCS Pay Band 1" = "Grade 6 or above" ) all_wave_data$CS_grade <- dplyr::recode(all_wave_data$CS_grade, !!!recode_list) all_wave_data$CS_grade <- factor(all_wave_data$CS_grade, levels = c( - "Administrative officer or executive officer", + "Administrative officer or Executive officer", "Higher Executive Officer", "Senior Executive Officer", "Grade 7", @@ -174,14 +191,14 @@ grade_freqs <- table(all_wave_data$year, all_wave_data$CS_grade) %>% data.frame( plot <- CARS::plot_stacked(grade_freqs, orientation = "v", font_size = 14, xlab = "Year", colour_scale = "gradient") %>% plotly::layout(legend = list(traceorder = "reversed")) -table <- CARS::df_to_table(grade_freqs[c(2,1,3)], crosstab = T, column_headers = c("Grade", "2020", "2021", "2022")) +table <- CARS::df_to_table(grade_freqs[c(2,1,3)], crosstab = T, column_headers = c("Grade", "2020", "2021", "2022", "2023")) CARS::wrap_outputs("grades-by-year", plot, table) ``` ## Profession -Below is a breakdown of the proportion of respondents in different Civil Service professions. These cover the [Analysis Function professions](https://analysisfunction.civilservice.gov.uk/about-us/frequently-asked-questions/) and do not apply outside of the civil service. The exception to this are data scientists who do not have an official government profession. They are included separately here to avoid skewing the data for other professions. Note that respondents can be members of more than one analytical profession. Profession data is difficult to compare across waves as these questions have changed in line with changes to the Analysis Function. +Below is a breakdown of the proportion of respondents in different Civil Service professions. These cover the [Analysis Function professions](https://analysisfunction.civilservice.gov.uk/about-us/frequently-asked-questions/) and do not apply outside of the civil service. The exception to these are data scientists and data engineers who do not have an official government profession. They are included separately here to avoid skewing the data for other professions. Note that respondents can be members of more than one analytical profession. Profession data is difficult to compare across years as these questions have changed in line with changes to the Analysis Function. The CARS sample has high representation from statisticians compared with other professions. This again may be representative of some organisations but not all. @@ -198,13 +215,14 @@ recode_vals <- c( "prof_GSG" = "Statisticians", "prof_DS" = "Data scientists", "prof_GSR" = "Social researchers", - "prof_CS_none" = "civil servant - no profession membership", + "prof_CS_none" = "Civil servant - no profession membership", "prof_GORS" = "Operational researchers", "prof_GES" = "Economists", "prof_DDAT" = "Digital, data and technology profession", "prof_CS_other" = "Civil servant - other profession", "prof_GAD" = "Actuaries", - "prof_geog" = "Georgraphers" + "prof_geog" = "Geographers", + "prof_DE" = "Data engineers" ) frequencies$Profession <- dplyr::recode(frequencies$Profession, !!!recode_vals) diff --git a/quarto/main/departments.qmd b/quarto/main/departments.qmd index ced2a62..3aba847 100644 --- a/quarto/main/departments.qmd +++ b/quarto/main/departments.qmd @@ -7,50 +7,41 @@ output: [comment]: <> (This document makes use of custom open and close tags for use with glue::glue() (" and ", respectively)) [comment]: <> (This document should only be edited as a template: quarto/templates/departments.qmd) -Below are links to profession-specific reports These contain summary statistics, filtered by department. Only departments with 20 or more respondents are included. +:::{.callout-note} +Please note, these are the initial summary statistics for CARS 2023 and further analysis will follow. +We advise linking directly to this document when distributing to ensure the most up to date information. +::: -- [Centre for Environment, Fisheries and Aquaculture Science](departments/centre-for-environment-fisheries-and-aquaculture-science.html) +Below are links to profession-specific reports These contain summary statistics, filtered by department. Only departments with 20 or more respondents are included. -- [Department for Business, Energy and Industrial Strategy (excl. agencies)](departments/department-for-business-energy-and-industrial-strategy-excl-agencies-.html) +- [Department for Business & Trade](departments/department-for-business--trade.html) -- [Department for Digital, Culture, Media and Sport](departments/department-for-digital-culture-media-and-sport.html) +- [Department for Education (excl. agencies)](departments/department-for-education-excl-agencies-.html) -- [Department for Education](departments/department-for-education.html) +- [Department for Energy Security & Net Zero](departments/department-for-energy-security--net-zero.html) - [Department for Environment, Food and Rural Affairs (excl. agencies)](departments/department-for-environment-food-and-rural-affairs-excl-agencies-.html) - [Department for Environment, Food and Rural Affairs (including agencies)](departments/department-for-environment-food-and-rural-affairs-including-agencies-.html) -- [Department for International Trade](departments/department-for-international-trade.html) - -- [Department for Levelling Up, Housing and Communities](departments/department-for-levelling-up-housing-and-communities.html) +- [Department for Levelling Up, Housing & Communities (excl. agencies)](departments/department-for-levelling-up-housing--communities-excl-agencies-.html) - [Department for Transport (excl. agencies)](departments/department-for-transport-excl-agencies-.html) -- [Department for Work and Pensions](departments/department-for-work-and-pensions.html) - -- [Department of Health and Social Care (excl. agencies)](departments/department-of-health-and-social-care-excl-agencies-.html) +- [HM Land Registry](departments/hm-land-registry.html) -- [Forestry Commission](departments/forestry-commission.html) +- [HM Revenue & Customs (excl. agencies)](departments/hm-revenue--customs-excl-agencies-.html) -- [Government Actuary's Department](departments/government-actuary-s-department.html) +- [Home Office (excl. agencies)](departments/home-office-excl-agencies-.html) -- [HM Revenue and Customs](departments/hm-revenue-and-customs.html) - -- [Ministry of Defence](departments/ministry-of-defence.html) +- [Ministry of Defence (excl. agencies)](departments/ministry-of-defence-excl-agencies-.html) - [Ministry of Justice (excl. agencies)](departments/ministry-of-justice-excl-agencies-.html) -- [National Records of Scotland](departments/national-records-of-scotland.html) - -- [NHS](departments/nhs.html) - - [Northern Ireland Statistics and Research Agency](departments/northern-ireland-statistics-and-research-agency.html) - [Office for National Statistics](departments/office-for-national-statistics.html) -- [Public Health Scotland](departments/public-health-scotland.html) - - [Scottish Government (excl. agencies)](departments/scottish-government-excl-agencies-.html) - [UK Health Security Agency](departments/uk-health-security-agency.html) diff --git a/quarto/main/index.qmd b/quarto/main/index.qmd index 222087b..9fa5878 100644 --- a/quarto/main/index.qmd +++ b/quarto/main/index.qmd @@ -1,12 +1,17 @@ --- title: "Home" --- +:::{.callout-note} +Please note, these are the initial summary statistics for CARS 2023 and further analysis will follow. +We advise linking directly to this document when distributing to ensure the most up to date information. +::: ## How do analysts use programming and software engineering in the UK Public Sector? The Coding in Analysis and Research Survey (CARS) was developed to understand better how government analysts use programming. Most questions focused on whether and how analysts use coding in their work. The survey is also concerned with [Reproducible Analytical Pipelines (RAP)](#rap). -The survey was distributed to government analysts and researchers in the autumn of 2022. The results presented summarise the key findings from 1322 respondents from over 50 public sector organisations. You can find the 2021 results in [last year's publication](https://best-practice-and-impact.github.io/CARS-3/). +The survey was distributed to government analysts and researchers in the autumn of 2023. The results presented summarise the key findings from 1297 respondents from over 50 public sector organisations. + ## Who is this research for? @@ -16,7 +21,7 @@ This research was made with senior analysts and leaders in mind. It is intended ## How to use this research -Responding to CARS is voluntary. The results presented here are from a self-selecting sample of government analysts. We are unable to provide details about response rates by department or profession. Because respondents are self-selecting, the results we present only reflect the views of the analysts who participated. +Responding to CARS is voluntary. The results presented here are from a self-selecting sample of government analysts. Because respondents are self-selecting, the results we present only reflect the views of the analysts who participated. We cannot draw wider conclusions about the general population of government analysts from these results because the survey is not based on a random sample. @@ -34,14 +39,13 @@ For more detail, [see the data collection page](data_collection.qmd). ## More information on reproducible analytical pipelines {#rap} -[Reproducible Analytical Pipelines (RAP) MVP](https://github.com/best-practice-and-impact/rap_mvp_maturity_guidance/blob/master/Reproducible-Analytical-Pipelines-MVP.md) +[The Reproducible Analytical Pipelines Minimum Viable Product](https://github.com/best-practice-and-impact/rap_mvp_maturity_guidance/blob/master/Reproducible-Analytical-Pipelines-MVP.md) sets out the minimum expectations for what analysts must do for work to qualify as a RAP. [Quality assurance for coding in analysis and research](https://best-practice-and-impact.github.io/qa-of-code-guidance/) -[The Reproducible Analytical Pipelines Minimum Viable Product](https://github.com/best-practice-and-impact/rap_mvp_maturity_guidance/blob/master/Reproducible-Analytical-Pipelines-MVP.md) sets out the minimum expectations for what analysts must do for work to qualify as a RAP. - [RAP champions network page](https://analysisfunction.civilservice.gov.uk/support/reproducible-analytical-pipelines/reproducible-analytical-pipeline-rap-champions/) + ## Accessibility statement This accessibility statement applies to the Coding in Analysis and Research Survey report. Please note that this does not include third-party content that is referenced from this site. diff --git a/quarto/main/professions.qmd b/quarto/main/professions.qmd index dfe02cd..f00a2e2 100644 --- a/quarto/main/professions.qmd +++ b/quarto/main/professions.qmd @@ -7,14 +7,19 @@ output: [comment]: <> (This document makes use of custom open and close tags for use with glue::glue() (" and ", respectively)) [comment]: <> (This document should only be edited as a template: quarto/templates/professions.qmd) +:::{.callout-note} +Please note, these are the initial summary statistics for CARS 2023 and further analysis will follow. +We advise linking directly to this document when distributing to ensure the most up to date information. +::: + Below are links to profession-specific reports These contain summary statistics, filtered by profession. Only professions with 20 or more respondents are included. +- [Government data engineers](professions/data-engineers.html) + - [Government data scientists](professions/data-scientists.html) - [Digital and data profession (DDAT)](professions/digital-and-data.html) -- [Government actuary's department (GAD)](professions/government-actuarys-department.html) - - [Government economic service (GES)](professions/government-economic-service.html) - [Government geography profession](professions/government-geography.html) @@ -23,6 +28,6 @@ Below are links to profession-specific reports These contain summary statistics, - [Government social research (GSR)](professions/government-social-research.html) -- [Government statistician group (GSG)](professions/government-statician-group.html) +- [Government statistician group (GSG)](professions/government-statistician-group.html) diff --git a/quarto/main/summary.qmd b/quarto/main/summary.qmd index 551f2e7..93a1513 100644 --- a/quarto/main/summary.qmd +++ b/quarto/main/summary.qmd @@ -1,5 +1,5 @@ --- -title: "The state of UK public sector analysis code: 2022" +title: "The state of UK public sector analysis code: 2023" output: html: self-contained: true @@ -9,10 +9,10 @@ output: library(magrittr) -data <- CARS::get_tidy_data_file("2022_data.csv") %>% +data <- CARS::get_tidy_data_file("2023_data.csv") %>% CARS::rename_cols() %>% CARS::apply_skip_logic() %>% - CARS::clean_departments() %>% + CARS::clean_data() %>% CARS::derive_vars() all_wave_data <- CARS::get_all_waves(mode = "file") @@ -22,20 +22,23 @@ tables <- CARS::summarise_all(data, all_tables = TRUE) samples <- CARS::sample_sizes(data) ``` +:::{.callout-note} +Please note, these are the initial summary statistics for CARS 2023 and further analysis will follow. +We advise linking directly to this document when distributing to ensure the most up to date information. +::: + ## How to use this research -Responding to CARS is voluntary. The results presented here are from a self-selecting sample of government analysts. We are unable to provide details about response rates by department or profession. Because respondents are self-selecting, the results we present reflect the views of the analysts who participated. +Responding to CARS is voluntary. The results presented here are from a self-selecting sample of government analysts. Because respondents are self-selecting, the results we present reflect the views of the analysts who participated. For more detail, [see the data collection page](data_collection.qmd). ## Coding frequency and tools -### Most respondents regularly use code at work - -Over the past three years, most respondents reported coding regularly or all the time to complete work objectives. This may be in part due to respondent bias, where those with an interest in coding are more likely to respond to the survey. However, we can conclude that coding is now, and has been for at least a few years, a normal aspect of analysis work in the public sector for many analysts. +We asked all respondents "In your current role, how often do you write code to complete your work objectives?" -#### 2022 data +#### 2023 data ```{r} @@ -68,7 +71,7 @@ code_freqs_by_year <- table(all_wave_data$year, all_wave_data$code_freq) %>% plot <- CARS::plot_stacked(code_freqs_by_year[c(1,2,5)], orientation = "v", type = "bar", font_size = 14, xlab = "Year") %>% plotly::layout(legend = list(traceorder = "reversed")) -table <- CARS::df_to_table(code_freqs_by_year[c(2,1,5)], crosstab = T, column_headers = c("In your current role, how often do you write code to complete your work objectives?", "2020", "2021", "2022")) +table <- CARS::df_to_table(code_freqs_by_year[c(2,1,5)], crosstab = T, column_headers = c("In your current role, how often do you write code to complete your work objectives?", "2020", "2021", "2022", "2023")) CARS::wrap_outputs("code-freq", plot, table) @@ -76,44 +79,39 @@ CARS::wrap_outputs("code-freq", plot, table) ### Access to and knowledge of programming languages -Given a list of programming tools, we asked respondents to answer "Yes", "No" or "Don't know" for the following questions; - -- Do you know how to program with this tool to a level suitable for your work? -- Is this tool available to use for your work? +Given a list of programming tools, we asked all respondents if the tool was available to use for their work. Access to tools does not necessarily refer to official policy. Some analysts may have access to tools others cannot access within the same organisation. -Please note that capability in programming languages is self-reported here and was not objectively defined or tested - -#### Most respondents have access to open source tools - -More respondents reported having access to R and Python than any other programming language listed here. R, Python and SQL are the most accessible programming languages across government, ahead of well established licensed tools such as SPSS, SAS and Stata. - +### Access to coding tools ```{r} plot <- CARS::plot_stacked(tables$access, n = samples$all, xlab = "Programming tool", colour_scale = "3scale", font_size = 14) -table <- CARS::df_to_table(tables$access, n = samples$all, column_headers = c("Programming tool", "Yes", "Don't Know", "No"), crosstab = TRUE) +table <- CARS::df_to_table(tables$access, n = samples$all, column_headers = c("Programming tool", "Yes", "No", "Don't know"), crosstab = TRUE) CARS::wrap_outputs("access", plot, table) ``` -#### Open source tools have overtaken proprietary tools in capability -More respondents reported having the knowledge to use R, Python and SQL at work than any other coding tools, with SPSS being the most popular proprietary software. This shift towards open source tooling is in line with the [RAP strategy](https://analysisfunction.civilservice.gov.uk/policy-store/reproducible-analytical-pipelines-strategy/) and cross-government RAP standards. +Given the same list of programming tools, all respondents were asked if they knew how to program with the tool to a level suitable for their work, answering "Yes", "No" or "Not required for my work". + +Please note that capability in programming languages is self-reported here and was not objectively defined or tested. The statement "not required for my work" was similarly not defined. + +### Knowledge of coding tools ```{r} plot <- CARS::plot_stacked(tables$knowledge, n = samples$all, xlab = "Programming tool", colour_scale = "3scale", font_size = 14) -table <- CARS::df_to_table(tables$knowledge, n = samples$all, column_headers = c("Programming tool", "Yes", "Don't Know", "No"), crosstab = TRUE) +table <- CARS::df_to_table(tables$knowledge, n = samples$all, column_headers = c("Programming tool", "Yes", "No", "Not required for my work"), crosstab = TRUE) CARS::wrap_outputs("knowledge", plot, table) ``` -#### Open source capability is increasing over time +### Open source capability over time -The proportion of respondents who report having the capability to use R and Python has been increasing over the past three waves of CARS. In contrast, the proportion who are able to use SAS, SPSS or Stata has been decreasing during this time. R and Python has risen dramatically in popularity in recent years due to their use in data science. Python in particular is now along the most used and [most popular programming languages globally](https://survey.stackoverflow.co/2022/), while both R and Python are in the top three most [popular programming languages for data science](https://www.kaggle.com/kaggle-survey-2022). +The proportion of respondents who report having the capability to use R and Python, is shown alongside the proportion who are able to use SAS, SPSS or Stata, for the past four years of the survey. ```{r} @@ -139,11 +137,11 @@ CARS::wrap_outputs("tools-over-time", plot, table) ``` -#### Different professions have capability in different tools +### Professions capability in different tools -Differences in preferred languages may lead to silos between analytical professions. For digital and data professionals, operational researchers, data scientists and geographers capability is highest in R, SQL and python. R is among the two top languages for capability for every analytical profession. +Differences in preferred languages may lead to silos between analytical professions. Here we show the percentage of respondents reporting capability in different tools, within the different analytical professions. -However, proprietary tools tend to be more profession-specific. For example, economists have much higher stata capability than any other profession, while social researchers have the highest SPSS capability. Open source tooling offers better cross-profession as well as cross-department collaboration due to easier access to tools. +Please note that respondents might be members of more than one profession, and may report capability in more than one tool. ```{R} colnames(tables$languages_by_prof)[2] <- "Profession" @@ -152,13 +150,17 @@ tables$languages_by_prof[c(2,1,3)] %>% CARS::df_to_table(crosstab = T, proportio ``` -### Most respondents have access to git and know how to use it +### Access to and knowledge of git + -Access to git is generally high across government. However, many have access to git but do not have the capability to use it, meaning there is more work to do to ensure analysts are able to learn these skills. However, some departments have no access, or limited access to git (see department pages). +We asked respondents to answer "Yes", "No" or "Don't know" for the following questions: + +- Is git available to use in your work? +- Do you know how to use git to version-control your work? Please note these outputs include people who do not code at work. -#### Access to git +### Access to git ```{r} plot <- CARS::plot_freqs(tables$git_access, font_size = 14, n = samples$all, xlab = "Access to git") %>% CARS::set_axis_range(0, 1, axis = "y") @@ -169,19 +171,33 @@ CARS::wrap_outputs("git-access", plot, table) ``` -## Capability +### Knowledge of git +```{r} +plot <- CARS::plot_freqs(tables$git_knowledge, font_size = 14, n = samples$all, xlab = "Knowledge of git") %>% CARS::set_axis_range(0, 1, axis = "y") -### Most respondents first learned to code in education +table <- CARS::df_to_table(tables$git_knowledge, n = samples$all, column_headers = c("Response", "Percent")) + +CARS::wrap_outputs("git-knowledge", plot, table) + +``` + + +## Coding capability and change + +### Where respondents first learned to code + +Respondents with coding experience outside their current role were asked where they first learned to code. Those analysts who code in their current role but reported no other coding experience, are included as having learned 'In current role'. Those who reported first learning to code outside of a work or educational environment were categorised as 'self-taught' based on free-text responses. + +These data only show where people first learned to code. They do not show all the settings in which they had learned to code, to what extent, or how long ago. -Half of respondents learned to code for the first time in education. Nevertheless, nearly a third reported learning code for the first time in public sector employment, mostly in their current role. These data show analysts are actively up-skilling in their roles and the organisation is able to draw experienced programmers from other parts of government, as well as those who leave education with some coding abilities. ```{r} -plot <- CARS::plot_freqs(tables$where_learned, n = samples$can_code, +plot <- CARS::plot_freqs(tables$where_learned, n = samples$code_at_work, xlab = "Where learned", font_size = 14, orientation = "h") table <- CARS::df_to_table(tables$where_learned, - n = samples$can_code, + n = samples$code_at_work, column_headers = c("Where learned", "Percent")) @@ -189,9 +205,11 @@ CARS::wrap_outputs("where-learned", plot, table) ``` -### Most analysts' coding capabilities are actively improving +### Change in coding ability during current role + +We asked "Has your coding ability changed during your current role?" -Most respondents with prior coding experience reported that their coding capability has improved while in their current role. Again, this shows the Analysis Function is building capability in-house as well as recruiting analysts with existing capability. However, results also show a minority of respondents who feel they are losing their capability. As more analysts gain capability, it is important that existing skills are retained. +This question was only asked of respondents with coding experience outside of their current role. This means analysts who first learned to code in their current role are not included in the data. ```{r} @@ -208,34 +226,6 @@ CARS::wrap_outputs("ability-change", plot, table) ``` -### How often people write code is a strong predictor of capability change - -Management responsibility and civil service grade are both negatively correlated with improvements to coding capability. In other words, more senior analysts are more likely to report that their coding abilities are becoming worse. - -We used cross-government data to create an ordinal regression model. How often people write code was a very strong predictor at all levels (p \< 0.001) of whether respondents report improvement to their coding capability. Analysts who use code at work at least some of the time were 4.5 times more likely to report more improvement/less decline to their coding abilities than those who rarely or never wrote code at work. The effect was even greater for those who reported writing code regularly or all the time. - -Civil service grade had a smaller effect, with those at grade 7 or 6 reporting less improvement to their coding abilities (p \< .001), but no significant difference between those at HEO and SEO grades (p \> .05). Line management responsibility and civil service grade had no significant effect when the model includes coding frequency regardless of whether it involved managing others who code (p \> .05). - -These findings show that allowing analysts to continue coding is key to retaining and building on existing skills. While seniority is less predictive of capability change, it is correlated with how often people write code at work, meaning more senior analysts are more likely to lose their capability and less likely to build capability. - -```{r} -plot <- CARS::plot_likert(tables$capability_change_by_freq, mid = 3, - neutral_mid = TRUE, font_size = 14, - height = 600, width = "100%") - -table <- CARS::df_to_table(tables$capability_change_by_freq, - column_headers = c("Coding frequency", - "Significantly worse", - "Slightly worse", - "No change", - "Slightly better", - "Significantly better"), - crosstab = TRUE) - -CARS::wrap_outputs("capability-change-by-freq", plot, table) - -``` - ## Reproducible analytical pipelines (RAP) @@ -246,14 +236,14 @@ The following links contain more resources on RAP: * you can find minimum RAP standards in the [RAP MVP](rap_mvp_maturity_guidance/Reproducible-Analytical-Pipelines-MVP.md%20at%20master%20·%20best-practice-and-impact/rap_mvp_maturity_guidance%20·%20GitHub) * you can find guidance on quality assuring code in the [Duck Book](https://best-practice-and-impact.github.io/qa-of-code-guidance/intro.html) -### Awareness of RAP across government has increased +### Awareness of RAP over time + +We asked respondents who used coding at work, if they had heard of RAP. ```{r} freqs <- CARS::summarise_rap_awareness_over_time(all_wave_data) ``` -Over the past three years, awareness of RAP has increased year on year across the Analysis Function. In 2022, `r round(freqs[3, 5]*100, digits = 0) %>% paste0("%")` of respondents had heard of RAP, a `r round((freqs[3, 5]-freqs[1, 5])/freqs[1, 5]*100, digits = 0) %>% paste0("%")` increase. - ```{r} plot <- CARS::plot_freqs(freqs[c(2, 5)], type = "line", xlab = "Year", font_size = 14, error_y = CARS::set_error_bars(freqs$lower_ci, freqs$upper_ci)) %>% CARS::set_axis_range(0, 1, axis = "y") @@ -267,24 +257,30 @@ table <- CARS::df_to_table(freqs[c(2,5:7)], CARS::wrap_outputs("rap-awareness-over-time", plot, table) ``` -### Most respondents have heard of RAP champions -[RAP champions](https://analysisfunction.civilservice.gov.uk/support/reproducible-analytical-pipelines/reproducible-analytical-pipeline-rap-champions/) support and promote the use of RAP across government. Although most respondents who have heard of RAP had heard of the RAP champions' network, most did not know who their RAP champions are. More work is needed to increase awareness of the support available across government, including who its RAP champions are. +### RAP Champions + +We asked respondents who had heard of RAP, if their department has a RAP champion and if they know who it is. + +[RAP champions](https://analysisfunction.civilservice.gov.uk/support/reproducible-analytical-pipelines/reproducible-analytical-pipeline-rap-champions/) support and promote the use of RAP across government. Please [contact the analysis standards and pipelines team](mailto:asap@ons.gov.uk) for any enquiries about RAP or the champions network. + ```{r} -plot <- CARS::plot_freqs(tables$rap_knowledge, n = samples$not_RAP_champ, break_q_names_col = "value", max_lines = 3, xlab = "Knowledge", font_size = 14, orientation = "h") -table <- CARS::df_to_table(tables$rap_knowledge, n = samples$not_RAP_champ, column_headers = c("Knowledge", "Percent")) +plot <- CARS::plot_freqs(tables$rap_champ_status, n = samples$heard_of_RAP, break_q_names_col = "value", max_lines = 2, xlab = "Department RAP champions?", font_size = 14, orientation = "h") +table <- CARS::df_to_table(tables$rap_champ_status, n = samples$heard_of_RAP, column_headers = c("Knowledge", "Percent")) -CARS::wrap_outputs("rap-knowledge", plot, table) +CARS::wrap_outputs("rap-champ-status", plot, table) ``` -### Most respondents have heard of the RAP strategy +### Awareness of RAP strategy + +We asked respondents who had heard of RAP, if they had heard of the RAP strategy. -The [Analysis Function RAP strategy](https://analysisfunction.civilservice.gov.uk/policy-store/reproducible-analytical-pipelines-strategy/) was released in June 2022 and sets out plans for adopting RAP across government. Although most respondents had not read the strategy, 77% of those who had heard of RAP were also aware of the RAP strategy. +The [Analysis Function RAP strategy](https://analysisfunction.civilservice.gov.uk/policy-store/reproducible-analytical-pipelines-strategy/) was released in June 2022 and sets out plans for adopting RAP across government. ```{r} plot <- CARS::plot_freqs(tables$strategy_knowledge, break_q_names_col = 1, max_lines = 3, font_size = 14, orientation = "v", n = samples$heard_of_RAP) @@ -299,13 +295,6 @@ CARS::wrap_outputs("RAP-strat", plot, table) We asked respondents who had heard of RAP whether they agreed with a series of statements. - -### Respondents see the benefits of RAP but most are not currently implementing it - -We asked respondents who had heard of RAP a series of questions about their opinions on RAP. The majority agreed with the statement "I think it is important to implement RAP in my work" (`r round(sum(data$RAP_important %in% c("Agree", "Strongly Agree")) / samples$heard_of_RAP * 100, 1) %>% paste0("%")`), but only only `r round(sum(data$RAP_implementing %in% c("Agree", "Strongly Agree")) / samples$heard_of_RAP * 100, 1) %>% paste0("%")` agreed they are currently implementing RAP. - -Similarly , around half of the respondents agreed on various statements on understanding RAP and having the support and resources to implement it. While awareness and buy-in is high, this highlights the need to ensure analysts are aware of the resources currently available, and for additional resources to be made available where gaps currently exist. - ```{r} plot <- CARS::plot_likert(tables$rap_opinions, @@ -332,13 +321,11 @@ CARS::wrap_outputs("rap-opinions", plot, table) ## Good coding practices -We asked respondents who reported writing code at work about the good practices they apply when writing code at work. These questions cover many of the coding practices recommended in the quality assurance of code for analysis and research guidance, as well as the [minimum RAP standards](rap_mvp_maturity_guidance/Reproducible-Analytical-Pipelines-MVP.md%20at%20master%20·%20best-practice-and-impact/rap_mvp_maturity_guidance%20·%20GitHub) set by the cross-government [RAP champions network](https://analysisfunction.civilservice.gov.uk/support/reproducible-analytical-pipelines/reproducible-analytical-pipeline-rap-champions/). +We asked respondents who reported writing code at work about the good practices they apply when writing code at work. These questions cover many of the coding practices recommended in the quality assurance of code for analysis and research guidance, as well as the [minimum RAP standards](https://github.com/best-practice-and-impact/rap_mvp_maturity_guidance/blob/master/Reproducible-Analytical-Pipelines-MVP.md) set by the cross-government [RAP champions network](https://analysisfunction.civilservice.gov.uk/support/reproducible-analytical-pipelines/reproducible-analytical-pipeline-rap-champions/). -### Respondents do not consistently apply RAP practices +Coding practices have been classified as either 'Basic' or 'Advanced'. Basic practices are those that make up the minimum RAP standards, while Advanced practices help improve reproducibility. The percentage of respondents who reported applying these practices either 'Regularly' or 'All the time' is shown below. -While many respondents make use of RAP practices, these are very inconsistently applied. The chart below present the frequency of respondents reporting that they apply these practices "regularly" or "all the time". For documentation, writing readme files was considered a minimum requirement. For dependency management and continuous integration respondents were only asked whether they use these at all. - -While most respondents who write code use open source software and peer review code regularly, this is not the case for other practices. Basic RAP practices, defined by the RAP champions network as being part of the [minimum RAP standards](rap_mvp_maturity_guidance/Reproducible-Analytical-Pipelines-MVP.md%20at%20master%20·%20best-practice-and-impact/rap_mvp_maturity_guidance%20·%20GitHub) are presented in blue. Among these, open sourcing code is particularly uncommon, despite being part of the digital service standard. +Open sourcing was defined as 'making code freely available to be modified and redistributed' ```{r} @@ -363,9 +350,10 @@ CARS::wrap_outputs("rap-comp", plot, table) ``` -### Many have the capability to apply good practices, but do not always do so +### Consistency of good coding practices + +We asked respondents who reported writing code at work how frequently they apply good coding practices when writing code at work. -As the detailed breakdowns below show, analysts often apply these good practices some of the time. However, in most cases fewer than half responded "regularly" or "all the time", meaning they often do not use these despite having the capability to do so. ```{r} @@ -393,9 +381,11 @@ CARS::wrap_outputs("good-practices", plot, table) ``` -### Analysts rely primarily on code comments for documentation +### Code documentation + +We asked respondents who reported writing code at work how frequently they write different forms of documentation when programming in their current role. -Many analysts do not regularly document code in any form other than code comments. While code comments are useful, other forms of documentation are needed to ensure the code is easy to review and work with, and truly reproducible. +[Embedded documentation](https://best-practice-and-impact.github.io/qa-of-code-guidance/code_documentation.html) is one of the components which make up a RAP minimum viable product. Documentation is important to help others be clear on how to use the product and what the code is intended to do. ```{r} @@ -423,9 +413,4 @@ CARS::wrap_outputs("doc", plot, table) ``` -## Summary - -The findings above show that the Analysis Function has made great strides towards RAP adoption. Using code for analysis is widespread. Open source coding tools such as R, Python, SQL and git are widely available and used (although this varies by organisation). RAP awareness has increased dramatically and many analysts feel supported to apply RAP in their work. Capability is generally increasing, but there is work to do to ensure analysts can retain these coding skills, especially as they move to more senior positions. - -However, there is still much work to be done before RAP becomes the default way of working. While many analysts write code, few consistently apply RAP principles while doing so. Although most respondents see the value of RAP in their work and are actively implementing it, the data on good practices suggest that teams across the Analysis Function still have a work to do before consistently meeting the minimum RAP standards. diff --git a/quarto/templates/departments.qmd b/quarto/templates/departments.qmd index 9431da5..261c183 100644 --- a/quarto/templates/departments.qmd +++ b/quarto/templates/departments.qmd @@ -7,6 +7,11 @@ output: [comment]: <> (This document makes use of custom open and close tags for use with glue::glue() ("{{{" and "}}}", respectively)) [comment]: <> (This document should only be edited as a template: quarto/templates/departments.qmd) +:::{.callout-note} +Please note, these are the initial summary statistics for CARS 2023 and further analysis will follow. +We advise linking directly to this document when distributing to ensure the most up to date information. +::: + Below are links to profession-specific reports These contain summary statistics, filtered by department. Only departments with 20 or more respondents are included. {{{links}}} diff --git a/quarto/templates/professions.qmd b/quarto/templates/professions.qmd index 7de966b..2867a5f 100644 --- a/quarto/templates/professions.qmd +++ b/quarto/templates/professions.qmd @@ -7,6 +7,11 @@ output: [comment]: <> (This document makes use of custom open and close tags for use with glue::glue() ("{{{" and "}}}", respectively)) [comment]: <> (This document should only be edited as a template: quarto/templates/professions.qmd) +:::{.callout-note} +Please note, these are the initial summary statistics for CARS 2023 and further analysis will follow. +We advise linking directly to this document when distributing to ensure the most up to date information. +::: + Below are links to profession-specific reports These contain summary statistics, filtered by profession. Only professions with 20 or more respondents are included. {{{links}}} diff --git a/quarto/templates/summary.qmd b/quarto/templates/summary.qmd index cdf7232..2648cd3 100644 --- a/quarto/templates/summary.qmd +++ b/quarto/templates/summary.qmd @@ -11,10 +11,10 @@ output: library(magrittr) -data <- CARS::get_tidy_data_file("2022_data.csv") %>% +data <- CARS::get_tidy_data_file("2023_data.csv") %>% CARS::rename_cols() %>% CARS::apply_skip_logic() %>% - CARS::clean_departments() %>% + CARS::clean_data() %>% CARS::derive_vars() data <- {{{filter}}} @@ -25,9 +25,14 @@ samples <- CARS::sample_sizes(data) ``` +:::{.callout-note} +Please note, these are the initial summary statistics for CARS 2023 and further analysis will follow. +We advise linking directly to this document when distributing to ensure the most up to date information. +::: + ## How to use this research -Responding to CARS is voluntary. The results presented here are from a self-selecting sample of government analysts. We are unable to provide details about response rates by department or profession. Because respondents are self-selecting, the results we present reflect the views of the analysts who participated. +Responding to CARS is voluntary. The results presented here are from a self-selecting sample of government analysts. Because respondents are self-selecting, the results we present reflect the views of the analysts who participated. For more detail, [see the data collection page](../data_collection.qmd). @@ -37,6 +42,7 @@ For more detail, [see the data collection page](../data_collection.qmd). We asked respondents "In your current role, how often do you write code to complete your work objectives?" + ```{r} plot <- CARS::plot_freqs(tables$code_freq, n = samples$all, xlab = "Coding frequency", font_size = 14) @@ -46,50 +52,33 @@ CARS::wrap_outputs("coding-freq", plot, table) ``` - -### What code is being used for - -We asked respondents what data operations they carry out in their work, and whether they use code to do them. Please note, we did not ask how much of each data operation is done with code or how often. - -Respondents who don't do the operation at all have been removed. - -```{r} - -plot <- CARS::plot_stacked(tables$operations, xlab = "Operation", font_size = 14) -table <- CARS::df_to_table(tables$operations, column_headers = c("Operation", "I do some or all of this by coding (%)", "I do this without coding (%)"), crosstab = TRUE) - -CARS::wrap_outputs("operations", plot, table) - -``` - ### Access to and knowledge of programming languages -Given a list of programming tools, we asked respondents to answer "Yes", "No" or "Don't know" for the following questions; - -- Is this tool available to use for your work? -- Do you know how to program with this tool to a level suitable for your work? +Given a list of programming tools, we asked all respondents if the tool was available to use for their work. Access to tools does not necessarily refer to official policy. Some analysts may have access to tools others cannot access within the same organisation. -Please note that capability in programming languages is self-reported here and was not objectively defined or tested - ### Access to coding tools ```{r} plot <- CARS::plot_stacked(tables$access, n = samples$all, xlab = "Programming tool", colour_scale = "3scale", font_size = 14) -table <- CARS::df_to_table(tables$access, n = samples$all, column_headers = c("Programming tool", "Yes", "Don't Know", "No"), crosstab = TRUE) +table <- CARS::df_to_table(tables$access, n = samples$all, column_headers = c("Programming tool", "Yes", "No", "Don't Know"), crosstab = TRUE) CARS::wrap_outputs("access", plot, table) ``` -### Coding tool knowledge +Given the same list of programming tools, all respondents were asked if they knew how to program with the tool to a level suitable for their work, answering "Yes", "No" or "Not required for my work". + +Please note that capability in programming languages is self-reported here and was not objectively defined or tested. The statement "not required for my work" was similarly not defined. + +### Knowledge of coding tools ```{r} plot <- CARS::plot_stacked(tables$knowledge, n = samples$all, xlab = "Programming tool", colour_scale = "3scale", font_size = 14) -table <- CARS::df_to_table(tables$knowledge, n = samples$all, column_headers = c("Programming tool", "Yes", "Don't Know", "No"), crosstab = TRUE) +table <- CARS::df_to_table(tables$knowledge, n = samples$all, column_headers = c("Programming tool", "Yes", "No", "Not required for my work"), crosstab = TRUE) CARS::wrap_outputs("knowledge", plot, table) @@ -97,62 +86,64 @@ CARS::wrap_outputs("knowledge", plot, table) ### Access to and knowledge of git + We asked respondents to answer "Yes", "No" or "Don't know" for the following questions: - Is git available to use in your work? - Do you know how to use git to version-control your work? +Please note these outputs include people who do not code at work. + ### Access to git ```{r} +plot <- CARS::plot_freqs(tables$git_access, font_size = 14, n = samples$all, xlab = "Access to git") %>% CARS::set_axis_range(0, 1, axis = "y") -plot <- CARS::plot_freqs(tables$git_access, n = samples$all, xlab = "git access", font_size = 14) -table <- CARS::df_to_table(tables$git_access, n = samples$all, column_headers = c("git access", "Percent")) +table <- CARS::df_to_table(tables$git_access, n = samples$all, column_headers = c("Response", "Percent")) CARS::wrap_outputs("git-access", plot, table) ``` -### Git knowledge - +### Knowledge of git ```{r} +plot <- CARS::plot_freqs(tables$git_knowledge, font_size = 14, n = samples$all, xlab = "Knowledge of git") %>% CARS::set_axis_range(0, 1, axis = "y") -plot <- CARS::plot_freqs(tables$git_knowledge, n = samples$all, xlab = "git knowledge", font_size = 14) -table <- CARS::df_to_table(tables$git_knowledge, n = samples$all, column_headers = c("git knowledge", "Percent")) +table <- CARS::df_to_table(tables$git_knowledge, n = samples$all, column_headers = c("Response", "Percent")) CARS::wrap_outputs("git-knowledge", plot, table) ``` -## Coding capability +## Coding capability and change -### Change in coding ability during current role +### Where respondents first learned to code -We asked "Has your coding ability changed during your current role?" +Respondents with coding experience outside their current role were asked where they first learned to code. Those analysts who code in their current role but reported no other coding experience, are included as having learned 'In current role'. Those who reported first learning to code outside of a work or educational environment were categorised as 'self-taught' based on free-text responses. -This question was only asked of respondents with coding experience outside of their current role. This means analysts who first learned to code in their current role are not included in the data. +These data only show where people first learned to code. They do not show all the settings in which they had learned to code, to what extent, or how long ago. ```{r} -plot <- CARS::plot_freqs(tables$ability_change, n = samples$other_code_experience, xlab = "Ability change", font_size = 14) -table <- CARS::df_to_table(tables$ability_change, n = samples$other_code_experience, column_headers = c("Ability change", "Percent")) +plot <- CARS::plot_freqs(tables$where_learned, n = samples$code_at_work, xlab = "Where learned", font_size = 14, orientation = "h") +table <- CARS::df_to_table(tables$where_learned, n = samples$code_at_work, column_headers = c("Where learned", "Percent")) -CARS::wrap_outputs("ability-change", plot, table) +CARS::wrap_outputs("where-learned", plot, table) ``` -### Where respondents first learned to code +### Change in coding ability during current role -Respondents with coding experience outside their current role were asked where they first learned to code. Those analysts who code in their current role but reported no other coding experience, are included as having learned 'In current role'. +We asked "Has your coding ability changed during your current role?" -These data only show where people first learned to code. They do not show all the settings in which they had learned to code, to what extent, or how long ago. +This question was only asked of respondents with coding experience outside of their current role. This means analysts who first learned to code in their current role are not included in the data. ```{r} -plot <- CARS::plot_freqs(tables$where_learned, n = samples$can_code, xlab = "Where learned", font_size = 14, orientation = "h") -table <- CARS::df_to_table(tables$where_learned, n = samples$can_code, column_headers = c("Where learned", "Percent")) +plot <- CARS::plot_freqs(tables$ability_change, n = samples$other_code_experience, xlab = "Ability change", font_size = 14) +table <- CARS::df_to_table(tables$ability_change, n = samples$other_code_experience, column_headers = c("Ability change", "Percent")) -CARS::wrap_outputs("where-learned", plot, table) +CARS::wrap_outputs("ability-change", plot, table) ``` @@ -161,8 +152,9 @@ CARS::wrap_outputs("where-learned", plot, table) We asked respondents who said they currently use code in their work, how often they carry out various coding practices. For more information on the practices presented below, please read our guidance on [Quality Assurance of Code for Analysis and Research](https://best-practice-and-impact.github.io/qa-of-code-guidance/intro.html) -### Good analysis coding practices +Open sourcing was defined as 'making code freely available to be modified and redistributed' +### Consistency of good coding practices ```{r} plot <- CARS::plot_likert(tables$coding_practices, n = samples$code_at_work, @@ -187,6 +179,8 @@ CARS::wrap_outputs("good-practices", plot, table) ### Documentation +We asked respondents who reported writing code at work how frequently they write different forms of documentation when programming in their current role. + [Embedded documentation](https://best-practice-and-impact.github.io/qa-of-code-guidance/code_documentation.html) is one of the components which make up a RAP minimum viable product. Documentation is important to help others be clear on how to use the product and what the code is intended to do. ```{r} @@ -211,6 +205,8 @@ CARS::wrap_outputs("doc", plot, table) ### Dependency Management +We asked respondents who reported writing code at work if they manage dependencies for their projects. + We provided examples of tools that may be used for dependency management: - Requirements files, e.g. python requirements.txt or R DESCRIPTION files @@ -228,7 +224,9 @@ CARS::wrap_outputs("dependency-management", plot, table) ### Continuous integration -As above, respondents were provided with examples of continuous integration technologies: +We asked respondents who reported writing code at work if they use continuous integration. + +We provided some examples of continuous integration technologies: - GitHub actions - Jenkins @@ -244,47 +242,62 @@ CARS::wrap_outputs("ci", plot, table) ``` ### Reproducible workflow packages -Respondents were asked whether they use reproducible workflow packages. Respondents were provided with the following examples: + +We asked respondents who reported writing code at work whether they use reproducible workflow packages. + +We provided some examples of packages: - drake - make -- pymake +- pymake +- targets ```{r} plot <- CARS::plot_freqs(tables$rep_workflow, n = samples$code_at_work, break_q_names_col = "value", xlab = "Use reproduciable workflow packages", font_size = 14) -table <- CARS::df_to_table(tables$rep_workflow, n = samples$code_at_work, column_headers = c("Use reproduciable workflow packages", "Percent")) +table <- CARS::df_to_table(tables$rep_workflow, n = samples$code_at_work, column_headers = c("Use reproducible workflow packages", "Percent")) CARS::wrap_outputs("rep-workflow", plot, table) ``` -## RAP knowledge and opinions +## Reproducible analytical pipelines (RAP) -We asked respondents about their knowledge of and opinions on [reproducible analytical pipelines (RAP)](https://gss.civilservice.gov.uk/reproducible-analytical-pipelines/). RAP refers to the use of practices from software engineering to make analysis more reproducible. These practices build on the advantages of writing analysis as code by ensuring increased quality, trust, efficiency, business continuity and knowledge management. The [RAP champions](https://analysisfunction.civilservice.gov.uk/support/reproducible-analytical-pipelines/reproducible-analytical-pipeline-rap-champions/) are a network of analysts across government who promote and support RAP development in their departments. +We asked respondents about their knowledge of and opinions on [reproducible analytical pipelines (RAP)](https://gss.civilservice.gov.uk/reproducible-analytical-pipelines/). RAP refers to the use of practices from software engineering to make analysis more reproducible. These practices build on the advantages of writing analysis as code by ensuring increased quality, trust, efficiency, business continuity and knowledge management. -### Knowledge of RAP +The [RAP champions](https://analysisfunction.civilservice.gov.uk/support/reproducible-analytical-pipelines/reproducible-analytical-pipeline-rap-champions/) are a network of analysts across government who promote and support RAP development in their departments. Please [contact the analysis standards and pipelines team](mailto:asap@ons.gov.uk) for any enquiries about RAP or the champions network. -We asked respondents: +The [Analysis Function RAP strategy](https://analysisfunction.civilservice.gov.uk/policy-store/reproducible-analytical-pipelines-strategy/) was released in June 2022 and sets out plans for adopting RAP across government. -- Have you heard of RAP? -- Do you know what a RAP champion is? -- Do you know who the RAP champion in your department is? +### Knowledge of RAP -Respondents who have neither access nor knowledge have been removed. +We asked respondents who reported writing code at work, if they had heard of RAP. ```{r} -plot <- CARS::plot_freqs(tables$rap_knowledge, n = samples$not_RAP_champ, break_q_names_col = "value", max_lines = 3, xlab = "Knowledge", font_size = 14, orientation = "h") -table <- CARS::df_to_table(tables$rap_knowledge, n = samples$not_RAP_champ, column_headers = c("Knowledge", "Percent")) +plot <- CARS::plot_freqs(tables$rap_knowledge, n = samples$code_at_work, xlab = "Heard of RAP?", font_size = 14, orientation = "v") +table <- CARS::df_to_table(tables$rap_knowledge, n = samples$code_at_work, column_headers = c("Knowledge", "Percent")) CARS::wrap_outputs("rap-knowledge", plot, table) ``` -### Knowledge of RAP strategy +### RAP Champions + +We asked respondents who had heard of RAP, if their department has a RAP champion and if they know who it is. + +```{r} + +plot <- CARS::plot_freqs(tables$rap_champ_status, n = samples$heard_of_RAP, break_q_names_col = "value", max_lines = 2, xlab = "Heard of RAP champions?", font_size = 14, orientation = "h") +table <- CARS::df_to_table(tables$rap_champ_status, n = samples$heard_of_RAP, column_headers = c("Knowledge", "Percent")) + +CARS::wrap_outputs("rap-champ-status", plot, table) + +``` + +### Awareness of RAP strategy -We asked the respondents "What is your familiarity with the Analysis Function reproducible analytical pipelines (RAP) strategy?". +We asked respondents who had heard of RAP, if they had heard of the RAP strategy. ```{r} @@ -326,9 +339,7 @@ In this section we present RAP components and RAP scores. For each RAP component a percent positive was calculated. Positive responses were recorded where an answer of "regularly" or "all the time" was given. For documentation, a positive response was recorded if both code comments and README files questions received positive responses. For the continuous integration and dependency management components, responses of "yes" were recorded as positive. -RAP scores are then calculated for each respondent as the total of their positive responses. A score of 3 suggests that a respondent is implementing 3 components of RAP at least regularly. - -"Basic components" are the components which make up the [RAP MVP](https://github.com/best-practice-and-impact/rap_mvp_maturity_guidance/blob/master/Reproducible-Analytical-Pipelines-MVP.md). "Advanced components" are components which help improve reproducibility, but are not considered part of the minimum standard. +"Basic" components are the components which make up the [RAP MVP](https://github.com/best-practice-and-impact/rap_mvp_maturity_guidance/blob/master/Reproducible-Analytical-Pipelines-MVP.md). "Advanced" components are components which help improve reproducibility, but are not considered part of the minimum standard. ### RAP components @@ -353,27 +364,3 @@ table <- CARS::df_to_table(tables$rap_components, CARS::wrap_outputs("rap-comp", plot, table) ``` - -### Basic RAP scores - -```{r} - -bar_colour <- CARS::get_2colour_scale(2)[2][[1]] - -plot <- CARS::plot_freqs(tables$basic_rap_scores, n = samples$code_at_work, bar_colour = bar_colour, xlab = "Basic RAP scores", font_size = 14) -table <- CARS::df_to_table(tables$basic_rap_scores, n = samples$code_at_work, column_headers = c("Basic RAP scores", "Percent")) - -CARS::wrap_outputs("basic-rap-scores", plot, table) - -``` - -### Advanced RAP scores - -```{r} - -plot <- CARS::plot_freqs(tables$advanced_rap_scores, n = samples$code_at_work, xlab = "Advanced RAP scores", font_size = 14) -table <- CARS::df_to_table(tables$advanced_rap_scores, n = samples$code_at_work, column_headers = c("Advanced RAP scores", "Percent")) - -CARS::wrap_outputs("advanced-rap-scores", plot, table) - -``` diff --git a/tests/testthat/test-calculate_freqs.R b/tests/testthat/test-calculate_freqs.R index adc943f..3bdcc5f 100644 --- a/tests/testthat/test-calculate_freqs.R +++ b/tests/testthat/test-calculate_freqs.R @@ -17,6 +17,15 @@ expected <- data.frame(name = rep(c("Question 1", levels = c("test1", "test2", "test3")), n = c(1, 0, 0, 0, 1, 0, 0.5, 0, 0.5)) +test_that("create_tidy_freq_table validation works", { + + expect_error(calculate_freqs(data = dummy_data, + questions = questions, + levels = levels), + "Missing input: labels needed for mutli-column frequencies.") + +}) + test_that("create_tidy_freq_table missing data is handled correctly", { got <- calculate_freqs(data = dummy_data, @@ -52,3 +61,36 @@ test_that("create_tidy_freq_table count output is as expected", { expect_equal(got, expected) }) + +test_that("create_tidy_freq_table single question proportion output is as expected", { + + questions <- "Q1" + + got <- calculate_freqs(data = dummy_data, + questions = questions, + levels = levels) + + expected <- data.frame(value = factor(c("test1", "test2", "test3"), + levels = c("test1", "test2", "test3")), + n = c(1, 0, 0)) + + expect_equal(got, expected) + +}) + +test_that("create_tidy_freq_table single question count output is as expected", { + + questions <- "Q1" + + got <- calculate_freqs(data = dummy_data, + questions = questions, + levels = levels, + prop = FALSE) + + expected <- data.frame(value = factor(c("test1", "test2", "test3"), + levels = c("test1", "test2", "test3")), + n = c(1, 0, 0)) + + expect_equal(got, expected) + +}) diff --git a/tests/testthat/test-clean_departments.R b/tests/testthat/test-clean_departments.R index d498f73..f487cba 100644 --- a/tests/testthat/test-clean_departments.R +++ b/tests/testthat/test-clean_departments.R @@ -3,32 +3,36 @@ test_that("clean_departments output is as expected", { dummy_data <- data.frame(department = c(NA, "test", - "test", + "Foreign, Commonwealth & Development Office (excl. agencies)", "Department for Environment, Food and Rural Affairs (excl. agencies)", + "Forestry Commission", + "Forest Research", + "Forestry England", "Animal and Plant Health Agency", "Centre for Environment, Fisheries and Aquaculture Science", "Rural Payments Agency", "Environment Agency", "Marine Management Organisation", "Natural England"), - other_department_name = c(NA, "Forest research", rep("test", 8)), - workplace = c(NA, "test", "NHS", rep("test", 7))) + workplace = c(NA, "NHS", rep("test", 11))) got <- clean_departments(dummy_data) expected <- data.frame(department = c(NA, - "Forestry Commission", "NHS", + "Foreign, Commonwealth and Development Office (excl. agencies)", "Department for Environment, Food and Rural Affairs (excl. agencies)", + "Forestry Commission", + "Forest Research", + "Forestry England", "Animal and Plant Health Agency", "Centre for Environment, Fisheries and Aquaculture Science", "Rural Payments Agency", "Environment Agency", "Marine Management Organisation", "Natural England"), - other_department_name = c(NA, "Forest research", rep("test", 8)), - workplace = c(NA, "test", "NHS", rep("test", 7)), - defra = c(FALSE, TRUE, FALSE, rep(TRUE, 7))) + workplace = c(NA, "NHS", rep("test", 11)), + defra = c(rep(FALSE, 3), rep(TRUE, 10))) expect_equal(got, expected) diff --git a/tests/testthat/test-derive_basic_rap_scores.R b/tests/testthat/test-derive_basic_rap_scores.R index de4ae7e..f5b5883 100644 --- a/tests/testthat/test-derive_basic_rap_scores.R +++ b/tests/testthat/test-derive_basic_rap_scores.R @@ -3,7 +3,7 @@ dummy_data <- data.frame(code_freq = c("Never", rep("Sometimes", 4)), prac_open_source_own = c(NA, "All the time", "Regularly", "Sometimes", "Rarely"), prac_version_control = c(NA, "Never" ,"All the time", "Regularly", "Sometimes"), prac_review = c(NA, "Rarely", "Never", "All the time", "Regularly"), - prac_AQUA_book = c(NA, "Sometimes", "Rarely", "Never", "All the time"), + prac_proportionate_QA = c(NA, "Sometimes", "Rarely", "Never", "All the time"), doc_readme = c(NA, "Regularly", "Sometimes", "Rarely", "Never"), doc_comments = c(NA, "All the time", "Regularly", "Sometimes", "Rarely")) @@ -11,7 +11,7 @@ test_that("derive_basic_rap_scores validation works", { dummy_data <- data.frame() - expect_error(derive_basic_rap_scores(dummy_data), "Unexpected input - missing column names: code_freq\nprac_use_open_source\nprac_open_source_own\nprac_version_control\nprac_review\nprac_AQUA_book\ndoc_comments\ndoc_readme") + expect_error(derive_basic_rap_scores(dummy_data), "Unexpected input - missing column names: code_freq\nprac_use_open_source\nprac_open_source_own\nprac_version_control\nprac_review\nprac_proportionate_QA\ndoc_comments\ndoc_readme") }) @@ -24,14 +24,14 @@ test_that("derive_basic_rap_scores output is as expected", { prac_open_source_own = c(NA, "All the time", "Regularly", "Sometimes", "Rarely"), prac_version_control = c(NA, "Never" ,"All the time", "Regularly", "Sometimes"), prac_review = c(NA, "Rarely", "Never", "All the time", "Regularly"), - prac_AQUA_book = c(NA, "Sometimes", "Rarely", "Never", "All the time"), + prac_proportionate_QA = c(NA, "Sometimes", "Rarely", "Never", "All the time"), doc_readme = c(NA, "Regularly", "Sometimes", "Rarely", "Never"), doc_comments = c(NA, "All the time", "Regularly", "Sometimes", "Rarely"), use_open_source_score = c(NA, 1, 0, 0, 0), open_code_score = c(NA, 1, 1, 0, 0), version_control_score = c(NA, 0, 1, 1, 0), peer_review_score = c(NA, 0, 0, 1, 1), - AQUA_book_score = c(NA, 0, 0, 0, 1), + proportionate_QA_score = c(NA, 0, 0, 0, 1), doc_score = c(NA, 1, 0, 0, 0), basic_rap_score = c(NA, 3, 2, 2, 2)) diff --git a/tests/testthat/test-freq_subplots.R b/tests/testthat/test-freq_subplots.R new file mode 100644 index 0000000..e468488 --- /dev/null +++ b/tests/testthat/test-freq_subplots.R @@ -0,0 +1,70 @@ +levels1 <- c("test1", "test2", "test3") +levels2 <- c(1, 2, 3) + +dummy_data <- data.frame(Q1 = factor(c(rep("test1", 3), + rep("test2", 3), + rep("test3", 3)), + levels = levels1), + Q2 = factor(c(rep(c(1,2,3), 3)), + levels = levels2), + n = c(0, 0, 1, 1, 0, 0, 0, 1, 0)) + + +testthat::test_that("validity check works", + { + testthat::expect_error( + freq_subplots(dummy_data, xlab = "x", ylab = "y", + height = 20, width = 20, nrows = 1), + "Unexpected input: n_rows should be 2 or greater.") + }) + + +got <- freq_subplots(dummy_data, xlab = "x", ylab = "y", + height = 500, width = 300, nrows = 3, + y_margin = .3, x_margin = .3, orientation = "v") + +for(i in 1:length(unique(dummy_data[[2]]))){ + j = 2*i - 1 + testthat::test_that("expected outputs for vertical chart achieved", + { + # x values + testthat::expect_equal(c(got$x$data[[j]]$x), factor(levels1, levels = levels1)) + + # y values + testthat::expect_equal(c(got$x$data[[j]]$y), dummy_data[dummy_data$Q2 == i, "n"]) + + # Bar colors + testthat::expect_equal(got$x$data[[j]]$marker$color, "#004556") + + # Plot orientation + testthat::expect_equal(got$x$data[[j]]$orientation, "v") + + # Title + testthat::expect_equal(got$x$data[[j]]$title, factor(i, levels = c(1, 2, 3))) + }) +} + +got <- freq_subplots(dummy_data, xlab = "x", ylab = "y", + height = 500, width = 300, nrows = 3, + y_margin = .3, x_margin = .3, orientation = "h") + +for(i in 1:length(unique(dummy_data[[2]]))){ + j = 2*i - 1 + testthat::test_that("expected outputs for horizontal chart achieved", + { + # x values + testthat::expect_equal(c(got$x$data[[j]]$y), factor(levels1, levels = rev(levels1))) + + # y values + testthat::expect_equal(c(got$x$data[[j]]$x), dummy_data[dummy_data$Q2 == i, "n"]) + + # Bar colors + testthat::expect_equal(got$x$data[[j]]$marker$color, "#004556") + + # Plot orientation + testthat::expect_equal(got$x$data[[j]]$orientation, "h") + + # Title + testthat::expect_equal(got$x$data[[j]]$title, factor(i, levels = c(1, 2, 3))) + }) +} diff --git a/tests/testthat/test-plot_freqs.R b/tests/testthat/test-plot_freqs.R new file mode 100644 index 0000000..39e842d --- /dev/null +++ b/tests/testthat/test-plot_freqs.R @@ -0,0 +1,91 @@ + +dummy_data <- data.frame(Q1 = factor(c("This is test number one", "This is test 2", "test3"), + levels = c("This is test number one", "This is test 2", "test3")), + n = c(0.5, 0.2, 0.8)) + + +testthat::test_that("validity checks work", + { + testthat::expect_error(plot_freqs(dummy_data, colour = c("blue", "green")), "Unexpected input - colour should be a single colour name.") + testthat::expect_error(plot_freqs(dummy_data, colour = 1), "Unexpected input - colour should be a single colour name.") + testthat::expect_error(plot_freqs(as.list(dummy_data)), "Unexpected input - data is not a data.frame.") + testthat::expect_error(plot_freqs(dplyr::mutate(dummy_data, Q2 = Q1)), "Unexpected input - data does not contain two columns.") + testthat::expect_error(plot_freqs(dplyr::mutate(dummy_data, n = as.character(n))), "Unexpected input - data column 2 is not numeric.") + testthat::expect_error(plot_freqs(dummy_data, xlab = 1), "Unexpected input - labels should be single character strings.") + testthat::expect_error(plot_freqs(dummy_data, ylab = 1), "Unexpected input - labels should be single character strings.") + testthat::expect_error(plot_freqs(dummy_data, xlab = c("1", "2")), "Unexpected input - labels should be single character strings.") + testthat::expect_error(plot_freqs(dummy_data, ylab = c("1", "2")), "Unexpected input - labels should be single character strings.") + testthat::expect_error(plot_freqs(dummy_data, font_size = "x"), "Unexpected input - font_size is not numeric.") + }) + +testthat::test_that("expected outputs for vertical chart achieved", + { + + got <- plot_freqs(dummy_data, n = 100, xlab = "x", ylab = "y", + break_q_names_col = TRUE, + type = "bar", + orientation = "v") + + # x and y values + testthat::expect_equal(c(got$x$attrs[[1]]$x), factor(c("This is test
number one", "This is test
2", "test3"), + levels = c("This is test
number one", "This is test
2", "test3"))) + testthat::expect_equal(c(got$x$attrs[[1]]$y), c(0.5, 0.2, 0.8)) + + # Bar colors + testthat::expect_equal(got$x$attrs[[1]]$marker$color, "#004556") + + # Plot orientation + testthat::expect_equal(got$x$attrs[[1]]$orientation, "v") + + # Sample size + testthat::expect_equal(got$x$layoutAttrs[[1]]$annotations$text, "Sample size = 100") + + # Axis labels + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$title, "x") + testthat::expect_equal(got$x$layoutAttrs[[2]]$annotations$text, "y") + + # Font sizes + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$titlefont$size, 14.4) + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$tickfont$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$yaxis$titlefont$size, 14.4) + testthat::expect_equal(got$x$layoutAttrs[[1]]$yaxis$tickfont$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$hoverlabel$font$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$annotations$font$size, 12) + + }) + +testthat::test_that("expected outputs for horizontal chart achieved", + { + + got <- plot_freqs(dummy_data, n = 100, xlab = "x", ylab = "y", + break_q_names_col = TRUE, + type = "line", + orientation = "h") + + # x and y values + testthat::expect_equal(c(got$x$attrs[[1]]$y), factor(c("This is test
number one", "This is test
2", "test3"), + levels = rev(c("This is test
number one", "This is test
2", "test3")))) + testthat::expect_equal(c(got$x$attrs[[1]]$x), c(0.5, 0.2, 0.8)) + + # Bar colors + testthat::expect_equal(got$x$attrs[[1]]$marker$color, "#004556") + + # Plot orientation + testthat::expect_equal(got$x$attrs[[1]]$orientation, "h") + + # Sample size + testthat::expect_equal(got$x$layoutAttrs[[1]]$annotations$text, "Sample size = 100") + + # Axis labels + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$title, "y") + testthat::expect_equal(got$x$layoutAttrs[[2]]$annotations$text, "x") + + # Font sizes + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$titlefont$size, 14.4) + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$tickfont$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$yaxis$titlefont$size, 14.4) + testthat::expect_equal(got$x$layoutAttrs[[1]]$yaxis$tickfont$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$hoverlabel$font$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$annotations$font$size, 12) + + }) diff --git a/tests/testthat/test-plot_grouped.R b/tests/testthat/test-plot_grouped.R new file mode 100644 index 0000000..71ffd11 --- /dev/null +++ b/tests/testthat/test-plot_grouped.R @@ -0,0 +1,83 @@ +levels1 <- c("This is test number one", "This is test 2", "test3", "test4", "test5") + +dummy_data <- data.frame(Q1 = factor(levels1, + levels = levels1), + Q2 = c(1, 1, 1, 2, 2), + n = c(0.2, 0.5, 0.3, 0.8, 0.6)) + + +testthat::test_that("validity checks work", + { + testthat::expect_error(plot_grouped(as.list(dummy_data)), "Unexpected input - data is not a data.frame.") + testthat::expect_error(plot_grouped(dplyr::mutate(dummy_data, Q3 = Q1)), "Unexpected input - data does not contain 3 columns.") + testthat::expect_error(plot_grouped(dplyr::mutate(dummy_data, n = as.character(n))), "Unexpected input - data column 3 is not numeric.") + testthat::expect_error(plot_grouped(dummy_data, xlab = 1), "Unexpected input - labels should be single character strings.") + testthat::expect_error(plot_grouped(dummy_data, ylab = 1), "Unexpected input - labels should be single character strings.") + testthat::expect_error(plot_grouped(dummy_data, xlab = c("1", "2")), "Unexpected input - labels should be single character strings.") + testthat::expect_error(plot_grouped(dummy_data, ylab = c("1", "2")), "Unexpected input - labels should be single character strings.") + testthat::expect_error(plot_grouped(dummy_data, font_size = "x"), "Unexpected input - font_size is not numeric.") + }) + +testthat::test_that("expected outputs for vertical chart achieved", + { + + got <- plot_grouped(dummy_data, n = 100, xlab = "x", ylab = "y", + break_q_names_col = TRUE, + orientation = "v") + + # x and y values + testthat::expect_equal(c(got$x$attrs[[1]]$x), factor(c("This is test
number one", "This is test
2", "test3", "test4", "test5"), + levels = c("This is test
number one", "This is test
2", "test3", "test4", "test5"))) + testthat::expect_equal(c(got$x$attrs[[1]]$y), c(0.2, 0.5, 0.3, 0.8, 0.6)) + + # Bar colors + testthat::expect_equal(got$x$attrs[[1]]$marker$color, c("#FF6900", "#FF6900", "#FF6900", "#004556", "#004556")) + + # Sample size + testthat::expect_equal(got$x$layoutAttrs[[1]]$annotations$text, "Sample size = 100") + + # Axis labels + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$title, "x") + testthat::expect_equal(got$x$layoutAttrs[[2]]$annotations$text, "y") + + # Font sizes + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$titlefont$size, 14.4) + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$tickfont$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$yaxis$titlefont$size, 14.4) + testthat::expect_equal(got$x$layoutAttrs[[1]]$yaxis$tickfont$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$hoverlabel$font$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$annotations$font$size, 12) + + }) + +testthat::test_that("expected outputs for horizontal chart achieved", + { + + got <- plot_grouped(dummy_data, n = 100, xlab = "x", ylab = "y", + break_q_names_col = TRUE, + orientation = "h") + + # x and y values + testthat::expect_equal(c(got$x$attrs[[1]]$y), factor(rev(c("This is test
number one", "This is test
2", "test3", "test4", "test5")), + levels = rev(c("This is test
number one", "This is test
2", "test3", "test4", "test5")))) + testthat::expect_equal(c(got$x$attrs[[1]]$x), rev(c(0.2, 0.5, 0.3, 0.8, 0.6))) + + # Bar colors + testthat::expect_equal(got$x$attrs[[1]]$marker$color, rev(c("#FF6900", "#FF6900", "#FF6900", "#004556", "#004556"))) + + # Sample size + testthat::expect_equal(got$x$layoutAttrs[[1]]$annotations$text, "Sample size = 100") + + # Axis labels + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$title, "y") + testthat::expect_equal(got$x$layoutAttrs[[2]]$annotations$text, "x") + + # Font sizes + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$titlefont$size, 14.4) + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$tickfont$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$yaxis$titlefont$size, 14.4) + testthat::expect_equal(got$x$layoutAttrs[[1]]$yaxis$tickfont$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$hoverlabel$font$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$annotations$font$size, 12) + + }) diff --git a/tests/testthat/test-plot_likert.R b/tests/testthat/test-plot_likert.R new file mode 100644 index 0000000..ca13a44 --- /dev/null +++ b/tests/testthat/test-plot_likert.R @@ -0,0 +1,55 @@ + +dummy_data <- data.frame(Q1 = rep(c("test1", + "test2", + "test3"), each = 3), + Q2 = factor(rep(c(1, 2, 3), 3), + levels = c(1, 2, 3)), + n = c(0.1, 0.3, 0.6, 0.5, 0.25, 0.25, 0.33, 0.33, 0.34)) + +testthat::test_that("validity checks work", + { + testthat::expect_error(plot_likert(as.list(dummy_data)), "Unexpected input - data is not a data.frame.") + testthat::expect_error(plot_likert(dplyr::mutate(dummy_data, Q3 = Q1)), "Unexpected input - data should have at three columns.") + testthat::expect_error(plot_likert(dummy_data, xlab = 1), "Unexpected input - labels should be single character strings.") + testthat::expect_error(plot_likert(dummy_data, ylab = 1), "Unexpected input - labels should be single character strings.") + testthat::expect_error(plot_likert(dummy_data, xlab = c("1", "2")), "Unexpected input - labels should be single character strings.") + testthat::expect_error(plot_likert(dummy_data, ylab = c("1", "2")), "Unexpected input - labels should be single character strings.") + testthat::expect_error(plot_likert(dummy_data, font_size = "x"), "Unexpected input - font_size is not numeric.") + testthat::expect_error(plot_likert(dummy_data, neutral_mid = "x"), "Unexpected input - neutral_mid is not logical.") + testthat::expect_error(plot_likert(dummy_data, mid = "2"), "Unexpected input - mid is not numeric.") + testthat::expect_error(plot_likert(dummy_data, mid = 1), "Unexpected input - mid is smaller than 2.") + testthat::expect_error(plot_likert(dummy_data, mid = 3), "Unexpected input - mid >= the number of answers.") + }) + +got <- plot_likert(dummy_data, mid = 2, n = 100, xlab = "x", ylab = "y") + + +testthat::test_that("expected outputs achieved", + { + # x and y values + testthat::expect_equal(c(got$x$attrs[[1]]$y), factor(rep(c("test1", "test2", "test3"), each = 3), + levels = c("test3", "test2", "test1"))) + testthat::expect_equal(c(got$x$attrs[[1]]$x), c(0.1, 0.3, 0.6, 0.5, 0.25, 0.25, 0.33, 0.33, 0.34)) + + # Bar colors + testthat::expect_equal(got$x$attrs[[1]]$marker$color, c("#004556", "#004556", "#004556", "#AFAFAF", "#AFAFAF", "#AFAFAF", "#FF6900", "#FF6900", "#FF6900")) + + # Plot orientation + testthat::expect_equal(got$x$attrs[[1]]$orientation, "h") + + # Sample size + testthat::expect_equal(got$x$layoutAttrs[[1]]$annotations$text, "Sample size = 100") + + # Axis labels + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$title, "x") + testthat::expect_equal(got$x$layoutAttrs[[2]]$annotations$text, "y") + + # Font sizes + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$titlefont$size, 14.4) + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$tickfont$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$yaxis$titlefont$size, 14.4) + testthat::expect_equal(got$x$layoutAttrs[[1]]$yaxis$tickfont$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$hoverlabel$font$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$annotations$font$size, 12) + + }) diff --git a/tests/testthat/test-plot_stacked.R b/tests/testthat/test-plot_stacked.R new file mode 100644 index 0000000..f45ae7c --- /dev/null +++ b/tests/testthat/test-plot_stacked.R @@ -0,0 +1,93 @@ + +dummy_data <- data.frame(Q1 = rep(c("This is test number one", + "This is test 2", + "test3"), each = 3), + Q2 = factor(rep(c(1, 2, 3), 3), + levels = c(1, 2, 3)), + n = c(0.1, 0.3, 0.6, 0.5, 0.25, 0.25, 0.33, 0.33, 0.34)) + +testthat::test_that("validity checks work", + { + testthat::expect_error(plot_stacked(as.list(dummy_data)), "Unexpected input - data is not a data.frame.") + testthat::expect_error(plot_stacked(dplyr::mutate(dummy_data, Q3 = Q1)), "Unexpected input - data should have three columns.") + testthat::expect_error(plot_stacked(dummy_data, xlab = 1), "Unexpected input - labels should be single character strings.") + testthat::expect_error(plot_stacked(dummy_data, ylab = 1), "Unexpected input - labels should be single character strings.") + testthat::expect_error(plot_stacked(dummy_data, xlab = c("1", "2")), "Unexpected input - labels should be single character strings.") + testthat::expect_error(plot_stacked(dummy_data, ylab = c("1", "2")), "Unexpected input - labels should be single character strings.") + testthat::expect_error(plot_stacked(dummy_data, font_size = "x"), "Unexpected input - font_size is not numeric.") + }) + +testthat::test_that("expected outputs for horizontal chart achieved", + { + + got <- plot_stacked(dummy_data, n = 100, xlab = "x", ylab = "y", + orientation = "h", + break_q_names_col = TRUE,) + + # x and y values + testthat::expect_equal(c(got$x$attrs[[1]]$y), factor(rep(c("This is test
number one", "This is test
2", "test3"), each = 3), + levels = c("test3", "This is test
2", "This is test
number one"))) + testthat::expect_equal(c(got$x$attrs[[1]]$x), c(0.1, 0.3, 0.6, 0.5, 0.25, 0.25, 0.33, 0.33, 0.34)) + + # Bar colors + testthat::expect_equal(got$x$attrs[[1]]$marker$color, c("#004556", "#004556", "#004556", "#AFAFAF", "#AFAFAF", "#AFAFAF", "#FF6900", "#FF6900", "#FF6900")) + + # Plot orientation + testthat::expect_equal(got$x$attrs[[1]]$orientation, "h") + + # Sample size + testthat::expect_equal(got$x$layoutAttrs[[1]]$annotations$text, "Sample size = 100") + + # Axis labels + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$title, "y") + testthat::expect_equal(got$x$layoutAttrs[[2]]$annotations$text, "x") + + # Font sizes + testthat::expect_equal(got$x$layoutAttrs[[1]]$legend$font$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$titlefont$size, 14.4) + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$tickfont$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$yaxis$titlefont$size, 14.4) + testthat::expect_equal(got$x$layoutAttrs[[1]]$yaxis$tickfont$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$hoverlabel$font$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$annotations$font$size, 12) + + }) + + + +testthat::test_that("expected outputs for vertical chart achieved", + { + + got <- plot_stacked(dummy_data, n = 100, xlab = "x", ylab = "y", + orientation = "v", + type = "line", + break_q_names_col = TRUE,) + + # x and y values + testthat::expect_equal(c(got$x$attrs[[1]]$x), factor(rep(c("This is test
number one", "This is test
2", "test3"), each = 3), + levels = rev(c("test3", "This is test
2", "This is test
number one")))) + testthat::expect_equal(c(got$x$attrs[[1]]$y), c(0.1, 0.3, 0.6, 0.5, 0.25, 0.25, 0.33, 0.33, 0.34)) + + # Bar colors + testthat::expect_equal(got$x$attrs[[1]]$marker$color, c("#004556", "#004556", "#004556", "#AFAFAF", "#AFAFAF", "#AFAFAF", "#FF6900", "#FF6900", "#FF6900")) + + # Plot orientation + testthat::expect_equal(got$x$attrs[[1]]$orientation, "v") + + # Sample size + testthat::expect_equal(got$x$layoutAttrs[[1]]$annotations$text, "Sample size = 100") + + # Axis labels + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$title, "x") + testthat::expect_equal(got$x$layoutAttrs[[2]]$annotations$text, "y") + + # Font sizes + testthat::expect_equal(got$x$layoutAttrs[[1]]$legend$font$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$titlefont$size, 14.4) + testthat::expect_equal(got$x$layoutAttrs[[1]]$xaxis$tickfont$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$yaxis$titlefont$size, 14.4) + testthat::expect_equal(got$x$layoutAttrs[[1]]$yaxis$tickfont$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$hoverlabel$font$size, 12) + testthat::expect_equal(got$x$layoutAttrs[[1]]$annotations$font$size, 12) + + }) diff --git a/tests/testthat/test-plot_utils.R b/tests/testthat/test-plot_utils.R new file mode 100644 index 0000000..370f815 --- /dev/null +++ b/tests/testthat/test-plot_utils.R @@ -0,0 +1,68 @@ + + +testthat::test_that("expected outputs achieved", { + + dummy_data <- data.frame(Q1 = rep(c("test1", + "test2", + "test3"), each = 3), + Q2 = factor(rep(c(1, 2, 3), 3), + levels = c(1, 2, 3)), + n = c(0.1, 0.3, 0.6, 0.5, 0.25, + 0.25, 0.33, 0.33, 0.34)) + + testthat::expect_equal(calculate_bases(dummy_data, mid = 2, neutral_mid = TRUE), + c(-0.250, -0.150, 0.150, -0.625, -0.125, + 0.125, -0.495, -0.165, 0.165)) + +}) + + +testthat::test_that("expected outputs achieved", { + + dummy_data <- data.frame(Q1 = factor(c("test1", "test2", "test3"), + levels = c("test1", "test2", "test3")), + n = c(0.5, 0.2, 0.8)) + + got <- plot_freqs(dummy_data, n = 100, xlab = "x", ylab = "y") + + axis_got <- set_axis_range(got, min = 0.1, max = 0.8, axis = "y") + + testthat::expect_equal(axis_got$x$layoutAttrs[[3]]$yaxis$range[[1]], 0.1) + testthat::expect_equal(axis_got$x$layoutAttrs[[3]]$yaxis$range[[2]], 0.8) + +}) + + +testthat::test_that("expected outputs achieved", { + testthat::expect_equal(create_y_lab("y", 12), + list(text = "y", + y = 1, + x = "min", + showarrow = FALSE, + yshift = 24, + xref = "paper", + yref = "paper", + font = list(size = 12 * 1.2))) + +}) + + +testthat::test_that("expected outputs achieved", { + testthat::expect_equal(axis_settings("x", "y", 12), + list( + scale_axis = list( + title = "y", + tickfont = list(size = 12), + titlefont = list(size = 12 * 1.2), + tickformat = ".0%", + title = "Percent" + ), + cat_axis = list( + title = "x", + tickfont = list(size = 12), + titlefont = list(size = 12 * 1.2) + ) + ) + ) + +}) diff --git a/tests/testthat/test-question-routing.R b/tests/testthat/test-question-routing.R index 2bff9bc..80e2da6 100644 --- a/tests/testthat/test-question-routing.R +++ b/tests/testthat/test-question-routing.R @@ -1,5 +1,5 @@ test_that("check_skip_logic returns failing rows from one skipped column", { - dummy_data <- data.frame(cond_col = c(T, T, F, F), skipped_col = c("test", NA, "test", NA)) + dummy_data <- data.frame(cond_col = c(F, F, T, T), skipped_col = c("test", NA, "test", NA)) got <- check_skip_logic(dummy_data, dummy_data$cond, "skipped_col") @@ -9,7 +9,7 @@ test_that("check_skip_logic returns failing rows from one skipped column", { }) test_that("check_skip_logic returns failing rows from multiple skipped columns", { - dummy_data <- data.frame(cond_col = c(T, T, T, F, F, F), + dummy_data <- data.frame(cond_col = c(F, F, F, T, T, T), skipped_col1 = c(NA, NA, NA, "test", NA, NA), skipped_col2 = c(NA, NA, NA, NA, "test", NA), skipped_col3 = c(NA, NA, NA, NA, NA, "test")) @@ -23,26 +23,26 @@ test_that("check_skip_logic returns failing rows from multiple skipped columns", test_that("enforce_skip_logic replaces failing rows from one skipped column", { - dummy_data <- data.frame(cond_col = c(T, T, F, F), + dummy_data <- data.frame(cond_col = c(F, F, T, T), skipped_col = c("test", NA, "test", NA)) got <- enforce_skip_logic(dummy_data, dummy_data$cond, "skipped_col") - expected <- data.frame(cond_col = c(T, T, F, F), + expected <- data.frame(cond_col = c(F, F, T, T), skipped_col = c("test", NA, NA, NA)) expect_equal(expected, got) }) test_that("enforce_skip_logic replaces failing rows from multiple skipped columns", { - dummy_data <- data.frame(cond_col = c(T, T, T, F, F, F), + dummy_data <- data.frame(cond_col = c(F, F, F, T, T, T), skipped_col1 = c(NA, "test", "test", NA, "test", "test"), skipped_col2 = c(NA, NA, "test", NA, NA, "test"), skipped_col3 = c(NA, NA, NA, NA, NA, NA)) got <- enforce_skip_logic(dummy_data, dummy_data$cond, c("skipped_col1", "skipped_col2", "skipped_col3")) - expected <- data.frame(cond_col = c(T, T, T, F, F, F), + expected <- data.frame(cond_col = c(F, F, F, T, T, T), skipped_col1 = c(NA, "test", "test", NA, NA, NA), skipped_col2 = c(NA, NA, "test", NA, NA, NA), skipped_col3 = c(NA, NA, NA, NA, NA, NA)) @@ -50,112 +50,12 @@ test_that("enforce_skip_logic replaces failing rows from multiple skipped column expect_equal(expected, got) }) +test_that("apply_skip_logic replaces all relevant columns with NAs", { + dummy_data <- rename_cols(cars_dummy_data) -questions = c("workplace", - "department", - "highest_qualification", - "code_freq", - "other_coding_experience", - "prev_coding_experience", - "heard_of_RAP") - -conditions = c("NHS", - "Companies House", - "Any other qualification", - "Never", - "No", - "No", - "No") - -data_template <- data.frame(workplace = rep("test", 7), - CS_grade = rep("test", 7), - department = rep("test", 7), - other_department_name = rep("test", 7), - prof_DS = rep("test", 7), - prof_DDAT = rep("test", 7), - prof_GAD = rep("test", 7), - prof_GES = rep("test", 7), - prof_geog = rep("test", 7), - prof_GORS = rep("test", 7), - prof_GSR = rep("test", 7), - prof_GSG = rep("test", 7), - prof_CS_none = rep("test", 7), - prof_CS_other = rep("test", 7), - ONS_directorate = rep("test", 7), - highest_qualification = rep("test", 7), - qual_1_subject = rep("test", 7), - qual_1_level = rep("test", 7), - qual_1_learn_code = rep("test", 7), - qual_2_subject = rep("test", 7), - qual_2_level = rep("test", 7), - qual_2_learn_code = rep("test", 7), - qual_3_subject = rep("test", 7), - qual_3_level = rep("test", 7), - qual_3_learn_code = rep("test", 7), - code_freq = rep("test", 7), - other_coding_experience = rep("test", 7), - coding_ability_change = rep("test", 7), - prev_coding_experience = rep("test", 7), - first_learned = rep("test", 7), - heard_of_RAP = rep("test", 7), - know_RAP_champ = rep("test", 7), - strategy_knowledge = rep("test", 7), - RAP_confident = rep("test", 7), - RAP_supported = rep("test", 7), - RAP_resources = rep("test", 7), - RAP_components = rep("test", 7), - RAP_important = rep("test", 7), - RAP_implementing = rep("test", 7), - RAP_planning = rep("test", 7), - RAP_comments = rep("test", 7), - prac_use_open_source = rep("test", 7), - prac_open_source_own = rep("test", 7), - prac_version_control = rep("test", 7), - prac_review = rep("test", 7), - prac_functions = rep("test", 7), - prac_unit_test = rep("test", 7), - prac_package = rep("test", 7), - prac_dir_structure = rep("test", 7), - prac_style = rep("test", 7), - prac_automated_QA = rep("test", 7), - prac_AQUA_book = rep("test", 7), - doc_comments = rep("test", 7), - doc_functions = rep("test", 7), - doc_readme = rep("test", 7), - doc_desk_notes = rep("test", 7), - doc_registers = rep("test", 7), - doc_AQA_logs = rep("test", 7), - doc_flow_charts = rep("test", 7), - doc_other = rep("test", 7), - CI = rep("test", 7), - dep_management = rep("test", 7), - reproducible_workflow = rep("test", 7), - misc_coding = rep("test", 7)) - -skipped_cols <- list(colnames(data_template)[which(colnames(data_template) == "CS_grade"):which(colnames(data_template) == "ONS_directorate")], - "ONS_directorate", - colnames(data_template)[which(colnames(data_template) == "qual_1_subject"):which(colnames(data_template) == "qual_3_learn_code")], - colnames(data_template)[which(colnames(data_template) == "prac_use_open_source"):which(colnames(data_template) == "misc_coding")], - colnames(data_template)[which(colnames(data_template) == "coding_ability_change"):which(colnames(data_template) == "first_learned")], - "first_learned", - colnames(data_template)[which(colnames(data_template) == "know_RAP_champ"):which(colnames(data_template) == "RAP_comments")]) - -dummy_data <- data_template -expected <- data_template - -for(i in 1:length(questions)) { - - dummy_data[i, questions[i]] <- conditions[i] - expected[i, questions[i]] <- conditions[i] - - expected[i, skipped_cols[[i]]] <- NA - - test_that(sprintf("apply_skip_logic replaces relevant skipped question enteries with NAs where %s is '%s.'", questions[i], conditions[i]), { - - got <- apply_skip_logic(data = dummy_data) - - expect_equal(got, expected) - - }) - -} + got <- apply_skip_logic(dummy_data) + + expected <- rename_cols(cars_dummy_data_clean) + + expect_equal(expected, got) +}) diff --git a/tests/testthat/test-summarise_ability_change.R b/tests/testthat/test-summarise_ability_change.R index 32e2b01..781d75d 100644 --- a/tests/testthat/test-summarise_ability_change.R +++ b/tests/testthat/test-summarise_ability_change.R @@ -1,10 +1,19 @@ -dummy_data <- data.frame(coding_ability_change = c(NA, - rep("Significantly worse", 2), - rep("Slightly worse", 3), - rep("No change", 4), - rep("Slightly better", 5), - rep("Significantly better", 6))) +dummy_data <- data.frame(first_learned =rep(c(NA, + "Current employment", + "Education", + "Previous public sector employment", + "Previous private sector employment", + "Other"), + times = 6), + coding_ability_change = rep(c(NA, + "It has become significantly worse", + "It has become slightly worse", + "It has stayed the same", + "It has become slightly better", + "It has become significantly better"), + each = 6) +) test_that("summarise_ability_change validation works", { @@ -28,15 +37,15 @@ test_that("summarise_ability_change output is as expected", { expected <- data.frame(value = factor(c("Significantly worse", "Slightly worse", - "No change", + "Stayed the same", "Slightly better", "Significantly better"), levels = c("Significantly worse", "Slightly worse", - "No change", + "Stayed the same", "Slightly better", "Significantly better")), - n=c(0.10, 0.15, 0.20, 0.25, 0.30)) + n=c(0.2, 0.2, 0.2, 0.2, 0.2)) expect_equal(got, expected) }) diff --git a/tests/testthat/test-summarise_cap_change_by_CS_grade.R b/tests/testthat/test-summarise_cap_change_by_CS_grade.R new file mode 100644 index 0000000..a6bbcc7 --- /dev/null +++ b/tests/testthat/test-summarise_cap_change_by_CS_grade.R @@ -0,0 +1,64 @@ +dummy_data <- data.frame( + CS_grade = c( + NA, + rep("Higher Executive Officer (or equivalent)", 10), + rep("Senior Executive Officer (or equivalent)", 10), + rep("Grade 7 (or equivalent)", 5), + rep("Grade 6 (or equivalent)", 5)), + coding_ability_change = c( + NA, + rep(c( + "It has become significantly worse", + "It has become slightly worse", + "It has stayed the same", + "It has become slightly better", + "It has become significantly better"), + times = 6)) + ) + +test_that("summarise_cap_change_by_CS_grade missing data is handled correctly", { + + got <- summarise_cap_change_by_CS_grade(dummy_data) + + expect_false(any(is.na.data.frame(got))) + +}) + +test_that("summarise_cap_change_by_CS_grade output is as expected", { + + got <- summarise_cap_change_by_CS_grade(dummy_data) + + expected <- data.frame( + CS_grade = factor(rep(c( + "Higher Executive Officer (or equivalent)", + "Senior Executive Officer (or equivalent)", + "Grade 6 and 7"), + each = 5), + levels = c( + "Higher Executive Officer (or equivalent)", + "Senior Executive Officer (or equivalent)", + "Grade 6 and 7")), + coding_ability_change = factor(rep(c( + "It has become significantly worse", + "It has become slightly worse", + "It has stayed the same", + "It has become slightly better", + "It has become significantly better"), + times = 3), + levels = c( + "It has become significantly worse", + "It has become slightly worse", + "It has stayed the same", + "It has become slightly better", + "It has become significantly better")), + n = c( + 0.20, 0.20, 0.20, 0.20, 0.20, + 0.20, 0.20, 0.20, 0.20, 0.20, + 0.20, 0.20, 0.20, 0.20, 0.20) + ) + + + expect_equal(got, expected) + + +}) diff --git a/tests/testthat/test-summarise_cap_change_by_freq.R b/tests/testthat/test-summarise_cap_change_by_freq.R index 49afbbb..2af697d 100644 --- a/tests/testthat/test-summarise_cap_change_by_freq.R +++ b/tests/testthat/test-summarise_cap_change_by_freq.R @@ -1,16 +1,33 @@ -dummy_data <- data.frame(coding_ability_change = c(NA, - rep("Significantly worse", 2), - rep("Slightly worse", 3), - rep("No change", 4), - rep("Slightly better", 5), - rep("Significantly better", 6)), - code_freq = c(NA, - rep("Sometimes", 6), - rep("All the time", 5), - rep("Never", 3), - rep("Rarely", 4), - rep("Regularly", 2))) +dummy_data <- data.frame( + coding_ability_change = rep(c( + NA, + "It has become significantly worse", + "It has become slightly worse", + "It has stayed the same", + "It has become slightly better", + "It has become significantly better"), + each = 90), + code_freq = rep(c( + NA, + "Sometimes", + "All the time", + "Rarely", + "Regularly"), + times = 108), + other_coding_experience = rep(c( + NA, + "Yes", + "No"), + times = 180), + first_learned = rep(c(NA, + "Current employment", + "Education", + "Previous public sector employment", + "Previous private sector employment", + "Other"), + times =90) + ) test_that("summarise_cap_change_by_freq missing data is handled correctly", { @@ -24,31 +41,37 @@ test_that("summarise_cap_change_by_freq output is as expected", { got <- summarise_cap_change_by_freq(dummy_data) - expected <- data.frame(code_freq = factor(c(rep("Never", 5), - rep("Rarely", 5), - rep("Sometimes", 5), - rep("Regularly", 5), - rep("All the time", 5)), - levels = c("Never", - "Rarely", - "Sometimes", - "Regularly", - "All the time")), - coding_ability_change = factor(rep(c("Significantly worse", - "Slightly worse", - "No change", - "Slightly better", - "Significantly better"), 5), - levels = c("Significantly worse", - "Slightly worse", - "No change", - "Slightly better", - "Significantly better")), - n = c(0.00, 0.00, 0.00, 1.00, 0.00, - 0.00, 0.00, 0.00, 0.00, 1.00, - 1/3, 0.50, 1/6, 0.00, 0.00, - 0.00, 0.00, 0.00, 0.00, 1.00, - 0.00, 0.00, 0.60, 0.40, 0.00)) + expected <- data.frame( + + code_freq = factor(rep(c( + "Rarely", + "Sometimes", + "Regularly", + "All the time"), + each = 5), + levels = c( + "Rarely", + "Sometimes", + "Regularly", + "All the time")), + + coding_ability_change = factor(rep(c( + "It has become significantly worse", + "It has become slightly worse", + "It has stayed the same", + "It has become slightly better", + "It has become significantly better"), + times = 4), + levels = c( + "It has become significantly worse", + "It has become slightly worse", + "It has stayed the same", + "It has become slightly better", + "It has become significantly better")), + + n = rep(1/5, times = 20) + + ) expect_equal(got, expected) diff --git a/tests/testthat/test-summarise_cap_change_by_line_manage.R b/tests/testthat/test-summarise_cap_change_by_line_manage.R new file mode 100644 index 0000000..1f7ab8f --- /dev/null +++ b/tests/testthat/test-summarise_cap_change_by_line_manage.R @@ -0,0 +1,66 @@ + +dummy_data <- data.frame( + + management = rep(c( + NA, + "Yes", + "No - I manage people who do not write code", + "No - I don't line manage anyone"), + each = 6), + + coding_ability_change = rep(c( + NA, + "It has become significantly worse", + "It has become slightly worse", + "It has stayed the same", + "It has become slightly better", + "It has become significantly better"), + times=4) + +) + +test_that("summarise_cap_change_by_line_manage missing data is handled correctly", { + + got <- summarise_cap_change_by_line_manage(dummy_data) + + expect_false(any(is.na.data.frame(got))) + +}) + +test_that("summarise_cap_change_by_line_manage output is as expected", { + + got <- summarise_cap_change_by_line_manage(dummy_data) + + expected <- data.frame( + + management = factor(rep(c( + "Yes", + "No - I manage people who do not write code", + "No - I don't line manage anyone"), + each=5), + levels = c( + "Yes", + "No - I manage people who do not write code", + "No - I don't line manage anyone")), + + coding_ability_change = factor(rep(c( + "It has become significantly worse", + "It has become slightly worse", + "It has stayed the same", + "It has become slightly better", + "It has become significantly better"), + times = 3), + levels = c( + "It has become significantly worse", + "It has become slightly worse", + "It has stayed the same", + "It has become slightly better", + "It has become significantly better")), + + n = rep(0.2, times = 15) + + ) + + expect_equal(got, expected) + +}) diff --git a/tests/testthat/test-summarise_coding_practices.R b/tests/testthat/test-summarise_coding_practices.R index 21dbf8e..6963ff6 100644 --- a/tests/testthat/test-summarise_coding_practices.R +++ b/tests/testthat/test-summarise_coding_practices.R @@ -1,18 +1,23 @@ -dummy_data <- data.frame(prac_use_open_source = c(rep("Never", 3), rep("Sometimes", 2), rep(NA, 1)), - prac_open_source_own = c(rep("Sometimes", 3), rep("I don't understand this question", 2), rep("All the time", 1)), - prac_version_control = c(rep("Rarely", 3), rep("All the time", 2), rep("Never", 1)), - prac_review = c(rep("Regularly", 3), rep("All the time", 2), rep("Never", 1)), - prac_functions = c(rep("I don't understand this question", 3), rep("Never", 2), rep("Rarely", 1)), - prac_unit_test = c(rep("All the time", 3), rep("Rarely", 2), rep("Never", 1)), - prac_package = c(rep("Never", 3), rep("Sometimes", 2), rep("Rarely", 1)), - prac_dir_structure = c(rep("Sometimes", 3), rep("Rarely", 2), rep("Never", 1)), - prac_style = c(rep("Rarely", 3), rep("Never", 2), rep("Sometimes", 1)), - prac_automated_QA = c(rep("Regularly", 3), rep("Sometimes", 2), rep("Never", 1)), - prac_AQUA_book = c(rep("I don't understand this question", 3), rep("Never", 2), rep("Sometimes", 1))) +answers <- c("I don't understand this question", "Never", "Rarely", "Sometimes", "Regularly", "All the time") + +dummy_data <- data.frame(prac_use_open_source = answers, + prac_open_source_own = answers, + prac_version_control = answers, + prac_review = answers, + prac_functions = answers, + prac_unit_test = answers, + prac_package = answers, + prac_dir_structure = answers, + prac_style = answers, + prac_automated_QA = answers, + prac_development_QA = answers, + prac_proportionate_QA = answers) test_that("summarise_coding_practises missing data is handled correctly", { + dummy_data[1,1] <- NA + got <- summarise_coding_practices(dummy_data) expect_false(any(is.na.data.frame(got))) @@ -33,30 +38,11 @@ test_that("summarise_coding_practises output is as expected", { "Standard directory structure", "Coding guidelines / Style guides", "Automated data quality assurance", - "Apply AQUA book principles with analysis code")), each=6), - value = factor(rep(c("I don't understand this question", - "Never", - "Rarely", - "Sometimes", - "Regularly", - "All the time"), 11), - levels = c("I don't understand this question", - "Never", - "Rarely", - "Sometimes", - "Regularly", - "All the time")), - n = c(1/2, 1/3, 0, 1/6, 0, 0, - 0, 1/6, 0, 1/3, 1/2, 0, - 0, 1/6, 0, 0, 1/2, 1/3, - 0, 1/3, 1/2, 1/6, 0, 0, - 1/2, 1/3, 1/6, 0, 0, 0, - 1/3, 0, 0, 1/2, 0, 1/6, - 0, 1/2, 1/6, 1/3, 0, 0, - 0, 1/6, 1/3, 1/2, 0, 0, - 0, 1/6, 1/3, 0, 0, 1/2, - 0, 3/5, 0, 2/5, 0, 0, - 0, 1/6, 1/2, 0, 0, 1/3)) + "Quality assurance throughout development", + "Proportionate quality assurance")), each=6), + value = factor(rep(answers, 12), + levels = answers), + n = rep(1/6, times=72)) expect_equal(got, expected) diff --git a/tests/testthat/test-summarise_coding_tools.R b/tests/testthat/test-summarise_coding_tools.R index fb45143..4288e4b 100644 --- a/tests/testthat/test-summarise_coding_tools.R +++ b/tests/testthat/test-summarise_coding_tools.R @@ -1,36 +1,29 @@ # Coding tools frequency tables (access or knowledge) dummy_data <- data.frame( - knowledge_R = c("Yes", rep("No", 2), rep("Don't Know", 3)), - access_R = c(rep("Yes", 2), "No", rep("Don't Know", 3)), - knowledge_SQL = c(rep("Yes", 3), rep("No", 2), "Don't Know"), - access_SQL = c("Yes", rep("No", 3), rep("Don't Know", 2)), - knowledge_SAS = c(rep("Yes", 2), rep("No", 3), "Don't Know"), - access_SAS = c(rep("Yes", 3), "No", rep("Don't Know", 2)), - knowledge_VBA = c("Yes", rep("No", 2), rep("Don't Know", 3)), - access_VBA = c(rep("Yes", 2), "No", rep("Don't Know", 3)), - knowledge_python = c(rep("Yes", 3), rep("No", 2), "Don't Know"), - access_python = c("Yes", rep("No", 3), rep("Don't Know", 2)), - knowledge_SPSS = c(rep("Yes", 2), rep("No", 3), "Don't Know"), - access_SPSS = c(rep("Yes", 3), "No", rep("Don't Know", 2)), - knowledge_stata = c("Yes", rep("No", 2), rep("Don't Know", 3)), - access_stata = c(rep("Yes", 2), "No", rep("Don't Know", 3)), - knowledge_JS = c(rep("Yes", 3), rep("No", 2), "Don't Know"), - access_JS = c("Yes", rep("No", 3), rep("Don't Know", 2)), - knowledge_java = c(rep("Yes", 2), rep("No", 3), "Don't Know"), - access_java = c(rep("Yes", 3), "No", rep("Don't Know", 2)), - knowledge_C = c("Yes", rep("No", 2), rep("Don't Know", 3)), - access_C = c(rep("Yes", 2), "No", rep("Don't Know", 3)), - knowledge_matlab = c(rep("Yes", 3), rep("No", 2), "Don't Know"), - access_matlab = c("Yes", rep("No", 5), rep("Don't Know", 0)) # Used to check zero counts aren't missing + knowledge_R = c("Yes", "No", "Not required for my work"), + access_R = c("Yes", "No", "Don't know"), + knowledge_SQL = c("Yes", "No", "Not required for my work"), + access_SQL = c("Yes", "No", "Don't know"), + knowledge_SAS = c("Yes", "No", "Not required for my work"), + access_SAS = c("Yes", "No", "Don't know"), + knowledge_VBA = c("Yes", "No", "Not required for my work"), + access_VBA = c("Yes", "No", "Don't know"), + knowledge_python = c("Yes", "No", "Not required for my work"), + access_python = c("Yes", "No", "Don't know"), + knowledge_SPSS = c("Yes", "No", "Not required for my work"), + access_SPSS = c("Yes", "No", "Don't know"), + knowledge_stata = c("Yes", "No", "Not required for my work"), + access_stata = c("Yes", "No", "Don't know"), + knowledge_matlab = c("Yes", "No", "No"), + access_matlab = c("Yes", "No", "No") # Used to check zero counts aren't missing ) test_that("summarise_coding_tools missing data is handled correctly", { - dummy_data_missing_values <- dummy_data - dummy_data_missing_values[1,] <- NA + dummy_data[1,] <- NA - got_missing <- summarise_coding_tools(dummy_data_missing_values, "knowledge") + got_missing <- summarise_coding_tools(dummy_data, "knowledge") expect_false(any(is.na(got_missing))) @@ -40,22 +33,18 @@ test_that("summarise_coding_tools knowledge output is as expected", { got_knowledge <- summarise_coding_tools(dummy_data, "knowledge") - expected_knowledge <- data.frame("name" = rep(c("C++ / C#", - "Java / Scala", - "Javascript / Typescript", - "Matlab", - "Python", - "R", - "SAS", - "SPSS", - "SQL", - "Stata", - "VBA"), each=3), - "value" = factor(rep(c("Yes", "Don't Know", "No"), 11), - levels = c("Yes", "Don't Know", "No")), - "n" = c(1/6, 1/2, 1/3, 1/3, 1/6, 1/2, 1/2, 1/6, 1/3, 1/2, 1/6, - 1/3, 1/2, 1/6, 1/3, 1/6, 1/2, 1/3, 1/3, 1/6, 1/2, 1/3, - 1/6, 1/2, 1/2, 1/6, 1/3, 1/6, 1/2, 1/3, 1/6, 1/2, 1/3)) + expected_knowledge <- data.frame(name = rep(c("Python", + "R", + "SQL", + "Matlab", + "SAS", + "SPSS", + "Stata", + "VBA"), each=3), + value = factor(rep(c("Yes", "No", "Not required for my work"), 8), + levels = c("Yes", "No", "Not required for my work")), + n = c(rep(1/3, times=9), 1/3, 2/3, 0, rep(1/3, times=12)) + ) expect_equal(got_knowledge, expected_knowledge) @@ -65,22 +54,18 @@ test_that("summarise_coding_tools access output is as expected", { got_access <- summarise_coding_tools(dummy_data, "access") - expected_access <- data.frame("name" = rep(c("C++ / C#", - "Java / Scala", - "Javascript / Typescript", - "Matlab", - "Python", - "R", - "SAS", - "SPSS", - "SQL", - "Stata", - "VBA"), each=3), - "value" = factor(rep(c("Yes", "Don't Know", "No"), 11), - levels = c("Yes", "Don't Know", "No")), - "n" = c(1/3, 1/2, 1/6, 1/2, 1/3, 1/6, 1/6, 1/3, 1/2, 1/6, 0, - 5/6, 1/6, 1/3, 1/2, 1/3, 1/2, 1/6, 1/2, 1/3, 1/6, 1/2, - 1/3, 1/6, 1/6, 1/3, 1/2, 1/3, 1/2, 1/6, 1/3, 1/2, 1/6)) + expected_access <- data.frame(name = rep(c("Python", + "R", + "SQL", + "Matlab", + "SAS", + "SPSS", + "Stata", + "VBA"), each=3), + value = factor(rep(c("Yes", "No", "Don't know"), 8), + levels = c("Yes", "No", "Don't know")), + n = c(rep(1/3, times=9), 1/3, 2/3, 0, rep(1/3, times=12)) + ) expect_equal(got_access, expected_access) diff --git a/tests/testthat/test-summarise_heard_of_RAP_by_prof.R b/tests/testthat/test-summarise_heard_of_RAP_by_prof.R new file mode 100644 index 0000000..7e1e366 --- /dev/null +++ b/tests/testthat/test-summarise_heard_of_RAP_by_prof.R @@ -0,0 +1,64 @@ +# Coding tools frequency tables (access or knowledge) + +dummy_data <- data.frame( + workplace = rep(c( + "Civil service, including devolved administrations", + "Office for Students", "NHS"), + each=4), + heard_of_RAP = rep(c("Yes", "Yes", "No", "No"), 3), + prof_DE = rep(c("Yes", "No"), 6), + prof_DS = rep(c("Yes", "No"), 6), + prof_DDAT = rep(c("Yes", "No"), 6), + prof_GAD = rep(c("Yes", "No"), 6), + prof_GES = rep(c("Yes", "No"), 6), + prof_geog = rep(c("Yes", "No"), 6), + prof_GORS = rep(c("Yes", "No"), 6), + prof_GSR = rep(c("Yes", "No"), 6), + prof_GSG = rep(c("Yes", "No"), 6) + +) + +test_that("summarise_heard_of_RAP_by_prof missing data is handled correctly", { + + dummy_data[1, ] <- NA + + got <- summarise_heard_of_RAP_by_prof(dummy_data) + + expect_false(any(is.na.data.frame(got))) + +}) + +test_that("summarise_heard_of_RAP_by_prof output is as expected", { + + got <- summarise_heard_of_RAP_by_prof(dummy_data) + + expected <- data.frame( + + value = factor(c( + "Data engineers", + "Data scientists", + "Digital and data (DDAT)", + "Actuaries", + "Economists (GES)", + "Geographers", + "Operational researchers (GORS)", + "Social researchers (GSR)", + "Statisticians (GSG)"), + levels = c( + "Data engineers", + "Data scientists", + "Digital and data (DDAT)", + "Actuaries", + "Economists (GES)", + "Geographers", + "Operational researchers (GORS)", + "Social researchers (GSR)", + "Statisticians (GSG)")), + + n = rep(1/2, times = 9) + + ) + + expect_equal(got, expected) + +}) diff --git a/tests/testthat/test-summarise_language_status.R b/tests/testthat/test-summarise_language_status.R index bf6f83c..dd5bd2a 100644 --- a/tests/testthat/test-summarise_language_status.R +++ b/tests/testthat/test-summarise_language_status.R @@ -16,12 +16,6 @@ dummy_data <- data.frame( access_SPSS = c("No", "Yes"), knowledge_stata = c("Don't know", "No"), access_stata = c("Yes", "No"), - knowledge_JS = c("Don't know", "Yes"), - access_JS = c("No", "Yes"), - knowledge_java = c("Don't know", "Yes"), - access_java = c("Don't know", "No"), - knowledge_C = c("Yes", "Don't know"), - access_C = c("No", "Yes"), knowledge_matlab = c("Yes", "No"), access_matlab = c("Yes", "No") ) @@ -32,10 +26,7 @@ test_that("summarise_language_status output is as expected", { got <- summarise_language_status(dummy_data) - expected <- data.frame(name = rep(c("C++ / C#", - "Java / Scala", - "Javascript / Typescript", - "Matlab", + expected <- data.frame(name = rep(c("Matlab", "Python", "R", "SAS", @@ -46,13 +37,13 @@ test_that("summarise_language_status output is as expected", { value = factor(rep(c("Access Only", "Both", "Knowledge Only"), - 11), + 8), levels=c("Access Only", "Both", "Knowledge Only")), - n = c(0.50, 0.00, 0.50, 0.00, 0.00, 1.00, 0.00, 1.00, 0.00, 0.00, 1.00, - 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 1.00, 0.00, 1.00, 0.00, 1.00, - 0.00, 0.00, 0.00, 0.00, 1.00, 1.00, 0.00, 0.00, 1.00, 0.00, 0.00)) + n = c(0.00, 1.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, + 1.00, 0.00, 1.00, 0.00, 1.00, 0.00, 0.00, 0.00, + 0.00, 1.00, 1.00, 0.00, 0.00, 1.00, 0.00, 0.00)) expect_equal(got, expected) diff --git a/tests/testthat/test-summarise_languages_by_prof.R b/tests/testthat/test-summarise_languages_by_prof.R index 6463c47..41393c4 100644 --- a/tests/testthat/test-summarise_languages_by_prof.R +++ b/tests/testthat/test-summarise_languages_by_prof.R @@ -1,26 +1,49 @@ # Coding tools frequency tables (access or knowledge) +knowledge_response <- rep(c( + NA, + "Yes", + "No", + "Not required for my work"), + each = 3, times = 6) + +prof_response <- rep(c( + NA, + "Yes", + "No"), + times = 24) + dummy_data <- data.frame( - code_freq = c(rep("Sometimes", 2), rep("All the time", 2), "Never", NA), - knowledge_R = c("Yes", rep("No", 2), rep("Don't Know", 3)), - knowledge_SQL = c(rep("Yes", 3), rep("No", 2), "Don't Know"), - knowledge_SAS = c(rep("Yes", 2), rep("No", 3), "Don't Know"), - knowledge_VBA = c("Yes", rep("No", 2), rep("Don't Know", 3)), - knowledge_python = c(rep("Yes", 3), rep("No", 2), "Don't Know"), - knowledge_SPSS = c(rep("Yes", 2), rep("No", 3), "Don't Know"), - knowledge_stata = c("Yes", rep("No", 2), rep("Don't Know", 3)), - knowledge_JS = c(rep("Yes", 3), rep("No", 2), "Don't Know"), - knowledge_java = c(rep("Yes", 2), rep("No", 3), "Don't Know"), - knowledge_C = c("Yes", rep("No", 2), rep("Don't Know", 3)), - knowledge_matlab = c(rep("Yes", 3), rep("No", 2), "Don't Know"), - prof_DS = c(rep("Yes", 3), rep("No", 3)), - prof_DDAT = c("No", rep("Yes", 4), rep("No", 1)), - prof_GAD = c(rep("No", 2), rep("Yes", 4)), - prof_GES = c(rep("No", 2), rep("Yes", 2), rep("No", 2)), - prof_geog = c(rep("No", 3), rep("Yes", 3)), - prof_GORS = c(rep("No", 1), rep("Yes", 5)), - prof_GSR = c(rep("Yes", 1), rep("No", 5)), - prof_GSG = c(rep("Yes", 6))) + + code_freq = rep(c( + NA, + "Never", + "Rarely", + "Sometimes", + "Regularly", + "All the time"), + each = 12), + + knowledge_R = knowledge_response, + knowledge_SQL = knowledge_response, + knowledge_SAS = knowledge_response, + knowledge_VBA = knowledge_response, + knowledge_python = knowledge_response, + knowledge_SPSS = knowledge_response, + knowledge_stata = knowledge_response, + knowledge_matlab = knowledge_response, + + prof_DE = prof_response, + prof_DS = prof_response, + prof_DDAT = prof_response, + prof_GAD = prof_response, + prof_GES = prof_response, + prof_geog = prof_response, + prof_GORS = prof_response, + prof_GSR = prof_response, + prof_GSG = prof_response + +) test_that("summarise_languages_by_prof missing data is handled correctly", { @@ -35,33 +58,34 @@ test_that("summarise_languages_by_prof output is as expected", { got <- summarise_languages_by_prof(dummy_data) - expected <- data.frame(lang = rep(c("C++ / C#", - "Java / Scala", - "Javascript / Typescript", - "Matlab", - "Python", - "R", - "SAS", - "SPSS", - "SQL", - "Stata", - "VBA"), 8), - prof = c(rep("Data scientists", 11), - rep("Digital and data (DDAT)", 11), - rep("Actuaries", 11), - rep("Economists (GES)", 11), - rep("Geographers", 11), - rep("Operational researchers (GORS)", 11), - rep("Social researchers (GSR)", 11), - rep("Statisticians (GSG)", 11)), - n = c(1/3, 2/3, 1, 1, 1, 1/3, 2/3, 2/3, 1, 1/3, 1/3, - 0, 1/4, 1/2, 1/2, 1/2, 0, 1/4, 1/4, 1/2, 0, 0, - 0, 0, 1/4, 1/4, 1/4, 0, 0, 0, 1/4, 0, 0, - 0, 0, 1/2, 1/2, 1/2, 0, 0, 0, 1/2, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1/5, 2/5, 2/5, 2/5, 0, 1/5, 1/5, 2/5, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1/6, 1/3, 1/2, 1/2, 1/2, 1/6, 1/3, 1/3, 1/2, 1/6, 1/6)) + expected <- data.frame( + + lang = rep(c( + "Python", + "R", + "SQL", + "Matlab", + "SAS", + "SPSS", + "Stata", + "VBA"), + times = 9), + + prof = rep(c( + "Data engineers", + "Data scientists", + "Digital and data (DDAT)", + "Actuaries", + "Economists (GES)", + "Geographers", + "Operational researchers (GORS)", + "Social researchers (GSR)", + "Statisticians (GSG)"), + each = 8), + + n = rep(1/3, times = 72) + + ) expect_equal(got, expected) diff --git a/tests/testthat/test-summarise_open_source_by_prof.R b/tests/testthat/test-summarise_open_source_by_prof.R new file mode 100644 index 0000000..bdf6460 --- /dev/null +++ b/tests/testthat/test-summarise_open_source_by_prof.R @@ -0,0 +1,94 @@ +# Coding tools frequency tables (access or knowledge) + +prac_responses <- rep(c( + NA, + "I don't understand this question", + "Never", + "Rarely", + "Sometimes", + "Regularly", + "All the time"), + each = 3) + +prof_responses <- rep(c( + NA, + "Yes", + "No"), + times = 7) + +dummy_data <- data.frame( + + prac_use_open_source = prac_responses, + prac_open_source_own = prac_responses, + prac_version_control = prac_responses, + prac_review = prac_responses, + prac_functions = prac_responses, + prac_unit_test = prac_responses, + prac_package = prac_responses, + prac_dir_structure = prac_responses, + prac_style = prac_responses, + prac_automated_QA = prac_responses, + prac_development_QA = prac_responses, + prac_proportionate_QA = prac_responses, + + prof_DE = prof_responses, + prof_DS = prof_responses, + prof_DDAT = prof_responses, + prof_GAD = prof_responses, + prof_GES = prof_responses, + prof_geog = prof_responses, + prof_GORS = prof_responses, + prof_GSR = prof_responses, + prof_GSG = prof_responses + +) + +test_that("summarise_open_source_by_prof missing data is handled correctly", { + + got <- summarise_open_source_by_prof(dummy_data) + + + expect_false(any(is.na.data.frame(got))) + +}) + +test_that("summarise_open_source_by_prof output is as expected", { + + got <- summarise_open_source_by_prof(dummy_data) + + expected <- data.frame( + + name = rep(c( + "Data engineers", + "Data scientists", + "Digital and data (DDAT)", + "Actuaries", + "Economists (GES)", + "Geographers", + "Operational researchers (GORS)", + "Social researchers (GSR)", + "Statisticians (GSG)"), + each=6), + + value = factor(rep(c( + "I don't understand this question", + "Never", + "Rarely", + "Sometimes", + "Regularly", + "All the time"), 9), + levels = c( + "I don't understand this question", + "Never", + "Rarely", + "Sometimes", + "Regularly", + "All the time")), + + n = rep(1/6, times = 54) + + ) + + expect_equal(got, expected) + +}) diff --git a/tests/testthat/test-summarise_operations.R b/tests/testthat/test-summarise_operations.R deleted file mode 100644 index d5368c3..0000000 --- a/tests/testthat/test-summarise_operations.R +++ /dev/null @@ -1,43 +0,0 @@ -# Data operations table -# Frequency table should not include missing values and should include counts of 0 - -dummy_data <- data.frame(ops_analysis = c(rep("I do some or all of this by coding", 3), rep("I don't do this", 2), "I do this without coding", NA), - ops_cleaning = c(rep("I do some or all of this by coding", 2), rep("I don't do this", 3), rep("I do this without coding", 2)), - ops_linking = c(rep("I do some or all of this by coding", 1), rep("I don't do this", 2), rep("I do this without coding", 4)), - ops_transfer_migration = c(rep("I do some or all of this by coding", 4), rep("I don't do this", 3)), - ops_vis = c(rep("I do some or all of this by coding", 3), rep("I don't do this", 2), rep("I do this without coding", 2)), - ops_machine_learning = c(rep("I do some or all of this by coding", 2), rep("I don't do this", 3), rep("I do this without coding", 2)), - ops_modelling = c(rep("I do some or all of this by coding", 3), rep("I don't do this", 2), rep("I do this without coding", 2)), - ops_QA = c(rep("I do some or all of this by coding", 1), rep("I don't do this", 3), rep("I do this without coding", 3)) -) - -test_that("summarise_operations missing data is handled correctly", { - - got <- summarise_operations(dummy_data) - - expect_false(any(is.na.data.frame(got))) - -}) - -test_that("summarise_operations output is as expected", { - - got <- summarise_operations(dummy_data) - - expected <- data.frame("name" = rep(c("Data analysis", - "Data cleaning", - "Data linking", - "Data transfer / migration", - "Data visualisation", - "Machine learning", - "Modelling", - "Quality assurance"), each=2), - "value" = factor(rep(c("I do some or all of this by coding", - "I do this without coding"), 8), - levels = c("I do some or all of this by coding", - "I do this without coding")), - "n" = c(3/4, 1/4, 1/2, 1/2, 1/5, 4/5, 1, 0, - 3/5, 2/5, 1/2, 1/2, 3/5, 2/5, 1/4, 3/4)) - - expect_equal(got, expected) - -}) diff --git a/tests/testthat/test-summarise_os_vs_prop.R b/tests/testthat/test-summarise_os_vs_prop.R new file mode 100644 index 0000000..d1bad83 --- /dev/null +++ b/tests/testthat/test-summarise_os_vs_prop.R @@ -0,0 +1,33 @@ + +n=192 +dummy_data <- data.frame(year = rep(c(2020, 2021, 2022), each=n/3), + knowledge_python = rep(c("Yes", "No"), each=n/12, times=n/16), + knowledge_R = rep(c("Yes", "No"), each=n/24, times=n/8), + knowledge_SAS = rep(c("Yes", "No"), each=n/48, times=n/4), + knowledge_SPSS = rep(c("Yes", "No"), each=n/96, times=n/2), + knowledge_stata = rep(c("Yes", "No"), each=n/192, times=n)) + +test_that("summarise_os_vs_prop missing data is handled correctly", { + + dummy_data[1,2] <- NA + + got <- summarise_os_vs_prop(dummy_data) + + expect_false(any(is.na.data.frame(got))) + +}) + +test_that("summarise_os_vs_prop output is as expected", { + + got <- summarise_os_vs_prop(dummy_data) + + expected <- data.frame(lang_type = factor(rep(c("Open Source", "Proprietary"), each=3), + levels = c("Open Source", "Proprietary")), + year = rep(c("2020", "2021", "2022"), times=2), + Freq = rep(c(96, 112), each=3), + n = rep(128, times=6)) %>% + get_ci(freq_col = 3, n_col = 4) + + expect_equal(got, expected) + +}) diff --git a/tests/testthat/test-summarise_rap_awareness_over_time.R b/tests/testthat/test-summarise_rap_awareness_over_time.R new file mode 100644 index 0000000..0921ca9 --- /dev/null +++ b/tests/testthat/test-summarise_rap_awareness_over_time.R @@ -0,0 +1,52 @@ +heard_of_RAP <- c(NA, + rep(c("Yes", + "No"), + times = 15)) + +year <- c(NA, + rep(c("2020", + "2021", + "2022"), + each = 10)) + +code_freq <- c(NA, + rep(c("Never", + "Rarely", + "Sometimes", + "Regularly", + "All the time"), + times = 6)) + +dummy_data <- data.frame(heard_of_RAP, year, code_freq) + +test_that("summarise_rap_awareness_over_time missing data is handled correctly", { + + got <- summarise_rap_awareness_over_time(dummy_data) + + expect_false(any(is.na.data.frame(got))) + +}) + +test_that("summarise_rap_awareness_over_time output is as expected", { + + got <- summarise_rap_awareness_over_time(dummy_data) + + expected <- data.frame(Var1 = factor(rep("Yes", 3), + levels = c("No", "Yes")), + Var2 = factor(c("2020", "2021", "2022"), + levels = c("2020", "2021", "2022")), + Freq = c(4), + n = c(8), + percent = c(0.5), + lower = c(0.21521606), + upper = c(0.78478394), + lower_ci = c(0.28478394), + upper_ci = c(0.28478394)) + + expect_equal(got, expected) + + +}) + + + diff --git a/tests/testthat/test-summarise_rap_champ_status.R b/tests/testthat/test-summarise_rap_champ_status.R new file mode 100644 index 0000000..f557a08 --- /dev/null +++ b/tests/testthat/test-summarise_rap_champ_status.R @@ -0,0 +1,47 @@ + +dummy_data <- data.frame( + + RAP_champ_status = c( + NA, + "Yes, and I am a RAP Champion", + "Yes, and I know who the RAP Champion is", + "Yes, but I don't know who the RAP Champion is", + "No", + "I don't know") + +) + +test_that("summarise_rap_champ_status missing data is handled correctly", { + + got <- summarise_rap_champ_status(dummy_data) + + expect_false(any(is.na.data.frame(got))) + +}) + +test_that("summarise_rap_knowledge output is as expected", { + + got <- summarise_rap_champ_status(dummy_data) + + expected <- data.frame( + + value = factor(c( + "Yes, and I am a RAP Champion", + "Yes, and I know who the RAP Champion is", + "Yes, but I don't know who the RAP Champion is", + "No", + "I don't know"), + levels = c( + "Yes, and I am a RAP Champion", + "Yes, and I know who the RAP Champion is", + "Yes, but I don't know who the RAP Champion is", + "No", + "I don't know")), + + n = rep(1/5, times = 5) + + ) + + expect_equal(got, expected) + +}) diff --git a/tests/testthat/test-summarise_rap_comp.R b/tests/testthat/test-summarise_rap_comp.R index d45b328..5355423 100644 --- a/tests/testthat/test-summarise_rap_comp.R +++ b/tests/testthat/test-summarise_rap_comp.R @@ -1,20 +1,31 @@ -dummy_data <- data.frame(code_freq = c(rep("Somtimes", 4)), - use_open_source_score = c(1, NA, 1, 0), - open_code_score = c(1, 1, 1, 1), - version_control_score = c(0, 0, 0, 0), - peer_review_score = c(1, 1, 1, 0), - AQUA_book_score = c(0, 1, 0, 1), - doc_score = c(1, 1, 1, 0), - basic_rap_score = c(1, 1, 0, 1), - function_score = c(1, 0, 1, 0), - unit_test_score = c(1, 1, 1, 0), - function_doc_score = c(0, 1, 1, 0), - package_score = c(1, 1, 1, 0), - code_style_score = c(0, 1, 1, 0), - cont_integration_score = c(1, 1, 1, 0), - dep_management_score = c(0, 1, 1, 1), - advanced_rap_score = c(1, 0, 1, 1)) +dummy_data <- data.frame( + + code_freq = rep(c( + "Never", + "Rarely", + "Sometimes", + "Regularly", + "All the time"), + each = 3), + + use_open_source_score = rep(c(NA, 1, 0), times = 5), + open_code_score = rep(c(NA, 1, 0), times = 5), + version_control_score = rep(c(NA, 1, 0), times = 5), + peer_review_score = rep(c(NA, 1, 0), times = 5), + proportionate_QA_score = rep(c(NA, 1, 0), times = 5), + doc_score = rep(c(NA, 1, 0), times = 5), + basic_rap_score = rep(c(NA, 1, 0), times = 5), + function_score = rep(c(NA, 1, 0), times = 5), + unit_test_score = rep(c(NA, 1, 0), times = 5), + function_doc_score = rep(c(NA, 1, 0), times = 5), + package_score = rep(c(NA, 1, 0), times = 5), + code_style_score = rep(c(NA, 1, 0), times = 5), + cont_integration_score = rep(c(NA, 1, 0), times = 5), + dep_management_score = rep(c(NA, 1, 0), times = 5), + advanced_rap_score =rep(c(NA, 1, 0), times = 5) + +) test_that("summarise_rap_comp missing data is handled correctly", { @@ -30,38 +41,43 @@ test_that("summarise_rap_comp output is as expected", { got <-summarise_rap_comp(dummy_data) - expected <- data.frame(name = factor(c("Use open source software", - "Team open source code", - "Version control", - "Peer review", - "AQUA book guidance", - "Documentation", - "Functions", - "Unit testing", - "Function documentation", - "Code packages", - "Follow code style guidelines", - "Continuous integration", - "Dependency management"), - levels = c("Use open source software", - "Team open source code", - "Version control", - "Peer review", - "AQUA book guidance", - "Documentation", - "Functions", - "Unit testing", - "Function documentation", - "Code packages", - "Follow code style guidelines", - "Continuous integration", - "Dependency management")), - - value = c("Basic", "Basic", "Basic", "Basic", "Basic", "Basic", - "Advanced", "Advanced", "Advanced", "Advanced", "Advanced", "Advanced", "Advanced"), - - n = c(1/2, 1, 0, 3/4, 1/2, 3/4, 1/2, - 3/4, 1/2, 3/4, 1/2, 3/4, 3/4) + expected <- data.frame( + + name = factor(c( + "Use open source software", + "Team open source code", + "Version control", + "Peer review", + "Proportionate QA", + "Documentation", + "Functions", + "Unit testing", + "Function documentation", + "Code packages", + "Follow code style guidelines", + "Continuous integration", + "Dependency management"), + levels = c( + "Use open source software", + "Team open source code", + "Version control", + "Peer review", + "Proportionate QA", + "Documentation", + "Functions", + "Unit testing", + "Function documentation", + "Code packages", + "Follow code style guidelines", + "Continuous integration", + "Dependency management")), + + value = c( + rep("Basic", times = 6), + rep("Advanced", times = 7)), + + n = rep(5/12, times = 13) + ) expect_equal(got, expected) diff --git a/tests/testthat/test-summarise_rap_knowledge.R b/tests/testthat/test-summarise_rap_knowledge.R index 04961fd..f6f1d5f 100644 --- a/tests/testthat/test-summarise_rap_knowledge.R +++ b/tests/testthat/test-summarise_rap_knowledge.R @@ -1,15 +1,18 @@ -dummy_data <- data.frame(heard_of_RAP = c("No", rep("Yes", 13)), - know_RAP_champ = c(rep("I don't know what a RAP champion is", 2), - rep("I know what a RAP champion is but don't know who the RAP champion in my department is", 3), - rep("I know what a RAP champion is and there is no RAP champion in my department", 4), - rep("I know who the RAP champion in my department is", 5)) -) +dummy_data <- data.frame(heard_of_RAP = rep(c(NA, + "Yes", + "No"), + each = 6), + code_freq = rep(c(NA, + "Never", + "Rarely", + "Sometimes", + "Regularly", + "All the time"), + times = 3)) test_that("summarise_rap_knowledge missing data is handled correctly", { - dummy_data[5, 1] <- NA - got <- summarise_rap_knowledge(dummy_data) expect_false(any(is.na.data.frame(got))) @@ -20,26 +23,19 @@ test_that("summarise_rap_knowledge output is as expected", { got <- summarise_rap_knowledge(dummy_data) - expected <- data.frame(value = factor(c("Have not heard of RAP", - "I don't know what a RAP champion is", - "I know what a RAP champion is but don't know who the RAP champion in my department is", - "I know what a RAP champion is and there is no RAP champion in my department", - "I know who the RAP champion in my department is"), - levels = c("Have not heard of RAP", - "I don't know what a RAP champion is", - "I know what a RAP champion is but don't know who the RAP champion in my department is", - "I know what a RAP champion is and there is no RAP champion in my department", - "I know who the RAP champion in my department is")), - n = c(1/14, 1/14, 3/14, 4/14, 5/14)) + expected <- data.frame( - expect_equal(got, expected) + value = factor(c( + "Yes", + "No"), + levels = c( + "Yes", + "No")), -}) + n = rep(1/2, times = 2) -test_that("summarise_rap_knowledge validation works", { + ) - dummy_data <- data.frame(Test = c("test1", "test2")) - - expect_error(summarise_rap_knowledge(dummy_data), "unexpected_input: no column called 'heard_of_RAP'") + expect_equal(got, expected) }) diff --git a/tests/testthat/test-summarise_rep_workflow.R b/tests/testthat/test-summarise_rep_workflow.R index c297506..6a653b7 100644 --- a/tests/testthat/test-summarise_rep_workflow.R +++ b/tests/testthat/test-summarise_rep_workflow.R @@ -1,7 +1,12 @@ -dummy_data <- data.frame(reproducible_workflow = c(NA, - rep("Yes", 2), - rep("No", 3), - rep("I don't know what reproducible workflows are", 4))) +dummy_data <- data.frame( + + reproducible_workflow = c( + NA, + "Yes", + "No", + "I don't know what reproducible workflows are") + +) test_that("summarise_rep_workflow missing data is handled correctly", { @@ -15,13 +20,19 @@ test_that("summarise_rep_workflow output is as expected", { got <- summarise_rep_workflow(dummy_data) - expected <- data.frame(value = factor(c("Yes", - "No", - "I don't know what reproducible workflows are"), - levels = c("Yes", - "No", - "I don't know what reproducible workflows are")), - n = c(2/9, 1/3, 4/9)) + expected <- data.frame( + + value = factor(c( + "Yes", + "No", + "I don't know what reproducible workflows are"), + levels = c("Yes", + "No", + "I don't know what reproducible workflows are")), + + n = rep(1/3, times = 3) + + ) expect_equal(got, expected) @@ -32,6 +43,7 @@ test_that("summarise_rep_workflow validation works", { dummy_data <- data.frame(Test = c("test1", "test2")) - expect_error(summarise_rep_workflow(dummy_data), "unexpected_input: no column called 'reproducible_workflow'") + expect_error(summarise_rep_workflow(dummy_data), + "unexpected_input: no column called 'reproducible_workflow'") }) diff --git a/tests/testthat/test-summarise_strategy_knowledge.R b/tests/testthat/test-summarise_strategy_knowledge.R index a2d8ec2..d31e2be 100644 --- a/tests/testthat/test-summarise_strategy_knowledge.R +++ b/tests/testthat/test-summarise_strategy_knowledge.R @@ -1,9 +1,20 @@ -dummy_data <- data.frame(heard_of_RAP = c("No", rep("Yes", 9)), - strategy_knowledge = c(NA, - rep("I have not heard of the RAP strategy", 2), - rep("I have heard of the RAP strategy, but I haven't read it", 3), - rep("I have read the RAP strategy", 4))) +dummy_data <- data.frame( + + heard_of_RAP = rep(c( + NA, + "No", + "Yes"), + each = 4), + + strategy_knowledge = rep(c( + NA, + "Yes", + "Yes, but I haven't read it", + "No"), + times = 3) + +) test_that("summarise_strategy_knowledge validation works", { @@ -29,13 +40,20 @@ test_that("summarise_strategy_knowledge output is as expected", { got <- summarise_strategy_knowledge(dummy_data) - expected <- data.frame(value = factor(c("I have not heard of the RAP strategy", - "I have heard of the RAP strategy, but I haven't read it", - "I have read the RAP strategy"), - levels = c("I have not heard of the RAP strategy", - "I have heard of the RAP strategy, but I haven't read it", - "I have read the RAP strategy")), - n = c(2/9, 1/3, 4/9)) + expected <- data.frame( + + value = factor(c( + "Yes", + "Yes, but I haven't read it", + "No"), + levels = c( + "Yes", + "Yes, but I haven't read it", + "No")), + + n = rep(1/3, times = 3) + + ) expect_equal(got, expected) diff --git a/tests/testthat/test-summarise_where_learned_code.R b/tests/testthat/test-summarise_where_learned_code.R index 047da89..736dbd2 100644 --- a/tests/testthat/test-summarise_where_learned_code.R +++ b/tests/testthat/test-summarise_where_learned_code.R @@ -1,10 +1,32 @@ -dummy_data <- data.frame(code_freq = c(rep("Never", 8), "Sometimes", "Regularly"), - other_coding_experience = c(rep("Yes", 8), "No", "Yes"), - prev_coding_experience = c(rep("Yes", 8), NA, "No"), - first_learned = c(rep("Self-taught" , 3), - rep( "In public sector employment", 3), - rep("other" , 1), - rep(NA , 3))) +dummy_data <- data.frame( + + code_freq = rep(c( + NA, + "Never", + "Rarely", + "Sometimes", + "Regularly", + "All the time"), + each = 21), + + other_coding_experience = rep(c( + NA, + "Yes", + "No"), + times = 6, + each = 7), + + first_learned = rep(c( + NA, + "Current employment", + "Education", + "Previous private sector employment", + "Previous public sector employment", + "Self-taught", + "Other"), + times = 18) + +) test_that("summarise_where_learned_code missing data is handled correctly", { @@ -18,58 +40,65 @@ test_that("summarise_where_learned_code output is as expected", { got <- summarise_where_learned_code(dummy_data) - expected <- data.frame(value = factor(c("In current role", - "In education", - "In private sector employment", - "In public sector employment", - "Self-taught", - "Other"), - levels = c("In current role", - "In education", - "In private sector employment", - "In public sector employment", - "Self-taught", - "Other")), - n = c(2/9, 0, 0, 1/3, 1/3, 1/9)) + expected <- data.frame( - expect_equal(got, expected) - -}) - - -test_that("summarise_where_learned_code validation works", { - - dummy_data_1 <- data.frame(code_freq = c(rep("Never", 8), "Sometimes", "Regularly"), - other_coding_experience = c(rep("Yes", 8), "No", "Yes"), - prev_coding_experience = c(rep("Yes", 8), NA, "No")) + value = factor(c( + "Current employment", + "Education", + "Previous private sector employment", + "Previous public sector employment", + "Self-taught", + "Other"), + levels = c( + "Current employment", + "Education", + "Previous private sector employment", + "Previous public sector employment", + "Self-taught", + "Other")), - expect_error(summarise_where_learned_code(dummy_data_1), "unexpected_input: no column called 'first_learned'") + n = c(24/64, rep(8/64, times=5)) - dummy_data_2 <- data.frame(code_freq = c(rep("Never", 8), "Sometimes", "Regularly"), - prev_coding_experience = c(rep("Yes", 8), NA, "No"), - first_learned = c(rep("Self-taught" , 3), - rep( "In public sector employment", 3), - rep("other" , 1), - rep(NA , 3))) + ) - expect_error(summarise_where_learned_code(dummy_data_2), "unexpected_input: no column called 'other_coding_experience'") + expect_equal(got, expected) - dummy_data_3 <- data.frame(other_coding_experience = c(rep("Yes", 8), "No", "Yes"), - prev_coding_experience = c(rep("Yes", 8), NA, "No"), - first_learned = c(rep("Self-taught" , 3), - rep( "In public sector employment", 3), - rep("other" , 1), - rep(NA , 3))) +}) - expect_error(summarise_where_learned_code(dummy_data_3), "unexpected_input: no column called 'code_freq'") - dummy_data_4 <- data.frame(code_freq = c(rep("Never", 8), "Sometimes", "Regularly"), - other_coding_experience = c(rep("Yes", 8), "No", "Yes"), - first_learned = c(rep("Self-taught" , 3), - rep( "In public sector employment", 3), - rep("other" , 1), - rep(NA , 3))) +test_that("summarise_where_learned_code validation works", { - expect_error(summarise_where_learned_code(dummy_data_4), "unexpected_input: no column called 'prev_coding_experience'") + dummy_data_1 <- data.frame( + code_freq = c(rep("Never", 8), "Sometimes", "Regularly"), + other_coding_experience = c(rep("Yes", 8), "No", "Yes"), + prev_coding_experience = c(rep("Yes", 8), NA, "No") + ) + + expect_error(summarise_where_learned_code(dummy_data_1), + "unexpected_input: no column called 'first_learned'") + + dummy_data_2 <- data.frame( + code_freq = c(rep("Never", 8), "Sometimes", "Regularly"), + prev_coding_experience = c(rep("Yes", 8), NA, "No"), + first_learned = c(rep("Self-taught" , 3), + rep( "In public sector employment", 3), + rep("other" , 1), + rep(NA , 3)) + ) + + expect_error(summarise_where_learned_code(dummy_data_2), + "unexpected_input: no column called 'other_coding_experience'") + + dummy_data_3 <- data.frame( + other_coding_experience = c(rep("Yes", 8), "No", "Yes"), + prev_coding_experience = c(rep("Yes", 8), NA, "No"), + first_learned = c(rep("Self-taught" , 3), + rep( "In public sector employment", 3), + rep("other" , 1), + rep(NA , 3)) + ) + + expect_error(summarise_where_learned_code(dummy_data_3), + "unexpected_input: no column called 'code_freq'") })