diff --git a/.gitignore b/.gitignore index 920ae27..8b19f43 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ temp/ *.rda exploratory_scripts/ docs/ +docs/summary_qa.html \ No newline at end of file diff --git a/R/frequency-tables.R b/R/frequency-tables.R index 31f32e6..2341f7f 100644 --- a/R/frequency-tables.R +++ b/R/frequency-tables.R @@ -4,49 +4,50 @@ #' #' @param data full CARS dataset after pre-processing #' @param all_tables logical: whether to produce all summary output tables. Defaults to FALSE. +#' @param sample additionally returns count and sample size for selected tables for QA. FALSE by default #' #' @return list of frequency tables #' #' @export -summarise_all <- function(data, all_tables = FALSE) { +summarise_all <- function(data, all_tables = FALSE, sample = FALSE) { output_list <- list( - code_freq = summarise_code_freq(data), - knowledge = summarise_coding_tools(data, "knowledge"), - access = summarise_coding_tools(data, "access"), + code_freq = summarise_code_freq(data, sample = sample), + knowledge = summarise_coding_tools(data, "knowledge", sample = sample), + access = summarise_coding_tools(data, "access", sample = sample), language_status = summarise_language_status(data), - where_learned = summarise_where_learned_code(data), - ability_change = summarise_ability_change(data), - coding_practices = summarise_coding_practices(data), - doc = summarise_doc(data), - rap_knowledge = summarise_rap_knowledge(data), - rap_champ_status = summarise_rap_champ_status(data), - rap_opinions = summarise_rap_opinions(data), + where_learned = summarise_where_learned_code(data, sample = sample), + ability_change = summarise_ability_change(data, sample = sample), + coding_practices = summarise_coding_practices(data, sample = sample), + doc = summarise_doc(data, sample = sample), + rap_knowledge = summarise_rap_knowledge(data, sample = sample), + rap_champ_status = summarise_rap_champ_status(data, sample = sample), + rap_opinions = summarise_rap_opinions(data, sample = sample), basic_rap_scores = summarise_rap_basic(data), advanced_rap_scores = summarise_rap_advanced(data), - rap_components = summarise_rap_comp(data), + rap_components = summarise_rap_comp(data, sample = sample), ci = summarise_ci(data), dependency_management = summarise_dep_man(data), rep_workflow = summarise_rep_workflow(data), line_manage = summarise_line_manage(data), git_knowledge = summarise_knowledge_git(data), - git_access = summarise_access_git(data), - strategy_knowledge = summarise_strategy_knowledge(data) + git_access = summarise_access_git(data, sample = sample), + strategy_knowledge = summarise_strategy_knowledge(data, sample = sample) ) if (all_tables) { output_list <- c(output_list, list( - capability_change_by_freq = summarise_cap_change_by_freq(data), + capability_change_by_freq = summarise_cap_change_by_freq(data, sample = sample), capability_change_by_line_manage = summarise_cap_change_by_line_manage(data), capability_change_by_CS_grade = summarise_cap_change_by_CS_grade(data), basic_score_by_implementation = summarise_basic_score_by_imp(data), adv_score_by_implementation = summarise_adv_score_by_imp(data), basic_score_by_understanding = summarise_basic_score_by_understanding(data), adv_score_by_understanding = summarise_adv_score_by_understanding(data), - languages_by_prof = summarise_languages_by_prof(data), + languages_by_prof = summarise_languages_by_prof(data, sample = sample), open_source_by_prof = summarise_open_source_by_prof(data), heard_of_RAP_by_prof = summarise_heard_of_RAP_by_prof(data) )) @@ -63,13 +64,14 @@ summarise_all <- function(data, all_tables = FALSE) { #' @return list of sample sizes #' #' @export +#' sample_sizes <- function(data) { list( all = nrow(data), code_at_work = sum(!is.na(data$code_freq) & data$code_freq != "Never"), - other_code_experience = sum(!is.na(data$other_coding_experience ) & data$other_coding_experience == "Yes"), - heard_of_RAP = sum(!is.na(data$heard_of_RAP) & data$heard_of_RAP == "Yes"), + other_code_experience = sum(!is.na(data$code_freq) & data$code_freq != "Never" & data$other_coding_experience == "Yes"), + heard_of_RAP = sum(!is.na(data$code_freq) & data$code_freq != "Never" & data$heard_of_RAP == "Yes"), not_RAP_champ = sum(is.na(data$know_RAP_champ) | data$know_RAP_champ != "I am a RAP champion"), profs = sapply(c("prof_DE", "prof_DS", "prof_DDAT", "prof_GAD", "prof_GES", @@ -82,6 +84,7 @@ sample_sizes <- function(data) { } } ) + ) } @@ -91,10 +94,11 @@ sample_sizes <- function(data) { #' @description calculate frequency table for coding frequency. #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_code_freq <- function(data) { +summarise_code_freq <- function(data, sample = FALSE) { # Validation checks if (!"code_freq" %in% colnames(data)) { @@ -109,7 +113,7 @@ summarise_code_freq <- function(data) { "Regularly", "All the time") - frequencies <- calculate_freqs(data, questions, levels) + frequencies <- calculate_freqs(data, questions, levels, sample = sample) return(frequencies) } @@ -122,10 +126,11 @@ summarise_code_freq <- function(data) { #' @param data full CARS dataset after pre-processing #' @param type type of table (knowledge or access) #' @param prop whether to return proportion data (0-1). TRUE by default. Assumes mutually exclusive response options. +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_coding_tools <- function(data, type = list("knowledge", "access"), prop = TRUE) { +summarise_coding_tools <- function(data, type = list("knowledge", "access"), prop = TRUE, sample = FALSE) { questions <- c("knowledge_R", "access_R", "knowledge_SQL", "access_SQL", "knowledge_SAS", "access_SAS", "knowledge_VBA", "access_VBA", @@ -145,7 +150,7 @@ summarise_coding_tools <- function(data, type = list("knowledge", "access"), pro questions <- questions[grepl(paste0(type, "_"), questions)] - frequencies <- calculate_freqs(data, questions, levels, labels, prop = prop) %>% + frequencies <- calculate_freqs(data, questions, levels, labels, prop = prop, sample = sample) %>% dplyr::arrange(match(name, c("Python", "R", "SQL", "Matlab", "SAS", "SPSS", "Stata", "VBA"))) return(frequencies) @@ -158,12 +163,13 @@ summarise_coding_tools <- function(data, type = list("knowledge", "access"), pro #' @description calculate frequency table of where respondents learned to code #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) #' #' @importFrom dplyr select mutate case_when -summarise_where_learned_code <- function(data){ +summarise_where_learned_code <- function(data, sample = FALSE){ # Validation checks if (!"first_learned" %in% colnames(data)) { @@ -193,7 +199,7 @@ summarise_where_learned_code <- function(data){ !is.na(data$first_learned) & !(data$first_learned %in% levels) ~ "Other", TRUE ~ first_learned)) - frequencies <- calculate_freqs(data, questions, levels) + frequencies <- calculate_freqs(data, questions, levels, sample = sample) return(frequencies) } @@ -204,10 +210,11 @@ summarise_where_learned_code <- function(data){ #' @description calculate frequency table for data practices #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_coding_practices <- function(data) { +summarise_coding_practices <- function(data, sample = FALSE) { questions <- c("prac_use_open_source", "prac_open_source_own", "prac_version_control", "prac_review", "prac_functions", @@ -231,7 +238,7 @@ summarise_coding_practices <- function(data) { "Quality assurance throughout development", "Proportionate quality assurance") - frequencies <- calculate_freqs(data, questions, levels, labels) + frequencies <- calculate_freqs(data, questions, levels, labels, sample = sample) return(frequencies) @@ -291,17 +298,18 @@ summarise_rap_advanced <- function(data){ #' @description Create a frequency table of knowledge of RAP #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_rap_knowledge <- function(data){ +summarise_rap_knowledge <- function(data, sample = FALSE){ questions <- "heard_of_RAP" levels <- c("Yes", "No") - frequencies <- calculate_freqs(data, questions, levels) + frequencies <- calculate_freqs(data, questions, levels, sample = sample) return(frequencies) } @@ -311,10 +319,11 @@ summarise_rap_knowledge <- function(data){ #' @description Create a frequency table of knowledge of RAP Champions #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_rap_champ_status <- function(data){ +summarise_rap_champ_status <- function(data, sample = FALSE){ questions <- "RAP_champ_status" @@ -324,7 +333,7 @@ summarise_rap_champ_status <- function(data){ "No", "I don't know") - frequencies <- calculate_freqs(data, questions, levels) + frequencies <- calculate_freqs(data, questions, levels, sample = sample) return(frequencies) } @@ -335,10 +344,11 @@ summarise_rap_champ_status <- function(data){ #' @description Create frequency table of opinions of RAP #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_rap_opinions <- function(data) { +summarise_rap_opinions <- function(data, sample = FALSE) { # Validation checks if (!"heard_of_RAP" %in% colnames(data)) { @@ -370,7 +380,7 @@ summarise_rap_opinions <- function(data) { "I or my team are planning on implementing RAP in the next 12 months") - frequencies <- calculate_freqs(opinion_rap_data, questions, levels, labels) + frequencies <- calculate_freqs(opinion_rap_data, questions, levels, labels, sample = sample) return(frequencies) @@ -382,10 +392,11 @@ summarise_rap_opinions <- function(data) { #' @description Create frequency table of documentation use #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_doc <- function(data) { +summarise_doc <- function(data, sample = FALSE) { # Validation checks if (!"code_freq" %in% colnames(data)) { @@ -418,7 +429,7 @@ summarise_doc <- function(data) { "Flow charts") - frequencies <- calculate_freqs(documentation_data, questions, levels, labels) + frequencies <- calculate_freqs(documentation_data, questions, levels, labels, sample = sample) return(frequencies) @@ -429,12 +440,13 @@ summarise_doc <- function(data) { #' @description Create frequency table of basic and advanced RAP score components #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) #' #' @importFrom dplyr mutate arrange -summarise_rap_comp <- function(data) { +summarise_rap_comp <- function(data, sample = FALSE) { labels <- c("Use open source software", "Team open source code", @@ -476,6 +488,13 @@ summarise_rap_comp <- function(data) { names(components$n) <- NULL + if (sample == TRUE) { + components <- components %>% + mutate(count = colSums(data[questions], na.rm = TRUE)) + + components$sample <- sum(data$code_freq != "Never", na.rm = TRUE) + } + return(components) } @@ -570,10 +589,11 @@ summarise_rep_workflow <- function(data) { #' @description calculate frequency table for ability change #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_ability_change <- function(data) { +summarise_ability_change <- function(data, sample = FALSE) { # Validation checks if (!"coding_ability_change" %in% colnames(data)) { @@ -588,7 +608,7 @@ summarise_ability_change <- function(data) { "It has become slightly better", "It has become significantly better") - frequencies <- calculate_freqs(data, questions, levels) + frequencies <- calculate_freqs(data, questions, levels, sample = sample) frequencies$value <- frequencies$value %>% dplyr::recode_factor("It has become significantly worse" = "Significantly worse", @@ -697,10 +717,11 @@ summarise_knowledge_git <- function(data){ #' @description calculate frequency table for if someone has access to git #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_access_git <- function(data){ +summarise_access_git <- function(data, sample = FALSE){ # Validation checks if (!"access_git" %in% colnames(data)) { @@ -713,7 +734,7 @@ summarise_access_git <- function(data){ "No", "I don't know") - frequencies <- calculate_freqs(data, questions, levels) + frequencies <- calculate_freqs(data, questions, levels, sample = sample) return(frequencies) @@ -725,10 +746,11 @@ summarise_access_git <- function(data){ #' @description calculate frequency table for if someone heard of or read the RAP strategy #' #' @param data full CARS dataset after pre-processing +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_strategy_knowledge <- function(data){ +summarise_strategy_knowledge <- function(data, sample = FALSE){ # Validation checks if (!"strategy_knowledge" %in% colnames(data)) { @@ -746,7 +768,7 @@ summarise_strategy_knowledge <- function(data){ "Yes, but I haven't read it", "No") - frequencies <- calculate_freqs(data, questions, levels) + frequencies <- calculate_freqs(data, questions, levels, sample = sample) return(frequencies) @@ -758,16 +780,17 @@ summarise_strategy_knowledge <- function(data){ #' @description calculate the cross tab of coding frequency by capability change #' #' @param data full CARS dataset after pre-processing +#' @param sample returns proportion, count and, group size and sample size. FALSE by default #' #' @return frequency table (data.frame) -summarise_cap_change_by_freq <- function(data){ +summarise_cap_change_by_freq <- function(data, sample = FALSE){ col1 <- "code_freq" col2 <- "coding_ability_change" - dplyr::filter(data, code_freq != "Never") + data <- dplyr::filter(data, (code_freq != "Never" & other_coding_experience == "Yes")) levels1 <- c( "Rarely", @@ -782,7 +805,7 @@ summarise_cap_change_by_freq <- function(data){ "It has become slightly better", "It has become significantly better") - frequencies <- calculate_multi_table_freqs(data, col1, col2, levels1, levels2) + frequencies <- calculate_multi_table_freqs(data, col1, col2, levels1, levels2, sample = sample) return(frequencies) @@ -791,7 +814,7 @@ summarise_cap_change_by_freq <- function(data){ #' @title Summarise capability change by management responsibility #' -#' @description calculate the cross tab of capability change by management responsibilty +#' @description calculate the cross tab of capability change by management responsibility #' #' @param data full CARS dataset after pre-processing #' @@ -984,12 +1007,13 @@ summarise_adv_score_by_understanding <- function(data){ #' @description only used the main summary page. Needs to be turned into wide data for html table. #' #' @param data CARS data (pre-processed) +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return data.frame #' #' @importFrom dplyr recode -summarise_languages_by_prof <- function(data) { +summarise_languages_by_prof <- function(data, sample = FALSE) { profs <- c("prof_DE", "prof_DS", "prof_DDAT", "prof_GAD", "prof_GES", "prof_geog", "prof_GORS", "prof_GSR", "prof_GSG") @@ -1011,7 +1035,7 @@ summarise_languages_by_prof <- function(data) { if(nrow(filtered_data) > 0) { - output <- summarise_coding_tools(filtered_data, "knowledge") + output <- summarise_coding_tools(filtered_data, "knowledge", sample = sample) # Retain frequencies for "Yes" responses only output <- output[output[[2]] == "Yes", ] @@ -1213,6 +1237,7 @@ summarise_rap_awareness_over_time <- function(data) { #' @param levels all possible factor values in the filtered columns #' @param labels labels to rename the column headers. Only needed for multi-column frequencies #' @param prop whether to return proportion data (0-1). TRUE by default. Assumes mutually exclusive response options. +#' @param sample additionally returns count and sample size. FALSE by default #' #' @return data.frame #' @@ -1221,7 +1246,7 @@ summarise_rap_awareness_over_time <- function(data) { #' @importFrom dplyr select all_of group_by count mutate recode arrange #' @importFrom tidyr pivot_longer drop_na -calculate_freqs <- function(data, questions, levels, labels = NULL, prop = TRUE){ +calculate_freqs <- function(data, questions, levels, labels = NULL, prop = TRUE, sample = FALSE){ if (!is.null(labels)) { labels_list <- as.list(labels) @@ -1240,6 +1265,13 @@ calculate_freqs <- function(data, questions, levels, labels = NULL, prop = TRUE) colnames(frequencies) <- c("value", "n") + if (sample == TRUE) { + frequencies <- frequencies %>% + mutate(count = n) + + frequencies$sample <- sum(!is.na(selected_data[1])) + } + if (prop) { frequencies$n <- frequencies$n / ifelse(sum(frequencies$n, na.rm = TRUE)==0, @@ -1261,6 +1293,13 @@ calculate_freqs <- function(data, questions, levels, labels = NULL, prop = TRUE) colnames(frequencies) <- c("name", "value", "n") + if (sample == TRUE) { + frequencies <- frequencies %>% + mutate(count = n) + + frequencies$sample <- sum(!is.na(selected_data[1])) + } + if (prop) { frequencies <- prop_by_group(frequencies) } @@ -1269,6 +1308,7 @@ calculate_freqs <- function(data, questions, levels, labels = NULL, prop = TRUE) return(frequencies) } + #' @title Create tidy cross table #' #' @description Returns a cross table in tidy data format. @@ -1279,12 +1319,13 @@ calculate_freqs <- function(data, questions, levels, labels = NULL, prop = TRUE) #' @param levels1 factor levels for col1 #' @param levels2 factor levels for col2 #' @param prop whether to return proportion data (0-1). TRUE by default. Assumes mutually exclusive response options. +#' @param sample returns proportion, count and, group size and sample size. FALSE by default #' #' @return data.frame #' #' @importFrom dplyr all_of across -calculate_multi_table_freqs <- function(data, col1, col2, levels1, levels2, prop = TRUE){ +calculate_multi_table_freqs <- function(data, col1, col2, levels1, levels2, prop = TRUE, sample = FALSE){ selected_data <- data %>% dplyr::select(all_of(c(col1, col2))) @@ -1297,14 +1338,26 @@ calculate_multi_table_freqs <- function(data, col1, col2, levels1, levels2, prop drop_na() %>% data.frame() + if (sample == TRUE) { + frequencies <- frequencies %>% + group_by_at(1) %>% + mutate(count = n, + group_size = sum(n)) + + frequencies$sample <- sum(!is.na(selected_data[1])) + } + if(prop){ frequencies <- prop_by_group(frequencies) } + return(frequencies) } + + #' @title Convert frequencies to proportions #' #' @param data frequency table with three columns (can be of any name): name, value and count diff --git a/R/ingest.R b/R/ingest.R index d5c1a00..62069a8 100644 --- a/R/ingest.R +++ b/R/ingest.R @@ -213,7 +213,7 @@ get_all_waves <- function(mode = c("api", "file")) { tidy_colnames() %>% rename_cols() %>% apply_skip_logic() %>% - clean_departments() %>% + clean_data() %>% derive_vars() data$year <- 2023 diff --git a/man/calculate_freqs.Rd b/man/calculate_freqs.Rd index 53294a4..42ce78a 100644 --- a/man/calculate_freqs.Rd +++ b/man/calculate_freqs.Rd @@ -4,7 +4,14 @@ \alias{calculate_freqs} \title{Calculate frequencies} \usage{ -calculate_freqs(data, questions, levels, labels = NULL, prop = TRUE) +calculate_freqs( + data, + questions, + levels, + labels = NULL, + prop = TRUE, + sample = FALSE +) } \arguments{ \item{data}{full CARS data frame after pre-processing} @@ -16,6 +23,8 @@ calculate_freqs(data, questions, levels, labels = NULL, prop = TRUE) \item{labels}{labels to rename the column headers. Only needed for multi-column frequencies} \item{prop}{whether to return proportion data (0-1). TRUE by default. Assumes mutually exclusive response options.} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ data.frame diff --git a/man/calculate_multi_table_freqs.Rd b/man/calculate_multi_table_freqs.Rd index ff62baa..991ad10 100644 --- a/man/calculate_multi_table_freqs.Rd +++ b/man/calculate_multi_table_freqs.Rd @@ -4,7 +4,15 @@ \alias{calculate_multi_table_freqs} \title{Create tidy cross table} \usage{ -calculate_multi_table_freqs(data, col1, col2, levels1, levels2, prop = TRUE) +calculate_multi_table_freqs( + data, + col1, + col2, + levels1, + levels2, + prop = TRUE, + sample = FALSE +) } \arguments{ \item{data}{pre-processed CARS data set} @@ -18,6 +26,8 @@ calculate_multi_table_freqs(data, col1, col2, levels1, levels2, prop = TRUE) \item{levels2}{factor levels for col2} \item{prop}{whether to return proportion data (0-1). TRUE by default. Assumes mutually exclusive response options.} + +\item{sample}{returns proportion, count and, group size and sample size. FALSE by default} } \value{ data.frame diff --git a/man/summarise_ability_change.Rd b/man/summarise_ability_change.Rd index 9c9304d..3d2d7db 100644 --- a/man/summarise_ability_change.Rd +++ b/man/summarise_ability_change.Rd @@ -4,10 +4,12 @@ \alias{summarise_ability_change} \title{Summarise ability change frequency} \usage{ -summarise_ability_change(data) +summarise_ability_change(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_access_git.Rd b/man/summarise_access_git.Rd index ed5d029..32b05fb 100644 --- a/man/summarise_access_git.Rd +++ b/man/summarise_access_git.Rd @@ -4,10 +4,12 @@ \alias{summarise_access_git} \title{Summarise access to git} \usage{ -summarise_access_git(data) +summarise_access_git(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_all.Rd b/man/summarise_all.Rd index c0cdd8c..8173096 100644 --- a/man/summarise_all.Rd +++ b/man/summarise_all.Rd @@ -4,12 +4,14 @@ \alias{summarise_all} \title{Summarise all} \usage{ -summarise_all(data, all_tables = FALSE) +summarise_all(data, all_tables = FALSE, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} \item{all_tables}{logical: whether to produce all summary output tables. Defaults to FALSE.} + +\item{sample}{additionally returns count and sample size for selected tables for QA. FALSE by default} } \value{ list of frequency tables diff --git a/man/summarise_cap_change_by_freq.Rd b/man/summarise_cap_change_by_freq.Rd index 380d32a..44f329f 100644 --- a/man/summarise_cap_change_by_freq.Rd +++ b/man/summarise_cap_change_by_freq.Rd @@ -4,10 +4,12 @@ \alias{summarise_cap_change_by_freq} \title{Summarise capability change by coding frequency} \usage{ -summarise_cap_change_by_freq(data) +summarise_cap_change_by_freq(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{returns proportion, count and, group size and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_cap_change_by_line_manage.Rd b/man/summarise_cap_change_by_line_manage.Rd index c4a3fc9..c61d131 100644 --- a/man/summarise_cap_change_by_line_manage.Rd +++ b/man/summarise_cap_change_by_line_manage.Rd @@ -13,5 +13,5 @@ summarise_cap_change_by_line_manage(data) frequency table (data.frame) } \description{ -calculate the cross tab of capability change by management responsibilty +calculate the cross tab of capability change by management responsibility } diff --git a/man/summarise_code_freq.Rd b/man/summarise_code_freq.Rd index 128f279..76fd66d 100644 --- a/man/summarise_code_freq.Rd +++ b/man/summarise_code_freq.Rd @@ -4,10 +4,12 @@ \alias{summarise_code_freq} \title{Summarise coding frequency} \usage{ -summarise_code_freq(data) +summarise_code_freq(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_coding_practices.Rd b/man/summarise_coding_practices.Rd index 74f90be..d4894ef 100644 --- a/man/summarise_coding_practices.Rd +++ b/man/summarise_coding_practices.Rd @@ -4,10 +4,12 @@ \alias{summarise_coding_practices} \title{Summarise data practices questions} \usage{ -summarise_coding_practices(data) +summarise_coding_practices(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_coding_tools.Rd b/man/summarise_coding_tools.Rd index e8a0d39..cdcce48 100644 --- a/man/summarise_coding_tools.Rd +++ b/man/summarise_coding_tools.Rd @@ -4,7 +4,12 @@ \alias{summarise_coding_tools} \title{Summarise coding tools} \usage{ -summarise_coding_tools(data, type = list("knowledge", "access"), prop = TRUE) +summarise_coding_tools( + data, + type = list("knowledge", "access"), + prop = TRUE, + sample = FALSE +) } \arguments{ \item{data}{full CARS dataset after pre-processing} @@ -12,6 +17,8 @@ summarise_coding_tools(data, type = list("knowledge", "access"), prop = TRUE) \item{type}{type of table (knowledge or access)} \item{prop}{whether to return proportion data (0-1). TRUE by default. Assumes mutually exclusive response options.} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_doc.Rd b/man/summarise_doc.Rd index efe5600..1e34459 100644 --- a/man/summarise_doc.Rd +++ b/man/summarise_doc.Rd @@ -4,10 +4,12 @@ \alias{summarise_doc} \title{Frequency of documentation use} \usage{ -summarise_doc(data) +summarise_doc(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_languages_by_prof.Rd b/man/summarise_languages_by_prof.Rd index 8ab1e2f..00d9d15 100644 --- a/man/summarise_languages_by_prof.Rd +++ b/man/summarise_languages_by_prof.Rd @@ -4,10 +4,12 @@ \alias{summarise_languages_by_prof} \title{Summarise programming language knowledge by profession} \usage{ -summarise_languages_by_prof(data) +summarise_languages_by_prof(data, sample = FALSE) } \arguments{ \item{data}{CARS data (pre-processed)} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ data.frame diff --git a/man/summarise_rap_champ_status.Rd b/man/summarise_rap_champ_status.Rd index 483aa86..b0c33f0 100644 --- a/man/summarise_rap_champ_status.Rd +++ b/man/summarise_rap_champ_status.Rd @@ -4,10 +4,12 @@ \alias{summarise_rap_champ_status} \title{Knowledge of RAP Champions} \usage{ -summarise_rap_champ_status(data) +summarise_rap_champ_status(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_rap_comp.Rd b/man/summarise_rap_comp.Rd index 685ccd0..a660c48 100644 --- a/man/summarise_rap_comp.Rd +++ b/man/summarise_rap_comp.Rd @@ -4,10 +4,12 @@ \alias{summarise_rap_comp} \title{RAP score components} \usage{ -summarise_rap_comp(data) +summarise_rap_comp(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_rap_knowledge.Rd b/man/summarise_rap_knowledge.Rd index 18b617c..a355ca9 100644 --- a/man/summarise_rap_knowledge.Rd +++ b/man/summarise_rap_knowledge.Rd @@ -4,10 +4,12 @@ \alias{summarise_rap_knowledge} \title{Knowledge of RAP} \usage{ -summarise_rap_knowledge(data) +summarise_rap_knowledge(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_rap_opinions.Rd b/man/summarise_rap_opinions.Rd index cc64d5b..f9ad05c 100644 --- a/man/summarise_rap_opinions.Rd +++ b/man/summarise_rap_opinions.Rd @@ -4,10 +4,12 @@ \alias{summarise_rap_opinions} \title{Opinions of RAP} \usage{ -summarise_rap_opinions(data) +summarise_rap_opinions(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_strategy_knowledge.Rd b/man/summarise_strategy_knowledge.Rd index ce082d6..82341df 100644 --- a/man/summarise_strategy_knowledge.Rd +++ b/man/summarise_strategy_knowledge.Rd @@ -4,10 +4,12 @@ \alias{summarise_strategy_knowledge} \title{Summarise Analysis Function RAP strategy knowledge} \usage{ -summarise_strategy_knowledge(data) +summarise_strategy_knowledge(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/man/summarise_where_learned_code.Rd b/man/summarise_where_learned_code.Rd index 793f33f..2754cf0 100644 --- a/man/summarise_where_learned_code.Rd +++ b/man/summarise_where_learned_code.Rd @@ -4,10 +4,12 @@ \alias{summarise_where_learned_code} \title{Summarise where respondents learned to code} \usage{ -summarise_where_learned_code(data) +summarise_where_learned_code(data, sample = FALSE) } \arguments{ \item{data}{full CARS dataset after pre-processing} + +\item{sample}{additionally returns count and sample size. FALSE by default} } \value{ frequency table (data.frame) diff --git a/quarto/main/data_collection.qmd b/quarto/main/data_collection.qmd index e180438..b10031b 100644 --- a/quarto/main/data_collection.qmd +++ b/quarto/main/data_collection.qmd @@ -11,8 +11,7 @@ all_wave_data <- CARS::get_all_waves(mode = "file") data <- CARS::get_tidy_data_file("2023_data.csv") %>% CARS::rename_cols() %>% CARS::apply_skip_logic() %>% - CARS::clean_workplace() %>% - CARS::clean_departments() %>% + CARS::clean_data() %>% CARS::derive_vars() ``` diff --git a/quarto/main/summary.qmd b/quarto/main/summary.qmd index db963fa..9603333 100644 --- a/quarto/main/summary.qmd +++ b/quarto/main/summary.qmd @@ -12,8 +12,7 @@ library(magrittr) data <- CARS::get_tidy_data_file("2023_data.csv") %>% CARS::rename_cols() %>% CARS::apply_skip_logic() %>% - CARS::clean_workplace() %>% - CARS::clean_departments() %>% + CARS::clean_data() %>% CARS::derive_vars() all_wave_data <- CARS::get_all_waves(mode = "file") diff --git a/quarto/main/summary_qa.qmd b/quarto/main/summary_qa.qmd new file mode 100644 index 0000000..6cd91d0 --- /dev/null +++ b/quarto/main/summary_qa.qmd @@ -0,0 +1,328 @@ +--- +title: "Summary QA" +output: + html: + self-contained: true +--- + +```{r echo=FALSE} +library(magrittr) + +data <- CARS::get_tidy_data_file("2023_data.csv") %>% + CARS::rename_cols() %>% + CARS::apply_skip_logic() %>% + CARS::clean_data() %>% + CARS::derive_vars() + + +raw_data <- CARS::get_tidy_data_file("2023_data.csv") %>% + CARS::rename_cols() %>% + CARS::clean_data() %>% + CARS::derive_vars() + +all_wave_data <- CARS::get_all_waves(mode = "file") + +tables <- CARS::summarise_all(data, all_tables = TRUE, sample = TRUE) + +exp_samples <- CARS::sample_sizes(raw_data) + +``` + +### QA checklist: + +* Spelling, grammar and readability +* All charts and tables are present +* All charts have titles, legends and axis labels +* All links work as expected + +In addition, this document can be used to QA the data underlying each of the frequency tables and charts. Denominator checks take the expected sample size based on the raw data following the logic rules of the sample_sizes function, as an additional check for question routing. The expected sample size will vary for each question depending on question streaming rules. Other checks include raw data tables used for percentage calculations, which can be used to cross-check calculations are correct. + +The datasets used in this document are: + +* data: data as used in the final publication, with question skip logic applied +* raw_data: data without question skip logic applied, used to determine the expected sample sizes based on question streaming logic +* all_wave_data: data as used in the final publication for each year, with question skip logic applied + +## Coding frequency and tools +#### Summarise coding frequency +Check data against figure +```{r echo = FALSE} +knitr::kable(tables$code_freq) +``` + +Denominator check: +```{r echo = FALSE} +if(tables$code_freq$sample[1] != exp_samples$all) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$all)) +} +``` +#### Coding frequency over time +Sample size should be the total response for each year. Percentages are calculated within the summary.qmd code. +```{r echo = FALSE} + +all_wave_data$code_freq <- factor(all_wave_data$code_freq, levels = c( + "Never", + "Rarely", + "Sometimes", + "Regularly", + "All the time" +)) + +table(all_wave_data$year, all_wave_data$code_freq) %>% + data.frame %>% + dplyr::group_by(Var1) %>% + dplyr::summarise(sample = sum(Freq)) %>% + knitr::kable() + +``` + +### Access to and knowledge of programming languages +#### Access +Check data against figure, check proportions are correct +```{r echo = FALSE} +knitr::kable(tables$access) +``` +Denominator check: +```{r echo = FALSE} + +if(tables$access$sample[1] != exp_samples$all) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$all)) + +} +``` + +#### Knowledge +Check data against figure, check proportions are correct +```{r echo = FALSE} +knitr::kable(tables$knowledge) +``` +Denominator check: +```{r echo = FALSE} + +if(tables$knowledge$sample[1] != exp_samples$all) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$all)) + +} +``` + +#### Open source capability +Check percentages are correct from the data in the table: +```{r} +knitr::kable(CARS::summarise_os_vs_prop(all_wave_data)) +``` + +#### Different professions have capability in different tools +Check percentages are correct from the data in the table (final column = group sample size): +```{r echo = FALSE} +knitr::kable(tables$languages_by_prof) +``` +Denominator check - numbers of respondents in each profession, cross check with above: +```{r} +raw_data %>% + tidyr::pivot_longer(contains("prof"), names_to = "prof", values_to = "value") %>% + dplyr::group_by(prof) %>% + dplyr::summarise(n = sum(value == "Yes")) %>% + knitr::kable() +``` +#### Access to git +Check data against figure, check proportions are correct +```{r} +knitr::kable(tables$git_access) +``` +Denominator check: +```{r echo = FALSE} + +if(tables$git_access$sample[1] != exp_samples$all) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$all)) +} +``` + +## Capability +#### First learned +Check data against figure, check proportions are correct +```{r echo = FALSE} +knitr::kable(tables$where_learned) +``` +Denominator check: +```{r echo = FALSE} + +if(tables$where_learned$sample[1] != exp_samples$code_at_work) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$code_at_work)) + +} +``` + +#### Ability change +Check data against figure, check proportions are correct +```{r echo = FALSE} +knitr::kable(tables$ability_change) +``` +Denominator check: +```{r echo = FALSE} + +if(tables$ability_change$sample[1] != exp_samples$other_code_experience) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$other_code_experience)) +} +``` + +#### Ability change by frequency +Check data against figure, check proportions are correct +```{r} +knitr::kable(tables$capability_change_by_freq) +``` + +Sample size check: +```{r echo = FALSE} + +if(tables$capability_change_by_freq$sample[1] != exp_samples$other_code_experience) { + warning("Sample size different from expected") + print(paste0("Expected: ", exp_samples$other_code_experience)) + print(paste0("Actual: ", tables$capability_change_by_freq$sample[1])) +} else { + print(paste0("Sample size as expected: ", exp_samples$other_code_experience)) +} +``` + +## RAP +#### Awareness of RAP +Check that the percentages in the chart and the figures in the text are correct +```{r} +knitr::kable(CARS::summarise_rap_awareness_over_time(all_wave_data)) +``` + +#### RAP knowledge +Check data against figure, check proportions are correct +```{r} +knitr::kable(tables$rap_knowledge) +``` + +Denominator check: +```{r echo = FALSE} + +if(tables$rap_knowledge$sample[1] != exp_samples$code_at_work) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$code_at_work)) +} +``` + + +#### RAP champs +Check data against figure, check proportions are correct +```{r} +knitr::kable(tables$rap_champ_status) +``` + +Denominator check: +```{r echo = FALSE} + +if(tables$rap_champ_status$sample[1] != exp_samples$heard_of_RAP) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$heard_of_RAP)) +} +``` + + +#### RAP strategy knowledge +Check data against figure, check proportions are correct +```{r} +knitr::kable(tables$strategy_knowledge) +``` + +Denominator check: +```{r echo = FALSE} + +if(tables$strategy_knowledge$sample[1] != exp_samples$heard_of_RAP) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$heard_of_RAP)) +} +``` + + +#### RAP opinions +Check data against figure, check proportions are correct +```{r} +knitr::kable(tables$rap_opinions) +``` + +Denominator check: +```{r echo = FALSE} + +if(tables$rap_opinions$sample[1] != exp_samples$heard_of_RAP) { + warning("Denominator different from expected") + print(paste0("Expected: ", exp_samples$heard_of_RAP)) + print(paste0("Actual: ", tables$rap_opinions[1])) +} else { + print(paste0("Denominator as expected: ", exp_samples$heard_of_RAP)) +} +``` + +### Coding practices +Check data against figure, check proportions are correct +```{r} +knitr::kable(tables$rap_components) +``` + +Denominator check: +In this function, denominator is derived directly from data based on logic rules as below +```{r echo = FALSE} + +if(sum(data$code_freq != "Never", na.rm = TRUE) != exp_samples$code_at_work) { + warning("Denominator different from expected") +} else { + print(paste0("Denominator as expected: ", exp_samples$code_at_work)) + +} +``` + +#### Coding practices: frequency +Check data against figure, check proportions are correct +```{r} +knitr::kable(tables$coding_practices) +``` + +Denominator check: +```{r} + +if(tables$coding_practices$sample[1] != exp_samples$code_at_work) { + warning("Denominator different from expected") + print(paste0("Expected: ", exp_samples$code_at_work)) + print(paste0("Actual: ", tables$coding_practices$sample[1])) +} else { + print(paste0("Denominator as expected: ", exp_samples$code_at_work)) +} + +``` + +#### Documentation +Check data against figure, check proportions are correct +```{r} +knitr::kable(tables$doc) +``` + +Denominator check: +```{r} + +if(tables$doc$sample[1] != exp_samples$code_at_work) { + warning("Denominator different from expected") + print(paste0("Expected: ", exp_samples$code_at_work)) + print(paste0("Actual: ", tables$doc$sample[1])) +} else { + print(paste0("Denominator as expected: ", exp_samples$code_at_work)) +} + +``` diff --git a/quarto/templates/summary.qmd b/quarto/templates/summary.qmd index 7fdd740..c065135 100644 --- a/quarto/templates/summary.qmd +++ b/quarto/templates/summary.qmd @@ -14,8 +14,7 @@ library(magrittr) data <- CARS::get_tidy_data_file("2023_data.csv") %>% CARS::rename_cols() %>% CARS::apply_skip_logic() %>% - CARS::clean_workplace() %>% - CARS::clean_departments() %>% + CARS::clean_data() %>% CARS::derive_vars() data <- {{{filter}}} diff --git a/tests/testthat/test-summarise_cap_change_by_freq.R b/tests/testthat/test-summarise_cap_change_by_freq.R index 04ee53e..7ed025c 100644 --- a/tests/testthat/test-summarise_cap_change_by_freq.R +++ b/tests/testthat/test-summarise_cap_change_by_freq.R @@ -7,14 +7,20 @@ dummy_data <- data.frame( "It has stayed the same", "It has become slightly better", "It has become significantly better"), - each = 5), + each = 15), code_freq = rep(c( NA, "Sometimes", "All the time", "Rarely", "Regularly"), - times = 6) + times = 18), + other_coding_experience = rep(c( + NA, + "Yes", + "No"), + times = 30 + ) ) test_that("summarise_cap_change_by_freq missing data is handled correctly", {