best-practice-and-impact · ldavies99 · Jan 3, 2024 · Jan 2, 2024
diff --git a/NAMESPACE b/NAMESPACE
@@ -5,7 +5,9 @@ export(apply_skip_logic)
 export(break_q_names)
 export(calculate_freqs)
 export(check_skip_logic)
+export(clean_data)
 export(clean_departments)
+export(clean_first_learned)
 export(clean_workplace)
 export(compare_models)
 export(create_filtered_pages)

diff --git a/R/data_cleaning.R b/R/data_cleaning.R
@@ -129,6 +129,25 @@ rename_cols <- function(data) {
   return(data)
 }
 
+#' @title Clean data
+#'
+#' @description Recategorise department, workplace and first_learned data
+#'
+#' @param data cleaned CARS dataset
+#'
+#' @return CARS dataset
+#' @export
+
+clean_data <- function(data){
+
+ data %>%
+   clean_departments() %>%
+   clean_workplace() %>%
+   clean_first_learned()
+
+}
+
+
 #' @title Clean department data
 #'
 #' @description add NHS to department list and merge departments where needed.
@@ -207,3 +226,30 @@ clean_workplace <- function(data) {
   return(data)
 
 }
+
+#' @title Clean first learned data
+#'
+#' @description reclassify 'other' free text responses into self-taught based on common terms used
+#'
+#' @param data cleaned CARS dataset
+#'
+#' @return CARS dataset
+#' @export
+
+clean_first_learned <- function(data) {
+
+  matches <- c("self",
+               "hobby",
+               "personal",
+               "independ",
+               "home",
+               "for fun",
+               "free time",
+               "spare time",
+               "childhood")
+
+  data$first_learned[stringr::str_detect(tolower(data$first_learned), stringr::str_c(matches, collapse = "|"))] <- "Self-taught"
+
+  return(data)
+
+}
diff --git a/R/frequency-tables.R b/R/frequency-tables.R
@@ -167,6 +167,7 @@ summarise_where_learned_code <- function(data){
               "Education",
               "Previous private sector employment",
               "Previous public sector employment",
+              "Self-taught",
               "Other")
 
   data <- data %>%

diff --git a/main.R b/main.R
@@ -3,8 +3,7 @@ library(magrittr)
 data <- CARS::get_tidy_data_file("2023_data.csv") %>%
   CARS::rename_cols() %>%
   CARS::apply_skip_logic() %>%
-  CARS::clean_workplace() %>%
-  CARS::clean_departments() %>%
+  CARS::clean_data() %>%
   CARS::derive_vars()
 
 CARS::create_filtered_pages(data, type = "departments")

diff --git a/man/clean_data.Rd b/man/clean_data.Rd
diff --git a/man/clean_first_learned.Rd b/man/clean_first_learned.Rd
diff --git a/tests/testthat/test-summarise_where_learned_code.R b/tests/testthat/test-summarise_where_learned_code.R
@@ -7,21 +7,22 @@ dummy_data <- data.frame(
     "Sometimes",
     "Regularly",
     "All the time"),
-    each=18),
+    each = 21),
 
   other_coding_experience = rep(c(
     NA,
     "Yes",
     "No"),
     times = 6,
-    each = 6),
+    each = 7),
 
   first_learned = rep(c(
     NA,
     "Current employment",
     "Education",
     "Previous private sector employment",
     "Previous public sector employment",
+    "Self-taught",
     "Other"),
     times = 18)
 
@@ -46,15 +47,17 @@ test_that("summarise_where_learned_code output is as expected", {
       "Education",
       "Previous private sector employment",
       "Previous public sector employment",
+      "Self-taught",
       "Other"),
       levels = c(
         "Current employment",
         "Education",
         "Previous private sector employment",
         "Previous public sector employment",
+        "Self-taught",
         "Other")),
 
-    n = c(19/47, rep(7/47, times=4))
+    n = c(24/64, rep(8/64, times=5))
 
   )