Merge pull request #133 from dfe-analytical-services/sen-standards-check

Standardised col_name check
dfe-analytical-services · Sep 6, 2024 · 0d23027 · 0d23027
2 parents a9570b1 + 20042cb
commit 0d23027
Show file tree

Hide file tree

Showing 43 changed files with 633 additions and 1,842 deletions.
diff --git a/R/knownVariables.r b/R/knownVariables.r
@@ -117,3 +117,5 @@ acceptable_indicatorunits <- c("%", "pp", "£", "£m")
 
 # Harmonised values ===================================================================================================
 ethnicity_standard_values <- suppressMessages(read_csv("data/ethnicity.csv"))
+
+harmonised_col_names <- suppressMessages(read_csv("data/harmonised_col_names.csv"))
diff --git a/R/mainTests.r b/R/mainTests.r
@@ -58,7 +58,7 @@ mainTests <- function(data_character, meta_character, datafile, metafile) {
       indicator_dp(metafile), # active test
       indicator_dp_validation(metafile), # active test
       indicator_dp_completed(metafile), # active test
-      ethnicity_headers(metafile), # active test
+      standard_filter_headers(metafile), # active test
       ethnicity_values(datafile), # active test
       ethnicity_characteristic_group(datafile), # active test
       ethnicity_characteristic_values(datafile), # active test
@@ -2556,36 +2556,45 @@ indicator_dp_completed <- function(meta) {
   return(output)
 }
 
-ethnicity_headers <- function(meta) {
-  # First find any ethnicity type columns that don't have the standard col_names
-  ethnicity_standard_headers <- c("ethnicity_major", "ethnicity_minor", "ethnicity_detailed", "minority_ethnic")
-  ethnicity_columns <- meta %>%
+standard_filter_headers <- function(meta) {
+  # Collapse search terms for bad column names into regex term
+  search_string <- harmonised_col_names %>%
+    pull(col_name_search_string) %>%
+    unique() %>%
+    paste(
+      collapse = "|"
+    )
+  # Pivot meta data to arrange col_name and filter_grouping_column together and
+  # then filter for possible non-standard filter names.
+  standard_col_names <- harmonised_col_names %>%
+    pull(col_name_harmonised) %>%
+    unique()
+  bad_col_names <- meta %>%
+    select(col_name, filter_grouping_column) %>%
+    pivot_longer(
+      c(col_name, filter_grouping_column),
+      values_to = "col_name"
+    ) %>%
     filter(
-      grepl("ethnic", tolower(col_name)),
-      !(col_name %in% ethnicity_standard_headers)
+      grepl(search_string, tolower(col_name)),
+      !(col_name %in% standard_col_names)
     ) %>%
     pull(col_name)
-  if (length(ethnicity_columns) == 0) {
+  if (length(bad_col_names) == 0) {
     output <- list(
-      "message" = "No ethnicity header issues found.",
+      "message" = "No standardised col_name issues found.",
       "result" = "PASS"
     )
-  } else if (length(ethnicity_columns) == 1) {
-    output <- list(
-      "message" = paste0(
-        paste(ethnicity_columns, collapse = "', '"), " appears to relate to ethnicity data, but does not conform to the standard col_name conventions: ",
-        paste(ethnicity_standard_headers, collapse = ", "),
-        "."
-      ),
-      "result" = "FAIL"
-    )
   } else {
     output <- list(
       "message" = paste0(
-        "The following columns appear to relate to ethnicity data, but do not conform to the standard col_name conventions: <br> - '",
-        paste(ethnicity_columns, collapse = "', '"), "'. <br> - These should take the form of one of the following: ",
-        paste(ethnicity_standard_headers, collapse = ", "),
-        "."
+        "The column(s) '",
+        paste(bad_col_names, collapse = "', '"), "' appear to relate to ",
+        "contexts that fall under the harmonised data standards. Please verify",
+        " your column headers against the data standards in the <a href=",
+        "'https://dfe-analytical-services.github.io/analysts-guide/",
+        "statistics-production/ud.html#common-harmonised-variables'",
+        ">DfE harmonised data guidance</a>."
       ),
       "result" = "FAIL"
     )

diff --git a/data/harmonised_col_names.csv b/data/harmonised_col_names.csv
@@ -0,0 +1,17 @@
+col_name_search_string,col_name_harmonised
+sen_,sen_status
+sen_,sen_primary_need
+sen_,sen_secondary_need
+sen_,sen_provision
+primary_need,sen_primary_need
+secondary_need,sen_secondary_need
+establishment_type,establishment_type
+establishment_type,establishment_type_group
+provider_type,provider_type
+school_type,establishment_type
+school_type,establishment_type_group
+phase,education_phase
+ethnic,ethnicity_major
+ethnic,ethnicity_minor
+ethnic,ethnicity_detailed
+ethnic,minority_ethnic
diff --git a/data/sen.csv b/data/sen.csv
@@ -7,7 +7,7 @@ sen_provision,SEN provision,SEN support / SEN without an EHC plan,sen_status,Any
 sen_provision,SEN provision,No SEN provision,sen_status,No identified special educational need,
 sen_provision,SEN provision,Total,sen_status,Total,
 sen_primary_need,SEN primary need,All primary need,,,
-sen_primary_need,SEN primary need,"Autistic spectrum disorder	",,,ASD
+sen_primary_need,SEN primary need,"Autistic spectrum disorder",,,ASD
 sen_primary_need,SEN primary need,Hearing impairment,,,HI
 sen_primary_need,SEN primary need,Not reported,,,
 sen_primary_need,SEN primary need,Moderate learning difficulty,,,MLD
@@ -24,7 +24,7 @@ sen_primary_need,SEN primary need,Vision impairment,,,VI
 sen_primary_need,SEN primary need,No primary need,,,
 sen_primary_need,SEN primary need,Unknown,,,
 sen_secondary_need,SEN secondary need,All secondary need,,,
-sen_secondary_need,SEN secondary need,"Autistic spectrum disorder	",,,ASD
+sen_secondary_need,SEN secondary need,"Autistic spectrum disorder",,,ASD
 sen_secondary_need,SEN secondary need,Hearing impairment,,,HI
 sen_secondary_need,SEN secondary need,Not reported,,,
 sen_secondary_need,SEN secondary need,Moderate learning difficulty,,,MLD
Original file line number	Diff line number	Diff line change
Expand Up		@@ -117,3 +117,5 @@ acceptable_indicatorunits <- c("%", "pp", "£", "£m")

		# Harmonised values ===================================================================================================
		ethnicity_standard_values <- suppressMessages(read_csv("data/ethnicity.csv"))

		harmonised_col_names <- suppressMessages(read_csv("data/harmonised_col_names.csv"))