subject summary outcomes too

The outcomes are now all doubles. That' why the 'subject-survey' csv looks renamed to git, but the RDS can tell the difference. See #35
nlsy-links · Jan 6, 2018 · 5eebf31 · 5eebf31
1 parent 55219e3
commit 5eebf31
Show file tree

Hide file tree

Showing 9 changed files with 20,011 additions and 125 deletions.
diff --git a/config.yml b/config.yml
@@ -1,4 +1,6 @@
 default:
   trials: 5
-  outcomes-79-csv: "./data-public/derived/outcomes-79-v%i.csv"
-  outcomes-79-rds: "./data-public/derived/outcomes-79.rds"
+  outcomes-79-subject-csv:        "./data-public/derived/outcomes-79-subject-v%i.csv"
+  outcomes-79-subject-rds:        "./data-public/derived/outcomes-79-subject.rds"
+  outcomes-79-subject-survey-csv: "./data-public/derived/outcomes-79-subject-survey-v%i.csv"
+  outcomes-79-subject-survey-rds: "./data-public/derived/outcomes-79-subject-survey.rds"
diff --git a/dal/outcomes/outcomes-79.R b/dal/outcomes/outcomes-79.R
@@ -47,39 +47,79 @@ sql_survey_time <- "
     LEFT JOIN Process.tblSubject s      ON t.SubjectTag = s.SubjectTag
   ORDER BY t.SubjectTag, t.SurveyYear
 "
+sql_subject_generation <- "
+  SELECT
+    SubjectTag    AS subject_tag,
+    Generation    AS generation
+  FROM Process.tblSubject
+"
 
 sql_algorithm_version_max <- "SELECT MAX(AlgorithmVersion) as version FROM Archive.tblRelatedValuesArchive"
-path_out_csv_raw          <- config::get("outcomes-79-csv")
-path_out_rds              <- config::get("outcomes-79-rds")
+path_out_subject_csv_raw                <- config::get("outcomes-79-subject-csv")
+path_out_subject_rds                    <- config::get("outcomes-79-subject-rds")
+path_out_subject_survey_csv_raw         <- config::get("outcomes-79-subject-survey-csv")
+path_out_subject_survey_rds             <- config::get("outcomes-79-subject-survey-rds")
 
 # ---- load-data ---------------------------------------------------------------
 
 channel                   <- open_dsn_channel_odbc()
 ds_outcome                <- DBI::dbGetQuery(channel, sql_outcome    )
 ds_survey_time            <- DBI::dbGetQuery(channel, sql_survey_time)
+ds_subject_generation     <- DBI::dbGetQuery(channel, sql_subject_generation)
 ds_algorithm_version      <- DBI::dbGetQuery(channel, sql_algorithm_version_max)
-DBI::dbDisconnect(channel); rm(channel, sql_outcome, sql_survey_time, sql_algorithm_version_max)
+DBI::dbDisconnect(channel); rm(channel, sql_outcome, sql_survey_time, sql_subject_generation, sql_algorithm_version_max)
 
 # ---- tweak-data --------------------------------------------------------------
 # OuhscMunge::column_rename_headstart(ds_county) #Spit out columns to help write call ato `dplyr::rename()`.
 dim(ds_survey_time)
 dim(ds_outcome)
+dim(ds_subject_generation)
 ds_algorithm_version
 
-path_out_csv <- sprintf(path_out_csv_raw, ds_algorithm_version$version)
+path_out_subject_csv        <- sprintf(path_out_subject_csv_raw         , ds_algorithm_version$version)
+path_out_subject_survey_csv <- sprintf(path_out_subject_survey_csv_raw  , ds_algorithm_version$version)
 
 ds_survey_time <- ds_survey_time %>%
   tibble::as_tibble()
 
-ds <- ds_outcome %>%
+ds_subject_generation <- ds_subject_generation %>%
+  tibble::as_tibble()
+
+ds_outcome <- ds_outcome %>%
   tibble::as_tibble() %>%
   dplyr::mutate(
-    item_label    = OuhscMunge::snake_case(item_label)
+    item_label          = OuhscMunge::snake_case(item_label),
+    value               = dplyr::if_else(item_label=="gen1_afqt_scaled3_decimals", value / 1000, as.numeric(value)), # If the conversion is here, then the other variables must be double-precision floating points.
+    item_label          = dplyr::recode(item_label, "gen1_afqt_scaled3_decimals"="afqt_scaled_gen1") # If the conversion is here, then the other variables must be double-precision floating points.
+  )
+
+# table(ds_outcome$item_label)
+
+# ---- groom-subject -----------------------------------------------------------
+ds_subject <- ds_outcome %>%
+  dplyr::group_by(subject_tag, item_label) %>%
+  dplyr::summarize(
+    value  = OuhscMunge::first_nonmissing(value)
   ) %>%
+  dplyr::ungroup() %>%
+  tidyr::spread(key=item_label, value=value) %>%
+  dplyr::left_join(ds_subject_generation, by="subject_tag") %>%
+  dplyr::select(
+    subject_tag,
+    generation,
+    height_inches   = gen1_height_inches,
+    weight_pounds   = gen1_weight_pounds,
+    afqt_scaled_gen1,
+    father_alive    = gen2_c_father_alive
+  )
+
+
+# ---- groom-subject-year ------------------------------------------------------
+ds_subject_survey <- ds_outcome %>%
   tidyr::spread(key=item_label, value=value) %>%
   dplyr::left_join(ds_survey_time, by=c("subject_tag", "survey_year")) %>%
   dplyr::mutate(
-    afqt_scaled_gen1   = gen1_afqt_scaled3_decimals / 1000
+    # afqt_scaled_gen1   = gen1_afqt_scaled3_decimals / 1000
   ) %>%
   dplyr::select(
     subject_tag,
@@ -92,34 +132,63 @@ ds <- ds_outcome %>%
     father_alive    = gen2_c_father_alive
   )
 
-ds
-# ---- verify-values -----------------------------------------------------------
+ds_subject_survey
+
+# ---- verify-values-ds_subject -----------------------------------------------------------
+# Sniff out problems
+summary(ds_subject)
+checkmate::assert_integer(ds_subject$subject_tag          , lower=         100L   , upper=1268600L, any.missing=F, unique=F)
+checkmate::assert_integer(ds_subject$generation           , lower=           1L   , upper=      2L, any.missing=F)
+checkmate::assert_numeric(ds_subject$height_inches        , lower=          48L   , upper=     83L, any.missing=T)
+checkmate::assert_numeric(ds_subject$weight_pounds        , lower=          47L   , upper=    400L, any.missing=T)
+checkmate::assert_numeric(ds_subject$afqt_scaled_gen1     , lower=           0    , upper=    100 , any.missing=T)
+checkmate::assert_numeric(ds_subject$father_alive         , lower=          -3L   , upper=      1L, any.missing=T)
+
+# ---- verify-values-ds_subject_survey -----------------------------------------------------------
 # Sniff out problems
-summary(ds)
-checkmate::assert_integer(ds$subject_tag          , lower=         100L   , upper=1268600L, any.missing=F, unique=F)
-checkmate::assert_integer(ds$survey_year          , lower=        1979L   , upper=   2016L, any.missing=F, unique=F)
-checkmate::assert_integer(ds$generation           , lower=           1L   , upper=      2L, any.missing=F)
-checkmate::assert_numeric(ds$age                  , lower=           0    , upper=     40 , any.missing=T)
-checkmate::assert_integer(ds$height_inches        , lower=          48L   , upper=     83L, any.missing=T)
-checkmate::assert_integer(ds$weight_pounds        , lower=          47L   , upper=    400L, any.missing=T)
-checkmate::assert_numeric(ds$afqt_scaled_gen1     , lower=           0    , upper=    100 , any.missing=T)
-checkmate::assert_integer(ds$father_alive         , lower=          -3L   , upper=      1L, any.missing=T)
-
-# ---- specify-columns-to-upload -----------------------------------------------
-# dput(colnames(ds)) # Print colnames for line below.
+summary(ds_subject_survey)
+checkmate::assert_integer(ds_subject_survey$subject_tag          , lower=         100L   , upper=1268600L, any.missing=F, unique=F)
+checkmate::assert_integer(ds_subject_survey$survey_year          , lower=        1979L   , upper=   2016L, any.missing=F, unique=F)
+checkmate::assert_integer(ds_subject_survey$generation           , lower=           1L   , upper=      2L, any.missing=F)
+checkmate::assert_numeric(ds_subject_survey$age                  , lower=           0    , upper=     40 , any.missing=T)
+checkmate::assert_numeric(ds_subject_survey$height_inches        , lower=          48L   , upper=     83L, any.missing=T)
+checkmate::assert_numeric(ds_subject_survey$weight_pounds        , lower=          47L   , upper=    400L, any.missing=T)
+checkmate::assert_numeric(ds_subject_survey$afqt_scaled_gen1     , lower=           0    , upper=    100 , any.missing=T)
+checkmate::assert_numeric(ds_subject_survey$father_alive         , lower=          -3L   , upper=      1L, any.missing=T)
+
+# ---- specify-columns-to-upload-ds_subject -----------------------------------------------
+# dput(colnames(ds_subject_survey)) # Print colnames for line below.
+columns_to_write <- c(
+  "subject_tag", "generation",
+  "height_inches", "weight_pounds", "afqt_scaled_gen1",
+  "father_alive"
+)
+ds_slim_subject <- ds_subject %>%
+  # dplyr::slice(1:100) %>%
+  dplyr::select(!!columns_to_write)
+ds_slim_subject
+
+rm(columns_to_write)
+
+# ---- specify-columns-to-upload-ds_subject_survey -----------------------------------------------
+# dput(colnames(ds_subject_survey)) # Print colnames for line below.
 columns_to_write <- c(
   "subject_tag", "survey_year", "generation", "age",
   "height_inches", "weight_pounds", "afqt_scaled_gen1",
   "father_alive"
-  )
-ds_slim <- ds %>%
+)
+ds_slim_subject_survey <- ds_subject_survey %>%
   # dplyr::slice(1:100) %>%
   dplyr::select(!!columns_to_write)
-ds_slim
+ds_slim_subject_survey
 
 rm(columns_to_write)
 
 # ---- save-to-disk ------------------------------------------------------------
 # The content of these two datasets are identical.  The first one is a plain-text csv.  The second one is a native R object.
-readr::write_csv(ds, path_out_csv)
-readr::write_rds(ds, path_out_rds, compress="xz")
+readr::write_csv(ds_slim_subject, path_out_subject_csv)
+readr::write_rds(ds_slim_subject, path_out_subject_rds, compress="xz")
+
+# The content of these two datasets are identical.  The first one is a plain-text csv.  The second one is a native R object.
+readr::write_csv(ds_slim_subject_survey, path_out_subject_survey_csv)
+readr::write_rds(ds_slim_subject_survey, path_out_subject_survey_rds, compress="xz")
diff --git a/data-public/derived/outcomes-79-v86.csv → ...erived/outcomes-79-subject-survey-v86.csv b/data-public/derived/outcomes-79-v86.csv → ...erived/outcomes-79-subject-survey-v86.csv
diff --git a/data-public/derived/outcomes-79-subject-survey.rds b/data-public/derived/outcomes-79-subject-survey.rds