Make everything cleaner

debruine · May 3, 2019 · 74d8d42 · 74d8d42
1 parent ea2a335
commit 74d8d42
Show file tree

Hide file tree

Showing 51 changed files with 1,450 additions and 445 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -19,7 +19,7 @@ Imports:
     ggplot2
 License: MIT + file LICENSE
 Suggests:
-    testthat,
+    testthat (>= 2.1.0),
     knitr,
     rmarkdown
 VignetteBuilder: knitr

diff --git a/NAMESPACE b/NAMESPACE
@@ -5,6 +5,7 @@ export(check_design)
 export(check_sim_stats)
 export(cormat)
 export(cormat_from_triangle)
+export(get_design_long)
 export(is_pos_def)
 export(long2wide)
 export(make_id)

diff --git a/NEWS.md b/NEWS.md
@@ -5,4 +5,9 @@
 
 # faux 0.0.0.9005
 
-* Bug fixes for `sim_design()` (failed when within or between factor number was 0)
+* Bug fixes for `sim_design()` (failed when within or between factor number was 0)
+
+# faux 0.0.0.9006
+
+* Changes to argument order and names (more consistent, but may break old scripts)
+* Updated vignettes
diff --git a/R/check_design.R b/R/check_design.R
@@ -15,7 +15,7 @@
 #' 
 #' within <- list(time = c("day", "night"))
 #' between <- list(pet = c("dog", "cat"))
-#' design <- check_design(within, between)
+#' check_design(within, between)
 #' 
 #' @export
 #' 
@@ -116,12 +116,12 @@ check_design <- function(within = list(), between = list(),
     between_factors = between_factors,
     within_labels = within_labels,
     between_labels = between_labels,
+    cells_w = cells_w,
+    cells_b = cells_b,
     cell_n = cell_n,
     cell_mu = cell_mu,
     cell_sd = cell_sd,
     cell_r = cell_r,
-    cells_w = cells_w,
-    cells_b = cells_b,
     sub_id = sub_id
   )
 }
@@ -209,3 +209,115 @@ convert_param <- function (param, cells_w, cells_b, type = "this parameter") {
 
   dd
 }
+
+
+#' Get design from long data
+#' 
+#' Makes a best guess at the design of a long-format data frame. 
+#' Finds all columns that contain a single value per unit of analysis (between factors), 
+#' all columns that contain the same values per unit of analysis (within factors), and 
+#' all columns that differ over units of analysis (dv, continuous factors)
+#' 
+#' @param .data the data frame (in long format)
+#' @param id the column name(s) that identify a unit of analysis
+#' @param dv the column name that identifies the DV
+#' 
+#' @return the data frame in long format
+#' 
+#' @export
+#'
+get_design_long <- function(.data, id = "sub_id", dv = "val") {
+  between_factors <- .data %>%
+    dplyr::group_by_at(dplyr::vars(tidyselect::one_of(id))) %>%
+    dplyr::summarise_all(dplyr::n_distinct) %>%
+    dplyr::ungroup() %>%
+    dplyr::select(-tidyselect::one_of(id)) %>%
+    dplyr::summarise_all(max) %>%
+    dplyr::select_if(~ . == 1) %>%
+    names()
+
+  within_factors <- .data %>%
+    dplyr::select(-tidyselect::one_of(between_factors)) %>%
+    dplyr::group_by_at(dplyr::vars(tidyselect::one_of(id))) %>%
+    dplyr::summarise_all(paste0, collapse = ",") %>%
+    dplyr::ungroup() %>%
+    dplyr::select(-tidyselect::one_of(id)) %>%
+    dplyr::summarise_all(dplyr::n_distinct) %>%
+    dplyr::select_if(~ . == 1) %>%
+    names()
+
+  within <- .data %>%
+    dplyr::select(tidyselect::one_of(within_factors)) %>%
+    dplyr::mutate_all(as.factor) %>%
+    dplyr::summarise_all(~levels(.) %>% paste0(collapse = ".|.")) %>%
+    as.list() %>%
+    sapply(strsplit, split=".|.", fixed = TRUE)
+
+  between <- .data %>%
+    dplyr::select(tidyselect::one_of(between_factors)) %>%
+    dplyr::mutate_all(as.factor) %>%
+    dplyr::summarise_all(~levels(.) %>% paste0(collapse = ".|.")) %>%
+    as.list() %>%
+    sapply(strsplit, split=".|.", fixed = TRUE)
+
+  between_labels <- purrr::map(between, fix_name_labels)
+  within_labels <- purrr::map(within, fix_name_labels)
+
+  cells_b <- do.call(expand.grid, between) %>%
+    tidyr::unite("b", 1:ncol(.)) %>% dplyr::pull("b")
+
+  cells_w <- do.call(expand.grid, within) %>%
+    tidyr::unite("b", 1:ncol(.)) %>% dplyr::pull("b")
+
+  # get n, mu, sd, r per cell
+  chk <- check_sim_stats(.data, between_factors, within_factors, dv, id)
+
+  n <- chk %>%
+    tidyr::unite(".within", tidyselect::one_of(between_factors)) %>%
+    dplyr::select(.within, var, n) %>%
+    tidyr::spread(var, n) %>%
+    tibble::column_to_rownames(".within")
+
+  mu <- chk %>%
+    tidyr::unite(".within", tidyselect::one_of(between_factors)) %>%
+    dplyr::select(.within, var, mean) %>%
+    tidyr::spread(var, mean) %>%
+    tibble::column_to_rownames(".within")
+
+  sd <- chk %>%
+    tidyr::unite(".within", tidyselect::one_of(between_factors)) %>%
+    dplyr::select(.within, var, sd) %>%
+    tidyr::spread(var, sd) %>%
+    tibble::column_to_rownames(".within")
+
+  cors <- chk %>%
+    tidyr::unite(".between", tidyselect::one_of(between_factors)) %>%
+    dplyr::select(tidyselect::one_of(c(".between", "var", cells_w))) %>%
+    dplyr::mutate(var = forcats::fct_relevel(var, cells_w)) %>%
+    dplyr::arrange(var) %>%
+    dplyr::group_by(.between) %>%
+    tidyr::nest(.key = "r") %>%
+    as.list() 
+
+  r <- purrr::map(cors$r, ~tibble::column_to_rownames(., "var"))
+  names(r) <- cors$.between
+
+  design <- list(
+    within = within,
+    between = between,
+    within_factors = within_factors,
+    between_factors = between_factors,
+    within_labels = within_labels,
+    between_labels = between_labels,
+    cells_w = cells_w,
+    cells_b = cells_b,
+    cell_n = n,
+    cell_mu = mu,
+    cell_sd = sd,
+    cell_r = r,
+    sub_id = id
+  )
+
+  design
+}
+
diff --git a/R/check_sim_stats.R b/R/check_sim_stats.R
@@ -2,30 +2,30 @@
 #'
 #' \code{check_sim_stats} Generates a table of the correlations and means of numeric columns in a data frame
 #'
-#' @param dat the existing dataframe
+#' @param .data the existing tbl
 #' @param between a vector of column names for between-subject factors
 #' @param within a vector of column names for within-subject factors (if data is long)
 #' @param dv the column name of the dv (if data is long)
 #' @param id the column name(s) of the subject ID (if data is long)
 #' @param digits how many digits to round to (default = 2)
 #' @param usekable logical. If TRUE, output with knitr::kable
 #' 
-#' @return tibble or kable
+#' @return a tbl or kable
 #' @examples
 #' check_sim_stats(iris, "Species")
 #' @export
 #' 
 
-check_sim_stats <- function(dat, between = c(), within = c(), dv = c(), id = c(),
+check_sim_stats <- function(.data, between = c(), within = c(), dv = c(), id = c(),
                             digits = 2, usekable = FALSE) {
 
   if (length(within) && length(dv) && length(id)) {
     # convert long to wide
-    dat <- long2wide(dat, within, between, dv, id) %>%
+    .data <- long2wide(.data, within, between, dv, id) %>%
       dplyr::select(-tidyselect::one_of(id))
   }
 
-  grpdat <- select_num_grp(dat, between)
+  grpdat <- select_num_grp(.data, between)
   grpvars <- dplyr::group_vars(grpdat)
   numvars <- names(grpdat)[!names(grpdat) %in% grpvars]
 

diff --git a/R/long2wide.R b/R/long2wide.R
@@ -2,18 +2,22 @@
 #' 
 #' Converts data from long format to wide
 #' 
-#' @param dat the long data frame to convert
+#' @param .data the tbl in long format
 #' @param within the names of the within column(s)
 #' @param between the names of between column(s) (optional)
 #' @param dv the name of the DV (value) column
 #' @param id the names of the column(s) for grouping observations
 #' 
-#' @return the data frame in wide format
+#' @return a tbl in wide format
+#' 
+#' @examples 
+#' df_long <- sim_design(2, 2, long = TRUE)
+#' long2wide(df_long, "A", "B", "val", "sub_id")
 #' 
 #' @export
 #' 
-long2wide <- function(dat, within = c(), between = c(), dv = c(), id = c()) {
-  dat %>%
+long2wide <- function(.data, within = c(), between = c(), dv = "val", id = "sub_id") {
+  .data %>%
     dplyr::select(tidyselect::one_of(c(id, between, within, dv))) %>%
     tidyr::unite(".tmpwithin.", tidyselect::one_of(within))  %>%
     dplyr::group_by_at(dplyr::vars(tidyselect::one_of(between))) %>%
@@ -25,17 +29,25 @@ long2wide <- function(dat, within = c(), between = c(), dv = c(), id = c()) {
 #' 
 #' Converts data from wide format to long
 #' 
-#' @param dat the wide data frame to convert
-#' @param within the names of the within factors
-#' @param dv the names of the DV (value) columns
-#' @param sep Separator between columns (see tidyr::separate)
+#' @param .data the tbl in wide format
+#' @param within_factors the names of the within factors
+#' @param within_cols the names (or indices) of the within-subject (value) columns
+#' @param sep Separator for within-columns (see tidyr::separate)
+#' 
+#' @return a tbl in long format
 #' 
-#' @return the data frame in long format
+#' @examples 
+#' wide2long(iris, c("Feature", "Measure"), 1:4)
 #' 
 #' @export
 #' 
-wide2long <- function(dat, within_factors = c(), within_cols = c(), sep = "[^[:alnum:]]+") {
-  dat %>%
+wide2long <- function(.data, within_factors = c(), within_cols = c(), sep = "[^[:alnum:]]+") {
+  if (is.numeric(within_cols)) {
+    within_cols <- names(.data)[within_cols]
+  }
+
+  .data %>%
     tidyr::gather(".tmpwithin.", "val", tidyselect::one_of(within_cols)) %>%
     tidyr::separate(".tmpwithin.", within_factors, sep = sep)
-}
+}
+
diff --git a/R/make_id.R b/R/make_id.R
@@ -1,17 +1,27 @@
 #' Make ID
 #' 
 #' Make IDs with fixed length and a letter prefix for random effects (e.g., S001, S002, ..., S100).
-#' @param n the number of IDs to generate
+#' @param n the number of IDs to generate (or a vector of numbers)
 #' @param prefix the letter prefix to the number
+#' @param digits the number of digits to use for the numeric part. Only used if this is larger than the number of digits in n.
 #' 
 #' @return a vector of IDs
 #' @export
 #' 
 #' @examples 
 #' 
 #' make_id(20, "SUBJECT_")
+#' make_id(10:30, digits = 3)
 #' 
-make_id <- function(n = 100, prefix = "S") {
-  max_digits <- floor(log10(n))+1
-  paste0(prefix, formatC(1:n, width = max_digits, flag = "0"))
-}
+make_id <- function(n = 100, prefix = "S", digits = 0) {
+  # set max digits to the larger of digits in `n`` or `digits`
+  if (length(n) == 1) {
+    max_n <- n
+    n <- 1:max_n
+  } else {
+    max_n <- max(n)
+  }
+
+  max_digits <- max(floor(log10(max_n))+1, digits)
+  paste0(prefix, formatC(n, width = max_digits, flag = "0"))
+}
diff --git a/R/rnorm_multi.R b/R/rnorm_multi.R
@@ -1,6 +1,6 @@
 #' Multiple Normally Distributed Vectors
 #'
-#' \code{rnorm_multi} makes multiple normally distributed vectors with specified relationships
+#' \code{rnorm_multi()} makes multiple normally distributed vectors with specified relationships.
 #'
 #' @param n the number of samples required
 #' @param vars the number of variables to return
@@ -12,10 +12,12 @@
 #' @param as.matrix logical. If true, returns a matrix
 #' @param cors (deprecated; use r)
 #' 
-#' @return dataframe of vars vectors
+#' @return a tbl of vars vectors
+#' 
 #' @examples
-#' rnorm_multi(100, 3, c(0.2, 0.4, 0.5), varnames=c("A", "B", "C"))
-#' rnorm_multi(100, 3, c(1, 0.2, -0.5, 0.2, 1, 0.5, -0.5, 0.5, 1), varnames=c("A", "B", "C"))
+#' rnorm_multi(100, 3, 0, 1, c(0.2, 0.4, 0.5), varnames=c("A", "B", "C"))
+#' rnorm_multi(100, 3, 0, 1, c(1, 0.2, -0.5, 0.2, 1, 0.5, -0.5, 0.5, 1), varnames=c("A", "B", "C"))
+#' 
 #' @export
 
 rnorm_multi <- function(n, vars = 3, mu = 0, sd = 1, r = 0,

diff --git a/R/select_num_grp.R b/R/select_num_grp.R
@@ -2,39 +2,39 @@
 #'
 #' \code{select_num_grp} Select grouping and (optionally specified) numeric columns and group
 #'
-#' @param dat the existing dataframe
+#' @param .data the existing tbl
 #' @param between an optional list of column names to group by
 #' @param cols an optional list of column names to return (default of NULL returns all numeric columns)
 #' 
-#' @return tibble
+#' @return a tbl
 #' @examples
 #' select_num_grp(iris, "Species")
 #' @export
 
-select_num_grp <- function(dat, between = c(), cols = NULL) {
+select_num_grp <- function(.data, between = c(), cols = NULL) {
   # error checking -----------------
-  if (is.matrix(dat)) {
-    dat = as.data.frame(dat)
-  } else if (!is.data.frame(dat)) {
-    stop("dat must be a data frame or matrix")
+  if (is.matrix(.data)) {
+    .data = as.data.frame(.data)
+  } else if (!is.data.frame(.data)) {
+    stop(".data must be a data frame or matrix")
   }
 
   # select only grouping and numeric columns -----------------
   if (is.null(between)) {
     # no grouping, so select all numeric columns
-    numdat <- dplyr::select_if(dat, is.numeric)
+    numdat <- dplyr::select_if(.data, is.numeric)
     grpdat <- numdat
   } else if (is.numeric(between) || is.character(between)) {
     # get grouping column names if specified by index
-    if (is.numeric(between)) between <- names(dat)[between]
+    if (is.numeric(between)) between <- names(.data)[between]
 
     # numeric columns, excluding grouping columns
-    numdat <- dat %>%
+    numdat <- .data %>%
       dplyr::select(-tidyselect::one_of(between)) %>%
       dplyr::select_if(is.numeric)
 
     # get grouping columns, add remaining numeric columns, and group
-    grpdat <- dat %>%
+    grpdat <- .data %>%
       dplyr::select(tidyselect::one_of(between)) %>%
       dplyr::bind_cols(numdat) %>%
       dplyr::group_by_at(dplyr::vars(tidyselect::one_of(between)))
@@ -44,7 +44,7 @@ select_num_grp <- function(dat, between = c(), cols = NULL) {
 
   if (!is.null(cols)) {
     # return only grouping and cols
-    if (is.numeric(cols)) cols <- names(dat)[cols]
+    if (is.numeric(cols)) cols <- names(.data)[cols]
 
     grpdat <- grpdat %>%
       dplyr::select(tidyselect::one_of(c(between, cols)))