Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
Alex Knudson committed Jun 12, 2020
2 parents fc86d13 + d6fe6e3 commit b0f7850
Show file tree
Hide file tree
Showing 25 changed files with 509 additions and 600 deletions.
2 changes: 1 addition & 1 deletion .Rprofile
Original file line number Diff line number Diff line change
@@ -1 +1 @@
# reticulate::use_condaenv("bigsimr")
reticulate::use_condaenv("bigsimr-cpu")
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: bigsimr
Title: An R Package for Generating High-Dimensional Random Vectors
Version: 0.5.0
Version: 0.6.0
Authors@R:
person(given = "Alexander",
family = "Knudson",
Expand All @@ -16,7 +16,8 @@ Imports:
parallel,
purrr,
coop,
pcaPP
pcaPP,
rlang
Remotes:
douglasgscofield/fastrank
License: GNU GPLv3
Expand Down
5 changes: 1 addition & 4 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
# Generated by roxygen2: do not edit by hand

export(adjustForDiscrete)
export(all_corInBounds)
export(computeCorBounds)
export(convertCor)
export(fastCor)
export(install_bigsimr)
export(rcor)
export(rmvn)
export(rnbinom_params)
export(rmvu)
export(rvec)
export(which_corInBounds)
importFrom(reticulate,conda_binary)
importFrom(reticulate,py_available)
importFrom(reticulate,py_module_available)
236 changes: 50 additions & 186 deletions R/correlation_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,89 +20,47 @@ convertCor <- function(rho,
function(r) r
)

Matrix::nearPD(A(rho), corr = TRUE)$mat
}


#' Adjust the correlation matrix when there are discrete distributions present
#'
#' @param rho The input correlation matrix
#' @param params The parameters of the marginals.
#' @param nSigmas The number of standard deviations from the mean
#' @export
adjustForDiscrete <- function(rho, params, nSigmas) {
upper_bound <- lapply(params, function(param) {
prob <- param[["prob"]]
size <- param[["size"]]

mu <- size * (1 - prob) / prob
sigma2 <- size^2 * (1 - prob) / prob

ceiling(mu + nSigmas * sqrt(sigma2))
})
upper_bound <- max(unlist(upper_bound))

adj <- lapply(params, function(param) {
if (param[[1]] %in% discrete_dists) {
pmf_x <- do.call(paste0("d", param[[1]]),
c(list(x = 0:upper_bound), param[-1]))
return(sqrt(1 - sum(pmf_x^3)))
} else {
return(1)
}
})
adj <- unlist(adj)

idx_mat <- combn(x = length(params), m = 2)

rho_adj <- apply(idx_mat, 2, function(pair) {
adj_factor <- adj[pair[1]] * adj[pair[2]]
rho_tmp <- rho[pair[1], pair[2]]
min(rho_tmp / adj_factor, 1)
})

rho[lower.tri(rho)] <- rho_adj
rho[upper.tri(rho)] <- t(rho)[upper.tri(rho)]
diag(rho) <- 1.0
rho
Matrix::nearPD(A(rho), ensureSymmetry = FALSE, corr = TRUE)$mat
}


#' Computes the theoretical upper and lower bounds of possible correlations
#' given a set of marginals
#'
#' @param params The parameters of the marginals.
#' @param margins The parameters of the marginals.
#' @param cores The number of cores to utilize.
#' @param type The type of correlation matrix that is being passed.
#' @return A list containing the theoretical upper and lower bounds
#' @export
computeCorBounds <- function(params,
cores = 1,
computeCorBounds <- function(margins,
type = c("pearson", "kendall", "spearman"),
cores = 1,
reps = 1e5){

d <- length(params)
type <- match.arg(type)
d <- length(margins)
index_mat <- combn(x = d, m = 2)


# Replace the quantile function with the RNG function (e.g. qnorm -> rnorm)
q2r <- function(x) {
s <- rlang::as_string(x)
substr(s, 1, 1) <- "r" # replace 'q' with 'r'
rlang::ensym(s)
}


# Generate random samples for each margin and sort the vectors
# The sorted vectors are used for computing the bounds
if (cores == 1) {

sim_data <- sapply(1:d, function(i) {
sort(do.call(what = paste0("r", unlist(params[[i]][1])),
args = c(list(n = reps), params[[i]][-1])))
# Replace the quantile function with the RNG function (e.g. qnorm -> rnorm)
margins[[i]][[1]] <- q2r(margins[[i]][[1]])
margins[[i]]$n <- quote(reps)
eval( rlang::call2("sort", margins[[i]]) )
})
} else {
`%dopar%` <- foreach::`%dopar%`
cl <- parallel::makeCluster(cores, type = "FORK")
doParallel::registerDoParallel(cl)
sim_data <- foreach::foreach(i = 1:d, .combine = "cbind") %dopar% {
sort(do.call(what = paste0("r", unlist(params[[i]][1])),
args = c(list(n = reps), params[[i]][-1])))
}
}

index_mat <- combn(x = d, m = 2)

if (cores == 1) {
# Upper bounds
rho_upper <- fastCor(sim_data, method = type)

Expand All @@ -113,13 +71,25 @@ computeCorBounds <- function(params,
method = type)
}, data = sim_data, method = type)

rho_lower <- diag(x = 1, nrow = d, ncol = d)
rho_lower[lower.tri(rho_lower)] <- rho_lower_values
rho_lower <- t(rho_lower)
rho_lower <- matrix(0, d, d)
diag(rho_lower) <- 0.5
rho_lower[lower.tri(rho_lower)] <- rho_lower_values
rho_lower <- rho_lower + t(rho_lower)

} else {
# Upper bounds

# Set up the parallel computing environment
`%dopar%` <- foreach::`%dopar%`
cl <- parallel::makeCluster(cores, type = "FORK")
doParallel::registerDoParallel(cl)

sim_data <- foreach::foreach(i = 1:d, .combine = "cbind") %dopar% {
margins[[i]][[1]] <- q2r(margins[[i]][[1]])
margins[[i]]$n <- quote(reps)
eval( rlang::call2("sort", margins[[i]]) )
}

# Upper bounds (parallelized)
rho_upper_values <- parallel::parApply(
cl = cl,
X = index_mat,
Expand All @@ -128,13 +98,12 @@ computeCorBounds <- function(params,
fastCor(x = data[, index[1]],
y = data[, index[2]],
method = type)
}, data = sim_data, method = type)
}, data = sim_data, method = type)

# Ensure that the result is a valid correlation matrix
rho_upper <- diag(x = 1, nrow = d, ncol = d)
rho_upper[lower.tri(rho_upper)] <- rho_upper_values
rho_upper <- t(rho_upper)
rho_upper <- matrix(0, d, d)
diag(rho_upper) <- 0.5
rho_upper[lower.tri(rho_upper)] <- rho_upper_values
rho_upper <- rho_upper + t(rho_upper)

# Lower bounds
rho_lower_values <- parallel::parApply(
Expand All @@ -146,88 +115,19 @@ computeCorBounds <- function(params,
y = rev(data[, index[2]]),
method=type)
}, data = sim_data, method = type)
parallel::stopCluster(cl)

# Ensure that the result is a valid correlation matrix
rho_lower <- diag(x = 1, nrow = d, ncol = d)
rho_lower[lower.tri(rho_lower)] <- rho_lower_values
rho_lower <- t(rho_lower)
rho_lower <- matrix(0, d, d)
diag(rho_lower) <- 0.5
rho_lower[lower.tri(rho_lower)] <- rho_lower_values
}

list(upper = rho_upper,
lower = rho_lower)
}


#' Constrains a correlation matrix to the theoretical upper and lower bounds
#'
#' @param rho The input correlation matrix.
#' @param rho_bounds A list containing the theoretical upper and lower bounds
#' @return The constrained correlation matrix
constrainRho <- function(rho, rho_bounds){
rho_tmp <- pmin(rho_bounds[["upper"]], rho)
pmax(rho_bounds[["lower"]], rho_tmp)
}


#' Returns a logical matrix of which correlations are in the feasible region
#'
#' @param rho The input correlation matrix.
#' @param rho_bounds A list containing the theoretical upper and lower bounds
#' @param negate Should the logical values be negated in order to identify
#' values that are outside the feasible region.
#' @export
which_corInBounds <- function(rho, rho_bounds, negate = FALSE){

tooSmall <- rho < rho_bounds[["lower"]]
tooLarge <- rho > rho_bounds[["upper"]]
rho_lower <- rho_lower + t(rho_lower)

outOfBounds <- tooSmall | tooLarge
inBounds <- !outOfBounds
# Shut down the parallel cluster
parallel::stopCluster(cl)

# Return either the in bounds or out of bounds matrix
if (negate) {
outOfBounds
} else {
inBounds
}
}


#' Returns TRUE if all correlations pairs are within the theoretical bounds
#'
#' @param rho The input correlation matrix.
#' @param params The parameters of the marginals.
#' @param cores The number of cores to utilize.
#' @param type The type of correlation matrix that is being passed.
#' @param rho_bounds Pre-computed upper and lower correlation bounds
#' ... Other arguments passed to `computeCoreBounds()`
#' @return Logical. TRUE if all correlations pairs are within the theoretical
#' bounds, and false otherwise.
#' @export
all_corInBounds <- function(rho,
params,
cores = 1,
type = c("pearson", "spearman", "kendall"),
rho_bounds = NULL, ...){

if (is.null(rho_bounds)){
rho_bounds <- computeCorBounds(params = params,
cores = cores,
type = type,
...)
}

outOfBounds <- which_corInBounds(rho, rho_bounds, negate = TRUE)

if (any(outOfBounds)) {
warning(paste0("At least one of the specified correlations are either ",
"too large or too small. Please use 'which_corInBounds() ",
"to see which correlations are valid."))
return(FALSE)
}
TRUE
list(upper = rho_upper,
lower = rho_lower)
}


Expand All @@ -239,6 +139,8 @@ all_corInBounds <- function(rho,
#' @export
fastCor <- function(x, y = NULL, method = c("pearson", "kendall", "spearman")) {

method <- match.arg(method)

stopifnot(method %in% c("pearson", "kendall", "spearman"))

if (is.data.frame(x)) {
Expand Down Expand Up @@ -270,41 +172,3 @@ fastCor <- function(x, y = NULL, method = c("pearson", "kendall", "spearman")) {
}

}


#' Return the nearest positive definite correlation matrix
validcorr <- function(A) {
eye <- function(d) {
A <- matrix(0, d, d)
diag(A) <- 1.0
A
}

Ps <- function(A) {
eig <- eigen(A)
Q <- eig$vectors
D <- eig$values
D <- diag(ifelse(D < 0, 0, D))
Q %*% D %*% t(Q)
}

Pu <- function(X) {
X - diag(diag(X)) + eye(ncol(X))
}

d <- dim(A)
stopifnot(length(d) == 2, d[1] == d[2])

S <- matrix(0, d[1], d[2])
Y <- A

for (k in 1:(ncol(A)*100)) {
R <- Y - S
X <- Ps(R)
S <- X - R
Y <- Pu(X)
}

Pu(X)
}

9 changes: 0 additions & 9 deletions R/normal2margin.R

This file was deleted.

Loading

0 comments on commit b0f7850

Please sign in to comment.