diff --git a/R/createJSON.R b/R/createJSON.R index 33f77e4..ed470aa 100644 --- a/R/createJSON.R +++ b/R/createJSON.R @@ -294,18 +294,22 @@ createJSON <- function(phi = matrix(), theta = matrix(), doc.length = integer(), #' many columns as there are terms in the vocabulary. #' #' @export + jsPCA <- function(phi) { # first, we compute a pairwise distance between topic distributions # using a symmetric version of KL-divergence # http://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence jensenShannon <- function(x, y) { - m <- 0.5 * (x + y) - lhs <- ifelse(x == 0, 0, x * (log(x) - log(m))) - rhs <- ifelse(y == 0, 0, y * (log(y) - log(m))) - 0.5 * sum(lhs) + 0.5 * sum(rhs) - } + eps <- 1e-10 # Small epsilon value + m <- 0.5 * (x + y) + lhs <- ifelse(x == 0, 0, x * (log(x + eps) - log(m + eps))) + rhs <- ifelse(y == 0, 0, y * (log(y + eps) - log(m + eps))) + 0.5 * sum(lhs) + 0.5 * sum(rhs) +} dist.mat <- proxy::dist(x = phi, method = jensenShannon) # then, we reduce the K by K proximity matrix down to K by 2 using PCA pca.fit <- stats::cmdscale(dist.mat, k = 2) data.frame(x = pca.fit[,1], y = pca.fit[,2]) } + +