From 5cf8a5f3bffdd2eaf7ee9eefe541ffda74334f14 Mon Sep 17 00:00:00 2001 From: Jesse Powell Date: Wed, 14 Jun 2023 16:37:13 -0400 Subject: [PATCH 1/2] Refactor Jensen-Shannon divergence calculation to handle small or zero probabilities Adjusted the Jensen-Shannon divergence calculation in the jensenShannon function to address potential issues with small or zero probabilities. Introduced a small epsilon value (eps = 1e-10) to prevent taking the logarithm of zero and division by zero. This adjustment ensures stability and avoids errors in cases where probabilities are close to zero. --- R/createJSON.R | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/R/createJSON.R b/R/createJSON.R index 33f77e4..3b4363e 100644 --- a/R/createJSON.R +++ b/R/createJSON.R @@ -299,13 +299,14 @@ jsPCA <- function(phi) { # using a symmetric version of KL-divergence # http://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence jensenShannon <- function(x, y) { - m <- 0.5 * (x + y) - lhs <- ifelse(x == 0, 0, x * (log(x) - log(m))) - rhs <- ifelse(y == 0, 0, y * (log(y) - log(m))) - 0.5 * sum(lhs) + 0.5 * sum(rhs) - } + eps <- 1e-10 # Small epsilon value + m <- 0.5 * (x + y) + lhs <- ifelse(x == 0, 0, x * (log(x + eps) - log(m + eps))) + rhs <- ifelse(y == 0, 0, y * (log(y + eps) - log(m + eps))) + 0.5 * sum(lhs) + 0.5 * sum(rhs) +} dist.mat <- proxy::dist(x = phi, method = jensenShannon) # then, we reduce the K by K proximity matrix down to K by 2 using PCA pca.fit <- stats::cmdscale(dist.mat, k = 2) data.frame(x = pca.fit[,1], y = pca.fit[,2]) -} + From 2c69cdbe7941f353c092a5c4c82a0166781d6dbf Mon Sep 17 00:00:00 2001 From: Jesse Powell Date: Wed, 14 Jun 2023 16:42:02 -0400 Subject: [PATCH 2/2] added missing bracket added missing curly brace. --- R/createJSON.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/createJSON.R b/R/createJSON.R index 3b4363e..ed470aa 100644 --- a/R/createJSON.R +++ b/R/createJSON.R @@ -294,6 +294,7 @@ createJSON <- function(phi = matrix(), theta = matrix(), doc.length = integer(), #' many columns as there are terms in the vocabulary. #' #' @export + jsPCA <- function(phi) { # first, we compute a pairwise distance between topic distributions # using a symmetric version of KL-divergence @@ -309,4 +310,6 @@ jsPCA <- function(phi) { # then, we reduce the K by K proximity matrix down to K by 2 using PCA pca.fit <- stats::cmdscale(dist.mat, k = 2) data.frame(x = pca.fit[,1], y = pca.fit[,2]) +} +