From 6467636e952859d18dfe8bb3a924368a845af43e Mon Sep 17 00:00:00 2001
From: Kate Isaac <41767733+kweav@users.noreply.github.com>
Date: Fri, 22 Mar 2024 09:57:42 -0400
Subject: [PATCH] fix cpm calculations

Using `apply(counts, 2, function(x) (x/new_data$counts_per_sample)*1e6)` means that every calculation was using the first sample count from `new_data$counts_per_sample` as the divisor. When comparing this code to the original, use of `new_data$counts_per_sample` leads to only 34641 of the 165850 values matched between the two repos. Replacing the pre-calculated sums with `sum(x)` brought them into alignment.

 For the log2 cpm calculation `log2(new_data$cpm +1)` leads to all zeros because there is no data in `new_data$cpm`. Replacing it with `new_data$transformed_data$cpm` points to the correct information and fixes this problem.

These changes were originally made on my qc branch in early Feb and buried under the rest of the changes, so suggesting them here separately so they can be incorporated now.
---
 R/00-setup_data.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/00-setup_data.R b/R/00-setup_data.R
index a8e2f81..c556888 100644
--- a/R/00-setup_data.R
+++ b/R/00-setup_data.R
@@ -59,8 +59,8 @@ setup_data <- function(counts = NULL, pg_metadata = NULL, sample_metadata = NULL
 
   # Transform the data
   new_data$transformed_data$count_norm <- apply(counts, 2, function(x) -log10((x+1)/sum(x)))
-  new_data$transformed_data$cpm <- apply(counts, 2, function(x) (x/new_data$counts_per_sample)*1e6)
-  new_data$transformed_data$log2_cpm <- log2(new_data$cpm +1)
+  new_data$transformed_data$cpm <- apply(counts, 2, function(x) (x/sum(x))*1e6)
+  new_data$transformed_data$log2_cpm <- log2(new_data$transformed_data$cpm +1)
 
   return(new_data)
 }