Merge branch 'messyForest' into Multiplex_Major_Patch

GoekeLab · Oct 25, 2024 · 657e2b5 · 657e2b5
2 parents 77cae13 + 773d88c
commit 657e2b5
Show file tree

Hide file tree

Showing 14 changed files with 943 additions and 280 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -86,7 +86,8 @@ Imports:
     Rsamtools,
     methods,
     Rcpp,
-    xgboost
+    xgboost,
+    Matrix
 VignetteBuilder: 
     knitr
 LazyData: true
diff --git a/NAMESPACE b/NAMESPACE
@@ -78,4 +78,5 @@ import(SummarizedExperiment)
 import(S4Vectors, except=c(rename, setequal, setdiff, intersect,cor))
 useDynLib(bambu, .registration = TRUE)
 import(xgboost)
+import(Matrix)
 import(BSgenome)
diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R
@@ -0,0 +1,83 @@
+#' Create equivilence classes and assign to transcripts
+#' @inheritParams bambu
+#' @import data.table
+#' @noRd
+assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParameters, verbose, demultiplexed, spatial) {
+    metadata(readClassList)$readClassDist <- calculateDistTable(readClassList, annotations, isoreParameters, verbose)
+    readClassList = splitReadClassFiles(readClassList)
+    readClassDt <- genEquiRCs(metadata(readClassList)$readClassDist, annotations, verbose) 
+    readClassDt$eqClass.match = match(readClassDt$eqClassById,metadata(readClassList)$eqClassById)
+    readClassDt <- simplifyNames(readClassDt)
+    readClassDt = readClassDt %>% group_by(eqClassId, gene_sid) %>% 
+        mutate(multi_align = length(unique(txid))>1) %>% ungroup() %>% mutate(aval = 1) %>%
+        data.table()
+
+    #return non-em counts
+    ColData = generateColData(colnames(metadata(readClassList)$countMatrix), clusters = NULL, demultiplexed, spatial)
+    quantData <- SummarizedExperiment(assays = SimpleList(
+        counts = generateUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations)),
+        rowRanges = annotations,
+        colData = ColData)
+    colnames(quantData) = ColData$id
+    if(sum(metadata(readClassList)$incompatibleCountMatrix)==0){
+        metadata(quantData)$incompatibleCounts = NULL
+    } else{
+        metadata(quantData)$incompatibleCounts = generateIncompatibleCounts(metadata(readClassList)$incompatibleCountMatrix, annotations)       
+    }
+    metadata(quantData)$nonuniqueCounts = generateNonUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations)
+    metadata(quantData)$readClassDt = readClassDt
+    metadata(quantData)$countMatrix = metadata(readClassList)$countMatrix
+    metadata(quantData)$incompatibleCountMatrix  = metadata(readClassList)$incompatibleCountMatrix 
+    metadata(quantData)$sampleNames = metadata(readClassList)$sampleNames 
+    return(quantData)     
+
+}
+
+
+generateUniqueCounts <- function(readClassDt, countMatrix, annotations){
+    x = readClassDt %>% filter(!multi_align & !is.na(eqClass.match))
+    uniqueCounts = countMatrix[x$eqClass.match,]
+    uniqueCounts.tx = sparse.model.matrix(~ factor(x$txid) - 1)
+    uniqueCounts = t(uniqueCounts.tx) %*% uniqueCounts
+    rownames(uniqueCounts) = names(annotations)[match(as.numeric(levels(factor(x$txid))),mcols(annotations)$txid)]
+    counts = sparseMatrix(length(annotations), ncol(uniqueCounts), x = 0)
+    rownames(counts) = names(annotations)
+    counts[rownames(uniqueCounts),] = uniqueCounts
+    return(counts)
+
+    counts.total = colSums(countMatrix) + colSums(incompatibleCountMatrix)
+    counts.total[counts.total==0] = 1
+    counts.CPM = counts/counts.total * 10^6
+
+}
+
+generateIncompatibleCounts <- function(incompatibleCountMatrix, annotations){
+    genes = levels(factor(unique(mcols(annotations)$GENEID)))
+    rownames(incompatibleCountMatrix) = genes[as.numeric(rownames(incompatibleCountMatrix))]
+    geneMat = sparseMatrix(length(genes), ncol(incompatibleCountMatrix), x = 0)
+    rownames(geneMat) = genes
+    geneMat[rownames(incompatibleCountMatrix),] = incompatibleCountMatrix
+    return(geneMat)
+}
+
+generateNonUniqueCounts <- function(readClassDt, countMatrix, annotations){
+    #fuse multi align RCs by gene
+    x = readClassDt %>% filter(multi_align & !is.na(eqClass.match))
+    x = x %>% distinct(eqClassId, .keep_all = TRUE)
+    nonuniqueCounts = countMatrix[x$eqClass.match,, drop = FALSE]
+    if(nrow(x)>1){
+    nonuniqueCounts.gene = sparse.model.matrix(~ factor(x$gene_sid) - 1)
+    nonuniqueCounts = t(nonuniqueCounts.gene) %*% nonuniqueCounts
+    }
+    #covert ids into gene ids
+    geneids = as.numeric(levels(factor(x$gene_sid)))
+    geneids = x$txid[match(geneids, x$gene_sid)]
+    geneids = mcols(annotations)$GENEID[as.numeric(geneids)]
+    rownames(nonuniqueCounts) = geneids
+    #create matrix for all annotated genes
+    genes = levels(factor(unique(mcols(annotations)$GENEID)))
+    geneMat = sparseMatrix(length(genes), ncol(nonuniqueCounts), x = 0)
+    rownames(geneMat) = genes
+    geneMat[rownames(nonuniqueCounts),] = nonuniqueCounts
+    return(geneMat)
+}
diff --git a/R/bambu-extendAnnotations-utilityExtend.R b/R/bambu-extendAnnotations-utilityExtend.R
@@ -67,7 +67,8 @@ filterTranscripts <- function(combinedTranscripts, min.sampleNumber){
         combinedTranscripts$NSampleTxScore >= min.sampleNumber) & (
         combinedTranscripts$NSampleReadProp >= min.sampleNumber)
   }
-  combinedTranscripts = combinedTranscripts[filterSet,]
+  #combinedTranscripts = combinedTranscripts[filterSet,]
+  combinedTranscripts$maxTxScore[!filterSet] = 0
   return(combinedTranscripts)
 }
 
@@ -95,8 +96,8 @@ filterTranscriptsByAnnotation <- function(rowDataCombined, annotationGrangesList
     subsetTranscripts <- combindRowDataWithRanges(
         rowDataCombined[!notCompatibleIds,], 
         exonRangesCombined[!notCompatibleIds])
-    exonRangesCombined <- exonRangesCombined[notCompatibleIds]
-    rowDataCombined <- rowDataCombined[notCompatibleIds,]
+    rowDataCombined$maxTxScore[grepl("compatible", rowDataCombined$readClassType) &
+        rowDataCombined$readClassType != "equal:compatible"]=0
   }
   #(2) remove transcripts below NDR threshold/identical junctions to annotations
   rowDataCombined = calculateNDROnTranscripts(rowDataCombined, 
@@ -122,7 +123,7 @@ filterTranscriptsByAnnotation <- function(rowDataCombined, annotationGrangesList
     exonRangesCombined <- exonRangesCombined[filterSet]
     rowDataCombined <- rowDataCombined[filterSet,]
   }
-  if(sum(filterSet==0) & length(annotationGrangesList)==0) stop(
+  if(sum(filterSet)==0 & length(annotationGrangesList)==0) stop(
     "WARNING - No annotations were provided. Please increase NDR threshold to use novel transcripts")
   if(sum(filterSet)==0) message("WARNING - No novel transcripts meet the given thresholds. Try a higher NDR.")
   # (3) combine novel transcripts with annotations
@@ -218,6 +219,7 @@ calculateNDROnTranscripts <- function(combinedTranscripts, useTxScore = FALSE){
             "for NDR precision stabilization.")
           message("NDR will be approximated as: (1 - Transcript Model Prediction Score)")
     } else combinedTranscripts$NDR = calculateNDR(combinedTranscripts$maxTxScore, equal)
+    combinedTranscripts$NDR[combinedTranscripts$maxTxScore==0] = 1
     return(combinedTranscripts)
 }
 
@@ -750,10 +752,11 @@ isore.estimateDistanceToAnnotations <- function(seReadClass,
                                          primarySecondaryDistStartEnd = min.primarySecondaryDistStartEnd,
                                          ignore.strand = FALSE)
   distTable$readCount <- assays(seReadClass)$counts[distTable$readClassId, ] 
-  if (additionalFiltering) 
-    distTable <- left_join(distTable, select(readClassTable,
-                                             readClassId, confidenceType), by = "readClassId") %>%
-    mutate(relativeReadCount = readCount / txNumberFiltered)
+#   if (additionalFiltering) 
+#     distTable <- left_join(distTable, select(readClassTable,
+#                                              readClassId, confidenceType), by = "readClassId") %>%
+#     mutate(relativeReadCount = readCount / txNumberFiltered)
+
   distTable <- dplyr::select(distTable, annotationTxId, txid, readClassId,
       readCount, compatible, equal,dist)
   distTable <- left_join(distTable, as_tibble(mcols(annotationGrangesList)[, c("txid", "GENEID")]),