Jing templates and fixtures

NIDAP-Community · Dec 5, 2022 · 08532bf · 08532bf
1 parent 9f16588
commit 08532bf
Show file tree

Hide file tree

Showing 15 changed files with 1,658 additions and 0 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,4 @@
 .nfs*
 *.png
 Rcheck.txt
+.Rproj.user
diff --git a/R/Color_by_Genes_Automatic.R b/R/Color_by_Genes_Automatic.R
@@ -0,0 +1,233 @@
+# This code comes from NIDAP 'Color by Genes Automatic [scRNA-seq][CCBR]' code template
+# Template Manager https://nidap.nih.gov/workspace/vector/templates/ri.vector.main.template.d71ed4e6-a25d-4f66-a186-27c00a50a703
+# Documentation https://nidap.nih.gov/workspace/notepad/view/ri.notepad.main.notepad.8101be48-85a5-40ec-8fa8-9fe28e5d31cb
+
+#' @title Create gene expression annotated dimension plots based on user inputted table
+#' @description Returns a panel of dimension plots colored by individual gene expression
+#' @details Takes in a list of genes inputted by the user, displays gene expression information on tsne, umap, or pca
+#' 
+#' @param SO Seurat-class object
+#' @param samples_to_include List of samples to subset the data by
+#' @param samples_to_display List of samples to depict on dimension plot, samples not in the list would be colored gray in the background
+#' @param marker_list Table of marker genes for each celltype (column names of the table), append "_prot" or "_neg" for proteins or negative markers
+#' @param cells_of_interest Vector of celltypes from geneset_dataframe to screen for
+#' @param protein_presence Set to TRUE if there are CITE-seq markers in geneset_dataframe
+#' @param assay Name of the assay to extract gene expression data from
+#' @param reduction_type Choose among tsne, umap, and pca
+#' @param point_transparency Set to lower value for more see through points on dimension plot
+#' @param point_shape Change the shape of points for between visualization
+#' @param number_of_rows Set the number of rows to be displayed on the compiled gene expression dimension plot
+#' @param doCiteSeq Set to TRUE if using Cite-seq data
+#' 
+#' @import Seurat 
+#' @import tidyverse
+#' @import gridExtra
+#' @import ggpubr
+#' @import ggplot2
+#'   
+#' @export
+
+#' @return compiled dimension plots of markers, in the same layout as the user-inputted marker table
+
+color_by_genes <- function(SO, 
+                             samples_to_include,
+                             samples_to_display,
+                             marker_list,
+                             cells_of_interest,
+                             protein_presence = FALSE,
+                             assay = "SCT",
+                             reduction_type = "umap",
+                             point_transparency = 0.5,
+                             point_shape = 16,
+                             number_of_rows = 0,
+                             doCiteSeq = FALSE){
+
+  ## --------- ##
+  ## Functions ##
+  ## --------- ##
+
+  plotMarkers <- function(Markers){
+    if (is.na(Markers) == TRUE) {
+      g <- ggplot() + theme_void()
+      return(g)
+    } else {
+      Markers.mat=SO.sub[[assay]]@scale.data[Markers,]
+      Markers.quant=quantile(Markers.mat[Markers.mat>1],probs=c(.1,.5,.90))
+      Markers.mat[Markers.mat>Markers.quant[3]]=Markers.quant[3]
+      Markers.mat[Markers.mat<Markers.quant[1]]=0
+
+      if (!(doCiteSeq)) {
+        if(reduction_type == "tsne"){
+          p1 <- DimPlot(SO.sub, reduction = "tsne", group.by = "ident")
+          clusmat=data.frame(umap1=p1$data$tSNE_1,umap2=p1$data$tSNE_2, Markers=Markers.mat, ident = as.factor(p1$data$ident))
+        }
+        else if(reduction_type == "umap"){
+          p1 <- DimPlot(SO.sub, reduction = "umap", group.by = "ident")
+          clusmat=data.frame(umap1=p1$data$UMAP_1,umap2=p1$data$UMAP_2, Markers=Markers.mat,ident = as.factor(p1$data$ident))
+        }
+        else{
+          p1 <- DimPlot(SO.sub, reduction = "pca", group.by = "ident")
+          clusmat=data.frame(umap1=p1$data$PC_1,umap2=p1$data$PC_2, Markers=Markers.mat,ident = as.factor(p1$data$ident))
+        } #if CITEseq is chosen then:
+      } else {
+        if(reduction_type == "tsne"){
+          p1 <- DimPlot(SO.sub, reduction = "protein_tsne", group.by = "ident")
+          clusmat=data.frame(umap1=p1$data$protein_tsne_1,umap2=p1$data$protein_tsne_2, Markers=Markers.mat,ident = as.factor(p1$data$ident))
+        }
+        else if(reduction_type == "umap"){
+          p1 <- DimPlot(SO.sub, reduction = "protein_umap", group.by = "ident")
+          clusmat=data.frame(umap1=p1$data$protein_umap_1,umap2=p1$data$protein_umap_2, Markers=Markers.mat,ident = as.factor(p1$data$ident))
+        }
+        else{
+          p1 <- DimPlot(SO.sub, reduction = "protein_pca", group.by = "ident")
+          clusmat=data.frame(umap1=p1$data$protein_pca_1,umap2=p1$data$protein_pca_2, Markers=Markers.mat,ident = as.factor(p1$data$ident))
+        }
+      }
+
+      # Samples caption
+      samples_displayed_message <- paste(samples_to_display, sep = "", collapse = "\n")
+      final_caption <- paste("Samples Displayed: ", samples_displayed_message, sep = "", collapse = "\n")
+
+      clusmat <- mutate(clusmat, sample_Markers = clusmat$Markers * grepl(paste(samples_to_display, collapse = "|"), clusmat$ident))
+
+      clusmat %>% dplyr::arrange(sample_Markers) -> clusmat
+      if (grepl("_neg",Markers) == TRUE){
+
+        clusmat %>% dplyr::arrange(desc(sample_Markers)) -> clusmat
+        g <- ggplot(clusmat, aes(x=umap1, y=umap2, group=ident)) +
+          theme_bw() +
+          theme(legend.title=element_blank()) +
+          ggtitle(Markers) +
+          geom_point(aes(color=sample_Markers, shape=ident),alpha=point_transparency,shape=point_shape, size=1) +
+          theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+                panel.background = element_blank(),legend.text=element_text(size=rel(0.5)) )+
+          scale_color_gradient(limits = c(0, Markers.quant[3]),low = "lightgrey", high = "red") +
+          xlab("umap-1") + ylab("umap-2")
+        return(g)
+      } else {
+        clusmat %>% dplyr::arrange(sample_Markers) -> clusmat
+        g <- ggplot(clusmat, aes(x=umap1, y=umap2, group=ident)) +
+          theme_bw() +
+          theme(legend.title=element_blank()) +
+          ggtitle(Markers) +
+          geom_point(aes(color=sample_Markers, shape=ident),alpha=point_transparency,shape=point_shape, size=1) +
+          theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+                panel.background = element_blank(),legend.text=element_text(size=rel(0.5)) )+
+          scale_color_gradient(limits = c(0, Markers.quant[3]),low = "lightgrey", high = "red") +
+          xlab("umap-1") + ylab("umap-2")
+        return(g)}
+    }
+  }
+
+  ## --------------- ##
+  ## Main Code Block ##
+  ## --------------- ##
+
+  if (length(samples_to_include) == 0) {
+    samples_to_include = unique(SO@meta.data$sample_name)
+  }
+
+  if("active.ident" %in% slotNames(SO)){
+    sample_name = as.factor(SO@meta.data$orig.ident)
+    names(sample_name)=names(SO@active.ident)
+    SO@active.ident <- as.factor(vector())
+    SO@active.ident <- sample_name
+    SO.sub = subset(SO, ident = samples_to_include)
+  } else {
+    sample_name = as.factor(SO@meta.data$orig.ident)
+    names(sample_name)=names(SO@active.ident)
+    SO@active.ident <- as.factor(vector())
+    SO@active.ident <- sample_name
+    SO.sub = subset(SO, ident = samples_to_include)
+  }
+
+  marker_list <- marker_list[cells_of_interest]
+
+  # Remove columns with all missing values
+  Present_Markers_ls <- list()
+
+  for (celltype in colnames(marker_list)) {
+    print(names(marker_list[celltype]))
+    present=lapply(marker_list[[celltype]], function(x) x %in% rownames(SO.sub$SCT@scale.data)) 
+    absentgenes = unlist(marker_list[[celltype]])[present==FALSE]
+    presentgenes = unlist(marker_list[[celltype]])[present==TRUE]
+    print(paste0("Genes not present: ",paste0(absentgenes,collapse=",")))
+    print(paste0("Genes present: ",paste0(presentgenes,collapse=",")))
+
+    if(length(presentgenes) == 0){
+      print(paste0(names(marker_list[celltype]), " genes were not found in SO and will not be analyzed"))
+    } else {Present_Markers_ls[[celltype]] <- presentgenes}
+  }
+
+  # Padd processed list containing only the present genes
+  padded_list <- lapply(Present_Markers_ls, `length<-`, max(lengths(Present_Markers_ls)))
+  Markers_from_list <- as.data.frame.matrix(do.call(cbind, padded_list))
+
+  # Recognize any markers designated as proteins and insert protein expression data into slot where plots are created
+  if (protein_presence){
+    protein_markers <- Markers_from_list[grepl("_prot",Markers_from_list)]
+
+    protein_orig_markers <- gsub("_prot.*","",protein_markers)
+
+    protein_markers_name <- paste(protein_orig_markers,
+                                  "_prot", sep = "")
+
+    i = 0
+    protein_array <- list()
+    for (i in seq_along(protein_orig_markers)){
+      protein_array[[i]] <- SO.sub@assays$Protein[protein_orig_markers[i],]
+      rownames(protein_array[[i]]) <- protein_markers_name[i]
+    }
+    protein_array_comp <- do.call(rbind,protein_array)
+    SO.sub@assays$SCT@scale.data <- rbind(SO.sub@assays$SCT@scale.data,protein_array_comp)
+  }
+
+  # Add negative/low identifiers
+  neg_markers_names <- Markers_from_list[grepl("_neg",Markers_from_list)]
+  orig_markers <- gsub("_neg.*","",neg_markers_names)
+
+  # Append neg_markers_names to rownames of SO.sub
+  neg_markers_list <- list()
+
+  # Calculate adjusted expression for negative markers
+  for (i in seq_along(orig_markers)){
+    if (orig_markers[i] %in% rownames(SO.sub@assays$SCT@scale.data)){
+      # Format the data so that it can rbinded with [email protected]
+      neg_markers_list[[i]] <- t(matrix(max(SO.sub@assays$SCT@scale.data[orig_markers[i],]) - SO.sub@assays$SCT@scale.data[orig_markers[i],]))
+      row.names(neg_markers_list[[i]]) <- neg_markers_names[i]
+      colnames(neg_markers_list[[i]]) <- colnames(SO.sub@assays$SCT@scale.data)
+
+      # Append new Negative/low marker (w Expression Count) to SO slot
+      SO.sub@assays$SCT@scale.data <- rbind(SO.sub@assays$SCT@scale.data, neg_markers_list[[i]]) 
+    } else {
+      print(paste(orig_markers[i], " is not found in Seurat, cannot calculate negative expression", sep = ""))
+    }} 
+
+  Markers_present = unlist(Markers_from_list)
+
+  if(!length(Markers_present)>0){
+    print("No Markers found in dataset")
+    return(NULL)
+  }
+
+  # Create list for storing color by gene plots of each celltype column
+  gg_storage <- list()
+
+  for (cell in colnames(Markers_from_list)){
+
+    title <- cell
+
+    markers_to_analyze <- as.character(Markers_from_list[[cell]])
+
+    grob <- lapply(markers_to_analyze,function(x) plotMarkers(x))
+    gg_storage[[cell]] <- gridExtra::arrangeGrob(grobs=grob,ncol = 1,newpage=F, as.table = F, top = text_grob(title,size = 15, face = "bold"))
+
+  }
+
+  #pdf("Color_by_Genes.pdf", 12, 15)
+  final_figures <- do.call(grid.arrange, c(gg_storage, ncol = ncol(Markers_from_list)))
+  #dev.off()
+
+  return(final_figures)
+}
+
diff --git a/R/Harmony_Batch_Correction.R b/R/Harmony_Batch_Correction.R
@@ -0,0 +1,141 @@
+# This code comes from NIDAP 'Harmony Batch Correction [scRNA-seq][CCBR]' code template
+# Template Manager https://nidap.nih.gov/workspace/vector/templates/ri.vector.main.template.a097334e-21ac-466c-b467-c761b2c25f02
+# Documentation https://nidap.nih.gov/workspace/notepad/view/ri.notepad.main.notepad.facd9ed0-90fc-4c55-bbb8-0153f06858d4
+
+#' @title Performs batch correction on Seurat-class object
+#' @description Returns a Seurat-class object with adjusted cell embeddings and gene expression data to account for variations in sample collection
+#' @details Takes in a list of genes inputted by the user, displays gene expression information in particular slot-assay with (optional) outliers removed
+#' 
+#' @param seurat_object Seurat-class object
+#' @param variable_features Number of most variable genes to subset the gene expression data by
+#' @param genes_to_add Add genes that might not be found among variably expressed genes
+#' @param group.by.vars which variable should be accountted for when running batch correction
+
+#' @import Seurat 
+#' @import harmony
+#' @import gridExtra
+#' @import RColorBrewer
+#' @import ggplot2
+#'   
+#' @export
+
+#' @return Seurat-class Object with Harmony-adjusted gene expression (SCT slot) and tSNE cell embedding
+
+
+harmony_batch_correct <- function(so, 
+                                  variable_features = 2000, 
+                                  genes_to_add,
+                                  group.by.vars) {
+
+  ## --------------- ##
+  ## Main Code Block ##
+  ## --------------- ##
+
+  seur.SCT <- so@assays$SCT@scale.data
+  seur.SCT[1:10, 1:10]
+
+  genes_of_interest <- genes_to_add[genes_to_add %in% rownames(so@assays$SCT@scale.data)]
+
+  # Add more genes to analyze
+  VariableFeatures(so) <- c(VariableFeatures(so), genes_of_interest) 
+  mvf <- VariableFeatures(so)[1:(variable_features + length(genes_of_interest))] 
+  print(dim(so@assays$SCT@scale.data))
+  seur.SCT <- seur.SCT[mvf,]
+  seur.SCT <- t(seur.SCT) 
+  # need the original loadings and embeddings to compare with my SVD
+  seur.loads <- so@reductions$pca@feature.loadings
+  seur.pca <- so@reductions$pca@cell.embeddings
+  sm.pca <- seur.pca[1:10, 1:10]
+  sm.seur.ldngs <- seur.loads[1:10, 1:10]
+
+  # SVD on scaled counts
+
+  variable_features <- length(mvf)
+  Sys.time()
+  pppca <- svd(seur.SCT) 
+  Sys.time() 
+  ppembed <- pppca$u %*% diag(pppca$d)
+  pcnames <- vector(mode = "character")
+  for (i in 1:dim(ppembed)[2])pcnames[i] <- paste("PC", i, sep = "_")
+  colnames(ppembed) <- pcnames
+  rownames(ppembed) <- rownames(seur.SCT)
+  sm.ppembed <- ppembed[1:10, 1:10]
+  ppldngs <- pppca$v
+  colnames(ppldngs) <- pcnames
+  rownames(ppldngs) <- mvf 
+  sm.ppldngs <- ppldngs[1:10, 1:10]
+  ppembed[1:10, 1:10]
+  ppldngs[1:10, 1:10]
+
+  so@reductions$pca@cell.embeddings <- ppembed
+  so@reductions$pca@feature.loadings <- ppldngs
+  so@reductions$pca@stdev <- pppca$d
+  print(Sys.time())
+
+  so <- RunHarmony(so, group.by.vars,
+                   do_pca=FALSE,
+                   assay.use = "SCT",
+                   plot_convergence = TRUE,
+                   return_object=TRUE)
+
+  head(so@reductions$harmony@cell.embeddings)
+  head(so@reductions$harmony@feature.loadings)
+
+  so <- RunUMAP(so, reduction = "harmony",dims=1:4)
+  so <- RunTSNE(so, reduction = "harmony",dims=1:4)
+
+  sdat <- data.frame(as.vector(so@reductions$tsne@cell.embeddings[,1]),
+                     as.vector(so@reductions$tsne@cell.embeddings[,2]),
+                     so@meta.data[eval(parse(text = "group.by.vars"))])
+  names(sdat) <- c("TSNE1","TSNE2","ident")
+
+  n <- 2e3
+  set.seed(10)
+  ourColorSpace <- colorspace::RGB(runif(n), runif(n), runif(n))
+  ourColorSpace <- as(ourColorSpace, "LAB")
+
+
+  distinctColorPalette <-function(k=1) {
+    currentColorSpace <- ourColorSpace@coords
+    # Set iter.max to 20 to avoid convergence warnings.
+    set.seed(1)
+    km <- kmeans(currentColorSpace, k, iter.max=20)
+    colors <- unname(hex(LAB(km$centers)))
+    return(colors)
+  }   
+
+  n <- 60
+  qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
+  qual_col_pals = qual_col_pals[c(7,6,2,1,8,3,4,5),]
+  cols = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
+
+  g1 <- ggplot(sdat, aes(x=TSNE1, y=TSNE2)) +
+    theme_bw() +
+    theme(legend.title=element_blank()) +
+    geom_point(aes(colour=ident),size=1) +
+    scale_color_manual(values=cols) +
+    theme(panel.grid.major = element_blank(), 
+          panel.grid.minor = element_blank(), 
+          legend.position="top",
+          panel.background = element_blank(),
+          legend.text=element_text(size=rel(0.5))) +
+    guides(colour = guide_legend(override.aes = list(size=5, alpha = 1))) 
+
+  solist <- list(so, ppldngs)
+
+  # Calculate adjusted gene expression from embeddings
+  so <- solist[[1]]
+  ppldngs <- solist[[2]]
+  seur.SCT <- t(so@assays$SCT@scale.data)
+  harm.embeds <- so@reductions$harmony@cell.embeddings
+  #print(dim(harm.embeds))
+  harm.lvl.backcalc <- harm.embeds %*% t(ppldngs)
+
+  sm.harmexpr <- harm.lvl.backcalc[1:10, 1:4]
+  sm.comp.scld <- (seur.SCT[rownames(sm.harmexpr), colnames(sm.harmexpr)])
+
+  # Convert to ggplot that includes color by samples for higher resolution of batch correction, use for loop to screen for different genes
+  so@assays$SCT@scale.data <- t(harm.lvl.backcalc)
+
+  return(so)
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,3 +2,4 @@ @@
     .nfs*
     *.png
     Rcheck.txt
+    .Rproj.user