Skip to content

Commit

Permalink
Jing templates and fixtures
Browse files Browse the repository at this point in the history
  • Loading branch information
bianjh-cloud committed Dec 5, 2022
1 parent 9f16588 commit 08532bf
Show file tree
Hide file tree
Showing 15 changed files with 1,658 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
^.*\.Rproj$
^\.Rproj\.user$
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
.nfs*
*.png
Rcheck.txt
.Rproj.user
233 changes: 233 additions & 0 deletions R/Color_by_Genes_Automatic.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
# This code comes from NIDAP 'Color by Genes Automatic [scRNA-seq][CCBR]' code template
# Template Manager https://nidap.nih.gov/workspace/vector/templates/ri.vector.main.template.d71ed4e6-a25d-4f66-a186-27c00a50a703
# Documentation https://nidap.nih.gov/workspace/notepad/view/ri.notepad.main.notepad.8101be48-85a5-40ec-8fa8-9fe28e5d31cb

#' @title Create gene expression annotated dimension plots based on user inputted table
#' @description Returns a panel of dimension plots colored by individual gene expression
#' @details Takes in a list of genes inputted by the user, displays gene expression information on tsne, umap, or pca
#'
#' @param SO Seurat-class object
#' @param samples_to_include List of samples to subset the data by
#' @param samples_to_display List of samples to depict on dimension plot, samples not in the list would be colored gray in the background
#' @param marker_list Table of marker genes for each celltype (column names of the table), append "_prot" or "_neg" for proteins or negative markers
#' @param cells_of_interest Vector of celltypes from geneset_dataframe to screen for
#' @param protein_presence Set to TRUE if there are CITE-seq markers in geneset_dataframe
#' @param assay Name of the assay to extract gene expression data from
#' @param reduction_type Choose among tsne, umap, and pca
#' @param point_transparency Set to lower value for more see through points on dimension plot
#' @param point_shape Change the shape of points for between visualization
#' @param number_of_rows Set the number of rows to be displayed on the compiled gene expression dimension plot
#' @param doCiteSeq Set to TRUE if using Cite-seq data
#'
#' @import Seurat
#' @import tidyverse
#' @import gridExtra
#' @import ggpubr
#' @import ggplot2
#'
#' @export

#' @return compiled dimension plots of markers, in the same layout as the user-inputted marker table

color_by_genes <- function(SO,
samples_to_include,
samples_to_display,
marker_list,
cells_of_interest,
protein_presence = FALSE,
assay = "SCT",
reduction_type = "umap",
point_transparency = 0.5,
point_shape = 16,
number_of_rows = 0,
doCiteSeq = FALSE){

## --------- ##
## Functions ##
## --------- ##

plotMarkers <- function(Markers){
if (is.na(Markers) == TRUE) {
g <- ggplot() + theme_void()
return(g)
} else {
Markers.mat=SO.sub[[assay]]@scale.data[Markers,]
Markers.quant=quantile(Markers.mat[Markers.mat>1],probs=c(.1,.5,.90))
Markers.mat[Markers.mat>Markers.quant[3]]=Markers.quant[3]
Markers.mat[Markers.mat<Markers.quant[1]]=0

if (!(doCiteSeq)) {
if(reduction_type == "tsne"){
p1 <- DimPlot(SO.sub, reduction = "tsne", group.by = "ident")
clusmat=data.frame(umap1=p1$data$tSNE_1,umap2=p1$data$tSNE_2, Markers=Markers.mat, ident = as.factor(p1$data$ident))
}
else if(reduction_type == "umap"){
p1 <- DimPlot(SO.sub, reduction = "umap", group.by = "ident")
clusmat=data.frame(umap1=p1$data$UMAP_1,umap2=p1$data$UMAP_2, Markers=Markers.mat,ident = as.factor(p1$data$ident))
}
else{
p1 <- DimPlot(SO.sub, reduction = "pca", group.by = "ident")
clusmat=data.frame(umap1=p1$data$PC_1,umap2=p1$data$PC_2, Markers=Markers.mat,ident = as.factor(p1$data$ident))
} #if CITEseq is chosen then:
} else {
if(reduction_type == "tsne"){
p1 <- DimPlot(SO.sub, reduction = "protein_tsne", group.by = "ident")
clusmat=data.frame(umap1=p1$data$protein_tsne_1,umap2=p1$data$protein_tsne_2, Markers=Markers.mat,ident = as.factor(p1$data$ident))
}
else if(reduction_type == "umap"){
p1 <- DimPlot(SO.sub, reduction = "protein_umap", group.by = "ident")
clusmat=data.frame(umap1=p1$data$protein_umap_1,umap2=p1$data$protein_umap_2, Markers=Markers.mat,ident = as.factor(p1$data$ident))
}
else{
p1 <- DimPlot(SO.sub, reduction = "protein_pca", group.by = "ident")
clusmat=data.frame(umap1=p1$data$protein_pca_1,umap2=p1$data$protein_pca_2, Markers=Markers.mat,ident = as.factor(p1$data$ident))
}
}

# Samples caption
samples_displayed_message <- paste(samples_to_display, sep = "", collapse = "\n")
final_caption <- paste("Samples Displayed: ", samples_displayed_message, sep = "", collapse = "\n")

clusmat <- mutate(clusmat, sample_Markers = clusmat$Markers * grepl(paste(samples_to_display, collapse = "|"), clusmat$ident))

clusmat %>% dplyr::arrange(sample_Markers) -> clusmat
if (grepl("_neg",Markers) == TRUE){

clusmat %>% dplyr::arrange(desc(sample_Markers)) -> clusmat
g <- ggplot(clusmat, aes(x=umap1, y=umap2, group=ident)) +
theme_bw() +
theme(legend.title=element_blank()) +
ggtitle(Markers) +
geom_point(aes(color=sample_Markers, shape=ident),alpha=point_transparency,shape=point_shape, size=1) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(),legend.text=element_text(size=rel(0.5)) )+
scale_color_gradient(limits = c(0, Markers.quant[3]),low = "lightgrey", high = "red") +
xlab("umap-1") + ylab("umap-2")
return(g)
} else {
clusmat %>% dplyr::arrange(sample_Markers) -> clusmat
g <- ggplot(clusmat, aes(x=umap1, y=umap2, group=ident)) +
theme_bw() +
theme(legend.title=element_blank()) +
ggtitle(Markers) +
geom_point(aes(color=sample_Markers, shape=ident),alpha=point_transparency,shape=point_shape, size=1) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(),legend.text=element_text(size=rel(0.5)) )+
scale_color_gradient(limits = c(0, Markers.quant[3]),low = "lightgrey", high = "red") +
xlab("umap-1") + ylab("umap-2")
return(g)}
}
}

## --------------- ##
## Main Code Block ##
## --------------- ##

if (length(samples_to_include) == 0) {
samples_to_include = unique(SO@meta.data$sample_name)
}

if("active.ident" %in% slotNames(SO)){
sample_name = as.factor(SO@meta.data$orig.ident)
names(sample_name)=names(SO@active.ident)
SO@active.ident <- as.factor(vector())
SO@active.ident <- sample_name
SO.sub = subset(SO, ident = samples_to_include)
} else {
sample_name = as.factor(SO@meta.data$orig.ident)
names(sample_name)=names(SO@active.ident)
SO@active.ident <- as.factor(vector())
SO@active.ident <- sample_name
SO.sub = subset(SO, ident = samples_to_include)
}

marker_list <- marker_list[cells_of_interest]

# Remove columns with all missing values
Present_Markers_ls <- list()

for (celltype in colnames(marker_list)) {
print(names(marker_list[celltype]))
present=lapply(marker_list[[celltype]], function(x) x %in% rownames(SO.sub$SCT@scale.data))
absentgenes = unlist(marker_list[[celltype]])[present==FALSE]
presentgenes = unlist(marker_list[[celltype]])[present==TRUE]
print(paste0("Genes not present: ",paste0(absentgenes,collapse=",")))
print(paste0("Genes present: ",paste0(presentgenes,collapse=",")))

if(length(presentgenes) == 0){
print(paste0(names(marker_list[celltype]), " genes were not found in SO and will not be analyzed"))
} else {Present_Markers_ls[[celltype]] <- presentgenes}
}

# Padd processed list containing only the present genes
padded_list <- lapply(Present_Markers_ls, `length<-`, max(lengths(Present_Markers_ls)))
Markers_from_list <- as.data.frame.matrix(do.call(cbind, padded_list))

# Recognize any markers designated as proteins and insert protein expression data into slot where plots are created
if (protein_presence){
protein_markers <- Markers_from_list[grepl("_prot",Markers_from_list)]

protein_orig_markers <- gsub("_prot.*","",protein_markers)

protein_markers_name <- paste(protein_orig_markers,
"_prot", sep = "")

i = 0
protein_array <- list()
for (i in seq_along(protein_orig_markers)){
protein_array[[i]] <- SO.sub@assays$Protein[protein_orig_markers[i],]
rownames(protein_array[[i]]) <- protein_markers_name[i]
}
protein_array_comp <- do.call(rbind,protein_array)
SO.sub@assays$SCT@scale.data <- rbind(SO.sub@assays$SCT@scale.data,protein_array_comp)
}

# Add negative/low identifiers
neg_markers_names <- Markers_from_list[grepl("_neg",Markers_from_list)]
orig_markers <- gsub("_neg.*","",neg_markers_names)

# Append neg_markers_names to rownames of SO.sub
neg_markers_list <- list()

# Calculate adjusted expression for negative markers
for (i in seq_along(orig_markers)){
if (orig_markers[i] %in% rownames(SO.sub@assays$SCT@scale.data)){
# Format the data so that it can rbinded with [email protected]
neg_markers_list[[i]] <- t(matrix(max(SO.sub@assays$SCT@scale.data[orig_markers[i],]) - SO.sub@assays$SCT@scale.data[orig_markers[i],]))
row.names(neg_markers_list[[i]]) <- neg_markers_names[i]
colnames(neg_markers_list[[i]]) <- colnames(SO.sub@assays$SCT@scale.data)

# Append new Negative/low marker (w Expression Count) to SO slot
SO.sub@assays$SCT@scale.data <- rbind(SO.sub@assays$SCT@scale.data, neg_markers_list[[i]])
} else {
print(paste(orig_markers[i], " is not found in Seurat, cannot calculate negative expression", sep = ""))
}}

Markers_present = unlist(Markers_from_list)

if(!length(Markers_present)>0){
print("No Markers found in dataset")
return(NULL)
}

# Create list for storing color by gene plots of each celltype column
gg_storage <- list()

for (cell in colnames(Markers_from_list)){

title <- cell

markers_to_analyze <- as.character(Markers_from_list[[cell]])

grob <- lapply(markers_to_analyze,function(x) plotMarkers(x))
gg_storage[[cell]] <- gridExtra::arrangeGrob(grobs=grob,ncol = 1,newpage=F, as.table = F, top = text_grob(title,size = 15, face = "bold"))

}

#pdf("Color_by_Genes.pdf", 12, 15)
final_figures <- do.call(grid.arrange, c(gg_storage, ncol = ncol(Markers_from_list)))
#dev.off()

return(final_figures)
}

141 changes: 141 additions & 0 deletions R/Harmony_Batch_Correction.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# This code comes from NIDAP 'Harmony Batch Correction [scRNA-seq][CCBR]' code template
# Template Manager https://nidap.nih.gov/workspace/vector/templates/ri.vector.main.template.a097334e-21ac-466c-b467-c761b2c25f02
# Documentation https://nidap.nih.gov/workspace/notepad/view/ri.notepad.main.notepad.facd9ed0-90fc-4c55-bbb8-0153f06858d4

#' @title Performs batch correction on Seurat-class object
#' @description Returns a Seurat-class object with adjusted cell embeddings and gene expression data to account for variations in sample collection
#' @details Takes in a list of genes inputted by the user, displays gene expression information in particular slot-assay with (optional) outliers removed
#'
#' @param seurat_object Seurat-class object
#' @param variable_features Number of most variable genes to subset the gene expression data by
#' @param genes_to_add Add genes that might not be found among variably expressed genes
#' @param group.by.vars which variable should be accountted for when running batch correction

#' @import Seurat
#' @import harmony
#' @import gridExtra
#' @import RColorBrewer
#' @import ggplot2
#'
#' @export

#' @return Seurat-class Object with Harmony-adjusted gene expression (SCT slot) and tSNE cell embedding


harmony_batch_correct <- function(so,
variable_features = 2000,
genes_to_add,
group.by.vars) {

## --------------- ##
## Main Code Block ##
## --------------- ##

seur.SCT <- so@assays$SCT@scale.data
seur.SCT[1:10, 1:10]

genes_of_interest <- genes_to_add[genes_to_add %in% rownames(so@assays$SCT@scale.data)]

# Add more genes to analyze
VariableFeatures(so) <- c(VariableFeatures(so), genes_of_interest)
mvf <- VariableFeatures(so)[1:(variable_features + length(genes_of_interest))]
print(dim(so@assays$SCT@scale.data))
seur.SCT <- seur.SCT[mvf,]
seur.SCT <- t(seur.SCT)
# need the original loadings and embeddings to compare with my SVD
seur.loads <- so@reductions$pca@feature.loadings
seur.pca <- so@reductions$pca@cell.embeddings
sm.pca <- seur.pca[1:10, 1:10]
sm.seur.ldngs <- seur.loads[1:10, 1:10]

# SVD on scaled counts

variable_features <- length(mvf)
Sys.time()
pppca <- svd(seur.SCT)
Sys.time()
ppembed <- pppca$u %*% diag(pppca$d)
pcnames <- vector(mode = "character")
for (i in 1:dim(ppembed)[2])pcnames[i] <- paste("PC", i, sep = "_")
colnames(ppembed) <- pcnames
rownames(ppembed) <- rownames(seur.SCT)
sm.ppembed <- ppembed[1:10, 1:10]
ppldngs <- pppca$v
colnames(ppldngs) <- pcnames
rownames(ppldngs) <- mvf
sm.ppldngs <- ppldngs[1:10, 1:10]
ppembed[1:10, 1:10]
ppldngs[1:10, 1:10]

so@reductions$pca@cell.embeddings <- ppembed
so@reductions$pca@feature.loadings <- ppldngs
so@reductions$pca@stdev <- pppca$d
print(Sys.time())

so <- RunHarmony(so, group.by.vars,
do_pca=FALSE,
assay.use = "SCT",
plot_convergence = TRUE,
return_object=TRUE)

head(so@reductions$harmony@cell.embeddings)
head(so@reductions$harmony@feature.loadings)

so <- RunUMAP(so, reduction = "harmony",dims=1:4)
so <- RunTSNE(so, reduction = "harmony",dims=1:4)

sdat <- data.frame(as.vector(so@reductions$tsne@cell.embeddings[,1]),
as.vector(so@reductions$tsne@cell.embeddings[,2]),
so@meta.data[eval(parse(text = "group.by.vars"))])
names(sdat) <- c("TSNE1","TSNE2","ident")

n <- 2e3
set.seed(10)
ourColorSpace <- colorspace::RGB(runif(n), runif(n), runif(n))
ourColorSpace <- as(ourColorSpace, "LAB")


distinctColorPalette <-function(k=1) {
currentColorSpace <- ourColorSpace@coords
# Set iter.max to 20 to avoid convergence warnings.
set.seed(1)
km <- kmeans(currentColorSpace, k, iter.max=20)
colors <- unname(hex(LAB(km$centers)))
return(colors)
}

n <- 60
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
qual_col_pals = qual_col_pals[c(7,6,2,1,8,3,4,5),]
cols = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))

g1 <- ggplot(sdat, aes(x=TSNE1, y=TSNE2)) +
theme_bw() +
theme(legend.title=element_blank()) +
geom_point(aes(colour=ident),size=1) +
scale_color_manual(values=cols) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position="top",
panel.background = element_blank(),
legend.text=element_text(size=rel(0.5))) +
guides(colour = guide_legend(override.aes = list(size=5, alpha = 1)))

solist <- list(so, ppldngs)

# Calculate adjusted gene expression from embeddings
so <- solist[[1]]
ppldngs <- solist[[2]]
seur.SCT <- t(so@assays$SCT@scale.data)
harm.embeds <- so@reductions$harmony@cell.embeddings
#print(dim(harm.embeds))
harm.lvl.backcalc <- harm.embeds %*% t(ppldngs)

sm.harmexpr <- harm.lvl.backcalc[1:10, 1:4]
sm.comp.scld <- (seur.SCT[rownames(sm.harmexpr), colnames(sm.harmexpr)])

# Convert to ggplot that includes color by samples for higher resolution of batch correction, use for loop to screen for different genes
so@assays$SCT@scale.data <- t(harm.lvl.backcalc)

return(so)
}
Loading

0 comments on commit 08532bf

Please sign in to comment.