diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..91114bf --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,2 @@ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/.gitignore b/.gitignore index d54dfbc..6020180 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .nfs* *.png Rcheck.txt +.Rproj.user diff --git a/R/Color_by_Genes_Automatic.R b/R/Color_by_Genes_Automatic.R new file mode 100644 index 0000000..d97aa01 --- /dev/null +++ b/R/Color_by_Genes_Automatic.R @@ -0,0 +1,233 @@ +# This code comes from NIDAP 'Color by Genes Automatic [scRNA-seq][CCBR]' code template +# Template Manager https://nidap.nih.gov/workspace/vector/templates/ri.vector.main.template.d71ed4e6-a25d-4f66-a186-27c00a50a703 +# Documentation https://nidap.nih.gov/workspace/notepad/view/ri.notepad.main.notepad.8101be48-85a5-40ec-8fa8-9fe28e5d31cb + +#' @title Create gene expression annotated dimension plots based on user inputted table +#' @description Returns a panel of dimension plots colored by individual gene expression +#' @details Takes in a list of genes inputted by the user, displays gene expression information on tsne, umap, or pca +#' +#' @param SO Seurat-class object +#' @param samples_to_include List of samples to subset the data by +#' @param samples_to_display List of samples to depict on dimension plot, samples not in the list would be colored gray in the background +#' @param marker_list Table of marker genes for each celltype (column names of the table), append "_prot" or "_neg" for proteins or negative markers +#' @param cells_of_interest Vector of celltypes from geneset_dataframe to screen for +#' @param protein_presence Set to TRUE if there are CITE-seq markers in geneset_dataframe +#' @param assay Name of the assay to extract gene expression data from +#' @param reduction_type Choose among tsne, umap, and pca +#' @param point_transparency Set to lower value for more see through points on dimension plot +#' @param point_shape Change the shape of points for between visualization +#' @param number_of_rows Set the number of rows to be displayed on the compiled gene expression dimension plot +#' @param doCiteSeq Set to TRUE if using Cite-seq data +#' +#' @import Seurat +#' @import tidyverse +#' @import gridExtra +#' @import ggpubr +#' @import ggplot2 +#' +#' @export + +#' @return compiled dimension plots of markers, in the same layout as the user-inputted marker table + +color_by_genes <- function(SO, + samples_to_include, + samples_to_display, + marker_list, + cells_of_interest, + protein_presence = FALSE, + assay = "SCT", + reduction_type = "umap", + point_transparency = 0.5, + point_shape = 16, + number_of_rows = 0, + doCiteSeq = FALSE){ + + ## --------- ## + ## Functions ## + ## --------- ## + + plotMarkers <- function(Markers){ + if (is.na(Markers) == TRUE) { + g <- ggplot() + theme_void() + return(g) + } else { + Markers.mat=SO.sub[[assay]]@scale.data[Markers,] + Markers.quant=quantile(Markers.mat[Markers.mat>1],probs=c(.1,.5,.90)) + Markers.mat[Markers.mat>Markers.quant[3]]=Markers.quant[3] + Markers.mat[Markers.mat% dplyr::arrange(sample_Markers) -> clusmat + if (grepl("_neg",Markers) == TRUE){ + + clusmat %>% dplyr::arrange(desc(sample_Markers)) -> clusmat + g <- ggplot(clusmat, aes(x=umap1, y=umap2, group=ident)) + + theme_bw() + + theme(legend.title=element_blank()) + + ggtitle(Markers) + + geom_point(aes(color=sample_Markers, shape=ident),alpha=point_transparency,shape=point_shape, size=1) + + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(),legend.text=element_text(size=rel(0.5)) )+ + scale_color_gradient(limits = c(0, Markers.quant[3]),low = "lightgrey", high = "red") + + xlab("umap-1") + ylab("umap-2") + return(g) + } else { + clusmat %>% dplyr::arrange(sample_Markers) -> clusmat + g <- ggplot(clusmat, aes(x=umap1, y=umap2, group=ident)) + + theme_bw() + + theme(legend.title=element_blank()) + + ggtitle(Markers) + + geom_point(aes(color=sample_Markers, shape=ident),alpha=point_transparency,shape=point_shape, size=1) + + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(),legend.text=element_text(size=rel(0.5)) )+ + scale_color_gradient(limits = c(0, Markers.quant[3]),low = "lightgrey", high = "red") + + xlab("umap-1") + ylab("umap-2") + return(g)} + } + } + + ## --------------- ## + ## Main Code Block ## + ## --------------- ## + + if (length(samples_to_include) == 0) { + samples_to_include = unique(SO@meta.data$sample_name) + } + + if("active.ident" %in% slotNames(SO)){ + sample_name = as.factor(SO@meta.data$orig.ident) + names(sample_name)=names(SO@active.ident) + SO@active.ident <- as.factor(vector()) + SO@active.ident <- sample_name + SO.sub = subset(SO, ident = samples_to_include) + } else { + sample_name = as.factor(SO@meta.data$orig.ident) + names(sample_name)=names(SO@active.ident) + SO@active.ident <- as.factor(vector()) + SO@active.ident <- sample_name + SO.sub = subset(SO, ident = samples_to_include) + } + + marker_list <- marker_list[cells_of_interest] + + # Remove columns with all missing values + Present_Markers_ls <- list() + + for (celltype in colnames(marker_list)) { + print(names(marker_list[celltype])) + present=lapply(marker_list[[celltype]], function(x) x %in% rownames(SO.sub$SCT@scale.data)) + absentgenes = unlist(marker_list[[celltype]])[present==FALSE] + presentgenes = unlist(marker_list[[celltype]])[present==TRUE] + print(paste0("Genes not present: ",paste0(absentgenes,collapse=","))) + print(paste0("Genes present: ",paste0(presentgenes,collapse=","))) + + if(length(presentgenes) == 0){ + print(paste0(names(marker_list[celltype]), " genes were not found in SO and will not be analyzed")) + } else {Present_Markers_ls[[celltype]] <- presentgenes} + } + + # Padd processed list containing only the present genes + padded_list <- lapply(Present_Markers_ls, `length<-`, max(lengths(Present_Markers_ls))) + Markers_from_list <- as.data.frame.matrix(do.call(cbind, padded_list)) + + # Recognize any markers designated as proteins and insert protein expression data into slot where plots are created + if (protein_presence){ + protein_markers <- Markers_from_list[grepl("_prot",Markers_from_list)] + + protein_orig_markers <- gsub("_prot.*","",protein_markers) + + protein_markers_name <- paste(protein_orig_markers, + "_prot", sep = "") + + i = 0 + protein_array <- list() + for (i in seq_along(protein_orig_markers)){ + protein_array[[i]] <- SO.sub@assays$Protein[protein_orig_markers[i],] + rownames(protein_array[[i]]) <- protein_markers_name[i] + } + protein_array_comp <- do.call(rbind,protein_array) + SO.sub@assays$SCT@scale.data <- rbind(SO.sub@assays$SCT@scale.data,protein_array_comp) + } + + # Add negative/low identifiers + neg_markers_names <- Markers_from_list[grepl("_neg",Markers_from_list)] + orig_markers <- gsub("_neg.*","",neg_markers_names) + + # Append neg_markers_names to rownames of SO.sub + neg_markers_list <- list() + + # Calculate adjusted expression for negative markers + for (i in seq_along(orig_markers)){ + if (orig_markers[i] %in% rownames(SO.sub@assays$SCT@scale.data)){ + # Format the data so that it can rbinded with SO$SCT@scale.data + neg_markers_list[[i]] <- t(matrix(max(SO.sub@assays$SCT@scale.data[orig_markers[i],]) - SO.sub@assays$SCT@scale.data[orig_markers[i],])) + row.names(neg_markers_list[[i]]) <- neg_markers_names[i] + colnames(neg_markers_list[[i]]) <- colnames(SO.sub@assays$SCT@scale.data) + + # Append new Negative/low marker (w Expression Count) to SO slot + SO.sub@assays$SCT@scale.data <- rbind(SO.sub@assays$SCT@scale.data, neg_markers_list[[i]]) + } else { + print(paste(orig_markers[i], " is not found in Seurat, cannot calculate negative expression", sep = "")) + }} + + Markers_present = unlist(Markers_from_list) + + if(!length(Markers_present)>0){ + print("No Markers found in dataset") + return(NULL) + } + + # Create list for storing color by gene plots of each celltype column + gg_storage <- list() + + for (cell in colnames(Markers_from_list)){ + + title <- cell + + markers_to_analyze <- as.character(Markers_from_list[[cell]]) + + grob <- lapply(markers_to_analyze,function(x) plotMarkers(x)) + gg_storage[[cell]] <- gridExtra::arrangeGrob(grobs=grob,ncol = 1,newpage=F, as.table = F, top = text_grob(title,size = 15, face = "bold")) + + } + + #pdf("Color_by_Genes.pdf", 12, 15) + final_figures <- do.call(grid.arrange, c(gg_storage, ncol = ncol(Markers_from_list))) + #dev.off() + + return(final_figures) +} + \ No newline at end of file diff --git a/R/Harmony_Batch_Correction.R b/R/Harmony_Batch_Correction.R new file mode 100644 index 0000000..4961de6 --- /dev/null +++ b/R/Harmony_Batch_Correction.R @@ -0,0 +1,141 @@ +# This code comes from NIDAP 'Harmony Batch Correction [scRNA-seq][CCBR]' code template +# Template Manager https://nidap.nih.gov/workspace/vector/templates/ri.vector.main.template.a097334e-21ac-466c-b467-c761b2c25f02 +# Documentation https://nidap.nih.gov/workspace/notepad/view/ri.notepad.main.notepad.facd9ed0-90fc-4c55-bbb8-0153f06858d4 + +#' @title Performs batch correction on Seurat-class object +#' @description Returns a Seurat-class object with adjusted cell embeddings and gene expression data to account for variations in sample collection +#' @details Takes in a list of genes inputted by the user, displays gene expression information in particular slot-assay with (optional) outliers removed +#' +#' @param seurat_object Seurat-class object +#' @param variable_features Number of most variable genes to subset the gene expression data by +#' @param genes_to_add Add genes that might not be found among variably expressed genes +#' @param group.by.vars which variable should be accountted for when running batch correction + +#' @import Seurat +#' @import harmony +#' @import gridExtra +#' @import RColorBrewer +#' @import ggplot2 +#' +#' @export + +#' @return Seurat-class Object with Harmony-adjusted gene expression (SCT slot) and tSNE cell embedding + + +harmony_batch_correct <- function(so, + variable_features = 2000, + genes_to_add, + group.by.vars) { + + ## --------------- ## + ## Main Code Block ## + ## --------------- ## + + seur.SCT <- so@assays$SCT@scale.data + seur.SCT[1:10, 1:10] + + genes_of_interest <- genes_to_add[genes_to_add %in% rownames(so@assays$SCT@scale.data)] + + # Add more genes to analyze + VariableFeatures(so) <- c(VariableFeatures(so), genes_of_interest) + mvf <- VariableFeatures(so)[1:(variable_features + length(genes_of_interest))] + print(dim(so@assays$SCT@scale.data)) + seur.SCT <- seur.SCT[mvf,] + seur.SCT <- t(seur.SCT) + # need the original loadings and embeddings to compare with my SVD + seur.loads <- so@reductions$pca@feature.loadings + seur.pca <- so@reductions$pca@cell.embeddings + sm.pca <- seur.pca[1:10, 1:10] + sm.seur.ldngs <- seur.loads[1:10, 1:10] + + # SVD on scaled counts + + variable_features <- length(mvf) + Sys.time() + pppca <- svd(seur.SCT) + Sys.time() + ppembed <- pppca$u %*% diag(pppca$d) + pcnames <- vector(mode = "character") + for (i in 1:dim(ppembed)[2])pcnames[i] <- paste("PC", i, sep = "_") + colnames(ppembed) <- pcnames + rownames(ppembed) <- rownames(seur.SCT) + sm.ppembed <- ppembed[1:10, 1:10] + ppldngs <- pppca$v + colnames(ppldngs) <- pcnames + rownames(ppldngs) <- mvf + sm.ppldngs <- ppldngs[1:10, 1:10] + ppembed[1:10, 1:10] + ppldngs[1:10, 1:10] + + so@reductions$pca@cell.embeddings <- ppembed + so@reductions$pca@feature.loadings <- ppldngs + so@reductions$pca@stdev <- pppca$d + print(Sys.time()) + + so <- RunHarmony(so, group.by.vars, + do_pca=FALSE, + assay.use = "SCT", + plot_convergence = TRUE, + return_object=TRUE) + + head(so@reductions$harmony@cell.embeddings) + head(so@reductions$harmony@feature.loadings) + + so <- RunUMAP(so, reduction = "harmony",dims=1:4) + so <- RunTSNE(so, reduction = "harmony",dims=1:4) + + sdat <- data.frame(as.vector(so@reductions$tsne@cell.embeddings[,1]), + as.vector(so@reductions$tsne@cell.embeddings[,2]), + so@meta.data[eval(parse(text = "group.by.vars"))]) + names(sdat) <- c("TSNE1","TSNE2","ident") + + n <- 2e3 + set.seed(10) + ourColorSpace <- colorspace::RGB(runif(n), runif(n), runif(n)) + ourColorSpace <- as(ourColorSpace, "LAB") + + + distinctColorPalette <-function(k=1) { + currentColorSpace <- ourColorSpace@coords + # Set iter.max to 20 to avoid convergence warnings. + set.seed(1) + km <- kmeans(currentColorSpace, k, iter.max=20) + colors <- unname(hex(LAB(km$centers))) + return(colors) + } + + n <- 60 + qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',] + qual_col_pals = qual_col_pals[c(7,6,2,1,8,3,4,5),] + cols = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals))) + + g1 <- ggplot(sdat, aes(x=TSNE1, y=TSNE2)) + + theme_bw() + + theme(legend.title=element_blank()) + + geom_point(aes(colour=ident),size=1) + + scale_color_manual(values=cols) + + theme(panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + legend.position="top", + panel.background = element_blank(), + legend.text=element_text(size=rel(0.5))) + + guides(colour = guide_legend(override.aes = list(size=5, alpha = 1))) + + solist <- list(so, ppldngs) + + # Calculate adjusted gene expression from embeddings + so <- solist[[1]] + ppldngs <- solist[[2]] + seur.SCT <- t(so@assays$SCT@scale.data) + harm.embeds <- so@reductions$harmony@cell.embeddings + #print(dim(harm.embeds)) + harm.lvl.backcalc <- harm.embeds %*% t(ppldngs) + + sm.harmexpr <- harm.lvl.backcalc[1:10, 1:4] + sm.comp.scld <- (seur.SCT[rownames(sm.harmexpr), colnames(sm.harmexpr)]) + + # Convert to ggplot that includes color by samples for higher resolution of batch correction, use for loop to screen for different genes + so@assays$SCT@scale.data <- t(harm.lvl.backcalc) + + return(so) +} diff --git a/R/ModuleScore.R b/R/ModuleScore.R new file mode 100644 index 0000000..f52f46c --- /dev/null +++ b/R/ModuleScore.R @@ -0,0 +1,356 @@ + # This code comes from NIDAP 'ModuleScore [scRNA-seq][CCBR]' code template + # Template Manager https://nidap.nih.gov/workspace/vector/templates/ri.vector.main.template.10cf059e-0bd0-4a1c-9a3a-65b8688dab23 + # Documentation https://nidap.nih.gov/workspace/notepad/view/ri.notepad.main.notepad.1d5957fa-ef90-4473-b627-989b9620278f + + #' @title Calculate average gene expression (subtracted by aggregated control feature set) and label cells according to ModuleScores + #' @description Returns Seurat-class object with metadata containing ModuleScores and Likely_CellType calls + #' @details Analyzed features are binned based on averaged expression; control features are randomly selected from each bin. + #' + #' @param SO Seurat-class object + #' @param sample_names List of samples to subset the data by + #' @param sample_to_display List of samples to depict on dimension plot, samples not in the list would be colored gray in the background + #' @param geneset_dataframe Table of marker genes for each celltype (column names of the table), append "_prot" or "_neg" for proteins or negative markers + #' @param proteins_presence Set to TRUE if there are CITE-seq markers in geneset_dataframe + #' @param celltypes_to_analyze Vector of celltypes from geneset_dataframe to screen for + #' @param manual_threshold Specify bimodal thresholds for cell classification, should be of the same length as celltypes_to_analyze vector + #' @param general_class Base population of cells to classify + #' @param multi_level_class Toggle to TRUE if there are subpopulations of cells you want to screen for + #' @param levels_dataframe Table of subpopulation levels (column names of the table) and parent-child information (e.g. Tcells-CD4, Tcells-CD8) + #' @param reduction Choose among tsne, umap, and pca + #' @param nbins Number of bins for storing control features and analyzing average expression + #' @param gradient_density_font_size Set size of axis labels on gradient density plot of ModuleScore distribution + #' @param violinplot_font_size Set size of axis labels on violin plot of ModuleScore distribution + #' @param step_size Set step size of distribution plots + #' @param image_width Set Image width (pixel units) + #' @param image_height Set Image height (pixel units) + #' @param image_type Choose between png and svg + #' + #' @import Seurat + #' @import tidyverse + #' @import gridExtra + #' @import quantmod + #' @import grid + #' @import data.table + #' @import utils + #' + #' @export + + #' @return Annotated dimension plot with ModuleScore distribution of cell marker gene, Seurat Object with cell classification metadata + +ModuleScore <- function(SO, + sample_names, + sample_to_display, + geneset_dataframe, + proteins_presence = FALSE, + celltypes_to_analyze, + manual_threshold = c(0), + general_class, + multi_level_class, + levels_dataframe, + reduction = "tsne", + nbins = 24, + gradient_density_font_size = 6, + violinplot_font_size = 6, + step_size = 0.1 + ){ + + ## ---------------- ## + ## Helper Functions ## + ## ---------------- ## + + Predict_Cell_from_ModScore <- function(ModScore_Metadata,manual_threshold,rejection){ + + thres_ls <- list() + for (i in 1:ncol(ModScore_Metadata)){ + thres_ls[[i]]<- rep(manual_threshold[i],nrow(ModScore_Metadata)) + } + thres_df <- data.frame(matrix(unlist(thres_ls),nrow = nrow(ModScore_Metadata))) + + thres_filter <- ModScore_Metadata > thres_df + ModScore_Metadata_post_thres_filter <- ModScore_Metadata * thres_filter + + # Find column number with highest modscore + max_col_vector <- max.col(ModScore_Metadata_post_thres_filter) + + # If a row contains all zeroes, they will be labeled with unknown + all_zero_filter <- as.integer(!apply(ModScore_Metadata_post_thres_filter, 1, function(find_zero_rows) all(find_zero_rows == 0))) + + # Final filtering: + final_filter <- (max_col_vector * all_zero_filter) + 1 + + # Original names appended to "unknown" classification for cells with ModScores below threshold + appended_names <- c(rejection, names(ModScore_Metadata)) + + # Added the names into a Likely_CellType Column + dupl_data <- ModScore_Metadata + dupl_data[,"Likely_CellType"] <- appended_names[final_filter] + return(dupl_data) + } + + ## --------------- ## + ## Main Code Block ## + ## --------------- ## + + geneset_for_module_scores <- unlist(geneset_dataframe) + + # Create a Barcode column if none is detected - required for matching cell calls in downstream steps + if (!"Barcode" %in% colnames(SO@meta.data)){ + SO@meta.data$Barcode <- rownames(SO@meta.data) + } + + if (length(sample_names) == 0) { + sample_names = unique(SO@meta.data$sample_name) + } + + colnames(SO@meta.data) <- gsub("orig_ident","orig.ident",colnames(SO@meta.data)) + + if("active.ident" %in% slotNames(SO)){ + sample_name = as.factor(SO@meta.data$orig.ident) + names(sample_name)=names(SO@active.ident) + SO@active.ident <- as.factor(vector()) + SO@active.ident <- sample_name + SO.sub = subset(SO, ident = sample_names) + } else { + sample_name = as.factor(SO@meta.data$orig.ident) + names(sample_name)=names(SO@active.ident) + SO@active.ident <- as.factor(vector()) + SO@active.ident <- sample_name + SO.sub = subset(SO, ident = sample_names) + } + + # Remove original unprocessed SO + rm(SO) + + # Adding protein marker expression + if (proteins_presence){ + protein_markers <- geneset_for_module_scores[grepl("_prot",geneset_for_module_scores)] + + protein_orig_markers <- gsub("_prot.*","",protein_markers) + + protein_markers_name <- paste(protein_orig_markers, + "_prot", sep = "") + + prot_indx = 0 + protein_array <- list() + for (prot_indx in seq_along(protein_orig_markers)){ + protein_array[[prot_indx]] <- SO.sub@assays$Protein[protein_orig_markers[prot_indx],] + rownames(protein_array[[prot_indx]]) <- protein_markers_name[prot_indx] + } + protein_array_comp <- do.call(rbind,protein_array) + SO.sub@assays$SCT@data <- rbind(SO.sub@assays$SCT@data,protein_array_comp) + } + + # Recognize any negative markers in marker list + neg_markers_names <- geneset_for_module_scores[grepl("_neg",geneset_for_module_scores)] + orig_markers <- gsub("_neg.*","",neg_markers_names) + + # Retrieve markers found in counts data + orig_markers <- orig_markers[orig_markers %in% rownames(SO.sub@assays$SCT@data)] + + neg_markers_list <- list() + + # Calculate adjusted expression for negative markers + for (neg_indx in seq_along(orig_markers)){ + + # Format the data so that it can rbinded with SO$SCT@scale.data + neg_markers_list[[neg_indx]] <- t(matrix(max(SO.sub@assays$SCT@data[orig_markers[neg_indx],]) - SO.sub@assays$SCT@data[orig_markers[neg_indx],])) + row.names(neg_markers_list[[neg_indx]]) <- neg_markers_names[neg_indx] + colnames(neg_markers_list[[neg_indx]]) <- colnames(SO.sub@assays$SCT@data) + + # Append new Negative/low marker (w Expression Count) to SO slot + SO.sub@assays$SCT@data <- rbind(SO.sub@assays$SCT@data, neg_markers_list[[neg_indx]]) + } + + # Retrive markers from list + marker = select(geneset_dataframe, celltypes_to_analyze) + marker.list = as.list(marker) + + # Error checking and messages for when threshold vector is zero or of non-matching length to number of celltypes to analyze + if (sum(unlist(marker.list) %in% rownames(SO.sub@assays$SCT@data)) == 0){ + stop("No genes from list was found in data") + } + + if (length(manual_threshold) != length(celltypes_to_analyze)){ + if (sum(manual_threshold) == 0){ + manual_threshold <- rep(0, length(celltypes_to_analyze)) + print("Manual threshold set to zero - outputing preliminary data") + } else { + stop("Manual threshold length does not match number of celltypes to analyze - please check manual thresholds") + }} + + names(manual_threshold) <- celltypes_to_analyze + + # Check if all markers for particular celltype is present, if none are present - corresponding celltype will not be analyzed + figures <- list() + exclude_cells <- c() + + h = 0 + j = 1 + + for (h in seq_along(marker.list)) { + print(names(marker.list[h])) + present=lapply(marker.list[[h]], function(x) x %in% rownames(SO.sub)) # apply function(x) x %in% rownames(SO.sub) to each element of marker.list + absentgenes = unlist(marker.list[[h]])[present==FALSE]; absentgenes=absentgenes[is.na(absentgenes)==F] + presentgenes = unlist(marker.list[[h]])[present==TRUE];presentgenes=presentgenes[is.na(presentgenes)==F] + print(paste0("Genes not present: ",paste0(absentgenes,collapse=","))) + print(paste0("Genes present: ",paste0(presentgenes,collapse=","))) + + if(length(presentgenes) == 0){ + print(paste0(names(marker.list[h]), " genes were not found in SO and will not be analyzed")) + exclude_cells[j] <- h + j = j + 1 + }} + + if (length(exclude_cells) > 0){ + marker.list <- marker.list[-exclude_cells]} else { + marker.list <- marker.list + } + + # Calculate aggregate expression of markers with Seurat's Module Score function + for (i in seq_along(marker.list)) { + SO.sub=AddModuleScore(SO.sub,marker.list[i],name = names(marker.list[i]), + nbin = nbins) + + m = 0 + + m = paste0(names(marker.list[i]),"1") + SO.sub@meta.data[[m]] <- scales::rescale(SO.sub@meta.data[[m]], to=c(0,1)) + + clusid = SO.sub@meta.data[[m]] + + # Calculate density data for ModScore vs Number of cells + d <- density(clusid) + + # Calculate dimension reduction + if(reduction=="tsne"){ + p1 <- DimPlot(SO.sub, reduction = "tsne", group.by = "ident") + } else if(reduction=="umap"){ + p1 <- DimPlot(SO.sub, reduction = "umap", group.by = "ident") + } else { + p1 <- DimPlot(SO.sub, reduction = "pca", group.by = "ident") + } + + if(reduction=="tsne"){ + clusmat=data.frame(ident=p1$data$ident,umap1=p1$data$tSNE_1,umap2=p1$data$tSNE_2, clusid=as.numeric(SO.sub@meta.data[[m]])) + } else if(reduction=="umap"){ + clusmat=data.frame(ident=p1$data$ident,umap1=p1$data$UMAP_1,umap2=p1$data$UMAP_2, clusid=as.numeric(SO.sub@meta.data[[m]])) + } else { + clusmat=data.frame(ident=p1$data$ident,umap1=p1$data$PC_1,umap2=p1$data$PC_2, clusid=as.numeric(SO.sub@meta.data[[m]])) + } + clusmat <- mutate(clusmat, sample_clusid = clusmat$clusid * grepl(paste(sample_to_display, collapse = "|"), clusmat$ident)) + + clusmat %>% group_by(clusid) %>% summarise(umap1.mean=mean(umap1), umap2.mean=mean(umap2)) -> umap.pos + title=as.character(m) + clusmat %>% dplyr::arrange(clusid) -> clusmat + + # Plot data onto colored dimension reduction plot + clusid.df <- data.frame(id=SO.sub@meta.data$orig.ident,ModuleScore=SO.sub@meta.data[[m]]) + + g <- ggplot(clusmat, aes(x=umap1, y=umap2)) + + theme_bw() + + theme(legend.title=element_blank()) + + geom_point(aes(colour=sample_clusid),alpha=0.5,shape = 20,size=1) + + scale_color_gradientn(colours = c("blue4","lightgrey", "red"), values = scales::rescale(c(0,manual_threshold[i]/2,manual_threshold[i],(manual_threshold[i]+1)/2,1), limits = c(0, 1))) + guides(colour = guide_legend(override.aes = list(size=5, alpha = 1))) + + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank()) + + xlab("tsne-1") + ylab("tsne-2") + + g1 = RidgePlot(SO.sub,features=m,group.by="orig.ident") + theme(legend.position = "none", title = element_blank(), axis.text.x = element_text(size = gradient_density_font_size)) + geom_vline(xintercept = manual_threshold[i], linetype = "dashed", color = "red3") + scale_x_continuous(breaks = seq(0,1,step_size)) + + # Violin Plot + g2 = ggplot(clusid.df,aes(x=id,y=ModuleScore)) + geom_violin(aes(fill=id)) + theme_classic() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),legend.title = element_blank(), panel.background = element_blank(), axis.text.x=element_blank(),legend.text=element_text(size=rel(0.8)),legend.position="top", axis.text.y = element_text(size = violinplot_font_size)) + guides(colour = guide_legend(override.aes = list(size=5, alpha = 1))) + geom_hline(yintercept = manual_threshold[i], linetype = "dashed", color = "red3") + scale_y_continuous(breaks = seq(0,1,step_size)) + + # Color gradient density plot + g3 = ggplot(data.frame(x = d$x, y = d$y), aes(x, y)) + xlab("ModuleScore") + ylab("Density") + geom_line() + + geom_segment(aes(xend = d$x, yend = 0, colour = x)) + scale_y_log10() + + scale_color_gradientn(colours = c("blue4","lightgrey", "red"), values = scales::rescale(c(0,manual_threshold[i]/2,manual_threshold[i],(manual_threshold[i]+1)/2,1), limits = c(0, 1))) + geom_vline(xintercept = manual_threshold[i], linetype = "dashed", color = "red3") + geom_vline(xintercept = manual_threshold[i], linetype = "dashed", color = "red3") + scale_x_continuous(breaks = seq(0,1,step_size)) + theme(legend.title = element_blank(), axis.text.x = element_text(size = 6)) + + # Set title for grid.arranged final figure + figures[[i]] = grid.arrange(g,g1,g2,g3, ncol=2, top=textGrob(names(marker.list[i]), gp = gpar(fontsize = 14, fontface = "bold"))) + } + + # Get rid of "1" at the end of MS columns + colnames(SO.sub@meta.data)[colnames(SO.sub@meta.data) %in% paste0(names(marker.list),1)] <- names(marker.list) + + # Classification of general cell class + + # Analyze and plot only the class of cells found in the metadata + general_class <- general_class[general_class %in% colnames(SO.sub@meta.data)] + + # Subset the columns of the metadata containing module scores only + SO_Trunc_Metadata_General <- SO.sub@meta.data[general_class] + + General_thres_vec <- manual_threshold[general_class] + + # See if elements in each ModScore column exceeds CellType threshold, set elements below threshold to zero. Keep values of elements above threshold + storage_list_MS_calls <- list() + + Calls_output <- Predict_Cell_from_ModScore(SO_Trunc_Metadata_General,General_thres_vec,rejection = "unknown") + Calls_output$Barcode <- rownames(Calls_output) + + if (multi_level_class){ + + for (k in 1:ncol(levels_dataframe)){ + + # Initialize list for temporarily keeping results from subpopulation calls + Sub_class_calls <- list() + + ## Subclass Identification + # Remove any NAs from comparisons + Sub_class_storage <- levels_dataframe[[k]][!is.na(levels_dataframe[[k]])] + + parent_class <- unique(gsub("(.*)-(.*)","\\1",Sub_class_storage)) + + for (parent in parent_class){ + Sub_class <- Sub_class_storage[grepl(parent,Sub_class_storage)] + children_class <- gsub("(.*)-(.*)","\\2",Sub_class) + + # Subset out cells predicted to be parents. Child cells will be screened from this population + parents <- Calls_output$Barcode[Calls_output$Likely_CellType == parent] + SO_Trunc_Metadata_parents <- SO.sub@meta.data[parents,] %>% select(children_class) + + # Stores a new density plot containing MS information of children cells within parent population + for (children in children_class){ + + plot_title <- paste("Density plot for",children,"Module Scores within", parent,"population", sep = " ") + adjusted_density_plot <- ggplot(SO_Trunc_Metadata_parents, aes_string(x = children)) + geom_density() + ggtitle(plot_title) + geom_vline(xintercept = manual_threshold[children], linetype = "dashed", color = "red3") + + figures[[length(figures) + 1]] <- adjusted_density_plot + } + + # Create output table without information of parent population, will be the table with updated cell calls will be reinserted + SO_Trunc_Metadata_no_parents <- Calls_output[!Calls_output$Likely_CellType == parent,] + non_parents <- rownames(SO_Trunc_Metadata_no_parents) + + gen_annot_table <- data.frame(cells = non_parents, identity = SO_Trunc_Metadata_no_parents$Likely_CellType) + + # Repeat Module Score Comparison and Cell Prediction with Child Subset: + children_thres_vec <- manual_threshold[children_class] + + Sub_class_calls[[match(parent,parent_class)]] <- Predict_Cell_from_ModScore(SO_Trunc_Metadata_parents,children_thres_vec,rejection = parent) %>% select(Likely_CellType) + } + + # Reappend subclassification results back to general output for higher level classification + Sub_class_calls <- do.call(rbind,Sub_class_calls) + Sub_class_calls$Barcode <- rownames(Sub_class_calls) + + # Continuously update Likely_CellType column in Calls Output with new results from each level subclassification(s) + Calls_output$Likely_CellType_updated <- Sub_class_calls$Likely_CellType[match(Calls_output$Barcode,Sub_class_calls$Barcode)] + + Calls_output <- Calls_output %>% mutate(Likely_CellType = case_when( + is.na(Likely_CellType_updated) ~ Likely_CellType, + TRUE ~ Likely_CellType_updated + )) + + # Remove Likely_CellType_updated + Calls_output$Likely_CellType_updated <- NULL + }} + + ## Updating CellType(s) in metadata with subclass calls + SO.sub@meta.data$Likely_CellType <- Calls_output$Likely_CellType[match(SO.sub@meta.data$Barcode,Calls_output$Barcode)] + + modscore_res <- list(Arranged_figures <- do.call("grid.arrange", c(figures)), + Seurat_Object = SO.sub) + + return(modscore_res) + } + \ No newline at end of file diff --git a/R/Pseudobulk_DEG.R b/R/Pseudobulk_DEG.R new file mode 100644 index 0000000..ddab0ea --- /dev/null +++ b/R/Pseudobulk_DEG.R @@ -0,0 +1,549 @@ +# This code comes from NIDAP 'Pseudobulk DEG [scRNA-seq][CCBR]' code template +# Template Manager https://nidap.nih.gov/workspace/vector/templates/ri.vector.main.template.07c5b878-ada6-4cd8-9f61-e312f8491904 +# Documentation https://nidap.nih.gov/workspace/notepad/view/ri.notepad.main.notepad.422102e6-4d67-4ec9-baf7-e4f1fd78e0a8 + +#' @title Pseudobulk Differential Gene Expression Analysis +#' @description Performs Linear Models for Microarray Data on single-cell data +#' @details Aggregate gene expression amongst replicates before performing limma regression +#' +#' @param so Seurat-class object +#' @param contrasts Choose the groups (group1-group2) to compare transcription profiles against, positive Fold Change values indicate upregulation in group1 +#' @param replicate Column of metadata containing replicate information, gene expression will first be aggregated based on this column before differential analysis is ran +#' @param comparison_level Name of metadata column containing information on the sub-levels you want to divide your data into. +#' @param label Name of metadata column containing information on the groups (contrasts) you want to compare. +#' @param min_num_cells Filters a comparison level based on the minimum number of cells at that level. +#' @param min_num_reps Filters a comparison level based on the minimum number of replicates at that level. +#' @param min_num_features Filters out genes if they are expressed in fewer than X number of cells. +#' @param subdivide_data Toggle to TRUE if you want to make comparisons across different levels within your data. +#' +#' @import Seurat +#' @import edgeR +#' @import purrr +#' @import magrittr +#' @import tidyverse +#' +#' @export + +#' @return data.frame of differentially expressed genes across comparisons + +Pseudobulk_DEG <- function(so, + contrasts, + replicate, + comparison_level, + label, + min_num_cells = 3, + min_num_reps = 1, + min_num_features = 0, + subdivide_data = FALSE) { + + ## ---------------- ## + ## Helper Functions ## + ## ---------------- ## + + check_inputs = function(input, + meta = meta, + replicate_col = 'Replicate', + cell_type_col = 'Comparison', + label_col = 'Label') { + + # extract cell types and label from metadata + if ("Seurat" %in% class(input)) { + # confirm Seurat is installed + if (!requireNamespace("Seurat", quietly = TRUE)) { + stop("install \"Seurat\" R package for Augur compatibility with ", + "input Seurat object", call. = FALSE) + } + meta = input@meta.data %>% + droplevels() + if (!is.null(replicate_col)) + replicates = as.character(meta[[replicate_col]]) + if (!is.factor(meta[[label_col]])) { + labels = meta[[label_col]] + } else { + labels = as.character(meta[[label_col]]) + } + cell_types = as.character(meta[[cell_type_col]]) + expr = Seurat::GetAssayData(input, slot = 'counts') + } else if ("cell_data_set" %in% class(input)) { + # confirm monocle3 is installed + if (!requireNamespace("monocle3", quietly = TRUE)) { + stop("install \"monocle3\" R package for Augur compatibility with ", + "input monocle3 object", call. = FALSE) + } + meta = monocle3::pData(input) %>% + droplevels() %>% + as.data.frame() + if (!is.null(replicate_col)) + replicates = as.character(meta[[replicate_col]]) + if (!is.factor(meta[[label_col]])) { + labels = meta[[label_col]] + } else { + labels = as.character(meta[[label_col]]) + } + cell_types = as.character(meta[[cell_type_col]]) + expr = monocle3::exprs(input) + } else if ("SingleCellExperiment" %in% class(input)){ + # confirm SingleCellExperiment is installed + if (!requireNamespace("SingleCellExperiment", quietly = TRUE)) { + stop("install \"SingleCellExperiment\" R package for Augur ", + "compatibility with input SingleCellExperiment object", + call. = FALSE) + } + meta = SummarizedExperiment::colData(input) %>% + droplevels() %>% + as.data.frame() + if (!is.null(replicate_col)) + replicates = as.character(meta[[replicate_col]]) + if (!is.factor(meta[[label_col]])) { + labels = meta[[label_col]] + } else { + labels = as.character(meta[[label_col]]) + } + cell_types = as.character(meta[[cell_type_col]]) + expr = SummarizedExperiment::assay(input) + } else { + # check if input is sparse matrix or numberic matrix/df + valid_input = is(input, 'sparseMatrix') || + is_numeric_matrix(input) || + is_numeric_dataframe(input) + if (!valid_input) + stop("input must be Seurat, monocle, sparse matrix, numeric matrix, or ", + "numeric data frame") + if (is.null(meta)) + stop("input matrix must be accompanied by a metadata table") + expr = input + if (!is.null(replicate_col)) + replicates = as.character(meta[[replicate_col]]) + labels = as.character(meta[[label_col]]) + cell_types = as.character(meta[[cell_type_col]]) + } + + # check dimensions are non-zero + if (length(dim(expr)) != 2 || !all(dim(expr) > 0)) { + stop("expression matrix has at least one dimension of size zero") + } + + # check dimensions match + n_cells1 = nrow(meta) + n_cells2 = ncol(expr) + if (n_cells1 != n_cells2) { + stop("number of cells in metadata (", n_cells1, ") does not match number ", + "of cells in expression (", n_cells2, ")") + } + + # check at least two labels + if (n_distinct(labels) == 1) { + stop("only one label provided: ", unique(labels)) + } + + # check for missing labels or cell types + if (any(is.na(labels))) { + stop("labels contain ", sum(is.na(labels)), "missing values") + } + if (any(is.na(cell_types))) { + stop("cell types contain ", sum(is.na(cell_types)), "missing values") + } + if (!is.null(replicate_col) && any(is.na(replicates))) { + stop("replicates contain ", sum(is.na(replicates)), "missing values") + } + + # check for missing replicates + if (!is.null(replicate_col) && is.null(replicates)) { + stop("metadata does not contain replicate information") + } + + # remove missing values + missing = is.na(expr) + if (any(missing)) { + stop("matrix contains ", sum(missing), "missing values") + } + + # clean up the meta data + if (!is.null(replicate_col)) { + meta <- meta %>% as.data.frame() %>% + mutate(cell_barcode = rownames(meta), + replicate = meta[[replicate_col]], + cell_type = meta[[cell_type_col]], + label = meta[[label_col]]) %>% + mutate_at(vars(replicate, cell_type, label), as.factor) + } else { + meta <- meta %>% as.data.frame() %>% + mutate(cell_barcode = rownames(meta), + cell_type = meta[[cell_type_col]], + label = meta[[label_col]]) %>% + mutate_at(vars(cell_type, label), as.factor) + } + + # make sure meta contains row names and is a data frame + rownames(meta) = colnames(expr) + meta = as.data.frame(meta) + to_return = list( + expr = expr, + meta = meta + ) + return(to_return) + } + + # Create a pseudobulk matrix + # Convert a single-cell expression matrix (i.e., genes by cells) + # to a pseudobulk matrix by summarizing counts within biological replicates + + to_pseudobulk = function(input, + meta = NULL, + replicate_col = 'Replicate', + cell_type_col = 'Comparison', + label_col = 'Label', + min_cells = 3, + min_reps = 2, + min_features = 0, + external = T, + subdivide = T) { + if (external) { + # first, make sure inputs are correct + inputs = check_inputs( + input, + meta = meta, + replicate_col = replicate_col, + cell_type_col = cell_type_col, + label_col = label_col) + expr = inputs$expr + meta = inputs$meta + } else { + expr = input + } + + # convert to characters + meta$replicate <- as.character(meta[,eval(parse(text = "replicate"))]) + meta$cell_type <- as.character(meta[,eval(parse(text = "comparison_level"))]) + meta$label <- as.character(meta[,eval(parse(text = "label"))]) + + ## Code from v20 ~ not working when using variable to subset + #meta %<>% mutate(replicate = as.character(meta[,replicate]), + # cell_type = as.character(meta[,comparison_level]), + # label = as.character(meta[,label])) + + # keep only cell types with enough cells + keep = meta %>% + dplyr::count(cell_type, label) %>% + group_by(cell_type) %>% + filter(all(n >= min_cells)) %>% + pull(cell_type) %>% + unique() + + # Need to make rownames = barcode otherwise, downstream matrix multiplication won't work + rownames(meta) <- meta$Barcode + + if (subdivide == TRUE){ + # process data into gene x replicate x cell_type matrices + pseudobulks = keep %>% + map( ~ { + print(.) + cell_type = . + meta0 = meta %>% filter(cell_type == !!cell_type) + expr0 = expr %>% magrittr::extract(, meta0$cell_barcode) + # catch cell types without replicates or conditions + if (n_distinct(meta0$label) < 2) + return(NA) + replicate_counts = distinct(meta0, label, replicate) %>% + group_by(label) %>% + summarise(replicates = n_distinct(replicate)) %>% + pull(replicates) + if (any(replicate_counts < min_reps)) + return(NA) + + # process data into gene X replicate X cell_type matrice + mm = model.matrix(~ 0 + replicate:label, data = meta0) + + # Test # + #mm = model.matrix(~ replicate:label, data = meta0) + # Test # + + mat_mm = expr0 %*% mm + + ### Original code - start ### + #keep_genes = rowSums(mat_mm > 0) > min_features + ### Original code - end ### + + ### New code - start ### + keep_genes = rowSums(as.array(mat_mm > 0)) > min_features + ### New code - end ### + + mat_mm = mat_mm[keep_genes, ] %>% as.data.frame() + mat_mm %<>% as.data.frame() + colnames(mat_mm) = gsub("replicate|label", "", colnames(mat_mm)) + # drop empty columns + keep_samples = colSums(mat_mm) > 0 + mat_mm %<>% magrittr::extract(, keep_samples) + return(mat_mm) + }) %>% + setNames(keep) + } else { + keep = "all" + pseudobulks = keep %>% + map( ~ { + print(.) + cell_type = . + meta0 = meta + expr0 = expr + # catch cell types without replicates or conditions + if (n_distinct(meta0$label) < 2) + return(NA) + replicate_counts = distinct(meta0, label, replicate) %>% + group_by(label) %>% + summarise(replicates = n_distinct(replicate)) %>% + pull(replicates) + if (any(replicate_counts < min_reps)) + return(NA) + + # process data into gene X replicate X cell_type matrice + mm = model.matrix(~ 0 + replicate:label, data = meta0) + + # Test # + #mm = model.matrix(~ replicate:label, data = meta0) + # Test # + + mat_mm = expr0 %*% mm + + ### Original code - start ### + #keep_genes = rowSums(mat_mm > 0) > min_features + ### Original code - end ### + + ### New code - start ### + keep_genes = rowSums(as.array(mat_mm > 0)) > min_features + ### New code - end ### + + mat_mm = mat_mm[keep_genes, ] %>% as.data.frame() + mat_mm %<>% as.data.frame() + colnames(mat_mm) = gsub("replicate|label", "", colnames(mat_mm)) + # drop empty columns + keep_samples = colSums(mat_mm) > 0 + mat_mm %<>% magrittr::extract(, keep_samples) + return(mat_mm) + }) %>% + setNames(keep) + } + + # drop NAs + pseudobulks %<>% magrittr::extract(!is.na(.)) + + # also filter out cell types with no retained genes + min_dim = map(pseudobulks, as.data.frame) %>% map(nrow) + pseudobulks %<>% magrittr::extract(min_dim > 1) + + # also filter out types without replicates + min_repl = map_int(pseudobulks, ~ { + # make sure we have a data frame a not a vector + tmp = as.data.frame(.) + targets = data.frame(group_sample = colnames(tmp)) %>% + mutate(group = gsub(".*\\:", "", group_sample)) + if (n_distinct(targets$group) == 1) + return(as.integer(0)) + min(table(targets$group)) + }) + pseudobulks %<>% magrittr::extract(min_repl >= min_reps) + return(pseudobulks) + } + + # Run a DE analysis within each cell type using Seurat + + pseudobulk_de = function(input, + meta = NULL, + replicate_col = replicate, + cell_type_col = comparison_level, + label_col = label, + min_cells = min_num_cells, + min_reps = min_num_reps, + min_features = min_num_features, + de_family = 'pseudobulk', + de_method = 'limma', + de_type = 'LRT', + subdivide = subdivide_data) { + # check args + if (de_method == 'limma') { + if (de_type != 'voom') { + # change default type to use + de_type = 'trend' + } + } + + # get the pseudobulks list + pseudobulks = to_pseudobulk( + input = input, + meta = meta, + replicate_col = replicate_col, + cell_type_col = cell_type_col, + label_col = label_col, + min_cells = min_cells, + min_reps = min_reps, + min_features = min_features, + external = T, + subdivide = subdivide + ) + + results = map(pseudobulks, function(x) { + # create targets matrix + targets = data.frame(group_sample = colnames(x)) %>% + mutate(group = gsub(".*\\:", "", group_sample)) + ## optionally, carry over factor levels from entire dataset + if (is.factor(meta$label)) { + targets$group %<>% factor(levels = levels(meta$label)) + } + # Need to set to higher values if more than 2 contrasts variables + if (n_distinct(targets$group) > 100000) + return(NULL) + # create design + + ### Possible Code for Design Formula (dm.formula) ### + dm.formula <- as.formula(paste("~0", paste(c("group")), sep="+", collapse="+")) + #design = model.matrix(~ group, data = targets) + design = model.matrix(dm.formula, data = targets) + + DE = switch(de_method, + edgeR = { + tryCatch({ + y = DGEList(counts = x, group = targets$group) %>% + calcNormFactors(method = 'TMM') %>% + estimateDisp(design) + test = switch(de_type, + QLF = { + fit = glmQLFit(y, design) + test = glmQLFTest(fit, coef = -1) + }, + LRT = { + fit = glmFit(y, design = design) + test = glmLRT(fit) + }) + res = topTags(test, n = Inf) %>% + as.data.frame() %>% + rownames_to_column('gene') %>% + # flag metrics in results + mutate(de_family = 'pseudobulk', + de_method = de_method, + de_type = de_type) + }, error = function(e) { + message(e) + data.frame() + }) + }, + DESeq2 = { + tryCatch({ + dds = DESeqDataSetFromMatrix(countData = x, + colData = targets, + design = ~ group) + dds = switch(de_type, + Wald = { + dds = try(DESeq(dds, + test = 'Wald', + fitType = 'parametric', + sfType = 'poscounts', + betaPrior = F)) + }, + LRT = { + dds = try(DESeq(dds, + test = 'LRT', + reduced = ~ 1, + fitType = 'parametric', + sfType = 'poscounts', + betaPrior = F)) + } + ) + res = results(dds) + # write + res = as.data.frame(res) %>% + mutate(gene = rownames(x)) %>% + # flag metrics in results + mutate(de_family = 'pseudobulk', + de_method = de_method, + de_type = de_type) + }, error = function(e) { + message(e) + data.frame() + }) + }, + limma = { + tryCatch({ + x = switch(de_type, + trend = { + trend_bool = T + dge = DGEList(as.matrix(x), group = targets$group) + dge = calcNormFactors(dge) + x = new("EList") + x$E = cpm(dge, log = TRUE, prior.count = 3) + x + }, + voom = { + counts = all(as.matrix(x) %% 1 == 0) + if (counts) { + trend_bool = F + x = voom(as.matrix(x), design) + x + } + }) + + # get fit + #Run Contrasts + + contrasts <- unlist(contrasts %>% strsplit("-") %>% + lapply(function (x) paste("group",x,sep="")) %>% + lapply(paste, collapse = "-")) + + cm <- makeContrasts(contrasts = contrasts, levels=design) + + fit2 = lmFit(x, design) %>% contrasts.fit(cm) %>% + eBayes(trend = trend_bool, robust = trend_bool) + + logFC = fit2$coefficients + colnames(logFC)=paste(colnames(logFC),"logFC",sep="_") + tstat = fit2$t + colnames(tstat)=paste(colnames(tstat),"tstat",sep="_") + FC = 2^fit2$coefficients + FC = ifelse(FC<1,-1/FC,FC) + colnames(FC)=paste(colnames(FC),"FC",sep="_") + pvalall=fit2$p.value + colnames(pvalall)=paste(colnames(pvalall),"pval",sep="_") + pvaladjall=apply(pvalall,2,function(x) p.adjust(x,"BH")) + colnames(pvaladjall)=paste(colnames(fit2$coefficients),"adjpval",sep="_") + + ### Original Code - Start ### + + ## format the results + #res = fit2 %>% + ## extract all coefs except intercept + #topTable(number = Inf, coef = NULL) %>% + #rownames_to_column('gene') %>% + ## flag metrics in results + #mutate( + # de_family = 'pseudobulk', + # de_method = de_method, + # de_type = de_type) + + ### Original Code - End ### + + res=as.data.frame(cbind(FC, logFC, tstat, pvalall, pvaladjall)) + + res$Gene <- rownames(res) + + res <- res %>% mutate( + de_family = 'pseudobulk', + de_method = de_method, + de_type = de_type) + + }, error = function(e) { + message(e) + data.frame() + }) + } + ) + }) + results %<>% bind_rows(.id = 'cell_type') + results <- results %>% dplyr::select("Gene", everything()) + } + + ## --------------- ## + ## Main Code Block ## + ## --------------- ## + + DE <- pseudobulk_de(so) + + return(DE) +} diff --git a/R/Violin_Plots_by_Metadata.R b/R/Violin_Plots_by_Metadata.R new file mode 100644 index 0000000..7d78faf --- /dev/null +++ b/R/Violin_Plots_by_Metadata.R @@ -0,0 +1,268 @@ +# This code comes from NIDAP 'Violin Plots by Metadata Column [scRNA-seq][CCBR]' code template +# Template Manager https://nidap.nih.gov/workspace/vector/templates/ri.vector.main.template.1011399a-426d-4aa1-bb3f-4430e97ea9ec +# Documentation https://nidap.nih.gov/workspace/notepad/view/ri.notepad.main.notepad.fbef08da-93ba-4301-aac7-2545631aad4f +# splitFacet helper function comes from https://stackoverflow.com/questions/30510898/split-facet-plot-into-list-of-plots/52225543 + +#' @title Violin Plot by Metadata +#' @description Returns a violin plot of gene expression data that is partitioned by user defined metadata column +#' @details Takes in a list of genes inputted by the user, displays gene expression information in particular slot-assay with (optional) outliers removed +#' +#' @param seurat_object Seurat-class object +#' @param assay Name of the assay to extract gene expression data from +#' @param slot Name of the slot within the assay to extract gene expression data from +#' @param ident_of_interest Split violin plot based on a column from the seurat object metadata +#' @param groups_of_interest Include only a specific subset from ident_of_interest +#' @param genes_of_interest Genes to visualize on the violin plot +#' @param filter_outliers Filter outliers from the data (TRUE/FALSE) +#' @param scale_data Scale data from 0 to 1 (TRUE/FALSE) +#' @param log_scale_data Transform data onto a log10 scale (TRUE/FALSE) +#' @param reorder_ident Numeric data will be ordered naturally by default. Toggling this option will order the groups to match the group list if non-numeric, and will have no effect if otherwise. +#' @param rename_ident Give alternative names to ident_of_interest displayed on the graph +#' @param ylimit Y-axis limit +#' @param plot_style Choose between grid, labeled, and row +#' @param outlier_low_lim Filter lower bound outliers (default = bottom 10 percentile) +#' @param outlier_up_lim Filter upper bound outliers (default = bottom 90 percentile) +#' @param jitter_points Scatter points on the plot (TRUE/FALSE) +#' @param jitter_width Set spread of jittered points +#' @param jitter_dot_size Set size of individual points +#' @param print_outliers Print outliers as points in your graph that may be redundant to jitter + +#' @import Seurat +#' @import reshape2 +#' @import cowplot +#' @import rlang +#' @import ggplot2 +#' +#' @export + +#' @return violin ggplot2 object + +ViolinPlot <- function(so, + assay = "SCT", + slot = "scale.data", + ident_of_interest, + groups_of_interest, + genes_of_interest, + filter_outliers = FALSE, + scale_data = TRUE, + log_scale_data = FALSE, + reorder_ident = TRUE, + rename_ident = "", + ylimit = 0, + plot_style = "grid", + outlier_low_lim = 0.1, + outlier_up_lim = 0.9, + jitter_points = FALSE, + jitter_width = 0.05, + jitter_dot_size = 0.5, + print_outliers = TRUE){ + + ## ---------------## + ## Error Messages ## + ## ---------------## + + gene_filter <- genes_of_interest %in% rownames(x = GetAssayData(object = so, slot = slot, assay = assay)) + genes_of_interest <- genes_of_interest[gene_filter] + missing_genes <- genes_of_interest[!gene_filter] + if(length(missing_genes) > 0){ + cat("The following genes are missing from the dataset:\n") + print(missing_genes) + } + + if(length(genes_of_interest) == 0){ + stop("No query genes were found in the dataset.") + } + + if(!ident_of_interest %in% colnames(so@meta.data)){ + colnames(so@meta.data) <- gsub("\\.","_",colnames(so@meta.data)) + if(!ident_of_interest %in% colnames(so@meta.data)){ + stop("Unable to find ident of interest in metadata.") + } + } + + group_filter <- groups_of_interest %in% so@meta.data[[ident_of_interest]] + groups_of_interest <- groups_of_interest[group_filter] + missing_groups <- groups_of_interest[!group_filter] + if(length(missing_groups) > 0){ + cat("The following groups are missing from the selected ident:\n") + print(missing_groups) + } + + if(length(groups_of_interest) == 0){ + stop("No groups were found in the selected ident.") + } + + if(rename_ident %in% c("Gene","Expression","scaled")){ + stop("New ident name cannot be one of Gene, Expression, or scaled.") + } + + ## ---------------- ## + ## Helper Functions ## + ## ---------------- ## + + splitFacet <- function(x){ + facet_vars <- names(x$facet$params$facets) # 1 + x$facet <- ggplot2::ggplot()$facet # 2 + datasets <- split(x$data, x$data[facet_vars]) # 3 + new_plots <- lapply(datasets,function(new_data) { # 4 + x$data <- new_data + x}) + } + + # from rapportools + is.empty <- function(x, trim = TRUE, ...) { + if (length(x) <= 1) { + if (is.null(x)) + return (TRUE) + if (length(x) == 0) + return (TRUE) + if (is.na(x) || is.nan(x)) + return (TRUE) + if (is.character(x) && nchar(ifelse(trim, trim.space(x), x)) == 0) + return (TRUE) + if (is.logical(x) && !isTRUE(x)) + return (TRUE) + if (is.numeric(x) && x == 0) + return (TRUE) + return (FALSE) + } else + sapply(x, is.empty, trim = trim, ...) + } + + # from rapportools + trim.space <- function(x, what = c('both', 'leading', 'trailing', 'none'), space.regex = '[:space:]', ...){ + if (missing(x)) + stop('nothing to trim spaces to =(') + re <- switch(match.arg(what), + both = sprintf('^[%s]+|[%s]+$', space.regex, space.regex), + leading = sprintf('^[%s]+', space.regex), + trailing = sprintf('[%s]+$', space.regex), + none = { + return (x) + }) + vgsub(re, '', x, ...) + } + + # from rapportools + vgsub <- function(pattern, replacement, x, ...){ + for(i in 1:length(pattern)) + x <- gsub(pattern[i], replacement[i], x, ...) + x + } + + remove_outliers_func <- function(x, na.rm = TRUE){ + qnt <- quantile(x, probs=c(outlier_low_lim,outlier_up_lim), na.rm = na.rm) + H <- 1.5*IQR(x, na.rm = na.rm) + y <- x + y[x < (qnt[1] - H)] <- NA + y[x > (qnt[2] + H)] <- NA + y + } + + ## --------------- ## + ## Main Code Block ## + ## --------------- ## + + # deal with limits + if(ylimit == 0){ + ylimit <- NULL + } + + #set idents + if("active.ident" %in% slotNames(so)){ + new_idents = as.factor(so@meta.data[[ident_of_interest]]) + names(new_idents) = names(so@active.ident) + so@active.ident <- as.factor(vector()) + so@active.ident <- new_idents + so.sub = subset(so, ident = groups_of_interest) + } else { + new_idents = as.factor(so@meta.data[[ident_of_interest]]) + names(sample_name)=so@meta.data[["Barcode"]] + so@active.ident <- as.factor(vector()) + so@active.ident <- new_idents + so.sub = subset(so, ident = groups_of_interest) + } + + DefaultAssay(object = so) <- assay + data <- FetchData(object = so.sub, vars = genes_of_interest, slot = slot) + + data[[ident_of_interest]] <- so.sub@meta.data[row.names(data),ident_of_interest] + + df.melt <- melt(data) + + if(!is.empty(rename_ident)){ + ident_of_interest <- rename_ident + } + colnames(df.melt) <- c(ident_of_interest,"Gene","Expression") + + #check to see if ident of interest looks numeric + if(suppressWarnings(all(!is.na(as.numeric(as.character(df.melt[[ident_of_interest]])))))){ + ident.values <- strtoi(df.melt[[ident_of_interest]]) + ident.levels <- unique(ident.values)[order(unique(ident.values))] + df.melt[[ident_of_interest]] <- factor(ident.values, levels = ident.levels) + }else if(reorder_ident){ + # if non-numeric, place in order of groups of interests + df.melt[[ident_of_interest]] <- factor(df.melt[[ident_of_interest]], levels = groups_of_interest) + } + + # Filter outliers + if(filter_outliers){ + for(gene in genes_of_interest){ + for(group in groups_of_interest){ + current.ind <- which(df.melt[["Gene"]] == gene & df.melt[[ident_of_interest]] == group) + df.melt[current.ind,"Expression"] <- remove_outliers_func(df.melt[current.ind,"Expression", drop = TRUE]) + } + } + } + + # Scale data to y limit + if(scale_data){ + expression_data = "scaled" + axis.title.y = "Expression (scaled)" + ylimit <- ylimit %||% 1 + df.melt <- df.melt %>% group_by(Gene) %>% mutate(scaled = scales::rescale(Expression, to=c(0,ylimit))) + }else{ + expression_data <- axis.title.y <- "Expression" + } + + g <- ggplot(df.melt, aes_string(x=ident_of_interest, y=expression_data)) + + geom_violin(aes_string(fill = ident_of_interest), scale="width", trim = FALSE, show.legend = FALSE) + + #geom_jitter(height = 0, width = 0.05, size=0.1) + + theme_classic() + + #scale_fill_manual(values=cols) + + labs(y=axis.title.y) + + theme(strip.text.y = element_text( + color="blue", face="bold.italic", angle = -90)) + + + if(!is.null(ylimit)){ + g <- g + ylim(0,ylimit) + } + + if(jitter_points){ + g <- g + geom_jitter(height = 0, width = jitter_width, size=jitter_dot_size) + } + + if(log_scale_data){ + g <- g + scale_y_log10() + } + + # Plot after jitter if wanted + g <- g + geom_boxplot(width=0.1, fill="white", outlier.shape = ifelse(print_outliers,19,NA)) + + # Plot styles + ncol = ceiling(length(unique(df.melt$Gene))^0.5) + nrow = ceiling(length(unique(df.melt$Gene)) / ncol) + if(plot_style == "rows"){ + g <- g + facet_grid(rows = vars(Gene)) + }else{ + g <- g + facet_wrap(~Gene, nrow = nrow, ncol = ncol) + if(plot_style == "labeled"){ + plots <- splitFacet(g) + plots <- lapply(seq_along(plots), function(i) plots[[i]] + ggtitle(genes_of_interest[i]) + theme(plot.title = element_text(hjust = 0.5)) ) + g <- plot_grid(plotlist = plots, nrow = nrow, ncol = ncol, labels = LETTERS[seq( from = 1, to = length(plots) )]) + } + } + + return(g) +} diff --git a/SCWorkflow.Rproj b/SCWorkflow.Rproj new file mode 100644 index 0000000..21a4da0 --- /dev/null +++ b/SCWorkflow.Rproj @@ -0,0 +1,17 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source diff --git a/tests/testthat/fixtures/MS_Levels_demo.csv b/tests/testthat/fixtures/MS_Levels_demo.csv new file mode 100644 index 0000000..c066f09 --- /dev/null +++ b/tests/testthat/fixtures/MS_Levels_demo.csv @@ -0,0 +1,3 @@ +level_1,level_2 +Pan_Tcells-CD4,CD4-Tregs +Pan_Tcells-CD8, diff --git a/tests/testthat/fixtures/Marker_Table_demo.csv b/tests/testthat/fixtures/Marker_Table_demo.csv new file mode 100644 index 0000000..793ce1f --- /dev/null +++ b/tests/testthat/fixtures/Marker_Table_demo.csv @@ -0,0 +1,19 @@ +CD8_T,CD4_T,Tregs,Macrophages,M1,M2,Neutrophils,Monocytes,cDCs,pDC,B_cells,NKs +Cd3e,Cd3e,Cd3e,Itgam,Cd38,Cd38_neg,Ly6g,Itgam,Ly6g_neg,Ccr9,Cd79a,Cd3e_neg +Cd8a,Cd4,Cd4,Ly6g_neg,Mrc1_neg,Mrc1,Ly6c1_neg,Ly6c1,Adgre1_neg,Siglech,Fcmr,Ncr1 +Cd3d,Foxp3_neg,Foxp3,Adgre1,Nos2,Cd163,Adgre1_neg,Ly6g_neg,Itgax,Cox6a2,,Ifng +Cd3g,Cd3d,Cd3d,Cd68,Sell_neg,,S100a9,Vcan,Cd209a,Klk1,,Klrb1c +Cd4_neg,Cd3g,Cd8a_neg,Siglec1,Cd40,,S100a8,Fn1,Cd24a,,, +Foxp3_neg,Cd8a_neg,Il2ra,Csf1r,Cd86,,Il1b,Ccr2,H2-Ab1,,, +Gzmk,Prf1_neg,,Apoe,,,G0s2,Csf1r,,,, +,,,Mafb,,,Csf3r,Mafb,,,, +,,,Cx3cr1,,,,Nr4a1,,,, +,,,Nr4a1_neg,,,,Lgals3,,,, +,,,F13a1,,,,,,,, +,,,Lgals3,,,,,,,, +,,,Cxcl9,,,,,,,, +,,,Cxcl10,,,,,,,, +,,,Sell_neg,,,,,,,, +,,,Arg1,,,,,,,, +,,,Mmp13,,,,,,,, +,,,Mmp12,,,,,,,, diff --git a/tests/testthat/test-Color_by_Genes_Automatic.R b/tests/testthat/test-Color_by_Genes_Automatic.R new file mode 100644 index 0000000..2a3a810 --- /dev/null +++ b/tests/testthat/test-Color_by_Genes_Automatic.R @@ -0,0 +1,14 @@ +test_that("Color by Genes Automatic returns correct class of figures", { + + so_demo <- readRDS(test_path("fixtures", "SO_moduleScore.rds")) + marker_tab_demo <- read.csv(test_path("fixtures", "Marker_Table_demo.csv")) + + Cbg_demo <- color_by_genes(SO = so_demo, + samples_to_include = c("1_E13","2_E15","3_Newborn","4_Adult"), + samples_to_display = c("1_E13","2_E15","3_Newborn","4_Adult"), + marker_list = marker_tab_demo, + cells_of_interest = c("CD8_T","CD4_T","Tregs","Macrophages")) + + expected_elements <- c("gtable", "gTree", "grob", "gDesc") + expect_setequal(class(Cbg_demo), expected_elements) +}) diff --git a/tests/testthat/test-Harmony_Batch_Correction.R b/tests/testthat/test-Harmony_Batch_Correction.R new file mode 100644 index 0000000..0ec15fe --- /dev/null +++ b/tests/testthat/test-Harmony_Batch_Correction.R @@ -0,0 +1,11 @@ +test_that("Harmony returns seurat object with adjusted embeddings", { + + so_demo <- readRDS(test_path("fixtures", "SO_moduleScore.rds")) + + so_harmonized <- harmony_batch_correct(so = so_demo, + genes_to_add = c("Epcam"), + group.by.vars = c("orig.ident")) + + expected_elements = c("pca","umap", "tsne", "harmony") + expect_setequal(names(so_harmonized@reductions), expected_elements) +}) diff --git a/tests/testthat/test-ModuleScore.R b/tests/testthat/test-ModuleScore.R new file mode 100644 index 0000000..8807da5 --- /dev/null +++ b/tests/testthat/test-ModuleScore.R @@ -0,0 +1,21 @@ +test_that("ModuleScore returns metadata with scores and cell calls", { + + so_demo <- readRDS(test_path("fixtures", "SO_moduleScore.rds")) + marker_tab_demo <- read.csv(test_path("fixtures", "Marker_Table_demo.csv")) + levels_df_demo = read.csv(test_path("fixtures", "MS_Levels_demo.csv")) + + modscore_demo <- ModuleScore(SO = so_demo, + sample_names = c("1_E13","2_E15","3_Newborn","4_Adult"), + sample_to_display <- c("1_E13","2_E15","3_Newborn","4_Adult"), + geneset_dataframe = marker_tab_demo, + proteins_presence <- FALSE, + celltypes_to_analyze <- c("CD8_T","CD4_T","Tregs","Macrophages","M1","M2","Neutrophils","Monocytes","cDCs","pDC","B_cells","NKs"), + manual_threshold <- c(0), + general_class <- c("CD8_T","CD4_T","Tregs","Macrophages","M1","M2","Neutrophils","Monocytes","cDCs","pDC","B_cells","NKs"), + multi_level_class <- FALSE, + levels_dataframe = levels_df_demo, + nbins = 10) + + expected_elements <- c("Likely_CellType","CD8_T","CD4_T","Tregs","Macrophages","M1","Neutrophils","Monocytes","cDCs","pDC") + expect_setequal(colnames(modscore_demo[[2]]@meta.data)[30:39], expected_elements) +}) diff --git a/tests/testthat/test-Pseudobulk_DEG.R b/tests/testthat/test-Pseudobulk_DEG.R new file mode 100644 index 0000000..ae57e65 --- /dev/null +++ b/tests/testthat/test-Pseudobulk_DEG.R @@ -0,0 +1,11 @@ +test_that("Pseudobulk DEG returns table of differentially expressed genes", { + so_demo <- readRDS(test_path("fixtures", "SO_moduleScore.rds")) + + pseudo_res <- Pseudobulk_DEG(so = so_demo, contrasts = c("CD4_T-Tregs"), replicate = 'orig.ident', comparison_level = 'Likely_CellType', + label = "Likely_CellType") + + expected_elements = c("Gene","cell_type","groupCD4_T-groupTregs_FC","groupCD4_T-groupTregs_logFC", + "groupCD4_T-groupTregs_tstat","groupCD4_T-groupTregs_pval", + "groupCD4_T-groupTregs_adjpval","de_family","de_method","de_type") + expect_setequal(colnames(pseudo_res), expected_elements) +}) diff --git a/tests/testthat/test-Violin_Plots_by_Metadata.R b/tests/testthat/test-Violin_Plots_by_Metadata.R new file mode 100644 index 0000000..901d8a0 --- /dev/null +++ b/tests/testthat/test-Violin_Plots_by_Metadata.R @@ -0,0 +1,12 @@ +test_that("Violin plot returns ggplot object", { + + so_demo <- readRDS(test_path("fixtures", "SO_moduleScore.rds")) + genes_demo <- read.csv(test_path("fixtures", "Marker_Table_demo.csv"))[,1][1:4] + + violin_res <- ViolinPlot(so = so_demo, ident_of_interest = "orig_ident", + groups_of_interest = c("1_E13","2_E15","3_Newborn","4_Adult"), + genes_of_interest = genes_demo) + + expected_elements = c("gg","ggplot") + expect_setequal(class(violin_res), expected_elements) +})