Skip to content

Commit

Permalink
v0.2.1; checked usage cross OS
Browse files Browse the repository at this point in the history
  • Loading branch information
josef0731 committed Feb 1, 2023
1 parent 77f57b4 commit 4197ebd
Show file tree
Hide file tree
Showing 60 changed files with 1,229 additions and 619 deletions.
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: sciCSR
Type: Package
Title: Single-cell Inference of Class Switch Recombination
Version: 0.2.0
Version: 0.2.1
Authors@R: c( person( given = "Joseph", family = "Ng",
email = "[email protected]", role = c( "aut", "cre" ) ) )
Author: Joseph Ng [aut, cre]
Expand All @@ -10,7 +10,7 @@ Description: Package for inferring the dynamics of B cell state transitions usin
License: GPL-3 + file LICENSE
VignetteBuilder: utils
Depends: R (>= 4.0),
Matrix,
Matrix (>= 1.5-3),
S4Vectors,
reticulate
Imports: GenomeInfoDb,
Expand All @@ -24,7 +24,8 @@ Imports: GenomeInfoDb,
SeuratDisk (>= 0.0.0.9020),
cowplot,
fs,
igraph,
igraph,
harmony,
nnls,
philentropy,
plyr,
Expand Down
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
export(AddCellMetaToVDJ)
export(annotatePairing)
export(collapseBCR)
export(collapseIntoMetagenes)
export(combineBCR)
export(combineLoomFiles)
export(compareTransitionMatrices)
Expand All @@ -15,6 +16,7 @@ export(getIGHreadType)
export(getSHM)
export(guessBarcodes)
export(mergeVelocytoWithGEX)
export(normalise_dimreduce)
export(plotFluxMatrix)
export(plotStationaryDistribution)
export(plot_arrows)
Expand All @@ -25,6 +27,8 @@ export(run_scVelo)
export(scanBam)
export(splitAnnData)
export(summariseIGHreads)
import(Matrix)
import(Seurat)
import(ggplot2)
import(hdf5r)
import(markovchain)
Expand Down Expand Up @@ -53,6 +57,7 @@ importFrom(SeuratDisk,SaveH5Seurat)
importFrom(cowplot,theme_cowplot)
importFrom(grid,grid.newpage)
importFrom(grid,grid.raster)
importFrom(harmony,RunHarmony)
importFrom(hdf5r,H5File)
importFrom(hdf5r,list.datasets)
importFrom(igraph,graph_from_adjacency_matrix)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# sciCSR 0.2.1 (Feb 1, 2023)
* fixed issues re distributed computing in python across MacOS and Windows.
* added functions for basic scRNA-seq data preprocessing (`normalise_dimreduce`) which includes customisable pruning of variably expressed genes, collapse VDJ (or other genes) into metagenes etc (`collapseIntoMetaGenes`).

# sciCSR 0.2.0 (Jan 30, 2023)
* removed velocyto.R dependency to avoid installation problems.
* fixed minor issues for cross-platform (MacOS / Linux) installation and use.
Expand Down
130 changes: 130 additions & 0 deletions R/data_preprocessing.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#' Normalisation and dimensionality reduction of scRNAseq gene counts
#'
#' @description
#' \code{normalise_dimreduce} is a wrapper function around the Seurat basic data processing workflow to generate normalised gene count data, its dimensionality reduction, and derivation of cell clusters.
#'
#' @details
#' \code{normalise_dimreduce} is a wrapper function around the major basic Seurat data processing workflow and performs the following steps:
#' \itemize{
#' \item{Calculation of \% mitochondrial transcripts}{ (\code{PercentageFeatureSet}) and subsetting to remove those beyond the cutoff given by \code{mt.percent}.}
#' \item{Gene count normalisation}{, using either \code{SCTransform} or \code{NormalizeData}}
#' \item{Pruning variably expressed features}{. All genes with names matching the vector of regular expression given in the argument \code{features_exclude} will be removed from this list to avoid them influencing the downstream dimensionality reduction and clustering steps. This is particularly relevant for avoiding clusters of B cells grouped by their isotypes/VDJ expression.}
#' \item{Principal component analysis (PCA)}{ (\code{Seurat::RunPCA} function)}
#' \item{Batch correction using \code{Harmony}}{: covariates given in \code{harmony_vars} will be removed in the Harmony regression step. (Optional, if \code{run.harmony == TRUE})}
#' \item{UMAP dimensionality reduction}{: \code{Seurat::RunUMAP}, retaining the top principal components, each of which explain at least \code{var_explained_lim} of the variance.}
#' \item{k-neighbor network (kNN) construction}{ (\code{Seurat::FindNeighbors})}
#' \item{Define cell clusters based on kNN graph}{ (\code{Seurat::FindClusters})}
#' }
#'
#' @param obj Seurat object with the gene counts unnormalised.
#' @param var_explained_lim numeric, the minimum proportion of variance explained cutoff for a principal component to be included in the dimensionality reduction and clustering steps (default: 0.015, i.e. 1.5\%)
#' @param run.harmony should the package \code{Harmony} be used on the data? (Default: FALSE)
#' @param harmony_vars vector of parameters to be included in the regression step in \code{Harmony}. Variations specific to these parameters will be removed during the \code{Harmony} run.
#' @param SCT Should the \code{SCTransform} pipeline be used? If not it will follow the standard Seurat normalisation workflow (\code{NormalizeData}, \code{FindVariableFeatures}, \code{ScaleData})
#' @param mt.pattern the regular expression used to identify mitochondrial transcripts (Default: ^MT-", i.e. all gene names beginning with "MT-")
#' @param mt.percent the cutoff for mitochondrial transcript percentage, above which cells will be removed from the Seurat project as part of quality control (Default: 10, i.e. cells with more than 10\% of counts mapped to mitochondrial transcripts will be removed from the Seurat object)
#' @param features_exclude a vector of regular expressions to select genes to be IGNORED during dimensionality reduction and clustering. By default the following features were included in this list: IgH, K, L V/D/JC genes, TRA/TRB V/C genes, AC233755.1 (which encodes a V-gene-like product), IGLL, JCHAIN)
#' @param ... Arguments to be passed to various Seurat functions (\code{SCTransform}, \code{NormalizeData}, \code{FindVariableFeatures}, \code{ScaleData}, \code{RunPCA}, \code{RunUMAP}, \code{FindNeighbors}, \code{FindClusters})
#'
#' @return A Seurat object with normalised gene count data, dimensionality reduction and clustering done
#'
#' @import Seurat
#' @importFrom harmony RunHarmony
#' @export normalise_dimreduce
normalise_dimreduce <- function(obj, var_explained_lim = 0.015,
run.harmony = FALSE, harmony_vars = NULL,
SCT = FALSE, mt.pattern = "^MT-", mt.percent = 10,
features_exclude = c("^IGH[MDE]", "^IGHG[1-4]", "^IGHA[1-2]",
"^IG[HKL][VDJ]", "^IGKC", "^IGLC[1-7]",
"^TR[ABGD][CV]", "^AC233755.1", "^IGLL",
"^JCHAIN"), ...)
{
if( ! inherits(obj, "Seurat") )
stop("'obj' should be a Seurat object.")
if( ! is.numeric( var_explained_lim) )
stop("'var_explained_lim' should be a numeric.")
if( length( var_explained_lim ) != 1 )
stop("'var_explained_lim' should be a numeric of length 1.")
if( var_explained_lim < 0 | var_explained_lim > 1)
stop("'var_explained_lim' should be a numericb between 0 and 1")
if( ! mt.percent | is.numeric(mt.percent) ){
obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = mt.pattern)
if( ! mt.percent ) mt.percent <- 10
subset_cells <- Cells(obj)[which(obj$percent.mt < mt.percent)]
obj <- subset(obj, cells = subset_cells)
}
if(SCT){
obj <- SCTransform(obj, ...)
assay.use <- "SCT"
} else {
obj <- NormalizeData(obj, ...)
obj <- FindVariableFeatures(obj, ...)
all.genes <- rownames(obj)
obj <- ScaleData(obj, features = all.genes, ...)
assay.use <- "RNA"
}
varfeat <- VariableFeatures(obj)
varfeat <- varfeat[!grepl("^IGH[MDE]|^IGHG[1-4]|^IGHA[1-2]|^IG[HKL][VDJ]|^IGKC|^IGLC[1-7]|^TR[ABGD][CV]|^AC233755.1|^IGLL|^JCHAIN", varfeat)]
obj <- RunPCA(obj, features = varfeat, ...)
if(run.harmony){
obj <- harmony::RunHarmony(obj, assay.use = assay.use,
group.by.vars = harmony_vars,
verbose = FALSE)
}
var_explained <- (obj@reductions$pca@stdev)^2 / sum((obj@reductions$pca@stdev)^2 )
dim <- max(which(var_explained > var_explained_lim))
cat(paste0("Top ", dim, " PCs explain variance >= ",
var_explained_lim, ".\n"))
if(run.harmony){
obj <- RunUMAP(obj, reduction = "harmony", dims = 1:dim, ...)
} else {
obj <- RunUMAP(obj, reduction = "pca", dims = 1:dim, ...)
}
obj <- FindNeighbors(obj, dims = 1:dim, ...)
obj <- FindClusters(obj, ...)
obj
}

#' Group gene counts into metagenes to minimise individual differences
#'
#' @description
#' \code{collapseIntoMetagenes} defines metagenes which sum over the gene counts mapped to a group of genes, in a single-cell gene expression count matrix.
#'
#' @details
#' \code{collapseIntoMetagenes} can be used to group transcript counts into metagenes, to remove the effect of e.g. individual variations which leads to preference of specific genes.
#' One example is the immunoglobulin VDJ genes whose expression is specific to each B cell and is indicative of clonotype rather than cell state. By summing over all individual VDJ genes into one metagene, this avoids those individual genes to influence the downstream dimensionality projection and clustering results.
#'
#' @param countmat sparse matrix containing single-cell gene expression data. Output from \code{Seurat::Read10X} or equivalent.
#' @param metagenes_definitions a vector containing regular expressions to match gene names in the row names of \code{countmat}. For each regular expression, matched genes will be summarised into one metagene (see Details). (Default: individual metagenes for ribosomal, HLA I-major, HLA I-minor, HLA II and Ig VDJ transcripts.)
#'
#' @return a sparse matrix containing count data where all the matched genes are collapsed into metagenes with names given by the names of each element in \code{metagenes_definitions}.
#'
#' @import Matrix
#' @export collapseIntoMetagenes
collapseIntoMetagenes <- function(countmat,
metagenes_definitions = c("RIBO" = "^RP[LS]|^MRP[LS]",
"HLA-Imaj" = "^HLA-[ABC]$",
"HLA-Imin" = "^HLA-[EFG]$",
"HLA-II" = "^HLA-D",
"VDJ" = "^IG[HKL][VDJ][0-9]"))
{
metagenes <- lapply(metagenes_definitions, function(x){
colSums(as.matrix(
countmat[
rownames(countmat)[grepl(x, rownames(countmat))],
]
))
})
names(metagenes) <- names(metagenes_definitions)
for(item in names(metagenes_definitions)){
n <- item
item <- metagenes_definitions[item]
# Remove individual genes from the matrix
countmat <- countmat[-(which(grepl(item, rownames(countmat)))), ]
# Append the metagene to the matrix
countmat <- rbind(countmat, metagenes[[n]])
}
# Rename the added rows
rownames(countmat[tail(1:nrow(countmat), length(metagenes_definitions))]) <- names(metagenes_definitions)
return( countmat )
}
7 changes: 2 additions & 5 deletions R/prepare_sciCSR.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,8 @@ prepare_sciCSR <- function()
conda_install(envname = 'scicsr', packages = c('scanpy', 'deeptime'), channel = 'conda-forge')
conda_install(envname = 'scicsr', packages = 'h5py', pip = TRUE, pip_options = "--force-reinstall")
conda_install(envname = 'scicsr', packages = 'scvelo', pip = TRUE, pip_options = "-U")
conda_install(envname = 'scicsr', packages = 'jupyter', channel = 'conda-forge')
conda_install(envname = 'scicsr', packages = c('python-igraph', 'louvain'),
pip = TRUE)
conda_install(envname = 'scicsr', packages = 'cellrank',
channel = c('conda-forge', 'bioconda'))
conda_install(envname = 'scicsr', packages = c('python-igraph', 'louvain'), pip = TRUE)
conda_install(envname = 'scicsr', packages = 'cellrank', pip = TRUE)
conda_install(envname = 'scicsr', packages = 'networkx==2.8.2')
conda_install(envname = 'scicsr', packages = 'multiprocess')
}
Expand Down
Loading

0 comments on commit 4197ebd

Please sign in to comment.