In this example, we will be integrating two datasets from adult mouse brain generated using 10X and snATAC-seq. All the datasets used in this example can be downloaded from here.
- Step 0. Download data
- Step 1. Create snap object
- Step 2. Select barcode
- Step 3. Add cell-by-bin matrix
- Step 4. Combine snap objects
- Step 5. Filter bins
- Step 6. Dimensionality reduction
- Step 7. Determine significant components
- Step 8. Remove batch effect
- Step 9. Graph-based cluster
- Step 10. Visualization
$ wget http://renlab.sdsc.edu/r3fang/share/github/Mouse_Brain_10X_snATAC/CEMBA180305_2B.snap
$ wget http://renlab.sdsc.edu/r3fang/share/github/Mouse_Brain_10X_snATAC/CEMBA180305_2B.barcode.txt
$ wget http://renlab.sdsc.edu/r3fang/share/github/Mouse_Brain_10X_snATAC/atac_v1_adult_brain_fresh_5k.snap
$ wget http://renlab.sdsc.edu/r3fang/share/github/Mouse_Brain_10X_snATAC/atac_v1_adult_brain_fresh_5k.barcode.txt
Step 1. Create snap object
In this example, we will create a list of snap objects that contains two datasets.
> library(SnapATAC);
> file.list = c("CEMBA180305_2B.snap", "atac_v1_adult_brain_fresh_5k.snap");
> sample.list = c("snATAC", "10X");
> x.sp.ls = lapply(seq(file.list), function(i){
x.sp = createSnap(file=file.list[i], sample=sample.list[i]);
x.sp
})
> names(x.sp.ls) = sample.list;
> sample.list
## $snATAC
## number of barcodes: 15136
## number of bins: 0
## number of genes: 0
## number of peaks: 0
## number of motifs: 0
##
## $`10X`
## number of barcodes: 20000
## number of bins: 0
## number of genes: 0
## number of peaks: 0
## number of motifs: 0
Step 2. Select barcode
Next, we read the barcode list which contains the high-quality barcodes. See here for barcode selection.
> barcode.file.list = c("CEMBA180305_2B.barcode.txt", "atac_v1_adult_brain_fresh_5k.barcode.txt");
> barcode.list = lapply(barcode.file.list, function(file){
read.table(file)[,1];
})
> x.sp.list = lapply(seq(x.sp.ls), function(i){
x.sp = x.sp.ls[[i]];
x.sp = x.sp[x.sp@barcode %in% barcode.list[[i]],];
})
> names(x.sp.list) = sample.list;
## $snATAC
## number of barcodes: 9646
## number of bins: 0
## number of genes: 0
## number of peaks: 0
## number of motifs: 0
##
## $`10X`
## number of barcodes: 4100
## number of bins: 0
## number of genes: 0
## number of peaks: 0
## number of motifs: 0
Step 3. Add cell-by-bin matrix
> x.sp.list = lapply(seq(x.sp.list), function(i){
x.sp = addBmatToSnap(x.sp.list[[i]], bin.size=5000);
x.sp
})
> x.sp.list
## $snATAC
## number of barcodes: 9646
## number of bins: 545118
## number of genes: 0
## number of peaks: 0
## number of motifs: 0
##
## $`10X`
## number of barcodes: 4100
## number of bins: 546206
## number of genes: 0
## number of peaks: 0
## number of motifs: 0
These two snap objects have different number of bins because they were generated seperately using slightly different reference genome.
Step 4. Combine snap objects
To combine these two snap objects, common bins are selected.
> bin.shared = Reduce(intersect, lapply(x.sp.list, function(x.sp) x.sp@feature$name));
> x.sp.list <- lapply(x.sp.list, function(x.sp){
idy = match(bin.shared, x.sp@feature$name);
x.sp[,idy, mat="bmat"];
})
> x.sp = Reduce(snapRbind, x.sp.list);
> rm(x.sp.list); # free memory
> gc();
> table(x.sp@sample);
## 10X snATAC
## 4100 9646
> x.sp = makeBinary(x.sp, mat="bmat");
Step 6. Filter bins
First, we filter out any bins overlapping with the ENCODE blacklist to prevent from potential artifacts.
> system("wget http://mitra.stanford.edu/kundaje/akundaje/release/blacklists/mm10-mouse/mm10.blacklist.bed.gz");
> library(GenomicRanges);
> black_list = read.table("mm10.blacklist.bed.gz");
> black_list.gr = GRanges(
black_list[,1],
IRanges(black_list[,2], black_list[,3])
);
> idy = queryHits(findOverlaps(x.sp@feature, black_list.gr));
> if(length(idy) > 0){x.sp = x.sp[,-idy, mat="bmat"]};
> x.sp
## number of barcodes: 13746
## number of bins: 545015
## number of genes: 0
## number of peaks: 0
## number of motifs: 0
Second, we remove unwanted chromosomes.
> chr.exclude = seqlevels(x.sp@feature)[grep("random|chrM", seqlevels(x.sp@feature))];
> idy = grep(paste(chr.exclude, collapse="|"), x.sp@feature);
> if(length(idy) > 0){x.sp = x.sp[,-idy, mat="bmat"]};
> x.sp
## number of barcodes: 13746
## number of bins: 545011
## number of genes: 0
## number of peaks: 0
## number of motifs: 0
Third, the bin coverage roughly obeys a log normal distribution. We remove the top 5% bins that overlap with invariant features such as promoters of the house keeping genes.
> bin.cov = log10(Matrix::colSums(x.sp@bmat)+1);
> bin.cutoff = quantile(bin.cov[bin.cov > 0], 0.95);
> idy = which(bin.cov <= bin.cutoff & bin.cov > 0);
> x.sp = x.sp[, idy, mat="bmat"];
> x.sp
## number of barcodes: 13746
## number of bins: 479127
## number of genes: 0
## number of peaks: 0
## number of motifs: 0
Step 7. Reduce dimensionality
We compute landmark diffusion maps for dimentionality reduction. We first randomly sample 10,000 cells as landmarks and project the remaining cells (query) onto the diffusion maps embedding.
> row.covs = log10(Matrix::rowSums(x.sp@bmat)+1);
> row.covs.dens = density(
x = row.covs,
bw = 'nrd', adjust = 1
);
> sampling_prob = 1 / (approx(x = row.covs.dens$x, y = row.covs.dens$y, xout = row.covs)$y + .Machine$double.eps);
> set.seed(1);
> idx.landmark.ds = sort(sample(x = seq(nrow(x.sp)), size = 10000, prob = sampling_prob));
> x.landmark.sp = x.sp[idx.landmark.ds,];
> x.query.sp = x.sp[-idx.landmark.ds,];
> x.landmark.sp = runDiffusionMaps(
obj= x.landmark.sp,
input.mat="bmat",
num.eigs=50
);
> x.query.sp = runDiffusionMapsExtension(
obj1=x.landmark.sp,
obj2=x.query.sp,
input.mat="bmat"
);
> x.landmark.sp@metaData$landmark = 1;
> x.query.sp@metaData$landmark = 0;
> x.sp = snapRbind(x.landmark.sp, x.query.sp);
## combine landmarks and query cells;
> x.sp = x.sp[order(x.sp@sample),]; # IMPORTANT
> rm(x.landmark.sp, x.query.sp); # free memory
Step 8. Determine significant components
> plotDimReductPW(
obj=x.sp,
eigs.dims=1:50,
point.size=0.3,
point.color="grey",
point.shape=19,
point.alpha=0.6,
down.sample=5000,
pdf.file.name=NULL,
pdf.height=7,
pdf.width=7
);
> library(harmony);
> x.after.sp = runHarmony(
obj=x.sp,
eigs.dims=1:22,
meta_data=x.sp@sample # sample index
);
> x.after.sp = runKNN(
obj= x.after.sp,
eigs.dim=1:22,
k=15
);
> x.after.sp = runCluster(
obj=x.after.sp,
tmp.folder=tempdir(),
louvain.lib="R-igraph",
path.to.snaptools=NULL,
seed.use=10
);
> x.after.sp@metaData$cluster = x.after.sp@cluster;
> x.sp = runViz(
obj=x.sp,
tmp.folder=tempdir(),
dims=2,
eigs.dims=1:22,
method="Rtsne",
seed.use=10
);
> x.after.sp = runViz(
obj=x.after.sp,
tmp.folder=tempdir(),
dims=2,
eigs.dims=1:22,
method="Rtsne",
seed.use=10
);
> par(mfrow = c(2, 3));
> plotViz(
obj=x.sp,
method="tsne",
main="Before Harmony",
point.color=x.sp@sample,
point.size=0.1,
text.add= FALSE,
down.sample=10000,
legend.add=TRUE
);
> plotViz(
obj=x.after.sp,
method="tsne",
main="After Harmony",
point.color=x.sp@sample,
point.size=0.1,
text.add=FALSE,
down.sample=10000,
legend.add=TRUE
);
> plotViz(
obj=x.after.sp,
method="tsne",
main="Cluster",
point.color=x.after.sp@cluster,
point.size=0.1,
text.add=TRUE,
text.size=1,
text.color="black",
text.halo.add=TRUE,
text.halo.color="white",
text.halo.width=0.2,
down.sample=10000,
legend.add=FALSE
);