From 05c6469393fca1a62f05c93b4f4497e73f936892 Mon Sep 17 00:00:00 2001 From: Elizabeth Tseng Date: Tue, 12 Dec 2023 08:53:30 -0800 Subject: [PATCH] Update examples.md (#56) Updated example + data to Kinnex data releases --- docs/clustering/examples.md | 109 +++++++++++++++++------------------- 1 file changed, 50 insertions(+), 59 deletions(-) diff --git a/docs/clustering/examples.md b/docs/clustering/examples.md index db4f52d..bae4caa 100644 --- a/docs/clustering/examples.md +++ b/docs/clustering/examples.md @@ -8,76 +8,67 @@ nav_order: 4 ## Real-world example ### Single sample -This is an example of an end-to-end cmd-line-only workflow to get from -HiFi reads to transcripts. It's a 1% subsampled Alzheimer dataset. -You can download the HiFi reads generated by CCS v4.2: - # Download the pre-computed HiFi reads - $ wget https://downloads.pacbcloud.com/public/dataset/IsoSeq_sandbox/2020_Alzheimer8M_subset/alz.1perc.ccs.bam + This is a toy dataset consisting of ~80k segmented reads (S-reads) from a Kinnex full-length RNA library. The original HiFi reads have already been segmented using Skera, so we can directly go into primer removal using lima and such. + + # Download toy S-read dataset + $ wget https://downloads.pacbcloud.com/public/dataset/IsoSeq_sandbox/human_80k_Sreads.segmented.bam - $ cat primers.fasta - >NEB_5p - GCAATGAAGTCGCAGGGTTGGGG - >Clontech_5p - AAGCAGTGGTATCAACGCAGAGTACATGGGG - >NEB_Clontech_3p - GTACTCTGCGTTGATACCACTGCTT + # Download the Iso-Seq v2 cDNA primers (from Iso-Seq express 2.0 kit) + $ wget https://downloads.pacbcloud.com/public/dataset/Kinnex-full-length-RNA/REF-primers/IsoSeq_v2_primers_12.fasta $ lima --version - lima 1.11.0 (commit v1.11.0) - - $ lima alz.1perc.ccs.bam primers.fasta alz.fl.bam --isoseq --peek-guess - - $ ls alz.fl* - alz.fl.json alz.fl.lima.summary - alz.fl.lima.clips alz.fl.NEB_5p--NEB_Clontech_3p.bam - alz.fl.lima.counts alz.fl.NEB_5p--NEB_Clontech_3p.bam.pbi - alz.fl.lima.guess alz.fl.NEB_5p--NEB_Clontech_3p.subreadset.xml - alz.fl.lima.report - - $ isoseq refine alz.fl.NEB_5p--NEB_Clontech_3p.bam primers.fasta alz.flnc.bam - - $ ls alz.flnc.* - alz.flnc.bam alz.flnc.filter_summary.json - alz.flnc.bam.pbi alz.flnc.report.csv - alz.flnc.consensusreadset.xml - - $ isoseq cluster alz.flnc.bam clustered.bam --verbose --use-qvs - Read BAM : (37648) 1s 235ms - Convert to reads : 589ms 797us - Sort Reads : 8ms 409us - Aligning Linear : 23s 63ms - Read to clusters : 861ms 287us - Aligning Linear : 20s 279ms - Merge by mapping : 7s 242ms - Consensus : 4s 663ms - Merge by mapping : 980ms 742us - Consensus : 103ms 913us - Write output : 1s 799ms - - $ ls clustered* - clustered.bam clustered.hq.fasta.gz - clustered.bam.pbi clustered.lq.bam - clustered.cluster clustered.lq.bam.pbi - clustered.cluster_report.csv clustered.lq.fasta.gz - clustered.hq.bam clustered.transcriptset.xml - clustered.hq.bam.pbi - + lima 2.9.0 + + $ lima human_80k_Sreads.segmented.bam IsoSeq_v2_primers_12.fasta output.bam --isoseq --peek-guess + + $ ls output.* + output.IsoSeqX_bc10_5p--IsoSeqX_3p.bam + output.IsoSeqX_bc10_5p--IsoSeqX_3p.bam.pbi + output.IsoSeqX_bc10_5p--IsoSeqX_3p.consensusreadset.xml + output.json + output.lima.clips + output.lima.counts + output.lima.guess + output.lima.report + output.lima.summary + + + $ isoseq refine output.IsoSeqX_bc10_5p--IsoSeqX_3p.bam IsoSeq_v2_primers_12.fasta flnc.bam --require-polya + + $ ls flnc.* + flnc.bam + flnc.bam.pbi + flnc.consensusreadset.xml + flnc.filter_summary.report.json + flnc.report.csv + + $ isoseq cluster2 flnc.bam transcripts.bam + + + $ ls transcripts* + transcripts.bam + transcripts.bam.pbi + transcripts.cluster_report.csv + ### Multiplexed samples - # Download HiFi reads - $ wget https://downloads.pacbcloud.com/public/dataset/IsoSeq_sandbox/2020_MultiplexIsoSeq_toy/m54363_190223_194117.ccs.bam + This is a 12-plex regular Iso-Seq (non-Kinex) run on Sequel II system consisting of ~3 million HiFi reads. + + # Download HiFi reads from a non-Kinnex (regular Iso-Seq) BAM file + $ wget https://downloads.pacbcloud.com/public/dataset/Kinnex-full-length-RNA/DATA-SQ2-UHRR-Monomer/1-CCS/m64307e_230628_025302.hifi_reads.bam + $ wget https://downloads.pacbcloud.com/public/dataset/Kinnex-full-length-RNA/DATA-SQ2-UHRR-Monomer/1-CCS/m64307e_230628_025302.hifi_reads.bam.pbi - # Download barcoded primers - $ wget https://downloads.pacbcloud.com/public/dataset/IsoSeq_sandbox/2020_MultiplexIsoSeq_toy/NEB_barcode16.fasta + # Download the Iso-Seq v2 cDNA primers (from Iso-Seq express 2.0 kit) + $ wget https://downloads.pacbcloud.com/public/dataset/Kinnex-full-length-RNA/REF-primers/IsoSeq_v2_primers_12.fasta # Demux and primer removal - $ lima m54363_190223_194117.ccs.bam NEB_barcode16.fasta fl.bam --isoseq --peek-guess + $ lima --isoseq --peek-guess m64307e_230628_025302.hifi_reads.bam IsoSeq_v2_primers_12.fasta output.bam # Combine inputs - $ ls fl.bc1001_5p--bc1001_3p.bam fl.bc1002_5p--bc1002_3p.bam > all.fofn + $ ls output.IsoSeqX*bam > all.fofn # Remove poly(A) tails and concatemer - $ isoseq refine all.fofn NEB_barcode16.fasta flnc.bam --require-polya + $ isoseq refine all.fofn IsoSeq_v2_primers_12.fasta flnc.bam --require-polya - $ isoseq cluster flnc.bam clustered.bam --use-qvs --verbose + $ isoseq cluster2 flnc.bam transcripts.bam