From 05c6469393fca1a62f05c93b4f4497e73f936892 Mon Sep 17 00:00:00 2001
From: Elizabeth Tseng <magdoll@gmail.com>
Date: Tue, 12 Dec 2023 08:53:30 -0800
Subject: [PATCH] Update examples.md (#56)

Updated example + data to Kinnex data releases
---
 docs/clustering/examples.md | 109 +++++++++++++++++-------------------
 1 file changed, 50 insertions(+), 59 deletions(-)

diff --git a/docs/clustering/examples.md b/docs/clustering/examples.md
index db4f52d..bae4caa 100644
--- a/docs/clustering/examples.md
+++ b/docs/clustering/examples.md
@@ -8,76 +8,67 @@ nav_order: 4
 ## Real-world example
 
 ### Single sample
-This is an example of an end-to-end cmd-line-only workflow to get from
-HiFi reads to transcripts. It's a 1% subsampled Alzheimer dataset.
-You can download the HiFi reads generated by CCS v4.2:
 
-    # Download the pre-computed HiFi reads
-    $ wget https://downloads.pacbcloud.com/public/dataset/IsoSeq_sandbox/2020_Alzheimer8M_subset/alz.1perc.ccs.bam
+    This is a toy dataset consisting of ~80k segmented reads (S-reads) from a Kinnex full-length RNA library. The original HiFi reads have already been segmented using Skera, so we can directly go into primer removal using lima and such.
+    
+    # Download toy S-read dataset
+    $ wget https://downloads.pacbcloud.com/public/dataset/IsoSeq_sandbox/human_80k_Sreads.segmented.bam
 
-    $ cat primers.fasta
-    >NEB_5p
-    GCAATGAAGTCGCAGGGTTGGGG
-    >Clontech_5p
-    AAGCAGTGGTATCAACGCAGAGTACATGGGG
-    >NEB_Clontech_3p
-    GTACTCTGCGTTGATACCACTGCTT
+    # Download the Iso-Seq v2 cDNA primers (from Iso-Seq express 2.0 kit)
+    $ wget https://downloads.pacbcloud.com/public/dataset/Kinnex-full-length-RNA/REF-primers/IsoSeq_v2_primers_12.fasta
 
     $ lima --version
-    lima 1.11.0 (commit v1.11.0)
-
-    $ lima alz.1perc.ccs.bam primers.fasta alz.fl.bam --isoseq --peek-guess
-
-    $ ls alz.fl*
-    alz.fl.json         alz.fl.lima.summary
-    alz.fl.lima.clips   alz.fl.NEB_5p--NEB_Clontech_3p.bam
-    alz.fl.lima.counts  alz.fl.NEB_5p--NEB_Clontech_3p.bam.pbi
-    alz.fl.lima.guess   alz.fl.NEB_5p--NEB_Clontech_3p.subreadset.xml
-    alz.fl.lima.report
-
-    $ isoseq refine alz.fl.NEB_5p--NEB_Clontech_3p.bam primers.fasta alz.flnc.bam
-
-    $ ls alz.flnc.*
-    alz.flnc.bam                   alz.flnc.filter_summary.json
-    alz.flnc.bam.pbi               alz.flnc.report.csv
-    alz.flnc.consensusreadset.xml
-
-    $ isoseq cluster alz.flnc.bam clustered.bam --verbose --use-qvs
-    Read BAM                 : (37648) 1s 235ms
-    Convert to reads         : 589ms 797us
-    Sort Reads               : 8ms 409us
-    Aligning Linear          : 23s 63ms
-    Read to clusters         : 861ms 287us
-    Aligning Linear          : 20s 279ms
-    Merge by mapping         : 7s 242ms
-    Consensus                : 4s 663ms
-    Merge by mapping         : 980ms 742us
-    Consensus                : 103ms 913us
-    Write output             : 1s 799ms
-
-    $ ls clustered*
-    clustered.bam                 clustered.hq.fasta.gz
-    clustered.bam.pbi             clustered.lq.bam
-    clustered.cluster             clustered.lq.bam.pbi
-    clustered.cluster_report.csv  clustered.lq.fasta.gz
-    clustered.hq.bam              clustered.transcriptset.xml
-    clustered.hq.bam.pbi
-
+    lima 2.9.0
+
+    $ lima human_80k_Sreads.segmented.bam IsoSeq_v2_primers_12.fasta output.bam --isoseq --peek-guess
+
+    $ ls output.*
+    output.IsoSeqX_bc10_5p--IsoSeqX_3p.bam
+    output.IsoSeqX_bc10_5p--IsoSeqX_3p.bam.pbi
+    output.IsoSeqX_bc10_5p--IsoSeqX_3p.consensusreadset.xml
+    output.json
+    output.lima.clips
+    output.lima.counts
+    output.lima.guess
+    output.lima.report
+    output.lima.summary
+
+
+    $ isoseq refine output.IsoSeqX_bc10_5p--IsoSeqX_3p.bam IsoSeq_v2_primers_12.fasta flnc.bam --require-polya
+
+    $ ls flnc.*
+    flnc.bam
+    flnc.bam.pbi
+    flnc.consensusreadset.xml
+    flnc.filter_summary.report.json
+    flnc.report.csv
+
+    $ isoseq cluster2 flnc.bam transcripts.bam
+    
+
+    $ ls transcripts*
+    transcripts.bam
+    transcripts.bam.pbi
+    transcripts.cluster_report.csv
+    
 ### Multiplexed samples
 
-    # Download HiFi reads
-    $ wget https://downloads.pacbcloud.com/public/dataset/IsoSeq_sandbox/2020_MultiplexIsoSeq_toy/m54363_190223_194117.ccs.bam
+    This is a 12-plex regular Iso-Seq (non-Kinex) run on Sequel II system consisting of ~3 million HiFi reads.
+
+    # Download HiFi reads from a non-Kinnex (regular Iso-Seq) BAM file
+    $ wget https://downloads.pacbcloud.com/public/dataset/Kinnex-full-length-RNA/DATA-SQ2-UHRR-Monomer/1-CCS/m64307e_230628_025302.hifi_reads.bam
+    $ wget https://downloads.pacbcloud.com/public/dataset/Kinnex-full-length-RNA/DATA-SQ2-UHRR-Monomer/1-CCS/m64307e_230628_025302.hifi_reads.bam.pbi
 
-    # Download barcoded primers
-    $ wget https://downloads.pacbcloud.com/public/dataset/IsoSeq_sandbox/2020_MultiplexIsoSeq_toy/NEB_barcode16.fasta
+    # Download the Iso-Seq v2 cDNA primers (from Iso-Seq express 2.0 kit)
+    $ wget https://downloads.pacbcloud.com/public/dataset/Kinnex-full-length-RNA/REF-primers/IsoSeq_v2_primers_12.fasta
 
     # Demux and primer removal
-    $ lima m54363_190223_194117.ccs.bam NEB_barcode16.fasta fl.bam --isoseq --peek-guess
+    $ lima --isoseq --peek-guess m64307e_230628_025302.hifi_reads.bam IsoSeq_v2_primers_12.fasta output.bam
 
     # Combine inputs
-    $ ls fl.bc1001_5p--bc1001_3p.bam fl.bc1002_5p--bc1002_3p.bam > all.fofn
+    $ ls output.IsoSeqX*bam > all.fofn
 
     # Remove poly(A) tails and concatemer
-    $ isoseq refine all.fofn NEB_barcode16.fasta flnc.bam --require-polya
+    $ isoseq refine all.fofn IsoSeq_v2_primers_12.fasta flnc.bam --require-polya
 
-    $ isoseq cluster flnc.bam clustered.bam --use-qvs --verbose
+    $ isoseq cluster2 flnc.bam transcripts.bam