compbiocore · Nando1014 · Jan 22, 2021 · Jan 22, 2021 · Jan 22, 2021 · Jan 22, 2021
diff --git a/docs/tutorials/download_sra_doc.md b/docs/tutorials/download_sra_doc.md
@@ -0,0 +1,124 @@
+# Downloading SRA Files
+
+QCDB is currently built to use the outputs generated by Bioflows \([https://compbiocore.github.io/bioflows/](https://compbiocore.github.io/bioflows/)\), a package designed to automate running bioinformatics workflows. To run bioflows:
+
+* Connect to Oscar: `ssh [email protected]` and enter password when prompted
+* Switch to the cbcollab group: `newgrp cbcollab`
+* Create a yaml file to download reads from SRA and run them in bioflows -- here is an example that  downloads human transcriptome data, runs some QC on the reads, aligns them to the genome, and runs some alignment QC. 
+  * This example has dummy values in place for `sample_manifest:sra:entrez_email` and `run_parms:ssh_user` -- these should be edited before attempting to use this example.
+  * To run bioflows on a different SRA sample, edit the fields for `bioproject:`and  `sample_manifest:sra:id` . You may also need to edit `run_parms:gtf_file:` and`run_parms:reference_fasta_path` to reflect the appropriate reference and annotations, as well as `workflow_sequence:gsnap` if gsnap is not the appropriate alignment tool for your data.
+
+```text
+bioproject: PRJNA240916
+experiment: sra_test
+sample_manifest:
+  sra:  
+      id: SRS594907
+      entrez_email: [email protected]
+      downloads: False
+run_parms:
+  conda_command: source /gpfs/runtime/cbc_conda/bin/activate_cbc_conda
+  work_dir: /gpfs/data/cbc/qcdb_populate
+  log_dir: logs
+  paired_end: False
+  local_targets: False
+  saga_host: localhost
+  ssh_user: username
+  saga_scheduler: slurm
+  gtf_file: /gpfs/data/cbc/cbcollab/ref_tools/Ensembl_hg_GRCh37_rel87/Ensembl_Homo_sapiens.GRCh37.87.gtf
+  reference_fasta_path: /gpfs/data/cbc/cbcollab/ref_tools/Ensembl_hg_GRCh37_rel87/Ensembl_Homo_sapiens.GRCh37.dna.primary_assembly.fa
+workflow_sequence:
+  - fastqc: default
+  - gsnap:
+      options:
+        -d: Ensembl_Homo_sapiens_GRCh37
+        -s: /gpfs/data/cbc/cbcollab/cbc_ref/gmapdb_2017.01.14/Ensembl_Homo_sapiens_GRCh37/Ensembl_Homo_sapiens_GRCh37.maps/Ensembl_Homo_sapiens.GRCh37.87.splicesites.iit
+      job_params:
+        ncpus: 42
+        mem: 128000
+        time: 1400
+      suffix:
+        output: ".sam"
+  - samtools:
+      subcommand: view
+      suffix:
+        input: ".sam"
+        output: ".bam"
+      options:
+        -Sbh:
+      job_params:
+        time: 2000
+        ncpus: 8
+        mem: 128000
+  - samtools:
+      subcommand: view
+      suffix:
+        input: ".bam"
+        output: ".mapped.bam"
+      options:
+        -bh:
+        -F: "0x4"
+      job_params:
+        time: 2000
+        ncpus: 8
+        mem: 65000
+  - samtools:
+      subcommand: sort
+      suffix:
+        input: ".mapped.bam"
+        output: ".srtd.bam"
+      job_params:
+        time: 2000
+        ncpus: 8
+        mem: 175000
+  - samtools:
+      subcommand: index
+      suffix:
+        input: ".srtd.bam"
+      job_params:
+        time: 2000
+        ncpus: 8
+        mem: 65000
+  - bammarkduplicates2:
+      suffix:
+        input: ".srtd.bam"
+        output: ".dup.srtd.bam" 
+      job_params:
+        time: 2000
+        ncpus: 8
+        mem: 65000 
+  - picard:
+      subcommand: AddOrReplaceReadGroups
+  - picard:
+      subcommand: CollectAlignmentSummaryMetrics
+      suffix:
+        input: ".rg.srtd.bam"
+      options:
+        VALIDATION_STRINGENCY=LENIENT:
+  - picard:
+      subcommand: CollectInsertSizeMetrics
+      suffix:
+        input: ".rg.srtd.bam"
+      options:
+        VALIDATION_STRINGENCY=LENIENT:
+  - picard:
+      subcommand: CollectGcBiasMetrics
+      suffix:
+        input: ".rg.srtd.bam"
+      options:
+        VALIDATION_STRINGENCY=LENIENT:
+```
+
+* After saving your yaml file, start a screen session -- here is an example of how to start a screen session named `bioflows_qcdb`:
+
+`screen -S bioflows_qcdb`
+
+* Activate the conda environment:
+
+`source /gpfs/runtime/cbc_conda/bin/activate_cbc_conda`
+
+* Run bioflows using the yaml you just created -- in this example, we are assuming the yaml is called `bioflows_qcdb.yaml`:
+
+`bioflows-run bioflows_qcdb.yaml`
+
+* Bioflows will create a series of folders inside the working directory you specified in your yaml file. The outputs that will be useful for qcdb will be located in the directory called `qc`