📝 updarted readme

🔨 added ref to rnaseq wf 🔨 replaced default ref in annofuse
kids-first · Jan 25, 2023 · c5e853e · c5e853e
1 parent 233fa65
commit c5e853e
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -197,7 +197,8 @@ Kids First favors setting/overriding defaults with "arriba-heavy" specified in [
 ### annoFuse:
 ```yaml
   sample_name: {type: 'string?', doc: "Sample ID of the input reads. If not provided, will use reads1 file basename."}
-  annofuse_col_num: {type: 'int?', doc: "column number in file of fusion name."}
+  annofuse_col_num: {type: 'int?', doc: "0-based column number in file of fusion name."}
+  fusion_annotator_ref: { type: 'File', doc: "Tar ball with fusion_annot_lib.idx and blast_pairs.idx from STAR-Fusion CTAT Genome lib. Can be same as FusionGenome, but only two files needed from that package", "sbg:suggestedValue": { class: 'File', path: '63cff818facdd82011c8d6fe', name: 'GRCh38_v39_fusion_annot_custom.tar.gz' }}
 ```
 ### rmats
 ```yaml
@@ -296,6 +297,7 @@ groups"`. See the STAR documentation on `outSAMattrRGline` for complete details.
 
 ## Reference build notes:
  - STAR-Fusion reference built with command `/usr/local/STAR-Fusion/ctat-genome-lib-builder/prep_genome_lib.pl --gtf gencode.v39.primary_assembly.annotation.gtf --annot_filter_rule ../AnnotFilterRule.pm --CPU 36 --fusion_annot_lib ../fusion_lib.Mar2021.dat.gz --genome_fa ../GRCh38.primary_assembly.genome.fa --output_dir GRCh38_v39_CTAT_lib_Mar242022.CUSTOM --human_gencode_filter --pfam_db current --dfam_db human 2> build.errs > build.out &`
+ - fusion_annotator_ref built by placing GRCh38_v39_CTAT_lib_Mar242022.CUSTOM/fusion_annot_lib.idx and GRCh38_v39_CTAT_lib_Mar242022.CUSTOM/blast_pairs.idx into its own tar ball
  - kallisto index built using RSEM `RSEM_GENCODE39.transcripts.fa` file as transcriptome fasta, using command: `kallisto index -i RSEM_GENCODE39.transcripts.kallisto.idx RSEM_GENCODE39.transcripts.fa`
  - RNA-SEQc reference built using [collapse gtf script](https://github.com/broadinstitute/gtex-pipeline/blob/master/gene_model/collapse_annotation.py)
    - Two references needed if data are stranded vs. unstranded

diff --git a/workflow/kfdrc_RNAseq_workflow.cwl b/workflow/kfdrc_RNAseq_workflow.cwl
@@ -202,7 +202,8 @@ doc: |
   ### annoFuse:
   ```yaml
     sample_name: {type: 'string?', doc: "Sample ID of the input reads. If not provided, will use reads1 file basename."}
-    annofuse_col_num: {type: 'int?', doc: "column number in file of fusion name."}
+    annofuse_col_num: {type: 'int?', doc: "0-based column number in file of fusion name."}
+    fusion_annotator_ref: { type: 'File', doc: "Tar ball with fusion_annot_lib.idx and blast_pairs.idx from STAR-Fusion CTAT Genome lib. Can be same as FusionGenome, but only two files needed from that package", "sbg:suggestedValue": { class: 'File', path: '63cff818facdd82011c8d6fe', name: 'GRCh38_v39_fusion_annot_custom.tar.gz' }}
   ```
   ### rmats
   ```yaml
@@ -301,6 +302,7 @@ doc: |
 
   ## Reference build notes:
    - STAR-Fusion reference built with command `/usr/local/STAR-Fusion/ctat-genome-lib-builder/prep_genome_lib.pl --gtf gencode.v39.primary_assembly.annotation.gtf --annot_filter_rule ../AnnotFilterRule.pm --CPU 36 --fusion_annot_lib ../fusion_lib.Mar2021.dat.gz --genome_fa ../GRCh38.primary_assembly.genome.fa --output_dir GRCh38_v39_CTAT_lib_Mar242022.CUSTOM --human_gencode_filter --pfam_db current --dfam_db human 2> build.errs > build.out &`
+   - fusion_annotator_ref built by placing GRCh38_v39_CTAT_lib_Mar242022.CUSTOM/fusion_annot_lib.idx and GRCh38_v39_CTAT_lib_Mar242022.CUSTOM/blast_pairs.idx into its own tar ball
    - kallisto index built using RSEM `RSEM_GENCODE39.transcripts.fa` file as transcriptome fasta, using command: `kallisto index -i RSEM_GENCODE39.transcripts.kallisto.idx RSEM_GENCODE39.transcripts.fa`
    - RNA-SEQc reference built using [collapse gtf script](https://github.com/broadinstitute/gtex-pipeline/blob/master/gene_model/collapse_annotation.py)
      - Two references needed if data are stranded vs. unstranded
@@ -523,7 +525,12 @@ inputs:
   # annoFuse
   sample_name: {type: 'string?', doc: "Sample ID of the input reads. If not provided,\
       \ will use reads1 file basename."}
-  annofuse_col_num: {type: 'int?', doc: "column number in file of fusion name.", default: 30}
+  annofuse_col_num: {type: 'int?', doc: "0-based column number in file of fusion name.",
+    default: 30}
+  fusion_annotator_ref: {type: 'File', doc: "Tar ball with fusion_annot_lib.idx and\
+      \ blast_pairs.idx from STAR-Fusion CTAT Genome lib. Can be same as FusionGenome,\
+      \ but only two files needed from that package", "sbg:suggestedValue": {class: 'File',
+      path: '63cff818facdd82011c8d6fe', name: 'GRCh38_v39_fusion_annot_custom.tar.gz'}}
   # rmats
   rmats_read_length: {type: 'int', doc: "Input read length for sample reads."}
   rmats_variable_read_length: {type: 'boolean?', doc: "Allow reads with lengths that\
@@ -818,7 +825,7 @@ steps:
     run: ../workflow/kfdrc_annoFuse_wf.cwl
     in:
       sample_name: basename_picker/outsample
-      FusionGenome: FusionGenome
+      FusionGenome: fusion_annotator_ref
       genome_untar_path: star_fusion_genome_untar_path
       rsem_expr_file: rsem/gene_out
       arriba_output_file: arriba_fusion_2-2-1/arriba_fusions
@@ -865,5 +872,5 @@ hints:
 - SE
 - STAR
 "sbg:links":
-- id: 'https://github.com/kids-first/kf-rnaseq-workflow/releases/tag/v4.2.2'
+- id: 'https://github.com/kids-first/kf-rnaseq-workflow/releases/tag/v4.3.0'
   label: github-release
diff --git a/workflow/kfdrc_annoFuse_wf.cwl b/workflow/kfdrc_annoFuse_wf.cwl
@@ -6,11 +6,11 @@ requirements:
 
 inputs:
   sample_name: { type: 'string', doc: "Sample name to apply. Ought to be one from some kind of clinical database" }
-  FusionGenome: { type: 'File', doc: "GRCh38_v27_CTAT_lib_Feb092018.plug-n-play.tar.gz", "sbg:suggestedValue": { class: 'File', path: '62853e7ad63f7c6d8d7ae5a8', name: 'GRCh38_v39_CTAT_lib_Mar242022.CUSTOM.tar.gz' }}
+  FusionGenome: { type: 'File', doc: "Tar ball with fusion_annot_lib.idx and blast_pairs.idx from STAR-Fusion CTAT Genome lib", "sbg:suggestedValue": { class: 'File', path: '63cff818facdd82011c8d6fe', name: 'GRCh38_v39_fusion_annot_custom.tar.gz' } }
   genome_untar_path: { type: 'string?', doc: "This is what the path will be when genome_tar is unpackaged", default: "GRCh38_v39_CTAT_lib_Mar242022.CUSTOM" }
   rsem_expr_file: { type: 'File', doc: "gzipped rsem gene expression file" }
   arriba_output_file: { type: 'File', doc: "Output from arriba, usually extension arriba.fusions.tsv" }
-  col_num: { type: 'int?', doc: "column number in file of fusion name, use 24 for arriba v1.1, 30 for v2", default: 30 }
+  col_num: { type: 'int?', doc: "0-based column number in file of fusion name, use 24 for arriba v1.1, 30 for v2", default: 30 }
   star_fusion_output_file: { type: 'File', doc: "Output from STAR Fusion, usually extension STAR.fusion_predictions.abridged.coding_effect.tsv" }
   output_basename: { type: 'string', doc: "String to use as basename for outputs" }
 
@@ -39,9 +39,6 @@ steps:
       [formatted_fusion_tsv]
 
   annotate_arriba:
-    hints:
-      - class: 'sbg:AWSInstanceType'
-        value: c5.2xlarge
     run: ../tools/fusion_annotator.cwl
     in:
       input_fusion_file: format_arriba_output/formatted_fusion_tsv
@@ -65,3 +62,8 @@ steps:
 
 $namespaces:
   sbg: https://sevenbridges.com
+
+hints:
+  - class: 'sbg:AWSInstanceType'
+    value: c5.2xlarge;ebs-gp2;400
+    doc: "Chosen for speed and lower cost"