improve prepare_genome subworkflow

nf-core · Sep 19, 2023 · b3427df · b3427df
1 parent 519f412
commit b3427df
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 117 deletions.
diff --git a/modules/local/gtf2bed/main.nf b/modules/local/gtf2bed/main.nf
@@ -8,8 +8,9 @@ process GTF2BED {
         'biocontainers/r-base:3.5.0'}"
 
     input:
-    path gtf
+    tuple val(meta), path(gtf)
     val feature_type
+
     output:
     path '*.bed'       , emit: bed
     path "versions.yml", emit: versions

diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf
@@ -1,136 +1,49 @@
 //
-// Uncompress and prepare reference genome files
+// Prepare reference genome files
 //
 
 include { BEDTOOLS_MERGE                 } from '../../../modules/nf-core/bedtools/merge/main'
 include { BEDTOOLS_SORT                  } from '../../../modules/nf-core/bedtools/sort/main'
 include { GATK4_CREATESEQUENCEDICTIONARY } from '../../../modules/nf-core/gatk4/createsequencedictionary/main'
 include { GFFREAD                        } from '../../../modules/nf-core/gffread/main'
 include { GTF2BED                        } from '../../../modules/local/gtf2bed'
-include { GUNZIP as GUNZIP_FASTA         } from '../../../modules/nf-core/gunzip/main'
-include { GUNZIP as GUNZIP_GENE_BED      } from '../../../modules/nf-core/gunzip/main'
-include { GUNZIP as GUNZIP_GFF           } from '../../../modules/nf-core/gunzip/main'
-include { GUNZIP as GUNZIP_GTF           } from '../../../modules/nf-core/gunzip/main'
 include { SAMTOOLS_FAIDX                 } from '../../../modules/nf-core/samtools/faidx/main'
 include { STAR_GENOMEGENERATE            } from '../../../modules/nf-core/star/genomegenerate/main'
-include { UNTAR as UNTAR_STAR_INDEX      } from '../../../modules/nf-core/untar/main'
 
 workflow PREPARE_GENOME {
     take:
-    fasta                //      file: /path/to/genome.fasta
-    // gtf                  //      file: /path/to/genome.gtf
-    // gff                  //      file: /path/to/genome.gff
-    // exon_bed             //      file: /path/to/gene.bed
+    ch_exon_bed   // file: /path/to/gene.bed
+    ch_fasta      // file: /path/to/genome.fasta
+    ch_gff        // file: /path/to/genome.gff
+    ch_gtf        // file: /path/to/genome.gtf
+    feature_type
     // prepare_tool_indices
-    // feature_type
 
     main:
     ch_versions = Channel.empty()
 
-    ch_fasta = fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] }
-
-    //
-    // Uncompress genome fasta file if required
-    //
-    // if (fasta.endsWith('.gz')) {
-    //     ch_fasta    = GUNZIP_FASTA([[:], fasta]).gunzip.map{ meta, fasta -> fasta }
-    //     ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions)
-    // } else {
-    //     ch_fasta = Channel.value(file(fasta))
-    // }
-
-    //
-    // Uncompress GTF annotation file or create from GFF3 if required
-    //
-    // if (gtf) {
-    //     if (gtf.endsWith('.gz')) {
-    //         ch_gtf      = GUNZIP_GTF([[:], gtf]).gunzip.map{ meta, gtf -> gtf }
-    //         ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions)
-    //     } else ch_gtf = Channel.value(file(gtf))
-    // } else if (gff) {
-    //     if (gff.endsWith('.gz')) {
-    //         ch_gff      = GUNZIP_GFF([[:], gff]).gunzip.map{ meta, gff -> gff }
-    //         ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions)
-    //     } else ch_gff = Channel.value(file(gff))
-    //     ch_gtf      = GFFREAD(ch_gff).gtf
-    //     ch_versions = ch_versions.mix(GFFREAD.out.versions)
-    // }
-
-    //
-    // Uncompress exon BED annotation file or create from GTF if required
-    //
-    // if (exon_bed) {
-    //     if (exon_bed.endsWith('.gz')) {
-    //         exonGENE_BED(
-    //             Channel.fromPath(exon_bed).map{ it -> [[id:it[0].baseName], it] }
-    //         )
-    //         ch_exon_bed = GUNZIP_GENE_BED.out.gunzip.map{ meta, bed -> [bed] }.collect()
-    //         ch_versions = ch_versions.mix(GUNZIP_GENE_BED.out.versions)
-    //     } else {
-    //         ch_exon_bed = Channel.fromPath(exon_bed).collect()
-    //     }
-    // } else {
-    //     ch_exon_bed = GTF2BED( ch_gtf , feature_type).bed.collect()
-    //     ch_versions = ch_versions.mix(GTF2BED.out.versions)
-    // }
-
-    //ch_exon_bed.view()
-    //ch_exon_bed.map{ it -> [[id:'exome'], it] }
-    //ch_exon_bed.view()
-    // Bedtools sort
-    // ch_bedtools_sort = BEDTOOLS_SORT(ch_exon_bed.map{ it -> [[id:'exome'], it] }, 'sorted').sorted.collect()
-    // ch_versions = ch_versions.mix(BEDTOOLS_SORT.out.versions)
-
-
-    // // Bedtools merge
-    // ch_bedtools_merge = BEDTOOLS_MERGE(ch_bedtools_sort).bed
-    // ch_versions = ch_versions.mix(BEDTOOLS_MERGE.out.versions)
-
-
-    // Index the genome fasta
+    GATK4_CREATESEQUENCEDICTIONARY(ch_fasta)
+    GFFREAD(ch_gff)
     SAMTOOLS_FAIDX(ch_fasta, [['id':null], []])
 
-    ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions)
+    ch_gtf = ch_gtf.mix(GFFREAD.out.gtf)
 
-    // Create dictionary file for the genome fasta
-    // ch_fasta_dict = Channel.empty()
-    // if (params.dict) ch_fasta_dict = Channel.fromPath(params.dict).collect()
-    // else ch_fasta_dict = GATK4_CREATESEQUENCEDICTIONARY(ch_fasta).dict
-
-    //
-    // Uncompress STAR index or generate from scratch if required
-    //
-    // ch_star_index = Channel.empty()
-    // if ('star' in prepare_tool_indices) {
-    //     if (params.star_index) {
-    //         if (params.star_index.endsWith('.tar.gz')) {
-    //             UNTAR_STAR_INDEX(
-    //                 Channel.fromPath(params.star_index).map{ it -> [[id:it[0].baseName], it] }
-    //             )
-    //             ch_star_index = UNTAR_STAR_INDEX.out.untar.map{ meta, star_index -> [star_index] }.collect()
-    //             ch_versions   = ch_versions.mix(UNTAR_STAR_INDEX.out.versions)
-    //         } else {
-    //             ch_star_index = Channel.fromPath(params.star_index).collect()
-    //         }
-    //     }
-    //     else {
-    //         STAR_GENOMEGENERATE(
-    //             ch_fasta,ch_gtf
-    //         )
-    //         .index
-    //         .set { ch_star_index }
-    //         ch_versions     = ch_versions.mix(STAR_GENOMEGENERATE.out.versions)
-    //     }
-    // }
+    GTF2BED(ch_gtf, feature_type)
+    STAR_GENOMEGENERATE(ch_fasta, ch_gtf)
 
+    ch_versions = ch_versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions)
+    ch_versions = ch_versions.mix(GFFREAD.out.versions)
+    ch_versions = ch_versions.mix(GTF2BED.out.versions)
+    ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions)
+    ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions)
 
     emit:
-    // fasta            = ch_fasta            // path: genome.fasta
-    fasta_fai        = SAMTOOLS_FAIDX.out.fai.map{ meta, fai -> [fai] } // path: genome.fasta.fai    dict             = ch_fasta_dict       // path: genome.fasta.dict
-    // gtf              = ch_gtf              // path: genome.gtf
-    // exon_bed         = ch_exon_bed         // path: exon.bed
+    dict       = GATK4_CREATESEQUENCEDICTIONARY.out.dict          //    path: genome.fasta.dict
+    exon_bed   = GTF2BED.out.bed.collect()                        //    path: exon.bed
+    fasta_fai  = SAMTOOLS_FAIDX.out.fai.map{ meta, fai -> [fai] } //    path: genome.fasta.fai
+    gtf        = ch_gtf                                           //    path: genome.gtf
+    star_index = STAR_GENOMEGENERATE.out.index                    //    path: star/index/
+    versions   = ch_versions                                      // channel: [ versions.yml ]
     // bedtools_sort    = ch_bedtools_sort    // path: sort.bed
     // bedtools_merge   = ch_bedtools_merge   // path: merge.bed
-    // star_index       = ch_star_index       // path: star/index/
-    versions         = ch_versions         // channel: [ versions.yml ]
 }
diff --git a/workflows/rnavar.nf b/workflows/rnavar.nf
@@ -117,7 +117,10 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS                        } from '../modules/
 
 
 // Initialize file channels based on params, defined in the params.genomes[params.genome] scope
-ch_fasta = params.fasta ? Channel.fromPath(params.fasta).first() : Channel.empty()
+ch_exon_bed = params.exon_bed ? Channel.fromPath(params.exon_bed)                                                       : Channel.empty()
+ch_fasta    = params.fasta    ? Channel.fromPath(params.fasta).map{ fasta -> [ [ id:fasta.baseName ], fasta ] }.first() : Channel.empty()
+ch_gff      = params.gff      ? Channel.fromPath(params.gff).first()                                                    : Channel.empty()
+ch_gtf      = params.gtf      ? Channel.fromPath(params.gtf).map{ gtf -> [ [ id:gtf.baseName ], gtf ] }.first()         : Channel.empty()
 
 /*
 ========================================================================================
@@ -129,20 +132,21 @@ workflow RNAVAR {
 
     // To gather all QC reports for MultiQC
     ch_reports  = Channel.empty()
+
     // To gather used softwares versions for MultiQC
     ch_versions = Channel.empty()
 
     //
-    // SUBWORKFLOW: Uncompress and prepare reference genome files
+    // Prepare reference genome files
     //
 
     PREPARE_GENOME(
-        ch_fasta
-        // params.gtf,
-        // params.gff,
-        // params.gene_bed,
+        ch_exon_bed,
+        ch_fasta,
+        ch_gff,
+        ch_gtf,
         // params.aligner,
-        // params.feature_type
+        params.feature_type
     )
 
     // ch_genome_bed = Channel.from([id:'genome.bed']).combine(PREPARE_GENOME.out.exon_bed)