From b3427dfc391c3040c3fbda1baa8e4d999975890f Mon Sep 17 00:00:00 2001 From: maxulysse Date: Tue, 19 Sep 2023 16:45:57 +0200 Subject: [PATCH] improve prepare_genome subworkflow --- modules/local/gtf2bed/main.nf | 3 +- subworkflows/local/prepare_genome/main.nf | 131 ++++------------------ workflows/rnavar.nf | 18 +-- 3 files changed, 35 insertions(+), 117 deletions(-) diff --git a/modules/local/gtf2bed/main.nf b/modules/local/gtf2bed/main.nf index b0c7e42a..b18a63d4 100755 --- a/modules/local/gtf2bed/main.nf +++ b/modules/local/gtf2bed/main.nf @@ -8,8 +8,9 @@ process GTF2BED { 'biocontainers/r-base:3.5.0'}" input: - path gtf + tuple val(meta), path(gtf) val feature_type + output: path '*.bed' , emit: bed path "versions.yml", emit: versions diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index 598dc4e4..d3cde79e 100755 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -1,5 +1,5 @@ // -// Uncompress and prepare reference genome files +// Prepare reference genome files // include { BEDTOOLS_MERGE } from '../../../modules/nf-core/bedtools/merge/main' @@ -7,130 +7,43 @@ include { BEDTOOLS_SORT } from '../../../modules/nf-core/bedtoo include { GATK4_CREATESEQUENCEDICTIONARY } from '../../../modules/nf-core/gatk4/createsequencedictionary/main' include { GFFREAD } from '../../../modules/nf-core/gffread/main' include { GTF2BED } from '../../../modules/local/gtf2bed' -include { GUNZIP as GUNZIP_FASTA } from '../../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_GENE_BED } from '../../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_GFF } from '../../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_GTF } from '../../../modules/nf-core/gunzip/main' include { SAMTOOLS_FAIDX } from '../../../modules/nf-core/samtools/faidx/main' include { STAR_GENOMEGENERATE } from '../../../modules/nf-core/star/genomegenerate/main' -include { UNTAR as UNTAR_STAR_INDEX } from '../../../modules/nf-core/untar/main' workflow PREPARE_GENOME { take: - fasta // file: /path/to/genome.fasta - // gtf // file: /path/to/genome.gtf - // gff // file: /path/to/genome.gff - // exon_bed // file: /path/to/gene.bed + ch_exon_bed // file: /path/to/gene.bed + ch_fasta // file: /path/to/genome.fasta + ch_gff // file: /path/to/genome.gff + ch_gtf // file: /path/to/genome.gtf + feature_type // prepare_tool_indices - // feature_type main: ch_versions = Channel.empty() - ch_fasta = fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] } - - // - // Uncompress genome fasta file if required - // - // if (fasta.endsWith('.gz')) { - // ch_fasta = GUNZIP_FASTA([[:], fasta]).gunzip.map{ meta, fasta -> fasta } - // ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions) - // } else { - // ch_fasta = Channel.value(file(fasta)) - // } - - // - // Uncompress GTF annotation file or create from GFF3 if required - // - // if (gtf) { - // if (gtf.endsWith('.gz')) { - // ch_gtf = GUNZIP_GTF([[:], gtf]).gunzip.map{ meta, gtf -> gtf } - // ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) - // } else ch_gtf = Channel.value(file(gtf)) - // } else if (gff) { - // if (gff.endsWith('.gz')) { - // ch_gff = GUNZIP_GFF([[:], gff]).gunzip.map{ meta, gff -> gff } - // ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions) - // } else ch_gff = Channel.value(file(gff)) - // ch_gtf = GFFREAD(ch_gff).gtf - // ch_versions = ch_versions.mix(GFFREAD.out.versions) - // } - - // - // Uncompress exon BED annotation file or create from GTF if required - // - // if (exon_bed) { - // if (exon_bed.endsWith('.gz')) { - // exonGENE_BED( - // Channel.fromPath(exon_bed).map{ it -> [[id:it[0].baseName], it] } - // ) - // ch_exon_bed = GUNZIP_GENE_BED.out.gunzip.map{ meta, bed -> [bed] }.collect() - // ch_versions = ch_versions.mix(GUNZIP_GENE_BED.out.versions) - // } else { - // ch_exon_bed = Channel.fromPath(exon_bed).collect() - // } - // } else { - // ch_exon_bed = GTF2BED( ch_gtf , feature_type).bed.collect() - // ch_versions = ch_versions.mix(GTF2BED.out.versions) - // } - - //ch_exon_bed.view() - //ch_exon_bed.map{ it -> [[id:'exome'], it] } - //ch_exon_bed.view() - // Bedtools sort - // ch_bedtools_sort = BEDTOOLS_SORT(ch_exon_bed.map{ it -> [[id:'exome'], it] }, 'sorted').sorted.collect() - // ch_versions = ch_versions.mix(BEDTOOLS_SORT.out.versions) - - - // // Bedtools merge - // ch_bedtools_merge = BEDTOOLS_MERGE(ch_bedtools_sort).bed - // ch_versions = ch_versions.mix(BEDTOOLS_MERGE.out.versions) - - - // Index the genome fasta + GATK4_CREATESEQUENCEDICTIONARY(ch_fasta) + GFFREAD(ch_gff) SAMTOOLS_FAIDX(ch_fasta, [['id':null], []]) - ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions) + ch_gtf = ch_gtf.mix(GFFREAD.out.gtf) - // Create dictionary file for the genome fasta - // ch_fasta_dict = Channel.empty() - // if (params.dict) ch_fasta_dict = Channel.fromPath(params.dict).collect() - // else ch_fasta_dict = GATK4_CREATESEQUENCEDICTIONARY(ch_fasta).dict - - // - // Uncompress STAR index or generate from scratch if required - // - // ch_star_index = Channel.empty() - // if ('star' in prepare_tool_indices) { - // if (params.star_index) { - // if (params.star_index.endsWith('.tar.gz')) { - // UNTAR_STAR_INDEX( - // Channel.fromPath(params.star_index).map{ it -> [[id:it[0].baseName], it] } - // ) - // ch_star_index = UNTAR_STAR_INDEX.out.untar.map{ meta, star_index -> [star_index] }.collect() - // ch_versions = ch_versions.mix(UNTAR_STAR_INDEX.out.versions) - // } else { - // ch_star_index = Channel.fromPath(params.star_index).collect() - // } - // } - // else { - // STAR_GENOMEGENERATE( - // ch_fasta,ch_gtf - // ) - // .index - // .set { ch_star_index } - // ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) - // } - // } + GTF2BED(ch_gtf, feature_type) + STAR_GENOMEGENERATE(ch_fasta, ch_gtf) + ch_versions = ch_versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions) + ch_versions = ch_versions.mix(GFFREAD.out.versions) + ch_versions = ch_versions.mix(GTF2BED.out.versions) + ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions) + ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) emit: - // fasta = ch_fasta // path: genome.fasta - fasta_fai = SAMTOOLS_FAIDX.out.fai.map{ meta, fai -> [fai] } // path: genome.fasta.fai dict = ch_fasta_dict // path: genome.fasta.dict - // gtf = ch_gtf // path: genome.gtf - // exon_bed = ch_exon_bed // path: exon.bed + dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict // path: genome.fasta.dict + exon_bed = GTF2BED.out.bed.collect() // path: exon.bed + fasta_fai = SAMTOOLS_FAIDX.out.fai.map{ meta, fai -> [fai] } // path: genome.fasta.fai + gtf = ch_gtf // path: genome.gtf + star_index = STAR_GENOMEGENERATE.out.index // path: star/index/ + versions = ch_versions // channel: [ versions.yml ] // bedtools_sort = ch_bedtools_sort // path: sort.bed // bedtools_merge = ch_bedtools_merge // path: merge.bed - // star_index = ch_star_index // path: star/index/ - versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/rnavar.nf b/workflows/rnavar.nf index 64264b44..b2796ae2 100755 --- a/workflows/rnavar.nf +++ b/workflows/rnavar.nf @@ -117,7 +117,10 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/ // Initialize file channels based on params, defined in the params.genomes[params.genome] scope -ch_fasta = params.fasta ? Channel.fromPath(params.fasta).first() : Channel.empty() +ch_exon_bed = params.exon_bed ? Channel.fromPath(params.exon_bed) : Channel.empty() +ch_fasta = params.fasta ? Channel.fromPath(params.fasta).map{ fasta -> [ [ id:fasta.baseName ], fasta ] }.first() : Channel.empty() +ch_gff = params.gff ? Channel.fromPath(params.gff).first() : Channel.empty() +ch_gtf = params.gtf ? Channel.fromPath(params.gtf).map{ gtf -> [ [ id:gtf.baseName ], gtf ] }.first() : Channel.empty() /* ======================================================================================== @@ -129,20 +132,21 @@ workflow RNAVAR { // To gather all QC reports for MultiQC ch_reports = Channel.empty() + // To gather used softwares versions for MultiQC ch_versions = Channel.empty() // - // SUBWORKFLOW: Uncompress and prepare reference genome files + // Prepare reference genome files // PREPARE_GENOME( - ch_fasta - // params.gtf, - // params.gff, - // params.gene_bed, + ch_exon_bed, + ch_fasta, + ch_gff, + ch_gtf, // params.aligner, - // params.feature_type + params.feature_type ) // ch_genome_bed = Channel.from([id:'genome.bed']).combine(PREPARE_GENOME.out.exon_bed)