diff --git a/.github/workflows/sample_sheet_all.csv b/.github/workflows/sample_sheet_all.csv index e4c745e..a3f2105 100755 --- a/.github/workflows/sample_sheet_all.csv +++ b/.github/workflows/sample_sheet_all.csv @@ -1,3 +1,3 @@ sample,fastq,fastq_1,fastq_2 -test,test_files/test_nanopore.fastq.gz,test_files/test_illumina_1.fastq.gz,test_files/test_illumina_1.fastq.gz +test,test_files/test_nanopore.fastq.gz,test_files/test_illumina_1.fastq.gz,test_files/test_illumina_2.fastq.gz test2,test_files/test_nanopore_only.fastq.gz,, \ No newline at end of file diff --git a/.github/workflows/sample_sheet_both.csv b/.github/workflows/sample_sheet_both.csv index 00c93ed..495a45d 100755 --- a/.github/workflows/sample_sheet_both.csv +++ b/.github/workflows/sample_sheet_both.csv @@ -1,2 +1,2 @@ sample,fastq,fastq_1,fastq_2 -test,test_files/test_nanopore.fastq.gz,test_files/test_illumina_1.fastq.gz,test_files/test_illumina_1.fastq.gz \ No newline at end of file +test,test_files/test_nanopore.fastq.gz,test_files/test_illumina_1.fastq.gz,test_files/test_illumina_2.fastq.gz \ No newline at end of file diff --git a/donut_falls.nf b/donut_falls.nf index c7c5d46..75b8cbf 100755 --- a/donut_falls.nf +++ b/donut_falls.nf @@ -98,9 +98,9 @@ if (params.sample_sheet) { .map { it -> meta = [id:it.sample] tuple( meta, - "${it.fastq}", + file("${it.fastq}", checkIfExists: true), "${it.fastq_1}", - "${it.fastq_2}" ) + "${it.fastq_2}") } .set{ ch_input_files } } else { @@ -110,7 +110,7 @@ if (params.sample_sheet) { // channel for illumina files (paired-end only) ch_input_files .filter { it[2] != it[3] } - .map { it -> tuple (it[0], [file(it[2], checkIfExists: true), file(it[3], checkIfExists: true)])} + .map { it -> tuple(it[0], [file(it[2], checkIfExists: true), file(it[3], checkIfExists: true)])} .set { ch_illumina_input } // channel for nanopore files @@ -323,7 +323,6 @@ process copy { line = line.strip() if line.startswith('>'): contig = str(line.replace('>','').split()[0]) - print(gfa_dict[contig]) circular = gfa_dict[contig]['Is circular'].replace('Y','true').replace('N','false') length = gfa_dict[contig]['Total segment length'] gc_per = gfa_dict[contig]['GC content %'] @@ -441,10 +440,10 @@ process fastp { time '10m' input: - tuple val(meta), file(reads), val(type) + tuple val(meta), file(reads) output: - tuple val(meta), file("fastp/*_fastp*.fastq.gz"), val(type), emit: fastq + tuple val(meta), file("fastp/*_fastp*.fastq.gz"), emit: fastq path "fastp/*", emit: everything path "fastp/*_fastp*.json", emit: summary path "versions.yml", emit: versions @@ -456,7 +455,7 @@ process fastp { def args = task.ext.args ?: '' def lrargs = task.ext.lrargs ?: '--qualified_quality_phred 12 --length_required 1000' def prefix = task.ext.prefix ?: "${meta.id}" - if (type == "illumina"){ + if (reads.toList().size() > 1 ){ """ mkdir -p fastp @@ -962,8 +961,8 @@ process nanoplot { output: path "nanoplot/*", emit: everything - path "nanoplot/${meta.id}_NanoStats.txt", emit: stats - path "nanoplot/${meta.id}_NanoStats.csv", emit: summary + path "nanoplot/${meta.id}*_NanoStats.txt", emit: stats + path "nanoplot/${meta.id}*_NanoStats.csv", emit: summary path "versions.yml", emit: versions when: @@ -972,19 +971,19 @@ process nanoplot { shell: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def readtype = fastq.toList().size() > 1 ? '_illumina' : '' """ mkdir -p nanoplot NanoPlot ${args} \ - --fastq ${fastq} \ + --fastq ${fastq.join(' ')} \ --threads ${task.cpus} \ + --prefix ${prefix}${readtype}_ \ --tsv_stats \ --outdir nanoplot - cp nanoplot/NanoStats.txt nanoplot/${prefix}_NanoStats.txt - - echo "sample,\$( cut -f 1 nanoplot/${prefix}_NanoStats.txt | tr '\\n' ',' )" > nanoplot/${prefix}_NanoStats.csv - echo "${prefix},\$(cut -f 2 nanoplot/${prefix}_NanoStats.txt | tr '\\n' ',' )" >> nanoplot/${prefix}_NanoStats.csv + echo "sample,\$( cut -f 1 nanoplot/${prefix}${readtype}_NanoStats.txt | tr '\\n' ',' )" > nanoplot/${prefix}${readtype}_NanoStats.csv + echo "${prefix}${readtype},\$(cut -f 2 nanoplot/${prefix}${readtype}_NanoStats.txt | tr '\\n' ',' )" >> nanoplot/${prefix}${readtype}_NanoStats.csv cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -996,7 +995,7 @@ process nanoplot { process png { tag "${meta.id}" label "process_low" - publishDir "${params.outdir}/${meta.id}", mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + publishDir "${params.outdir}/${meta.id}/bandage", mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } container 'staphb/multiqc:1.19' time '10m' @@ -1161,7 +1160,7 @@ process rasusa { tag "${meta.id}" label "process_medium" publishDir "${params.outdir}/${meta.id}", mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - container 'staphb/rasusa:0.8.0' + container 'staphb/rasusa:2.0.0' time '10m' input: @@ -1180,9 +1179,11 @@ process rasusa { """ mkdir -p rasusa - rasusa ${args} \ - --input ${fastq} \ - --output rasusa/${prefix}_rasusa.fastq.gz + rasusa \ + reads \ + ${args} \ + --output rasusa/${prefix}_rasusa.fastq.gz \ + ${fastq} cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -1291,18 +1292,6 @@ process summary { dict[sample_analysis]['unmapped_illumina'] = unmapped_illumina return dict - def fastp_results(): - dict = {} - files = glob.glob('*_fastp_sr.json') - for file in files: - sample = file.replace('_fastp_sr.json', '') - with open(file, 'r') as f: - data = json.load(f) - total_reads = data['summary']['before_filtering']['total_reads'] - dict[sample] = total_reads - - return dict - def file_to_dict(file, header, delim): dict = {} with open(file, mode='r', newline='') as file: @@ -1399,8 +1388,6 @@ process summary { else: gfastats_dict = {} - fastp_dict = fastp_results() - busco_dict = busco_results() circulocov_dict = circulocov_results() @@ -1408,102 +1395,105 @@ process summary { final_results = {} assemblers = ['dragonflye', 'flye', 'hybracter', 'raven', 'unicycler'] for key in nanoplot_dict.keys(): - final_results[key] = {} - final_results[key]['name'] = key - - # from nanostas - final_results[key]['number_of_reads'] = int(nanoplot_dict[key]['number_of_reads']) - final_results[key]['mean_read_length'] = float(nanoplot_dict[key]['mean_read_length']) - final_results[key]['mean_qual'] = float(nanoplot_dict[key]['mean_qual']) - # from fastp (runs on non-unicycler runs) - if key in fastp_dict.keys(): - final_results[key]['total_illumina_reads'] = int(fastp_dict[key]) + if key.endswith('_illumina'): + actual_key = key.replace('_illumina', '') + if actual_key not in final_results.keys(): + final_results[actual_key] = {} + final_results[actual_key]['total_illumina_reads'] = int(nanoplot_dict[key]['number_of_reads']) else: - final_results[key]['total_illumina_reads'] = 0 - - # from mash - if key in mash_dict.keys(): - final_results[key]['nanopore_illumina_mash_distance'] = float(mash_dict[key]['dist']) - else: - final_results[key]['nanopore_illumina_mash_distance'] = 'NF' - - final_results[key]['assemblers'] = '${params.assembler}' - - # for each assembler - for assembler in assemblers: - if key + '_' + assembler in gfastats_dict.keys(): - final_results[key][assembler] = {} - final_results[key][assembler]['assembler'] = assembler - - # gfastats results - total_length = 0 - num_circular = 0 - for contig in gfastats_dict[key + '_' + assembler].keys(): - total_length = total_length + int(gfastats_dict[key + '_' + assembler][contig]['Total segment length']) - if gfastats_dict[key + '_' + assembler][contig]['Is circular'] == 'Y': - num_circular = num_circular + 1 - - final_results[key][assembler]['total_length'] = total_length - final_results[key][assembler]['num_contigs'] = len(gfastats_dict[key + '_' + assembler].keys()) - final_results[key][assembler]['circ_contigs'] = num_circular - - # circulocov results - if key + '_' + assembler in circulocov_dict.keys(): - if 'coverage' in circulocov_dict[key + '_' + assembler].keys(): - final_results[key][assembler]['coverage'] = float(circulocov_dict[key + '_' + assembler]['coverage']) - else: - final_results[key][assembler]['coverage'] = 'NF' - - if 'unmapped_nanopore' in circulocov_dict[key + '_' + assembler].keys(): - final_results[key][assembler]['unmapped_nanopore'] = int(circulocov_dict[key + '_' + assembler]['unmapped_nanopore']) - final_results[key][assembler]['unmapped_nanopore_pc'] = float('{:.2f}'.format(int(final_results[key][assembler]['unmapped_nanopore']) / int(nanoplot_dict[key]['number_of_reads']) * 100)) - else: - final_results[key][assembler]['unmapped_nanopore'] = 'NF' - final_results[key][assembler]['unmapped_nanopore_pc'] = 'NF' - - if final_results[key]['total_illumina_reads'] == 0 and 'illumina_numreads' in circulocov_dict[key + '_' + assembler].keys(): - final_results[key]['total_illumina_reads'] = int(circulocov_dict[key + '_' + assembler]['illumina_numreads']) - - if 'unmapped_illumina' in circulocov_dict[key + '_' + assembler].keys(): - final_results[key][assembler]['unmapped_illumina'] = int(circulocov_dict[key + '_' + assembler]['unmapped_illumina']) - if 'total_illumina_reads' in final_results[key].keys() and final_results[key]['total_illumina_reads'] > 0: - final_results[key][assembler]['unmapped_illumina_pc'] = float('{:.2f}'.format(int(final_results[key][assembler]['unmapped_illumina']) / int(final_results[key]['total_illumina_reads']) * 100 )) + if key not in final_results.keys(): + final_results[key] = {} + + if 'total_illumina_reads' not in final_results[key].keys(): + final_results[key]['total_illumina_reads'] = 0 + + final_results[key]['name'] = key + + # from nanostas + final_results[key]['number_of_reads'] = int(nanoplot_dict[key]['number_of_reads']) + final_results[key]['mean_read_length'] = float(nanoplot_dict[key]['mean_read_length']) + final_results[key]['mean_qual'] = float(nanoplot_dict[key]['mean_qual']) + + # from mash + if key in mash_dict.keys(): + final_results[key]['nanopore_illumina_mash_distance'] = float(mash_dict[key]['dist']) + else: + final_results[key]['nanopore_illumina_mash_distance'] = 'NF' + + final_results[key]['assemblers'] = '${params.assembler}' + + # for each assembler + for assembler in assemblers: + if key + '_' + assembler in gfastats_dict.keys(): + final_results[key][assembler] = {} + final_results[key][assembler]['assembler'] = assembler + + # gfastats results + total_length = 0 + num_circular = 0 + for contig in gfastats_dict[key + '_' + assembler].keys(): + total_length = total_length + int(gfastats_dict[key + '_' + assembler][contig]['Total segment length']) + if gfastats_dict[key + '_' + assembler][contig]['Is circular'] == 'Y': + num_circular = num_circular + 1 + + final_results[key][assembler]['total_length'] = total_length + final_results[key][assembler]['num_contigs'] = len(gfastats_dict[key + '_' + assembler].keys()) + final_results[key][assembler]['circ_contigs'] = num_circular + + # circulocov results + if key + '_' + assembler in circulocov_dict.keys(): + if 'coverage' in circulocov_dict[key + '_' + assembler].keys(): + final_results[key][assembler]['coverage'] = float(circulocov_dict[key + '_' + assembler]['coverage']) else: - final_results[key][assembler]['unmapped_illumina_pc'] = 0.0 - else: - final_results[key][assembler]['unmapped_illumina'] = 'NF' - final_results[key][assembler]['unmapped_illumina_pc'] = 'NF' - - # busco results - if key + '_' + assembler in busco_dict.keys(): - final_results[key][assembler]['busco'] = busco_dict[key + '_' + assembler] - elif key + '_' + assembler + '_reoriented' in busco_dict.keys(): - final_results[key][assembler]['busco'] = busco_dict[key + '_' + assembler + '_reoriented'] - else: - final_results[key][assembler]['busco'] = 'NF' + final_results[key][assembler]['coverage'] = 'NF' - if assembler != 'unicycler': - for step in ['polypolish', 'pypolca', 'medaka']: - if key + '_' + assembler + '_' + step in busco_dict.keys(): - final_results[key][assembler]['busco_' + step ] = busco_dict[key + '_' + assembler + '_' + step] + if 'unmapped_nanopore' in circulocov_dict[key + '_' + assembler].keys(): + final_results[key][assembler]['unmapped_nanopore'] = int(circulocov_dict[key + '_' + assembler]['unmapped_nanopore']) + final_results[key][assembler]['unmapped_nanopore_pc'] = float('{:.2f}'.format(int(final_results[key][assembler]['unmapped_nanopore']) / int(nanoplot_dict[key]['number_of_reads']) * 100)) else: - final_results[key][assembler]['busco_' + step ] = 'NF' - - # pypolca results - if key + '_' + assembler in pypolca_dict.keys(): - if 'Consensus_Quality_Before_Polishing' in pypolca_dict[key + '_' + assembler].keys(): - final_results[key][assembler]['Consensus_Quality_Before_Polishing'] = float(pypolca_dict[key + '_' + assembler]['Consensus_Quality_Before_Polishing']) - else: - final_results[key][assembler]['Consensus_Quality_Before_Polishing'] = 'NF' - if 'Consensus_QV_Before_Polishing' in pypolca_dict[key + '_' + assembler].keys(): - final_results[key][assembler]['Consensus_QV_Before_Polishing'] = float(pypolca_dict[key + '_' + assembler]['Consensus_QV_Before_Polishing']) + final_results[key][assembler]['unmapped_nanopore'] = 'NF' + final_results[key][assembler]['unmapped_nanopore_pc'] = 'NF' + + if 'unmapped_illumina' in circulocov_dict[key + '_' + assembler].keys(): + final_results[key][assembler]['unmapped_illumina'] = int(circulocov_dict[key + '_' + assembler]['unmapped_illumina']) + if 'total_illumina_reads' in final_results[key].keys() and final_results[key]['total_illumina_reads'] > 0: + final_results[key][assembler]['unmapped_illumina_pc'] = float('{:.2f}'.format(int(final_results[key][assembler]['unmapped_illumina']) / int(final_results[key]['total_illumina_reads']) * 100 )) + else: + final_results[key][assembler]['unmapped_illumina_pc'] = 0.0 + else: + final_results[key][assembler]['unmapped_illumina'] = 'NF' + final_results[key][assembler]['unmapped_illumina_pc'] = 'NF' + + # busco results + if key + '_' + assembler in busco_dict.keys(): + final_results[key][assembler]['busco'] = busco_dict[key + '_' + assembler] + elif key + '_' + assembler + '_reoriented' in busco_dict.keys(): + final_results[key][assembler]['busco'] = busco_dict[key + '_' + assembler + '_reoriented'] else: - final_results[key][assembler]['Consensus_QV_Before_Polishing'] = 'NF' + final_results[key][assembler]['busco'] = 'NF' + + if assembler != 'unicycler': + for step in ['polypolish', 'pypolca', 'medaka']: + if key + '_' + assembler + '_' + step in busco_dict.keys(): + final_results[key][assembler]['busco_' + step ] = busco_dict[key + '_' + assembler + '_' + step] + else: + final_results[key][assembler]['busco_' + step ] = 'NF' + + # pypolca results + if key + '_' + assembler in pypolca_dict.keys(): + if 'Consensus_Quality_Before_Polishing' in pypolca_dict[key + '_' + assembler].keys(): + final_results[key][assembler]['Consensus_Quality_Before_Polishing'] = float(pypolca_dict[key + '_' + assembler]['Consensus_Quality_Before_Polishing']) + else: + final_results[key][assembler]['Consensus_Quality_Before_Polishing'] = 'NF' + if 'Consensus_QV_Before_Polishing' in pypolca_dict[key + '_' + assembler].keys(): + final_results[key][assembler]['Consensus_QV_Before_Polishing'] = float(pypolca_dict[key + '_' + assembler]['Consensus_QV_Before_Polishing']) + else: + final_results[key][assembler]['Consensus_QV_Before_Polishing'] = 'NF' - elif assembler != 'unicycler': - final_results[key][assembler]['Consensus_Quality_Before_Polishing'] = 0.0 - final_results[key][assembler]['Consensus_QV_Before_Polishing'] = 0.0 + elif assembler != 'unicycler': + final_results[key][assembler]['Consensus_Quality_Before_Polishing'] = 0.0 + final_results[key][assembler]['Consensus_QV_Before_Polishing'] = 0.0 final_file(final_results) tsv_file(final_results) @@ -1537,8 +1527,6 @@ process unicycler { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ - mkdir -p unicycler - unicycler ${args} \ -1 ${illumina[0]} \ -2 ${illumina[1]} \ @@ -1721,7 +1709,7 @@ workflow DONUT_FALLS { ch_illumina_input .join(mash.out.dist, by: 0) - .filter{it[2] as float < 0.05} + .filter{it[2] as float < 0.5} .map{it -> tuple(it[0], it[1])} .set {ch_dist_filter} @@ -1737,8 +1725,8 @@ workflow DONUT_FALLS { if (params.assembler.replaceAll('dragonflye','dragon') =~ /flye/ || params.assembler =~ /raven/ ) { // quality filter ch_dist_filter - .map { it -> [it[0], it[1], "illumina"]} - .mix(ch_nanopore_input.map { it -> [it[0], it[1], "nanopore"]}) + .map { it -> [it[0], it[1]]} + .mix(ch_nanopore_input.map { it -> [it[0], it[1]]}) .filter{it[0]} .set { ch_input } @@ -1748,14 +1736,15 @@ workflow DONUT_FALLS { ch_summary = ch_summary.mix(fastp.out.summary) fastp.out.fastq - .filter { it[1].size() > 200 } .branch { it -> - nanopore: it[2] == 'nanopore' - illumina: it[2] == 'illumina' + illumina: it[1].toList().size() == 2 + nanopore: it[1].size() > 200 } .set { ch_filter } - rasusa(ch_filter.nanopore.map {it -> tuple(it[0], it[1])}) + ch_filter.nanopore.view{it[1].toList().size()} + + rasusa(ch_filter.nanopore) ch_versions = ch_versions.mix(rasusa.out.versions) @@ -1870,7 +1859,7 @@ workflow DONUT_FALLS { ch_consensus = ch_consensus.mix(dnaapler.out.fasta).mix(medaka.out.fasta).mix(polypolish.out.fasta).mix(pypolca.out.fasta) } - nanoplot(ch_nanopore_input) + nanoplot(ch_nanopore_input.mix(ch_illumina_input.filter{it[1]})) nanoplot.out.summary .collectFile(name: "nanoplot_summary.csv", diff --git a/nextflow.config b/nextflow.config index 6f5f572..a2f2775 100644 --- a/nextflow.config +++ b/nextflow.config @@ -4,7 +4,7 @@ manifest { author = 'Erin Young' homePage = 'https://github.com/UPHL-BioNGS/Donut_Falls' description = "De novo assembly of long-reads" - version = '1.7.24198' + version = '1.8.24207' nextflowVersion = '!>=22.10.1' defaultBranch = 'main' }