Merge branch 'dev' into samtools_index_c

nf-core · Aug 27, 2024 · ff5ace9 · ff5ace9
2 parents 7d34115 + 0e95690
commit ff5ace9
Show file tree

Hide file tree

Showing 7 changed files with 290 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [[#382]](https://github.com/nf-core/smrnaseq/pull/382) - Add [collapse_mirtop.R](https://github.com/nf-core/smrnaseq/issues/174) - Add nf-tests for local modules using custom R scripts.
 - [[#383]](https://github.com/nf-core/smrnaseq/pull/383) - Fix [parameter `--skip_fastp` throws an error](https://github.com/nf-core/smrnaseq/issues/263) - Fix parameter --skip_fastp.
 - [[#384]](https://github.com/nf-core/smrnaseq/pull/384) - Fix [filter status bug fix](https://github.com/nf-core/smrnaseq/issues/360) - Fix filter stats module and add filter contaminants test profile.
+- [[#387]](https://github.com/nf-core/smrnaseq/pull/387) - Add nf-test to local module `blat_mirna` and fixes [contaminant filter failure because the Docker image for BLAT cannot be pulled](https://github.com/nf-core/smrnaseq/issues/354). Adds a small test profile to test contaminant filter results.
 - [[#391]](https://github.com/nf-core/smrnaseq/pull/391) - Change `.bai` index for `.csi` index in `samtools_index` to fix [error because of large chromosomes](https://github.com/nf-core/smrnaseq/issues/132).
 
 ## v2.3.1 - 2024-04-18 - Gray Zinc Dalmation Patch

diff --git a/conf/test_contamination.config b/conf/test_contamination.config
@@ -0,0 +1,41 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/smrnaseq -profile test_contamination,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function with contamination filter'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+
+    input            = 'https://github.com/nf-core/test-datasets/raw/smrnaseq/samplesheet/v2.0/samplesheet.csv'
+    fasta            = 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/genome.fa'
+
+    mirtrace_species         = 'hsa'
+    skip_mirdeep             = true
+    save_merged              = false
+    save_aligned_mirna_quant = false
+
+
+    filter_contamination = true
+    cdna                 = "https://huggingface.co/datasets/nf-core/smrnaseq/resolve/main/GRCh37/Homo_sapiens.GRCh37.cdna.all.fa"
+    ncrna                = "https://huggingface.co/datasets/nf-core/smrnaseq/resolve/main/GRCh37/Homo_sapiens.GRCh37.ncrna.fa"
+    trna                 = "https://huggingface.co/datasets/nf-core/smrnaseq/resolve/main/GRCh37/hg19-tRNAs.fa"
+}
+
+// Include illumina config to run test without additional profiles
+
+includeConfig 'protocol_illumina.config'
diff --git a/modules/local/blat_mirna/blat_mirna.nf b/modules/local/blat_mirna/blat_mirna.nf
@@ -0,0 +1,60 @@
+process BLAT_MIRNA {
+    tag "$fasta"
+    label 'process_medium'
+
+    conda 'bioconda::blat=36'
+    container 'community.wave.seqera.io/library/ucsc-blat:445--32730933d3c2c916'
+
+    input:
+    val db_type
+    path mirna
+    path contaminants
+
+
+    output:
+    path 'filtered.fa'  , emit: filtered_set
+    path "versions.yml" , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    if ( db_type == "cdna" )
+        """
+        echo $db_type
+        awk '/^>/ { x=index(\$6, "transcript_biotype:miRNA") } { if(!x) print }' $contaminants > subset.fa
+        blat -out=blast8 $mirna subset.fa /dev/stdout | awk 'BEGIN{FS="\t"}{if(\$11 < 1e-5)print \$1;}' | uniq > mirnahit.txt
+        awk 'BEGIN { while((getline<"mirnahit.txt")>0) l[">"\$1]=1 } /^>/ {x = l[\$1]} {if(!x) print }' subset.fa  > filtered.fa
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            blat: \$(echo \$(blat) | grep Standalone | awk '{ if (match(\$0,/[0-9]*[0-9]/,m)) print m[0] }')
+        END_VERSIONS
+        """
+
+    else if ( db_type == "ncrna" )
+        """
+        echo $db_type
+        awk '/^>/ { x=(index(\$6, "transcript_biotype:rRNA") || index(\$6, "transcript_biotype:miRNA")) } { if(!x) print }' $contaminants > subset.fa
+        blat -out=blast8 $mirna subset.fa /dev/stdout | awk 'BEGIN{FS="\t"}{if(\$11 < 1e-5)print \$1;}' | uniq > mirnahit.txt
+        awk 'BEGIN { while((getline<"mirnahit.txt")>0) l[">"\$1]=1 } /^>/ {x = l[\$1]} {if(!x) print }' subset.fa  > filtered.fa
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            blat: \$(echo \$(blat) | grep Standalone | awk '{ if (match(\$0,/[0-9]*[0-9]/,m)) print m[0] }')
+        END_VERSIONS
+        """
+
+    else
+        """
+        echo $db_type
+        blat -out=blast8 $mirna $contaminants /dev/stdout | awk 'BEGIN{FS="\t"}{if(\$11 < 1e-5)print \$1;}' | uniq > mirnahit.txt
+        awk 'BEGIN { while((getline<"mirnahit.txt")>0) l[">"\$1]=1 } /^>/ {x = l[\$1]} {if(!x) print }' $contaminants  > filtered.fa
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            blat: \$(echo \$(blat) | grep Standalone | awk '{ if (match(\$0,/[0-9]*[0-9]/,m)) print m[0] }')
+        END_VERSIONS
+        """
+
+}
diff --git a/modules/local/blat_mirna/tests/blat_mirna.nf.test b/modules/local/blat_mirna/tests/blat_mirna.nf.test
@@ -0,0 +1,114 @@
+nextflow_process {
+
+    name "Test Process BLAT_MIRNA"
+    script "../blat_mirna.nf"
+    process "BLAT_MIRNA"
+    tag "modules"
+    tag "modules_local"
+    tag "blat_mirna"
+
+    test("cDNA BLAT - Human") {
+
+        when {
+            params {
+                outdir = "${outputDir}"
+            }
+            process {
+                """
+                input[0] = "cdna"
+                input[1] =  [file("https://github.com/nf-core/test-datasets/raw/smrnaseq/miRBase/hairpin.fa", checkIfExists: true)]
+                input[2] =  [file("https://huggingface.co/datasets/nf-core/smrnaseq/resolve/main/GRCh37/Homo_sapiens.GRCh37.cdna.all.fa", checkIfExists: true)]
+                """
+            }
+        }
+
+        then {
+            assert process.success
+            assert snapshot(process.out).match()
+
+            with(process.out.filtered_set) {
+                with(get(0)) {
+                    assert get(0).endsWith("filtered.fa")
+
+                    // Check for AWK filtering of specific biotype
+                    def lines = path(get(0)).readLines()
+                    assert !lines.any { it.contains("transcript_biotype:miRNA") }
+
+                    // Check for exclusion of miRNAs hits
+                    assert !lines.any { it.contains("ENST00000564740.1") }
+                }
+            }
+        }
+
+    }
+
+    test("ncRNA BLAT - Human") {
+
+        when {
+            params {
+                outdir = "${outputDir}"
+            }
+            process {
+                """
+                input[0] = "ncRNA"
+                input[1] =  [file("https://github.com/nf-core/test-datasets/raw/smrnaseq/miRBase/hairpin.fa", checkIfExists: true)]
+                input[2] =  [file("https://huggingface.co/datasets/nf-core/smrnaseq/resolve/main/GRCh37/Homo_sapiens.GRCh37.ncrna.fa", checkIfExists: true)]
+                """
+            }
+        }
+
+        then {
+            assert process.success
+            assert snapshot(process.out).match()
+
+            with(process.out.filtered_set) {
+                with(get(0)) {
+                    assert get(0).endsWith("filtered.fa")
+
+                    // Check for AWK filtering of specific biotype
+                    def lines = path(get(0)).readLines()
+                    // Lines contain transcript_biotype:rRNA or miRNA, so AWK assertions do not work:
+                    //assert !lines.any { it.contains("transcript_biotype:rRNA") }
+                    //assert !lines.any { it.contains("transcript_biotype:miRNA") }
+
+                    // Check for exclusion of miRNAs hits
+                    assert !lines.any { it.contains("ENST00000564740.1") }
+                }
+            }
+        }
+
+    }
+
+    test("tRNA BLAT - Human") {
+
+        when {
+            params {
+                outdir = "${outputDir}"
+            }
+            process {
+                """
+                input[0] = "tRNA"
+                input[1] =  [file("https://github.com/nf-core/test-datasets/raw/smrnaseq/miRBase/hairpin.fa", checkIfExists: true)]
+                input[2] =  [file("https://huggingface.co/datasets/nf-core/smrnaseq/resolve/main/GRCh37/hg19-tRNAs.fa", checkIfExists: true)]
+                """
+            }
+        }
+
+        then {
+            assert process.success
+            assert snapshot(process.out).match()
+
+            with(process.out.filtered_set) {
+                with(get(0)) {
+                    assert get(0).endsWith("filtered.fa")
+
+                    // Check for exclusion of miRNAs hits
+                    def lines = path(get(0)).readLines()
+                    assert !lines.any { it.contains("ENST00000564740.1") }
+                }
+            }
+        }
+
+    }
+
+}
diff --git a/modules/local/blat_mirna/tests/blat_mirna.nf.test.snap b/modules/local/blat_mirna/tests/blat_mirna.nf.test.snap
@@ -0,0 +1,71 @@
+{
+    "ncRNA BLAT - Human": {
+        "content": [
+            {
+                "0": [
+                    "filtered.fa:md5,6bc8a430400e2e78cf7f474981230811"
+                ],
+                "1": [
+                    "versions.yml:md5,e2957df2cc8f0410101564c8e65d1761"
+                ],
+                "filtered_set": [
+                    "filtered.fa:md5,6bc8a430400e2e78cf7f474981230811"
+                ],
+                "versions": [
+                    "versions.yml:md5,e2957df2cc8f0410101564c8e65d1761"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "24.04.4"
+        },
+        "timestamp": "2024-08-23T17:53:35.313580289"
+    },
+    "tRNA BLAT - Human": {
+        "content": [
+            {
+                "0": [
+                    "filtered.fa:md5,6b54e95ca5418d7d9c4d331ca3b2c96f"
+                ],
+                "1": [
+                    "versions.yml:md5,e2957df2cc8f0410101564c8e65d1761"
+                ],
+                "filtered_set": [
+                    "filtered.fa:md5,6b54e95ca5418d7d9c4d331ca3b2c96f"
+                ],
+                "versions": [
+                    "versions.yml:md5,e2957df2cc8f0410101564c8e65d1761"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "24.04.4"
+        },
+        "timestamp": "2024-08-23T17:57:58.238216453"
+    },
+    "cDNA BLAT - Human": {
+        "content": [
+            {
+                "0": [
+                    "filtered.fa:md5,8fd42894e815999b4278b08297720aae"
+                ],
+                "1": [
+                    "versions.yml:md5,e2957df2cc8f0410101564c8e65d1761"
+                ],
+                "filtered_set": [
+                    "filtered.fa:md5,8fd42894e815999b4278b08297720aae"
+                ],
+                "versions": [
+                    "versions.yml:md5,e2957df2cc8f0410101564c8e65d1761"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "24.04.4"
+        },
+        "timestamp": "2024-08-23T17:53:16.735132971"
+    }
+}
diff --git a/nextflow.config b/nextflow.config
@@ -248,7 +248,8 @@ profiles {
     test_index                     { includeConfig 'conf/test_index.config' }
     test_technical_repeats         { includeConfig 'conf/test_technical_repeats.config' }
     test_mirgenedb                 { includeConfig 'conf/test_mirgenedb.config' }
-    test_skipfastp              { includeConfig 'conf/test_skipfastp.config' }
+    test_contamination             { includeConfig 'conf/test_contamination.config' }
+    test_skipfastp                 { includeConfig 'conf/test_skipfastp.config' }
 
 
     //Protocol specific profiles

diff --git a/subworkflows/local/contaminant_filter.nf b/subworkflows/local/contaminant_filter.nf
@@ -5,7 +5,7 @@
 include { BLAT_MIRNA as BLAT_CDNA
         BLAT_MIRNA as BLAT_NCRNA
         BLAT_MIRNA as BLAT_PIRNA
-        BLAT_MIRNA as BLAT_OTHER } from '../../modules/local/blat_mirna'
+        BLAT_MIRNA as BLAT_OTHER } from '../../modules/local/blat_mirna/blat_mirna'
 
 include { INDEX_CONTAMINANTS as INDEX_RRNA
         INDEX_CONTAMINANTS as INDEX_TRNA