Merge pull request #1953 from nickeener/mummer4

Add Mummer4 tools
galaxyproject · Dec 4, 2018 · 9c835e6 · 9c835e6
2 parents 5b6dc96 + ab9c3b9
commit 9c835e6
Show file tree

Hide file tree

Showing 28 changed files with 1,459 additions and 0 deletions.
diff --git a/tools/mummer4/.shed.yml b/tools/mummer4/.shed.yml
@@ -0,0 +1,17 @@
+name: mummer4
+owner: iuc
+homepage_url: https://github.com/mummer4/mummer
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/mummer4
+description: Mummer4 Tools
+long_description: |
+  MUMmer is a system for rapidly aligning entire genomes. The current version (release 4.x) can find all 20 base pair maximal exact matches between two bacterial genomes of ~5 million base pairs each in 20 seconds, using 90 MB of memory, on a typical 1.8 GHz Linux desktop computer. 
+categories:
+  - Sequence Analysis
+type: unrestricted
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  description_template: "Wrapper for the MUMmer4 tool: {{ tool_name }}."
+suite:
+  name: "suite_mummer4"
+  description: "A suite of MUMmer4 tools."
+type: repository_suite_definition
diff --git a/tools/mummer4/delta-filter.xml b/tools/mummer4/delta-filter.xml
@@ -0,0 +1,81 @@
+<tool id="mummer_delta_filter" name="Delta-Filter" version="@MUMMER_VERSION@">
+    <description>Filters alignment (delta) file from nucmer</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <command detect_errors="exit_code">
+        <![CDATA[
+        delta-filter
+            $alignment
+            -i '$min_identity'
+            -l '$min_length'
+            $overlap
+            -u '$min_uniqueness'
+            -o '$max_overlap'
+            '$delta' > '$output'
+        ]]>
+    </command>
+    <inputs>
+        <param name="delta" type="data" format="tabular" label="Match file from nucmer" />
+        <param name="alignment" type="select" label="Alignment Strategy" >
+            <option value="-m">Use default [Many-to-many alignment allowing for rearrangements] (-m)</option>
+            <option value="-1">1-to-1 alignment allowing for rearrangements (-1)</option>
+            <option value="-g">1-to-1 global alignment not allowing rearrangements (-g)</option>
+        </param>
+        <param name="min_identity" type="float" argument="-i" value="0" min="0" max="100" label="Minimum Identity" help="Set the minimum alignment identity. (-i)" />
+        <param name="min_length" type="integer" argument="-l" value="0" label="Minumum Legnth" help="Set the minimum alignment length. (-l)" />
+        <param name="overlap" type="select" label="Overlaps"
+            help=" Maps each position of each query/reference to its best hit in the reference/query, allowing for reference/query overlaps." >
+            <option value="-q">Reference overlaps (-q)</option>
+            <option value="-r">Query overlaps (-r)</option>
+        </param>
+        <param name="min_uniqueness" type="float" argument="-u" value="0" min="0" max="100" label="Minimum Alignment Uniqueness"
+            help="Set the minimum alignment uniqueness, i.e. percent of the alignment matching to unique reference AND query sequence. (-u)" />
+        <param name="max_overlap" type="float" argument="-o" value="100" min="0" max="100" label="Maximum Alignment Overlap"
+            help="Set the maximum alignment overlap for overlap options as a percent of the alignment length. (-o)" />
+    </inputs>
+    <outputs>
+        <data name="output" format="tabular" from_work_dir="delta-filter.txt"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="delta" ftype="tabular" value="nucmer.txt" />
+            <output name="output" ftype="tabular" compare="diff" lines_diff="2" value="delta-filter.txt" />
+        </test>
+    </tests>
+    <help><![CDATA[
+        This program filters the alignment file produced by nucmer, leaving only the desired alignments. Its primary function is the LIS algorithm which calculates the longest increasing subset of alignments. This allows for the calculation of a global set of alignments (i.e. 1-to-1 and mutually consistent order) with the 1-1 global option or locally consistent with 1-1 with rearrangements or many-to-many alignment. Reference sequences can be mapped to query sequences with the reference option of the Overlaps parameter, or queries to references with the Query option. This allows the user to exclude chance and repeat induced alignments, leaving only the "best" alignments between the two data sets. Filtering can also be performed on length, identity, and uniqueness.
+
+        An important distinction between the alignment options is that 1-1 global requires the alignments to be mutually consistent in their order, while the other options are not required to be mutually consistent and therefore tolerate translocations, inversions, etc.
+
+        In general cases, the many-to-many option is the best choice, however 1-1 alignment allowing for rearrangements can be handy for applications such as SNP finding which require a 1-to-1 mapping. 
+
+        Finally, for mapping query contigs, or sequencing reads, to a reference genome, use the query option for the Overlaps parameter.
+
+**Options**::
+
+    -m      Many-to-many alignment allowing for rearrangements
+
+    -1      1-to-1 alignment allowing for rearrangements
+
+    -g      1-to-1 global alignment not allowing rearrangements
+
+    -i      Set the minimum alignment identity [0, 100], default 0
+
+    -l      Set the minimum alignment length, default 0
+
+    -q      Maps each position of each query to its best hit in the reference, allowing for reference 
+            overlaps
+
+    -r      Maps each position of each reference to its best hit in the query, allowing for query overlaps
+
+    -u      Set the minimum alignment uniqueness, i.e. percent of the alignment matching to unique reference 
+            AND query sequence [0,100], default 0
+
+    -o      Set the maximum alignment overlap for -r and -q options as a percent of the alignment length 
+            [0, 100], default 100
+
+    ]]></help>
+    <expand macro="citation" />
+</tool>
diff --git a/tools/mummer4/dnadiff.xml b/tools/mummer4/dnadiff.xml
@@ -0,0 +1,98 @@
+<tool id="mummer_dnadiff" name="DNAdiff" version="@MUMMER_VERSION@">
+    <description>Evaluate similarities/differences between two sequences</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <command detect_errors="exit_code">
+        <![CDATA[
+        ln -s $reference_sequence reference.fa &&
+        ln -s $query_sequence query.fa &&
+        dnadiff
+            'reference.fa' 'query.fa'
+
+        ]]>  
+    </command>
+    <inputs>
+        <param name="reference_sequence" type="data" format="fasta" label="Reference Sequence" help="FastA or multi-FastA" />
+        <param name="query_sequence" type="data" format="fasta" label="Query Sequence(s)" help="FastA or multi-FastA" />
+        <param name="report_only" type="select" label="Output only the general report file?" help="Select no to output all output files">
+            <option value="yes">YES</option>
+            <option value="no">NO</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="report" format="txt" from_work_dir="out.report" label="${tool.name} on ${on_string}: report" />
+        <data name="delta" format="tabular" from_work_dir="out.delta" label="${tool.name} on ${on_string}: delta">
+            <filter> report_only == 'no'</filter>
+        </data>
+        <data name="1delta" format="tabular" from_work_dir="out.1delta" label="${tool.name} on ${on_string}: 1delta" >
+            <filter> report_only == 'no' </filter>
+        </data>
+        <data name="mdelta" format="tabular" from_work_dir="out.mdelta" label="${tool.name} on ${on_string}: mdelta" >
+            <filter> report_only == 'no' </filter>
+        </data>
+        <data name="1coords" format="tabular" from_work_dir="out.1coords" label="${tool.name} on ${on_string}: 1coords" >
+            <filter> report_only == 'no' </filter>
+            <actions>
+                    <action name="column_names" type="metadata" default="[S1], [E1], [S2], [E2], [LEN 1], [LEN 2], [% IDY], [LEN R], [LEN Q], [COV R], [COV Q], [REF TAG], [QUERY TAG]" />
+            </actions>
+        </data>
+        <data name="mcoords" format="tabular" from_work_dir="out.mcoords" label="${tool.name} on ${on_string}: mcoords" >
+            <filter> report_only == 'no' </filter>
+            <actions>
+                    <action name="column_names" type="metadata" default="[S1], [E1], [S2], [E2], [LEN 1], [LEN 2], [% IDY], [LEN R], [LEN Q], [COV R], [COV Q], [REF TAG], [QUERY TAG]" />
+            </actions>
+        </data>
+        <data name="snps" format="tabular" from_work_dir="out.snps" label="${tool.name} on ${on_string}: snps" >
+            <filter> report_only == 'no' </filter>
+            <actions>
+            	<action name="column_names" type="metadata" default="[P1], [REF SUB], [QUERY SUB], [P2], [BUFF], [DIST], [LEN REF], [LEN QUERY], [REF FRAME], [QUERY FRAME], [REF TAG], [QUERY TAG]" />
+            </actions>
+        </data>
+        <data name="rdiff" format="tabular" from_work_dir="out.rdiff" label="${tool.name} on ${on_string}: rdiff" >
+            <filter> report_only == 'no' </filter>
+            <actions>
+                <action name="column_names" type="metadata" default="Seq ID, Feature Type, Feature Start, Feature End, Feature Length" />
+            </actions>
+        </data>
+        <data name="qdiff" format="tabular" from_work_dir="out.qdiff" label="${tool.name} on ${on_string}: qdiff" >
+            <filter> report_only == 'no' </filter>
+            <actions>
+                <action name="column_names" type="metadata" default="Seq ID, Feature Type, Feature Start, Feature End, Feature Length" />
+            </actions>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="reference_sequence" ftype="fasta" value="human_aqp3.fasta" />
+            <param name="query_sequence" ftype="fasta" value="mouse_aqp3.fasta"/>
+            <param name="input_type" value="sequence" />
+            <param name="report_only" value="no" />
+            <output name="report" ftype="txt" compare="diff" sort="true" lines_diff="2" value="report.txt"/>
+            <output name="delta" ftype="tabular" compare="diff" lines_diff="2" value="delta.txt"/>
+            <output name="1delta" ftype="tabular" compare="diff" lines_diff="2" value="1delta.txt"/>
+            <output name="mdelta" ftype="tabular" compare="diff" lines_diff="2" value="mdelta.txt"/>
+            <output name="1coords" ftype="tabular" compare="diff" value="1coords.txt"/>
+            <output name="mcoords" ftype="tabular" compare="diff" value="mcoords.txt"/>
+            <output name="snps" ftype="tabular" compare="diff" value="snps.txt"/>
+            <output name="rdiff" ftype="tabular" compare="diff" value="rdiff.txt"/>
+            <output name="qdiff" ftype="tabular" compare="diff" value="qdiff.txt"/>
+        </test>
+    </tests>
+     <help><![CDATA[
+This script is a wrapper around nucmer that builds an alignment using default parameters, and runs many of nucmer's helper scripts to process the output and report alignment statistics, SNPs, breakpoints, etc. It is designed for evaluating the sequence and structural similarity of two highly similar sequence sets. E.g. comparing two different assemblies of the same organism, or comparing two strains of the same species.
+
+**Output files:**
+    * report: Summary of alignments, differences and SNPs
+    * delta: Standard nucmer alignment output
+    * 1delta: 1-to-1 alignment from delta-filter -1
+    * mdelta: M-to-M alignment from delta-filter -m
+    * 1coords: 1-to-1 coordinates from show-coords -THrcl .1delta
+    * mcoords: M-to-M coordinates from show-coords -THrcl .mdelta
+    * snps: SNPs from show-snps -rlTHC .1delta
+    * rdiff: Classified ref breakpoints from show-diff -rH .mdelta
+    * qdiff: Classified qry breakpoints from show-diff -qH .mdelta
+    ]]></help>
+    <expand macro="citation" />
+</tool>
diff --git a/tools/mummer4/macros.xml b/tools/mummer4/macros.xml
@@ -0,0 +1,69 @@
+<macros>
+    <xml name="citation">
+        <citations>
+            <citation type="bibtex">
+                @misc{githubmummer,
+                author = {Art Delcher, Stefan Kurtz, Adam Phillippy, Steven Salzberg},
+                year = {2012},
+                title = {mummer4},
+                publisher = {GitHub},
+                journal = {GitHub repository},
+                url = {https://github.com/mummer4/mummer},
+            }</citation>
+        </citations>
+    </xml>
+    <token name="@MUMMER_VERSION@">4.0.0beta2</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@MUMMER_VERSION@">mummer4</requirement>
+            <yield />
+        </requirements>
+    </xml>
+    <xml name="mumplot_input" >
+        <yield />
+        <param name="breaklen" type="integer" argument="-b" value="20" label="Break Length"
+            help="Highlight alignments with breakpoints further than breaklen nucleotides from the nearest sequence end. (-b)" />
+        <param name="color" type="select" label="Color" help="Color plot lines with a percent similarity gradient or turn off all plot color." >
+            <option value="">Color</option>
+            <option value="-color">No color (-color)</option>
+        </param>
+        <param name="coverage" type="select" label="Coverage Plot" help="Generate a reference coverage plot (default for .tiling) or the defualt dotplot." >
+            <option value="">Dotplot</option>
+            <option value="-c">Coverage Plot (-c)</option>
+        </param>
+        <param name="filter" type="boolean" argument="--filter" truevalue="--filter" falsevalue="" label="Filter"
+            help="Only display .delta alignments which represent the 'best' hit to any particular spot on either sequence, i.e. a one-to-one mapping of reference and query subsequences. (--filter)" />
+        <param name="fat" type="boolean" argument="--fat" truevalue="--fat" falsevalue="" label="Layout sequences using fattest alignment only" help="(--fat)" />
+        <conditional name="labels" >
+            <param name="IDs" type="select" label="Plot a particular reference or query sequence?" help="For alignments that used more than one reference/query." >
+                <option value="no">NO</option>
+                <option value="yes">YES</option>
+            </param>
+            <when value="yes" >
+                <param name="ref_id" type="text" value="ref_id" label="Reference sequence ID" help="(-IdR)" />
+                <param name="query_id" type="text" value="query_id" label="Query sequence ID" help="(-IdQ)" />
+            </when>
+            <when value="no" />
+        </conditional>
+        <param name="size" type="select" label="Plot Size" help="Set the output size to small, medium or large. (-s)" >
+            <option value="small">Small</option>
+            <option value="medium">Medium</option>
+            <option value="large">Large</option>
+        </param>
+        <param name="snp" type="boolean" argument="--SNP" truevalue="--SNP" falsevalue="" label="SNPs" help="Highlight SNP locations in each alignment. (--SNP)" />
+        <param name="title" type="text" argument="-title" value="Title" label="Plot Title" help="(-title)" />
+        <conditional name="range" >
+            <param name="custom" type="select" label="Choose custom X and Y axis ranges?" >
+                <option value="no">NO</option>
+                <option value="yes">YES</option>
+            </param>
+            <when value="yes" >
+                <param name="min_x" type="integer" argument="-x" value="0" label="Minimum X-axis range" help="(-x)" />
+                <param name="max_x" type="integer" argument="-x" value="100" label="Maximum X-axis range" help="(-x)" />
+                <param name="min_y" type="integer" argument="-y" value="0" label="Minimum Y-axis range" help="(-y)" />
+                <param name="max_y" type="integer" argument="-y" value="100" label="Maximum Y-axis range" help="(-y)" />
+            </when>
+            <when value="no" />
+        </conditional>
+    </xml>
+</macros>