merge dev

geraldinepascal · May 19, 2021 · 85ff718 · 85ff718
2 parents 6e7d406 + d3013af
commit 85ff718
Show file tree

Hide file tree

Showing 52 changed files with 273 additions and 55 deletions.
diff --git a/INSTALL_from_source.md b/INSTALL_from_source.md
@@ -7,7 +7,7 @@ It has been tested on a Xubuntu 16.04 virtual machine.
 Here we suppose to install dependencies in the same directory as FROGS.
 
 ```bash
-version=3.2.2
+version=3.2.3
 DIR=`pwd`
 BIN_DIR=$DIR/bin
 mkdir -p $BIN_DIR

diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 
 Visit our web site : http://frogs.toulouse.inrae.fr/
 
-[![Release](https://img.shields.io/badge/release-3.2.2-blue.svg)![Date](https://img.shields.io/badge/date-April%202021-red.svg)](https://github.com/geraldinepascal/FROGS-wrappers/releases) [<img src="https://www.podcastscience.fm/wp-content/uploads/2017/12/deezer.png" width="5%" style="display: block; margin: auto;"/>](https://www.deezer.com/fr/playlist/5233843102?utm_source=deezer&utm_content=playlist-5233843102&utm_term=18632989_1545296531&utm_medium=web)
+[![Release](https://img.shields.io/badge/release-3.2.3-blue.svg)![Date](https://img.shields.io/badge/date-May%202021-red.svg)](https://github.com/geraldinepascal/FROGS-wrappers/releases) [<img src="https://www.podcastscience.fm/wp-content/uploads/2017/12/deezer.png" width="5%" style="display: block; margin: auto;"/>](https://www.deezer.com/fr/playlist/5233843102?utm_source=deezer&utm_content=playlist-5233843102&utm_term=18632989_1545296531&utm_medium=web)
 
 
 
@@ -132,9 +132,9 @@ FROGS is now available on bioconda (https://anaconda.org/bioconda/frogs).
   * to create a specific environment for a specific FROGS version
 
 ```
-conda env create --name [email protected].2 --file frogs-conda-requirements.yaml
+conda env create --name [email protected].3 --file frogs-conda-requirements.yaml
 # to use FROGS, first you need to activate your environment
-conda activate [email protected].2
+conda activate [email protected].3
 ```
 
 ### From source
@@ -146,7 +146,7 @@ see [INSTALL_from_source.md](INSTALL_from_source.md)
 To check your installation you can type:
 ```
 cd <FROGS_PATH>/test
-# when using conda FROGS_PATH=<conda_env_dir>/[email protected].2/share/FROGS_3.2.2
+# when using conda FROGS_PATH=<conda_env_dir>/[email protected].3/share/FROGS_3.2.3
 
 sh test.sh <FROGS_PATH> <NB_CPU> <JAVA_MEM> <OUT_FOLDER>
 ```

diff --git a/RELEASES_NOTES.md b/RELEASES_NOTES.md
@@ -1,3 +1,9 @@
+# v3.2.3 [2021-05]
+
+### Bug fixed
+
+* DESeq2 visualisation : correctly identify name of reference condition
+
 # v3.2.2 [2021-04]
 
 ### Modifications

diff --git a/frogs-conda-requirements.yaml b/frogs-conda-requirements.yaml
@@ -3,7 +3,7 @@ channels:
   - bioconda
 dependencies:
 # bioconda
-  - frogs =3.2.2
+  - frogs =3.2.3
   - emboss =6.6
   - flash =1.2
   # need to be >=2.8

diff --git a/libexec/fasta2RDP.py b/libexec/fasta2RDP.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python3.7
+#
+# Copyright (C) 2014 INRA
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+#-*-coding:utf-8-*-
+__author__ = 'Frederic Escudie - Plateforme bioinformatique Toulouse'
+__copyright__ = 'Copyright (C) 2015 INRA'
+__license__ = 'GNU General Public License'
+__version__ = '0.1.0'
+__email__ = '[email protected]'
+__status__ = 'dev'
+
+
+import sys,os
+import random
+import argparse
+import numpy as np
+import unicodedata
+from frogsNode import *
+from frogsSequenceIO import *
+
+
+##################################################################################################################################################
+#
+# FUNCTIONS
+#
+##################################################################################################################################################
+def get_taxonomy( node ):
+    """
+    @summary: Returns the taxonomy of the node.
+    @param node: [Node] The node processed.
+    @return: [str] The taxonomy based on 'clean_name'. Each level is separated by ';'.
+    """
+    parent_taxonomy = ";".join([getCleanName(ancestor) for ancestor in node.get_ancestors()[1:]])
+    if parent_taxonomy == "":
+        return getCleanName(node)
+    return parent_taxonomy + ";" + getCleanName(node)
+
+def get_cleaned_sp( species_name ):
+    """
+    @summary: Returns the species name without strain information.
+    @param species_name: [str] The standard name of the species (example: last field in taxonomy extract from the header fasta).
+    @return: [str] The cleaned species name.
+    """
+    new_species_name = species_name
+    if species_name.startswith("uncultured") or species_name.lower() in ["undefined", "unidentified", "incertae sedis"]:
+        new_species_name = "unknown species"
+    else:
+        # Remove strain information
+        pattern = re.compile( '^([^\s]+ sp\.).+' )
+        matches = pattern.match( new_species_name )
+        if matches is not None:
+            new_species_name = matches.group(1)
+        else:
+            pattern = re.compile( '^([^\s]+ [^\s]+) subsp\.' )
+            matches = pattern.match( new_species_name )
+            if matches is not None:
+                new_species_name = matches.group(1)
+            else:
+                pattern = re.compile( '(.+) DSMY? \d+$' )
+                matches = pattern.match( new_species_name )
+                if matches is not None:
+                    new_species_name = matches.group(1)
+    return new_species_name
+
+
+def getCleanName(node):
+    clean_name = node.name
+    if node.name.lower() != "root":
+        clean_name = node.name + " [id: " + str(node.metadata["id"]) + "]"
+    return clean_name
+
+
+def writeRDPTax( FH_tax, node ):
+    """
+    @summary: Writes the node and all the descendant. The Output format is RDP trainning database taxonomy (RDPTools/classifier.jar train).
+    @param FH_tax: [File] The file handle on output file.
+    @param node: [Node] The node to write.
+    """
+    node_depth = node.get_depth()
+    parent_id = None
+    if node_depth == 0:
+        parent_id = -1
+    else:
+        parent_id = node.parent.metadata["id"]
+    #taxid*taxon name*parent taxid*depth*rank
+    FH_tax.write( str(node.metadata["id"]) + '*' + getCleanName(node) + '*' + str(parent_id) + '*' + str(node_depth) + '*' + str(node.metadata["rank"]) + "\n" )
+    for child in node.get_children():
+        writeRDPTax( FH_tax, child )
+
+
+def treeFromFasta( in_fasta, ranks ):
+    tree = Node("Root", None, None, {"id":0, "rank":"rootrank"})
+    current_id = 1
+    FH_databank = FastaIO(in_fasta)
+    for record in FH_databank:
+        desc = unicodedata.normalize('NFD', record.description).encode('ascii', 'ignore').decode('utf-8')
+        if desc.endswith(";"):
+            desc=desc[:-1]
+        taxonomy = [taxa.strip() for taxa in desc.split(";")]
+        parent = tree
+        for rank_depth, taxa in enumerate(taxonomy):
+            if not ranks is None:
+                rank = ranks[rank_depth]
+            else :
+                rank=taxa[0]
+            if not parent.has_child(taxa): 
+            #####################################################################" pb niv espece car diff sp in connu du meme genre donne meme nom
+                taxa_node = Node(taxa, parent, None, {"id":current_id, "rank":rank})
+                current_id += 1
+            parent = parent.get_child(taxa)
+    FH_databank.close()
+    return tree
+
+
+def mergeFastaTax( in_tax, in_fasta, out_fasta ):
+    tax_by_id = dict()
+    # Get taxonomies by sequence ID
+    FH_tax = open(in_tax)
+    for line in FH_tax:
+        seq_id, taxonomy = line.strip().split(None, 1)
+        tax = unicodedata.normalize('NFD', taxonomy).encode('ascii', 'ignore').decode('utf-8')
+        if tax.endswith(";"):
+            tax_by_id[seq_id] = tax[:-1]
+        else:
+            tax_by_id[seq_id] = tax
+    FH_tax.close()
+    # Write fasta with taxonomy
+    FH_in_db = FastaIO(in_fasta)
+    FH_out_db = FastaIO(out_fasta, "wt")
+    for record in FH_in_db:
+        record.description = tax_by_id[record.id]
+        FH_out_db.write(record)
+    FH_in_db.close()
+    FH_out_db.close()
+
+
+def writeRDPFasta(tree, in_fasta, out_fasta):
+    FH_in_db = FastaIO(in_fasta)
+    FH_out_db = FastaIO(out_fasta, "wt")
+    for record in FH_in_db:
+        desc = unicodedata.normalize('NFD', record.description).encode('ascii', 'ignore').decode('utf-8')
+        if desc.endswith(";"):
+            desc=desc[:-1]
+        taxonomy = [taxa.strip() for taxa in desc.split(";")]
+        clean_taxonomy = [getCleanName(tree)]
+        node = tree
+        for taxa in taxonomy:
+            tax = unicodedata.normalize('NFD', taxa).encode('ascii', 'ignore').decode('utf-8')
+            node = node.get_child(tax)
+            clean_taxonomy.append(getCleanName(node))
+        record.description = ";".join(clean_taxonomy)
+        FH_out_db.write(record)
+    FH_in_db.close()
+    FH_out_db.close()
+
+
+##################################################################################################################################################
+#
+# MAIN
+#
+##################################################################################################################################################
+if __name__ == "__main__":
+    # Manage parameters
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=("Prepare file for RDP classifier trainings. (NB : input file(s) must be utf-8 encoding)")
+    )
+    parser.add_argument( '-v', '--version', action='version', version=__version__ )
+    parser.add_argument( '-r', '--ranks' , nargs='*', help='The ordered ranks levels used in the metadata taxonomy. [Default: first letter of each taxon (usefull for Greengenes)]' ) 
+    group_input = parser.add_argument_group( 'Inputs' ) # Inputs
+    group_input.add_argument( '-d', '--databank', required=True, help='The reference databank (format: FASTA). Each sequence must have the same number of tacxonomy level and the header must have this format: "ID<TAB>TAX_LVL1;TAX_LVL2". or provide the taxonomy with --taxonomy option' )
+    group_input.add_argument( '-t', '--taxonomy', required=False, help='The reference databank (format: TSV). Each sequence must have the same number of taxonomy level and the header must have this format: "ID<TAB>TAX_LVL1;TAX_LVL2".' )
+    group_output = parser.add_argument_group( 'Outputs' ) # Outputs
+    group_output.add_argument( '--rdp-taxonomy', required=True, help='The selected sequences (format: RDPTax).' )
+    group_output.add_argument( '--rdp-fasta', required=True, help='The selected sequences (format: RDPTax).' )
+    args = parser.parse_args()
+
+    # Pre-process
+    db_with_tax = args.databank
+    if args.taxonomy is not None:
+        db_with_tax = args.databank+".tmp" #################################################
+        mergeFastaTax(args.taxonomy, args.databank, db_with_tax)
+
+    # Build tree
+    databank_tree = treeFromFasta(db_with_tax, args.ranks)
+
+    # Write RDP tax
+    FH_RDPTax = open(args.rdp_taxonomy, "wt")
+    writeRDPTax(FH_RDPTax, databank_tree)
+    FH_RDPTax.close()
+
+    # Write RDP fasta
+    writeRDPFasta(databank_tree, db_with_tax, args.rdp_fasta)
+
+	# remove tmp file
+    if args.taxonomy is not None:
+        os.remove(args.databank+".tmp")
+
diff --git a/libexec/select_inclusive_amplicon.py b/libexec/select_inclusive_amplicon.py
@@ -143,7 +143,7 @@ def process(params) :
     for observation in biom_in.get_observations():
         nb_obs += 1
         # reduce multiaffiliations list
-        if len(observation['metadata']["blast_affiliations"]) > 1:
+        if observation['metadata']["blast_affiliations"] is not None and len(observation['metadata']["blast_affiliations"]) > 1:
             nb_obs_multi_affi +=1
             new_blast_affi = select_smallest(observation['metadata']['blast_affiliations'], ref_size)
             if len(new_blast_affi) < len(observation['metadata']['blast_affiliations']):

diff --git a/test/data/test_dataset.tar.gz b/test/data/test_dataset.tar.gz
diff --git a/test/test.sh b/test/test.sh
@@ -434,7 +434,7 @@ echo "Step phyloseq_clustering `date`"
 phyloseq_clustering.py  \
  --varExp EnvType \
  --rdata $out_dir/16-phylo_import.Rdata --distance-matrix $out_dir/unifrac.tsv \
- --html $out_dir/21-phylo_clutering.nb.html \
+ --html $out_dir/21-phylo_clustering.nb.html \
  --log-file $out_dir/21-phylo_clustering.log
 
 

diff --git a/test/test_dependancies.sh b/test/test_dependancies.sh
@@ -820,7 +820,7 @@ then
 	phyloseq_clustering.py  \
 	 --varExp EnvType \
 	 --rdata $expected_dir/16-phylo_import.Rdata --distance-matrix $expected_dir/unifrac.tsv \
-	 --html $out_dir/21-phylo_clutering.nb.html \
+	 --html $out_dir/21-phylo_clustering.nb.html \
 	 --log-file $out_dir/21-phylo_clustering.log
 
 

diff --git a/tools/affiliation_OTU/affiliation_OTU.py b/tools/affiliation_OTU/affiliation_OTU.py
@@ -19,7 +19,7 @@
 __author__ = 'Maria Bernard INRA - SIGENAE AND Frederic Escudie - Plateforme bioinformatique Toulouse'
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
-__version__ = '3.2.2'
+__version__ = '3.2.3'
 __email__ = '[email protected]'
 __status__ = 'prod'
 

diff --git a/tools/affiliation_OTU/affiliation_OTU_tpl.html b/tools/affiliation_OTU/affiliation_OTU_tpl.html
@@ -19,7 +19,7 @@
 	<head>
 		<title>FROGS Affiliation</title>
 		<meta charset="UTF-8">
-		<meta name="version" content="3.2.2">
+		<meta name="version" content="3.2.3">
 		<!-- CSS -->
 		<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.1.1/css/bootstrap.css"></link>
 		<link rel="stylesheet" href="https://cdn.datatables.net/1.10.19/css/dataTables.bootstrap4.min.css"></link>

diff --git a/tools/affiliation_filters/affiliation_filters.py b/tools/affiliation_filters/affiliation_filters.py
@@ -19,7 +19,7 @@
 __author__ = 'Katia Vidal - Team NED Toulouse AND Frederic Escudie - Plateforme bioinformatique Toulouse AND Maria Bernard - Sigenae Jouy en Josas'
 __copyright__ = 'Copyright (C) 2020 INRAE'
 __license__ = 'GNU General Public License'
-__version__ = '3.2.2'
+__version__ = '3.2.3'
 __email__ = '[email protected]'
 __status__ = 'prod'
 

diff --git a/tools/affiliation_filters/affiliation_filters_tpl.html b/tools/affiliation_filters/affiliation_filters_tpl.html
@@ -19,7 +19,7 @@
 	<head>
 		<title>FROGS Affiliation Filters</title>
 		<meta charset="UTF-8">
-		<meta name="version" content="3.2.2">
+		<meta name="version" content="3.2.3">
 		<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.1.1/css/bootstrap.css"></link>
 		<link rel="stylesheet" href="https://cdn.datatables.net/1.10.19/css/dataTables.bootstrap4.min.css"></link>
 		<link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet">

diff --git a/tools/affiliation_postprocess/affiliation_postprocess.py b/tools/affiliation_postprocess/affiliation_postprocess.py
@@ -19,7 +19,7 @@
 __author__ = 'Maria Bernard INRA - SIGENAE'
 __copyright__ = 'Copyright (C) 2018 INRA'
 __license__ = 'GNU General Public License'
-__version__ = '3.2.2'
+__version__ = '3.2.3'
 __email__ = '[email protected]'
 __status__ = 'prod'
 

diff --git a/tools/affiliations_stat/affiliations_stat.py b/tools/affiliations_stat/affiliations_stat.py
@@ -19,7 +19,7 @@
 __author__ = 'Frederic Escudie - Plateforme bioinformatique Toulouse'
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
-__version__ = '3.2.2'
+__version__ = '3.2.3'
 __email__ = '[email protected]'
 __status__ = 'prod'
 

diff --git a/tools/affiliations_stat/affiliations_stat_tpl.html b/tools/affiliations_stat/affiliations_stat_tpl.html
@@ -19,7 +19,7 @@
 	<head>
 		<title>FROGS Affiliations stat</title>
 		<meta charset="UTF-8">
-		<meta name="version" content="3.2.2">
+		<meta name="version" content="3.2.3">
 		<!-- CSS -->
 		<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.1.1/css/bootstrap.css"></link>
 		<link rel="stylesheet" href="https://cdn.datatables.net/1.10.19/css/dataTables.bootstrap4.min.css"></link>

diff --git a/tools/biom_to_stdBiom/biom_to_stdBiom.py b/tools/biom_to_stdBiom/biom_to_stdBiom.py
@@ -19,7 +19,7 @@
 __author__ = 'Frederic Escudie - Plateforme bioinformatique Toulouse'
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
-__version__ = '3.2.2'
+__version__ = '3.2.3'
 __email__ = '[email protected]'
 __status__ = 'prod'
 

diff --git a/tools/biom_to_tsv/biom_to_tsv.py b/tools/biom_to_tsv/biom_to_tsv.py
@@ -19,7 +19,7 @@
 __author__ = 'Frederic Escudie - Plateforme bioinformatique Toulouse and Maria Bernard - Sigenae Jouy en Josas'
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
-__version__ = '3.2.2'
+__version__ = '3.2.3'
 __email__ = '[email protected]'
 __status__ = 'prod'
 

diff --git a/tools/clustering/clustering.py b/tools/clustering/clustering.py
@@ -19,7 +19,7 @@
 __author__ = 'Maria Bernard - SIGENAE AND Frederic Escudie - Plateforme bioinformatique Toulouse'
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
-__version__ = '3.2.2'
+__version__ = '3.2.3'
 __email__ = '[email protected]'
 __status__ = 'prod'
 

diff --git a/tools/clusters_stat/clusters_stat.py b/tools/clusters_stat/clusters_stat.py
@@ -19,7 +19,7 @@
 __author__ = 'Frederic Escudie - Plateforme bioinformatique Toulouse'
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
-__version__ = '3.2.2'
+__version__ = '3.2.3'
 __email__ = '[email protected]'
 __status__ = 'prod'