merge branch dev

geraldinepascal · Aug 8, 2017 · fa5b2e0 · fa5b2e0
2 parents 204f187 + 9c6d2df
commit fa5b2e0
Show file tree

Hide file tree

Showing 80 changed files with 1,093 additions and 422 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
+FROGS.Rproj
diff --git a/README.md b/README.md
@@ -168,19 +168,26 @@
     RScript
         Version : >= 3.3.0
         Named as : RScript
-        Tools : all FROGS_Phyloseq and FROGS_DESeq
+        Tools : all FROGS_Phyloseq
         Download : https://cran.r-project.org/
 
+    Phangorn R package
+        Version : depending on your R version
+        Tools : FROGS_Tree
+        Installation in R session : # https://cran.r-project.org/web/packages/phangorn/index.html
+                                    install.packages("phangorn")
+        Test in R session : library(phangorn)
+
     Rmarkdown R package
         Version : depending on your R version
-        Tools : all FROGS_Phyloseq and FROGS_DESeq
+        Tools : all FROGS_Phyloseq
         Install in R session : # https://cran.r-project.org/web/packages/rmarkdown/index.html
                                install.packages("rmarkdown")
 
     Pandoc
         Version : >= 1.12.3
         Named as : pandoc
-        Tools : all FROGS_Phyloseq and FROGS_DESeq
+        Tools : all FROGS_Phyloseq
         Download/Installation : # http://pandoc.org/installing.html#linux or simply soft-link pandoc binary from RStudio path (if you have Rstudio installed)
 
     Phyloseq R package
@@ -204,6 +211,7 @@
                                     install.packages("gridExtra")
         Test in R session : library(gridExtra)
 
+
 ### 4. Check intallation
     To check your installation you can type:
         cd <FROGS_PATH>/test
@@ -259,6 +267,7 @@
     Example:
         ...
         <section id="FROGS_wrappers" name="FROGS">
+        <label text="OTUs reconstruction" id="FROGS_OTU" />
             <tool file="FROGS/app/upload_tar.xml" />
             <tool file="FROGS/app/demultiplex.xml" />
             <tool file="FROGS/app/preprocess.xml" />
@@ -272,6 +281,15 @@
             <tool file="FROGS/app/biom_to_tsv.xml" />
             <tool file="FROGS/app/tsv_to_biom.xml" />
             <tool file="FROGS/app/normalisation.xml" />
+            <tool file="FROGS/app/tree.xml" />
+        <label text="OTUs structure and composition analysis" id="FROGS_Phyloseq" />
+            <tool file="FROGS/app/r_import_data.xml" />
+            <tool file="FROGS/app/r_composition.xml" />
+            <tool file="FROGS/app/r_alpha_diversity.xml" />
+            <tool file="FROGS/app/r_beta_diversity.xml" />
+            <tool file="FROGS/app/r_structure.xml" />
+            <tool file="FROGS/app/r_clustering.xml" />
+            <tool file="FROGS/app/r_manova.xml" />
         </section>
         ...
     Note: 

diff --git a/RELEASES_NOTES.md b/RELEASES_NOTES.md
@@ -9,8 +9,12 @@
   * FROGS Phyloseq Clustering
   * FROGS Phyloseq Manova 
 
+### libexec program added:
+  * rooted_tree.R : Rscript to root FastTree tree. (used by Tree)
+
 ### Bugs fixes:
   * Preprocess : min overlap at least equal to 1
+  * biom2tsv : not working with stdBiom containing RDP affiliation
 
 ### Functions added:
   * Preprocess: add Flash mismatch rate option

diff --git a/assessment/bin/assessRealMock.py b/assessment/bin/assessRealMock.py
@@ -21,7 +21,7 @@
 __license__ = 'GNU General Public License'
 __version__ = '1.0.0'
 __email__ = '[email protected]'
-__status__ = 'dev'
+__status__ = 'prod'
 
 
 import re
@@ -138,15 +138,15 @@ def get_expected( abund_file ):
     expected_by_depth = dict()
     FH_expected = open(abund_file)
     for line in FH_expected:
-        taxonomy, count = line.strip().split()
+        taxonomy, count = line.strip().split("\t")
         clean_taxonomy = getCleanedTaxonomy(taxonomy)
         for rank_depth in range(len(clean_taxonomy)):
             rank_taxonomy = ";".join(clean_taxonomy[:rank_depth + 1])
             if rank_depth not in expected_by_depth:
                 expected_by_depth[rank_depth] = dict()
             if rank_taxonomy not in expected_by_depth[rank_depth]:
                 expected_by_depth[rank_depth][rank_taxonomy] = 0
-            expected_by_depth[rank_depth][rank_taxonomy] += int(count)
+            expected_by_depth[rank_depth][rank_taxonomy] += float(count)
     FH_expected.close()
     return expected_by_depth
 
@@ -155,7 +155,7 @@ def get_checked( abund_file, checked_sample, taxonomy_key, expected_by_depth ):
     checked_by_depth = dict()
     biom = BiomIO.from_json(abund_file)
     for current_obs in biom.get_observations():
-        clean_taxonomy = getCleanedTaxonomy(current_obs["metadata"][taxonomy_key])
+        clean_taxonomy = getCleanedTaxonomy(current_obs["metadata"][taxonomy_key]) if current_obs["metadata"][taxonomy_key] is not None else ["unknown_taxa"]*len(expected_by_depth)
         count = biom.get_count(current_obs["id"], checked_sample)
         if count > 0:
             if clean_taxonomy[len(clean_taxonomy)-1] == "Multi-affiliation":

diff --git a/assessment/bin/assessment_4real.py b/assessment/bin/assessment_4real.py
@@ -19,7 +19,7 @@
 __author__ = 'Plateforme bioinformatique Toulouse - Sigenae  Jouy en Josas'
 __copyright__ = 'Copyright (C) 2016 INRA'
 __license__ = 'GNU General Public License'
-__version__ = '1.1.1'
+__version__ = '1.2.1'
 __email__ = '[email protected]'
 __status__ = 'prod'
 
@@ -163,7 +163,7 @@ def uparse(udb_databank, reads_directory, out_biom, out_fasta, min_length, max_l
     )########################################################## Problem threads > 1
 
 
-def mothur(affiliation_databank, affiliation_taxonomy, mothur_databank, mothur_taxonomy, reads_directory, out_biom, out_fasta, min_length, max_length, pcr_start, pcr_end, kept_start, kept_end, nb_cpus):
+def mothur(affiliation_databank, affiliation_taxonomy, mothur_databank, mothur_taxonomy, reads_directory, out_biom, out_fasta, min_length, max_length, pcr_start, pcr_end, kept_start, kept_end, diffs, nb_cpus):
     """
     @summary: Launch mothur pipeline.
     @param affiliation_databank: [str] Path to the databank used in affiliation. If affiliation_databank is None the affiliation step is skipped.
@@ -179,6 +179,7 @@ def mothur(affiliation_databank, affiliation_taxonomy, mothur_databank, mothur_t
     @param pcr_end: [int] End position for amplicon region. This value speedup pipeline by databank restriction.
     @param kept_start: [int] In PCR region the start position kept. All sequences must have same size.
     @param kept_end: [int] In PCR region the end position kept. All sequences must have same size.
+    @param diffs : [int] Number of mismatch to pre.cluster sequence (1 difference for every 100 bp of sequence)
     @param nb_cpus: [int] Number of used CPUs.
     """
     exec_cmd(
@@ -190,6 +191,7 @@ def mothur(affiliation_databank, affiliation_taxonomy, mothur_databank, mothur_t
         + " --pcr-end " + str(pcr_end) \
         + " --kept-start " + str(kept_start) \
         + " --kept-end " + str(kept_end) \
+        + " --preclusters-difference " + str(diffs) \
         + (" --affiliation-databank-fasta " + affiliation_databank if affiliation_databank is not None else "") \
         + (" --affiliation-databank-tax " + affiliation_taxonomy if affiliation_taxonomy is not None else "") \
         + " --restriction-databank-fasta " + mothur_databank \
@@ -293,6 +295,15 @@ def frogs_affiliation(fasta_databank, in_biom, in_fasta, output_biom, nb_cpus):
             "pcr_end": 26000,
             "kept_start": 1862,
             "kept_end": 10588
+        },
+        "V4V4_forward100": {
+            "min_length": 50,
+            "max_length": 150,
+            "pcr_start": 12000,
+            "pcr_end": 26000,
+            "kept_start": 1862,
+            "kept_end": 4307,
+            "diffs" : 1
         }
     }
 
@@ -309,6 +320,10 @@ def frogs_affiliation(fasta_databank, in_biom, in_fasta, output_biom, nb_cpus):
         pcr_end = primers_param[current_primers]["pcr_end"]
         kept_start = primers_param[current_primers]["kept_start"]
         kept_end = primers_param[current_primers]["kept_end"]
+        if "diffs" in primers_param[current_primers]:
+            diffs = primers_param[current_primers]["diffs"]
+        else:
+            diffs = 2
         for current_nb_sp in args.nb_sp:
             for dataset_idx in args.datasets:
                 for current_distribution in args.distribution_laws:
@@ -369,7 +384,7 @@ def frogs_affiliation(fasta_databank, in_biom, in_fasta, output_biom, nb_cpus):
                         mothur_assess_affi = os.path.join(mothur_out_dir, "mothur_affiResults.txt")
                         mothur_assess_clst = os.path.join(mothur_out_dir, "mothur_OTUResults.txt")
                         #    Execution
-                        mothur(args.affiliation_databank_fasta, args.affiliation_databank_tax, args.mothur_databank, args.mothur_taxonomy, reads_directory, mothur_biom, mothur_fasta, min_length, max_length, pcr_start, pcr_end, kept_start, kept_end, args.nb_cpus)
+                        mothur(args.affiliation_databank_fasta, args.affiliation_databank_tax, args.mothur_databank, args.mothur_taxonomy, reads_directory, mothur_biom, mothur_fasta, min_length, max_length, pcr_start, pcr_end, kept_start, kept_end, diffs, args.nb_cpus)
 
                     # QIIME
                     if "qiime" in args.pipelines:

diff --git a/assessment/bin/qiime_4real.py b/assessment/bin/qiime_4real.py
@@ -72,22 +72,21 @@ def exec_cmd( cmd, output=None ):
     file_list=",".join([os.path.join(args.input_folder,f) for f in os.listdir(args.input_folder) ] )
     sample_name_list=",".join([f.split("-")[0] for f in os.listdir(args.input_folder)])
 
-    exec_cmd("qiime; split_libraries_fastq.py -i " + file_list \
+    exec_cmd("split_libraries_fastq.py -i " + file_list \
         + " --sample_ids " + sample_name_list \
         +" -o " + os.path.join(working_path_prefix, "qiime_preprocess") \
-        +" --barcode_type 'not-barcoded' " \
-        +" --phred_offset 33")
+        +" --barcode_type 'not-barcoded' " )
 
     merge_fasta=os.path.join(working_path_prefix,"qiime_preprocess","seqs.fna")
 
     # Launch chimera identification (in Qiime)
-    exec_cmd("qiime; identify_chimeric_seqs.py -i "+ merge_fasta \
+    exec_cmd("identify_chimeric_seqs.py -i "+ merge_fasta \
         + " -m usearch61 --suppress_usearch61_ref " \
         + " -o " + os.path.join(working_path_prefix,"usearch61_chimeras") )
 
     # Remove chimera
     qiime_input_fasta=os.path.join(working_path_prefix,"usearch61_chimeras","seqs_chimeras_filtered.fna")
-    exec_cmd("qiime; filter_fasta.py -f " + merge_fasta \
+    exec_cmd("filter_fasta.py -f " + merge_fasta \
         + " -o " + qiime_input_fasta \
         + " -s " + os.path.join(working_path_prefix,"usearch61_chimeras","chimeras.txt") \
         + " -n")
@@ -97,7 +96,7 @@ def exec_cmd( cmd, output=None ):
     if args.nb_cpus > 1 :
         cpus_opt = " -aO "+str(args.nb_cpus)
 
-    qiime_command ="qiime; pick_open_reference_otus.py -i " + qiime_input_fasta\
+    qiime_command ="pick_open_reference_otus.py -i " + qiime_input_fasta\
         + cpus_opt \
         + " -o "+ os.path.join(working_path_prefix, "pick_open_reference_otus") \
         + " -r "+ args.ref_fasta \
@@ -107,7 +106,7 @@ def exec_cmd( cmd, output=None ):
 
     qiime_fasta=os.path.join(working_path_prefix, "pick_open_reference_otus","rep_set.fna")
     if args.ref_tax is not None:
-        exec_cmd("qiime; assign_taxonomy.py -o " + os.path.join(working_path_prefix,"uclust_assigned_taxonomy") \
+        exec_cmd("assign_taxonomy.py -o " + os.path.join(working_path_prefix,"uclust_assigned_taxonomy") \
             + " -i " + qiime_fasta \
             + " -t " + args.ref_tax \
             + " -r " + args.ref_fasta )

diff --git a/lib/frogsBiom.py b/lib/frogsBiom.py
@@ -19,7 +19,7 @@
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
 __version__ = '0.13.0'
-__email__ = 'frogs@toulouse.inra.fr'
+__email__ = '[email protected]'
 __status__ = 'prod'
 
 import re

diff --git a/lib/frogsNode.py b/lib/frogsNode.py
@@ -19,7 +19,7 @@
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
 __version__ = '0.2.1'
-__email__ = 'frogs@toulouse.inra.fr'
+__email__ = '[email protected]'
 __status__ = 'dev'
 
 

diff --git a/lib/frogsSequenceIO.py b/lib/frogsSequenceIO.py
@@ -19,7 +19,7 @@
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
 __version__ = '1.0.2'
-__email__ = 'frogs@toulouse.inra.fr'
+__email__ = '[email protected]'
 __status__ = 'prod'
 
 import gzip

diff --git a/lib/frogsUtils.py b/lib/frogsUtils.py
@@ -19,7 +19,7 @@
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
 __version__ = '0.2.0'
-__email__ = 'frogs@toulouse.inra.fr'
+__email__ = '[email protected]'
 __status__ = 'prod'
 
 import os
@@ -133,15 +133,15 @@ def submit(self, log_file=None):
         if log_file is not None:
             FH_log = Logger( log_file )
             FH_log.write( '# ' + self.description + ' (' + os.path.basename(self.program) + ' version : ' + self.get_version() + ')\n' )
-            FH_log.write( 'Command:\n\t' + self.get_cmd() + '\n' )
+            FH_log.write( 'Command:\n\t' + self.get_cmd() + '\n\n' )
             FH_log.write( 'Execution:\n\tstart: ' + time.strftime("%d %b %Y %H:%M:%S", time.localtime()) + '\n' )
             FH_log.close()
         # Process
         subprocess.check_output( self.get_cmd(), shell=True )
         # Log
         if log_file is not None:
             FH_log = Logger( log_file )
-            FH_log.write( '\tend:   ' + time.strftime("%d %b %Y %H:%M:%S", time.localtime()) + '\n' )
+            FH_log.write( '\tend:   ' + time.strftime("%d %b %Y %H:%M:%S", time.localtime()) + '\n\n' )
             FH_log.close()
             # Post-process results
             self.parser(log_file)

diff --git a/libexec/addAffiliation2biom.py b/libexec/addAffiliation2biom.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
 __version__ = '2.3.0'
-__email__ = 'frogs@toulouse.inra.fr'
+__email__ = '[email protected]'
 __status__ = 'prod'
 
 import os

diff --git a/libexec/biom2tsv.py b/libexec/biom2tsv.py
@@ -19,8 +19,8 @@
 __author__ = 'Maria Bernard - Sigenae AND Frederic Escudie - Plateforme bioinformatique Toulouse'
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
-__version__ = '1.4.0'
-__email__ = 'frogs@toulouse.inra.fr'
+__version__ = '1.4.1'
+__email__ = '[email protected]'
 __status__ = 'prod'
 
 import os
@@ -62,12 +62,18 @@ def observation_line_parts( observation, count_by_sample, fields, list_separator
         elif current_field == '@observation_sum':
             line.append( str(sum(count_by_sample)) )
         elif current_field == "@rdp_tax_and_bootstrap":
-                rdp_and_bootstrap = ""
+            rdp_and_bootstrap = ""
+            if issubclass(observation['metadata']["rdp_taxonomy"].__class__, list) :
                 rdp_taxonomy = observation['metadata']["rdp_taxonomy"]
                 rdp_bootstrap = observation['metadata']["rdp_bootstrap"]
                 for i, tax in enumerate(rdp_taxonomy):
                     rdp_and_bootstrap += tax + ";(" + str(rdp_bootstrap[i]) + ");" # tax1;(boots1);tax2;(boots2);
-                line.append(str(rdp_and_bootstrap))
+            else:
+                rdp_taxonomy = observation['metadata']["rdp_taxonomy"].split(";")
+                rdp_bootstrap = observation['metadata']["rdp_bootstrap"].split(";")
+                for i, tax in enumerate(rdp_taxonomy):
+                    rdp_and_bootstrap += tax + ";(" + str(rdp_bootstrap[i]) + ");" # tax1;(boots1);tax2;(boots2);
+            line.append(str(rdp_and_bootstrap))
         elif current_field == "@blast_perc_identity":
             if len(observation['metadata']["blast_affiliations"]) > 0:
                 line.append( str(uniq(observation['metadata']["blast_affiliations"], "perc_identity", "multi-identity")) )

diff --git a/libexec/biomFastaUpdate.py b/libexec/biomFastaUpdate.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
 __version__ = '1.0.1'
-__email__ = 'frogs@toulouse.inra.fr'
+__email__ = '[email protected]'
 __status__ = 'prod'
 
 import os

diff --git a/libexec/biomTools.py b/libexec/biomTools.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
 __version__ = '0.10.1'
-__email__ = 'frogs@toulouse.inra.fr'
+__email__ = '[email protected]'
 __status__ = 'beta'
 
 import os

diff --git a/libexec/derepSamples.py b/libexec/derepSamples.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
 __version__ = '1.5.0'
-__email__ = 'frogs@toulouse.inra.fr'
+__email__ = '[email protected]'
 __status__ = 'prod'
 
 import os

diff --git a/libexec/extractSwarmsFasta.py b/libexec/extractSwarmsFasta.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
 __version__ = '1.3.1'
-__email__ = 'frogs@toulouse.inra.fr'
+__email__ = '[email protected]'
 __status__ = 'prod'
 
 import os

diff --git a/libexec/filterSeq.py b/libexec/filterSeq.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
 __version__ = '1.4.0'
-__email__ = 'frogs@toulouse.inra.fr'
+__email__ = '[email protected]'
 __status__ = 'prod'
 
 import os

diff --git a/libexec/multiAffiFromBiom.py b/libexec/multiAffiFromBiom.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
 __version__ = '1.3.0'
-__email__ = 'frogs@toulouse.inra.fr'
+__email__ = '[email protected]'
 __status__ = 'prod'
 
 import os

diff --git a/libexec/parallelChimera.py b/libexec/parallelChimera.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
 __version__ = '0.7.1'
-__email__ = 'frogs@toulouse.inra.fr'
+__email__ = '[email protected]'
 __status__ = 'prod'
 
 import os

diff --git a/libexec/remove454Adapt.py b/libexec/remove454Adapt.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (C) 2015 INRA'
 __license__ = 'GNU General Public License'
 __version__ = '0.5.0'
-__email__ = 'frogs@toulouse.inra.fr'
+__email__ = '[email protected]'
 __status__ = 'prod'
 
 import os