Merge pull request #683 from metagenome-atlas/subset-vamb

Enable running cobinning on different bingroups
metagenome-atlas · Aug 17, 2023 · 5caa6ca · 5caa6ca
2 parents 1daf261 + fbe054b
commit 5caa6ca
Show file tree

Hide file tree

Showing 24 changed files with 690 additions and 254 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -59,6 +59,12 @@ jobs:
           command: |
               source activate ./atlasenv
               test/dryrun.sh
+      - run:
+          name: Test_init
+          command: |
+              ls -l test
+              source activate ./atlasenv
+              ./test/test_init_many_samples.sh
 
       - persist_to_workspace:
           root: /root/project/
@@ -264,8 +270,8 @@ jobs:
           path: wd/logs
           destination: binning_logs
       - store_artifacts:
-          path: wd/Streptococcus/logs
-          destination: binning_logs/sample
+          path: wd/sample1/logs
+          destination: sample_logs
 
   #
   #

diff --git a/.gitignore b/.gitignore
@@ -35,15 +35,9 @@ databases
 *.manifest
 *.spec
 databases
-test/*
 .test/*
 test/*
-!test/dryrun.sh
-!test/test_ci.sh
-!test/test_assembly.sh
-!test/test_binning.sh
-!test/test_sra.sh
-!test/test_external_genomes.sh
+!test/*.sh
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt

diff --git a/atlas/atlas.py b/atlas/atlas.py
@@ -8,8 +8,8 @@
 
 
 from snakemake.io import load_configfile
-from .make_config import make_config, validate_config
-from .init.atlas_init import run_init, run_init_sra
+from .make_config import validate_config
+from .init.atlas_init import run_init#, run_init_sra
 
 from .__init__ import __version__
 
@@ -66,7 +66,7 @@ def cli(obj):
 cli.add_command(run_init)
 
 
-cli.add_command(run_init_sra)
+#cli.add_command(run_init_sra)
 
 
 def get_snakefile(file="workflow/Snakefile"):

diff --git a/atlas/init/atlas_init.py b/atlas/init/atlas_init.py
@@ -1,20 +1,16 @@
 import os, sys
 from ..color_logger import logger
-import multiprocessing
-import tempfile
-from snakemake import utils
-from snakemake.io import load_configfile
 import pandas as pd
 import numpy as np
-from collections import defaultdict
 import click
 from pathlib import Path
-
 from ..make_config import make_config, validate_config
 from .create_sample_table import get_samples_from_fastq, simplify_sample_names
 from ..sample_table import (
     validate_sample_table,
-    load_sample_table,
+    validate_bingroup_size_cobinning,
+    validate_bingroup_size_metabat,
+    BinGroupSizeError,
     ADDITIONAL_SAMPLEFILE_HEADERS,
 )
 
@@ -64,7 +60,14 @@ def prepare_sample_table_for_atlas(
     for h in Headers:
         sample_table[h] = np.nan
 
-    sample_table["BinGroup"] = sample_table.index
+    sample_table["BinGroup"] = "All"
+
+    if sample_table.shape[0] >=50:
+        logger.warning(
+            "You have more than 50 samples in your sample table. "
+            "You should consider to split your samples into multiple BinGroups"
+            "For this modify the 'BinGroup' column in your sample table"
+        )
 
     validate_sample_table(sample_table)
 
@@ -146,21 +149,48 @@ def run_init(
     os.makedirs(working_dir, exist_ok=True)
     os.makedirs(db_dir, exist_ok=True)
 
+    sample_table = get_samples_from_fastq(path_to_fastq)
+
+    prepare_sample_table_for_atlas(
+        sample_table,
+        reads_are_QC=skip_qc,
+        outfile=os.path.join(working_dir, "samples.tsv"),
+    )
+
+    # Set default binner depending on number of samples
+    n_samples = sample_table.shape[0]
+    if  n_samples <= 7:
+        logger.info("You don't have many samples in your dataset. "
+                       "I set 'metabat' as binner"
+                       )
+        binner = "metabat"
+
+    try:
+        validate_bingroup_size_metabat(sample_table,logger)
+    except BinGroupSizeError:
+        pass
+
+
+    else:
+        binner = "vamb"
+        try:
+            validate_bingroup_size_cobinning(sample_table,logger)
+
+        except BinGroupSizeError:
+            pass
+
+
+
     make_config(
         db_dir,
         threads,
         assembler,
         data_type,
         interleaved_fastq,
         os.path.join(working_dir, "config.yaml"),
+        binner= binner
     )
-    sample_table = get_samples_from_fastq(path_to_fastq)
 
-    prepare_sample_table_for_atlas(
-        sample_table,
-        reads_are_QC=skip_qc,
-        outfile=os.path.join(working_dir, "samples.tsv"),
-    )
 
 
 ########### Public init download data from SRA ##############

diff --git a/atlas/init/parse_sra.py b/atlas/init/parse_sra.py
@@ -96,7 +96,7 @@ def filter_runinfo(RunTable, ignore_paired=False):
             RunTable = RunTable.query("LibraryLayout == 'SINGLE'")
 
         else:
-            logger.warn(f"I drop {N_library_layout['SINGLE']} single end libraries")
+            logger.warning(f"I drop {N_library_layout['SINGLE']} single end libraries")
 
             RunTable = RunTable.query("LibraryLayout == 'PAIRED'")
 
@@ -105,7 +105,7 @@ def filter_runinfo(RunTable, ignore_paired=False):
     if not RunTable.Platform.isin(["ILLUMINA"]).all():
         Platforms = ", ".join(RunTable.Platform.unique())
 
-        logger.warn(
+        logger.warning(
             f"Your samples are sequenced on the folowing platform: {Platforms}\n"
             "I don't know how well Atlas handles non-illumina reads.\n"
             "If you have long-reads, specify them via a the longreads, column in the sample table."
@@ -160,7 +160,7 @@ def validate_merging_runinfo(path):
             else:
                 problematic_samples_list = " ".join(problematic_samples)
 
-                logger.warn(
+                logger.warning(
                     "You attemt to merge runs from the same sample. "
                     f"But for {len(problematic_samples)} samples the runs have different {key}: {problematic_samples_list}\n"
                     f"You can modify the table {path} and rerun the command.\n"

diff --git a/atlas/make_config.py b/atlas/make_config.py
@@ -155,7 +155,6 @@ def make_default_config():
 
     config["cobining_min_contig_length"] = 2000
     config["cobining_min_bin_size"] = 200 * 1000
-    config["semibin_options"] = "  --max-node 1 --max-edges 200 "
     config["cobinning_separator"] = ":"
 
     config["annotations"] = ["gtdb_taxonomy", "checkm_taxonomy", "gtdb_tree"]
@@ -173,6 +172,7 @@ def make_config(
     data_type="metagenome",
     interleaved_fastq=False,
     config="config.yaml",
+    binner="vamb",
 ):
     """
     Reads template config file with comments from ../workflow/config/template_config.yaml
@@ -221,6 +221,8 @@ def make_config(
     # conf["refseq_tree"] = os.path.join(database_dir, "refseq.tree")
     # conf["diamond_db"] = os.path.join(database_dir, "refseq.dmnd")
 
+    conf["final_binner"] = binner
+
     if os.path.exists(config):
         logger.warning(
             f"Config file {config} already exists, I didn't dare to overwrite it. continue..."
@@ -234,6 +236,8 @@ def make_config(
         )
 
 
+
+
 def validate_config(config, workflow):
     conf = load_configfile(config)
 

diff --git a/atlas/sample_table.py b/atlas/sample_table.py
@@ -25,21 +25,126 @@ def validate_sample_table(sampleTable):
         exit(1)
 
     if sampleTable.index.str.match("^\d").any():
-        logger.warning(
+        logger.error(
             f"Sample names shouldn't start with a digit. This can lead to incompatibilities.\n {list(sampleTable.index)}"
         )
+        exit(1)
 
     if sampleTable.index.str.contains("_").any():
-        logger.warning(
+        logger.error(
             f"Sample names shouldn't contain underscores. This can lead to incompatibilities. \n {list(sampleTable.index)}"
         )
+        exit(1)
+
     if sampleTable.index.str.count("-").max() > 1:
-        logger.warning(
-            f"Sample names shouldn't have more than one hypon '-'. This can lead to incompatibilities.\n {list(sampleTable.index)}"
+        logger.error(
+            f"Sample names shouldn't have more than one hypo '-'. This can lead to incompatibilities.\n {list(sampleTable.index)}"
+        )
+        exit(1)
+
+    ### Validate BinGroup
+
+    if sampleTable.BinGroup.isnull().any():
+        logger.warning(f"Found empty values in the sample table column 'BinGroup'")
+
+    if sampleTable.BinGroup.str.contains("_").any():
+        logger.error(
+            f"BinGroup names shouldn't contain underscores. This can lead to incompatibilities. \n {list(sampleTable.BinGroup)}"
         )
+        exit(1)
+
+    if sampleTable.BinGroup.str.contains("-").any():
+        logger.error(
+            f"BinGroup names shouldn't contain hypos '-'. This can lead to incompatibilities.\n {list(sampleTable.BinGroup)}"
+        )
+        exit(1)
 
 
 def load_sample_table(sample_table="samples.tsv"):
     sampleTable = pd.read_csv(sample_table, index_col=0, sep="\t")
     validate_sample_table(sampleTable)
     return sampleTable
+
+
+class BinGroupSizeError(Exception):
+    """
+    Exception with Bingroupsize
+    """
+
+    def __init__(self, message):
+        super(BinGroupSizeError, self).__init__(message)
+
+
+def validate_bingroup_size_cobinning(sampleTable, logger):
+    """
+    Validate that the bingroups are not too large, nor too small for co-binning.
+
+    e.g. vamb and SemiBin
+    """
+
+    bin_group_sizes = sampleTable.BinGroup.value_counts()
+
+    if bin_group_sizes.max() > 180:
+        logger.warning(
+            f"Found a bin group with more than 180 samples. This might lead to memory issues. \n {bin_group_sizes}"
+        )
+
+    if bin_group_sizes.min() < 10:
+        logger.error(
+            "If you want to use co-binning, you should have at least 5-10 samples per bin group. \n"
+        )
+        BinGroupSizeError("BinGroup too small")
+
+
+def validate_bingroup_size_metabat(sampleTable, logger):
+    bin_group_sizes = sampleTable.BinGroup.value_counts()
+
+    max_bin_group_size = bin_group_sizes.max()
+
+    warn_message = (
+        "Co-binning with metabat uses cross-mapping which scales quadratically."
+        f"You have a bingroup with {max_bin_group_size} samples, which already leads to {max_bin_group_size*max_bin_group_size} cross-mappings."
+    )
+
+    if max_bin_group_size > 50:
+        logger.error(
+            warn_message
+            + "This is too much for metabat. Please use vamb, or SemiBin or split your samples into smaller groups."
+        )
+        BinGroupSizeError("BinGroup too large")
+
+    if max_bin_group_size > 10:
+        logger.warning(
+            warn_message
+            + "This might be too much for metabat. Consider using vamb, or SemiBin or split your samples into smaller groups."
+        )
+
+    elif max_bin_group_size == 1:
+        logger.warning(
+            "You have only one sample per bingroup. This doesn't use the co-abundance information."
+        )
+
+
+def validate_bingroup_size(sampleTable, config, logger):
+    if config["final_binner"] == "DASTool":
+        binners = config["binner"]
+
+        logger.info(f"DASTool uses the folowing binners: {binners}")
+
+        if ("vamb" in binners) or ("SemiBin" in binners):
+            validate_bingroup_size_cobinning(sampleTable, logger)
+
+        if "metabat" in binners:
+            validate_bingroup_size_metabat(sampleTable, logger)
+
+    elif config["final_binner"] == "metabat":
+        validate_bingroup_size_metabat(sampleTable, logger)
+
+    elif config["final_binner"] in ["vamb", "SemiBin"]:
+        validate_bingroup_size_cobinning(sampleTable, logger)
+
+    elif config["final_binner"] == "maxbin":
+        logger.warning("maxbin Doesn't use coabundance for binning.")
+
+    else:
+        Exception(f"Unknown final binner: {config['final_binner']}")
diff --git a/config/default_config.yaml b/config/default_config.yaml
@@ -22,9 +22,18 @@ genome_aligner: "minimap"
 
 bin_quality_asesser: checkm2          #[ checkm2, busco, cehckm]
 
-
+semibin_options: ""
 semibin_train_extra: ""
 
 
 filter_chimieric_bins: true
 gunc_database: "progenomes"  # progenomes or gtdb
+
+
+binner:                           # If DASTool is used as final_binner, use predictions of this binners
+  - metabat
+  - maxbin
+#  - vamb
+
+
+cobinning_readmapping_id: 0.95  #when mapping different reads to contigs from different samples use less stringent alignment threshold
diff --git a/config/template_config.yaml b/config/template_config.yaml
@@ -156,13 +156,10 @@ minimum_map_quality: 0
 # Binning
 ########################
 
-final_binner: DASTool             # [SemiBin, DASTool, vamb, maxbin]
+final_binner: vamb             # [SemiBin, vamb, metabat, DASTool]
 
-binner:                           # If DASTool is used as final_binner, use predictions of this binners
-  - metabat
-  - maxbin
-#  - vamb
 
+semibin_options: ""
 
 metabat:
   sensitivity: sensitive