Skip to content

Commit

Permalink
Merge pull request #683 from metagenome-atlas/subset-vamb
Browse files Browse the repository at this point in the history
Enable running cobinning on different bingroups
  • Loading branch information
SilasK authored Aug 17, 2023
2 parents 1daf261 + fbe054b commit 5caa6ca
Show file tree
Hide file tree
Showing 24 changed files with 690 additions and 254 deletions.
10 changes: 8 additions & 2 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ jobs:
command: |
source activate ./atlasenv
test/dryrun.sh
- run:
name: Test_init
command: |
ls -l test
source activate ./atlasenv
./test/test_init_many_samples.sh
- persist_to_workspace:
root: /root/project/
Expand Down Expand Up @@ -264,8 +270,8 @@ jobs:
path: wd/logs
destination: binning_logs
- store_artifacts:
path: wd/Streptococcus/logs
destination: binning_logs/sample
path: wd/sample1/logs
destination: sample_logs

#
#
Expand Down
8 changes: 1 addition & 7 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,9 @@ databases
*.manifest
*.spec
databases
test/*
.test/*
test/*
!test/dryrun.sh
!test/test_ci.sh
!test/test_assembly.sh
!test/test_binning.sh
!test/test_sra.sh
!test/test_external_genomes.sh
!test/*.sh
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
Expand Down
6 changes: 3 additions & 3 deletions atlas/atlas.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@


from snakemake.io import load_configfile
from .make_config import make_config, validate_config
from .init.atlas_init import run_init, run_init_sra
from .make_config import validate_config
from .init.atlas_init import run_init#, run_init_sra

from .__init__ import __version__

Expand Down Expand Up @@ -66,7 +66,7 @@ def cli(obj):
cli.add_command(run_init)


cli.add_command(run_init_sra)
#cli.add_command(run_init_sra)


def get_snakefile(file="workflow/Snakefile"):
Expand Down
58 changes: 44 additions & 14 deletions atlas/init/atlas_init.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,16 @@
import os, sys
from ..color_logger import logger
import multiprocessing
import tempfile
from snakemake import utils
from snakemake.io import load_configfile
import pandas as pd
import numpy as np
from collections import defaultdict
import click
from pathlib import Path

from ..make_config import make_config, validate_config
from .create_sample_table import get_samples_from_fastq, simplify_sample_names
from ..sample_table import (
validate_sample_table,
load_sample_table,
validate_bingroup_size_cobinning,
validate_bingroup_size_metabat,
BinGroupSizeError,
ADDITIONAL_SAMPLEFILE_HEADERS,
)

Expand Down Expand Up @@ -64,7 +60,14 @@ def prepare_sample_table_for_atlas(
for h in Headers:
sample_table[h] = np.nan

sample_table["BinGroup"] = sample_table.index
sample_table["BinGroup"] = "All"

if sample_table.shape[0] >=50:
logger.warning(
"You have more than 50 samples in your sample table. "
"You should consider to split your samples into multiple BinGroups"
"For this modify the 'BinGroup' column in your sample table"
)

validate_sample_table(sample_table)

Expand Down Expand Up @@ -146,21 +149,48 @@ def run_init(
os.makedirs(working_dir, exist_ok=True)
os.makedirs(db_dir, exist_ok=True)

sample_table = get_samples_from_fastq(path_to_fastq)

prepare_sample_table_for_atlas(
sample_table,
reads_are_QC=skip_qc,
outfile=os.path.join(working_dir, "samples.tsv"),
)

# Set default binner depending on number of samples
n_samples = sample_table.shape[0]
if n_samples <= 7:
logger.info("You don't have many samples in your dataset. "
"I set 'metabat' as binner"
)
binner = "metabat"

try:
validate_bingroup_size_metabat(sample_table,logger)
except BinGroupSizeError:
pass


else:
binner = "vamb"
try:
validate_bingroup_size_cobinning(sample_table,logger)

except BinGroupSizeError:
pass



make_config(
db_dir,
threads,
assembler,
data_type,
interleaved_fastq,
os.path.join(working_dir, "config.yaml"),
binner= binner
)
sample_table = get_samples_from_fastq(path_to_fastq)

prepare_sample_table_for_atlas(
sample_table,
reads_are_QC=skip_qc,
outfile=os.path.join(working_dir, "samples.tsv"),
)


########### Public init download data from SRA ##############
Expand Down
6 changes: 3 additions & 3 deletions atlas/init/parse_sra.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def filter_runinfo(RunTable, ignore_paired=False):
RunTable = RunTable.query("LibraryLayout == 'SINGLE'")

else:
logger.warn(f"I drop {N_library_layout['SINGLE']} single end libraries")
logger.warning(f"I drop {N_library_layout['SINGLE']} single end libraries")

RunTable = RunTable.query("LibraryLayout == 'PAIRED'")

Expand All @@ -105,7 +105,7 @@ def filter_runinfo(RunTable, ignore_paired=False):
if not RunTable.Platform.isin(["ILLUMINA"]).all():
Platforms = ", ".join(RunTable.Platform.unique())

logger.warn(
logger.warning(
f"Your samples are sequenced on the folowing platform: {Platforms}\n"
"I don't know how well Atlas handles non-illumina reads.\n"
"If you have long-reads, specify them via a the longreads, column in the sample table."
Expand Down Expand Up @@ -160,7 +160,7 @@ def validate_merging_runinfo(path):
else:
problematic_samples_list = " ".join(problematic_samples)

logger.warn(
logger.warning(
"You attemt to merge runs from the same sample. "
f"But for {len(problematic_samples)} samples the runs have different {key}: {problematic_samples_list}\n"
f"You can modify the table {path} and rerun the command.\n"
Expand Down
6 changes: 5 additions & 1 deletion atlas/make_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@ def make_default_config():

config["cobining_min_contig_length"] = 2000
config["cobining_min_bin_size"] = 200 * 1000
config["semibin_options"] = " --max-node 1 --max-edges 200 "
config["cobinning_separator"] = ":"

config["annotations"] = ["gtdb_taxonomy", "checkm_taxonomy", "gtdb_tree"]
Expand All @@ -173,6 +172,7 @@ def make_config(
data_type="metagenome",
interleaved_fastq=False,
config="config.yaml",
binner="vamb",
):
"""
Reads template config file with comments from ../workflow/config/template_config.yaml
Expand Down Expand Up @@ -221,6 +221,8 @@ def make_config(
# conf["refseq_tree"] = os.path.join(database_dir, "refseq.tree")
# conf["diamond_db"] = os.path.join(database_dir, "refseq.dmnd")

conf["final_binner"] = binner

if os.path.exists(config):
logger.warning(
f"Config file {config} already exists, I didn't dare to overwrite it. continue..."
Expand All @@ -234,6 +236,8 @@ def make_config(
)




def validate_config(config, workflow):
conf = load_configfile(config)

Expand Down
113 changes: 109 additions & 4 deletions atlas/sample_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,126 @@ def validate_sample_table(sampleTable):
exit(1)

if sampleTable.index.str.match("^\d").any():
logger.warning(
logger.error(
f"Sample names shouldn't start with a digit. This can lead to incompatibilities.\n {list(sampleTable.index)}"
)
exit(1)

if sampleTable.index.str.contains("_").any():
logger.warning(
logger.error(
f"Sample names shouldn't contain underscores. This can lead to incompatibilities. \n {list(sampleTable.index)}"
)
exit(1)

if sampleTable.index.str.count("-").max() > 1:
logger.warning(
f"Sample names shouldn't have more than one hypon '-'. This can lead to incompatibilities.\n {list(sampleTable.index)}"
logger.error(
f"Sample names shouldn't have more than one hypo '-'. This can lead to incompatibilities.\n {list(sampleTable.index)}"
)
exit(1)

### Validate BinGroup

if sampleTable.BinGroup.isnull().any():
logger.warning(f"Found empty values in the sample table column 'BinGroup'")

if sampleTable.BinGroup.str.contains("_").any():
logger.error(
f"BinGroup names shouldn't contain underscores. This can lead to incompatibilities. \n {list(sampleTable.BinGroup)}"
)
exit(1)

if sampleTable.BinGroup.str.contains("-").any():
logger.error(
f"BinGroup names shouldn't contain hypos '-'. This can lead to incompatibilities.\n {list(sampleTable.BinGroup)}"
)
exit(1)


def load_sample_table(sample_table="samples.tsv"):
sampleTable = pd.read_csv(sample_table, index_col=0, sep="\t")
validate_sample_table(sampleTable)
return sampleTable


class BinGroupSizeError(Exception):
"""
Exception with Bingroupsize
"""

def __init__(self, message):
super(BinGroupSizeError, self).__init__(message)


def validate_bingroup_size_cobinning(sampleTable, logger):
"""
Validate that the bingroups are not too large, nor too small for co-binning.
e.g. vamb and SemiBin
"""

bin_group_sizes = sampleTable.BinGroup.value_counts()

if bin_group_sizes.max() > 180:
logger.warning(
f"Found a bin group with more than 180 samples. This might lead to memory issues. \n {bin_group_sizes}"
)

if bin_group_sizes.min() < 10:
logger.error(
"If you want to use co-binning, you should have at least 5-10 samples per bin group. \n"
)
BinGroupSizeError("BinGroup too small")


def validate_bingroup_size_metabat(sampleTable, logger):
bin_group_sizes = sampleTable.BinGroup.value_counts()

max_bin_group_size = bin_group_sizes.max()

warn_message = (
"Co-binning with metabat uses cross-mapping which scales quadratically."
f"You have a bingroup with {max_bin_group_size} samples, which already leads to {max_bin_group_size*max_bin_group_size} cross-mappings."
)

if max_bin_group_size > 50:
logger.error(
warn_message
+ "This is too much for metabat. Please use vamb, or SemiBin or split your samples into smaller groups."
)
BinGroupSizeError("BinGroup too large")

if max_bin_group_size > 10:
logger.warning(
warn_message
+ "This might be too much for metabat. Consider using vamb, or SemiBin or split your samples into smaller groups."
)

elif max_bin_group_size == 1:
logger.warning(
"You have only one sample per bingroup. This doesn't use the co-abundance information."
)


def validate_bingroup_size(sampleTable, config, logger):
if config["final_binner"] == "DASTool":
binners = config["binner"]

logger.info(f"DASTool uses the folowing binners: {binners}")

if ("vamb" in binners) or ("SemiBin" in binners):
validate_bingroup_size_cobinning(sampleTable, logger)

if "metabat" in binners:
validate_bingroup_size_metabat(sampleTable, logger)

elif config["final_binner"] == "metabat":
validate_bingroup_size_metabat(sampleTable, logger)

elif config["final_binner"] in ["vamb", "SemiBin"]:
validate_bingroup_size_cobinning(sampleTable, logger)

elif config["final_binner"] == "maxbin":
logger.warning("maxbin Doesn't use coabundance for binning.")

else:
Exception(f"Unknown final binner: {config['final_binner']}")
11 changes: 10 additions & 1 deletion config/default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,18 @@ genome_aligner: "minimap"

bin_quality_asesser: checkm2 #[ checkm2, busco, cehckm]


semibin_options: ""
semibin_train_extra: ""


filter_chimieric_bins: true
gunc_database: "progenomes" # progenomes or gtdb


binner: # If DASTool is used as final_binner, use predictions of this binners
- metabat
- maxbin
# - vamb


cobinning_readmapping_id: 0.95 #when mapping different reads to contigs from different samples use less stringent alignment threshold
7 changes: 2 additions & 5 deletions config/template_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -156,13 +156,10 @@ minimum_map_quality: 0
# Binning
########################

final_binner: DASTool # [SemiBin, DASTool, vamb, maxbin]
final_binner: vamb # [SemiBin, vamb, metabat, DASTool]

binner: # If DASTool is used as final_binner, use predictions of this binners
- metabat
- maxbin
# - vamb

semibin_options: ""

metabat:
sensitivity: sensitive
Expand Down
Loading

0 comments on commit 5caa6ca

Please sign in to comment.