Skip to content

Commit

Permalink
New scripts for binning
Browse files Browse the repository at this point in the history
  • Loading branch information
alneberg committed Mar 23, 2018
1 parent 6a9865e commit 7368f0f
Show file tree
Hide file tree
Showing 4 changed files with 287 additions and 0 deletions.
85 changes: 85 additions & 0 deletions scripts/concoct/divide_approved_per_taxonomy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# coding: utf-8
import pandas as pd
from collections import defaultdict

def main(args):
clustering = pd.read_table(args.clustering_file, sep=',', names=['contig_id', 'cluster_id'], index_col=0)
taxonomy_df = pd.read_table(args.taxonomy_file, header=None, index_col=0, names=["contig_id", "taxonomy", "bla", "bla1", "bla2"])
all_approved = pd.read_table(args.all_approved_file, header=None, names=["contig_id"], index_col=0)
checkm_taxonomy = pd.read_table(args.checkm_taxonomy_file, index_col=0)

all_approved_set = set(all_approved.index.values)
unapproved_rrna = defaultdict(int)
approved_rrna = {}
levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
for rrna_contig in taxonomy_df.index.values:
if rrna_contig in clustering.index:
cluster_id = clustering.loc[rrna_contig]['cluster_id']
if cluster_id in all_approved_set:
checkm_val = checkm_taxonomy.loc[cluster_id]['Taxonomy'].split(';')
metaxa_val = taxonomy_df.loc[rrna_contig]['taxonomy'].split(';')

metaxa_val = fix_strange_metaxa_vals(metaxa_val)


matched_level = None
for i, level in enumerate(levels):
checkm_level_val, metaxa_level_val = None, None
if len(checkm_val) > i and len(metaxa_val) > i:
checkm_level_val = checkm_val[i][3:]
metaxa_level_val = metaxa_val[i]

if level == 'species':
metaxa_level_val = metaxa_val[i].replace(' ', '_')
if checkm_level_val == metaxa_level_val:
matched_level = i
else:
break
else:
matched_level = i-1
break
if cluster_id not in approved_rrna:
approved_rrna[cluster_id] = {'matching': 0, 'not matching': 0}

if matched_level >= 3:
approved_rrna[cluster_id]['matching'] += 1
else:
approved_rrna[cluster_id]['not matching'] += 1
#print(most_detailed_level_checkm, most_detailed_level_metaxa)
#print(most_detailed_matched_level)
#print(taxonomy_df.loc[rrna_contig]['taxonomy'], checkm_taxonomy.loc[cluster_id]['Taxonomy'])
else:
unapproved_rrna[cluster_id] += 1

for cluster_id in all_approved_set:
if cluster_id not in approved_rrna:
approved_rrna[cluster_id] = {'matching': 0, 'not matching': 0}

approved_stats_df = pd.DataFrame.from_dict(approved_rrna, orient='index')

unapproved_stats_df = pd.DataFrame.from_dict(unapproved_rrna, orient='index')
unapproved_stats_df.columns = ['nr_rrna']

print(approved_stats_df)
print(unapproved_stats_df)

# Things to output:
#
# Number of approved genomes with matching rrna
# Number of approved genomes with unmatching rrna
# Number of rrna genes per bin
#
# Number of approved genomes with > 0 matching rrna and < 2 unmatching rrna
# Matching is counted at order level
#


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--approved_bin_dir', help="Directory where bins can be found, will be copied to directories decided on taxonomy.")
parser.add_argument('--all_approved_file', help="e.g. ../../Data/test_binning_and_16s_combo/list_of_all_approved_bins_nocutup.tsv")
parser.add_argument('--checkm_taxonomy_file', help="e.g. ../../Data/test_binning_and_16s_combo/checkm_tree_qa.tsv")

args = parser.parse_args()

main(args)
63 changes: 63 additions & 0 deletions scripts/concoct/dnadiff_per_fastani_cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env python
"""
Given a preliminary coarse fastani clustering, run the concoct
dnadiff script on each cluster respectively.
@author: alneberg
"""
import sys
import os
import argparse
import pandas as pd
from subprocess import call

def main(args):
# Read fastani clustering
fastani_df = pd.read_table(args.fastani_clustering, sep=',')

# convert clusters to fasta file paths
def bin_to_path(bin_id):
return os.path.join(args.input_dir, "{}.fa".format(bin_id))

fastani_df['filepath'] = fastani_df['bin_id'].apply(bin_to_path)

# groupby fastani cluster
for cluster, group_df in fastani_df.groupby('clustering'):
if len(group_df) == 1:
print("Cluster {} is a singleton, continuing".format(cluster))
continue
# Create dnadiff output folder
cluster_outdir = os.path.join(args.output_dir, "dnadiff_{}".format(cluster))
if os.path.isdir(cluster_outdir):
if args.skip_existing:
print("Cluster directory {} exists, continuing".format(cluster_outdir))
continue
else:
os.mkdir(cluster_outdir)

# run dnadiff
print("Running dnadiff for cluster {}, with {} genomes".format(cluster, len(group_df)))
fasta_name_file = os.path.join(args.output_dir, 'fasta_names.txt')
with open(fasta_name_file, 'w') as ofh:
for fasta_name in list(group_df['bin_id'].values):
ofh.write(fasta_name + '\n')
cmd = [args.dnadiff_script, cluster_outdir]
for filepath in list(group_df['filepath'].values):
cmd.append(filepath)

cmd += ["--fasta_names", fasta_name_file, "--skip_plot"]
if args.skip_dnadiff:
cmd.append("--skip_dnadiff")
call(cmd)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("fastani_clustering", help=("Coarse preliminary fastani clustering."))
parser.add_argument("input_dir", help="Directory where fasta files can be found")
parser.add_argument("output_dir", help="Directory where dnadiff directories where be placed")
parser.add_argument("dnadiff_script", help="The script that will be ran")
parser.add_argument("--skip_dnadiff", action="store_true", help="Skip running dnadiff, assumes it has already been run.")
parser.add_argument("--skip_existing", action="store_true", help="Skip cluster directories which exists.")
args = parser.parse_args()

main(args)
44 changes: 44 additions & 0 deletions scripts/concoct/long_but_unapproved_bins.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python
"""
Based on all checkm results, creates a table containing the nr of approved genomes
for all binning runs it can find within binning/*.
@author: alneberg
"""
from __future__ import print_function
import sys
import os
import argparse
import pandas as pd
import glob
from shutil import copyfile


def main(args):
# Read bin sizes file
bin_sizes_df = pd.read_table(args.bin_sizes, sep=',', index_col=0)

if os.stat(args.approved_bins).st_size == 0:
approved_bins_df = None
else:
approved_bins_df = pd.read_table(args.approved_bins, header=None, index_col=0)

for bin_nr, row in bin_sizes_df.iterrows():
if float(row['nr_bases']) < 1000000:
continue

# create a symlink unless bin is approved
if (approved_bins_df is None) or (bin_nr not in approved_bins_df.index):
source_bin = os.path.join(args.input_dir, "{}.fa".format(bin_nr))
destination_bin = os.path.join(args.output_dir, "{}.fa".format(bin_nr))
copyfile(source_bin, destination_bin)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("input_dir")
parser.add_argument("bin_sizes")
parser.add_argument("approved_bins")
parser.add_argument("output_dir")

args = parser.parse_args()
main(args)
95 changes: 95 additions & 0 deletions scripts/concoct/sample_profile_for_mags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python
"""
With contigs cutup with cut_up_fasta.py as input, sees to that the consequtive
parts of the original contigs are merged.
prints result to stdout.
@author: alneberg
"""
import sys
import os
import argparse
import pandas as pd

def original_contig_name(s):
"""Transform s to the original contig name"""
n = s.split(".")[-1]
try:
int(n)
except:
return s
# Only small integers are likely to be
# indicating a cutup part.
if int(n) < 1000:
return ".".join(s.split(".")[:-1])
else:
# A large n indicates that the integer
# was part of the original contig
return s


def main(args):

# Check if any approved bins exists:
if os.stat(args.list_all_approved_bins).st_size == 0:
return

print("# Find all approved bins")
approved_bins = pd.read_table(args.list_all_approved_bins, header=None, index_col=0)

print("# Open clustering file (non-cutup but")
clustering_df = pd.read_table(args.clustering_nocutup, header=None, index_col=0, names=['contig_id', 'cluster_id'], sep=',')

print("# Read input table")
input_table = pd.read_table(args.concoct_inputtable, index_col=0)
original_columns = input_table.columns

print("# Read kallisto quant result for the contig length")
kallisto_df = pd.read_table(args.kallisto_quant, index_col=0)

print("# Create a new column with non-cutup contig id")
input_table['target_id'] = input_table.index
input_table['original_contig_id'] = input_table['target_id'].apply(original_contig_name)

print("# Subset clustering file to get all approved contigs.")
approved_contigs = clustering_df[clustering_df['cluster_id'].isin(approved_bins.index)]


print("# Subset input table to all contigs included in approved bins")
approved_table = input_table[input_table['original_contig_id'].isin(approved_contigs.index)]

print("# Add length for the relevant contigs")
approved_table['length'] = kallisto_df.loc[approved_table.index]['length']

def cluster_id_for_approved(or_id):
return approved_contigs.loc[or_id]['cluster_id']

print("# Assign cluster id to input table")
approved_table['cluster_id'] = approved_table['original_contig_id'].apply(lambda x: cluster_id_for_approved(x))


results_d = {}

# Group by cluster id
for cluster, cluster_df in approved_table.groupby('cluster_id'):
total_length = cluster_df['length'].sum()
cluster_name = "{}_bin{}".format(args.sample_name, cluster)
cluster_df[original_columns] = cluster_df[original_columns].mul(cluster_df['length'], axis=0)
result_s = cluster_df[original_columns].sum() / float(total_length)
result_s['length'] = int(total_length)
results_d[cluster_name] = result_s

pd.DataFrame.from_dict(results_d, orient='index').to_csv(args.output_file, sep='\t', index_label="MAG")

if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("list_all_approved_bins", help=("A file with all approved bins."))
parser.add_argument("clustering_nocutup", help="Clustering file for all no-cutup contigs")
parser.add_argument("concoct_inputtable", help="concoct inputtable for the cutup contigs")
parser.add_argument("kallisto_quant", help="quant result for one sample, to get contig length")
parser.add_argument("sample_name", help="Sample name")
parser.add_argument("output_file", help="output file")
args = parser.parse_args()

main(args)

0 comments on commit 7368f0f

Please sign in to comment.