-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
287 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
# coding: utf-8 | ||
import pandas as pd | ||
from collections import defaultdict | ||
|
||
def main(args): | ||
clustering = pd.read_table(args.clustering_file, sep=',', names=['contig_id', 'cluster_id'], index_col=0) | ||
taxonomy_df = pd.read_table(args.taxonomy_file, header=None, index_col=0, names=["contig_id", "taxonomy", "bla", "bla1", "bla2"]) | ||
all_approved = pd.read_table(args.all_approved_file, header=None, names=["contig_id"], index_col=0) | ||
checkm_taxonomy = pd.read_table(args.checkm_taxonomy_file, index_col=0) | ||
|
||
all_approved_set = set(all_approved.index.values) | ||
unapproved_rrna = defaultdict(int) | ||
approved_rrna = {} | ||
levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] | ||
for rrna_contig in taxonomy_df.index.values: | ||
if rrna_contig in clustering.index: | ||
cluster_id = clustering.loc[rrna_contig]['cluster_id'] | ||
if cluster_id in all_approved_set: | ||
checkm_val = checkm_taxonomy.loc[cluster_id]['Taxonomy'].split(';') | ||
metaxa_val = taxonomy_df.loc[rrna_contig]['taxonomy'].split(';') | ||
|
||
metaxa_val = fix_strange_metaxa_vals(metaxa_val) | ||
|
||
|
||
matched_level = None | ||
for i, level in enumerate(levels): | ||
checkm_level_val, metaxa_level_val = None, None | ||
if len(checkm_val) > i and len(metaxa_val) > i: | ||
checkm_level_val = checkm_val[i][3:] | ||
metaxa_level_val = metaxa_val[i] | ||
|
||
if level == 'species': | ||
metaxa_level_val = metaxa_val[i].replace(' ', '_') | ||
if checkm_level_val == metaxa_level_val: | ||
matched_level = i | ||
else: | ||
break | ||
else: | ||
matched_level = i-1 | ||
break | ||
if cluster_id not in approved_rrna: | ||
approved_rrna[cluster_id] = {'matching': 0, 'not matching': 0} | ||
|
||
if matched_level >= 3: | ||
approved_rrna[cluster_id]['matching'] += 1 | ||
else: | ||
approved_rrna[cluster_id]['not matching'] += 1 | ||
#print(most_detailed_level_checkm, most_detailed_level_metaxa) | ||
#print(most_detailed_matched_level) | ||
#print(taxonomy_df.loc[rrna_contig]['taxonomy'], checkm_taxonomy.loc[cluster_id]['Taxonomy']) | ||
else: | ||
unapproved_rrna[cluster_id] += 1 | ||
|
||
for cluster_id in all_approved_set: | ||
if cluster_id not in approved_rrna: | ||
approved_rrna[cluster_id] = {'matching': 0, 'not matching': 0} | ||
|
||
approved_stats_df = pd.DataFrame.from_dict(approved_rrna, orient='index') | ||
|
||
unapproved_stats_df = pd.DataFrame.from_dict(unapproved_rrna, orient='index') | ||
unapproved_stats_df.columns = ['nr_rrna'] | ||
|
||
print(approved_stats_df) | ||
print(unapproved_stats_df) | ||
|
||
# Things to output: | ||
# | ||
# Number of approved genomes with matching rrna | ||
# Number of approved genomes with unmatching rrna | ||
# Number of rrna genes per bin | ||
# | ||
# Number of approved genomes with > 0 matching rrna and < 2 unmatching rrna | ||
# Matching is counted at order level | ||
# | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--approved_bin_dir', help="Directory where bins can be found, will be copied to directories decided on taxonomy.") | ||
parser.add_argument('--all_approved_file', help="e.g. ../../Data/test_binning_and_16s_combo/list_of_all_approved_bins_nocutup.tsv") | ||
parser.add_argument('--checkm_taxonomy_file', help="e.g. ../../Data/test_binning_and_16s_combo/checkm_tree_qa.tsv") | ||
|
||
args = parser.parse_args() | ||
|
||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#!/usr/bin/env python | ||
""" | ||
Given a preliminary coarse fastani clustering, run the concoct | ||
dnadiff script on each cluster respectively. | ||
@author: alneberg | ||
""" | ||
import sys | ||
import os | ||
import argparse | ||
import pandas as pd | ||
from subprocess import call | ||
|
||
def main(args): | ||
# Read fastani clustering | ||
fastani_df = pd.read_table(args.fastani_clustering, sep=',') | ||
|
||
# convert clusters to fasta file paths | ||
def bin_to_path(bin_id): | ||
return os.path.join(args.input_dir, "{}.fa".format(bin_id)) | ||
|
||
fastani_df['filepath'] = fastani_df['bin_id'].apply(bin_to_path) | ||
|
||
# groupby fastani cluster | ||
for cluster, group_df in fastani_df.groupby('clustering'): | ||
if len(group_df) == 1: | ||
print("Cluster {} is a singleton, continuing".format(cluster)) | ||
continue | ||
# Create dnadiff output folder | ||
cluster_outdir = os.path.join(args.output_dir, "dnadiff_{}".format(cluster)) | ||
if os.path.isdir(cluster_outdir): | ||
if args.skip_existing: | ||
print("Cluster directory {} exists, continuing".format(cluster_outdir)) | ||
continue | ||
else: | ||
os.mkdir(cluster_outdir) | ||
|
||
# run dnadiff | ||
print("Running dnadiff for cluster {}, with {} genomes".format(cluster, len(group_df))) | ||
fasta_name_file = os.path.join(args.output_dir, 'fasta_names.txt') | ||
with open(fasta_name_file, 'w') as ofh: | ||
for fasta_name in list(group_df['bin_id'].values): | ||
ofh.write(fasta_name + '\n') | ||
cmd = [args.dnadiff_script, cluster_outdir] | ||
for filepath in list(group_df['filepath'].values): | ||
cmd.append(filepath) | ||
|
||
cmd += ["--fasta_names", fasta_name_file, "--skip_plot"] | ||
if args.skip_dnadiff: | ||
cmd.append("--skip_dnadiff") | ||
call(cmd) | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument("fastani_clustering", help=("Coarse preliminary fastani clustering.")) | ||
parser.add_argument("input_dir", help="Directory where fasta files can be found") | ||
parser.add_argument("output_dir", help="Directory where dnadiff directories where be placed") | ||
parser.add_argument("dnadiff_script", help="The script that will be ran") | ||
parser.add_argument("--skip_dnadiff", action="store_true", help="Skip running dnadiff, assumes it has already been run.") | ||
parser.add_argument("--skip_existing", action="store_true", help="Skip cluster directories which exists.") | ||
args = parser.parse_args() | ||
|
||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#!/usr/bin/env python | ||
""" | ||
Based on all checkm results, creates a table containing the nr of approved genomes | ||
for all binning runs it can find within binning/*. | ||
@author: alneberg | ||
""" | ||
from __future__ import print_function | ||
import sys | ||
import os | ||
import argparse | ||
import pandas as pd | ||
import glob | ||
from shutil import copyfile | ||
|
||
|
||
def main(args): | ||
# Read bin sizes file | ||
bin_sizes_df = pd.read_table(args.bin_sizes, sep=',', index_col=0) | ||
|
||
if os.stat(args.approved_bins).st_size == 0: | ||
approved_bins_df = None | ||
else: | ||
approved_bins_df = pd.read_table(args.approved_bins, header=None, index_col=0) | ||
|
||
for bin_nr, row in bin_sizes_df.iterrows(): | ||
if float(row['nr_bases']) < 1000000: | ||
continue | ||
|
||
# create a symlink unless bin is approved | ||
if (approved_bins_df is None) or (bin_nr not in approved_bins_df.index): | ||
source_bin = os.path.join(args.input_dir, "{}.fa".format(bin_nr)) | ||
destination_bin = os.path.join(args.output_dir, "{}.fa".format(bin_nr)) | ||
copyfile(source_bin, destination_bin) | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument("input_dir") | ||
parser.add_argument("bin_sizes") | ||
parser.add_argument("approved_bins") | ||
parser.add_argument("output_dir") | ||
|
||
args = parser.parse_args() | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
#!/usr/bin/env python | ||
""" | ||
With contigs cutup with cut_up_fasta.py as input, sees to that the consequtive | ||
parts of the original contigs are merged. | ||
prints result to stdout. | ||
@author: alneberg | ||
""" | ||
import sys | ||
import os | ||
import argparse | ||
import pandas as pd | ||
|
||
def original_contig_name(s): | ||
"""Transform s to the original contig name""" | ||
n = s.split(".")[-1] | ||
try: | ||
int(n) | ||
except: | ||
return s | ||
# Only small integers are likely to be | ||
# indicating a cutup part. | ||
if int(n) < 1000: | ||
return ".".join(s.split(".")[:-1]) | ||
else: | ||
# A large n indicates that the integer | ||
# was part of the original contig | ||
return s | ||
|
||
|
||
def main(args): | ||
|
||
# Check if any approved bins exists: | ||
if os.stat(args.list_all_approved_bins).st_size == 0: | ||
return | ||
|
||
print("# Find all approved bins") | ||
approved_bins = pd.read_table(args.list_all_approved_bins, header=None, index_col=0) | ||
|
||
print("# Open clustering file (non-cutup but") | ||
clustering_df = pd.read_table(args.clustering_nocutup, header=None, index_col=0, names=['contig_id', 'cluster_id'], sep=',') | ||
|
||
print("# Read input table") | ||
input_table = pd.read_table(args.concoct_inputtable, index_col=0) | ||
original_columns = input_table.columns | ||
|
||
print("# Read kallisto quant result for the contig length") | ||
kallisto_df = pd.read_table(args.kallisto_quant, index_col=0) | ||
|
||
print("# Create a new column with non-cutup contig id") | ||
input_table['target_id'] = input_table.index | ||
input_table['original_contig_id'] = input_table['target_id'].apply(original_contig_name) | ||
|
||
print("# Subset clustering file to get all approved contigs.") | ||
approved_contigs = clustering_df[clustering_df['cluster_id'].isin(approved_bins.index)] | ||
|
||
|
||
print("# Subset input table to all contigs included in approved bins") | ||
approved_table = input_table[input_table['original_contig_id'].isin(approved_contigs.index)] | ||
|
||
print("# Add length for the relevant contigs") | ||
approved_table['length'] = kallisto_df.loc[approved_table.index]['length'] | ||
|
||
def cluster_id_for_approved(or_id): | ||
return approved_contigs.loc[or_id]['cluster_id'] | ||
|
||
print("# Assign cluster id to input table") | ||
approved_table['cluster_id'] = approved_table['original_contig_id'].apply(lambda x: cluster_id_for_approved(x)) | ||
|
||
|
||
results_d = {} | ||
|
||
# Group by cluster id | ||
for cluster, cluster_df in approved_table.groupby('cluster_id'): | ||
total_length = cluster_df['length'].sum() | ||
cluster_name = "{}_bin{}".format(args.sample_name, cluster) | ||
cluster_df[original_columns] = cluster_df[original_columns].mul(cluster_df['length'], axis=0) | ||
result_s = cluster_df[original_columns].sum() / float(total_length) | ||
result_s['length'] = int(total_length) | ||
results_d[cluster_name] = result_s | ||
|
||
pd.DataFrame.from_dict(results_d, orient='index').to_csv(args.output_file, sep='\t', index_label="MAG") | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument("list_all_approved_bins", help=("A file with all approved bins.")) | ||
parser.add_argument("clustering_nocutup", help="Clustering file for all no-cutup contigs") | ||
parser.add_argument("concoct_inputtable", help="concoct inputtable for the cutup contigs") | ||
parser.add_argument("kallisto_quant", help="quant result for one sample, to get contig length") | ||
parser.add_argument("sample_name", help="Sample name") | ||
parser.add_argument("output_file", help="output file") | ||
args = parser.parse_args() | ||
|
||
main(args) |