From 69044a249ac594011d370769a8d7e0b24b4eee3a Mon Sep 17 00:00:00 2001 From: alneberg Date: Thu, 15 Nov 2018 10:52:46 +0100 Subject: [PATCH] More scripts useful for concoct binning evaluation --- scripts/concoct/approve_bins_from_busco.py | 53 ++++++++++++ scripts/concoct/assign_metaxa_taxonomy.py | 66 ++++++++++++++ scripts/concoct/extract_16S_fasta.py | 85 +++++++++++++++++++ scripts/concoct/extract_stats_for_approved.py | 31 +++++++ .../dnadiff_eukaryote_reference_parsing.py | 71 ++++++++++++++++ scripts/parse_augustus_basic.py | 48 +++++++++++ 6 files changed, 354 insertions(+) create mode 100644 scripts/concoct/approve_bins_from_busco.py create mode 100644 scripts/concoct/assign_metaxa_taxonomy.py create mode 100644 scripts/concoct/extract_16S_fasta.py create mode 100644 scripts/concoct/extract_stats_for_approved.py create mode 100755 scripts/dnadiff_eukaryote_reference_parsing.py create mode 100644 scripts/parse_augustus_basic.py diff --git a/scripts/concoct/approve_bins_from_busco.py b/scripts/concoct/approve_bins_from_busco.py new file mode 100644 index 0000000..3b3391d --- /dev/null +++ b/scripts/concoct/approve_bins_from_busco.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +""" +Based on the busco results, approves bins according to the leves of contamination and completeness. + +Copies approved bins to output directory. + +@author: alneberg +""" +from __future__ import print_function +import sys +import os +import argparse +import pandas as pd +from shutil import copyfile + +def main(args): + # Read in the busco table + df = pd.read_table(args.busco_stats, index_col=0, sep=',') + df['completeness_float'] = df['completeness'].apply(lambda x: float(x.replace('%',''))) + df['duplicated_float'] = df['duplicated'].apply(lambda x: float(x.replace('%',''))) + # extract the ids for all rows that meet the requirements + filtered_df = df[(df['completeness_float'] >= args.min_completeness) & (df['duplicated_float'] <= args.max_contamination)] + + if len(filtered_df) < 1: + sys.stderr.write("\nApproved 0 bins\n\n") + return None + + approved_bins = list(filtered_df.index) + + # copy the approved bins to the new output directory + for approved_bin_int in approved_bins: + approved_bin = str(approved_bin_int) + bin_source = os.path.join(args.bin_directory, approved_bin) + bin_source += '.' + args.extension + bin_destination = os.path.join(args.output_directory) + bin_destination += '/' + os.path.basename(bin_source) + + sys.stderr.write("Copying approved bin {} from {} to {}\n".format(approved_bin, bin_source, bin_destination)) + copyfile(bin_source, bin_destination) + + sys.stderr.write("\nApproved {} bins\n\n".format(len(approved_bins))) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("bin_directory", help=("Input fasta files should be within directory.")) + parser.add_argument("busco_stats", help="Busco stats in csv format with header and bin id as index/first column") + parser.add_argument("output_directory", help="Directory where to put approved bins") + parser.add_argument("--min_completeness", default=30, type=float, help="default=30") + parser.add_argument("--max_contamination", default=10, type=float, help="default=10") + parser.add_argument("--extension", default='fa') + args = parser.parse_args() + + main(args) diff --git a/scripts/concoct/assign_metaxa_taxonomy.py b/scripts/concoct/assign_metaxa_taxonomy.py new file mode 100644 index 0000000..51f1f27 --- /dev/null +++ b/scripts/concoct/assign_metaxa_taxonomy.py @@ -0,0 +1,66 @@ +import pandas as pd +import argparse +from collections import defaultdict +import os +import math + +def main(args): + clustering = pd.read_table(args.clustering_file, sep=',', names=['contig_id', 'cluster_id'], index_col=0) + taxonomy_df = pd.read_table(args.taxonomy_file, header=None, index_col=0, names=["contig_id", "taxonomy", "identity", "aln_length", "reliability_score"]) + all_approved_prok = pd.read_table(args.all_approved_file, header=None, names=["contig_id"], index_col=0) + if args.all_approved_euk_file: + all_approved_euk = pd.read_table(args.all_approved_euk_file, header=None, names=["contig_id"], index_col=0) + + def pair_approved_and_metaxa(all_approved): + all_approved_set = set(all_approved.index.values) + unapproved_rrna = defaultdict(int) + all_clusters_found = set() + approved_rrna = [] + taxonomy_df['taxonomy'].fillna('', inplace=True) + for rrna_contig in taxonomy_df.index.values: + if rrna_contig in clustering.index: + cluster_id = clustering.loc[rrna_contig]['cluster_id'] + metaxa_val = taxonomy_df.loc[rrna_contig] + if cluster_id in all_approved_set: + tax_dict = metaxa_val.to_dict() + tax_dict['cluster_id'] = cluster_id + tax_dict['contig_id'] = rrna_contig + approved_rrna.append(tax_dict) + all_clusters_found.add(cluster_id) + + for cluster_id in all_approved_set: + if cluster_id not in all_clusters_found: + approved_rrna.append({'cluster_id': cluster_id, 'contig_id': '', 'taxonomy': '', 'identity': '', 'aln_length': '', 'reliability_score': ''}) + + return approved_rrna + + approved_rrna_prok = pair_approved_and_metaxa(all_approved_prok) + approved_stats_prok_df = pd.DataFrame(approved_rrna_prok) + + approved_rrna_euk = pair_approved_and_metaxa(all_approved_euk) + approved_stats_euk_df = pd.DataFrame(approved_rrna_euk) + + columns = ["cluster_id", "contig_id", "taxonomy", "identity", "aln_length", "reliability_score"] + with open(os.path.join(args.outdir, 'approved_prok.tsv'), 'w') as ofh: + if len(approved_stats_prok_df): + approved_stats_prok_df[columns].to_csv(ofh, sep='\t', index=False) + else: + ofh.write('\t'.join(columns) + '\n') + with open(os.path.join(args.outdir, 'approved_euk.tsv'), 'w') as ofh: + if len(approved_stats_euk_df): + approved_stats_euk_df[columns].to_csv(ofh, sep='\t', index=False) + else: + ofh.write('\t'.join(columns) + '\n') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--clustering_file', help="CONCOCT nocutup merged clustering clustering_nocutup.csv") + parser.add_argument('--taxonomy_file', help="Metaxa output all_contigs.taxonomy.txt") + parser.add_argument('--all_approved_file', help="list_of_all_approved_bins_nocutup.tsv") + parser.add_argument('--all_approved_euk_file', help="list_of_all_approved_bins_nocutup_eukaryotes.tsv") + parser.add_argument('--outdir', help="A directory for output files") + + args = parser.parse_args() + + main(args) diff --git a/scripts/concoct/extract_16S_fasta.py b/scripts/concoct/extract_16S_fasta.py new file mode 100644 index 0000000..ab030d0 --- /dev/null +++ b/scripts/concoct/extract_16S_fasta.py @@ -0,0 +1,85 @@ +import pandas as pd +import argparse +from collections import defaultdict +import os +import math +from Bio import SeqIO + +"""A script to extract rrna sequences for approved MAGs given the metaxa results. + +""" + +def main(args): + clustering = pd.read_table(args.clustering_file, sep=',', names=['contig_id', 'cluster_id'], index_col=0) + taxonomy_df = pd.read_table(args.taxonomy_file, header=None, index_col=0, names=["contig_id", "taxonomy", "identity", "aln_length", "reliability_score"]) + all_approved_prok = pd.read_table(args.all_approved_file, header=None, names=["contig_id"], index_col=0) + if args.all_approved_euk_file: + all_approved_euk = pd.read_table(args.all_approved_euk_file, header=None, names=["contig_id"], index_col=0) + + def pair_approved_and_metaxa(all_approved): + all_approved_set = set(all_approved.index.values) + contig_to_mag = {} + taxonomy_df['taxonomy'].fillna('', inplace=True) + for rrna_contig in taxonomy_df.index.values: + # Only long contigs are in clustering + if rrna_contig in clustering.index: + cluster_id = clustering.loc[rrna_contig]['cluster_id'] + + # Only report for approved bins + if cluster_id in all_approved_set: + contig_to_mag[rrna_contig] = cluster_id + + return contig_to_mag + + contig_to_mag_prok = pair_approved_and_metaxa(all_approved_prok) + + contig_to_mag_euk = pair_approved_and_metaxa(all_approved_euk) + + + def filter_seqs(contig_to_mag, input_fasta ): + # Keep track if a contig is present several times + count_per_contig = {} + + output_seqs = [] + # Read prok fasta file + for seq in SeqIO.parse(input_fasta, "fasta"): + if seq.id in count_per_contig: + count_per_contig[seq.id] += 1 + else: + count_per_contig[seq.id] = 1 + + # check which MAG it belonged to + if seq.id in contig_to_mag: + mag = contig_to_mag[seq.id] + new_mag_id = args.bin_prefix + str(mag) + new_seq_id = "{}_{}_{}".format(seq.id, count_per_contig[seq.id], new_mag_id) + seq.id = new_seq_id + + output_seqs.append(seq) + + return output_seqs + + prok_output_seqs = filter_seqs(contig_to_mag_prok, args.rrna_prok_fasta_file) + + with open(os.path.join(args.outdir, "prok_rRNA.fasta"), 'w') as ofh: + SeqIO.write(prok_output_seqs, ofh, 'fasta') + + euk_output_seqs = filter_seqs(contig_to_mag_euk, args.rrna_euk_fasta_file) + + with open(os.path.join(args.outdir, "euk_rRNA.fasta"), 'w') as ofh: + SeqIO.write(euk_output_seqs, ofh, 'fasta') + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--clustering_file', help="CONCOCT nocutup merged clustering clustering_nocutup.csv") + parser.add_argument('--taxonomy_file', help="Metaxa output all_contigs.taxonomy.txt") + parser.add_argument('--rrna_prok_fasta_file', help="Metaxa output all_contigs.bacteria.fasta and all_contigs.archaea.fasta combined") + parser.add_argument('--rrna_euk_fasta_file', help="Metaxa output all_contigs.eukaryota.fasta") + parser.add_argument('--all_approved_file', help="list_of_all_approved_bins_nocutup.tsv") + parser.add_argument('--all_approved_euk_file', help="list_of_all_approved_bins_nocutup_eukaryotes.tsv") + parser.add_argument('--bin_prefix', help="Prefix that will be added be fore the integer indicating cluster id") + parser.add_argument('--outdir', help="A directory for output files") + + args = parser.parse_args() + + main(args) diff --git a/scripts/concoct/extract_stats_for_approved.py b/scripts/concoct/extract_stats_for_approved.py new file mode 100644 index 0000000..4dc51fc --- /dev/null +++ b/scripts/concoct/extract_stats_for_approved.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +""" +Based on the checkm results, approves bins according to the leves of contamination and completeness. + +Prints the corresponding stats to stdout. + +@author: alneberg +""" +from __future__ import print_function +import sys +import os +import argparse +import pandas as pd +from shutil import copyfile + +def main(args): + # Read in the checkm table + df = pd.read_table(args.checkm_stats, index_col=0) + # extract the ids for all rows that meet the requirements + filtered_df = df[(df['Completeness'] >= args.min_completeness) & (df['Contamination'] <= args.max_contamination)] + + filtered_df.to_csv(sys.stdout, sep='\t') + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("checkm_stats", help="Checkm qa stats in tab_table format") + parser.add_argument("--min_completeness", default=85, type=float, help="default=85") + parser.add_argument("--max_contamination", default=5, type=float, help="default=5") + args = parser.parse_args() + + main(args) diff --git a/scripts/dnadiff_eukaryote_reference_parsing.py b/scripts/dnadiff_eukaryote_reference_parsing.py new file mode 100755 index 0000000..2dfdf8f --- /dev/null +++ b/scripts/dnadiff_eukaryote_reference_parsing.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +""" +Parses a large number of dnadiff reports to only report the most significant hits + +Only hits with more than 10% of the query bases aligned is reported. + +output_folder/fastaname1_vs_fastaname2/ +output_folder/fastaname1_vs_fastaname3/ + +etc + +""" +import argparse +import re +import os +from os.path import join as ospj + +class MUMmerReport(object): + """Represents .report file from MUMmer's dnadiff. Stores TotalBases and + AlignedBases (%) stats.""" + def __init__(self, report_file): + self.report_file = report_file + + with open(report_file) as f: + one_to_one_parsed = False + + for line in f: + if line.startswith("TotalBases"): + self.tot_bases = [int(b) for b in line.split()[1:]] + if line.startswith("AlignedBases"): + # store percentage + self.aligned_perc = [float(p) for p in + re.findall(r'\((.*?)\%\)', line)] + self.aligned_bases = [int(n) for n in + re.findall(r'\W([0-9]*)\(', line)] + if not one_to_one_parsed and line.startswith("AvgIdentity"): + self.avg_identity = [float(p) for p in line.split()[1:]][0] + one_to_one_parsed = True + self.aligned_perc_ref = self.aligned_perc[0] + self.aligned_perc_mag = self.aligned_perc[1] + self.aligned_bases_ref = self.aligned_bases[0] + self.aligned_bases_mag = self.aligned_bases[1] + +def main(args): + # Get fasta names + + # Get basename from fasta files and see if those are unique + fasta_names_ref = [".".join(os.path.basename(f).split(".")[0:-1]) for f in + args.fasta_files_ref] + fasta_names_mag = [os.path.basename(f).split(".")[0] for f in + args.fasta_files_mag] + + print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format("ref_fasta_name", "mag_fasta_name", "aligned_bases_ref", "aligned_perc_ref", "aligned_bases_mag", "aligned_perc_mag", "avg_identity")) + for ref_fasta_name in fasta_names_ref: + for mag_fasta_name in fasta_names_mag: + repfile = ospj(args.input_dir, "{fn1}_vs_{fn2}.report".format( + fn1=ref_fasta_name, fn2=mag_fasta_name)) + mumr = MUMmerReport(repfile) + if mumr.aligned_perc_mag >= args.min_coverage: + print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(ref_fasta_name, mag_fasta_name, mumr.aligned_bases_ref, mumr.aligned_perc_ref, mumr.aligned_bases_mag, mumr.aligned_perc_mag, mumr.avg_identity)) + +if __name__ == "__main__": + """Return input arguments using argparse""" + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("input_dir") + parser.add_argument("--fasta_files_ref", nargs='+') + parser.add_argument("--fasta_files_mag", nargs='+') + parser.add_argument("--min_coverage", type=float, default=10) + args = parser.parse_args() + main(args) diff --git a/scripts/parse_augustus_basic.py b/scripts/parse_augustus_basic.py new file mode 100644 index 0000000..6c7a1cf --- /dev/null +++ b/scripts/parse_augustus_basic.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +""" +Reads an augustus output file and prints the amino_acid fasta file to stdout + +@author: alneberg +""" +import sys +import os +import argparse + +def to_fasta(s, gene_id, contig_id): + print('>{}_{}'.format(contig_id, gene_id)) + print(s) + + +def main(args): + with open(args.augustus_output) as ifh: + protein_str = None + for line in ifh: + line = line.strip() + # Check contig_id + if not line.startswith('#'): + contig_id = line.split('\t')[0] + # Check if protein starts + elif line.startswith('# protein sequence'): + line = line.replace('# protein sequence = [', '') + protein_str = line.replace(']','') + protein_str = line + elif protein_str: + # If protein ends + # Parse lines and output to stdout + if line.startswith('# end gene'): + line = line.replace('# end gene ', '') + gene_id = line + to_fasta(protein_str, gene_id, contig_id) + protein_str = None + else: + # add to protein lines + line = line.replace('# ', '') + line = line.replace(']', '') + protein_str += line + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("augustus_output", help=("Standard output format of augustus.")) + args = parser.parse_args() + + main(args)