From 41a1d4dc60c001969541eded62edb8449abffb03 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Fri, 15 Jun 2018 13:53:51 +0200 Subject: [PATCH 1/3] Added scripts for contamination and chimeric reads finding --- scripts/sag_vs_mag/check_status_in_mapping.py | 81 ++++++++++++++++ scripts/sag_vs_mag/filter_out_none_clipped.py | 62 +++++++++++++ .../find_candidate_chimeric_reads.py | 70 ++++++++++++++ scripts/sag_vs_mag/mag_no_mag_mapping.py | 92 +++++++++++++++++++ 4 files changed, 305 insertions(+) create mode 100755 scripts/sag_vs_mag/check_status_in_mapping.py create mode 100755 scripts/sag_vs_mag/filter_out_none_clipped.py create mode 100755 scripts/sag_vs_mag/find_candidate_chimeric_reads.py create mode 100755 scripts/sag_vs_mag/mag_no_mag_mapping.py diff --git a/scripts/sag_vs_mag/check_status_in_mapping.py b/scripts/sag_vs_mag/check_status_in_mapping.py new file mode 100755 index 0000000..a648db5 --- /dev/null +++ b/scripts/sag_vs_mag/check_status_in_mapping.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +usage="""Checks the status for a subset of reads within a bam file. + +Three categories: + Matching + Soft clipped + Not mapping +""" + +import argparse +import sys +import re + +soft_clip_re = re.compile('([0-9]+)S') +matched_re = re.compile('([0-9]+)M') + +nr_mismatched_re = re.compile('XM:i:([0-9]+)') + +def parse_cigar(cigar): + soft_clipped_matches = soft_clip_re.findall(cigar) + matched_matches = matched_re.findall(cigar) + soft_clipped_bases = sum([int(x) for x in soft_clipped_matches]) + matched_bases = sum([int(x) for x in matched_matches]) + + return soft_clipped_bases, matched_bases + +def main(args): + all_reads = set() + for line in open(args.subset_reads, 'r'): + read = line.strip() + all_reads.add(read) + + not_mapped = set() + soft_clipped_mapped = set() + other = set() + good_mapped = set() + + # Read the input sam files + for line in sys.stdin: + line = line.strip() + + # Check if in header + if line[0] != '@': + # Not in header + + line_split = line.split('\t') + if (int(line_split[1]) / 128) == 1: + read_id = line_split[0] + "_2" + else: + read_id = line_split[0] + "_1" + + # Ignore reads which are not in the subset + if read_id in all_reads: + + no_mapping_bit = (((((int(line_split[1]) % 128) % 64) % 32) % 16) % 8) / 4 + if no_mapping_bit == 1: + not_mapped.add(read_id) + else: + cigar = line_split[5] + + soft_clipped, matching = parse_cigar(cigar) + read_length = len(line_split[9]) + mismatches = int(nr_mismatched_re.findall(line)[0]) + + if soft_clipped > 20: + soft_clipped_mapped.add(read_id) + elif (matching > read_length*0.90): + good_mapped.add(read_id) + else: + other.add(read_id) + + print("Not Mapped\tSoft Clipped\tOther\tGood Mapped") + print("{}\t{}\t{}\t{}".format(len(not_mapped), len(soft_clipped_mapped), len(other), len(good_mapped))) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=usage) + parser.add_argument('subset_reads') + args = parser.parse_args() + + main(args) diff --git a/scripts/sag_vs_mag/filter_out_none_clipped.py b/scripts/sag_vs_mag/filter_out_none_clipped.py new file mode 100755 index 0000000..55b62ac --- /dev/null +++ b/scripts/sag_vs_mag/filter_out_none_clipped.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +usage="""Filter potential chimeric reads to remove correctly none-clipped. +""" + +import argparse +import sys +import re + +soft_clip_re = re.compile('([0-9]+)S') +matched_re = re.compile('([0-9]+)M') + +nr_mismatched_re = re.compile('XM:i:([0-9]+)') + +def parse_cigar(cigar): + soft_clipped_matches = soft_clip_re.findall(cigar) + matched_matches = matched_re.findall(cigar) + soft_clipped_bases = sum([int(x) for x in soft_clipped_matches]) + matched_bases = sum([int(x) for x in matched_matches]) + + return soft_clipped_bases, matched_bases + +def main(args): + all_reads = set() + for line in open(args.potential_chimera, 'r'): + read = line.strip() + all_reads.add(read) + + # Read the input sam files + for line in sys.stdin: + line = line.strip() + + # Check if in header + if line[0] != '@': + # Not in header + + # Ignore reads which are note potential chimeras + # Read is most likely not chimeric + line_split = line.split('\t') + if (int(line_split[1]) / 128) == 1: + read_id = line_split[0] + "_2" + else: + read_id = line_split[0] + "_1" + + if read_id in all_reads: + cigar = line_split[5] + # Matching min 95% of bases + soft_clipped, matching = parse_cigar(cigar) + + read_length = len(line_split[9]) + mismatches = int(nr_mismatched_re.findall(line)[0]) + + if (matching > read_length*0.95) and (int(mismatches) < 5): + all_reads.remove(read_id) + + print("\n".join(list(all_reads))) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=usage) + parser.add_argument('potential_chimera') + args = parser.parse_args() + + main(args) diff --git a/scripts/sag_vs_mag/find_candidate_chimeric_reads.py b/scripts/sag_vs_mag/find_candidate_chimeric_reads.py new file mode 100755 index 0000000..32da16c --- /dev/null +++ b/scripts/sag_vs_mag/find_candidate_chimeric_reads.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +usage="""Prints reads which are potential chimeras. + +These are determined as being soft clipped, not on the edge +of contigs and at leat 50% mapped with max 2 mismatches. + +Input SAM with header. +""" + +import argparse +import sys +import re + +soft_clip_re = re.compile('([0-9]+)S') +matched_re = re.compile('([0-9]+)M') + +nr_mismatched_re = re.compile('XM:i:([0-9]+)') + +def parse_cigar(cigar): + soft_clipped_matches = soft_clip_re.findall(cigar) + matched_matches = matched_re.findall(cigar) + soft_clipped_bases = sum([int(x) for x in soft_clipped_matches]) + matched_bases = sum([int(x) for x in matched_matches]) + + return soft_clipped_bases, matched_bases + +def main(): + contig_len_re = re.compile('SN:(.*) *LN:([0-9]*)$') + contig_lens = {} + + # Read the input sam file + for line in sys.stdin: + line = line.strip() + + # Check if in header + if line[0] == '@': + if line[0:3] == '@SQ': + contig, contig_len = contig_len_re.findall(line)[0] + contig_lens[contig] = contig_len + else: + # Not in header + line_split = line.split('\t') + cigar = line_split[5] + # Soft clipped at least 20 bases + if 'S' in cigar: + soft_clipped, matching = parse_cigar(cigar) + if soft_clipped > 20: + # Matching region should be > 50% of read + read_length = len(line_split[9]) + if matching > read_length/2.0: + contig, mapping_start_pos = line_split[2], line_split[7] + + # mapping should not be on the edges + if mapping_start_pos > 300 and int(contig_lens[contig]) - int(mapping_start_pos) > 300: + # check nr_mismatches + mismatches = int(nr_mismatched_re.findall(line)[0]) + if int(mismatches) < 2: + # check which read + if (int(line_split[1]) / 128) == 1: + read_nr = "2" + else: + read_nr = "1" + + print(line_split[0]+"_" + read_nr) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=usage) + parser.parse_args() + main() diff --git a/scripts/sag_vs_mag/mag_no_mag_mapping.py b/scripts/sag_vs_mag/mag_no_mag_mapping.py new file mode 100755 index 0000000..6c531f0 --- /dev/null +++ b/scripts/sag_vs_mag/mag_no_mag_mapping.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python +usage="""Checks the status for a subset of reads within a bam file. + +Three categories: + Not Mapped + Mapping to < 1000 bp contig + Mapping to > 1000 bp contig (not MAG) + Mapping to MAG contig +""" + +import argparse +import sys +import re + +soft_clip_re = re.compile('([0-9]+)S') +matched_re = re.compile('([0-9]+)M') + +nr_mismatched_re = re.compile('XM:i:([0-9]+)') + +def parse_cigar(cigar): + soft_clipped_matches = soft_clip_re.findall(cigar) + matched_matches = matched_re.findall(cigar) + soft_clipped_bases = sum([int(x) for x in soft_clipped_matches]) + matched_bases = sum([int(x) for x in matched_matches]) + + return soft_clipped_bases, matched_bases + +def main(args): + all_reads = set() + for line in open(args.subset_reads, 'r'): + read = line.strip() + all_reads.add(read) + + MAG_contigs = set() + for line in open(args.mag_contigs, 'r'): + contig = line.strip() + MAG_contigs.add(contig) + + not_mapped = set() + short_mapped = set() + long_mapped = set() + MAG_mapped = set() + + contig_len_re = re.compile('SN:(.*) *LN:([0-9]*)$') + contig_lens = {} + + # Read the input sam file + for line in sys.stdin: + line = line.strip() + + # Check if in header + if line[0] == '@': + if line[0:3] == '@SQ': + contig, contig_len = contig_len_re.findall(line)[0] + contig = contig.split('_')[0] + contig_lens[contig] = int(contig_len) + else: + # Not in header + line_split = line.split('\t') + if (int(line_split[1]) / 128) == 1: + read_id = line_split[0] + "_2" + else: + read_id = line_split[0] + "_1" + + # Ignore reads which are not in the subset + if read_id in all_reads: + + no_mapping_bit = (((((int(line_split[1]) % 128) % 64) % 32) % 16) % 8) / 4 + if no_mapping_bit == 1: + not_mapped.add(read_id) + else: + contig = line_split[2] + contig = contig.split('_')[0] + if contig in MAG_contigs: + MAG_mapped.add(read_id) + elif contig_lens[contig] >= 1000: + long_mapped.add(read_id) + else: + short_mapped.add(read_id) + + print("Not Mapped\tShort Mapped\tLong Mapped\tMAG mapped") + print("{}\t{}\t{}\t{}".format(len(not_mapped), len(short_mapped), + len(long_mapped), len(MAG_mapped))) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=usage) + parser.add_argument('subset_reads') + parser.add_argument('mag_contigs') + args = parser.parse_args() + + main(args) From b1935352ca087dffe2d595da5a1f9dc247d3ba79 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Tue, 19 Jun 2018 14:52:23 +0200 Subject: [PATCH 2/3] Changed what to count in the sag mag chimera mapping --- scripts/sag_vs_mag/check_status_in_mapping.py | 95 +++++++++++++------ 1 file changed, 65 insertions(+), 30 deletions(-) diff --git a/scripts/sag_vs_mag/check_status_in_mapping.py b/scripts/sag_vs_mag/check_status_in_mapping.py index a648db5..2e3cf45 100755 --- a/scripts/sag_vs_mag/check_status_in_mapping.py +++ b/scripts/sag_vs_mag/check_status_in_mapping.py @@ -14,6 +14,8 @@ soft_clip_re = re.compile('([0-9]+)S') matched_re = re.compile('([0-9]+)M') +left_clipped_re = re.compile('^([0-9]+)S') + nr_mismatched_re = re.compile('XM:i:([0-9]+)') def parse_cigar(cigar): @@ -22,25 +24,54 @@ def parse_cigar(cigar): soft_clipped_bases = sum([int(x) for x in soft_clipped_matches]) matched_bases = sum([int(x) for x in matched_matches]) - return soft_clipped_bases, matched_bases + left_clipped_matches = left_clipped_re.findall(cigar) + left_clipped_bases = sum([int(x) for x in left_clipped_matches]) + + return soft_clipped_bases, matched_bases, left_clipped_bases + + +""" +chimeric or not +soft clipped (> 20S) or not +overlapping edge or not +mapped or not + + soft clipped not soft clipped overlapping edge not overlapping edge overlapping edge and soft clipped mapped not mapped + chimeric: +Non-chimeric: + +""" + +def print_stats(name, soft_clipped_set, overlapping_set, mapped_set, complete_set): + "soft clipped not soft clipped overlapping edge not overlapping edge overlapping edge and soft clipped mapped not mapped" + + print("{},{},{},{},{},{},{},{}".format(name, len(soft_clipped_set), len(complete_set - soft_clipped_set), len(overlapping_set), len(complete_set - overlapping_set), len(overlapping_set & soft_clipped_set), len(mapped_set), len(complete_set - mapped_set))) + def main(args): - all_reads = set() + chim_reads = set() for line in open(args.subset_reads, 'r'): read = line.strip() - all_reads.add(read) + chim_reads.add(read) + + mapped_reads = set() + soft_clipped_reads = set() + overlapping_edge_reads = set() + all_reads = set() + contig_lens = {} - not_mapped = set() - soft_clipped_mapped = set() - other = set() - good_mapped = set() + contig_len_re = re.compile('SN:(.*) *LN:([0-9]*)$') # Read the input sam files for line in sys.stdin: line = line.strip() # Check if in header - if line[0] != '@': + if line[0] == '@': + if line[0:3] == '@SQ': + contig, contig_len = contig_len_re.findall(line)[0] + contig_lens[contig] = contig_len + else: # Not in header line_split = line.split('\t') @@ -49,28 +80,32 @@ def main(args): else: read_id = line_split[0] + "_1" - # Ignore reads which are not in the subset - if read_id in all_reads: - - no_mapping_bit = (((((int(line_split[1]) % 128) % 64) % 32) % 16) % 8) / 4 - if no_mapping_bit == 1: - not_mapped.add(read_id) - else: - cigar = line_split[5] - - soft_clipped, matching = parse_cigar(cigar) - read_length = len(line_split[9]) - mismatches = int(nr_mismatched_re.findall(line)[0]) - - if soft_clipped > 20: - soft_clipped_mapped.add(read_id) - elif (matching > read_length*0.90): - good_mapped.add(read_id) - else: - other.add(read_id) - - print("Not Mapped\tSoft Clipped\tOther\tGood Mapped") - print("{}\t{}\t{}\t{}".format(len(not_mapped), len(soft_clipped_mapped), len(other), len(good_mapped))) + all_reads.add(read_id) + + mapping_bit = (((((int(line_split[1]) % 128) % 64) % 32) % 16) % 8) / 4 + if mapping_bit != 1: + mapped_reads.add(read_id) + cigar = line_split[5] + + soft_clipped, matching, left_clipped_bases = parse_cigar(cigar) + if soft_clipped > 20: + soft_clipped_reads.add(read_id) + + read_length = len(line_split[9]) + + contig, mapping_start_pos = line_split[2], line_split[7] + + if int(mapping_start_pos) - left_clipped_bases < 0: + overlapping_edge_reads.add(read_id) + + if int(mapping_start_pos) + read_length > contig_lens[contig]: + overlapping_edge_reads.add(read_id) + + # Aim to print> + # "soft clipped not soft clipped overlapping edge not overlapping edge overlapping edge and soft clipped mapped not mapped" + print("name,soft clipped, not soft clipped,overlapping edge,not overlapping edge,overlapping edge and soft clipped,mapped,not mapped") + print_stats("chimeric_reads", soft_clipped_reads & chim_reads, overlapping_edge_reads & chim_reads, mapped_reads & chim_reads, chim_reads) + print_stats("other_reads", soft_clipped_reads - chim_reads, overlapping_edge_reads - chim_reads, mapped_reads - chim_reads, all_reads - chim_reads) if __name__ == "__main__": From 4ebf7f15b2906e8d9cbe65e992503d951ba8d226 Mon Sep 17 00:00:00 2001 From: alneberg Date: Fri, 29 Jun 2018 18:07:13 +0200 Subject: [PATCH 3/3] Some conveniece scripts for ena uploading --- .../construct_ena_sequencing_runs_table.py | 71 +++++++++++++++++++ scripts/convenience/upload_to_ena_ftp.sh | 14 ++++ 2 files changed, 85 insertions(+) create mode 100644 scripts/convenience/construct_ena_sequencing_runs_table.py create mode 100755 scripts/convenience/upload_to_ena_ftp.sh diff --git a/scripts/convenience/construct_ena_sequencing_runs_table.py b/scripts/convenience/construct_ena_sequencing_runs_table.py new file mode 100644 index 0000000..f676fb2 --- /dev/null +++ b/scripts/convenience/construct_ena_sequencing_runs_table.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +"""construct_ena_sequencing_runs_table.py + +Based on a folder with uploaded files and a template, construct table ready to be submitted. +This script is not general but is very niched to the NGI/Uppmax scenario. +Ideally the user is expected to copy this script and edit it to suit the users needs. +""" + +import argparse +import sys +import os +import glob +from os.path import join as opj +import pandas as pd +from collections import defaultdict +import gzip + +# Need to fetch file name and link it to sample id +# Fetch the md5sum already calculated + +def main(args): + md5sum_df = pd.read_table(args.md5_summary, sep=',', header=None, names=['file_name', 'md5sum'], index_col=0) + + insert_sizes = pd.read_table(args.insert_size, index_col=0) + info_d = {} + + for sample_dir in args.sample_dirs: + for R1_run_file in glob.glob(opj(sample_dir, "*", "*_R1*.fastq.gz")): + R1_file_name=os.path.basename(R1_run_file) + sample_name="_".join(R1_file_name.split('_')[0:2]) + run_id="_".join(R1_file_name.split('_')[0:4]) + run_info = {} + run_info['sample_accession'] = sample_name + run_info['library_name'] = run_id + is_series = insert_sizes.ix[sample_name]['Avg. FS'] + try: + run_info['insert_size'] = int(round(is_series)) + except TypeError: + run_info['insert_size'] = int(round(insert_sizes[insert_sizes['Lib QC'] == 'PASSED'].ix[sample_name]['Avg. FS'])) + + run_info['forward_file_name'] = R1_file_name + run_info['forward_file_md5'] = md5sum_df.loc[R1_file_name]['md5sum'] + R2_file_name = R1_file_name.replace("R1", "R2") + run_info['reverse_file_name'] = R2_file_name + run_info['reverse_file_md5'] = md5sum_df.loc[R2_file_name]['md5sum'] + run_info['library_source'] = 'METAGENOMIC' + run_info['library_selection'] = 'RANDOM' + run_info['library_strategy'] = 'WGS' + run_info['library_construction_protocol'] = 'Rubicon Thruplex' + run_info['instrument_model'] = 'Illumina HiSeq 2500' + run_info['file_type'] = 'fastq' + run_info['library_layout'] = 'PAIRED' + + + info_d[run_id] = run_info + all_columns_sorted = ['sample_accession', 'library_name', 'library_source', 'insert_size', \ + 'library_selection', 'library_strategy', 'library_construction_protocol', 'instrument_model', \ + 'file_type', 'library_layout', 'insert_size', \ + 'forward_file_name', 'forward_file_md5', 'reverse_file_name', 'reverse_file_md5'] + + df = pd.DataFrame.from_dict(info_d, orient='index') + df[all_columns_sorted].to_csv(sys.stdout, index=False, sep='\t', header=True) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("md5_summary", help="Table (csv) with md5sum values for all files") + parser.add_argument("insert_size", help="Table with insert size per sample") + parser.add_argument("sample_dirs", nargs='*', help="Directories where read files are located in subdirs") + args = parser.parse_args() + + main(args) diff --git a/scripts/convenience/upload_to_ena_ftp.sh b/scripts/convenience/upload_to_ena_ftp.sh new file mode 100755 index 0000000..e635e48 --- /dev/null +++ b/scripts/convenience/upload_to_ena_ftp.sh @@ -0,0 +1,14 @@ +# -n option disables auto-logon +FILE_SUFFIX="fastq.gz" +for upload_dir in "$@" +do +ftp -n webin.ebi.ac.uk <