From 41a1d4dc60c001969541eded62edb8449abffb03 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@rackham3.uppmax.uu.se>
Date: Fri, 15 Jun 2018 13:53:51 +0200
Subject: [PATCH 1/3] Added scripts for contamination and chimeric reads
 finding

---
 scripts/sag_vs_mag/check_status_in_mapping.py | 81 ++++++++++++++++
 scripts/sag_vs_mag/filter_out_none_clipped.py | 62 +++++++++++++
 .../find_candidate_chimeric_reads.py          | 70 ++++++++++++++
 scripts/sag_vs_mag/mag_no_mag_mapping.py      | 92 +++++++++++++++++++
 4 files changed, 305 insertions(+)
 create mode 100755 scripts/sag_vs_mag/check_status_in_mapping.py
 create mode 100755 scripts/sag_vs_mag/filter_out_none_clipped.py
 create mode 100755 scripts/sag_vs_mag/find_candidate_chimeric_reads.py
 create mode 100755 scripts/sag_vs_mag/mag_no_mag_mapping.py

diff --git a/scripts/sag_vs_mag/check_status_in_mapping.py b/scripts/sag_vs_mag/check_status_in_mapping.py
new file mode 100755
index 0000000..a648db5
--- /dev/null
+++ b/scripts/sag_vs_mag/check_status_in_mapping.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+usage="""Checks the status for a subset of reads within a bam file.
+
+Three categories:
+    Matching
+    Soft clipped
+    Not mapping
+"""
+
+import argparse
+import sys
+import re
+
+soft_clip_re = re.compile('([0-9]+)S')
+matched_re = re.compile('([0-9]+)M')
+
+nr_mismatched_re = re.compile('XM:i:([0-9]+)')
+
+def parse_cigar(cigar):
+    soft_clipped_matches = soft_clip_re.findall(cigar)
+    matched_matches = matched_re.findall(cigar)    
+    soft_clipped_bases = sum([int(x) for x in soft_clipped_matches])
+    matched_bases = sum([int(x) for x in matched_matches])
+
+    return soft_clipped_bases, matched_bases
+
+def main(args):
+    all_reads = set()
+    for line in open(args.subset_reads, 'r'):
+        read = line.strip()
+        all_reads.add(read)
+
+    not_mapped = set()
+    soft_clipped_mapped = set()
+    other = set()
+    good_mapped = set()
+
+    # Read the input sam files
+    for line in sys.stdin:
+        line = line.strip()
+
+        # Check if in header
+        if line[0] != '@':
+            # Not in header
+
+            line_split = line.split('\t')
+            if (int(line_split[1]) / 128) == 1:
+                read_id = line_split[0] + "_2"
+            else:
+                read_id = line_split[0] + "_1"
+
+            # Ignore reads which are not in the subset
+            if read_id in all_reads:
+
+                no_mapping_bit = (((((int(line_split[1]) % 128) % 64) % 32) % 16) % 8) / 4
+                if no_mapping_bit == 1:
+                    not_mapped.add(read_id)
+                else:
+                    cigar = line_split[5]
+
+                    soft_clipped, matching = parse_cigar(cigar)
+                    read_length = len(line_split[9])
+                    mismatches = int(nr_mismatched_re.findall(line)[0])
+                    
+                    if soft_clipped > 20:
+                        soft_clipped_mapped.add(read_id)
+                    elif (matching > read_length*0.90):
+                        good_mapped.add(read_id)
+                    else:
+                        other.add(read_id)
+
+    print("Not Mapped\tSoft Clipped\tOther\tGood Mapped")
+    print("{}\t{}\t{}\t{}".format(len(not_mapped), len(soft_clipped_mapped), len(other), len(good_mapped)))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=usage)
+    parser.add_argument('subset_reads')
+    args = parser.parse_args()
+
+    main(args)
diff --git a/scripts/sag_vs_mag/filter_out_none_clipped.py b/scripts/sag_vs_mag/filter_out_none_clipped.py
new file mode 100755
index 0000000..55b62ac
--- /dev/null
+++ b/scripts/sag_vs_mag/filter_out_none_clipped.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+usage="""Filter potential chimeric reads to remove correctly none-clipped. 
+"""
+
+import argparse
+import sys
+import re
+
+soft_clip_re = re.compile('([0-9]+)S')
+matched_re = re.compile('([0-9]+)M')
+
+nr_mismatched_re = re.compile('XM:i:([0-9]+)')
+
+def parse_cigar(cigar):
+    soft_clipped_matches = soft_clip_re.findall(cigar)
+    matched_matches = matched_re.findall(cigar)    
+    soft_clipped_bases = sum([int(x) for x in soft_clipped_matches])
+    matched_bases = sum([int(x) for x in matched_matches])
+
+    return soft_clipped_bases, matched_bases
+
+def main(args):
+    all_reads = set()
+    for line in open(args.potential_chimera, 'r'):
+        read = line.strip()
+        all_reads.add(read)
+
+    # Read the input sam files
+    for line in sys.stdin:
+        line = line.strip()
+
+        # Check if in header
+        if line[0] != '@':
+            # Not in header
+            
+            # Ignore reads which are note potential chimeras
+            # Read is most likely not chimeric
+            line_split = line.split('\t')
+            if (int(line_split[1]) / 128) == 1:
+                read_id = line_split[0] + "_2"
+            else:
+                read_id = line_split[0] + "_1"
+
+            if read_id in all_reads:
+                cigar = line_split[5]
+                # Matching min 95% of bases
+                soft_clipped, matching = parse_cigar(cigar)
+                    
+                read_length = len(line_split[9])
+                mismatches = int(nr_mismatched_re.findall(line)[0])
+                
+                if (matching > read_length*0.95) and (int(mismatches) < 5):
+                    all_reads.remove(read_id)
+
+    print("\n".join(list(all_reads)))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=usage)
+    parser.add_argument('potential_chimera')
+    args = parser.parse_args()
+
+    main(args)
diff --git a/scripts/sag_vs_mag/find_candidate_chimeric_reads.py b/scripts/sag_vs_mag/find_candidate_chimeric_reads.py
new file mode 100755
index 0000000..32da16c
--- /dev/null
+++ b/scripts/sag_vs_mag/find_candidate_chimeric_reads.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+usage="""Prints reads which are potential chimeras.
+
+These are determined as being soft clipped, not on the edge 
+of contigs and at leat 50% mapped with max 2 mismatches.
+
+Input SAM with header.
+"""
+
+import argparse
+import sys
+import re
+
+soft_clip_re = re.compile('([0-9]+)S')
+matched_re = re.compile('([0-9]+)M')
+
+nr_mismatched_re = re.compile('XM:i:([0-9]+)')
+
+def parse_cigar(cigar):
+    soft_clipped_matches = soft_clip_re.findall(cigar)
+    matched_matches = matched_re.findall(cigar)    
+    soft_clipped_bases = sum([int(x) for x in soft_clipped_matches])
+    matched_bases = sum([int(x) for x in matched_matches])
+
+    return soft_clipped_bases, matched_bases
+
+def main():
+    contig_len_re = re.compile('SN:(.*)		*LN:([0-9]*)$')
+    contig_lens = {}
+
+    # Read the input sam file
+    for line in sys.stdin:
+        line = line.strip()
+
+        # Check if in header
+        if line[0] == '@':
+            if line[0:3] == '@SQ':
+                contig, contig_len = contig_len_re.findall(line)[0]
+                contig_lens[contig] = contig_len
+        else:
+            # Not in header
+            line_split = line.split('\t')
+            cigar = line_split[5]
+            # Soft clipped at least 20 bases
+            if 'S' in cigar:
+                soft_clipped, matching = parse_cigar(cigar)
+                if soft_clipped > 20:
+                    # Matching region should be > 50% of read
+                    read_length = len(line_split[9])
+                    if matching > read_length/2.0:
+                        contig, mapping_start_pos = line_split[2], line_split[7]
+
+                        # mapping should not be on the edges
+                        if mapping_start_pos > 300 and int(contig_lens[contig]) - int(mapping_start_pos) > 300:
+                            # check nr_mismatches
+                            mismatches = int(nr_mismatched_re.findall(line)[0])
+                            if int(mismatches) < 2:
+                                # check which read 
+                                if (int(line_split[1]) / 128) == 1:
+                                    read_nr = "2"
+                                else:
+                                    read_nr = "1"
+
+                                print(line_split[0]+"_" + read_nr)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=usage)
+    parser.parse_args()
+    main()
diff --git a/scripts/sag_vs_mag/mag_no_mag_mapping.py b/scripts/sag_vs_mag/mag_no_mag_mapping.py
new file mode 100755
index 0000000..6c531f0
--- /dev/null
+++ b/scripts/sag_vs_mag/mag_no_mag_mapping.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+usage="""Checks the status for a subset of reads within a bam file.
+
+Three categories:
+    Not Mapped
+    Mapping to < 1000 bp contig
+    Mapping to > 1000 bp contig (not MAG)
+    Mapping to MAG contig
+"""
+
+import argparse
+import sys
+import re
+
+soft_clip_re = re.compile('([0-9]+)S')
+matched_re = re.compile('([0-9]+)M')
+
+nr_mismatched_re = re.compile('XM:i:([0-9]+)')
+
+def parse_cigar(cigar):
+    soft_clipped_matches = soft_clip_re.findall(cigar)
+    matched_matches = matched_re.findall(cigar)    
+    soft_clipped_bases = sum([int(x) for x in soft_clipped_matches])
+    matched_bases = sum([int(x) for x in matched_matches])
+
+    return soft_clipped_bases, matched_bases
+
+def main(args):
+    all_reads = set()
+    for line in open(args.subset_reads, 'r'):
+        read = line.strip()
+        all_reads.add(read)
+
+    MAG_contigs = set()
+    for line in open(args.mag_contigs, 'r'):
+        contig = line.strip()
+        MAG_contigs.add(contig)
+
+    not_mapped = set()
+    short_mapped = set()
+    long_mapped = set()
+    MAG_mapped = set()
+
+    contig_len_re = re.compile('SN:(.*)		*LN:([0-9]*)$')
+    contig_lens = {}
+
+    # Read the input sam file
+    for line in sys.stdin:
+        line = line.strip()
+
+        # Check if in header
+        if line[0] == '@':
+            if line[0:3] == '@SQ':
+                contig, contig_len = contig_len_re.findall(line)[0]
+                contig = contig.split('_')[0]
+                contig_lens[contig] = int(contig_len)
+        else:
+            # Not in header
+            line_split = line.split('\t')
+            if (int(line_split[1]) / 128) == 1:
+                read_id = line_split[0] + "_2"
+            else:
+                read_id = line_split[0] + "_1"
+
+            # Ignore reads which are not in the subset
+            if read_id in all_reads:
+
+                no_mapping_bit = (((((int(line_split[1]) % 128) % 64) % 32) % 16) % 8) / 4
+                if no_mapping_bit == 1:
+                    not_mapped.add(read_id)
+                else:
+                    contig = line_split[2]
+                    contig = contig.split('_')[0]
+                    if contig in MAG_contigs:
+                        MAG_mapped.add(read_id)
+                    elif contig_lens[contig] >= 1000:
+                        long_mapped.add(read_id)
+                    else:
+                        short_mapped.add(read_id)
+
+    print("Not Mapped\tShort Mapped\tLong Mapped\tMAG mapped")
+    print("{}\t{}\t{}\t{}".format(len(not_mapped), len(short_mapped),
+        len(long_mapped), len(MAG_mapped)))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=usage)
+    parser.add_argument('subset_reads')
+    parser.add_argument('mag_contigs')
+    args = parser.parse_args()
+
+    main(args)

From b1935352ca087dffe2d595da5a1f9dc247d3ba79 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@rackham3.uppmax.uu.se>
Date: Tue, 19 Jun 2018 14:52:23 +0200
Subject: [PATCH 2/3] Changed what to count in the sag mag chimera mapping

---
 scripts/sag_vs_mag/check_status_in_mapping.py | 95 +++++++++++++------
 1 file changed, 65 insertions(+), 30 deletions(-)

diff --git a/scripts/sag_vs_mag/check_status_in_mapping.py b/scripts/sag_vs_mag/check_status_in_mapping.py
index a648db5..2e3cf45 100755
--- a/scripts/sag_vs_mag/check_status_in_mapping.py
+++ b/scripts/sag_vs_mag/check_status_in_mapping.py
@@ -14,6 +14,8 @@
 soft_clip_re = re.compile('([0-9]+)S')
 matched_re = re.compile('([0-9]+)M')
 
+left_clipped_re = re.compile('^([0-9]+)S')
+
 nr_mismatched_re = re.compile('XM:i:([0-9]+)')
 
 def parse_cigar(cigar):
@@ -22,25 +24,54 @@ def parse_cigar(cigar):
     soft_clipped_bases = sum([int(x) for x in soft_clipped_matches])
     matched_bases = sum([int(x) for x in matched_matches])
 
-    return soft_clipped_bases, matched_bases
+    left_clipped_matches = left_clipped_re.findall(cigar)
+    left_clipped_bases = sum([int(x) for x in left_clipped_matches])
+
+    return soft_clipped_bases, matched_bases, left_clipped_bases
+
+
+"""
+chimeric or not
+soft clipped (> 20S) or not 
+overlapping edge or not
+mapped or not
+
+              soft clipped   not soft clipped    overlapping edge    not overlapping edge   overlapping edge and soft clipped    mapped    not mapped
+    chimeric: 
+Non-chimeric:  
+
+"""
+
+def print_stats(name, soft_clipped_set, overlapping_set, mapped_set, complete_set):
+    "soft clipped   not soft clipped    overlapping edge    not overlapping edge   overlapping edge and soft clipped    mapped    not mapped"
+
+    print("{},{},{},{},{},{},{},{}".format(name, len(soft_clipped_set), len(complete_set - soft_clipped_set), len(overlapping_set), len(complete_set - overlapping_set), len(overlapping_set & soft_clipped_set), len(mapped_set), len(complete_set - mapped_set)))
+
 
 def main(args):
-    all_reads = set()
+    chim_reads = set()
     for line in open(args.subset_reads, 'r'):
         read = line.strip()
-        all_reads.add(read)
+        chim_reads.add(read)
+
+    mapped_reads = set()
+    soft_clipped_reads = set()
+    overlapping_edge_reads = set()
+    all_reads = set()
+    contig_lens = {}
 
-    not_mapped = set()
-    soft_clipped_mapped = set()
-    other = set()
-    good_mapped = set()
+    contig_len_re = re.compile('SN:(.*)		*LN:([0-9]*)$')
 
     # Read the input sam files
     for line in sys.stdin:
         line = line.strip()
 
         # Check if in header
-        if line[0] != '@':
+        if line[0] == '@':
+            if line[0:3] == '@SQ':
+                contig, contig_len = contig_len_re.findall(line)[0]
+                contig_lens[contig] = contig_len
+	else:
             # Not in header
 
             line_split = line.split('\t')
@@ -49,28 +80,32 @@ def main(args):
             else:
                 read_id = line_split[0] + "_1"
 
-            # Ignore reads which are not in the subset
-            if read_id in all_reads:
-
-                no_mapping_bit = (((((int(line_split[1]) % 128) % 64) % 32) % 16) % 8) / 4
-                if no_mapping_bit == 1:
-                    not_mapped.add(read_id)
-                else:
-                    cigar = line_split[5]
-
-                    soft_clipped, matching = parse_cigar(cigar)
-                    read_length = len(line_split[9])
-                    mismatches = int(nr_mismatched_re.findall(line)[0])
-                    
-                    if soft_clipped > 20:
-                        soft_clipped_mapped.add(read_id)
-                    elif (matching > read_length*0.90):
-                        good_mapped.add(read_id)
-                    else:
-                        other.add(read_id)
-
-    print("Not Mapped\tSoft Clipped\tOther\tGood Mapped")
-    print("{}\t{}\t{}\t{}".format(len(not_mapped), len(soft_clipped_mapped), len(other), len(good_mapped)))
+            all_reads.add(read_id)
+            
+            mapping_bit = (((((int(line_split[1]) % 128) % 64) % 32) % 16) % 8) / 4
+            if mapping_bit != 1:
+                mapped_reads.add(read_id)
+                cigar = line_split[5]
+
+                soft_clipped, matching, left_clipped_bases = parse_cigar(cigar)
+                if soft_clipped > 20:
+                    soft_clipped_reads.add(read_id)
+
+                read_length = len(line_split[9])
+                 
+                contig, mapping_start_pos = line_split[2], line_split[7]
+                
+                if int(mapping_start_pos) - left_clipped_bases < 0:
+                    overlapping_edge_reads.add(read_id)
+		   
+                if int(mapping_start_pos) + read_length > contig_lens[contig]:
+                    overlapping_edge_reads.add(read_id)
+
+    # Aim to print> 
+    # "soft clipped   not soft clipped    overlapping edge    not overlapping edge   overlapping edge and soft clipped    mapped    not mapped"
+    print("name,soft clipped, not soft clipped,overlapping edge,not overlapping edge,overlapping edge and soft clipped,mapped,not mapped")
+    print_stats("chimeric_reads", soft_clipped_reads & chim_reads, overlapping_edge_reads & chim_reads, mapped_reads & chim_reads, chim_reads)
+    print_stats("other_reads", soft_clipped_reads - chim_reads, overlapping_edge_reads - chim_reads, mapped_reads - chim_reads, all_reads - chim_reads) 
 
 
 if __name__ == "__main__":

From 4ebf7f15b2906e8d9cbe65e992503d951ba8d226 Mon Sep 17 00:00:00 2001
From: alneberg <alneberg@kth.se>
Date: Fri, 29 Jun 2018 18:07:13 +0200
Subject: [PATCH 3/3] Some conveniece scripts for ena uploading

---
 .../construct_ena_sequencing_runs_table.py    | 71 +++++++++++++++++++
 scripts/convenience/upload_to_ena_ftp.sh      | 14 ++++
 2 files changed, 85 insertions(+)
 create mode 100644 scripts/convenience/construct_ena_sequencing_runs_table.py
 create mode 100755 scripts/convenience/upload_to_ena_ftp.sh

diff --git a/scripts/convenience/construct_ena_sequencing_runs_table.py b/scripts/convenience/construct_ena_sequencing_runs_table.py
new file mode 100644
index 0000000..f676fb2
--- /dev/null
+++ b/scripts/convenience/construct_ena_sequencing_runs_table.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+"""construct_ena_sequencing_runs_table.py 
+
+Based on a folder with uploaded files and a template, construct table ready to be submitted.
+This script is not general but is very niched to the NGI/Uppmax scenario.
+Ideally the user is expected to copy this script and edit it to suit the users needs.
+"""
+
+import argparse
+import sys
+import os
+import glob
+from os.path import join as opj
+import pandas as pd
+from collections import defaultdict
+import gzip
+
+# Need to fetch file name and link it to sample id
+# Fetch the md5sum already calculated
+
+def main(args):
+    md5sum_df = pd.read_table(args.md5_summary, sep=',', header=None, names=['file_name', 'md5sum'], index_col=0)
+
+    insert_sizes = pd.read_table(args.insert_size, index_col=0)
+    info_d = {}
+
+    for sample_dir in args.sample_dirs:
+        for R1_run_file in glob.glob(opj(sample_dir, "*", "*_R1*.fastq.gz")):
+            R1_file_name=os.path.basename(R1_run_file)
+            sample_name="_".join(R1_file_name.split('_')[0:2])
+            run_id="_".join(R1_file_name.split('_')[0:4])
+            run_info = {}
+            run_info['sample_accession'] = sample_name
+            run_info['library_name'] = run_id
+            is_series = insert_sizes.ix[sample_name]['Avg. FS']
+            try:
+                run_info['insert_size'] = int(round(is_series))
+            except TypeError:
+                run_info['insert_size'] = int(round(insert_sizes[insert_sizes['Lib QC'] == 'PASSED'].ix[sample_name]['Avg. FS']))
+
+            run_info['forward_file_name'] = R1_file_name
+            run_info['forward_file_md5'] = md5sum_df.loc[R1_file_name]['md5sum']
+            R2_file_name = R1_file_name.replace("R1", "R2")
+            run_info['reverse_file_name'] = R2_file_name
+            run_info['reverse_file_md5'] = md5sum_df.loc[R2_file_name]['md5sum']
+            run_info['library_source'] = 'METAGENOMIC'
+            run_info['library_selection'] = 'RANDOM'
+            run_info['library_strategy'] = 'WGS'
+            run_info['library_construction_protocol'] = 'Rubicon Thruplex'
+            run_info['instrument_model'] = 'Illumina HiSeq 2500'
+            run_info['file_type'] = 'fastq'
+            run_info['library_layout'] = 'PAIRED'
+
+            
+            info_d[run_id] = run_info 
+    all_columns_sorted = ['sample_accession', 'library_name', 'library_source', 'insert_size', \
+            'library_selection', 'library_strategy', 'library_construction_protocol', 'instrument_model', \
+            'file_type', 'library_layout', 'insert_size', \
+            'forward_file_name', 'forward_file_md5', 'reverse_file_name', 'reverse_file_md5']
+
+    df = pd.DataFrame.from_dict(info_d, orient='index')
+    df[all_columns_sorted].to_csv(sys.stdout, index=False, sep='\t', header=True)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("md5_summary", help="Table (csv) with md5sum values for all files")
+    parser.add_argument("insert_size", help="Table with insert size per sample")
+    parser.add_argument("sample_dirs", nargs='*', help="Directories where read files are located in subdirs")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/scripts/convenience/upload_to_ena_ftp.sh b/scripts/convenience/upload_to_ena_ftp.sh
new file mode 100755
index 0000000..e635e48
--- /dev/null
+++ b/scripts/convenience/upload_to_ena_ftp.sh
@@ -0,0 +1,14 @@
+# -n option disables auto-logon
+FILE_SUFFIX="fastq.gz"
+for upload_dir in "$@"
+do
+ftp -n webin.ebi.ac.uk <<End-Of-Session
+user $ENA_USER $ENA_PASS
+binary
+prompt
+lcd "$upload_dir"
+!ls *"$FILE_SUFFIX"
+mput *"$FILE_SUFFIX"
+bye
+End-Of-Session
+done