-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathAssemblytics_within_alignment.py
executable file
·105 lines (88 loc) · 4.97 KB
/
Assemblytics_within_alignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#! /usr/bin/env python
import argparse
import gzip
# Author: Maria Nattestad
# Email: [email protected]
# This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer.
def run(args):
filename = args.delta
minimum_variant_size = args.minimum_variant_size
f = open(filename)
header1 = f.readline()
if header1[0:2]=="\x1f\x8b":
f.close()
f = gzip.open(filename)
header1 = f.readline()
# Ignore the first two lines for now
f.readline()
linecounter = 0
current_reference_name = ""
current_reference_position = 0
current_query_name = ""
current_query_position = 0
variants = []
for line in f:
if line[0]==">":
# linecounter += 1
# if linecounter > 1:
# break
fields = line.strip().split()
current_reference_name = fields[0][1:]
current_query_name = fields[1]
else:
fields = line.strip().split()
if len(fields) > 4:
# current_reference_position = int(fields[0])
current_reference_position = min(int(fields[0]),int(fields[1]))
# fields[1] is the reference position at the end of the alignment
# current_query_position = int(fields[2])
current_query_position = min(int(fields[2]),int(fields[3]))
# fields[3] is the query position at the end of the alignment
else:
tick = int(fields[0])
if abs(tick) == 1: # then go back and edit the last entry to add 1 more to its size
report = variants[-1]
report[4] = report[4] + 1 # size
if tick > 0: # deletion, moves in reference
report[2] = report[2] + 1 # reference end position
report[7] = report[7] + 1 # reference gap size
current_reference_position += 1 # update reference position after deletion
elif tick < 0: # insertion, moves in query
report[8] = report[8] + 1 # query gap size
report[12] = report[12] + 1 # query end position
current_query_position += 1 # update query position after insertion
else: # report the last one and continue
current_reference_position += abs(tick) - 1
current_query_position += abs(tick) - 1
if tick > 0:
size = 1
# report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % (current_reference_name,current_reference_position,current_reference_position+size,len(variants)+1,size,"+","Deletion",size,0,current_query_name,"within_alignment")
report = [current_reference_name,current_reference_position,current_reference_position+size,"Assemblytics_w_"+str(len(variants)+1),size,"+","Deletion",size,0,current_query_name,"within_alignment",current_query_position,current_query_position]
current_reference_position += size # update reference position after deletion
variants.append(report)
elif tick < 0:
size = 1
# report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % (current_reference_name,current_reference_position,current_reference_position,len(variants)+1,size,"+","Insertion",0,size,current_query_name,"within_alignment")
report = [current_reference_name,current_reference_position,current_reference_position,"Assemblytics_w_"+str(len(variants)+1),size,"+","Insertion",0,size,current_query_name,"within_alignment",current_query_position,current_query_position+size]
current_query_position += size # update query position after insertion
variants.append(report)
# TESTING
# print line, report
f.close()
newcounter = 1
for line in variants:
# report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % line
if line[4] >= minimum_variant_size:
line[3] = "Assemblytics_w_%d" % (newcounter)
print ("\t".join(map(str,line[0:10])) + ":" + str(line[11]) + "-" + str(line[12]) + ":+\t" + line[10])
# print "\t".join(map(str,line))
newcounter += 1
def main():
parser=argparse.ArgumentParser(description="Outputs MUMmer coordinates annotated with length of unique sequence for each alignment")
parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True)
parser.add_argument("--min",help="Minimum size (bp) of variant to include, default = 50" ,dest="minimum_variant_size",type=int, default=50)
parser.set_defaults(func=run)
args=parser.parse_args()
args.func(args)
if __name__=="__main__":
main()