-
Notifications
You must be signed in to change notification settings - Fork 2
/
update_sv_hp_ps_group_1.py
188 lines (163 loc) · 9 KB
/
update_sv_hp_ps_group_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env python3
"""
This script update vcf file to add both HP haplotag and PS phasing block info fields, It takes as input vcf file, hp, ps.
"""
import argparse
import sys, os
from operator import itemgetter
from collections import Counter
# Python program to print
# green text with red background
#
# from colorama import init
# from termcolor import colored
#
# init()
# Part that processing input arguments
def get_args():
parser = argparse.ArgumentParser(epilog="%(prog)s version 0.01. use command -h for info.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Phase SVs Using haplotyped reads in tab format',
add_help=True, )
parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.01')
# parser.add_argument('input', help='Input file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
# parser.add_argument('output', help='Output file', nargs="?", type=argparse.FileType('w'), default=sys.stdout)
parser.add_argument('input', nargs='?', help="Structural variant vcf file",
type=argparse.FileType('r'),
default=sys.stdin)
parser.add_argument('hp', nargs='?', help="tab delimeted read\thp\tps file",
type=argparse.FileType('r'))
parser.add_argument('output', nargs='?', help="Output file, PS and HP will be added.",
type=argparse.FileType('w+'),
default=sys.stdout)
parser.set_defaults(func=update_vcf)
# if not argument print help.
if len(sys.argv) == 1 and sys.stdin.isatty(): # sys.stdin.isatty() returns false if there's something in stdin
parser.print_help(sys.stderr)
sys.exit(1)
args = parser.parse_args()
if 'func' in args:
args.func(args)
else:
parser.print_help()
def update_vcf(args):
# check if the input from stdin
if not sys.stdin.isatty(): # there is nothing in the stdin
if args.input.name.endswith("gz"):
import gzip
myfile = gzip.open(args.input.name, 'rt') # t is not a must normally it is default.
else:
myfile = args.input
else:
myfile = args.input
# read the Haplotyped reads file as dictionary
hp_dic = {}
with args.hp as hp_in:
for line in hp_in:
id, hp, ps = line.split()
hp_dic[id] = [hp.rsplit(":", 1)[-1], ps.rsplit(":", 1)[-1]] # read -> [hp, ps]
with myfile as data_in, args.output as data_out:
for line in data_in:
reads = []
if line.startswith('##'):
data_out.write(line)
elif line.startswith("#"):
# data_out.write("##INFO=<ID=HP,Number=1,Type=Integer,Description=\"Haplotype identifier\">\n")
data_out.write("##INFO=<ID=CONFLICT,Number=.,Type=Integer,Description=\"The Phase is conflict or not\">\n")
data_out.write("##INFO=<ID=HP_RATIO,Number=.,Type=String,Description=\"Phase Ratio of 1\">\n")
data_out.write("##FORMAT=<ID=PS,Number=.,Type=Integer,Description=\"Phase set identifier\">\n")
data_out.write(line)
else:
line_split = line.split()
#1)Adding PS field to FORMAT column for this cases 1|0,0|1,1|1,0|0,.|. for all -1.YES
#2)Lets try to analyze homozygous variant to cover uniparental disomy cases
#3)REF_NO_CONFLICT,HMZ_NO_CONFLICT,HET_NO_CONFLICT,HET_SNP_ALLELE_CONFLICT,HET_SNP_MISSING????
#Binary number,00,01,02
#4)Calculate ratio for HET.Count of 1's/Total=HP_SV_READ_RATIO_1.Count of 2's/Total=HP_SV_READ_RATIO_2.YES
if line_split[-1].split(":", 1)[0] == "1/1" or line_split[-1].split(":", 1)[0] == "0/0" or line_split[-1].split(":", 1)[0] == "./.": # no gt to phase
reads = [i for i in line_split[7].split(";") if i.startswith("RNAMES")][0].split("=",1)[-1].split(",")
myvalues = list(map(hp_dic.get, reads))
if any(myvalues): # any value is not none
ps_dict = categorize_ps(myvalues)
if 0 in list(ps_dict.values()): # means that the hp is conflicting do not update anything and add flag that is is conflicting
ratios = collect_ratios(myvalues)
line_split[7] = "{info};CONFLICT={conflict};HP_RATIO={hp_ratio}".format(info=line_split[7], conflict=0, hp_ratio=ratios)
line_split[-2] = "{}:{}".format(line_split[-2], "PS")
line_split[-1] = "{}:{}".format(line_split[-1], "-1")
data_out.write("{}\n".format("\t".join(line_split)))
elif line_split[-1].split(":", 1)[0] in ["0/1", "1/0"]:
reads = [i for i in line_split[7].split(";") if i.startswith("RNAMES")][0].split("=",1)[-1].split(",")
myvalues = list(map(hp_dic.get, reads)) # list of lists first element id hp second is ps or None in case there are no reads with hp and ps to support this sv
# If any value not None
if any(myvalues): # any value is not none
ps_dict = categorize_ps(myvalues)
if 0 in list(ps_dict.values()): # means that the hp is conflicting do not update anything and add flag that is is conflicting
ratios = collect_ratios(myvalues)
line_split[7] = "{info};CONFLICT={conflict};HP_RATIO={hp_ratio}".format(info=line_split[7], conflict=1, hp_ratio=ratios)
line_split[-2] = "{}:{}".format(line_split[-2], "PS")
line_split[-1] = "{}:{}".format(line_split[-1], ",".join(ps_dict.keys()))
data_out.write("{}\n".format("\t".join(line_split)))
else: # update the gt field and ps to sv
ratios = collect_ratios(myvalues)
line_split[7] = "{info};CONFLICT={conflict};HP_RATIO={hp_ratio}".format(info=line_split[7], conflict=0, hp_ratio=ratios)
line_split[-2] = "{}:{}".format(line_split[-2], "PS")
# if values are negative then it is hp=1 1|0 else it is hp2 0|1
# line_split[-1] = line_split[-1].replace("/", "|")
hp_new_value = line_split[-1].split(':')
if list(ps_dict.values())[0] < 1: # haplotype 1
hp_new_value[0] = "1|0"
else:
hp_new_value[0] = "0|1"
hp_new_value = ":".join(hp_new_value)
line_split[-1] = "{}:{}".format(hp_new_value, ",".join(ps_dict.keys()))
data_out.write("{}\n".format("\t".join(line_split)))
else: # all are none
ratios="0,0,0"
line_split[7] = "{info};CONFLICT=2;HP_RATIO={hp_ratio}".format(info=line_split[7], conflict=1, hp_ratio=ratios)
line_split[-2] = "{}:{}".format(line_split[-2], "PS")
line_split[-1] = "{}:{}".format(line_split[-1], "-1")
data_out.write("{}\n".format("\t".join(line_split)))
# Test case [['1', '23200'], ['2', '23200'], ['2', '23200'], ['1', '23200'], ['2', '23200'], ['2', '23200'], ['1', '23200'], ['2', '23200'], ['1', '23200'], ['2', '23200'], ['1', '23200'], ['1', '23200'], ['2', '23200'], ['2', '23200'], ['2', '23200']]
def categorize_ps(myvalues):
ps_dict = {}
for i in myvalues:
if not i:
continue
ps = i[1]
hp = int(i[0])
if ps in ps_dict:
if hp == 1:
if ps_dict[ps] < 0 : # if we put = it will use the paralment decision
ps_dict[ps] = ps_dict[ps] - 1
else: #conflict
ps_dict[ps] = 0
else: # means that it is haplotype 2 hp=2
if ps_dict[ps] > 0: # if we put = it will use the paralment decision
ps_dict[ps] = ps_dict[ps] + 1
else: #conflict
ps_dict[ps] = 0
else:
if hp == 1:
ps_dict[ps] = -1
else:
ps_dict[ps] = 1
return ps_dict
def collect_ratios(myvalues):
count_of_1s = 0
count_of_2s = 0
count_of_None = 0
collected_ratios = 0,0,0
for i in myvalues:
if i is not None:
if i[0] == '1':
count_of_1s+=1
elif i[0] == '2':
count_of_2s+=1
else:
count_of_None=+1
collected_ratios = str(count_of_1s)+","+str(count_of_2s)+","+str(count_of_None)
return collected_ratios
def main():
args = get_args()
if __name__ == "__main__":
main()