forked from precimed/python_convert
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsumstats_utils.py
301 lines (282 loc) · 10.4 KB
/
sumstats_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
# Misc utils to deal with summary stat files.
# Some parts of the code in this file originates from https://github.com/bulik/ldsc/,
# which is licensed under GNU General Public License v3.0
# See https://github.com/bulik/ldsc/blob/master/LICENSE for complete license.
import sys, os, re, logging, datetime
import numpy as np
import pandas as pd
import gzip
import six
import itertools as it
from collections import namedtuple
COMPLEMENT_ALLELE = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
# bases
BASES = COMPLEMENT_ALLELE.keys()
# true iff strand ambiguous
STRAND_AMBIGUOUS = {''.join(x): x[0] == COMPLEMENT_ALLELE[x[1]]
for x in it.product(BASES, BASES)
if x[0] != x[1]}
# SNPS we want to keep (pairs of alleles)
VALID_SNPS = {x for x in map(lambda y: ''.join(y), it.product(BASES, BASES))
if x[0] != x[1] and not STRAND_AMBIGUOUS[x]}
# T iff SNP 1 has the same alleles as SNP 2 (allowing for strand or ref allele flip).
MATCH_ALLELES = {x for x in map(lambda y: ''.join(y), it.product(VALID_SNPS, VALID_SNPS))
# strand and ref match
if ((x[0] == x[2]) and (x[1] == x[3])) or
# ref match, strand flip
((x[0] == COMPLEMENT_ALLELE[x[2]]) and (x[1] == COMPLEMENT_ALLELE[x[3]])) or
# ref flip, strand match
((x[0] == x[3]) and (x[1] == x[2])) or
((x[0] == COMPLEMENT_ALLELE[x[3]]) and (x[1] == COMPLEMENT_ALLELE[x[2]]))} # strand and ref flip
# T iff SNP 1 has the same alleles as SNP 2 w/ ref allele flip.
FLIP_ALLELES = {''.join(x):
((x[0] == x[3]) and (x[1] == x[2])) or # strand match
# strand flip
((x[0] == COMPLEMENT_ALLELE[x[3]]) and (x[1] == COMPLEMENT_ALLELE[x[2]]))
for x in MATCH_ALLELES}
def filter_alleles(alleles):
'''Remove bad variants (mismatched alleles, non-SNPs, strand ambiguous).'''
ii = alleles.apply(lambda y: y in MATCH_ALLELES)
return ii
def align_alleles(z, alleles):
'''Align Z1 and Z2 to same choice of ref allele (allowing for strand flip).'''
try:
z *= (-1) ** alleles.apply(lambda y: FLIP_ALLELES[y])
except KeyError as e:
raise KeyError('Incompatible alleles. ')
return z
Cols = namedtuple('Cols', ['SNP', 'CHR', 'BP', 'PVAL', 'A1', 'A2', 'EA', 'N', 'NCASE', 'NCONTROL', 'Z', 'OR', 'BETA', 'LOGODDS', 'SE', 'INFO', 'FRQ', 'NSTUDY', 'CHRPOS', 'A1A2', 'CHRPOSA1A2', 'DIRECTION', 'ORL95', 'ORU95'])
cols = Cols._make( ['SNP', 'CHR', 'BP', 'PVAL', 'A1', 'A2', 'EA', 'N', 'NCASE', 'NCONTROL', 'Z', 'OR', 'BETA', 'LOGODDS', 'SE', 'INFO', 'FRQ', 'NSTUDY', 'CHRPOS', 'A1A2', 'CHRPOSA1A2', 'DIRECTION', 'ORL95', 'ORU95'])
cols_type_map = {'SNP':str, 'CHR':int, 'BP':int, 'PVAL':np.float64, 'A1':str, 'A2':str, 'EA':str, 'N':float, 'NCASE':float, 'NCONTROL':float, 'Z':float, 'OR':float, 'BETA':float,
'LOGODDS':float, 'SE':float, 'INFO':float, 'FRQ':float, 'NSTUDY':int, 'CHRPOS':str, 'A1A2':str, 'CHRPOSA1A2':str, 'DIRECTION':str, 'ORL95':float, 'ORU95':float}
cname_to_cleansumstats_map = {
'SNP': 'col_SNP',
'CHR': 'col_CHR',
'BP': 'col_POS',
'PVAL': 'col_P',
'A1': 'col_EffectAllele',
'A2': 'col_OtherAllele',
'N': 'col_N',
'NCASE': 'col_CaseN',
'NCONTROL': 'col_ControlN',
'Z': 'col_Z',
'OR': 'col_OR',
'BETA': 'col_BETA',
'SE': 'col_SE',
'LOGODDS': 'col_BETA',
'INFO': 'col_INFO',
'FRQ': 'col_EAF',
'DIRECTION': 'col_Direction',
'ORL95': 'col_ORL95',
'ORU95': 'col_ORU95',
# 'NSTUDY': 'col_studyN' - not supported by cleansumstats
# col_OAF, col_Notes - not supported by python_convert
# CHRPOSA1A2, CHRPOS - require special handling (see update_cleansumstats_cols in sumstats.py)
# A1A2 - incompatible
}
null_values = {
cols.LOGODDS: 0,
cols.BETA: 0,
cols.OR: 1,
cols.Z: 0
}
default_cnames = {
# RS NUMBER
'SNP': cols.SNP,
'MARKERNAME': cols.SNP,
'SNPID': cols.SNP,
'SNP_ID': cols.SNP,
'RS': cols.SNP,
'RSID': cols.SNP,
'RS_NUMBER': cols.SNP,
'RS_NUMBERS': cols.SNP,
# CHROMOSOME
'CHR': cols.CHR,
'CHROM': cols.CHR,
'CHROMSOME': cols.CHR,
'CHROMOSOME' : cols.CHR,
# POSITION
'POS': cols.BP,
'BP': cols.BP,
'BPOS': cols.BP,
'POSITION' : cols.BP,
# NUMBER OF STUDIES
'NSTUDY': cols.NSTUDY,
'N_STUDY': cols.NSTUDY,
'NSTUDIES': cols.NSTUDY,
'N_STUDIES': cols.NSTUDY,
# P-VALUE
'P': cols.PVAL,
'PVALUE': cols.PVAL,
'P_VALUE': cols.PVAL,
'PVAL': cols.PVAL,
'P_VAL': cols.PVAL,
'GC_PVALUE': cols.PVAL,
'MTAG_PVAL': cols.PVAL,
# ALLELE 1
'A1': cols.A1,
'ALLELE1': cols.A1,
'ALLELE_1': cols.A1,
'EFFECT_ALLELE': cols.A1,
'EFFECTALLELE': cols.A1,
'REFERENCE_ALLELE': cols.A1,
'INC_ALLELE': cols.A1,
'EA': cols.A1,
# ALLELE 2
'A2': cols.A2,
'ALLELE2': cols.A2,
'ALLELE_2': cols.A2,
'OTHER_ALLELE': cols.A2,
'OTHERALLELE': cols.A2,
'NON_EFFECT_ALLELE': cols.A2,
'NON_EFF_ALLELE': cols.A2,
'DEC_ALLELE': cols.A2,
'NEA': cols.A2,
# N
'N': cols.N,
'SAMPLESIZE': cols.N,
'WEIGHT': cols.N, # metal does this. possibly risky.
# NCASE
'NCASE': cols.NCASE,
'CASES_N': cols.NCASE,
'N_CASE': cols.NCASE,
'N_CASES': cols.NCASE,
'N_CAS': cols.NCASE,
'N_CASE': cols.NCASE,
'CASEN': cols.NCASE,
# NCONTROL
'N_CONTROLS': cols.NCONTROL,
'N_CON': cols.NCONTROL,
'CONTROLN': cols.NCONTROL,
'NCONTROL': cols.NCONTROL,
'CONTROLS_N': cols.NCONTROL,
'N_CONTROL': cols.NCONTROL,
# SIGNED STATISTICS
'ZSCORE': cols.Z,
'Z-SCORE': cols.Z,
'GC_ZSCORE': cols.Z,
'Z': cols.Z,
'MTAG_Z': cols.Z,
'OR': cols.OR,
'ORL95': cols.ORL95,
'ORU95': cols.ORU95,
'B': cols.BETA,
'BETA': cols.BETA,
'MTAG_BETA': cols.BETA,
'LOGODDS': cols.LOGODDS,
'EFFECTS': cols.BETA,
'EFFECT': cols.BETA,
'SIGNED_SUMSTAT': 'SIGNED_SUMSTAT',
# STANDARD ERROR
'SE' : cols.SE,
'STDERR' : cols.SE,
'MTAG_SE' : cols.SE,
# INFO
'INFO': cols.INFO,
# MAF
'EAF': cols.FRQ,
'FRQ': cols.FRQ,
'MAF': cols.FRQ,
'FRQ_U': cols.FRQ,
'F_U': cols.FRQ,
'FREQ': cols.FRQ,
# DIRECTION
'DIRECTION': cols.DIRECTION,
}
describe_cname = {
cols.SNP: 'Variant ID (e.g., rs number)',
cols.CHR: 'Chromosome number',
cols.BP: 'Base-pair position',
cols.PVAL: 'p-Value',
cols.A1: 'Allele 1, interpreted as ref allele for signed sumstat.',
cols.A2: 'Allele 2, interpreted as non-ref allele for signed sumstat.',
cols.EA: 'Effect Allele, interpreted as ref allele for signed sumstat (specific to MVP data to validate that A1 is the same as EA).',
cols.N: 'Sample size',
cols.NCASE: 'Number of cases',
cols.NCONTROL: 'Number of controls',
cols.Z: 'Z-score (0 --> no effect; above 0 --> A1 is trait/risk increasing)',
cols.OR: 'Odds ratio (1 --> no effect; above 1 --> A1 is risk increasing)',
cols.ORL95: 'Lower 95%% confidence bound of OR',
cols.ORU95: 'Upper 95%% confidence bound of OR',
cols.BETA: '[linear/logistic] regression coefficient (0 --> no effect; above 0 --> A1 is trait/risk increasing)',
cols.LOGODDS: 'Log odds ratio (0 --> no effect; above 0 --> A1 is risk increasing)',
cols.SE: 'standard error of the effect size',
cols.INFO: 'INFO score (imputation quality; higher --> better imputation)',
cols.FRQ: 'Allele frequency',
'SIGNED_SUMSTAT': 'Directional summary statistic as specified by --signed-sumstats.',
cols.NSTUDY: 'Number of studies in which the SNP was genotyped.',
'UNKNOWN': 'Unknown column type (will be skipped).',
cols.CHRPOS: 'chr:pos column with colon-separated information about Chromosome and Base-pair position',
cols.A1A2: 'A1/A2 column with slash-separated information about marker allles',
cols.CHRPOSA1A2: 'chr:pos:ref:alt column with colon-separated information about Chromosome, Base-pair position, Reference allele, Alternative allele',
cols.DIRECTION: 'METAL "direction" column, one char per substudy (+ or - indicate effect direction; ? indicate failed imputation or QC)'
}
def clean_header(header):
'''
For cleaning file headers.
- convert to uppercase
- replace dashes '-' with underscores '_'
- replace dots '.' (as in R) with underscores '_'
- remove newlines ('\n')
'''
return header.upper().replace('-', '_').replace('.', '_').replace('\n', '')
def format_chr(chrvec):
'''
Reformat chromosome names.
Input:
------
Vector of chromosome IDs
Output:
-------
Vector of cleaned chromosome IDs
Note:
* Remove "chr/Chr/CHR/MT/mt" strings in the name
* Change chrX to 23, ChrY to 24, PAR to 25, MT to 26
* (as in plink, https://www.cog-genomics.org/plink/1.9/input#allow_extra_chr)
'''
try:
tmpchrvec = chrvec.astype('str')
tmpchrvec = tmpchrvec.str.lower()
tmpchrvec = tmpchrvec.str.replace('chr', '')
tmpchrvec[tmpchrvec=='x'] = '23'
tmpchrvec[tmpchrvec=='y'] = '24'
tmpchrvec[tmpchrvec=='par'] = '25'
tmpchrvec[tmpchrvec=='m'] = '26'
tmpchrvec[tmpchrvec=='mt'] = '26'
tmpchrvec[tmpchrvec=='x_par1'] = '25'
tmpchrvec[tmpchrvec=='x_par2'] = '25'
tmpchrvec[tmpchrvec=='x_nonpar'] = '23'
# TO-DO: Bellow is anoying
tmpchrvec[tmpchrvec=='na'] = '-9'
tmpchrvec[tmpchrvec.isnull()] = '-9'
tmpchrvec[tmpchrvec=='nan'] = '-9'
tmpchrvec[tmpchrvec==' '] = '-9'
tmpchrvec = tmpchrvec.astype('float').astype('int')
return tmpchrvec
except:
raise
def get_header(fh, lines=5):
(openfunc, _) = get_compression(fh)
header = []
with openfunc(fh) as f:
for line in it.islice(f, lines):
line = line if isinstance(line, six.string_types) else line.decode('utf-8')
header.append(line.rstrip('\n'))
return header
def get_compression_and_open(fh):
(openfunc, _) = get_compression(fh)
return openfunc(fh)
def get_compression(fh):
'''
Read filename suffixes and figure out whether it is gzipped,bzip2'ed or not compressed
'''
if fh.endswith('gz'):
compression = 'gzip'
openfunc = gzip.open
elif fh.endswith('bz2'):
compression = 'bz2'
openfunc = bz2.BZ2File
else:
openfunc = open
compression = None
return openfunc, compression