forked from 23andMe/yhaplo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert2genos.py
executable file
·189 lines (148 loc) · 5.64 KB
/
convert2genos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/env python
#
# David Poznik
# 2016.6.30
# convert2genos.py
#----------------------------------------------------------------------
import argparse
import os
import sys
sys.path.append('..')
import utils
DESCRIPTION = '''
Converts data to .genos.txt format for yHaplogroup sofware.
Input format options:
1. .ped and .map
2. .23andMe.txt
Column 1: SNP identifier (ignored)
Column 2: Chromosome (row retained only if chromosome in CHROMOSOME_SET)
Column 3: Physical coordinate (GRCh37 assumed)
Column 4: Allele 1 (row retained only if allele 1 in ALLELE_SET)
Column 5: Allele 2 (if present)
'''
CHROMOSOME_SET = { '24', 'Y' }
ALLELE_SET = set('ACGTDI')
fnEnding2fnTypeDict = {
'.ped': 'ped',
'.23andMe.txt': 'ttam',
'.acom.txt': 'ttam',
}
outDir = 'converted'
utils.mkdirP(outDir)
#----------------------------------------------------------------------
# ped and map
def convertPed(pedFN, fnRoot, fnEnding):
'reads a .ped and a .map and converts to .genos.txt'
mapFN = pedFN.replace(fnEnding, '.map')
outFN = '%s/%s.genos.txt' % (outDir, fnRoot)
outFile = open(outFN, 'w')
indexList = readMap(mapFN, outFile)
processPed(pedFN, indexList, outFile)
print 'Output: %s\n' % outFN
outFile.close()
def readMap(mapFN, outFile):
'reads a .map file'
if not os.path.exists(mapFN):
sys.exit('ERROR. Expecting map file: %s' % mapFN)
print 'Map: %s\n' % mapFN
positionList = list()
indexList = list()
index = 0
with open(mapFN, 'r') as mapFile:
for line in mapFile:
chromosome, _, _, position = line.strip().split()
if chromosome in CHROMOSOME_SET:
positionList.append(position)
indexList.append(index)
index += 1
outFile.write('ID\t%s\n' % '\t'.join(positionList))
return indexList
def processPed(pedFN, indexList, outFile):
'process a .ped file'
diploidIndexList = [2 * i for i in indexList]
numIndividuals, numFemale = 0, 0
with open(pedFN, 'r') as inFile:
for line in inFile:
lineList = line.strip().split()
sex = lineList[4]
if sex == '2':
numFemale += 1
continue
diploidGenoList = lineList[6:]
haploidGenoList = list()
for i in diploidIndexList:
allele1, allele2 = diploidGenoList[i], diploidGenoList[i+1]
if allele1 in ALLELE_SET and allele1 == allele2:
haploidGenoList.append(allele1)
else:
haploidGenoList.append('.')
numIndividuals += 1
ID = '-'.join(lineList[:2])
outFile.write('%s\t%s\n' % (ID, '\t'.join(haploidGenoList)))
print '%5d females ignored' % numFemale
print '%5d individuals written' % numIndividuals
print '%5d markers\n' % len(indexList)
#----------------------------------------------------------------------
# 23andMe
def convertTTAM(inFN, ID):
'reads single-sample flat format and converts to .genos.txt'
outFN = '%s/%s.genos.txt' % (outDir, ID)
genoTupleList = list()
numNonY, numHetOrNoCall = 0, 0
with open(inFN, 'r') as inFile:
for line in inFile:
if line[0] == '#' or line[:4] == 'rsid':
continue
lineList = line.strip().split()
numFields = len(lineList)
chromosome, position, allele1 = lineList[1:4]
if numFields == 5:
allele2 = lineList[4]
elif numFields != 4:
sys.exit('ERROR. Encountered line with %d elements:\n%s' % \
(numFields, line))
if chromosome in CHROMOSOME_SET:
if allele1 in ALLELE_SET and (numFields == 4 or allele1 == allele2):
genoTupleList.append((position, allele1))
else:
numHetOrNoCall += 1
else:
numNonY += 1
with open(outFN, 'w') as outFile:
writeLineFromTupleList(0, genoTupleList, outFile, 'ID')
writeLineFromTupleList(1, genoTupleList, outFile, ID)
print '%6d non-Y genotypes ignored' % numNonY
print '%6d Y-chromosome genotypes ignored (het or no-call)' % numHetOrNoCall
print '%6d written\n' % len(genoTupleList)
print 'Output: %s\n' % outFN
def writeLineFromTupleList(index, tupleList, outFile, rowHeader=''):
'given a list of tuples, writes one line with the i-th element of each tuple'
outFile.write(rowHeader)
for myTuple in tupleList:
outFile.write('\t%s' % myTuple[index])
outFile.write('\n')
#----------------------------------------------------------------------
# main
def main():
parser = argparse.ArgumentParser(description=DESCRIPTION,
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('inFN', type=str, help='input file name')
args = parser.parse_args()
inFN = args.inFN
if not os.path.exists(inFN):
sys.exit('ERROR. Input file does not exist: %s' % inFN)
print 'Input: %s\n' % inFN
fnType = None
for fnEnding in fnEnding2fnTypeDict:
if inFN.endswith(fnEnding):
fnType = fnEnding2fnTypeDict[fnEnding]
fnRoot = os.path.basename(inFN).replace(fnEnding, '')
break
if fnType == 'ped':
convertPed(inFN, fnRoot, fnEnding)
elif fnType == 'ttam':
convertTTAM(inFN, ID=fnRoot)
else:
sys.exit('ERROR. Input file must .23andMe.txt or (.ped, .map)')
if __name__ == '__main__':
main()