-
Notifications
You must be signed in to change notification settings - Fork 1
/
gdc-maf-tool.py
202 lines (185 loc) · 6.68 KB
/
gdc-maf-tool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import os
import re
import sys
import json
import csv
import requests
import hashlib
import gzip
import argparse
import datetime
def arg_parse():
parser = argparse.ArgumentParser(
description = '----GDC MAF Concatenation Tool v1.0----',
usage = 'python gdc-maf-cat.py <-m MANIFEST or -p PROJECT_ID>')
parser.add_argument('-m', '--manifest', action = "store",
help = 'Specify MAF files with GDC Manifest')
parser.add_argument('-p', '--project', action = "store",
help = 'Specify MAF files by project')
parser.add_argument('-o', '--output', metavar = 'FILE_PREFIX',
action = "store", dest = 'o', type = str, default = "outfile.maf",
help = 'Designates a name for the output file')
args = parser.parse_args()
return args
def main(args):
'''
Retrieves and parses the arguments
'''
global use_manifest, output_file, manifest_path, project_string
if args.manifest:
use_manifest = True
manifest_path = args.manifest
if args.project:
use_manifest = False
project_string = args.project
if args.o: output_file = args.o
if args.manifest and args.project:
error_parse("both_argue")
if not args.manifest and not args.project:
error_parse("no_argue")
def error_parse(code):
'''
Generates the error messages
'''
error = {
"bad_manifest": "Input must be valid GDC Manifest. " \
"\n\tGo to https://portal.gdc.cancer.gov/ to download a manifest",
"no_result": "Query produced no results",
"no_argue": "No argument detected, please use the -p or -m flags",
"both_argue": "Must choose either -p OR -m, not both.",
"md5sum_mis": "Expected md5sum does not match file's md5sum value",
"max_retry" : "Maximum retries exceeded"
}
print("ERROR: " + error[code])
sys.exit(2)
def strip_maf_header(maf_file):
'''
Removes the MAF header
'''
maf_list = []
for line in maf_file:
if line[0] != "#":
maf_list.append(line)
return maf_list
def jsonify_maf(maf_file):
'''
Converts MAF TSV to dict, requires header is stripped.
'''
master_dict = []
keys = maf_file[0].strip().split("\t")
for line in maf_file[1:]:
split_line = line.strip().split("\t")
one_line_dict = dict(zip(keys, split_line))
master_dict.append(one_line_dict)
return master_dict, keys
def back_to_tsv(full_dict, col_order, prefix):
'''
Converts full concatenated dict to TSV for writing out
'''
dict_writer = csv.DictWriter(open("{}".format(prefix), "w"), col_order, delimiter='\t')
dict_writer.writeheader()
dict_writer.writerows(full_dict)
def read_in_manifest(manifest_path):
'''
Reads in a GDC Manifest to parse out UUIDs
'''
manifest_file = open(manifest_path, "r").read().splitlines()
id_list = []
if manifest_file[0].strip().split("\t")[0] != "id":
error_parse("bad_manifest")
for line in manifest_file[1:]:
id_list.append(line.strip().split("\t")[0])
return id_list
def retrieve_ids_by_project(provided, project):
'''
Retrieves IDs when provided a project_id or list of UUIDs
'''
id_list = []
endpt = "https://api.gdc.cancer.gov/files"
filters = [
("files.data_format",["MAF"]),
("files.data_type",["Masked Somatic Mutation"])]
if project == True:
filters.append(("cases.project.project_id", provided.split(",")))
else:
filters.append(("files.file_id", provided))
filters_gdc = {"op":"and", "content":[]}
for field, value in filters:
filt_core = {"field": field, "value": value}
single_filt = {"op": "in", "content": filt_core}
filters_gdc["content"].append(single_filt)
params = {
"filters": json.dumps(filters_gdc),
"fields" : "file_id,md5sum,file_name",
"format" : "JSON",
"size" : "10000"
}
response = requests.get(endpt, params= params)
out_hits = json.loads(response.content)["data"]["hits"]
if len(out_hits) == 0:
error_parse("no_result")
for file_entry in out_hits:
single_dict = dict(zip(["file_id", "md5sum", "file_name"],
[file_entry["file_id"], file_entry["md5sum"], file_entry["file_name"]]))
id_list.append(single_dict)
return id_list
def download_maf(single_maf_dict, tmpdir):
'''
Downloads each MAF file and stores in tmp directory
'''
file_id, exp_md5 = single_maf_dict["file_id"], single_maf_dict["md5sum"]
retry = True
retry_num = 0
while retry == True and retry_num < 3:
data_endpt = "https://api.gdc.cancer.gov/data/{}".format(file_id)
print "> {} | Downloading File | {} |".format(datetime.datetime.now(), file_id)
response = requests.get(data_endpt, headers = {"Content-Type": "application/json"})
if response.status_code == 200:
retry = False
else:
retry_num += 1
print "> -- Retrying Download..."
if retry == False:
response_head_cd = response.headers["Content-Disposition"]
file_name = re.findall("filename=(.+)", response_head_cd)[0]
with open("/".join([tmpdir, file_name]), "wb") as output_file:
output_file.write(response.content)
check_md5sum(file_name, exp_md5, tmpdir)
elif retry_num == 3:
error_parse("max_retry")
def download_run(id_list):
'''
Runs MAF download for multiple and performed per-session tasks
'''
tmpdir = "tmpMAF_" + str(datetime.datetime.now()).split(" ")[0]
if not os.path.exists(tmpdir):
os.mkdir(tmpdir)
for single_maf in id_list:
download_maf(single_maf, tmpdir)
print ">-- All MAF Downloads Complete"
return id_list, tmpdir
def check_md5sum(file_name, exp_md5, tmpdir):
'''
Checks the MD5SUM matches the one in the GDC index
'''
hash_md5 = hashlib.md5()
with open("/".join([tmpdir, file_name]), "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
if exp_md5 != hash_md5.hexdigest():
error_parse("md5sum_mis")
def execute():
main(arg_parse())
cat_maf = []
if use_manifest == True:
maf_ids_only = read_in_manifest(manifest_path)
maf_ids = retrieve_ids_by_project(maf_ids_only, False)
else:
maf_ids = retrieve_ids_by_project(project_string, True)
id_list, tmpdir = download_run(maf_ids)
for single_maf in id_list:
maf_list = strip_maf_header(gzip.open("/".join([tmpdir,single_maf["file_name"]]), "r"))
jsonified, keys = jsonify_maf(maf_list)
cat_maf += jsonified
back_to_tsv(cat_maf, keys, output_file)
execute()