forked from voichek/kmersGWAS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmers_gwas.py
executable file
·278 lines (244 loc) · 16.8 KB
/
kmers_gwas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
##
## @file kmers_gwas.py
## @brief A pipeline to associate phenotype with k-mers and snps
##
## This script contains the logic of associating a phenotype with k-mers or SNPs
## and all the organization of parameters used in the different programs and
## scripts.
##
## @author Yoav Voichek (YV), [email protected]
##
## @internal
## Created 07/31/18
## Company Max Planck Institute for Developmental Biology Dep 6
## Copyright Copyright (c) 2018, Yoav Voichek
##
## This source code is released for free distribution under the terms of the
## GNU General Public License as published by the Free Software Foundation.
## =====================================================================================
## General parameters:
import subprocess
import time
import sys
## Save all the relevant directories and files in one dictionary
paths = {}
# scripts/ programs prefix
paths["base_path"] = sys.path[0] + "/"
# script with python functionalities
paths["gen_script"] = paths["base_path"] + "src/py/functions.py"
# program the associate kmers
paths["assoicate_kmers"] = paths["base_path"] + "bin/associate_kmers"
# program to associate snps
paths["associate_snps"] = paths["base_path"] + "bin/associate_snps"
# script to permute and transform phenotypes
paths["Permute_phenotype"] = paths["base_path"] + "src/R/transform_and_permute_phenotypes.R"
# script to intersect kinship matrix to the current samples
paths["kinship_intersect_script"] = paths["base_path"] + "src/py/align_kinship_phenotype.py"
# logic of parsing the user given params
paths["parser_script"] = paths["base_path"] + "src/py/pipeline_parser.py"
# average phenotypes if there are a few repeats of some accessions
paths["average_pheno_script"] = paths["base_path"] + "src/awk/average_phenotypes.awk"
####################################################################################################
execfile(paths["gen_script"])
execfile(paths["parser_script"])
####################################################################################################
def main():
## Read and parse user defined parameters
args = parser.parse_args()
## Creating the directory to work in
create_dir(args.outdir)
## open log file
paths["log_file"] = "%s/log_file" % args.outdir
f_log = file(paths["log_file"],"w",buffering=1)
f_log.writelines(str(args))
base_created_files = "pheno"
## Copying the phenotype data to the directory
paths["pheno_orig_fn"] = "%s/%s.original_phenotypes" % (args.outdir, base_created_files)
copy_phenotypes(get_file(args.fn_phenotype), paths["pheno_orig_fn"], paths["average_pheno_script"], f_log)
## Information on SNPs dataset
if args.snps_matrix is not None:
paths["snps_fam"] = args.snps_matrix + ".fam" # we use this to get the order of accessions
else: # patch (not secure, should change in the future)
# Create a file that will hold the order of the accessions of the kinship matrix from the k-mers
paths["snps_fam"] = "%s/%s_for_accessions_order.fam" % (args.outdir, base_created_files)
fout = file(paths["snps_fam"], "w")
acc_order = [x for x in file(args.kmers_table + ".names", "r").read().split("\n") if len(x)>0]
for (i,a) in enumerate(acc_order):
fout.write("{0} {0} 0 0 0 -9\n".format(a))
fout.close()
## Which kinship to use
if args.use_kinship_from_kmers or (args.snps_matrix is None):
paths["original_kinship_fn"] = args.kmers_table + ".kinship"
f_log.write("Using kinship calculated on k-mers")
else:
paths["original_kinship_fn"] = args.snps_matrix + ".kinship"
f_log.write("Using kinship calculated on SNPs")
## Creating kinship matrix specific to used accessions
paths["pheno_intersected_fn"] = "%s/%s.phenotypes" % (args.outdir, base_created_files)
paths["kinship_fn"] = "%s/%s.kinship" % (args.outdir, base_created_files)
cur_cmd = "python2.7 %s --pheno %s --fam_file %s --kinship_file %s --output_pheno %s " + \
"--output_kinship %s --DBs_list %s"
cur_cmd = cur_cmd % (paths["kinship_intersect_script"], paths["pheno_orig_fn"], paths["snps_fam"],
paths["original_kinship_fn"], paths["pheno_intersected_fn"], paths["kinship_fn"], args.kmers_table+".names")
run_and_log_command(cur_cmd, f_log)
## Transformation of the phenotypes and creating permutations of phenotype
paths["pheno_permuted_fn"] = "%s/%s.phenotypes_and_permutations" % (args.outdir, base_created_files)
paths["pheno_permuted_transformed_fn"] = "%s/%s.phenotypes_permuted_transformed" % (args.outdir, base_created_files)
paths["EMMA_perm_log_fn"] = "%s/EMMA_perm.log" % (args.outdir)
paths["log_R_permute"] = "%s/phenotypes_transformation_permutation.log" % (args.outdir)
cur_cmd = "Rscript %s %s %s %d %s %s %s > %s"
cur_cmd = cur_cmd % (paths["Permute_phenotype"] , paths["pheno_intersected_fn"], paths["kinship_fn"],\
args.n_permutations, paths["pheno_permuted_fn"], paths["pheno_permuted_transformed_fn"],
paths["EMMA_perm_log_fn"], paths["log_R_permute"])
run_and_log_command(cur_cmd, f_log)
phenotypes_names = file(paths["pheno_permuted_fn"] ,"r").read().split("\n")[0].split("\t")[1:]
## handles for all gemma runs (to monitor how many are running)
gemma_handles = []
##############################################################################################################
# Calculate affective minor allele frequency
n_accession = len(file(paths["pheno_intersected_fn"]).read().split("\n")[1:-1])
if n_accession < args.min_data_points:
err_msg = "Can't run with less than %d data points, there is only %d values here" % (args.min_data_points, n_accession)
f_log.write(err_msg + "\n")
f_log.close()
os.system("touch %s/NOT_ENOUGH_DATA" % args.outdir)
exit()
affective_maf = args.maf
maf_from_mac = float(args.mac) / float(n_accession)
if maf_from_mac > affective_maf:
affective_maf = maf_from_mac
print "affective MAF =", affective_maf
##############################################################################################################
######################################## k-mers associations #######################################
##############################################################################################################
if args.run_kmers:
paths["kmers_associations_dir"] = "%s/kmers" % args.outdir # Create relevant directory
run_and_log_command("mkdir %s" % paths["kmers_associations_dir"], f_log)
cur_cmd = "%s -p %s -b %s -o %s -n %d --parallel %d --kmers_table %s --kmer_len %d --maf %f --mac %d" %\
(paths["assoicate_kmers"], paths["pheno_permuted_transformed_fn"], base_created_files,
paths["kmers_associations_dir"], args.n_kmers, args.parallel,
args.kmers_table, args.kmers_len, args.maf, args.mac)
# Optional parameters for k-mers associations
if args.kmers_pattern_counter:
cur_cmd = cur_cmd + " --pattern_counter"
# Optional parameter to save more k-mers in the heap for the phenotype itself (and not the permutation of it)
if args.n_extra_phenotype_kmers is not None:
f_log.write("Saving a different number ({}) of k-mers in heap for the phenotype itseld\n".format(args.n_extra_phenotype_kmers))
cur_cmd = cur_cmd + " --first_phenotype_best " + str(args.n_extra_phenotype_kmers)
paths["log_kmers_associations"] = "%s/associate_kmers.log" % (args.outdir)
cur_cmd = cur_cmd + " 2> %s" % paths["log_kmers_associations"]
run_and_log_command(cur_cmd, f_log)
## Run GEMMA on the results from the k-mers associations to get the exact scoring
f_log.writelines("We have %d phenotypes\n" % len(phenotypes_names) )
for (p_ind, p_name) in enumerate(phenotypes_names):
fam_fn = get_file_type_in_dir(paths["kmers_associations_dir"], "%s.fam" % p_name)
## for all the fam files, move to fam.orig and create a new fam file with the non-transformed phenotype
run_and_log_command("mv %s %s.orig" % (fam_fn, fam_fn), f_log) # Saving the original fam (which has transformed values)
base_name = fam_fn[:-4]
# building the fam file to run gemma
cur_cmd = r"""cat %s | tail -n +2 | awk '{print $1 " " $1 " 0 0 0 " $%d}' > %s""" % \
(paths["pheno_permuted_fn"], p_ind +2, fam_fn)
run_and_log_command(cur_cmd, f_log)
# run gemma on best k-mers
cur_cmd = "%s -bfile %s -lmm 2 -k %s -outdir %s -o %s -maf %f -miss 0.5" % \
(args.gemma_path, base_name, paths["kinship_fn"], \
paths["kmers_associations_dir"] + "/output", p_name, affective_maf)
run_gemma_cmd(cur_cmd, args.parallel, gemma_handles, f_log)
##############################################################################################################
######################################## SNPs associations ########################################
##############################################################################################################
if args.run_one_step_snps and args.run_two_steps_snps:
print "The program can not run both snps approximation and exact model for SNPs"
print "You can choose up to one of this options"
exit()
# The two steps option is similiar to the method for k-mers, we first run GRAMMAR-Gamma for all SNPS, taking
# the best X and then run the full model only on this part.
# Notice that in any case, the two steps is only for the permutation, for the real values, we run the exact
# model on all SNPs
if args.run_one_step_snps or args.run_two_steps_snps:
f_log.write("\n\nStart associating snps\n")
# Create the directory for SNPs associations
paths["snps_associations_dir"] = "%s/snps" % args.outdir
run_and_log_command("mkdir %s" % paths["snps_associations_dir"], f_log)
# Make a link to the bed/bim of the full dataset
paths["snps_table_fn"] = "%s/snps.plink" % paths["snps_associations_dir"]
run_and_log_command("ln -s %s.bed %s.bed" % (args.snps_matrix, paths["snps_table_fn"]), f_log)
run_and_log_command("ln -s %s.bim %s.bim" % (args.snps_matrix, paths["snps_table_fn"]), f_log)
# building a fam file to run vs. all snps (so it will be expanded to the full format with missing values)
create_full_fam_file("%s.fam" % paths["snps_table_fn"],paths["snps_fam"],paths["pheno_permuted_fn"],args.n_permutations+1)
# Run GEMMA on the full phenotypes
cur_cmd = "%s -bfile %s -lmm 2 -k %s -outdir %s -o %s -n %d -maf %f -miss 0.5" % \
(args.gemma_path, paths["snps_table_fn"], paths["original_kinship_fn"], \
paths["snps_associations_dir"] + "/output", \
phenotypes_names[0], 1, affective_maf)
run_gemma_cmd(cur_cmd, args.parallel, gemma_handles, f_log)
if args.run_two_steps_snps:
cur_cmd = "%s %s %s %s %s %f %d" % \
(paths["associate_snps"],paths["pheno_permuted_transformed_fn"], args.snps_matrix, \
paths["snps_associations_dir"] + "/" + base_created_files, args.n_snps, affective_maf, args.mac)
run_and_log_command(cur_cmd, f_log)
for (p_ind, p_name) in enumerate(phenotypes_names[1:]):
cur_base_bedbim = paths["snps_associations_dir"] + "/" + base_created_files + "." + p_name
# Create the relevan fam file (just link the general one we created)
run_and_log_command("cp %s.fam %s.fam" % (paths["snps_table_fn"], cur_base_bedbim), f_log)
# Run GEMMA
cur_cmd = "%s -bfile %s -lmm 2 -k %s -outdir %s -o %s -n %d -maf %f -miss 0.5" % \
(args.gemma_path, cur_base_bedbim, paths["original_kinship_fn"], \
paths["snps_associations_dir"] + "/output", \
p_name, p_ind+2, affective_maf)
run_gemma_cmd(cur_cmd, args.parallel, gemma_handles, f_log)
if args.run_one_step_snps:
for (p_ind, p_name) in enumerate(phenotypes_names[1:]):
cur_cmd = "%s -bfile %s -lmm 2 -k %s -outdir %s -o %s -n %d -maf %f -miss 0.5" % \
(args.gemma_path, paths["snps_table_fn"], paths["original_kinship_fn"], \
paths["snps_associations_dir"] + "/output", \
p_name, p_ind+1, affective_maf)
run_gemma_cmd(cur_cmd, args.parallel, gemma_handles, f_log)
##############################################################################################################
################################## Accumulate general statistics ###################################
##############################################################################################################
while count_running_gemma(gemma_handles) != 0: #make sure all GEMMA finished running
time.sleep(30)
## Save the smallest p-values for k-mers associations in the permutations
if args.run_one_step_snps or args.run_two_steps_snps:
paths["best snps pvals"] = paths["snps_associations_dir"] + "/" + "best_pvals"
res = calc_best_pvals(paths["snps_associations_dir"] + "/output", paths["best snps pvals"])
th_5per = get_threshold_from_perm(res, "P", args.n_permutations, 0.05)
th_10per = get_threshold_from_perm(res, "P", args.n_permutations, 0.1)
run_and_log_command("echo %f > %s/threshold_5per" % (th_5per, paths["snps_associations_dir"]),f_log)
run_and_log_command("echo %f > %s/threshold_10per" % (th_10per, paths["snps_associations_dir"]),f_log)
run_and_log_command("cat %s/output/phenotype_value.assoc.txt | awk '(-log($9)/log(10)) > %f' > %s/pass_threshold_5per" %\
(paths["snps_associations_dir"], th_5per, paths["snps_associations_dir"]), f_log)
run_and_log_command("cat %s/output/phenotype_value.assoc.txt | awk '(-log($9)/log(10)) > %f' > %s/pass_threshold_10per" %\
(paths["snps_associations_dir"], th_10per, paths["snps_associations_dir"]), f_log)
if args.run_kmers:
paths["best kmers pvals"] = paths["kmers_associations_dir"] + "/" + "best_pvals"
res = calc_best_pvals(paths["kmers_associations_dir"] + "/output", paths["best kmers pvals"])
th_5per = get_threshold_from_perm(res, "P", args.n_permutations, 0.05)
th_10per = get_threshold_from_perm(res, "P", args.n_permutations, 0.1)
run_and_log_command("echo %f > %s/threshold_5per" % (th_5per, paths["kmers_associations_dir"]),f_log)
run_and_log_command("echo %f > %s/threshold_10per" % (th_10per, paths["kmers_associations_dir"]),f_log)
run_and_log_command("cat %s/output/phenotype_value.assoc.txt | awk '(-log($9)/log(10)) > %f' > %s/pass_threshold_5per" %\
(paths["kmers_associations_dir"], th_5per, paths["kmers_associations_dir"]), f_log)
run_and_log_command("cat %s/output/phenotype_value.assoc.txt | awk '(-log($9)/log(10)) > %f' > %s/pass_threshold_10per" %\
(paths["kmers_associations_dir"], th_10per, paths["kmers_associations_dir"]), f_log)
##############################################################################################################
####################################### Clean intermediate files #########################################
##############################################################################################################
if args.remove_intermediate:
for file_type in ["bed","bim","fam"]:
if args.run_kmers:
run_and_log_command("rm %s/%s.*.P*.%s" % (paths["kmers_associations_dir"],base_created_files, file_type), f_log)
if args.run_one_step_snps or args.run_two_steps_snps:
run_and_log_command("rm %s/%s.P*.%s" % (paths["snps_associations_dir"],base_created_files, file_type), f_log)
if args.run_one_step_snps or args.run_two_steps_snps:
run_and_log_command("rm %s/output/P*" % paths["snps_associations_dir"], f_log)
run_and_log_command("gzip %s/output/phenotype_value.assoc.txt" % paths["snps_associations_dir"], f_log)
if args.run_kmers:
run_and_log_command("rm %s/%s.*.P*.%s" % (paths["kmers_associations_dir"],base_created_files, "fam.orig"), f_log)
run_and_log_command("rm %s/output/P*" % paths["kmers_associations_dir"], f_log)
run_and_log_command("gzip %s/output/phenotype_value.assoc.txt" % paths["kmers_associations_dir"], f_log)
# Close log file
f_log.close()
if __name__ == "__main__":
main()