-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathking.py
executable file
·140 lines (101 loc) · 5.19 KB
/
king.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#! /usr/bin/env python3
"""Identity By Descent"""
import TopmedPipeline
import sys
import os
from argparse import ArgumentParser
from copy import deepcopy
description = """
Identity by Descent with the following steps:
1) Convert GDS to BED for use with KING
2) KING --ibdseg to get initial kinship estimates
3) Convert KING output to sparse matrix
"""
parser = ArgumentParser(description=description)
parser.add_argument("config_file", help="configuration file")
parser.add_argument("--cluster_type", default="UW_Cluster",
help="type of compute cluster environment [default %(default)s]")
parser.add_argument("--cluster_file", default=None,
help="json file containing options to pass to the cluster")
parser.add_argument("--verbose", action="store_true", default=False,
help="enable verbose output to help debug")
parser.add_argument("-n", "--ncores", default="8",
help="number of cores to use [default %(default)s]")
parser.add_argument("-e", "--email", default=None,
help="email address for job reporting")
parser.add_argument("--print_only", action="store_true", default=False,
help="print qsub commands without submitting")
parser.add_argument("--version", action="version",
version="TopmedPipeline "+TopmedPipeline.__version__,
help="show the version number and exit")
args = parser.parse_args()
configfile = args.config_file
cluster_file = args.cluster_file
cluster_type = args.cluster_type
ncores = args.ncores
email = args.email
print_only = args.print_only
verbose = args.verbose
version = "--version " + TopmedPipeline.__version__
cluster = TopmedPipeline.ClusterFactory.createCluster(cluster_type, cluster_file, verbose)
pipeline = cluster.getPipelinePath()
submitPath = cluster.getSubmitPath()
driver = os.path.join(submitPath, "runRscript.sh")
configdict = TopmedPipeline.readConfig(configfile)
configdict = TopmedPipeline.directorySetup(configdict, subdirs=["config", "data", "log", "plots"])
# analysis init
cluster.analysisInit(print_only=print_only)
job = "gds2bed"
rscript = os.path.join(pipeline, "R", job + ".R")
# include all samples in BED
config = deepcopy(configdict)
config["sample_include_file"] = "NA"
configfile = configdict["config_prefix"] + "_" + job + ".config"
TopmedPipeline.writeConfig(config, configfile)
jobid = cluster.submitJob(job_name=job, cmd=driver, args=[rscript, configfile, version], email=email, print_only=print_only)
## this swaps alleles to make KING --ibdseg happy about reading the file
job = "plink_make-bed"
bedprefix = configdict["bed_file"]
arglist = ["--bfile", bedprefix, "--make-bed", "--out", bedprefix]
plinkid = cluster.submitJob(binary=True, job_name=job, cmd="plink", args=arglist, holdid=[jobid], email=email, print_only=print_only)
## when input and output files have same name, plink renames input with "~"
tmpid = cluster.submitJob(binary=True, job_name="rm_tmp_files", cmd="rm", args=[bedprefix + ".*~"], holdid=[plinkid], email=email, print_only=print_only)
job = "king_ibdseg"
bedfile = configdict["bed_file"] + ".bed"
outprefix = configdict["data_prefix"] + "_king_ibdseg"
arglist = ["-b", bedfile, "--cpus", ncores, "--ibdseg", "--prefix", outprefix]
segid = cluster.submitJob(binary=True, job_name=job, cmd="king", args=arglist, holdid=[plinkid], request_cores=ncores, email=email, print_only=print_only)
kingfile = outprefix + ".seg"
# gzip output
segid = cluster.submitJob(binary=True, job_name="gzip", cmd="gzip", args=[kingfile], holdid=[segid], email=email, print_only=print_only)
kingfile = kingfile + ".gz"
job = "kinship_plots"
rscript = os.path.join(pipeline, "R", job + ".R")
config = deepcopy(configdict)
config["kinship_file"] = kingfile
config["kinship_method"] = "king_ibdseg"
config["out_file_all"] = configdict["plots_prefix"] + "_king_ibdseg_kinship_all.pdf"
config["out_file_cross"] = configdict["plots_prefix"] + "_king_ibdseg_kinship_cross.pdf"
config["out_file_study"] = configdict["plots_prefix"] + "_king_ibdseg_kinship_study.pdf"
configfile = configdict["config_prefix"] + "_" + job + "_ibdseg.config"
TopmedPipeline.writeConfig(config, configfile)
segplotid = cluster.submitJob(job_name=job, cmd=driver, args=[rscript, configfile, version], holdid=[segid], email=email, print_only=print_only)
job = "king_to_matrix"
rscript = os.path.join(pipeline, "R", job + ".R")
config = deepcopy(configdict)
config["king_file"] = kingfile
config["kinship_method"] = "king_ibdseg"
config["out_prefix"] = configdict["data_prefix"] + "_king_ibdseg_Matrix"
configfile = configdict["config_prefix"] + "_" + job + "_ibdseg.config"
TopmedPipeline.writeConfig(config, configfile)
segmatid = cluster.submitJob(job_name=job, cmd=driver, args=[rscript, configfile, version], holdid=[segid], email=email, print_only=print_only)
# post analysis
bname = "post_analysis"
job = "king" + "_" + bname
jobpy = bname + ".py"
pcmd=os.path.join(submitPath, jobpy)
holdlist = [segplotid, segmatid]
argList = ["-a", cluster.getAnalysisName(), "-l", cluster.getAnalysisLog(),
"-s", cluster.getAnalysisStartSec()]
cluster.submitJob(binary=True, job_name=job, cmd=pcmd, args=argList,
holdid=holdlist, print_only=print_only)