-
Notifications
You must be signed in to change notification settings - Fork 0
/
docker_st.py
104 lines (86 loc) · 3.63 KB
/
docker_st.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import sys, json
import subprocess
from docker_dt import ExpRunner
from os.path import expanduser
from pssh.clients.native.single import SSHClient
class SingleNodeExp(ExpRunner):
def __init__(self, config):
""""""
self.config = config
self._parse_config(config)
self.host_nodes = []
def _parse_config(self, config):
self.host_user_dir = config["host_user_dir"]
self.docker_user_dir = config["docker_user_dir"]
self.docker_user = config["docker_user"]
self.docker_ssh_port = config["docker_ssh_port"]
self.script_path = self._trans_docker_path(config["script_path"])
self.script_args = config["script_args"]
self.log_folder = config['log_folder']
self.docker_key = config["docker_ssh_key"]
self.bw_limit = "ST"
self.default_bw = "ST"
def _start_containers(self):
""" start local container only"""
stop_cmd = "docker kill $(docker ps -q)"
pull_cmd = "docker pull zarzen/horovod-mod:nccl-noSum-noSock"
start_cmd = "sudo docker run --gpus 1 --network=host --detach --ipc=host "\
"-v {}/autorun/distributed-training:{}/distributed-training "\
"-v {}/autorun/horovod_logs:{}/horovod_logs "\
"-v {}/data:{}/data "\
"zarzen/horovod-mod:nccl-noSum-noSock".format(self.host_user_dir, self.docker_user_dir,
self.host_user_dir, self.docker_user_dir,
self.host_user_dir, self.docker_user_dir)
subprocess.run(stop_cmd, shell=True)
subprocess.run(pull_cmd, shell=True)
subprocess.run(start_cmd, shell=True)
def _init_host_env(self):
check_cmd = "rm -rf ~/autorun; mkdir ~/autorun; mkdir ~/autorun/horovod_logs; " \
"mkdir ~/autorun/horovod_logs/hooks; "\
"mkdir ~/autorun/horovod_logs/model_log; "\
"mkdir ~/autorun/horovod_logs/mpi_events; "\
"mkdir ~/autorun/logs/; "\
"mkdir ~/autorun/logs/net; mkdir ~/autorun/logs/cpu; mkdir ~/data "
subprocess.run(check_cmd, shell=True)
cmd = "cd ~/autorun;"\
"git clone https://github.com/handar423/distributed-training.git"
subprocess.run(cmd, shell=True)
def run(self):
""""""
self._init_host_env()
self._start_containers()
self._init_docker_ssh()
self.exist_logs = self._get_logs()
# self._exe_cmd(self.contianer, "ls")
cmd = self.build_train_cmd()
print('running command:', cmd)
self._exe_cmd(self.contianer, cmd)
print('End experiment')
self.move_log()
def build_train_cmd(self):
""""""
exp_cmd = "python3 {} {}".format(self.script_path, self.script_args)
return exp_cmd
def _init_docker_ssh(self):
self.contianer = SSHClient("localhost", user=self.docker_user, port=2022,
pkey=self.docker_key)
# def _kill_containers(self):
# stop_cmd = "docker kill $(docker ps -q)"
# subprocess.run(stop_cmd, shell=True)
def _exe_cmd(self, client, cmd):
ret = client.run_command(cmd)
for line in ret.stdout:
print(line)
for line in ret.stderr:
print(line)
def main():
if len(sys.argv) < 2:
print("Please specific config file")
sys.exit()
return
with open(sys.argv[1]) as config_file:
config = json.load(config_file)
exp = SingleNodeExp(config)
exp.run()
if __name__ == "__main__":
main()