Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve SGE system #446

Merged
merged 46 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
11fe336
revise SGE
thangckt Mar 26, 2024
8140a43
Update pbs.py
thangckt Mar 26, 2024
b893544
Update pbs.py
thangckt Mar 26, 2024
82b7262
Update pbs.py
thangckt Mar 26, 2024
01ffd14
add sge_qe_name
thangckt Mar 26, 2024
56e3afe
Update pbs.py
thangckt Mar 26, 2024
703aa49
y
thangckt Mar 26, 2024
2c3b38a
Update submission.py
thangckt Mar 26, 2024
f73e0f9
u
thangckt Mar 26, 2024
957cf4a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 26, 2024
034c9d7
Update pbs.py
thangckt Mar 26, 2024
3f07742
Update pbs.py
thangckt Mar 26, 2024
499c691
Merge branch 'PR' into master
thangckt Mar 26, 2024
76150bf
Merge pull request #1 from thangckt/master
thangckt Mar 26, 2024
ef28a1b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 26, 2024
65dca91
u
thangckt Mar 27, 2024
ffe52e1
Merge pull request #2 from thangckt/master
thangckt Mar 27, 2024
8d7b566
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 27, 2024
8c50c4f
u
thangckt Mar 27, 2024
89d2a41
Merge branch 'PR' into master
thangckt Mar 27, 2024
2093336
Merge pull request #3 from thangckt/master
thangckt Mar 27, 2024
920984d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 27, 2024
c8f5657
y
thangckt Mar 27, 2024
ef6ebeb
Merge branch 'master' of https://github.com/thangckt/dpdispatcher
thangckt Mar 27, 2024
f56fa50
Merge branch 'PR' into master
thangckt Mar 27, 2024
e06c648
Merge pull request #4 from thangckt/master
thangckt Mar 27, 2024
6d6c973
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 27, 2024
dae7909
Update pbs.py
thangckt Mar 27, 2024
4ca179c
Merge pull request #5 from thangckt/master
thangckt Mar 27, 2024
83644c3
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 27, 2024
b1758d9
Create _version.py
thangckt Apr 4, 2024
62a3459
Delete dpdispatcher/_version.py
thangckt Apr 4, 2024
774b130
Merge pull request #6 from deepmodeling/master
thangckt Apr 13, 2024
2ae2e62
Update pbs.py
thangckt Apr 13, 2024
ad6b971
Merge branch 'master' of https://github.com/deepmodeling/dpdispatcher…
thangckt May 2, 2024
d02adf6
Merge pull request #8 from deepmodeling/master
thangckt May 7, 2024
3aca952
Merge pull request #9 from deepmodeling/master
thangckt May 19, 2024
9fc0f84
Merge branch 'master' into PR
thangckt May 28, 2024
95b8aa5
Merge pull request #10 from deepmodeling/master
thangckt May 28, 2024
cc7efcd
Merge pull request #11 from thangckt/master
thangckt May 28, 2024
99589d3
u
thangckt May 29, 2024
711da1d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 29, 2024
1e0c21e
Update test_lsf_script_generation.py
thangckt May 29, 2024
10cf5d3
Merge branch 'PR' of https://github.com/thangckt/dpdispatcher into PR
thangckt May 29, 2024
e31a578
Update test_lsf_script_generation.py
thangckt May 29, 2024
09a17a8
Merge branch 'master' into PR
thangckt May 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions dpdispatcher/machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,7 @@ def gen_script_env(self, job):

source_list = job.resources.source_list
for ii in source_list:
line = f"{{ source {ii}; }} \n"
source_files_part += line
source_files_part += f"source {ii}\n"

export_envs_part = ""
envs = job.resources.envs
Expand Down
30 changes: 23 additions & 7 deletions dpdispatcher/machines/pbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,10 @@ def gen_script_header(self, job):

sge_script_header_template = """
#!/bin/bash
#$ -N dpdispatcher_submit
{select_node_line}
#$ -S /bin/bash
#$ -cwd

#$ -N dp_job
{select_node_line}
"""


Expand All @@ -209,21 +209,37 @@ def __init__(
)

def gen_script_header(self, job):
### Ref:https://softpanorama.org/HPC/PBS_and_derivatives/Reference/pbs_command_vs_sge_commands.shtml
# resources.number_node is not used in SGE
resources = job.resources
sge_script_header_dict = {}
# resources.number_node is not used
sge_script_header_dict["select_node_line"] = (
f"#$ -pe mpi {resources.cpu_per_node} "
f"#$ -pe {resources.sge_pe_name} {resources.cpu_per_node}\n"
)
# resources.queue_name is not necessary
sge_script_header = sge_script_header_template.format(**sge_script_header_dict)
if resources.queue_name != "":
sge_script_header_dict["select_node_line"] += (
f"#$ -q {resources.queue_name}"
)
if (
resources["strategy"].get("customized_script_header_template_file")
is not None
):
file_name = resources["strategy"]["customized_script_header_template_file"]
sge_script_header = customized_script_header_template(file_name, resources)
else:
sge_script_header = sge_script_header_template.format(
**sge_script_header_dict
)
return sge_script_header

def do_submit(self, job):
script_file_name = job.script_file_name
script_str = self.gen_script(job)
job_id_name = job.job_hash + "_job_id"
self.context.write_file(fname=script_file_name, write_str=script_str)
script_run_str = self.gen_script_command(job)
script_run_file_name = f"{job.script_file_name}.run"
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
script_file_dir = self.context.remote_root
stdin, stdout, stderr = self.context.block_checkcall(
"cd {} && {} {}".format(script_file_dir, "qsub", script_file_name)
Expand Down
8 changes: 8 additions & 0 deletions dpdispatcher/submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -961,6 +961,8 @@ class Resources:
The queue name of batch job scheduler system.
group_size : int
The number of `tasks` in a `job`.
sge_pe_name : str
thangckt marked this conversation as resolved.
Show resolved Hide resolved
The parallel environment name of SGE.
custom_flags : list of Str
The extra lines pass to job submitting script header
strategy : dict
Expand Down Expand Up @@ -991,6 +993,7 @@ def __init__(
queue_name,
group_size,
*,
sge_pe_name="mpi",
custom_flags=[],
strategy=default_strategy,
para_deg=1,
Expand All @@ -1011,6 +1014,7 @@ def __init__(
self.group_size = group_size

# self.extra_specification = extra_specification
self.sge_pe_name = sge_pe_name
self.custom_flags = custom_flags
self.strategy = strategy
self.para_deg = para_deg
Expand Down Expand Up @@ -1057,6 +1061,7 @@ def serialize(self):
resources_dict["queue_name"] = self.queue_name
resources_dict["group_size"] = self.group_size

resources_dict["sge_pe_name"] = self.sge_pe_name
resources_dict["custom_flags"] = self.custom_flags
resources_dict["strategy"] = self.strategy
resources_dict["para_deg"] = self.para_deg
Expand All @@ -1079,6 +1084,7 @@ def deserialize(cls, resources_dict):
gpu_per_node=resources_dict.get("gpu_per_node", 0),
queue_name=resources_dict.get("queue_name", ""),
group_size=resources_dict["group_size"],
sge_pe_name=resources_dict.get("sge_pe_name", "mpi"),
custom_flags=resources_dict.get("custom_flags", []),
strategy=resources_dict.get("strategy", default_strategy),
para_deg=resources_dict.get("para_deg", 1),
Expand Down Expand Up @@ -1127,6 +1133,7 @@ def arginfo(detail_kwargs=True):
doc_gpu_per_node = "gpu numbers of each node assigned to each job."
doc_queue_name = "The queue name of batch job scheduler system."
doc_group_size = "The number of `tasks` in a `job`. 0 means infinity."
doc_sge_pe_name = "The parallel environment name of SGE."
doc_custom_flags = "The extra lines pass to job submitting script header"
doc_para_deg = "Decide how many tasks will be run in parallel."
doc_source_list = "The env file to be sourced before the command execution."
Expand Down Expand Up @@ -1191,6 +1198,7 @@ def arginfo(detail_kwargs=True):
),
Argument("queue_name", str, optional=True, doc=doc_queue_name, default=""),
Argument("group_size", int, optional=False, doc=doc_group_size),
Argument("sge_pe_name", str, optional=True, doc=doc_sge_pe_name),
thangckt marked this conversation as resolved.
Show resolved Hide resolved
Argument("custom_flags", List[str], optional=True, doc=doc_custom_flags),
# Argument("strategy", dict, optional=True, doc=doc_strategy,default=default_strategy),
strategy_format,
Expand Down
2 changes: 2 additions & 0 deletions tests/sample_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def get_sample_resources(cls):
gpu_per_node=1,
queue_name="T4_4_15",
group_size=2,
sge_pe_name="mpi",
custom_flags=[],
strategy={"if_cuda_multi_devices": False},
para_deg=1,
Expand All @@ -43,6 +44,7 @@ def get_sample_resources_dict(cls):
"gpu_per_node": 1,
"queue_name": "T4_4_15",
"group_size": 2,
"sge_pe_name": "mpi",
"custom_flags": [],
"strategy": {"if_cuda_multi_devices": False, "ratio_unfinished": 0.0},
"para_deg": 1,
Expand Down
1 change: 1 addition & 0 deletions tests/test_argcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def test_resources_argcheck(self):
"para_deg": 1,
"prepend_script": [],
"queue_name": "haha",
"sge_pe_name": "mpi",
"source_list": [],
"strategy": {"if_cuda_multi_devices": False, "ratio_unfinished": 0.0},
"wait_time": 0,
Expand Down