Skip to content

Commit

Permalink
Merge pull request #138 from kbase/dev-service
Browse files Browse the repository at this point in the history
Share scratch with a configured NERSC group
  • Loading branch information
MrCreosote authored Dec 20, 2024
2 parents 3bf87fb + 89ebba4 commit 1fb9a4e
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 8 deletions.
7 changes: 5 additions & 2 deletions cdmtaskservice/app_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,12 @@ async def build_app(
logr.info("Done")
logr.info("Setting up NERSC manager and installing code at NERSC...")
# TODO MULTICLUSTER service won't start if perlmutter is down, need to make it more dynamic
remote_code_loc = Path(cfg.nersc_remote_code_dir) / VERSION
nerscman = await NERSCManager.create(
sfapi_client.get_client, remote_code_loc, cfg.jaws_token, cfg.jaws_group
sfapi_client.get_client,
Path(cfg.nersc_remote_code_dir) / VERSION,
cfg.nersc_file_group,
cfg.jaws_token,
cfg.jaws_group,
)
logr.info("Done")
logr.info("Initializing S3 client... ")
Expand Down
3 changes: 3 additions & 0 deletions cdmtaskservice/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class CDMTaskServiceConfig:
as the remaining lines.
sfapi_user: str - the user name of the user accociated with the credentials.
nersc_remote_code_dir: str - the location at NERSC to upload remote code.
nersc_file_group: str - the NERSC group with which downloaded files must be shared.
jaws_token: str - the JAWS token used to run jobs.
jaws_group: str - the JAWS group used to run jobs.
s3_url: str - the URL of the S3 instance to use for data storage.
Expand Down Expand Up @@ -78,6 +79,7 @@ def __init__(self, config_file: BinaryIO):
self.sfapi_cred_path = _get_string_required(config, _SEC_NERSC, "sfapi_cred_path")
self.sfapi_user = _get_string_required(config, _SEC_NERSC, "sfapi_user")
self.nersc_remote_code_dir = _get_string_required(config, _SEC_NERSC, "remote_code_dir")
self.nersc_file_group = _get_string_required(config, _SEC_NERSC, "file_group")
self.jaws_token = _get_string_required(config, _SEC_JAWS, "token")
self.jaws_group = _get_string_required(config, _SEC_JAWS, "group")
self.s3_url = _get_string_required(config, _SEC_S3, "url")
Expand Down Expand Up @@ -116,6 +118,7 @@ def print_config(self, output: TextIO):
f"NERSC client credential path: {self.sfapi_cred_path}",
f"NERSC client user: {self.sfapi_user}",
f"NERSC remote code dir: {self.nersc_remote_code_dir}",
f"NERSC file group: {self.nersc_file_group}",
"JAWS token: REDACTED FOR THE NATIONAL SECURITY OF GONDWANALAND",
f"JAWS group: {self.jaws_group}",
f"S3 URL: {self.s3_url}",
Expand Down
26 changes: 20 additions & 6 deletions cdmtaskservice/nersc/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ async def create(
cls,
client_provider: Callable[[], AsyncClient],
nersc_code_path: Path,
file_group: str,
jaws_token: str,
jaws_group: str,
) -> Self:
Expand All @@ -127,11 +128,16 @@ async def create(
the user associated with the client does not change.
nersc_code_path - the path in which to store remote code at NERSC. It is advised to
include version information in the path to avoid code conflicts.
file_group - the group with which to share downloaded files at NERSC.
jaws_token - a token for the JGI JAWS system.
jaws_group - the group to use for running JAWS jobs.
"""
nm = NERSCManager(client_provider, nersc_code_path)
await nm._setup_remote_code(jaws_token, jaws_group)
await nm._setup_remote_code(
_require_string(file_group, "file_group"),
_require_string(jaws_token, "jaws_token"),
_require_string(jaws_group, "jaws_group"),
)
return nm

def __init__(
Expand All @@ -149,7 +155,7 @@ def _check_path(self, path: Path, name: str):
raise ValueError(f"{name} must be absolute to the NERSC root dir")
return path

async def _setup_remote_code(self, jaws_token: str, jaws_group: str):
async def _setup_remote_code(self, file_group: str, jaws_token: str, jaws_group: str):
# TODO RELIABILITY atomically write files. For these small ones probably doesn't matter?
cli = self._client_provider()
perlmutter = await cli.compute(Machine.perlmutter)
Expand Down Expand Up @@ -178,7 +184,7 @@ async def _setup_remote_code(self, jaws_token: str, jaws_group: str):
),
chmod = "600"
))
res = tg.create_task(dt.run(f"{_DT_WORKAROUND}; echo $SCRATCH"))
scratch = tg.create_task(self._set_up_dtn_scratch(cli, file_group))
if _PIP_DEPENDENCIES:
deps = " ".join(
# may need to do something else if module doesn't have __version__
Expand All @@ -192,11 +198,19 @@ async def _setup_remote_code(self, jaws_token: str, jaws_group: str):
+ f"pip install {deps}" # adding notapackage causes a failure
)
tg.create_task(dt.run(command))
scratch = res.result().strip()
self._dtn_scratch = scratch.result()

async def _set_up_dtn_scratch(self, client: AsyncClient, file_group: str) -> Path:
dt = await client.compute(_DT_TARGET)
scratch = await dt.run(f"{_DT_WORKAROUND}; echo $SCRATCH")
scratch = scratch.strip()
if not scratch:
raise ValueError("Unable to determine $SCRATCH variable for NERSC dtns")
self._dtn_scratch = Path(scratch)
logging.getLogger(__name__).info(f"NERSC DTN scratch path: {self._dtn_scratch}")
logging.getLogger(__name__).info(f"NERSC DTN scratch path: {scratch}")
await dt.run(
f"{_DT_WORKAROUND}; set -e; chgrp {file_group} {scratch}; chmod g+rs {scratch}"
)
return Path(scratch)

async def _run_command(self, client: AsyncClient, machine: Machine, exe: str):
# TODO ERRORHANDlING deal with errors
Expand Down
5 changes: 5 additions & 0 deletions cdmtaskservice_config.toml.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,18 @@ sfapi_cred_path = "{{ KBCTS_SFAPI_CRED_PATH or "" }}"
# The user associated with the client credentials. The user's default shell must be bash.
# If the client credentials are updated but the user doesn't match they will not be accepted.
# It is advised to create a collaboration user for the service.
# The user's scratch directory will be shared with the file_group below.
# The jaws.conf file will be created in the user's home directory on service startup, overwriting
# any extant file.
sfapi_user = "{{ KBCTS_SFAPI_USER or "" }}"

# Where to store remote code at NERSC. This must be writeable by the service account.
remote_code_dir = "{{ KBCTS_NERSC_REMOTE_CODE_DIR or "/global/cfs/cdirs/kbase/cdm_task_service" }}"

# The group with which to share downloaded data files. The JAWS user that will run jobs must
# be in the same group so it can read the input files.
file_group = "{{ KBCTS_NERSC_FILE_GROUP or "kbase" }}"

[JAWS]

# The JGI JAWS token to use to run jobs. This can be obtained from a JAWS representative.
Expand Down
1 change: 1 addition & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ services:
- KBCTS_SFAPI_CRED_PATH=/creds/sfapi_creds
- KBCTS_SFAPI_USER=cdm_ts
- KBCTS_NERSC_REMOTE_CODE_DIR=/global/cfs/cdirs/kbase/cdm_task_service
- KBCTS_NERSC_FILE_GROUP=kbase
# Don't commit your token to github please
- KBCTS_JAWS_TOKEN=tokengoeshere
- KBCTS_JAWS_GROUP=kbase
Expand Down

0 comments on commit 1fb9a4e

Please sign in to comment.