From f8f43a9072cfd22e5e30f53e90478f5d8ec6e8e9 Mon Sep 17 00:00:00 2001 From: bloodearnest Date: Wed, 24 Apr 2024 15:56:59 +0100 Subject: [PATCH] Add a cli command to backfill manifests for all workspaces I have manually tested this using the db from the test backend. I have not included automated tests, as a) this is a one off migration command, b) they would be quite hard to write and c) this is an offline process that is not mission critical --- jobrunner/cli/manifests.py | 96 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 jobrunner/cli/manifests.py diff --git a/jobrunner/cli/manifests.py b/jobrunner/cli/manifests.py new file mode 100644 index 00000000..a419d190 --- /dev/null +++ b/jobrunner/cli/manifests.py @@ -0,0 +1,96 @@ +""" +Ops utility for backfilling manifest.json files from db +""" +import argparse + +from jobrunner.executors import local +from jobrunner.lib import database +from jobrunner.models import Job + + +def main(workspaces=None): + conn = database.get_connection() + workspaces = [ + w["workspace"] for w in conn.execute("SELECT DISTINCT(workspace) FROM job;") + ] + + n_workspaces = len(workspaces) + for i, workspace in enumerate(workspaces): + print(f"workspace {i+1}/{n_workspaces}: {workspace}") + + level4_dir = local.get_medium_privacy_workspace(workspace) + + sentinel = level4_dir / ".manifest-backfill" + if sentinel.exists(): + print(" - already done, skipping") + continue + + write_manifest(workspace) + + sentinel.touch() + + +def write_manifest(workspace): + conn = database.get_connection() + workspace_dir = local.get_high_privacy_workspace(workspace) + level4_dir = local.get_medium_privacy_workspace(workspace) + + job_ids = conn.execute( + """ + SELECT id FROM job + WHERE workspace = ? + AND outputs != '' + AND completed_at IS NOT NULL + AND state = 'succeeded' + ORDER BY completed_at DESC; + """, + (workspace,), + ) + + outputs = {} + + for row in job_ids: + job_id = row["id"] + job = database.find_one(Job, id=job_id) + + for output, level in job.outputs.items(): + if output in outputs: + # older version of the file, ingnore + continue + + abspath = workspace_dir / output + + # testing only + abspath.parent.mkdir(parents=True, exist_ok=True) + abspath.touch() + + # use presence of message file to detect excluded files + message_file = level4_dir / (output + ".txt") + excluded = message_file.exists() + + metadata = local.get_output_metadata( + abspath, + level, + job_id=job_id, + job_request=job.job_request_id, + action=job.action, + commit=job.commit, + excluded=excluded, + ) + + outputs[output] = metadata + + manifest = local.read_manifest_file(level4_dir, workspace) + manifest["outputs"] = outputs + print(f" - writing manifest for {workspace} with {len(outputs)} outputs") + local.write_manifest_file(level4_dir, manifest) + + +def run(): + parser = argparse.ArgumentParser(description=__doc__.partition("\n\n")[0]) + args = parser.parse_args() + main(**vars(args)) + + +if __name__ == "__main__": + run()