diff --git a/dags/map_reproducibility/aotc_reproducibility.py b/dags/map_reproducibility/aotc_reproducibility.py index eff156ab..8c20f73c 100644 --- a/dags/map_reproducibility/aotc_reproducibility.py +++ b/dags/map_reproducibility/aotc_reproducibility.py @@ -24,12 +24,11 @@ def set_variables_cmds(): "export CLUSTER_REGION=australia-southeast1", "NOW=$(date +%s)", "export BUCKET_NAME=regression-testing-xlml", - "export JOB_NAME=gpt3-xlml-$NOW-175b-nemo", ) return set_variables -def set_project_commands(): +def configure_project_and_cluster(): set_project_command = ( "gcloud config set project $PROJECT", "sudo chown -R airflow:airflow /home/airflow/composer_kube_config", @@ -38,6 +37,26 @@ def set_project_commands(): ) return set_project_command +# This is required to get auth to access +# internal GoB repo +def git_cookie_authdaemon(): + auth_cmds = ( + "git clone https://gerrit.googlesource.com/gcompute-tools", + "echo 'trying to run git-cookie-authdaemon'", + "./gcompute-tools/git-cookie-authdaemon", + ) + return auth_cmds + +def clone_gob(): + gob_clone_cmds = ( + "echo 'trying to clone GoB repo from outside'", + "git clone https://ai-hypercomputer-benchmarks.googlesource.com/" + "reproducible-benchmark-recipes", + "cd reproducible-benchmark-recipes/projects", + "cd gpu-recipes", + ) + return gob_clone_cmds + def install_helm_cmds(): install_helm_cmd = ( @@ -57,10 +76,20 @@ def namespace_cmds(): namespace = ( "kubectl config view | grep namespace", "kubectl config set-context --current --namespace=default", - "kubectl config set-context heml --namespace=default", + "kubectl config set-context helm --namespace=default", ) return namespace +def clone_gob_cmds(): + gob_cmds = ( + # "git clone https://gerrit.googlesource.com/gcompute-tools", + # "./gcompute-tools/git-cookie-authdaemon", + "sudo apt install git-remote-google", + "sudo apt-get install git-remote-google", + "git clone sso://ai-hypercomputer-benchmarks/reproducible-benchmark-recipes", + ) + return gob_cmds + def wait_for_jobs_cmds(): wait_for_job = ( diff --git a/dags/map_reproducibility/nemo_gpt3.py b/dags/map_reproducibility/nemo_gpt3.py index ff03fdaf..e8cee1ae 100644 --- a/dags/map_reproducibility/nemo_gpt3.py +++ b/dags/map_reproducibility/nemo_gpt3.py @@ -21,12 +21,14 @@ from dags import composer_env from dags.map_reproducibility.aotc_reproducibility import get_metrics_cmds from dags.map_reproducibility.aotc_reproducibility import set_variables_cmds -from dags.map_reproducibility.aotc_reproducibility import set_project_commands +from dags.map_reproducibility.aotc_reproducibility import configure_project_and_cluster from dags.map_reproducibility.aotc_reproducibility import install_helm_cmds from dags.map_reproducibility.aotc_reproducibility import namespace_cmds from dags.map_reproducibility.aotc_reproducibility import wait_for_jobs_cmds from dags.map_reproducibility.aotc_reproducibility import copy_bucket_cmds from dags.map_reproducibility.aotc_reproducibility import cleanup_cmds +from dags.map_reproducibility.aotc_reproducibility import git_cookie_authdaemon +from dags.map_reproducibility.aotc_reproducibility import clone_gob # Run once a day at 2 pm UTC (6 am PST) SCHEDULED_TIME = "0 14 * * *" if composer_env.is_prod_env() else None @@ -34,50 +36,51 @@ @task def run_aotc_workload(): - gpu_recipe_cmd = ( - "git clone https://github.com/ai-hypercomputer/gpu-recipes.git", - "cd gpu-recipes", - "export REPO_ROOT=`git rev-parse --show-toplevel`", - "export RECIPE_ROOT=" - "$REPO_ROOT/training/a3mega/gpt3-175b/nemo-pretraining-gke", - "cd $RECIPE_ROOT", - ) + gpu_recipe_cmd = ( + "export REPO_ROOT=`pwd`", + "export RECIPE_ROOT=" + "$REPO_ROOT/training/a3mega/gpt3-175b/nemo-pretraining-gke", + "cd $RECIPE_ROOT", + ) - helm_cmds = ( - "CONFIG_FILE=$REPO_ROOT/src/frameworks" - "/nemo-configs/gpt3-175b-256gpus-fp8.yaml", - " helm install -f values.yaml " - "--namespace default " - "--set namespace=default" - " --set-file nemo_config" - "=$CONFIG_FILE" - " --set workload.image" - "=us-central1-docker.pkg.dev/" - "supercomputer-testing/gunjanjalori/nemo_test/nemo_workload:24.07" - " --set workload.gcsBucketForDataCataPath=$BUCKET_NAME" - " $JOB_NAME $REPO_ROOT/src/helm-charts/nemo-training", - ) + helm_cmds = ( + "CONFIG_FILE=$REPO_ROOT/src/frameworks" + "/nemo-configs/gpt3-175b-256gpus-fp8.yaml", + "export JOB_NAME=gpt3-xlml-$NOW-175b-nemo", + " helm install -f values.yaml " + "--namespace default " + "--set namespace=default" + " --set-file nemo_config" + "=$CONFIG_FILE" + " --set workload.image" + "=us-central1-docker.pkg.dev/" + "supercomputer-testing/gunjanjalori/nemo_test/nemo_workload:24.07" + " --set workload.gcsBucketForDataCataPath=$BUCKET_NAME" + " $JOB_NAME $REPO_ROOT/src/helm-charts/nemo-training", + ) - hook = SubprocessHook() - result = hook.run_command( - [ - "bash", - "-c", - ";".join( - set_variables_cmds() - + set_project_commands() - + gpu_recipe_cmd - + install_helm_cmds() - + namespace_cmds() - + helm_cmds - + wait_for_jobs_cmds() - + copy_bucket_cmds() - + get_metrics_cmds() - + cleanup_cmds() - ), - ], - ) - assert result.exit_code == 0, f"Command failed with code {result.exit_code}" + hook = SubprocessHook() + result = hook.run_command( + [ + "bash", + "-c", + ";".join( + set_variables_cmds() + + configure_project_and_cluster() + + git_cookie_authdaemon() + + clone_gob() + + gpu_recipe_cmd + + install_helm_cmds() + + namespace_cmds() + + helm_cmds + + wait_for_jobs_cmds() + + copy_bucket_cmds() + + get_metrics_cmds() + + cleanup_cmds() + ), + ], + ) + assert result.exit_code == 0, f"Command failed with code {result.exit_code}" with models.DAG( @@ -94,4 +97,4 @@ def run_aotc_workload(): start_date=datetime.datetime(2024, 11, 15), catchup=False, ) as dag: - run_aotc_workload() + run_aotc_workload()