Skip to content

Commit

Permalink
add GoB cloning commands
Browse files Browse the repository at this point in the history
  • Loading branch information
gunjanj007 committed Nov 21, 2024
1 parent 12a6b80 commit 3db020e
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 47 deletions.
35 changes: 32 additions & 3 deletions dags/map_reproducibility/aotc_reproducibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,11 @@ def set_variables_cmds():
"export CLUSTER_REGION=australia-southeast1",
"NOW=$(date +%s)",
"export BUCKET_NAME=regression-testing-xlml",
"export JOB_NAME=gpt3-xlml-$NOW-175b-nemo",
)
return set_variables


def set_project_commands():
def configure_project_and_cluster():
set_project_command = (
"gcloud config set project $PROJECT",
"sudo chown -R airflow:airflow /home/airflow/composer_kube_config",
Expand All @@ -38,6 +37,26 @@ def set_project_commands():
)
return set_project_command

# This is required to get auth to access
# internal GoB repo
def git_cookie_authdaemon():
auth_cmds = (
"git clone https://gerrit.googlesource.com/gcompute-tools",
"echo 'trying to run git-cookie-authdaemon'",
"./gcompute-tools/git-cookie-authdaemon",
)
return auth_cmds

def clone_gob():
gob_clone_cmds = (
"echo 'trying to clone GoB repo from outside'",
"git clone https://ai-hypercomputer-benchmarks.googlesource.com/"
"reproducible-benchmark-recipes",
"cd reproducible-benchmark-recipes/projects",
"cd gpu-recipes",
)
return gob_clone_cmds


def install_helm_cmds():
install_helm_cmd = (
Expand All @@ -57,10 +76,20 @@ def namespace_cmds():
namespace = (
"kubectl config view | grep namespace",
"kubectl config set-context --current --namespace=default",
"kubectl config set-context heml --namespace=default",
"kubectl config set-context helm --namespace=default",
)
return namespace

def clone_gob_cmds():
gob_cmds = (
# "git clone https://gerrit.googlesource.com/gcompute-tools",
# "./gcompute-tools/git-cookie-authdaemon",
"sudo apt install git-remote-google",
"sudo apt-get install git-remote-google",
"git clone sso://ai-hypercomputer-benchmarks/reproducible-benchmark-recipes",
)
return gob_cmds


def wait_for_jobs_cmds():
wait_for_job = (
Expand Down
91 changes: 47 additions & 44 deletions dags/map_reproducibility/nemo_gpt3.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,63 +21,66 @@
from dags import composer_env
from dags.map_reproducibility.aotc_reproducibility import get_metrics_cmds
from dags.map_reproducibility.aotc_reproducibility import set_variables_cmds
from dags.map_reproducibility.aotc_reproducibility import set_project_commands
from dags.map_reproducibility.aotc_reproducibility import configure_project_and_cluster
from dags.map_reproducibility.aotc_reproducibility import install_helm_cmds
from dags.map_reproducibility.aotc_reproducibility import namespace_cmds
from dags.map_reproducibility.aotc_reproducibility import wait_for_jobs_cmds
from dags.map_reproducibility.aotc_reproducibility import copy_bucket_cmds
from dags.map_reproducibility.aotc_reproducibility import cleanup_cmds
from dags.map_reproducibility.aotc_reproducibility import git_cookie_authdaemon
from dags.map_reproducibility.aotc_reproducibility import clone_gob

# Run once a day at 2 pm UTC (6 am PST)
SCHEDULED_TIME = "0 14 * * *" if composer_env.is_prod_env() else None


@task
def run_aotc_workload():
gpu_recipe_cmd = (
"git clone https://github.com/ai-hypercomputer/gpu-recipes.git",
"cd gpu-recipes",
"export REPO_ROOT=`git rev-parse --show-toplevel`",
"export RECIPE_ROOT="
"$REPO_ROOT/training/a3mega/gpt3-175b/nemo-pretraining-gke",
"cd $RECIPE_ROOT",
)
gpu_recipe_cmd = (
"export REPO_ROOT=`pwd`",
"export RECIPE_ROOT="
"$REPO_ROOT/training/a3mega/gpt3-175b/nemo-pretraining-gke",
"cd $RECIPE_ROOT",
)

helm_cmds = (
"CONFIG_FILE=$REPO_ROOT/src/frameworks"
"/nemo-configs/gpt3-175b-256gpus-fp8.yaml",
" helm install -f values.yaml "
"--namespace default "
"--set namespace=default"
" --set-file nemo_config"
"=$CONFIG_FILE"
" --set workload.image"
"=us-central1-docker.pkg.dev/"
"supercomputer-testing/gunjanjalori/nemo_test/nemo_workload:24.07"
" --set workload.gcsBucketForDataCataPath=$BUCKET_NAME"
" $JOB_NAME $REPO_ROOT/src/helm-charts/nemo-training",
)
helm_cmds = (
"CONFIG_FILE=$REPO_ROOT/src/frameworks"
"/nemo-configs/gpt3-175b-256gpus-fp8.yaml",
"export JOB_NAME=gpt3-xlml-$NOW-175b-nemo",
" helm install -f values.yaml "
"--namespace default "
"--set namespace=default"
" --set-file nemo_config"
"=$CONFIG_FILE"
" --set workload.image"
"=us-central1-docker.pkg.dev/"
"supercomputer-testing/gunjanjalori/nemo_test/nemo_workload:24.07"
" --set workload.gcsBucketForDataCataPath=$BUCKET_NAME"
" $JOB_NAME $REPO_ROOT/src/helm-charts/nemo-training",
)

hook = SubprocessHook()
result = hook.run_command(
[
"bash",
"-c",
";".join(
set_variables_cmds()
+ set_project_commands()
+ gpu_recipe_cmd
+ install_helm_cmds()
+ namespace_cmds()
+ helm_cmds
+ wait_for_jobs_cmds()
+ copy_bucket_cmds()
+ get_metrics_cmds()
+ cleanup_cmds()
),
],
)
assert result.exit_code == 0, f"Command failed with code {result.exit_code}"
hook = SubprocessHook()
result = hook.run_command(
[
"bash",
"-c",
";".join(
set_variables_cmds()
+ configure_project_and_cluster()
+ git_cookie_authdaemon()
+ clone_gob()
+ gpu_recipe_cmd
+ install_helm_cmds()
+ namespace_cmds()
+ helm_cmds
+ wait_for_jobs_cmds()
+ copy_bucket_cmds()
+ get_metrics_cmds()
+ cleanup_cmds()
),
],
)
assert result.exit_code == 0, f"Command failed with code {result.exit_code}"


with models.DAG(
Expand All @@ -94,4 +97,4 @@ def run_aotc_workload():
start_date=datetime.datetime(2024, 11, 15),
catchup=False,
) as dag:
run_aotc_workload()
run_aotc_workload()

0 comments on commit 3db020e

Please sign in to comment.