isaac-sim · glvov-bdai · Sep 27, 2024 · Sep 24, 2024 · Sep 27, 2024 · Sep 27, 2024
@@ -97,6 +97,7 @@ Table of Contents
    source/features/hydra
    source/features/multi_gpu
    source/features/tiled_rendering
+   source/features/ray
    source/features/reproducibility
 
 .. toctree::

@@ -0,0 +1,10 @@
+# This dockerfile only works because of Felix Yu's help #TODO: Add Felix to contributors list
+FROM isaac-lab-base:latest
+ENV PATH="/usr/local/nvidia/bin:$PATH"
+ENV LD_LIBRARY_PATH="/usr/local/nvidia/lib64"
+RUN ln -sf /usr/local/nvidia/bin/nvidia* /usr/bin
+RUN /workspace/isaaclab/_isaac_sim/python.sh -m pip install "ray[default, tune]"==2.31.0 && \
+sed -i "1i $(echo "#!/workspace/isaaclab/_isaac_sim/python.sh")" \
+/isaac-sim/kit/python/bin/ray && ln -s /isaac-sim/kit/python/bin/ray /usr/local/bin/ray
+# The following is only needed for tuning
+RUN /workspace/isaaclab/_isaac_sim/python.sh -m pip install optuna bayesian-optimization
diff --git a/source/standalone/workflows/ray/cluster_configs/google_cloud/README.md b/source/standalone/workflows/ray/cluster_configs/google_cloud/README.md
@@ -0,0 +1,4 @@
+# Ray on Google Cloud with Isaac Lab
+
+For more info, see
+https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/ray-on-gke/README.md
@@ -0,0 +1,177 @@
+# Jinja is used for templating here as full helm setup is excessive for application
+apiVersion: ray.io/v1alpha1
+kind: RayCluster
+metadata:
+  name: {{ name }}
+  namespace: {{ namespace }}
+spec:
+  rayVersion: "2.8.0"
+  enableInTreeAutoscaling: true
+  autoscalerOptions:
+    upscalingMode: Default
+    idleTimeoutSeconds: 120
+    imagePullPolicy: Always
+    securityContext: {}
+    envFrom: []
+
+  headGroupSpec:
+    rayStartParams:
+      block: "true"
+      dashboard-host: 0.0.0.0
+      dashboard-port: "8265"
+      node-ip-address: "0.0.0.0"
+      port: "6379"
+      include-dashboard: "true"
+      ray-debugger-external: "true"
+      object-manager-port: "8076"
+      num-gpus: "0"
+      num-cpus: "0" # prevent scheduling jobs to the head node - workers only
+    headService:
+      apiVersion: v1
+      kind: Service
+      metadata:
+        name: head
+      spec:
+        type: LoadBalancer
+    template:
+      metadata:
+        labels:
+          app.kubernetes.io/instance: tuner
+          app.kubernetes.io/name: kuberay
+          cloud.google.com/gke-ray-node-type: head
+      spec:
+        serviceAccountName: {{ service_account_name }}
+        affinity: {}
+        securityContext:
+          fsGroup: 100
+        containers:
+          - env:
+            - name: GOOGLE_APPLICATION_CREDENTIALS
+              value: "/var/secrets/{{secret_name}}/key.json"
+            image: {{ image }}
+            imagePullPolicy: Always
+            name: head
+            resources:
+              limits:
+                cpu: "{{ num_head_cpu }}"
+                memory: {{ head_ram_gb }}G
+                nvidia.com/gpu: "0"
+              requests:
+                cpu: "{{ num_head_cpu }}"
+                memory: {{ head_ram_gb }}G
+                nvidia.com/gpu: "0"
+            securityContext: {}
+            volumeMounts:
+              - mountPath: /tmp/ray
+                name: ray-logs
+              - mountPath: /var/secrets/{{secret_name}}
+                name: {{secret_name}}
+                readOnly: true
+            command: ["/bin/bash", "-c", "ray start --head --port=6379 --object-manager-port=8076 --dashboard-host=0.0.0.0 --dashboard-port=8265 --include-dashboard=true && tail -f /dev/null"]
+          - image: fluent/fluent-bit:1.9.6
+            name: fluentbit
+            resources:
+              limits:
+                cpu: 100m
+                memory: 128Mi
+              requests:
+                cpu: 100m
+                memory: 128Mi
+            volumeMounts:
+              - mountPath: /tmp/ray
+                name: ray-logs
+        imagePullSecrets: []
+        nodeSelector:
+          iam.gke.io/gke-metadata-server-enabled: "true"
+        volumes:
+          - configMap:
+              name: fluentbit-config
+            name: fluentbit-config
+          - name: ray-logs
+            emptyDir: {}
+          - name: {{secret_name}}
+            secret:
+              secretName: {{secret_name}}
+
+  workerGroupSpecs:
+    {% for it in range(gpu_per_worker|length) %}
+    - groupName: "{{ worker_accelerator[it] }}x{{ gpu_per_worker[it] }}-cpu-{{ cpu_per_worker[it] }}-ram-gb-{{ ram_gb_per_worker[it] }}"
+      replicas: {{ num_workers[it] }}
+      maxReplicas: {{ num_workers[it] }}
+      minReplicas: {{ num_workers[it] }}
+      rayStartParams:
+        block: "true"
+        ray-debugger-external: "true"
+        replicas: "{{num_workers[it]}}"
+      template:
+        metadata:
+          annotations: {}
+          labels:
+            app.kubernetes.io/instance: tuner
+            app.kubernetes.io/name: kuberay
+            cloud.google.com/gke-ray-node-type: worker
+        spec:
+          serviceAccountName: {{ service_account_name }}
+          affinity: {}
+          securityContext:
+            fsGroup: 100
+          containers:
+            - env:
+              - name: GOOGLE_APPLICATION_CREDENTIALS
+                value: "/var/secrets/{{secret_name}}/key.json"
+              - name: NVIDIA_VISIBLE_DEVICES
+                value: "all"
+              - name: NVIDIA_DRIVER_CAPABILITIES
+                value: "compute,utility"
+
+              image: {{ image }}
+              imagePullPolicy: Always
+              name: ray-worker
+              resources:
+                limits:
+                  cpu: "{{ cpu_per_worker[it] }}"
+                  memory: {{ ram_gb_per_worker[it] }}G
+                  nvidia.com/gpu: "{{ gpu_per_worker[it] }}"
+                requests:
+                  cpu: "{{ cpu_per_worker[it] }}"
+                  memory: {{ ram_gb_per_worker[it] }}G
+                  nvidia.com/gpu: "{{ gpu_per_worker[it] }}"
+              securityContext: {}
+              volumeMounts:
+                - mountPath: /tmp/ray
+                  name: ray-logs
+                - mountPath: /var/secrets/{{secret_name}}
+                  name: {{secret_name}}
+                  readOnly: true
+              command: ["/bin/bash", "-c", "ray start --address=head.{{ namespace }}.svc.cluster.local:6379 && tail -f /dev/null"]
+            - image: fluent/fluent-bit:1.9.6
+              name: fluentbit
+              resources:
+                limits:
+                  cpu: 100m
+                  memory: 128Mi
+                requests:
+                  cpu: 100m
+                  memory: 128Mi
+              volumeMounts:
+                - mountPath: /tmp/ray
+                  name: ray-logs
+
+          imagePullSecrets: []
+          nodeSelector:
+            cloud.google.com/gke-accelerator: {{ worker_accelerator[it] }}
+            iam.gke.io/gke-metadata-server-enabled: "true"
+          tolerations:
+            - key: "nvidia.com/gpu"
+              operator: "Exists"
+              effect: "NoSchedule"
+          volumes:
+            - configMap:
+                name: fluentbit-config
+              name: fluentbit-config
+            - name: ray-logs
+              emptyDir: {}
+            - name: {{secret_name}}
+              secret:
+                secretName: {{secret_name}}
+    {% endfor %}
@@ -0,0 +1,180 @@
+# Copyright (c) 2022-2024, The Isaac Lab Project Developers.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import argparse
+import os
+import re
+import subprocess
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+"""
+This script requires that kubectl is installed and KubeRay was used to create the cluster.
+
+Creates a config file containing ``name: <NAME> address: http://<IP>:<PORT>`` on
+a new line for each cluster.
+
+Usage:
+
+.. code-block:: bash
+
+
+    ./isaaclab.sh -p source/standalone/workflows/ray/grok_cluster_with_kubectl.py
+
+    # For options, supply -h arg
+"""
+
+
+def get_pods(namespace: str = "default") -> list[tuple]:
+    cmd = ["kubectl", "get", "pods", "-n", namespace, "--no-headers"]
+    output = subprocess.check_output(cmd).decode()
+    pods = []
+    for line in output.strip().split("\n"):
+        fields = line.split()
+        pod_name = fields[0]
+        status = fields[2]
+        pods.append((pod_name, status))
+    return pods
+
+
+def get_clusters(pods: list, cluster_name_prefix: str) -> set:
+    clusters = set()
+    # Modify regex pattern to match the entire structure including `-head` or `-worker`
+    for pod_name, _ in pods:
+        match = re.match(r"(" + re.escape(cluster_name_prefix) + r"[-\w]+)", pod_name)
+        if match:
+            clusters.add(match.group(1).split("-head")[0].split("-worker")[0])
+    return sorted(clusters)
+
+
+def check_clusters_running(pods: list, clusters: set) -> bool:
+    clusters_running = True
+    for cluster in clusters:
+        cluster_pods = [p for p in pods if p[0].startswith(cluster)]
+        total_pods = len(cluster_pods)
+        running_pods = len([p for p in cluster_pods if p[1] == "Running"])
+        if running_pods != total_pods:
+            clusters_running = False
+            break
+    return clusters_running
+
+
+def get_ray_address(head_pod: str, namespace: str = "default", ray_head_name: str = "head") -> str:
+    cmd = ["kubectl", "logs", head_pod, "-c", ray_head_name, "-n", namespace]
+    try:
+        output = subprocess.check_output(cmd).decode()
+    except subprocess.CalledProcessError as e:
+        raise ValueError(
+            f"Could not enter head container with cmd {cmd}: {e}Perhaps try a different namespace or ray head name."
+        )
+    match = re.search(r"RAY_ADDRESS='([^']+)'", output)
+    if match:
+        return match.group(1)
+    else:
+        return None
+
+
+def process_cluster(cluster_info: dict, ray_head_name: str = "head") -> str:
+    cluster, pods, namespace = cluster_info
+    head_pod = None
+    for pod_name, status in pods:
+        if pod_name.startswith(cluster + "-head"):
+            head_pod = pod_name
+            break
+    if not head_pod:
+        return f"Error: Could not find head pod for cluster {cluster}\n"
+
+    # Get RAY_ADDRESS and status
+    ray_address = get_ray_address(head_pod, namespace=namespace, ray_head_name=ray_head_name)
+    if not ray_address:
+        return f"Error: Could not find RAY_ADDRESS for cluster {cluster}\n"
+    output_line = (  # num_cpu: {num_cpu} num_gpu: {num_gpu} ram_gb: {ram_gb} total_workers: {total_workers}\n"
+        f"name: {cluster} address: {ray_address} \n"
+    )
+    return output_line
+
+
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(description="Process Ray clusters and save their specifications.")
+    parser.add_argument("--prefix", default="isaacray", help="The prefix for the cluster names.")
+    parser.add_argument("--output", default="~/.cluster_config", help="The file to save cluster specifications.")
+    parser.add_argument("--ray_head_name", default="head", help="The metadata name for the ray head container")
+    args = parser.parse_args()
+
+    CLUSTER_NAME_PREFIX = args.prefix
+    # Expand user directory for output file
+    CLUSTER_SPEC_FILE = os.path.expanduser(args.output)
+
+    # Get current namespace
+    try:
+        CURRENT_NAMESPACE = (
+            subprocess.check_output(["kubectl", "config", "view", "--minify", "--output", "jsonpath={..namespace}"])
+            .decode()
+            .strip()
+        )
+        if not CURRENT_NAMESPACE:
+            CURRENT_NAMESPACE = "default"
+    except subprocess.CalledProcessError:
+        CURRENT_NAMESPACE = "default"
+    print(f"Using namespace: {CURRENT_NAMESPACE}")
+
+    # Get all pods
+    pods = get_pods(namespace=CURRENT_NAMESPACE)
+
+    # Get clusters
+    clusters = get_clusters(pods, CLUSTER_NAME_PREFIX)
+    if not clusters:
+        print(f"No clusters found with prefix {CLUSTER_NAME_PREFIX}")
+        return
+
+    # Wait for clusters to be running
+    while True:
+        pods = get_pods(namespace=CURRENT_NAMESPACE)  # Refresh pods list inside loop
+        if check_clusters_running(pods, clusters):
+            break
+        print("Waiting for all clusters to spin up...")
+        time.sleep(5)
+
+    # Prepare cluster info for parallel processing
+    cluster_infos = []
+    for cluster in clusters:
+        cluster_pods = [p for p in pods if p[0].startswith(cluster)]
+        cluster_infos.append((cluster, cluster_pods, CURRENT_NAMESPACE))
+
+    # Use ThreadPoolExecutor to process clusters in parallel
+    results = []
+    results_lock = threading.Lock()  # Create a lock for thread-safe results collection
+
+    with ThreadPoolExecutor() as executor:
+        future_to_cluster = {
+            executor.submit(process_cluster, info, args.ray_head_name): info[0] for info in cluster_infos
+        }
+        for future in as_completed(future_to_cluster):
+            cluster_name = future_to_cluster[future]
+            try:
+                result = future.result()
+                with results_lock:
+                    results.append(result)
+            except Exception as exc:
+                print(f"{cluster_name} generated an exception: {exc}")
+
+    # Sort results alphabetically by cluster name
+    results.sort()
+
+    # Write sorted results to the output file
+    with open(CLUSTER_SPEC_FILE, "w") as f:
+        for result in results:
+            f.write(result)
+
+    print(f"Cluster spec information saved to {CLUSTER_SPEC_FILE}")
+    # Display the contents of the config file
+    with open(CLUSTER_SPEC_FILE) as f:
+        print(f.read())
+
+
+if __name__ == "__main__":
+    main()