From 46906d81c0be602119ad1241333afc9753444986 Mon Sep 17 00:00:00 2001 From: Aaron Siddhartha Mondal Date: Wed, 6 Dec 2023 16:53:59 +0100 Subject: [PATCH] Add Kubernetes example This example starts a fairly complete Kubernetes cluster and showcases perfectly reproducible remote execution via the local remote execution toolchain containers. This example uses a three-layer setup process: 1. The infra layer is a kind cluster with Cilium and MetalLB. This layer is built to be easily swappable with more "production grade" clusters. 2. The operations layer deploys a few standard applications that are not inherently required for NativeLink, but are solid deployments that one would likely want running in a cluster. This includes monitoring and handling image availability. 3. The application layer is a straightforward `kubectl apply -k .` which deploys a NativeLink CAS, Worker and Scheduler. This deployment differs from the Docker Compose setup in that it does not make use of any system paths and doesn't allow visibility "outside" of the node itself. That is, it's a hard requirement that the worker image is self-contained. Storage is fully ephemeral in this example and a `kubectl delete -k .` will destroy the cache for quick iterations and cache testing. --- .bazelrc | 3 + .github/workflows/lre.yaml | 59 ++++++++ .gitignore | 1 + deployment-examples/kubernetes/.gitignore | 1 + deployment-examples/kubernetes/00_infra.sh | 127 ++++++++++++++++++ .../kubernetes/01_operations.sh | 28 ++++ .../kubernetes/02_application.sh | 15 +++ .../kubernetes/03_delete_application.sh | 11 ++ deployment-examples/kubernetes/README.md | 76 +++++++++++ deployment-examples/kubernetes/cas.json | 114 ++++++++++++++++ deployment-examples/kubernetes/cas.yaml | 61 +++++++++ .../example-do-not-use-in-prod-key.pem | 52 +++++++ .../example-do-not-use-in-prod-rootca.crt | 29 ++++ deployment-examples/kubernetes/gateway.yaml | 24 ++++ .../kubernetes/kustomization.yaml | 23 ++++ deployment-examples/kubernetes/routes.yaml | 28 ++++ deployment-examples/kubernetes/scheduler.json | 74 ++++++++++ deployment-examples/kubernetes/scheduler.yaml | 51 +++++++ .../kubernetes/worker.json.template | 63 +++++++++ deployment-examples/kubernetes/worker.yaml | 35 +++++ flake.nix | 3 + local-remote-execution/generated/config/BUILD | 2 +- tools/pre-commit-hooks.nix | 3 +- 23 files changed, 881 insertions(+), 2 deletions(-) create mode 100644 deployment-examples/kubernetes/.gitignore create mode 100755 deployment-examples/kubernetes/00_infra.sh create mode 100755 deployment-examples/kubernetes/01_operations.sh create mode 100755 deployment-examples/kubernetes/02_application.sh create mode 100755 deployment-examples/kubernetes/03_delete_application.sh create mode 100644 deployment-examples/kubernetes/README.md create mode 100644 deployment-examples/kubernetes/cas.json create mode 100644 deployment-examples/kubernetes/cas.yaml create mode 100644 deployment-examples/kubernetes/example-do-not-use-in-prod-key.pem create mode 100644 deployment-examples/kubernetes/example-do-not-use-in-prod-rootca.crt create mode 100644 deployment-examples/kubernetes/gateway.yaml create mode 100644 deployment-examples/kubernetes/kustomization.yaml create mode 100644 deployment-examples/kubernetes/routes.yaml create mode 100644 deployment-examples/kubernetes/scheduler.json create mode 100644 deployment-examples/kubernetes/scheduler.yaml create mode 100644 deployment-examples/kubernetes/worker.json.template create mode 100644 deployment-examples/kubernetes/worker.yaml diff --git a/.bazelrc b/.bazelrc index 427367190..b4d109ea9 100644 --- a/.bazelrc +++ b/.bazelrc @@ -68,3 +68,6 @@ build:lre --define=EXECUTOR=remote # See: https://github.com/bazelbuild/bazel/issues/19714#issuecomment-1745604978 build:lre --repo_env=BAZEL_DO_NOT_DETECT_CPP_TOOLCHAIN=1 + +# Allow user-side customization. +try-import %workspace%/.bazelrc.user diff --git a/.github/workflows/lre.yaml b/.github/workflows/lre.yaml index 5f41755ef..738097e62 100644 --- a/.github/workflows/lre.yaml +++ b/.github/workflows/lre.yaml @@ -38,3 +38,62 @@ jobs: --config=lre \ --verbose_failures \ //local-remote-execution/examples:hello_lre" + + remote: + strategy: + fail-fast: false + matrix: + os: [ubuntu-22.04] + name: Remote / ${{ matrix.os }} + runs-on: ${{ matrix.os }} + steps: + - name: Checkout + uses: >- # v4.1.1 + actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 + + - name: Install Nix + uses: >- #v7 + DeterminateSystems/nix-installer-action@5620eb4af6b562c53e4d4628c0b6e4f9d9ae8612 + + - name: Cache Nix derivations + uses: >- # Custom commit, last pinned at 2023-11-17. + DeterminateSystems/magic-nix-cache-action@a04e6275a6bea232cd04fc6f3cbf20d4cb02a3e1 + + - name: Start Kubernetes cluster + run: > + nix develop --impure --command + bash -c "cd deployment-examples/kubernetes \ + && ./00_infra.sh \ + && ./01_operations.sh \ + && ./02_application.sh" + + - name: Get gateway IPs + id: gateway-ips + run: | + echo "cache_ip=$(kubectl get gtw cache -o=jsonpath='{.status.addresses[0].value}')" >> "$GITHUB_ENV" + echo "scheduler_ip=$(kubectl get gtw scheduler -o=jsonpath='{.status.addresses[0].value}')" >> "$GITHUB_ENV" + + - name: Print cluster state + run: | + kubectl get svc -A + kubectl get pod -A + kubectl get svc -A + kubectl get deployments -A + kubectl describe gtw + echo "cas" + kubectl logs -l app=nativelink-cas + echo "scheduler" + kubectl logs -l app=nativelink-scheduler + echo "worker" + kubectl logs -l app=nativelink-worker + + - name: Build hello_lre with LRE toolchain. + run: > + nix develop --impure --command + bash -c "bazel run \ + --config=lre \ + --remote_instance_name=main \ + --remote_cache=grpc://$cache_ip:50051 \ + --remote_executor=grpc://$scheduler_ip:50052 \ + --verbose_failures \ + //local-remote-execution/examples:hello_lre" diff --git a/.gitignore b/.gitignore index 3f90cbc3d..41c0178ff 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ __pycache__ .DS_Store .pre-commit-config.yaml result +.bazelrc.user diff --git a/deployment-examples/kubernetes/.gitignore b/deployment-examples/kubernetes/.gitignore new file mode 100644 index 000000000..b0c421b2d --- /dev/null +++ b/deployment-examples/kubernetes/.gitignore @@ -0,0 +1 @@ +worker.json diff --git a/deployment-examples/kubernetes/00_infra.sh b/deployment-examples/kubernetes/00_infra.sh new file mode 100755 index 000000000..df31b08a8 --- /dev/null +++ b/deployment-examples/kubernetes/00_infra.sh @@ -0,0 +1,127 @@ +# This script sets up a local development cluster. It's roughly equivalent to +# a managed K8s setup. + +# For ease of development and to save disk space we pipe a local container +# registry through to kind. +# +# See https://kind.sigs.k8s.io/docs/user/local-registry/. + +reg_name='kind-registry' +reg_port='5001' +if [ "$(docker inspect -f '{{.State.Running}}' "${reg_name}" 2>/dev/null || true)" != 'true' ]; then + docker run \ + -d --restart=always -p "127.0.0.1:${reg_port}:5000" --network bridge --name "${reg_name}" \ + registry:2 +fi + +# Start a basic cluster. We use cilium's CNI and eBPF kube-proxy replacement. + +cat < "$KUSTOMIZE_DIR/worker.json" + +kubectl apply -k "$KUSTOMIZE_DIR" + +kubectl rollout status deploy/nativelink-cas +kubectl rollout status deploy/nativelink-scheduler +kubectl rollout status deploy/nativelink-worker diff --git a/deployment-examples/kubernetes/03_delete_application.sh b/deployment-examples/kubernetes/03_delete_application.sh new file mode 100755 index 000000000..9055ac480 --- /dev/null +++ b/deployment-examples/kubernetes/03_delete_application.sh @@ -0,0 +1,11 @@ +# Get the nix derivation hash from the toolchain container, change the +# `TOOLCHAIN_TAG` variable in the `worker.json.template` to that hash and delete +# the configuration. + +KUSTOMIZE_DIR=$(git rev-parse --show-toplevel)/deployment-examples/kubernetes + +sed "s/__NATIVELINK_TOOLCHAIN_TAG__/$(nix eval .#lre.imageTag --raw)/g" \ + "$KUSTOMIZE_DIR/worker.json.template" \ + > "$KUSTOMIZE_DIR/worker.json" + +kubectl delete -k "$KUSTOMIZE_DIR" diff --git a/deployment-examples/kubernetes/README.md b/deployment-examples/kubernetes/README.md new file mode 100644 index 000000000..29f8a82b2 --- /dev/null +++ b/deployment-examples/kubernetes/README.md @@ -0,0 +1,76 @@ +# Kubernetes example + +This deployment sets up a 3-container deployment with separate CAS, scheduler +and worker. Don't use this example deployment in production. It's insecure. + +In this example we're using `kind` to set up the cluster and `cilium` with +`metallb` to provide a `LoadBalancer` and `GatewayController`. + +First set up a local development cluster: + +```bash +./00_infra.sh +``` + +Next start a few standard deployments. This part also builds the remote +execution containers and makes them available to the cluster: + +```bash +./01_operations.sh +``` + +Finally deploy NativeLink: + +```bash +./02_application.sh +``` + +> [!TIP] +> You can use `./03_delete_application.sh` to remove just the nativelink +> deployments but leave the rest of the cluster intact. + +This demo setup creates two gateways to expose the `cas` and `scheduler` +deployments via your local docker network: + +```bash +CACHE=$(kubectl get gtw cache -o=jsonpath='{.status.addresses[0].value}') +SCHEDULER=$(kubectl get gtw scheduler -o=jsonpath='{.status.addresses[0].value}') + +echo "Cache IP: $CACHE" +echo "Scheduler IP: $SCHEDULER" + +# Prints something like: +# +# Cache IP: 172.20.255.200 +# Scheduler IP: 172.20.255.201 +``` + +You can now pass these IPs to your bazel invocation to use the remote cache and +executor: + +```bash +bazel test \ + --config=lre \ + --remote_instance_name=main \ + --remote_cache=grpc://$CACHE:50051 + --remote_executore=grpc://$SCHEDULER:50052 + //:dummy_test +``` + +> [!TIP] +> You can add these flags to a to a `.bazelrc.user` file in the workspace root. +> Note that you'll need to pass in explicit IPs as this file can't resolve +> environment variables: +> ```bash +> # .bazelrc.user +> build --config=lre +> build --remote_instance_name=main +> build --remote_cache=grpc://172.20.255.200:50051 +> build --remote_executor=grpc://172.20.255.201:50052 +> ``` + +When you're done testing, delete the cluster: + +```bash +kind delete cluster +``` diff --git a/deployment-examples/kubernetes/cas.json b/deployment-examples/kubernetes/cas.json new file mode 100644 index 000000000..2d742f14c --- /dev/null +++ b/deployment-examples/kubernetes/cas.json @@ -0,0 +1,114 @@ +// This configuration places objects in various directories in +// `~/.cache/nativelink`. When this location is mounted as a PersistentVolume +// it persists the cache across restarts. +{ + "stores": { + "CAS_MAIN_STORE": { + "existence_cache": { + "backend": { + "compression": { + "compression_algorithm": { + "lz4": {} + }, + "backend": { + "filesystem": { + "content_path": "~/.cache/nativelink/content_path-cas", + "temp_path": "~/.cache/nativelink/tmp_path-cas", + "eviction_policy": { + // 10gb. + "max_bytes": 10000000000, + } + } + } + } + } + } + }, + "AC_MAIN_STORE": { + "completeness_checking": { + "backend": { + "filesystem": { + "content_path": "~/.cache/nativelink/content_path-ac", + "temp_path": "~/.cache/nativelink/tmp_path-ac", + "eviction_policy": { + // 500mb. + "max_bytes": 500000000, + } + } + }, + "cas_store": { + "ref_store": { + "name": "CAS_MAIN_STORE" + } + } + } + } + }, + "servers": [{ + "listener": { + "http": { + "socket_address": "0.0.0.0:50051" + } + }, + "services": { + "cas": { + "main": { + "cas_store": "CAS_MAIN_STORE" + } + }, + "ac": { + "main": { + "ac_store": "AC_MAIN_STORE" + } + }, + "capabilities": {}, + "bytestream": { + "cas_stores": { + "main": "CAS_MAIN_STORE", + }, + } + } + }, + { + // Only publish metrics on a private port. + "listener": { + "http": { + "socket_address": "0.0.0.0:50061" + } + }, + "services": { + "experimental_prometheus": { + "path": "/metrics" + } + } + }, + { + "listener": { + "http": { + "socket_address": "0.0.0.0:50071", + "tls": { + "cert_file": "/root/example-do-not-use-in-prod-rootca.crt", + "key_file": "/root/example-do-not-use-in-prod-key.pem" + } + } + }, + "services": { + "cas": { + "main": { + "cas_store": "CAS_MAIN_STORE" + } + }, + "ac": { + "main": { + "ac_store": "AC_MAIN_STORE" + } + }, + "capabilities": {}, + "bytestream": { + "cas_stores": { + "main": "CAS_MAIN_STORE", + } + } + } + }] +} diff --git a/deployment-examples/kubernetes/cas.yaml b/deployment-examples/kubernetes/cas.yaml new file mode 100644 index 000000000..96fe18331 --- /dev/null +++ b/deployment-examples/kubernetes/cas.yaml @@ -0,0 +1,61 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nativelink-cas +spec: + replicas: 1 + selector: + matchLabels: + app: nativelink-cas + template: + metadata: + labels: + app: nativelink-cas + spec: + containers: + - name: nativelink-cas + image: "localhost:5001/nativelink:local" + env: + - name: RUST_LOG + value: info + ports: + - containerPort: 50051 + - containerPort: 50061 + - containerPort: 50071 + volumeMounts: + - name: cas-config + mountPath: /cas.json + subPath: cas.json + - name: tls-volume + mountPath: /root + readOnly: true + args: ["/cas.json"] + volumes: + - name: cas-config + configMap: + name: cas + - name: tls-volume + secret: + secretName: tls-secret +--- +apiVersion: v1 +kind: Service +metadata: + name: nativelink-cas +spec: + selector: + app: nativelink-cas + ports: + - name: http + protocol: TCP + port: 50051 + targetPort: 50051 + - name: metrics + protocol: TCP + port: 50061 + targetPort: 50061 + - name: https + protocol: TCP + port: 50071 + targetPort: 50071 diff --git a/deployment-examples/kubernetes/example-do-not-use-in-prod-key.pem b/deployment-examples/kubernetes/example-do-not-use-in-prod-key.pem new file mode 100644 index 000000000..9ad6b97a0 --- /dev/null +++ b/deployment-examples/kubernetes/example-do-not-use-in-prod-key.pem @@ -0,0 +1,52 @@ +-----BEGIN PRIVATE KEY----- +MIIJQwIBADANBgkqhkiG9w0BAQEFAASCCS0wggkpAgEAAoICAQCWkpK9CeY3B1Q+ ++f5qTcavKEVPgJ4PqFdr9OJtOmboAcw9B9FcgM8QQ8FYkjwFYNE03ec0xCAUdPXG +gPXYxdU9ogwFYlTldV4Zhm9G7qbKLKWUE9rhXJHoiQZeLVHKUIn4UPSkMBr2UdQw +BVazG3RtJzRKTDfNpP6UAAmHlf/6MYWdQZwpK/Oelr4pCKofcavuiqmOyZmKEhpw +Nt0/6pFvLqERbaveuX2I+Ugvg0oY8voMueGAgT7qeE+ph+7BXQkaTBzYr+2p5TGy +aNXczXvkbL53KUxtLZ3bVlfLymGv8XpG8IVLroIOXbeQFSJpFiALilvZdtx2MnLk +hqiz193RNCVji9gL2l6JQDsL4L40XPY7iwmov+ar5fkjq3ysgIzpjRjZZUbt9DjJ +Ercmntfx3nDlXZni6/BnqQ9LxJC4Q1ZoONP6uXRJEC65vdRBusCXmRzLhijgdLLD +tSxk/1MQYztOq17X3HoOSUdlOrSMAyTiOysNTLCB3hWz53d0UAx2Chk+9Xd4LnVl +fOQH81OdK/QhAF6gGkhRM9UATe13v1nKLao+CS6U0xAjMNcA0mfd2/pVKXO8Ieuv +NEhI8UQOFvGX+DlJVXQtLAGrwGoNHzl2zbyI/8k1Lwp0bMzhrg7UfA9mb62rNVSH +3S9fu7XmeVuOi/Sb6iNFAGgMqXZm+QIDAQABAoICACMXC31kljMS0S5QAltxvrDJ +OMXbWnqZp63wBuifqKW+7YL3lHHzTs4ijCeBBN3X/0lLwXsJjdRWoYReP20CRby0 +TArhLJnA7WoTa1FjgpkxKHFG40aL1TEU5dgCxcHZXR3yd+DfmRj4XvPF+ruI1Wwu +q7/43TRelDmXfKENcvcynUnVpXfu1zqJNM3JSCRF0GSqCfPNxatopqKqFouztars +efrezSP9pGTMe2cfOKBx15Ypp4TUGf54hVAj7p8/eESZoktZTJDAagGC7vwjIxFL +ymNoy3fSHqykBZfnlTZ0lyN7IAVQKN0COCoq0rccweEOoHMggSZGgYICY8RttTKV +ZazKK0H+23oTj6zilgdHw0LNSV3+ByZF0zD0Ss2SZG7LHZJxJ8MS18i3U7XUG999 +LsWPacrIo5C1tK33WHWYLIIlJZyOqmJbvrRLleE3UImo26rwYVYP2I+KE6F3SH8b +lRCGYEqITXXKSBRDcAo63Ooohy9ftqeVRPzQtLzFVPODjnV3Ho3fCP+LdfjRCvED +9rk5f/zJR2SabW9OLKrvXoU0Z7+Oa7RvHiqH2Tdx1kUDuZnJgjcMF2pIeOPcAls7 +pEOrV5lIesLWSJQwRy2qsM1tr7/DFmpohGs8WnUbHwdLjWkFJXtkmTWcY7Yl3tm4 +aUD/8L4VNvsrGBtLHVmVAoIBAQDIjMe1umF/DgAWpuw0RYgELAyQ+MNc8GNalPV0 +FyAV2Sjzf08rfauJQ8PxJFAxkeI49uB2iGrcz1iuMC4l2sKdEHRbpYSArbmHcHsX +Hvq/n6TX2RH2GXv4k29c7dmj19phJMbTZCGvTpj+watJelwmK0cP/OmEWdLd3nud +dl6zrwsxxu88G330fg7KPBJIoVtMoczeOJ58lvlPXM+W729/lUhnItwIOU/AVu3u +xYe6skhVtX3YG0VWjNgFLK0KgaXJ2x+vNlrxOIOczpGiByfqpaCPoCG6vbVul/Bp +mpsp9iuFzHP8Ym938MMCsw/rrkQiZqcvgNrhHXT4XgUFN0qvAoIBAQDANFInrVOQ +LDs0G+s0nE+GSdjhSh2Qmf0rARs78XFxsRE/ZkHAzCfGAi9vggXqXtu3zzkC50KM +gAYgE2aQ6KoYJl5OU49o6nr7dWGguKZRVnfvnyVjq7h5ZYVHJQYry48+4SSkHzyY +NaGDNlnbKfehLxeQ3fzHrzajFCLsVMRiBb1wDEw+pbH1D7eSf6j7cOcgaUn0kfI9 +usPsjEZxxHAAuDYH8z6RHspucx3s+UxBIcyrljF+TNbEJ70ntow2172rAu/4O5d7 +NmTWBvX3YDsth52dHN199t+61cgz1/1gG15QEzvj8LHkGVPAu/8yQLskeJwMSSVJ +gOdHSvl9t7LXAoIBAQDIRkZ+BbWtcybFeEaz4Kf3OqIF+FborisZ313LY/inuOi7 +WeyPCv3Noz2+x7vJ9NyMdIYt+Qd2oR33PagoJ3Fn9nGsv/s1oceAKOTTbx2LVXsC +DwVv1X1G/L0eeWx5wWqUzmw72GgHMJ968Z8DTs1NVLSvk6sDf8wjwzWBRx9Fo02z +lO1+AhpjmatdbGX54CFTwtTbKoGi/AXWqmGoYk3fhA386QQAtnMdKfKMGvjziWzR +IuCceodg6mAjsrzPnC8bCNd4WviwofZVFri6rRjArExOgeSNHzxbSCJO45WGbw6B +nG5LoWWdynKEdJs2ih4CmK7msnBilM8l5IZw8gmDAoIBAQCte0wf9ejzu4igawas +EN4SlnsENdJjnyoMc84yF6ZOeQTZVaHJtDu+FGDeY9yVA5OL36VUwomlqTReJUSx +TN+iNpduWSubBfGFIBjDaYbs6YANr3ae9PLn18MpSPi99NjRZ4OcA3m85MNoXFlU +YRfay1eY8VTko3hMT7OJ3qT22Ll49hCnhwUN7WbC+yj58pka/w26izS7lOSckKxQ +qX5yl9Jk8J+resA6WvtK1mWGcEx9H26C0jYTDM9FlhYOtkHCpj8Uriz3EEyJhfTi +mGAxozOXCOO+e2LLD4TJjo1q/qjs27916N2XxWh8EPOxVw3TSG2JBUh7hLa7+ach +Wvo3AoIBAAM9Z9MOkAMECNwbSqsKt6/8j7RS+474cLPqXwRsmh1KPv5P4WkCTmxB +TDjqP/JkytL/0V3Zexc+2MUNwBwt1kUXRhi1rkbpi+fmZ/Sp8jh4fwriSRNj5BKz +0soGd/1dGF57EhpZHMQvts86y9A94tdKrVUb1Wosvsua22xAgeumbYGkc9gFKAkG +BiplOruOMkJhEQMR3gdrDUuyNL5HqBCGXP7v7+Lfgh99aZIh/ophex69T2QxTylw +sYDOH4+gMkJsYHU4H2UKm4JdGDZKRlxctwf//N1B+Ot7AULM/OPtHPgt/JcWxpjn +tLDdTkO8oICsLdyBDetlzNQ+h2hXfuM= +-----END PRIVATE KEY----- diff --git a/deployment-examples/kubernetes/example-do-not-use-in-prod-rootca.crt b/deployment-examples/kubernetes/example-do-not-use-in-prod-rootca.crt new file mode 100644 index 000000000..1e50c5e20 --- /dev/null +++ b/deployment-examples/kubernetes/example-do-not-use-in-prod-rootca.crt @@ -0,0 +1,29 @@ +-----BEGIN CERTIFICATE----- +MIIFCTCCAvGgAwIBAgIUBpZ3cJ8onptpeD/v2TVympU03sQwDQYJKoZIhvcNAQEL +BQAwFDESMBAGA1UEAwwJbG9jYWxob3N0MB4XDTIzMTAyMjA2MzMxNloXDTI0MTAy +MTA2MzMxNlowFDESMBAGA1UEAwwJbG9jYWxob3N0MIICIjANBgkqhkiG9w0BAQEF +AAOCAg8AMIICCgKCAgEAlpKSvQnmNwdUPvn+ak3GryhFT4CeD6hXa/TibTpm6AHM +PQfRXIDPEEPBWJI8BWDRNN3nNMQgFHT1xoD12MXVPaIMBWJU5XVeGYZvRu6myiyl +lBPa4VyR6IkGXi1RylCJ+FD0pDAa9lHUMAVWsxt0bSc0Skw3zaT+lAAJh5X/+jGF +nUGcKSvznpa+KQiqH3Gr7oqpjsmZihIacDbdP+qRby6hEW2r3rl9iPlIL4NKGPL6 +DLnhgIE+6nhPqYfuwV0JGkwc2K/tqeUxsmjV3M175Gy+dylMbS2d21ZXy8phr/F6 +RvCFS66CDl23kBUiaRYgC4pb2XbcdjJy5Iaos9fd0TQlY4vYC9peiUA7C+C+NFz2 +O4sJqL/mq+X5I6t8rICM6Y0Y2WVG7fQ4yRK3Jp7X8d5w5V2Z4uvwZ6kPS8SQuENW +aDjT+rl0SRAuub3UQbrAl5kcy4Yo4HSyw7UsZP9TEGM7Tqte19x6DklHZTq0jAMk +4jsrDUywgd4Vs+d3dFAMdgoZPvV3eC51ZXzkB/NTnSv0IQBeoBpIUTPVAE3td79Z +yi2qPgkulNMQIzDXANJn3dv6VSlzvCHrrzRISPFEDhbxl/g5SVV0LSwBq8BqDR85 +ds28iP/JNS8KdGzM4a4O1HwPZm+tqzVUh90vX7u15nlbjov0m+ojRQBoDKl2ZvkC +AwEAAaNTMFEwHQYDVR0OBBYEFL2SbUDfjD72r9B1/V2v33992AoWMB8GA1UdIwQY +MBaAFL2SbUDfjD72r9B1/V2v33992AoWMA8GA1UdEwEB/wQFMAMBAf8wDQYJKoZI +hvcNAQELBQADggIBAF6WiXP61VEfdclSpV24wZfg1bSDpHw9S6LG7fy+/0oizpFY +nQkc+M2x2i7LFE8BoJKop6l4VyJ0jgGf5wzRHfmHX4QVatHBmzfJKA6Vm471aRVx +NM4/VWFpVbflnqGjeYEZcStM+506lgKfJk2It7Kd6pCmVbNzYb8W5zlvuYeWtwaZ +mtSLrmgsXLBfIwZBJagOEdQiM0mrRMT5LkA+fjKs+mqjDQz/RaDLTsrWnCnQ354Z +SBycbrJoyfnfAf0YlOlX+s2sjOEeSUvcjyI6mcA0Osyzerw79bztyaF132rU0A6D +jQw22Yj8B4tdcuTjaThXjPrlvWNOfoWbIyiU8Fk7BEvy3YFeRJ9qW9nx2BOCn0/y +quvu4W5NreDeIGT6FSX836GJYkEdheUnVDiKsiZZmB8Xng2D52gkWv5LI2U7LB4D +sZqFkyYXBR2xwoWbV46j0WnbffloUvCTk9oKOJ7i4i3kCyoWccfVUp2kVCdRK3Ok +bq0cPkudq7srb8IsjJVaAISvmrPXQj1sByLVhTJZsNsMmACDV5+QK+4fQn5hpAhS +RIG803s/fdITWxTzE+j8IM6YOcEhmDQpzUxhNFKGdZATNv3FF/YZdg8dCs8hvdxq +y8nkIB354me8h7bchCTIpD7OlL4D+vwPmuVs7IHpiXZJXPzCP0gkMX8Ewt6W +-----END CERTIFICATE----- diff --git a/deployment-examples/kubernetes/gateway.yaml b/deployment-examples/kubernetes/gateway.yaml new file mode 100644 index 000000000..bc6bf5450 --- /dev/null +++ b/deployment-examples/kubernetes/gateway.yaml @@ -0,0 +1,24 @@ +# TODO(aaronmondal): There should just be a single gateway. But that's currently +# bugged: https://github.com/cilium/cilium/issues/29099 +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: cache +spec: + gatewayClassName: cilium + listeners: + - name: cache + protocol: HTTP + port: 50051 +--- +apiVersion: gateway.networking.k8s.io/v1beta1 +kind: Gateway +metadata: + name: scheduler +spec: + gatewayClassName: cilium + listeners: + - name: scheduler + protocol: HTTP + port: 50052 diff --git a/deployment-examples/kubernetes/kustomization.yaml b/deployment-examples/kubernetes/kustomization.yaml new file mode 100644 index 000000000..8df1712a0 --- /dev/null +++ b/deployment-examples/kubernetes/kustomization.yaml @@ -0,0 +1,23 @@ +--- +resources: + - cas.yaml + - scheduler.yaml + - worker.yaml + - routes.yaml + +configMapGenerator: + - name: cas + files: + - cas.json + - name: scheduler + files: + - scheduler.json + - name: worker + files: + - worker.json + +secretGenerator: + - name: tls-secret + files: + - example-do-not-use-in-prod-rootca.crt + - example-do-not-use-in-prod-key.pem diff --git a/deployment-examples/kubernetes/routes.yaml b/deployment-examples/kubernetes/routes.yaml new file mode 100644 index 000000000..e094f9d02 --- /dev/null +++ b/deployment-examples/kubernetes/routes.yaml @@ -0,0 +1,28 @@ +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute # TODO(aaronmondal): Use GRPCRoute after resolution of + # https://github.com/TraceMachina/nativelink/issues/481 +metadata: + name: cache-route +spec: + parentRefs: + - sectionName: cache + name: cache + rules: + - backendRefs: + - name: nativelink-cas + port: 50051 +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute # TODO(aaronmondal): Pure GRPC is unstable here. Find out why + # and migrate to a GRPCRoute. +metadata: + name: scheduler-route +spec: + parentRefs: + - sectionName: scheduler + name: scheduler + rules: + - backendRefs: + - name: nativelink-scheduler + port: 50052 diff --git a/deployment-examples/kubernetes/scheduler.json b/deployment-examples/kubernetes/scheduler.json new file mode 100644 index 000000000..d0c3c73ee --- /dev/null +++ b/deployment-examples/kubernetes/scheduler.json @@ -0,0 +1,74 @@ +{ + "stores": { + "GRPC_LOCAL_STORE": { + // Note: This file is used to test GRPC store. + "grpc": { + "instance_name": "main", + "endpoints": ["grpc://${CAS_ENDPOINT:-127.0.0.1}:50051"], + "store_type": "cas" + } + }, + "GRPC_LOCAL_AC_STORE": { + // Note: This file is used to test GRPC store. + "grpc": { + "instance_name": "main", + "endpoints": ["grpc://${CAS_ENDPOINT:-127.0.0.1}:50051"], + "store_type": "ac" + } + } + }, + "schedulers": { + "MAIN_SCHEDULER": { + "simple": { + "supported_platform_properties": { + "cpu_count": "minimum", + "OSFamily": "exact", + "container-image": "exact" + } + } + } + }, + "servers": [{ + "listener": { + "http": { + "socket_address": "0.0.0.0:50052" + } + }, + "services": { + "ac": { + "main": { + "ac_store": "GRPC_LOCAL_AC_STORE" + } + }, + "execution": { + "main": { + "cas_store": "GRPC_LOCAL_STORE", + "scheduler": "MAIN_SCHEDULER", + } + }, + "capabilities": { + "main": { + "remote_execution": { + "scheduler": "MAIN_SCHEDULER", + } + } + } + } + }, + { + "listener": { + "http": { + "socket_address": "0.0.0.0:50061", + } + }, + "services": { + // Note: This should be served on a different port, because it has + // a different permission set than the other services. + // In other words, this service is a backend api. The ones above + // are a frontend api. + "worker_api": { + "scheduler": "MAIN_SCHEDULER", + } + } + }] +} diff --git a/deployment-examples/kubernetes/scheduler.yaml b/deployment-examples/kubernetes/scheduler.yaml new file mode 100644 index 000000000..02a6892ac --- /dev/null +++ b/deployment-examples/kubernetes/scheduler.yaml @@ -0,0 +1,51 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nativelink-scheduler +spec: + replicas: 1 + selector: + matchLabels: + app: nativelink-scheduler + template: + metadata: + labels: + app: nativelink-scheduler + spec: + containers: + - name: nativelink-scheduler + image: "localhost:5001/nativelink:local" + env: + - name: RUST_LOG + value: info + - name: CAS_ENDPOINT + value: nativelink-cas + ports: + - containerPort: 50052 + volumeMounts: + - name: scheduler-config + mountPath: /scheduler.json + subPath: scheduler.json + args: ["/scheduler.json"] + volumes: + - name: scheduler-config + configMap: + name: scheduler +--- +apiVersion: v1 +kind: Service +metadata: + name: nativelink-scheduler +spec: + selector: + app: nativelink-scheduler + ports: + - name: scheduler + protocol: TCP + port: 50052 + targetPort: 50052 + - name: worker-api + protocol: TCP + port: 50061 + targetPort: 50061 diff --git a/deployment-examples/kubernetes/worker.json.template b/deployment-examples/kubernetes/worker.json.template new file mode 100644 index 000000000..ad7ce83d6 --- /dev/null +++ b/deployment-examples/kubernetes/worker.json.template @@ -0,0 +1,63 @@ +{ + "stores": { + "GRPC_LOCAL_STORE": { + // Note: This file is used to test GRPC store. + "grpc": { + "instance_name": "main", + "endpoints": ["grpc://${CAS_ENDPOINT:-127.0.0.1}:50051"], + "store_type": "cas" + } + }, + "GRPC_LOCAL_AC_STORE": { + // Note: This file is used to test GRPC store. + "grpc": { + "instance_name": "main", + "endpoints": ["grpc://${CAS_ENDPOINT:-127.0.0.1}:50051"], + "store_type": "ac" + } + }, + "WORKER_FAST_SLOW_STORE": { + "fast_slow": { + "fast": { + "filesystem": { + "content_path": "/root/.cache/nativelink/data-worker-test/content_path-cas", + "temp_path": "/root/.cache/nativelink/data-worker-test/tmp_path-cas", + "eviction_policy": { + // 10gb. + "max_bytes": 10000000000, + } + } + }, + "slow": { + "ref_store": { + "name": "GRPC_LOCAL_STORE", + } + } + } + } + }, + "workers": [{ + "local": { + "worker_api_endpoint": { + "uri": "grpc://${SCHEDULER_ENDPOINT:-127.0.0.1}:50061", + }, + "cas_fast_slow_store": "WORKER_FAST_SLOW_STORE", + "upload_action_result": { + "ac_store": "GRPC_LOCAL_AC_STORE", + }, + "work_directory": "/root/.cache/nativelink/work", + "platform_properties": { + "cpu_count": { + "query_cmd": "nproc" + }, + "OSFamily": { + "values": ["Linux"] + }, + "container-image": { + "values": ["docker://nativelink-toolchain:__NATIVELINK_TOOLCHAIN_TAG__"] + } + } + } + }], + "servers": [] +} diff --git a/deployment-examples/kubernetes/worker.yaml b/deployment-examples/kubernetes/worker.yaml new file mode 100644 index 000000000..66397a377 --- /dev/null +++ b/deployment-examples/kubernetes/worker.yaml @@ -0,0 +1,35 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nativelink-worker +spec: + replicas: 1 + selector: + matchLabels: + app: nativelink-worker + template: + metadata: + labels: + app: nativelink-worker + spec: + containers: + - name: nativelink-worker + image: "localhost:5001/nativelink-toolchain:local" + env: + - name: RUST_LOG + value: info + - name: CAS_ENDPOINT + value: nativelink-cas + - name: SCHEDULER_ENDPOINT + value: nativelink-scheduler + volumeMounts: + - name: worker-config + mountPath: /worker.json + subPath: worker.json + command: ["/bin/cas"] + args: ["/worker.json"] + volumes: + - name: worker-config + configMap: + name: worker diff --git a/flake.nix b/flake.nix index 1a7adb7a9..d44eb520c 100644 --- a/flake.nix +++ b/flake.nix @@ -131,6 +131,9 @@ pkgs.skopeo pkgs.dive pkgs.cosign + pkgs.kubectl + pkgs.kubernetes-helm + pkgs.cilium-cli # Additional tools from within our development environment. local-image-test diff --git a/local-remote-execution/generated/config/BUILD b/local-remote-execution/generated/config/BUILD index a7320a7eb..76ade21d4 100755 --- a/local-remote-execution/generated/config/BUILD +++ b/local-remote-execution/generated/config/BUILD @@ -40,7 +40,7 @@ platform( "@bazel_tools//tools/cpp:clang", ], exec_properties = { - "container-image": "docker://nativelink-toolchain:h8andgczmivnnz4blabfqfya4bdqrnhg", + "container-image": "docker://nativelink-toolchain:lslbs7cb2pdf0lgp14vz6kj9r2xbby55", "OSFamily": "Linux", }, parents = ["@local_config_platform//:host"], diff --git a/tools/pre-commit-hooks.nix b/tools/pre-commit-hooks.nix index e59e1c902..a6d217966 100644 --- a/tools/pre-commit-hooks.nix +++ b/tools/pre-commit-hooks.nix @@ -44,8 +44,9 @@ in }; detect-private-key = { excludes = excludes ++ [ - # This is an integration testfile that is not intended for production. + # Integration testfiles not intended for production. "deployment-examples/docker-compose/example-do-not-use-in-prod-key.pem" + "deployment-examples/kubernetes/example-do-not-use-in-prod-key.pem" ]; enable = true; name = "detect-private-key";