Skip to content

Commit

Permalink
Fix: Update cedana binaries (#753)
Browse files Browse the repository at this point in the history
- Update cedana binaries to 0.9.234
- Remove container ID hotfix
- Add correct binaries to github action / worker dockerfile
  • Loading branch information
luke-lombardi authored Dec 4, 2024
1 parent f0a1d1c commit 5b54136
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 27 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/release-worker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
id: login-ecr
with:
registry-type: public
mask-password: 'true'
mask-password: "true"

- name: Set version
id: set-version
Expand All @@ -49,3 +49,6 @@ jobs:
${{ steps.login-ecr.outputs.registry }}/n4e0e1y0/beta9-worker:latest
target: final
platforms: linux/amd64
build-args: |
CEDANA_TOKEN=${{ secrets.CEDANA_TOKEN }}
CEDANA_BASE_URL=${{ secrets.CEDANA_BASE_URL }}
15 changes: 11 additions & 4 deletions docker/Dockerfile.worker
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,6 @@ RUN apt-get update && \
apt-get install psmisc

RUN curl -L https://beam-runner-python-deps.s3.amazonaws.com/juicefs -o /usr/local/bin/juicefs && chmod +x /usr/local/bin/juicefs
RUN curl -L https://beam-runner-python-deps.s3.amazonaws.com/cedana-gpu-controller -o /usr/local/bin/cedana-gpu-controller && chmod +x /usr/local/bin/cedana-gpu-controller
RUN curl -L https://beam-runner-python-deps.s3.amazonaws.com/libcedana-gpu.so -o /usr/local/lib/libcedana-gpu.so && chmod +x /usr/local/lib/libcedana-gpu.so

RUN curl -fsSL https://tailscale.com/install.sh | sh
RUN apt-get install -y --no-install-recommends nvidia-container-toolkit-base nvidia-container-toolkit

Expand All @@ -107,7 +104,7 @@ if [ "$(uname -m)" = "x86_64" ]; then
fi
EOT

ARG CEDANA_VERSION=0.9.233
ARG CEDANA_VERSION=0.9.234
RUN <<EOT
set -eux
if [ "$(uname -m)" = "x86_64" ]; then
Expand All @@ -118,6 +115,16 @@ if [ "$(uname -m)" = "x86_64" ]; then
fi
EOT

ARG CEDANA_TOKEN
ARG CEDANA_BASE_URL

RUN if [ -n "${CEDANA_TOKEN}" ]; then \
curl -L -H "Authorization: Bearer ${CEDANA_TOKEN}" ${CEDANA_BASE_URL}/k8s/gpu/gpucontroller -o /usr/local/bin/cedana-gpu-controller && \
chmod +x /usr/local/bin/cedana-gpu-controller && \
curl -L -H "Authorization: Bearer ${CEDANA_TOKEN}" ${CEDANA_BASE_URL}/k8s/gpu/libcedana -o /usr/local/lib/libcedana-gpu.so && \
chmod +x /usr/local/lib/libcedana-gpu.so; \
fi


ARG TARGETARCH

Expand Down
8 changes: 4 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ go 1.22.0
toolchain go1.22.4

require (
buf.build/gen/go/cedana/task/grpc/go v1.5.1-20241120213244-f9aa09c7b23a.1
buf.build/gen/go/cedana/task/protocolbuffers/go v1.35.2-20241120213244-f9aa09c7b23a.1
buf.build/gen/go/cedana/task/grpc/go v1.5.1-20241203191352-12c25eb032cd.1
buf.build/gen/go/cedana/task/protocolbuffers/go v1.35.2-20241203191352-12c25eb032cd.1
github.com/DATA-DOG/go-sqlmock v1.5.2
github.com/Masterminds/squirrel v1.5.4
github.com/alicebob/miniredis/v2 v2.30.5
Expand All @@ -21,7 +21,7 @@ require (
github.com/beam-cloud/clip v0.0.0-20240826223025-899feb184e88
github.com/beam-cloud/go-runc v0.0.0-20231222221338-b89899f33170
github.com/bsm/redislock v0.9.4
github.com/cedana/cedana v0.9.233
github.com/cedana/cedana v0.9.234
github.com/cenkalti/backoff v2.2.1+incompatible
github.com/cloudevents/sdk-go/v2 v2.15.1
github.com/containerd/console v1.0.4
Expand Down Expand Up @@ -88,7 +88,7 @@ require (
)

require (
buf.build/gen/go/cedana/gpu/protocolbuffers/go v1.35.2-20241120213244-06763032c670.1 // indirect
buf.build/gen/go/cedana/gpu/protocolbuffers/go v1.35.2-20241203191352-2167379de17d.1 // indirect
filippo.io/edwards25519 v1.1.0 // indirect
github.com/AdamKorcz/go-fuzz-headers v0.0.0-20210312213058-32f4d319f0d2 // indirect
github.com/akutz/memconn v0.1.0 // indirect
Expand Down
16 changes: 8 additions & 8 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
buf.build/gen/go/cedana/gpu/protocolbuffers/go v1.35.2-20241120213244-06763032c670.1 h1:8mJImDs0Q83s61/MN4V3aPJnip6YJRU2IneghwohXyw=
buf.build/gen/go/cedana/gpu/protocolbuffers/go v1.35.2-20241120213244-06763032c670.1/go.mod h1:14P29lBGPpmKBTOhetR/D806Xraxf+f7Zw9nwaWSOmo=
buf.build/gen/go/cedana/task/grpc/go v1.5.1-20241120213244-f9aa09c7b23a.1 h1:XhcTeOzrC5BP7zTGa7p43kLHk3O5vDxJmvW8hGQoNT0=
buf.build/gen/go/cedana/task/grpc/go v1.5.1-20241120213244-f9aa09c7b23a.1/go.mod h1:jdQDdjvoDWct/Uvdg/gjkeeSaZoDhTUKnqKfy3jLDUo=
buf.build/gen/go/cedana/task/protocolbuffers/go v1.35.2-20241120213244-f9aa09c7b23a.1 h1:CuKjhBdYSu99fmBwr10++U7uhEA649XQZgafr2F9t8o=
buf.build/gen/go/cedana/task/protocolbuffers/go v1.35.2-20241120213244-f9aa09c7b23a.1/go.mod h1:8rNH7TKUqnXZf521fY9srpS05tKL8dCwugkRKcmLgC8=
buf.build/gen/go/cedana/gpu/protocolbuffers/go v1.35.2-20241203191352-2167379de17d.1 h1:AxyPWWT4YGmLs8RduxavZi3acD+WGXXr2r7oJP2EBQQ=
buf.build/gen/go/cedana/gpu/protocolbuffers/go v1.35.2-20241203191352-2167379de17d.1/go.mod h1:14P29lBGPpmKBTOhetR/D806Xraxf+f7Zw9nwaWSOmo=
buf.build/gen/go/cedana/task/grpc/go v1.5.1-20241203191352-12c25eb032cd.1 h1:qTCCRcSFwS7IIKeBtSSOgaZVSsH3T7V0sSRKFAGrPds=
buf.build/gen/go/cedana/task/grpc/go v1.5.1-20241203191352-12c25eb032cd.1/go.mod h1:hvCGPsk+cZoAyajPC1bH9AksIXACSO+VTqKHGZ4AIqE=
buf.build/gen/go/cedana/task/protocolbuffers/go v1.35.2-20241203191352-12c25eb032cd.1 h1:6XRJodc0kzDvlBuZbq6/q1eOypH6NarL86rLD/QZkGA=
buf.build/gen/go/cedana/task/protocolbuffers/go v1.35.2-20241203191352-12c25eb032cd.1/go.mod h1:PrKt+vnu69Ffpw8c3G4gr8CqzL0uBbQ9QjwL4qEusCU=
cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
Expand Down Expand Up @@ -107,8 +107,8 @@ github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
github.com/bsm/redislock v0.9.4 h1:X/Wse1DPpiQgHbVYRE9zv6m070UcKoOGekgvpNhiSvw=
github.com/bsm/redislock v0.9.4/go.mod h1:Epf7AJLiSFwLCiZcfi6pWFO/8eAYrYpQXFxEDPoDeAk=
github.com/cedana/cedana v0.9.233 h1:WsJQZU/aQ4Fh1XAFro7yo2kx00DdijziezV5v3P8JTE=
github.com/cedana/cedana v0.9.233/go.mod h1:2F2Es54G5jCaOhtCWDojTHup6yd8mbBmccwfbgzJm70=
github.com/cedana/cedana v0.9.234 h1:sH9F3unNC9fPl/TOzePFqlb05Pudr/wV0yTQgLGm7lc=
github.com/cedana/cedana v0.9.234/go.mod h1:dAynzssinqVeyy1GTw8WYDwEeIVTLOzQx0qA5/D0Tu4=
github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4=
github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM=
github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
Expand Down
11 changes: 1 addition & 10 deletions pkg/worker/lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -511,21 +511,12 @@ func (s *Worker) spawn(request *types.ContainerRequest, spec *specs.Spec, output

// Handle checkpoint creation & restore if applicable
if s.IsCRIUAvailable() && request.CheckpointEnabled {
restored, restoredContainerId, err := s.attemptCheckpointOrRestore(ctx, request, consoleWriter, startedChan, configPath)
restored, _, err := s.attemptCheckpointOrRestore(ctx, request, consoleWriter, startedChan, configPath)
if err != nil {
log.Printf("<%s> - C/R failed: %v\n", containerId, err)
}

if restored {
// HOTFIX: If we restored from a checkpoint, we need to use the container ID of the restored container
// instead of the original container ID
containerInstance, exists := s.containerInstances.Get(request.ContainerId)
if exists {
containerInstance.Id = restoredContainerId
s.containerInstances.Set(containerId, containerInstance)
containerId = restoredContainerId
}

exitCode = s.waitForRestoredContainer(ctx, containerId, startedChan, outputChan, request, spec)
return
}
Expand Down

0 comments on commit 5b54136

Please sign in to comment.