Skip to content

Commit

Permalink
fix: Remove unwanted taints for neuron-device-plugin ds for stable di…
Browse files Browse the repository at this point in the history
…ffusion and llama2 models (#479)
  • Loading branch information
ratnopamc authored Mar 29, 2024
1 parent 1b9cc4b commit 2a1879c
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 106 deletions.
2 changes: 1 addition & 1 deletion ai-ml/jupyterhub/addons.tf
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ module "eks_data_addons" {
jupyter_single_user_sa_name = kubernetes_service_account_v1.jupyterhub_single_user_sa.metadata[0].name
region = var.region
})]
version = "3.2.1"
version = "3.2.1"
}

#---------------------------------------------------------------
Expand Down
3 changes: 0 additions & 3 deletions ai-ml/trainium-inferentia/addons.tf
Original file line number Diff line number Diff line change
Expand Up @@ -321,9 +321,6 @@ module "eks_data_addons" {
- provisionerType: Karpenter
- hub.jupyter.org/node-purpose: user
taints:
- key: aws.amazon.com/neuroncore
value: "true"
effect: "NoSchedule"
- key: aws.amazon.com/neuron
value: "true"
effect: "NoSchedule"
Expand Down
14 changes: 2 additions & 12 deletions ai-ml/trainium-inferentia/eks.tf
Original file line number Diff line number Diff line change
Expand Up @@ -529,12 +529,7 @@ module "eks" {
key = "aws.amazon.com/neuron",
value = "true",
effect = "NO_SCHEDULE"
},
{
key = "aws.amazon.com/neuroncore",
value = "true",
effect = "NO_SCHEDULE"
},
}
]

tags = merge(local.tags, {
Expand Down Expand Up @@ -586,12 +581,7 @@ module "eks" {
key = "aws.amazon.com/neuron",
value = true,
effect = "NO_SCHEDULE"
},
{
key = "aws.amazon.com/neuroncore",
value = true,
effect = "NO_SCHEDULE"
},
}
]

tags = merge(local.tags, {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
---
apiVersion: v1
kind: Namespace
metadata:
Expand All @@ -14,7 +13,7 @@ spec:
serviceUnhealthySecondThreshold: 900
deploymentUnhealthySecondThreshold: 300
serveConfig:
importPath: ray_serve_llama2:entrypoint # Specify the correct path to your Python script
importPath: ray_serve_llama2:entrypoint # Specify the correct path to your Python script
runtimeEnv: |
env_vars: {"MODEL_ID": "NousResearch/Llama-2-13b-chat-hf"} # Replace with the appropriate model ID
Expand All @@ -31,79 +30,76 @@ spec:
template:
spec:
containers:
- name: ray-head
image: public.ecr.aws/data-on-eks/ray-serve-inf2-llama2:latest # Image created using the Dockerfile attached in the folder
imagePullPolicy: Always # Ensure the image is always pulled when updated
lifecycle:
preStop:
exec:
command: [ "/bin/sh","-c","ray stop" ]
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
resources:
limits:
cpu: 4
memory: 20Gi
requests:
cpu: 4
memory: 20Gi
- name: ray-head
image: public.ecr.aws/data-on-eks/ray-serve-inf2-llama2:latest # Image created using the Dockerfile attached in the folder
imagePullPolicy: Always # Ensure the image is always pulled when updated
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
resources:
limits:
cpu: 4
memory: 20Gi
requests:
cpu: 4
memory: 20Gi
nodeSelector: # This is using Karpenter Nodes with the provisioner label
instanceType: mixed-x86
provisionerType: Karpenter
workload: rayhead
volumes:
- name: ray-logs
emptyDir: {}
- name: ray-logs
emptyDir: {}

workerGroupSpecs:
- groupName: inf2-worker-group
replicas: 1
minReplicas: 1
maxReplicas: 1
rayStartParams: {}
template:
spec:
containers:
- name: ray-worker
image: public.ecr.aws/data-on-eks/ray-serve-inf2-llama2:latest
imagePullPolicy: Always # Ensure the image is always pulled when updated
lifecycle:
preStop:
exec:
command: [ "/bin/sh","-c","ray stop" ]
resources:
limits:
cpu: "180"
memory: "700G"
aws.amazon.com/neuron: "12"
requests:
cpu: "180"
memory: "700G"
aws.amazon.com/neuron: "12"
nodeSelector:
instanceType: inferentia-inf2
provisionerType: Karpenter
tolerations:
- key: "aws.amazon.com/neuroncore"
operator: "Exists"
effect: "NoSchedule"
- key: "aws.amazon.com/neuron"
operator: "Exists"
effect: "NoSchedule"
- key: "hub.jupyter.org/dedicated"
operator: "Equal"
value: "user"
effect: "NoSchedule"
- groupName: inf2-worker-group
replicas: 1
minReplicas: 1
maxReplicas: 1
rayStartParams: {}
template:
spec:
containers:
- name: ray-worker
image: public.ecr.aws/data-on-eks/ray-serve-inf2-llama2:latest
imagePullPolicy: Always # Ensure the image is always pulled when updated
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
resources:
limits:
cpu: "180"
memory: "700G"
aws.amazon.com/neuron: "12"
requests:
cpu: "180"
memory: "700G"
aws.amazon.com/neuron: "12"
nodeSelector:
instanceType: inferentia-inf2
provisionerType: Karpenter
tolerations:
- key: "aws.amazon.com/neuron"
operator: "Exists"
effect: "NoSchedule"
- key: "hub.jupyter.org/dedicated"
operator: "Equal"
value: "user"
effect: "NoSchedule"


---
Expand All @@ -117,21 +113,21 @@ metadata:
spec:
ingressClassName: nginx
rules:
- http:
paths:
# Ray Dashboard
- path: /dashboard/(.*)
pathType: ImplementationSpecific
backend:
service:
name: llama2-service
port:
number: 8265
# Ray Serve
- path: /serve/(.*)
pathType: ImplementationSpecific
backend:
service:
name: llama2-service
port:
number: 8000
- http:
paths:
# Ray Dashboard
- path: /dashboard/(.*)
pathType: ImplementationSpecific
backend:
service:
name: llama2-service
port:
number: 8265
# Ray Serve
- path: /serve/(.*)
pathType: ImplementationSpecific
backend:
service:
name: llama2-service
port:
number: 8000
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,6 @@ spec:
instanceType: inferentia-inf2
provisionerType: Karpenter
tolerations:
- key: "aws.amazon.com/neuroncore"
operator: "Exists"
effect: "NoSchedule"
- key: "aws.amazon.com/neuron"
operator: "Exists"
effect: "NoSchedule"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5458,4 +5458,4 @@
"lastRetrieved": 1708463794770
}
}
}
}

0 comments on commit 2a1879c

Please sign in to comment.