diff --git a/helm-charts/nvidia-triton-server/templates/hpa.yaml b/helm-charts/nvidia-triton-server/templates/hpa.yaml index 053ba9a..bd98d18 100644 --- a/helm-charts/nvidia-triton-server/templates/hpa.yaml +++ b/helm-charts/nvidia-triton-server/templates/hpa.yaml @@ -14,20 +14,12 @@ spec: minReplicas: {{ .Values.hpa.minReplicas }} maxReplicas: {{ .Values.hpa.maxReplicas }} metrics: - {{- if .Values.hpa.targetCPUUtilizationPercentage }} - - type: Resource - resource: - name: cpu - target: - type: Utilization - averageUtilization: {{ .Values.hpa.targetCPUUtilizationPercentage }} - {{- end }} - {{- if .Values.hpa.targetMemoryUtilizationPercentage }} - - type: Resource - resource: - name: memory - target: - type: Utilization - averageUtilization: {{ .Values.hpa.targetMemoryUtilizationPercentage }} + {{- range .Values.hpa.metrics }} + - {{ . | toYaml | nindent 10 }} {{- end }} + {{- if .Values.hpa.scaleDownStabilizationSecs }} + behavior: + scaleDown: + stabilizationWindowSeconds: {{ .Values.hpa.scaleDownStabilizationSecs }} + {{- end }} {{- end }} diff --git a/helm-charts/nvidia-triton-server/values.yaml b/helm-charts/nvidia-triton-server/values.yaml index 83b9d69..a73574e 100644 --- a/helm-charts/nvidia-triton-server/values.yaml +++ b/helm-charts/nvidia-triton-server/values.yaml @@ -31,35 +31,35 @@ ingress: enabled: true className: nginx annotations: {} - # kubernetes.io/ingress.class: nginx - # nginx.ingress.kubernetes.io/use-regex: "true" - # nginx.ingress.kubernetes.io/rewrite-target: "/$1" - # OR - # kubernetes.io/ingress.class: alb - # alb.ingress.kubernetes.io/scheme: internet-facing - # alb.ingress.kubernetes.io/target-type: ip - # alb.ingress.kubernetes.io/success-codes: "200-299" - # alb.ingress.kubernetes.io/healthcheck-path: "/v1/health/ready" - # alb.ingress.kubernetes.io/healthcheck-port: "8080" + # kubernetes.io/ingress.class: nginx + # nginx.ingress.kubernetes.io/use-regex: "true" + # nginx.ingress.kubernetes.io/rewrite-target: "/$1" + # OR + # kubernetes.io/ingress.class: alb + # alb.ingress.kubernetes.io/scheme: internet-facing + # alb.ingress.kubernetes.io/target-type: ip + # alb.ingress.kubernetes.io/success-codes: "200-299" + # alb.ingress.kubernetes.io/healthcheck-path: "/v1/health/ready" + # alb.ingress.kubernetes.io/healthcheck-port: "8080" hosts: - - host: "example.com" - paths: - - path: / - pathType: Prefix - service: - name: - port: - number: 8000 - # - path: /serve/(.*) - # pathType: ImplementationSpecific - # service: - # name: - # port: - # number: 8265 + - host: "example.com" + paths: + - path: / + pathType: Prefix + service: + name: + port: + number: 8000 + # - path: /serve/(.*) + # pathType: ImplementationSpecific + # service: + # name: + # port: + # number: 8265 tls: [] - # - hosts: - # - "example.com" - # secretName: "example-tls" + # - hosts: + # - "example.com" + # secretName: "example-tls" selectorLabels: app: triton-inference-server @@ -70,38 +70,38 @@ podSecurityContext: fsGroup: 1000 securityContext: {} - # capabilities: - # drop: - # - ALL - # readOnlyRootFilesystem: true - # runAsNonRoot: true - # runAsUser: 1000 +# capabilities: +# drop: +# - ALL +# readOnlyRootFilesystem: true +# runAsNonRoot: true +# runAsUser: 1000 # Environment variables for Triton containers environment: - - name: "LD_PRELOAD" - value: "" - - name: "TRANSFORMERS_CACHE" - value: "/home/triton-server/.cache" - - name: "shm-size" - value: "5g" - - name: "NCCL_IGNORE_DISABLED_P2P" - value: "1" +- name: "LD_PRELOAD" + value: "" +- name: "TRANSFORMERS_CACHE" + value: "/home/triton-server/.cache" +- name: "shm-size" + value: "5g" +- name: "NCCL_IGNORE_DISABLED_P2P" + value: "1" # - name: "model_name" # value: "meta-llama/Llama-2-7b-chat-hf" # Secret environment variables to authenticate with Hugging Face to load models secretEnvironment: - - name: "HUGGING_FACE_TOKEN" - secretName: "huggingface" # Name of the secret - key: "HF_TOKEN" # Key within the secret +- name: "HUGGING_FACE_TOKEN" + secretName: "huggingface" # Name of the secret + key: "HF_TOKEN" # Key within the secret resources: - requests: # Minimum resource requests for each Triton pod + requests: # Minimum resource requests for each Triton pod cpu: "100m" memory: "512Mi" nvidia.com/gpu: 1 - limits: # Maximum resource limits + limits: # Maximum resource limits cpu: "500m" memory: "2Gi" nvidia.com/gpu: 1 @@ -111,10 +111,22 @@ hpa: enabled: true minReplicas: 1 maxReplicas: 5 - targetCPUUtilizationPercentage: 80 - targetMemoryUtilizationPercentage: 80 + metrics: [] + behavior: + scaleDown: + stabilizationWindowSeconds: 180 # 3 minutes stabilization window + policies: + - type: Percent + value: 50 # Scale down by 50% at a time + periodSeconds: 60 # Check every 60 seconds + scaleUp: + stabilizationWindowSeconds: 60 # 1 minute stabilization window + policies: + - type: Percent + value: 100 # Scale up by 100% at a time + periodSeconds: 15 # Advanced Configuration (If needed) nodeSelector: {} # Schedule pods on specific nodes -tolerations: [] # Allow pods to be scheduled on nodes with 'taints' -affinity: {} # Influence pod scheduling based on node or pod labels +tolerations: [] # Allow pods to be scheduled on nodes with 'taints' +affinity: {} # Influence pod scheduling based on node or pod labels