diff --git a/PROJECT b/PROJECT index 1606c96..cd75a86 100644 --- a/PROJECT +++ b/PROJECT @@ -13,8 +13,8 @@ resources: namespaced: true controller: true domain: github.com - group: lm-eval-service - kind: EvalJob + group: foundation-model-stack.github.com + kind: LMEvalJob path: github.com/foundation-model-stack/fms-lm-eval-service/api/v1beta1 version: v1beta1 webhooks: diff --git a/api/v1beta1/groupversion_info.go b/api/v1beta1/groupversion_info.go index 7c7a1d0..61c87d2 100644 --- a/api/v1beta1/groupversion_info.go +++ b/api/v1beta1/groupversion_info.go @@ -14,9 +14,9 @@ See the License for the specific language governing permissions and limitations under the License. */ -// Package v1beta1 contains API Schema definitions for the lm-eval-service v1beta1 API group +// Package v1beta1 contains API Schema definitions for the foundation-model-stack.github.com.github.com v1beta1 API group // +kubebuilder:object:generate=true -// +groupName=lm-eval-service.github.com +// +groupName=foundation-model-stack.github.com.github.com package v1beta1 import ( @@ -25,15 +25,15 @@ import ( ) const ( - GroupName = "lm-eval-service.github.com" + GroupName = "foundation-model-stack.github.com.github.com" Version = "v1beta1" - KindName = "EvalJob" - FinalizerName = "lm-eval-service.github.com/finalizer" + KindName = "LMEvalJob" + FinalizerName = "lm-eval-service.foundation-model-stack.github.com.github.com/finalizer" ) var ( // GroupVersion is group version used to register these objects - GroupVersion = schema.GroupVersion{Group: "lm-eval-service.github.com", Version: "v1beta1"} + GroupVersion = schema.GroupVersion{Group: GroupName, Version: Version} // SchemeBuilder is used to add go types to the GroupVersionKind scheme SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} diff --git a/api/v1beta1/evaljob_types.go b/api/v1beta1/lmevaljob_types.go similarity index 88% rename from api/v1beta1/evaljob_types.go rename to api/v1beta1/lmevaljob_types.go index d0bbb74..710b44f 100644 --- a/api/v1beta1/evaljob_types.go +++ b/api/v1beta1/lmevaljob_types.go @@ -59,8 +59,8 @@ type Arg struct { Value string `json:"value,omitempty"` } -// EvalJobSpec defines the desired state of EvalJob -type EvalJobSpec struct { +// LMEvalJobSpec defines the desired state of LMEvalJob +type LMEvalJobSpec struct { // INSERT ADDITIONAL SPEC FIELDS - desired state of cluster // Important: Run "make" to regenerate code after modifying this file @@ -88,8 +88,8 @@ type EvalJobSpec struct { LogSamples *bool `json:"logSamples,omitempty"` } -// EvalJobStatus defines the observed state of EvalJob -type EvalJobStatus struct { +// LMEvalJobStatus defines the observed state of LMEvalJob +type LMEvalJobStatus struct { // Important: Run "make" to regenerate code after modifying this file // The name of the Pod that runs the evaluation job @@ -118,24 +118,24 @@ type EvalJobStatus struct { // +kubebuilder:object:root=true // +kubebuilder:subresource:status -// EvalJob is the Schema for the evaljobs API -type EvalJob struct { +// LMEvalJob is the Schema for the lmevaljobs API +type LMEvalJob struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` - Spec EvalJobSpec `json:"spec,omitempty"` - Status EvalJobStatus `json:"status,omitempty"` + Spec LMEvalJobSpec `json:"spec,omitempty"` + Status LMEvalJobStatus `json:"status,omitempty"` } // +kubebuilder:object:root=true -// EvalJobList contains a list of EvalJob -type EvalJobList struct { +// LMEvalJobList contains a list of LMEvalJob +type LMEvalJobList struct { metav1.TypeMeta `json:",inline"` metav1.ListMeta `json:"metadata,omitempty"` - Items []EvalJob `json:"items"` + Items []LMEvalJob `json:"items"` } func init() { - SchemeBuilder.Register(&EvalJob{}, &EvalJobList{}) + SchemeBuilder.Register(&LMEvalJob{}, &LMEvalJobList{}) } diff --git a/api/v1beta1/evaljob_webhook.go b/api/v1beta1/lmevaljob_webhook.go similarity index 63% rename from api/v1beta1/evaljob_webhook.go rename to api/v1beta1/lmevaljob_webhook.go index 9e2662a..c5cb038 100644 --- a/api/v1beta1/evaljob_webhook.go +++ b/api/v1beta1/lmevaljob_webhook.go @@ -33,66 +33,62 @@ import ( var evaljoblog = logf.Log.WithName("evaljob-resource") // SetupWebhookWithManager will setup the manager to manage the webhooks -func (r *EvalJob) SetupWebhookWithManager(mgr ctrl.Manager) error { +func (r *LMEvalJob) SetupWebhookWithManager(mgr ctrl.Manager) error { return ctrl.NewWebhookManagedBy(mgr). For(r). Complete() } -// TODO(user): EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! +// +kubebuilder:webhook:path=/mutate-foundation-model-stack-github-com-github-com-v1beta1-lmevaljob,mutating=true,failurePolicy=fail,sideEffects=None,groups=foundation-model-stack.github.com.github.com,resources=lmevaljobs,verbs=create;update,versions=v1beta1,name=mlmevaljob.kb.io,admissionReviewVersions=v1 -// +kubebuilder:webhook:path=/mutate-lm-eval-service-github-com-v1beta1-evaljob,mutating=true,failurePolicy=fail,sideEffects=None,groups=lm-eval-service.github.com,resources=evaljobs,verbs=create;update,versions=v1beta1,name=mevaljob.kb.io,admissionReviewVersions=v1 - -var _ webhook.Defaulter = &EvalJob{} +var _ webhook.Defaulter = &LMEvalJob{} // Default implements webhook.Defaulter so a webhook will be registered for the type -func (r *EvalJob) Default() { +func (r *LMEvalJob) Default() { evaljoblog.Info("default", "name", r.Name) - - // TODO(user): fill in your defaulting logic. } -// TODO(user): change verbs to "verbs=create;update;delete" if you want to enable deletion validation. // NOTE: The 'path' attribute must follow a specific pattern and should not be modified directly here. // Modifying the path for an invalid path can cause API server errors; failing to locate the webhook. -// +kubebuilder:webhook:path=/validate-lm-eval-service-github-com-v1beta1-evaljob,mutating=false,failurePolicy=fail,sideEffects=None,groups=lm-eval-service.github.com,resources=evaljobs,verbs=create;update,versions=v1beta1,name=vevaljob.kb.io,admissionReviewVersions=v1 +// +kubebuilder:webhook:path=/validate-foundation-model-stack-github-com-github-com-v1beta1-lmevaljob,mutating=false,failurePolicy=fail,sideEffects=None,groups=foundation-model-stack.github.com.github.com,resources=lmevaljobs,verbs=create;update,versions=v1beta1,name=vlmevaljob.kb.io,admissionReviewVersions=v1 -var _ webhook.Validator = &EvalJob{} +var _ webhook.Validator = &LMEvalJob{} // ValidateCreate implements webhook.Validator so a webhook will be registered for the type -func (r *EvalJob) ValidateCreate() (admission.Warnings, error) { +func (r *LMEvalJob) ValidateCreate() (admission.Warnings, error) { evaljoblog.Info("validate create", "name", r.Name) - // TODO(user): fill in your validation logic upon object creation. - return nil, nil + return nil, r.ValidateJob() } // ValidateUpdate implements webhook.Validator so a webhook will be registered for the type -func (r *EvalJob) ValidateUpdate(old runtime.Object) (admission.Warnings, error) { +func (r *LMEvalJob) ValidateUpdate(old runtime.Object) (admission.Warnings, error) { evaljoblog.Info("validate update", "name", r.Name) - // TODO(user): fill in your validation logic upon object update. - return nil, nil + return nil, r.ValidateJob() } // ValidateDelete implements webhook.Validator so a webhook will be registered for the type -func (r *EvalJob) ValidateDelete() (admission.Warnings, error) { +func (r *LMEvalJob) ValidateDelete() (admission.Warnings, error) { evaljoblog.Info("validate delete", "name", r.Name) - // TODO(user): fill in your validation logic upon object deletion. return nil, nil } -func (r *EvalJob) ValidateJob() error { +func (r *LMEvalJob) ValidateJob() error { var allErrs field.ErrorList if err := r.ValidateLimit(); err != nil { allErrs = append(allErrs, err) } + + if len(allErrs) == 0 { + return nil + } return apierrors.NewInvalid( schema.GroupKind{Group: GroupName, Kind: KindName}, r.Name, allErrs) } -func (r *EvalJob) ValidateLimit() *field.Error { +func (r *LMEvalJob) ValidateLimit() *field.Error { if r.Spec.Limit == "" { return nil } diff --git a/api/v1beta1/evaljob_webhook_test.go b/api/v1beta1/lmevaljob_webhook_test.go similarity index 100% rename from api/v1beta1/evaljob_webhook_test.go rename to api/v1beta1/lmevaljob_webhook_test.go diff --git a/api/v1beta1/webhook_suite_test.go b/api/v1beta1/webhook_suite_test.go index ba3f3da..c0d20dd 100644 --- a/api/v1beta1/webhook_suite_test.go +++ b/api/v1beta1/webhook_suite_test.go @@ -113,7 +113,7 @@ var _ = BeforeSuite(func() { }) Expect(err).NotTo(HaveOccurred()) - err = (&EvalJob{}).SetupWebhookWithManager(mgr) + err = (&LMEvalJob{}).SetupWebhookWithManager(mgr) Expect(err).NotTo(HaveOccurred()) // +kubebuilder:scaffold:webhook diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index 67b3d1d..2c6ef5d 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -40,7 +40,7 @@ func (in *Arg) DeepCopy() *Arg { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *EvalJob) DeepCopyInto(out *EvalJob) { +func (in *LMEvalJob) DeepCopyInto(out *LMEvalJob) { *out = *in out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) @@ -48,18 +48,18 @@ func (in *EvalJob) DeepCopyInto(out *EvalJob) { in.Status.DeepCopyInto(&out.Status) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EvalJob. -func (in *EvalJob) DeepCopy() *EvalJob { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LMEvalJob. +func (in *LMEvalJob) DeepCopy() *LMEvalJob { if in == nil { return nil } - out := new(EvalJob) + out := new(LMEvalJob) in.DeepCopyInto(out) return out } // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *EvalJob) DeepCopyObject() runtime.Object { +func (in *LMEvalJob) DeepCopyObject() runtime.Object { if c := in.DeepCopy(); c != nil { return c } @@ -67,31 +67,31 @@ func (in *EvalJob) DeepCopyObject() runtime.Object { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *EvalJobList) DeepCopyInto(out *EvalJobList) { +func (in *LMEvalJobList) DeepCopyInto(out *LMEvalJobList) { *out = *in out.TypeMeta = in.TypeMeta in.ListMeta.DeepCopyInto(&out.ListMeta) if in.Items != nil { in, out := &in.Items, &out.Items - *out = make([]EvalJob, len(*in)) + *out = make([]LMEvalJob, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EvalJobList. -func (in *EvalJobList) DeepCopy() *EvalJobList { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LMEvalJobList. +func (in *LMEvalJobList) DeepCopy() *LMEvalJobList { if in == nil { return nil } - out := new(EvalJobList) + out := new(LMEvalJobList) in.DeepCopyInto(out) return out } // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *EvalJobList) DeepCopyObject() runtime.Object { +func (in *LMEvalJobList) DeepCopyObject() runtime.Object { if c := in.DeepCopy(); c != nil { return c } @@ -99,7 +99,7 @@ func (in *EvalJobList) DeepCopyObject() runtime.Object { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *EvalJobSpec) DeepCopyInto(out *EvalJobSpec) { +func (in *LMEvalJobSpec) DeepCopyInto(out *LMEvalJobSpec) { *out = *in if in.ModelArgs != nil { in, out := &in.ModelArgs, &out.ModelArgs @@ -128,18 +128,18 @@ func (in *EvalJobSpec) DeepCopyInto(out *EvalJobSpec) { } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EvalJobSpec. -func (in *EvalJobSpec) DeepCopy() *EvalJobSpec { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LMEvalJobSpec. +func (in *LMEvalJobSpec) DeepCopy() *LMEvalJobSpec { if in == nil { return nil } - out := new(EvalJobSpec) + out := new(LMEvalJobSpec) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *EvalJobStatus) DeepCopyInto(out *EvalJobStatus) { +func (in *LMEvalJobStatus) DeepCopyInto(out *LMEvalJobStatus) { *out = *in if in.LastScheduleTime != nil { in, out := &in.LastScheduleTime, &out.LastScheduleTime @@ -151,12 +151,12 @@ func (in *EvalJobStatus) DeepCopyInto(out *EvalJobStatus) { } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EvalJobStatus. -func (in *EvalJobStatus) DeepCopy() *EvalJobStatus { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LMEvalJobStatus. +func (in *LMEvalJobStatus) DeepCopy() *LMEvalJobStatus { if in == nil { return nil } - out := new(EvalJobStatus) + out := new(LMEvalJobStatus) in.DeepCopyInto(out) return out } diff --git a/backend/controller/evaljob_controller.go b/backend/controller/lmevaljob_controller.go similarity index 69% rename from backend/controller/evaljob_controller.go rename to backend/controller/lmevaljob_controller.go index 9e9ed09..bebd53f 100644 --- a/backend/controller/evaljob_controller.go +++ b/backend/controller/lmevaljob_controller.go @@ -19,6 +19,7 @@ package controller import ( "context" "fmt" + "slices" "strings" "time" @@ -48,13 +49,25 @@ const ( PodImageKey = "pod-image" DriverImageKey = "driver-image" DriverServiceAccountKey = "driver-serviceaccount" + PodCheckingIntervalKey = "pod-checking-interval" + ImagePullPolicyKey = "image-pull-policy" DefaultPodImage = "quay.io/yhwang/lm-eval-aas-flask:test" DefaultDriverImage = "quay.io/yhwang/lm-eval-aas-driver:test" DefaultDriverServiceAccount = "driver" + DefaultPodCheckingInterval = time.Second * 10 + DefaultImagePullPolicy = corev1.PullAlways ) -// EvalJobReconciler reconciles a EvalJob object -type EvalJobReconciler struct { +var ( + pullPolicyMap = map[string]corev1.PullPolicy{ + "Always": corev1.PullAlways, + "Never": corev1.PullNever, + "IfNotPresent": corev1.PullIfNotPresent, + } +) + +// LMEvalJobReconciler reconciles a LMEvalJob object +type LMEvalJobReconciler struct { client.Client Scheme *runtime.Scheme Recorder record.EventRecorder @@ -67,63 +80,62 @@ type ServiceOptions struct { PodImage string DriverImage string DriverServiceAccount string + PodCheckingInterval time.Duration + ImagePullPolicy corev1.PullPolicy } -// +kubebuilder:rbac:groups=lm-eval-service.github.com,resources=evaljobs,verbs=get;list;watch;create;update;patch;delete -// +kubebuilder:rbac:groups=lm-eval-service.github.com,resources=evaljobs/status,verbs=get;update;patch -// +kubebuilder:rbac:groups=lm-eval-service.github.com,resources=evaljobs/finalizers,verbs=update +// +kubebuilder:rbac:groups=foundation-model-stack.github.com.github.com,resources=lmevaljobs,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=foundation-model-stack.github.com.github.com,resources=lmevaljobs/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=foundation-model-stack.github.com.github.com,resources=lmevaljobs/finalizers,verbs=update // +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;delete // +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;watch;list -func (r *EvalJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +func (r *LMEvalJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := log.FromContext(ctx) - evalJob := &lmevalservicev1beta1.EvalJob{} - if err := r.Get(ctx, req.NamespacedName, evalJob); err != nil { - log.Error(err, "unable to fetch EvalJob. could be from an deletion request") - // we'll ignore not-found errors, since they can't be fixed by an immediate - // requeue (we'll need to wait for a new notification), and we can get them - // on deleted requests. + job := &lmevalservicev1beta1.LMEvalJob{} + if err := r.Get(ctx, req.NamespacedName, job); err != nil { + log.Info("unable to fetch LMEvalJob. could be from an deletion request") return ctrl.Result{}, client.IgnoreNotFound(err) } - if !evalJob.ObjectMeta.DeletionTimestamp.IsZero() { + if !job.ObjectMeta.DeletionTimestamp.IsZero() { // Handle deletion here - return r.handleDeletion(ctx, evalJob, log) + return r.handleDeletion(ctx, job, log) } // Treat this as NewJobState - if evalJob.Status.LastScheduleTime == nil { - evalJob.Status.State = lmevalservicev1beta1.NewJobState + if job.Status.LastScheduleTime == nil { + job.Status.State = lmevalservicev1beta1.NewJobState } // Handle the job based on its state - switch evalJob.Status.State { + switch job.Status.State { case lmevalservicev1beta1.NewJobState: // Handle newly created job - return r.handleNewCR(ctx, log, evalJob) + return r.handleNewCR(ctx, log, job) case lmevalservicev1beta1.ScheduledJobState: // the job's pod has been created and the driver hasn't updated the state yet // let's check the pod status and detect pod failure if there is // TODO: need a timeout/retry mechanism here to transite to other states - return r.checkScheduledPod(ctx, log, evalJob) + return r.checkScheduledPod(ctx, log, job) case lmevalservicev1beta1.RunningJobState: // TODO: need a timeout/retry mechanism here to transite to other states - return r.checkScheduledPod(ctx, log, evalJob) + return r.checkScheduledPod(ctx, log, job) case lmevalservicev1beta1.CompleteJobState: - return r.handleComplete(ctx, log, evalJob) + return r.handleComplete(ctx, log, job) case lmevalservicev1beta1.CancelledJobState: - return r.handleCancel(ctx, log, evalJob) + return r.handleCancel(ctx, log, job) } return ctrl.Result{}, nil } // SetupWithManager sets up the controller with the Manager. -func (r *EvalJobReconciler) SetupWithManager(mgr ctrl.Manager) error { +func (r *LMEvalJobReconciler) SetupWithManager(mgr ctrl.Manager) error { // Add a runnable to retrieve the settings from the specified configmap - mgr.Add(manager.RunnableFunc(func(context.Context) error { + if err := mgr.Add(manager.RunnableFunc(func(context.Context) error { var cm corev1.ConfigMap if err := r.Get( context.Background(), @@ -142,12 +154,14 @@ func (r *EvalJobReconciler) SetupWithManager(mgr ctrl.Manager) error { return err } return nil - })) + })); err != nil { + return err + } // watch the pods created by the controller but only for the deletion event return ctrl.NewControllerManagedBy(mgr). // since we register the finalizer, no need to monitor deletion events - For(&lmevalservicev1beta1.EvalJob{}, builder.WithPredicates(predicate.Funcs{ + For(&lmevalservicev1beta1.LMEvalJob{}, builder.WithPredicates(predicate.Funcs{ // drop deletion events DeleteFunc: func(event.DeleteEvent) bool { return false @@ -155,7 +169,7 @@ func (r *EvalJobReconciler) SetupWithManager(mgr ctrl.Manager) error { })). Watches( &corev1.Pod{}, - handler.EnqueueRequestForOwner(mgr.GetScheme(), mgr.GetRESTMapper(), &lmevalservicev1beta1.EvalJob{}), + handler.EnqueueRequestForOwner(mgr.GetScheme(), mgr.GetRESTMapper(), &lmevalservicev1beta1.LMEvalJob{}), builder.WithPredicates(predicate.Funcs{ // drop all events except deletion CreateFunc: func(event.CreateEvent) bool { @@ -172,10 +186,12 @@ func (r *EvalJobReconciler) SetupWithManager(mgr ctrl.Manager) error { Complete(r) } -func (r *EvalJobReconciler) constructOptionsFromConfigMap(configmap *corev1.ConfigMap) error { +func (r *LMEvalJobReconciler) constructOptionsFromConfigMap(configmap *corev1.ConfigMap) error { r.options.DriverImage = DefaultDriverImage r.options.PodImage = DefaultPodImage r.options.DriverServiceAccount = DefaultDriverServiceAccount + r.options.PodCheckingInterval = DefaultPodCheckingInterval + r.options.ImagePullPolicy = DefaultImagePullPolicy if v, found := configmap.Data[DriverImageKey]; found { r.options.DriverImage = v } @@ -185,17 +201,27 @@ func (r *EvalJobReconciler) constructOptionsFromConfigMap(configmap *corev1.Conf if v, found := configmap.Data[DriverServiceAccountKey]; found { r.options.DriverServiceAccount = v } + if v, found := configmap.Data[PodCheckingIntervalKey]; found { + if d, err := time.ParseDuration(v); err == nil { + r.options.PodCheckingInterval = d + } + } + if v, found := configmap.Data[ImagePullPolicyKey]; found { + if p, found := pullPolicyMap[v]; found { + r.options.ImagePullPolicy = p + } + } return nil } -func (r *EvalJobReconciler) handleDeletion(ctx context.Context, job *lmevalservicev1beta1.EvalJob, log logr.Logger) (reconcile.Result, error) { +func (r *LMEvalJobReconciler) handleDeletion(ctx context.Context, job *lmevalservicev1beta1.LMEvalJob, log logr.Logger) (reconcile.Result, error) { if controllerutil.ContainsFinalizer(job, lmevalservicev1beta1.FinalizerName) { // delete the correspondling pod if needed // remove our finalizer from the list and update it. if job.Status.State != lmevalservicev1beta1.CompleteJobState || job.Status.Reason != lmevalservicev1beta1.CancelledReason { - if err := r.deleteJobPod(ctx, job); err != nil { + if err := r.deleteJobPod(ctx, job); err != nil && client.IgnoreNotFound(err) != nil { log.Error(err, "failed to delete pod of the job") } } @@ -205,7 +231,7 @@ func (r *EvalJobReconciler) handleDeletion(ctx context.Context, job *lmevalservi return ctrl.Result{}, err } r.Recorder.Event(job, "Normal", "DetachFinalizer", - fmt.Sprintf("removed finalizer from EvalJob %s in namespace %s", + fmt.Sprintf("removed finalizer from LMEvalJob %s in namespace %s", job.Name, job.Namespace)) log.Info("Successfully remove the finalizer", "name", job.Name) @@ -214,7 +240,7 @@ func (r *EvalJobReconciler) handleDeletion(ctx context.Context, job *lmevalservi return ctrl.Result{}, nil } -func (r *EvalJobReconciler) handleNewCR(ctx context.Context, log logr.Logger, job *lmevalservicev1beta1.EvalJob) (reconcile.Result, error) { +func (r *LMEvalJobReconciler) handleNewCR(ctx context.Context, log logr.Logger, job *lmevalservicev1beta1.LMEvalJob) (reconcile.Result, error) { // If it doesn't contain our finalizer, add it if !controllerutil.ContainsFinalizer(job, lmevalservicev1beta1.FinalizerName) { controllerutil.AddFinalizer(job, lmevalservicev1beta1.FinalizerName) @@ -223,10 +249,10 @@ func (r *EvalJobReconciler) handleNewCR(ctx context.Context, log logr.Logger, jo return ctrl.Result{}, err } r.Recorder.Event(job, "Normal", "AttachFinalizer", - fmt.Sprintf("added the finalizer to the EvalJob %s in namespace %s", + fmt.Sprintf("added the finalizer to the LMEvalJob %s in namespace %s", job.Name, job.Namespace)) - // Since finalizers were updated. Need to fetch the new EvalJob + // Since finalizers were updated. Need to fetch the new LMEvalJob // End the current reconsile and get revisioned job in next reconsile return ctrl.Result{}, nil } @@ -240,9 +266,9 @@ func (r *EvalJobReconciler) handleNewCR(ctx context.Context, log logr.Logger, jo job.Status.Reason = lmevalservicev1beta1.FailedReason job.Status.Message = err.Error() if err := r.Status().Update(ctx, job); err != nil { - log.Error(err, "unable to update EvalJob status for pod creation failure") + log.Error(err, "unable to update LMEvalJob status for pod creation failure") } - log.Error(err, "Failed to create pod for the EvalJob", "name", job.Name) + log.Error(err, "Failed to create pod for the LMEvalJob", "name", job.Name) return ctrl.Result{}, err } @@ -251,19 +277,19 @@ func (r *EvalJobReconciler) handleNewCR(ctx context.Context, log logr.Logger, jo job.Status.PodName = pod.Name job.Status.LastScheduleTime = ¤tTime if err := r.Status().Update(ctx, job); err != nil { - log.Error(err, "unable to update EvalJob status (pod creation done)") + log.Error(err, "unable to update LMEvalJob status (pod creation done)") return ctrl.Result{}, err } r.Recorder.Event(job, "Normal", "PodCreation", - fmt.Sprintf("the EvalJob %s in namespace %s created a pod", + fmt.Sprintf("the LMEvalJob %s in namespace %s created a pod", job.Name, job.Namespace)) log.Info("Successfully create a Pod for the Job") - // Check the pod after 10 seconds - return ctrl.Result{Requeue: true, RequeueAfter: time.Second * 10}, nil + // Check the pod after the config interval + return ctrl.Result{Requeue: true, RequeueAfter: r.options.PodCheckingInterval}, nil } -func (r *EvalJobReconciler) checkScheduledPod(ctx context.Context, log logr.Logger, job *lmevalservicev1beta1.EvalJob) (ctrl.Result, error) { +func (r *LMEvalJobReconciler) checkScheduledPod(ctx context.Context, log logr.Logger, job *lmevalservicev1beta1.LMEvalJob) (ctrl.Result, error) { pod, err := r.getPod(ctx, job) if err != nil { // a weird state, someone delete the corresponding pod? mark this as CompleteJobState @@ -272,11 +298,11 @@ func (r *EvalJobReconciler) checkScheduledPod(ctx context.Context, log logr.Logg job.Status.Reason = lmevalservicev1beta1.FailedReason job.Status.Message = err.Error() if err := r.Status().Update(ctx, job); err != nil { - log.Error(err, "unable to update EvalJob status", "state", job.Status.State) + log.Error(err, "unable to update LMEvalJob status", "state", job.Status.State) return ctrl.Result{}, err } r.Recorder.Event(job, "Warning", "PodMising", - fmt.Sprintf("the pod for the EvalJob %s in namespace %s is gone", + fmt.Sprintf("the pod for the LMEvalJob %s in namespace %s is gone", job.Name, job.Namespace)) log.Error(err, "since the job's pod is gone, mark the job as complete with error result.") @@ -285,39 +311,38 @@ func (r *EvalJobReconciler) checkScheduledPod(ctx context.Context, log logr.Logg if pod.Status.ContainerStatuses == nil { // wait for the pod to initialize and run the containers - return ctrl.Result{Requeue: true, RequeueAfter: time.Second * 10}, nil - } - - for _, cstatus := range pod.Status.ContainerStatuses { - if cstatus.Name == "main" { - if cstatus.LastTerminationState.Terminated == nil { - return ctrl.Result{Requeue: true, RequeueAfter: time.Second * 10}, nil - } else { - if cstatus.LastTerminationState.Terminated.ExitCode == 0 { - job.Status.State = lmevalservicev1beta1.CompleteJobState - job.Status.Reason = lmevalservicev1beta1.SucceedReason - } else { - job.Status.State = lmevalservicev1beta1.CompleteJobState - job.Status.Reason = lmevalservicev1beta1.FailedReason - job.Status.Message = cstatus.LastTerminationState.Terminated.Reason - - } - if err := r.Status().Update(ctx, job); err != nil { - log.Error(err, "unable to update EvalJob status", "state", job.Status.State) - return ctrl.Result{}, err - } - r.Recorder.Event(job, "Normal", "PodCompleted", - fmt.Sprintf("The pod for the EvalJob %s in namespace %s has completed", - job.Name, - job.Namespace)) - return ctrl.Result{}, nil - } - } + return ctrl.Result{Requeue: true, RequeueAfter: r.options.PodCheckingInterval}, nil + } + + mainIndex := slices.IndexFunc(pod.Status.ContainerStatuses, func(s corev1.ContainerStatus) bool { + return s.Name == "main" + }) + if mainIndex == -1 || pod.Status.ContainerStatuses[mainIndex].LastTerminationState.Terminated == nil { + // wait for the main container to finish + return ctrl.Result{Requeue: true, RequeueAfter: r.options.PodCheckingInterval}, nil + } + + // main container finished. update status + job.Status.State = lmevalservicev1beta1.CompleteJobState + if pod.Status.ContainerStatuses[mainIndex].LastTerminationState.Terminated.ExitCode == 0 { + job.Status.Reason = lmevalservicev1beta1.SucceedReason + } else { + job.Status.Reason = lmevalservicev1beta1.FailedReason + job.Status.Message = pod.Status.ContainerStatuses[mainIndex].LastTerminationState.Terminated.Reason } - return ctrl.Result{Requeue: true, RequeueAfter: time.Second * 10}, nil + + err = r.Status().Update(ctx, job) + if err != nil { + log.Error(err, "unable to update LMEvalJob status", "state", job.Status.State) + } + r.Recorder.Event(job, "Normal", "PodCompleted", + fmt.Sprintf("The pod for the LMEvalJob %s in namespace %s has completed", + job.Name, + job.Namespace)) + return ctrl.Result{}, err } -func (r *EvalJobReconciler) getPod(ctx context.Context, job *lmevalservicev1beta1.EvalJob) (*corev1.Pod, error) { +func (r *LMEvalJobReconciler) getPod(ctx context.Context, job *lmevalservicev1beta1.LMEvalJob) (*corev1.Pod, error) { var pod = corev1.Pod{} if err := r.Get(ctx, types.NamespacedName{Namespace: job.Namespace, Name: job.Name}, &pod); err != nil { return nil, err @@ -333,7 +358,7 @@ func (r *EvalJobReconciler) getPod(ctx context.Context, job *lmevalservicev1beta return nil, fmt.Errorf("pod doesn't have proper entry in the OwnerReferences") } -func (r *EvalJobReconciler) deleteJobPod(ctx context.Context, job *lmevalservicev1beta1.EvalJob) error { +func (r *LMEvalJobReconciler) deleteJobPod(ctx context.Context, job *lmevalservicev1beta1.LMEvalJob) error { pod := corev1.Pod{ TypeMeta: v1.TypeMeta{ Kind: "Pod", @@ -354,10 +379,10 @@ func (r *EvalJobReconciler) deleteJobPod(ctx context.Context, job *lmevalservice return r.Delete(ctx, &pod, &client.DeleteOptions{}) } -func (r *EvalJobReconciler) handleComplete(ctx context.Context, log logr.Logger, job *lmevalservicev1beta1.EvalJob) (ctrl.Result, error) { +func (r *LMEvalJobReconciler) handleComplete(ctx context.Context, log logr.Logger, job *lmevalservicev1beta1.LMEvalJob) (ctrl.Result, error) { if job.Status.CompleteTime == nil { r.Recorder.Event(job, "Normal", "JobCompleted", - fmt.Sprintf("Tthe EvalJob %s in namespace %s has completed", + fmt.Sprintf("The LMEvalJob %s in namespace %s has completed", job.Name, job.Namespace)) // TODO: final wrap up/clean up @@ -370,7 +395,7 @@ func (r *EvalJobReconciler) handleComplete(ctx context.Context, log logr.Logger, return ctrl.Result{}, nil } -func (r *EvalJobReconciler) handleCancel(ctx context.Context, log logr.Logger, job *lmevalservicev1beta1.EvalJob) (ctrl.Result, error) { +func (r *LMEvalJobReconciler) handleCancel(ctx context.Context, log logr.Logger, job *lmevalservicev1beta1.LMEvalJob) (ctrl.Result, error) { // delete the pod and update the state to complete if _, err := r.getPod(ctx, job); err != nil { // pod is gone. update status @@ -382,8 +407,8 @@ func (r *EvalJobReconciler) handleCancel(ctx context.Context, log logr.Logger, j job.Status.Reason = lmevalservicev1beta1.CancelledReason if err := r.deleteJobPod(ctx, job); err != nil { // leave the state as is and retry again - log.Error(err, "failed to delete pod. scheduled a retry after 10 seconds") - return ctrl.Result{Requeue: true, RequeueAfter: time.Second * 10}, err + log.Error(err, "failed to delete pod. scheduled a retry", "interval", r.options.PodCheckingInterval.String()) + return ctrl.Result{Requeue: true, RequeueAfter: r.options.PodCheckingInterval}, err } } @@ -392,13 +417,13 @@ func (r *EvalJobReconciler) handleCancel(ctx context.Context, log logr.Logger, j log.Error(err, "failed to update status for cancellation") } r.Recorder.Event(job, "Normal", "Cancelled", - fmt.Sprintf("Tthe EvalJob %s in namespace %s has cancelled and changed its state to Complete", + fmt.Sprintf("The LMEvalJob %s in namespace %s has cancelled and changed its state to Complete", job.Name, job.Namespace)) return ctrl.Result{}, err } -func (r *EvalJobReconciler) createPod(job *lmevalservicev1beta1.EvalJob) *corev1.Pod { +func (r *LMEvalJobReconciler) createPod(job *lmevalservicev1beta1.LMEvalJob) *corev1.Pod { var allowPrivilegeEscalation = false var runAsNonRootUser = true var ownerRefController = true @@ -431,7 +456,7 @@ func (r *EvalJobReconciler) createPod(job *lmevalservicev1beta1.EvalJob) *corev1 { Name: "driver", Image: r.options.DriverImage, - ImagePullPolicy: corev1.PullAlways, + ImagePullPolicy: r.options.ImagePullPolicy, Command: []string{DriverPath, "--copy", DestDriverPath}, SecurityContext: &corev1.SecurityContext{ AllowPrivilegeEscalation: &allowPrivilegeEscalation, @@ -454,7 +479,7 @@ func (r *EvalJobReconciler) createPod(job *lmevalservicev1beta1.EvalJob) *corev1 { Name: "main", Image: r.options.PodImage, - ImagePullPolicy: corev1.PullAlways, + ImagePullPolicy: r.options.ImagePullPolicy, Env: []corev1.EnvVar{ { Name: "GENAI_KEY", @@ -507,7 +532,7 @@ func (r *EvalJobReconciler) createPod(job *lmevalservicev1beta1.EvalJob) *corev1 return &pod } -func generateArgs(job *lmevalservicev1beta1.EvalJob) []string { +func generateArgs(job *lmevalservicev1beta1.LMEvalJob) []string { if job == nil { return nil } @@ -542,7 +567,7 @@ func generateArgs(job *lmevalservicev1beta1.EvalJob) []string { return []string{"sh", "-ec", strings.Join(cmds, " ")} } -func generateCmd(job *lmevalservicev1beta1.EvalJob) []string { +func generateCmd(job *lmevalservicev1beta1.LMEvalJob) []string { if job == nil { return nil } diff --git a/backend/controller/evaljob_controller_test.go b/backend/controller/lmevaljob_controller_test.go similarity index 74% rename from backend/controller/evaljob_controller_test.go rename to backend/controller/lmevaljob_controller_test.go index e7c1b0a..60ca8c3 100644 --- a/backend/controller/evaljob_controller_test.go +++ b/backend/controller/lmevaljob_controller_test.go @@ -30,7 +30,7 @@ import ( lmevalservicev1beta1 "github.com/foundation-model-stack/fms-lm-eval-service/api/v1beta1" ) -var _ = Describe("EvalJob Controller", func() { +var _ = Describe("LMEvalJob Controller", func() { Context("When reconciling a resource", func() { const resourceName = "test-resource" @@ -38,20 +38,20 @@ var _ = Describe("EvalJob Controller", func() { typeNamespacedName := types.NamespacedName{ Name: resourceName, - Namespace: "default", // TODO(user):Modify as needed + Namespace: "default", } - evaljob := &lmevalservicev1beta1.EvalJob{} + evaljob := &lmevalservicev1beta1.LMEvalJob{} BeforeEach(func() { - By("creating the custom resource for the Kind EvalJob") + By("creating the custom resource for the Kind LMEvalJob") err := k8sClient.Get(ctx, typeNamespacedName, evaljob) if err != nil && errors.IsNotFound(err) { - resource := &lmevalservicev1beta1.EvalJob{ + resource := &lmevalservicev1beta1.LMEvalJob{ ObjectMeta: metav1.ObjectMeta{ Name: resourceName, Namespace: "default", }, - Spec: lmevalservicev1beta1.EvalJobSpec{ + Spec: lmevalservicev1beta1.LMEvalJobSpec{ Model: "test", ModelArgs: []lmevalservicev1beta1.Arg{ {Name: "arg1", Value: "value1"}, @@ -64,17 +64,16 @@ var _ = Describe("EvalJob Controller", func() { }) AfterEach(func() { - // TODO(user): Cleanup logic after each test, like removing the resource instance. - resource := &lmevalservicev1beta1.EvalJob{} + resource := &lmevalservicev1beta1.LMEvalJob{} err := k8sClient.Get(ctx, typeNamespacedName, resource) Expect(err).NotTo(HaveOccurred()) - By("Cleanup the specific resource instance EvalJob") + By("Cleanup the specific resource instance LMEvalJob") Expect(k8sClient.Delete(ctx, resource)).To(Succeed()) }) It("should successfully reconcile the resource", func() { By("Reconciling the created resource") - controllerReconciler := &EvalJobReconciler{ + controllerReconciler := &LMEvalJobReconciler{ Client: k8sClient, Scheme: k8sClient.Scheme(), } @@ -83,8 +82,6 @@ var _ = Describe("EvalJob Controller", func() { NamespacedName: typeNamespacedName, }) Expect(err).NotTo(HaveOccurred()) - // TODO(user): Add more specific assertions depending on your controller's reconciliation logic. - // Example: If you expect a certain status condition after reconciliation, verify it here. }) }) }) diff --git a/backend/driver/driver.go b/backend/driver/driver.go index c0edefc..9dfcc99 100644 --- a/backend/driver/driver.go +++ b/backend/driver/driver.go @@ -60,7 +60,7 @@ type Driver interface { type driverImpl struct { client client.Client - job lmevalservicev1beta1.EvalJob + job lmevalservicev1beta1.LMEvalJob Option *DriverOption } @@ -183,7 +183,7 @@ func (d *driverImpl) updateCompleteStatus(err error) error { } else { // read the content of result*.json pattern := filepath.Join(d.Option.OutputPath, "result*.json") - filepath.WalkDir(d.Option.OutputPath, func(path string, dir fs.DirEntry, err error) error { + if err := filepath.WalkDir(d.Option.OutputPath, func(path string, dir fs.DirEntry, err error) error { if err != nil { return err } @@ -201,7 +201,9 @@ func (d *driverImpl) updateCompleteStatus(err error) error { } } return nil - }) + }); err != nil { + return err + } } if err := d.client.Status().Update(d.Option.Context, &d.job); err != nil { d.Option.Logger.Error(err, "unable to update EvalJob.Status.State to Complete") diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 9ba9d9b..42b4ce2 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -127,7 +127,7 @@ func main() { os.Exit(1) } - if err = (&controller.EvalJobReconciler{ + if err = (&controller.LMEvalJobReconciler{ ConfigMap: configMap, Namespace: namespace, Client: mgr.GetClient(), @@ -138,7 +138,7 @@ func main() { os.Exit(1) } if os.Getenv("ENABLE_WEBHOOKS") != "false" { - if err = (&lmevalservicev1beta1.EvalJob{}).SetupWebhookWithManager(mgr); err != nil { + if err = (&lmevalservicev1beta1.LMEvalJob{}).SetupWebhookWithManager(mgr); err != nil { setupLog.Error(err, "unable to create webhook", "webhook", "EvalJob") os.Exit(1) } diff --git a/config/crd/bases/lm-eval-service.github.com_evaljobs.yaml b/config/crd/bases/foundation-model-stack.github.com.github.com_lmevaljobs.yaml similarity index 90% rename from config/crd/bases/lm-eval-service.github.com_evaljobs.yaml rename to config/crd/bases/foundation-model-stack.github.com.github.com_lmevaljobs.yaml index f589330..c57d235 100644 --- a/config/crd/bases/lm-eval-service.github.com_evaljobs.yaml +++ b/config/crd/bases/foundation-model-stack.github.com.github.com_lmevaljobs.yaml @@ -4,20 +4,20 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.15.0 - name: evaljobs.lm-eval-service.github.com + name: lmevaljobs.foundation-model-stack.github.com.github.com spec: - group: lm-eval-service.github.com + group: foundation-model-stack.github.com.github.com names: - kind: EvalJob - listKind: EvalJobList - plural: evaljobs - singular: evaljob + kind: LMEvalJob + listKind: LMEvalJobList + plural: lmevaljobs + singular: lmevaljob scope: Namespaced versions: - name: v1beta1 schema: openAPIV3Schema: - description: EvalJob is the Schema for the evaljobs API + description: LMEvalJob is the Schema for the lmevaljobs API properties: apiVersion: description: |- @@ -37,7 +37,7 @@ spec: metadata: type: object spec: - description: EvalJobSpec defines the desired state of EvalJob + description: LMEvalJobSpec defines the desired state of LMEvalJob properties: genArgs: description: Map to `--gen_kwargs` parameter for the underlying library. @@ -90,7 +90,7 @@ spec: - tasks type: object status: - description: EvalJobStatus defines the observed state of EvalJob + description: LMEvalJobStatus defines the observed state of LMEvalJob properties: completeTime: description: Information when the job's state changes to Complete. diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml index d45e11a..4fcc06d 100644 --- a/config/crd/kustomization.yaml +++ b/config/crd/kustomization.yaml @@ -2,18 +2,18 @@ # since it depends on service name and namespace that are out of this kustomize package. # It should be run by config/default resources: -- bases/lm-eval-service.github.com_evaljobs.yaml +- bases/foundation-model-stack.github.com.github.com_lmevaljobs.yaml # +kubebuilder:scaffold:crdkustomizeresource patches: # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix. # patches here are for enabling the conversion webhook for each CRD -- path: patches/webhook_in_evaljobs.yaml +- path: patches/webhook_in_lmevaljobs.yaml # +kubebuilder:scaffold:crdkustomizewebhookpatch # [CERTMANAGER] To enable cert-manager, uncomment all the sections with [CERTMANAGER] prefix. # patches here are for enabling the CA injection for each CRD -- path: patches/cainjection_in_evaljobs.yaml +- path: patches/cainjection_in_lmevaljobs.yaml # +kubebuilder:scaffold:crdkustomizecainjectionpatch # [WEBHOOK] To enable webhook, uncomment the following section diff --git a/config/crd/patches/cainjection_in_evaljobs.yaml b/config/crd/patches/cainjection_in_lmevaljobs.yaml similarity index 79% rename from config/crd/patches/cainjection_in_evaljobs.yaml rename to config/crd/patches/cainjection_in_lmevaljobs.yaml index 220fa67..31ec811 100644 --- a/config/crd/patches/cainjection_in_evaljobs.yaml +++ b/config/crd/patches/cainjection_in_lmevaljobs.yaml @@ -4,4 +4,4 @@ kind: CustomResourceDefinition metadata: annotations: cert-manager.io/inject-ca-from: CERTIFICATE_NAMESPACE/CERTIFICATE_NAME - name: evaljobs.lm-eval-service.github.com + name: lmevaljobs.foundation-model-stack.github.com.github.com diff --git a/config/crd/patches/webhook_in_evaljobs.yaml b/config/crd/patches/webhook_in_lmevaljobs.yaml similarity index 84% rename from config/crd/patches/webhook_in_evaljobs.yaml rename to config/crd/patches/webhook_in_lmevaljobs.yaml index f02f4a6..442eee7 100644 --- a/config/crd/patches/webhook_in_evaljobs.yaml +++ b/config/crd/patches/webhook_in_lmevaljobs.yaml @@ -2,7 +2,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: - name: evaljobs.lm-eval-service.github.com + name: lmevaljobs.foundation-model-stack.github.com.github.com spec: conversion: strategy: Webhook diff --git a/config/manager/configmap.yaml b/config/manager/configmap.yaml index 1c0e5f8..bd4cedb 100644 --- a/config/manager/configmap.yaml +++ b/config/manager/configmap.yaml @@ -3,6 +3,8 @@ data: driver-image: quay.io/yhwang/lm-eval-aas-driver:latest pod-image: quay.io/yhwang/lm-eval-aas-flask:test driver-serviceaccount: driver + pod-checking-interval: "10s" + image-pull-policy: Always kind: ConfigMap metadata: name: configmap diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index c22927b..81bcf07 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -28,37 +28,10 @@ spec: labels: control-plane: controller-manager spec: - # TODO(user): Uncomment the following code to configure the nodeAffinity expression - # according to the platforms which are supported by your solution. - # It is considered best practice to support multiple architectures. You can - # build your manager image using the makefile target docker-buildx. - # affinity: - # nodeAffinity: - # requiredDuringSchedulingIgnoredDuringExecution: - # nodeSelectorTerms: - # - matchExpressions: - # - key: kubernetes.io/arch - # operator: In - # values: - # - amd64 - # - arm64 - # - ppc64le - # - s390x - # - key: kubernetes.io/os - # operator: In - # values: - # - linux securityContext: runAsNonRoot: true seccompProfile: type: RuntimeDefault - # TODO(user): For common cases that do not require escalating privileges - # it is recommended to ensure that all your Pods/Containers are restrictive. - # More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted - # Please uncomment the following code if your project does NOT have to work on old Kubernetes - # versions < 1.19 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ). - # seccompProfile: - # type: RuntimeDefault containers: - command: - /bin/manager @@ -93,8 +66,6 @@ spec: port: 8081 initialDelaySeconds: 5 periodSeconds: 10 - # TODO(user): Configure the resources accordingly based on the project requirements. - # More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ resources: limits: cpu: 500m diff --git a/config/rbac/driver_role.yaml b/config/rbac/driver_role.yaml index 9bb50ef..6ea0f9b 100644 --- a/config/rbac/driver_role.yaml +++ b/config/rbac/driver_role.yaml @@ -19,9 +19,9 @@ rules: - list - watch - apiGroups: - - lm-eval-service.github.com + - foundation-model-stack.github.com.github.com resources: - - evaljobs + - lmevaljobs verbs: - get - list @@ -29,9 +29,9 @@ rules: - update - watch - apiGroups: - - lm-eval-service.github.com + - foundation-model-stack.github.com.github.com resources: - - evaljobs/status + - lmevaljobs/status verbs: - get - patch diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index be4ca2a..73df365 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -16,6 +16,6 @@ resources: # default, aiding admins in cluster management. Those roles are # not used by the Project itself. You can comment the following lines # if you do not want those helpers be installed with your Project. -- evaljob_editor_role.yaml -- evaljob_viewer_role.yaml +- lmevaljob_editor_role.yaml +- lmevaljob_viewer_role.yaml diff --git a/config/rbac/evaljob_editor_role.yaml b/config/rbac/lmevaljob_editor_role.yaml similarity index 60% rename from config/rbac/evaljob_editor_role.yaml rename to config/rbac/lmevaljob_editor_role.yaml index 0cef639..815def1 100644 --- a/config/rbac/evaljob_editor_role.yaml +++ b/config/rbac/lmevaljob_editor_role.yaml @@ -1,16 +1,16 @@ -# permissions for end users to edit evaljobs. +# permissions for end users to edit lmevaljobs. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: app.kubernetes.io/name: fms-lm-eval-service app.kubernetes.io/managed-by: kustomize - name: evaljob-editor-role + name: lmevaljob-editor-role rules: - apiGroups: - - lm-eval-service.github.com + - foundation-model-stack.github.com.github.com resources: - - evaljobs + - lmevaljobs verbs: - create - delete @@ -20,8 +20,8 @@ rules: - update - watch - apiGroups: - - lm-eval-service.github.com + - foundation-model-stack.github.com.github.com resources: - - evaljobs/status + - lmevaljobs/status verbs: - get diff --git a/config/rbac/evaljob_viewer_role.yaml b/config/rbac/lmevaljob_viewer_role.yaml similarity index 57% rename from config/rbac/evaljob_viewer_role.yaml rename to config/rbac/lmevaljob_viewer_role.yaml index 5633edb..6396be9 100644 --- a/config/rbac/evaljob_viewer_role.yaml +++ b/config/rbac/lmevaljob_viewer_role.yaml @@ -1,23 +1,23 @@ -# permissions for end users to view evaljobs. +# permissions for end users to view lmevaljobs. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: app.kubernetes.io/name: fms-lm-eval-service app.kubernetes.io/managed-by: kustomize - name: evaljob-viewer-role + name: lmevaljob-viewer-role rules: - apiGroups: - - lm-eval-service.github.com + - foundation-model-stack.github.com.github.com resources: - - evaljobs + - lmevaljobs verbs: - get - list - watch - apiGroups: - - lm-eval-service.github.com + - foundation-model-stack.github.com.github.com resources: - - evaljobs/status + - lmevaljobs/status verbs: - get diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index e706aa3..e083a48 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -23,9 +23,9 @@ rules: - list - watch - apiGroups: - - lm-eval-service.github.com + - foundation-model-stack.github.com.github.com resources: - - evaljobs + - lmevaljobs verbs: - create - delete @@ -35,15 +35,15 @@ rules: - update - watch - apiGroups: - - lm-eval-service.github.com + - foundation-model-stack.github.com.github.com resources: - - evaljobs/finalizers + - lmevaljobs/finalizers verbs: - update - apiGroups: - - lm-eval-service.github.com + - foundation-model-stack.github.com.github.com resources: - - evaljobs/status + - lmevaljobs/status verbs: - get - patch diff --git a/config/samples/lm-eval-service_v1beta1_evaljob.yaml b/config/samples/lm-eval-service_v1beta1_evaljob.yaml index 659af2c..5818bc1 100644 --- a/config/samples/lm-eval-service_v1beta1_evaljob.yaml +++ b/config/samples/lm-eval-service_v1beta1_evaljob.yaml @@ -1,5 +1,5 @@ -apiVersion: lm-eval-service.github.com/v1beta1 -kind: EvalJob +apiVersion: foundation-model-stack.github.com.github.com/v1beta1 +kind: LMEvalJob metadata: labels: app.kubernetes.io/name: fms-lm-eval-service diff --git a/config/webhook/manifests.yaml b/config/webhook/manifests.yaml index 58f2dff..897fefc 100644 --- a/config/webhook/manifests.yaml +++ b/config/webhook/manifests.yaml @@ -10,19 +10,19 @@ webhooks: service: name: webhook-service namespace: system - path: /mutate-lm-eval-service-github-com-v1beta1-evaljob + path: /mutate-foundation-model-stack-github-com-github-com-v1beta1-lmevaljob failurePolicy: Fail - name: mevaljob.kb.io + name: mlmevaljob.kb.io rules: - apiGroups: - - lm-eval-service.github.com + - foundation-model-stack.github.com.github.com apiVersions: - v1beta1 operations: - CREATE - UPDATE resources: - - evaljobs + - lmevaljobs sideEffects: None --- apiVersion: admissionregistration.k8s.io/v1 @@ -36,17 +36,17 @@ webhooks: service: name: webhook-service namespace: system - path: /validate-lm-eval-service-github-com-v1beta1-evaljob + path: /validate-foundation-model-stack-github-com-github-com-v1beta1-lmevaljob failurePolicy: Fail - name: vevaljob.kb.io + name: vlmevaljob.kb.io rules: - apiGroups: - - lm-eval-service.github.com + - foundation-model-stack.github.com.github.com apiVersions: - v1beta1 operations: - CREATE - UPDATE resources: - - evaljobs + - lmevaljobs sideEffects: None