diff --git a/.github/workflows/image-push-release.yml b/.github/workflows/image-push-release.yml index 4b061f5..042505f 100644 --- a/.github/workflows/image-push-release.yml +++ b/.github/workflows/image-push-release.yml @@ -3,14 +3,17 @@ on: push: tags: - v* +env: + REGISTRY: "ghcr.io" + OPERATOR_IMAGE_NAME: "maintenance-operator" jobs: image-build-push: name: Image build and push runs-on: ubuntu-latest steps: - - name: Set repository as lower-case output variable - id: repo_name - run: echo ::set-output name=repository::$(echo ${{ github.repository }} | tr '[:upper:]' '[:lower:]') + - name: Set repository owner as lower-case output variable + id: repo_owner + run: echo REPO_OWNER=$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV - name: Check out code into the Go module directory uses: actions/checkout@v4 with: @@ -29,7 +32,7 @@ jobs: id: docker_meta uses: docker/metadata-action@v5 with: - images: ghcr.io/${{ steps.repo_name.outputs.repository }} + images: ${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.OPERATOR_IMAGE_NAME }} tags: | type=ref,event=tag flavor: | @@ -44,3 +47,23 @@ jobs: ${{ steps.docker_meta.outputs.tags }} labels: ${{ steps.docker_meta.outputs.labels }} file: ./Dockerfile + - name: Determine version, tag, and base branch + run: | + git_tag=${{ github.ref_name }} + echo VERSION_WITH_PREFIX=$git_tag >> $GITHUB_ENV + echo VERSION_WITHOUT_PREFIX=${git_tag:1} >> $GITHUB_ENV # without the 'v' prefix + - name: Lookup image digest + run: | + operator_digest=$(skopeo inspect docker://$REGISTRY/$REPO_OWNER/$OPERATOR_IMAGE_NAME:$VERSION_WITH_PREFIX | jq -r .Digest) + echo $operator_digest | wc -w | grep 1 # verifies value not empty + echo OPERATOR_DIGEST=$operator_digest >> $GITHUB_ENV + - name: Make bundle + env: + IMG: ${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.OPERATOR_IMAGE_NAME }}@${{ env.OPERATOR_DIGEST }} + BUNDLE_IMG: ${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.OPERATOR_IMAGE_NAME }}-bundle:${{ env.VERSION_WITH_PREFIX }} + VERSION: ${{ env.VERSION_WITHOUT_PREFIX }} + run: | + version_major_minor=$(echo $VERSION_WITH_PREFIX | grep -Eo 'v[0-9]+\.[0-9]+') + export CHANNELS=stable,$version_major_minor + export DEFAULT_CHANNEL=$version_major_minor + make bundle bundle-build bundle-push diff --git a/Makefile b/Makefile index 3506323..89c4ac7 100644 --- a/Makefile +++ b/Makefile @@ -49,6 +49,8 @@ ifeq ($(USE_IMAGE_DIGESTS), true) BUNDLE_GEN_FLAGS += --use-image-digests endif +BUNDLE_OCP_VERSIONS=v4.14-v4.17 + # Set the Operator SDK version to use. By default, what is installed on the system is used. # This is useful for CI or a project to utilize a specific version of the operator-sdk toolkit. OPERATOR_SDK_VERSION ?= v1.35.0 @@ -410,10 +412,11 @@ undeploy-operator-e2e: helm ## Undeploy operator from test cluster ##@ Build Dependencies .PHONY: bundle -bundle: manifests kustomize operator-sdk ## Generate bundle manifests and metadata, then validate generated files. +bundle: manifests kustomize operator-sdk $(YQ) ## Generate bundle manifests and metadata, then validate generated files. $(OPERATOR_SDK) generate kustomize manifests -q cd config/manager && $(KUSTOMIZE) edit set image controller=$(IMG) $(KUSTOMIZE) build config/manifests | $(OPERATOR_SDK) generate bundle $(BUNDLE_GEN_FLAGS) + BUNDLE_OCP_VERSIONS=$(BUNDLE_OCP_VERSIONS) TAG=$(IMG) hack/scripts/ocp-bundle-postprocess.sh $(OPERATOR_SDK) bundle validate ./bundle .PHONY: bundle-build diff --git a/PROJECT b/PROJECT index ef9cd94..5cf4ef9 100644 --- a/PROJECT +++ b/PROJECT @@ -8,7 +8,7 @@ layout: plugins: manifests.sdk.operatorframework.io/v2: {} scorecard.sdk.operatorframework.io/v2: {} -projectName: maintenance-operator +projectName: nvidia-maintenance-operator repo: github.com/Mellanox/maintenance-operator resources: - api: diff --git a/api/v1alpha1/maintenanceoperatorconfig_types.go b/api/v1alpha1/maintenanceoperatorconfig_types.go index 68a46ac..aa1e235 100644 --- a/api/v1alpha1/maintenanceoperatorconfig_types.go +++ b/api/v1alpha1/maintenanceoperatorconfig_types.go @@ -58,14 +58,19 @@ type MaintenanceOperatorConfigSpec struct { MaxNodeMaintenanceTimeSeconds int32 `json:"maxNodeMaintenanceTimeSeconds,omitempty"` } +type MaintenanceOperatorConfigStatus struct { +} + //+kubebuilder:object:root=true +//+kubebuilder:subresource:status // MaintenanceOperatorConfig is the Schema for the maintenanceoperatorconfigs API type MaintenanceOperatorConfig struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` - Spec MaintenanceOperatorConfigSpec `json:"spec,omitempty"` + Spec MaintenanceOperatorConfigSpec `json:"spec,omitempty"` + Status MaintenanceOperatorConfigStatus `json:"status,omitempty"` } //+kubebuilder:object:root=true diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 65b4c55..0e7310e 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -74,6 +74,7 @@ func (in *MaintenanceOperatorConfig) DeepCopyInto(out *MaintenanceOperatorConfig out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) in.Spec.DeepCopyInto(&out.Spec) + out.Status = in.Status } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MaintenanceOperatorConfig. @@ -151,6 +152,21 @@ func (in *MaintenanceOperatorConfigSpec) DeepCopy() *MaintenanceOperatorConfigSp return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MaintenanceOperatorConfigStatus) DeepCopyInto(out *MaintenanceOperatorConfigStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MaintenanceOperatorConfigStatus. +func (in *MaintenanceOperatorConfigStatus) DeepCopy() *MaintenanceOperatorConfigStatus { + if in == nil { + return nil + } + out := new(MaintenanceOperatorConfigStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NodeMaintenance) DeepCopyInto(out *NodeMaintenance) { *out = *in diff --git a/config/rbac/auth_proxy_service.yaml b/bundle/manifests/maintenance-operator-webhook-service_v1_service.yaml similarity index 54% rename from config/rbac/auth_proxy_service.yaml rename to bundle/manifests/maintenance-operator-webhook-service_v1_service.yaml index 5a62b50..fdf560d 100644 --- a/config/rbac/auth_proxy_service.yaml +++ b/bundle/manifests/maintenance-operator-webhook-service_v1_service.yaml @@ -1,21 +1,21 @@ apiVersion: v1 kind: Service metadata: + creationTimestamp: null labels: - control-plane: controller-manager - app.kubernetes.io/name: service - app.kubernetes.io/instance: controller-manager-metrics-service - app.kubernetes.io/component: kube-rbac-proxy + app.kubernetes.io/component: webhook app.kubernetes.io/created-by: maintenance-operator - app.kubernetes.io/part-of: maintenance-operator + app.kubernetes.io/instance: webhook-service app.kubernetes.io/managed-by: kustomize - name: controller-manager-metrics-service - namespace: system + app.kubernetes.io/name: service + app.kubernetes.io/part-of: maintenance-operator + name: maintenance-operator-webhook-service spec: ports: - - name: https - port: 8443 + - port: 443 protocol: TCP - targetPort: https + targetPort: 9443 selector: control-plane: controller-manager +status: + loadBalancer: {} diff --git a/bundle/manifests/maintenance.nvidia.com_maintenanceoperatorconfigs.yaml b/bundle/manifests/maintenance.nvidia.com_maintenanceoperatorconfigs.yaml new file mode 100644 index 0000000..c5c96be --- /dev/null +++ b/bundle/manifests/maintenance.nvidia.com_maintenanceoperatorconfigs.yaml @@ -0,0 +1,100 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + cert-manager.io/inject-ca-from: maintenance-operator-system/maintenance-operator-serving-cert + controller-gen.kubebuilder.io/version: v0.15.0 + creationTimestamp: null + name: maintenanceoperatorconfigs.maintenance.nvidia.com +spec: + group: maintenance.nvidia.com + names: + kind: MaintenanceOperatorConfig + listKind: MaintenanceOperatorConfigList + plural: maintenanceoperatorconfigs + singular: maintenanceoperatorconfig + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: MaintenanceOperatorConfig is the Schema for the maintenanceoperatorconfigs + API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: MaintenanceOperatorConfigSpec defines the desired state of + MaintenanceOperatorConfig + properties: + logLevel: + default: info + description: LogLevel is the operator logging level + enum: + - debug + - info + - error + type: string + maxNodeMaintenanceTimeSeconds: + default: 1600 + description: |- + MaxNodeMaintenanceTimeSeconds is the time from when a NodeMaintenance is marked as ready (phase: Ready) + until the NodeMaintenance is considered stale and removed by the operator. + should be less than idle time for any autoscaler that is running. + default to 30m (1600 seconds) + format: int32 + minimum: 0 + type: integer + maxParallelOperations: + anyOf: + - type: integer + - type: string + default: 1 + description: |- + MaxParallelOperations indicates the maximal number nodes that can undergo maintenance + at a given time. 0 means no limit + value can be an absolute number (ex: 5) or a percentage of total nodes in the cluster (ex: 10%). + absolute number is calculated from percentage by rounding up. + defaults to 1. The actual number of nodes that can undergo maintenance may be lower depending + on the value of MaintenanceOperatorConfigSpec.MaxUnavailable. + x-kubernetes-int-or-string: true + maxUnavailable: + anyOf: + - type: integer + - type: string + description: |- + MaxUnavailable is the maximum number of nodes that can become unavailable in the cluster. + value can be an absolute number (ex: 5) or a percentage of total nodes in the cluster (ex: 10%). + absolute number is calculated from percentage by rounding up. + by default, unset. + new nodes will not be processed if the number of unavailable node will exceed this value + x-kubernetes-int-or-string: true + type: object + status: + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: null + storedVersions: null diff --git a/bundle/manifests/maintenance.nvidia.com_nodemaintenances.yaml b/bundle/manifests/maintenance.nvidia.com_nodemaintenances.yaml new file mode 100644 index 0000000..0723f3d --- /dev/null +++ b/bundle/manifests/maintenance.nvidia.com_nodemaintenances.yaml @@ -0,0 +1,292 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + cert-manager.io/inject-ca-from: maintenance-operator-system/maintenance-operator-serving-cert + controller-gen.kubebuilder.io/version: v0.15.0 + creationTimestamp: null + name: nodemaintenances.maintenance.nvidia.com +spec: + group: maintenance.nvidia.com + names: + kind: NodeMaintenance + listKind: NodeMaintenanceList + plural: nodemaintenances + singular: nodemaintenance + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.nodeName + name: Node + type: string + - jsonPath: .spec.requestorID + name: Requestor + type: string + - jsonPath: .status.conditions[?(@.type=='Ready')].status + name: Ready + type: string + - jsonPath: .status.conditions[?(@.type=='Ready')].reason + name: Phase + type: string + - jsonPath: .status.conditions[?(@.type=='Failed')].reason + name: Failed + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: NodeMaintenance is the Schema for the nodemaintenances API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: NodeMaintenanceSpec defines the desired state of NodeMaintenance + properties: + additionalRequestors: + description: |- + AdditionalRequestors is a set of additional requestor IDs which are using the same NodeMaintenance + request. addition or removal of requiestor IDs to this list MUST be made with update operation (and retry on failure) + which will replace the entire list. + items: + type: string + type: array + x-kubernetes-list-type: set + cordon: + default: true + description: Cordon if set, marks node as unschedulable during maintenance + operation + type: boolean + drainSpec: + description: DrainSpec specifies how a node will be drained. if not + provided, no draining will be performed. + properties: + deleteEmptyDir: + default: false + description: |- + DeleteEmptyDir indicates if should continue even if there are pods using emptyDir + (local data that will be deleted when the node is drained) + type: boolean + force: + default: false + description: Force draining even if there are pods that do not + declare a controller + type: boolean + podEvictionFilters: + description: |- + PodEvictionFilters specifies filters for pods that need to undergo eviction during drain. + if specified. only pods that match PodEvictionFilters will be evicted during drain operation. + if unspecified. all non-daemonset pods will be evicted. + logical OR is performed between filter entires. logical AND is performed within different filters + in a filter entry. + items: + description: PodEvictionFiterEntry defines filters for Pod evictions + during drain operation + properties: + byResourceNameRegex: + description: ByResourceNameRegex filters pods by the name + of the resources they consume using regex. + type: string + type: object + type: array + podSelector: + description: |- + PodSelector specifies a label selector to filter pods on the node that need to be drained + For more details on label selectors, see: + https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + type: string + timeoutSeconds: + default: 300 + description: TimeoutSecond specifies the length of time in seconds + to wait before giving up drain, zero means infinite + format: int32 + minimum: 0 + type: integer + type: object + nodeName: + description: |- + NodeName is The name of the node that maintenance operation will be performed on + creation fails if node obj does not exist (webhook) + type: string + x-kubernetes-validations: + - message: Value is immutable + rule: self == oldSelf + requestorID: + description: |- + RequestorID MUST follow domain name notation format (https://tools.ietf.org/html/rfc1035#section-2.3.1) + It MUST be 63 characters or less, beginning and ending with an alphanumeric + character ([a-z0-9A-Z]) with dashes (-), dots (.), and alphanumerics between. + caller SHOULD NOT create multiple objects with same requestorID and nodeName. + This field identifies the requestor of the operation. + maxLength: 63 + minLength: 2 + pattern: ^([a-z0-9A-Z]([-a-z0-9A-Z]*[a-z0-9A-Z])?(\.[a-z0-9A-Z]([-a-z0-9A-Z]*[a-z0-9A-Z])?)*)$ + type: string + x-kubernetes-validations: + - message: Value is immutable + rule: self == oldSelf + waitForPodCompletion: + description: |- + WaitForPodCompletion specifies pods via selector to wait for completion before performing drain operation + if not provided, will not wait for pods to complete + properties: + podSelector: + description: |- + PodSelector specifies a label selector for the pods to wait for completion + For more details on label selectors, see: + https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + example: app=my-workloads + type: string + timeoutSeconds: + default: 0 + description: |- + TimeoutSecond specifies the length of time in seconds + to wait before giving up on pod termination, zero means infinite + format: int32 + minimum: 0 + type: integer + type: object + required: + - nodeName + - requestorID + type: object + status: + description: NodeMaintenanceStatus defines the observed state of NodeMaintenance + properties: + conditions: + description: Conditions represents observations of NodeMaintenance + current state + items: + description: "Condition contains details for one aspect of the current + state of this API Resource.\n---\nThis struct is intended for + direct use as an array at the field path .status.conditions. For + example,\n\n\n\ttype FooStatus struct{\n\t // Represents the + observations of a foo's current state.\n\t // Known .status.conditions.type + are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // + +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t + \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" + patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t + \ // other fields\n\t}" + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + drain: + description: Drain represents the drain status of the node + properties: + drainProgress: + description: DrainProgress represents the draining progress as + percentage + format: int32 + minimum: 0 + type: integer + evictionPods: + description: EvictionPods is the total number of pods that need + to be evicted at the time NodeMaintenance started draining + format: int32 + minimum: 0 + type: integer + totalPods: + description: TotalPods is the number of pods on the node at the + time NodeMaintenance started draining + format: int32 + minimum: 0 + type: integer + waitForEviction: + description: WaitForEviction is the list of namespaced named pods + that need to be evicted + items: + type: string + type: array + required: + - drainProgress + - evictionPods + - totalPods + type: object + waitForCompletion: + description: WaitForCompletion is the list of namespaced named pods + that we wait to complete + items: + type: string + type: array + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: null + storedVersions: null diff --git a/bundle/manifests/nvidia-maintenance-operator.clusterserviceversion.yaml b/bundle/manifests/nvidia-maintenance-operator.clusterserviceversion.yaml new file mode 100644 index 0000000..f1bae75 --- /dev/null +++ b/bundle/manifests/nvidia-maintenance-operator.clusterserviceversion.yaml @@ -0,0 +1,376 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: ClusterServiceVersion +metadata: + annotations: + alm-examples: |- + [ + { + "apiVersion": "maintenance.nvidia.com/v1alpha1", + "kind": "MaintenanceOperatorConfig", + "metadata": { + "labels": { + "app.kubernetes.io/created-by": "maintenance-operator", + "app.kubernetes.io/instance": "maintenanceoperatorconfig-sample", + "app.kubernetes.io/managed-by": "kustomize", + "app.kubernetes.io/name": "maintenanceoperatorconfig", + "app.kubernetes.io/part-of": "maintenance-operator" + }, + "name": "maintenanceoperatorconfig-sample" + }, + "spec": { + "logLevel": "info", + "maxNodeMaintenanceTimeSeconds": 1600, + "maxParallelOperations": 1, + "maxUnavailable": "30%" + } + }, + { + "apiVersion": "maintenance.nvidia.com/v1alpha1", + "kind": "NodeMaintenance", + "metadata": { + "labels": { + "app.kubernetes.io/created-by": "maintenance-operator", + "app.kubernetes.io/instance": "nodemaintenance-sample", + "app.kubernetes.io/managed-by": "kustomize", + "app.kubernetes.io/name": "nodemaintenance", + "app.kubernetes.io/part-of": "maintenance-operator" + }, + "name": "nodemaintenance-sample" + }, + "spec": { + "cordon": true, + "drainSpec": { + "deleteEmptyDir": true, + "force": true, + "podEvictionFilters": [ + { + "byResourceNameRegex": "nvidia.com/gpu-*" + }, + { + "byResourceNameRegex": "nvidia.com/rdma*" + } + ], + "podSelector": "app=rdma-workload", + "timeoutSeconds": 0 + }, + "nodeName": "worker-node-01", + "requestorID": "some-cluster-component.vendor.com", + "waitForPodCompletion": { + "podSelector": "app=rdma-workload", + "timeoutSeconds": 0 + } + } + } + ] + capabilities: Basic Install + createdAt: "2025-01-07T08:19:53Z" + description: Node maintenance in K8s cluster in a coordinated manner + features.operators.openshift.io/cnf: "false" + features.operators.openshift.io/cni: "false" + features.operators.openshift.io/csi: "false" + features.operators.openshift.io/disconnected: "true" + features.operators.openshift.io/fips-compliant: "false" + features.operators.openshift.io/proxy-aware: "true" + features.operators.openshift.io/tls-profiles: "false" + features.operators.openshift.io/token-auth-aws: "false" + features.operators.openshift.io/token-auth-azure: "false" + features.operators.openshift.io/token-auth-gcp: "false" + operatorframework.io/suggested-namespace: nvidia-maintenance-operator + operators.operatorframework.io/builder: operator-sdk-v1.37.0 + operators.operatorframework.io/project_layout: go.kubebuilder.io/v4 + provider: NVIDIA + repository: https://github.com/Mellanox/maintenance-operator/ + support: NVIDIA + containerImage: ghcr.io/mellanox/maintenance-operator@sha256:7a9bb354429ab982a056088c1bd1d221063502970a4d5590602b7f133f5f531c + labels: + operatorframework.io/arch.amd64: supported + operatorframework.io/arch.arm64: supported + name: nvidia-maintenance-operator.v0.1.1 + namespace: placeholder +spec: + apiservicedefinitions: {} + customresourcedefinitions: + owned: + - description: MaintenanceOperatorConfig is the Schema for the maintenanceoperatorconfigs API + displayName: Maintenance Operator Config + kind: MaintenanceOperatorConfig + name: maintenanceoperatorconfigs.maintenance.nvidia.com + version: v1alpha1 + - description: NodeMaintenance is the Schema for the nodemaintenances API + displayName: Node Maintenance + kind: NodeMaintenance + name: nodemaintenances.maintenance.nvidia.com + version: v1alpha1 + description: Coordinates And Performs Common Node Preparations For Maintenance + displayName: NVIDIA Maintenance Operator + icon: + - base64data: "" + mediatype: "" + install: + spec: + clusterPermissions: + - rules: + - apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - update + - apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - patch + - update + - watch + - apiGroups: + - "" + resources: + - pods + verbs: + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - "" + resources: + - pods/eviction + verbs: + - create + - delete + - get + - list + - patch + - update + - apiGroups: + - apps + resources: + - daemonsets + verbs: + - get + - list + - watch + - apiGroups: + - maintenance.nvidia.com + resources: + - maintenanceoperatorconfigs + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - maintenance.nvidia.com + resources: + - maintenanceoperatorconfigs/finalizers + verbs: + - update + - apiGroups: + - maintenance.nvidia.com + resources: + - maintenanceoperatorconfigs/status + verbs: + - get + - patch + - update + - apiGroups: + - maintenance.nvidia.com + resources: + - nodemaintenances + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - maintenance.nvidia.com + resources: + - nodemaintenances/finalizers + verbs: + - update + - apiGroups: + - maintenance.nvidia.com + resources: + - nodemaintenances/status + verbs: + - get + - patch + - update + serviceAccountName: maintenance-operator-controller-manager + deployments: + - label: + app.kubernetes.io/component: manager + app.kubernetes.io/created-by: maintenance-operator + app.kubernetes.io/instance: controller-manager + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: deployment + app.kubernetes.io/part-of: maintenance-operator + control-plane: controller-manager + name: maintenance-operator-controller-manager + spec: + replicas: 1 + selector: + matchLabels: + control-plane: controller-manager + strategy: {} + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + control-plane: controller-manager + spec: + containers: + - args: + - --leader-elect + command: + - /manager + env: + - name: OPERATOR_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: ENABLE_WEBHOOKS + value: "true" + image: ghcr.io/mellanox/maintenance-operator@sha256:7a9bb354429ab982a056088c1bd1d221063502970a4d5590602b7f133f5f531c + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumeMounts: + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + serviceAccountName: maintenance-operator-controller-manager + terminationGracePeriodSeconds: 10 + volumes: + - name: cert + secret: + defaultMode: 420 + secretName: webhook-server-cert + permissions: + - rules: + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + serviceAccountName: maintenance-operator-controller-manager + strategy: deployment + installModes: + - supported: true + type: OwnNamespace + - supported: false + type: SingleNamespace + - supported: false + type: MultiNamespace + - supported: true + type: AllNamespaces + keywords: + - node + - maintenance + - drain + links: + - name: NVIDIA Maintenance Operator + url: https://github.com/Mellanox/maintenance-operator/ + maintainers: + - email: nvidia-network-operator-support@nvidia.com + name: NVIDIA + maturity: alpha + minKubeVersion: 1.30.0 + provider: + name: NVIDIA + url: https://github.com/Mellanox/maintenance-operator/ + version: 0.1.1 + webhookdefinitions: + - admissionReviewVersions: + - v1 + containerPort: 443 + deploymentName: maintenance-operator-controller-manager + failurePolicy: Fail + generateName: vnodemaintenance.kb.io + rules: + - apiGroups: + - maintenance.nvidia.com + apiVersions: + - v1alpha1 + operations: + - CREATE + resources: + - nodemaintenances + sideEffects: None + targetPort: 9443 + type: ValidatingAdmissionWebhook + webhookPath: /validate-maintenance-nvidia-com-v1alpha1-nodemaintenance + relatedImages: + - name: nvidia-maintenance-operator + image: ghcr.io/mellanox/maintenance-operator@sha256:7a9bb354429ab982a056088c1bd1d221063502970a4d5590602b7f133f5f531c diff --git a/bundle/metadata/annotations.yaml b/bundle/metadata/annotations.yaml new file mode 100644 index 0000000..32a71b9 --- /dev/null +++ b/bundle/metadata/annotations.yaml @@ -0,0 +1,16 @@ +annotations: + # Core bundle annotations. + operators.operatorframework.io.bundle.mediatype.v1: registry+v1 + operators.operatorframework.io.bundle.manifests.v1: manifests/ + operators.operatorframework.io.bundle.metadata.v1: metadata/ + operators.operatorframework.io.bundle.package.v1: nvidia-maintenance-operator + operators.operatorframework.io.bundle.channels.v1: v0.1.1,stable + operators.operatorframework.io.bundle.channel.default.v1: v0.1.1 + operators.operatorframework.io.metrics.builder: operator-sdk-v1.37.0 + operators.operatorframework.io.metrics.mediatype.v1: metrics+v1 + operators.operatorframework.io.metrics.project_layout: go.kubebuilder.io/v4 + + # Annotations for testing. + operators.operatorframework.io.test.mediatype.v1: scorecard+v1 + operators.operatorframework.io.test.config.v1: tests/scorecard/ + com.redhat.openshift.versions: v4.14-v4.17 diff --git a/bundle/tests/scorecard/config.yaml b/bundle/tests/scorecard/config.yaml new file mode 100644 index 0000000..4e32de1 --- /dev/null +++ b/bundle/tests/scorecard/config.yaml @@ -0,0 +1,70 @@ +apiVersion: scorecard.operatorframework.io/v1alpha3 +kind: Configuration +metadata: + name: config +stages: +- parallel: true + tests: + - entrypoint: + - scorecard-test + - basic-check-spec + image: quay.io/operator-framework/scorecard-test:v1.35.0 + labels: + suite: basic + test: basic-check-spec-test + storage: + spec: + mountPath: {} + - entrypoint: + - scorecard-test + - olm-bundle-validation + image: quay.io/operator-framework/scorecard-test:v1.35.0 + labels: + suite: olm + test: olm-bundle-validation-test + storage: + spec: + mountPath: {} + - entrypoint: + - scorecard-test + - olm-crds-have-validation + image: quay.io/operator-framework/scorecard-test:v1.35.0 + labels: + suite: olm + test: olm-crds-have-validation-test + storage: + spec: + mountPath: {} + - entrypoint: + - scorecard-test + - olm-crds-have-resources + image: quay.io/operator-framework/scorecard-test:v1.35.0 + labels: + suite: olm + test: olm-crds-have-resources-test + storage: + spec: + mountPath: {} + - entrypoint: + - scorecard-test + - olm-spec-descriptors + image: quay.io/operator-framework/scorecard-test:v1.35.0 + labels: + suite: olm + test: olm-spec-descriptors-test + storage: + spec: + mountPath: {} + - entrypoint: + - scorecard-test + - olm-status-descriptors + image: quay.io/operator-framework/scorecard-test:v1.35.0 + labels: + suite: olm + test: olm-status-descriptors-test + storage: + spec: + mountPath: {} +storage: + spec: + mountPath: {} diff --git a/config/crd/bases/maintenance.nvidia.com_maintenanceoperatorconfigs.yaml b/config/crd/bases/maintenance.nvidia.com_maintenanceoperatorconfigs.yaml index 24a29d6..4f27568 100644 --- a/config/crd/bases/maintenance.nvidia.com_maintenanceoperatorconfigs.yaml +++ b/config/crd/bases/maintenance.nvidia.com_maintenanceoperatorconfigs.yaml @@ -84,6 +84,10 @@ spec: new nodes will not be processed if the number of unavailable node will exceed this value x-kubernetes-int-or-string: true type: object + status: + type: object type: object served: true storage: true + subresources: + status: {} diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index 61f1c48..ee1f56b 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -27,11 +27,6 @@ resources: #- ../prometheus patches: -# Protect the /metrics endpoint by putting it behind auth. -# If you want your controller-manager to expose the /metrics -# endpoint w/o any authn/z, please comment the following line. -#- path: manager_auth_proxy_patch.yaml - # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in # crd/kustomization.yaml - path: manager_webhook_patch.yaml diff --git a/config/default/manager_auth_proxy_patch.yaml b/config/default/manager_auth_proxy_patch.yaml deleted file mode 100644 index 70c3437..0000000 --- a/config/default/manager_auth_proxy_patch.yaml +++ /dev/null @@ -1,39 +0,0 @@ -# This patch inject a sidecar container which is a HTTP proxy for the -# controller manager, it performs RBAC authorization against the Kubernetes API using SubjectAccessReviews. -apiVersion: apps/v1 -kind: Deployment -metadata: - name: controller-manager - namespace: system -spec: - template: - spec: - containers: - - name: kube-rbac-proxy - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - "ALL" - image: gcr.io/kubebuilder/kube-rbac-proxy:v0.15.0 - args: - - "--secure-listen-address=0.0.0.0:8443" - - "--upstream=http://127.0.0.1:8080/" - - "--logtostderr=true" - - "--v=0" - ports: - - containerPort: 8443 - protocol: TCP - name: https - resources: - limits: - cpu: 500m - memory: 128Mi - requests: - cpu: 5m - memory: 64Mi - - name: manager - args: - - "--health-probe-bind-address=:8081" - - "--metrics-bind-address=127.0.0.1:8080" - - "--leader-elect" diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 8f15c40..a5bba9b 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -3,6 +3,6 @@ resources: apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization images: -- name: controller - newName: nvidia.com/maintenance-operator - newTag: latest +- digest: sha256:7a9bb354429ab982a056088c1bd1d221063502970a4d5590602b7f133f5f531c + name: controller + newName: ghcr.io/mellanox/maintenance-operator diff --git a/config/manifests/bases/maintenance-operator.clusterserviceversion.yaml b/config/manifests/bases/maintenance-operator.clusterserviceversion.yaml deleted file mode 100644 index b03eba4..0000000 --- a/config/manifests/bases/maintenance-operator.clusterserviceversion.yaml +++ /dev/null @@ -1,50 +0,0 @@ -apiVersion: operators.coreos.com/v1alpha1 -kind: ClusterServiceVersion -metadata: - annotations: - alm-examples: '[]' - capabilities: Basic Install - name: maintenance-operator.v0.0.0 - namespace: placeholder -spec: - apiservicedefinitions: {} - customresourcedefinitions: - owned: - - description: NodeMaintenance is the Schema for the nodemaintenances API - displayName: Node Maintenance - kind: NodeMaintenance - name: nodemaintenances.maintenance.nvidia.com - version: v1alpha1 - description: Coordinates And Performs Common Node Preparations For Maintenance - displayName: NVIDIA Maintenance Operator - icon: - - base64data: "" - mediatype: "" - install: - spec: - deployments: null - strategy: "" - installModes: - - supported: false - type: OwnNamespace - - supported: false - type: SingleNamespace - - supported: false - type: MultiNamespace - - supported: true - type: AllNamespaces - keywords: - - node - - maintenance - - drain - links: - - name: Maintenance Operator - url: https://maintenance-operator.domain - maintainers: - - email: noemail@nodomain.com - name: na - maturity: alpha - minKubeVersion: 1.30.0 - provider: - name: Nvidia - version: 0.0.0 diff --git a/config/manifests/bases/nvidia-maintenance-operator.clusterserviceversion.yaml b/config/manifests/bases/nvidia-maintenance-operator.clusterserviceversion.yaml new file mode 100644 index 0000000..897ffdb --- /dev/null +++ b/config/manifests/bases/nvidia-maintenance-operator.clusterserviceversion.yaml @@ -0,0 +1,75 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: ClusterServiceVersion +metadata: + annotations: + alm-examples: '[]' + capabilities: Basic Install + description: Node maintenance in K8s cluster in a coordinated manner + features.operators.openshift.io/cnf: "false" + features.operators.openshift.io/cni: "false" + features.operators.openshift.io/csi: "false" + features.operators.openshift.io/disconnected: "true" + features.operators.openshift.io/fips-compliant: "false" + features.operators.openshift.io/proxy-aware: "true" + features.operators.openshift.io/tls-profiles: "false" + features.operators.openshift.io/token-auth-aws: "false" + features.operators.openshift.io/token-auth-azure: "false" + features.operators.openshift.io/token-auth-gcp: "false" + operatorframework.io/suggested-namespace: nvidia-maintenance-operator + provider: NVIDIA + repository: https://github.com/Mellanox/maintenance-operator/ + support: NVIDIA + labels: + operatorframework.io/arch.amd64: supported + operatorframework.io/arch.arm64: supported + name: nvidia-maintenance-operator.v0.0.0 + namespace: placeholder +spec: + apiservicedefinitions: {} + customresourcedefinitions: + owned: + - description: MaintenanceOperatorConfig is the Schema for the maintenanceoperatorconfigs + API + displayName: Maintenance Operator Config + kind: MaintenanceOperatorConfig + name: maintenanceoperatorconfigs.maintenance.nvidia.com + version: v1alpha1 + - description: NodeMaintenance is the Schema for the nodemaintenances API + displayName: Node Maintenance + kind: NodeMaintenance + name: nodemaintenances.maintenance.nvidia.com + version: v1alpha1 + description: Coordinates And Performs Common Node Preparations For Maintenance + displayName: NVIDIA Maintenance Operator + icon: + - base64data: "" + mediatype: "" + install: + spec: + deployments: null + strategy: "" + installModes: + - supported: true + type: OwnNamespace + - supported: false + type: SingleNamespace + - supported: false + type: MultiNamespace + - supported: true + type: AllNamespaces + keywords: + - node + - maintenance + - drain + links: + - name: NVIDIA Maintenance Operator + url: https://github.com/Mellanox/maintenance-operator/ + maintainers: + - email: nvidia-network-operator-support@nvidia.com + name: NVIDIA + maturity: alpha + minKubeVersion: 1.30.0 + provider: + name: NVIDIA + url: https://github.com/Mellanox/maintenance-operator/ + version: 0.0.0 diff --git a/config/manifests/kustomization.yaml b/config/manifests/kustomization.yaml index 5f4f81a..dbb98c1 100644 --- a/config/manifests/kustomization.yaml +++ b/config/manifests/kustomization.yaml @@ -1,7 +1,7 @@ # These resources constitute the fully configured set of manifests # used to generate the 'manifests/' directory in a bundle. resources: -- bases/maintenance-operator.clusterserviceversion.yaml +- bases/nvidia-maintenance-operator.clusterserviceversion.yaml - ../default - ../samples - ../scorecard diff --git a/config/rbac/auth_proxy_client_clusterrole.yaml b/config/rbac/auth_proxy_client_clusterrole.yaml deleted file mode 100644 index d3e8880..0000000 --- a/config/rbac/auth_proxy_client_clusterrole.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: clusterrole - app.kubernetes.io/instance: metrics-reader - app.kubernetes.io/component: kube-rbac-proxy - app.kubernetes.io/created-by: maintenance-operator - app.kubernetes.io/part-of: maintenance-operator - app.kubernetes.io/managed-by: kustomize - name: metrics-reader -rules: -- nonResourceURLs: - - "/metrics" - verbs: - - get diff --git a/config/rbac/auth_proxy_role.yaml b/config/rbac/auth_proxy_role.yaml deleted file mode 100644 index 7930a3b..0000000 --- a/config/rbac/auth_proxy_role.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: clusterrole - app.kubernetes.io/instance: proxy-role - app.kubernetes.io/component: kube-rbac-proxy - app.kubernetes.io/created-by: maintenance-operator - app.kubernetes.io/part-of: maintenance-operator - app.kubernetes.io/managed-by: kustomize - name: proxy-role -rules: -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create diff --git a/config/rbac/auth_proxy_role_binding.yaml b/config/rbac/auth_proxy_role_binding.yaml deleted file mode 100644 index 0d01dbb..0000000 --- a/config/rbac/auth_proxy_role_binding.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - app.kubernetes.io/name: clusterrolebinding - app.kubernetes.io/instance: proxy-rolebinding - app.kubernetes.io/component: kube-rbac-proxy - app.kubernetes.io/created-by: maintenance-operator - app.kubernetes.io/part-of: maintenance-operator - app.kubernetes.io/managed-by: kustomize - name: proxy-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: proxy-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index 731832a..166fe79 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -9,10 +9,3 @@ resources: - role_binding.yaml - leader_election_role.yaml - leader_election_role_binding.yaml -# Comment the following 4 lines if you want to disable -# the auth proxy (https://github.com/brancz/kube-rbac-proxy) -# which protects your /metrics endpoint. -- auth_proxy_service.yaml -- auth_proxy_role.yaml -- auth_proxy_role_binding.yaml -- auth_proxy_client_clusterrole.yaml diff --git a/config/samples/maintenance_v1alpha1_maintenanceoperatorconfig.yaml b/config/samples/maintenance_v1alpha1_maintenanceoperatorconfig.yaml index ad87df1..7e50068 100644 --- a/config/samples/maintenance_v1alpha1_maintenanceoperatorconfig.yaml +++ b/config/samples/maintenance_v1alpha1_maintenanceoperatorconfig.yaml @@ -8,7 +8,6 @@ metadata: app.kubernetes.io/managed-by: kustomize app.kubernetes.io/created-by: maintenance-operator name: maintenanceoperatorconfig-sample -spec: spec: # maxParallelOperations indicates how many nodes can be processed in parallel, 0 means no limit # value can be an absolute number (ex: 5) or a percentage of total nodes in the cluster (ex: 10%). diff --git a/deployment/maintenance-operator-chart/crds/maintenance.nvidia.com_maintenanceoperatorconfigs.yaml b/deployment/maintenance-operator-chart/crds/maintenance.nvidia.com_maintenanceoperatorconfigs.yaml index 24a29d6..4f27568 100644 --- a/deployment/maintenance-operator-chart/crds/maintenance.nvidia.com_maintenanceoperatorconfigs.yaml +++ b/deployment/maintenance-operator-chart/crds/maintenance.nvidia.com_maintenanceoperatorconfigs.yaml @@ -84,6 +84,10 @@ spec: new nodes will not be processed if the number of unavailable node will exceed this value x-kubernetes-int-or-string: true type: object + status: + type: object type: object served: true storage: true + subresources: + status: {} diff --git a/hack/scripts/ocp-bundle-postprocess.sh b/hack/scripts/ocp-bundle-postprocess.sh new file mode 100755 index 0000000..6dfe10c --- /dev/null +++ b/hack/scripts/ocp-bundle-postprocess.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# 2024 NVIDIA CORPORATION & AFFILIATES +# +# Licensed under the Apache License, Version 2.0 (the License); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o nounset +set -o pipefail +set -o errexit + +if [[ "${TRACE-0}" == "1" ]]; then + set -o xtrace +fi + +BASE=${PWD} +YQ_CMD="${BASE}/bin/yq" + +printf " relatedImages:\n - name: nvidia-maintenance-operator\n image: %s" "$TAG" >> bundle/manifests/nvidia-maintenance-operator.clusterserviceversion.yaml + +# Add containerImage annotation +$YQ_CMD eval -i '.metadata.annotations.containerImage = strenv(TAG)' bundle/manifests/nvidia-maintenance-operator.clusterserviceversion.yaml + +# Add OpenShift versions in metadata/annotations.yaml +echo " com.redhat.openshift.versions: $BUNDLE_OCP_VERSIONS" >> bundle/metadata/annotations.yaml