From 0b723b5b523fdbb9530e84c1ac149327ba95c2c5 Mon Sep 17 00:00:00 2001 From: Alexander Maslennikov Date: Mon, 16 Dec 2024 09:50:13 +0100 Subject: [PATCH] feat: add OCP bundle - build OCP bundle - configure operator configmap via kustomize - add required RBAC permissions to operate in openshift - add a github action to build and push the bundle to ghcr on a new tag Signed-off-by: Alexander Maslennikov --- .github/workflows/image-push-release.yml | 48 ++- Makefile | 6 +- PROJECT | 2 +- ....nvidia.com_nicconfigurationtemplates.yaml | 180 +++++++++ ...nfiguration.net.nvidia.com_nicdevices.yaml | 270 +++++++++++++ ...guration-operator-config_v1_configmap.yaml | 10 + ...r-supported-nic-firmware_v1_configmap.yaml | 26 ++ ...ration-operator.clusterserviceversion.yaml | 373 ++++++++++++++++++ bundle/metadata/annotations.yaml | 16 + bundle/tests/scorecard/config.yaml | 70 ++++ config/daemon/kustomization.yaml | 15 +- config/default/kustomization.yaml | 6 - config/default/manager_auth_proxy_patch.yaml | 39 -- config/default/manager_config_patch.yaml | 10 - config/manager/kustomization.yaml | 6 +- config/manager/manager.yaml | 31 +- ...ration-operator.clusterserviceversion.yaml | 78 ++++ config/manifests/kustomization.yaml | 23 +- .../rbac/auth_proxy_client_clusterrole.yaml | 12 - config/rbac/auth_proxy_role.yaml | 20 - config/rbac/auth_proxy_role_binding.yaml | 15 - config/rbac/auth_proxy_service.yaml | 17 - config/rbac/kustomization.yaml | 22 -- config/rbac/leader_election_role.yaml | 40 -- config/rbac/leader_election_role_binding.yaml | 15 - .../nicconfigurationtemplate_editor_role.yaml | 27 -- .../nicconfigurationtemplate_viewer_role.yaml | 23 -- config/rbac/nicdevice_editor_role.yaml | 27 -- config/rbac/nicdevice_viewer_role.yaml | 23 -- config/rbac/role.yaml | 31 ++ ...net_v1alpha1_nicconfigurationtemplate.yaml | 33 +- .../configuration.net_v1alpha1_nicdevice.yaml | 31 +- .../nic-configuration-operator-config.yaml | 19 + .../supported-nic-firmware-configmap.yaml | 2 +- hack/scripts/ocp-bundle-postprocess.sh | 33 ++ .../nicconfigurationtemplate_controller.go | 5 +- pkg/consts/consts.go | 2 +- 37 files changed, 1232 insertions(+), 374 deletions(-) create mode 100644 bundle/manifests/configuration.net.nvidia.com_nicconfigurationtemplates.yaml create mode 100644 bundle/manifests/configuration.net.nvidia.com_nicdevices.yaml create mode 100644 bundle/manifests/nic-configuration-operator-config_v1_configmap.yaml create mode 100644 bundle/manifests/nic-configuration-operator-supported-nic-firmware_v1_configmap.yaml create mode 100644 bundle/manifests/nvidia-nic-configuration-operator.clusterserviceversion.yaml create mode 100644 bundle/metadata/annotations.yaml create mode 100644 bundle/tests/scorecard/config.yaml delete mode 100644 config/default/manager_auth_proxy_patch.yaml delete mode 100644 config/default/manager_config_patch.yaml create mode 100644 config/manifests/bases/nvidia-nic-configuration-operator.clusterserviceversion.yaml delete mode 100644 config/rbac/auth_proxy_client_clusterrole.yaml delete mode 100644 config/rbac/auth_proxy_role.yaml delete mode 100644 config/rbac/auth_proxy_role_binding.yaml delete mode 100644 config/rbac/auth_proxy_service.yaml delete mode 100644 config/rbac/leader_election_role.yaml delete mode 100644 config/rbac/leader_election_role_binding.yaml delete mode 100644 config/rbac/nicconfigurationtemplate_editor_role.yaml delete mode 100644 config/rbac/nicconfigurationtemplate_viewer_role.yaml delete mode 100644 config/rbac/nicdevice_editor_role.yaml delete mode 100644 config/rbac/nicdevice_viewer_role.yaml create mode 100644 deployment/nic-configuration-operator-chart/templates/nic-configuration-operator-config.yaml create mode 100755 hack/scripts/ocp-bundle-postprocess.sh diff --git a/.github/workflows/image-push-release.yml b/.github/workflows/image-push-release.yml index 69e293e..512cf22 100644 --- a/.github/workflows/image-push-release.yml +++ b/.github/workflows/image-push-release.yml @@ -3,14 +3,18 @@ on: push: tags: - v* +env: + REGISTRY: "ghcr.io" + OPERATOR_IMAGE_NAME: "nic-configuration-operator" + DAEMON_IMAGE_NAME: "nic-configuration-operator-daemon" jobs: image-build-push: name: Image build and push runs-on: ubuntu-latest steps: - - name: Set repository as lower-case output variable - id: repo_name - run: echo ::set-output name=repository::$(echo ${{ github.repository }} | tr '[:upper:]' '[:lower:]') + - name: Set repository owner as lower-case output variable + id: repo_owner + run: echo REPO_OWNER=$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV - name: Check out code into the Go module directory uses: actions/checkout@v4 with: @@ -22,14 +26,14 @@ jobs: - name: Login to GitHub Container Registry uses: docker/login-action@v3 with: - registry: ghcr.io + registry: ${{ env.REGISTRY }} username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: Docker operator image meta id: docker_meta_operator uses: docker/metadata-action@v5 with: - images: ghcr.io/${{ steps.repo_name.outputs.repository }} + images: ${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.OPERATOR_IMAGE_NAME }} tags: | type=ref,event=tag flavor: | @@ -46,7 +50,7 @@ jobs: id: docker_meta_daemon uses: docker/metadata-action@v5 with: - images: ghcr.io/${{ steps.repo_name.outputs.repository }}-daemon + images: ${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.DAEMON_IMAGE_NAME }} tags: | type=ref,event=tag flavor: | @@ -60,4 +64,34 @@ jobs: ${{ steps.docker_meta_daemon.outputs.tags }} labels: ${{ steps.docker_meta_daemon.outputs.labels }} file: ./Dockerfile.nic-configuration-daemon - + - name: Determine version, tag, and base branch + run: | + git_tag=${{ github.ref_name }} + echo VERSION_WITH_PREFIX=$git_tag >> $GITHUB_ENV + echo VERSION_WITHOUT_PREFIX=${git_tag:1} >> $GITHUB_ENV # without the 'v' prefix + if echo $git_tag | grep beta; then + base_branch=$DEFAULT_BRANCH + else + v_major_minor=$(echo $git_tag | grep -Eo '^v[0-9]+\.[0-9]+') + base_branch=$v_major_minor.x + fi + echo BASE_BRANCH=$base_branch >> $GITHUB_ENV + - name: Lookup image digest + run: | + operator_digest=$(skopeo inspect docker://$REGISTRY/$REPO_OWNER/$OPERATOR_IMAGE_NAME:$VERSION_WITH_PREFIX | jq -r .Digest) + echo $operator_digest | wc -w | grep 1 # verifies value not empty + echo OPERATOR_DIGEST=$operator_digest >> $GITHUB_ENV + daemon_digest=$(skopeo inspect docker://$REGISTRY/$REPO_OWNER/$DAEMON_IMAGE_NAME:$VERSION_WITH_PREFIX | jq -r .Digest) + echo $daemon_digest | wc -w | grep 1 # verifies value not empty + echo DAEMON_DIGEST=$daemon_digest >> $GITHUB_ENV + - name: Make bundle + env: + OPERATOR_IMAGE_TAG: ${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.OPERATOR_IMAGE_NAME }}@${{ env.OPERATOR_DIGEST }} + CONFIG_DAEMON_IMAGE_TAG: ${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.DAEMON_IMAGE_NAME }}@${{ env.DAEMON_DIGEST }} + BUNDLE_IMG: ${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.OPERATOR_IMAGE_NAME }}-bundle:${{ github.ref_name }} + VERSION: ${{ env.VERSION_WITHOUT_PREFIX }} + run: | + version_major_minor=$(echo $VERSION_WITH_PREFIX | grep -Eo 'v[0-9]+\.[0-9]+') + export CHANNELS=stable,$version_major_minor + export DEFAULT_CHANNEL=$version_major_minor + make bundle bundle-build bundle-push diff --git a/Makefile b/Makefile index 638d86a..44e3512 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,8 @@ ifeq ($(USE_IMAGE_DIGESTS), true) BUNDLE_GEN_FLAGS += --use-image-digests endif +BUNDLE_OCP_VERSIONS=v4.14-v4.17 + # Set the Operator SDK version to use. By default, what is installed on the system is used. # This is useful for CI or a project to utilize a specific version of the operator-sdk toolkit. OPERATOR_SDK_VERSION ?= v1.36.0 @@ -335,7 +337,9 @@ endif bundle: manifests kustomize operator-sdk ## Generate bundle manifests and metadata, then validate generated files. $(OPERATOR_SDK) generate kustomize manifests -q cd config/manager && $(KUSTOMIZE) edit set image controller=$(OPERATOR_IMAGE_TAG) + cd config/daemon && $(KUSTOMIZE) edit set configmap config --from-literal=configDaemonImage=$(CONFIG_DAEMON_IMAGE_TAG) --from-literal=releaseVersion=${VERSION} $(KUSTOMIZE) build config/manifests | $(OPERATOR_SDK) generate bundle $(BUNDLE_GEN_FLAGS) + BUNDLE_OCP_VERSIONS=$(BUNDLE_OCP_VERSIONS) OPERATOR_IMAGE_TAG=$(OPERATOR_IMAGE_TAG) CONFIG_DAEMON_IMAGE_TAG=$(CONFIG_DAEMON_IMAGE_TAG) hack/scripts/ocp-bundle-postprocess.sh $(OPERATOR_SDK) bundle validate ./bundle .PHONY: bundle-build @@ -344,7 +348,7 @@ bundle-build: ## Build the bundle image. .PHONY: bundle-push bundle-push: ## Push the bundle image. - $(MAKE) docker-push IMG=$(BUNDLE_IMG) + $(CONTAINER_TOOL) push $(BUNDLE_IMG) .PHONY: opm OPM = $(LOCALBIN)/opm diff --git a/PROJECT b/PROJECT index e83e548..8c064b1 100644 --- a/PROJECT +++ b/PROJECT @@ -8,7 +8,7 @@ layout: plugins: manifests.sdk.operatorframework.io/v2: {} scorecard.sdk.operatorframework.io/v2: {} -projectName: nic-configuration-operator +projectName: nvidia-nic-configuration-operator repo: github.com/Mellanox/nic-configuration-operator resources: - api: diff --git a/bundle/manifests/configuration.net.nvidia.com_nicconfigurationtemplates.yaml b/bundle/manifests/configuration.net.nvidia.com_nicconfigurationtemplates.yaml new file mode 100644 index 0000000..b437bc3 --- /dev/null +++ b/bundle/manifests/configuration.net.nvidia.com_nicconfigurationtemplates.yaml @@ -0,0 +1,180 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + creationTimestamp: null + name: nicconfigurationtemplates.configuration.net.nvidia.com +spec: + group: configuration.net.nvidia.com + names: + kind: NicConfigurationTemplate + listKind: NicConfigurationTemplateList + plural: nicconfigurationtemplates + singular: nicconfigurationtemplate + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: NicConfigurationTemplate is the Schema for the nicconfigurationtemplates + API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Defines the desired state of NICs + properties: + nicSelector: + description: NIC selector configuration + properties: + nicType: + description: Type of the NIC to be selected, e.g. 101d,1015,a2d6 + etc. + type: string + pciAddresses: + description: Array of PCI addresses to be selected, e.g. "0000:03:00.0" + items: + type: string + type: array + serialNumbers: + description: Serial numbers of the NICs to be selected, e.g. MT2116X09299 + items: + type: string + type: array + required: + - nicType + type: object + nodeSelector: + additionalProperties: + type: string + description: NodeSelector contains labels required on the node + type: object + resetToDefault: + default: false + description: |- + ResetToDefault specifies whether node agent needs to perform a reset flow + The following operations will be performed: + * Nvconfig reset of all non-volatile configurations + - Mstconfig -d reset for each PF + - Mstconfig -d set ADVANCED_PCI_SETTINGS=1 + * Node reboot + - Applies new NIC NV config + - Will undo any runtime configuration previously performed for the device/driver + type: boolean + template: + description: Configuration template to be applied to matching devices + properties: + gpuDirectOptimized: + description: GPU Direct optimization settings + properties: + enabled: + description: Optimize GPU Direct + type: boolean + env: + description: GPU direct environment, e.g. Baremetal + type: string + required: + - enabled + - env + type: object + linkType: + description: LinkType to be configured, Ethernet|Infiniband + enum: + - Ethernet + - Infiniband + type: string + numVfs: + description: Number of VFs to be configured + type: integer + pciPerformanceOptimized: + description: PCI performance optimization settings + properties: + enabled: + description: Specifies whether to enable PCI performance optimization + type: boolean + maxAccOutRead: + description: Specifies the PCIe Max Accumulative Outstanding + read bytes + type: integer + maxReadRequest: + description: Specifies the size of a single PCI read request + in bytes + enum: + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + type: integer + required: + - enabled + type: object + roceOptimized: + description: RoCE optimization settings + properties: + enabled: + description: Optimize RoCE + type: boolean + qos: + description: Quality of Service settings + properties: + pfc: + description: Priority-based Flow Control configuration, + e.g. "0,0,0,1,0,0,0,0" + pattern: ^([01],){7}[01]$ + type: string + trust: + description: Trust mode for QoS settings, e.g. trust-dscp + type: string + required: + - pfc + - trust + type: object + required: + - enabled + type: object + required: + - linkType + - numVfs + type: object + required: + - nicSelector + - template + type: object + status: + description: Defines the observed state of NicConfigurationTemplate + properties: + nicDevices: + description: NicDevice CRs matching this configuration template + items: + type: string + type: array + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: null + storedVersions: null diff --git a/bundle/manifests/configuration.net.nvidia.com_nicdevices.yaml b/bundle/manifests/configuration.net.nvidia.com_nicdevices.yaml new file mode 100644 index 0000000..2cd0ec9 --- /dev/null +++ b/bundle/manifests/configuration.net.nvidia.com_nicdevices.yaml @@ -0,0 +1,270 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + creationTimestamp: null + name: nicdevices.configuration.net.nvidia.com +spec: + group: configuration.net.nvidia.com + names: + kind: NicDevice + listKind: NicDeviceList + plural: nicdevices + singular: nicdevice + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: NicDevice is the Schema for the nicdevices API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: NicDeviceSpec defines the desired state of NicDevice + properties: + configuration: + description: Configuration specifies the configuration requested by + NicConfigurationTemplate + properties: + resetToDefault: + description: |- + ResetToDefault specifies whether node agent needs to perform a reset flow. + In NIC Configuration Operator template v0.1.14 BF2/BF3 DPUs (not SuperNics) FW reset flow isn't supported. + The following operations will be performed: + * Nvconfig reset of all non-volatile configurations + - Mstconfig -d reset for each PF + - Mstconfig -d set ADVANCED_PCI_SETTINGS=1 + * Node reboot + - Applies new NIC NV config + - Will undo any runtime configuration previously performed for the device/driver + type: boolean + template: + description: Configuration template applied from the NicConfigurationTemplate + CR + properties: + gpuDirectOptimized: + description: GPU Direct optimization settings + properties: + enabled: + description: Optimize GPU Direct + type: boolean + env: + description: GPU direct environment, e.g. Baremetal + type: string + required: + - enabled + - env + type: object + linkType: + description: LinkType to be configured, Ethernet|Infiniband + enum: + - Ethernet + - Infiniband + type: string + numVfs: + description: Number of VFs to be configured + type: integer + pciPerformanceOptimized: + description: PCI performance optimization settings + properties: + enabled: + description: Specifies whether to enable PCI performance + optimization + type: boolean + maxAccOutRead: + description: Specifies the PCIe Max Accumulative Outstanding + read bytes + type: integer + maxReadRequest: + description: Specifies the size of a single PCI read request + in bytes + enum: + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + type: integer + required: + - enabled + type: object + roceOptimized: + description: RoCE optimization settings + properties: + enabled: + description: Optimize RoCE + type: boolean + qos: + description: Quality of Service settings + properties: + pfc: + description: Priority-based Flow Control configuration, + e.g. "0,0,0,1,0,0,0,0" + pattern: ^([01],){7}[01]$ + type: string + trust: + description: Trust mode for QoS settings, e.g. trust-dscp + type: string + required: + - pfc + - trust + type: object + required: + - enabled + type: object + required: + - linkType + - numVfs + type: object + type: object + type: object + status: + description: NicDeviceStatus defines the observed state of NicDevice + properties: + conditions: + description: List of conditions observed for the device + items: + description: "Condition contains details for one aspect of the current + state of this API Resource.\n---\nThis struct is intended for + direct use as an array at the field path .status.conditions. For + example,\n\n\n\ttype FooStatus struct{\n\t // Represents the + observations of a foo's current state.\n\t // Known .status.conditions.type + are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // + +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t + \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" + patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t + \ // other fields\n\t}" + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + firmwareVersion: + description: Firmware version currently installed on the device, e.g. + 22.31.1014 + type: string + node: + description: Node where the device is located + type: string + partNumber: + description: Part number of the device, e.g. MCX713106AEHEA_QP1 + type: string + ports: + description: List of ports for the device + items: + description: NicDevicePortSpec describes the ports of the NIC + properties: + networkInterface: + description: NetworkInterface is the name of the network interface + for this port, e.g. eth1 + type: string + pci: + description: PCI is a PCI address of the port, e.g. 0000:3b:00.0 + type: string + rdmaInterface: + description: RdmaInterface is the name of the rdma interface + for this port, e.g. mlx5_1 + type: string + required: + - pci + type: object + type: array + psid: + description: Product Serial ID of the device, e.g. MT_0000000221 + type: string + serialNumber: + description: Serial number of the device, e.g. MT2116X09299 + type: string + type: + description: Type of device, e.g. ConnectX7 + type: string + required: + - firmwareVersion + - node + - partNumber + - ports + - psid + - serialNumber + - type + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: null + storedVersions: null diff --git a/bundle/manifests/nic-configuration-operator-config_v1_configmap.yaml b/bundle/manifests/nic-configuration-operator-config_v1_configmap.yaml new file mode 100644 index 0000000..6a9e136 --- /dev/null +++ b/bundle/manifests/nic-configuration-operator-config_v1_configmap.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +data: + clusterType: openshift + configDaemonImage: ghcr.io/mellanox/nic-configuration-operator@sha256:fd62c275c6765b728fe4f18f092a5c3f7372729763a4622ff4883b94d0e5d03f + logLevel: info + releaseVersion: 0.1.14 + serviceAccountName: nic-configuration-operator-controller-manager +kind: ConfigMap +metadata: + name: nic-configuration-operator-config diff --git a/bundle/manifests/nic-configuration-operator-supported-nic-firmware_v1_configmap.yaml b/bundle/manifests/nic-configuration-operator-supported-nic-firmware_v1_configmap.yaml new file mode 100644 index 0000000..6805d36 --- /dev/null +++ b/bundle/manifests/nic-configuration-operator-supported-nic-firmware_v1_configmap.yaml @@ -0,0 +1,26 @@ +apiVersion: v1 +data: + Nvidia_mlx5_ConnectX-4-24.07: 1013 24.07-0.6.1 12.28.2006 + Nvidia_mlx5_ConnectX-4-24.10: 1013 24.10-0.7.0 12.28.2006 + Nvidia_mlx5_ConnectX-4_Lx-24.10: 1013 24.10-0.7.0 14.32.1010 + Nvidia_mlx5_ConnectX-5-24.07: 1017 24.07-0.6.1 16.35.4030 + Nvidia_mlx5_ConnectX-5-24.10: 1017 24.10-0.7.0 16.35.4030 + Nvidia_mlx5_ConnectX-5_Ex-24.07: 1019 24.07-0.6.1 16.35.4030 + Nvidia_mlx5_ConnectX-5_Ex-24.10: 1019 24.10-0.7.0 16.35.4030 + Nvidia_mlx5_ConnectX-6-24.07: 101b 24.07-0.6.1 20.42.1000 + Nvidia_mlx5_ConnectX-6-24.10: 101b 24.10-0.7.0 20.43.1014 + Nvidia_mlx5_ConnectX-6_Dx-24.07: 101d 24.07-0.6.1 22.42.1000 + Nvidia_mlx5_ConnectX-6_Dx-24.10: 101d 24.10-0.7.0 22.43.1014 + Nvidia_mlx5_ConnectX-6_Lx-24.07: 101f 24.07-0.6.1 26.42.1000 + Nvidia_mlx5_ConnectX-6_Lx-24.10: 101f 24.10-0.7.0 26.43.1014 + Nvidia_mlx5_ConnectX-7-24.07: 1021 24.07-0.6.1 28.42.1000 + Nvidia_mlx5_ConnectX-7-24.10: 1021 24.10-0.7.0 28.43.1014 + Nvidia_mlx5_MT42822_BlueField-2_integrated_ConnectX-6_Dx-24.07: a2d6 24.07-0.6.1 + 24.42.1000 + Nvidia_mlx5_MT42822_BlueField-2_integrated_ConnectX-6_Dx-24.10: a2d6 24.10-0.7.0 + 24.43.2026 + Nvidia_mlx5_MT43244_BlueField-3_integrated_ConnectX-7_Dx-24.10: a2dc 24.10-0.7.0 + 32.43.2026 +kind: ConfigMap +metadata: + name: nic-configuration-operator-supported-nic-firmware diff --git a/bundle/manifests/nvidia-nic-configuration-operator.clusterserviceversion.yaml b/bundle/manifests/nvidia-nic-configuration-operator.clusterserviceversion.yaml new file mode 100644 index 0000000..a57a991 --- /dev/null +++ b/bundle/manifests/nvidia-nic-configuration-operator.clusterserviceversion.yaml @@ -0,0 +1,373 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: ClusterServiceVersion +metadata: + annotations: + containerImage: ghcr.io/mellanox/nic-configuration-operator@sha256:bfd4b296eecd0a68293458eec1f11015560ecb91e73f50e7b7b7da6048d554c9 + alm-examples: |- + [ + { + "apiVersion": "configuration.net.nvidia.com/v1alpha1", + "kind": "NicConfigurationTemplate", + "metadata": { + "name": "connectx6-config", + "namespace": "nic-configuration-operator" + }, + "spec": { + "nicSelector": { + "nicType": "101b", + "pciAddresses": [ + "0000:03:00.0", + "“0000:04:00.0”" + ], + "serialNumbers": [ + "MT2116X09299" + ] + }, + "nodeSelector": { + "feature.node.kubernetes.io/pci-15b3.present": "true" + }, + "resetToDefault": false, + "template": { + "gpuDirectOptimized": { + "enabled": true, + "env": "Baremetal" + }, + "linkType": "Ethernet", + "numVfs": 2, + "pciPerformanceOptimized": { + "enabled": true, + "maxAccOutRead": 44, + "maxReadRequest": 4096 + }, + "roceOptimized": { + "enabled": true, + "qos": { + "pfc": "0,0,0,1,0,0,0,0", + "trust": "dscp" + } + } + } + } + }, + { + "apiVersion": "configuration.net.nvidia.com/v1alpha1", + "kind": "NicDevice", + "metadata": { + "name": "co-node-25-101b-mt2232t13210", + "namespace": "nic-configuration-operator" + }, + "spec": { + "configuration": { + "template": { + "linkType": "Ethernet", + "numVfs": 8, + "pciPerformanceOptimized": { + "enabled": true + } + } + } + }, + "status": { + "conditions": [ + { + "reason": "UpdateSuccessful", + "status": "False", + "type": "ConfigUpdateInProgress" + } + ], + "firmwareVersion": "20.42.1000", + "node": "co-node-25", + "partNumber": "mcx632312a-hdat", + "ports": [ + { + "networkInterface": "enp4s0f0np0", + "pci": "0000:04:00.0", + "rdmaInterface": "mlx5_0" + }, + { + "networkInterface": "enp4s0f1np1", + "pci": "0000:04:00.1", + "rdmaInterface": "mlx5_1" + } + ], + "psid": "mt_0000000225", + "serialNumber": "mt2232t13210", + "type": "101b" + } + } + ] + capabilities: Basic Install + createdAt: "2024-12-24T10:01:08Z" + description: FW configuration on Nvidia NICs in a coordinated manner + features.operators.openshift.io/cnf: "false" + features.operators.openshift.io/cni: "false" + features.operators.openshift.io/csi: "false" + features.operators.openshift.io/disconnected: "true" + features.operators.openshift.io/fips-compliant: "false" + features.operators.openshift.io/proxy-aware: "true" + features.operators.openshift.io/tls-profiles: "false" + features.operators.openshift.io/token-auth-aws: "false" + features.operators.openshift.io/token-auth-azure: "false" + features.operators.openshift.io/token-auth-gcp: "false" + operatorframework.io/suggested-namespace: nvidia-nic-configuration-operator + operators.operatorframework.io/builder: operator-sdk-v1.37.0 + operators.operatorframework.io/project_layout: go.kubebuilder.io/v4 + provider: NVIDIA + repository: https://github.com/Mellanox/nic-configuration-operator/ + support: NVIDIA + labels: + operatorframework.io/arch.amd64: supported + operatorframework.io/arch.arm64: supported + name: nvidia-nic-configuration-operator.v0.1.14 + namespace: placeholder +spec: + apiservicedefinitions: {} + customresourcedefinitions: + owned: + - description: NicConfigurationTemplate is the Schema for the nicconfigurationtemplates + API + displayName: Nic Configuration Template + kind: NicConfigurationTemplate + name: nicconfigurationtemplates.configuration.net.nvidia.com + version: v1alpha1 + - description: NicDevice is the Schema for the nicdevices API + displayName: Nic Device + kind: NicDevice + name: nicdevices.configuration.net.nvidia.com + version: v1alpha1 + description: NVIDIA NIC Configuration Operator provides Kubernetes API (Custom Resource + Definition) to allow FW configuration on NVIDIA NICs in a coordinated manner. + It deploys a configuration daemon on each of the desired nodes to configure NVIDIA + NICs there. + displayName: NVIDIA NIC Configuration Operator + icon: + - base64data: "" + mediatype: "" + install: + spec: + clusterPermissions: + - rules: + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - events + verbs: + - create + - apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - patch + - update + - watch + - apiGroups: + - "" + resources: + - pods + verbs: + - list + - apiGroups: + - "" + resources: + - pods/eviction + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - apps + resources: + - daemonsets + verbs: + - create + - delete + - get + - update + - apiGroups: + - configuration.net.nvidia.com + resources: + - nicconfigurationtemplates + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - configuration.net.nvidia.com + resources: + - nicconfigurationtemplates/finalizers + verbs: + - update + - apiGroups: + - configuration.net.nvidia.com + resources: + - nicconfigurationtemplates/status + verbs: + - get + - patch + - update + - apiGroups: + - configuration.net.nvidia.com + resources: + - nicdevices + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - configuration.net.nvidia.com + resources: + - nicdevices/finalizers + verbs: + - update + - apiGroups: + - configuration.net.nvidia.com + resources: + - nicdevices/status + verbs: + - get + - patch + - update + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - create + - get + - update + - apiGroups: + - maintenance.nvidia.com + resources: + - nodemaintenances + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use + serviceAccountName: nic-configuration-operator-controller-manager + deployments: + - label: + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: nic-configuration-operator + control-plane: controller-manager + name: nic-configuration-operator-controller-manager + spec: + replicas: 1 + selector: + matchLabels: + control-plane: controller-manager + strategy: {} + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + control-plane: controller-manager + spec: + containers: + - args: + - --leader-elect + command: + - /manager + env: + - name: LOG_LEVEL + value: info + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + image: ghcr.io/mellanox/nic-configuration-operator@sha256:bfd4b296eecd0a68293458eec1f11015560ecb91e73f50e7b7b7da6048d554c9 + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + securityContext: + runAsNonRoot: true + serviceAccountName: nic-configuration-operator-controller-manager + terminationGracePeriodSeconds: 10 + strategy: deployment + installModes: + - supported: true + type: OwnNamespace + - supported: true + type: SingleNamespace + - supported: false + type: MultiNamespace + - supported: false + type: AllNamespaces + keywords: + - node + - fw + - configuration + - nic + links: + - name: NVIDIA NIC Configuration Operator + url: https://github.com/Mellanox/nic-configuration-operator/ + maintainers: + - email: nvidia-network-operator-support@nvidia.com + name: NVIDIA + maturity: alpha + provider: + name: NVIDIA + url: https://github.com/Mellanox/nic-configuration-operator/ + version: 0.1.14 + relatedImages: + - name: nvidia-nic-configuration-operator + image: ghcr.io/mellanox/nic-configuration-operator@sha256:bfd4b296eecd0a68293458eec1f11015560ecb91e73f50e7b7b7da6048d554c9 + - name: nvidia-nic-configuration-daemon + image: ghcr.io/mellanox/nic-configuration-operator@sha256:fd62c275c6765b728fe4f18f092a5c3f7372729763a4622ff4883b94d0e5d03f \ No newline at end of file diff --git a/bundle/metadata/annotations.yaml b/bundle/metadata/annotations.yaml new file mode 100644 index 0000000..70011db --- /dev/null +++ b/bundle/metadata/annotations.yaml @@ -0,0 +1,16 @@ +annotations: + # Core bundle annotations. + operators.operatorframework.io.bundle.mediatype.v1: registry+v1 + operators.operatorframework.io.bundle.manifests.v1: manifests/ + operators.operatorframework.io.bundle.metadata.v1: metadata/ + operators.operatorframework.io.bundle.package.v1: nvidia-nic-configuration-operator + operators.operatorframework.io.bundle.channels.v1: v0.1.14,stable + operators.operatorframework.io.bundle.channel.default.v1: v0.1.14 + operators.operatorframework.io.metrics.builder: operator-sdk-v1.37.0 + operators.operatorframework.io.metrics.mediatype.v1: metrics+v1 + operators.operatorframework.io.metrics.project_layout: go.kubebuilder.io/v4 + + # Annotations for testing. + operators.operatorframework.io.test.mediatype.v1: scorecard+v1 + operators.operatorframework.io.test.config.v1: tests/scorecard/ + com.redhat.openshift.versions: v4.14-v4.17 diff --git a/bundle/tests/scorecard/config.yaml b/bundle/tests/scorecard/config.yaml new file mode 100644 index 0000000..da55ad9 --- /dev/null +++ b/bundle/tests/scorecard/config.yaml @@ -0,0 +1,70 @@ +apiVersion: scorecard.operatorframework.io/v1alpha3 +kind: Configuration +metadata: + name: config +stages: +- parallel: true + tests: + - entrypoint: + - scorecard-test + - basic-check-spec + image: quay.io/operator-framework/scorecard-test:v1.36.0 + labels: + suite: basic + test: basic-check-spec-test + storage: + spec: + mountPath: {} + - entrypoint: + - scorecard-test + - olm-bundle-validation + image: quay.io/operator-framework/scorecard-test:v1.36.0 + labels: + suite: olm + test: olm-bundle-validation-test + storage: + spec: + mountPath: {} + - entrypoint: + - scorecard-test + - olm-crds-have-validation + image: quay.io/operator-framework/scorecard-test:v1.36.0 + labels: + suite: olm + test: olm-crds-have-validation-test + storage: + spec: + mountPath: {} + - entrypoint: + - scorecard-test + - olm-crds-have-resources + image: quay.io/operator-framework/scorecard-test:v1.36.0 + labels: + suite: olm + test: olm-crds-have-resources-test + storage: + spec: + mountPath: {} + - entrypoint: + - scorecard-test + - olm-spec-descriptors + image: quay.io/operator-framework/scorecard-test:v1.36.0 + labels: + suite: olm + test: olm-spec-descriptors-test + storage: + spec: + mountPath: {} + - entrypoint: + - scorecard-test + - olm-status-descriptors + image: quay.io/operator-framework/scorecard-test:v1.36.0 + labels: + suite: olm + test: olm-status-descriptors-test + storage: + spec: + mountPath: {} +storage: + spec: + mountPath: {} diff --git a/config/daemon/kustomization.yaml b/config/daemon/kustomization.yaml index 26990af..fc5b550 100644 --- a/config/daemon/kustomization.yaml +++ b/config/daemon/kustomization.yaml @@ -1,2 +1,15 @@ resources: - - daemon.yaml +- daemon.yaml + +configMapGenerator: +- literals: + - serviceAccountName=nic-configuration-operator-controller-manager + - configDaemonImage=ghcr.io/mellanox/nic-configuration-operator@sha256:fd62c275c6765b728fe4f18f092a5c3f7372729763a4622ff4883b94d0e5d03f + - clusterType=openshift + - logLevel=info + - releaseVersion=0.1.14 + name: config + options: + disableNameSuffixHash: true +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index 9b8d314..cc4a287 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -28,12 +28,6 @@ resources: # [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. #- ../prometheus -patches: -# Protect the /metrics endpoint by putting it behind auth. -# If you want your controller-manager to expose the /metrics -# endpoint w/o any authn/z, please comment the following line. -- path: manager_auth_proxy_patch.yaml - # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in # crd/kustomization.yaml #- path: manager_webhook_patch.yaml diff --git a/config/default/manager_auth_proxy_patch.yaml b/config/default/manager_auth_proxy_patch.yaml deleted file mode 100644 index 4c3c276..0000000 --- a/config/default/manager_auth_proxy_patch.yaml +++ /dev/null @@ -1,39 +0,0 @@ -# This patch inject a sidecar container which is a HTTP proxy for the -# controller manager, it performs RBAC authorization against the Kubernetes API using SubjectAccessReviews. -apiVersion: apps/v1 -kind: Deployment -metadata: - name: controller-manager - namespace: system -spec: - template: - spec: - containers: - - name: kube-rbac-proxy - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - "ALL" - image: gcr.io/kubebuilder/kube-rbac-proxy:v0.16.0 - args: - - "--secure-listen-address=0.0.0.0:8443" - - "--upstream=http://127.0.0.1:8080/" - - "--logtostderr=true" - - "--v=0" - ports: - - containerPort: 8443 - protocol: TCP - name: https - resources: - limits: - cpu: 500m - memory: 128Mi - requests: - cpu: 5m - memory: 64Mi - - name: manager - args: - - "--health-probe-bind-address=:8081" - - "--metrics-bind-address=127.0.0.1:8080" - - "--leader-elect" diff --git a/config/default/manager_config_patch.yaml b/config/default/manager_config_patch.yaml deleted file mode 100644 index f6f5891..0000000 --- a/config/default/manager_config_patch.yaml +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: controller-manager - namespace: system -spec: - template: - spec: - containers: - - name: manager diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index bf97c27..f389f8f 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -3,6 +3,6 @@ resources: apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization images: -- name: controller - newName: nic-configuration-operator - newTag: latest +- digest: sha256:bfd4b296eecd0a68293458eec1f11015560ecb91e73f50e7b7b7da6048d554c9 + name: controller + newName: ghcr.io/mellanox/nic-configuration-operator diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index a8c7b84..eb8b0ad 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -28,35 +28,8 @@ spec: labels: control-plane: controller-manager spec: - # TODO(user): Uncomment the following code to configure the nodeAffinity expression - # according to the platforms which are supported by your solution. - # It is considered best practice to support multiple architectures. You can - # build your manager image using the makefile target docker-buildx. - # affinity: - # nodeAffinity: - # requiredDuringSchedulingIgnoredDuringExecution: - # nodeSelectorTerms: - # - matchExpressions: - # - key: kubernetes.io/arch - # operator: In - # values: - # - amd64 - # - arm64 - # - ppc64le - # - s390x - # - key: kubernetes.io/os - # operator: In - # values: - # - linux securityContext: runAsNonRoot: true - # TODO(user): For common cases that do not require escalating privileges - # it is recommended to ensure that all your Pods/Containers are restrictive. - # More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted - # Please uncomment the following code if your project does NOT have to work on old Kubernetes - # versions < 1.19 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ). - # seccompProfile: - # type: RuntimeDefault containers: - command: - /manager @@ -71,7 +44,7 @@ spec: - "ALL" env: - name: LOG_LEVEL - value: debug + value: info - name: NAMESPACE valueFrom: fieldRef: @@ -88,8 +61,6 @@ spec: port: 8081 initialDelaySeconds: 5 periodSeconds: 10 - # TODO(user): Configure the resources accordingly based on the project requirements. - # More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ resources: limits: cpu: 500m diff --git a/config/manifests/bases/nvidia-nic-configuration-operator.clusterserviceversion.yaml b/config/manifests/bases/nvidia-nic-configuration-operator.clusterserviceversion.yaml new file mode 100644 index 0000000..781c8c6 --- /dev/null +++ b/config/manifests/bases/nvidia-nic-configuration-operator.clusterserviceversion.yaml @@ -0,0 +1,78 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: ClusterServiceVersion +metadata: + annotations: + alm-examples: '[]' + capabilities: Basic Install + description: FW configuration on Nvidia NICs in a coordinated manner + features.operators.openshift.io/cnf: "false" + features.operators.openshift.io/cni: "false" + features.operators.openshift.io/csi: "false" + features.operators.openshift.io/disconnected: "true" + features.operators.openshift.io/fips-compliant: "false" + features.operators.openshift.io/proxy-aware: "true" + features.operators.openshift.io/tls-profiles: "false" + features.operators.openshift.io/token-auth-aws: "false" + features.operators.openshift.io/token-auth-azure: "false" + features.operators.openshift.io/token-auth-gcp: "false" + operatorframework.io/suggested-namespace: nvidia-nic-configuration-operator + provider: NVIDIA + repository: https://github.com/Mellanox/nic-configuration-operator/ + support: NVIDIA + labels: + operatorframework.io/arch.amd64: supported + operatorframework.io/arch.arm64: supported + name: nvidia-nic-configuration-operator.v0.0.0 + namespace: placeholder +spec: + apiservicedefinitions: {} + customresourcedefinitions: + owned: + - description: NicConfigurationTemplate is the Schema for the nicconfigurationtemplates + API + displayName: Nic Configuration Template + kind: NicConfigurationTemplate + name: nicconfigurationtemplates.configuration.net.nvidia.com + version: v1alpha1 + - description: NicDevice is the Schema for the nicdevices API + displayName: Nic Device + kind: NicDevice + name: nicdevices.configuration.net.nvidia.com + version: v1alpha1 + description: NVIDIA NIC Configuration Operator provides Kubernetes API (Custom Resource + Definition) to allow FW configuration on NVIDIA NICs in a coordinated manner. + It deploys a configuration daemon on each of the desired nodes to configure NVIDIA + NICs there. + displayName: NVIDIA NIC Configuration Operator + icon: + - base64data: "" + mediatype: "" + install: + spec: + deployments: null + strategy: "" + installModes: + - supported: true + type: OwnNamespace + - supported: true + type: SingleNamespace + - supported: false + type: MultiNamespace + - supported: false + type: AllNamespaces + keywords: + - node + - fw + - configuration + - nic + links: + - name: NVIDIA NIC Configuration Operator + url: https://github.com/Mellanox/nic-configuration-operator/ + maintainers: + - email: nvidia-network-operator-support@nvidia.com + name: NVIDIA + maturity: alpha + provider: + name: NVIDIA + url: https://github.com/Mellanox/nic-configuration-operator/ + version: 0.0.0 diff --git a/config/manifests/kustomization.yaml b/config/manifests/kustomization.yaml index b7d0256..06a147c 100644 --- a/config/manifests/kustomization.yaml +++ b/config/manifests/kustomization.yaml @@ -1,28 +1,7 @@ # These resources constitute the fully configured set of manifests # used to generate the 'manifests/' directory in a bundle. resources: -- bases/nic-configuration-operator.clusterserviceversion.yaml +- bases/nvidia-nic-configuration-operator.clusterserviceversion.yaml - ../default - ../samples - ../scorecard - -# [WEBHOOK] To enable webhooks, uncomment all the sections with [WEBHOOK] prefix. -# Do NOT uncomment sections with prefix [CERTMANAGER], as OLM does not support cert-manager. -# These patches remove the unnecessary "cert" volume and its manager container volumeMount. -#patchesJson6902: -#- target: -# group: apps -# version: v1 -# kind: Deployment -# name: controller-manager -# namespace: system -# patch: |- -# # Remove the manager container's "cert" volumeMount, since OLM will create and mount a set of certs. -# # Update the indices in this path if adding or removing containers/volumeMounts in the manager's Deployment. -# - op: remove - -# path: /spec/template/spec/containers/0/volumeMounts/0 -# # Remove the "cert" volume, since OLM will create and mount a set of certs. -# # Update the indices in this path if adding or removing volumes in the manager's Deployment. -# - op: remove -# path: /spec/template/spec/volumes/0 diff --git a/config/rbac/auth_proxy_client_clusterrole.yaml b/config/rbac/auth_proxy_client_clusterrole.yaml deleted file mode 100644 index c7a703a..0000000 --- a/config/rbac/auth_proxy_client_clusterrole.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: nic-configuration-operator - app.kubernetes.io/managed-by: kustomize - name: metrics-reader -rules: -- nonResourceURLs: - - "/metrics" - verbs: - - get diff --git a/config/rbac/auth_proxy_role.yaml b/config/rbac/auth_proxy_role.yaml deleted file mode 100644 index 2ea5b9a..0000000 --- a/config/rbac/auth_proxy_role.yaml +++ /dev/null @@ -1,20 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: nic-configuration-operator - app.kubernetes.io/managed-by: kustomize - name: proxy-role -rules: -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create diff --git a/config/rbac/auth_proxy_role_binding.yaml b/config/rbac/auth_proxy_role_binding.yaml deleted file mode 100644 index 9098711..0000000 --- a/config/rbac/auth_proxy_role_binding.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - app.kubernetes.io/name: nic-configuration-operator - app.kubernetes.io/managed-by: kustomize - name: proxy-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: proxy-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/config/rbac/auth_proxy_service.yaml b/config/rbac/auth_proxy_service.yaml deleted file mode 100644 index 11441a5..0000000 --- a/config/rbac/auth_proxy_service.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - labels: - control-plane: controller-manager - app.kubernetes.io/name: nic-configuration-operator - app.kubernetes.io/managed-by: kustomize - name: controller-manager-metrics-service - namespace: system -spec: - ports: - - name: https - port: 8443 - protocol: TCP - targetPort: https - selector: - control-plane: controller-manager diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index 2302caa..664fcac 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -1,26 +1,4 @@ resources: -# All RBAC will be applied under this service account in -# the deployment namespace. You may comment out this resource -# if your manager will use a service account that exists at -# runtime. Be sure to update RoleBinding and ClusterRoleBinding -# subjects if changing service account names. - service_account.yaml - role.yaml - role_binding.yaml -- leader_election_role.yaml -- leader_election_role_binding.yaml -# Comment the following 4 lines if you want to disable -# the auth proxy (https://github.com/brancz/kube-rbac-proxy) -# which protects your /metrics endpoint. -- auth_proxy_service.yaml -- auth_proxy_role.yaml -- auth_proxy_role_binding.yaml -- auth_proxy_client_clusterrole.yaml -# For each CRD, "Editor" and "Viewer" roles are scaffolded by -# default, aiding admins in cluster management. Those roles are -# not used by the Project itself. You can comment the following lines -# if you do not want those helpers be installed with your Project. -- nicdevice_editor_role.yaml -- nicdevice_viewer_role.yaml -- nicconfigurationtemplate_editor_role.yaml -- nicconfigurationtemplate_viewer_role.yaml diff --git a/config/rbac/leader_election_role.yaml b/config/rbac/leader_election_role.yaml deleted file mode 100644 index 4391f82..0000000 --- a/config/rbac/leader_election_role.yaml +++ /dev/null @@ -1,40 +0,0 @@ -# permissions to do leader election. -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - labels: - app.kubernetes.io/name: nic-configuration-operator - app.kubernetes.io/managed-by: kustomize - name: leader-election-role -rules: -- apiGroups: - - "" - resources: - - configmaps - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - coordination.k8s.io - resources: - - leases - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - "" - resources: - - events - verbs: - - create - - patch diff --git a/config/rbac/leader_election_role_binding.yaml b/config/rbac/leader_election_role_binding.yaml deleted file mode 100644 index 6cca6dd..0000000 --- a/config/rbac/leader_election_role_binding.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - labels: - app.kubernetes.io/name: nic-configuration-operator - app.kubernetes.io/managed-by: kustomize - name: leader-election-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: leader-election-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/config/rbac/nicconfigurationtemplate_editor_role.yaml b/config/rbac/nicconfigurationtemplate_editor_role.yaml deleted file mode 100644 index f17da4f..0000000 --- a/config/rbac/nicconfigurationtemplate_editor_role.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# permissions for end users to edit nicconfigurationtemplates. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: nic-configuration-operator - app.kubernetes.io/managed-by: kustomize - name: nicconfigurationtemplate-editor-role -rules: -- apiGroups: - - configuration.net.nvidia.com - resources: - - nicconfigurationtemplates - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - configuration.net.nvidia.com - resources: - - nicconfigurationtemplates/status - verbs: - - get diff --git a/config/rbac/nicconfigurationtemplate_viewer_role.yaml b/config/rbac/nicconfigurationtemplate_viewer_role.yaml deleted file mode 100644 index cbae9bd..0000000 --- a/config/rbac/nicconfigurationtemplate_viewer_role.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# permissions for end users to view nicconfigurationtemplates. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: nic-configuration-operator - app.kubernetes.io/managed-by: kustomize - name: nicconfigurationtemplate-viewer-role -rules: -- apiGroups: - - configuration.net.nvidia.com - resources: - - nicconfigurationtemplates - verbs: - - get - - list - - watch -- apiGroups: - - configuration.net.nvidia.com - resources: - - nicconfigurationtemplates/status - verbs: - - get diff --git a/config/rbac/nicdevice_editor_role.yaml b/config/rbac/nicdevice_editor_role.yaml deleted file mode 100644 index 1be1449..0000000 --- a/config/rbac/nicdevice_editor_role.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# permissions for end users to edit nicdevices. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: nic-configuration-operator - app.kubernetes.io/managed-by: kustomize - name: nicdevice-editor-role -rules: -- apiGroups: - - configuration.net.nvidia.com - resources: - - nicdevices - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - configuration.net.nvidia.com - resources: - - nicdevices/status - verbs: - - get diff --git a/config/rbac/nicdevice_viewer_role.yaml b/config/rbac/nicdevice_viewer_role.yaml deleted file mode 100644 index 761ff16..0000000 --- a/config/rbac/nicdevice_viewer_role.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# permissions for end users to view nicdevices. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: nic-configuration-operator - app.kubernetes.io/managed-by: kustomize - name: nicdevice-viewer-role -rules: -- apiGroups: - - configuration.net.nvidia.com - resources: - - nicdevices - verbs: - - get - - list - - watch -- apiGroups: - - configuration.net.nvidia.com - resources: - - nicdevices/status - verbs: - - get diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 51b7f17..c0fddc9 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -12,6 +12,12 @@ rules: - get - list - watch +- apiGroups: + - "" + resources: + - events + verbs: + - create - apiGroups: - "" resources: @@ -40,6 +46,15 @@ rules: - patch - update - watch +- apiGroups: + - apps + resources: + - daemonsets + verbs: + - create + - delete + - get + - update - apiGroups: - configuration.net.nvidia.com resources: @@ -92,6 +107,14 @@ rules: - get - patch - update +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - create + - get + - update - apiGroups: - maintenance.nvidia.com resources: @@ -104,3 +127,11 @@ rules: - patch - update - watch +- apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use diff --git a/config/samples/configuration.net_v1alpha1_nicconfigurationtemplate.yaml b/config/samples/configuration.net_v1alpha1_nicconfigurationtemplate.yaml index 4aba5fe..5d731f1 100644 --- a/config/samples/configuration.net_v1alpha1_nicconfigurationtemplate.yaml +++ b/config/samples/configuration.net_v1alpha1_nicconfigurationtemplate.yaml @@ -1,9 +1,32 @@ apiVersion: configuration.net.nvidia.com/v1alpha1 kind: NicConfigurationTemplate metadata: - labels: - app.kubernetes.io/name: nic-configuration-operator - app.kubernetes.io/managed-by: kustomize - name: nicconfigurationtemplate-sample + name: connectx6-config + namespace: nic-configuration-operator spec: - # TODO(user): Add fields here + nodeSelector: + feature.node.kubernetes.io/pci-15b3.present: "true" + nicSelector: + # nicType selector is mandatory the rest are optional. Only a single type can be specified. + nicType: 101b + pciAddresses: + - "0000:03:00.0" + - “0000:04:00.0” + serialNumbers: + - "MT2116X09299" + resetToDefault: false # if set, template is ignored, device configuration should reset + template: + numVfs: 2 + linkType: Ethernet + pciPerformanceOptimized: + enabled: true + maxAccOutRead: 44 + maxReadRequest: 4096 + roceOptimized: + enabled: true + qos: + trust: dscp + pfc: "0,0,0,1,0,0,0,0" + gpuDirectOptimized: + enabled: true + env: Baremetal \ No newline at end of file diff --git a/config/samples/configuration.net_v1alpha1_nicdevice.yaml b/config/samples/configuration.net_v1alpha1_nicdevice.yaml index d5c5249..2787c82 100644 --- a/config/samples/configuration.net_v1alpha1_nicdevice.yaml +++ b/config/samples/configuration.net_v1alpha1_nicdevice.yaml @@ -1,9 +1,30 @@ apiVersion: configuration.net.nvidia.com/v1alpha1 kind: NicDevice metadata: - labels: - app.kubernetes.io/name: nic-configuration-operator - app.kubernetes.io/managed-by: kustomize - name: nicdevice-sample + name: co-node-25-101b-mt2232t13210 + namespace: nic-configuration-operator spec: - # TODO(user): Add fields here + configuration: + template: + linkType: Ethernet + numVfs: 8 + pciPerformanceOptimized: + enabled: true +status: + conditions: + - reason: UpdateSuccessful + status: "False" + type: ConfigUpdateInProgress + firmwareVersion: 20.42.1000 + node: co-node-25 + partNumber: mcx632312a-hdat + ports: + - networkInterface: enp4s0f0np0 + pci: "0000:04:00.0" + rdmaInterface: mlx5_0 + - networkInterface: enp4s0f1np1 + pci: "0000:04:00.1" + rdmaInterface: mlx5_1 + psid: mt_0000000225 + serialNumber: mt2232t13210 + type: 101b \ No newline at end of file diff --git a/deployment/nic-configuration-operator-chart/templates/nic-configuration-operator-config.yaml b/deployment/nic-configuration-operator-chart/templates/nic-configuration-operator-config.yaml new file mode 100644 index 0000000..d4103e3 --- /dev/null +++ b/deployment/nic-configuration-operator-chart/templates/nic-configuration-operator-config.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: nic-configuration-operator-config +data: + serviceAccountName: "{{ include "nic-configuration-operator.serviceAccountName" . }}" + configDaemonImage: "{{ .Values.configDaemon.image.repository }}/{{ .Values.configDaemon.image.name }}:{{ .Values.configDaemon.image.tag | default .Chart.AppVersion }}" + {{- if .Values.imagePullSecrets}} + imagePullSecrets: {{ join "," .Values.imagePullSecrets }} + {{- end}} + {{- if .Values.configDaemon.nodeSelector}} + nodeSelector: {{ .Values.configDaemon.nodeSelector | toJson | quote }} + {{- end}} + {{- if .Values.configDaemon.resources}} + resources: {{ .Values.configDaemon.resources | toJson | quote }} + {{- end}} + {{- if .Values.logLevel}} + logLevel: {{ .Values.logLevel }} + {{- end}} \ No newline at end of file diff --git a/deployment/nic-configuration-operator-chart/templates/supported-nic-firmware-configmap.yaml b/deployment/nic-configuration-operator-chart/templates/supported-nic-firmware-configmap.yaml index 02634f5..b46cd90 100644 --- a/deployment/nic-configuration-operator-chart/templates/supported-nic-firmware-configmap.yaml +++ b/deployment/nic-configuration-operator-chart/templates/supported-nic-firmware-configmap.yaml @@ -1,7 +1,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: supported-nic-firmware + name: nic-configuration-operator-supported-nic-firmware data: Nvidia_mlx5_ConnectX-4-24.07: "1013 24.07-0.6.1 12.28.2006" Nvidia_mlx5_ConnectX-5-24.07: "1017 24.07-0.6.1 16.35.4030" diff --git a/hack/scripts/ocp-bundle-postprocess.sh b/hack/scripts/ocp-bundle-postprocess.sh new file mode 100755 index 0000000..40a8362 --- /dev/null +++ b/hack/scripts/ocp-bundle-postprocess.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# 2024 NVIDIA CORPORATION & AFFILIATES +# +# Licensed under the Apache License, Version 2.0 (the License); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o nounset +set -o pipefail +set -o errexit + +if [[ "${TRACE-0}" == "1" ]]; then + set -o xtrace +fi + +printf " relatedImages:\n - name: nvidia-nic-configuration-operator\n image: %s\n - name: nvidia-nic-configuration-daemon\n image: %s" "$OPERATOR_IMAGE_TAG" "$CONFIG_DAEMON_IMAGE_TAG" >> bundle/manifests/nvidia-nic-configuration-operator.clusterserviceversion.yaml + +# Add containerImage annotation +# Escape the tag annotation value for sed +ESCAPED_TAG=$(printf '%s\n' "$OPERATOR_IMAGE_TAG" | sed -e 's/[]\/$*.^[]/\\&/g') +sed -i "0,/annotations:/s/annotations:/annotations:\n containerImage: $ESCAPED_TAG/" bundle/manifests/nvidia-nic-configuration-operator.clusterserviceversion.yaml + +# Add OpenShift versions in metadata/annotations.yaml +echo " com.redhat.openshift.versions: $BUNDLE_OCP_VERSIONS" >> bundle/metadata/annotations.yaml \ No newline at end of file diff --git a/internal/controller/nicconfigurationtemplate_controller.go b/internal/controller/nicconfigurationtemplate_controller.go index 3dc0f4b..0d197aa 100644 --- a/internal/controller/nicconfigurationtemplate_controller.go +++ b/internal/controller/nicconfigurationtemplate_controller.go @@ -57,10 +57,13 @@ type NicConfigurationTemplateReconciler struct { //+kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicdevices/finalizers,verbs=update //+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;update;patch //+kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch -//+kubebuilder:rbac:groups="",resources=configmaps,verbs=get +//+kubebuilder:rbac:groups="",resources=events,verbs=create //+kubebuilder:rbac:groups="",resources=pods,verbs=list //+kubebuilder:rbac:groups="",resources=pods/eviction,verbs=create;delete;get;list;patch;update;watch //+kubebuilder:rbac:groups=maintenance.nvidia.com,resources=nodemaintenances,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;update;create +//+kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=get;create;update;delete +//+kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=use,resourceNames=privileged // Reconcile reconciles the NicConfigurationTemplate object func (r *NicConfigurationTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { diff --git a/pkg/consts/consts.go b/pkg/consts/consts.go index 89a0eb4..6599bb9 100644 --- a/pkg/consts/consts.go +++ b/pkg/consts/consts.go @@ -78,7 +78,7 @@ const ( HostPath = "/host" - SupportedNicFirmwareConfigmap = "supported-nic-firmware" + SupportedNicFirmwareConfigmap = "nic-configuration-operator-supported-nic-firmware" Mlx5ModuleVersionPath = "/sys/bus/pci/drivers/mlx5_core/module/version" FwConfigNotAppliedAfterRebootErrorMsg = "firmware configuration failed to apply after reboot"