Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add OCP bundle
Browse files Browse the repository at this point in the history
- build OCP bundle
- configure operator configmap via kustomize
- add required RBAC permissions to operate in openshift
- add a github action to build and push the bundle to ghcr on a new tag

Signed-off-by: Alexander Maslennikov <[email protected]>
almaslennikov committed Dec 20, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent cedb2e3 commit 2400250
Showing 31 changed files with 1,168 additions and 269 deletions.
45 changes: 38 additions & 7 deletions .github/workflows/image-push-release.yml
Original file line number Diff line number Diff line change
@@ -3,14 +3,18 @@ on:
push:
tags:
- v*
env:
REGISTRY: "ghcr.io"
OPERATOR_IMAGE_NAME: "nic-configuration-operator"
DAEMON_IMAGE_NAME: "nic-configuration-operator-daemon"
jobs:
image-build-push:
name: Image build and push
runs-on: ubuntu-latest
steps:
- name: Set repository as lower-case output variable
id: repo_name
run: echo ::set-output name=repository::$(echo ${{ github.repository }} | tr '[:upper:]' '[:lower:]')
- name: Set repository owner as lower-case output variable
id: repo_owner
run: echo REPO_OWNER=$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV
- name: Check out code into the Go module directory
uses: actions/checkout@v4
with:
@@ -22,14 +26,14 @@ jobs:
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
registry: ${{ env.REGISTRY }}
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Docker operator image meta
id: docker_meta_operator
uses: docker/metadata-action@v5
with:
images: ghcr.io/${{ steps.repo_name.outputs.repository }}
images: ${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.OPERATOR_IMAGE_NAME }}
tags: |
type=ref,event=tag
flavor: |
@@ -46,7 +50,7 @@ jobs:
id: docker_meta_daemon
uses: docker/metadata-action@v5
with:
images: ghcr.io/${{ steps.repo_name.outputs.repository }}-daemon
images: ${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.DAEMON_IMAGE_NAME }}
tags: |
type=ref,event=tag
flavor: |
@@ -60,4 +64,31 @@ jobs:
${{ steps.docker_meta_daemon.outputs.tags }}
labels: ${{ steps.docker_meta_daemon.outputs.labels }}
file: ./Dockerfile.nic-configuration-daemon

- name: Determine version, tag, and base branch
run: |
git_tag=${{ github.ref_name }}
echo VERSION_WITH_PREFIX=$git_tag >> $GITHUB_ENV
echo VERSION_WITHOUT_PREFIX=${git_tag:1} >> $GITHUB_ENV # without the 'v' prefix
if echo $git_tag | grep beta; then
base_branch=$DEFAULT_BRANCH
else
v_major_minor=$(echo $git_tag | grep -Eo '^v[0-9]+\.[0-9]+')
base_branch=$v_major_minor.x
fi
echo BASE_BRANCH=$base_branch >> $GITHUB_ENV
- name: Lookup image digest
run: |
operator_digest=$(skopeo inspect docker://$REGISTRY/$REPO_OWNER/$OPERATOR_IMAGE_NAME:$VERSION_WITH_PREFIX | jq -r .Digest)
echo $operator_digest | wc -w | grep 1 # verifies value not empty
echo OPERATOR_DIGEST=$operator_digest >> $GITHUB_ENV
- name: Make bundle
env:
OPERATOR_IMAGE_TAG: ${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.OPERATOR_IMAGE_NAME }}@${{ env.OPERATOR_DIGEST }}
CONFIG_DAEMON_IMAGE_TAG: ${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.DAEMON_IMAGE_NAME }}:${{ github.ref_name }}
BUNDLE_IMG: ${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.OPERATOR_IMAGE_NAME }}-bundle:${{ github.ref_name }}
VERSION: ${{ env.VERSION_WITHOUT_PREFIX }}
run: |
version_major_minor=$(echo $VERSION_WITH_PREFIX | grep -Eo 'v[0-9]+\.[0-9]+')
export CHANNELS=stable,$version_major_minor
export DEFAULT_CHANNEL=$version_major_minor
make bundle bundle-build bundle-push
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -335,6 +335,7 @@ endif
bundle: manifests kustomize operator-sdk ## Generate bundle manifests and metadata, then validate generated files.
$(OPERATOR_SDK) generate kustomize manifests -q
cd config/manager && $(KUSTOMIZE) edit set image controller=$(OPERATOR_IMAGE_TAG)
cd config/daemon && $(KUSTOMIZE) edit set configmap config --from-literal=configDaemonImage=$(CONFIG_DAEMON_IMAGE_TAG) --from-literal=releaseVersion=${VERSION}
$(KUSTOMIZE) build config/manifests | $(OPERATOR_SDK) generate bundle $(BUNDLE_GEN_FLAGS)
$(OPERATOR_SDK) bundle validate ./bundle

@@ -344,7 +345,7 @@ bundle-build: ## Build the bundle image.

.PHONY: bundle-push
bundle-push: ## Push the bundle image.
$(MAKE) docker-push IMG=$(BUNDLE_IMG)
$(CONTAINER_TOOL) push $(BUNDLE_IMG)

.PHONY: opm
OPM = $(LOCALBIN)/opm
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.14.0
creationTimestamp: null
name: nicconfigurationtemplates.configuration.net.nvidia.com
spec:
group: configuration.net.nvidia.com
names:
kind: NicConfigurationTemplate
listKind: NicConfigurationTemplateList
plural: nicconfigurationtemplates
singular: nicconfigurationtemplate
scope: Namespaced
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
description: NicConfigurationTemplate is the Schema for the nicconfigurationtemplates
API
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: Defines the desired state of NICs
properties:
nicSelector:
description: NIC selector configuration
properties:
nicType:
description: Type of the NIC to be selected, e.g. 101d,1015,a2d6
etc.
type: string
pciAddresses:
description: Array of PCI addresses to be selected, e.g. "0000:03:00.0"
items:
type: string
type: array
serialNumbers:
description: Serial numbers of the NICs to be selected, e.g. MT2116X09299
items:
type: string
type: array
required:
- nicType
type: object
nodeSelector:
additionalProperties:
type: string
description: NodeSelector contains labels required on the node
type: object
resetToDefault:
default: false
description: |-
ResetToDefault specifies whether node agent needs to perform a reset flow
The following operations will be performed:
* Nvconfig reset of all non-volatile configurations
- Mstconfig -d <device> reset for each PF
- Mstconfig -d <device> set ADVANCED_PCI_SETTINGS=1
* Node reboot
- Applies new NIC NV config
- Will undo any runtime configuration previously performed for the device/driver
type: boolean
template:
description: Configuration template to be applied to matching devices
properties:
gpuDirectOptimized:
description: GPU Direct optimization settings
properties:
enabled:
description: Optimize GPU Direct
type: boolean
env:
description: GPU direct environment, e.g. Baremetal
type: string
required:
- enabled
- env
type: object
linkType:
description: LinkType to be configured, Ethernet|Infiniband
enum:
- Ethernet
- Infiniband
type: string
numVfs:
description: Number of VFs to be configured
type: integer
pciPerformanceOptimized:
description: PCI performance optimization settings
properties:
enabled:
description: Specifies whether to enable PCI performance optimization
type: boolean
maxAccOutRead:
description: Specifies the PCIe Max Accumulative Outstanding
read bytes
type: integer
maxReadRequest:
description: Specifies the size of a single PCI read request
in bytes
enum:
- 128
- 256
- 512
- 1024
- 2048
- 4096
type: integer
required:
- enabled
type: object
roceOptimized:
description: RoCE optimization settings
properties:
enabled:
description: Optimize RoCE
type: boolean
qos:
description: Quality of Service settings
properties:
pfc:
description: Priority-based Flow Control configuration,
e.g. "0,0,0,1,0,0,0,0"
pattern: ^([01],){7}[01]$
type: string
trust:
description: Trust mode for QoS settings, e.g. trust-dscp
type: string
required:
- pfc
- trust
type: object
required:
- enabled
type: object
required:
- linkType
- numVfs
type: object
required:
- nicSelector
- template
type: object
status:
description: Defines the observed state of NicConfigurationTemplate
properties:
nicDevices:
description: NicDevice CRs matching this configuration template
items:
type: string
type: array
type: object
type: object
served: true
storage: true
subresources:
status: {}
status:
acceptedNames:
kind: ""
plural: ""
conditions: null
storedVersions: null
270 changes: 270 additions & 0 deletions bundle/manifests/configuration.net.nvidia.com_nicdevices.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.14.0
creationTimestamp: null
name: nicdevices.configuration.net.nvidia.com
spec:
group: configuration.net.nvidia.com
names:
kind: NicDevice
listKind: NicDeviceList
plural: nicdevices
singular: nicdevice
scope: Namespaced
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
description: NicDevice is the Schema for the nicdevices API
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: NicDeviceSpec defines the desired state of NicDevice
properties:
configuration:
description: Configuration specifies the configuration requested by
NicConfigurationTemplate
properties:
resetToDefault:
description: |-
ResetToDefault specifies whether node agent needs to perform a reset flow.
In NIC Configuration Operator template v0.1.14 BF2/BF3 DPUs (not SuperNics) FW reset flow isn't supported.
The following operations will be performed:
* Nvconfig reset of all non-volatile configurations
- Mstconfig -d <device> reset for each PF
- Mstconfig -d <device> set ADVANCED_PCI_SETTINGS=1
* Node reboot
- Applies new NIC NV config
- Will undo any runtime configuration previously performed for the device/driver
type: boolean
template:
description: Configuration template applied from the NicConfigurationTemplate
CR
properties:
gpuDirectOptimized:
description: GPU Direct optimization settings
properties:
enabled:
description: Optimize GPU Direct
type: boolean
env:
description: GPU direct environment, e.g. Baremetal
type: string
required:
- enabled
- env
type: object
linkType:
description: LinkType to be configured, Ethernet|Infiniband
enum:
- Ethernet
- Infiniband
type: string
numVfs:
description: Number of VFs to be configured
type: integer
pciPerformanceOptimized:
description: PCI performance optimization settings
properties:
enabled:
description: Specifies whether to enable PCI performance
optimization
type: boolean
maxAccOutRead:
description: Specifies the PCIe Max Accumulative Outstanding
read bytes
type: integer
maxReadRequest:
description: Specifies the size of a single PCI read request
in bytes
enum:
- 128
- 256
- 512
- 1024
- 2048
- 4096
type: integer
required:
- enabled
type: object
roceOptimized:
description: RoCE optimization settings
properties:
enabled:
description: Optimize RoCE
type: boolean
qos:
description: Quality of Service settings
properties:
pfc:
description: Priority-based Flow Control configuration,
e.g. "0,0,0,1,0,0,0,0"
pattern: ^([01],){7}[01]$
type: string
trust:
description: Trust mode for QoS settings, e.g. trust-dscp
type: string
required:
- pfc
- trust
type: object
required:
- enabled
type: object
required:
- linkType
- numVfs
type: object
type: object
type: object
status:
description: NicDeviceStatus defines the observed state of NicDevice
properties:
conditions:
description: List of conditions observed for the device
items:
description: "Condition contains details for one aspect of the current
state of this API Resource.\n---\nThis struct is intended for
direct use as an array at the field path .status.conditions. For
example,\n\n\n\ttype FooStatus struct{\n\t // Represents the
observations of a foo's current state.\n\t // Known .status.conditions.type
are: \"Available\", \"Progressing\", and \"Degraded\"\n\t //
+patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t
\ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\"
patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t
\ // other fields\n\t}"
properties:
lastTransitionTime:
description: |-
lastTransitionTime is the last time the condition transitioned from one status to another.
This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable.
format: date-time
type: string
message:
description: |-
message is a human readable message indicating details about the transition.
This may be an empty string.
maxLength: 32768
type: string
observedGeneration:
description: |-
observedGeneration represents the .metadata.generation that the condition was set based upon.
For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
with respect to the current state of the instance.
format: int64
minimum: 0
type: integer
reason:
description: |-
reason contains a programmatic identifier indicating the reason for the condition's last transition.
Producers of specific condition types may define expected values and meanings for this field,
and whether the values are considered a guaranteed API.
The value should be a CamelCase string.
This field may not be empty.
maxLength: 1024
minLength: 1
pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
type: string
status:
description: status of the condition, one of True, False, Unknown.
enum:
- "True"
- "False"
- Unknown
type: string
type:
description: |-
type of condition in CamelCase or in foo.example.com/CamelCase.
---
Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be
useful (see .node.status.conditions), the ability to deconflict is important.
The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt)
maxLength: 316
pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
type: string
required:
- lastTransitionTime
- message
- reason
- status
- type
type: object
type: array
firmwareVersion:
description: Firmware version currently installed on the device, e.g.
22.31.1014
type: string
node:
description: Node where the device is located
type: string
partNumber:
description: Part number of the device, e.g. MCX713106AEHEA_QP1
type: string
ports:
description: List of ports for the device
items:
description: NicDevicePortSpec describes the ports of the NIC
properties:
networkInterface:
description: NetworkInterface is the name of the network interface
for this port, e.g. eth1
type: string
pci:
description: PCI is a PCI address of the port, e.g. 0000:3b:00.0
type: string
rdmaInterface:
description: RdmaInterface is the name of the rdma interface
for this port, e.g. mlx5_1
type: string
required:
- pci
type: object
type: array
psid:
description: Product Serial ID of the device, e.g. MT_0000000221
type: string
serialNumber:
description: Serial number of the device, e.g. MT2116X09299
type: string
type:
description: Type of device, e.g. ConnectX7
type: string
required:
- firmwareVersion
- node
- partNumber
- ports
- psid
- serialNumber
- type
type: object
type: object
served: true
storage: true
subresources:
status: {}
status:
acceptedNames:
kind: ""
plural: ""
conditions: null
storedVersions: null
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: v1
data:
configDaemonImage: harbor.mellanox.com/cloud-orchestration-dev/nic-configuration-operator-daemon:latest
logLevel: info
releaseVersion: 1.1.0
serviceAccountName: nic-configuration-operator-controller-manager
kind: ConfigMap
metadata:
name: nic-configuration-operator-config
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
apiVersion: v1
data:
Nvidia_mlx5_ConnectX-4-24.07: 1013 24.07-0.6.1 12.28.2006
Nvidia_mlx5_ConnectX-4-24.10: 1013 24.10-0.7.0 12.28.2006
Nvidia_mlx5_ConnectX-4_Lx-24.10: 1013 24.10-0.7.0 14.32.1010
Nvidia_mlx5_ConnectX-5-24.07: 1017 24.07-0.6.1 16.35.4030
Nvidia_mlx5_ConnectX-5-24.10: 1017 24.10-0.7.0 16.35.4030
Nvidia_mlx5_ConnectX-5_Ex-24.07: 1019 24.07-0.6.1 16.35.4030
Nvidia_mlx5_ConnectX-5_Ex-24.10: 1019 24.10-0.7.0 16.35.4030
Nvidia_mlx5_ConnectX-6-24.07: 101b 24.07-0.6.1 20.42.1000
Nvidia_mlx5_ConnectX-6-24.10: 101b 24.10-0.7.0 20.43.1014
Nvidia_mlx5_ConnectX-6_Dx-24.07: 101d 24.07-0.6.1 22.42.1000
Nvidia_mlx5_ConnectX-6_Dx-24.10: 101d 24.10-0.7.0 22.43.1014
Nvidia_mlx5_ConnectX-6_Lx-24.07: 101f 24.07-0.6.1 26.42.1000
Nvidia_mlx5_ConnectX-6_Lx-24.10: 101f 24.10-0.7.0 26.43.1014
Nvidia_mlx5_ConnectX-7-24.07: 1021 24.07-0.6.1 28.42.1000
Nvidia_mlx5_ConnectX-7-24.10: 1021 24.10-0.7.0 28.43.1014
Nvidia_mlx5_MT42822_BlueField-2_integrated_ConnectX-6_Dx-24.07: a2d6 24.07-0.6.1
24.42.1000
Nvidia_mlx5_MT42822_BlueField-2_integrated_ConnectX-6_Dx-24.10: a2d6 24.10-0.7.0
22.43.1014
kind: ConfigMap
metadata:
name: nic-configuration-operator-supported-nic-firmware
373 changes: 373 additions & 0 deletions bundle/manifests/nic-configuration-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,373 @@
apiVersion: operators.coreos.com/v1alpha1
kind: ClusterServiceVersion
metadata:
annotations:
alm-examples: |-
[
{
"apiVersion": "configuration.net.nvidia.com/v1alpha1",
"kind": "NicConfigurationTemplate",
"metadata": {
"name": "connectx6-config",
"namespace": "nic-configuration-operator"
},
"spec": {
"nicSelector": {
"nicType": "101b",
"pciAddresses": [
"0000:03:00.0",
"“0000:04:00.0”"
],
"serialNumbers": [
"MT2116X09299"
]
},
"nodeSelector": {
"feature.node.kubernetes.io/network-sriov.capable": "true"
},
"resetToDefault": false,
"template": {
"gpuDirectOptimized": {
"enabled": true,
"env": "Baremetal"
},
"linkType": "Ethernet",
"numVfs": 2,
"pciPerformanceOptimized": {
"enabled": true,
"maxAccOutRead": 44,
"maxReadRequest": 4096
},
"roceOptimized": {
"enabled": true,
"qos": {
"pfc": "0,0,0,1,0,0,0,0",
"trust": "dscp"
}
}
}
}
},
{
"apiVersion": "configuration.net.nvidia.com/v1alpha1",
"kind": "NicDevice",
"metadata": {
"name": "co-node-25-101b-mt2232t13210",
"namespace": "nic-configuration-operator"
},
"spec": {
"configuration": {
"template": {
"linkType": "Ethernet",
"numVfs": 8,
"pciPerformanceOptimized": {
"enabled": true
}
}
}
},
"status": {
"conditions": [
{
"reason": "UpdateSuccessful",
"status": "False",
"type": "ConfigUpdateInProgress"
}
],
"firmwareVersion": "20.42.1000",
"node": "co-node-25",
"partNumber": "mcx632312a-hdat",
"ports": [
{
"networkInterface": "enp4s0f0np0",
"pci": "0000:04:00.0",
"rdmaInterface": "mlx5_0"
},
{
"networkInterface": "enp4s0f1np1",
"pci": "0000:04:00.1",
"rdmaInterface": "mlx5_1"
}
],
"psid": "mt_0000000225",
"serialNumber": "mt2232t13210",
"type": "101b"
}
}
]
capabilities: Basic Install
createdAt: "2024-12-19T11:17:35Z"
operators.operatorframework.io/builder: operator-sdk-v1.37.0
operators.operatorframework.io/project_layout: go.kubebuilder.io/v4
name: nic-configuration-operator.v1.1.0
namespace: placeholder
spec:
apiservicedefinitions: {}
customresourcedefinitions:
owned:
- description: NicConfigurationTemplate is the Schema for the nicconfigurationtemplates
API
displayName: Nic Configuration Template
kind: NicConfigurationTemplate
name: nicconfigurationtemplates.configuration.net.nvidia.com
version: v1alpha1
- description: NicDevice is the Schema for the nicdevices API
displayName: Nic Device
kind: NicDevice
name: nicdevices.configuration.net.nvidia.com
version: v1alpha1
description: NVIDIA NIC Configuration Operator provides Kubernetes API (Custom Resource
Definition) to allow FW configuration on NVIDIA NICs in a coordinated manner.
It deploys a configuration daemon on each of the desired nodes to configure NVIDIA
NICs there.
displayName: NVIDIA NIC Configuration Operator
icon:
- base64data: ""
mediatype: ""
install:
spec:
clusterPermissions:
- rules:
- apiGroups:
- ""
resources:
- configmaps
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- patch
- update
- watch
- apiGroups:
- ""
resources:
- pods
verbs:
- list
- apiGroups:
- ""
resources:
- pods/eviction
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- apps
resources:
- daemonsets
verbs:
- create
- delete
- get
- update
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicconfigurationtemplates
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicconfigurationtemplates/finalizers
verbs:
- update
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicconfigurationtemplates/status
verbs:
- get
- patch
- update
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicdevices
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicdevices/finalizers
verbs:
- update
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicdevices/status
verbs:
- get
- patch
- update
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- create
- get
- update
- apiGroups:
- maintenance.nvidia.com
resources:
- nodemaintenances
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- security.openshift.io
resourceNames:
- privileged
resources:
- securitycontextconstraints
verbs:
- use
serviceAccountName: nic-configuration-operator-controller-manager
deployments:
- label:
app.kubernetes.io/managed-by: kustomize
app.kubernetes.io/name: nic-configuration-operator
control-plane: controller-manager
name: nic-configuration-operator-controller-manager
spec:
replicas: 1
selector:
matchLabels:
control-plane: controller-manager
strategy: {}
template:
metadata:
annotations:
kubectl.kubernetes.io/default-container: manager
labels:
control-plane: controller-manager
spec:
containers:
- args:
- --secure-listen-address=0.0.0.0:8443
- --upstream=http://127.0.0.1:8080/
- --logtostderr=true
- --v=0
image: gcr.io/kubebuilder/kube-rbac-proxy:v0.16.0
name: kube-rbac-proxy
ports:
- containerPort: 8443
name: https
protocol: TCP
resources:
limits:
cpu: 500m
memory: 128Mi
requests:
cpu: 5m
memory: 64Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
- args:
- --health-probe-bind-address=:8081
- --metrics-bind-address=127.0.0.1:8080
- --leader-elect
command:
- /manager
env:
- name: LOG_LEVEL
value: info
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
image: harbor.mellanox.com/cloud-orchestration-dev/amaslennikov/nic-configuration-operator@sha256:806cff1a608bec79e47cbd410968284d64f6f01b814ae668f3f7e7234001b80b
livenessProbe:
httpGet:
path: /healthz
port: 8081
initialDelaySeconds: 15
periodSeconds: 20
name: manager
readinessProbe:
httpGet:
path: /readyz
port: 8081
initialDelaySeconds: 5
periodSeconds: 10
resources:
limits:
cpu: 500m
memory: 128Mi
requests:
cpu: 10m
memory: 64Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
securityContext:
runAsNonRoot: true
serviceAccountName: nic-configuration-operator-controller-manager
terminationGracePeriodSeconds: 10
strategy: deployment
installModes:
- supported: true
type: OwnNamespace
- supported: true
type: SingleNamespace
- supported: false
type: MultiNamespace
- supported: false
type: AllNamespaces
keywords:
- node
- fw
- configuration
- nic
links:
- name: Nic Configuration Operator
url: https://github.com/Mellanox/nic-configuration-operator
maintainers:
- email: nvidia-network-operator-support@nvidia.com
name: NVIDIA
maturity: alpha
provider:
name: NVIDIA
version: 1.1.0
15 changes: 15 additions & 0 deletions bundle/metadata/annotations.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
annotations:
# Core bundle annotations.
operators.operatorframework.io.bundle.mediatype.v1: registry+v1
operators.operatorframework.io.bundle.manifests.v1: manifests/
operators.operatorframework.io.bundle.metadata.v1: metadata/
operators.operatorframework.io.bundle.package.v1: nic-configuration-operator
operators.operatorframework.io.bundle.channels.v1: v1.1,stable
operators.operatorframework.io.bundle.channel.default.v1: v1.1
operators.operatorframework.io.metrics.builder: operator-sdk-v1.37.0
operators.operatorframework.io.metrics.mediatype.v1: metrics+v1
operators.operatorframework.io.metrics.project_layout: go.kubebuilder.io/v4

# Annotations for testing.
operators.operatorframework.io.test.mediatype.v1: scorecard+v1
operators.operatorframework.io.test.config.v1: tests/scorecard/
70 changes: 70 additions & 0 deletions bundle/tests/scorecard/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
apiVersion: scorecard.operatorframework.io/v1alpha3
kind: Configuration
metadata:
name: config
stages:
- parallel: true
tests:
- entrypoint:
- scorecard-test
- basic-check-spec
image: quay.io/operator-framework/scorecard-test:v1.36.0
labels:
suite: basic
test: basic-check-spec-test
storage:
spec:
mountPath: {}
- entrypoint:
- scorecard-test
- olm-bundle-validation
image: quay.io/operator-framework/scorecard-test:v1.36.0
labels:
suite: olm
test: olm-bundle-validation-test
storage:
spec:
mountPath: {}
- entrypoint:
- scorecard-test
- olm-crds-have-validation
image: quay.io/operator-framework/scorecard-test:v1.36.0
labels:
suite: olm
test: olm-crds-have-validation-test
storage:
spec:
mountPath: {}
- entrypoint:
- scorecard-test
- olm-crds-have-resources
image: quay.io/operator-framework/scorecard-test:v1.36.0
labels:
suite: olm
test: olm-crds-have-resources-test
storage:
spec:
mountPath: {}
- entrypoint:
- scorecard-test
- olm-spec-descriptors
image: quay.io/operator-framework/scorecard-test:v1.36.0
labels:
suite: olm
test: olm-spec-descriptors-test
storage:
spec:
mountPath: {}
- entrypoint:
- scorecard-test
- olm-status-descriptors
image: quay.io/operator-framework/scorecard-test:v1.36.0
labels:
suite: olm
test: olm-status-descriptors-test
storage:
spec:
mountPath: {}
storage:
spec:
mountPath: {}
15 changes: 14 additions & 1 deletion config/daemon/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,15 @@
resources:
- daemon.yaml
- daemon.yaml

configMapGenerator:
- literals:
- logLevel=info
- releaseVersion=1.1.0
- serviceAccountName=nic-configuration-operator-controller-manager
- configDaemonImage=harbor.mellanox.com/cloud-orchestration-dev/nic-configuration-operator-daemon:latest
- clusterType=openshift
name: config
options:
disableNameSuffixHash: true
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
6 changes: 3 additions & 3 deletions config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -3,6 +3,6 @@ resources:
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
images:
- name: controller
newName: nic-configuration-operator
newTag: latest
- digest: sha256:806cff1a608bec79e47cbd410968284d64f6f01b814ae668f3f7e7234001b80b
name: controller
newName: harbor.mellanox.com/cloud-orchestration-dev/amaslennikov/nic-configuration-operator
4 changes: 1 addition & 3 deletions config/manager/manager.yaml
Original file line number Diff line number Diff line change
@@ -71,7 +71,7 @@ spec:
- "ALL"
env:
- name: LOG_LEVEL
value: debug
value: info
- name: NAMESPACE
valueFrom:
fieldRef:
@@ -88,8 +88,6 @@ spec:
port: 8081
initialDelaySeconds: 5
periodSeconds: 10
# TODO(user): Configure the resources accordingly based on the project requirements.
# More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
resources:
limits:
cpu: 500m
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
apiVersion: operators.coreos.com/v1alpha1
kind: ClusterServiceVersion
metadata:
annotations:
alm-examples: '[]'
capabilities: Basic Install
name: nic-configuration-operator.v0.0.0
namespace: placeholder
spec:
apiservicedefinitions: {}
customresourcedefinitions:
owned:
- description: NicConfigurationTemplate is the Schema for the nicconfigurationtemplates
API
displayName: Nic Configuration Template
kind: NicConfigurationTemplate
name: nicconfigurationtemplates.configuration.net.nvidia.com
version: v1alpha1
- description: NicDevice is the Schema for the nicdevices API
displayName: Nic Device
kind: NicDevice
name: nicdevices.configuration.net.nvidia.com
version: v1alpha1
description: NVIDIA NIC Configuration Operator provides Kubernetes API (Custom Resource
Definition) to allow FW configuration on NVIDIA NICs in a coordinated manner.
It deploys a configuration daemon on each of the desired nodes to configure NVIDIA
NICs there.
displayName: NVIDIA NIC Configuration Operator
icon:
- base64data: ""
mediatype: ""
install:
spec:
deployments: null
strategy: ""
installModes:
- supported: true
type: OwnNamespace
- supported: true
type: SingleNamespace
- supported: false
type: MultiNamespace
- supported: false
type: AllNamespaces
keywords:
- node
- fw
- configuration
- nic
links:
- name: Nic Configuration Operator
url: https://github.com/Mellanox/nic-configuration-operator
maintainers:
- email: nvidia-network-operator-support@nvidia.com
name: NVIDIA
maturity: alpha
provider:
name: NVIDIA
version: 0.0.0
12 changes: 0 additions & 12 deletions config/rbac/auth_proxy_client_clusterrole.yaml

This file was deleted.

20 changes: 0 additions & 20 deletions config/rbac/auth_proxy_role.yaml

This file was deleted.

15 changes: 0 additions & 15 deletions config/rbac/auth_proxy_role_binding.yaml

This file was deleted.

17 changes: 0 additions & 17 deletions config/rbac/auth_proxy_service.yaml

This file was deleted.

22 changes: 0 additions & 22 deletions config/rbac/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,26 +1,4 @@
resources:
# All RBAC will be applied under this service account in
# the deployment namespace. You may comment out this resource
# if your manager will use a service account that exists at
# runtime. Be sure to update RoleBinding and ClusterRoleBinding
# subjects if changing service account names.
- service_account.yaml
- role.yaml
- role_binding.yaml
- leader_election_role.yaml
- leader_election_role_binding.yaml
# Comment the following 4 lines if you want to disable
# the auth proxy (https://github.com/brancz/kube-rbac-proxy)
# which protects your /metrics endpoint.
- auth_proxy_service.yaml
- auth_proxy_role.yaml
- auth_proxy_role_binding.yaml
- auth_proxy_client_clusterrole.yaml
# For each CRD, "Editor" and "Viewer" roles are scaffolded by
# default, aiding admins in cluster management. Those roles are
# not used by the Project itself. You can comment the following lines
# if you do not want those helpers be installed with your Project.
- nicdevice_editor_role.yaml
- nicdevice_viewer_role.yaml
- nicconfigurationtemplate_editor_role.yaml
- nicconfigurationtemplate_viewer_role.yaml
40 changes: 0 additions & 40 deletions config/rbac/leader_election_role.yaml

This file was deleted.

15 changes: 0 additions & 15 deletions config/rbac/leader_election_role_binding.yaml

This file was deleted.

27 changes: 0 additions & 27 deletions config/rbac/nicconfigurationtemplate_editor_role.yaml

This file was deleted.

23 changes: 0 additions & 23 deletions config/rbac/nicconfigurationtemplate_viewer_role.yaml

This file was deleted.

27 changes: 0 additions & 27 deletions config/rbac/nicdevice_editor_role.yaml

This file was deleted.

23 changes: 0 additions & 23 deletions config/rbac/nicdevice_viewer_role.yaml

This file was deleted.

31 changes: 31 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
@@ -12,6 +12,12 @@ rules:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- apiGroups:
- ""
resources:
@@ -40,6 +46,15 @@ rules:
- patch
- update
- watch
- apiGroups:
- apps
resources:
- daemonsets
verbs:
- create
- delete
- get
- update
- apiGroups:
- configuration.net.nvidia.com
resources:
@@ -92,6 +107,14 @@ rules:
- get
- patch
- update
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- create
- get
- update
- apiGroups:
- maintenance.nvidia.com
resources:
@@ -104,3 +127,11 @@ rules:
- patch
- update
- watch
- apiGroups:
- security.openshift.io
resourceNames:
- privileged
resources:
- securitycontextconstraints
verbs:
- use
Original file line number Diff line number Diff line change
@@ -1,9 +1,32 @@
apiVersion: configuration.net.nvidia.com/v1alpha1
kind: NicConfigurationTemplate
metadata:
labels:
app.kubernetes.io/name: nic-configuration-operator
app.kubernetes.io/managed-by: kustomize
name: nicconfigurationtemplate-sample
name: connectx6-config
namespace: nic-configuration-operator
spec:
# TODO(user): Add fields here
nodeSelector:
feature.node.kubernetes.io/network-sriov.capable: "true"
nicSelector:
# nicType selector is mandatory the rest are optional. Only a single type can be specified.
nicType: 101b
pciAddresses:
- "0000:03:00.0"
- “0000:04:00.0”
serialNumbers:
- "MT2116X09299"
resetToDefault: false # if set, template is ignored, device configuration should reset
template:
numVfs: 2
linkType: Ethernet
pciPerformanceOptimized:
enabled: true
maxAccOutRead: 44
maxReadRequest: 4096
roceOptimized:
enabled: true
qos:
trust: dscp
pfc: "0,0,0,1,0,0,0,0"
gpuDirectOptimized:
enabled: true
env: Baremetal
31 changes: 26 additions & 5 deletions config/samples/configuration.net_v1alpha1_nicdevice.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,30 @@
apiVersion: configuration.net.nvidia.com/v1alpha1
kind: NicDevice
metadata:
labels:
app.kubernetes.io/name: nic-configuration-operator
app.kubernetes.io/managed-by: kustomize
name: nicdevice-sample
name: co-node-25-101b-mt2232t13210
namespace: nic-configuration-operator
spec:
# TODO(user): Add fields here
configuration:
template:
linkType: Ethernet
numVfs: 8
pciPerformanceOptimized:
enabled: true
status:
conditions:
- reason: UpdateSuccessful
status: "False"
type: ConfigUpdateInProgress
firmwareVersion: 20.42.1000
node: co-node-25
partNumber: mcx632312a-hdat
ports:
- networkInterface: enp4s0f0np0
pci: "0000:04:00.0"
rdmaInterface: mlx5_0
- networkInterface: enp4s0f1np1
pci: "0000:04:00.1"
rdmaInterface: mlx5_1
psid: mt_0000000225
serialNumber: mt2232t13210
type: 101b
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: nic-configuration-operator-config
data:
serviceAccountName: "{{ include "nic-configuration-operator.serviceAccountName" . }}"
configDaemonImage: "{{ .Values.configDaemon.image.repository }}/{{ .Values.configDaemon.image.name }}:{{ .Values.configDaemon.image.tag | default .Chart.AppVersion }}"
{{- if .Values.imagePullSecrets}}
imagePullSecrets: {{ join "," .Values.imagePullSecrets }}
{{- end}}
{{- if .Values.configDaemon.nodeSelector}}
nodeSelector: {{ .Values.configDaemon.nodeSelector | toJson | quote }}
{{- end}}
{{- if .Values.configDaemon.resources}}
resources: {{ .Values.configDaemon.resources | toJson | quote }}
{{- end}}
{{- if .Values.logLevel}}
logLevel: {{ .Values.logLevel }}
{{- end}}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: supported-nic-firmware
name: nic-configuration-operator-supported-nic-firmware
data:
Nvidia_mlx5_ConnectX-4-24.07: "1013 24.07-0.6.1 12.28.2006"
Nvidia_mlx5_ConnectX-5-24.07: "1017 24.07-0.6.1 16.35.4030"
5 changes: 4 additions & 1 deletion internal/controller/nicconfigurationtemplate_controller.go
Original file line number Diff line number Diff line change
@@ -57,10 +57,13 @@ type NicConfigurationTemplateReconciler struct {
//+kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicdevices/finalizers,verbs=update
//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;update;patch
//+kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch
//+kubebuilder:rbac:groups="",resources=configmaps,verbs=get
//+kubebuilder:rbac:groups="",resources=events,verbs=create
//+kubebuilder:rbac:groups="",resources=pods,verbs=list
//+kubebuilder:rbac:groups="",resources=pods/eviction,verbs=create;delete;get;list;patch;update;watch
//+kubebuilder:rbac:groups=maintenance.nvidia.com,resources=nodemaintenances,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;update;create
//+kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=get;create;update;delete
//+kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=use,resourceNames=privileged

// Reconcile reconciles the NicConfigurationTemplate object
func (r *NicConfigurationTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
2 changes: 1 addition & 1 deletion pkg/consts/consts.go
Original file line number Diff line number Diff line change
@@ -78,7 +78,7 @@ const (

HostPath = "/host"

SupportedNicFirmwareConfigmap = "supported-nic-firmware"
SupportedNicFirmwareConfigmap = "nic-configuration-operator-supported-nic-firmware"
Mlx5ModuleVersionPath = "/sys/bus/pci/drivers/mlx5_core/module/version"

FwConfigNotAppliedAfterRebootErrorMsg = "firmware configuration failed to apply after reboot"

0 comments on commit 2400250

Please sign in to comment.