diff --git a/charts/intel-gaudi-resource-driver/Chart.yaml b/charts/intel-gaudi-resource-driver/Chart.yaml index a354e08..0e78390 100644 --- a/charts/intel-gaudi-resource-driver/Chart.yaml +++ b/charts/intel-gaudi-resource-driver/Chart.yaml @@ -3,5 +3,5 @@ name: intel-gaudi-resource-driver description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel Gaudi Resource Driver type: application -version: 0.1.1 -appVersion: "v0.1.1" +version: 0.2.0 +appVersion: "v0.2.0" diff --git a/charts/intel-gaudi-resource-driver/README.md b/charts/intel-gaudi-resource-driver/README.md index 5882975..b54c538 100644 --- a/charts/intel-gaudi-resource-driver/README.md +++ b/charts/intel-gaudi-resource-driver/README.md @@ -43,9 +43,7 @@ You may also run `helm show values` on this chart's dependencies for additional | image.repository | string | `intel` | | image.name | string | `"intel-gaudi-resource-driver"` | | image.pullPolicy | string | `"IfNotPresent"` | -| image.tag | string | `"v0.1.1"` | - -If you change the image tag to be used in Helm chart deployment, ensure that the version of the container image is consistent with CRDs and deployment YAMLs - they might change between releases. +| image.tag | string | `"v0.2.0"` | > [!Note] -> Helm does not support _upgrading_ (or deleting) CRDs to prevent data loss. Only installing CRDs is supported. Details: https://github.com/helm/community/blob/main/hips/hip-0011.md +> When upgrading, CRDs from previous version need to be removed manually because Helm supports neither upgrading nor deleting CRDs, see: https://github.com/helm/community/blob/main/hips/hip-0011.md diff --git a/charts/intel-gaudi-resource-driver/crds/gaudi.resource.intel.com_gaudiallocationstates.yaml b/charts/intel-gaudi-resource-driver/crds/gaudi.resource.intel.com_gaudiallocationstates.yaml deleted file mode 100644 index 04cd6fc..0000000 --- a/charts/intel-gaudi-resource-driver/crds/gaudi.resource.intel.com_gaudiallocationstates.yaml +++ /dev/null @@ -1,110 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.15.0 - name: gaudiallocationstates.gaudi.resource.intel.com -spec: - group: gaudi.resource.intel.com - names: - kind: GaudiAllocationState - listKind: GaudiAllocationStateList - plural: gaudiallocationstates - singular: gas - scope: Namespaced - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: - GaudiAllocationState holds the state required for allocation - on a node. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: - GaudiAllocationStateSpec is the spec for the GaudiAllocationState - CRD. - properties: - allocatableDevices: - additionalProperties: - description: - AllocatableDevice represents an allocatable Gaudi on - a node. - properties: - model: - description: PCI ID of the Gaudi device. - type: string - uid: - description: - "Unique identifier of device: PCI address and PCI - Device ID." - type: string - required: - - model - - uid - type: object - type: object - allocatedClaims: - additionalProperties: - description: Resources that were allocated for the claim by controller. - properties: - devices: - description: - AllocatedDevices represents a list of allocated - devices on a node. - items: - description: - AllocatedDevice represents an allocated Gaudi - on a node. - properties: - uid: - description: - "Unique identifier of device: PCI address - and PCI Device ID." - type: string - required: - - uid - type: object - maxItems: 640 - type: array - required: - - devices - type: object - type: object - taintedDevices: - additionalProperties: - description: TaintedDevice represents a tainted Gaudi on a node. - properties: - reasons: - additionalProperties: - type: boolean - description: |- - Reasons why device is tainted, which _all_ need to be - resolved, before device can be dropped from taints map. - type: object - type: object - type: object - type: object - status: - type: string - type: object - served: true - storage: true diff --git a/charts/intel-gaudi-resource-driver/crds/gaudi.resource.intel.com_gaudiclaimparameters.yaml b/charts/intel-gaudi-resource-driver/crds/gaudi.resource.intel.com_gaudiclaimparameters.yaml deleted file mode 100644 index 128349b..0000000 --- a/charts/intel-gaudi-resource-driver/crds/gaudi.resource.intel.com_gaudiclaimparameters.yaml +++ /dev/null @@ -1,57 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.15.0 - name: gaudiclaimparameters.gaudi.resource.intel.com -spec: - group: gaudi.resource.intel.com - names: - kind: GaudiClaimParameters - listKind: GaudiClaimParametersList - plural: gaudiclaimparameters - singular: gaudiclaimparameters - scope: Namespaced - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: - GaudiClaimParameters holds the set of parameters provided when - creating a resource claim for a device. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: - GaudiClaimParametersSpec is the spec for the GaudiClaimParameters - CRD. - properties: - count: - description: How many devices is requested. - format: int64 - maximum: 8 - minimum: 1 - type: integer - required: - - count - type: object - type: object - served: true - storage: true diff --git a/charts/intel-gaudi-resource-driver/crds/gaudi.resource.intel.com_gaudiclassparameters.yaml b/charts/intel-gaudi-resource-driver/crds/gaudi.resource.intel.com_gaudiclassparameters.yaml deleted file mode 100644 index 8d6c6e8..0000000 --- a/charts/intel-gaudi-resource-driver/crds/gaudi.resource.intel.com_gaudiclassparameters.yaml +++ /dev/null @@ -1,63 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.15.0 - name: gaudiclassparameters.gaudi.resource.intel.com -spec: - group: gaudi.resource.intel.com - names: - kind: GaudiClassParameters - listKind: GaudiClassParametersList - plural: gaudiclassparameters - singular: gaudiclassparameters - scope: Cluster - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: - GaudiClassParameters holds the set of parameters provided when - creating a resource class for this driver. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: - GaudiClassParametersSpec is the spec for the GaudiClassParametersSpec - CRD. - properties: - deviceSelector: - items: - description: - DeviceSelector allows one to match on a specific type - of Device as part of the class. - properties: - name: - type: string - required: - - name - type: object - type: array - monitor: - type: boolean - type: object - type: object - served: true - storage: true diff --git a/charts/intel-gaudi-resource-driver/templates/clusterrole.yaml b/charts/intel-gaudi-resource-driver/templates/clusterrole.yaml index 61533fa..3dea9b1 100644 --- a/charts/intel-gaudi-resource-driver/templates/clusterrole.yaml +++ b/charts/intel-gaudi-resource-driver/templates/clusterrole.yaml @@ -5,11 +5,11 @@ metadata: namespace: {{ include "intel-gaudi-resource-driver.namespace" . }} rules: - apiGroups: [""] - resources: ["pods", "nodes", "events"] - verbs: ["get", "list", "create", "watch", "patch"] + resources: ["nodes"] + verbs: ["get"] - apiGroups: ["resource.k8s.io"] - resources: ["resourceclaims", "resourceclasses", "podschedulings","resourceclaims/status", "podschedulings/status", "podschedulingcontexts", "podschedulingcontexts/status"] - verbs: ["get", "update", "list", "watch", "patch"] -- apiGroups: ["gaudi.resource.intel.com"] - resources: ["*"] - verbs: ["*"] \ No newline at end of file + resources: ["resourceslices"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["resource.k8s.io"] + resources: ["resourceclaims"] + verbs: ["get"] diff --git a/charts/intel-gaudi-resource-driver/templates/device-class.yaml b/charts/intel-gaudi-resource-driver/templates/device-class.yaml new file mode 100644 index 0000000..6be276b --- /dev/null +++ b/charts/intel-gaudi-resource-driver/templates/device-class.yaml @@ -0,0 +1,9 @@ +apiVersion: resource.k8s.io/v1alpha3 +kind: DeviceClass +metadata: + name: gaudi.intel.com + +spec: + selectors: + - cel: + expression: device.driver == "gaudi.intel.com" diff --git a/charts/intel-gaudi-resource-driver/templates/resource-class.yaml b/charts/intel-gaudi-resource-driver/templates/resource-class.yaml deleted file mode 100644 index 6ea3dcf..0000000 --- a/charts/intel-gaudi-resource-driver/templates/resource-class.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: resource.k8s.io/v1alpha2 -kind: ResourceClass -metadata: - name: intel-gaudi -driverName: gaudi.resource.intel.com -parametersRef: - apiGroup: gaudi.resource.intel.com/v1alpha1 - kind: GaudiClassParameters - name: intel-gaudi-params ---- -apiVersion: gaudi.resource.intel.com/v1alpha1 -kind: GaudiClassParameters -metadata: - name: intel-gaudi-monitor-params -spec: - monitor: true ---- -apiVersion: resource.k8s.io/v1alpha2 -kind: ResourceClass -metadata: - name: intel-gaudi-monitor -driverName: gaudi.resource.intel.com -parametersRef: - apiGroup: gaudi.resource.intel.com/v1alpha1 - kind: GaudiClassParameters - name: intel-gaudi-monitor-params diff --git a/charts/intel-gaudi-resource-driver/templates/resource-driver.yaml b/charts/intel-gaudi-resource-driver/templates/resource-driver.yaml index e8e8382..b95679f 100644 --- a/charts/intel-gaudi-resource-driver/templates/resource-driver.yaml +++ b/charts/intel-gaudi-resource-driver/templates/resource-driver.yaml @@ -16,30 +16,6 @@ spec: spec: serviceAccount: intel-gaudi-resource-driver-service-account serviceAccountName: {{ include "intel-gaudi-resource-driver.serviceAccountName" . }} - initContainers: - - name: init - image: {{ include "intel-gaudi-resource-driver.fullimage" . }} - imagePullPolicy: {{ .Values.image.pullPolicy }} - command: ["/kubelet-gaudi-plugin", "--status", "NotReady"] - env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - securityContext: - privileged: false - allowPrivilegeEscalation: false - capabilities: - drop: ["ALL"] - readOnlyRootFilesystem: true - runAsNonRoot: true - runAsUser: 10002 - seccompProfile: - type: RuntimeDefault containers: - name: kubelet-plugin image: {{ include "intel-gaudi-resource-driver.fullimage" . }} @@ -56,11 +32,6 @@ spec: fieldPath: metadata.namespace - name: SYSFS_ROOT value: "/sysfs" - # Use this to tell kubelet-plugin where the DRI devices nodes should be. - # This will be prefix for CDI devices, runtime will try to mount devices - # with this prefix into workloads. - #- name: DEV_DRI_PATH - # value: "/fake/dri" volumeMounts: - name: plugins-registry mountPath: /var/lib/kubelet/plugins_registry @@ -70,6 +41,7 @@ spec: mountPath: /etc/cdi - name: varruncdi mountPath: /var/run/cdi + # when using fake sysfs - mount at the same place as on host - name: sysfs mountPath: "/sysfs" securityContext: @@ -109,55 +81,3 @@ spec: affinity: {{- toYaml . | nindent 8 }} {{- end }} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: intel-gaudi-resource-driver-controller - namespace: {{ include "intel-gaudi-resource-driver.namespace" . }} - labels: - {{- include "intel-gaudi-resource-driver.labels" . | nindent 4 }} -spec: - replicas: 1 - selector: - matchLabels: - app: intel-gaudi-resource-driver-controller - template: - metadata: - labels: - app: intel-gaudi-resource-driver-controller - spec: - serviceAccount: intel-gaudi-resource-driver-service-account - serviceAccountName: {{ include "intel-gaudi-resource-driver.serviceAccountName" . }} - containers: - - name: controller - image: {{ include "intel-gaudi-resource-driver.fullimage" . }} - imagePullPolicy: {{ .Values.image.pullPolicy }} - command: ["/gaudi-controller"] - env: - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - securityContext: - privileged: false - allowPrivilegeEscalation: false - capabilities: - drop: ["ALL"] - readOnlyRootFilesystem: true - runAsNonRoot: true - runAsUser: 10002 - seccompProfile: - type: RuntimeDefault - {{- with .Values.controller.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.controller.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.controller.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} diff --git a/charts/intel-gaudi-resource-driver/templates/validating-admission-policy-binding.yaml b/charts/intel-gaudi-resource-driver/templates/validating-admission-policy-binding.yaml new file mode 100644 index 0000000..387619f --- /dev/null +++ b/charts/intel-gaudi-resource-driver/templates/validating-admission-policy-binding.yaml @@ -0,0 +1,7 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicyBinding +metadata: + name: resourceslices-policy-dra-kubelet-plugin-gaudi +spec: + policyName: resourceslices-policy-dra-kubelet-plugin-gaudi + validationActions: [Deny] diff --git a/charts/intel-gaudi-resource-driver/templates/validating-admission-policy.yaml b/charts/intel-gaudi-resource-driver/templates/validating-admission-policy.yaml new file mode 100644 index 0000000..a432733 --- /dev/null +++ b/charts/intel-gaudi-resource-driver/templates/validating-admission-policy.yaml @@ -0,0 +1,31 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicy +metadata: + name: resourceslices-policy-dra-kubelet-plugin-gaudi +spec: + failurePolicy: Fail + matchConstraints: + resourceRules: + - apiGroups: ["resource.k8s.io"] + apiVersions: ["v1alpha3"] + operations: ["CREATE", "UPDATE", "DELETE"] + resources: ["resourceslices"] + matchConditions: + - name: isRestrictedUser + expression: >- + request.userInfo.username == "system:serviceaccount:intel-gaudi-resource-driver:intel-gaudi-resource-driver-service-account" + variables: + - name: userNodeName + expression: >- + request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('') + - name: objectNodeName + expression: >- + (request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("") + validations: + - expression: variables.userNodeName != "" + message: >- + no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled + - expression: variables.userNodeName == variables.objectNodeName + messageExpression: >- + "this user running on node '"+variables.userNodeName+"' may not modify " + + (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'") diff --git a/charts/intel-gaudi-resource-driver/values.yaml b/charts/intel-gaudi-resource-driver/values.yaml index 6cc4e7f..d87b4ba 100644 --- a/charts/intel-gaudi-resource-driver/values.yaml +++ b/charts/intel-gaudi-resource-driver/values.yaml @@ -9,15 +9,15 @@ image: repository: intel name: intel-gaudi-resource-driver pullPolicy: IfNotPresent - tag: "v0.1.1" + tag: "v0.2.0" serviceAccount: + create: true annotations: {} - name: "intel-gaudi-resource-driver-service-account" + name: intel-gaudi-resource-driver-service-account automount: true -# Define Controller Part -controller: +kubeletPlugin: podAnnotations: {} tolerations: - key: node-role.kubernetes.io/master @@ -26,14 +26,6 @@ controller: - key: node-role.kubernetes.io/control-plane operator: Exists effect: NoSchedule - nodeSelector: - {} - #node-role.kubernetes.io/control-plane: "" - affinity: {} - -# Define Kubelet-Plugin Part -kubeletPlugin: - podAnnotations: {} - tolerations: [] nodeSelector: {} + #node-role.kubernetes.io/control-plane: "" affinity: {}