-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: added chart for AWS Inferenria operator (#133)
* feat: added chart for AWS Inferenria operator
- Loading branch information
1 parent
668cbac
commit 3bbf17b
Showing
4 changed files
with
176 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
apiVersion: v2 | ||
name: tfy-inferentia-operator | ||
version: 0.0.1 | ||
description: "Truefoundry Inferentia Operator" | ||
maintainers: | ||
- name: truefoundry |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# Tfy-inferentia-operator helm chart packaged by TrueFoundry | ||
Tfy-inferentia-operator is a Helm chart that facilitates the deployment and management of AWS Inferentia resources in Kubernetes clusters. | ||
|
||
Refer to https://github.com/aws-neuron/aws-neuron-sdk/blob/e0ef8a1a780ee798e7f01fe94f1235d571e211c6/src/k8/k8s-neuron-device-plugin.yml#L1 | ||
|
||
## Parameters | ||
|
||
### Configuration for the device plugin responsible for node feature discovery | ||
|
||
| Name | Description | Value | | ||
| ---------------------------------------- | ------------------------------------------- | ------------------------------------------------------ | | ||
| `devicePlugin.enabled` | Enable device plugin Daemonset. | `true` | | ||
| `devicePlugin.resources.requests.cpu` | CPU request for device plugin Daemonset. | `100m` | | ||
| `devicePlugin.resources.requests.memory` | Memory request for device plugin Daemonset. | `128MiB` | | ||
| `devicePlugin.image` | Image to use for device plugin Daemonset. | `public.ecr.aws/neuron/neuron-device-plugin:2.16.18.0` | |
114 changes: 114 additions & 0 deletions
114
charts/tfy-inferentia-operator/templates/neuron-device-plugin.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
{{- if .Values.devicePlugin }} | ||
kind: ClusterRole | ||
apiVersion: rbac.authorization.k8s.io/v1 | ||
metadata: | ||
name: {{ .Release.Name }}-neuron-device-plugin | ||
rules: | ||
- apiGroups: | ||
- "" | ||
resources: | ||
- nodes | ||
verbs: | ||
- get | ||
- list | ||
- watch | ||
- apiGroups: | ||
- "" | ||
resources: | ||
- events | ||
verbs: | ||
- create | ||
- patch | ||
- apiGroups: | ||
- "" | ||
resources: | ||
- pods | ||
verbs: | ||
- update | ||
- patch | ||
- get | ||
- list | ||
- watch | ||
- apiGroups: | ||
- "" | ||
resources: | ||
- nodes/status | ||
verbs: | ||
- patch | ||
- update | ||
--- | ||
apiVersion: v1 | ||
kind: ServiceAccount | ||
metadata: | ||
name: {{ .Release.Name }}-neuron-device-plugin | ||
namespace: {{ .Release.Namespace }} | ||
--- | ||
kind: ClusterRoleBinding | ||
apiVersion: rbac.authorization.k8s.io/v1 | ||
metadata: | ||
name: {{ .Release.Name }}-neuron-device-plugin | ||
namespace: {{ .Release.Namespace }} | ||
roleRef: | ||
apiGroup: rbac.authorization.k8s.io | ||
kind: ClusterRole | ||
name: {{ .Release.Name }}-neuron-device-plugin | ||
subjects: | ||
- kind: ServiceAccount | ||
name: {{ .Release.Name }}-neuron-device-plugin | ||
namespace: {{ .Release.Namespace }} | ||
--- | ||
apiVersion: apps/v1 | ||
kind: DaemonSet | ||
metadata: | ||
name: {{ .Release.Name }}-neuron-device-plugin | ||
namespace: {{ .Release.Namespace }} | ||
spec: | ||
selector: | ||
matchLabels: | ||
name: {{ .Release.Name }}-neuron-device-plugin-ds | ||
updateStrategy: | ||
type: RollingUpdate | ||
template: | ||
metadata: | ||
labels: | ||
name: {{ .Release.Name }}-neuron-device-plugin-ds | ||
spec: | ||
serviceAccount: neuron-device-plugin | ||
tolerations: | ||
{{- range .Values.devicePlugin.tolerations }} | ||
- {{ . | toYaml | nindent 10 | trim }} | ||
{{- else }} | ||
[] | ||
{{- end }} | ||
priorityClassName: "system-node-critical" | ||
affinity: | ||
{{- toYaml (index .Values "devicePlugin" "affinity") | nindent 8 }} | ||
containers: | ||
- image: .Values.devicePlugin.image | ||
imagePullPolicy: Always | ||
name: neuron-device-plugin | ||
resources: | ||
{{- toYaml (index .Values "devicePlugin" "resources") | nindent 10 }} | ||
env: | ||
- name: NODE_NAME | ||
valueFrom: | ||
fieldRef: | ||
fieldPath: spec.nodeName | ||
securityContext: | ||
allowPrivilegeEscalation: false | ||
capabilities: | ||
drop: ["ALL"] | ||
volumeMounts: | ||
- name: device-plugin | ||
mountPath: /var/lib/kubelet/device-plugins | ||
- name: infa-map | ||
mountPath: /run | ||
volumes: | ||
- name: device-plugin | ||
hostPath: | ||
path: /var/lib/kubelet/device-plugins | ||
- name: infa-map | ||
hostPath: | ||
path: /run | ||
--- | ||
{{- end }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
## @section Configuration for the device plugin responsible for node feature discovery | ||
## @param devicePlugin.enabled Enable device plugin Daemonset. | ||
## @param devicePlugin.resources.requests.cpu CPU request for device plugin Daemonset. | ||
## @param devicePlugin.resources.requests.memory Memory request for device plugin Daemonset. | ||
## @param devicePlugin.image Image to use for device plugin Daemonset. | ||
devicePlugin: | ||
enabled: true | ||
resources: | ||
requests: | ||
cpu: 100m | ||
memory: 128MiB | ||
image: public.ecr.aws/neuron/neuron-device-plugin:2.16.18.0 | ||
tolerations: | ||
## @skip devicePlugin.tolerations[0] | ||
## @skip devicePlugin.tolerations[1] | ||
- key: CriticalAddonsOnly | ||
operator: Exists | ||
- key: aws.amazon.com/neuron | ||
operator: Exists | ||
effect: NoSchedule | ||
affinity: | ||
## @skip devicePlugin.affinity.nodeAffinity | ||
nodeAffinity: | ||
requiredDuringSchedulingIgnoredDuringExecution: | ||
nodeSelectorTerms: | ||
- matchExpressions: | ||
- key: "node.kubernetes.io/instance-type" | ||
operator: In | ||
values: | ||
- inf1.xlarge | ||
- inf1.2xlarge | ||
- inf1.6xlarge | ||
- inf1.24xlarge | ||
- inf2.xlarge | ||
- inf2.4xlarge | ||
- inf2.8xlarge | ||
- inf2.24xlarge | ||
- inf2.48xlarge | ||
- trn1.2xlarge | ||
- trn1.32xlarge | ||
- trn1n.32xlarge |