Skip to content

Commit

Permalink
feat: added chart for AWS Inferenria operator (#133)
Browse files Browse the repository at this point in the history
* feat: added chart for AWS Inferenria operator
  • Loading branch information
debajyoti-truefoundry authored Nov 17, 2023
1 parent 668cbac commit 3bbf17b
Show file tree
Hide file tree
Showing 4 changed files with 176 additions and 0 deletions.
6 changes: 6 additions & 0 deletions charts/tfy-inferentia-operator/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apiVersion: v2
name: tfy-inferentia-operator
version: 0.0.1
description: "Truefoundry Inferentia Operator"
maintainers:
- name: truefoundry
15 changes: 15 additions & 0 deletions charts/tfy-inferentia-operator/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Tfy-inferentia-operator helm chart packaged by TrueFoundry
Tfy-inferentia-operator is a Helm chart that facilitates the deployment and management of AWS Inferentia resources in Kubernetes clusters.

Refer to https://github.com/aws-neuron/aws-neuron-sdk/blob/e0ef8a1a780ee798e7f01fe94f1235d571e211c6/src/k8/k8s-neuron-device-plugin.yml#L1

## Parameters

### Configuration for the device plugin responsible for node feature discovery

| Name | Description | Value |
| ---------------------------------------- | ------------------------------------------- | ------------------------------------------------------ |
| `devicePlugin.enabled` | Enable device plugin Daemonset. | `true` |
| `devicePlugin.resources.requests.cpu` | CPU request for device plugin Daemonset. | `100m` |
| `devicePlugin.resources.requests.memory` | Memory request for device plugin Daemonset. | `128MiB` |
| `devicePlugin.image` | Image to use for device plugin Daemonset. | `public.ecr.aws/neuron/neuron-device-plugin:2.16.18.0` |
114 changes: 114 additions & 0 deletions charts/tfy-inferentia-operator/templates/neuron-device-plugin.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
{{- if .Values.devicePlugin }}
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: {{ .Release.Name }}-neuron-device-plugin
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- ""
resources:
- pods
verbs:
- update
- patch
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- patch
- update
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ .Release.Name }}-neuron-device-plugin
namespace: {{ .Release.Namespace }}
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: {{ .Release.Name }}-neuron-device-plugin
namespace: {{ .Release.Namespace }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ .Release.Name }}-neuron-device-plugin
subjects:
- kind: ServiceAccount
name: {{ .Release.Name }}-neuron-device-plugin
namespace: {{ .Release.Namespace }}
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ .Release.Name }}-neuron-device-plugin
namespace: {{ .Release.Namespace }}
spec:
selector:
matchLabels:
name: {{ .Release.Name }}-neuron-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: {{ .Release.Name }}-neuron-device-plugin-ds
spec:
serviceAccount: neuron-device-plugin
tolerations:
{{- range .Values.devicePlugin.tolerations }}
- {{ . | toYaml | nindent 10 | trim }}
{{- else }}
[]
{{- end }}
priorityClassName: "system-node-critical"
affinity:
{{- toYaml (index .Values "devicePlugin" "affinity") | nindent 8 }}
containers:
- image: .Values.devicePlugin.image
imagePullPolicy: Always
name: neuron-device-plugin
resources:
{{- toYaml (index .Values "devicePlugin" "resources") | nindent 10 }}
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: infa-map
mountPath: /run
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: infa-map
hostPath:
path: /run
---
{{- end }}
41 changes: 41 additions & 0 deletions charts/tfy-inferentia-operator/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
## @section Configuration for the device plugin responsible for node feature discovery
## @param devicePlugin.enabled Enable device plugin Daemonset.
## @param devicePlugin.resources.requests.cpu CPU request for device plugin Daemonset.
## @param devicePlugin.resources.requests.memory Memory request for device plugin Daemonset.
## @param devicePlugin.image Image to use for device plugin Daemonset.
devicePlugin:
enabled: true
resources:
requests:
cpu: 100m
memory: 128MiB
image: public.ecr.aws/neuron/neuron-device-plugin:2.16.18.0
tolerations:
## @skip devicePlugin.tolerations[0]
## @skip devicePlugin.tolerations[1]
- key: CriticalAddonsOnly
operator: Exists
- key: aws.amazon.com/neuron
operator: Exists
effect: NoSchedule
affinity:
## @skip devicePlugin.affinity.nodeAffinity
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "node.kubernetes.io/instance-type"
operator: In
values:
- inf1.xlarge
- inf1.2xlarge
- inf1.6xlarge
- inf1.24xlarge
- inf2.xlarge
- inf2.4xlarge
- inf2.8xlarge
- inf2.24xlarge
- inf2.48xlarge
- trn1.2xlarge
- trn1.32xlarge
- trn1n.32xlarge

0 comments on commit 3bbf17b

Please sign in to comment.