-
Notifications
You must be signed in to change notification settings - Fork 670
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added GPU enabled sandbox image. (v2?) #4340
base: master
Are you sure you want to change the base?
Changes from 5 commits
395bfb9
9fd335f
92afb8c
772e160
5027f4a
c2eed3e
a4ec221
b9510e5
f8c8c85
6ae93f3
91347b1
c9d639a
d8b0717
ccf0505
68fd7a2
59fd04d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
# syntax=docker/dockerfile:1.4-labs | ||
|
||
###### BUILD FLYTE | ||
FROM --platform=${BUILDPLATFORM} mgoltzsche/podman:minimal AS builder | ||
|
||
ARG TARGETARCH | ||
ENV TARGETARCH "${TARGETARCH}" | ||
|
||
WORKDIR /build | ||
|
||
COPY images/manifest.txt images/preload ./ | ||
RUN --security=insecure ./preload manifest.txt | ||
|
||
FROM --platform=${BUILDPLATFORM} golang:1.19-bullseye AS bootstrap | ||
|
||
ARG TARGETARCH | ||
ENV CGO_ENABLED 0 | ||
ENV GOARCH "${TARGETARCH}" | ||
ENV GOOS linux | ||
|
||
WORKDIR /flyteorg/build | ||
COPY bootstrap/go.mod bootstrap/go.sum ./ | ||
RUN go mod download | ||
COPY bootstrap/ ./ | ||
RUN --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/root/go/pkg/mod \ | ||
go build -o dist/flyte-sandbox-bootstrap cmd/bootstrap/main.go | ||
|
||
###### GET K3S | ||
# ARG K3S_TAG=v1.26.4-k3s1 | ||
FROM rancher/k3s:v1.26.4-k3s1 as k3s | ||
|
||
FROM nvidia/cuda:11.8.0-base-ubuntu22.04 | ||
|
||
ENV CRICTL_VERSION="v1.26.0" | ||
ENV FLYTE_GPU "ENABLED" | ||
ARG TARGETARCH | ||
|
||
RUN apt-get update \ | ||
&& apt-get -y install gnupg2 curl nvidia-container-toolkit \ | ||
&& chmod 1777 /tmp \ | ||
&& mkdir -p /var/lib/rancher/k3s/agent/etc/containerd \ | ||
&& mkdir -p /var/lib/rancher/k3s/server/manifests \ | ||
&& curl -L https://github.com/kubernetes-sigs/cri-tools/releases/download/$CRICTL_VERSION/crictl-${CRICTL_VERSION}-linux-amd64.tar.gz --output crictl-${CRICTL_VERSION}-linux-amd64.tar.gz \ | ||
&& tar zxvf crictl-$CRICTL_VERSION-linux-amd64.tar.gz -C /usr/local/bin \ | ||
&& rm -f crictl-$CRICTL_VERSION-linux-amd64.tar.gz \ | ||
&& echo "alias kubectl='k3s kubectl'" >> /root/.bashrc | ||
|
||
COPY --from=k3s /bin /bin | ||
COPY --from=k3s /etc /etc | ||
|
||
# Provide custom containerd configuration to configure the nvidia-container-runtime | ||
COPY config.toml.tmpl /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl | ||
|
||
# Deploy the nvidia driver plugin on startup | ||
COPY device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml | ||
|
||
COPY --from=builder /build/images/ /var/lib/rancher/k3s/agent/images/ | ||
COPY --from=bootstrap /flyteorg/build/dist/flyte-sandbox-bootstrap /bin/ | ||
COPY images/tar/${TARGETARCH}/ /var/lib/rancher/k3s/agent/images/ | ||
COPY manifests/ /var/lib/rancher/k3s/server/manifests-staging/ | ||
COPY bin/ /bin/ | ||
|
||
VOLUME /var/lib/kubelet | ||
VOLUME /var/lib/rancher/k3s | ||
VOLUME /var/lib/cni | ||
VOLUME /var/log | ||
|
||
ENV PATH="$PATH:/bin/aux" | ||
ENV CRI_CONFIG_FILE=/var/lib/rancher/k3s/agent/etc/crictl.yaml | ||
|
||
ENTRYPOINT [ "/bin/k3d-entrypoint.sh" ] | ||
CMD [ "server", "--disable=traefik", "--disable=servicelb" ] | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,8 +14,12 @@ if [ -f /sys/fs/cgroup/cgroup.controllers ]; then | |
# move the processes from the root group to the /init group, | ||
# otherwise writing subtree_control fails with EBUSY. | ||
mkdir -p /sys/fs/cgroup/init | ||
busybox xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : | ||
if command -v busybox >/dev/null 2>&1; then | ||
busybox xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : | ||
else | ||
xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : | ||
fi | ||
# enable controllers | ||
sed -e 's/ / +/g' -e 's/^/+/' <"/sys/fs/cgroup/cgroup.controllers" >"/sys/fs/cgroup/cgroup.subtree_control" | ||
sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers > /sys/fs/cgroup/cgroup.subtree_control | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess that GPU sandbox will use this command. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know the reason, it is because of the change from here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. busybox isn't installed on the base image ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, I am not sure is it necessary or not.
danpf marked this conversation as resolved.
Show resolved
Hide resolved
|
||
echo "[$(date -Iseconds)] [CgroupV2 Fix] Done" | ||
fi |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#!/bin/sh | ||
|
||
if [ -n "${FLYTE_GPU}" ]; then | ||
echo "GPU Enabled - checking if it's available" | ||
nvidia-smi | ||
if [ $? -eq 0 ]; then | ||
echo "nvidia-smi working" | ||
else | ||
>&2 echo "NVIDIA not available, enable it in docker like so: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/user-guide.html" | ||
exit 255 | ||
fi | ||
|
||
else | ||
echo "GPU not enabled" | ||
fi | ||
danpf marked this conversation as resolved.
Show resolved
Hide resolved
danpf marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
version = 2 | ||
|
||
[plugins."io.containerd.internal.v1.opt"] | ||
path = "{{ .NodeConfig.Containerd.Opt }}" | ||
[plugins."io.containerd.grpc.v1.cri"] | ||
stream_server_address = "127.0.0.1" | ||
stream_server_port = "10010" | ||
enable_selinux = {{ .NodeConfig.SELinux }} | ||
enable_unprivileged_ports = {{ .EnableUnprivileged }} | ||
enable_unprivileged_icmp = {{ .EnableUnprivileged }} | ||
|
||
{{- if .DisableCgroup}} | ||
disable_cgroup = true | ||
{{end}} | ||
{{- if .IsRunningInUserNS }} | ||
disable_apparmor = true | ||
restrict_oom_score_adj = true | ||
{{end}} | ||
|
||
{{- if .NodeConfig.AgentConfig.PauseImage }} | ||
sandbox_image = "{{ .NodeConfig.AgentConfig.PauseImage }}" | ||
{{end}} | ||
|
||
{{- if .NodeConfig.AgentConfig.Snapshotter }} | ||
[plugins."io.containerd.grpc.v1.cri".containerd] | ||
default_runtime_name = "nvidia" | ||
snapshotter = "{{ .NodeConfig.AgentConfig.Snapshotter }}" | ||
disable_snapshot_annotations = {{ if eq .NodeConfig.AgentConfig.Snapshotter "stargz" }}false{{else}}true{{end}} | ||
{{ if eq .NodeConfig.AgentConfig.Snapshotter "stargz" }} | ||
{{ if .NodeConfig.AgentConfig.ImageServiceSocket }} | ||
[plugins."io.containerd.snapshotter.v1.stargz"] | ||
cri_keychain_image_service_path = "{{ .NodeConfig.AgentConfig.ImageServiceSocket }}" | ||
[plugins."io.containerd.snapshotter.v1.stargz".cri_keychain] | ||
enable_keychain = true | ||
{{end}} | ||
{{ if .PrivateRegistryConfig }} | ||
{{ if .PrivateRegistryConfig.Mirrors }} | ||
[plugins."io.containerd.snapshotter.v1.stargz".registry.mirrors]{{end}} | ||
{{range $k, $v := .PrivateRegistryConfig.Mirrors }} | ||
[plugins."io.containerd.snapshotter.v1.stargz".registry.mirrors."{{$k}}"] | ||
endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf "%q" .}}{{end}}] | ||
{{if $v.Rewrites}} | ||
[plugins."io.containerd.snapshotter.v1.stargz".registry.mirrors."{{$k}}".rewrite] | ||
{{range $pattern, $replace := $v.Rewrites}} | ||
"{{$pattern}}" = "{{$replace}}" | ||
{{end}} | ||
{{end}} | ||
{{end}} | ||
{{range $k, $v := .PrivateRegistryConfig.Configs }} | ||
{{ if $v.Auth }} | ||
[plugins."io.containerd.snapshotter.v1.stargz".registry.configs."{{$k}}".auth] | ||
{{ if $v.Auth.Username }}username = {{ printf "%q" $v.Auth.Username }}{{end}} | ||
{{ if $v.Auth.Password }}password = {{ printf "%q" $v.Auth.Password }}{{end}} | ||
{{ if $v.Auth.Auth }}auth = {{ printf "%q" $v.Auth.Auth }}{{end}} | ||
{{ if $v.Auth.IdentityToken }}identitytoken = {{ printf "%q" $v.Auth.IdentityToken }}{{end}} | ||
{{end}} | ||
{{ if $v.TLS }} | ||
[plugins."io.containerd.snapshotter.v1.stargz".registry.configs."{{$k}}".tls] | ||
{{ if $v.TLS.CAFile }}ca_file = "{{ $v.TLS.CAFile }}"{{end}} | ||
{{ if $v.TLS.CertFile }}cert_file = "{{ $v.TLS.CertFile }}"{{end}} | ||
{{ if $v.TLS.KeyFile }}key_file = "{{ $v.TLS.KeyFile }}"{{end}} | ||
{{ if $v.TLS.InsecureSkipVerify }}insecure_skip_verify = true{{end}} | ||
{{end}} | ||
{{end}} | ||
{{end}} | ||
{{end}} | ||
{{end}} | ||
|
||
{{- if not .NodeConfig.NoFlannel }} | ||
[plugins."io.containerd.grpc.v1.cri".cni] | ||
bin_dir = "{{ .NodeConfig.AgentConfig.CNIBinDir }}" | ||
conf_dir = "{{ .NodeConfig.AgentConfig.CNIConfDir }}" | ||
{{end}} | ||
|
||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] | ||
runtime_type = "io.containerd.runc.v2" | ||
|
||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] | ||
SystemdCgroup = {{ .SystemdCgroup }} | ||
|
||
{{ if .PrivateRegistryConfig }} | ||
{{ if .PrivateRegistryConfig.Mirrors }} | ||
[plugins."io.containerd.grpc.v1.cri".registry.mirrors]{{end}} | ||
{{range $k, $v := .PrivateRegistryConfig.Mirrors }} | ||
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."{{$k}}"] | ||
endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf "%q" .}}{{end}}] | ||
{{if $v.Rewrites}} | ||
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."{{$k}}".rewrite] | ||
{{range $pattern, $replace := $v.Rewrites}} | ||
"{{$pattern}}" = "{{$replace}}" | ||
{{end}} | ||
{{end}} | ||
{{end}} | ||
|
||
{{range $k, $v := .PrivateRegistryConfig.Configs }} | ||
{{ if $v.Auth }} | ||
[plugins."io.containerd.grpc.v1.cri".registry.configs."{{$k}}".auth] | ||
{{ if $v.Auth.Username }}username = {{ printf "%q" $v.Auth.Username }}{{end}} | ||
{{ if $v.Auth.Password }}password = {{ printf "%q" $v.Auth.Password }}{{end}} | ||
{{ if $v.Auth.Auth }}auth = {{ printf "%q" $v.Auth.Auth }}{{end}} | ||
{{ if $v.Auth.IdentityToken }}identitytoken = {{ printf "%q" $v.Auth.IdentityToken }}{{end}} | ||
{{end}} | ||
{{ if $v.TLS }} | ||
[plugins."io.containerd.grpc.v1.cri".registry.configs."{{$k}}".tls] | ||
{{ if $v.TLS.CAFile }}ca_file = "{{ $v.TLS.CAFile }}"{{end}} | ||
{{ if $v.TLS.CertFile }}cert_file = "{{ $v.TLS.CertFile }}"{{end}} | ||
{{ if $v.TLS.KeyFile }}key_file = "{{ $v.TLS.KeyFile }}"{{end}} | ||
{{ if $v.TLS.InsecureSkipVerify }}insecure_skip_verify = true{{end}} | ||
{{end}} | ||
{{end}} | ||
{{end}} | ||
|
||
{{range $k, $v := .ExtraRuntimes}} | ||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes."{{$k}}"] | ||
runtime_type = "{{$v.RuntimeType}}" | ||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes."{{$k}}".options] | ||
BinaryName = "{{$v.BinaryName}}" | ||
{{end}} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you like to provide the source URL? |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
apiVersion: apps/v1 | ||
kind: DaemonSet | ||
metadata: | ||
name: nvidia-device-plugin-daemonset | ||
namespace: kube-system | ||
spec: | ||
selector: | ||
matchLabels: | ||
name: nvidia-device-plugin-ds | ||
template: | ||
metadata: | ||
# Mark this pod as a critical add-on; when enabled, the critical add-on scheduler | ||
# reserves resources for critical add-on pods so that they can be rescheduled after | ||
# a failure. This annotation works in tandem with the toleration below. | ||
annotations: | ||
scheduler.alpha.kubernetes.io/critical-pod: "" | ||
labels: | ||
name: nvidia-device-plugin-ds | ||
spec: | ||
tolerations: | ||
# Allow this pod to be rescheduled while the node is in "critical add-ons only" mode. | ||
# This, along with the annotation above marks this pod as a critical add-on. | ||
- key: CriticalAddonsOnly | ||
operator: Exists | ||
containers: | ||
- env: | ||
- name: DP_DISABLE_HEALTHCHECKS | ||
value: xids | ||
image: nvidia/k8s-device-plugin:1.11 | ||
name: nvidia-device-plugin-ctr | ||
securityContext: | ||
allowPrivilegeEscalation: true | ||
capabilities: | ||
drop: ["ALL"] | ||
volumeMounts: | ||
- name: device-plugin | ||
mountPath: /var/lib/kubelet/device-plugins | ||
volumes: | ||
- name: device-plugin | ||
hostPath: | ||
path: /var/lib/kubelet/device-plugins | ||
danpf marked this conversation as resolved.
Show resolved
Hide resolved
danpf marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
--- | ||
apiVersion: helm.cattle.io/v1 | ||
kind: HelmChart | ||
metadata: | ||
name: nvidia-device-plugin | ||
namespace: kube-system | ||
spec: | ||
chart: nvidia-device-plugin | ||
repo: https://nvidia.github.io/k8s-device-plugin | ||
danpf marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -805,7 +805,7 @@ type: Opaque | |||||||
--- | ||||||||
apiVersion: v1 | ||||||||
data: | ||||||||
haSharedSecret: UUREcXo3a1VGNnlyc1RCWg== | ||||||||
haSharedSecret: OVRlcXBXS2NkUTc0czEwQg== | ||||||||
proxyPassword: "" | ||||||||
proxyUsername: "" | ||||||||
kind: Secret | ||||||||
|
@@ -1366,7 +1366,7 @@ spec: | |||||||
metadata: | ||||||||
annotations: | ||||||||
checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 | ||||||||
checksum/secret: cfe2089fc583f69d068c3b3d56e875082a5d926c70b00b32f094d587df7396a5 | ||||||||
checksum/secret: 17ec194cf72de1676eef76a26280b2056a6b549bb06b25cf333e4b5f62562ab3 | ||||||||
labels: | ||||||||
app: docker-registry | ||||||||
release: flyte-sandbox | ||||||||
|
@@ -1814,3 +1814,12 @@ spec: | |||||||
updateStrategy: | ||||||||
rollingUpdate: {} | ||||||||
type: RollingUpdate | ||||||||
--- | ||||||||
apiVersion: helm.cattle.io/v1 | ||||||||
kind: HelmChart | ||||||||
metadata: | ||||||||
name: nvidia-device-plugin | ||||||||
namespace: kube-system | ||||||||
spec: | ||||||||
chart: nvidia-device-plugin | ||||||||
repo: https://nvidia.github.io/k8s-device-plugin | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would you like to explain the logic between Dockerfile and Dockerfile.gpu under the same directory?