-
Notifications
You must be signed in to change notification settings - Fork 670
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added GPU enabled sandbox image. (v2?) #4340
base: master
Are you sure you want to change the base?
Changes from all commits
395bfb9
9fd335f
92afb8c
772e160
5027f4a
c2eed3e
a4ec221
b9510e5
f8c8c85
6ae93f3
91347b1
c9d639a
d8b0717
ccf0505
68fd7a2
59fd04d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
# syntax=docker/dockerfile:1.4-labs | ||
FROM --platform=${BUILDPLATFORM} mgoltzsche/podman:minimal AS builder | ||
|
||
ARG TARGETARCH | ||
ENV TARGETARCH "${TARGETARCH}" | ||
|
||
WORKDIR /build | ||
|
||
COPY images/manifest.txt images/preload ./ | ||
RUN --security=insecure ./preload manifest.txt | ||
|
||
|
||
FROM --platform=${BUILDPLATFORM} golang:1.19-bullseye AS bootstrap | ||
|
||
ARG TARGETARCH | ||
ENV CGO_ENABLED 0 | ||
ENV GOARCH "${TARGETARCH}" | ||
ENV GOOS linux | ||
|
||
WORKDIR /flyteorg/build | ||
COPY bootstrap/go.mod bootstrap/go.sum ./ | ||
RUN go mod download | ||
COPY bootstrap/ ./ | ||
RUN --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/root/go/pkg/mod \ | ||
go build -o dist/flyte-sandbox-bootstrap cmd/bootstrap/main.go | ||
|
||
FROM rancher/k3s:v1.26.4-k3s1 as k3s | ||
|
||
# We may want to have another version with devel in the future (has more features but is huge) | ||
FROM nvidia/cuda:11.8.0-base-ubuntu22.04 | ||
|
||
ENV CRICTL_VERSION="v1.26.0" | ||
ENV FLYTE_GPU "ENABLED" | ||
ARG TARGETARCH | ||
|
||
ARG FLYTE_SANDBOX_VERSION | ||
ENV FLYTE_SANDBOX_VERSION "${FLYTE_SANDBOX_VERSION}" | ||
RUN apt-get update \ | ||
&& apt-get -y install gnupg2 curl nvidia-container-toolkit \ | ||
&& chmod 1777 /tmp \ | ||
&& mkdir -p /var/lib/rancher/k3s/agent/etc/containerd \ | ||
&& mkdir -p /var/lib/rancher/k3s/server/manifests \ | ||
&& curl -L https://github.com/kubernetes-sigs/cri-tools/releases/download/$CRICTL_VERSION/crictl-${CRICTL_VERSION}-linux-amd64.tar.gz --output crictl-${CRICTL_VERSION}-linux-amd64.tar.gz \ | ||
&& tar zxvf crictl-$CRICTL_VERSION-linux-amd64.tar.gz -C /usr/local/bin \ | ||
&& rm -f crictl-$CRICTL_VERSION-linux-amd64.tar.gz \ | ||
&& echo "alias kubectl='k3s kubectl'" >> /root/.bashrc | ||
|
||
COPY --from=k3s /bin /bin | ||
COPY --from=k3s /etc /etc | ||
|
||
# Provide custom containerd configuration to configure the nvidia-container-runtime | ||
COPY config.toml.tmpl /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl | ||
|
||
COPY --from=builder /build/images/ /var/lib/rancher/k3s/agent/images/ | ||
COPY images/tar/${TARGETARCH}/ /var/lib/rancher/k3s/agent/images/ | ||
COPY manifests/ /var/lib/rancher/k3s/server/manifests-staging/ | ||
COPY bin/ /bin/ | ||
|
||
# Install bootstrap | ||
COPY --from=bootstrap /flyteorg/build/dist/flyte-sandbox-bootstrap /bin/ | ||
|
||
VOLUME /var/lib/flyte/storage | ||
|
||
# Set environment variable for picking up additional CA certificates | ||
ENV SSL_CERT_DIR /var/lib/flyte/config/ca-certificates | ||
|
||
## START https://github.com/k3s-io/k3s/blob/master/package/Dockerfile#L15 | ||
VOLUME /var/lib/kubelet | ||
VOLUME /var/lib/rancher/k3s | ||
VOLUME /var/lib/cni | ||
VOLUME /var/log | ||
|
||
ENV PATH="$PATH:/bin/aux" | ||
ENV CRI_CONFIG_FILE=/var/lib/rancher/k3s/agent/etc/crictl.yaml | ||
## END https://github.com/k3s-io/k3s/blob/master/package/Dockerfile#L15 | ||
|
||
ENTRYPOINT [ "/bin/k3d-entrypoint.sh" ] | ||
CMD [ "server", "--disable=traefik", "--disable=servicelb" ] | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,8 +14,12 @@ if [ -f /sys/fs/cgroup/cgroup.controllers ]; then | |
# move the processes from the root group to the /init group, | ||
# otherwise writing subtree_control fails with EBUSY. | ||
mkdir -p /sys/fs/cgroup/init | ||
busybox xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : | ||
if command -v busybox >/dev/null 2>&1; then | ||
busybox xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : | ||
else | ||
xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : | ||
fi | ||
# enable controllers | ||
sed -e 's/ / +/g' -e 's/^/+/' <"/sys/fs/cgroup/cgroup.controllers" >"/sys/fs/cgroup/cgroup.subtree_control" | ||
sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers > /sys/fs/cgroup/cgroup.subtree_control | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess that GPU sandbox will use this command. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know the reason, it is because of the change from here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. busybox isn't installed on the base image ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, I am not sure is it necessary or not.
danpf marked this conversation as resolved.
Show resolved
Hide resolved
|
||
echo "[$(date -Iseconds)] [CgroupV2 Fix] Done" | ||
fi |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#!/bin/sh | ||
|
||
if [ -n "${FLYTE_GPU}" ]; then | ||
echo "GPU Enabled - checking if it's available" | ||
nvidia-smi | ||
if [ $? -eq 0 ]; then | ||
echo "nvidia-smi working" | ||
else | ||
>&2 echo "NVIDIA not available, enable it in docker like so: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/user-guide.html" | ||
exit 255 | ||
fi | ||
|
||
else | ||
echo "GPU not enabled" | ||
fi |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
# Exactly the same as: https://github.com/k3s-io/k3s/blob/master/pkg/agent/templates/templates_linux.go#L10 | ||
# EXCEPT under the heading: [plugins."io.containerd.grpc.v1.cri".containerd] we add: default_runtime_name = "nvidia" | ||
version = 2 | ||
|
||
[plugins."io.containerd.internal.v1.opt"] | ||
path = "{{ .NodeConfig.Containerd.Opt }}" | ||
[plugins."io.containerd.grpc.v1.cri"] | ||
stream_server_address = "127.0.0.1" | ||
stream_server_port = "10010" | ||
enable_selinux = {{ .NodeConfig.SELinux }} | ||
enable_unprivileged_ports = {{ .EnableUnprivileged }} | ||
enable_unprivileged_icmp = {{ .EnableUnprivileged }} | ||
|
||
{{- if .DisableCgroup}} | ||
disable_cgroup = true | ||
{{end}} | ||
{{- if .IsRunningInUserNS }} | ||
disable_apparmor = true | ||
restrict_oom_score_adj = true | ||
{{end}} | ||
|
||
{{- if .NodeConfig.AgentConfig.PauseImage }} | ||
sandbox_image = "{{ .NodeConfig.AgentConfig.PauseImage }}" | ||
{{end}} | ||
|
||
{{- if .NodeConfig.AgentConfig.Snapshotter }} | ||
[plugins."io.containerd.grpc.v1.cri".containerd] | ||
default_runtime_name = "nvidia" | ||
snapshotter = "{{ .NodeConfig.AgentConfig.Snapshotter }}" | ||
disable_snapshot_annotations = {{ if eq .NodeConfig.AgentConfig.Snapshotter "stargz" }}false{{else}}true{{end}} | ||
{{ if eq .NodeConfig.AgentConfig.Snapshotter "stargz" }} | ||
{{ if .NodeConfig.AgentConfig.ImageServiceSocket }} | ||
[plugins."io.containerd.snapshotter.v1.stargz"] | ||
cri_keychain_image_service_path = "{{ .NodeConfig.AgentConfig.ImageServiceSocket }}" | ||
[plugins."io.containerd.snapshotter.v1.stargz".cri_keychain] | ||
enable_keychain = true | ||
{{end}} | ||
{{ if .PrivateRegistryConfig }} | ||
{{ if .PrivateRegistryConfig.Mirrors }} | ||
[plugins."io.containerd.snapshotter.v1.stargz".registry.mirrors]{{end}} | ||
{{range $k, $v := .PrivateRegistryConfig.Mirrors }} | ||
[plugins."io.containerd.snapshotter.v1.stargz".registry.mirrors."{{$k}}"] | ||
endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf "%q" .}}{{end}}] | ||
{{if $v.Rewrites}} | ||
[plugins."io.containerd.snapshotter.v1.stargz".registry.mirrors."{{$k}}".rewrite] | ||
{{range $pattern, $replace := $v.Rewrites}} | ||
"{{$pattern}}" = "{{$replace}}" | ||
{{end}} | ||
{{end}} | ||
{{end}} | ||
{{range $k, $v := .PrivateRegistryConfig.Configs }} | ||
{{ if $v.Auth }} | ||
[plugins."io.containerd.snapshotter.v1.stargz".registry.configs."{{$k}}".auth] | ||
{{ if $v.Auth.Username }}username = {{ printf "%q" $v.Auth.Username }}{{end}} | ||
{{ if $v.Auth.Password }}password = {{ printf "%q" $v.Auth.Password }}{{end}} | ||
{{ if $v.Auth.Auth }}auth = {{ printf "%q" $v.Auth.Auth }}{{end}} | ||
{{ if $v.Auth.IdentityToken }}identitytoken = {{ printf "%q" $v.Auth.IdentityToken }}{{end}} | ||
{{end}} | ||
{{ if $v.TLS }} | ||
[plugins."io.containerd.snapshotter.v1.stargz".registry.configs."{{$k}}".tls] | ||
{{ if $v.TLS.CAFile }}ca_file = "{{ $v.TLS.CAFile }}"{{end}} | ||
{{ if $v.TLS.CertFile }}cert_file = "{{ $v.TLS.CertFile }}"{{end}} | ||
{{ if $v.TLS.KeyFile }}key_file = "{{ $v.TLS.KeyFile }}"{{end}} | ||
{{ if $v.TLS.InsecureSkipVerify }}insecure_skip_verify = true{{end}} | ||
{{end}} | ||
{{end}} | ||
{{end}} | ||
{{end}} | ||
{{end}} | ||
|
||
{{- if not .NodeConfig.NoFlannel }} | ||
[plugins."io.containerd.grpc.v1.cri".cni] | ||
bin_dir = "{{ .NodeConfig.AgentConfig.CNIBinDir }}" | ||
conf_dir = "{{ .NodeConfig.AgentConfig.CNIConfDir }}" | ||
{{end}} | ||
|
||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] | ||
runtime_type = "io.containerd.runc.v2" | ||
|
||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] | ||
SystemdCgroup = {{ .SystemdCgroup }} | ||
|
||
{{ if .PrivateRegistryConfig }} | ||
{{ if .PrivateRegistryConfig.Mirrors }} | ||
[plugins."io.containerd.grpc.v1.cri".registry.mirrors]{{end}} | ||
{{range $k, $v := .PrivateRegistryConfig.Mirrors }} | ||
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."{{$k}}"] | ||
endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf "%q" .}}{{end}}] | ||
{{if $v.Rewrites}} | ||
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."{{$k}}".rewrite] | ||
{{range $pattern, $replace := $v.Rewrites}} | ||
"{{$pattern}}" = "{{$replace}}" | ||
{{end}} | ||
{{end}} | ||
{{end}} | ||
|
||
{{range $k, $v := .PrivateRegistryConfig.Configs }} | ||
{{ if $v.Auth }} | ||
[plugins."io.containerd.grpc.v1.cri".registry.configs."{{$k}}".auth] | ||
{{ if $v.Auth.Username }}username = {{ printf "%q" $v.Auth.Username }}{{end}} | ||
{{ if $v.Auth.Password }}password = {{ printf "%q" $v.Auth.Password }}{{end}} | ||
{{ if $v.Auth.Auth }}auth = {{ printf "%q" $v.Auth.Auth }}{{end}} | ||
{{ if $v.Auth.IdentityToken }}identitytoken = {{ printf "%q" $v.Auth.IdentityToken }}{{end}} | ||
{{end}} | ||
{{ if $v.TLS }} | ||
[plugins."io.containerd.grpc.v1.cri".registry.configs."{{$k}}".tls] | ||
{{ if $v.TLS.CAFile }}ca_file = "{{ $v.TLS.CAFile }}"{{end}} | ||
{{ if $v.TLS.CertFile }}cert_file = "{{ $v.TLS.CertFile }}"{{end}} | ||
{{ if $v.TLS.KeyFile }}key_file = "{{ $v.TLS.KeyFile }}"{{end}} | ||
{{ if $v.TLS.InsecureSkipVerify }}insecure_skip_verify = true{{end}} | ||
{{end}} | ||
{{end}} | ||
{{end}} | ||
|
||
{{range $k, $v := .ExtraRuntimes}} | ||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes."{{$k}}"] | ||
runtime_type = "{{$v.RuntimeType}}" | ||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes."{{$k}}".options] | ||
BinaryName = "{{$v.BinaryName}}" | ||
SystemdCgroup = {{ $.SystemdCgroup }} | ||
{{end}} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# Sourced from: https://k3d.io/v5.6.0/usage/advanced/cuda/?h=gpu#the-nvidia-device-plugin | ||
# Thank you to the k3d team for their work on this. | ||
|
||
apiVersion: apps/v1 | ||
kind: DaemonSet | ||
metadata: | ||
name: nvidia-device-plugin-daemonset | ||
namespace: kube-system | ||
spec: | ||
selector: | ||
matchLabels: | ||
name: nvidia-device-plugin-ds | ||
template: | ||
metadata: | ||
# Mark this pod as a critical add-on; when enabled, the critical add-on scheduler | ||
# reserves resources for critical add-on pods so that they can be rescheduled after | ||
# a failure. This annotation works in tandem with the toleration below. | ||
annotations: | ||
scheduler.alpha.kubernetes.io/critical-pod: "" | ||
labels: | ||
name: nvidia-device-plugin-ds | ||
spec: | ||
tolerations: | ||
# Allow this pod to be rescheduled while the node is in "critical add-ons only" mode. | ||
# This, along with the annotation above marks this pod as a critical add-on. | ||
- key: CriticalAddonsOnly | ||
operator: Exists | ||
containers: | ||
- env: | ||
- name: DP_DISABLE_HEALTHCHECKS | ||
value: xids | ||
image: nvidia/k8s-device-plugin:1.11 | ||
name: nvidia-device-plugin-ctr | ||
securityContext: | ||
allowPrivilegeEscalation: true | ||
capabilities: | ||
drop: ["ALL"] | ||
volumeMounts: | ||
- name: device-plugin | ||
mountPath: /var/lib/kubelet/device-plugins | ||
volumes: | ||
- name: device-plugin | ||
hostPath: | ||
path: /var/lib/kubelet/device-plugins | ||
danpf marked this conversation as resolved.
Show resolved
Hide resolved
danpf marked this conversation as resolved.
Show resolved
Hide resolved
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would you like to explain the logic between Dockerfile and Dockerfile.gpu under the same directory?