Skip to content

Commit

Permalink
Add a pod mutating webhook to auto inject the pod network resources
Browse files Browse the repository at this point in the history
Signed-off-by: cyclinder <[email protected]>
  • Loading branch information
cyclinder committed Oct 30, 2024
1 parent f059e10 commit 394c80c
Show file tree
Hide file tree
Showing 30 changed files with 2,021 additions and 321 deletions.
139 changes: 71 additions & 68 deletions charts/spiderpool/README.md

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions charts/spiderpool/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ data:
clusterSubnetDefaultFlexibleIPNumber: 0
{{- end }}
tuneSysctlConfig: {{ .Values.spiderpoolAgent.tuneSysctlConfig }}
podResourceInject:
enabled: {{ .Values.spiderpoolController.podResourceInject.enabled }}
namespacesExclude: {{ toJson .Values.spiderpoolController.podResourceInject.namespacesExclude }}
namespacesInclude: {{ toJson .Values.spiderpoolController.podResourceInject.namespacesInclude }}
{{- if .Values.multus.multusCNI.install }}
---
kind: ConfigMap
Expand Down
2 changes: 2 additions & 0 deletions charts/spiderpool/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: SPIDERPOOL_CONTROLLER_DEPLOYMENT_NAME
value: {{ .Values.spiderpoolController.name | quote }}
{{- with .Values.spiderpoolController.extraEnv }}
{{- toYaml . | nindent 8 }}
{{- end }}
Expand Down
1 change: 1 addition & 0 deletions charts/spiderpool/templates/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ rules:
- delete
- get
- list
- update
- watch
- apiGroups:
- apiextensions.k8s.io
Expand Down
14 changes: 14 additions & 0 deletions charts/spiderpool/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,20 @@ spiderpoolController:
## @param spiderpoolController.webhookPort the http port for spiderpoolController webhook
webhookPort: 5722

podResourceInject:
## @param spiderpoolController.podResourceInject.enabled enable pod resource inject
enabled: false

## @param spiderpoolController.podResourceInject.namespacesExclude exclude the namespaces of the pod resource inject
namespacesExclude:
- kube-system
- spiderpool
- metallb-system
- istio-system

## @param spiderpoolController.podResourceInject.namespacesInclude include the namespaces of the pod resource inject, empty means all namespaces but exclude the namespaces in namespacesExclude, not empty means only include the namespaces in namespacesInclude
namespacesInclude: []

prometheus:
## @param spiderpoolController.prometheus.enabled enable spiderpool Controller to collect metrics
enabled: false
Expand Down
19 changes: 11 additions & 8 deletions cmd/spiderpool-controller/cmd/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ var envInfo = []envConf{
{"SPIDERPOOL_MULTUS_CONFIG_INFORMER_RESYNC_PERIOD", "60", false, nil, nil, &controllerContext.Cfg.MultusConfigInformerResyncPeriod},
{"SPIDERPOOL_CILIUM_CONFIGMAP_NAMESPACE_NAME", "kube-system/cilium-config", false, &controllerContext.Cfg.CiliumConfigName, nil, nil},

{"SPIDERPOOL_CONTROLLER_DEPLOYMENT_NAME", "spiderpool-controller", true, &controllerContext.Cfg.ControllerDeploymentName, nil, nil},
{"SPIDERPOOL_IPPOOL_INFORMER_RESYNC_PERIOD", "300", false, nil, nil, &controllerContext.Cfg.IPPoolInformerResyncPeriod},
{"SPIDERPOOL_IPPOOL_INFORMER_WORKERS", "3", true, nil, nil, &controllerContext.Cfg.IPPoolInformerWorkers},
{"SPIDERPOOL_AUTO_IPPOOL_HANDLER_MAX_WORKQUEUE_LENGTH", "10000", true, nil, nil, &controllerContext.Cfg.IPPoolInformerMaxWorkQueueLength},
Expand Down Expand Up @@ -128,16 +129,18 @@ type Config struct {
GopsListenPort string
PyroscopeAddress string
DefaultCniConfDir string
// CiliumConfigName is formatted by namespace and name,default is kube-system/cilium-config
// CiliumConfigName is formatted by namespace and name
// default is kube-system/cilium-config
CiliumConfigName string

ControllerPodNamespace string
ControllerPodName string
DefaultCoordinatorName string
LeaseDuration int
LeaseRenewDeadline int
LeaseRetryPeriod int
LeaseRetryGap int
ControllerDeploymentName string
ControllerPodNamespace string
ControllerPodName string
DefaultCoordinatorName string
LeaseDuration int
LeaseRenewDeadline int
LeaseRetryPeriod int
LeaseRetryGap int

IPPoolMaxAllocatedIPs int

Expand Down
2 changes: 2 additions & 0 deletions cmd/spiderpool-controller/cmd/crd_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/go-logr/logr"
multusv1 "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/apis/k8s.cni.cncf.io/v1"
calicov1 "github.com/tigera/operator/pkg/apis/crd.projectcalico.org/v1"
admissionregistrationv1 "k8s.io/api/admissionregistration/v1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
Expand All @@ -35,6 +36,7 @@ func init() {
utilruntime.Must(multusv1.AddToScheme(scheme))
utilruntime.Must(apiextensionsv1.AddToScheme(scheme))
utilruntime.Must(kubevirtv1.AddToScheme(scheme))
utilruntime.Must(admissionregistrationv1.AddToScheme(scheme))
}

func newCRDManager() (ctrl.Manager, error) {
Expand Down
16 changes: 16 additions & 0 deletions cmd/spiderpool-controller/cmd/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,22 @@ func initControllerServiceManagers(ctx context.Context) {
}
controllerContext.PodManager = podManager

if controllerContext.Cfg.PodResourceInjectConfig.Enabled {
logger.Debug("Begin to init Pod MutatingWebhook")
if err := podmanager.InitPodWebhook(controllerContext.ClientSet.AdmissionregistrationV1(),
controllerContext.CRDManager, controllerContext.Cfg.ControllerDeploymentName,
controllerContext.Cfg.PodResourceInjectConfig.NamespacesExclude,
controllerContext.Cfg.PodResourceInjectConfig.NamespacesInclude); err != nil {
logger.Fatal(err.Error())
}
} else {
logger.Debug("InjectPodNetworkResource is disabled, try to remove the pod part in the MutatingWebhook")
if err := podmanager.RemovePodMutatingWebhook(controllerContext.ClientSet.AdmissionregistrationV1(),
controllerContext.Cfg.ControllerDeploymentName); err != nil {
logger.Error(err.Error())
}
}

logger.Info("Begin to initialize StatefulSet manager")
statefulSetManager, err := statefulsetmanager.NewStatefulSetManager(
controllerContext.CRDManager.GetClient(),
Expand Down
1 change: 1 addition & 0 deletions docs/reference/spiderpool-controller.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Run the spiderpool controller daemon.
| SPIDERPOOL_CNI_CONFIG_DIR | /etc/cni/net.d | The host path of the cni config directory. |
| SPIDERPOOL_CILIUM_CONFIGMAP_NAMESPACE_NAME | kube-system/cilium-config. | The cilium's configMap, default is kube-system/cilium-config. |
| SPIDERPOOL_COORDINATOR_DEFAULT_NAME | default | the name of default spidercoordinator CR |
| SPIDERPOOL_CONTROLLER_DEPLOYMENT_NAME | spiderpool-controller | The deployment name of spiderpool-controller. |

## spiderpool-controller shutdown

Expand Down
187 changes: 146 additions & 41 deletions docs/usage/install/ai/get-started-macvlan-zh_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@
对于 Mellanox 网卡,可下载 [NVIDIA OFED 官方驱动](https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/)进行主机安装,执行如下安装命令:

```shell
$ mount /root/MLNX_OFED_LINUX-24.01-0.3.3.1-ubuntu22.04-x86_64.iso /mnt
$ /mnt/mlnxofedinstall --all
mount /root/MLNX_OFED_LINUX-24.01-0.3.3.1-ubuntu22.04-x86_64.iso /mnt
/mnt/mlnxofedinstall --all
```

对于 Mellanox 网卡,也可基于容器化安装驱动,实现对集群主机上所有 Mellanox 网卡批量安装驱动,运行如下命令,注意的是,该运行过程中需要访问因特网获取一些安装包。当所有的 ofed pod 进入 ready 状态,表示主机上已经完成了 OFED driver 安装。
Expand Down Expand Up @@ -131,7 +131,7 @@
gdrdrv 24576 0
```

4. 确认主机上的 RDMA 子系统为 shared 模式,这是 macvlan 场景下提供 RDMA 设备给容器的要求。
4. 确认主机上的 RDMA 子系统为 shared 模式,这是 macvlan 场景下提供 RDMA 设备给容器的要求。

```
# Check the current operating mode (the Linux RDMA subsystem operates in shared mode by default):
Expand All @@ -144,10 +144,10 @@
1. 使用 helm 安装 Spiderpool,并启用 rdmaSharedDevicePlugin 组件

```shell
$ helm repo add spiderpool https://spidernet-io.github.io/spiderpool
$ helm repo update spiderpool
$ kubectl create namespace spiderpool
$ helm install spiderpool spiderpool/spiderpool -n spiderpool --set rdma.rdmaSharedDevicePlugin.install=true
helm repo add spiderpool https://spidernet-io.github.io/spiderpool
helm repo update spiderpool
kubectl create namespace spiderpool
helm install spiderpool spiderpool/spiderpool -n spiderpool --set rdma.rdmaSharedDevicePlugin.install=true
```

> 如果您是中国用户,可以指定参数 `--set global.imageRegistryOverride=ghcr.m.daocloud.io` 来使用国内的镜像源。
Expand Down Expand Up @@ -223,22 +223,22 @@
metadata:
name: gpu1-net11
spec:
gateway: 172.16.11.254
subnet: 172.16.11.0/16
ips:
- 172.16.11.1-172.16.11.200
gateway: 172.16.11.254
subnet: 172.16.11.0/16
ips:
- 172.16.11.1-172.16.11.200
---
apiVersion: spiderpool.spidernet.io/v2beta1
kind: SpiderMultusConfig
metadata:
name: gpu1-macvlan
namespace: spiderpool
spec:
cniType: macvlan
macvlan:
master: ["enp11s0f0np0"]
ippools:
ipv4: ["gpu1-net11"]
cniType: macvlan
macvlan:
master: ["enp11s0f0np0"]
ippools:
ipv4: ["gpu1-net11"]
EOF
```
Expand All @@ -247,6 +247,8 @@
1. 在指定节点上创建一组 DaemonSet 应用
如下例子,通过 annotations `v1.multus-cni.io/default-network` 指定使用 calico 的缺省网卡,用于进行控制面通信,annotations `k8s.v1.cni.cncf.io/networks` 接入 8 个 GPU 亲和网卡的网卡,用于 RDMA 通信,并配置 8 种 RDMA resources 资源
> 注:可自动为应用注入 RDMA 网络资源,参考 [基于 Webhook 自动注入 RDMA 资源](#基于-webhook-自动注入网络资源)
```shell
$ helm repo add spiderchart https://spidernet-io.github.io/charts
$ helm repo update
Expand All @@ -261,39 +263,39 @@
# just run daemonset in nodes 'worker1' and 'worker2'
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- worker1
- worker2
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- worker1
- worker2
# macvlan interfaces
extraAnnotations:
k8s.v1.cni.cncf.io/networks: |-
[{"name":"gpu1-macvlan","namespace":"spiderpool"},
{"name":"gpu2-macvlan","namespace":"spiderpool"},
{"name":"gpu3-macvlan","namespace":"spiderpool"},
{"name":"gpu4-macvlan","namespace":"spiderpool"},
{"name":"gpu5-macvlan","namespace":"spiderpool"},
{"name":"gpu6-macvlan","namespace":"spiderpool"},
{"name":"gpu7-macvlan","namespace":"spiderpool"},
{"name":"gpu8-macvlan","namespace":"spiderpool"}]
[{"name":"gpu1-macvlan","namespace":"spiderpool"},
{"name":"gpu2-macvlan","namespace":"spiderpool"},
{"name":"gpu3-macvlan","namespace":"spiderpool"},
{"name":"gpu4-macvlan","namespace":"spiderpool"},
{"name":"gpu5-macvlan","namespace":"spiderpool"},
{"name":"gpu6-macvlan","namespace":"spiderpool"},
{"name":"gpu7-macvlan","namespace":"spiderpool"},
{"name":"gpu8-macvlan","namespace":"spiderpool"}]
# macvlan resource
resources:
limits:
spidernet.io/shared_cx5_gpu1: 1
spidernet.io/shared_cx5_gpu2: 1
spidernet.io/shared_cx5_gpu3: 1
spidernet.io/shared_cx5_gpu4: 1
spidernet.io/shared_cx5_gpu5: 1
spidernet.io/shared_cx5_gpu6: 1
spidernet.io/shared_cx5_gpu7: 1
spidernet.io/shared_cx5_gpu8: 1
#nvidia.com/gpu: 1
spidernet.io/shared_cx5_gpu1: 1
spidernet.io/shared_cx5_gpu2: 1
spidernet.io/shared_cx5_gpu3: 1
spidernet.io/shared_cx5_gpu4: 1
spidernet.io/shared_cx5_gpu5: 1
spidernet.io/shared_cx5_gpu6: 1
spidernet.io/shared_cx5_gpu7: 1
spidernet.io/shared_cx5_gpu8: 1
#nvidia.com/gpu: 1
EOF
$ helm install rdma-tools spiderchart/rdma-tools -f ./values.yaml
Expand Down Expand Up @@ -410,3 +412,106 @@
# Successfully access the RDMA service of the other Pod
$ ib_read_lat 172.91.0.115
```
## 基于 Webhook 自动注入网络资源
Spiderpool 为了简化 AI 应用配置多网卡的复杂度,支持通过 labels(`cni.spidernet.io/rdma-resource-inject`) 对一组网卡配置分类。用户只需要为 Pod 添加相同的注解。这样 Spiderpool 会通过 webhook 自动为 Pod 注入所有具有相同 label 的对应的网卡和网络资源。
> 该功能仅支持 [ macvlan,ipvlan,sriov,ib-sriov, ipoib ] 这几种 cniType 的网卡配置。
1. 安装 Spiderpool 时,指定开启 webhook 自动注入网络资源功能:
```shell
helm install spiderpool spiderchart/spiderpool --set spiderpoolController.podResourceInject.enabled=true
```
> - 默认关闭 webhook 自动注入网络资源功能,需要用户手动开启。
> - 您可以通过 `spiderpoolController.podResourceInject.namespacesExclude` 指定不注入的命名空间,通过 `spiderpoolController.podResourceInject.namespacesInclude` 指定注入的命名空间。
> - 安装 Spiderpool 后,您可以通过更新 spiderpool-config configMap 中 podResourceInject 字段更新配置。
2. 创建 SpiderMultusConfig 时指定 labels,并配置 RDMA 相关配置:
```shell
$ cat <<EOF | kubectl apply -f -
apiVersion: spiderpool.spidernet.io/v2beta1
kind: SpiderMultusConfig
metadata:
name: gpu1-macvlan
namespace: spiderpool
labels:
cni.spidernet.io/rdma-resource-inject: gpu-macvlan
spec:
cniType: macvlan
macvlan:
master: ["enp11s0f0np0"]
enableRdma: true
rdmaResourceName: spidernet.io/shared_cx5_gpu1
ippools:
ipv4: ["gpu1-net11"]
EOF
```
> - `cni.spidernet.io/rdma-resource-inject: gpu-macvlan` 固定的 key,value 为用户自定义。具有相同 `Label``Value` 的一组网卡配置要求 `cniType` 必须一致。
> - `enableRdma`, `rdmaResourceName``ippools` 必须配置,否则 Pod 无法成功注入网络资源。
3. 创建应用时添加注解: `cni.spidernet.io/rdma-resource-inject: gpu-macvlan`,这样 Spiderpool 自动为 Pod 添加 8 个 GPU 亲和网卡的网卡,用于 RDMA 通信,并配置 8 种 RDMA resources 资源:
> 注意:使用 webhook 自动注入网络资源功能时,不能为应用添加其他网络配置注解(如 `k8s.v1.cni.cncf.io/networks``ipam.spidernet.io/ippools`等),否则会影响资源自动注入功能。
```shell
$ helm repo add spiderchart https://spidernet-io.github.io/charts
$ helm repo update
$ helm search repo rdma-tools
# run daemonset on worker1 and worker2
$ cat <<EOF > values.yaml
# for china user , it could add these to use a domestic registry
#image:
# registry: ghcr.m.daocloud.io
# just run daemonset in nodes 'worker1' and 'worker2'
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- worker1
- worker2
# macvlan interfaces
extraAnnotations:
cni.spidernet.io/rdma-resource-inject: gpu-macvlan
EOF
$ helm install rdma-tools spiderchart/rdma-tools -f ./values.yaml
```
当 Pod 成功 Running,检查 Pod 是否成功注入 8 个 RDMA 网卡的 annotations 和 8 种 RDMA 资源。
```shell
# Pod multus annotations
k8s.v1.cni.cncf.io/networks: |-
[{"name":"gpu1-macvlan","namespace":"spiderpool"},
{"name":"gpu2-macvlan","namespace":"spiderpool"},
{"name":"gpu3-macvlan","namespace":"spiderpool"},
{"name":"gpu4-macvlan","namespace":"spiderpool"},
{"name":"gpu5-macvlan","namespace":"spiderpool"},
{"name":"gpu6-macvlan","namespace":"spiderpool"},
{"name":"gpu7-macvlan","namespace":"spiderpool"},
{"name":"gpu8-macvlan","namespace":"spiderpool"}]
# macvlan resource
resources:
requests:
spidernet.io/shared_cx5_gpu1: 1
spidernet.io/shared_cx5_gpu2: 1
spidernet.io/shared_cx5_gpu3: 1
spidernet.io/shared_cx5_gpu4: 1
spidernet.io/shared_cx5_gpu5: 1
spidernet.io/shared_cx5_gpu6: 1
spidernet.io/shared_cx5_gpu7: 1
spidernet.io/shared_cx5_gpu8: 1
#nvidia.com/gpu: 1
```
Loading

0 comments on commit 394c80c

Please sign in to comment.