-
Notifications
You must be signed in to change notification settings - Fork 0
/
04_add-gpu.yml
193 lines (170 loc) · 7.89 KB
/
04_add-gpu.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
---
- name: test1
gather_facts: false
hosts: k8s
tasks:
- name: test
debug:
msg:
- "{{ gpu_label }}"
- "{{ groups.k8s }}"
- name: test2
gather_facts: false
hosts: master
tasks:
- name: test
debug:
msg:
- "{{ hostvars[item].gpu_label }}"
- "{{ hostvars[item].node_name }}"
when: "hostvars[item].gpu_label == 'sp_t4'"
with_items: ["{{ groups.master }}", "{{ groups.node }}"]
- name: 00安装gpu 依赖
gather_facts: false
hosts: k8s
roles:
- gpu
tags: gpu
- name: 01更新gpu yaml
gather_facts: false
hosts:
- master
tasks:
#yaml部署是一个简单的静态守护进程集,旨在演示 的基本功能。在生产环境中部署插件时,使用helm部署
#- name: "Generate nvidia-device-plugin yaml"
# template:
# src: "roles/gpu/templates/{{item.src}}"
# dest: "{{ tmp_dir }}/nvidia/{{item.dest}}"
# with_items:
# - {src: "nvidia-device-plugin-0.16.1.yaml.j2", dest: "nvidia-device-plugin.yaml"}
# tags: nvidia-device-plugin
#- name: "Deploy nvidia-device-plugin"
# shell: "kubectl apply -f {{ tmp_dir }}/nvidia/nvidia-device-plugin.yaml"
# delegate_to: "{{ groups['master'][0] }}"
# run_once: true
# tags: nvidia-device-plugin
- name: 创建临时目录
file: dest={{ tmp_dir }}/nvidia state=directory
- name: 分发gpu容器化配置文件
template: src=roles/gpu/templates/{{ item }} dest={{ tmp_dir }}/nvidia/{{ item.split('.')[:-1]|join('.') }}
with_items:
- dp-mps-config.yaml.j2
- dp-time-slicing-config.yaml.j2
- dp-only-one-config.yaml.j2
- dcgm-exporter-helm-values.yaml.j2
- name: copy nvidia-device-plugin tool
copy: src=roles/gpu/files/{{ item }} dest={{ tmp_dir }}/nvidia
with_items:
- nvidia-device-plugin-{{ nvidia_gpu_plugin_ver }}.tgz
- dcgm-exporter-3.5.0.tgz
- name: copy nvidia-device-plugin helm
shell: >
cd {{ tmp_dir }}/nvidia &&
cp nvidia-device-plugin-{{ nvidia_gpu_plugin_ver }}.tgz {{ HELM_PATH }}/charts &&
cp dcgm-exporter-3.5.0.tgz {{ HELM_PATH }}/charts &&
helm repo index {{ HELM_PATH }}/charts --url http://{{ groups['helm'][0] }}:{{ HELM_PORT }}/charts &&
helm repo update
- name: 删除helm部署
shell: "helm delete dcgm-exporter -n nvidia-device-plugin || echo true; sleep 3"
tags: addons_delete
ignore_errors: true
- name: 删除helm部署
shell: "helm delete nvdp -n nvidia-device-plugin || echo true; sleep 3"
tags: addons_delete
ignore_errors: true
#问题解决:nvidia-device-plugin nvdp-nvidia-device-plugin 0 0 0 0 0 <none> 144m
#https://github.com/NVIDIA/k8s-device-plugin/issues/708
#dcgm-exporter会匹配到nvidia.com/gpu.present存在的node,无需指定值
- name: 打nsp标签
shell: |
kubectl label nodes {{ hostvars[item].node_name }} feature.node.kubernetes.io/pci-10de.present-
kubectl label nodes {{ hostvars[item].node_name }} nvidia.com/gpu.present-
when: "hostvars[item].gpu_label == 'nsp'"
with_items: ["{{ groups.master }}", "{{ groups.node }}"]
#0x10de 是PCI vender ID,是分配给 NVIDIA 的供应商ID
- name: 打nvidia标签
shell: |
kubectl label nodes {{ hostvars[item].node_name }} feature.node.kubernetes.io/pci-10de.present=true --overwrite
kubectl label nodes {{ hostvars[item].node_name }} nvidia.com/gpu.present=true --overwrite
#kubectl label nodes {{ hostvars[item].node_name }} feature.node.kubernetes.io/cpu-model.vendor_id=NVIDIA --overwrite
when: "hostvars[item].gpu_label == 'sp_t4'"
with_items: ["{{ groups.master }}", "{{ groups.node }}"]
- name: 打GPU独占标签
shell: |
kubectl label nodes {{ hostvars[item].node_name }} nvidia.com/mps.capable=false --overwrite
kubectl label nodes {{ hostvars[item].node_name }} nvidia.com/device-plugin.config=config0 --overwrite
when:
- "hostvars[item].gpu_label == 'sp_t4'"
- "hostvars[item].gpu_share_label == 'sp_one'"
with_items: ["{{ groups.master }}", "{{ groups.node }}"]
- name: 打mps标签
shell: |
kubectl label nodes {{ hostvars[item].node_name }} nvidia.com/mps.capable=true --overwrite
kubectl label nodes {{ hostvars[item].node_name }} nvidia.com/device-plugin.config=config1 --overwrite
when:
- "hostvars[item].gpu_label == 'sp_t4'"
- "hostvars[item].gpu_share_label == 'sp_mps'"
with_items: ["{{ groups.master }}", "{{ groups.node }}"]
- name: 打time-slicing标签
shell: |
kubectl label nodes {{ hostvars[item].node_name }} nvidia.com/mps.capable=false --overwrite
kubectl label nodes {{ hostvars[item].node_name }} nvidia.com/device-plugin.config=config2 --overwrite
when:
- "hostvars[item].gpu_label == 'sp_t4'"
- "hostvars[item].gpu_share_label == 'sp_time'"
with_items: ["{{ groups.master }}", "{{ groups.node }}"]
#时间分片和 MPS 的使用是互相排斥的,可以集群不同node混部,默认配置mps
- name: 更新部署type1
shell: >
helm upgrade -i nvdp
{{ tmp_dir }}/nvidia/nvidia-device-plugin-{{ nvidia_gpu_plugin_ver }}.tgz
--namespace nvidia-device-plugin
--create-namespace
--set gfd.enabled=true
--set config.default=config0
--set-file config.map.config0={{ tmp_dir }}/nvidia/dp-only-one-config.yaml
--set-file config.map.config1={{ tmp_dir }}/nvidia/dp-mps-config.yaml
--set-file config.map.config2={{ tmp_dir }}/nvidia/dp-time-slicing-config.yaml
when:
- "cluster_network != 'kube-ovn'"
#when use ovn ,error gfd:grpc: addrConn.createTransport failed to connect to {Addr: "nvdp-node-feature-discovery-master:8080", ServerName: "nvdp-node-feature-discovery-master:8080", }
#gfd bug fix: 使用ovn网络时候会冲突,直接屏蔽--set gfd.enabled=true
- name: 更新部署type2
shell: >
helm upgrade -i nvdp
{{ tmp_dir }}/nvidia/nvidia-device-plugin-{{ nvidia_gpu_plugin_ver }}.tgz
--namespace nvidia-device-plugin
--create-namespace
--set config.default=config0
--set-file config.map.config0={{ tmp_dir }}/nvidia/dp-only-one-config.yaml
--set-file config.map.config1={{ tmp_dir }}/nvidia/dp-mps-config.yaml
--set-file config.map.config2={{ tmp_dir }}/nvidia/dp-time-slicing-config.yaml
when:
- "cluster_network == 'kube-ovn'"
#--set compatWithCPUManager=true #启用兼容性CPUManager并以 100ms 的 CPU 时间请求和 512MB 内存限制运行
#--set resources.requests.cpu=100m
#--set resources.limits.memory=512Mi
#独立模式部署 gpu-feature-discovery
#helm upgrade--install nvdp {{ tmp_dir }}/nvidia/nvidia-device-plugin-{{ nvidia_gpu_plugin_ver }}.tgz
# --namespace gpu-feature-discovery
# --create-namespace
# --set devicePlugin.enabled=false
- name: 更新部署
shell: >
helm upgrade -i dcgm-exporter --namespace nvidia-device-plugin
{{ tmp_dir }}/nvidia/dcgm-exporter-3.5.0.tgz
-f {{ tmp_dir }}/nvidia/dcgm-exporter-helm-values.yaml
tags:
- nvidia-device-plugin
- gpu
#方式二:
#kubectl create ns nvidia-device-plugin
#kubectl create cm -n nvidia-device-plugin nvidia-plugin-configs \
# --from-file=config0=/tmp/dp-example-config0.yaml \
# --from-file=config1=/tmp/dp-example-config1.yaml
#helm upgrade -i nvdp nvdp/nvidia-device-plugin \
# --version=0.16.1 \
# --namespace nvidia-device-plugin \
# --create-namespace \
# --set config.default=config0 \
# --set config.name=nvidia-plugin-configs