-
Notifications
You must be signed in to change notification settings - Fork 44
228 lines (194 loc) · 10.8 KB
/
guided_notebook_tests.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
name: Guided notebooks tests
on:
pull_request:
branches:
- main
- 'release-*'
paths-ignore:
- 'docs/**'
- '**.adoc'
- '**.md'
- 'LICENSE'
push:
branches:
- main
- 'release-*'
paths-ignore:
- 'docs/**'
- '**.adoc'
- '**.md'
- 'LICENSE'
concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
cancel-in-progress: true
env:
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
permissions:
id-token: write # This is required for requesting the JWT
contents: read
jobs:
kubernetes:
runs-on: ubuntu-20.04-4core-gpu
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
submodules: recursive
- name: Checkout common repo code
uses: actions/checkout@v4
with:
repository: 'project-codeflare/codeflare-common'
ref: 'main'
path: 'common'
- name: Checkout CodeFlare operator repository
uses: actions/checkout@v4
with:
repository: project-codeflare/codeflare-operator
path: codeflare-operator
- name: Set Go
uses: actions/setup-go@v5
with:
go-version-file: './codeflare-operator/go.mod'
cache-dependency-path: "./codeflare-operator/go.sum"
- name: Set up gotestfmt
uses: gotesttools/gotestfmt-action@v2
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
python-version: '3.9'
cache: 'pip' # caching pip dependencies
- name: Setup NVidia GPU environment for KinD
uses: ./common/github-actions/nvidia-gpu-setup
- name: Setup and start KinD cluster
uses: ./common/github-actions/kind
- name: Install NVidia GPU operator for KinD
uses: ./common/github-actions/nvidia-gpu-operator
with:
enable-time-slicing: 'true'
- name: Deploy CodeFlare stack
id: deploy
run: |
cd codeflare-operator
echo Setting up CodeFlare stack
make setup-e2e
echo Deploying CodeFlare operator
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
cd ..
- name: Install MINIO
run: |
kubectl apply -f ./tests/e2e/minio_deployment.yaml
kubectl wait --timeout=120s --for=condition=Available=true deployment -n default minio
- name: Add user to KinD
uses: ./common/github-actions/kind-add-user
with:
user-name: sdk-user
- name: Configure RBAC for sdk user with limited permissions
run: |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
kubectl config use-context sdk-user
- name: Setup Guided notebooks execution
run: |
echo "Installing papermill and dependencies..."
pip install poetry papermill ipython ipykernel
# Disable virtualenv due to problems using packaged in virtualenv in papermill
poetry config virtualenvs.create false
echo "Installing SDK..."
poetry install --with test,docs
- name: Run 0_basic_ray.ipynb
run: |
set -euo pipefail
# Remove login/logout cells, as KinD doesn't support authentication using token
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb
# Run notebook
# poetry run papermill 0_basic_ray.ipynb 0_basic_ray_out.ipynb --log-output --execution-timeout 600
working-directory: demo-notebooks/guided-demos
- name: Run 1_cluster_job_client.ipynb
run: |
set -euo pipefail
# Remove login/logout cells, as KinD doesn't support authentication using token
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb
# Run notebook
# poetry run papermill 1_cluster_job_client.ipynb 1_cluster_job_client_out.ipynb --log-output --execution-timeout 1200
working-directory: demo-notebooks/guided-demos
- name: Run 2_basic_interactive.ipynb
run: |
set -euo pipefail
# Remove login/logout cells, as KinD doesn't support authentication using token
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
sed -i "s/cluster_uri()/local_client_url()/" 2_basic_interactive.ipynb
# Set explicit namespace as SDK need it (currently) to resolve local queues
sed -i "s/head_cpus=1,/head_cpus=1, namespace='default',/" 2_basic_interactive.ipynb
# Add MINIO related modules to runtime environment
sed -i "s/\\\\\"transformers/\\\\\"s3fs\\\\\", \\\\\"pyarrow\\\\\", \\\\\"transformers/" 2_basic_interactive.ipynb
# Replace markdown cell with remote configuration for MINIO
MINIO_CONFIG=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/minio_remote_config_cell.json)
jq --argjson minio_config "$MINIO_CONFIG" -r '(.cells[] | select(.source[] | contains("Now that we are connected"))) |= $minio_config' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
# Configure persistent storage for Ray trainer
sed -i -E "s/# run_config.*\)/, run_config=ray.get(get_minio_run_config.remote())/" 2_basic_interactive.ipynb
# Run notebook
poetry run papermill 2_basic_interactive.ipynb 2_basic_interactive_out.ipynb --log-output --execution-timeout 1200
env:
GRPC_DNS_RESOLVER: "native"
working-directory: demo-notebooks/guided-demos
- name: Switch to kind-cluster context to print logs
if: always() && steps.deploy.outcome == 'success'
run: kubectl config use-context kind-cluster
- name: Print debug info
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing debug info"
kubectl describe pods -n default
- name: Print CodeFlare operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing CodeFlare operator logs"
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log
- name: Print Kueue operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Kueue operator logs"
KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}')
kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log
- name: Print KubeRay operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing KubeRay operator logs"
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log
- name: Export all KinD pod logs
uses: ./common/github-actions/kind-export-logs
if: always() && steps.deploy.outcome == 'success'
with:
output-directory: ${TEMP_DIR}
- name: Upload logs
uses: actions/upload-artifact@v4
if: always() && steps.deploy.outcome == 'success'
with:
name: logs
retention-days: 10
path: |
${{ env.TEMP_DIR }}/**/*.log