Run PR check for guided notebooks #34
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Guided notebooks tests | |
on: | |
pull_request: | |
branches: | |
- main | |
- 'release-*' | |
paths-ignore: | |
- 'docs/**' | |
- '**.adoc' | |
- '**.md' | |
- 'LICENSE' | |
push: | |
branches: | |
- main | |
- 'release-*' | |
paths-ignore: | |
- 'docs/**' | |
- '**.adoc' | |
- '**.md' | |
- 'LICENSE' | |
concurrency: | |
group: ${{ github.head_ref }}-${{ github.workflow }} | |
cancel-in-progress: true | |
env: | |
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" | |
permissions: | |
id-token: write # This is required for requesting the JWT | |
contents: read | |
jobs: | |
kubernetes: | |
runs-on: ubuntu-20.04-4core-gpu | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Checkout common repo code | |
uses: actions/checkout@v4 | |
with: | |
repository: 'project-codeflare/codeflare-common' | |
ref: 'main' | |
path: 'common' | |
- name: Checkout CodeFlare operator repository | |
uses: actions/checkout@v4 | |
with: | |
repository: project-codeflare/codeflare-operator | |
path: codeflare-operator | |
- name: Set Go | |
uses: actions/setup-go@v5 | |
with: | |
go-version-file: './codeflare-operator/go.mod' | |
cache-dependency-path: "./codeflare-operator/go.sum" | |
- name: Set up gotestfmt | |
uses: gotesttools/gotestfmt-action@v2 | |
with: | |
token: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set up specific Python version | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.9' | |
cache: 'pip' # caching pip dependencies | |
- name: Setup NVidia GPU environment for KinD | |
uses: ./common/github-actions/nvidia-gpu-setup | |
- name: Setup and start KinD cluster | |
uses: ./common/github-actions/kind | |
- name: Install NVidia GPU operator for KinD | |
uses: ./common/github-actions/nvidia-gpu-operator | |
with: | |
enable-time-slicing: 'true' | |
- name: Deploy CodeFlare stack | |
id: deploy | |
run: | | |
cd codeflare-operator | |
echo Setting up CodeFlare stack | |
make setup-e2e | |
echo Deploying CodeFlare operator | |
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" | |
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager | |
cd .. | |
- name: Install MINIO | |
run: | | |
kubectl apply -f ./tests/e2e/minio_deployment.yaml | |
kubectl wait --timeout=120s --for=condition=Available=true deployment -n default minio | |
- name: Add user to KinD | |
uses: ./common/github-actions/kind-add-user | |
with: | |
user-name: sdk-user | |
- name: Configure RBAC for sdk user with limited permissions | |
run: | | |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses | |
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user | |
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces | |
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user | |
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters | |
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user | |
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers | |
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user | |
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors | |
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user | |
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues | |
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user | |
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues | |
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user | |
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets | |
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user | |
kubectl config use-context sdk-user | |
- name: Setup Guided notebooks execution | |
run: | | |
echo "Installing papermill and dependencies..." | |
pip install poetry papermill ipython ipykernel | |
# Disable virtualenv due to problems using packaged in virtualenv in papermill | |
poetry config virtualenvs.create false | |
echo "Installing SDK..." | |
poetry install --with test,docs | |
- name: Run 0_basic_ray.ipynb | |
run: | | |
set -euo pipefail | |
# Remove login/logout cells, as KinD doesn't support authentication using token | |
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb | |
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb | |
# Run notebook | |
poetry run papermill 0_basic_ray.ipynb 0_basic_ray_out.ipynb --log-output --execution-timeout 600 | |
working-directory: demo-notebooks/guided-demos | |
- name: Run 1_cluster_job_client.ipynb | |
run: | | |
set -euo pipefail | |
# Remove login/logout cells, as KinD doesn't support authentication using token | |
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb | |
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb | |
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill | |
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json) | |
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb | |
# Run notebook | |
poetry run papermill 1_cluster_job_client.ipynb 1_cluster_job_client_out.ipynb --log-output --execution-timeout 1200 | |
working-directory: demo-notebooks/guided-demos | |
- name: Run 2_basic_interactive.ipynb | |
run: | | |
set -euo pipefail | |
# Remove login/logout cells, as KinD doesn't support authentication using token | |
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb | |
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb | |
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster | |
sed -i "s/cluster_uri()/local_client_url()/" 2_basic_interactive.ipynb | |
# Set explicit namespace as SDK need it (currently) to resolve local queues | |
sed -i "s/head_cpus=1,/head_cpus=1, namespace='default',/" 2_basic_interactive.ipynb | |
# Add MINIO related modules to runtime environment | |
sed -i "s/\\\\\"transformers/\\\\\"s3fs\\\\\", \\\\\"pyarrow\\\\\", \\\\\"transformers/" 2_basic_interactive.ipynb | |
# Replace markdown cell with remote configuration for MINIO | |
MINIO_CONFIG=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/minio_remote_config_cell.json) | |
jq --argjson minio_config "$MINIO_CONFIG" -r '(.cells[] | select(.source[] | contains("Now that we are connected"))) |= $minio_config' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb | |
# Configure persistent storage for Ray trainer | |
sed -i -E "s/# run_config.*\)/, run_config=ray.get(get_minio_run_config.remote())/" 2_basic_interactive.ipynb | |
# Run notebook | |
poetry run papermill 2_basic_interactive.ipynb 2_basic_interactive_out.ipynb --log-output --execution-timeout 1200 | |
env: | |
GRPC_DNS_RESOLVER: "native" | |
working-directory: demo-notebooks/guided-demos | |
- name: Switch to kind-cluster context to print logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: kubectl config use-context kind-cluster | |
- name: Print debug info | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing debug info" | |
kubectl describe pods -n default | |
- name: Print CodeFlare operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing CodeFlare operator logs" | |
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log | |
- name: Print Kueue operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing Kueue operator logs" | |
KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}') | |
kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log | |
- name: Print KubeRay operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing KubeRay operator logs" | |
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log | |
- name: Export all KinD pod logs | |
uses: ./common/github-actions/kind-export-logs | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
output-directory: ${TEMP_DIR} | |
- name: Upload logs | |
uses: actions/upload-artifact@v4 | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
name: logs | |
retention-days: 10 | |
path: | | |
${{ env.TEMP_DIR }}/**/*.log |