Skip to content

Commit

Permalink
Run PR check for guided notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
sutaakar committed Jul 25, 2024
1 parent f100ba1 commit cd7fc9b
Show file tree
Hide file tree
Showing 8 changed files with 153 additions and 60 deletions.
20 changes: 20 additions & 0 deletions .github/resources/minio_remote_config_cell.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@ray.remote\n",
"def get_minio_run_config():\n",
" import s3fs\n",
" import pyarrow\n",
" s3_fs = s3fs.S3FileSystem(\n",
" key = \"minio\",\n",
" secret = \"minio123\",\n",
" endpoint_url = \"http://minio-service.default.svc.cluster.local:9000\"\n",
" )\n",
" custom_fs = pyarrow.fs.PyFileSystem(pyarrow.fs.FSSpecHandler(s3_fs))\n",
" run_config = ray.train.RunConfig(storage_path='training', storage_filesystem=custom_fs)\n",
" return run_config"
]
}
20 changes: 20 additions & 0 deletions .github/resources/wait_for_job_cell.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from time import sleep\n",
"\n",
"finished = False\n",
"while not finished:\n",
" sleep(5)\n",
" status = client.get_job_status(submission_id)\n",
" finished = (status == \"SUCCEEDED\" or status == \"FAILED\" or status == \"STOPPED\")\n",
" print(status)\n",
"print(\"Job status \" + status)\n",
"print(\"Logs: \")\n",
"print(client.get_job_logs(submission_id))\n",
"assert status == \"SUCCEEDED\", \"Job failed or was stopped!\""
]
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: e2e
name: Guided notebooks tests

on:
pull_request:
Expand Down Expand Up @@ -27,6 +27,10 @@ concurrency:
env:
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"

permissions:
id-token: write # This is required for requesting the JWT
contents: read

jobs:
kubernetes:

Expand Down Expand Up @@ -76,6 +80,8 @@ jobs:

- name: Install NVidia GPU operator for KinD
uses: ./common/github-actions/nvidia-gpu-operator
with:
enable-time-slicing: 'true'

- name: Deploy CodeFlare stack
id: deploy
Expand All @@ -88,6 +94,11 @@ jobs:
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
cd ..
- name: Install MINIO
run: |
kubectl apply -f ./tests/e2e/minio_deployment.yaml
kubectl wait --timeout=120s --for=condition=Available=true deployment -n default minio
- name: Add user to KinD
uses: ./common/github-actions/kind-add-user
with:
Expand All @@ -113,46 +124,99 @@ jobs:
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
kubectl config use-context sdk-user
- name: Run e2e tests
- name: Setup Guided notebooks execution
run: |
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
echo "Installing papermill and dependencies..."
pip install poetry papermill ipython ipykernel
# Disable virtualenv due to problems using packaged in virtualenv in papermill
poetry config virtualenvs.create false
set -euo pipefail
pip install poetry
echo "Installing SDK..."
poetry install --with test,docs
echo "Running e2e tests..."
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
- name: Run 0_basic_ray.ipynb
run: |
set -euo pipefail
# Remove login/logout cells, as KinD doesn't support authentication using token
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb
# Run notebook
# poetry run papermill 0_basic_ray.ipynb 0_basic_ray_out.ipynb --log-output --execution-timeout 600
working-directory: demo-notebooks/guided-demos

- name: Run 1_cluster_job_client.ipynb
run: |
set -euo pipefail
# Remove login/logout cells, as KinD doesn't support authentication using token
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb
# Run notebook
# poetry run papermill 1_cluster_job_client.ipynb 1_cluster_job_client_out.ipynb --log-output --execution-timeout 1200
working-directory: demo-notebooks/guided-demos

- name: Run 2_basic_interactive.ipynb
run: |
set -euo pipefail
# Remove login/logout cells, as KinD doesn't support authentication using token
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
sed -i "s/cluster_uri()/local_client_url()/" 2_basic_interactive.ipynb
# Set explicit namespace as SDK need it (currently) to resolve local queues
sed -i "s/head_cpus=1,/head_cpus=1, namespace='default',/" 2_basic_interactive.ipynb
# Add MINIO related modules to runtime environment
sed -i "s/\\\\\"transformers/\\\\\"s3fs\\\\\", \\\\\"pyarrow\\\\\", \\\\\"transformers/" 2_basic_interactive.ipynb
# Replace markdown cell with remote configuration for MINIO
MINIO_CONFIG=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/minio_remote_config_cell.json)
jq --argjson minio_config "$MINIO_CONFIG" -r '(.cells[] | select(.source[] | contains("Now that we are connected"))) |= $minio_config' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
# Configure persistent storage for Ray trainer
sed -i -E "s/# run_config.*\)/, run_config=ray.get(get_minio_run_config.remote())/" 2_basic_interactive.ipynb
# Run notebook
poetry run papermill 2_basic_interactive.ipynb 2_basic_interactive_out.ipynb --log-output --execution-timeout 1200
env:
GRPC_DNS_RESOLVER: "native"
working-directory: demo-notebooks/guided-demos

- name: Switch to kind-cluster context to print logs
if: always() && steps.deploy.outcome == 'success'
run: kubectl config use-context kind-cluster

- name: Print Pytest output log
- name: Print debug info
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Pytest output logs"
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
echo "Printing debug info"
kubectl describe pods -n default
- name: Print CodeFlare operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing CodeFlare operator logs"
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log
- name: Print Kueue operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Kueue operator logs"
KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}')
kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log
- name: Print KubeRay operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing KubeRay operator logs"
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log
- name: Export all KinD pod logs
uses: ./common/github-actions/kind-export-logs
if: always() && steps.deploy.outcome == 'success'
with:
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
output-directory: ${TEMP_DIR}

- name: Upload logs
uses: actions/upload-artifact@v4
Expand All @@ -161,4 +225,4 @@ jobs:
name: logs
retention-days: 10
path: |
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
${{ env.TEMP_DIR }}/**/*.log
4 changes: 3 additions & 1 deletion demo-notebooks/guided-demos/0_basic_ray.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,12 @@
"# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
"cluster = Cluster(ClusterConfiguration(\n",
" name='raytest', \n",
" head_cpus='500m',\n",
" head_memory=2,\n",
" head_gpus=0, # For GPU enabled workloads set the head_gpus and num_gpus\n",
" num_gpus=0,\n",
" num_workers=2,\n",
" min_cpus=1,\n",
" min_cpus='250m',\n",
" max_cpus=1,\n",
" min_memory=4,\n",
" max_memory=4,\n",
Expand Down
4 changes: 3 additions & 1 deletion demo-notebooks/guided-demos/1_cluster_job_client.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,12 @@
"# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
"cluster = Cluster(ClusterConfiguration(\n",
" name='jobtest',\n",
" head_cpus=1,\n",
" head_memory=4,\n",
" head_gpus=1, # For GPU enabled workloads set the head_gpus and num_gpus\n",
" num_gpus=1,\n",
" num_workers=2,\n",
" min_cpus=1,\n",
" min_cpus='250m',\n",
" max_cpus=1,\n",
" min_memory=4,\n",
" max_memory=4,\n",
Expand Down
22 changes: 17 additions & 5 deletions demo-notebooks/guided-demos/2_basic_interactive.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,15 @@
"cluster_name = \"interactivetest\"\n",
"cluster = Cluster(ClusterConfiguration(\n",
" name=cluster_name,\n",
" head_cpus=1,\n",
" head_memory=4,\n",
" head_gpus=1, # For GPU enabled workloads set the head_gpus and num_gpus\n",
" num_gpus=1,\n",
" num_workers=2,\n",
" min_cpus=2,\n",
" max_cpus=2,\n",
" min_memory=8,\n",
" max_memory=8,\n",
" min_cpus='250m',\n",
" max_cpus=1,\n",
" min_memory=4,\n",
" max_memory=4,\n",
" image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n",
" write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
" # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
Expand Down Expand Up @@ -251,7 +253,17 @@
"\n",
" ray_trainer = TorchTrainer(\n",
" train_func,\n",
" scaling_config=ScalingConfig(num_workers=3, use_gpu=True),\n",
" scaling_config=ScalingConfig(\n",
" # num_workers = number of worker nodes with the ray head node included\n",
" num_workers=3,\n",
" use_gpu=True,\n",
" resources_per_worker={\n",
" \"CPU\": 1,\n",
" },\n",
" trainer_resources={\n",
" \"CPU\": 0,\n",
" }\n",
" )\n",
" # Configure persistent storage that is accessible across \n",
" # all worker nodes.\n",
" # Uncomment and update the RunConfig below to include your storage details.\n",
Expand Down
12 changes: 10 additions & 2 deletions demo-notebooks/guided-demos/mnist_fashion.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,16 @@ def train_func_distributed():
trainer = TorchTrainer(
train_func_distributed,
scaling_config=ScalingConfig(
num_workers=3, use_gpu=use_gpu
), # num_workers = number of worker nodes with the ray head node included
# num_workers = number of worker nodes with the ray head node included
num_workers=3,
use_gpu=use_gpu,
resources_per_worker={
"CPU": 1,
},
trainer_resources={
"CPU": 0,
},
),
)

results = trainer.fit()
37 changes: 1 addition & 36 deletions tests/e2e/minio_deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,7 @@ spec:
mountPath: /data
subPath: minio
terminationMessagePolicy: File
image: >-
quay.io/minio/minio:RELEASE.2024-06-22T05-26-45Z
# In case of disconnected environment, use image digest instead of tag
# For example : <mirror_registry_endpoint>/minio/minio@sha256:6b3abf2f59286b985bfde2b23e37230b466081eda5dccbf971524d54c8e406b5
image: quay.io/minio/minio:RELEASE.2024-06-22T05-26-45Z
args:
- server
- /data
Expand Down Expand Up @@ -129,35 +126,3 @@ spec:
sessionAffinity: None
selector:
app: minio
---
kind: Route
apiVersion: route.openshift.io/v1
metadata:
name: minio-api
spec:
to:
kind: Service
name: minio-service
weight: 100
port:
targetPort: api
wildcardPolicy: None
tls:
termination: edge
insecureEdgeTerminationPolicy: Redirect
---
kind: Route
apiVersion: route.openshift.io/v1
metadata:
name: minio-ui
spec:
to:
kind: Service
name: minio-service
weight: 100
port:
targetPort: ui
wildcardPolicy: None
tls:
termination: edge
insecureEdgeTerminationPolicy: Redirect

0 comments on commit cd7fc9b

Please sign in to comment.