diff --git a/.github/resources/minio_remote_config_cell.json b/.github/resources/minio_remote_config_cell.json new file mode 100644 index 000000000..e36c4b188 --- /dev/null +++ b/.github/resources/minio_remote_config_cell.json @@ -0,0 +1,20 @@ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@ray.remote\n", + "def get_minio_run_config():\n", + " import s3fs\n", + " import pyarrow\n", + " s3_fs = s3fs.S3FileSystem(\n", + " key = \"minio\",\n", + " secret = \"minio123\",\n", + " endpoint_url = \"http://minio-service.default.svc.cluster.local:9000\"\n", + " )\n", + " custom_fs = pyarrow.fs.PyFileSystem(pyarrow.fs.FSSpecHandler(s3_fs))\n", + " run_config = ray.train.RunConfig(storage_path='training', storage_filesystem=custom_fs)\n", + " return run_config" + ] + } diff --git a/.github/resources/wait_for_job_cell.json b/.github/resources/wait_for_job_cell.json new file mode 100644 index 000000000..eb8805bd4 --- /dev/null +++ b/.github/resources/wait_for_job_cell.json @@ -0,0 +1,20 @@ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from time import sleep\n", + "\n", + "finished = False\n", + "while not finished:\n", + " sleep(5)\n", + " status = client.get_job_status(submission_id)\n", + " finished = (status == \"SUCCEEDED\" or status == \"FAILED\" or status == \"STOPPED\")\n", + " print(status)\n", + "print(\"Job status \" + status)\n", + "print(\"Logs: \")\n", + "print(client.get_job_logs(submission_id))\n", + "assert status == \"SUCCEEDED\", \"Job failed or was stopped!\"" + ] + } diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/guided_notebook_tests.yaml similarity index 50% rename from .github/workflows/e2e_tests.yaml rename to .github/workflows/guided_notebook_tests.yaml index d216df9d7..0c367b0d7 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/guided_notebook_tests.yaml @@ -1,4 +1,4 @@ -name: e2e +name: Guided notebooks tests on: pull_request: @@ -27,6 +27,10 @@ concurrency: env: CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" +permissions: + id-token: write # This is required for requesting the JWT + contents: read + jobs: kubernetes: @@ -76,6 +80,8 @@ jobs: - name: Install NVidia GPU operator for KinD uses: ./common/github-actions/nvidia-gpu-operator + with: + enable-time-slicing: 'true' - name: Deploy CodeFlare stack id: deploy @@ -88,6 +94,11 @@ jobs: kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager cd .. + - name: Install MINIO + run: | + kubectl apply -f ./tests/e2e/minio_deployment.yaml + kubectl wait --timeout=120s --for=condition=Available=true deployment -n default minio + - name: Add user to KinD uses: ./common/github-actions/kind-add-user with: @@ -113,46 +124,99 @@ jobs: kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user kubectl config use-context sdk-user - - name: Run e2e tests + - name: Setup Guided notebooks execution run: | - export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} - echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV + echo "Installing papermill and dependencies..." + pip install poetry papermill ipython ipykernel + # Disable virtualenv due to problems using packaged in virtualenv in papermill + poetry config virtualenvs.create false - set -euo pipefail - pip install poetry + echo "Installing SDK..." poetry install --with test,docs - echo "Running e2e tests..." - poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 + + - name: Run 0_basic_ray.ipynb + run: | + set -euo pipefail + + # Remove login/logout cells, as KinD doesn't support authentication using token + jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb + jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb + # Run notebook +# poetry run papermill 0_basic_ray.ipynb 0_basic_ray_out.ipynb --log-output --execution-timeout 600 + working-directory: demo-notebooks/guided-demos + + - name: Run 1_cluster_job_client.ipynb + run: | + set -euo pipefail + + # Remove login/logout cells, as KinD doesn't support authentication using token + jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb + jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb + # Replace async logs with waiting for job to finish, async logs don't work properly in papermill + JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json) + jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb + # Run notebook +# poetry run papermill 1_cluster_job_client.ipynb 1_cluster_job_client_out.ipynb --log-output --execution-timeout 1200 + working-directory: demo-notebooks/guided-demos + + - name: Run 2_basic_interactive.ipynb + run: | + set -euo pipefail + + # Remove login/logout cells, as KinD doesn't support authentication using token + jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb + jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb + # Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster + sed -i "s/cluster_uri()/local_client_url()/" 2_basic_interactive.ipynb + # Set explicit namespace as SDK need it (currently) to resolve local queues + sed -i "s/head_cpus=1,/head_cpus=1, namespace='default',/" 2_basic_interactive.ipynb + # Add MINIO related modules to runtime environment + sed -i "s/\\\\\"transformers/\\\\\"s3fs\\\\\", \\\\\"pyarrow\\\\\", \\\\\"transformers/" 2_basic_interactive.ipynb + # Replace markdown cell with remote configuration for MINIO + MINIO_CONFIG=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/minio_remote_config_cell.json) + jq --argjson minio_config "$MINIO_CONFIG" -r '(.cells[] | select(.source[] | contains("Now that we are connected"))) |= $minio_config' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb + # Configure persistent storage for Ray trainer + sed -i -E "s/# run_config.*\)/, run_config=ray.get(get_minio_run_config.remote())/" 2_basic_interactive.ipynb + # Run notebook + poetry run papermill 2_basic_interactive.ipynb 2_basic_interactive_out.ipynb --log-output --execution-timeout 1200 env: GRPC_DNS_RESOLVER: "native" + working-directory: demo-notebooks/guided-demos - name: Switch to kind-cluster context to print logs if: always() && steps.deploy.outcome == 'success' run: kubectl config use-context kind-cluster - - name: Print Pytest output log + - name: Print debug info if: always() && steps.deploy.outcome == 'success' run: | - echo "Printing Pytest output logs" - cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log + echo "Printing debug info" + kubectl describe pods -n default - name: Print CodeFlare operator logs if: always() && steps.deploy.outcome == 'success' run: | echo "Printing CodeFlare operator logs" - kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log + kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log + + - name: Print Kueue operator logs + if: always() && steps.deploy.outcome == 'success' + run: | + echo "Printing Kueue operator logs" + KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}') + kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log - name: Print KubeRay operator logs if: always() && steps.deploy.outcome == 'success' run: | echo "Printing KubeRay operator logs" - kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log + kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log - name: Export all KinD pod logs uses: ./common/github-actions/kind-export-logs if: always() && steps.deploy.outcome == 'success' with: - output-directory: ${CODEFLARE_TEST_OUTPUT_DIR} + output-directory: ${TEMP_DIR} - name: Upload logs uses: actions/upload-artifact@v4 @@ -161,4 +225,4 @@ jobs: name: logs retention-days: 10 path: | - ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log + ${{ env.TEMP_DIR }}/**/*.log diff --git a/demo-notebooks/guided-demos/0_basic_ray.ipynb b/demo-notebooks/guided-demos/0_basic_ray.ipynb index 3f0f62e47..11f3a3b2e 100644 --- a/demo-notebooks/guided-demos/0_basic_ray.ipynb +++ b/demo-notebooks/guided-demos/0_basic_ray.ipynb @@ -62,10 +62,12 @@ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", "cluster = Cluster(ClusterConfiguration(\n", " name='raytest', \n", + " head_cpus='500m',\n", + " head_memory=2,\n", " head_gpus=0, # For GPU enabled workloads set the head_gpus and num_gpus\n", " num_gpus=0,\n", " num_workers=2,\n", - " min_cpus=1,\n", + " min_cpus='250m',\n", " max_cpus=1,\n", " min_memory=4,\n", " max_memory=4,\n", diff --git a/demo-notebooks/guided-demos/1_cluster_job_client.ipynb b/demo-notebooks/guided-demos/1_cluster_job_client.ipynb index 00576024a..bd5d69657 100644 --- a/demo-notebooks/guided-demos/1_cluster_job_client.ipynb +++ b/demo-notebooks/guided-demos/1_cluster_job_client.ipynb @@ -44,10 +44,12 @@ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", "cluster = Cluster(ClusterConfiguration(\n", " name='jobtest',\n", + " head_cpus=1,\n", + " head_memory=4,\n", " head_gpus=1, # For GPU enabled workloads set the head_gpus and num_gpus\n", " num_gpus=1,\n", " num_workers=2,\n", - " min_cpus=1,\n", + " min_cpus='250m',\n", " max_cpus=1,\n", " min_memory=4,\n", " max_memory=4,\n", diff --git a/demo-notebooks/guided-demos/2_basic_interactive.ipynb b/demo-notebooks/guided-demos/2_basic_interactive.ipynb index 0692caa4c..935fb031e 100644 --- a/demo-notebooks/guided-demos/2_basic_interactive.ipynb +++ b/demo-notebooks/guided-demos/2_basic_interactive.ipynb @@ -60,13 +60,15 @@ "cluster_name = \"interactivetest\"\n", "cluster = Cluster(ClusterConfiguration(\n", " name=cluster_name,\n", + " head_cpus=1,\n", + " head_memory=4,\n", " head_gpus=1, # For GPU enabled workloads set the head_gpus and num_gpus\n", " num_gpus=1,\n", " num_workers=2,\n", - " min_cpus=2,\n", - " max_cpus=2,\n", - " min_memory=8,\n", - " max_memory=8,\n", + " min_cpus='250m',\n", + " max_cpus=1,\n", + " min_memory=4,\n", + " max_memory=4,\n", " image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n", " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", @@ -251,7 +253,17 @@ "\n", " ray_trainer = TorchTrainer(\n", " train_func,\n", - " scaling_config=ScalingConfig(num_workers=3, use_gpu=True),\n", + " scaling_config=ScalingConfig(\n", + " # num_workers = number of worker nodes with the ray head node included\n", + " num_workers=3,\n", + " use_gpu=True,\n", + " resources_per_worker={\n", + " \"CPU\": 1,\n", + " },\n", + " trainer_resources={\n", + " \"CPU\": 0,\n", + " }\n", + " )\n", " # Configure persistent storage that is accessible across \n", " # all worker nodes.\n", " # Uncomment and update the RunConfig below to include your storage details.\n", diff --git a/demo-notebooks/guided-demos/mnist_fashion.py b/demo-notebooks/guided-demos/mnist_fashion.py index 85cd6e64c..ba5b2636c 100644 --- a/demo-notebooks/guided-demos/mnist_fashion.py +++ b/demo-notebooks/guided-demos/mnist_fashion.py @@ -78,8 +78,16 @@ def train_func_distributed(): trainer = TorchTrainer( train_func_distributed, scaling_config=ScalingConfig( - num_workers=3, use_gpu=use_gpu - ), # num_workers = number of worker nodes with the ray head node included + # num_workers = number of worker nodes with the ray head node included + num_workers=3, + use_gpu=use_gpu, + resources_per_worker={ + "CPU": 1, + }, + trainer_resources={ + "CPU": 0, + }, + ), ) results = trainer.fit() diff --git a/tests/e2e/minio_deployment.yaml b/tests/e2e/minio_deployment.yaml index 86d4ef01f..b2cdc54a9 100644 --- a/tests/e2e/minio_deployment.yaml +++ b/tests/e2e/minio_deployment.yaml @@ -88,10 +88,7 @@ spec: mountPath: /data subPath: minio terminationMessagePolicy: File - image: >- - quay.io/minio/minio:RELEASE.2024-06-22T05-26-45Z - # In case of disconnected environment, use image digest instead of tag - # For example : /minio/minio@sha256:6b3abf2f59286b985bfde2b23e37230b466081eda5dccbf971524d54c8e406b5 + image: quay.io/minio/minio:RELEASE.2024-06-22T05-26-45Z args: - server - /data @@ -129,35 +126,3 @@ spec: sessionAffinity: None selector: app: minio ---- -kind: Route -apiVersion: route.openshift.io/v1 -metadata: - name: minio-api -spec: - to: - kind: Service - name: minio-service - weight: 100 - port: - targetPort: api - wildcardPolicy: None - tls: - termination: edge - insecureEdgeTerminationPolicy: Redirect ---- -kind: Route -apiVersion: route.openshift.io/v1 -metadata: - name: minio-ui -spec: - to: - kind: Service - name: minio-service - weight: 100 - port: - targetPort: ui - wildcardPolicy: None - tls: - termination: edge - insecureEdgeTerminationPolicy: Redirect