Skip to content

e2e: Use Ray dashboard API to retrieve job logs #68

e2e: Use Ray dashboard API to retrieve job logs

e2e: Use Ray dashboard API to retrieve job logs #68

Workflow file for this run

# This workflow will build the CodeFlare Operator image and catalog containing bundle with this image, execute OLM upgrade tests using this catalog
name: OLM Install and Upgrade
on:
pull_request:
branches:
- main
- 'release-*'
paths-ignore:
- 'docs/**'
- '**.adoc'
- '**.md'
- 'LICENSE'
concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
cancel-in-progress: true
jobs:
kubernetes:
runs-on: ubuntu-20.04
timeout-minutes: 60
env:
OLM_VERSION: v0.24.0
VERSION: "v0.0.0-ghaction" # Need to supply some semver version for bundle to be properly generated
CATALOG_BASE_IMG: "registry.access.redhat.com/redhat/community-operator-index:v4.13"
CODEFLARE_TEST_TIMEOUT_SHORT: "1m"
CODEFLARE_TEST_TIMEOUT_MEDIUM: "5m"
CODEFLARE_TEST_TIMEOUT_LONG: "10m"
steps:
- name: Cleanup
run: |
ls -lart
echo "Initial status:"
df -h
echo "Cleaning up resources:"
sudo swapoff -a
sudo rm -f /swapfile
sudo apt clean
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
docker rmi $(docker image ls -aq)
echo "Final status:"
df -h
- uses: actions/checkout@v3
with:
fetch-depth: 0 # fetching also previous commits to get tags
- name: Set Go
uses: actions/setup-go@v3
with:
go-version: v1.18
- name: Set up gotestfmt
uses: gotesttools/gotestfmt-action@v2
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Setup and start KinD cluster
uses: ./.github/actions/kind
- name: Install OLM
run: |
kubectl create -f https://github.com/operator-framework/operator-lifecycle-manager/releases/download/${OLM_VERSION}/crds.yaml
# wait for a while to be sure CRDs are installed
sleep 1
kubectl create -f https://github.com/operator-framework/operator-lifecycle-manager/releases/download/${OLM_VERSION}/olm.yaml
echo Wait for default CatalogSource to start
kubectl wait -n ${{ env.CATALOG_SOURCE_NAMESPACE }} catalogsource/${{ env.CATALOG_SOURCE_NAME }} --for=jsonpath='{.status.connectionState.lastObservedState}'=READY --timeout=180s
env:
CATALOG_SOURCE_NAME: "operatorhubio-catalog"
CATALOG_SOURCE_NAMESPACE: "olm"
- name: Create openshift-operator namespace and OperatorGroup
run: |
# Need to use openshift-operator namespace due to https://github.com/project-codeflare/codeflare-operator/issues/161
kubectl create namespace openshift-operators
kubectl create -f .github/resources-olm-upgrade/operatorgroup.yaml
- name: Deploy latest released CodeFlare operator from OLM
id: deploy
run: |
echo Deploying CodeFlare operator using Subscription
envsubst < .github/resources-olm-upgrade/catalogsource.yaml > ${{ env.TEMP_DIR }}/catalogsource.yaml
envsubst < .github/resources-olm-upgrade/subscription.yaml > ${{ env.TEMP_DIR }}/subscription.yaml
kubectl create -f ${{ env.TEMP_DIR }}/catalogsource.yaml
echo Wait for CatalogSource ${{ env.CATALOG_SOURCE_NAME }} to start
kubectl wait -n ${{ env.CATALOG_SOURCE_NAMESPACE }} catalogsource/${{ env.CATALOG_SOURCE_NAME }} --for=jsonpath='{.status.connectionState.lastObservedState}'=READY --timeout=180s
kubectl create -f ${{ env.TEMP_DIR }}/subscription.yaml
echo Waiting for Subscription to be ready
kubectl wait -n ${{ env.SUBSCRIPTION_NAMESPACE }} subscription/${{ env.SUBSCRIPTION_NAME }} --for=jsonpath='{.status.state}'=AtLatestKnown --timeout=180s
echo Waiting for Deployment to be ready
timeout 60 bash -c 'until [[ $(kubectl get deployment/codeflare-operator-manager -n '${{ env.SUBSCRIPTION_NAMESPACE }}') ]]; do sleep 5 && echo "$(kubectl get deployment/codeflare-operator-manager -n '${{ env.SUBSCRIPTION_NAMESPACE }}')"; done'
kubectl wait -n ${{ env.SUBSCRIPTION_NAMESPACE }} deployment/codeflare-operator-manager --for=condition=Available=true --timeout=60s
env:
CATALOG_SOURCE_NAME: "codeflare-olm-test"
CATALOG_SOURCE_NAMESPACE: "olm"
SUBSCRIPTION_NAME: "codeflare-operator"
SUBSCRIPTION_NAMESPACE: "openshift-operators"
- name: Store latest CSV version as PREVIOUS_VERSION env variable (used for bundle build)
run: |
CSV_VERSION=$(kubectl get ClusterServiceVersion -l operators.coreos.com/codeflare-operator.openshift-operators='' -n openshift-operators -o json | jq -r .items[].spec.version)
echo "PREVIOUS_VERSION=v$CSV_VERSION" >> $GITHUB_ENV
- name: Deploy CodeFlare stack (MCAD, KubeRay)
run: |
make setup-e2e
- name: Build operator and catalog image
run: |
make image-push
make bundle-build
make bundle-push
make catalog-build-from-index
make catalog-push
env:
IMG: "${{ env.REGISTRY_ADDRESS }}/codeflare-operator:v0.0.1"
BUNDLE_IMG: "${{ env.REGISTRY_ADDRESS }}/codeflare-operator-bundle:v0.0.1"
CATALOG_IMG: "${{ env.REGISTRY_ADDRESS }}/codeflare-operator-catalog:v0.0.1"
OPM_BUNDLE_OPT: "--use-http"
BUNDLE_PUSH_OPT: "--tls-verify=false"
CATALOG_PUSH_OPT: "--tls-verify=false"
- name: Update Operator to the built version
run: |
ORIGINAL_POD_NAME=$(kubectl get pod -l app.kubernetes.io/name=codeflare-operator -n openshift-operators -o json | jq -r .items[].metadata.name)
echo "Running old operator pod name is ${ORIGINAL_POD_NAME}"
echo Updating custom CatalogSource image to the built CatalogSource with latest operator
kubectl patch CatalogSource codeflare-olm-test -n olm --type merge --patch "{\"spec\":{\"image\":\"${CATALOG_IMG}\"}}"
echo Waiting for previous operator pod to get deleted
kubectl wait --timeout=120s --for=delete pod/${ORIGINAL_POD_NAME} -n openshift-operators
echo Waiting for Subscription to be ready
kubectl wait -n ${{ env.SUBSCRIPTION_NAMESPACE }} subscription/${{ env.SUBSCRIPTION_NAME }} --for=jsonpath='{.status.state}'=AtLatestKnown --timeout=180s
echo Waiting for Deployment to be ready
timeout 60 bash -c 'until [[ $(kubectl get deployment/codeflare-operator-manager -n '${{ env.SUBSCRIPTION_NAMESPACE }}') ]]; do sleep 5 && echo "$(kubectl get deployment/codeflare-operator-manager -n '${{ env.SUBSCRIPTION_NAMESPACE }}')"; done'
kubectl wait -n ${{ env.SUBSCRIPTION_NAMESPACE }} deployment/codeflare-operator-manager --for=condition=Available=true --timeout=60s
echo Checking that correct CSV is available
CSV_VERSION=$(kubectl get ClusterServiceVersion/codeflare-operator.${VERSION} -n openshift-operators -o json | jq -r .spec.version)
if [ "v${CSV_VERSION}" != "${VERSION}" ]; then
echo "CSV version v${CSV_VERSION} doesn't match expected version ${VERSION}"
exit 1
fi
env:
CATALOG_IMG: "${{ env.REGISTRY_ADDRESS }}/codeflare-operator-catalog:v0.0.1"
SUBSCRIPTION_NAME: "codeflare-operator"
SUBSCRIPTION_NAMESPACE: "openshift-operators"
- name: Run e2e tests against built operator
run: |
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
set -euo pipefail
go test -timeout 30m -v ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
- name: Print CodeFlare operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing CodeFlare operator logs"
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log
- name: Print MCAD controller logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing MCAD controller logs"
kubectl logs -n codeflare-system --tail -1 -l component=multi-cluster-application-dispatcher | tee ${CODEFLARE_TEST_OUTPUT_DIR}/mcad.log
- name: Print KubeRay operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing KubeRay operator logs"
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
- name: Upload logs
uses: actions/upload-artifact@v3
if: always() && steps.deploy.outcome == 'success'
with:
name: logs
retention-days: 10
path: |
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log