Skip to content

Commit

Permalink
ci: add basic e2e test
Browse files Browse the repository at this point in the history
Signed-off-by: vsoch <[email protected]>
  • Loading branch information
vsoch committed Aug 2, 2024
1 parent 4cb267f commit 6adb6b6
Show file tree
Hide file tree
Showing 4 changed files with 257 additions and 52 deletions.
73 changes: 26 additions & 47 deletions .github/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,55 +6,47 @@

set -eEu -o pipefail

# ensure upstream exists
# This test script assumes fluence image and sidecar are already built
make prepare

# Keep track of root directory to return to
here=$(pwd)

# Never will use our loaded (just built) images
cd upstream/manifests/install/charts
REGISTRY=ghcr.io/converged-computing

# And then install using the charts. The pull policy ensures we use the loaded ones
helm install \
--set scheduler.image=ghcr.io/flux-framework/fluence:latest \
--set scheduler.sidecarimage=ghcr.io/flux-framework/fluence-sidecar:latest \
--set controller.image=ghcr.io/flux-framework/fluence-controller:latest \
--set controller.pullPolicy=Never \
--set postgres.image=${REGISTRY}/fluxnetes-postgres:latest \
--set scheduler.image=${REGISTRY}/fluxnetes:latest \
--set sidecar.image=${REGISTRY}/fluxnetes-sidecar:latest \
--set postgres.pullPolicy=Never \
--set scheduler.pullPolicy=Never \
--set scheduler.sidecarPullPolicy=Never \
schedscheduler-plugins as-a-second-scheduler/
--set sidecar.pullPolicy=Never \
fluxnetes chart/

# These containers should already be loaded into minikube
echo "Sleeping 10 seconds waiting for scheduler deploy"
sleep 10
kubectl get pods

# This will get the fluence image (which has scheduler and sidecar), which should be first
fluence_pod=$(kubectl get pods -o json | jq -r .items[0].metadata.name)
echo "Found fluence pod ${fluence_pod}"
fluxnetes_pod=$(kubectl get pods -o json | jq -r .items[0].metadata.name)
echo "Found fluxnetes pod ${fluxnetes_pod}"

# Show logs for debugging, if needed
echo
echo "⭐️ kubectl logs ${fluence_pod} -c sidecar"
kubectl logs ${fluence_pod} -c sidecar
echo "⭐️ kubectl logs ${fluxnetes_pod} -c sidecar"
kubectl logs ${fluxnetes_pod} -c sidecar
echo
echo "⭐️ kubectl logs ${fluence_pod} -c scheduler-plugins-scheduler"
kubectl logs ${fluence_pod} -c scheduler-plugins-scheduler
echo "⭐️ kubectl logs ${fluxnetes_pod} -c scheduler"
kubectl logs ${fluxnetes_pod} -c scheduler

# We now want to apply the examples
cd ${here}/examples/test_example

# Apply both example jobs
kubectl apply -f fluence-job.yaml
kubectl apply -f default-job.yaml
kubectl apply -f ./examples/job.yaml

# Get them based on associated job
fluence_job_pod=$(kubectl get pods --selector=job-name=fluence-job -o json | jq -r .items[0].metadata.name)
default_job_pod=$(kubectl get pods --selector=job-name=default-job -o json | jq -r .items[0].metadata.name)
fluxnetes_job_pod=$(kubectl get pods --selector=job-name=job -o json | jq -r .items[0].metadata.name)
fluxnetes_scheduler=$(kubectl get pods --selector=job-name=job -o json | jq -r .items[0].spec.schedulerName)

echo
echo "Fluence job pod is ${fluence_job_pod}"
echo "Default job pod is ${default_job_pod}"
echo "Fluxnetes job pod is ${fluxnetes_job_pod}"
sleep 10

# Shared function to check output
Expand All @@ -70,30 +62,17 @@ function check_output {
}

# Get output (and show)
default_output=$(kubectl logs ${default_job_pod})
default_scheduled_by=$(kubectl get pod ${default_job_pod} -o json | jq -r .spec.schedulerName)
echo
echo "Default scheduler pod output: ${default_output}"
echo " Scheduled by: ${default_scheduled_by}"
fluxnetes_output=$(kubectl logs ${fluxnetes_job_pod})

fluence_output=$(kubectl logs ${fluence_job_pod})
fluence_scheduled_by=$(kubectl get pod ${fluence_job_pod} -o json | jq -r .spec.schedulerName)
echo
echo "Fluence scheduler pod output: ${fluence_output}"
echo " Scheduled by: ${fluence_scheduled_by}"
echo "Job pod output: ${fluxnetes_output}"
echo " Scheduled by: ${fluxnetes_schdeuler}"

# Check output explicitly
check_output 'check-fluence-output' "${fluence_output}" "potato"
check_output 'check-default-output' "${default_output}" "not potato"
check_output 'check-default-scheduled-by' "${default_scheduled_by}" "default-scheduler"
check_output 'check-fluence-scheduled-by' "${fluence_scheduled_by}" "fluence"
check_output 'check-fluxnetes-output' "${fluxnetes_output}" "potato"
check_output 'check-scheduled-by' "${fluxnetes_scheduler}" "fluxnetes"

# But events tell us actually what happened, let's parse throught them and find our pods
# This tells us the Event -> reason "Scheduled" and who it was reported by.
reported_by=$(kubectl events --for pod/${fluence_job_pod} -o json | jq -c '[ .items[] | select( .reason | contains("Scheduled")) ]' | jq -r .[0].reportingComponent)
check_output 'reported-by-fluence' "${reported_by}" "fluence"

# And the second should be the default scheduler, but reportingComponent is empty and we see the
# result in the source -> component
reported_by=$(kubectl events --for pod/${default_job_pod} -o json | jq -c '[ .items[] | select( .reason | contains("Scheduled")) ]' | jq -r .[0].source.component)
check_output 'reported-by-default' "${reported_by}" "default-scheduler"
reported_by=$(kubectl events --for pod/${fluxnetes_job_pod} -o json | jq -c '[ .items[] | select( .reason | contains("Scheduled")) ]' | jq -r .[0].reportingComponent)
check_output 'reported-by-fluxnetes' "${reported_by}" "fluxnetes"
228 changes: 228 additions & 0 deletions .github/workflows/e2e-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
name: fluexnetes test

on:
pull_request: []
# Test on demand (dispatch) or once a week, sunday
# We combine the builds into one job to simplify not needing to share
# containers between jobs. We also don't want to push unless the tests pass.
workflow_dispatch:
schedule:
- cron: '0 0 * * 0'

jobs:
build-fluxnetes:
permissions:
packages: write
env:
container: ghcr.io/converged-computing/fluxnetes
runs-on: ubuntu-latest
name: build fluxnetes
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: ^1.21.9

- name: Build Containers
run: |
make prepare
make build REGISTRY=ghcr.io/converged-computing SCHEDULER_IMAGE=fluxnetes
- name: Save Containers
run: docker save ${{ env.container }} | gzip > fluxnetes_latest.tar.gz

- name: Upload container artifact
uses: actions/upload-artifact@v4
with:
name: fluxnetes
path: fluxnetes_latest.tar.gz

build-sidecar:
permissions:
packages: write
env:
container: ghcr.io/converged-computing/fluxnetes-sidecar
runs-on: ubuntu-latest
name: build sidecar
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: ^1.21.9

- name: Build Containers
run: |
make prepare
make build-sidecar REGISTRY=ghcr.io/converged-computing SIDECAR_IMAGE=fluxnetes-sidecar
- name: Save Containers
run: docker save ${{ env.container }} | gzip > fluxnetes_sidecar_latest.tar.gz

- name: Upload container artifact
uses: actions/upload-artifact@v4
with:
name: fluxnetes-sidecar
path: fluxnetes_sidecar_latest.tar.gz

build-postgres:
permissions:
packages: write
env:
container: ghcr.io/converged-computing/fluxnetes-postgres
runs-on: ubuntu-latest
name: build postgres
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: ^1.21.9

- name: Build Container
run: |
make prepare
make build-postgres REGISTRY=ghcr.io/converged-computing
- name: Save Containers
run: docker save ${{ env.container }} | gzip > fluxnetes_postgres_latest.tar.gz

- name: Upload container artifact
uses: actions/upload-artifact@v4
with:
name: fluxnetes-postgres
path: fluxnetes_postgres_latest.tar.gz

test-fluxnetes:
needs: [build-fluxnetes, build-sidecar, build-postgres]
permissions:
packages: write
env:
fluxnetes_container: ghcr.io/converged-computing/fluxnetes
sidecar_container: ghcr.io/converged-computing/fluxnetes-sidecar
postgres_container: ghcr.io/converged-computing/fluxnetes-postgres

runs-on: ubuntu-latest
name: test fluxnetes
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: ^1.21

- name: Download fluxnetes artifact
uses: actions/download-artifact@v4
with:
name: fluxnetes
path: /tmp

- name: Download fluxnetes_sidecar artifact
uses: actions/download-artifact@v4
with:
name: fluxnetes-sidecar
path: /tmp

- name: Download fluxnetes_postgres artifact
uses: actions/download-artifact@v4
with:
name: fluxnetes-postgres
path: /tmp

- name: Make Space For Build
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo apt-get remove -y firefox || true
sudo apt-get remove -y google-chrome-stable || true
sudo apt purge openjdk-* || echo "OpenJDK is not installed"
sudo apt remove --autoremove openjdk-* || echo "OpenJDK is not installed"
sudo apt purge oracle-java* || echo "Oracle Java is not installed"
sudo apt remove --autoremove adoptopenjdk-* || echo "Adopt open JDK is not installed"
sudo apt-get remove -y ant || echo "ant is not installed"
sudo rm -rf /opt/hostedtoolcache/Java_Adopt_jdk || true
sudo apt-get remove -y podman || echo "Podman is not installed"
sudo apt-get remove -y buildah || echo "Buidah is not installed"
sudo apt-get remove -y esl-erlang || echo "erlang is not installed"
sudo rm -rf /opt/google
sudo rm -rf /usr/share/az* /opt/az || true
sudo rm -rf /opt/microsoft
sudo rm -rf /opt/hostedtoolcache/Ruby
sudo apt-get remove -y swift || echo "swift is not installed"
sudo apt-get remove -y swig || echo "swig is not installed"
sudo apt-get remove -y texinfo || echo "texinfo is not installed"
sudo apt-get remove -y texlive || echo "texlive is not installed"
sudo apt-get remove -y r-base-core r-base || echo "R is not installed"
sudo rm -rf /opt/R
sudo rm -rf /usr/share/R
sudo rm -rf /opt/*.zip
sudo rm -rf /opt/*.tar.gz
sudo rm -rf /usr/share/*.zip
sudo rm -rf /usr/share/*.tar.gz
sudo rm -rf /opt/hhvm
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /opt/hostedtoolcache/node
sudo apt-get autoremove
- name: Load Docker images
run: |
ls /tmp/*.tar.gz
docker load --input /tmp/fluxnetes_sidecar_latest.tar.gz
rm /tmp/fluxnetes_sidecar_latest.tar.gz
docker load --input /tmp/fluxnetes_latest.tar.gz
rm /tmp/fluxnetes_latest.tar.gz
docker load --input /tmp/fluxnetes_postgres_latest.tar.gz
rm /tmp/fluxnetes_postgres_latest.tar.gz
docker image ls -a | grep fluxnetes
- name: Create Kind Cluster
uses: helm/[email protected]
with:
cluster_name: kind
kubectl_version: v1.28.2
version: v0.20.0
config: ./.github/test-kind-config.yaml

- name: Load Docker Containers into Kind
env:
fluxnetes: ${{ env.fluxnetes_container }}
sidecar: ${{ env.sidecar_container }}
postgres: ${{ env.postgres_container }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
kind load docker-image ${fluxnetes}
kind load docker-image ${sidecar}
kind load docker-image ${postgres}
- name: Install Cert Manager
run: |
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml
sleep 10
- name: Test Fluxnetes
run: /bin/bash ./.github/test.sh

- name: Tag Weekly Images
run: |
# YEAR-MONTH-DAY or #YYYY-MM-DD
tag=$(echo $(date +%Y-%m-%d))
echo "Tagging and releasing ${{ env.fluxnetes_container}}:${tag}"
docker tag ${{ env.fluxnetes_container }}:latest ${{ env.fluxnetes_container }}:${tag}
echo "Tagging and releasing ${{ env.sidecar_container}}:${tag}"
docker tag ${{ env.sidecar_container }}:latest ${{ env.sidecar_container }}:${tag}
echo "Tagging and releasing ${{ env.postgres_container}}:${tag}"
docker tag ${{ env.postgres_container }}:latest ${{ env.postgres_container }}:${tag}
# If we get here, tests pass, and we can deploy
- name: GHCR Login
if: (github.event_name != 'pull_request')
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Deploy Containers
if: (github.event_name != 'pull_request')
run: |
docker push ${{ env.fluxnetes_container }} --all-tags
docker push ${{ env.sidecar_container }} --all-tags
docker push ${{ env.postgres_container }} --all-tags
5 changes: 2 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,8 @@ RUN go get github.com/patrickmn/go-cache && \
make WHAT=cmd/kube-scheduler && \
cp /go/src/k8s.io/kubernetes/_output/local/go/bin/kube-scheduler /bin/kube-scheduler

# Commented out - was caching. We can uncomment when there is a more solid build
# https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/
# FROM busybox
# COPY --from=base /go/src/k8s.io/kubernetes/_output/local/go/bin/kube-scheduler /bin/kube-scheduler
FROM busybox
COPY --from=base /go/src/k8s.io/kubernetes/_output/local/go/bin/kube-scheduler /bin/kube-scheduler
WORKDIR /bin
CMD ["kube-scheduler"]
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,7 @@ SELECT group_name, group_size from pods_provisional;
- [ ] Restarting with postgres shouldn't have crashloopbackoff when the database isn't ready yet
- [ ] In-tree registry plugins (that are related to resources) should be run first to inform fluxion what nodes not to bind, where there are volumes, etc.
- [ ] The queue should inherit (and return) the start time (when the pod was first seen) "start" in scheduler.go
- [ ] when in basic working state, add back build and test workflows
- need to test duration / completion time works (run job with short duration, should be cancelled/cleaned up)
- [ ] need to test duration / completion time works (run job with short duration, should be cancelled/cleaned up)
- spam submission and test reservations (and cancel)
- [ ] implement other queue strategies (fcfs and backfill with > 1 reservation depth)
- fcfs can work by only adding one job (first in provisional) to the worker queue at once, only when it's empty! lol.
Expand Down

0 comments on commit 6adb6b6

Please sign in to comment.