diff --git a/examples/failure-policy/failjobset-action.yaml b/examples/failure-policy/failjobset-action.yaml new file mode 100644 index 000000000..b25ed0f6c --- /dev/null +++ b/examples/failure-policy/failjobset-action.yaml @@ -0,0 +1,61 @@ +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: failjobset-action-example +spec: + failurePolicy: + maxRestarts: 3 + rules: + # The JobSet will fail immediately when the leader job fails. + - action: FailJobSet + targetReplicatedJobs: + - leader + replicatedJobs: + - name: leader + replicas: 1 + template: + spec: + # Set backoff limit to 0 so job will immediately fail if any pod fails. + backoffLimit: 0 + completions: 2 + parallelism: 2 + template: + spec: + containers: + - name: leader + image: bash:latest + command: + - bash + - -xc + - | + echo "JOB_COMPLETION_INDEX=$JOB_COMPLETION_INDEX" + if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then + for i in $(seq 10 -1 1) + do + echo "Sleeping in $i" + sleep 1 + done + exit 1 + fi + for i in $(seq 1 1000) + do + echo "$i" + sleep 1 + done + - name: workers + replicas: 1 + template: + spec: + backoffLimit: 0 + completions: 2 + parallelism: 2 + template: + spec: + containers: + - name: worker + image: bash:latest + command: + - bash + - -xc + - | + sleep 1000 diff --git a/examples/failure-policy/host-maintenance-event-model.yaml b/examples/failure-policy/host-maintenance-event-model.yaml new file mode 100644 index 000000000..aaf60eedd --- /dev/null +++ b/examples/failure-policy/host-maintenance-event-model.yaml @@ -0,0 +1,74 @@ +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: host-maintenance-event-model +spec: + failurePolicy: + maxRestarts: 0 + rules: + # The JobSet will restart an unlimited number of times when failure matches the pod failure policy. + - action: RestartJobSetAndIgnoreMaxRestarts + onJobFailureReasons: + - PodFailurePolicy + # The JobSet is restarted as normal when the leader job fails and the above rule is not matched. + - action: RestartJobSet + targetReplicatedJobs: + - leader + replicatedJobs: + - name: leader + replicas: 1 + template: + spec: + # Set backoff limit to 0 so job will immediately fail if any pod fails. + backoffLimit: 0 + completions: 2 + parallelism: 2 + template: + spec: + restartPolicy: Never + containers: + - name: leader + image: bash:latest + command: + - bash + - -xc + - | + echo "JOB_COMPLETION_INDEX=$JOB_COMPLETION_INDEX" + if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then + for i in $(seq 120 -1 1) + do + echo "Sleeping in $i" + sleep 1 + done + exit 1 + fi + for i in $(seq 1 1000) + do + echo "$i" + sleep 1 + done + # This failure policy is triggered when a node undergoes host maintenace. + # In such a case, the pods are evicted and the job will fail with a condition + # of type DisruptionTarget. + podFailurePolicy: + rules: + - action: FailJob + onPodConditions: + - type: DisruptionTarget + - name: workers + replicas: 1 + template: + spec: + backoffLimit: 0 + completions: 2 + parallelism: 2 + template: + spec: + containers: + - name: worker + image: bash:latest + command: + - bash + - -xc + - | + sleep 1000 diff --git a/examples/failure-policy/onjobfailurereasons-present-podfailurepolicy.yaml b/examples/failure-policy/onjobfailurereasons-present-podfailurepolicy.yaml new file mode 100644 index 000000000..41622890c --- /dev/null +++ b/examples/failure-policy/onjobfailurereasons-present-podfailurepolicy.yaml @@ -0,0 +1,74 @@ +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: onjobfailurereasons-podfailurepolicy-example +spec: + failurePolicy: + maxRestarts: 3 + rules: + # The JobSet will restart an unlimited number of times + # when the leader job fails with a failure reason matching + # the pod failure policy. + - action: RestartJobSetAndIgnoreMaxRestarts + targetReplicatedJobs: + - leader + onJobFailureReasons: + - PodFailurePolicy + replicatedJobs: + - name: leader + replicas: 1 + template: + spec: + # Set backoff limit to 0 so job will immediately fail if any pod fails. + backoffLimit: 0 + completions: 2 + parallelism: 2 + template: + spec: + restartPolicy: Never + containers: + - name: leader + image: bash:latest + command: + - bash + - -xc + - | + echo "JOB_COMPLETION_INDEX=$JOB_COMPLETION_INDEX" + if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then + for i in $(seq 10 -1 1) + do + echo "Sleeping in $i" + sleep 1 + done + exit 1 + fi + for i in $(seq 1 1000) + do + echo "$i" + sleep 1 + done + podFailurePolicy: + rules: + - action: FailJob + onPodConditions: [] + onExitCodes: + containerName: leader + operator: In + values: [1] + - name: workers + replicas: 1 + template: + spec: + backoffLimit: 0 + completions: 2 + parallelism: 2 + template: + spec: + containers: + - name: worker + image: bash:latest + command: + - bash + - -xc + - | + sleep 1000 diff --git a/examples/failure-policy/onjobfailurereasons-present.yaml b/examples/failure-policy/onjobfailurereasons-present.yaml new file mode 100644 index 000000000..2b7176eb4 --- /dev/null +++ b/examples/failure-policy/onjobfailurereasons-present.yaml @@ -0,0 +1,64 @@ +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: onjobfailurereasons-present-example +spec: + failurePolicy: + maxRestarts: 3 + rules: + # The JobSet will restart an unlimited number of times when the + # leader job fails with the failure reason BackoffLimitExceeded. + - action: RestartJobSetAndIgnoreMaxRestarts + targetReplicatedJobs: + - leader + onJobFailureReasons: + - BackoffLimitExceeded + replicatedJobs: + - name: leader + replicas: 1 + template: + spec: + # Set backoff limit to 0 so job will immediately fail if any pod fails. + backoffLimit: 0 + completions: 2 + parallelism: 2 + template: + spec: + containers: + - name: leader + image: bash:latest + command: + - bash + - -xc + - | + echo "JOB_COMPLETION_INDEX=$JOB_COMPLETION_INDEX" + if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then + for i in $(seq 10 -1 1) + do + echo "Sleeping in $i" + sleep 1 + done + exit 1 + fi + for i in $(seq 1 1000) + do + echo "$i" + sleep 1 + done + - name: workers + replicas: 1 + template: + spec: + backoffLimit: 0 + completions: 2 + parallelism: 2 + template: + spec: + containers: + - name: worker + image: bash:latest + command: + - bash + - -xc + - | + sleep 1000 diff --git a/examples/failure-policy/restartjobset-action.yaml b/examples/failure-policy/restartjobset-action.yaml new file mode 100644 index 000000000..d4b9712f8 --- /dev/null +++ b/examples/failure-policy/restartjobset-action.yaml @@ -0,0 +1,61 @@ +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: restartjobset-action-example +spec: + failurePolicy: + maxRestarts: 3 + rules: + # The JobSet will restart when the leader job fails. + - action: RestartJobSet + targetReplicatedJobs: + - leader + replicatedJobs: + - name: leader + replicas: 1 + template: + spec: + # Set backoff limit to 0 so job will immediately fail if any pod fails. + backoffLimit: 0 + completions: 2 + parallelism: 2 + template: + spec: + containers: + - name: leader + image: bash:latest + command: + - bash + - -xc + - | + echo "JOB_COMPLETION_INDEX=$JOB_COMPLETION_INDEX" + if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then + for i in $(seq 10 -1 1) + do + echo "Sleeping in $i" + sleep 1 + done + exit 1 + fi + for i in $(seq 1 1000) + do + echo "$i" + sleep 1 + done + - name: workers + replicas: 1 + template: + spec: + backoffLimit: 0 + completions: 2 + parallelism: 2 + template: + spec: + containers: + - name: worker + image: bash:latest + command: + - bash + - -xc + - | + sleep 1000 diff --git a/examples/failure-policy/restartjobsetandignoremaxrestarts-action.yaml b/examples/failure-policy/restartjobsetandignoremaxrestarts-action.yaml new file mode 100644 index 000000000..67553d6a0 --- /dev/null +++ b/examples/failure-policy/restartjobsetandignoremaxrestarts-action.yaml @@ -0,0 +1,63 @@ +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + # rjimr stands for "restartjobsetandignoremaxrestarts" + name: rjimr-action-example +spec: + failurePolicy: + maxRestarts: 3 + rules: + # The JobSet will restart an unlimited number of times + # when the leader job fails. + - action: RestartJobSetAndIgnoreMaxRestarts + targetReplicatedJobs: + - leader + replicatedJobs: + - name: leader + replicas: 1 + template: + spec: + # Set backoff limit to 0 so job will immediately fail if any pod fails. + backoffLimit: 0 + completions: 2 + parallelism: 2 + template: + spec: + containers: + - name: leader + image: bash:latest + command: + - bash + - -xc + - | + echo "JOB_COMPLETION_INDEX=$JOB_COMPLETION_INDEX" + if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then + for i in $(seq 10 -1 1) + do + echo "Sleeping in $i" + sleep 1 + done + exit 1 + fi + for i in $(seq 1 1000) + do + echo "$i" + sleep 1 + done + - name: workers + replicas: 1 + template: + spec: + backoffLimit: 0 + completions: 2 + parallelism: 2 + template: + spec: + containers: + - name: worker + image: bash:latest + command: + - bash + - -xc + - | + sleep 1000