Skip to content

Commit

Permalink
support successPolicy and failurePolicy
Browse files Browse the repository at this point in the history
Signed-off-by: qiankunli <[email protected]>

run codegen

Signed-off-by: qiankunli <[email protected]>

support watch pg and preemptable label

Signed-off-by: qiankunli <[email protected]>

fix test case

Signed-off-by: qiankunli <[email protected]>

fix test case

Signed-off-by: qiankunli <[email protected]>

fix test case

Signed-off-by: qiankunli <[email protected]>

refactor

Signed-off-by: qiankunli <[email protected]>

fix make

Signed-off-by: qiankunli <[email protected]>

fix test

Signed-off-by: qiankunli <[email protected]>

add corev1 schema

Signed-off-by: qiankunli <[email protected]>

add podgroups crd

Signed-off-by: qiankunli <[email protected]>
  • Loading branch information
qiankunli committed Apr 20, 2022
1 parent 8c43231 commit 552e42b
Show file tree
Hide file tree
Showing 11 changed files with 492 additions and 9 deletions.
2 changes: 2 additions & 0 deletions cmd/training-operator.v1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"github.com/kubeflow/training-operator/pkg/config"
controllerv1 "github.com/kubeflow/training-operator/pkg/controller.v1"
//+kubebuilder:scaffold:imports
volcanov1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
)

var (
Expand All @@ -54,6 +55,7 @@ func init() {
utilruntime.Must(mxnetv1.AddToScheme(scheme))
utilruntime.Must(mpiv1.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme
utilruntime.Must(volcanov1beta1.AddToScheme(scheme))
}

func main() {
Expand Down
4 changes: 4 additions & 0 deletions manifests/base/crds/kubeflow.org_pytorchjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ spec:
properties:
elasticPolicy:
properties:
failurePolicy:
type: string
maxReplicas:
description: upper limit for the number of pods that can be set
by the autoscaler; cannot be smaller than MinReplicas, defaults
Expand Down Expand Up @@ -522,6 +524,8 @@ spec:
--rdzv_endpoint, --rdzv_id are auto-assigned; any explicitly
set values are ignored.
type: boolean
successPolicy:
type: string
type: object
pytorchReplicaSpecs:
additionalProperties:
Expand Down
301 changes: 301 additions & 0 deletions manifests/base/crds/scheduling.volcano.sh_podgroups.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,301 @@
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.6.0
creationTimestamp: null
name: podgroups.scheduling.volcano.sh
spec:
group: scheduling.volcano.sh
names:
kind: PodGroup
listKind: PodGroupList
plural: podgroups
shortNames:
- pg
- podgroup-v1beta1
singular: podgroup
scope: Namespaced
versions:
- name: v1beta1
additionalPrinterColumns:
- name: STATUS
type: string
jsonPath: .status.phase
- name: MINMEMBER
type: integer
jsonPath: .spec.minMember
- name: RUNNINGS
type: integer
jsonPath: .status.running
- name: AGE
type: date
jsonPath: .metadata.creationTimestamp
schema:
openAPIV3Schema:
description: PodGroup is a collection of Pod; used for batch workload.
properties:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
type: string
kind:
description: 'Kind is a string value representing the REST resource this
object represents. Servers may infer this from the endpoint the client
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string
metadata:
type: object
spec:
description: 'Specification of the desired behavior of the pod group.
More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status'
properties:
minMember:
description: MinMember defines the minimal number of members/tasks
to run the pod group; if there's not enough resources to start all
tasks, the scheduler will not start anyone.
format: int32
type: integer
minResources:
additionalProperties:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
description: MinResources defines the minimal resource of members/tasks
to run the pod group; if there's not enough resources to start all
tasks, the scheduler will not start anyone.
type: object
minTaskMember:
additionalProperties:
format: int32
type: integer
description: MinTaskMember defines the minimal number of pods to run
each task in the pod group; if there's not enough resources to start
each task, the scheduler will not start anyone.
type: object
priorityClassName:
description: If specified, indicates the PodGroup's priority. "system-node-critical"
and "system-cluster-critical" are two special keywords which indicate
the highest priorities with the former being the highest priority.
Any other name must be defined by creating a PriorityClass object
with that name. If not specified, the PodGroup priority will be
default or zero if there is no default.
type: string
queue:
description: Queue defines the queue to allocate resource for PodGroup;
if queue does not exist, the PodGroup will not be scheduled. Defaults
to `default` Queue with the lowest weight.
type: string
type: object
status:
description: Status represents the current information about a pod group.
This data may not be up to date.
properties:
conditions:
description: The conditions of PodGroup.
items:
description: PodGroupCondition contains details for the current
state of this pod group.
properties:
lastTransitionTime:
description: Last time the phase transitioned from another to
current phase.
format: date-time
type: string
message:
description: Human-readable message indicating details about
last transition.
type: string
reason:
description: Unique, one-word, CamelCase reason for the phase's
last transition.
type: string
status:
description: Status is the status of the condition.
type: string
transitionID:
description: The ID of condition transition.
type: string
type:
description: Type is the type of the condition
type: string
type: object
type: array
failed:
description: The number of pods which reached phase Failed.
format: int32
type: integer
phase:
description: Current phase of PodGroup.
type: string
running:
description: The number of actively running pods.
format: int32
type: integer
succeeded:
description: The number of pods which reached phase Succeeded.
format: int32
type: integer
type: object
type: object
served: true
storage: true
status:
acceptedNames:
kind: ""
plural: ""
conditions: []
storedVersions: []
---
# Source: volcano/templates/scheduling_v1beta1_queue.yaml
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.6.0
creationTimestamp: null
name: queues.scheduling.volcano.sh
spec:
group: scheduling.volcano.sh
names:
kind: Queue
listKind: QueueList
plural: queues
shortNames:
- q
- queue-v1beta1
singular: queue
scope: Cluster
versions:
- name: v1beta1
schema:
openAPIV3Schema:
description: Queue is a queue of PodGroup.
properties:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
type: string
kind:
description: 'Kind is a string value representing the REST resource this
object represents. Servers may infer this from the endpoint the client
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string
metadata:
type: object
spec:
description: 'Specification of the desired behavior of the queue. More
info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status'
properties:
capability:
additionalProperties:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
description: ResourceList is a set of (resource name, quantity) pairs.
type: object
extendClusters:
description: extendCluster indicate the jobs in this Queue will be
dispatched to these clusters.
items:
description: CluterSpec represents the template of Cluster
properties:
capacity:
additionalProperties:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
description: ResourceList is a set of (resource name, quantity)
pairs.
type: object
name:
type: string
weight:
format: int32
type: integer
type: object
type: array
guarantee:
description: Guarantee indicate configuration about resource reservation
properties:
resource:
additionalProperties:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
description: The amount of cluster resource reserved for queue.
Just set either `percentage` or `resource`
type: object
type: object
reclaimable:
description: Reclaimable indicate whether the queue can be reclaimed
by other queue
type: boolean
weight:
format: int32
type: integer
type: object
status:
description: The status of queue.
properties:
inqueue:
description: The number of `Inqueue` PodGroup in this queue.
format: int32
type: integer
pending:
description: The number of 'Pending' PodGroup in this queue.
format: int32
type: integer
reservation:
description: Reservation is the profile of resource reservation for
queue
properties:
nodes:
description: Nodes are Locked nodes for queue
items:
type: string
type: array
resource:
additionalProperties:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
description: Resource is a list of total idle resource in locked
nodes.
type: object
type: object
running:
description: The number of 'Running' PodGroup in this queue.
format: int32
type: integer
state:
description: State is state of queue
type: string
unknown:
description: The number of 'Unknown' PodGroup in this queue.
format: int32
type: integer
type: object
type: object
served: true
storage: true
subresources:
status: {}
status:
acceptedNames:
kind: ""
plural: ""
conditions: []
storedVersions: []
8 changes: 8 additions & 0 deletions pkg/apis/pytorch/v1/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,14 @@ func setElasticPolicy(job *PyTorchJob) {
job.Spec.ElasticPolicy.MaxReplicas = workerReplicas
job.Spec.ElasticPolicy.MinReplicas = workerReplicas
}
if job.Spec.ElasticPolicy.SuccessPolicy == nil {
policy := SuccessPolicyDefault
job.Spec.ElasticPolicy.SuccessPolicy = &policy
}
if job.Spec.ElasticPolicy.FailurePolicy == nil {
policy := FailurePolicyDefault
job.Spec.ElasticPolicy.FailurePolicy = &policy
}
}
}

Expand Down
14 changes: 13 additions & 1 deletion pkg/apis/pytorch/v1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 552e42b

Please sign in to comment.