Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
dchourasia committed Oct 1, 2024
2 parents 6d6206e + f500555 commit 3551ea9
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 64 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ require (
github.com/openshift/api v0.0.0-20230823114715-5fdd7511b790
github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c
github.com/project-codeflare/appwrapper v0.25.0
github.com/project-codeflare/codeflare-common v0.0.0-20240628111341-56c962a09b7e
github.com/project-codeflare/codeflare-common v0.0.0-20240927111823-758dad4e90d0
github.com/ray-project/kuberay/ray-operator v1.1.1
go.uber.org/zap v1.27.0
golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/project-codeflare/appwrapper v0.25.1-0.20240926155059-30a8af17b8f4 h1:XYjq50WpGxagELHurCXyiirvdM9OzxTnCMcQC9gebnQ=
github.com/project-codeflare/appwrapper v0.25.1-0.20240926155059-30a8af17b8f4/go.mod h1:zDALq3/gn+eiczpD7TBZWWbAVuwcCGDFuN/77oh+CDw=
github.com/project-codeflare/codeflare-common v0.0.0-20240628111341-56c962a09b7e h1:juFd1dQyioeMxbVE6F0YD25ozm/jiqJE+MpDhu8p22k=
github.com/project-codeflare/codeflare-common v0.0.0-20240628111341-56c962a09b7e/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
github.com/project-codeflare/codeflare-common v0.0.0-20240927111823-758dad4e90d0 h1:5gfJUhF2GRZIXCUK/aUYTo79Ipo4Ngg9HO8Jgj8zThM=
github.com/project-codeflare/codeflare-common v0.0.0-20240927111823-758dad4e90d0/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
github.com/prometheus/client_golang v1.20.2 h1:5ctymQzZlyOON1666svgwn3s6IKWgfbjsejTMiXIyjg=
github.com/prometheus/client_golang v1.20.2/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
Expand Down
17 changes: 12 additions & 5 deletions test/e2e/mnist_pytorch_appwrapper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,23 +28,30 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/kueue/apis/kueue/v1beta1"
)

func TestMnistPyTorchAppWrapperCpu(t *testing.T) {
runMnistPyTorchAppWrapper(t, "cpu")
runMnistPyTorchAppWrapper(t, "cpu", 0)
}

func TestMnistPyTorchAppWrapperGpu(t *testing.T) {
runMnistPyTorchAppWrapper(t, "gpu")
runMnistPyTorchAppWrapper(t, "gpu", 1)
}

// Trains the MNIST dataset as a batch Job in an AppWrapper, and asserts successful completion of the training job.
func runMnistPyTorchAppWrapper(t *testing.T, accelerator string) {
func runMnistPyTorchAppWrapper(t *testing.T, accelerator string, numberOfGpus int) {
test := With(t)

// Create a namespace and localqueue in that namespace
// Create a namespace
namespace := test.NewTestNamespace()
localQueue := CreateKueueLocalQueue(test, namespace.Name, "e2e-cluster-queue")

// Create Kueue resources
resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
clusterQueue := createClusterQueue(test, resourceFlavor, numberOfGpus)
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)

// Test configuration
config := &corev1.ConfigMap{
Expand Down
79 changes: 62 additions & 17 deletions test/e2e/mnist_rayjob_raycluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/kueue/apis/kueue/v1beta1"
)

// Trains the MNIST dataset as a RayJob, executed by a Ray cluster
Expand All @@ -49,9 +50,15 @@ func TestMnistRayJobRayClusterGpu(t *testing.T) {
func runMnistRayJobRayCluster(t *testing.T, accelerator string, numberOfGpus int) {
test := With(t)

// Create a namespace and localqueue in that namespace
// Create a namespace
namespace := test.NewTestNamespace()
localQueue := CreateKueueLocalQueue(test, namespace.Name, "e2e-cluster-queue")

// Create Kueue resources
resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
clusterQueue := createClusterQueue(test, resourceFlavor, numberOfGpus)
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)

// Create MNIST training script
mnist := constructMNISTConfigMap(test, namespace)
Expand All @@ -61,7 +68,6 @@ func runMnistRayJobRayCluster(t *testing.T, accelerator string, numberOfGpus int

// Create RayCluster and assign it to the localqueue
rayCluster := constructRayCluster(test, namespace, mnist, numberOfGpus)
AssignToLocalQueue(rayCluster, localQueue)
rayCluster, err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Create(test.Ctx(), rayCluster, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
test.T().Logf("Created RayCluster %s/%s successfully", rayCluster.Namespace, rayCluster.Name)
Expand All @@ -78,8 +84,8 @@ func runMnistRayJobRayCluster(t *testing.T, accelerator string, numberOfGpus int

rayDashboardURL := getRayDashboardURL(test, rayCluster.Namespace, rayCluster.Name)

test.T().Logf("Connecting to Ray cluster at: %s", rayDashboardURL.String())
rayClient := NewRayClusterClient(rayDashboardURL)
test.T().Logf("Connecting to Ray cluster at: %s", rayDashboardURL)
rayClient := GetRayClusterClient(test, rayDashboardURL, test.Config().BearerToken)

test.T().Logf("Waiting for RayJob %s/%s to complete", rayJob.Namespace, rayJob.Name)
test.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutLong).
Expand Down Expand Up @@ -111,9 +117,15 @@ func TestMnistRayJobRayClusterAppWrapperGpu(t *testing.T) {
func runMnistRayJobRayClusterAppWrapper(t *testing.T, accelerator string, numberOfGpus int) {
test := With(t)

// Create a namespace and localqueue in that namespace
// Create a namespace
namespace := test.NewTestNamespace()
localQueue := CreateKueueLocalQueue(test, namespace.Name, "e2e-cluster-queue")

// Create Kueue resources
resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
clusterQueue := createClusterQueue(test, resourceFlavor, numberOfGpus)
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)

// Create MNIST training script
mnist := constructMNISTConfigMap(test, namespace)
Expand Down Expand Up @@ -167,8 +179,8 @@ func runMnistRayJobRayClusterAppWrapper(t *testing.T, accelerator string, number

rayDashboardURL := getRayDashboardURL(test, rayCluster.Namespace, rayCluster.Name)

test.T().Logf("Connecting to Ray cluster at: %s", rayDashboardURL.String())
rayClient := NewRayClusterClient(rayDashboardURL)
test.T().Logf("Connecting to Ray cluster at: %s", rayDashboardURL)
rayClient := GetRayClusterClient(test, rayDashboardURL, test.Config().BearerToken)

test.T().Logf("Waiting for RayJob %s/%s to complete", rayJob.Namespace, rayJob.Name)
test.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutLong).
Expand Down Expand Up @@ -374,7 +386,7 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC
}
}

func getRayDashboardURL(test Test, namespace, rayClusterName string) url.URL {
func getRayDashboardURL(test Test, namespace, rayClusterName string) string {
dashboardName := "ray-dashboard-" + rayClusterName

if IsOpenShift(test) {
Expand All @@ -396,10 +408,10 @@ func getRayDashboardURL(test Test, namespace, rayClusterName string) url.URL {
return resp.StatusCode, nil
}, TestTimeoutShort).Should(Not(Equal(503)))

return url.URL{
Scheme: "https",
Host: hostname,
}
dashboardUrl, _ := url.Parse("https://" + hostname)
test.T().Logf("Ray-dashboard route : %s\n", dashboardUrl.String())

return dashboardUrl.String()
}

ingress := GetIngress(test, namespace, dashboardName)
Expand All @@ -408,8 +420,41 @@ func getRayDashboardURL(test Test, namespace, rayClusterName string) url.URL {
test.Eventually(Ingress(test, ingress.Namespace, ingress.Name), TestTimeoutShort).
Should(WithTransform(LoadBalancerIngresses, HaveLen(1)))

return url.URL{
Scheme: "http",
Host: ingress.Spec.Rules[0].Host,
hostname := ingress.Spec.Rules[0].Host
dashboardUrl, _ := url.Parse("http://" + hostname)
test.T().Logf("Ray-dashboard route : %s\n", dashboardUrl.String())

return dashboardUrl.String()
}

// Create ClusterQueue
func createClusterQueue(test Test, resourceFlavor *v1beta1.ResourceFlavor, numberOfGpus int) *v1beta1.ClusterQueue {
cqSpec := v1beta1.ClusterQueueSpec{
NamespaceSelector: &metav1.LabelSelector{},
ResourceGroups: []v1beta1.ResourceGroup{
{
CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory"), corev1.ResourceName("nvidia.com/gpu")},
Flavors: []v1beta1.FlavorQuotas{
{
Name: v1beta1.ResourceFlavorReference(resourceFlavor.Name),
Resources: []v1beta1.ResourceQuota{
{
Name: corev1.ResourceCPU,
NominalQuota: resource.MustParse("8"),
},
{
Name: corev1.ResourceMemory,
NominalQuota: resource.MustParse("12Gi"),
},
{
Name: corev1.ResourceName("nvidia.com/gpu"),
NominalQuota: resource.MustParse(fmt.Sprint(numberOfGpus)),
},
},
},
},
},
},
}
return CreateKueueClusterQueue(test, cqSpec)
}
27 changes: 0 additions & 27 deletions test/e2e/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,30 +69,3 @@ done
echo ""

sleep 5
echo Creating Kueue ResourceFlavor and ClusterQueue
cat <<EOF | kubectl apply -f -
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: "default-flavor"
EOF

cat <<EOF | kubectl apply -f -
apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: "e2e-cluster-queue"
spec:
namespaceSelector: {} # match all.
resourceGroups:
- coveredResources: ["cpu","memory", "nvidia.com/gpu"]
flavors:
- name: "default-flavor"
resources:
- name: "cpu"
nominalQuota: 4
- name: "memory"
nominalQuota: "20G"
- name: "nvidia.com/gpu"
nominalQuota: "1"
EOF
12 changes: 0 additions & 12 deletions test/e2e/support.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,6 @@ import (

"github.com/onsi/gomega"
"github.com/project-codeflare/codeflare-common/support"

"sigs.k8s.io/controller-runtime/pkg/client"
kueuev1beta1 "sigs.k8s.io/kueue/apis/kueue/v1beta1"
)

//go:embed *.py *.txt *.sh
Expand All @@ -35,12 +32,3 @@ func ReadFile(t support.Test, fileName string) []byte {
t.Expect(err).NotTo(gomega.HaveOccurred())
return file
}

func AssignToLocalQueue(object client.Object, localqueue *kueuev1beta1.LocalQueue) {
labels := object.GetLabels()
if labels == nil {
labels = make(map[string]string)
}
labels["kueue.x-k8s.io/queue-name"] = localqueue.Name
object.SetLabels(labels)
}

0 comments on commit 3551ea9

Please sign in to comment.