Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support leader election mechanism for Spark operator HA #511

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 67 additions & 27 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,27 @@ limitations under the License.
package main

import (
"context"
"flag"
"fmt"
"os"
"os/signal"
"syscall"
"time"

"github.com/golang/glog"

apiv1 "k8s.io/api/core/v1"
apiextensionsclient "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/clock"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/informers"
clientset "k8s.io/client-go/kubernetes"
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/client-go/tools/leaderelection"
"k8s.io/client-go/tools/leaderelection/resourcelock"
"k8s.io/client-go/tools/record"

crclientset "github.com/GoogleCloudPlatform/spark-on-k8s-operator/pkg/client/clientset/versioned"
crinformers "github.com/GoogleCloudPlatform/spark-on-k8s-operator/pkg/client/informers/externalversions"
Expand Down Expand Up @@ -67,6 +71,13 @@ var (
metricsEndpoint = flag.String("metrics-endpoint", "/metrics", "Metrics endpoint.")
metricsPrefix = flag.String("metrics-prefix", "", "Prefix for the metrics.")
ingressUrlFormat = flag.String("ingress-url-format", "", "Ingress URL format.")
lockNamespace = flag.String("lock-namespace", apiv1.NamespaceDefault, "spark operator configMap lock namespace.")
configMapLockName = flag.String("configmap-lock-name", "spark-operator-configmap-lock", "name of spark operator configMap lock.")

leaseDuration = 15 * time.Second
renewDuration = 5 * time.Second
retryPeriod = 3 * time.Second
waitDuration = 5 * time.Second
)

func main() {
Expand Down Expand Up @@ -99,8 +110,6 @@ func main() {

glog.Info("Starting the Spark Operator")

stopCh := make(chan struct{})

crClient, err := crclientset.NewForConfig(config)
if err != nil {
glog.Fatal(err)
Expand Down Expand Up @@ -129,15 +138,38 @@ func main() {
scheduledApplicationController := scheduledsparkapplication.NewController(
crClient, kubeClient, apiExtensionsClient, crInformerFactory, clock.RealClock{})

controllerCtx, cancel := context.WithCancel(context.Background())
defer cancel()
// Start the informer factory that in turn starts the informer.
go crInformerFactory.Start(stopCh)
go podInformerFactory.Start(stopCh)
go crInformerFactory.Start(controllerCtx.Done())
go podInformerFactory.Start(controllerCtx.Done())

if err = applicationController.Start(*controllerThreads, stopCh); err != nil {
glog.Fatal(err)
onStarted := func(ctx context.Context) {
if err = applicationController.Start(*controllerThreads, ctx.Done()); err != nil {
glog.Fatal(err)
}
if err = scheduledApplicationController.Start(*controllerThreads, ctx.Done()); err != nil {
glog.Fatal(err)
}
}
if err = scheduledApplicationController.Start(*controllerThreads, stopCh); err != nil {
glog.Fatal(err)
onStopped := func() {
applicationController.Stop()
scheduledApplicationController.Stop()
}
hostName, err := os.Hostname()
if err != nil {
glog.Fatalf("failed to get hostname: %v", err)
}
rl := resourcelock.ConfigMapLock{
ConfigMapMeta: metav1.ObjectMeta{
Namespace: *lockNamespace,
Name: *configMapLockName,
},
Client: kubeClient.CoreV1(),
LockConfig: resourcelock.ResourceLockConfig{
Identity: hostName,
EventRecorder: &record.FakeRecorder{},
},
}

var hook *webhook.WebHook
Expand All @@ -147,25 +179,33 @@ func main() {
if err != nil {
glog.Fatal(err)
}

if err = hook.Start(*webhookConfigName); err != nil {
glog.Fatal(err)
}
go runWebHook(hook, controllerCtx)
}
// leader election for multiple operators
wait.Forever(func() {
leaderelection.RunOrDie(controllerCtx, leaderelection.LeaderElectionConfig{
Lock: &rl,
LeaseDuration: leaseDuration,
RenewDeadline: renewDuration,
RetryPeriod: retryPeriod,
Callbacks: leaderelection.LeaderCallbacks{
OnStartedLeading: onStarted,
OnStoppedLeading: onStopped,
},
})
}, waitDuration)
}

signalCh := make(chan os.Signal, 1)
signal.Notify(signalCh, syscall.SIGINT, syscall.SIGTERM)
<-signalCh

close(stopCh)

glog.Info("Shutting down the Spark Operator")
applicationController.Stop()
scheduledApplicationController.Stop()
if *enableWebhook {
if err := hook.Stop(*webhookConfigName); err != nil {
glog.Fatal(err)
}
func runWebHook(hook *webhook.WebHook, ctx context.Context) {
ctx, cancel := context.WithCancel(ctx)
defer cancel()
if err := hook.Start(*webhookConfigName); err != nil {
glog.Fatal(err)
}
<-ctx.Done()
glog.Info("Shutting down webhook.")
if err := hook.Stop(*webhookConfigName); err != nil {
glog.Fatal(err)
}
}

Expand Down