diff --git a/cmd/bootstrap/group_replication.go b/cmd/bootstrap/group_replication.go index 5ad900e8b..a4b30ebd3 100644 --- a/cmd/bootstrap/group_replication.go +++ b/cmd/bootstrap/group_replication.go @@ -373,6 +373,19 @@ func bootstrapGroupReplication(ctx context.Context) error { log.Printf("Cluster status:\n%s", status) + for _, member := range status.DefaultReplicaSet.Topology { + if member.MemberRole == innodbcluster.MemberRolePrimary && member.MemberState != innodbcluster.MemberStateOnline { + log.Printf("Primary (%s) is not ONLINE. Starting full cluster crash recovery...", member.Address) + + if err := handleFullClusterCrash(ctx); err != nil { + return errors.Wrap(err, "handle full cluster crash") + } + + // force restart container + os.Exit(1) + } + } + member, ok := status.DefaultReplicaSet.Topology[fmt.Sprintf("%s:%d", localShell.host, 3306)] if !ok { log.Printf("Adding instance (%s) to InnoDB cluster", localShell.host) diff --git a/cmd/bootstrap/main.go b/cmd/bootstrap/main.go index b7d452d98..71f239670 100644 --- a/cmd/bootstrap/main.go +++ b/cmd/bootstrap/main.go @@ -2,6 +2,7 @@ package main import ( "context" + "io" "log" "os" "path/filepath" @@ -20,7 +21,8 @@ func main() { log.Fatalf("error opening file: %v", err) } defer f.Close() - log.SetOutput(f) + + log.SetOutput(io.MultiWriter(os.Stderr, f)) fullClusterCrash, err := fileExists(fullClusterCrashFile) if err == nil && fullClusterCrash { diff --git a/config/rbac/cluster/role.yaml b/config/rbac/cluster/role.yaml index 4cae4bb5a..21014a95e 100644 --- a/config/rbac/cluster/role.yaml +++ b/config/rbac/cluster/role.yaml @@ -17,9 +17,21 @@ rules: - apiGroups: - "" resources: - - configmaps - pods - pods/exec + verbs: + - create + - delete + - deletecollection + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - configmaps - secrets - services verbs: diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 9ffc3139d..552be60d3 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -9,8 +9,6 @@ rules: resources: - configmaps - persistentvolumeclaims - - pods - - pods/exec - secrets - services verbs: @@ -28,6 +26,20 @@ rules: verbs: - create - patch +- apiGroups: + - "" + resources: + - pods + - pods/exec + verbs: + - create + - delete + - deletecollection + - get + - list + - patch + - update + - watch - apiGroups: - "" resources: diff --git a/deploy/bundle.yaml b/deploy/bundle.yaml index 57107161c..972e4e920 100644 --- a/deploy/bundle.yaml +++ b/deploy/bundle.yaml @@ -11437,8 +11437,6 @@ rules: resources: - configmaps - persistentvolumeclaims - - pods - - pods/exec - secrets - services verbs: @@ -11456,6 +11454,20 @@ rules: verbs: - create - patch +- apiGroups: + - "" + resources: + - pods + - pods/exec + verbs: + - create + - delete + - deletecollection + - get + - list + - patch + - update + - watch - apiGroups: - "" resources: diff --git a/deploy/cw-bundle.yaml b/deploy/cw-bundle.yaml index dbf0e78bf..fcb936eea 100644 --- a/deploy/cw-bundle.yaml +++ b/deploy/cw-bundle.yaml @@ -11445,9 +11445,21 @@ rules: - apiGroups: - "" resources: - - configmaps - pods - pods/exec + verbs: + - create + - delete + - deletecollection + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - configmaps - secrets - services verbs: diff --git a/deploy/cw-rbac.yaml b/deploy/cw-rbac.yaml index 79f5b37e4..6f63d29c4 100644 --- a/deploy/cw-rbac.yaml +++ b/deploy/cw-rbac.yaml @@ -58,9 +58,21 @@ rules: - apiGroups: - "" resources: - - configmaps - pods - pods/exec + verbs: + - create + - delete + - deletecollection + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - configmaps - secrets - services verbs: diff --git a/deploy/rbac.yaml b/deploy/rbac.yaml index b8e9be440..7961de781 100644 --- a/deploy/rbac.yaml +++ b/deploy/rbac.yaml @@ -50,8 +50,6 @@ rules: resources: - configmaps - persistentvolumeclaims - - pods - - pods/exec - secrets - services verbs: @@ -69,6 +67,20 @@ rules: verbs: - create - patch +- apiGroups: + - "" + resources: + - pods + - pods/exec + verbs: + - create + - delete + - deletecollection + - get + - list + - patch + - update + - watch - apiGroups: - "" resources: diff --git a/e2e-tests/functions b/e2e-tests/functions index a8316683b..17c5ee59b 100755 --- a/e2e-tests/functions +++ b/e2e-tests/functions @@ -575,7 +575,10 @@ get_primary_from_haproxy() { } get_primary_from_group_replication() { - run_mysql "SELECT MEMBER_HOST FROM performance_schema.replication_group_members where MEMBER_ROLE='PRIMARY';" "-h $(get_mysql_router_service $(get_cluster_name)) -P 6446 -uroot -proot_password" | cut -d'.' -f1 + run_mysql \ + "SELECT MEMBER_HOST FROM performance_schema.replication_group_members where MEMBER_ROLE='PRIMARY';" \ + "-h $(get_mysql_router_service $(get_cluster_name)) -P 6446 -uroot -proot_password" \ + | cut -d'.' -f1 } verify_certificate_sans() { diff --git a/e2e-tests/tests/gr-self-healing/01-deploy-chaos-mesh.yaml b/e2e-tests/tests/gr-self-healing/01-deploy-chaos-mesh.yaml index 2fcde5027..e5638625e 100644 --- a/e2e-tests/tests/gr-self-healing/01-deploy-chaos-mesh.yaml +++ b/e2e-tests/tests/gr-self-healing/01-deploy-chaos-mesh.yaml @@ -1,6 +1,5 @@ apiVersion: kuttl.dev/v1beta1 kind: TestStep -timeout: 10 commands: - script: |- set -o errexit @@ -9,3 +8,4 @@ commands: source ../../functions deploy_chaos_mesh + timeout: 120 diff --git a/e2e-tests/tests/gr-self-healing/14-cluster-crash.yaml b/e2e-tests/tests/gr-self-healing/14-cluster-crash.yaml index 23a786193..ed4e78f5f 100644 --- a/e2e-tests/tests/gr-self-healing/14-cluster-crash.yaml +++ b/e2e-tests/tests/gr-self-healing/14-cluster-crash.yaml @@ -1,6 +1,5 @@ apiVersion: kuttl.dev/v1beta1 kind: TestStep -timeout: 30 commands: - script: |- set -o errexit @@ -10,3 +9,4 @@ commands: kill_pods "${NAMESPACE}" "label" "app.kubernetes.io/instance" "gr-self-healing" "cluster-crash" sleep 30 # wait for crash + timeout: 40 diff --git a/e2e-tests/tests/gr-self-healing/17-assert.yaml b/e2e-tests/tests/gr-self-healing/17-assert.yaml new file mode 100644 index 000000000..ab1e5930f --- /dev/null +++ b/e2e-tests/tests/gr-self-healing/17-assert.yaml @@ -0,0 +1,202 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 480 +--- +kind: StatefulSet +apiVersion: apps/v1 +metadata: + name: gr-self-healing-mysql +status: + observedGeneration: 1 + replicas: 3 + readyReplicas: 3 + currentReplicas: 3 + updatedReplicas: 3 + collisionCount: 0 +--- +kind: Deployment +apiVersion: apps/v1 +metadata: + name: gr-self-healing-router +status: + observedGeneration: 1 + replicas: 3 + readyReplicas: 3 + updatedReplicas: 3 +--- +apiVersion: ps.percona.com/v1alpha1 +kind: PerconaServerMySQL +metadata: + name: gr-self-healing + finalizers: + - percona.com/delete-mysql-pods-in-order +status: + mysql: + ready: 3 + size: 3 + state: ready + router: + ready: 3 + size: 3 + state: ready +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: mysql + app.kubernetes.io/instance: gr-self-healing + app.kubernetes.io/managed-by: percona-server-operator + app.kubernetes.io/name: percona-server + app.kubernetes.io/part-of: percona-server + name: gr-self-healing-mysql + ownerReferences: + - apiVersion: ps.percona.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: PerconaServerMySQL + name: gr-self-healing +spec: + clusterIP: None + ports: + - name: mysql + port: 3306 + protocol: TCP + targetPort: 3306 + - name: mysql-admin + port: 33062 + protocol: TCP + targetPort: 33062 + - name: mysqlx + port: 33060 + protocol: TCP + targetPort: 33060 + - name: http + port: 6450 + protocol: TCP + targetPort: 6450 + - name: mysql-gr + port: 33061 + protocol: TCP + targetPort: 33061 + selector: + app.kubernetes.io/component: mysql + app.kubernetes.io/instance: gr-self-healing + app.kubernetes.io/managed-by: percona-server-operator + app.kubernetes.io/name: percona-server + app.kubernetes.io/part-of: percona-server + sessionAffinity: None + type: ClusterIP +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: router + app.kubernetes.io/instance: gr-self-healing + app.kubernetes.io/managed-by: percona-server-operator + app.kubernetes.io/name: percona-server + app.kubernetes.io/part-of: percona-server + name: gr-self-healing-router + ownerReferences: + - apiVersion: ps.percona.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: PerconaServerMySQL + name: gr-self-healing +spec: + ports: + - name: http + port: 8443 + protocol: TCP + targetPort: 8443 + - name: rw-default + port: 3306 + protocol: TCP + targetPort: 6446 + - name: read-write + port: 6446 + protocol: TCP + targetPort: 6446 + - name: read-only + port: 6447 + protocol: TCP + targetPort: 6447 + - name: x-read-write + port: 6448 + protocol: TCP + targetPort: 6448 + - name: x-read-only + port: 6449 + protocol: TCP + targetPort: 6449 + - name: x-default + port: 33060 + protocol: TCP + targetPort: 33060 + - name: rw-admin + port: 33062 + protocol: TCP + targetPort: 33062 + selector: + app.kubernetes.io/component: router + app.kubernetes.io/instance: gr-self-healing + app.kubernetes.io/managed-by: percona-server-operator + app.kubernetes.io/name: percona-server + app.kubernetes.io/part-of: percona-server + sessionAffinity: None + type: ClusterIP +--- +apiVersion: chaos-mesh.org/v1alpha1 +kind: PodChaos +metadata: + name: chaos-kill-label-cluster-crash +spec: + action: pod-kill + mode: all +status: + experiment: + containerRecords: + - events: + - operation: Apply + type: Succeeded + injectedCount: 1 + phase: Injected + recoveredCount: 0 + selectorKey: . + - events: + - operation: Apply + type: Succeeded + injectedCount: 1 + phase: Injected + recoveredCount: 0 + selectorKey: . + - events: + - operation: Apply + type: Succeeded + injectedCount: 1 + phase: Injected + recoveredCount: 0 + selectorKey: . + - events: + - operation: Apply + type: Succeeded + injectedCount: 1 + phase: Injected + recoveredCount: 0 + selectorKey: . + - events: + - operation: Apply + type: Succeeded + injectedCount: 1 + phase: Injected + recoveredCount: 0 + selectorKey: . + - events: + - operation: Apply + type: Succeeded + injectedCount: 1 + phase: Injected + recoveredCount: 0 + selectorKey: . + desiredPhase: Run diff --git a/e2e-tests/tests/gr-self-healing/17-quorum-loss.yaml b/e2e-tests/tests/gr-self-healing/17-quorum-loss.yaml new file mode 100644 index 000000000..3fa1d257e --- /dev/null +++ b/e2e-tests/tests/gr-self-healing/17-quorum-loss.yaml @@ -0,0 +1,16 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +timeout: 30 +commands: + - script: |- + set -o errexit + set -o xtrace + + source ../../functions + + primary=$(get_primary_from_group_replication) + a_replica=$(run_mysql \ + "SELECT MEMBER_HOST FROM performance_schema.replication_group_members WHERE MEMBER_ROLE='SECONDARY' LIMIT 1;" \ + "-h $(get_mysql_router_service $(get_cluster_name)) -P 6446 -uroot -proot_password" | cut -d'.' -f1) + + kubectl -n ${NAMESPACE} delete pod ${primary} ${a_replica} --force --grace-period=0 diff --git a/e2e-tests/tests/gr-self-healing/17-destroy-chaos-mesh.yaml b/e2e-tests/tests/gr-self-healing/97-destroy-chaos-mesh.yaml similarity index 100% rename from e2e-tests/tests/gr-self-healing/17-destroy-chaos-mesh.yaml rename to e2e-tests/tests/gr-self-healing/97-destroy-chaos-mesh.yaml diff --git a/pkg/controller/ps/controller.go b/pkg/controller/ps/controller.go index b6fcdf1a8..2fc65673e 100644 --- a/pkg/controller/ps/controller.go +++ b/pkg/controller/ps/controller.go @@ -72,7 +72,8 @@ type PerconaServerMySQLReconciler struct { } //+kubebuilder:rbac:groups=ps.percona.com,resources=perconaservermysqls;perconaservermysqls/status;perconaservermysqls/finalizers,verbs=get;list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups="",resources=pods;pods/exec;configmaps;services;secrets,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups="",resources=configmaps;services;secrets,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups="",resources=pods;pods/exec,verbs=get;list;watch;create;update;patch;delete;deletecollection //+kubebuilder:rbac:groups="",resources=events,verbs=create;patch //+kubebuilder:rbac:groups=apps,resources=statefulsets;deployments,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=certmanager.k8s.io;cert-manager.io,resources=issuers;certificates,verbs=get;list;watch;create;update;patch;delete;deletecollection diff --git a/pkg/controller/ps/crash_recovery.go b/pkg/controller/ps/crash_recovery.go index 5bec42082..90cb07643 100644 --- a/pkg/controller/ps/crash_recovery.go +++ b/pkg/controller/ps/crash_recovery.go @@ -7,6 +7,9 @@ import ( "strings" "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" + "sigs.k8s.io/controller-runtime/pkg/client" logf "sigs.k8s.io/controller-runtime/pkg/log" apiv1alpha1 "github.com/percona/percona-server-mysql-operator/api/v1alpha1" @@ -83,6 +86,7 @@ func (r *PerconaServerMySQLReconciler) reconcileFullClusterCrash(ctx context.Con continue } + log.Info("Attempting to reboot cluster from complete outage") err = mysh.RebootClusterFromCompleteOutageWithExec(ctx, cr.InnoDBClusterName()) if err == nil { log.Info("Cluster was successfully rebooted") @@ -93,6 +97,23 @@ func (r *PerconaServerMySQLReconciler) reconcileFullClusterCrash(ctx context.Con } break } + + if strings.Contains(err.Error(), "The Cluster is ONLINE") { + log.Info("Tried to reboot the cluster but MySQL says the cluster is already online") + log.Info("Deleting all MySQL pods") + err := r.Client.DeleteAllOf(ctx, &corev1.Pod{}, &client.DeleteAllOfOptions{ + ListOptions: client.ListOptions{ + LabelSelector: labels.SelectorFromSet(mysql.MatchLabels(cr)), + Namespace: cr.Namespace, + }, + }) + if err != nil { + return errors.Wrap(err, "failed to delete MySQL pods") + } + break + } + + log.Error(err, "failed to reboot cluster from complete outage") } return nil diff --git a/pkg/innodbcluster/innodbcluster.go b/pkg/innodbcluster/innodbcluster.go index 07b8bb917..1586b0768 100644 --- a/pkg/innodbcluster/innodbcluster.go +++ b/pkg/innodbcluster/innodbcluster.go @@ -34,8 +34,16 @@ const ( MemberStateMissing MemberState = "(MISSING)" ) +type MemberRole string + +const ( + MemberRolePrimary MemberRole = "PRIMARY" + MemberRoleSecondary MemberRole = "SECONDARY" +) + type Member struct { Address string `json:"address"` + MemberRole MemberRole `json:"memberRole"` MemberState MemberState `json:"status"` InstanceErrors []string `json:"instanceErrors"` }