From 511c7189beb2b895cc9bc6a4c773136cf8751b56 Mon Sep 17 00:00:00 2001
From: Max Falk <gfalk@yelp.com>
Date: Wed, 5 Jul 2023 17:58:48 +0200
Subject: [PATCH] PAASTA-17941: add topology spread constraints option (#3641)

Ticket: PAASTA-17941

Problem
-----
On EKS, we cannot configure the default Kubernetes scheduler with pod topology constraints.
But to support the Karpenter migration and to be able to tune our spread of Pods across zones and nodes, we want to be able to configure cluster wide topology constraints via PaaSTA.

Solution
-----
Add a `topology_spread_constraints` option to the PaaSTA system config that allows defining rules to spread Pods across topologies per cluster.
For example to try spreading Pods evenly across both nodes and availability zones, we'd set:
```
topology_spread_constraints:
    - max_skew: 1
      topology_key: "topology.kubernetes.io/zone"
      when_unsatisfiable: "ScheduleAnyway"
    - max_skew: 1
      topology_key: "kubernetes.io/hostname"
      when_unsatisfiable: "ScheduleAnyway"
```
This can be configured once per cluster (or globally) and will be added to every Pod Spec template (i.e. both Deployments and StatefulSets), using `paasta.yelp.com/service` and `paasta.yelp.com/instance` as label selectors.

Future Work
------
There is a potentially conflicting interaction of this new configuration option with `deploy_whitelist` and `deploy_blacklist` because those use node affinities and could constrain a deployment to one specific habitat/AZ while the topology spread constraint might be configured to spread Pods across multiple AZs. If `when_unsatisfiable: "DoNotSchedule"` is set, this would lead to Pods being unable to get scheduled.

For now, this will be handled by not defining any default spread constraints and only using `ScheduleAnyway` but we'll probably pose a follow-up PR to change the implementation of the whitelist/blacklist options from node affinities (which are somewhat expensive anyway) to topology spread constraints and give priority to the whitelist/blacklist options, if defined.

Signed-off-by: Max Falk <gfalk@yelp.com>
---
 docs/source/yelpsoa_configs.rst  | 18 +++++++++
 paasta_tools/kubernetes_tools.py | 49 +++++++++++++++++++++++++
 paasta_tools/utils.py            |  5 +++
 tests/test_kubernetes_tools.py   | 63 ++++++++++++++++++++++++++++++--
 4 files changed, 132 insertions(+), 3 deletions(-)

diff --git a/docs/source/yelpsoa_configs.rst b/docs/source/yelpsoa_configs.rst
index 74eee3151d..ed4b65af32 100644
--- a/docs/source/yelpsoa_configs.rst
+++ b/docs/source/yelpsoa_configs.rst
@@ -149,6 +149,24 @@ Kubernetes
 
 These options are only applicable to tasks scheduled on Kubernetes.
 
+  * ``topology_spread_constraints``: A set of rules to spread Pods across topologies, for example to try spreading Pods evenly across both nodes and availability zones::
+
+      topology_spread_constraints:
+        - max_skew: 1
+          topology_key: "topology.kubernetes.io/zone"
+          when_unsatisfiable: "ScheduleAnyway"
+        - max_skew: 1
+          topology_key: "kubernetes.io/hostname"
+          when_unsatisfiable: "ScheduleAnyway"
+
+    These can be configured per cluster (or globally) and will be added to every Pod Spec template, using `paasta.yelp.com/service` and `paasta.yelp.com/instance` as selectors.
+
+    To avoid conflicts with the `deploy_whitelist` and `deploy_blacklist`, please only use `when_unsatisfiable: "ScheduleAnyway"` (at least until PAASTA-17951 is resolved).
+
+    For more information, see the official Kubernetes
+    documentation on `topology spread constraints
+    <https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/>`_.
+
   * ``node_selectors``: A map of labels a node is required to have for a task
     to be launched on said node. There are several ways to define a selector.
     The simplest is a key-value pair. For example, this selector restricts a
diff --git a/paasta_tools/kubernetes_tools.py b/paasta_tools/kubernetes_tools.py
index da80ad7b99..97607ba022 100644
--- a/paasta_tools/kubernetes_tools.py
+++ b/paasta_tools/kubernetes_tools.py
@@ -102,6 +102,7 @@
 from kubernetes.client import V1StatefulSetSpec
 from kubernetes.client import V1Subject
 from kubernetes.client import V1TCPSocketAction
+from kubernetes.client import V1TopologySpreadConstraint
 from kubernetes.client import V1Volume
 from kubernetes.client import V1VolumeMount
 from kubernetes.client import V1WeightedPodAffinityTerm
@@ -1970,6 +1971,17 @@ def get_pod_template_spec(
             affinity.pod_anti_affinity = pod_anti_affinity
             pod_spec_kwargs["affinity"] = affinity
 
+        # PAASTA-17941: Allow configuring topology spread constraints per cluster
+        pod_topology_spread_constraints = create_pod_topology_spread_constraints(
+            service=self.get_service(),
+            instance=self.get_instance(),
+            topology_spread_constraints=system_paasta_config.get_topology_spread_constraints(),
+        )
+        if pod_topology_spread_constraints:
+            constraints = pod_spec_kwargs.get("topologySpreadConstraints", [])
+            constraints += pod_topology_spread_constraints
+            pod_spec_kwargs["topologySpreadConstraints"] = constraints
+
         termination_grace_period = self.get_termination_grace_period()
         if termination_grace_period is not None:
             pod_spec_kwargs[
@@ -3560,6 +3572,43 @@ def load_custom_resource_definitions(
     return custom_resources
 
 
+def create_pod_topology_spread_constraints(
+    service: str,
+    instance: str,
+    topology_spread_constraints: List[Dict[str, Any]],
+) -> List[V1TopologySpreadConstraint]:
+    """
+    Applies cluster-level topology spread constraints to every Pod template.
+    This allows us to configure default topology spread constraints on EKS where we cannot configure the scheduler.
+    """
+    if not topology_spread_constraints:
+        return []
+
+    selector = V1LabelSelector(
+        match_labels={
+            "paasta.yelp.com/service": service,
+            "paasta.yelp.com/instance": instance,
+        }
+    )
+
+    pod_topology_spread_constraints = []
+    for constraint in topology_spread_constraints:
+        pod_topology_spread_constraints.append(
+            V1TopologySpreadConstraint(
+                label_selector=selector,
+                topology_key=constraint.get(
+                    "topology_key", None
+                ),  # ValueError will be raised if unset
+                max_skew=constraint.get("max_skew", 1),
+                when_unsatisfiable=constraint.get(
+                    "when_unsatisfiable", "ScheduleAnyway"
+                ),
+            )
+        )
+
+    return pod_topology_spread_constraints
+
+
 def sanitised_cr_name(service: str, instance: str) -> str:
     sanitised_service = sanitise_kubernetes_name(service)
     sanitised_instance = sanitise_kubernetes_name(instance)
diff --git a/paasta_tools/utils.py b/paasta_tools/utils.py
index 1cb4095eca..2e5a316fa4 100644
--- a/paasta_tools/utils.py
+++ b/paasta_tools/utils.py
@@ -1970,6 +1970,7 @@ class SystemPaastaConfigDict(TypedDict, total=False):
     pdb_max_unavailable: Union[str, int]
     pki_backend: str
     pod_defaults: Dict[str, Any]
+    topology_spread_constraints: List[Dict[str, Any]]
     previous_marathon_servers: List[MarathonConfigDict]
     readiness_check_prefix_template: List[str]
     register_k8s_pods: bool
@@ -2556,6 +2557,10 @@ def get_taskproc(self) -> Dict:
     def get_disabled_watchers(self) -> List:
         return self.config_dict.get("disabled_watchers", [])
 
+    def get_topology_spread_constraints(self) -> List[Dict[str, Any]]:
+        """List of TopologySpreadConstraints that will be applied to all Pods in the cluster"""
+        return self.config_dict.get("topology_spread_constraints", [])
+
     def get_vault_environment(self) -> Optional[str]:
         """Get the environment name for the vault cluster
         This must match the environment keys in the secret json files
diff --git a/tests/test_kubernetes_tools.py b/tests/test_kubernetes_tools.py
index fdcbf9c550..92fa903e12 100644
--- a/tests/test_kubernetes_tools.py
+++ b/tests/test_kubernetes_tools.py
@@ -56,6 +56,7 @@
 from kubernetes.client import V1StatefulSetSpec
 from kubernetes.client import V1Subject
 from kubernetes.client import V1TCPSocketAction
+from kubernetes.client import V1TopologySpreadConstraint
 from kubernetes.client import V1Volume
 from kubernetes.client import V1VolumeMount
 from kubernetes.client import V2beta2CrossVersionObjectReference
@@ -1622,11 +1623,15 @@ def test_format_kubernetes_app_dict(self, _):
         "paasta_tools.kubernetes_tools.KubernetesDeploymentConfig.get_pod_anti_affinity",
         autospec=True,
     )
+    @mock.patch(
+        "paasta_tools.kubernetes_tools.create_pod_topology_spread_constraints",
+        autospec=True,
+    )
     @pytest.mark.parametrize(
-        "in_smtstk,routable_ip,node_affinity,anti_affinity,spec_affinity,termination_grace_period",
+        "in_smtstk,routable_ip,node_affinity,anti_affinity,spec_affinity,termination_grace_period,pod_topology",
         [
-            (True, "true", None, None, {}, None),
-            (False, "false", None, None, {}, 10),
+            (True, "true", None, None, {}, None, []),
+            (False, "false", None, None, {}, 10, []),
             # an node affinity absent but pod anti affinity present
             (
                 False,
@@ -1635,6 +1640,7 @@ def test_format_kubernetes_app_dict(self, _):
                 "pod_anti_affinity",
                 {"affinity": V1Affinity(pod_anti_affinity="pod_anti_affinity")},
                 None,
+                [],
             ),
             # an affinity obj is only added if there is a node affinity
             (
@@ -1649,11 +1655,13 @@ def test_format_kubernetes_app_dict(self, _):
                     )
                 },
                 None,
+                [],
             ),
         ],
     )
     def test_get_pod_template_spec(
         self,
+        mock_create_pod_topology_spread_constraints,
         mock_get_pod_anti_affinity,
         mock_get_termination_grace_period,
         mock_load_service_namespace_config,
@@ -1665,6 +1673,7 @@ def test_get_pod_template_spec(
         mock_get_volumes,
         in_smtstk,
         routable_ip,
+        pod_topology,
         node_affinity,
         anti_affinity,
         spec_affinity,
@@ -1675,10 +1684,12 @@ def test_get_pod_template_spec(
         mock_service_namespace_config.is_in_smartstack.return_value = in_smtstk
         mock_get_node_affinity.return_value = node_affinity
         mock_get_pod_anti_affinity.return_value = anti_affinity
+        mock_create_pod_topology_spread_constraints.return_value = pod_topology
         mock_system_paasta_config = mock.Mock()
         mock_system_paasta_config.get_kubernetes_add_registration_labels.return_value = (
             True
         )
+        mock_system_paasta_config.get_topology_spread_constraints.return_value = []
         mock_system_paasta_config.get_pod_defaults.return_value = dict(dns_policy="foo")
         mock_get_termination_grace_period.return_value = termination_grace_period
 
@@ -1775,6 +1786,52 @@ def test_routable_ip(
 
         assert ret == expected
 
+    def test_create_pod_topology_spread_constraints(self):
+        configured_constraints = [
+            {
+                "topology_key": "kubernetes.io/hostname",
+                "max_skew": 1,
+                "when_unsatisfiable": "ScheduleAnyway",
+            },
+            {
+                "topology_key": "topology.kubernetes.io/zone",
+                "max_skew": 3,
+                "when_unsatisfiable": "DoNotSchedule",
+            },
+        ]
+
+        expected_constraints = [
+            V1TopologySpreadConstraint(
+                label_selector=V1LabelSelector(
+                    match_labels={
+                        "paasta.yelp.com/service": "schematizer",
+                        "paasta.yelp.com/instance": "main",
+                    }
+                ),
+                max_skew=1,
+                topology_key="kubernetes.io/hostname",
+                when_unsatisfiable="ScheduleAnyway",
+            ),
+            V1TopologySpreadConstraint(
+                label_selector=V1LabelSelector(
+                    match_labels={
+                        "paasta.yelp.com/service": "schematizer",
+                        "paasta.yelp.com/instance": "main",
+                    }
+                ),
+                max_skew=3,
+                topology_key="topology.kubernetes.io/zone",
+                when_unsatisfiable="DoNotSchedule",
+            ),
+        ]
+
+        assert (
+            kubernetes_tools.create_pod_topology_spread_constraints(
+                "schematizer", "main", configured_constraints
+            )
+            == expected_constraints
+        )
+
     @pytest.mark.parametrize(
         "raw_selectors,expected",
         [