From fe35ab995ccb6a618a3d9a9ed0e03cf61d90e02e Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Fri, 15 Nov 2024 04:59:13 +0000 Subject: [PATCH] KEP-4963: Kube-proxy Services Acceleration Use the kernel flowtables infrastructure to allow kube-proxy users to accelerate service traffic. Change-Id: Iee638c8e86a4d17ddbdb30901b4fb4fd20e7dbda --- keps/prod-readiness/sig-network/4963.yaml | 3 + .../README.md | 463 ++++++++++++++++++ .../kep.yaml | 39 ++ 3 files changed, 505 insertions(+) create mode 100644 keps/prod-readiness/sig-network/4963.yaml create mode 100644 keps/sig-network/4963-kube-proxy-flowtables-fastpath/README.md create mode 100644 keps/sig-network/4963-kube-proxy-flowtables-fastpath/kep.yaml diff --git a/keps/prod-readiness/sig-network/4963.yaml b/keps/prod-readiness/sig-network/4963.yaml new file mode 100644 index 00000000000..84ba5244a19 --- /dev/null +++ b/keps/prod-readiness/sig-network/4963.yaml @@ -0,0 +1,3 @@ +kep-number: 4963 +alpha: + approver: "@johnbelamaric" diff --git a/keps/sig-network/4963-kube-proxy-flowtables-fastpath/README.md b/keps/sig-network/4963-kube-proxy-flowtables-fastpath/README.md new file mode 100644 index 00000000000..033f5be5d43 --- /dev/null +++ b/keps/sig-network/4963-kube-proxy-flowtables-fastpath/README.md @@ -0,0 +1,463 @@ +# KEP-4963: Kube-proxy Services Acceleration + + +- [Release Signoff Checklist](#release-signoff-checklist) +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [User Stories (Optional)](#user-stories-optional) + - [Story 1](#story-1) + - [Story 2](#story-2) + - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) + - [Risks and Mitigations](#risks-and-mitigations) +- [Design Details](#design-details) + - [Test Plan](#test-plan) + - [Prerequisite testing updates](#prerequisite-testing-updates) + - [Unit tests](#unit-tests) + - [Integration tests](#integration-tests) + - [e2e tests](#e2e-tests) + - [Graduation Criteria](#graduation-criteria) + - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) + - [Version Skew Strategy](#version-skew-strategy) +- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) +- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) + + +## Release Signoff Checklist + +Items marked with (R) are required *prior to targeting to a milestone / release*. + +- [x] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [x] (R) KEP approvers have approved the KEP status as `implementable` +- [x] (R) Design details are appropriately documented +- [x] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) + - [ ] e2e Tests for all Beta API Operations (endpoints) + - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free +- [x] (R) Graduation criteria is in place + - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) +- [ ] (R) Production readiness review completed +- [ ] (R) Production readiness review approved +- [ ] "Implementation History" section is up-to-date for milestone +- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] +- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes + +[kubernetes.io]: https://kubernetes.io/ +[kubernetes/enhancements]: https://git.k8s.io/enhancements +[kubernetes/kubernetes]: https://git.k8s.io/kubernetes +[kubernetes/website]: https://git.k8s.io/website + + + +## Summary + +This KEP proposes utilizing the flowtable infrastructure within the Linux kernel's netfilter module to create an accelerated path for Kubernetes Service traffic using kube-proxy. + +## Motivation + +Services traffic acceleration can significantly reduce latency and increase throughput improving application response times and user experience. +It will also reduce the CPU and memory overhead associated with Service traffic processing, improving efficiency and reducing cost. + +### Goals + +- Provide an option for kube-proxy users to enable Service traffic acceleration. + +### Non-Goals + +- Provide acceleration in any traffic that is not related to kube-proxy, per example, Pod to Pod traffic. + +## Proposal + +The kernel [Netfilter’s flowtable infrastructure](https://docs.kernel.org/networking/nf_flowtable.html) allows to define a fastpath through the flowtable datapath. This infrastructure also provides hardware offload support. + +``` + userspace process + ^ | + | | + _____|____ ____\/___ + / \ / \ + | input | | output | + \__________/ \_________/ + ^ | + | | + _________ __________ --------- _____\/_____ + / \ / \ |Routing | / \ +--> ingress ---> prerouting ---> |decision| | postrouting |--> neigh_xmit + \_________/ \__________/ ---------- \____________/ ^ + | ^ | ^ | + flowtable | ____\/___ | | + | | / \ | | + __\/___ | | forward |------------ | + |-----| | \_________/ | + |-----| | 'flow offload' rule | + |-----| | adds entry to | + |_____| | flowtable | + | | | + / \ | | + /hit\_no_| | + \ ? / | + \ / | + |__yes_________________fastpath bypass ____________________________| + + Fig.1 Netfilter hooks and flowtable interactions +``` + +Enabling the flowtable fastpath requires to use nftables and only needs to create a flowtable on the corresponding network interface and add one rule in the forward chain. + +Example configuration: + +``` +table inet x { + flowtable f { + hook ingress priority 0; devices = { eth0, eth1 }; + } + chain y { + type filter hook forward priority 0; policy accept; + ip protocol tcp flow add @f + counter packets 0 bytes 0 + } +} +``` + +Kube-proxy will allow users to opt-in for Service acceleration, since it is not possible to guarantee that a feature that completely bypass the kernel has unintended consequences in some environments or incompatibility with other networking components. + +The opt-in behavior will be based on network interface names and attributes and will use CEL expressions, similar to how we use `--pod-interface-name-prefix` in [KEP-2450](https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/2450-Remove-knowledge-of-pod-cluster-CIDR-from-iptables-rules). The powerful matching expression allow users to deal with more complex environments: nodes with multiple interfaces and only some of them with hardware offload, matches on alias to select only some specific kind of pods, ... + +Some example expressions are: + +- `interface.name.startsWith(\"eth\")` selects all interfaces with prefix `eth` +- `interface.type == \"veth\"` selects all interfaces with type `veth` +- `interface.alias == \"offload\"` selects all interfaces with alias set to `offload` + +This expressions can also be concatenated, see https://github.com/google/cel-go for more information. + +Kernel bypassing will only be applied once the connection is ESTABLISHED, the rationale behind this is that, once the connection is accelerated, it bypass the network stack, by waiting for the connection to be established we ensure the integration with other network applications (network policies, service meshes, ... ) in the node and we avoid possible performance problems or possible DOS attacks by trying to accelerate every single connection on a node. + +### User Stories (Optional) + +#### Story 1 + +I want to use Service traffic acceleration with kube-proxy to improve the performance of my applications. + +### Risks and Mitigations + +Once the network traffic moves to the datapath it completely bypass the kernel stack, so +any other network applications that depend on the packets going through the network stack (monitoring per example) we'll not be able to see the connection data. The feature will only +apply the fast path on established connections, since most of the network applications are statefuls, usually is safe to think that once a connection is established no additional operations are required on it. + +## Design Details + +This feature will only work with kube-proxy nftables mode. + +Users will be able to opt-in to Service traffic acceleration by passing a CEL expression using the flag `--accelerated-interface-expression` or the configuration option `AcceleratedInterfaceExpression` to match the network interfaces in the node that are subjet to Service traffic acceleration. The absence of a CEL expression disables the feature. + +Kube-proxy will create a `flowtable` in the kube-proxy table with the name `kube-proxy-flowtable` and will monitor the network interfaces in the node to populate the `flowtable` with the interfaces that match the configured CEL expression. + +Kube-proxy will insert a rule to offload all Services established traffic in the `filter-forward` chain: + +``` + tx.Add(&knftables.Rule{ + Chain: filterForwardChain, + Rule: knftables.Concat( + "ct original", ipX, "daddr", "@", clusterIPsSet, + "ct state established", + "flow offload", "@", serviceFlowTable, + ), + }) +``` + +### Test Plan + +[x] I/we understand the owners of the involved components may require updates to +existing tests to make this code solid enough prior to committing the changes necessary +to implement this enhancement. + +##### Prerequisite testing updates + +##### Unit tests + +To be added + +- ``: `` - `` + +##### Integration tests + +Not needed + +##### e2e tests + +- Create one Service with one backend running an iperf Service +- Run an iperf client against the Service without acceleration +- Run an iperf client against the Service with acceleration. +- The service with acceleration has to show a significant statistically difference on the throughput results. + +### Graduation Criteria + +#### Alpha + +- Feature implemented behind a feature flag +- Initial e2e tests completed and enabled + +#### Beta + +- Gather feedback from developers + +#### GA + +- No bug reported +- Feedback from developers and users + +**Note:** Generally we also wait at least two releases between beta and +GA/stable, because there's no opportunity for user feedback, or even bug reports, +in back-to-back releases. + +**For non-optional features moving to GA, the graduation criteria must include +[conformance tests].** + +[conformance tests]: https://git.k8s.io/community/contributors/devel/sig-architecture/conformance-tests.md + + +### Upgrade / Downgrade Strategy + +kube-proxy reconcile the nftables rules so the rules will be reconciled during startup and added or removed depending on how kube-proxy is configured. + +### Version Skew Strategy + +N/A + +## Production Readiness Review Questionnaire + +### Feature Enablement and Rollback + +###### How can this feature be enabled / disabled in a live cluster? + +- [x] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: KubeProxyAcceleration + - Components depending on the feature gate: kube-proxy +- [x] Other + - Describe the mechanism: kube-proxy configuration option + - Will enabling / disabling the feature require downtime of the control + plane? No + - Will enabling / disabling the feature require downtime or reprovisioning + of a node? No + +###### Does enabling the feature change any default behavior? + +No + +###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? + +Yes, disable the feature gate or the configuration in kube-proxy and restart it. + +###### What happens if we reenable the feature if it was previously rolled back? + +Kube-proxy reconciles at startup, so no problems can happen during rolls backs. + +###### Are there any tests for feature enablement/disablement? + +This is an opt-in feature in kube-proxy behind a feature gate, manual test will be performed for validation. + +### Rollout, Upgrade and Rollback Planning + + + +###### How can a rollout or rollback fail? Can it impact already running workloads? + + + +###### What specific metrics should inform a rollback? + + + +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? + + + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? + + + +### Monitoring Requirements + + + +###### How can an operator determine if the feature is in use by workloads? + + + +###### How can someone using this feature know that it is working for their instance? + + + +- [ ] Events + - Event Reason: +- [ ] API .status + - Condition name: + - Other field: +- [ ] Other (treat as last resort) + - Details: + +###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? + + + +###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? + + + +- [ ] Metrics + - Metric name: + - [Optional] Aggregation method: + - Components exposing the metric: +- [ ] Other (treat as last resort) + - Details: + +###### Are there any missing metrics that would be useful to have to improve observability of this feature? + + + +### Dependencies + +###### Does this feature depend on any specific services running in the cluster? + +No + +### Scalability + +###### Will enabling / using this feature result in any new API calls? + +No + +###### Will enabling / using this feature result in introducing new API types? + +No + +###### Will enabling / using this feature result in any new calls to the cloud provider? + +No + +###### Will enabling / using this feature result in increasing size or count of the existing API objects? + +No + +###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? + +No + +###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? + +No, on the contrary it is expected CPU consumption to be decreased. + +###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)? + +This feature consumes entries on the network device flowtable, that are per device. + +### Troubleshooting + +###### How does this feature react if the API server and/or etcd is unavailable? + +Is not impacted, kube-proxy will not be able to add new Services to the datapath, but existing traffic for Services will be unaffected. + +###### What are other known failure modes? + + + +###### What steps should be taken if SLOs are not being met to determine the problem? + +## Implementation History + + + +## Drawbacks + + + +## Alternatives + +- eBPF was considered, but this was discarded because of the complexity, the increase on node resources consumption and the lack of support for old kernels. diff --git a/keps/sig-network/4963-kube-proxy-flowtables-fastpath/kep.yaml b/keps/sig-network/4963-kube-proxy-flowtables-fastpath/kep.yaml new file mode 100644 index 00000000000..4d69e63a3f8 --- /dev/null +++ b/keps/sig-network/4963-kube-proxy-flowtables-fastpath/kep.yaml @@ -0,0 +1,39 @@ +title: Kube-proxy Services Acceleration +kep-number: 4963 +authors: + - "@aojea" +owning-sig: sig-network +status: implementable +creation-date: 2024-11-14 +reviewers: + - "@danwinship" + - "@thockin" +approvers: + - "@danwinship" + - "@thockin" + +# The target maturity stage in the current dev cycle for this KEP. +stage: alpha + +# The most recent milestone for which work toward delivery of this KEP has been +# done. This can be the current (upcoming) milestone, if it is being actively +# worked on. +latest-milestone: "v1.33" + +# The milestone at which this feature was, or is targeted to be, at each stage. +milestone: + alpha: "v1.33" + beta: "v1.34" + stable: "v1.35" + +# The following PRR answers are required at alpha release +# List the feature gate name and the components for which it must be enabled +feature-gates: + - name: FastPathProxy + components: + - kube-proxy +disable-supported: true + +# The following PRR answers are required at beta release +metrics: + - count_accelerated_interfaces_total