From eb2a9b993ee74c3c936b8dade36b0a654e362478 Mon Sep 17 00:00:00 2001 From: Tomas Hruby Date: Wed, 31 May 2023 15:08:51 -0700 Subject: [PATCH 1/4] [BPF] tc_state_fill_from_iphdr fills ctx->ipheader_len Then the lenght is stored in state and carried over to subsequent programs without the need of reading it from the packet itself again. --- felix/bpf-gpl/parsing.h | 8 ++------ felix/bpf-gpl/skb.h | 1 - felix/bpf-gpl/tc.c | 9 ++------- felix/bpf-gpl/types.h | 10 ++++++++-- felix/bpf/state/map.go | 3 ++- 5 files changed, 14 insertions(+), 17 deletions(-) diff --git a/felix/bpf-gpl/parsing.h b/felix/bpf-gpl/parsing.h index 61c64ef3d8c..e5f9ce173fd 100644 --- a/felix/bpf-gpl/parsing.h +++ b/felix/bpf-gpl/parsing.h @@ -107,13 +107,7 @@ static CALI_BPF_INLINE int parse_packet_ip(struct cali_tc_ctx *ctx) { CALI_DEBUG("Drop malformed IP packets\n"); deny_reason(ctx, CALI_REASON_IP_MALFORMED); goto deny; - } else if (ip_hdr(ctx)->ihl > 5) { - /* Drop packets with IP options from/to WEP. - * Also drop packets with IP options if the dest IP is not host IP - */ - ctx->ipheader_len = 4 * ip_hdr(ctx)->ihl; } - CALI_DEBUG("IP ihl=%d bytes\n", ctx->ipheader_len); return PARSING_OK; @@ -135,6 +129,8 @@ static CALI_BPF_INLINE void tc_state_fill_from_iphdr(struct cali_tc_ctx *ctx) ctx->state->pre_nat_ip_dst = ip_hdr(ctx)->daddr; ctx->state->ip_proto = ip_hdr(ctx)->protocol; ctx->state->ip_size = ip_hdr(ctx)->tot_len; + ctx->ipheader_len = ctx->state->ihl = ip_hdr(ctx)->ihl * 4; + CALI_DEBUG("IP ihl=%d bytes\n", ctx->ipheader_len); } static CALI_BPF_INLINE void tc_state_fill_from_ipv6hdr(struct cali_tc_ctx *ctx) diff --git a/felix/bpf-gpl/skb.h b/felix/bpf-gpl/skb.h index 6b0260e6219..758483eb078 100644 --- a/felix/bpf-gpl/skb.h +++ b/felix/bpf-gpl/skb.h @@ -120,7 +120,6 @@ static CALI_BPF_INLINE int skb_refresh_validate_ptrs(struct cali_tc_ctx *ctx, lo } // Success, refresh the ip_header/nh fields in the context. ctx->ip_header = ctx->data_start + skb_iphdr_offset(ctx); - ctx->ipheader_len = 4 * ip_hdr(ctx)->ihl; return 0; } diff --git a/felix/bpf-gpl/tc.c b/felix/bpf-gpl/tc.c index d5f5c377441..1d616bbb994 100644 --- a/felix/bpf-gpl/tc.c +++ b/felix/bpf-gpl/tc.c @@ -1081,7 +1081,6 @@ int calico_tc_skb_accepted_entrypoint(struct __sk_buff *skb) .reason = CALI_REASON_UNKNOWN, .mark = CALI_SKB_MARK_SEEN, }, - .ipheader_len = IP_SIZE, ); struct cali_tc_ctx *ctx = &_ctx; @@ -1442,14 +1441,13 @@ int calico_tc_skb_icmp_inner_nat(struct __sk_buff *skb) .res = TC_ACT_UNSPEC, .reason = CALI_REASON_UNKNOWN, }, - .ipheader_len = IP_SIZE, ); struct cali_tc_ctx *ctx = &_ctx; struct cali_tc_state *state = ctx->state; bool ct_related = ct_result_is_related(state->ct_result.rc); int ct_rc = ct_result_rc(state->ct_result.rc); - + CALI_DEBUG("Entering calico_tc_skb_icmp_inner_nat\n"); if (!ct_related) { @@ -1480,7 +1478,7 @@ int calico_tc_skb_icmp_inner_nat(struct __sk_buff *skb) } ctx->ip_header = (struct iphdr*)pkt; - ctx->ipheader_len = ip_hdr(ctx)->ihl * 4; + ctx->ipheader_len = ctx->state->ihl = ip_hdr(ctx)->ihl * 4; if (ctx->ipheader_len > 60) { CALI_DEBUG("this cannot be!\n"); goto deny; @@ -1553,7 +1551,6 @@ int calico_tc_skb_send_icmp_replies(struct __sk_buff *skb) .res = TC_ACT_UNSPEC, .reason = CALI_REASON_UNKNOWN, }, - .ipheader_len = IP_SIZE, ); struct cali_tc_ctx *ctx = &_ctx; @@ -1602,7 +1599,6 @@ int calico_tc_host_ct_conflict(struct __sk_buff *skb) .res = TC_ACT_UNSPEC, .reason = CALI_REASON_UNKNOWN, }, - .ipheader_len = IP_SIZE, ); struct cali_tc_ctx *ctx = &_ctx; @@ -1662,7 +1658,6 @@ int calico_tc_skb_drop(struct __sk_buff *skb) { DECLARE_TC_CTX(_ctx, .skb = skb, - .ipheader_len = IP_SIZE, ); struct cali_tc_ctx *ctx = &_ctx; diff --git a/felix/bpf-gpl/types.h b/felix/bpf-gpl/types.h index 794d284bfac..be0bdaecdc3 100644 --- a/felix/bpf-gpl/types.h +++ b/felix/bpf-gpl/types.h @@ -68,7 +68,8 @@ struct cali_tc_state { __be32 tun_ip1; __be32 tun_ip2; __be32 tun_ip3; - __u32 unused; + __u16 ihl; + __u16 unused; /* Return code from the policy program CALI_POL_DENY/ALLOW etc. */ __s32 pol_rc; /* Source port of the packet; updated on the CALI_CT_ESTABLISHED_SNAT path or when doing encap. @@ -186,13 +187,18 @@ struct cali_tc_ctx { bpf_exit(TC_ACT_SHOT); \ } \ struct pkt_scratch *scratch = (void *)(gl->__scratch); \ - (struct cali_tc_ctx) { \ + struct cali_tc_ctx x = { \ .state = state, \ .counters = counters, \ .globals = gl, \ .scratch = scratch, \ __VA_ARGS__ \ }; \ + if (x.ipheader_len == 0) { \ + x.ipheader_len = state->ihl; \ + } \ + \ + x; \ }) \ static CALI_BPF_INLINE struct iphdr* ip_hdr(struct cali_tc_ctx *ctx) diff --git a/felix/bpf/state/map.go b/felix/bpf/state/map.go index 86741948345..c42ddc6f6b5 100644 --- a/felix/bpf/state/map.go +++ b/felix/bpf/state/map.go @@ -94,7 +94,8 @@ type State struct { TunIP1 uint32 TunIP2 uint32 TunIP3 uint32 - _ uint32 + ihl uint16 + _ uint16 PolicyRC PolicyResult SrcPort uint16 DstPort uint16 From c5851661edd0e80b0a37c25c5f6e96cf99eb2bc1 Mon Sep 17 00:00:00 2001 From: Tomas Hruby Date: Tue, 30 May 2023 15:14:40 -0700 Subject: [PATCH 2/4] [BPF] remove legacy ipv6 --- felix/bpf-gpl/Makefile | 2 +- felix/bpf-gpl/ipv6.h | 24 ------- felix/bpf-gpl/parsing.h | 22 ------- felix/bpf-gpl/tc.c | 2 - felix/bpf-gpl/tc6.c | 134 ---------------------------------------- felix/bpf-gpl/types.h | 10 --- 6 files changed, 1 insertion(+), 193 deletions(-) delete mode 100644 felix/bpf-gpl/ipv6.h delete mode 100644 felix/bpf-gpl/tc6.c diff --git a/felix/bpf-gpl/Makefile b/felix/bpf-gpl/Makefile index ac89c741fa4..feb0a7e3f33 100644 --- a/felix/bpf-gpl/Makefile +++ b/felix/bpf-gpl/Makefile @@ -53,7 +53,7 @@ OBJS:=$(shell ./list-objs) OBJS+=bin/tc_preamble.o OBJS+=bin/xdp_preamble.o OBJS+=bin/policy_default.o -C_FILES:=tc_preamble.c tc.c tc6.c connect_balancer.c connect_balancer_v6.c xdp_preamble.c xdp.c policy_default.c +C_FILES:=tc_preamble.c tc.c connect_balancer.c connect_balancer_v6.c xdp_preamble.c xdp.c policy_default.c all: $(OBJS) ut-objs: $(UT_OBJS) diff --git a/felix/bpf-gpl/ipv6.h b/felix/bpf-gpl/ipv6.h deleted file mode 100644 index 98b157e9092..00000000000 --- a/felix/bpf-gpl/ipv6.h +++ /dev/null @@ -1,24 +0,0 @@ -// Project Calico BPF dataplane programs. -// Copyright (c) 2022 Tigera, Inc. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later - -#ifndef __CALI_BPF_IPV6_H__ -#define __CALI_BPF_IPV6_H__ - -// We can only pass in 3 parameters to a helper function because of bpf -// architecture, so we need to split printing ipv6 address into 2 parts. -#define CALI_LOG_IPV6(ipv6) \ - CALI_DEBUG("src: %x%x", \ - bpf_ntohl((ipv6)->saddr.in6_u.u6_addr32[0]), \ - bpf_ntohl((ipv6)->saddr.in6_u.u6_addr32[1])); \ - CALI_DEBUG("%x%x\n", \ - bpf_ntohl((ipv6)->saddr.in6_u.u6_addr32[2]), \ - bpf_ntohl((ipv6)->saddr.in6_u.u6_addr32[3])); \ - CALI_DEBUG("dst: %x%x", \ - bpf_ntohl((ipv6)->daddr.in6_u.u6_addr32[0]), \ - bpf_ntohl((ipv6)->daddr.in6_u.u6_addr32[1])); \ - CALI_DEBUG("%x%x\n", \ - bpf_ntohl((ipv6)->daddr.in6_u.u6_addr32[2]), \ - bpf_ntohl((ipv6)->daddr.in6_u.u6_addr32[3])) \ - -#endif /* __CALI_BPF_IPV6_H__ */ diff --git a/felix/bpf-gpl/parsing.h b/felix/bpf-gpl/parsing.h index e5f9ce173fd..948fa0251d5 100644 --- a/felix/bpf-gpl/parsing.h +++ b/felix/bpf-gpl/parsing.h @@ -133,28 +133,6 @@ static CALI_BPF_INLINE void tc_state_fill_from_iphdr(struct cali_tc_ctx *ctx) CALI_DEBUG("IP ihl=%d bytes\n", ctx->ipheader_len); } -static CALI_BPF_INLINE void tc_state_fill_from_ipv6hdr(struct cali_tc_ctx *ctx) -{ - // Fill in source ip - ctx->state->ip_src = ipv6_hdr(ctx)->saddr.in6_u.u6_addr32[0]; - ctx->state->ip_src1 = ipv6_hdr(ctx)->saddr.in6_u.u6_addr32[1]; - ctx->state->ip_src2 = ipv6_hdr(ctx)->saddr.in6_u.u6_addr32[2]; - ctx->state->ip_src3 = ipv6_hdr(ctx)->saddr.in6_u.u6_addr32[3]; - // Fill in dst ip - ctx->state->ip_dst = ipv6_hdr(ctx)->daddr.in6_u.u6_addr32[0]; - ctx->state->ip_dst1 = ipv6_hdr(ctx)->daddr.in6_u.u6_addr32[1]; - ctx->state->ip_dst2 = ipv6_hdr(ctx)->daddr.in6_u.u6_addr32[2]; - ctx->state->ip_dst3 = ipv6_hdr(ctx)->daddr.in6_u.u6_addr32[3]; - // Fill in pre nat ip - ctx->state->pre_nat_ip_dst = ipv6_hdr(ctx)->daddr.in6_u.u6_addr32[0]; - ctx->state->pre_nat_ip_dst1 = ipv6_hdr(ctx)->daddr.in6_u.u6_addr32[1]; - ctx->state->pre_nat_ip_dst2 = ipv6_hdr(ctx)->daddr.in6_u.u6_addr32[2]; - ctx->state->pre_nat_ip_dst3 = ipv6_hdr(ctx)->daddr.in6_u.u6_addr32[3]; - // Fill in other information - ctx->state->ip_proto = ipv6_hdr(ctx)->nexthdr; - ctx->state->ip_size = ipv6_hdr(ctx)->payload_len; -} - /* Continue parsing packet based on the IP protocol and fill in relevant fields * in the state (struct cali_tc_state). */ static CALI_BPF_INLINE int tc_state_fill_from_nexthdr(struct cali_tc_ctx *ctx, bool decap) diff --git a/felix/bpf-gpl/tc.c b/felix/bpf-gpl/tc.c index 1d616bbb994..25eb8919171 100644 --- a/felix/bpf-gpl/tc.c +++ b/felix/bpf-gpl/tc.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -37,7 +36,6 @@ #include "fib.h" #include "rpf.h" #include "parsing.h" -#include "ipv6.h" #include "tc.h" #include "failsafe.h" #include "metadata.h" diff --git a/felix/bpf-gpl/tc6.c b/felix/bpf-gpl/tc6.c deleted file mode 100644 index 5bb0d3cacb0..00000000000 --- a/felix/bpf-gpl/tc6.c +++ /dev/null @@ -1,134 +0,0 @@ -// Project Calico BPF dataplane programs. -// Copyright (c) 2022 Tigera, Inc. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later - -#include -#include -#include - -// stdbool.h has no deps so it's OK to include; stdint.h pulls in parts -// of the std lib that aren't compatible with BPF. -#include - -#include "bpf.h" -#include "types.h" -#include "counters.h" -#include "log.h" -#include "skb.h" -#include "routes.h" -#include "parsing.h" -#include "ipv6.h" -#include "jump.h" -#include "policy.h" - -const volatile struct cali_tc_globals __globals; - - -SEC("tc") -int calico_tc6(struct __sk_buff *skb) -{ - struct cali_tc_ctx _ctx = { - .state = state_get(), - .globals = state_get_globals_tc(), - .skb = skb, - .fwd = { - .res = TC_ACT_UNSPEC, - .reason = CALI_REASON_UNKNOWN, - }, - .ipheader_len = IPv6_SIZE, - }; - struct cali_tc_ctx *ctx = &_ctx; - - if (!ctx->globals) { - CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "State map globals lookup failed: DROP\n"); - return TC_ACT_SHOT; - } - - if (!ctx->state) { - CALI_DEBUG("State map lookup failed: DROP\n"); - return TC_ACT_SHOT; - } - - CALI_DEBUG("Entering IPv6 prologue program\n"); - - // TODO: Add IPv6 counters - - if (CALI_LOG_LEVEL >= CALI_LOG_LEVEL_INFO) { - ctx->state->prog_start_time = bpf_ktime_get_ns(); - } - - if (skb_refresh_validate_ptrs(ctx, UDP_SIZE)) { - deny_reason(ctx, CALI_REASON_SHORT); - CALI_DEBUG("Too short\n"); - goto deny; - } - - tc_state_fill_from_ipv6hdr(ctx); - - /* Parse out the source/dest ports (or type/code for ICMP). */ - switch (tc_state_fill_from_nexthdr(ctx)) { - case PARSING_ERROR: - goto deny; - case PARSING_ALLOW_WITHOUT_ENFORCING_POLICY: - goto allow; - } - - CALI_LOG_IPV6(ipv6_hdr(ctx)); - CALI_DEBUG("IP src=%x\n", ctx->state->ip_src); - CALI_DEBUG("IP src1=%x\n", ctx->state->ip_src1); - CALI_DEBUG("IP src2=%x\n", ctx->state->ip_src2); - CALI_DEBUG("IP src3=%x\n", ctx->state->ip_src3); - CALI_DEBUG("proto=%d\n", ctx->state->ip_proto); - CALI_DEBUG("sport=%d\n", ctx->state->sport); - CALI_DEBUG("dport=%d\n", ctx->state->dport); - - if (CALI_F_WEP) { - CALI_DEBUG("IPv6 from workload: drop\n"); - goto deny; - } - CALI_DEBUG("IPv6 on host interface: allow\n"); - CALI_DEBUG("About to jump to normal policy program\n"); - CALI_JUMP_TO(ctx, PROG_INDEX_V6_POLICY); - if (CALI_F_HEP) { - CALI_DEBUG("HEP with no policy, allow.\n"); - goto allow; - } - CALI_DEBUG("Tail call to normal policy program failed: DROP\n"); - -deny: - skb->mark = CALI_SKB_MARK_SEEN; - return TC_ACT_SHOT; - -allow: - skb->mark = CALI_SKB_MARK_SEEN; - return TC_ACT_UNSPEC; -} - -SEC("tc") -int calico_tc_skb_accepted_entrypoint(struct __sk_buff *skb) -{ - CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "Entering IPv6 accepted program\n"); - // TODO: Implement the logic for accepted packets by the policy program - // We should not reach here since no tail call happens to this program - skb->mark = CALI_SKB_MARK_SEEN; - return TC_ACT_UNSPEC; -} - -SEC("tc") -int calico_tc_skb_send_icmp_replies(struct __sk_buff *skb) -{ - CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "Entering IPv6 icmp program\n"); - // TODO: Implement the logic for accepted icmp packets by the policy program - // We should not reach here since no tail call happens to this program - return TC_ACT_SHOT; -} - -SEC("tc") -int calico_tc_skb_drop(struct __sk_buff *skb) -{ - CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "Entering IPv6 drop program\n"); - // TODO: Implement the logic for dropped packets by the policy program - // We should not reach here since no tail call happens to this program - return TC_ACT_SHOT; -} - diff --git a/felix/bpf-gpl/types.h b/felix/bpf-gpl/types.h index be0bdaecdc3..e17ebd17fbc 100644 --- a/felix/bpf-gpl/types.h +++ b/felix/bpf-gpl/types.h @@ -206,11 +206,6 @@ static CALI_BPF_INLINE struct iphdr* ip_hdr(struct cali_tc_ctx *ctx) return (struct iphdr *)ctx->ip_header; } -static CALI_BPF_INLINE struct ipv6hdr* ipv6_hdr(struct cali_tc_ctx *ctx) -{ - return (struct ipv6hdr *)ctx->ip_header; -} - static CALI_BPF_INLINE struct ethhdr* eth_hdr(struct cali_tc_ctx *ctx) { return (struct ethhdr *)ctx->data_start; @@ -231,11 +226,6 @@ static CALI_BPF_INLINE struct icmphdr* icmp_hdr(struct cali_tc_ctx *ctx) return (struct icmphdr *)ctx->scratch->l4; } -static CALI_BPF_INLINE struct ipv6_opt_hdr* ipv6ext_hdr(struct cali_tc_ctx *ctx) -{ - return (struct ipv6_opt_hdr *)ctx->scratch->l4; -} - static CALI_BPF_INLINE __u32 ctx_ifindex(struct cali_tc_ctx *ctx) { #if CALI_F_XDP From 6bf83e2f5f282e126c15fcdee755b8a6dec53a5b Mon Sep 17 00:00:00 2001 From: Tomas Hruby Date: Wed, 31 May 2023 13:02:03 -0700 Subject: [PATCH 3/4] [BPF] IPv6 data plane code passes basic unittests The BPF dataplane code is no compiled either for 4 byte (ipv4) or 16 byte (ipv6) addresses. Most of the code base is the same for both versions except IP header parsing. So code reshiffling and simplification needed due to limitted stack space. --- felix/bpf-gpl/Makefile | 25 +- felix/bpf-gpl/arp.h | 10 +- felix/bpf-gpl/bpf.h | 48 +- felix/bpf-gpl/calculate-flags | 2 +- felix/bpf-gpl/connect.h | 15 +- felix/bpf-gpl/connect_balancer.c | 4 +- felix/bpf-gpl/connect_balancer_v6.c | 6 +- felix/bpf-gpl/conntrack.h | 217 ++--- felix/bpf-gpl/conntrack_types.h | 47 +- felix/bpf-gpl/failsafe.h | 12 +- felix/bpf-gpl/fib.h | 13 +- felix/bpf-gpl/globals.h | 56 +- felix/bpf-gpl/icmp.h | 162 +--- felix/bpf-gpl/icmp4.h | 168 ++++ felix/bpf-gpl/icmp6.h | 13 + felix/bpf-gpl/ip_addr.h | 98 +++ felix/bpf-gpl/jump.h | 64 +- felix/bpf-gpl/list-ut-objs | 2 +- felix/bpf-gpl/metadata.h | 30 +- felix/bpf-gpl/nat.h | 258 ++---- felix/bpf-gpl/nat4.h | 125 +++ felix/bpf-gpl/nat6.h | 113 +++ felix/bpf-gpl/nat_lookup.h | 88 +- felix/bpf-gpl/nat_types.h | 61 +- felix/bpf-gpl/parsing.h | 154 +--- felix/bpf-gpl/parsing4.h | 105 +++ felix/bpf-gpl/parsing6.h | 176 ++++ felix/bpf-gpl/routes.h | 30 +- felix/bpf-gpl/rpf.h | 13 +- felix/bpf-gpl/sendrecv.h | 26 +- felix/bpf-gpl/skb.h | 30 +- felix/bpf-gpl/tc.c | 472 ++++++----- felix/bpf-gpl/tc_preamble.c | 43 +- felix/bpf-gpl/types.h | 82 +- felix/bpf-gpl/ut/ip_parse_test.c | 59 ++ felix/bpf-gpl/ut/ipv4_opts_test.c | 5 +- felix/bpf-gpl/ut/nat_decap_test.c | 2 +- felix/bpf-gpl/ut/nat_encap_test.c | 6 +- felix/bpf-gpl/xdp.c | 1 + felix/bpf/arp/map.go | 7 +- felix/bpf/arp/map6.go | 111 +++ felix/bpf/conntrack/map.go | 103 ++- felix/bpf/conntrack/v3/map.go | 6 - felix/bpf/conntrack/v3/map6.go | 444 ++++++++++ felix/bpf/libbpf/libbpf.go | 39 + felix/bpf/libbpf/libbpf_api.h | 47 ++ felix/bpf/libbpf/libbpf_common.go | 18 + felix/bpf/libbpf/libbpf_stub.go | 4 + felix/bpf/maps/maps.go | 6 +- felix/bpf/nat/maps.go | 10 + felix/bpf/nat/maps6.go | 630 ++++++++++++++ felix/bpf/routes/map.go | 5 +- felix/bpf/routes/map6.go | 223 +++++ felix/bpf/state/map.go | 5 +- felix/bpf/tc/attach.go | 8 + felix/bpf/tc/defs/defs.go | 16 +- felix/bpf/ut/bpf_prog_test.go | 470 +++++++++-- felix/bpf/ut/failsafes_test.go | 2 +- felix/bpf/ut/filter_test.go | 2 +- felix/bpf/ut/icmp_port_unreachable_test.go | 4 +- felix/bpf/ut/icmp_too_big_test.go | 2 +- felix/bpf/ut/icmp_ttl_exceeded_test.go | 2 +- felix/bpf/ut/ip_dec_ttl_test.go | 4 +- felix/bpf/ut/ip_options_test.go | 2 +- felix/bpf/ut/ip_parse_test.go | 78 ++ felix/bpf/ut/ipv4_opts_test.go | 2 +- felix/bpf/ut/nat_encap_test.go | 71 +- felix/bpf/ut/nat_test.go | 922 ++++++++++++++++++++- felix/bpf/ut/snat_test.go | 10 +- felix/bpf/ut/tcp_test.go | 4 +- felix/bpf/ut/to_host_allowed_test.go | 6 +- felix/bpf/ut/whitelist_test.go | 80 +- felix/bpf/ut/xdp_test.go | 2 +- 73 files changed, 5028 insertions(+), 1158 deletions(-) create mode 100644 felix/bpf-gpl/icmp4.h create mode 100644 felix/bpf-gpl/icmp6.h create mode 100644 felix/bpf-gpl/ip_addr.h create mode 100644 felix/bpf-gpl/nat4.h create mode 100644 felix/bpf-gpl/nat6.h create mode 100644 felix/bpf-gpl/parsing4.h create mode 100644 felix/bpf-gpl/parsing6.h create mode 100644 felix/bpf-gpl/ut/ip_parse_test.c create mode 100644 felix/bpf/arp/map6.go create mode 100644 felix/bpf/conntrack/v3/map6.go create mode 100644 felix/bpf/nat/maps6.go create mode 100644 felix/bpf/routes/map6.go create mode 100644 felix/bpf/ut/ip_parse_test.go diff --git a/felix/bpf-gpl/Makefile b/felix/bpf-gpl/Makefile index feb0a7e3f33..1a0d0064d7b 100644 --- a/felix/bpf-gpl/Makefile +++ b/felix/bpf-gpl/Makefile @@ -48,9 +48,11 @@ LD := llc-12 UT_C_FILES:=$(shell find ut -name '*.c') UT_OBJS:=$(UT_C_FILES:.c=.o) $(shell ./list-ut-objs) +UT_OBJS+=ut/ip_parse_test_v6.o OBJS:=$(shell ./list-objs) OBJS+=bin/tc_preamble.o +OBJS+=bin/tc_preamble_v6.o OBJS+=bin/xdp_preamble.o OBJS+=bin/policy_default.o C_FILES:=tc_preamble.c tc.c connect_balancer.c connect_balancer_v6.c xdp_preamble.c xdp.c policy_default.c @@ -72,12 +74,15 @@ UT_CFLAGS=\ -I . # Mini-UT programs that test one or two functions. These are each in their own files. -ut/%.ll: ut/%.c ut/ut.h tc.c tc.d +ut/%.ll: ut/%.c ut/ut.h $(CC) $(UT_CFLAGS) $(CFLAGS) -c $< -o $@ tc_preamble.ll: tc_preamble.c tc_preamble.d $(CC) $(CFLAGS) -c $< -o $@ +tc_preamble_v6.ll: tc_preamble.c tc_preamble.d + $(CC) $(CFLAGS) -DIPVER6 -c $< -o $@ + xdp_preamble.ll: xdp_preamble.c xdp_preamble.d $(CC) $(CFLAGS) -DCALI_COMPILE_FLAGS=64 -c $< -o $@ @@ -90,14 +95,14 @@ to%.ll: tc.c tc.d calculate-flags $(COMPILE) from%.ll: tc.c tc.d calculate-flags $(COMPILE) -#to%_v6.ll: tc6.c tc.d calculate-flags -# $(COMPILE) -#from%_v6.ll: tc6.c tc.d calculate-flags -# $(COMPILE) +to%_v6.ll: tc.c tc.d calculate-flags + $(COMPILE) +from%_v6.ll: tc.c tc.d calculate-flags + $(COMPILE) test%.ll: tc.c tc.d calculate-flags $(COMPILE) -#test%_v6.ll: tc6.c tc.d calculate-flags -# $(COMPILE) +test%_v6.ll: tc.c tc.d calculate-flags + $(COMPILE) xdp%.ll: xdp.c xdp.d calculate-flags $(COMPILE) test_xdp%.ll: xdp.c xdp.d calculate-flags @@ -106,6 +111,8 @@ test_xdp%.ll: xdp.c xdp.d calculate-flags LINK=$(LD) -march=bpf -filetype=obj -o $@ $< bin/tc_preamble.o: tc_preamble.ll | bin $(LINK) +bin/tc_preamble_v6.o: tc_preamble_v6.ll | bin + $(LINK) bin/xdp_preamble.o: xdp_preamble.ll | bin $(LINK) bin/policy_default.o: policy_default.ll | bin @@ -128,6 +135,10 @@ bin/connect_time_%v6_co-re.o: connect_time_%v6.ll | bin $(LINK) ut/%.o: ut/%.ll $(LINK) +ut/ip_parse_test_v6.ll: ut/ip_parse_test.c + $(CC) $(UT_CFLAGS) $(CFLAGS) -DIPVER6 -c $< -o $@ +ut/ip_parse_test_v6.o: ut/ip_parse_test_v6.ll + $(LINK) bin: mkdir -p bin diff --git a/felix/bpf-gpl/arp.h b/felix/bpf-gpl/arp.h index 3febcb43055..9bd86113c18 100644 --- a/felix/bpf-gpl/arp.h +++ b/felix/bpf-gpl/arp.h @@ -5,8 +5,10 @@ #ifndef __CALI_ARP_H__ #define __CALI_ARP_H__ +#include "ip_addr.h" + struct arp_key { - __u32 ip; + ipv46_addr_t ip; __u32 ifindex; }; @@ -15,6 +17,10 @@ struct arp_value { char mac_dst[6]; }; -CALI_MAP(cali_v4_arp, 2, BPF_MAP_TYPE_LRU_HASH, struct arp_key, struct arp_value, 10000, 0) +#ifdef IPVER6 +CALI_MAP_NAMED(cali_v6_arp, cali_arp, 2, BPF_MAP_TYPE_LRU_HASH, struct arp_key, struct arp_value, 10000, 0) +#else +CALI_MAP_NAMED(cali_v4_arp, cali_arp, 2, BPF_MAP_TYPE_LRU_HASH, struct arp_key, struct arp_value, 10000, 0) +#endif #endif /* __CALI_ARP_H__ */ diff --git a/felix/bpf-gpl/bpf.h b/felix/bpf-gpl/bpf.h index e2016733e89..bca62b6a1b2 100644 --- a/felix/bpf-gpl/bpf.h +++ b/felix/bpf-gpl/bpf.h @@ -12,10 +12,14 @@ #include #include #include -#include "globals.h" +/* CALI_BPF_INLINE must be defined before we include any of our headers. They + * assume it exists! + */ #define CALI_BPF_INLINE inline __attribute__((always_inline)) +#include "globals.h" + #define BPF_REDIR_EGRESS 0 #define BPF_REDIR_INGRESS 1 @@ -98,7 +102,12 @@ #define CALI_FIB_LOOKUP_ENABLED true #endif +#ifdef IPVER6 +#undef CALI_FIB_LOOKUP_ENABLED +#define CALI_FIB_LOOKUP_ENABLED false +#else #define CALI_FIB_ENABLED (!CALI_F_L3 && CALI_FIB_LOOKUP_ENABLED && (CALI_F_TO_HOST || CALI_F_TO_HEP)) +#endif #define COMPILE_TIME_ASSERT(expr) {typedef char array[(expr) ? 1 : -1];} static CALI_BPF_INLINE void __compile_asserts(void) { @@ -215,8 +224,18 @@ static CALI_BPF_INLINE __attribute__((noreturn)) void bpf_exit(int rc) { } #pragma clang diagnostic pop +#ifdef IPVER6 + +#define debug_ip(ip) (bpf_htonl((ip).d)) +#define ip_is_dnf(ip) (true) + +#else + +#define debug_ip(ip) bpf_htonl(ip) + #define ip_is_dnf(ip) ((ip)->frag_off & bpf_htons(0x4000)) #define ip_frag_no(ip) ((ip)->frag_off & bpf_htons(0x1fff)) +#endif static CALI_BPF_INLINE void ip_dec_ttl(struct iphdr *ip) { @@ -229,7 +248,11 @@ static CALI_BPF_INLINE void ip_dec_ttl(struct iphdr *ip) ip->check = (__be16) (sum + (sum >> 16)); } +#ifdef IPVER6 +#define ip_ttl_exceeded(ip) (CALI_F_TO_HOST && !CALI_F_TUNNEL && (ip)->hop_limit <= 1) +#else #define ip_ttl_exceeded(ip) (CALI_F_TO_HOST && !CALI_F_TUNNEL && (ip)->ttl <= 1) +#endif #if CALI_F_XDP @@ -275,25 +298,25 @@ CALI_PATCH_DEFINE(__skb_mark, 0x4d424b53) /* be 0x4d424b53 = ASCII(SKBM) */ #define map_symbol(name, ver) name##ver -#define MAP_LOOKUP_FN(name, ver) \ -static CALI_BPF_INLINE void * name##_lookup_elem(const void* key) \ +#define MAP_LOOKUP_FN(fname, name, ver) \ +static CALI_BPF_INLINE void * fname##_lookup_elem(const void* key) \ { \ return bpf_map_lookup_elem(&map_symbol(name, ver), key); \ } -#define MAP_UPDATE_FN(name, ver) \ -static CALI_BPF_INLINE int name##_update_elem(const void* key, const void* value, __u64 flags)\ +#define MAP_UPDATE_FN(fname, name, ver) \ +static CALI_BPF_INLINE int fname##_update_elem(const void* key, const void* value, __u64 flags)\ { \ return bpf_map_update_elem(&map_symbol(name, ver), key, value, flags); \ } -#define MAP_DELETE_FN(name, ver) \ -static CALI_BPF_INLINE int name##_delete_elem(const void* key) \ +#define MAP_DELETE_FN(fname, name, ver) \ +static CALI_BPF_INLINE int fname##_delete_elem(const void* key) \ { \ return bpf_map_delete_elem(&map_symbol(name, ver), key); \ } -#define CALI_MAP(name, ver, map_type, key_type, val_type, size, flags) \ +#define CALI_MAP_NAMED(name, fname, ver, map_type, key_type, val_type, size, flags) \ struct { \ __uint(type, map_type); \ __type(key, key_type); \ @@ -301,9 +324,12 @@ struct { \ __uint(max_entries, size); \ __uint(map_flags, flags); \ }map_symbol(name, ver) SEC(".maps"); \ - MAP_LOOKUP_FN(name, ver) \ - MAP_UPDATE_FN(name, ver) \ - MAP_DELETE_FN(name, ver) + MAP_LOOKUP_FN(fname, name, ver) \ + MAP_UPDATE_FN(fname, name, ver) \ + MAP_DELETE_FN(fname, name, ver) + +#define CALI_MAP(name, ver, map_type, key_type, val_type, size, flags) \ + CALI_MAP_NAMED(name, name, ver, map_type, key_type, val_type, size, flags) #define CALI_MAP_V1(name, map_type, key_type, val_type, size, flags) \ CALI_MAP(name,, map_type, key_type, val_type, size, flags) diff --git a/felix/bpf-gpl/calculate-flags b/felix/bpf-gpl/calculate-flags index 7574ace37f5..9ad456d156a 100755 --- a/felix/bpf-gpl/calculate-flags +++ b/felix/bpf-gpl/calculate-flags @@ -37,7 +37,7 @@ if [[ "${filename}" =~ test_.* ]]; then args+=("-DUNITTEST") fi -if [[ "${filename}" =~ .*_v6.o ]]; then +if [[ "${filename}" =~ .*_v6.ll ]]; then args+=("-DIPVER6") fi diff --git a/felix/bpf-gpl/connect.h b/felix/bpf-gpl/connect.h index 854820aed2c..daed9825738 100644 --- a/felix/bpf-gpl/connect.h +++ b/felix/bpf-gpl/connect.h @@ -10,7 +10,7 @@ #include "bpf.h" #include "nat_lookup.h" -static CALI_BPF_INLINE int do_nat_common(struct bpf_sock_addr *ctx, __u8 proto, __be32 *dst, bool connect) +static CALI_BPF_INLINE int do_nat_common(struct bpf_sock_addr *ctx, __u8 proto, ipv46_addr_t *dst, bool connect) { int err = 0; /* We do not know what the source address is yet, we only know that it @@ -24,7 +24,8 @@ static CALI_BPF_INLINE int do_nat_common(struct bpf_sock_addr *ctx, __u8 proto, nat_lookup_result res = NAT_LOOKUP_ALLOW; __u16 dport_he = (__u16)(bpf_ntohl(ctx->user_port)>>16); struct calico_nat_dest *nat_dest; - nat_dest = calico_v4_nat_lookup(0, *dst, proto, dport_he, false, &res, + ipv46_addr_t voidip = VOID_IP; + nat_dest = calico_nat_lookup(&voidip, dst, proto, dport_he, false, &res, proto == IPPROTO_UDP && !connect ? CTLB_UDP_NOT_SEEN_TIMEO : 0, /* enforce affinity UDP */ proto == IPPROTO_UDP && !connect /* update affinity timer */); if (!nat_dest) { @@ -49,11 +50,11 @@ static CALI_BPF_INLINE int do_nat_common(struct bpf_sock_addr *ctx, __u8 proto, .port = dport_be, .proto = proto, }; - struct sendrecv4_val val = { + struct sendrec_val val = { .ip = *dst, .port = ctx->user_port, }; - int rc = cali_v4_ct_nats_update_elem(&natk, &val, 0); + int rc = cali_ct_nats_update_elem(&natk, &val, 0); if (rc) { /* if this happens things are really bad! report */ CALI_INFO("Failed to update ct_nats map rc=%d\n", rc); @@ -65,13 +66,13 @@ static CALI_BPF_INLINE int do_nat_common(struct bpf_sock_addr *ctx, __u8 proto, __u64 cookie = bpf_get_socket_cookie(ctx); CALI_DEBUG("Store: ip=%x port=%d cookie=%x\n", bpf_ntohl(nat_dest->addr), bpf_ntohs((__u16)dport_be), cookie); - struct sendrecv4_key key = { + struct sendrec_key key = { .ip = nat_dest->addr, .port = dport_be, .cookie = cookie, }; - if (cali_v4_srmsg_update_elem(&key, &val, 0)) { + if (cali_srmsg_update_elem(&key, &val, 0)) { /* if this happens things are really bad! report */ CALI_INFO("Failed to update map\n"); goto out; @@ -85,7 +86,7 @@ static CALI_BPF_INLINE int do_nat_common(struct bpf_sock_addr *ctx, __u8 proto, return err; } -static CALI_BPF_INLINE int connect_v4(struct bpf_sock_addr *ctx, __be32 *dst) +static CALI_BPF_INLINE int connect_v4(struct bpf_sock_addr *ctx, ipv46_addr_t *dst) { int ret = 1; /* OK value */ diff --git a/felix/bpf-gpl/connect_balancer.c b/felix/bpf-gpl/connect_balancer.c index 849a41d6f32..93e26c3d46b 100644 --- a/felix/bpf-gpl/connect_balancer.c +++ b/felix/bpf-gpl/connect_balancer.c @@ -65,13 +65,13 @@ int calico_recvmsg_v4(struct bpf_sock_addr *ctx) __u64 cookie = bpf_get_socket_cookie(ctx); CALI_DEBUG("Lookup: ip=%x port=%d(BE) cookie=%x\n",ctx->user_ip4, ctx->user_port, cookie); - struct sendrecv4_key key = { + struct sendrec_key key = { .ip = ctx->user_ip4, .port = ctx->user_port, .cookie = cookie, }; - struct sendrecv4_val *revnat = cali_v4_srmsg_lookup_elem(&key); + struct sendrec_val *revnat = cali_srmsg_lookup_elem(&key); if (revnat == NULL) { CALI_DEBUG("revnat miss for %x:%d\n", diff --git a/felix/bpf-gpl/connect_balancer_v6.c b/felix/bpf-gpl/connect_balancer_v6.c index f197173e7a3..e3d2e51f1d5 100644 --- a/felix/bpf-gpl/connect_balancer_v6.c +++ b/felix/bpf-gpl/connect_balancer_v6.c @@ -2,6 +2,8 @@ // Copyright (c) 2020-2022 Tigera, Inc. All rights reserved. // SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +#undef IPVER6 /* XXX */ + #include // socket_type.h contains the definition of SOCK_XXX constants that we need @@ -98,13 +100,13 @@ int calico_recvmsg_v6(struct bpf_sock_addr *ctx) goto out; } - struct sendrecv4_key key = { + struct sendrec_key key = { .ip = ipv4, .port = ctx->user_port, .cookie = bpf_get_socket_cookie(ctx), }; - struct sendrecv4_val *revnat = cali_v4_srmsg_lookup_elem(&key); + struct sendrec_val *revnat = cali_srmsg_lookup_elem(&key); if (revnat == NULL) { CALI_DEBUG("revnat miss for %x:%d\n", diff --git a/felix/bpf-gpl/conntrack.h b/felix/bpf-gpl/conntrack.h index cb71127538d..542134f6baf 100644 --- a/felix/bpf-gpl/conntrack.h +++ b/felix/bpf-gpl/conntrack.h @@ -21,40 +21,58 @@ static CALI_BPF_INLINE int psnat_get_port(struct cali_tc_ctx *ctx) return PSNAT_START + (bpf_get_prandom_u32() % PSNAT_LEN); } +#ifdef IPVER6 + +static CALI_BPF_INLINE bool src_lt_dest(ipv6_addr_t ip_src, ipv6_addr_t ip_dst, __u16 sport, __u16 dport) +{ + int ret = ipv6_addr_t_cmp(ip_src, ip_dst); + + if (ret != 0) { + return ret < 0; + } + + return sport < dport; +} + +#else + #define src_lt_dest(ip_src, ip_dst, sport, dport) \ ((ip_src) < (ip_dst)) || (((ip_src) == (ip_dst)) && (sport) < (dport)) -#define __ct_make_key(proto, ipa, ipb, porta, portb) \ - (struct calico_ct_key) { \ - .protocol = proto, \ - .addr_a = ipa, .port_a = porta, \ - .addr_b = ipb, .port_b = portb, \ - } +#endif /* IPVER6 */ -#define ct_make_key(sltd, p, ipa, ipb, pta, ptb) ({ \ - struct calico_ct_key k; \ - k = sltd ? __ct_make_key(p, ipa, ipb, pta, ptb) : __ct_make_key(p, ipb, ipa, ptb, pta); \ - dump_ct_key(ctx, &k); \ - k; \ -}) +static CALI_BPF_INLINE void fill_ct_key(struct calico_ct_key *k, bool sltd, __u8 proto, + ipv46_addr_t *ipa, ipv46_addr_t *ipb, __u16 pta, __u16 ptb) +{ + k->protocol = proto; + + if (sltd) { + k->addr_a = *ipa; + k->addr_b = *ipb; + k->port_a = pta; + k->port_b = ptb; + } else { + k->addr_a = *ipb; + k->addr_b = *ipa; + k->port_a = ptb; + k->port_b = pta; + } +} #define ct_result_np_node(res) ((res).flags & CALI_CT_FLAG_NP_FWD) static CALI_BPF_INLINE void dump_ct_key(struct cali_tc_ctx *ctx, struct calico_ct_key *k) { - CALI_VERB("CT-ALL key A=%x:%d proto=%d\n", bpf_ntohl(k->addr_a), k->port_a, (int)k->protocol); - CALI_VERB("CT-ALL key B=%x:%d size=%d\n", bpf_ntohl(k->addr_b), k->port_b, (int)sizeof(struct calico_ct_key)); + CALI_VERB("CT-ALL key A=%x:%d proto=%d\n", debug_ip(k->addr_a), k->port_a, (int)k->protocol); + CALI_VERB("CT-ALL key B=%x:%d size=%d\n", debug_ip(k->addr_b), k->port_b, (int)sizeof(struct calico_ct_key)); } static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, struct ct_create_ctx *ct_ctx, struct calico_ct_key *k) { - __be32 ip_src = ct_ctx->src; - __be32 ip_dst = ct_ctx->dst; __u16 sport = ct_ctx->sport; __u16 dport = ct_ctx->dport; - __be32 orig_dst = ct_ctx->orig_dst; __u16 orig_dport = ct_ctx->orig_dport; int err = 0; @@ -63,22 +81,22 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, bool syn = false; __u64 now; - if (ct_ctx->tcp) { - seq = ct_ctx->tcp->seq; - syn = ct_ctx->tcp->syn; + if (ct_ctx->proto == IPPROTO_TCP) { + seq = tcp_hdr(ctx)->seq; + syn = tcp_hdr(ctx)->syn; } - CALI_DEBUG("CT-ALL packet mark is: 0x%x\n", ct_ctx->skb->mark); - if (skb_seen(ct_ctx->skb)) { + CALI_DEBUG("CT-ALL packet mark is: 0x%x\n", ctx->skb->mark); + if (skb_seen(ctx->skb)) { /* Packet already marked as being from another workload, which will * have created a conntrack entry. Look that one up instead of * creating one. */ - CALI_DEBUG("CT-ALL Asked to create entry but packet is marked as " + CALI_VERB("CT-ALL Asked to create entry but packet is marked as " "from another endpoint, doing lookup\n"); - bool srcLTDest = src_lt_dest(ip_src, ip_dst, sport, dport); - *k = ct_make_key(srcLTDest, ct_ctx->proto, ip_src, ip_dst, sport, dport); - struct calico_ct_value *ct_value = cali_v4_ct_lookup_elem(k); + bool srcLTDest = src_lt_dest(ct_ctx->src, ct_ctx->dst, sport, dport); + fill_ct_key(k, srcLTDest, ct_ctx->proto, &ct_ctx->src, &ct_ctx->dst, sport, dport); + struct calico_ct_value *ct_value = cali_ct_lookup_elem(k); if (!ct_value) { CALI_VERB("CT Packet marked as from workload but got a conntrack miss!\n"); goto create; @@ -115,7 +133,7 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, .created=now, .last_seen=now, .type = ct_ctx->type, - .orig_ip = orig_dst, + .orig_ip = ct_ctx->orig_dst, .orig_port = orig_dport, }; @@ -124,42 +142,33 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, ct_value.orig_sip = ct_ctx->orig_src; ct_value.orig_sport = ct_ctx->orig_sport; - CALI_DEBUG("CT-ALL SNAT orig %x:%d\n", bpf_htonl(ct_ctx->orig_src), ct_ctx->orig_sport); + CALI_DEBUG("CT-ALL SNAT orig %x:%d\n", debug_ip(ct_ctx->orig_src), ct_ctx->orig_sport); - if (ct_ctx->type == CALI_CT_TYPE_NAT_REV && ct_ctx->tun_ip) { + if (ct_ctx->type == CALI_CT_TYPE_NAT_REV && !ip_void(ct_ctx->tun_ip)) { if (ct_ctx->flags & CALI_CT_FLAG_NP_FWD) { - CALI_DEBUG("CT-ALL nat tunneled to %x\n", bpf_ntohl(ct_ctx->tun_ip)); + CALI_DEBUG("CT-ALL nat tunneled to %x\n", debug_ip(ct_ctx->tun_ip)); } else { - struct cali_rt *rt = cali_rt_lookup(ct_ctx->tun_ip); + struct cali_rt *rt = cali_rt_lookup(&ct_ctx->tun_ip); if (!rt || !cali_rt_is_host(rt)) { - CALI_DEBUG("CT-ALL nat tunnel IP not a host %x\n", bpf_ntohl(ct_ctx->tun_ip)); + CALI_DEBUG("CT-ALL nat tunnel IP not a host %x\n", debug_ip(ct_ctx->tun_ip)); err = -1; goto out; } - CALI_DEBUG("CT-ALL nat tunneled from %x\n", bpf_ntohl(ct_ctx->tun_ip)); + CALI_DEBUG("CT-ALL nat tunneled from %x\n", debug_ip(ct_ctx->tun_ip)); } ct_value.tun_ip = ct_ctx->tun_ip; } struct calico_ct_leg *src_to_dst, *dst_to_src; - bool srcLTDest = (ip_src < ip_dst) || ((ip_src == ip_dst) && sport < dport); + bool srcLTDest = src_lt_dest(ct_ctx->src, ct_ctx->dst, sport, dport); + fill_ct_key(k, srcLTDest, ct_ctx->proto, &ct_ctx->src, &ct_ctx->dst, sport, dport); if (srcLTDest) { - *k = (struct calico_ct_key) { - .protocol = ct_ctx->proto, - .addr_a = ip_src, .port_a = sport, - .addr_b = ip_dst, .port_b = dport, - }; CALI_VERB("CT-ALL src_to_dst A->B\n"); src_to_dst = &ct_value.a_to_b; dst_to_src = &ct_value.b_to_a; } else { - *k = (struct calico_ct_key) { - .protocol = ct_ctx->proto, - .addr_a = ip_dst, .port_a = dport, - .addr_b = ip_src, .port_b = sport, - }; CALI_VERB("CT-ALL src_to_dst B->A\n"); src_to_dst = &ct_value.b_to_a; dst_to_src = &ct_value.a_to_b; @@ -172,7 +181,7 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, src_to_dst->syn_seen = syn; src_to_dst->opener = 1; if (CALI_F_TO_HOST) { - src_to_dst->ifindex = skb_ingress_ifindex(ct_ctx->skb); + src_to_dst->ifindex = skb_ingress_ifindex(ctx->skb); } else { src_to_dst->ifindex = CT_INVALID_IFINDEX; } @@ -182,7 +191,6 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, if (CALI_F_FROM_WEP) { /* src is the from the WEP, policy approved this side */ src_to_dst->approved = 1; - CALI_DEBUG("CT-ALL approved source side - from WEP\n"); } else if (CALI_F_FROM_HEP) { /* src is the from the HEP, policy approved this side */ src_to_dst->approved = 1; @@ -196,7 +204,7 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, } CALI_DEBUG("CT-ALL approved source side - from HEP tun allow_return=%d\n", ct_ctx->allow_return); - } else if (CALI_F_TO_HEP && !skb_seen(ct_ctx->skb) && (ct_ctx->type == CALI_CT_TYPE_NAT_REV)) { + } else if (CALI_F_TO_HEP && !skb_seen(ctx->skb) && (ct_ctx->type == CALI_CT_TYPE_NAT_REV)) { src_to_dst->approved = 1; dst_to_src->approved = 1; CALI_DEBUG("CT-ALL approved both due to host source port conflict resolution.\n"); @@ -213,28 +221,28 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, } } - err = cali_v4_ct_update_elem(k, &ct_value, BPF_NOEXIST); + err = cali_ct_update_elem(k, &ct_value, BPF_NOEXIST); if (CALI_F_HEP && err == -17 /* EEXIST */) { int i; - CALI_DEBUG("Source collision for 0x%x:%d\n", bpf_htonl(ip_src), sport); + CALI_DEBUG("Source collision for 0x%x:%d\n", debug_ip(ct_ctx->src), sport); ct_value.orig_sport = sport; - bool src_lt_dst = ip_src < ip_dst; + bool src_lt_dst = ip_lt(ct_ctx->src, ct_ctx->dst); for (i = 0; i < PSNAT_RETRIES; i++) { sport = psnat_get_port(ctx); CALI_DEBUG("New sport %d\n", sport); - if (ip_src == ip_dst) { + if (ip_equal(ct_ctx->src, ct_ctx->dst)) { src_lt_dst = sport < dport; } - *k = ct_make_key(src_lt_dst, ct_ctx->proto, ip_src, ip_dst, sport, dport); + fill_ct_key(k, src_lt_dst, ct_ctx->proto, &ct_ctx->src, &ct_ctx->dst, sport, dport); - if (!(err = cali_v4_ct_update_elem(k, &ct_value, BPF_NOEXIST))) { + if (!(err = cali_ct_update_elem(k, &ct_value, BPF_NOEXIST))) { ct_ctx->sport = sport; break; } @@ -242,7 +250,7 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, if (i == PSNAT_RETRIES) { CALI_INFO("Source collision unresolved 0x%x:%d\n", - bpf_htonl(ip_src), ct_value.orig_sport); + debug_ip(ct_ctx->src), ct_value.orig_sport); err = -17; /* EEXIST */ } } @@ -252,18 +260,17 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, return err; } -static CALI_BPF_INLINE int calico_ct_v4_create_nat_fwd(struct cali_tc_ctx *ctx, - struct ct_create_ctx *ct_ctx, - struct calico_ct_key *rk) +static CALI_BPF_INLINE int calico_ct_create_nat_fwd(struct cali_tc_ctx *ctx, + struct ct_create_ctx *ct_ctx, + struct calico_ct_key *rk) { - __u8 ip_proto = ct_ctx->proto; - __be32 ip_src = ct_ctx->orig_src; - __be32 ip_dst = ct_ctx->orig_dst; + ipv46_addr_t ip_src = ct_ctx->orig_src; + ipv46_addr_t ip_dst = ct_ctx->orig_dst; __u16 sport = ct_ctx->orig_sport; __u16 dport = ct_ctx->orig_dport; if (CALI_F_TO_HEP && !CALI_F_NAT_IF && sport != ct_ctx->sport && - !(ct_ctx->skb->mark & (CALI_SKB_MARK_FROM_NAT_IFACE_OUT | CALI_SKB_MARK_SEEN))) { + !(ctx->skb->mark & (CALI_SKB_MARK_FROM_NAT_IFACE_OUT | CALI_SKB_MARK_SEEN))) { /* This entry is being created because we have a source port * conflict on a connection from host. We did psnat so we mark * such an entry with a 0 sport. @@ -274,24 +281,30 @@ static CALI_BPF_INLINE int calico_ct_v4_create_nat_fwd(struct cali_tc_ctx *ctx, __u64 now = bpf_ktime_get_ns(); - CALI_DEBUG("CT-%d Creating FWD entry at %llu.\n", ip_proto, now); - CALI_DEBUG("FWD %x -> %x\n", bpf_ntohl(ip_src), bpf_ntohl(ip_dst)); + CALI_DEBUG("CT-%d Creating FWD entry at %llu.\n", ct_ctx->proto, now); + CALI_DEBUG("FWD %x -> %x\n", debug_ip(ip_src), debug_ip(ip_dst)); struct calico_ct_value ct_value = { .type = CALI_CT_TYPE_NAT_FWD, .last_seen = now, .created = now, }; - struct calico_ct_key k; + ct_value.nat_rev_key = *rk; + + /* We do not need rk anymore, we can reause it for the new key. + * + * N.B. calico_ct_create_nat_fwd() is called _after_ calico_ct_v4_create_tracking() + * which also uses the rk! + */ + struct calico_ct_key *k = rk; bool srcLTDest = src_lt_dest(ip_src, ip_dst, sport, dport); - k = ct_make_key(srcLTDest, ct_ctx->proto, ip_src, ip_dst, sport, dport); + fill_ct_key(k, srcLTDest, ct_ctx->proto, &ip_src, &ip_dst, sport, dport); - ct_value.nat_rev_key = *rk; if (ct_ctx->orig_sport != ct_ctx->sport) { ct_value.nat_sport = ct_ctx->sport; } - int err = cali_v4_ct_update_elem(&k, &ct_value, 0); - CALI_VERB("CT-%d Create result: %d.\n", ip_proto, err); + int err = cali_ct_update_elem(k, &ct_value, 0); + CALI_VERB("CT-%d Create result: %d.\n", ctx->state->ip_proto, err); return err; } @@ -302,6 +315,9 @@ static CALI_BPF_INLINE int calico_ct_v4_create_nat_fwd(struct cali_tc_ctx *ctx, */ static CALI_BPF_INLINE bool skb_icmp_err_unpack(struct cali_tc_ctx *ctx, struct ct_lookup_ctx *ct_ctx) { +#ifdef IPVER6 + return false; +#else /* ICMP packet is an error, its payload should contain the full IP header and * at least the first 8 bytes of the next header. */ @@ -377,6 +393,7 @@ static CALI_BPF_INLINE bool skb_icmp_err_unpack(struct cali_tc_ctx *ctx, struct }; return true; +#endif /* IPVER6 */ } #define CALI_CT_LOG(level, fmt, ...) \ @@ -455,12 +472,14 @@ static CALI_BPF_INLINE bool tcp_recycled(bool syn, struct calico_ct_value *v) static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_tc_ctx *ctx) { + __u8 proto = ctx->state->ip_proto; + // TODO: refactor the conntrack code to simply use the ctx instead of its own. This // code is a direct translation of the pre-ctx code so it has some duplication (but it // needs a bit more analysis to sort out because the ct_ctx gets modified in place in // ways that might not make sense to expose through the ctx. struct ct_lookup_ctx ct_lookup_ctx = { - .proto = ctx->state->ip_proto, + .proto = proto, .src = ctx->state->ip_src, .sport = ctx->state->sport, .dst = ctx->state->ip_dst, @@ -468,7 +487,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t }; struct ct_lookup_ctx *ct_ctx = &ct_lookup_ctx; - switch (ctx->state->ip_proto) { + switch (proto) { case IPPROTO_TCP: if (skb_refresh_validate_ptrs(ctx, TCP_SIZE)) { deny_reason(ctx, CALI_REASON_SHORT); @@ -485,15 +504,15 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t } __u8 proto_orig = ct_ctx->proto; - __be32 ip_src = ct_ctx->src; - __be32 ip_dst = ct_ctx->dst; + ipv46_addr_t ip_src = ct_ctx->src; + ipv46_addr_t ip_dst = ct_ctx->dst; __u16 sport = ct_ctx->sport; __u16 dport = ct_ctx->dport; - struct tcphdr *tcp_header = ct_ctx->tcp; + struct tcphdr *tcp_header = proto == IPPROTO_TCP ? tcp_hdr(ctx) : NULL; bool related = false; - CALI_CT_DEBUG("lookup from %x:%d\n", bpf_ntohl(ip_src), sport); - CALI_CT_DEBUG("lookup to %x:%d\n", bpf_ntohl(ip_dst), dport); + CALI_CT_DEBUG("lookup from %x:%d\n", debug_ip(ip_src), sport); + CALI_CT_DEBUG("lookup to %x:%d\n", debug_ip(ip_dst), dport); if (tcp_header) { CALI_CT_VERB("packet seq = %u\n", bpf_ntohl(tcp_header->seq)); CALI_CT_VERB("packet ack_seq = %u\n", bpf_ntohl(tcp_header->ack_seq)); @@ -509,10 +528,12 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t }; bool srcLTDest = src_lt_dest(ip_src, ip_dst, sport, dport); - struct calico_ct_key k = ct_make_key(srcLTDest, ct_ctx->proto, ip_src, ip_dst, sport, dport); + struct calico_ct_key k; bool syn = tcp_header && tcp_header->syn && !tcp_header->ack; - struct calico_ct_value *v = cali_v4_ct_lookup_elem(&k); + fill_ct_key(&k, srcLTDest, ct_ctx->proto, &ip_src, &ip_dst, sport, dport); + + struct calico_ct_value *v = cali_ct_lookup_elem(&k); if (!v) { if (syn) { // SYN packet (new flow); send it to policy. @@ -563,12 +584,12 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t // skb_icmp_err_unpack updates the ct_ctx with the details of the inner packet; // look for a conntrack entry for the inner packet... - CALI_CT_DEBUG("related lookup from %x:%d\n", bpf_ntohl(ct_ctx->src), ct_ctx->sport); - CALI_CT_DEBUG("related lookup to %x:%d\n", bpf_ntohl(ct_ctx->dst), ct_ctx->dport); + CALI_CT_DEBUG("related lookup from %x:%d\n", debug_ip(ct_ctx->src), ct_ctx->sport); + CALI_CT_DEBUG("related lookup to %x:%d\n", debug_ip(ct_ctx->dst), ct_ctx->dport); srcLTDest = src_lt_dest(ct_ctx->src, ct_ctx->dst, ct_ctx->sport, ct_ctx->dport); - k = ct_make_key(srcLTDest, ct_ctx->proto, ct_ctx->src, ct_ctx->dst, ct_ctx->sport, ct_ctx->dport); - v = cali_v4_ct_lookup_elem(&k); + fill_ct_key(&k, srcLTDest, ct_ctx->proto, &ct_ctx->src, &ct_ctx->dst, ct_ctx->sport, ct_ctx->dport); + v = cali_ct_lookup_elem(&k); if (!v) { if (CALI_F_FROM_HOST && ct_ctx->proto == IPPROTO_TCP && @@ -597,7 +618,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t ip_dst = ct_ctx->dst; sport = ct_ctx->sport; dport = ct_ctx->dport; - tcp_header = ct_ctx->tcp; + tcp_header = proto == IPPROTO_TCP ? tcp_hdr(ctx) : NULL; related = true; @@ -626,15 +647,15 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t // This is a forward NAT entry; since we do the bookkeeping on the // reverse entry, we need to do a second lookup. CALI_CT_DEBUG("Hit! NAT FWD entry, doing secondary lookup.\n"); - tracking_v = cali_v4_ct_lookup_elem(&v->nat_rev_key); + tracking_v = cali_ct_lookup_elem(&v->nat_rev_key); if (!tracking_v) { CALI_CT_DEBUG("Miss when looking for secondary entry.\n"); goto out_lookup_fail; } if (tcp_recycled(syn, tracking_v)) { CALI_CT_DEBUG("TCP SYN recycles entry, NEW flow.\n"); - cali_v4_ct_delete_elem(&k); - cali_v4_ct_delete_elem(&v->nat_rev_key); + cali_ct_delete_elem(&k); + cali_ct_delete_elem(&v->nat_rev_key); goto out_lookup_fail; } @@ -668,7 +689,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t } result.tun_ip = tracking_v->tun_ip; - CALI_CT_DEBUG("fwd tun_ip:%x\n", bpf_ntohl(tracking_v->tun_ip)); + CALI_CT_DEBUG("fwd tun_ip:%x\n", debug_ip(tracking_v->tun_ip)); // flags are in the tracking entry result.flags = ct_value_get_flags(tracking_v); CALI_CT_DEBUG("result.flags 0x%x\n", result.flags); @@ -690,9 +711,10 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t /* If we are on a HEP - where encap/decap can happen - and if the packet * arrived through a tunnel, check if the src IP of the packet is expected. */ - if (CALI_F_FROM_HEP && ctx->state->tun_ip && result.tun_ip && result.tun_ip != ctx->state->tun_ip) { + if (CALI_F_FROM_HEP && !ip_void(ctx->state->tun_ip) && !ip_void(result.tun_ip) && + !ip_equal(result.tun_ip, ctx->state->tun_ip)) { CALI_CT_DEBUG("tunnel src changed from %x to %x\n", - bpf_ntohl(result.tun_ip), bpf_ntohl(ctx->state->tun_ip)); + debug_ip(result.tun_ip), debug_ip(ctx->state->tun_ip)); ct_result_set_flag(result.rc, CT_RES_TUN_SRC_CHANGED); } @@ -715,7 +737,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t } result.tun_ip = v->tun_ip; - CALI_CT_DEBUG("tun_ip:%x\n", bpf_ntohl(v->tun_ip)); + CALI_CT_DEBUG("tun_ip:%x\n", debug_ip(v->tun_ip)); result.flags = ct_value_get_flags(v); @@ -742,7 +764,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t */ snat = CALI_F_FROM_HOST; /* if returning packet into a tunnel */ - snat |= (dnat_return_should_encap() && v->tun_ip); + snat |= (dnat_return_should_encap() && !ip_void(v->tun_ip)); snat |= result.flags & CALI_CT_FLAG_VIA_NAT_IF; snat |= result.flags & CALI_CT_FLAG_HOST_PSNAT; snat |= result.flags & CALI_CT_FLAG_NP_LOOP; @@ -771,7 +793,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t CALI_CT_DEBUG("Hit! NORMAL entry.\n"); if (tcp_recycled(syn, v)) { CALI_CT_DEBUG("TCP SYN recycles entry, NEW flow.\n"); - cali_v4_ct_delete_elem(&k); + cali_ct_delete_elem(&k); goto out_lookup_fail; } CALI_CT_VERB("Created: %llu.\n", v->created); @@ -815,7 +837,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t } int ret_from_tun = CALI_F_FROM_HEP && - ctx->state->tun_ip && + !ip_void(ctx->state->tun_ip) && ct_result_rc(result.rc) == CALI_CT_ESTABLISHED_DNAT && src_to_dst->approved && result.flags & CALI_CT_FLAG_NP_FWD; @@ -834,7 +856,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t } if (ret_from_tun) { - CALI_DEBUG("Packet returned from tunnel %x\n", bpf_ntohl(ctx->state->tun_ip)); + CALI_DEBUG("Packet returned from tunnel %x\n", debug_ip(ctx->state->tun_ip)); } else if (CALI_F_TO_HOST || (skb_from_host(ctx->skb) && result.flags & CALI_CT_FLAG_HOST_PSNAT)) { /* Source of the packet is the endpoint, so check the src approval flag. */ if (src_to_dst->approved) { @@ -954,7 +976,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t /* creates connection tracking for tracked protocols */ static CALI_BPF_INLINE int conntrack_create(struct cali_tc_ctx *ctx, struct ct_create_ctx *ct_ctx) { - struct calico_ct_key k; + struct calico_ct_key *k = &ctx->scratch->ct_key; int err; if (ctx->state->flags & CALI_ST_SUPPRESS_CT_STATE) { @@ -962,17 +984,14 @@ static CALI_BPF_INLINE int conntrack_create(struct cali_tc_ctx *ctx, struct ct_c return 0; } - // Workaround for verifier; make sure verifier sees the skb on all code paths. - ct_ctx->skb = ctx->skb; - - err = calico_ct_v4_create_tracking(ctx, ct_ctx, &k); + err = calico_ct_v4_create_tracking(ctx, ct_ctx, k); if (err) { CALI_DEBUG("calico_ct_v4_create_tracking err %d\n", err); return err; } if (ct_ctx->type == CALI_CT_TYPE_NAT_REV) { - err = calico_ct_v4_create_nat_fwd(ctx, ct_ctx, &k); + err = calico_ct_create_nat_fwd(ctx, ct_ctx, k); if (err) { /* XXX we should clean up the tracking entry */ } diff --git a/felix/bpf-gpl/conntrack_types.h b/felix/bpf-gpl/conntrack_types.h index 5b81818ce25..ad054a7bfd7 100644 --- a/felix/bpf-gpl/conntrack_types.h +++ b/felix/bpf-gpl/conntrack_types.h @@ -9,7 +9,8 @@ struct calico_ct_key { __u32 protocol; - __be32 addr_a, addr_b; // NBO + ipv46_addr_t addr_a; // NBO + ipv46_addr_t addr_b; // NBO __u16 port_a, port_b; // HBO }; @@ -79,18 +80,22 @@ struct calico_ct_value { struct calico_ct_leg b_to_a; // 48 // CALI_CT_TYPE_NAT_REV - __u32 tun_ip; // 72 - __u32 orig_ip; // 76 + ipv46_addr_t tun_ip; // 72 + ipv46_addr_t orig_ip; // 76 __u16 orig_port; // 80 __u16 orig_sport; // 82 - __u32 orig_sip; // 84 + ipv46_addr_t orig_sip; // 84 }; // CALI_CT_TYPE_NAT_FWD; key for the CALI_CT_TYPE_NAT_REV entry. struct { struct calico_ct_key nat_rev_key; // 24 __u16 nat_sport; +#ifdef IPVER6 + __u8 pad2[60]; +#else __u8 pad2[46]; +#endif }; }; @@ -100,7 +105,11 @@ struct calico_ct_value { static CALI_BPF_INLINE void __xxx_compile_asserts(void) { #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wunused-local-typedef" +#ifdef IPVER6 + COMPILE_TIME_ASSERT((sizeof(struct calico_ct_value) == 128)) +#else COMPILE_TIME_ASSERT((sizeof(struct calico_ct_value) == 88)) +#endif #pragma clang diagnostic pop } @@ -117,34 +126,38 @@ static CALI_BPF_INLINE void __xxx_compile_asserts(void) { struct ct_lookup_ctx { __u8 proto; - __be32 src; - __be32 dst; + DECLARE_IP_ADDR(src); + DECLARE_IP_ADDR(dst); __u16 sport; __u16 dport; struct tcphdr *tcp; }; struct ct_create_ctx { - struct __sk_buff *skb; - __u8 proto; - __be32 orig_src; - __be32 src; - __be32 orig_dst; - __be32 dst; + ipv46_addr_t orig_src; + ipv46_addr_t src; + ipv46_addr_t orig_dst; + ipv46_addr_t dst; __u16 sport; __u16 dport; __u16 orig_dport; __u16 orig_sport; struct tcphdr *tcp; - __be32 tun_ip; /* is set when the packet arrive through the NP tunnel. + ipv46_addr_t tun_ip; /* is set when the packet arrive through the NP tunnel. * It is also set on the first node when we create the * initial CT entry for the tunneled traffic. */ __u16 flags; + __u8 proto; + __u8 __pad; enum cali_ct_type type; bool allow_return; }; -CALI_MAP(cali_v4_ct, 3, +#ifdef IPVER6 +CALI_MAP_NAMED(cali_v6_ct, cali_ct, 3, +#else +CALI_MAP_NAMED(cali_v4_ct, cali_ct, 3, +#endif BPF_MAP_TYPE_HASH, struct calico_ct_key, struct calico_ct_value, 512000, BPF_F_NO_PREALLOC) @@ -205,11 +218,11 @@ enum calico_ct_result_type { struct calico_ct_result { __s16 rc; __u16 flags; - __be32 nat_ip; - __be32 nat_sip; + ipv46_addr_t nat_ip; + ipv46_addr_t nat_sip; __u16 nat_port; __u16 nat_sport; - __be32 tun_ip; + ipv46_addr_t tun_ip; __u32 ifindex_fwd; /* if set, the ifindex where the packet should be forwarded */ __u32 ifindex_created; /* For a CT state that was created by a packet ingressing * through an interface towards the host, this is the diff --git a/felix/bpf-gpl/failsafe.h b/felix/bpf-gpl/failsafe.h index 00cdcc88bfb..3bd0733c5b7 100644 --- a/felix/bpf-gpl/failsafe.h +++ b/felix/bpf-gpl/failsafe.h @@ -35,7 +35,8 @@ CALI_MAP(cali_v4_fsafes, 2, #define FSAFE_PREFIX_LEN_IN_BITS (FSAFE_PREFIX_LEN * 8) -static CALI_BPF_INLINE bool is_failsafe_in(__u8 ip_proto, __u16 dport, __be32 ip) { +static CALI_BPF_INLINE bool is_failsafe_in(__u8 ip_proto, __u16 dport, ipv46_addr_t ip) { +#ifndef IPVER6 struct failsafe_key key = { .prefixlen = FSAFE_PREFIX_LEN_IN_BITS, .ip_proto = ip_proto, @@ -46,10 +47,14 @@ static CALI_BPF_INLINE bool is_failsafe_in(__u8 ip_proto, __u16 dport, __be32 ip if (cali_v4_fsafes_lookup_elem(&key)) { return true; } +#else + /* XXX not implemented yet*/ +#endif return false; } -static CALI_BPF_INLINE bool is_failsafe_out(__u8 ip_proto, __u16 dport, __be32 ip) { +static CALI_BPF_INLINE bool is_failsafe_out(__u8 ip_proto, __u16 dport, ipv46_addr_t ip) { +#ifndef IPVER6 struct failsafe_key key = { .prefixlen = FSAFE_PREFIX_LEN_IN_BITS, .ip_proto = ip_proto, @@ -60,6 +65,9 @@ static CALI_BPF_INLINE bool is_failsafe_out(__u8 ip_proto, __u16 dport, __be32 i if (cali_v4_fsafes_lookup_elem(&key)) { return true; } +#else + /* XXX not implemented yet*/ +#endif return false; } diff --git a/felix/bpf-gpl/fib.h b/felix/bpf-gpl/fib.h index eef109d080c..861e946587b 100644 --- a/felix/bpf-gpl/fib.h +++ b/felix/bpf-gpl/fib.h @@ -104,14 +104,14 @@ static CALI_BPF_INLINE int forward_or_drop(struct cali_tc_ctx *ctx) struct arp_value *arpv; struct arp_key arpk = { - .ip = iface != NATIN_IFACE ? state->ip_dst : 0 /* 0.0.0.0 */, + .ip = iface != NATIN_IFACE ? state->ip_dst : VOID_IP, .ifindex = iface, }; - arpv = cali_v4_arp_lookup_elem(&arpk); + arpv = cali_arp_lookup_elem(&arpk); if (!arpv) { CALI_DEBUG("ARP lookup failed for %x dev %d\n", - bpf_ntohl(state->ip_dst), iface); + debug_ip(state->ip_dst), iface); goto skip_redir_ifindex; } @@ -261,15 +261,16 @@ static CALI_BPF_INLINE int forward_or_drop(struct cali_tc_ctx *ctx) __u32 iface = NATIN_IFACE; struct arp_key arpk = { - .ip = 0 /* 0.0.0.0 */, .ifindex = iface, }; - struct arp_value *arpv = cali_v4_arp_lookup_elem(&arpk); + ip_set_void(arpk.ip); + + struct arp_value *arpv = cali_arp_lookup_elem(&arpk); if (!arpv) { ctx->fwd.reason = CALI_REASON_NATIFACE; CALI_DEBUG("ARP lookup failed for %x dev %d\n", - bpf_ntohl(state->ip_dst), iface); + debug_ip(state->ip_dst), iface); goto deny; } diff --git a/felix/bpf-gpl/globals.h b/felix/bpf-gpl/globals.h index b6208873037..44582937ef3 100644 --- a/felix/bpf-gpl/globals.h +++ b/felix/bpf-gpl/globals.h @@ -5,29 +5,39 @@ #ifndef __CALI_GLOBALS_H__ #define __CALI_GLOBALS_H__ -struct cali_tc_globals { - __be32 host_ip; - __be16 tunnel_mtu; - __be16 vxlan_port; - __be32 intf_ip; - __be32 ext_to_svc_mark; - __be16 psnat_start; - __be16 psnat_len; - __be32 host_tunnel_ip; - __be32 flags; - __be16 wg_port; - __be16 __pad; - __u32 natin_idx; - __u32 natout_idx; - __u8 iface_name[16]; - __u32 log_filter_jmp; - __u32 jumps[32]; - /* Needs to be 32bit aligned as it is followed by scratch area for - * building headers. We reuse the same slot in state map to save - * ourselves a lookup. - */ - __u32 __scratch[]; /* N.B. this provides pointer to the location but does not add to the size */ -}; +#include "ip_addr.h" + +#define DECLARE_TC_GLOBALS(name, ip_t) \ +struct name { \ + ip_t host_ip; \ + __be16 tunnel_mtu; \ + __be16 vxlan_port; \ + ip_t intf_ip; \ + __be32 ext_to_svc_mark; \ + __be16 psnat_start; \ + __be16 psnat_len; \ + ip_t host_tunnel_ip; \ + __be32 flags; \ + __be16 wg_port; \ + __be16 __pad; \ + __u32 natin_idx; \ + __u32 natout_idx; \ + __u8 iface_name[16]; \ + __u32 log_filter_jmp; \ + __u32 jumps[32]; \ + /* Needs to be 32bit aligned as it is followed by scratch area for \ + * building headers. We reuse the same slot in state map to save \ + * ourselves a lookup. \ + */ \ + __u32 __scratch[]; /* N.B. this provides pointer to the location but does not add to the size */ \ +} + +DECLARE_TC_GLOBALS(cali_tc_globals, ipv46_addr_t); +/* cali_tc_globals_v6 is for userspace as cali_tc_globals are used for ipv4 in + * userspace, but it has the exact same layout as cali_tc_globals in eBPF when + * compiled for ipv6. + */ +DECLARE_TC_GLOBALS(cali_tc_globals_v6, ipv6_addr_t); enum cali_globals_flags { /* CALI_GLOBALS_IPV6_ENABLED is set when IPv6 is enabled by Felix */ diff --git a/felix/bpf-gpl/icmp.h b/felix/bpf-gpl/icmp.h index e0c6eff6d02..f40602de96c 100644 --- a/felix/bpf-gpl/icmp.h +++ b/felix/bpf-gpl/icmp.h @@ -5,164 +5,10 @@ #ifndef __CALI_ICMP_H__ #define __CALI_ICMP_H__ -#include -#include -#include - -#include "bpf.h" -#include "log.h" -#include "skb.h" - -static CALI_BPF_INLINE int icmp_v4_reply(struct cali_tc_ctx *ctx, - __u8 type, __u8 code, __be32 un) -{ - int ret; - - /* ICMP is on the slow path so we may as well revalidate here to keep calling code - * simple. We only need to look at the IP header before we resize the packet. */ - if (skb_refresh_validate_ptrs(ctx, 0)) { - deny_reason(ctx, CALI_REASON_SHORT); - CALI_DEBUG("ICMP v4 reply: too short\n"); - return -1; - } - - struct iphdr ip_orig = *ip_hdr(ctx); - - /* Trim the packet to the desired length. ICMP requires min 8 bytes of - * payload but the SKB implementation gets upset if we try to trim - * part-way through the UDP/TCP header. - */ - __u32 len = skb_iphdr_offset(ctx) + 60 /* max IP len */; - switch (ip_hdr(ctx)->protocol) { - case IPPROTO_TCP: - len += sizeof(struct tcphdr); - break; - case IPPROTO_UDP: - len += sizeof(struct udphdr); - break; - default: - len += 8; - break; - } - - CALI_DEBUG("Trimming to %d\n", len); - int err = bpf_skb_change_tail(ctx->skb, len, 0); - if (err) { - CALI_DEBUG("ICMP v4 reply: early bpf_skb_change_tail (len=%d) failed (err=%d)\n", len, err); - return -1; - } - - /* make room for the new IP + ICMP header */ - int new_hdrs_len = sizeof(struct iphdr) + sizeof(struct icmphdr); - CALI_DEBUG("Inserting %d\n", new_hdrs_len); - ret = bpf_skb_adjust_room(ctx->skb, new_hdrs_len, BPF_ADJ_ROOM_MAC, 0); - if (ret) { - CALI_DEBUG("ICMP v4 reply: failed to make room\n"); - return -1; - } - - len += new_hdrs_len; - CALI_DEBUG("Len after insert %d\n", len); - - /* ICMP reply carries the IP header + at least 8 bytes of data. */ - if (skb_refresh_validate_ptrs(ctx, len - IP_SIZE - (CALI_F_L3 ? 0 : ETH_SIZE))) { - deny_reason(ctx, CALI_REASON_SHORT); - CALI_DEBUG("ICMP v4 reply: too short after making room\n"); - return -1; - } - - /* we do not touch ethhdr, we rely on linux to rewrite it after routing - * XXX we might want to swap MACs and bounce it back from the same device - */ - ip_hdr(ctx)->version = 4; - ip_hdr(ctx)->ihl = 5; - ip_hdr(ctx)->tos = 0; - ip_hdr(ctx)->ttl = 64; /* good default */ - ip_hdr(ctx)->protocol = IPPROTO_ICMP; - ip_hdr(ctx)->check = 0; - ip_hdr(ctx)->tot_len = bpf_htons(len - (CALI_F_L3_DEV ? 0 : ETH_SIZE)); - - ctx->ipheader_len = 20; - -#ifdef CALI_PARANOID - /* XXX verify that ip_orig.daddr is always the node's IP - * - * we only call this function because of NodePort encap - */ - if (ip_orig.daddr != HOST_IP) { - CALI_DEBUG("ICMP v4 reply: ip_orig.daddr != HOST_IP 0x%x\n", ip_orig.daddr); - } +#ifdef IPVER6 +#include "icmp6.h" +#else +#include "icmp4.h" #endif - /* use the host IP of the program that handles the packet */ - ip_hdr(ctx)->saddr = INTF_IP; - ip_hdr(ctx)->daddr = ip_orig.saddr; - - struct icmphdr *icmp = ((void *)ip_hdr(ctx)) + IP_SIZE; - - icmp->type = type; - icmp->code = code; - *((__be32 *)&icmp->un) = un; - icmp->checksum = 0; - - __wsum ip_csum = bpf_csum_diff(0, 0, ctx->ip_header, sizeof(struct iphdr), 0); - __wsum icmp_csum = bpf_csum_diff(0, 0, (__u32 *)icmp, - len - sizeof(struct iphdr) - skb_iphdr_offset(ctx), 0); - CALI_DEBUG("ICMP: checksum 0x%x len %d\n", icmp_csum, len - sizeof(struct iphdr) - skb_iphdr_offset(ctx)); - - ret = bpf_l3_csum_replace(ctx->skb, - skb_iphdr_offset(ctx) + offsetof(struct iphdr, check), 0, ip_csum, 0); - if (ret) { - CALI_DEBUG("ICMP v4 reply: set ip csum failed\n"); - return -1; - } - ret = bpf_l4_csum_replace(ctx->skb, sizeof(struct ethhdr) + sizeof(struct iphdr) + - offsetof(struct icmphdr, checksum), 0, icmp_csum, 0); - if (ret) { - CALI_DEBUG("ICMP v4 reply: set icmp csum failed\n"); - return -1; - } - - CALI_DEBUG("ICMP v4 reply creation succeeded\n"); - - return 0; -} - -static CALI_BPF_INLINE int icmp_v4_too_big(struct cali_tc_ctx *ctx) -{ - struct { - __be16 unused; - __be16 mtu; - } frag = { - .mtu = bpf_htons(TUNNEL_MTU), - }; - - CALI_DEBUG("Sending ICMP too big mtu=%d\n", bpf_ntohs(frag.mtu)); - return icmp_v4_reply(ctx, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, *(__be32 *)&frag); -} - -static CALI_BPF_INLINE int icmp_v4_ttl_exceeded(struct cali_tc_ctx *ctx) -{ - return icmp_v4_reply(ctx, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); -} - -static CALI_BPF_INLINE int icmp_v4_port_unreachable(struct cali_tc_ctx *ctx) -{ - return icmp_v4_reply(ctx, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); -} - -static CALI_BPF_INLINE bool icmp_type_is_err(__u8 type) -{ - switch (type) { - case ICMP_DEST_UNREACH: - case ICMP_SOURCE_QUENCH: - case ICMP_REDIRECT: - case ICMP_TIME_EXCEEDED: - case ICMP_PARAMETERPROB: - return true; - } - - return false; -} - #endif /* __CALI_ICMP_H__ */ diff --git a/felix/bpf-gpl/icmp4.h b/felix/bpf-gpl/icmp4.h new file mode 100644 index 00000000000..e2d62f97e47 --- /dev/null +++ b/felix/bpf-gpl/icmp4.h @@ -0,0 +1,168 @@ +// Project Calico BPF dataplane programs. +// Copyright (c) 2020-2023 Tigera, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +#ifndef __CALI_ICMP4_H__ +#define __CALI_ICMP4_H__ + +#include +#include +#include + +#include "bpf.h" +#include "log.h" +#include "skb.h" + +static CALI_BPF_INLINE int icmp_v4_reply(struct cali_tc_ctx *ctx, + __u8 type, __u8 code, __be32 un) +{ + int ret; + + /* ICMP is on the slow path so we may as well revalidate here to keep calling code + * simple. We only need to look at the IP header before we resize the packet. */ + if (skb_refresh_validate_ptrs(ctx, 0)) { + deny_reason(ctx, CALI_REASON_SHORT); + CALI_DEBUG("ICMP v4 reply: too short\n"); + return -1; + } + + struct iphdr ip_orig = *ip_hdr(ctx); + + /* Trim the packet to the desired length. ICMP requires min 8 bytes of + * payload but the SKB implementation gets upset if we try to trim + * part-way through the UDP/TCP header. + */ + __u32 len = skb_iphdr_offset(ctx) + 60 /* max IP len */; + switch (ip_hdr(ctx)->protocol) { + case IPPROTO_TCP: + len += sizeof(struct tcphdr); + break; + case IPPROTO_UDP: + len += sizeof(struct udphdr); + break; + default: + len += 8; + break; + } + + CALI_DEBUG("Trimming to %d\n", len); + int err = bpf_skb_change_tail(ctx->skb, len, 0); + if (err) { + CALI_DEBUG("ICMP v4 reply: early bpf_skb_change_tail (len=%d) failed (err=%d)\n", len, err); + return -1; + } + + /* make room for the new IP + ICMP header */ + int new_hdrs_len = sizeof(struct iphdr) + sizeof(struct icmphdr); + CALI_DEBUG("Inserting %d\n", new_hdrs_len); + ret = bpf_skb_adjust_room(ctx->skb, new_hdrs_len, BPF_ADJ_ROOM_MAC, 0); + if (ret) { + CALI_DEBUG("ICMP v4 reply: failed to make room\n"); + return -1; + } + + len += new_hdrs_len; + CALI_DEBUG("Len after insert %d\n", len); + + /* ICMP reply carries the IP header + at least 8 bytes of data. */ + if (skb_refresh_validate_ptrs(ctx, len - IP_SIZE - (CALI_F_L3 ? 0 : ETH_SIZE))) { + deny_reason(ctx, CALI_REASON_SHORT); + CALI_DEBUG("ICMP v4 reply: too short after making room\n"); + return -1; + } + + /* we do not touch ethhdr, we rely on linux to rewrite it after routing + * XXX we might want to swap MACs and bounce it back from the same device + */ + ip_hdr(ctx)->version = 4; + ip_hdr(ctx)->ihl = 5; + ip_hdr(ctx)->tos = 0; + ip_hdr(ctx)->ttl = 64; /* good default */ + ip_hdr(ctx)->protocol = IPPROTO_ICMP; + ip_hdr(ctx)->check = 0; + ip_hdr(ctx)->tot_len = bpf_htons(len - (CALI_F_L3_DEV ? 0 : ETH_SIZE)); + + ctx->ipheader_len = 20; + +#ifdef CALI_PARANOID + /* XXX verify that ip_orig.daddr is always the node's IP + * + * we only call this function because of NodePort encap + */ + if (ip_orig.daddr != HOST_IP) { + CALI_DEBUG("ICMP v4 reply: ip_orig.daddr != HOST_IP 0x%x\n", ip_orig.daddr); + } +#endif + + /* use the host IP of the program that handles the packet */ + ip_hdr(ctx)->saddr = INTF_IP; + ip_hdr(ctx)->daddr = ip_orig.saddr; + + struct icmphdr *icmp = ((void *)ip_hdr(ctx)) + IP_SIZE; + + icmp->type = type; + icmp->code = code; + *((__be32 *)&icmp->un) = un; + icmp->checksum = 0; + + __wsum ip_csum = bpf_csum_diff(0, 0, ctx->ip_header, sizeof(struct iphdr), 0); + __wsum icmp_csum = bpf_csum_diff(0, 0, (__u32 *)icmp, + len - sizeof(struct iphdr) - skb_iphdr_offset(ctx), 0); + CALI_DEBUG("ICMP: checksum 0x%x len %d\n", icmp_csum, len - sizeof(struct iphdr) - skb_iphdr_offset(ctx)); + + ret = bpf_l3_csum_replace(ctx->skb, + skb_iphdr_offset(ctx) + offsetof(struct iphdr, check), 0, ip_csum, 0); + if (ret) { + CALI_DEBUG("ICMP v4 reply: set ip csum failed\n"); + return -1; + } + ret = bpf_l4_csum_replace(ctx->skb, sizeof(struct ethhdr) + sizeof(struct iphdr) + + offsetof(struct icmphdr, checksum), 0, icmp_csum, 0); + if (ret) { + CALI_DEBUG("ICMP v4 reply: set icmp csum failed\n"); + return -1; + } + + CALI_DEBUG("ICMP v4 reply creation succeeded\n"); + + return 0; +} + +static CALI_BPF_INLINE int icmp_v4_too_big(struct cali_tc_ctx *ctx) +{ + struct { + __be16 unused; + __be16 mtu; + } frag = { + .mtu = bpf_htons(TUNNEL_MTU), + }; + + CALI_DEBUG("Sending ICMP too big mtu=%d\n", bpf_ntohs(frag.mtu)); + return icmp_v4_reply(ctx, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, *(__be32 *)&frag); +} + +static CALI_BPF_INLINE int icmp_v4_ttl_exceeded(struct cali_tc_ctx *ctx) +{ + return icmp_v4_reply(ctx, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); +} + +static CALI_BPF_INLINE int icmp_v4_port_unreachable(struct cali_tc_ctx *ctx) +{ + return icmp_v4_reply(ctx, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); +} + +static CALI_BPF_INLINE bool icmp_type_is_err(__u8 type) +{ + switch (type) { + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_REDIRECT: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + return true; + } + + return false; +} + +#endif /* __CALI_ICMP4_H__ */ diff --git a/felix/bpf-gpl/icmp6.h b/felix/bpf-gpl/icmp6.h new file mode 100644 index 00000000000..5d87bcd4795 --- /dev/null +++ b/felix/bpf-gpl/icmp6.h @@ -0,0 +1,13 @@ +// Project Calico BPF dataplane programs. +// Copyright (c) 2023 Tigera, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +#ifndef __CALI_ICMP6_H__ +#define __CALI_ICMP6_H__ + +static CALI_BPF_INLINE bool icmp_type_is_err(__u8 type) { + /* XXX not implemented yet */ + return false; +} + +#endif /* __CALI_ICMP6_H__ */ diff --git a/felix/bpf-gpl/ip_addr.h b/felix/bpf-gpl/ip_addr.h new file mode 100644 index 00000000000..eb405e88497 --- /dev/null +++ b/felix/bpf-gpl/ip_addr.h @@ -0,0 +1,98 @@ +// Project Calico BPF dataplane programs. +// Copyright (c) 2023 Tigera, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +#ifndef __CALI_IP_ADDR_H__ +#define __CALI_IP_ADDR_H__ + +typedef struct { + __be32 a; + __be32 b; + __be32 c; + __be32 d; +} ipv6_addr_t; + +typedef __be32 ipv4_addr_t; + +#ifdef IPVER6 + +#include + +static CALI_BPF_INLINE bool ipv6_addr_t_eq(ipv6_addr_t x, ipv6_addr_t y) +{ + return x.a == y.a && x.b == y.b && x.c == y.c && x.d == y.d; +} + +static CALI_BPF_INLINE int ipv6_addr_t_cmp(ipv6_addr_t x, ipv6_addr_t y) +{ + if (x.a < y.a) { + return -1; + } else if (x.a == y.a) { + if (x.b < y.b) { + return -1; + } else if (x.b == y.b) { + if (x.c < y.c) { + return -1; + } else if (x.c == y.c) { + if (x.d < y.d) { + return -1; + } else if (x.d == y.d) { + return 0; + } + } + } + } + + return 1; +} + +#define ip_void(ip) ((ip).a == 0 && (ip).b == 0 && (ip).c == 0 && (ip).d == 0) +#define VOID_IP ({ipv6_addr_t x = {}; x;}) +#define ip_set_void(ip) do { \ + (ip).a = 0; \ + (ip).b = 0; \ + (ip).c = 0; \ + (ip).d = 0; \ +} while(0) +#define NP_SPECIAL_IP ({ipv6_addr_t x = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; x;}) +#define ip_equal(a, b) ipv6_addr_t_eq(a, b) +#define ip_lt(a, b) (ipv6_addr_t_cmp(a, b) < 0) + +static CALI_BPF_INLINE void ipv6hdr_ip_to_ipv6_addr_t(ipv6_addr_t *us, struct in6_addr *lnx) +{ + us->a = lnx->in6_u.u6_addr32[0]; + us->b = lnx->in6_u.u6_addr32[1]; + us->c = lnx->in6_u.u6_addr32[2]; + us->d = lnx->in6_u.u6_addr32[3]; +} + +static CALI_BPF_INLINE void ipv6_addr_t_to_ipv6hdr_ip(struct in6_addr *lnx, ipv6_addr_t *us) +{ + lnx->in6_u.u6_addr32[0] = us->a; + lnx->in6_u.u6_addr32[1] = us->b; + lnx->in6_u.u6_addr32[2] = us->c; + lnx->in6_u.u6_addr32[3] = us->d; +} + +typedef ipv6_addr_t ipv46_addr_t; + +#define DECLARE_IP_ADDR(name) ipv6_addr_t name + +#else /* ipv4 */ + +#define ip_void(ip) ((ip) == 0) +#define VOID_IP 0 +#define ip_set_void(ip) ((ip) = 0) +#define NP_SPECIAL_IP 0xffffffff +#define ip_equal(a, b) ((a) == (b)) +#define ip_lt(a, b) ((a) < (b)) + +typedef ipv4_addr_t ipv46_addr_t; + +#define DECLARE_IP_ADDR(name) union { \ + ipv4_addr_t name; \ + ipv6_addr_t __pad ## name; \ + } +#endif + +#endif /* __CALI_IP_ADDR_H__ */ diff --git a/felix/bpf-gpl/jump.h b/felix/bpf-gpl/jump.h index 1dacbf4780f..902ed171715 100644 --- a/felix/bpf-gpl/jump.h +++ b/felix/bpf-gpl/jump.h @@ -5,7 +5,7 @@ #ifndef __CALI_BPF_JUMP_H__ #define __CALI_BPF_JUMP_H__ -CALI_MAP(cali_state, 3, +CALI_MAP(cali_state, 4, BPF_MAP_TYPE_PERCPU_ARRAY, __u32, struct cali_tc_state, 2, 0) @@ -42,16 +42,24 @@ static CALI_BPF_INLINE struct cali_xdp_globals *state_get_globals_xdp(void) CALI_MAP_V1(cali_jump_map, BPF_MAP_TYPE_PROG_ARRAY, __u32, __u32, 200, 0) #define CALI_JUMP_TO(ctx, index) bpf_tail_call((ctx)->xdp, &cali_jump_map, (ctx)->xdp_globals->jumps[PROG_PATH(index)]) -#else + +#else /* CALI_F_XDP */ #define cali_jump_map map_symbol(cali_progs, 2) CALI_MAP_V1(cali_jump_map, BPF_MAP_TYPE_PROG_ARRAY, __u32, __u32, 200, 0) -#define CALI_JUMP_TO(ctx, index) do { \ +#define __CALI_JUMP_TO(ctx, index) do { \ CALI_DEBUG("jump to idx %d prog at %d\n", index, (ctx)->globals->jumps[PROG_PATH(index)]); \ bpf_tail_call((ctx)->skb, &cali_jump_map, (ctx)->globals->jumps[PROG_PATH(index)]); \ } while (0) + +#ifdef IPVER6 +#define CALI_JUMP_TO(ctx, index) __CALI_JUMP_TO(ctx, index ## _V6) +#else +#define CALI_JUMP_TO(ctx, index) __CALI_JUMP_TO(ctx, index) +#endif + #endif /* Add new values to the end as these are program indices */ @@ -72,17 +80,21 @@ enum cali_jump_index { PROG_INDEX_HOST_CT_CONFLICT_DEBUG, PROG_INDEX_ICMP_INNER_NAT_DEBUG, - PROG_INDEX_V6_PROLOGUE, - PROG_INDEX_V6_POLICY, - PROG_INDEX_V6_ALLOWED, - PROG_INDEX_V6_ICMP, - PROG_INDEX_V6_DROP, - - PROG_INDEX_V6_PROLOGUE_DEBUG, - PROG_INDEX_V6_POLICY_DEBUG, - PROG_INDEX_V6_ALLOWED_DEBUG, - PROG_INDEX_V6_ICMP_DEBUG, - PROG_INDEX_V6_DROP_DEBUG, + PROG_INDEX_MAIN_V6, + PROG_INDEX_POLICY_V6, + PROG_INDEX_ALLOWED_V6, + PROG_INDEX_ICMP_V6, + PROG_INDEX_DROP_V6, + PROG_INDEX_HOST_CT_CONFLICT_V6, + PROG_INDEX_ICMP_INNER_NAT_V6, + + PROG_INDEX_MAIN_V6_DEBUG, + PROG_INDEX_POLICY_V6_DEBUG, + PROG_INDEX_ALLOWED_V6_DEBUG, + PROG_INDEX_ICMP_V6_DEBUG, + PROG_INDEX_DROP_V6_DEBUG, + PROG_INDEX_HOST_CT_CONFLICT_V6_DEBUG, + PROG_INDEX_ICMP_INNER_NAT_V6_DEBUG, }; #if CALI_F_XDP @@ -96,21 +108,29 @@ CALI_MAP_V1(cali_jump_prog_map, BPF_MAP_TYPE_PROG_ARRAY, __u32, __u32, 100, 0) */ #define CALI_JUMP_TO_POLICY(ctx) \ bpf_tail_call((ctx)->xdp, &cali_jump_prog_map, (ctx)->xdp_globals->jumps[PROG_INDEX_POLICY]) -#else +#else /* CALI_F_XDP */ #define cali_jump_prog_map map_symbol(cali_jump, 2) CALI_MAP_V1(cali_jump_prog_map, BPF_MAP_TYPE_PROG_ARRAY, __u32, __u32, 10000, 0) -#define CALI_JUMP_TO_POLICY(ctx) do { \ - (ctx)->skb->cb[0] = (ctx)->globals->jumps[PROG_PATH(PROG_INDEX_ALLOWED)]; \ - (ctx)->skb->cb[1] = (ctx)->globals->jumps[PROG_PATH(PROG_INDEX_DROP)]; \ - CALI_DEBUG("policy allow prog at %d\n", (ctx)->globals->jumps[PROG_PATH(PROG_INDEX_ALLOWED)]); \ - CALI_DEBUG("policy deny prog at %d\n", (ctx)->globals->jumps[PROG_PATH(PROG_INDEX_DROP)]); \ - CALI_DEBUG("jump to policy prog at %d\n", (ctx)->globals->jumps[PROG_INDEX_POLICY]); \ - bpf_tail_call((ctx)->skb, &cali_jump_prog_map, (ctx)->globals->jumps[PROG_INDEX_POLICY]); \ +#define __CALI_JUMP_TO_POLICY(ctx, allow, deny, pol) do { \ + (ctx)->skb->cb[0] = (ctx)->globals->jumps[PROG_PATH(allow)]; \ + (ctx)->skb->cb[1] = (ctx)->globals->jumps[PROG_PATH(deny)]; \ + CALI_DEBUG("policy allow prog at %d\n", (ctx)->globals->jumps[PROG_PATH(allow)]); \ + CALI_DEBUG("policy deny prog at %d\n", (ctx)->globals->jumps[PROG_PATH(deny)]); \ + CALI_DEBUG("jump to policy prog at %d\n", (ctx)->globals->jumps[pol]); \ + bpf_tail_call((ctx)->skb, &cali_jump_prog_map, (ctx)->globals->jumps[pol]); \ } while (0) +#ifdef IPVER6 +#define CALI_JUMP_TO_POLICY(ctx) \ + __CALI_JUMP_TO_POLICY(ctx, PROG_INDEX_ALLOWED_V6, PROG_INDEX_DROP_V6, PROG_INDEX_POLICY_V6) +#else +#define CALI_JUMP_TO_POLICY(ctx) \ + __CALI_JUMP_TO_POLICY(ctx, PROG_INDEX_ALLOWED, PROG_INDEX_DROP, PROG_INDEX_POLICY) +#endif + #endif #endif /* __CALI_BPF_JUMP_H__ */ diff --git a/felix/bpf-gpl/list-ut-objs b/felix/bpf-gpl/list-ut-objs index 561ec109ebf..c5517d97143 100755 --- a/felix/bpf-gpl/list-ut-objs +++ b/felix/bpf-gpl/list-ut-objs @@ -11,7 +11,7 @@ emit_filename() { echo "bin/test_${from_or_to}_${ep_type}_fib_${log_level}${dsr}.o" -# echo "bin/test_${from_or_to}_${ep_type}_fib_${log_level}${dsr}_v6.o" + echo "bin/test_${from_or_to}_${ep_type}_fib_${log_level}${dsr}_v6.o" } log_level=debug diff --git a/felix/bpf-gpl/metadata.h b/felix/bpf-gpl/metadata.h index cd6416677bc..ab0db17dad6 100644 --- a/felix/bpf-gpl/metadata.h +++ b/felix/bpf-gpl/metadata.h @@ -22,6 +22,8 @@ enum cali_metadata_flags { // Set metadata to be received by TC programs static CALI_BPF_INLINE int xdp2tc_set_metadata(struct cali_tc_ctx *ctx, __u32 flags) { +#ifndef IPVER6 + /* XXX */ #ifndef UNITTEST struct cali_metadata *metadata; // Reserve space in-front of xdp_md.meta for metadata. @@ -55,6 +57,7 @@ static CALI_BPF_INLINE int xdp2tc_set_metadata(struct cali_tc_ctx *ctx, __u32 fl CALI_DEBUG("Set IP TOS: %d\n", ip_hdr(ctx)->tos); goto metadata_ok; #endif +#endif error: return -1; @@ -62,22 +65,23 @@ static CALI_BPF_INLINE int xdp2tc_set_metadata(struct cali_tc_ctx *ctx, __u32 fl metadata_ok: return 0; } -#endif /* CALI_F_XDP */ +#else /* CALI_F_XDP */ // Fetch metadata set by XDP program. If not set or on error return 0. static CALI_BPF_INLINE __u32 xdp2tc_get_metadata(struct __sk_buff *skb) { +#ifndef IPVER6 + /* XXX */ struct cali_metadata *metadata; - if (CALI_F_FROM_HEP && !CALI_F_XDP) { #ifndef UNITTEST - metadata = (void *)(unsigned long)skb->data_meta; + metadata = (void *)(unsigned long)skb->data_meta; - if (skb->data_meta + sizeof(struct cali_metadata) > skb->data) { - CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "No metadata is shared by XDP\n"); - goto no_metadata; - } + if (skb->data_meta + sizeof(struct cali_metadata) > skb->data) { + CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "No metadata is shared by XDP\n"); + goto no_metadata; + } - CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "Received metadata from XDP: %d\n", metadata->flags); - goto metadata_ok; + CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "Received metadata from XDP: %d\n", metadata->flags); + goto metadata_ok; #else struct cali_tc_ctx ctx = { .skb = skb, @@ -97,15 +101,17 @@ static CALI_BPF_INLINE __u32 xdp2tc_get_metadata(struct __sk_buff *skb) { CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "Set IP TOS: %d\n", ip_hdr(&ctx)->tos); goto metadata_ok; #endif /* UNITTEST */ - } else { - CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "Fetching metadata from XDP not supported in this hook\n"); - } no_metadata: return 0; metadata_ok: return metadata->flags; +#else + return 0; +#endif } +#endif /* CALI_F_XDP */ + #endif /* __CALI_METADATA_H__ */ diff --git a/felix/bpf-gpl/nat.h b/felix/bpf-gpl/nat.h index 411365cf634..75187d736ce 100644 --- a/felix/bpf-gpl/nat.h +++ b/felix/bpf-gpl/nat.h @@ -5,36 +5,52 @@ #ifndef __CALI_NAT_H__ #define __CALI_NAT_H__ -#include - -#include -#include - -#include "bpf.h" -#include "skb.h" -#include "routes.h" -#include "nat_types.h" - #ifndef CALI_VXLAN_VNI #define CALI_VXLAN_VNI 0xca11c0 #endif +#define vxlan_udp_csum_ok(udp) ((udp)->check == 0) + +#ifdef IPVER6 +#include "nat6.h" +#else +#include "nat4.h" +#endif + #define dnat_should_encap() (CALI_F_FROM_HEP && !CALI_F_TUNNEL && !CALI_F_L3_DEV && !CALI_F_NAT_IF) #define dnat_return_should_encap() (CALI_F_FROM_WEP && !CALI_F_TUNNEL && !CALI_F_L3_DEV && !CALI_F_NAT_IF) #define dnat_should_decap() (CALI_F_FROM_HEP && !CALI_F_TUNNEL && !CALI_F_L3_DEV && !CALI_F_NAT_IF) -/* Number of bytes we add to a packet when we do encap. */ -#define VXLAN_ENCAP_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ - sizeof(struct udphdr) + sizeof(struct vxlanhdr)) +static CALI_BPF_INLINE int is_vxlan_tunnel(struct cali_tc_ctx *ctx, __u16 vxlanport) +{ + return ctx->state->ip_proto == IPPROTO_UDP && + ctx->state->dport == vxlanport; +} + +static CALI_BPF_INLINE bool vxlan_encap_too_big(struct cali_tc_ctx *ctx) +{ + __u32 mtu = TUNNEL_MTU; + + /* RFC-1191: MTU is the size in octets of the largest datagram that + * could be forwarded, along the path of the original datagram, without + * being fragmented at this router. The size includes the IP header and + * IP data, and does not include any lower-level headers. + */ + if (ctx->skb->len > sizeof(struct ethhdr) + mtu) { + CALI_DEBUG("SKB too long (len=%d) vs limit=%d\n", ctx->skb->len, mtu); + return true; + } + return false; +} #define EFAULT 14 -static CALI_BPF_INLINE int skb_nat_l4_csum_ipv4(struct cali_tc_ctx *ctx, size_t off, - __be32 ip_src_from, __be32 ip_src_to, - __be32 ip_dst_from, __be32 ip_dst_to, - __u16 dport_from, __u16 dport_to, - __u16 sport_from, __u16 sport_to, - __u64 flags) +static CALI_BPF_INLINE int skb_nat_l4_csum(struct cali_tc_ctx *ctx, size_t off, + ipv46_addr_t ip_src_from, ipv46_addr_t ip_src_to, + ipv46_addr_t ip_dst_from, ipv46_addr_t ip_dst_to, + __u16 dport_from, __u16 dport_to, + __u16 sport_from, __u16 sport_to, + __u64 flags) { int ret = 0; struct __sk_buff *skb = ctx->skb; @@ -70,19 +86,42 @@ static CALI_BPF_INLINE int skb_nat_l4_csum_ipv4(struct cali_tc_ctx *ctx, size_t } } + /* We start with csum == 0 (seed for the first diff) as we are calculating just + * the diff between 2 IPs. We then feed the result as a seed to the next diff if + * we need to as a carry-over. + * + * We must use diff because the replace functions cannot calculate a diff for 16 + * byte ipv6 addresses in one go. And this keeps the code the same for v4/6 with + * minimal impact on v4. + */ + __wsum csum = 0; + + bool csum_update = false; - if (ip_src_from != ip_src_to) { + if (!ip_equal(ip_src_from, ip_src_to)) { CALI_DEBUG("L4 checksum update src IP from %x to %x\n", - bpf_ntohl(ip_src_from), bpf_ntohl(ip_src_to)); - ret = bpf_l4_csum_replace(skb, off, ip_src_from, ip_src_to, flags | BPF_F_PSEUDO_HDR | 4); - CALI_DEBUG("bpf_l4_csum_replace(IP): %d\n", ret); + debug_ip(ip_src_from), debug_ip(ip_src_to)); + + csum = bpf_csum_diff((__u32*)&ip_src_from, sizeof(ip_src_from), (__u32*)&ip_src_to, sizeof(ip_src_to), csum); + CALI_DEBUG("bpf_l4_csum_diff(IP): 0x%x\n", csum); + csum_update = true; } - if (ip_dst_from != ip_dst_to) { + if (!ip_equal(ip_dst_from, ip_dst_to)) { CALI_DEBUG("L4 checksum update dst IP from %x to %x\n", - bpf_ntohl(ip_dst_from), bpf_ntohl(ip_dst_to)); - ret = bpf_l4_csum_replace(skb, off, ip_dst_from, ip_dst_to, flags | BPF_F_PSEUDO_HDR | 4); - CALI_DEBUG("bpf_l4_csum_replace(IP): %d\n", ret); + debug_ip(ip_dst_from), debug_ip(ip_dst_to)); + csum = bpf_csum_diff((__u32*)&ip_dst_from, sizeof(ip_dst_from), (__u32*)&ip_dst_to, sizeof(ip_dst_to), csum); + CALI_DEBUG("bpf_l4_csum_diff(IP): 0x%x\n", csum); + csum_update = true; + } + + /* If the IPs have changed we must replace it as part of the pseudo header that is + * used to calculate L4 csum. + */ + if (csum_update) { + ret = bpf_l4_csum_replace(skb, off, 0, csum, flags | BPF_F_PSEUDO_HDR); } + + /* We can use replace for ports in both v4/6 as they are the same size of 2 bytes. */ if (sport_from != sport_to) { CALI_DEBUG("L4 checksum update sport from %d to %d\n", bpf_ntohs(sport_from), bpf_ntohs(sport_to)); @@ -101,135 +140,6 @@ static CALI_BPF_INLINE int skb_nat_l4_csum_ipv4(struct cali_tc_ctx *ctx, size_t return ret; } -static CALI_BPF_INLINE int vxlan_v4_encap(struct cali_tc_ctx *ctx, __be32 ip_src, __be32 ip_dst) -{ - int ret; - __wsum csum; - - __u32 new_hdrsz = sizeof(struct ethhdr) + sizeof(struct iphdr) + - sizeof(struct udphdr) + sizeof(struct vxlanhdr); - - ret = bpf_skb_adjust_room(ctx->skb, new_hdrsz, BPF_ADJ_ROOM_MAC, - BPF_F_ADJ_ROOM_ENCAP_L4_UDP | - BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | - BPF_F_ADJ_ROOM_ENCAP_L2(sizeof(struct ethhdr))); - - if (ret) { - goto out; - } - - ret = -1; - - if (skb_refresh_validate_ptrs(ctx, new_hdrsz)) { - deny_reason(ctx, CALI_REASON_SHORT); - CALI_DEBUG("Too short VXLAN encap\n"); - goto out; - } - - // Note: assuming L2 packet here so this code can't be used on an L3 device. - struct udphdr *udp = (struct udphdr*) ((void *)ip_hdr(ctx) + IP_SIZE); - struct vxlanhdr *vxlan = (void *)(udp + 1); - struct ethhdr *eth_inner = (void *)(vxlan+1); - struct iphdr *ip_inner = (void*)(eth_inner+1); - - /* Copy the original IP header. Since it is already DNATed, the dest IP is - * already set. All we need to do is to change the source IP - */ - *ip_hdr(ctx) = *ip_inner; - - /* decrement TTL for the inner IP header. TTL must be > 1 to get here */ - ip_dec_ttl(ip_inner); - - ip_hdr(ctx)->saddr = ip_src; - ip_hdr(ctx)->daddr = ip_dst; - ip_hdr(ctx)->tot_len = bpf_htons(bpf_ntohs(ip_hdr(ctx)->tot_len) + new_hdrsz); - ip_hdr(ctx)->ihl = 5; /* in case there were options in ip_inner */ - ip_hdr(ctx)->check = 0; - ip_hdr(ctx)->protocol = IPPROTO_UDP; - - udp->source = udp->dest = bpf_htons(VXLAN_PORT); - udp->len = bpf_htons(bpf_ntohs(ip_hdr(ctx)->tot_len) - sizeof(struct iphdr)); - - *((__u8*)&vxlan->flags) = 1 << 3; /* set the I flag to make the VNI valid */ - vxlan->vni = bpf_htonl(CALI_VXLAN_VNI) >> 8; /* it is actually 24-bit, last 8 reserved */ - - /* keep eth_inner MACs zeroed, it is useless after decap */ - eth_inner->h_proto = eth_hdr(ctx)->h_proto; - - CALI_DEBUG("vxlan encap %x : %x\n", bpf_ntohl(ip_hdr(ctx)->saddr), bpf_ntohl(ip_hdr(ctx)->daddr)); - - /* change the checksums last to avoid pointer access revalidation */ - - csum = bpf_csum_diff(0, 0, ctx->ip_header, sizeof(struct iphdr), 0); - ret = bpf_l3_csum_replace(ctx->skb, ((long) ctx->ip_header) - ((long) skb_start_ptr(ctx->skb)) + - offsetof(struct iphdr, check), 0, csum, 0); - -out: - return ret; -} - -static CALI_BPF_INLINE int vxlan_v4_decap(struct __sk_buff *skb) -{ - __u32 extra_hdrsz; - int ret = -1; - - extra_hdrsz = sizeof(struct ethhdr) + sizeof(struct iphdr) + - sizeof(struct udphdr) + sizeof(struct vxlanhdr); - - ret = bpf_skb_adjust_room(skb, -extra_hdrsz, BPF_ADJ_ROOM_MAC | BPF_F_ADJ_ROOM_FIXED_GSO, 0); - - return ret; -} - -static CALI_BPF_INLINE int is_vxlan_tunnel(struct iphdr *ip, __u16 vxlanport) -{ - struct udphdr *udp = (struct udphdr *)(ip +1); - - return ip->protocol == IPPROTO_UDP && - udp->dest == bpf_htons(vxlanport); -} - -static CALI_BPF_INLINE bool vxlan_size_ok(struct cali_tc_ctx *ctx) -{ - return !skb_refresh_validate_ptrs(ctx, UDP_SIZE + sizeof(struct vxlanhdr)); -} - -static CALI_BPF_INLINE __u32 vxlan_vni(struct cali_tc_ctx *ctx) -{ - struct vxlanhdr *vxlan; - - vxlan = skb_ptr_after(skb, udp_hdr(ctx)); - - return bpf_ntohl(vxlan->vni << 8); /* 24-bit field, last 8 reserved */ -} - -static CALI_BPF_INLINE bool vxlan_vni_is_valid(struct cali_tc_ctx *ctx) -{ - struct vxlanhdr *vxlan; - - vxlan = skb_ptr_after(ctx->skb, udp_hdr(ctx)); - - return *((__u8*)&vxlan->flags) & (1 << 3); -} - -#define vxlan_udp_csum_ok(udp) ((udp)->check == 0) - -static CALI_BPF_INLINE bool vxlan_v4_encap_too_big(struct cali_tc_ctx *ctx) -{ - __u32 mtu = TUNNEL_MTU; - - /* RFC-1191: MTU is the size in octets of the largest datagram that - * could be forwarded, along the path of the original datagram, without - * being fragmented at this router. The size includes the IP header and - * IP data, and does not include any lower-level headers. - */ - if (ctx->skb->len > sizeof(struct ethhdr) + mtu) { - CALI_DEBUG("SKB too long (len=%d) vs limit=%d\n", ctx->skb->len, mtu); - return true; - } - return false; -} - /* vxlan_attempt_decap tries to decode the packet as VXLAN and, if it is a BPF-to-BPF * program VXLAN packet, does the decap. Returns: * @@ -241,10 +151,14 @@ static CALI_BPF_INLINE int vxlan_attempt_decap(struct cali_tc_ctx *ctx) { /* decap on host ep only if directly for the node */ CALI_DEBUG("VXLAN tunnel packet to %x (host IP=%x)\n", +#ifdef IPVER6 + bpf_ntohl(ip_hdr(ctx)->daddr.in6_u.u6_addr32[3]), +#else bpf_ntohl(ip_hdr(ctx)->daddr), - bpf_ntohl(HOST_IP)); +#endif + debug_ip(HOST_IP)); - if (!rt_addr_is_local_host(ip_hdr(ctx)->daddr)) { + if (!rt_addr_is_local_host((ipv46_addr_t *)&ip_hdr(ctx)->daddr)) { goto fall_through; } if (!vxlan_size_ok(ctx)) { @@ -258,7 +172,7 @@ static CALI_BPF_INLINE int vxlan_attempt_decap(struct cali_tc_ctx *ctx) goto fall_through; } if (vxlan_vni(ctx) != CALI_VXLAN_VNI) { - if (rt_addr_is_remote_host(ip_hdr(ctx)->saddr)) { + if (rt_addr_is_remote_host((ipv46_addr_t *)&ip_hdr(ctx)->saddr)) { /* Not BPF-generated VXLAN packet but it was from a Calico host to this node. */ CALI_DEBUG("VXLAN: non-tunnel calico\n"); goto auto_allow; @@ -267,7 +181,7 @@ static CALI_BPF_INLINE int vxlan_attempt_decap(struct cali_tc_ctx *ctx) CALI_DEBUG("VXLAN: Not our VNI\n"); goto fall_through; } - if (!rt_addr_is_remote_host(ip_hdr(ctx)->saddr)) { + if (!rt_addr_is_remote_host((ipv46_addr_t *)&ip_hdr(ctx)->saddr)) { CALI_DEBUG("VXLAN with our VNI from unexpected source.\n"); deny_reason(ctx, CALI_REASON_UNAUTH_SOURCE); goto deny; @@ -279,19 +193,28 @@ static CALI_BPF_INLINE int vxlan_attempt_decap(struct cali_tc_ctx *ctx) goto deny; } - ctx->arpk.ip = ip_hdr(ctx)->saddr; - ctx->arpk.ifindex = ctx->skb->ifindex; - /* We update the map straight with the packet data, eth header is * dst:src but the value is src:dst so it flips it automatically * when we use it on xmit. */ - cali_v4_arp_update_elem(&ctx->arpk, eth_hdr(ctx), 0); - CALI_DEBUG("ARP update for ifindex %d ip %x\n", ctx->arpk.ifindex, bpf_ntohl(ctx->arpk.ip)); + struct arp_key arpk = { + .ifindex = ctx->skb->ifindex, + }; +#ifdef IPVER6 + ipv6hdr_ip_to_ipv6_addr_t(&arpk.ip, &ip_hdr(ctx)->saddr); +#else + arpk.ip = ip_hdr(ctx)->saddr; +#endif + cali_arp_update_elem(&arpk, eth_hdr(ctx), 0); + CALI_DEBUG("ARP update for ifindex %d ip %x\n", arpk.ifindex, debug_ip(arpk.ip)); +#ifdef IPVER6 + ipv6hdr_ip_to_ipv6_addr_t(&ctx->state->tun_ip, &ip_hdr(ctx)->saddr); +#else ctx->state->tun_ip = ip_hdr(ctx)->saddr; +#endif CALI_DEBUG("vxlan decap\n"); - if (vxlan_v4_decap(ctx->skb)) { + if (vxlan_decap(ctx->skb)) { deny_reason(ctx, CALI_REASON_DECAP_FAIL); goto deny; } @@ -303,7 +226,7 @@ static CALI_BPF_INLINE int vxlan_attempt_decap(struct cali_tc_ctx *ctx) goto deny; } - CALI_DEBUG("vxlan decap origin %x\n", bpf_ntohl(ctx->state->tun_ip)); + CALI_DEBUG("vxlan decap origin %x\n", debug_ip(ctx->state->tun_ip)); fall_through: return 0; @@ -316,4 +239,5 @@ static CALI_BPF_INLINE int vxlan_attempt_decap(struct cali_tc_ctx *ctx) return -1; } + #endif /* __CALI_NAT_H__ */ diff --git a/felix/bpf-gpl/nat4.h b/felix/bpf-gpl/nat4.h new file mode 100644 index 00000000000..f79d015024b --- /dev/null +++ b/felix/bpf-gpl/nat4.h @@ -0,0 +1,125 @@ +// Project Calico BPF dataplane programs. +// Copyright (c) 2020-2023 Tigera, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +#ifndef __CALI_NAT4_H__ +#define __CALI_NAT4_H__ + +#include + +#include +#include + +#include "bpf.h" +#include "skb.h" +#include "routes.h" +#include "nat_types.h" + +/* Number of bytes we add to a packet when we do encap. */ +#define VXLAN_ENCAP_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct udphdr) + sizeof(struct vxlanhdr)) + +static CALI_BPF_INLINE int vxlan_encap(struct cali_tc_ctx *ctx, __be32 *ip_src, __be32 *ip_dst) +{ + int ret; + __wsum csum; + + __u32 new_hdrsz = sizeof(struct ethhdr) + sizeof(struct iphdr) + + sizeof(struct udphdr) + sizeof(struct vxlanhdr); + + ret = bpf_skb_adjust_room(ctx->skb, new_hdrsz, BPF_ADJ_ROOM_MAC, + BPF_F_ADJ_ROOM_ENCAP_L4_UDP | + BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | + BPF_F_ADJ_ROOM_ENCAP_L2(sizeof(struct ethhdr))); + + if (ret) { + goto out; + } + + ret = -1; + + if (skb_refresh_validate_ptrs(ctx, new_hdrsz)) { + deny_reason(ctx, CALI_REASON_SHORT); + CALI_DEBUG("Too short VXLAN encap\n"); + goto out; + } + + // Note: assuming L2 packet here so this code can't be used on an L3 device. + struct udphdr *udp = (struct udphdr*) ((void *)ip_hdr(ctx) + IP_SIZE); + struct vxlanhdr *vxlan = (void *)(udp + 1); + struct ethhdr *eth_inner = (void *)(vxlan+1); + struct iphdr *ip_inner = (void*)(eth_inner+1); + + /* Copy the original IP header. Since it is already DNATed, the dest IP is + * already set. All we need to do is to change the source IP + */ + *ip_hdr(ctx) = *ip_inner; + + /* decrement TTL for the inner IP header. TTL must be > 1 to get here */ + ip_dec_ttl(ip_inner); + + ip_hdr(ctx)->saddr = *ip_src; + ip_hdr(ctx)->daddr = *ip_dst; + ip_hdr(ctx)->tot_len = bpf_htons(bpf_ntohs(ip_hdr(ctx)->tot_len) + new_hdrsz); + ip_hdr(ctx)->ihl = 5; /* in case there were options in ip_inner */ + ip_hdr(ctx)->check = 0; + ip_hdr(ctx)->protocol = IPPROTO_UDP; + + udp->source = udp->dest = bpf_htons(VXLAN_PORT); + udp->len = bpf_htons(bpf_ntohs(ip_hdr(ctx)->tot_len) - sizeof(struct iphdr)); + + *((__u8*)&vxlan->flags) = 1 << 3; /* set the I flag to make the VNI valid */ + vxlan->vni = bpf_htonl(CALI_VXLAN_VNI) >> 8; /* it is actually 24-bit, last 8 reserved */ + + /* keep eth_inner MACs zeroed, it is useless after decap */ + eth_inner->h_proto = eth_hdr(ctx)->h_proto; + + CALI_DEBUG("vxlan encap %x : %x\n", bpf_ntohl(ip_hdr(ctx)->saddr), bpf_ntohl(ip_hdr(ctx)->daddr)); + + /* change the checksums last to avoid pointer access revalidation */ + + csum = bpf_csum_diff(0, 0, ctx->ip_header, sizeof(struct iphdr), 0); + ret = bpf_l3_csum_replace(ctx->skb, ((long) ctx->ip_header) - ((long) skb_start_ptr(ctx->skb)) + + offsetof(struct iphdr, check), 0, csum, 0); + +out: + return ret; +} + +static CALI_BPF_INLINE int vxlan_decap(struct __sk_buff *skb) +{ + __u32 extra_hdrsz; + int ret = -1; + + extra_hdrsz = sizeof(struct ethhdr) + sizeof(struct iphdr) + + sizeof(struct udphdr) + sizeof(struct vxlanhdr); + + ret = bpf_skb_adjust_room(skb, -extra_hdrsz, BPF_ADJ_ROOM_MAC | BPF_F_ADJ_ROOM_FIXED_GSO, 0); + + return ret; +} + +static CALI_BPF_INLINE bool vxlan_size_ok(struct cali_tc_ctx *ctx) +{ + return !skb_refresh_validate_ptrs(ctx, UDP_SIZE + sizeof(struct vxlanhdr)); +} + +static CALI_BPF_INLINE __u32 vxlan_vni(struct cali_tc_ctx *ctx) +{ + struct vxlanhdr *vxlan; + + vxlan = skb_ptr_after(skb, udp_hdr(ctx)); + + return bpf_ntohl(vxlan->vni << 8); /* 24-bit field, last 8 reserved */ +} + +static CALI_BPF_INLINE bool vxlan_vni_is_valid(struct cali_tc_ctx *ctx) +{ + struct vxlanhdr *vxlan; + + vxlan = skb_ptr_after(ctx->skb, udp_hdr(ctx)); + + return *((__u8*)&vxlan->flags) & (1 << 3); +} + +#endif /* __CALI_NAT4_H__ */ diff --git a/felix/bpf-gpl/nat6.h b/felix/bpf-gpl/nat6.h new file mode 100644 index 00000000000..d6b0ede2187 --- /dev/null +++ b/felix/bpf-gpl/nat6.h @@ -0,0 +1,113 @@ +// Project Calico BPF dataplane programs. +// Copyright (c) 2020-2023 Tigera, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +#ifndef __CALI_NAT4_H__ +#define __CALI_NAT4_H__ + +#include + +#include +#include + +#include "bpf.h" +#include "skb.h" +#include "routes.h" +#include "nat_types.h" + +/* Number of bytes we add to a packet when we do encap. */ +#define VXLAN_ENCAP_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct udphdr) + sizeof(struct vxlanhdr)) + +static CALI_BPF_INLINE int vxlan_encap(struct cali_tc_ctx *ctx, ipv6_addr_t *ip_src, ipv6_addr_t *ip_dst) +{ + __u32 new_hdrsz = sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + + sizeof(struct udphdr) + sizeof(struct vxlanhdr); + + if (bpf_skb_adjust_room(ctx->skb, new_hdrsz, BPF_ADJ_ROOM_MAC, + BPF_F_ADJ_ROOM_ENCAP_L4_UDP | + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 | + BPF_F_ADJ_ROOM_ENCAP_L2(sizeof(struct ethhdr)))) { + return -1; + } + + if (skb_refresh_validate_ptrs(ctx, new_hdrsz)) { + deny_reason(ctx, CALI_REASON_SHORT); + CALI_DEBUG("Too short VXLAN encap\n"); + return -1; + } + + // Note: assuming L2 packet here so this code can't be used on an L3 device. + struct udphdr *udp = (struct udphdr*) ((void *)ip_hdr(ctx) + IP_SIZE); + struct vxlanhdr *vxlan = (void *)(udp + 1); + struct ethhdr *eth_inner = (void *)(vxlan+1); + struct ipv6hdr *ip_inner = (void*)(eth_inner+1); + + /* Copy the original IP header. Since it is already DNATed, the dest IP is + * already set. All we need to do is to change the source IP + */ + *ip_hdr(ctx) = *ip_inner; + + /* decrement TTL for the inner IP header. TTL must be > 1 to get here */ + ip_inner->hop_limit--; + + ipv6_addr_t_to_ipv6hdr_ip(&ip_hdr(ctx)->saddr, ip_src); + ipv6_addr_t_to_ipv6hdr_ip(&ip_hdr(ctx)->daddr, ip_dst); + ip_hdr(ctx)->payload_len = bpf_htons(bpf_ntohs(ip_hdr(ctx)->payload_len) + new_hdrsz); + ip_hdr(ctx)->nexthdr = IPPROTO_UDP; + + udp->source = udp->dest = bpf_htons(VXLAN_PORT); + udp->len = bpf_htons(bpf_ntohs(ip_hdr(ctx)->payload_len) - sizeof(struct iphdr)); + /* XXX we leave udp->check == 0 which is not legal in IPv6, but we are + * the only ones parsing that packet! + */ + + *((__u8*)&vxlan->flags) = 1 << 3; /* set the I flag to make the VNI valid */ + vxlan->vni = bpf_htonl(CALI_VXLAN_VNI) >> 8; /* it is actually 24-bit, last 8 reserved */ + + /* keep eth_inner MACs zeroed, it is useless after decap */ + eth_inner->h_proto = eth_hdr(ctx)->h_proto; + + CALI_DEBUG("vxlan encap %x : %x\n", + bpf_ntohl(ip_hdr(ctx)->saddr.in6_u.u6_addr32[3]), bpf_ntohl(ip_hdr(ctx)->daddr.in6_u.u6_addr32[3])); + + return 0; +} + +static CALI_BPF_INLINE int vxlan_decap(struct __sk_buff *skb) +{ + __u32 extra_hdrsz; + int ret = -1; + + extra_hdrsz = sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + + sizeof(struct udphdr) + sizeof(struct vxlanhdr); + + ret = bpf_skb_adjust_room(skb, -extra_hdrsz, BPF_ADJ_ROOM_MAC | BPF_F_ADJ_ROOM_FIXED_GSO, 0); + + return ret; +} + +static CALI_BPF_INLINE bool vxlan_size_ok(struct cali_tc_ctx *ctx) +{ + return !skb_refresh_validate_ptrs(ctx, UDP_SIZE + sizeof(struct vxlanhdr)); +} + +static CALI_BPF_INLINE __u32 vxlan_vni(struct cali_tc_ctx *ctx) +{ + struct vxlanhdr *vxlan; + + vxlan = skb_ptr_after(skb, udp_hdr(ctx)); + + return bpf_ntohl(vxlan->vni << 8); /* 24-bit field, last 8 reserved */ +} + +static CALI_BPF_INLINE bool vxlan_vni_is_valid(struct cali_tc_ctx *ctx) +{ + struct vxlanhdr *vxlan; + + vxlan = skb_ptr_after(ctx->skb, udp_hdr(ctx)); + + return *((__u8*)&vxlan->flags) & (1 << 3); +} + +#endif /* __CALI_NAT4_H__ */ diff --git a/felix/bpf-gpl/nat_lookup.h b/felix/bpf-gpl/nat_lookup.h index 0c4284d8667..a506ddab9e2 100644 --- a/felix/bpf-gpl/nat_lookup.h +++ b/felix/bpf-gpl/nat_lookup.h @@ -14,46 +14,46 @@ #include "routes.h" #include "nat_types.h" -static CALI_BPF_INLINE struct calico_nat_dest* calico_v4_nat_lookup(__be32 ip_src, - __be32 ip_dst, - __u8 ip_proto, - __u16 dport, - bool from_tun, - nat_lookup_result *res, - int affinity_always_timeo, - bool affinity_tmr_update +static CALI_BPF_INLINE struct calico_nat_dest* calico_nat_lookup(ipv46_addr_t *ip_src, + ipv46_addr_t *ip_dst, + __u8 ip_proto, + __u16 dport, + bool from_tun, + nat_lookup_result *res, + int affinity_always_timeo, + bool affinity_tmr_update #if !(CALI_F_XDP) && !(CALI_F_CGROUP) - , struct cali_tc_ctx *ctx + , struct cali_tc_ctx *ctx #endif - ) + ) { - struct calico_nat_v4_key nat_key = { + struct calico_nat_key nat_key = { .prefixlen = NAT_PREFIX_LEN_WITH_SRC_MATCH_IN_BITS, - .addr = ip_dst, + .addr = *ip_dst, .port = dport, .protocol = ip_proto, - .saddr = ip_src, + .saddr = *ip_src, }; - struct calico_nat_v4_value *nat_lv1_val; - struct calico_nat_secondary_v4_key nat_lv2_key; + struct calico_nat_value *nat_lv1_val; + struct calico_nat_secondary_key nat_lv2_key; struct calico_nat_dest *nat_lv2_val; - struct calico_nat_v4_affinity_key affkey = {}; + struct calico_nat_affinity_key affkey = {}; __u64 now = 0; - nat_lv1_val = cali_v4_nat_fe_lookup_elem(&nat_key); + nat_lv1_val = cali_nat_fe_lookup_elem(&nat_key); switch (nat_key.protocol) { case IPPROTO_UDP: - CALI_DEBUG("NAT: 1st level lookup addr=%x port=%d udp\n", (int)bpf_ntohl(nat_key.addr), (int)dport); + CALI_DEBUG("NAT: 1st level lookup addr=%x port=%d udp\n", (int)debug_ip(nat_key.addr), (int)dport); break; case IPPROTO_TCP: - CALI_DEBUG("NAT: 1st level lookup addr=%x port=%d tcp\n", (int)bpf_ntohl(nat_key.addr), (int)dport); + CALI_DEBUG("NAT: 1st level lookup addr=%x port=%d tcp\n", (int)debug_ip(nat_key.addr), (int)dport); break; case IPPROTO_ICMP: - CALI_DEBUG("NAT: 1st level lookup addr=%x port=%d icmp\n", (int)bpf_ntohl(nat_key.addr), (int)dport); + CALI_DEBUG("NAT: 1st level lookup addr=%x port=%d icmp\n", (int)debug_ip(nat_key.addr), (int)dport); break; default: - CALI_DEBUG("NAT: 1st level lookup addr=%x port=%d other\n", (int)bpf_ntohl(nat_key.addr), (int)dport); + CALI_DEBUG("NAT: 1st level lookup addr=%x port=%d other\n", (int)debug_ip(nat_key.addr), (int)dport); break; } @@ -66,7 +66,7 @@ static CALI_BPF_INLINE struct calico_nat_dest* calico_v4_nat_lookup(__be32 ip_sr * straight NAT and avoid a possible extra hop. */ if (!(CALI_F_FROM_WEP || CALI_F_TO_HEP || CALI_F_CGROUP || - (CALI_F_FROM_HEP && from_tun)) || ip_dst == 0xffffffff) { + (CALI_F_FROM_HEP && from_tun)) || ip_equal(*ip_dst, NP_SPECIAL_IP)) { return NULL; } @@ -96,8 +96,8 @@ static CALI_BPF_INLINE struct calico_nat_dest* calico_v4_nat_lookup(__be32 ip_sr return NULL; } - nat_key.addr = 0xffffffff; - nat_lv1_val = cali_v4_nat_fe_lookup_elem(&nat_key); + nat_key.addr = NP_SPECIAL_IP; + nat_lv1_val = cali_nat_fe_lookup_elem(&nat_key); if (!nat_lv1_val) { CALI_DEBUG("NAT: nodeport miss\n"); return NULL; @@ -105,7 +105,7 @@ static CALI_BPF_INLINE struct calico_nat_dest* calico_v4_nat_lookup(__be32 ip_sr CALI_DEBUG("NAT: nodeport hit\n"); } /* With LB source range, we install a drop entry in the NAT FE map - * with count equal to 0xffffffff. If we hit this entry, + * with count equal to all-ones for both ip4/6. If we hit this entry, * packet is dropped. */ if (nat_lv1_val->count == NAT_FE_DROP_COUNT) { @@ -147,34 +147,34 @@ static CALI_BPF_INLINE struct calico_nat_dest* calico_v4_nat_lookup(__be32 ip_sr goto skip_affinity; } - struct calico_nat_v4 nat_data = { - .addr = ip_dst, + struct calico_nat nat_data = { + .addr = *ip_dst, .port = dport, .protocol = ip_proto, }; affkey.nat_key = nat_data; - affkey.client_ip = ip_src; + affkey.client_ip = *ip_src; CALI_DEBUG("NAT: backend affinity %d seconds\n", nat_lv1_val->affinity_timeo ? : affinity_always_timeo); - struct calico_nat_v4_affinity_val *affval; + struct calico_nat_affinity_val *affval; now = bpf_ktime_get_ns(); - affval = cali_v4_nat_aff_lookup_elem(&affkey); + affval = cali_nat_aff_lookup_elem(&affkey); if (affval) { int timeo = (affinity_always_timeo ? : nat_lv1_val->affinity_timeo); if (now - affval->ts <= timeo * 1000000000ULL) { CALI_DEBUG("NAT: using affinity backend %x:%d\n", - bpf_ntohl(affval->nat_dest.addr), affval->nat_dest.port); + debug_ip(affval->nat_dest.addr), affval->nat_dest.port); if (affinity_tmr_update) { affval->ts = now; } return &affval->nat_dest; } - CALI_DEBUG("NAT: affinity expired for %x:%d\n", bpf_ntohl(ip_dst), dport); + CALI_DEBUG("NAT: affinity expired for %x:%d\n", debug_ip(*ip_dst), dport); } else { - CALI_DEBUG("no previous affinity for %x:%d", bpf_ntohl(ip_dst), dport); + CALI_DEBUG("no previous affinity for %x:%d", debug_ip(*ip_dst), dport); } /* To be k8s conformant, fall through to pick a random backend. */ @@ -185,23 +185,23 @@ static CALI_BPF_INLINE struct calico_nat_dest* calico_v4_nat_lookup(__be32 ip_sr CALI_DEBUG("NAT: 1st level hit; id=%d ordinal=%d\n", nat_lv2_key.id, nat_lv2_key.ordinal); - if (!(nat_lv2_val = cali_v4_nat_be_lookup_elem(&nat_lv2_key))) { + if (!(nat_lv2_val = cali_nat_be_lookup_elem(&nat_lv2_key))) { CALI_DEBUG("NAT: backend miss\n"); *res = NAT_NO_BACKEND; return NULL; } - CALI_DEBUG("NAT: backend selected %x:%d\n", bpf_ntohl(nat_lv2_val->addr), nat_lv2_val->port); + CALI_DEBUG("NAT: backend selected %x:%d\n", debug_ip(nat_lv2_val->addr), nat_lv2_val->port); if (nat_lv1_val->affinity_timeo != 0 || affinity_always_timeo) { int err; - struct calico_nat_v4_affinity_val val = { + struct calico_nat_affinity_val val = { .ts = now, .nat_dest = *nat_lv2_val, }; - CALI_DEBUG("NAT: updating affinity for client %x\n", bpf_ntohl(ip_src)); - if ((err = cali_v4_nat_aff_update_elem(&affkey, &val, BPF_ANY))) { + CALI_DEBUG("NAT: updating affinity for client %x\n", debug_ip(*ip_src)); + if ((err = cali_nat_aff_update_elem(&affkey, &val, BPF_ANY))) { CALI_INFO("NAT: failed to update affinity table: %d\n", err); /* we do carry on, we have a good nat_lv2_val */ } @@ -211,13 +211,13 @@ static CALI_BPF_INLINE struct calico_nat_dest* calico_v4_nat_lookup(__be32 ip_sr } #if !(CALI_F_XDP) && !(CALI_F_CGROUP) -static CALI_BPF_INLINE struct calico_nat_dest* calico_v4_nat_lookup_tc(struct cali_tc_ctx *ctx, - __be32 ip_src, __be32 ip_dst, - __u8 ip_proto, __u16 dport, - bool from_tun, - nat_lookup_result *res) +static CALI_BPF_INLINE struct calico_nat_dest* calico_nat_lookup_tc(struct cali_tc_ctx *ctx, + ipv46_addr_t *ip_src, ipv46_addr_t *ip_dst, + __u8 ip_proto, __u16 dport, + bool from_tun, + nat_lookup_result *res) { - return calico_v4_nat_lookup(ip_src, ip_dst, ip_proto, dport, from_tun, res, 0, false, ctx); + return calico_nat_lookup(ip_src, ip_dst, ip_proto, dport, from_tun, res, 0, false, ctx); } #endif diff --git a/felix/bpf-gpl/nat_types.h b/felix/bpf-gpl/nat_types.h index 3c47a6d72ec..22996b9fbe1 100644 --- a/felix/bpf-gpl/nat_types.h +++ b/felix/bpf-gpl/nat_types.h @@ -14,8 +14,8 @@ typedef enum calico_nat_lookup_result { } nat_lookup_result; -struct calico_nat_v4 { - __u32 addr; // NBO +struct calico_nat { + ipv46_addr_t addr; // NBO __u16 port; // HBO __u8 protocol; }; @@ -24,31 +24,31 @@ struct calico_nat_v4 { * Modified the map from HASH to LPM_TRIE. This is to drop packets outside * src IP range specified for Load Balancer */ -struct __attribute__((__packed__)) calico_nat_v4_key { +struct __attribute__((__packed__)) calico_nat_key { __u32 prefixlen; - __u32 addr; // NBO + ipv46_addr_t addr; // NBO __u16 port; // HBO __u8 protocol; - __u32 saddr; + ipv46_addr_t saddr; __u8 pad; }; /* Prefix len = (dst_addr + port + protocol + src_addr) in bits. */ -#define NAT_PREFIX_LEN_WITH_SRC_MATCH (sizeof(struct calico_nat_v4_key) - \ - sizeof(((struct calico_nat_v4_key*)0)->prefixlen) - \ - sizeof(((struct calico_nat_v4_key*)0)->pad)) +#define NAT_PREFIX_LEN_WITH_SRC_MATCH (sizeof(struct calico_nat_key) - \ + sizeof(((struct calico_nat_key*)0)->prefixlen) - \ + sizeof(((struct calico_nat_key*)0)->pad)) #define NAT_PREFIX_LEN_WITH_SRC_MATCH_IN_BITS (NAT_PREFIX_LEN_WITH_SRC_MATCH * 8) // This is used as a special ID along with count=0 to drop a packet at nat level1 lookup #define NAT_FE_DROP_COUNT 0xffffffff -union calico_nat_v4_lpm_key { +union calico_nat_lpm_key { struct bpf_lpm_trie_key lpm; - struct calico_nat_v4_key key; + struct calico_nat_key key; }; -struct calico_nat_v4_value { +struct calico_nat_value { __u32 id; __u32 count; __u32 local; @@ -59,45 +59,60 @@ struct calico_nat_v4_value { #define NAT_FLG_EXTERNAL_LOCAL 0x1 #define NAT_FLG_INTERNAL_LOCAL 0x2 -CALI_MAP(cali_v4_nat_fe, 3, +#ifdef IPVER6 +CALI_MAP_NAMED(cali_v6_nat_fe, cali_nat_fe, 3, +#else +CALI_MAP_NAMED(cali_v4_nat_fe, cali_nat_fe, 3, +#endif BPF_MAP_TYPE_LPM_TRIE, - union calico_nat_v4_lpm_key, struct calico_nat_v4_value, + union calico_nat_lpm_key, struct calico_nat_value, 64*1024, BPF_F_NO_PREALLOC) // Map: NAT level two. ID and ordinal -> new dest and port. -struct calico_nat_secondary_v4_key { +struct calico_nat_secondary_key { __u32 id; __u32 ordinal; }; struct calico_nat_dest { - __u32 addr; + ipv46_addr_t addr; __u16 port; __u8 pad[2]; }; -CALI_MAP_V1(cali_v4_nat_be, +#ifdef IPVER6 +CALI_MAP_NAMED(cali_v6_nat_be, cali_nat_be,, +#else +CALI_MAP_NAMED(cali_v4_nat_be, cali_nat_be,, +#endif BPF_MAP_TYPE_HASH, - struct calico_nat_secondary_v4_key, struct calico_nat_dest, + struct calico_nat_secondary_key, struct calico_nat_dest, 256*1024, BPF_F_NO_PREALLOC) -struct calico_nat_v4_affinity_key { - struct calico_nat_v4 nat_key; - __u32 client_ip; +struct calico_nat_affinity_key { + struct calico_nat nat_key; + ipv46_addr_t client_ip; __u32 padding; }; -struct calico_nat_v4_affinity_val { +struct calico_nat_affinity_val { struct calico_nat_dest nat_dest; +#ifdef IPVER6 + __u32 __pad; +#endif __u64 ts; }; -CALI_MAP_V1(cali_v4_nat_aff, +#ifdef IPVER6 +CALI_MAP_NAMED(cali_v6_nat_aff, cali_nat_aff,, +#else +CALI_MAP_NAMED(cali_v4_nat_aff, cali_nat_aff,, +#endif BPF_MAP_TYPE_LRU_HASH, - struct calico_nat_v4_affinity_key, struct calico_nat_v4_affinity_val, + struct calico_nat_affinity_key, struct calico_nat_affinity_val, 64*1024, 0) struct vxlanhdr { diff --git a/felix/bpf-gpl/parsing.h b/felix/bpf-gpl/parsing.h index 948fa0251d5..41103faf926 100644 --- a/felix/bpf-gpl/parsing.h +++ b/felix/bpf-gpl/parsing.h @@ -9,6 +9,7 @@ #include #include +#include "types.h" #include "skb.h" #include "routes.h" @@ -17,127 +18,61 @@ #define PARSING_ALLOW_WITHOUT_ENFORCING_POLICY 2 #define PARSING_ERROR -1 -static CALI_BPF_INLINE int bpf_load_bytes(struct cali_tc_ctx *ctx, __u32 offset, void *buf, __u32 len) -{ - int ret; +static CALI_BPF_INLINE int bpf_load_bytes(struct cali_tc_ctx *ctx, __u32 offset, void *buf, __u32 len); - if (CALI_F_XDP) { -#ifdef BPF_CORE_SUPPORTED - if (bpf_core_enum_value_exists(enum bpf_func_id, BPF_FUNC_xdp_load_bytes)) { - ret = bpf_xdp_load_bytes(ctx->xdp, offset, buf, len); - } else +#ifdef IPVER6 +#include "parsing6.h" +#else +#include "parsing4.h" #endif - { - return -22 /* EINVAL */; - } - } else { - ret = bpf_skb_load_bytes(ctx->skb, offset, buf, len); - } - return ret; +#ifdef IPVER6 +static CALI_BPF_INLINE int parse_packet_ip(struct cali_tc_ctx *ctx) +{ + return parse_packet_ip_v6(ctx); } -static CALI_BPF_INLINE int parse_packet_ip(struct cali_tc_ctx *ctx) { - __u16 protocol = 0; - - /* We need to make a decision based on Ethernet protocol, however, - * the protocol number is not available to XDP programs like TC ones. - * In TC programs protocol number is available via skb->protocol. - * For that, in XDP programs we need to parse at least up to Ethernet - * first, before making any decision. But in TC programs we can make - * an initial decision based on Ethernet protocol before parsing packet - * for more headers. - */ - if (CALI_F_XDP) { - if (skb_refresh_validate_ptrs(ctx, UDP_SIZE)) { - deny_reason(ctx, CALI_REASON_SHORT); - CALI_DEBUG("Too short\n"); - goto deny; - } - protocol = bpf_ntohs(eth_hdr(ctx)->h_proto); - } else { - protocol = bpf_ntohs(ctx->skb->protocol); - } +static CALI_BPF_INLINE void tc_state_fill_from_iphdr(struct cali_tc_ctx *ctx) +{ + return tc_state_fill_from_iphdr_v6(ctx); +} +#else +static CALI_BPF_INLINE int parse_packet_ip(struct cali_tc_ctx *ctx) +{ + return parse_packet_ip_v4(ctx); +} - switch (protocol) { - case ETH_P_IP: - break; - case ETH_P_ARP: - CALI_DEBUG("ARP: allowing packet\n"); - goto allow_no_fib; - case ETH_P_IPV6: - // If IPv6 is supported and enabled, handle the packet - if (GLOBAL_FLAGS & CALI_GLOBALS_IPV6_ENABLED) { - CALI_DEBUG("IPv6 packet, continue with parsing it.\n"); - goto ipv6_packet; - } - // otherwise, drop if the packet is from workload - if (CALI_F_WEP) { - CALI_DEBUG("IPv6 from workload: drop\n"); - goto deny; - } else { // or allow, it the packet is on host interface - CALI_DEBUG("IPv6 on host interface: allow\n"); - goto allow_no_fib; - } - default: - if (CALI_F_WEP) { - CALI_DEBUG("Unknown ethertype (%x), drop\n", protocol); - goto deny; - } else { - CALI_DEBUG("Unknown ethertype on host interface (%x), allow\n", - protocol); - goto allow_no_fib; - } - } +static CALI_BPF_INLINE void tc_state_fill_from_iphdr(struct cali_tc_ctx *ctx) +{ + return tc_state_fill_from_iphdr_v4(ctx); +} +#endif - // In TC programs, parse packet and validate its size. This is - // already done for XDP programs at the beginning of the function. - if (!CALI_F_XDP) { - if (skb_refresh_validate_ptrs(ctx, UDP_SIZE)) { - deny_reason(ctx, CALI_REASON_SHORT); - CALI_DEBUG("Too short\n"); - goto deny; - } - } +static CALI_BPF_INLINE int bpf_load_bytes(struct cali_tc_ctx *ctx, __u32 offset, void *buf, __u32 len) +{ + int ret; - CALI_DEBUG("IP id=%d\n",bpf_ntohs(ip_hdr(ctx)->id)); - CALI_DEBUG("IP s=%x d=%x\n", bpf_ntohl(ip_hdr(ctx)->saddr), bpf_ntohl(ip_hdr(ctx)->daddr)); - // Drop malformed IP packets - if (ip_hdr(ctx)->ihl < 5) { - CALI_DEBUG("Drop malformed IP packets\n"); - deny_reason(ctx, CALI_REASON_IP_MALFORMED); - goto deny; +#if CALI_F_XDP +#ifdef BPF_CORE_SUPPORTED + if (bpf_core_enum_value_exists(enum bpf_func_id, BPF_FUNC_xdp_load_bytes)) { + ret = bpf_xdp_load_bytes(ctx->xdp, offset, buf, len); + } else +#endif + { + return -22 /* EINVAL */; } +#else /* CALI_F_XDP */ + ret = bpf_skb_load_bytes(ctx->skb, offset, buf, len); +#endif /* CALI_F_XDP */ - return PARSING_OK; - -ipv6_packet: - // Parse IPv6 header, and perform necessary checks here - return PARSING_OK_V6; - -allow_no_fib: - return PARSING_ALLOW_WITHOUT_ENFORCING_POLICY; - -deny: - return PARSING_ERROR; -} - -static CALI_BPF_INLINE void tc_state_fill_from_iphdr(struct cali_tc_ctx *ctx) -{ - ctx->state->ip_src = ip_hdr(ctx)->saddr; - ctx->state->ip_dst = ip_hdr(ctx)->daddr; - ctx->state->pre_nat_ip_dst = ip_hdr(ctx)->daddr; - ctx->state->ip_proto = ip_hdr(ctx)->protocol; - ctx->state->ip_size = ip_hdr(ctx)->tot_len; - ctx->ipheader_len = ctx->state->ihl = ip_hdr(ctx)->ihl * 4; - CALI_DEBUG("IP ihl=%d bytes\n", ctx->ipheader_len); + return ret; } /* Continue parsing packet based on the IP protocol and fill in relevant fields * in the state (struct cali_tc_state). */ static CALI_BPF_INLINE int tc_state_fill_from_nexthdr(struct cali_tc_ctx *ctx, bool decap) { - if (ip_hdr(ctx)->ihl == 5) { + if (ctx->ipheader_len == 20) { switch (ctx->state->ip_proto) { case IPPROTO_TCP: if (skb_refresh_validate_ptrs(ctx, TCP_SIZE)) { @@ -226,8 +161,8 @@ static CALI_BPF_INLINE int tc_state_fill_from_nexthdr(struct cali_tc_ctx *ctx, b /* CALI_F_FROM_HEP case is handled in vxlan_attempt_decap above since it already decoded * the header. */ if (CALI_F_TO_HEP) { - if (rt_addr_is_remote_host(ctx->state->ip_dst) && - rt_addr_is_local_host(ctx->state->ip_src)) { + if (rt_addr_is_remote_host(&ctx->state->ip_dst) && + rt_addr_is_local_host(&ctx->state->ip_src)) { CALI_DEBUG("VXLAN packet to known Calico host, allow.\n"); goto allow; } else { @@ -253,7 +188,7 @@ static CALI_BPF_INLINE int tc_state_fill_from_nexthdr(struct cali_tc_ctx *ctx, b goto deny; } if (CALI_F_FROM_HEP) { - if (rt_addr_is_remote_host(ctx->state->ip_src)) { + if (rt_addr_is_remote_host(&ctx->state->ip_src)) { CALI_DEBUG("IPIP packet from known Calico host, allow.\n"); goto allow; } else { @@ -262,7 +197,7 @@ static CALI_BPF_INLINE int tc_state_fill_from_nexthdr(struct cali_tc_ctx *ctx, b goto deny; } } else if (CALI_F_TO_HEP && !CALI_F_TUNNEL && !CALI_F_L3_DEV) { - if (rt_addr_is_remote_host(ctx->state->ip_dst)) { + if (rt_addr_is_remote_host(&ctx->state->ip_dst)) { CALI_DEBUG("IPIP packet to known Calico host, allow.\n"); goto allow; } else { @@ -290,4 +225,5 @@ static CALI_BPF_INLINE int tc_state_fill_from_nexthdr(struct cali_tc_ctx *ctx, b return PARSING_ERROR; } + #endif /* __CALI_PARSING_H__ */ diff --git a/felix/bpf-gpl/parsing4.h b/felix/bpf-gpl/parsing4.h new file mode 100644 index 00000000000..1662ee6d4c7 --- /dev/null +++ b/felix/bpf-gpl/parsing4.h @@ -0,0 +1,105 @@ +// Project Calico BPF dataplane programs. +// Copyright (c) 2020-2022 Tigera, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +#ifndef __CALI_PARSING4_H__ +#define __CALI_PARSING4_H__ + +static CALI_BPF_INLINE int parse_packet_ip_v4(struct cali_tc_ctx *ctx) +{ + __u16 protocol = 0; + + /* We need to make a decision based on Ethernet protocol, however, + * the protocol number is not available to XDP programs like TC ones. + * In TC programs protocol number is available via skb->protocol. + * For that, in XDP programs we need to parse at least up to Ethernet + * first, before making any decision. But in TC programs we can make + * an initial decision based on Ethernet protocol before parsing packet + * for more headers. + */ +#if CALI_F_XDP + if (skb_refresh_validate_ptrs(ctx, UDP_SIZE)) { + deny_reason(ctx, CALI_REASON_SHORT); + CALI_DEBUG("Too short\n"); + goto deny; + } + protocol = bpf_ntohs(eth_hdr(ctx)->h_proto); +#else + protocol = bpf_ntohs(ctx->skb->protocol); +#endif + + switch (protocol) { + case ETH_P_IP: + break; + case ETH_P_ARP: + CALI_DEBUG("ARP: allowing packet\n"); + goto allow_no_fib; + case ETH_P_IPV6: + // If IPv6 is supported and enabled, handle the packet + if (GLOBAL_FLAGS & CALI_GLOBALS_IPV6_ENABLED) { + CALI_DEBUG("IPv6 packet, continue with parsing it.\n"); + goto ipv6_packet; + } + // otherwise, drop if the packet is from workload + if (CALI_F_WEP) { + CALI_DEBUG("IPv6 from workload: drop\n"); + goto deny; + } else { // or allow, it the packet is on host interface + CALI_DEBUG("IPv6 on host interface: allow\n"); + goto allow_no_fib; + } + default: + if (CALI_F_WEP) { + CALI_DEBUG("Unknown ethertype (%x), drop\n", protocol); + goto deny; + } else { + CALI_DEBUG("Unknown ethertype on host interface (%x), allow\n", + protocol); + goto allow_no_fib; + } + } + + // In TC programs, parse packet and validate its size. This is + // already done for XDP programs at the beginning of the function. +#if !CALI_F_XDP + if (skb_refresh_validate_ptrs(ctx, UDP_SIZE)) { + deny_reason(ctx, CALI_REASON_SHORT); + CALI_DEBUG("Too short\n"); + goto deny; + } +#endif + + CALI_DEBUG("IP id=%d\n",bpf_ntohs(ip_hdr(ctx)->id)); + CALI_DEBUG("IP s=%x d=%x\n", bpf_ntohl(ip_hdr(ctx)->saddr), bpf_ntohl(ip_hdr(ctx)->daddr)); + // Drop malformed IP packets + if (ip_hdr(ctx)->ihl < 5) { + CALI_DEBUG("Drop malformed IP packets\n"); + deny_reason(ctx, CALI_REASON_IP_MALFORMED); + goto deny; + } + + return PARSING_OK; + +ipv6_packet: + // Parse IPv6 header, and perform necessary checks here + return PARSING_OK_V6; + +allow_no_fib: + return PARSING_ALLOW_WITHOUT_ENFORCING_POLICY; + +deny: + return PARSING_ERROR; +} + +static CALI_BPF_INLINE void tc_state_fill_from_iphdr_v4(struct cali_tc_ctx *ctx) +{ + ctx->state->ip_src = ip_hdr(ctx)->saddr; + ctx->state->ip_dst = ip_hdr(ctx)->daddr; + ctx->state->pre_nat_ip_dst = ip_hdr(ctx)->daddr; + ctx->state->ip_proto = ip_hdr(ctx)->protocol; + ctx->state->ip_size = ip_hdr(ctx)->tot_len; + ctx->ipheader_len = ctx->state->ihl = ip_hdr(ctx)->ihl * 4; + CALI_DEBUG("IP ihl=%d bytes\n", ctx->ipheader_len); +} + +#endif /* __CALI_PARSING4_H__ */ diff --git a/felix/bpf-gpl/parsing6.h b/felix/bpf-gpl/parsing6.h new file mode 100644 index 00000000000..85fd3f3a3fc --- /dev/null +++ b/felix/bpf-gpl/parsing6.h @@ -0,0 +1,176 @@ +// Project Calico BPF dataplane programs. +// Copyright (c) 2023 Tigera, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +#ifndef __CALI_PARSING6_H__ +#define __CALI_PARSING6_H__ + +#define NEXTHDR_HOP 0 +#define NEXTHDR_ROUTING 43 +#define NEXTHDR_FRAGMENT 44 +#define NEXTHDR_GRE 47 +#define NEXTHDR_ESP 50 +#define NEXTHDR_AUTH 51 +#define NEXTHDR_NONE 59 +#define NEXTHDR_DEST 60 +#define NEXTHDR_MOBILITY 135 + + +static CALI_BPF_INLINE int parse_packet_ip_v6(struct cali_tc_ctx *ctx) { + __u16 protocol = 0; + + /* We need to make a decision based on Ethernet protocol, however, + * the protocol number is not available to XDP programs like TC ones. + * In TC programs protocol number is available via skb->protocol. + * For that, in XDP programs we need to parse at least up to Ethernet + * first, before making any decision. But in TC programs we can make + * an initial decision based on Ethernet protocol before parsing packet + * for more headers. + */ + if (CALI_F_XDP) { + if (skb_refresh_validate_ptrs(ctx, UDP_SIZE)) { + deny_reason(ctx, CALI_REASON_SHORT); + CALI_DEBUG("Too short\n"); + goto deny; + } + protocol = bpf_ntohs(eth_hdr(ctx)->h_proto); + } else { + protocol = bpf_ntohs(ctx->skb->protocol); + } + + switch (protocol) { + case ETH_P_IPV6: + break; + default: + if (CALI_F_WEP) { + CALI_DEBUG("Unknown ethertype (%x), drop\n", protocol); + goto deny; + } else { + CALI_DEBUG("Unknown ethertype on host interface (%x), allow\n", + protocol); + goto allow_no_fib; + } + } + + // In TC programs, parse packet and validate its size. This is + // already done for XDP programs at the beginning of the function. + if (!CALI_F_XDP) { + if (skb_refresh_validate_ptrs(ctx, UDP_SIZE)) { + deny_reason(ctx, CALI_REASON_SHORT); + CALI_DEBUG("Too short\n"); + goto deny; + } + } + + return PARSING_OK_V6; + +allow_no_fib: + return PARSING_ALLOW_WITHOUT_ENFORCING_POLICY; + +deny: + return PARSING_ERROR; +} + +static CALI_BPF_INLINE bool ipv6_hexthdr_is_opt(int nexthdr) +{ + switch(nexthdr) { + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_FRAGMENT: + case NEXTHDR_GRE: + case NEXTHDR_ESP: + case NEXTHDR_AUTH: + case NEXTHDR_NONE: + case NEXTHDR_DEST: + case NEXTHDR_MOBILITY: + return true; + } + + return false; +} + +static CALI_BPF_INLINE void tc_state_fill_from_iphdr_v6(struct cali_tc_ctx *ctx) +{ + // Fill in source ip + ipv6hdr_ip_to_ipv6_addr_t(&ctx->state->ip_src, &ip_hdr(ctx)->saddr); + // Fill in dst ip + ipv6hdr_ip_to_ipv6_addr_t(&ctx->state->ip_dst, &ip_hdr(ctx)->daddr); + // Fill in pre nat ip + ctx->state->pre_nat_ip_dst = ctx->state->ip_dst; + // Fill in other information + ctx->state->ip_size = ip_hdr(ctx)->payload_len; + + int hdr; + + switch (ip_hdr(ctx)->nexthdr) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_ICMPV6: + ctx->ipheader_len = ctx->state->ihl = IP_SIZE; + ctx->state->ip_proto = ip_hdr(ctx)->nexthdr; + goto out; + case NEXTHDR_NONE: + goto deny; + default: + hdr = ip_hdr(ctx)->nexthdr; + } + + CALI_DEBUG("ip->nexthdr %d IPv6 options!\n", ip_hdr(ctx)->nexthdr); + + int i; + int ipoff = skb_iphdr_offset(ctx); + int len = IP_SIZE; + + for (i = 0; i < 8; i++) { + struct ipv6_opt_hdr opt; + + CALI_DEBUG("loading extension at offset %d\n", ipoff + len); + if (bpf_load_bytes(ctx, ipoff + len, &opt, sizeof(opt))) { + CALI_DEBUG("Too short\n"); + goto deny; + } + + CALI_DEBUG("ext nexthdr %d hdrlen %d\n", opt.nexthdr, opt.hdrlen); + + switch(hdr) { + case NEXTHDR_FRAGMENT: + len += 16; + break; + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_DEST: + case NEXTHDR_GRE: + case NEXTHDR_ESP: + case NEXTHDR_AUTH: + case NEXTHDR_MOBILITY: + len += (opt.hdrlen + 1) * 8; + break; + } + + switch(opt.nexthdr) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_ICMPV6: + ctx->ipheader_len = ctx->state->ihl = len; + ctx->state->ip_proto = opt.nexthdr; + goto out; + case NEXTHDR_NONE: + goto deny; + } + + + } + +out: + CALI_DEBUG("IP ihl=%d bytes\n", ctx->ipheader_len); + return; + +deny: + if (CALI_F_XDP) { + bpf_exit(XDP_DROP); + } else { + bpf_exit(TC_ACT_SHOT); + } +} + +#endif /* __CALI_PARSING6_H__ */ diff --git a/felix/bpf-gpl/routes.h b/felix/bpf-gpl/routes.h index 8e51d764053..00804899f1b 100644 --- a/felix/bpf-gpl/routes.h +++ b/felix/bpf-gpl/routes.h @@ -12,7 +12,7 @@ struct cali_rt_key { __u32 prefixlen; - __be32 addr; // NBO + ipv46_addr_t addr; // NBO }; union cali_rt_lpm_key { @@ -36,26 +36,34 @@ struct cali_rt { __u32 flags; /* enum cali_rt_flags */ union { // IP encap next hop for remote workload routes. - __u32 next_hop; + ipv46_addr_t next_hop; // Interface index for local workload routes. __u32 if_index; }; }; -CALI_MAP_V1(cali_v4_routes, +#ifdef IPVER6 +CALI_MAP_NAMED(cali_v6_routes, cali_routes,, +#else +CALI_MAP_NAMED(cali_v4_routes, cali_routes,, +#endif BPF_MAP_TYPE_LPM_TRIE, union cali_rt_lpm_key, struct cali_rt, 256*1024, BPF_F_NO_PREALLOC) -static CALI_BPF_INLINE struct cali_rt *cali_rt_lookup(__be32 addr) +static CALI_BPF_INLINE struct cali_rt *cali_rt_lookup(ipv46_addr_t *addr) { union cali_rt_lpm_key k; +#ifdef IPVER6 + k.key.prefixlen = 128; +#else k.key.prefixlen = 32; - k.key.addr = addr; - return cali_v4_routes_lookup_elem(&k); +#endif + k.key.addr = *addr; + return cali_routes_lookup_elem(&k); } -static CALI_BPF_INLINE enum cali_rt_flags cali_rt_lookup_flags(__be32 addr) +static CALI_BPF_INLINE enum cali_rt_flags cali_rt_lookup_flags(ipv46_addr_t *addr) { struct cali_rt *rt = cali_rt_lookup(addr); if (!rt) { @@ -77,22 +85,22 @@ static CALI_BPF_INLINE enum cali_rt_flags cali_rt_lookup_flags(__be32 addr) #define cali_rt_flags_remote_tunneled_host(t) (((t) & (CALI_RT_LOCAL | CALI_RT_HOST | CALI_RT_TUNNELED)) == (CALI_RT_HOST | CALI_RT_TUNNELED)) #define cali_rt_flags_local_tunneled_host(t) (((t) & (CALI_RT_LOCAL | CALI_RT_HOST | CALI_RT_TUNNELED)) == (CALI_RT_LOCAL | CALI_RT_HOST | CALI_RT_TUNNELED)) -static CALI_BPF_INLINE bool rt_addr_is_local_host(__be32 addr) +static CALI_BPF_INLINE bool rt_addr_is_local_host(ipv46_addr_t *addr) { return cali_rt_flags_local_host(cali_rt_lookup_flags(addr)); } -static CALI_BPF_INLINE bool rt_addr_is_remote_host(__be32 addr) +static CALI_BPF_INLINE bool rt_addr_is_remote_host(ipv46_addr_t *addr) { return cali_rt_flags_remote_host(cali_rt_lookup_flags(addr)); } -static CALI_BPF_INLINE bool rt_addr_is_remote_tunneled_host(__be32 addr) +static CALI_BPF_INLINE bool rt_addr_is_remote_tunneled_host(ipv46_addr_t *addr) { return cali_rt_flags_remote_tunneled_host(cali_rt_lookup_flags(addr)); } -static CALI_BPF_INLINE bool rt_addr_is_local_tunneled_host(__be32 addr) +static CALI_BPF_INLINE bool rt_addr_is_local_tunneled_host(ipv46_addr_t *addr) { return cali_rt_flags_local_tunneled_host(cali_rt_lookup_flags(addr)); } diff --git a/felix/bpf-gpl/rpf.h b/felix/bpf-gpl/rpf.h index 5d5933d406c..5fc95de123f 100644 --- a/felix/bpf-gpl/rpf.h +++ b/felix/bpf-gpl/rpf.h @@ -7,11 +7,12 @@ #include "types.h" #include "skb.h" +#include "routes.h" static CALI_BPF_INLINE bool wep_rpf_check(struct cali_tc_ctx *ctx, struct cali_rt *r) { CALI_DEBUG("Workload RPF check src=%x skb iface=%d.\n", - bpf_ntohl(ctx->state->ip_src), ctx->skb->ifindex); + debug_ip(ctx->state->ip_src), ctx->skb->ifindex); if (!r) { CALI_INFO("Workload RPF fail: missing route.\n"); return false; @@ -31,6 +32,9 @@ static CALI_BPF_INLINE bool wep_rpf_check(struct cali_tc_ctx *ctx, struct cali_r static CALI_BPF_INLINE bool hep_rpf_check(struct cali_tc_ctx *ctx) { +#ifdef IPVER6 + return true; +#else bool ret = false; bool strict; @@ -62,18 +66,19 @@ static CALI_BPF_INLINE bool hep_rpf_check(struct cali_tc_ctx *ctx) if (strict) { ret = ctx->skb->ingress_ifindex == fib_params.ifindex; CALI_DEBUG("Host RPF check src=%x skb strict if %d\n", - bpf_ntohl(ctx->state->ip_src), fib_params.ifindex); + debug_ip(ctx->state->ip_src), fib_params.ifindex); } else { ret = fib_params.ifindex != CT_INVALID_IFINDEX; CALI_DEBUG("Host RPF check src=%x skb loose if %d\n", - bpf_ntohl(ctx->state->ip_src), fib_params.ifindex); + debug_ip(ctx->state->ip_src), fib_params.ifindex); } } CALI_DEBUG("Host RPF check src=%x skb iface=%d\n", - bpf_ntohl(ctx->state->ip_src), ctx->skb->ifindex); + debug_ip(ctx->state->ip_src), ctx->skb->ifindex); CALI_DEBUG("Host RPF check rc %d result %d\n", rc, ret); return ret; +#endif } #endif /* __CALI_FIB_H__ */ diff --git a/felix/bpf-gpl/sendrecv.h b/felix/bpf-gpl/sendrecv.h index da2939e2083..1a69cb59ef2 100644 --- a/felix/bpf-gpl/sendrecv.h +++ b/felix/bpf-gpl/sendrecv.h @@ -5,33 +5,41 @@ #ifndef __SENDRECV_H__ #define __SENDRECV_H__ -struct sendrecv4_key { +struct sendrec_key { __u64 cookie; - __u32 ip; + ipv46_addr_t ip; __u32 port; /* because bpf_sock_addr uses 32bit and we would need padding */ }; -struct sendrecv4_val { - __u32 ip; +struct sendrec_val { + ipv46_addr_t ip; __u32 port; /* because bpf_sock_addr uses 32bit and we would need padding */ }; -CALI_MAP_V1(cali_v4_srmsg, +#ifdef IPVER6 +CALI_MAP_NAMED(cali_v6_srmsg, cali_srmsg,, +#else +CALI_MAP_NAMED(cali_v4_srmsg, cali_srmsg,, +#endif BPF_MAP_TYPE_LRU_HASH, - struct sendrecv4_key, struct sendrecv4_val, + struct sendrec_key, struct sendrec_val, 510000, 0) struct ct_nats_key { __u64 cookie; - __u32 ip; + ipv46_addr_t ip; __u32 port; /* because bpf_sock_addr uses 32bit */ __u8 proto; __u8 pad[7]; }; -CALI_MAP_V1(cali_v4_ct_nats, +#ifdef IPVER6 +CALI_MAP_NAMED(cali_v6_ct_nats, cali_ct_nats ,, +#else +CALI_MAP_NAMED(cali_v4_ct_nats, cali_ct_nats ,, +#endif BPF_MAP_TYPE_LRU_HASH, - struct ct_nats_key, struct sendrecv4_val, + struct ct_nats_key, struct sendrec_val, 10000, 0) static CALI_BPF_INLINE __u16 ctx_port_to_host(__u32 port) diff --git a/felix/bpf-gpl/skb.h b/felix/bpf-gpl/skb.h index 758483eb078..cb66066bbb8 100644 --- a/felix/bpf-gpl/skb.h +++ b/felix/bpf-gpl/skb.h @@ -52,13 +52,13 @@ static CALI_BPF_INLINE void *skb_end_ptr(struct __sk_buff *skb) { * Fresh values are loaded using skb_start/end_ptr. */ static CALI_BPF_INLINE void skb_refresh_start_end(struct cali_tc_ctx *ctx) { - if (CALI_F_XDP) { - ctx->data_start = (void *)(long)ctx->xdp->data; - ctx->data_end = (void *)(long)ctx->xdp->data_end; - } else { - ctx->data_start = skb_start_ptr(ctx->skb); - ctx->data_end = skb_end_ptr(ctx->skb); - } +#if CALI_F_XDP + ctx->data_start = (void *)(long)ctx->xdp->data; + ctx->data_end = (void *)(long)ctx->xdp->data_end; +#else + ctx->data_start = skb_start_ptr(ctx->skb); + ctx->data_end = skb_end_ptr(ctx->skb); +#endif } /* skb_iphdr_offset returns the expected offset of the IP header for this type of program. @@ -92,17 +92,17 @@ static CALI_BPF_INLINE long skb_iphdr_offset(struct cali_tc_ctx *ctx) * - ctx->ip_header * - ctx->nh/tcp_header/udp_header/icmp_header. */ -static CALI_BPF_INLINE int skb_refresh_validate_ptrs(struct cali_tc_ctx *ctx, long nh_len) { +static CALI_BPF_INLINE int skb_refresh_validate_ptrs(struct cali_tc_ctx *ctx, long nh_len) +{ int min_size = skb_iphdr_offset(ctx) + IP_SIZE; skb_refresh_start_end(ctx); if (ctx->data_start + (min_size + nh_len) > ctx->data_end) { // This is an XDP program and there is not enough data for next header. - if (CALI_F_XDP) { - CALI_DEBUG("Too short to have %d bytes for next header\n", - min_size + nh_len); - return -2; - } - +#if CALI_F_XDP + CALI_DEBUG("Too short to have %d bytes for next header\n", + min_size + nh_len); + return -2; +#else // Try to pull in more data. Ideally enough for TCP, or, failing that, the // minimum we've been asked for. if (nh_len > TCP_SIZE || bpf_skb_pull_data(ctx->skb, min_size + TCP_SIZE)) { @@ -117,9 +117,11 @@ static CALI_BPF_INLINE int skb_refresh_validate_ptrs(struct cali_tc_ctx *ctx, lo if (ctx->data_start + (min_size + nh_len) > ctx->data_end) { return -2; } +#endif } // Success, refresh the ip_header/nh fields in the context. ctx->ip_header = ctx->data_start + skb_iphdr_offset(ctx); + return 0; } diff --git a/felix/bpf-gpl/tc.c b/felix/bpf-gpl/tc.c index 25eb8919171..82b557d8816 100644 --- a/felix/bpf-gpl/tc.c +++ b/felix/bpf-gpl/tc.c @@ -44,6 +44,8 @@ #define HAS_HOST_CONFLICT_PROG CALI_F_TO_HEP +#define STATE (ctx->state) + /* calico_tc_main is the main function used in all of the tc programs. It is specialised * for particular hook at build time based on the CALI_F build flags. */ @@ -72,7 +74,11 @@ int calico_tc_main(struct __sk_buff *skb) struct cali_tc_ctx *ctx = &_ctx; CALI_DEBUG("New packet at ifindex=%d; mark=%x\n", skb->ifindex, skb->mark); +#ifdef IPVER6 + parse_packet_ip_v6(ctx); +#else parse_packet_ip(ctx); +#endif CALI_DEBUG("Final result=ALLOW (%d). Bypass mark set.\n", CALI_REASON_BYPASS); } return TC_ACT_UNSPEC; @@ -156,15 +162,15 @@ int calico_tc_main(struct __sk_buff *skb) /* Parse the packet as far as the IP header; as a side-effect this validates the packet size * is large enough for UDP. */ switch (parse_packet_ip(ctx)) { +#ifdef IPVER6 + case PARSING_OK_V6: + // IPv6 Packet. + break; +#else case PARSING_OK: // IPv4 Packet. break; - case PARSING_OK_V6: - // An IPv6 packet, so we should jump to the relevant IPv6 programs - CALI_DEBUG("About to jump to IPv6 prologue program\n"); - CALI_JUMP_TO(ctx, PROG_INDEX_V6_PROLOGUE); - CALI_DEBUG("Jump to IPv6 prologue failed.\n"); - goto deny; +#endif case PARSING_ALLOW_WITHOUT_ENFORCING_POLICY: // A packet that we automatically let through fwd_fib_set(&ctx->fwd, false); @@ -182,10 +188,6 @@ int calico_tc_main(struct __sk_buff *skb) allow: finalize: return forward_or_drop(ctx); - -deny: - ctx->fwd.res = TC_ACT_SHOT; - goto finalize; } static CALI_BPF_INLINE int pre_policy_processing(struct cali_tc_ctx *ctx) @@ -204,7 +206,7 @@ static CALI_BPF_INLINE int pre_policy_processing(struct cali_tc_ctx *ctx) /* Now we've got as far as the UDP header, check if this is one of our VXLAN packets, which we * use to forward traffic for node ports. */ if (dnat_should_decap() /* Compile time: is this a BPF program that should decap packets? */ && - is_vxlan_tunnel(ip_hdr(ctx), VXLAN_PORT) /* Is this a VXLAN packet? */ ) { + is_vxlan_tunnel(ctx, VXLAN_PORT) /* Is this a VXLAN packet? */ ) { /* Decap it; vxlan_attempt_decap will revalidate the packet if needed. */ switch (vxlan_attempt_decap(ctx)) { case -1: @@ -297,7 +299,7 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) * IP stack do the RPF check on the source, dest is not important. */ goto deny; - } else if (!wep_rpf_check(ctx, cali_rt_lookup(ctx->state->ip_src))) { + } else if (!wep_rpf_check(ctx, cali_rt_lookup(&ctx->state->ip_src))) { goto deny; } } @@ -379,10 +381,10 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) nat_lookup_result nat_res = NAT_LOOKUP_ALLOW; if (CALI_F_TO_HOST || (CALI_F_FROM_HOST && !skb_seen(ctx->skb) && !ctx->nat_dest /* no sport conflcit */)) { - ctx->nat_dest = calico_v4_nat_lookup_tc(ctx, - ctx->state->ip_src, ctx->state->ip_dst, - ctx->state->ip_proto, ctx->state->dport, - ctx->state->tun_ip != 0, &nat_res); + ctx->nat_dest = calico_nat_lookup_tc(ctx, + &ctx->state->ip_src, &ctx->state->ip_dst, + ctx->state->ip_proto, ctx->state->dport, + !ip_void(ctx->state->tun_ip), &nat_res); } if (nat_res == NAT_FE_LOOKUP_DROP) { @@ -397,7 +399,7 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) /* send icmp port unreachable if there is no backend for a service */ ctx->state->icmp_type = ICMP_DEST_UNREACH; ctx->state->icmp_code = ICMP_PORT_UNREACH; - ctx->state->tun_ip = 0; + ip_set_void(ctx->state->tun_ip); goto icmp_send_reply; } else { ctx->state->post_nat_ip_dst = ctx->state->ip_dst; @@ -407,14 +409,14 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) syn_force_policy: /* DNAT in state is set correctly now */ - if ((!(ctx->state->tun_ip) && CALI_F_FROM_HEP) && !CALI_F_NAT_IF && !CALI_F_LO) { + if ((ip_void(ctx->state->tun_ip) && CALI_F_FROM_HEP) && !CALI_F_NAT_IF && !CALI_F_LO) { if (!hep_rpf_check(ctx)) { goto deny; } } if (CALI_F_TO_WEP && !skb_seen(ctx->skb) && - cali_rt_flags_local_host(cali_rt_lookup_flags(ctx->state->ip_src))) { + cali_rt_flags_local_host(cali_rt_lookup_flags(&ctx->state->ip_src))) { /* Host to workload traffic always allowed. We discount traffic that was * seen by another program since it must have come in via another interface. */ @@ -423,7 +425,7 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) } if (CALI_F_FROM_WEP) { - struct cali_rt *r = cali_rt_lookup(ctx->state->ip_src); + struct cali_rt *r = cali_rt_lookup(&ctx->state->ip_src); /* Do RPF check since it's our responsibility to police that. */ if (!wep_rpf_check(ctx, r)) { goto deny; @@ -431,7 +433,7 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) // Check whether the workload needs outgoing NAT to this address. if (r->flags & CALI_RT_NAT_OUT) { - if (!(cali_rt_lookup_flags(ctx->state->post_nat_ip_dst) & CALI_RT_IN_POOL)) { + if (!(cali_rt_lookup_flags(&ctx->state->post_nat_ip_dst) & CALI_RT_IN_POOL)) { CALI_DEBUG("Source is in NAT-outgoing pool " "but dest is not, need to SNAT.\n"); ctx->state->flags |= CALI_ST_NAT_OUTGOING; @@ -439,10 +441,10 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) } /* If 3rd party CNI is used and dest is outside cluster. See commit fc711b192f for details. */ if (!(r->flags & CALI_RT_IN_POOL)) { - CALI_DEBUG("Source %x not in IP pool\n", bpf_ntohl(ctx->state->ip_src)); - r = cali_rt_lookup(ctx->state->post_nat_ip_dst); + CALI_DEBUG("Source %x not in IP pool\n", debug_ip(ctx->state->ip_src)); + r = cali_rt_lookup(&ctx->state->post_nat_ip_dst); if (!r || !(r->flags & (CALI_RT_WORKLOAD | CALI_RT_HOST))) { - CALI_DEBUG("Outside cluster dest %x\n", bpf_ntohl(ctx->state->post_nat_ip_dst)); + CALI_DEBUG("Outside cluster dest %x\n", debug_ip(ctx->state->post_nat_ip_dst)); ctx->state->flags |= CALI_ST_SKIP_FIB; } } @@ -462,7 +464,7 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) ctx->state->nat_dest.addr = ctx->nat_dest->addr; ctx->state->nat_dest.port = ctx->nat_dest->port; } else { - ctx->state->nat_dest.addr = 0; + ip_set_void(ctx->state->nat_dest.addr); ctx->state->nat_dest.port = 0; } @@ -482,15 +484,15 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) // If we didn't find a CTLB NAT entry then use the packet's own IP/port for the // pre-DNAT values that's set by tc_state_fill_from_iphdr() and // tc_state_fill_from_nextheader(). - struct sendrecv4_val *revnat = cali_v4_ct_nats_lookup_elem(&ct_nkey); + struct sendrec_val *revnat = cali_ct_nats_lookup_elem(&ct_nkey); if (revnat) { - CALI_DEBUG("Got cali_v4_ct_nats entry; flow was NATted by CTLB.\n"); + CALI_DEBUG("Got cali_ct_nats entry; flow was NATted by CTLB.\n"); ctx->state->pre_nat_ip_dst = revnat->ip; ctx->state->pre_nat_dport = ctx_port_to_host(revnat->port); } } - if (!forwarding && rt_addr_is_local_host(ctx->state->ip_src)) { + if (!forwarding && rt_addr_is_local_host(&ctx->state->ip_src)) { CALI_DEBUG("Source IP is local host.\n"); if (CALI_F_TO_HEP && is_failsafe_out(ctx->state->ip_proto, ctx->state->post_nat_dport, ctx->state->post_nat_ip_dst)) { CALI_DEBUG("Outbound failsafe port: %d. Skip policy.\n", ctx->state->post_nat_dport); @@ -500,10 +502,10 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) ctx->state->flags |= CALI_ST_SRC_IS_HOST; } - struct cali_rt *dest_rt = cali_rt_lookup(ctx->state->post_nat_ip_dst); + struct cali_rt *dest_rt = cali_rt_lookup(&ctx->state->post_nat_ip_dst); if (!dest_rt) { - CALI_DEBUG("No route for post DNAT dest %x\n", bpf_ntohl(ctx->state->post_nat_ip_dst)); + CALI_DEBUG("No route for post DNAT dest %x\n", debug_ip(ctx->state->post_nat_ip_dst)); if (CALI_F_FROM_HEP) { /* Disable FIB, let the packet go through the host after it is * policed. It is ingress into the system and we do not know what @@ -529,7 +531,7 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) if (CALI_F_TO_HEP && ctx->nat_dest && !skb_seen(ctx->skb) && !(ctx->state->flags & CALI_ST_HOST_PSNAT)) { CALI_DEBUG("Host accesses nodeport backend %x:%d\n", - bpf_htonl(ctx->state->post_nat_ip_dst), ctx->state->post_nat_dport); + debug_ip(ctx->state->post_nat_ip_dst), ctx->state->post_nat_dport); CALI_DEBUG("Host accesses nodeport state->flags 0x%x\n", ctx->state->flags); if (cali_rt_flags_local_workload(dest_rt->flags)) { CALI_DEBUG("NP redir on HEP - skip policy\n"); @@ -592,18 +594,16 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, size_t l4_csum_off, bool ct_related, int ct_rc, - struct ct_create_ctx ct_ctx_nat, + struct ct_create_ctx *ct_ctx_nat, bool *is_dnat, __u32 *seen_mark, bool in_place) { - int res = 0; bool encap_needed = false; - struct cali_tc_state *state = ctx->state; switch (ct_rc){ case CALI_CT_ESTABLISHED_DNAT: - if (CALI_F_FROM_HEP && state->tun_ip && ct_result_np_node(state->ct_result)) { + if (CALI_F_FROM_HEP && !ip_void(STATE->tun_ip) && ct_result_np_node(STATE->ct_result)) { /* Packet is returning from a NAT tunnel, * already SNATed, just forward it. */ @@ -611,8 +611,8 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, CALI_DEBUG("returned from NAT tunnel\n"); goto allow; } - state->post_nat_ip_dst = state->ct_result.nat_ip; - state->post_nat_dport = state->ct_result.nat_port; + STATE->post_nat_ip_dst = STATE->ct_result.nat_ip; + STATE->post_nat_dport = STATE->ct_result.nat_port; /* fall through */ @@ -620,10 +620,10 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, /* We may not do a true DNAT here if we are resolving service source port * conflict with host->pod w/o service. See calico_tc_host_ct_conflict(). */ - *is_dnat = state->ip_dst != state->post_nat_ip_dst || state->dport != state->post_nat_dport; + *is_dnat = !ip_equal(STATE->ip_dst, STATE->post_nat_ip_dst) || STATE->dport != STATE->post_nat_dport; CALI_DEBUG("CT: DNAT to %x:%d\n", - bpf_ntohl(state->post_nat_ip_dst), state->post_nat_dport); + debug_ip(STATE->post_nat_ip_dst), STATE->post_nat_dport); encap_needed = dnat_should_encap(); @@ -638,29 +638,29 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, /* When we need to encap, we need to find out if the backend is * local or not. If local, we actually do not need the encap. */ - rt = cali_rt_lookup(state->post_nat_ip_dst); + rt = cali_rt_lookup(&STATE->post_nat_ip_dst); if (!rt) { deny_reason(ctx, CALI_REASON_RT_UNKNOWN); goto deny; } CALI_DEBUG("rt found for 0x%x local %d\n", - bpf_ntohl(state->post_nat_ip_dst), !!cali_rt_is_local(rt)); + debug_ip(STATE->post_nat_ip_dst), !!cali_rt_is_local(rt)); encap_needed = !cali_rt_is_local(rt); if (encap_needed) { - if (CALI_F_FROM_HEP && state->tun_ip == 0) { + if (CALI_F_FROM_HEP && ip_void(STATE->tun_ip)) { if (CALI_F_DSR) { - ct_ctx_nat.flags |= CALI_CT_FLAG_DSR_FWD | - (state->ct_result.flags & CALI_CT_FLAG_NP_NO_DSR); + ct_ctx_nat->flags |= CALI_CT_FLAG_DSR_FWD | + (STATE->ct_result.flags & CALI_CT_FLAG_NP_NO_DSR); } - ct_ctx_nat.flags |= CALI_CT_FLAG_NP_FWD; + ct_ctx_nat->flags |= CALI_CT_FLAG_NP_FWD; } - ct_ctx_nat.allow_return = true; - ct_ctx_nat.tun_ip = rt->next_hop; - state->ip_dst = rt->next_hop; + ct_ctx_nat->allow_return = true; + ct_ctx_nat->tun_ip = rt->next_hop; + STATE->ip_dst = rt->next_hop; } else if (cali_rt_is_workload(rt) && - state->ip_dst != state->post_nat_ip_dst && + !ip_equal(STATE->ip_dst, STATE->post_nat_ip_dst) && !CALI_F_NAT_IF) { /* Packet arrived from a HEP for a workload and we're * about to NAT it. We can't rely on the kernel's RPF check @@ -673,82 +673,84 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, * rule is used. */ - ct_ctx_nat.flags |= CALI_CT_FLAG_EXT_LOCAL; - ctx->state->ct_result.flags |= CALI_CT_FLAG_EXT_LOCAL; + ct_ctx_nat->flags |= CALI_CT_FLAG_EXT_LOCAL; + STATE->ct_result.flags |= CALI_CT_FLAG_EXT_LOCAL; CALI_DEBUG("CT_NEW marked with FLAG_EXT_LOCAL\n"); } } - if (CALI_F_FROM_WEP && state->ip_src == state->post_nat_ip_dst) { + if (CALI_F_FROM_WEP && ip_equal(STATE->ip_src, STATE->post_nat_ip_dst)) { CALI_DEBUG("New loopback SNAT\n"); - ct_ctx_nat.flags |= CALI_CT_FLAG_SVC_SELF; - ctx->state->ct_result.flags |= CALI_CT_FLAG_SVC_SELF; + ct_ctx_nat->flags |= CALI_CT_FLAG_SVC_SELF; + STATE->ct_result.flags |= CALI_CT_FLAG_SVC_SELF; } - ct_ctx_nat.type = CALI_CT_TYPE_NAT_REV; + ct_ctx_nat->type = CALI_CT_TYPE_NAT_REV; int err; - if ((err = conntrack_create(ctx, &ct_ctx_nat))) { + if ((err = conntrack_create(ctx, ct_ctx_nat))) { CALI_DEBUG("Creating NAT conntrack failed with %d\n", err); goto deny; } - state->ct_result.nat_sip = ct_ctx_nat.src; - state->ct_result.nat_sport = ct_ctx_nat.sport; + STATE->ct_result.nat_sip = ct_ctx_nat->src; + STATE->ct_result.nat_sport = ct_ctx_nat->sport; } else { - if (encap_needed && ct_result_np_node(state->ct_result)) { - CALI_DEBUG("CT says encap to node %x\n", bpf_ntohl(state->ct_result.tun_ip)); - state->ip_dst = state->ct_result.tun_ip; + if (encap_needed && ct_result_np_node(STATE->ct_result)) { + CALI_DEBUG("CT says encap to node %x\n", debug_ip(STATE->ct_result.tun_ip)); + STATE->ip_dst = STATE->ct_result.tun_ip; } else { encap_needed = false; } } if (encap_needed) { - if (!(state->ip_proto == IPPROTO_TCP && skb_is_gso(ctx->skb)) && - ip_is_dnf(ip_hdr(ctx)) && vxlan_v4_encap_too_big(ctx)) { + if (!(STATE->ip_proto == IPPROTO_TCP && skb_is_gso(ctx->skb)) && + ip_is_dnf(ip_hdr(ctx)) && vxlan_encap_too_big(ctx)) { CALI_DEBUG("Request packet with DNF set is too big\n"); goto icmp_too_big; } - state->ip_src = HOST_IP; + STATE->ip_src = HOST_IP; *seen_mark = CALI_SKB_MARK_BYPASS_FWD; /* Do FIB if possible */ CALI_DEBUG("marking CALI_SKB_MARK_BYPASS_FWD\n"); goto nat_encap; } - ip_hdr(ctx)->saddr = state->ct_result.nat_sip; - ip_hdr(ctx)->daddr = state->post_nat_ip_dst; + ip_hdr_set_ip(ctx, saddr, STATE->ct_result.nat_sip); + ip_hdr_set_ip(ctx, daddr, STATE->post_nat_ip_dst); - switch (ip_hdr(ctx)->protocol) { + switch (STATE->ip_proto) { case IPPROTO_TCP: - if (state->ct_result.nat_sport) { + if (STATE->ct_result.nat_sport) { CALI_DEBUG("Fixing TCP source port from %d to %d\n", - bpf_ntohs(tcp_hdr(ctx)->source), state->ct_result.nat_sport); - tcp_hdr(ctx)->source = bpf_htons(state->ct_result.nat_sport); + bpf_ntohs(tcp_hdr(ctx)->source), STATE->ct_result.nat_sport); + tcp_hdr(ctx)->source = bpf_htons(STATE->ct_result.nat_sport); } - tcp_hdr(ctx)->dest = bpf_htons(state->post_nat_dport); + tcp_hdr(ctx)->dest = bpf_htons(STATE->post_nat_dport); break; case IPPROTO_UDP: - if (state->ct_result.nat_sport) { + if (STATE->ct_result.nat_sport) { CALI_DEBUG("Fixing UDP source port from %d to %d\n", - bpf_ntohs(udp_hdr(ctx)->source), state->ct_result.nat_sport); - udp_hdr(ctx)->source = bpf_htons(state->ct_result.nat_sport); + bpf_ntohs(udp_hdr(ctx)->source), STATE->ct_result.nat_sport); + udp_hdr(ctx)->source = bpf_htons(STATE->ct_result.nat_sport); } - udp_hdr(ctx)->dest = bpf_htons(state->post_nat_dport); + udp_hdr(ctx)->dest = bpf_htons(STATE->post_nat_dport); break; } CALI_DEBUG("L3 csum at %d L4 csum at %d\n", l3_csum_off, l4_csum_off); if (l4_csum_off) { - res = skb_nat_l4_csum_ipv4(ctx, l4_csum_off, - state->ip_src, - state->ct_result.nat_sip, - state->ip_dst, - state->post_nat_ip_dst, - bpf_htons(state->dport), - bpf_htons(state->post_nat_dport), - bpf_htons(state->sport), - bpf_htons(state->ct_result.nat_sport ? : state->sport), - ip_hdr(ctx)->protocol == IPPROTO_UDP ? BPF_F_MARK_MANGLED_0 : 0); + if (skb_nat_l4_csum(ctx, l4_csum_off, + STATE->ip_src, + STATE->ct_result.nat_sip, + STATE->ip_dst, + STATE->post_nat_ip_dst, + bpf_htons(STATE->dport), + bpf_htons(STATE->post_nat_dport), + bpf_htons(STATE->sport), + bpf_htons(STATE->ct_result.nat_sport ? : STATE->sport), + STATE->ip_proto == IPPROTO_UDP ? BPF_F_MARK_MANGLED_0 : 0)) { + goto deny; + } } if (!in_place) { @@ -766,24 +768,24 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, offset += ctx->ipheader_len; - if (bpf_skb_store_bytes(ctx->skb, offset, ctx->scratch->l4, 8, 0)) { + if (bpf_skb_store_bytes(ctx->skb, offset, ctx->nh, 8, 0)) { CALI_DEBUG("Too short\n"); deny_reason(ctx, CALI_REASON_SHORT); goto deny; } } - res |= bpf_l3_csum_replace(ctx->skb, l3_csum_off, state->ip_src, state->ct_result.nat_sip, 4); - res |= bpf_l3_csum_replace(ctx->skb, l3_csum_off, state->ip_dst, state->post_nat_ip_dst, 4); - /* From now on, the packet has a new source IP */ - if (state->ct_result.nat_sip) { - state->ip_src = state->ct_result.nat_sip; - } - - if (res) { +#ifndef IPVER6 + if (bpf_l3_csum_replace(ctx->skb, l3_csum_off, STATE->ip_src, STATE->ct_result.nat_sip, 4) || + bpf_l3_csum_replace(ctx->skb, l3_csum_off, STATE->ip_dst, STATE->post_nat_ip_dst, 4)) { deny_reason(ctx, CALI_REASON_CSUM_FAIL); goto deny; } +#endif + /* From now on, the packet has a new source IP */ + if (!ip_void(STATE->ct_result.nat_sip)) { + STATE->ip_src = STATE->ct_result.nat_sip; + } /* Handle returning ICMP related to tunnel * @@ -792,9 +794,9 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, * unlikely that we are anywhere to close the MTU limit. If we * are, we need to fail anyway. */ - if (ct_related && state->ip_proto == IPPROTO_ICMP - && state->ct_result.tun_ip - && (!CALI_F_DSR || (state->ct_result.flags & CALI_CT_FLAG_NP_NO_DSR))) { + if (ct_related && STATE->ip_proto == IPPROTO_ICMP + && !ip_void(STATE->ct_result.tun_ip) + && (!CALI_F_DSR || (STATE->ct_result.flags & CALI_CT_FLAG_NP_NO_DSR))) { if (dnat_return_should_encap()) { CALI_DEBUG("Returning related ICMP from workload to tunnel\n"); } else if (CALI_F_TO_HEP) { @@ -810,66 +812,69 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, CALI_DEBUG("Returning related ICMP from host to tunnel\n"); } - state->ip_src = HOST_IP; - state->ip_dst = state->ct_result.tun_ip; + STATE->ip_src = HOST_IP; + STATE->ip_dst = STATE->ct_result.tun_ip; goto nat_encap; } - state->dport = state->post_nat_dport; - state->ip_dst = state->post_nat_ip_dst; + STATE->dport = STATE->post_nat_dport; + STATE->ip_dst = STATE->post_nat_ip_dst; goto allow; case CALI_CT_ESTABLISHED_SNAT: CALI_DEBUG("CT: SNAT from %x:%d\n", - bpf_ntohl(state->ct_result.nat_ip), state->ct_result.nat_port); + debug_ip(STATE->ct_result.nat_ip), STATE->ct_result.nat_port); - if (dnat_return_should_encap() && state->ct_result.tun_ip) { - if (CALI_F_DSR && !(state->ct_result.flags & CALI_CT_FLAG_NP_NO_DSR)) { + if (dnat_return_should_encap() && !ip_void(STATE->ct_result.tun_ip)) { + if (CALI_F_DSR && !(STATE->ct_result.flags & CALI_CT_FLAG_NP_NO_DSR)) { /* SNAT will be done after routing, when leaving HEP */ CALI_DEBUG("DSR enabled, skipping SNAT + encap\n"); goto allow; } - if (!(state->ip_proto == IPPROTO_TCP && skb_is_gso(ctx->skb)) && - ip_is_dnf(ip_hdr(ctx)) && vxlan_v4_encap_too_big(ctx)) { + if (!(STATE->ip_proto == IPPROTO_TCP && skb_is_gso(ctx->skb)) && + ip_is_dnf(ip_hdr(ctx)) && vxlan_encap_too_big(ctx)) { CALI_DEBUG("Return ICMP mtu is too big\n"); goto icmp_too_big; } } // Actually do the NAT. - ip_hdr(ctx)->saddr = state->ct_result.nat_ip; - ip_hdr(ctx)->daddr = state->ct_result.nat_sip; + ip_hdr_set_ip(ctx, saddr, STATE->ct_result.nat_ip); + ip_hdr_set_ip(ctx, daddr, STATE->ct_result.nat_sip); - switch (ip_hdr(ctx)->protocol) { + switch (ctx->state->ip_proto) { case IPPROTO_TCP: - tcp_hdr(ctx)->source = bpf_htons(state->ct_result.nat_port); - if (state->ct_result.nat_sport) { + tcp_hdr(ctx)->source = bpf_htons(STATE->ct_result.nat_port); + if (STATE->ct_result.nat_sport) { CALI_DEBUG("Fixing TCP dest port from %d to %d\n", - bpf_ntohs(tcp_hdr(ctx)->dest), state->ct_result.nat_sport); - tcp_hdr(ctx)->dest = bpf_htons(state->ct_result.nat_sport); + bpf_ntohs(tcp_hdr(ctx)->dest), STATE->ct_result.nat_sport); + tcp_hdr(ctx)->dest = bpf_htons(STATE->ct_result.nat_sport); } break; case IPPROTO_UDP: - udp_hdr(ctx)->source = bpf_htons(state->ct_result.nat_port); - if (state->ct_result.nat_sport) { + udp_hdr(ctx)->source = bpf_htons(STATE->ct_result.nat_port); + if (STATE->ct_result.nat_sport) { CALI_DEBUG("Fixing UDP dest port from %d to %d\n", - bpf_ntohs(tcp_hdr(ctx)->dest), state->ct_result.nat_sport); - udp_hdr(ctx)->dest = bpf_htons(state->ct_result.nat_sport); + bpf_ntohs(tcp_hdr(ctx)->dest), STATE->ct_result.nat_sport); + udp_hdr(ctx)->dest = bpf_htons(STATE->ct_result.nat_sport); } break; } + /* XXX */ CALI_DEBUG("L3 csum at %d L4 csum at %d\n", l3_csum_off, l4_csum_off); - if (l4_csum_off) { - res = skb_nat_l4_csum_ipv4(ctx, l4_csum_off, - state->ip_src, state->ct_result.nat_ip, - state->ip_dst, state->ct_result.nat_sip, - bpf_htons(state->dport), bpf_htons(state->ct_result.nat_sport ? : state->dport), - bpf_htons(state->sport), bpf_htons(state->ct_result.nat_port), - ip_hdr(ctx)->protocol == IPPROTO_UDP ? BPF_F_MARK_MANGLED_0 : 0); + if (l4_csum_off && skb_nat_l4_csum(ctx, l4_csum_off, + STATE->ip_src, STATE->ct_result.nat_ip, + STATE->ip_dst, STATE->ct_result.nat_sip, + bpf_htons(STATE->dport), + bpf_htons(STATE->ct_result.nat_sport ? : STATE->dport), + bpf_htons(STATE->sport), bpf_htons(STATE->ct_result.nat_port), + STATE->ip_proto == IPPROTO_UDP ? BPF_F_MARK_MANGLED_0 : 0)) { + deny_reason(ctx, CALI_REASON_CSUM_FAIL); + goto deny; } if (!in_place) { @@ -894,20 +899,18 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, } } +#ifndef IPVER6 CALI_VERB("L3 checksum update (csum is at %d) port from %x to %x\n", - l3_csum_off, state->ip_src, state->ct_result.nat_ip); + l3_csum_off, STATE->ip_src, STATE->ct_result.nat_ip); - int csum_rc = bpf_l3_csum_replace(ctx->skb, l3_csum_off, - state->ip_src, state->ct_result.nat_ip, 4); - csum_rc |= bpf_l3_csum_replace(ctx->skb, l3_csum_off, - state->ip_dst, state->ct_result.nat_sip, 4); - CALI_VERB("bpf_l3_csum_replace(IP): %d\n", csum_rc); - res |= csum_rc; - - if (res) { + if (bpf_l3_csum_replace(ctx->skb, l3_csum_off, + STATE->ip_src, STATE->ct_result.nat_ip, 4) || + bpf_l3_csum_replace(ctx->skb, l3_csum_off, + STATE->ip_dst, STATE->ct_result.nat_sip, 4)) { deny_reason(ctx, CALI_REASON_CSUM_FAIL); goto deny; } +#endif /* In addition to dnat_return_should_encap() we also need to encap on the * host endpoint for egress traffic, when we hit an SNAT rule. This is the @@ -916,14 +919,14 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, * able to match as SNAT. */ if ((dnat_return_should_encap() || (CALI_F_TO_HEP && !CALI_F_DSR)) && - state->ct_result.tun_ip) { - state->ip_src = HOST_IP; - state->ip_dst = state->ct_result.tun_ip; + !ip_void(STATE->ct_result.tun_ip)) { + STATE->ip_src = HOST_IP; + STATE->ip_dst = STATE->ct_result.tun_ip; goto nat_encap; } - state->sport = state->ct_result.nat_port; - state->ip_src = state->ct_result.nat_ip; + STATE->sport = STATE->ct_result.nat_port; + STATE->ip_src = STATE->ct_result.nat_ip; goto allow; } @@ -935,8 +938,9 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, return NAT_ALLOW; icmp_too_big: - state->icmp_type = ICMP_DEST_UNREACH; - state->icmp_code = ICMP_FRAG_NEEDED; +#ifndef IPVER6 + STATE->icmp_type = ICMP_DEST_UNREACH; + STATE->icmp_code = ICMP_FRAG_NEEDED; struct { __be16 unused; @@ -944,11 +948,16 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, } frag = { .mtu = bpf_htons(TUNNEL_MTU), }; - state->tun_ip = *(__be32 *)&frag; + STATE->tun_ip = *(__be32 *)&frag; return NAT_ICMP_TOO_BIG; +#else + /* XXX not implemented yet. */ + return NAT_DENY; +#endif nat_encap: + /* XXX */ /* We are about to encap return traffic that originated on the local host * namespace - a host networked pod. Routing was based on the dst IP, * which was the original client's IP at that time, not the node's that @@ -957,14 +966,14 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, if (CALI_F_TO_HEP) { struct arp_value *arpv; struct arp_key arpk = { - .ip = state->ip_dst, + .ip = STATE->ip_dst, .ifindex = ctx->skb->ifindex, }; - arpv = cali_v4_arp_lookup_elem(&arpk); + arpv = cali_arp_lookup_elem(&arpk); if (!arpv) { CALI_DEBUG("ARP lookup failed for %x dev %d at HEP\n", - bpf_ntohl(state->ip_dst), arpk.ifindex); + debug_ip(STATE->ip_dst), arpk.ifindex); /* Don't drop it yet, we might get lucky and the MAC is correct */ } else { if (skb_refresh_validate_ptrs(ctx, 0)) { @@ -973,7 +982,7 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, goto deny; } __builtin_memcpy(ð_hdr(ctx)->h_dest, arpv->mac_dst, ETH_ALEN); - if (state->ct_result.ifindex_fwd == ctx->skb->ifindex) { + if (STATE->ct_result.ifindex_fwd == ctx->skb->ifindex) { /* No need to change src MAC, if we are at the right device */ } else { /* FIXME we need to redirect to the right device */ @@ -981,16 +990,16 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, } } - if (vxlan_v4_encap(ctx, state->ip_src, state->ip_dst)) { + if (vxlan_encap(ctx, &STATE->ip_src, &STATE->ip_dst)) { deny_reason(ctx, CALI_REASON_ENCAP_FAIL); goto deny; } - state->sport = state->dport = VXLAN_PORT; - state->ip_proto = IPPROTO_UDP; + STATE->sport = STATE->dport = VXLAN_PORT; + STATE->ip_proto = IPPROTO_UDP; CALI_DEBUG("vxlan return %d ifindex_fwd %d\n", - dnat_return_should_encap(), state->ct_result.ifindex_fwd); + dnat_return_should_encap(), STATE->ct_result.ifindex_fwd); return NAT_ENCAP_ALLOW; } @@ -1026,12 +1035,12 @@ static CALI_BPF_INLINE struct fwd post_nat(struct cali_tc_ctx *ctx, } if (CALI_F_TO_HEP && !skb_seen(ctx->skb) && is_dnat) { - struct cali_rt *r = cali_rt_lookup(state->post_nat_ip_dst); + struct cali_rt *r = cali_rt_lookup(&state->post_nat_ip_dst); if (r && cali_rt_flags_local_workload(r->flags)) { state->ct_result.ifindex_fwd = r->if_index; CALI_DEBUG("NP local WL %x:%d on HEP\n", - bpf_htonl(state->post_nat_ip_dst), state->post_nat_dport); + debug_ip(state->post_nat_ip_dst), state->post_nat_dport); ctx->state->flags |= CALI_ST_CT_NP_LOOP; fib = true; /* Enforce FIB since we want to redirect */ } else if (!r || cali_rt_flags_remote_workload(r->flags)) { @@ -1039,7 +1048,7 @@ static CALI_BPF_INLINE struct fwd post_nat(struct cali_tc_ctx *ctx, if (CALI_F_LO || CALI_F_MAIN) { state->ct_result.ifindex_fwd = NATIN_IFACE ; CALI_DEBUG("NP remote WL %x:%d on LO or main HEP\n", - bpf_htonl(state->post_nat_ip_dst), state->post_nat_dport); + debug_ip(state->post_nat_ip_dst), state->post_nat_dport); ctx->state->flags |= CALI_ST_CT_NP_LOOP; } ctx->state->flags |= CALI_ST_CT_NP_REMOTE; @@ -1108,7 +1117,7 @@ int calico_tc_skb_accepted_entrypoint(struct __sk_buff *skb) .addr = ctx->state->nat_dest.addr, .port = ctx->state->nat_dest.port, }; - if (ctx->state->nat_dest.addr != 0) { + if (!ip_void(ctx->state->nat_dest.addr)) { nat_dest = &nat_dest_2; } @@ -1125,17 +1134,18 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx CALI_DEBUG("Entering calico_tc_skb_accepted\n"); struct cali_tc_state *state = ctx->state; bool fib = true; - struct ct_create_ctx ct_ctx_nat = {}; + struct ct_create_ctx *ct_ctx_nat = &ctx->scratch->ct_ctx_nat; int ct_rc = ct_result_rc(state->ct_result.rc); bool ct_related = ct_result_is_related(state->ct_result.rc); __u32 seen_mark = ctx->fwd.mark; - size_t l4_csum_off = 0, l3_csum_off; + size_t l4_csum_off = 0; + size_t l3_csum_off = 0;; bool is_dnat = false; enum do_nat_res nat_res = NAT_ALLOW; - CALI_DEBUG("src=%x dst=%x\n", bpf_ntohl(state->ip_src), bpf_ntohl(state->ip_dst)); - CALI_DEBUG("post_nat=%x:%d\n", bpf_ntohl(state->post_nat_ip_dst), state->post_nat_dport); - CALI_DEBUG("tun_ip=%x\n", state->tun_ip); + CALI_DEBUG("src=%x dst=%x\n", debug_ip(state->ip_src), debug_ip(state->ip_dst)); + CALI_DEBUG("post_nat=%x:%d\n", debug_ip(state->post_nat_ip_dst), state->post_nat_dport); + CALI_DEBUG("tun_ip=%x\n", debug_ip(state->tun_ip)); CALI_DEBUG("pol_rc=%d\n", state->pol_rc); CALI_DEBUG("sport=%d\n", state->sport); CALI_DEBUG("flags=%x\n", state->flags); @@ -1168,7 +1178,11 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx /* We check the ttl here to avoid needing complicated handling of * related traffic back from the host if we let the host to handle it. */ +#ifdef IPVER6 + CALI_DEBUG("ip->hop_limit %d\n", ip_hdr(ctx)->hop_limit); +#else CALI_DEBUG("ip->ttl %d\n", ip_hdr(ctx)->ttl); +#endif if (ip_ttl_exceeded(ip_hdr(ctx))) { switch (ct_rc){ case CALI_CT_NEW: @@ -1182,16 +1196,18 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx } } +#ifndef IPVER6 l3_csum_off = skb_iphdr_offset(ctx) + offsetof(struct iphdr, check); +#endif if (ct_related) { - if (ip_hdr(ctx)->protocol == IPPROTO_ICMP) { + if (ctx->state->ip_proto == IPPROTO_ICMP) { bool outer_ip_snat; /* if we do SNAT ... */ outer_ip_snat = ct_rc == CALI_CT_ESTABLISHED_SNAT; /* ... there is a return path to the tunnel ... */ - outer_ip_snat = outer_ip_snat && state->ct_result.tun_ip; + outer_ip_snat = outer_ip_snat && !ip_void(state->ct_result.tun_ip); /* ... and should do encap and it is not DSR or it is leaving host * and either DSR from WEP or originated at host ... */ outer_ip_snat = outer_ip_snat && @@ -1201,15 +1217,17 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx /* ... then fix the outer header IP first */ if (outer_ip_snat) { - ip_hdr(ctx)->saddr = state->ct_result.nat_ip; + ip_hdr_set_ip(ctx, saddr, state->ct_result.nat_ip); +#ifndef IPVER6 int res = bpf_l3_csum_replace(ctx->skb, l3_csum_off, state->ip_src, state->ct_result.nat_ip, 4); if (res) { deny_reason(ctx, CALI_REASON_CSUM_FAIL); goto deny; } +#endif CALI_DEBUG("ICMP related: outer IP SNAT to %x\n", - bpf_ntohl(state->ct_result.nat_ip)); + debug_ip(state->ct_result.nat_ip)); } /* Related ICMP traffic must be an error response so it should include inner IP @@ -1231,7 +1249,7 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx } } - switch (ip_hdr(ctx)->protocol) { + switch (ctx->state->ip_proto) { case IPPROTO_TCP: l4_csum_off = skb_l4hdr_offset(ctx) + offsetof(struct tcphdr, check); break; @@ -1256,45 +1274,46 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx if (CALI_F_FROM_WEP && CALI_DROP_WORKLOAD_TO_HOST && cali_rt_flags_local_host( - cali_rt_lookup_flags(state->post_nat_ip_dst))) { + cali_rt_lookup_flags(&state->post_nat_ip_dst))) { CALI_DEBUG("Workload to host traffic blocked by " "DefaultEndpointToHostAction: DROP\n"); goto deny; } - ct_ctx_nat.skb = ctx->skb; - ct_ctx_nat.proto = state->ip_proto; - ct_ctx_nat.src = state->ip_src; - ct_ctx_nat.sport = state->sport; - ct_ctx_nat.dst = state->post_nat_ip_dst; - ct_ctx_nat.dport = state->post_nat_dport; - ct_ctx_nat.tun_ip = state->tun_ip; - ct_ctx_nat.type = CALI_CT_TYPE_NORMAL; - ct_ctx_nat.allow_return = false; + __builtin_memset(ct_ctx_nat, 0, sizeof(*ct_ctx_nat)); + + ct_ctx_nat->proto = state->ip_proto; + ct_ctx_nat->src = state->ip_src; + ct_ctx_nat->sport = state->sport; + ct_ctx_nat->dst = state->post_nat_ip_dst; + ct_ctx_nat->dport = state->post_nat_dport; + ct_ctx_nat->tun_ip = state->tun_ip; + ct_ctx_nat->type = CALI_CT_TYPE_NORMAL; + ct_ctx_nat->allow_return = false; if (state->flags & CALI_ST_NAT_OUTGOING) { - ct_ctx_nat.flags |= CALI_CT_FLAG_NAT_OUT; + ct_ctx_nat->flags |= CALI_CT_FLAG_NAT_OUT; } if (CALI_F_FROM_WEP && state->flags & CALI_ST_SKIP_FIB) { - ct_ctx_nat.flags |= CALI_CT_FLAG_SKIP_FIB; + ct_ctx_nat->flags |= CALI_CT_FLAG_SKIP_FIB; } /* Packets received at WEP with CALI_CT_FLAG_SKIP_FIB mark signal * that all traffic on this connection must flow via host namespace as it was * originally meant for host, but got redirected to a WEP by a 3rd party DNAT rule. */ if (CALI_F_TO_WEP && ((ctx->skb->mark & CALI_SKB_MARK_SKIP_FIB) == CALI_SKB_MARK_SKIP_FIB)) { - ct_ctx_nat.flags |= CALI_CT_FLAG_SKIP_FIB; + ct_ctx_nat->flags |= CALI_CT_FLAG_SKIP_FIB; } if (CALI_F_TO_HOST && CALI_F_NAT_IF) { - ct_ctx_nat.flags |= CALI_CT_FLAG_VIA_NAT_IF; + ct_ctx_nat->flags |= CALI_CT_FLAG_VIA_NAT_IF; } if (CALI_F_TO_HEP && !CALI_F_NAT_IF && state->flags & CALI_ST_CT_NP_LOOP) { - ct_ctx_nat.flags |= CALI_CT_FLAG_NP_LOOP; + ct_ctx_nat->flags |= CALI_CT_FLAG_NP_LOOP; } if (CALI_F_TO_HEP && !CALI_F_NAT_IF && state->flags & CALI_ST_CT_NP_REMOTE) { - ct_ctx_nat.flags |= CALI_CT_FLAG_NP_REMOTE; + ct_ctx_nat->flags |= CALI_CT_FLAG_NP_REMOTE; } if (state->flags & CALI_ST_HOST_PSNAT) { - ct_ctx_nat.flags |= CALI_CT_FLAG_HOST_PSNAT; + ct_ctx_nat->flags |= CALI_CT_FLAG_HOST_PSNAT; } /* Mark connections that were routed via bpfnatout, but had CT miss at * HEP. That is because of SNAT happened between bpfnatout and here. @@ -1303,7 +1322,7 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx */ if (CALI_F_TO_HEP && ((ctx->skb->mark & CALI_SKB_MARK_FROM_NAT_IFACE_OUT) == CALI_SKB_MARK_FROM_NAT_IFACE_OUT)) { - ct_ctx_nat.flags |= CALI_CT_FLAG_VIA_NAT_IF; + ct_ctx_nat->flags |= CALI_CT_FLAG_VIA_NAT_IF; } /* If we just received the first packet for a NP forwarded from a @@ -1311,12 +1330,10 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx * CIDRs from DSR, we need to make a check if this client also opted out * and save the information in conntrack. */ - CALI_DEBUG("CALI_F_DSR: %d\n", CALI_F_DSR); - CALI_DEBUG("GLOBAL_FLAGS: 0x%x\n", GLOBAL_FLAGS); if (CALI_F_FROM_HEP && CALI_F_DSR && (GLOBAL_FLAGS & CALI_GLOBALS_NO_DSR_CIDRS)) { - CALI_DEBUG("state->tun_ip = 0x%x\n", state->tun_ip); - if (state->tun_ip && cali_rt_lookup_flags(state->ip_src) & CALI_RT_NO_DSR) { - ct_ctx_nat.flags |= CALI_CT_FLAG_NP_NO_DSR; + CALI_DEBUG("state->tun_ip = 0x%x\n", debug_ip(state->tun_ip)); + if (!ip_void(state->tun_ip) && cali_rt_lookup_flags(&state->ip_src) & CALI_RT_NO_DSR) { + ct_ctx_nat->flags |= CALI_CT_FLAG_NP_NO_DSR; CALI_DEBUG("CALI_CT_FLAG_NP_NO_DSR\n"); } } @@ -1327,17 +1344,17 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx CALI_DEBUG("Too short for TCP: DROP\n"); goto deny; } - ct_ctx_nat.tcp = tcp_hdr(ctx); + ct_ctx_nat->tcp = tcp_hdr(ctx); } // If we get here, we've passed policy. if (nat_dest == NULL) { - if (conntrack_create(ctx, &ct_ctx_nat)) { + if (conntrack_create(ctx, ct_ctx_nat)) { CALI_DEBUG("Creating normal conntrack failed\n"); - if ((CALI_F_FROM_HEP && rt_addr_is_local_host(ct_ctx_nat.dst)) || - (CALI_F_TO_HEP && rt_addr_is_local_host(ct_ctx_nat.src))) { + if ((CALI_F_FROM_HEP && rt_addr_is_local_host(&ct_ctx_nat->dst)) || + (CALI_F_TO_HEP && rt_addr_is_local_host(&ct_ctx_nat->src))) { CALI_DEBUG("Allowing local host traffic without CT\n"); goto allow; } @@ -1347,26 +1364,26 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx goto allow; } - ct_ctx_nat.orig_src = state->ip_src; - ct_ctx_nat.orig_dst = state->ip_dst; - ct_ctx_nat.orig_dport = state->dport; - ct_ctx_nat.orig_sport = state->sport; - state->ct_result.nat_sport = ct_ctx_nat.sport; + ct_ctx_nat->orig_src = state->ip_src; + ct_ctx_nat->orig_dst = state->ip_dst; + ct_ctx_nat->orig_dport = state->dport; + ct_ctx_nat->orig_sport = state->sport; + state->ct_result.nat_sport = ct_ctx_nat->sport; /* fall through as DNAT is now established */ if ((CALI_F_TO_HOST && CALI_F_NAT_IF) || (CALI_F_TO_HEP && (CALI_F_LO || CALI_F_MAIN))) { - struct cali_rt *r = cali_rt_lookup(state->post_nat_ip_dst); + struct cali_rt *r = cali_rt_lookup(&state->post_nat_ip_dst); if (r && cali_rt_flags_remote_workload(r->flags) && cali_rt_is_tunneled(r)) { CALI_DEBUG("remote wl %x tunneled via %x\n", - bpf_htonl(state->post_nat_ip_dst), bpf_htonl(HOST_TUNNEL_IP)); - ct_ctx_nat.src = HOST_TUNNEL_IP; + debug_ip(state->post_nat_ip_dst), debug_ip(HOST_TUNNEL_IP)); + ct_ctx_nat->src = HOST_TUNNEL_IP; /* This would be the place to set a new source port if we * had a way how to allocate it. Instead we rely on source * port collision resolution. - * ct_ctx_nat.sport = 10101; + * ct_ctx_nat->sport = 10101; */ - state->ct_result.nat_sip = ct_ctx_nat.src; - state->ct_result.nat_sport = ct_ctx_nat.sport; + state->ct_result.nat_sip = ct_ctx_nat->src; + state->ct_result.nat_sport = ct_ctx_nat->sport; } } @@ -1407,12 +1424,14 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx goto deny; icmp_ttl_exceeded: +#ifndef IPVER6 if (ip_frag_no(ip_hdr(ctx))) { goto deny; } +#endif state->icmp_type = ICMP_TIME_EXCEEDED; state->icmp_code = ICMP_EXC_TTL; - state->tun_ip = 0; + ip_set_void(state->tun_ip); goto icmp_send_reply; icmp_send_reply: @@ -1431,6 +1450,11 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx SEC("tc") int calico_tc_skb_icmp_inner_nat(struct __sk_buff *skb) { +#ifdef IPVER6 + /* XXX not implemented yet */ + return TC_ACT_SHOT; +#else + /* Initialise the context, which is stored on the stack, and the state, which * we use to pass data from one program to the next via tail calls. */ DECLARE_TC_CTX(_ctx, @@ -1476,7 +1500,7 @@ int calico_tc_skb_icmp_inner_nat(struct __sk_buff *skb) } ctx->ip_header = (struct iphdr*)pkt; - ctx->ipheader_len = ctx->state->ihl = ip_hdr(ctx)->ihl * 4; + tc_state_fill_from_iphdr(ctx); if (ctx->ipheader_len > 60) { CALI_DEBUG("this cannot be!\n"); goto deny; @@ -1491,7 +1515,7 @@ int calico_tc_skb_icmp_inner_nat(struct __sk_buff *skb) goto deny; } - ctx->scratch = (void *)(pkt + ctx->ipheader_len); + ctx->nh = (void *)(pkt + ctx->ipheader_len); /* Flip the direction, we need to reverse the original packet. */ switch (ct_rc) { @@ -1523,22 +1547,39 @@ int calico_tc_skb_icmp_inner_nat(struct __sk_buff *skb) bool fib = true; struct ct_create_ctx ct_ctx_nat = {}; /* CT_NEW is not the option so pass an empty one. */ - nat_res = do_nat(ctx, l3_csum_off, 0, false, ct_rc, ct_ctx_nat, &is_dnat, &seen_mark, false); + nat_res = do_nat(ctx, l3_csum_off, 0, false, ct_rc, &ct_ctx_nat, &is_dnat, &seen_mark, false); ctx->fwd = post_nat(ctx, nat_res, fib, seen_mark, is_dnat); allow: + /* We are going to forward the packet now. But all the state is about + * the inner IP so we need to refresh our state back to the outter IP + * that is used for forwarding! + * + * N.B. we could just remember an update the state, however, forwarding + * also updates ttl/hops in the header so we need the right header + * available anyway. + */ + if (parse_packet_ip(ctx) != PARSING_OK) { + CALI_DEBUG("Non ipv4 packet on icmp path! DROP!\n"); + goto deny; + } + tc_state_fill_from_iphdr(ctx); fwd_fib_set(&ctx->fwd, true); return forward_or_drop(ctx); deny: return TC_ACT_SHOT; +#endif /* IPVER6 */ } SEC("tc") int calico_tc_skb_send_icmp_replies(struct __sk_buff *skb) { +#ifdef IPVER6 + return TC_ACT_SHOT; +#else __u32 fib_flags = 0; /* Initialise the context, which is stored on the stack, and the state, which @@ -1583,6 +1624,7 @@ int calico_tc_skb_send_icmp_replies(struct __sk_buff *skb) return forward_or_drop(ctx); deny: return TC_ACT_SHOT; +#endif /* IPVER6 */ } #if HAS_HOST_CONFLICT_PROG @@ -1665,12 +1707,12 @@ int calico_tc_skb_drop(struct __sk_buff *skb) counter_inc(ctx, CALI_REASON_DROPPED_BY_POLICY); CALI_DEBUG("proto=%d\n", ctx->state->ip_proto); - CALI_DEBUG("src=%x dst=%x\n", bpf_ntohl(ctx->state->ip_src), - bpf_ntohl(ctx->state->ip_dst)); - CALI_DEBUG("pre_nat=%x:%d\n", bpf_ntohl(ctx->state->pre_nat_ip_dst), + CALI_DEBUG("src=%x dst=%x\n", debug_ip(ctx->state->ip_src), + debug_ip(ctx->state->ip_dst)); + CALI_DEBUG("pre_nat=%x:%d\n", debug_ip(ctx->state->pre_nat_ip_dst), ctx->state->pre_nat_dport); - CALI_DEBUG("post_nat=%x:%d\n", bpf_ntohl(ctx->state->post_nat_ip_dst), ctx->state->post_nat_dport); - CALI_DEBUG("tun_ip=%x\n", ctx->state->tun_ip); + CALI_DEBUG("post_nat=%x:%d\n", debug_ip(ctx->state->post_nat_ip_dst), ctx->state->post_nat_dport); + CALI_DEBUG("tun_ip=%x\n", debug_ip(ctx->state->tun_ip)); CALI_DEBUG("pol_rc=%d\n", ctx->state->pol_rc); CALI_DEBUG("sport=%d\n", ctx->state->sport); CALI_DEBUG("flags=0x%x\n", ctx->state->flags); @@ -1691,17 +1733,17 @@ int calico_tc_skb_drop(struct __sk_buff *skb) ctx->state->pre_nat_dport == WG_PORT && ctx->state->sport == WG_PORT) { if ((CALI_F_FROM_HEP && - rt_addr_is_local_host(ctx->state->ip_dst) && - rt_addr_is_remote_host(ctx->state->ip_src)) || + rt_addr_is_local_host(&ctx->state->ip_dst) && + rt_addr_is_remote_host(&ctx->state->ip_src)) || (CALI_F_TO_HEP && - rt_addr_is_remote_host(ctx->state->ip_dst) && - rt_addr_is_local_host(ctx->state->ip_src))) { + rt_addr_is_remote_host(&ctx->state->ip_dst) && + rt_addr_is_local_host(&ctx->state->ip_src))) { /* This is info as it is supposed to be low intensity (only when a * new flow detected - should happen exactly once in a blue moon ;-) ) * but would be good to know about for issue debugging. */ CALI_INFO("Allowing WG %x <-> %x despite blocked by policy - known hosts.\n", - bpf_ntohl(ctx->state->ip_src), bpf_ntohl(ctx->state->ip_dst)); + debug_ip(ctx->state->ip_src), debug_ip(ctx->state->ip_dst)); goto allow; } } diff --git a/felix/bpf-gpl/tc_preamble.c b/felix/bpf-gpl/tc_preamble.c index cc9cfce00af..5ec06c081e2 100644 --- a/felix/bpf-gpl/tc_preamble.c +++ b/felix/bpf-gpl/tc_preamble.c @@ -14,6 +14,19 @@ const volatile struct cali_tc_globals __globals; +#ifdef IPVER6 +#define IPV " v6" +#define JUMP_IDX(idx) (idx ## _V6) +#define JUMP_IDX_DEBUG(idx) (idx ## _V6_DEBUG) +#else +#define IPV " v4" +#define JUMP_IDX(idx) (idx) +#define JUMP_IDX_DEBUG(idx) (idx ## _DEBUG) +#endif + +#define JUMP(idx) globals->jumps[JUMP_IDX(idx)] +#define JUMP_DEBUG(idx) globals->jumps[JUMP_IDX_DEBUG(idx)] + SEC("tc") int cali_tc_preamble(struct __sk_buff *skb) { @@ -27,39 +40,39 @@ int cali_tc_preamble(struct __sk_buff *skb) *globals = __globals; #if EMIT_LOGS - CALI_LOG("tc_preamble iface %s\n", globals->iface_name); + CALI_LOG("tc_preamble" IPV " iface %s\n", globals->iface_name); #endif /* If we have log filter installed, tell the filter where to jump next * and jump to the filter. */ if (globals->log_filter_jmp != (__u32)-1) { - skb->cb[0] = globals->jumps[PROG_INDEX_MAIN]; - skb->cb[1] = globals->jumps[PROG_INDEX_MAIN_DEBUG]; + skb->cb[0] = JUMP(PROG_INDEX_MAIN); + skb->cb[1] = JUMP_DEBUG(PROG_INDEX_MAIN); bpf_tail_call(skb, &cali_jump_prog_map, globals->log_filter_jmp); - CALI_LOG("tc_preamble iface %s failed to call log filter %d\n", + CALI_LOG("tc_preamble" IPV " iface %s failed to call log filter %d\n", globals->iface_name, globals->log_filter_jmp); /* try to jump to the regular path */ } /* Jump to the start of the prog chain. */ #if EMIT_LOGS - CALI_LOG("tc_preamble iface %s jump to %d\n", - globals->iface_name, globals->jumps[PROG_INDEX_MAIN]); + CALI_LOG("tc_preamble" IPV " iface %s jump to %d\n", + globals->iface_name, JUMP(PROG_INDEX_MAIN)); #endif - bpf_tail_call(skb, &cali_jump_map, globals->jumps[PROG_INDEX_MAIN]); - CALI_LOG("tc_preamble iface %s failed to call main %d\n", - globals->iface_name, globals->jumps[PROG_INDEX_MAIN]); + bpf_tail_call(skb, &cali_jump_map, JUMP(PROG_INDEX_MAIN)); + CALI_LOG("tc_preamble" IPV " iface %s failed to call main %d\n", + globals->iface_name, JUMP(PROG_INDEX_MAIN)); /* Try debug path in the unexpected case of not being able to make the jump. */ - CALI_LOG("tc_preamble iface %s jump to %d\n", - globals->iface_name, globals->jumps[PROG_INDEX_MAIN_DEBUG]); - bpf_tail_call(skb, &cali_jump_map, globals->jumps[PROG_INDEX_MAIN_DEBUG]); - CALI_LOG("tc_preamble iface %s failed to call debug main %d\n", - globals->iface_name, globals->jumps[PROG_INDEX_MAIN_DEBUG]); + CALI_LOG("tc_preamble" IPV " iface %s jump to %d\n", + globals->iface_name, JUMP_DEBUG(PROG_INDEX_MAIN)); + bpf_tail_call(skb, &cali_jump_map, JUMP_DEBUG(PROG_INDEX_MAIN)); + CALI_LOG("tc_preamble" IPV " iface %s failed to call debug main %d\n", + globals->iface_name, JUMP_DEBUG(PROG_INDEX_MAIN)); /* Drop the packet in the unexpected case of not being able to make the jump. */ - CALI_LOG("tc_preamble iface %s failed to call main %d\n", globals->iface_name, globals->jumps[PROG_INDEX_MAIN]); + CALI_LOG("tc_preamble" IPV " iface %s failed to call main %d\n", globals->iface_name, JUMP(PROG_INDEX_MAIN)); return TC_ACT_SHOT; } diff --git a/felix/bpf-gpl/types.h b/felix/bpf-gpl/types.h index e17ebd17fbc..1b3d2fee362 100644 --- a/felix/bpf-gpl/types.h +++ b/felix/bpf-gpl/types.h @@ -8,7 +8,11 @@ #include #include #include +#ifdef IPVER6 +#include +#else #include +#endif #include #include #include @@ -23,8 +27,11 @@ #define ETH_IPV4_UDP_SIZE (sizeof(struct ethhdr) + IPV4_UDP_SIZE) #define ETH_SIZE (sizeof(struct ethhdr)) +#ifdef IPVER6 +#define IP_SIZE (sizeof(struct ipv6hdr)) +#else #define IP_SIZE (sizeof(struct iphdr)) -#define IPv6_SIZE (sizeof(struct ipv6hdr)) +#endif #define UDP_SIZE (sizeof(struct udphdr)) #define TCP_SIZE (sizeof(struct tcphdr)) #define ICMP_SIZE (sizeof(struct icmphdr)) @@ -38,36 +45,21 @@ struct cali_tc_state { /* Initial IP read from the packet, updated to host's IP when doing NAT encap/ICMP error. * updated when doing CALI_CT_ESTABLISHED_SNAT handling. Used for FIB lookup. */ - __be32 ip_src; - __be32 ip_src1; - __be32 ip_src2; - __be32 ip_src3; + DECLARE_IP_ADDR(ip_src); /* Initial IP read from packet. Updated when doing encap and ICMP errors or CALI_CT_ESTABLISHED_DNAT. * If connect-time load balancing is enabled, this will be the post-NAT IP because the connect-time * load balancer gets in before TC. */ - __be32 ip_dst; - __be32 ip_dst1; - __be32 ip_dst2; - __be32 ip_dst3; + DECLARE_IP_ADDR(ip_dst); /* Set when invoking the policy program; if no NAT, ip_dst; otherwise, the pre-DNAT IP. If the connect * time load balancer is enabled, this may be different from ip_dst. */ - __be32 pre_nat_ip_dst; - __be32 pre_nat_ip_dst1; - __be32 pre_nat_ip_dst2; - __be32 pre_nat_ip_dst3; + DECLARE_IP_ADDR(pre_nat_ip_dst); /* If no NAT, ip_dst. Otherwise the NAT dest that we look up from the NAT maps or the conntrack entry * for CALI_CT_ESTABLISHED_DNAT. */ - __be32 post_nat_ip_dst; - __be32 post_nat_ip_dst1; - __be32 post_nat_ip_dst2; - __be32 post_nat_ip_dst3; + DECLARE_IP_ADDR(post_nat_ip_dst); /* For packets that arrived over our VXLAN tunnel, the source IP of the tunnel packet. * Zeroed out when we decide to respond with an ICMP error. * Also used to stash the ICMP MTU when calling the ICMP response program. */ - __be32 tun_ip; - __be32 tun_ip1; - __be32 tun_ip2; - __be32 tun_ip3; + DECLARE_IP_ADDR(tun_ip); __u16 ihl; __u16 unused; /* Return code from the policy program CALI_POL_DENY/ALLOW etc. */ @@ -107,10 +99,15 @@ struct cali_tc_state { struct calico_nat_dest nat_dest; /* 8 bytes */ __u64 prog_start_time; __u64 flags; +#ifndef IPVER6 + __u8 __pad_ipv4[48]; +#endif }; struct pkt_scratch { - __u8 l4[20]; /* 20 bytes to fit udp, icmp, tcp w/o options */ + __u8 l4[24]; /* 20 bytes to fit udp, icmp, tcp w/o options and 24 to make 8-aligned */ + struct ct_create_ctx ct_ctx_nat; + struct calico_ct_key ct_key; }; enum cali_state_flags { @@ -150,20 +147,26 @@ struct fwd { }; struct cali_tc_ctx { +#if !CALI_F_XDP struct __sk_buff *skb; +#else struct xdp_md *xdp; +#endif /* Our single copies of the data start/end pointers loaded from the skb. */ void *data_start; void *data_end; void *ip_header; long ipheader_len; + void *nh; struct cali_tc_state *state; +#if !CALI_F_XDP const volatile struct cali_tc_globals *globals; +#else const volatile struct cali_xdp_globals *xdp_globals; /* XXX we must split the state between tc/xdp */ +#endif struct calico_nat_dest *nat_dest; - struct arp_key arpk; struct fwd fwd; void *counters; struct pkt_scratch *scratch; @@ -192,6 +195,7 @@ struct cali_tc_ctx { .counters = counters, \ .globals = gl, \ .scratch = scratch, \ + .nh = &scratch->l4, \ __VA_ARGS__ \ }; \ if (x.ipheader_len == 0) { \ @@ -201,11 +205,33 @@ struct cali_tc_ctx { x; \ }) \ +#ifdef IPVER6 +static CALI_BPF_INLINE struct ipv6hdr* ip_hdr(struct cali_tc_ctx *ctx) +{ + return (struct ipv6hdr *)ctx->ip_header; +} + +#define ip_hdr_set_ip(ctx, field, ip) do { \ + struct in6_addr *addr = &(ip_hdr(ctx)->field); \ + addr->in6_u.u6_addr32[0] = ip.a; \ + addr->in6_u.u6_addr32[1] = ip.b; \ + addr->in6_u.u6_addr32[2] = ip.c; \ + addr->in6_u.u6_addr32[3] = ip.d; \ +} while(0) + +#else + static CALI_BPF_INLINE struct iphdr* ip_hdr(struct cali_tc_ctx *ctx) { return (struct iphdr *)ctx->ip_header; } +#define ip_hdr_set_ip(ctx, field, ip) do { \ + ip_hdr(ctx)->field = ip; \ +} while (0) + +#endif + static CALI_BPF_INLINE struct ethhdr* eth_hdr(struct cali_tc_ctx *ctx) { return (struct ethhdr *)ctx->data_start; @@ -213,17 +239,17 @@ static CALI_BPF_INLINE struct ethhdr* eth_hdr(struct cali_tc_ctx *ctx) static CALI_BPF_INLINE struct tcphdr* tcp_hdr(struct cali_tc_ctx *ctx) { - return (struct tcphdr *)ctx->scratch->l4; + return (struct tcphdr *)ctx->nh; } static CALI_BPF_INLINE struct udphdr* udp_hdr(struct cali_tc_ctx *ctx) { - return (struct udphdr *)ctx->scratch->l4; + return (struct udphdr *)ctx->nh; } static CALI_BPF_INLINE struct icmphdr* icmp_hdr(struct cali_tc_ctx *ctx) { - return (struct icmphdr *)ctx->scratch->l4; + return (struct icmphdr *)ctx->nh; } static CALI_BPF_INLINE __u32 ctx_ifindex(struct cali_tc_ctx *ctx) @@ -249,5 +275,9 @@ static CALI_BPF_INLINE int l4_hdr_len(struct cali_tc_ctx *ctx) return 0; } +#define IP_VOID 0 +#define IP_EQ(ip1, ip2) ((ip1) == (ip2)) +#define IP_SET(var, val) ((var) = (val)) + #endif /* __CALI_BPF_TYPES_H__ */ diff --git a/felix/bpf-gpl/ut/ip_parse_test.c b/felix/bpf-gpl/ut/ip_parse_test.c new file mode 100644 index 00000000000..39110d03b96 --- /dev/null +++ b/felix/bpf-gpl/ut/ip_parse_test.c @@ -0,0 +1,59 @@ +// Project Calico BPF dataplane programs. +// Copyright (c) 2023 Tigera, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +#include "ut.h" +#include "parsing.h" +#include "jump.h" +#include "nat.h" + +const volatile struct cali_tc_globals __globals; + +static CALI_BPF_INLINE int calico_unittest_entry (struct __sk_buff *skb) +{ + volatile struct cali_tc_globals *globals = state_get_globals_tc(); + + if (!globals) { + return TC_ACT_SHOT; + } + + /* Set the globals for the rest of the prog chain. */ + *globals = __globals; + DECLARE_TC_CTX(_ctx, + .skb = skb, + .ipheader_len = IP_SIZE, + ); + struct cali_tc_ctx *ctx = &_ctx; + + if (!ctx->counters) { + CALI_DEBUG("Counters map lookup failed: DROP\n"); + return TC_ACT_SHOT; + } + + int ver; + + switch (parse_packet_ip(ctx)) { +#ifdef IPVER6 + case PARSING_OK_V6: + ver = 6; + break; +#else + case PARSING_OK: + ver = 4; + break; +#endif + default: + return TC_ACT_UNSPEC; + } + + tc_state_fill_from_iphdr(ctx); + + switch (tc_state_fill_from_nexthdr(ctx, true)) { + case PARSING_ERROR: + return -1; + case PARSING_ALLOW_WITHOUT_ENFORCING_POLICY: + return -2; + } + + return ver; +} diff --git a/felix/bpf-gpl/ut/ipv4_opts_test.c b/felix/bpf-gpl/ut/ipv4_opts_test.c index c547a601a54..6f3febd1485 100644 --- a/felix/bpf-gpl/ut/ipv4_opts_test.c +++ b/felix/bpf-gpl/ut/ipv4_opts_test.c @@ -44,7 +44,10 @@ static CALI_BPF_INLINE int calico_unittest_entry (struct __sk_buff *skb) goto allow; } - if (vxlan_v4_encap(ctx, 0x06060606, 0x10101010)) { + __u32 a = 0x06060606; + __u32 b = 0x10101010; + + if (vxlan_encap(ctx, &a, &b)) { CALI_DEBUG("vxlan: encap failed!\n"); deny_reason(ctx, CALI_REASON_ENCAP_FAIL); goto deny; diff --git a/felix/bpf-gpl/ut/nat_decap_test.c b/felix/bpf-gpl/ut/nat_decap_test.c index 6f67ba642ef..aab9216782a 100644 --- a/felix/bpf-gpl/ut/nat_decap_test.c +++ b/felix/bpf-gpl/ut/nat_decap_test.c @@ -8,5 +8,5 @@ static CALI_BPF_INLINE int calico_unittest_entry (struct __sk_buff *skb) { - return vxlan_v4_decap(skb); + return vxlan_decap(skb); } diff --git a/felix/bpf-gpl/ut/nat_encap_test.c b/felix/bpf-gpl/ut/nat_encap_test.c index 932bd462277..32a3e9b2bc6 100644 --- a/felix/bpf-gpl/ut/nat_encap_test.c +++ b/felix/bpf-gpl/ut/nat_encap_test.c @@ -31,5 +31,9 @@ static CALI_BPF_INLINE int calico_unittest_entry (struct __sk_buff *skb) CALI_DEBUG("Counters map lookup failed: DROP\n"); return TC_ACT_SHOT; } - return vxlan_v4_encap(ctx, HOST_IP, 0x02020202); + + __u32 a = HOST_IP; + __u32 b = 0x02020202; + + return vxlan_encap(ctx, &a, &b); } diff --git a/felix/bpf-gpl/xdp.c b/felix/bpf-gpl/xdp.c index dea9dd6682c..5743c921c2e 100644 --- a/felix/bpf-gpl/xdp.c +++ b/felix/bpf-gpl/xdp.c @@ -62,6 +62,7 @@ int calico_xdp_main(struct xdp_md *xdp) } __builtin_memset(ctx->state, 0, sizeof(*ctx->state)); ctx->scratch = (void *)(ctx->xdp_globals + 1); /* needs to be set to something, not used, there is space */ + ctx->nh = &ctx->scratch->l4; counter_inc(ctx, COUNTER_TOTAL_PACKETS); diff --git a/felix/bpf/arp/map.go b/felix/bpf/arp/map.go index 7319410650e..c4e92a19649 100644 --- a/felix/bpf/arp/map.go +++ b/felix/bpf/arp/map.go @@ -26,6 +26,7 @@ import ( func init() { maps.SetSize(MapParams.VersionedName(), MapParams.MaxEntries) + maps.SetSize(MapV6Params.VersionedName(), MapParams.MaxEntries) } var MapParams = maps.MapParameters{ @@ -110,10 +111,10 @@ type MapMem map[Key]Value func LoadMapMem(m maps.Map) (MapMem, error) { ret := make(MapMem) - err := m.Iter(func(k, v []byte) maps.IteratorAction { - ks := len(Key{}) - vs := len(Value{}) + ks := len(Key{}) + vs := len(Value{}) + err := m.Iter(func(k, v []byte) maps.IteratorAction { var key Key copy(key[:ks], k[:ks]) diff --git a/felix/bpf/arp/map6.go b/felix/bpf/arp/map6.go new file mode 100644 index 00000000000..d0feab07f39 --- /dev/null +++ b/felix/bpf/arp/map6.go @@ -0,0 +1,111 @@ +// Copyright (c) 2020 Tigera, Inc. All rights reserved. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arp + +import ( + "encoding/binary" + "fmt" + "net" + + "github.com/projectcalico/calico/felix/bpf/maps" +) + +var MapV6Params = maps.MapParameters{ + Type: "lru_hash", + KeySize: KeyV6Size, + ValueSize: ValueV6Size, + MaxEntries: 10000, // max number of nodes that can forward nodeports to a single node + Name: "cali_v6_arp", + Version: 2, +} + +func MapV6() maps.Map { + return maps.NewPinnedMap(MapV6Params) +} + +const KeyV6Size = 20 + +type KeyV6 [KeyV6Size]byte + +func NewKeyV6(ip net.IP, ifIndex uint32) KeyV6 { + var k KeyV6 + + ip = ip.To16() + + copy(k[:16], ip) + binary.LittleEndian.PutUint32(k[16:20], ifIndex) + + return k +} + +func (k KeyV6) IP() net.IP { + return net.IP(k[:16]) +} + +func (k KeyV6) IfIndex() uint32 { + return binary.LittleEndian.Uint32(k[16:20]) +} + +func (k KeyV6) String() string { + return fmt.Sprintf("ip %s ifindex %d", k.IP(), k.IfIndex()) +} + +func (k KeyV6) AsBytes() []byte { + return k[:] +} + +const ValueV6Size = ValueSize + +type ValueV6 = Value + +type MapMemV6 map[KeyV6]ValueV6 + +// LoadMapMem loads ConntrackMap into memory +func LoadMapMemV6(m maps.Map) (MapMemV6, error) { + ret := make(MapMemV6) + + ks := len(KeyV6{}) + vs := len(ValueV6{}) + + err := m.Iter(func(k, v []byte) maps.IteratorAction { + var key KeyV6 + copy(key[:ks], k[:ks]) + + var val ValueV6 + copy(val[:vs], v[:vs]) + + ret[key] = val + return maps.IterNone + }) + + return ret, err +} + +// MapMemIterV6 returns maps.MapIter that loads the provided MapMem +func MapMemIterV6(m MapMemV6) maps.IterCallback { + ks := len(KeyV6{}) + vs := len(ValueV6{}) + + return func(k, v []byte) maps.IteratorAction { + var key KeyV6 + copy(key[:ks], k[:ks]) + + var val ValueV6 + copy(val[:vs], v[:vs]) + + m[key] = val + return maps.IterNone + } +} diff --git a/felix/bpf/conntrack/map.go b/felix/bpf/conntrack/map.go index 4a92473f3df..e47415d9189 100644 --- a/felix/bpf/conntrack/map.go +++ b/felix/bpf/conntrack/map.go @@ -28,24 +28,33 @@ import ( ) func init() { - SetMapSize(MapParams.MaxEntries) + SetMapSize(MaxEntries) } func SetMapSize(size int) { - maps.SetSize(MapParams.VersionedName(), size) + maps.SetSize(curVer.MapParams.VersionedName(), size) + maps.SetSize(curVer.MapParamsV6.VersionedName(), size) } const KeySize = curVer.KeySize +const KeyV6Size = curVer.KeyV6Size const ValueSize = curVer.ValueSize +const ValueV6Size = curVer.ValueV6Size const MaxEntries = curVer.MaxEntries type Key = curVer.Key +type KeyV6 = curVer.KeyV6 func NewKey(proto uint8, ipA net.IP, portA uint16, ipB net.IP, portB uint16) Key { return curVer.NewKey(proto, ipA, portA, ipB, portB) } +func NewKeyV6(proto uint8, ipA net.IP, portA uint16, ipB net.IP, portB uint16) KeyV6 { + return curVer.NewKeyV6(proto, ipA, portA, ipB, portB) +} + type Value = curVer.Value +type ValueV6 = curVer.ValueV6 const ( TypeNormal uint8 = iota @@ -77,9 +86,34 @@ func NewValueNATReverseSNAT(created, lastSeen time.Duration, flags uint16, legA, return curVer.NewValueNATReverseSNAT(created, lastSeen, flags, legA, legB, tunnelIP, origIP, origSrcIP, origPort) } +// NewValueV6Normal creates a new ValueV6 of type TypeNormal based on the given parameters +func NewValueV6Normal(created, lastSeen time.Duration, flags uint16, legA, legB Leg) ValueV6 { + return curVer.NewValueV6Normal(created, lastSeen, flags, legA, legB) +} + +// NewValueV6NATForward creates a new ValueV6 of type TypeNATForward for the given +// arguments and the reverse key +func NewValueV6NATForward(created, lastSeen time.Duration, flags uint16, revKey KeyV6) ValueV6 { + return curVer.NewValueV6NATForward(created, lastSeen, flags, revKey) +} + +// NewValueV6NATReverse creates a new ValueV6 of type TypeNATReverse for the given +// arguments and reverse parameters +func NewValueV6NATReverse(created, lastSeen time.Duration, flags uint16, legA, legB Leg, + tunnelIP, origIP net.IP, origPort uint16) ValueV6 { + return curVer.NewValueV6NATReverse(created, lastSeen, flags, legA, legB, tunnelIP, origIP, origPort) +} + +// NewValueV6NATReverseSNAT in addition to NewValueV6NATReverse sets the orig source IP +func NewValueV6NATReverseSNAT(created, lastSeen time.Duration, flags uint16, legA, legB Leg, + tunnelIP, origIP, origSrcIP net.IP, origPort uint16) ValueV6 { + return curVer.NewValueV6NATReverseSNAT(created, lastSeen, flags, legA, legB, tunnelIP, origIP, origSrcIP, origPort) +} + type Leg = curVer.Leg var MapParams = curVer.MapParams +var MapParamsV6 = curVer.MapParamsV6 func Map() maps.Map { b := maps.NewPinnedMap(MapParams) @@ -89,6 +123,12 @@ func Map() maps.Map { return b } +func MapV6() maps.Map { + b := maps.NewPinnedMap(MapParamsV6) + b.GetMapParams = GetMapParams + return b +} + func MapV2() maps.Map { return maps.NewPinnedMap(v2.MapParams) } @@ -158,6 +198,65 @@ func StringToValue(str string) Value { return BytesToValue([]byte(str)) } +func KeyV6FromBytes(k []byte) KeyV6 { + var ctKeyV6 KeyV6 + if len(k) != len(ctKeyV6) { + log.Panic("KeyV6 has unexpected length") + } + copy(ctKeyV6[:], k[:]) + return ctKeyV6 +} + +func ValueV6FromBytes(v []byte) ValueV6 { + var ctVal ValueV6 + if len(v) != len(ctVal) { + log.Panic("ValueV6 has unexpected length") + } + copy(ctVal[:], v[:]) + return ctVal +} + +type MapMemV6 = curVer.MapMemV6 + +// LoadMapMem loads ConntrackMap into memory +func LoadMapMemV6(m maps.Map) (MapMemV6, error) { + ret, err := curVer.LoadMapMemV6(m) + return ret, err +} + +// MapMemIter returns maps.MapIter that loads the provided MapMem +func MapMemIterV6(m MapMemV6) func(k, v []byte) { + return curVer.MapMemIterV6(m) +} + +// BytesToKeyV6 turns a slice of bytes into a KeyV6 +func BytesToKeyV6(bytes []byte) KeyV6 { + var k KeyV6 + + copy(k[:], bytes[:]) + + return k +} + +// StringToKeyV6 turns a string into a KeyV6 +func StringToKeyV6(str string) KeyV6 { + return BytesToKeyV6([]byte(str)) +} + +// BytesToValueV6 turns a slice of bytes into a value +func BytesToValueV6(bytes []byte) ValueV6 { + var v ValueV6 + + copy(v[:], bytes) + + return v +} + +// StringToValueV6 turns a string into a ValueV6 +func StringToValueV6(str string) ValueV6 { + return BytesToValueV6([]byte(str)) +} + func GetMapParams(version int) maps.MapParameters { switch version { case 2: diff --git a/felix/bpf/conntrack/v3/map.go b/felix/bpf/conntrack/v3/map.go index 129592443b1..3fa50d4b0ce 100644 --- a/felix/bpf/conntrack/v3/map.go +++ b/felix/bpf/conntrack/v3/map.go @@ -506,12 +506,6 @@ var MapParams = maps.MapParameters{ UpdatedByBPF: true, } -const ( - ProtoICMP = 1 - ProtoTCP = 6 - ProtoUDP = 17 -) - func KeyFromBytes(k []byte) Key { var ctKey Key if len(k) != len(ctKey) { diff --git a/felix/bpf/conntrack/v3/map6.go b/felix/bpf/conntrack/v3/map6.go new file mode 100644 index 00000000000..5799e6cf92f --- /dev/null +++ b/felix/bpf/conntrack/v3/map6.go @@ -0,0 +1,444 @@ +// Copyright (c) 2022 Tigera, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v3 + +import ( + "encoding/binary" + "fmt" + "net" + "time" + + log "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + + "github.com/projectcalico/calico/felix/bpf/maps" +) + +// struct calico_ct_key { +// uint32_t protocol; +// __be32 addr_a, addr_b; // NBO +// uint16_t port_a, port_b; // HBO +// }; +const KeyV6Size = 40 +const ValueV6Size = 128 + +type KeyV6 [KeyV6Size]byte + +func (k KeyV6) AsBytes() []byte { + return k[:] +} + +func (k KeyV6) Proto() uint8 { + return uint8(binary.LittleEndian.Uint32(k[:4])) +} + +func (k KeyV6) AddrA() net.IP { + return k[4:20] +} + +func (k KeyV6) PortA() uint16 { + return binary.LittleEndian.Uint16(k[36:38]) +} + +func (k KeyV6) AddrB() net.IP { + return k[20:36] +} + +func (k KeyV6) PortB() uint16 { + return binary.LittleEndian.Uint16(k[38:40]) +} + +func (k KeyV6) String() string { + return fmt.Sprintf("ConntrackKey{proto=%v %v:%v <-> %v:%v}", + k.Proto(), k.AddrA(), k.PortA(), k.AddrB(), k.PortB()) +} + +func (k KeyV6) Upgrade() maps.Upgradable { + panic("conntrack map key already at its latest version") +} + +func NewKeyV6(proto uint8, ipA net.IP, portA uint16, ipB net.IP, portB uint16) KeyV6 { + var k KeyV6 + binary.LittleEndian.PutUint32(k[:4], uint32(proto)) + copy(k[4:20], ipA.To16()) + copy(k[20:36], ipB.To16()) + binary.LittleEndian.PutUint16(k[36:38], portA) + binary.LittleEndian.PutUint16(k[38:40], portB) + return k +} + +// struct calico_ct_value { +// __u64 created; +// __u64 last_seen; // 8 +// __u8 type; // 16 +// __u8 flags; // 17 +// +// // Important to use explicit padding, otherwise the compiler can decide +// // not to zero the padding bytes, which upsets the verifier. Worse than +// // that, debug logging often prevents such optimisation resulting in +// // failures when debug logging is compiled out only :-). +// __u8 pad0[5]; +// __u8 flags2; +// union { +// // CALI_CT_TYPE_NORMAL and CALI_CT_TYPE_NAT_REV. +// struct { +// struct calico_ct_leg a_to_b; // 24 +// struct calico_ct_leg b_to_a; // 36 +// +// // CALI_CT_TYPE_NAT_REV only. +// __u32 orig_dst; // 48 +// __u16 orig_port; // 52 +// __u8 pad1[2]; // 54 +// __u32 tun_ip; // 56 +// __u32 pad3; // 60 +// }; +// +// // CALI_CT_TYPE_NAT_FWD; key for the CALI_CT_TYPE_NAT_REV entry. +// struct { +// struct calico_ct_key nat_rev_key; // 24 +// __u8 pad2[8]; +// }; +// }; +// }; + +const ( + VoCreatedV6 int = 0 + VoLastSeenV6 int = 8 + VoTypeV6 int = 16 + VoFlagsV6 int = 17 + VoFlags2V6 int = 23 + VoRevKeyV6 int = 24 + VoLegABV6 int = 24 + VoLegBAV6 int = 48 + VoTunIPV6 int = 72 + VoOrigIPV6 int = VoTunIPV6 + 16 + VoOrigPortV6 int = VoOrigIPV6 + 16 + VoOrigSPortV6 int = VoOrigPortV6 + 2 + VoOrigSIPV6 int = VoOrigSPortV6 + 2 + VoNATSPortV6 int = VoRevKeyV6 + KeyV6Size +) + +type ValueV6 [ValueV6Size]byte + +func (e ValueV6) Created() int64 { + return int64(binary.LittleEndian.Uint64(e[VoCreatedV6 : VoCreatedV6+8])) +} + +func (e ValueV6) LastSeen() int64 { + return int64(binary.LittleEndian.Uint64(e[VoLastSeenV6 : VoLastSeenV6+8])) +} + +func (e ValueV6) Type() uint8 { + return e[VoTypeV6] +} + +func (e ValueV6) Flags() uint16 { + return uint16(e[VoFlagsV6]) | (uint16(e[VoFlags2]) << 8) +} + +// OrigIP returns the original destination IP, valid only if Type() is TypeNormal or TypeNATReverse +func (e ValueV6) OrigIP() net.IP { + return e[VoOrigIPV6 : VoOrigIPV6+16] +} + +// OrigPort returns the original destination port, valid only if Type() is TypeNormal or TypeNATReverse +func (e ValueV6) OrigPort() uint16 { + return binary.LittleEndian.Uint16(e[VoOrigPortV6 : VoOrigPortV6+2]) +} + +// OrigSPort returns the original source port, valid only if Type() is +// TypeNATReverse and if the value returned is non-zero. +func (e ValueV6) OrigSPort() uint16 { + return binary.LittleEndian.Uint16(e[VoOrigSPortV6 : VoOrigSPortV6+2]) +} + +// NATSPort returns the port to SNAT to, valid only if Type() is TypeNATForward. +func (e ValueV6) NATSPort() uint16 { + return binary.LittleEndian.Uint16(e[VoNATSPortV6 : VoNATSPortV6+2]) +} + +// OrigSrcIP returns the original source IP. +func (e ValueV6) OrigSrcIP() net.IP { + return e[VoOrigSIPV6 : VoOrigSIPV6+16] +} + +func (e ValueV6) ReverseNATKey() KeyV6 { + var ret KeyV6 + + l := len(KeyV6{}) + copy(ret[:l], e[VoRevKeyV6:VoRevKeyV6+l]) + + return ret +} + +// AsBytes returns the value as slice of bytes +func (e ValueV6) AsBytes() []byte { + return e[:] +} + +func (e *ValueV6) SetLegA2B(leg Leg) { + copy(e[VoLegABV6:VoLegABV6+legSize], leg.AsBytes()) +} + +func (e *ValueV6) SetLegB2A(leg Leg) { + copy(e[VoLegBAV6:VoLegBAV6+legSize], leg.AsBytes()) +} + +func (e *ValueV6) SetOrigSport(sport uint16) { + binary.LittleEndian.PutUint16(e[VoOrigSPortV6:VoOrigSPortV6+2], sport) +} + +func (e *ValueV6) SetNATSport(sport uint16) { + binary.LittleEndian.PutUint16(e[VoNATSPortV6:VoNATSPortV6+2], sport) +} + +func initValueV6(v *ValueV6, created, lastSeen time.Duration, typ uint8, flags uint16) { + binary.LittleEndian.PutUint64(v[VoCreatedV6:VoCreatedV6+8], uint64(created)) + binary.LittleEndian.PutUint64(v[VoLastSeenV6:VoLastSeenV6+8], uint64(lastSeen)) + v[VoTypeV6] = typ + v[VoFlagsV6] = byte(flags & 0xff) + v[VoFlags2] = byte((flags >> 8) & 0xff) +} + +// NewValueV6Normal creates a new ValueV6 of type TypeNormal based on the given parameters +func NewValueV6Normal(created, lastSeen time.Duration, flags uint16, legA, legB Leg) ValueV6 { + v := ValueV6{} + + initValueV6(&v, created, lastSeen, TypeNormal, flags) + + v.SetLegA2B(legA) + v.SetLegB2A(legB) + + return v +} + +// NewValueV6NATForward creates a new ValueV6 of type TypeNATForward for the given +// arguments and the reverse key +func NewValueV6NATForward(created, lastSeen time.Duration, flags uint16, revKey KeyV6) ValueV6 { + v := ValueV6{} + + initValueV6(&v, created, lastSeen, TypeNATForward, flags) + + copy(v[VoRevKeyV6:VoRevKeyV6+KeySize], revKey.AsBytes()) + + return v +} + +// NewValueV6NATReverse creates a new ValueV6 of type TypeNATReverse for the given +// arguments and reverse parameters +func NewValueV6NATReverse(created, lastSeen time.Duration, flags uint16, legA, legB Leg, + tunnelIP, origIP net.IP, origPort uint16) ValueV6 { + v := ValueV6{} + + initValueV6(&v, created, lastSeen, TypeNATReverse, flags) + + v.SetLegA2B(legA) + v.SetLegB2A(legB) + + copy(v[VoOrigIPV6:VoOrigIPV6+16], origIP.To4()) + binary.LittleEndian.PutUint16(v[VoOrigPortV6:VoOrigPortV6+2], origPort) + + copy(v[VoTunIPV6:VoTunIPV6+16], tunnelIP.To4()) + + return v +} + +// NewValueV6NATReverseSNAT in addition to NewValueV6NATReverse sets the orig source IP +func NewValueV6NATReverseSNAT(created, lastSeen time.Duration, flags uint16, legA, legB Leg, + tunnelIP, origIP, origSrcIP net.IP, origPort uint16) ValueV6 { + v := NewValueV6NATReverse(created, lastSeen, flags, legA, legB, tunnelIP, origIP, origPort) + copy(v[VoOrigSIPV6:VoOrigSIPV6+16], origIP.To4()) + + return v +} + +func readConntrackLegV6(b []byte) Leg { + bits := binary.LittleEndian.Uint32(b[legExtra+4 : legExtra+8]) + return Leg{ + Bytes: binary.LittleEndian.Uint64(b[0:8]), + Packets: binary.LittleEndian.Uint32(b[8:12]), + Seqno: binary.BigEndian.Uint32(b[legExtra+0 : legExtra+4]), + SynSeen: bitSet(bits, 0), + AckSeen: bitSet(bits, 1), + FinSeen: bitSet(bits, 2), + RstSeen: bitSet(bits, 3), + Approved: bitSet(bits, 4), + Opener: bitSet(bits, 5), + Ifindex: binary.LittleEndian.Uint32(b[legExtra+8 : legExtra+12]), + } +} + +func (e ValueV6) Data() EntryData { + ip := e[VoOrigIPV6 : VoOrigIPV6+16] + tip := e[VoTunIPV6 : VoTunIPV6+16] + sip := e[VoOrigSIPV6 : VoOrigSIPV6+16] + return EntryData{ + A2B: readConntrackLegV6(e[VoLegABV6 : VoLegABV6+legSize]), + B2A: readConntrackLegV6(e[VoLegBAV6 : VoLegBAV6+legSize]), + OrigDst: ip, + OrigSrc: sip, + OrigPort: binary.LittleEndian.Uint16(e[VoOrigPortV6 : VoOrigPortV6+2]), + OrigSPort: binary.LittleEndian.Uint16(e[VoOrigPortV6+2 : VoOrigPortV6+4]), + TunIP: tip, + } +} + +func (e ValueV6) String() string { + flagsStr := "" + flags := e.Flags() + + if flags == 0 { + flagsStr = " " + } else { + flagsStr = fmt.Sprintf(" 0x%x", flags) + if flags&FlagNATOut != 0 { + flagsStr += " nat-out" + } + + if flags&FlagNATFwdDsr != 0 { + flagsStr += " fwd-dsr" + } + + if flags&FlagNATNPFwd != 0 { + flagsStr += " np-fwd" + } + + if flags&FlagSkipFIB != 0 { + flagsStr += " skip-fib" + } + + if flags&FlagExtLocal != 0 { + flagsStr += " ext-local" + } + + if flags&FlagViaNATIf != 0 { + flagsStr += " via-nat-iface" + } + + if flags&FlagSrcDstBA != 0 { + flagsStr += " B-A" + } + + if flags&FlagHostPSNAT != 0 { + flagsStr += " host-psnat" + } + + if flags&FlagSvcSelf != 0 { + flagsStr += " svc-self" + } + + if flags&FlagNPLoop != 0 { + flagsStr += " np-loop" + } + + if flags&FlagNPRemote != 0 { + flagsStr += " np-remote" + } + + if flags&FlagNPRemote != 0 { + flagsStr += " no-dsr" + } + } + + ret := fmt.Sprintf("Entry{Type:%d, Created:%d, LastSeen:%d, Flags:%s ", + e.Type(), e.Created(), e.LastSeen(), flagsStr) + + switch e.Type() { + case TypeNATForward: + ret += fmt.Sprintf("REVKey: %s NATSPort: %d", e.ReverseNATKey().String(), e.NATSPort()) + case TypeNormal, TypeNATReverse: + ret += fmt.Sprintf("Data: %+v", e.Data()) + default: + ret += "TYPE INVALID" + } + + return ret + "}" +} + +func (e ValueV6) IsForwardDSR() bool { + return e.Flags()&FlagNATFwdDsr != 0 +} + +func (e ValueV6) Upgrade() maps.Upgradable { + panic("conntrack map value already at its latest version") +} + +var MapParamsV6 = maps.MapParameters{ + Type: "hash", + KeySize: KeyV6Size, + ValueSize: ValueV6Size, + MaxEntries: MaxEntries, + Name: "cali_v6_ct", + Flags: unix.BPF_F_NO_PREALLOC, + Version: 3, + UpdatedByBPF: true, +} + +func KeyV6FromBytes(k []byte) KeyV6 { + var ctKey KeyV6 + if len(k) != len(ctKey) { + log.Panic("KeyV6 has unexpected length") + } + copy(ctKey[:], k[:]) + return ctKey +} + +func ValueV6FromBytes(v []byte) ValueV6 { + var ctVal ValueV6 + if len(v) != len(ctVal) { + log.Panic("ValueV6 has unexpected length") + } + copy(ctVal[:], v[:]) + return ctVal +} + +type MapMemV6 map[KeyV6]ValueV6 + +// LoadMapMem loads ConntrackMap into memory +func LoadMapMemV6(m maps.Map) (MapMemV6, error) { + ret := make(MapMemV6) + + err := m.Iter(func(k, v []byte) maps.IteratorAction { + ks := len(KeyV6{}) + vs := len(ValueV6{}) + + var key KeyV6 + copy(key[:ks], k[:ks]) + + var val ValueV6 + copy(val[:vs], v[:vs]) + + ret[key] = val + return maps.IterNone + }) + + return ret, err +} + +// MapMemIterV6 returns maps.MapIter that loads the provided MapMemV6 +func MapMemIterV6(m MapMemV6) func(k, v []byte) { + ks := len(KeyV6{}) + vs := len(ValueV6{}) + + return func(k, v []byte) { + var key KeyV6 + copy(key[:ks], k[:ks]) + + var val ValueV6 + copy(val[:vs], v[:vs]) + + m[key] = val + } +} diff --git a/felix/bpf/libbpf/libbpf.go b/felix/bpf/libbpf/libbpf.go index 83145af3d44..d3a1a40eeaa 100644 --- a/felix/bpf/libbpf/libbpf.go +++ b/felix/bpf/libbpf/libbpf.go @@ -66,6 +66,10 @@ func (m *Map) ValueSize() int { return int(C.bpf_map__value_size(m.bpfMap)) } +func (m *Map) KeySize() int { + return int(C.bpf_map__key_size(m.bpfMap)) +} + func (m *Map) SetPinPath(path string) error { cPath := C.CString(path) defer C.free(unsafe.Pointer(cPath)) @@ -415,6 +419,41 @@ func TcSetGlobals( return err } +func TcSetGlobals6( + m *Map, + globalData *TcGlobalData6, +) error { + + cName := C.CString(globalData.IfaceName) + defer C.free(unsafe.Pointer(cName)) + + cJumps := make([]C.uint, len(globalData.Jumps)) + + for i, v := range globalData.Jumps { + cJumps[i] = C.uint(v) + } + + _, err := C.bpf_tc_set_globals_v6(m.bpfMap, + cName, + (*C.char)(unsafe.Pointer(&globalData.HostIP[0])), + (*C.char)(unsafe.Pointer(&globalData.IntfIP[0])), + C.uint(globalData.ExtToSvcMark), + C.ushort(globalData.Tmtu), + C.ushort(globalData.VxlanPort), + C.ushort(globalData.PSNatStart), + C.ushort(globalData.PSNatLen), + (*C.char)(unsafe.Pointer(&globalData.HostTunnelIP[0])), + C.uint(globalData.Flags), + C.ushort(globalData.WgPort), + C.uint(globalData.NatIn), + C.uint(globalData.NatOut), + C.uint(globalData.LogFilterJmp), + &cJumps[0], // it is safe because we hold the reference here until we return. + ) + + return err +} + func CTLBSetGlobals(m *Map, udpNotSeen time.Duration, excludeUDP bool) error { udpNotSeen /= time.Second // Convert to seconds _, err := C.bpf_ctlb_set_globals(m.bpfMap, C.uint(udpNotSeen), C.bool(excludeUDP)) diff --git a/felix/bpf/libbpf/libbpf_api.h b/felix/bpf/libbpf/libbpf_api.h index 03df79842a1..33c36dc51a4 100644 --- a/felix/bpf/libbpf/libbpf_api.h +++ b/felix/bpf/libbpf/libbpf_api.h @@ -19,6 +19,7 @@ #include #include #include "globals.h" +#include "ip_addr.h" static void set_errno(int ret) { errno = ret >= 0 ? ret : -ret; @@ -181,6 +182,52 @@ void bpf_tc_set_globals(struct bpf_map *map, set_errno(bpf_map__set_initial_value(map, (void*)(&data), sizeof(data))); } +void bpf_tc_set_globals_v6(struct bpf_map *map, + char *iface_name, + char* host_ip, + char* intf_ip, + uint ext_to_svc_mark, + ushort tmtu, + ushort vxlanPort, + ushort psnat_start, + ushort psnat_len, + char* host_tunnel_ip, + uint flags, + ushort wg_port, + uint natin, + uint natout, + uint log_filter_jmp, + uint *jumps) +{ + struct cali_tc_globals_v6 data = { + .tunnel_mtu = tmtu, + .vxlan_port = vxlanPort, + .ext_to_svc_mark = ext_to_svc_mark, + .psnat_start = psnat_start, + .psnat_len = psnat_len, + .flags = flags, + .wg_port = wg_port, + .natin_idx = natin, + .natout_idx = natout, + .log_filter_jmp = log_filter_jmp, + }; + + memcpy(&data.host_ip, host_ip, 16); + memcpy(&data.intf_ip, intf_ip, 16); + memcpy(&data.host_tunnel_ip, host_tunnel_ip, 16); + + strncpy(data.iface_name, iface_name, sizeof(data.iface_name)); + data.iface_name[sizeof(data.iface_name)-1] = '\0'; + + int i; + + for (i = 0; i < sizeof(data.jumps)/sizeof(uint); i++) { + data.jumps[i] = jumps[i]; + } + + set_errno(bpf_map__set_initial_value(map, (void*)(&data), sizeof(data))); +} + int bpf_xdp_program_id(int ifIndex) { __u32 prog_id = 0, flags = 0; int err; diff --git a/felix/bpf/libbpf/libbpf_common.go b/felix/bpf/libbpf/libbpf_common.go index 5082e6c82a5..888ef084286 100644 --- a/felix/bpf/libbpf/libbpf_common.go +++ b/felix/bpf/libbpf/libbpf_common.go @@ -32,6 +32,24 @@ type TcGlobalData struct { Jumps [32]uint32 } +type TcGlobalData6 struct { + IfaceName string + HostIP [16]byte + IntfIP [16]byte + ExtToSvcMark uint32 + Tmtu uint16 + VxlanPort uint16 + PSNatStart uint16 + PSNatLen uint16 + HostTunnelIP [16]byte + Flags uint32 + WgPort uint16 + NatIn uint32 + NatOut uint32 + LogFilterJmp uint32 + Jumps [32]uint32 +} + type XDPGlobalData struct { IfaceName string Jumps [32]uint32 diff --git a/felix/bpf/libbpf/libbpf_stub.go b/felix/bpf/libbpf/libbpf_stub.go index ae089f84f93..fc57f75ad9a 100644 --- a/felix/bpf/libbpf/libbpf_stub.go +++ b/felix/bpf/libbpf/libbpf_stub.go @@ -133,6 +133,10 @@ func TcSetGlobals(_ *Map, globalData *TcGlobalData) error { panic("LIBBPF syscall stub") } +func TcSetGlobals6(_ *Map, globalData *TcGlobalData6) error { + panic("LIBBPF syscall stub") +} + func CTLBSetGlobals(_ *Map, _ time.Duration, _ bool) error { panic("LIBBPF syscall stub") } diff --git a/felix/bpf/maps/maps.go b/felix/bpf/maps/maps.go index 000d428ede7..cdafae9982f 100644 --- a/felix/bpf/maps/maps.go +++ b/felix/bpf/maps/maps.go @@ -645,7 +645,11 @@ func (b *PinnedMap) EnsureExists() error { } } - log.WithField("name", b.Name).Debug("Map didn't exist, creating it") + log.WithFields(log.Fields{ + "name": b.Name, + "keySize": b.KeySize, + "valuesize": b.ValueSize, + }).Debug("Map didn't exist, creating it") cmd := exec.Command("bpftool", "map", "create", b.VersionedFilename(), "type", b.Type, "key", fmt.Sprint(b.KeySize), diff --git a/felix/bpf/nat/maps.go b/felix/bpf/nat/maps.go index 7314c5368d5..ecc760506bd 100644 --- a/felix/bpf/nat/maps.go +++ b/felix/bpf/nat/maps.go @@ -34,12 +34,22 @@ func init() { maps.SetSize(AffinityMapParameters.VersionedName(), AffinityMapParameters.MaxEntries) maps.SetSize(SendRecvMsgMapParameters.VersionedName(), SendRecvMsgMapParameters.MaxEntries) maps.SetSize(CTNATsMapParameters.VersionedName(), CTNATsMapParameters.MaxEntries) + + maps.SetSize(FrontendMapV6Parameters.VersionedName(), FrontendMapV6Parameters.MaxEntries) + maps.SetSize(BackendMapV6Parameters.VersionedName(), BackendMapV6Parameters.MaxEntries) + maps.SetSize(AffinityMapV6Parameters.VersionedName(), AffinityMapV6Parameters.MaxEntries) + maps.SetSize(SendRecvMsgMapV6Parameters.VersionedName(), SendRecvMsgMapV6Parameters.MaxEntries) + maps.SetSize(CTNATsMapV6Parameters.VersionedName(), CTNATsMapV6Parameters.MaxEntries) } func SetMapSizes(fsize, bsize, asize int) { maps.SetSize(FrontendMapParameters.VersionedName(), fsize) maps.SetSize(BackendMapParameters.VersionedName(), bsize) maps.SetSize(AffinityMapParameters.VersionedName(), asize) + + maps.SetSize(FrontendMapV6Parameters.VersionedName(), fsize) + maps.SetSize(BackendMapV6Parameters.VersionedName(), bsize) + maps.SetSize(AffinityMapV6Parameters.VersionedName(), asize) } // struct calico_nat_v4_key { diff --git a/felix/bpf/nat/maps6.go b/felix/bpf/nat/maps6.go new file mode 100644 index 00000000000..591aaa8cf53 --- /dev/null +++ b/felix/bpf/nat/maps6.go @@ -0,0 +1,630 @@ +// Copyright (c) 2020-2021 Tigera, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package nat + +import ( + "encoding/binary" + "fmt" + "net" + "time" + + "golang.org/x/sys/unix" + + "github.com/projectcalico/calico/felix/bpf/maps" + "github.com/projectcalico/calico/felix/ip" +) + +// struct calico_nat_v4_key { +// uint32_t prefixLen; +// uint32_t addr; // NBO +// uint16_t port; // HBO +// uint8_t protocol; +// uint32_t saddr; +// uint8_t pad; +// }; +const frontendKeyV6Size = 40 + +// struct calico_nat { +// uint32_t addr; +// uint16_t port; +// uint8_t protocol; +// uint8_t pad; +// }; +const frontendAffKeyV6Size = 20 + +// struct calico_nat_v4_value { +// uint32_t id; +// uint32_t count; +// uint32_t local; +// uint32_t affinity_timeo; +// uint32_t flags; +// }; +const frontendValueV6Size = 20 + +// struct calico_nat_secondary_v4_key { +// uint32_t id; +// uint32_t ordinal; +// }; +const backendKeyV6Size = 8 + +// struct calico_nat_dest { +// uint32_t addr; +// uint16_t port; +// uint8_t pad[2]; +// }; +const backendValueV6Size = 20 + +// (sizeof(addr) + sizeof(port) + sizeof(proto)) in bits +const ZeroCIDRV6PrefixLen = (16 + 2 + 1) * 8 + +var ZeroCIDRV6 = ip.MustParseCIDROrIP("::/0").(ip.V6CIDR) + +type FrontendKeyV6 [frontendKeyV6Size]byte + +func NewNATKeyV6(addr net.IP, port uint16, protocol uint8) FrontendKeyV6 { + return NewNATKeyV6Src(addr, port, protocol, ZeroCIDRV6) +} + +func NewNATKeyV6Src(addr net.IP, port uint16, protocol uint8, cidr ip.V6CIDR) FrontendKeyV6 { + var k FrontendKeyV6 + prefixlen := ZeroCIDRV6PrefixLen + addr = addr.To16() + binary.LittleEndian.PutUint32(k[:4], uint32(prefixlen)+uint32(cidr.Prefix())) + copy(k[4:20], addr) + binary.LittleEndian.PutUint16(k[20:22], port) + k[22] = protocol + copy(k[23:39], cidr.Addr().AsNetIP().To16()) + return k +} + +func (k FrontendKeyV6) Proto() uint8 { + return k[22] +} + +func (k FrontendKeyV6) Addr() net.IP { + return k[4:20] +} + +func (k FrontendKeyV6) srcAddr() ip.Addr { + var addr ip.V6Addr + copy(addr[:], k[23:39]) + return addr +} + +// This function returns the Prefix length of the source CIDR +func (k FrontendKeyV6) SrcPrefixLen() uint32 { + return k.PrefixLen() - ZeroCIDRV6PrefixLen +} + +func (k FrontendKeyV6) SrcCIDR() ip.CIDR { + return ip.CIDRFromAddrAndPrefix(k.srcAddr(), int(k.SrcPrefixLen())) +} + +func (k FrontendKeyV6) PrefixLen() uint32 { + return binary.LittleEndian.Uint32(k[0:4]) +} + +func (k FrontendKeyV6) Port() uint16 { + return binary.LittleEndian.Uint16(k[20:22]) +} + +func (k FrontendKeyV6) AsBytes() []byte { + return k[:] +} + +func (k FrontendKeyV6) Affinitykey() []byte { + return k[4:12] +} + +func (k FrontendKeyV6) String() string { + return fmt.Sprintf("NATKeyV6{Proto:%v Addr:%v Port:%v SrcAddr:%v}", k.Proto(), k.Addr(), k.Port(), k.SrcCIDR()) +} + +func FrontendKeyV6FromBytes(b []byte) FrontendKeyV6 { + var k FrontendKeyV6 + copy(k[:], b) + return k +} + +type FrontendValueV6 = FrontendValue + +func NewNATValueV6(id uint32, count, local, affinityTimeo uint32) FrontendValueV6 { + return NewNATValue(id, count, local, affinityTimeo) +} + +func NewNATValueV6WithFlags(id uint32, count, local, affinityTimeo, flags uint32) FrontendValueV6 { + v := NewNATValue(id, count, local, affinityTimeo) + binary.LittleEndian.PutUint32(v[16:20], flags) + return v +} + +func FrontendValueV6FromBytes(b []byte) FrontendValueV6 { + var v FrontendValueV6 + copy(v[:], b) + return v +} + +type BackendKeyV6 = BackendKey + +func NewNATBackendKeyV6(id, ordinal uint32) BackendKeyV6 { + return NewNATBackendKey(id, ordinal) +} + +func BackendKeyV6FromBytes(b []byte) BackendKeyV6 { + var k BackendKeyV6 + copy(k[:], b) + return k +} + +type BackendValueV6 [backendValueV6Size]byte + +func NewNATBackendValueV6(addr net.IP, port uint16) BackendValueV6 { + var k BackendValueV6 + addr = addr.To16() + copy(k[:16], addr) + binary.LittleEndian.PutUint16(k[16:18], port) + return k +} + +func (k BackendValueV6) Addr() net.IP { + return k[:16] +} + +func (k BackendValueV6) Port() uint16 { + return binary.LittleEndian.Uint16(k[4:6]) +} + +func (k BackendValueV6) String() string { + return fmt.Sprintf("NATBackendValueV6{Addr:%v Port:%v}", k.Addr(), k.Port()) +} + +func (k BackendValueV6) AsBytes() []byte { + return k[:] +} + +func BackendValueV6FromBytes(b []byte) BackendValueV6 { + var v BackendValueV6 + copy(v[:], b) + return v +} + +var FrontendMapV6Parameters = maps.MapParameters{ + Type: "lpm_trie", + KeySize: frontendKeyV6Size, + ValueSize: frontendValueV6Size, + MaxEntries: 64 * 1024, + Name: "cali_v6_nat_fe", + Flags: unix.BPF_F_NO_PREALLOC, + Version: 3, +} + +func FrontendMapV6() maps.MapWithExistsCheck { + return maps.NewPinnedMap(FrontendMapV6Parameters) +} + +var BackendMapV6Parameters = maps.MapParameters{ + Type: "hash", + KeySize: backendKeyV6Size, + ValueSize: backendValueV6Size, + MaxEntries: 256 * 1024, + Name: "cali_v6_nat_be", + Flags: unix.BPF_F_NO_PREALLOC, +} + +func BackendMapV6() maps.MapWithExistsCheck { + return maps.NewPinnedMap(BackendMapV6Parameters) +} + +// NATMapMem represents FrontendMap loaded into memory +type MapMemV6 map[FrontendKeyV6]FrontendValueV6 + +// Equal compares keys and values of the NATMapMem +func (m MapMemV6) Equal(cmp MapMemV6) bool { + if len(m) != len(cmp) { + return false + } + + for k, v := range m { + v2, ok := cmp[k] + if !ok || v != v2 { + return false + } + } + + return true +} + +// LoadFrontendMap loads the NAT map into a go map or returns an error +func LoadFrontendMapV6(m maps.Map) (MapMemV6, error) { + ret := make(MapMemV6) + + if err := m.Open(); err != nil { + return nil, err + } + + iterFn := MapMemV6Iter(ret) + + err := m.Iter(func(k, v []byte) maps.IteratorAction { + iterFn(k, v) + return maps.IterNone + }) + if err != nil { + ret = nil + } + + return ret, err +} + +// MapMemIter returns maps.MapIter that loads the provided NATMapMem +func MapMemV6Iter(m MapMemV6) func(k, v []byte) { + ks := len(FrontendKeyV6{}) + vs := len(FrontendValueV6{}) + + return func(k, v []byte) { + var key FrontendKeyV6 + copy(key[:ks], k[:ks]) + + var val FrontendValueV6 + copy(val[:vs], v[:vs]) + + m[key] = val + } +} + +// BackendMapMemV6 represents a NATBackend loaded into memory +type BackendMapMemV6 map[BackendKeyV6]BackendValueV6 + +// Equal compares keys and values of the NATBackendMapMem +func (m BackendMapMemV6) Equal(cmp BackendMapMemV6) bool { + if len(m) != len(cmp) { + return false + } + + for k, v := range m { + v2, ok := cmp[k] + if !ok || v != v2 { + return false + } + } + + return true +} + +// LoadBackendMap loads the NATBackend map into a go map or returns an error +func LoadBackendMapV6(m maps.Map) (BackendMapMemV6, error) { + ret := make(BackendMapMemV6) + + if err := m.Open(); err != nil { + return nil, err + } + + iterFn := BackendMapMemV6Iter(ret) + + err := m.Iter(func(k, v []byte) maps.IteratorAction { + iterFn(k, v) + return maps.IterNone + }) + if err != nil { + ret = nil + } + + return ret, err +} + +// BackendMapMemIter returns maps.MapIter that loads the provided NATBackendMapMem +func BackendMapMemV6Iter(m BackendMapMemV6) func(k, v []byte) { + ks := len(BackendKeyV6{}) + vs := len(BackendValueV6{}) + + return func(k, v []byte) { + var key BackendKeyV6 + copy(key[:ks], k[:ks]) + + var val BackendValueV6 + copy(val[:vs], v[:vs]) + + m[key] = val + } +} + +// struct calico_nat_v4_affinity_key { +// struct calico_nat_v4 nat_key; +// uint32_t client_ip; +// uint32_t padding; +// }; + +const affinityKeyV6Size = frontendAffKeyV6Size + 16 + 4 + +// AffinityKeyV6 is a key into the affinity table that consist of FrontendKeyV6 and +// the client's IP +type AffinityKeyV6 [affinityKeyV6Size]byte + +type FrontEndAffinityKeyV6 [frontendAffKeyV6Size]byte + +func (k FrontEndAffinityKeyV6) AsBytes() []byte { + return k[:] +} + +func (k FrontEndAffinityKeyV6) String() string { + return fmt.Sprintf("FrontEndAffinityKeyV6{Proto:%v Addr:%v Port:%v}", k.Proto(), k.Addr(), k.Port()) +} + +func (k FrontEndAffinityKeyV6) Proto() uint8 { + return k[6] +} + +func (k FrontEndAffinityKeyV6) Addr() net.IP { + return k[0:16] +} + +func (k FrontEndAffinityKeyV6) Port() uint16 { + return binary.LittleEndian.Uint16(k[16:18]) +} + +// NewAffinityKey create a new AffinityKeyV6 from a clientIP and FrontendKeyV6 +func NewAffinityKeyV6(clientIP net.IP, fEndKey FrontendKeyV6) AffinityKeyV6 { + var k AffinityKeyV6 + + copy(k[:], fEndKey[4:4+frontendAffKeyV6Size]) + + addr := clientIP.To16() + copy(k[frontendAffKeyV6Size:frontendAffKeyV6Size+16], addr) + return k +} + +// ClientIP returns the ClientIP part of the key +func (k AffinityKeyV6) ClientIP() net.IP { + return k[frontendAffKeySize : frontendAffKeySize+4] +} + +// FrontendKeyV6 returns the FrontendKeyV6 part of the key +func (k AffinityKeyV6) FrontendAffinityKey() FrontEndAffinityKeyV6 { + var f FrontEndAffinityKeyV6 + copy(f[:], k[:frontendAffKeySize]) + + return f +} + +func (k AffinityKeyV6) String() string { + return fmt.Sprintf("AffinityKeyV6{ClientIP:%v %s}", k.ClientIP(), k.FrontendAffinityKey()) +} + +// AsBytes returns the key as []byte +func (k AffinityKeyV6) AsBytes() []byte { + return k[:] +} + +// struct calico_nat_v4_affinity_val { +// struct calico_nat_dest; +// uint64_t ts; +// }; + +const affinityValueV6Size = backendValueV6Size + 4 + 8 + +// AffinityValueV6 represents a backend picked by the affinity and the timestamp +// of its creating +type AffinityValueV6 [affinityValueV6Size]byte + +// NewAffinityValue creates a value from a timestamp and a backend +func NewAffinityValueV6(ts uint64, backend BackendValueV6) AffinityValueV6 { + var v AffinityValueV6 + + copy(v[:], backend[:]) + binary.LittleEndian.PutUint64(v[backendValueV6Size:backendValueV6Size+8], ts) + + return v +} + +// Timestamp returns the timestamp of the entry. It is generated by +// bpf_ktime_get_ns which returns the time since the system boot in nanoseconds +// - it is the monotonic clock reading, which is compatible with time operations +// in time package. +func (v AffinityValueV6) Timestamp() time.Duration { + nano := binary.LittleEndian.Uint64(v[backendValueSize : backendValueSize+8]) + return time.Duration(nano) * time.Nanosecond +} + +// Backend returns the backend the affinity ties the frontend + client to. +func (v AffinityValueV6) Backend() BackendValueV6 { + var b BackendValueV6 + + copy(b[:], v[:backendValueSize]) + + return b +} + +func (v AffinityValueV6) String() string { + return fmt.Sprintf("AffinityValueV6{Timestamp:%d,Backend:%v}", v.Timestamp(), v.Backend()) +} + +// AsBytes returns the value as []byte +func (v AffinityValueV6) AsBytes() []byte { + return v[:] +} + +// AffinityMapParameters describe the AffinityMap +var AffinityMapV6Parameters = maps.MapParameters{ + Type: "lru_hash", + KeySize: affinityKeyV6Size, + ValueSize: affinityValueV6Size, + MaxEntries: 64 * 1024, + Name: "cali_v6_nat_aff", +} + +// AffinityMap returns an instance of an affinity map +func AffinityMapV6() maps.Map { + return maps.NewPinnedMap(AffinityMapV6Parameters) +} + +// AffinityMapMem represents affinity map in memory +type AffinityMapMemV6 map[AffinityKeyV6]AffinityValueV6 + +// LoadAffinityMap loads affinity map into memory +func LoadAffinityMapV6(m maps.Map) (AffinityMapMemV6, error) { + ret := make(AffinityMapMemV6) + + if err := m.Open(); err != nil { + return nil, err + } + + iterFn := AffinityMapMemV6Iter(ret) + + err := m.Iter(func(k, v []byte) maps.IteratorAction { + iterFn(k, v) + return maps.IterNone + }) + if err != nil { + ret = nil + } + + return ret, err +} + +// AffinityMapMemIter returns maps.MapIter that loads the provided AffinityMapMem +func AffinityMapMemV6Iter(m AffinityMapMemV6) func(k, v []byte) { + ks := len(AffinityKeyV6{}) + vs := len(AffinityValueV6{}) + + return func(k, v []byte) { + var key AffinityKeyV6 + copy(key[:ks], k[:ks]) + + var val AffinityValueV6 + copy(val[:vs], v[:vs]) + + m[key] = val + } +} + +// struct sendrecv4_key { +// uint64_t cookie; +// uint32_t ip; +// uint32_t port; +// }; +// +// struct sendrecv4_val { +// uint32_t ip; +// uint32_t port; +// }; + +const sendRecvMsgKeyV6Size = 28 +const ctNATsMsgKeyV6Size = 38 + +// SendRecvMsgKeyV6 is the key for SendRecvMsgMap +type SendRecvMsgKeyV6 [sendRecvMsgKeyV6Size]byte + +// Cookie returns the socket cookie part of the key that can be used to match +// the socket. +func (k SendRecvMsgKeyV6) Cookie() uint64 { + return binary.LittleEndian.Uint64(k[0:8]) +} + +// IP returns the IP address part of the key +func (k SendRecvMsgKeyV6) IP() net.IP { + return k[8:24] +} + +// Port returns port converted to 16-bit host endianness +func (k SendRecvMsgKeyV6) Port() uint16 { + port := binary.BigEndian.Uint32(k[24:28]) + return uint16(port >> 16) +} + +func (k SendRecvMsgKeyV6) String() string { + return fmt.Sprintf("SendRecvMsgKeyV6{Cookie: 0x%016x, IP: %+v, Port: %+v}", k.Cookie(), k.IP(), k.Port()) +} + +const sendRecvMsgValueV6Size = 20 + +// SendRecvMsgValueV6 is the value of SendRecvMsgMap +type SendRecvMsgValueV6 [sendRecvMsgValueV6Size]byte + +// IP returns the IP address part of the key +func (v SendRecvMsgValueV6) IP() net.IP { + return v[0:16] +} + +// Port returns port converted to 16-bit host endianness +func (v SendRecvMsgValueV6) Port() uint16 { + port := binary.BigEndian.Uint32(v[16:20]) + return uint16(port >> 16) +} + +func (v SendRecvMsgValueV6) String() string { + return fmt.Sprintf("SendRecvMsgValueV6{IP: %+v, Port: %+v}", v.IP(), v.Port()) +} + +// SendRecvMsgMapParameters define SendRecvMsgMap +var SendRecvMsgMapV6Parameters = maps.MapParameters{ + Type: "lru_hash", + KeySize: sendRecvMsgKeyV6Size, + ValueSize: sendRecvMsgValueV6Size, + MaxEntries: 510000, + Name: "cali_v6_srmsg", +} + +var CTNATsMapV6Parameters = maps.MapParameters{ + Type: "lru_hash", + KeySize: ctNATsMsgKeyV6Size, + ValueSize: sendRecvMsgValueV6Size, + MaxEntries: 10000, + Name: "cali_v6_ct_nats", +} + +// SendRecvMsgMap tracks reverse translations for sendmsg/recvmsg of +// unconnected UDP +func SendRecvMsgMapV6() maps.Map { + return maps.NewPinnedMap(SendRecvMsgMapV6Parameters) +} + +func AllNATsMsgMapV6() maps.Map { + return maps.NewPinnedMap(CTNATsMapV6Parameters) +} + +// SendRecvMsgMapMem represents affinity map in memory +type SendRecvMsgMapMemV6 map[SendRecvMsgKeyV6]SendRecvMsgValueV6 + +// LoadSendRecvMsgMap loads affinity map into memory +func LoadSendRecvMsgMapV6(m maps.Map) (SendRecvMsgMapMemV6, error) { + ret := make(SendRecvMsgMapMemV6) + + iterFn := SendRecvMsgMapMemV6Iter(ret) + + err := m.Iter(func(k, v []byte) maps.IteratorAction { + iterFn(k, v) + return maps.IterNone + }) + if err != nil { + ret = nil + } + + return ret, err +} + +// SendRecvMsgMapMemIter returns maps.MapIter that loads the provided SendRecvMsgMapMem +func SendRecvMsgMapMemV6Iter(m SendRecvMsgMapMemV6) func(k, v []byte) { + ks := len(SendRecvMsgKeyV6{}) + vs := len(SendRecvMsgValueV6{}) + + return func(k, v []byte) { + var key SendRecvMsgKeyV6 + copy(key[:ks], k[:ks]) + + var val SendRecvMsgValueV6 + copy(val[:vs], v[:vs]) + + m[key] = val + } +} diff --git a/felix/bpf/routes/map.go b/felix/bpf/routes/map.go index 834fe897bcf..effd17cec22 100644 --- a/felix/bpf/routes/map.go +++ b/felix/bpf/routes/map.go @@ -33,6 +33,7 @@ func init() { func SetMapSize(size int) { maps.SetSize(MapParameters.VersionedName(), size) + maps.SetSize(MapV6Parameters.VersionedName(), size) } // struct cali_rt_key { @@ -44,7 +45,7 @@ const KeySize = 8 type Key [KeySize]byte func (k Key) Addr() ip.Addr { - var addr ip.V4Addr // FIXME IPv6 + var addr ip.V4Addr copy(addr[:], k[4:8]) return addr } @@ -101,7 +102,7 @@ func (v Value) Flags() Flags { } func (v Value) NextHop() ip.Addr { - var addr ip.V4Addr // FIXME IPv6 + var addr ip.V4Addr copy(addr[:], v[4:8]) return addr } diff --git a/felix/bpf/routes/map6.go b/felix/bpf/routes/map6.go new file mode 100644 index 00000000000..7cc8b213e32 --- /dev/null +++ b/felix/bpf/routes/map6.go @@ -0,0 +1,223 @@ +// Copyright (c) 2023 Tigera, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package routes + +import ( + "encoding/binary" + "fmt" + "strings" + "sync" + + "github.com/pkg/errors" + "golang.org/x/sys/unix" + + "github.com/projectcalico/calico/felix/bpf/maps" + "github.com/projectcalico/calico/felix/ip" +) + +const KeyV6Size = 20 + +type KeyV6 [KeyV6Size]byte + +func (k KeyV6) Addr() ip.Addr { + var addr ip.V6Addr + copy(addr[:], k[4:20]) + return addr +} + +func (k KeyV6) Dest() ip.CIDR { + addr := k.Addr() + return ip.CIDRFromAddrAndPrefix(addr, k.PrefixLen()) +} + +func (k KeyV6) PrefixLen() int { + return int(binary.LittleEndian.Uint32(k[:4])) +} + +func (k KeyV6) AsBytes() []byte { + return k[:] +} + +const ValueV6Size = 20 + +type ValueV6 [ValueV6Size]byte + +func (v ValueV6) Flags() Flags { + return Flags(binary.LittleEndian.Uint32(v[:4])) +} + +func (v ValueV6) NextHop() ip.Addr { + var addr ip.V6Addr + copy(addr[:], v[4:20]) + return addr +} + +func (v ValueV6) IfaceIndex() uint32 { + return binary.LittleEndian.Uint32(v[4:8]) +} + +func (v ValueV6) AsBytes() []byte { + return v[:] +} + +func (v ValueV6) String() string { + var parts []string + + typeFlags := v.Flags() + + if typeFlags&FlagLocal != 0 { + parts = append(parts, "local") + } else { + parts = append(parts, "remote") + } + + if typeFlags&FlagHost != 0 { + parts = append(parts, "host") + } else if typeFlags&FlagWorkload != 0 { + parts = append(parts, "workload") + } + + if typeFlags&FlagInIPAMPool != 0 { + parts = append(parts, "in-pool") + } + + if typeFlags&FlagNATOutgoing != 0 { + parts = append(parts, "nat-out") + } + + if typeFlags&FlagSameSubnet != 0 { + parts = append(parts, "same-subnet") + } + + if typeFlags&FlagNoDSR != 0 { + parts = append(parts, "no-dsr") + } + + if typeFlags&FlagTunneled != 0 { + parts = append(parts, "tunneled") + } + + if typeFlags&FlagLocal != 0 && typeFlags&FlagWorkload != 0 { + parts = append(parts, "idx", fmt.Sprint(v.IfaceIndex())) + } + + if typeFlags&FlagLocal == 0 && typeFlags&FlagWorkload != 0 { + parts = append(parts, "nh", fmt.Sprint(v.NextHop())) + } + + if len(parts) == 0 { + return fmt.Sprintf("unknown type (%d)", typeFlags) + } + + return strings.Join(parts, " ") +} + +func NewKeyV6(cidr ip.V6CIDR) KeyV6 { + var k KeyV6 + + binary.LittleEndian.PutUint32(k[:4], uint32(cidr.Prefix())) + copy(k[4:20], cidr.Addr().AsNetIP().To16()) + + return k +} + +func NewValueV6(flags Flags) ValueV6 { + var v ValueV6 + binary.LittleEndian.PutUint32(v[:4], uint32(flags)) + return v +} + +func NewValueV6WithNextHop(flags Flags, nextHop ip.V6Addr) ValueV6 { + var v ValueV6 + binary.LittleEndian.PutUint32(v[:4], uint32(flags)) + copy(v[4:20], nextHop.AsNetIP().To16()) + return v +} + +func NewValueV6WithIfIndex(flags Flags, ifIndex int) ValueV6 { + var v ValueV6 + binary.LittleEndian.PutUint32(v[:4], uint32(flags)) + binary.LittleEndian.PutUint32(v[4:8], uint32(ifIndex)) + return v +} + +var MapV6Parameters = maps.MapParameters{ + Type: "lpm_trie", + KeySize: KeyV6Size, + ValueSize: ValueV6Size, + MaxEntries: 256 * 1024, + Name: "cali_v6_routes", + Flags: unix.BPF_F_NO_PREALLOC, +} + +func MapV6() maps.Map { + return maps.NewPinnedMap(MapV6Parameters) +} + +type MapMemV6 map[KeyV6]ValueV6 + +// LoadMap loads a routes.Map into memory +func LoadMapV6(rtm maps.Map) (MapMemV6, error) { + m := make(MapMemV6) + + err := rtm.Iter(func(k, v []byte) maps.IteratorAction { + var key KeyV6 + var value ValueV6 + copy(key[:], k) + copy(value[:], v) + + m[key] = value + return maps.IterNone + }) + + return m, err +} + +type LPMv6 struct { + sync.RWMutex + t *ip.CIDRTrie +} + +func NewLPMv6() *LPMv6 { + return &LPMv6{ + t: ip.NewCIDRTrie(), + } +} + +func (lpm *LPMv6) Update(k KeyV6, v ValueV6) error { + if cidrv6, ok := k.Dest().(ip.V6CIDR); ok { + lpm.t.Update(cidrv6, v) + return nil + } + + return errors.Errorf("k.Dest() %+v type %T is not ip.V4CIDR", k.Dest(), k.Dest()) +} + +func (lpm *LPMv6) Delete(k KeyV6) error { + if cidrv6, ok := k.Dest().(ip.V6CIDR); ok { + lpm.t.Delete(cidrv6) + return nil + } + + return errors.Errorf("k.Dest() %+v type %T is not ip.V4CIDR", k.Dest(), k.Dest()) +} + +func (lpm *LPMv6) Lookup(addr ip.V6Addr) (ValueV6, bool) { + _, v := lpm.t.LPM(addr.AsCIDR().(ip.V6CIDR)) + if v == nil { + return ValueV6{}, false + } + return v.(ValueV6), true +} diff --git a/felix/bpf/state/map.go b/felix/bpf/state/map.go index c42ddc6f6b5..b2262f37e07 100644 --- a/felix/bpf/state/map.go +++ b/felix/bpf/state/map.go @@ -116,9 +116,10 @@ type State struct { NATData uint64 ProgStartTime uint64 Flags uint64 + _ [48]byte // ipv6 padding } -const expectedSize = 416 +const expectedSize = 464 func (s *State) AsBytes() []byte { size := unsafe.Sizeof(State{}) @@ -144,7 +145,7 @@ var MapParameters = maps.MapParameters{ ValueSize: expectedSize, MaxEntries: 2, Name: "cali_state", - Version: 3, + Version: 4, } func Map() maps.Map { diff --git a/felix/bpf/tc/attach.go b/felix/bpf/tc/attach.go index fade8fde626..2a38d4041ed 100644 --- a/felix/bpf/tc/attach.go +++ b/felix/bpf/tc/attach.go @@ -459,6 +459,14 @@ func ConfigureProgram(m *libbpf.Map, iface string, globalData *libbpf.TcGlobalDa return libbpf.TcSetGlobals(m, globalData) } +func ConfigureProgramV6(m *libbpf.Map, iface string, globalData *libbpf.TcGlobalData6) error { + in := []byte("---------------") + copy(in, iface) + globalData.IfaceName = string(in) + + return libbpf.TcSetGlobals6(m, globalData) +} + func convertIPToUint32(ip net.IP) (uint32, error) { ipv4 := ip.To4() if ipv4 == nil { diff --git a/felix/bpf/tc/defs/defs.go b/felix/bpf/tc/defs/defs.go index 6a3048b5310..947ed97e1e7 100644 --- a/felix/bpf/tc/defs/defs.go +++ b/felix/bpf/tc/defs/defs.go @@ -60,16 +60,20 @@ const ( ProgIndexDropDebug ProgIndexHostCtConflictDebug ProgIndexIcmpInnerNatDebug - ProgIndexV6Prologue + ProgIndexV6Main ProgIndexV6Policy ProgIndexV6Allowed ProgIndexV6Icmp ProgIndexV6Drop - ProgIndexV6PrologueDebug + ProgIndexV6HostCtConflict + ProgIndexV6IcmpInnerNat + ProgIndexV6MainDebug ProgIndexV6PolicyDebug ProgIndexV6AllowedDebug ProgIndexV6IcmpDebug ProgIndexV6DropDebug + ProgIndexV6HostCtConflictDebug + ProgIndexV6IcmpInnerNatDebug ProgIndexEndDebug ProgIndexEnd @@ -101,17 +105,21 @@ var ProgramNames = []string{ "calico_tc_host_ct_conflict", "calico_tc_skb_icmp_inner_nat", /* ipv6 */ - "calico_tc6", + "calico_tc_main", "calico_tc_norm_pol_tail", "calico_tc_skb_accepted_entrypoint", "calico_tc_skb_send_icmp_replies", "calico_tc_skb_drop", + "calico_tc_host_ct_conflict", + "calico_tc_skb_icmp_inner_nat", /* ipv6 - debug */ - "calico_tc6", + "calico_tc_main", "calico_tc_norm_pol_tail", "calico_tc_skb_accepted_entrypoint", "calico_tc_skb_send_icmp_replies", "calico_tc_skb_drop", + "calico_tc_host_ct_conflict", + "calico_tc_skb_icmp_inner_nat", } type ToOrFromEp string diff --git a/felix/bpf/ut/bpf_prog_test.go b/felix/bpf/ut/bpf_prog_test.go index 9f7ce14e48b..3ee8d5c1b05 100644 --- a/felix/bpf/ut/bpf_prog_test.go +++ b/felix/bpf/ut/bpf_prog_test.go @@ -104,6 +104,20 @@ var ( IP: node2ip, Mask: net.IPv4Mask(255, 255, 255, 255), } + + node1ipV6 = net.ParseIP("abcd::ffff:0a0a:0001").To16() + node1ip2V6 = net.ParseIP("abcd::ffff:0a0a:0201").To16() + node1tunIPV6 = net.ParseIP("abcd::ffff:0b0b:0001").To16() + node2ipV6 = net.ParseIP("abcd::ffff:0a0a:0002").To16() + intfIPV6 = net.ParseIP("abcd::ffff:0a0a:0003").To16() + node1CIDRV6 = net.IPNet{ + IP: node1ipV6, + Mask: net.CIDRMask(128, 128), + } + node2CIDRV6 = net.IPNet{ + IP: node2ipV6, + Mask: net.CIDRMask(128, 128), + } ) // Globals that we use to configure the next test run. @@ -195,11 +209,22 @@ var tcJumpMapIndexes = map[string][]int{ tcdefs.ProgIndexIcmpInnerNatDebug, }, "IPv6": []int{ - tcdefs.ProgIndexV6PrologueDebug, + tcdefs.ProgIndexV6Main, + tcdefs.ProgIndexV6Policy, + tcdefs.ProgIndexV6Allowed, + tcdefs.ProgIndexV6Icmp, + tcdefs.ProgIndexV6Drop, + tcdefs.ProgIndexV6HostCtConflict, + tcdefs.ProgIndexV6IcmpInnerNat, + }, + "IPv6 debug": []int{ + tcdefs.ProgIndexV6MainDebug, tcdefs.ProgIndexV6PolicyDebug, tcdefs.ProgIndexV6AllowedDebug, tcdefs.ProgIndexV6IcmpDebug, tcdefs.ProgIndexV6DropDebug, + tcdefs.ProgIndexV6HostCtConflictDebug, + tcdefs.ProgIndexV6IcmpInnerNatDebug, }, } @@ -309,8 +334,13 @@ func setupAndRun(logger testLogger, loglevel, section string, rules *polprog.Rul topts.progLog = "WEP" } - log.WithField("hostIP", hostIP).Info("Host IP") - log.WithField("intfIP", intfIP).Info("Intf IP") + if topts.ipv6 { + log.WithField("hostIP", hostIP).Info("Host IP") + log.WithField("intfIP", intfIPV6).Info("Intf IP") + } else { + log.WithField("hostIP", hostIP).Info("Host IP") + log.WithField("intfIP", intfIP).Info("Intf IP") + } obj += fmt.Sprintf("fib_%s", loglevel) if strings.Contains(section, "_dsr") { @@ -318,12 +348,24 @@ func setupAndRun(logger testLogger, loglevel, section string, rules *polprog.Rul } } - if !topts.xdp { - o, err := objLoad("../../bpf-gpl/bin/tc_preamble.o", bpfFsDir, "preamble", topts, false, false) + ipFamily := "IPv4" + policyIdx := tcdefs.ProgIndexPolicy + if topts.ipv6 { + ipFamily = "IPv6" + obj += "_v6" + policyIdx = tcdefs.ProgIndexV6Policy + } + + if topts.xdp { + o, err := objLoad("../../bpf-gpl/bin/xdp_preamble.o", bpfFsDir, "preamble", topts, false, false) + Expect(err).NotTo(HaveOccurred()) + defer o.Close() + } else if topts.ipv6 { + o, err := objLoad("../../bpf-gpl/bin/tc_preamble_v6.o", bpfFsDir, "preamble", topts, false, false) Expect(err).NotTo(HaveOccurred()) defer o.Close() } else { - o, err := objLoad("../../bpf-gpl/bin/xdp_preamble.o", bpfFsDir, "preamble", topts, false, false) + o, err := objLoad("../../bpf-gpl/bin/tc_preamble.o", bpfFsDir, "preamble", topts, false, false) Expect(err).NotTo(HaveOccurred()) defer o.Close() } @@ -340,25 +382,15 @@ func setupAndRun(logger testLogger, loglevel, section string, rules *polprog.Rul err = bin.WriteToFile(tempObj) Expect(err).NotTo(HaveOccurred()) - ipFamily := "IPv4" if loglevel == "debug" { ipFamily += " debug" } - o, err := objLoad(tempObj, bpfFsDir, ipFamily, topts, rules != nil, true) - Expect(err).NotTo(HaveOccurred()) - defer o.Close() - - if topts.ipv6 { - o, err := objLoad(obj+"_v6.o", bpfFsDir, "IPv6", topts, rules != nil, false) - Expect(err).NotTo(HaveOccurred()) - defer o.Close() - } + var o *libbpf.Obj - if err != nil { - logger.Log("Error:", string(err.(*exec.ExitError).Stderr)) - } + o, err = objLoad(tempObj, bpfFsDir, ipFamily, topts, rules != nil, true) Expect(err).NotTo(HaveOccurred()) + defer o.Close() if rules != nil { jmpMap := progMap @@ -385,7 +417,7 @@ func setupAndRun(logger testLogger, loglevel, section string, rules *polprog.Rul } Expect(err).NotTo(HaveOccurred(), "Failed to load rules program.") defer func() { _ = polProgFD.Close() }() - err = jumpMapUpdate(polMap, tcdefs.ProgIndexPolicy, int(polProgFD)) + err = jumpMapUpdate(polMap, policyIdx, int(polProgFD)) Expect(err).NotTo(HaveOccurred()) log.WithField("rules", rules).Debug("set policy") } @@ -491,27 +523,35 @@ func bpftool(args ...string) ([]byte, error) { var ( mapInitOnce sync.Once - natMap, natBEMap, ctMap, rtMap, ipsMap, stateMap, testStateMap, progMap, progMapXDP, affinityMap, arpMap, fsafeMap, countersMap, ifstateMap, jumpMap, jumpMapXDP maps.Map - allMaps []maps.Map + natMap, natBEMap, ctMap, rtMap, ipsMap, testStateMap, affinityMap, arpMap, fsafeMap maps.Map + natMapV6, natBEMapV6, ctMapV6, rtMapV6, affinityMapV6, arpMapV6 maps.Map + stateMap, countersMap, ifstateMap, progMap, progMapXDP, jumpMap, jumpMapXDP maps.Map + allMaps []maps.Map ) func initMapsOnce() { mapInitOnce.Do(func() { natMap = nat.FrontendMap() natBEMap = nat.BackendMap() + natMapV6 = nat.FrontendMapV6() + natBEMapV6 = nat.BackendMapV6() ctMap = conntrack.Map() + ctMapV6 = conntrack.MapV6() rtMap = routes.Map() + rtMapV6 = routes.MapV6() ipsMap = ipsets.Map() stateMap = state.Map() testStateMap = state.MapForTest() affinityMap = nat.AffinityMap() + affinityMapV6 = nat.AffinityMapV6() arpMap = arp.Map() + arpMapV6 = arp.MapV6() fsafeMap = failsafes.Map() countersMap = counters.Map() ifstateMap = ifstate.Map() - allMaps = []maps.Map{natMap, natBEMap, ctMap, rtMap, ipsMap, stateMap, testStateMap, - affinityMap, arpMap, fsafeMap, countersMap, ifstateMap} + allMaps = []maps.Map{natMap, natBEMap, natMapV6, natBEMapV6, ctMap, ctMapV6, rtMap, rtMapV6, ipsMap, + stateMap, testStateMap, affinityMap, affinityMapV6, arpMap, arpMapV6, fsafeMap, countersMap, ifstateMap} for _, m := range allMaps { err := m.EnsureExists() if err != nil { @@ -588,9 +628,18 @@ func tcUpdateJumpMap(obj *libbpf.Obj, progs []int, hasPolicyProg, hasHostConflic if !hasPolicyProg { continue } - } - if (idx == tcdefs.ProgIndexHostCtConflict || idx == tcdefs.ProgIndexHostCtConflictDebug) && !hasHostConflictProg { - continue + case + tcdefs.ProgIndexV6Icmp, + tcdefs.ProgIndexV6IcmpDebug: + continue // XXX not implemented + case + tcdefs.ProgIndexHostCtConflict, + tcdefs.ProgIndexHostCtConflictDebug, + tcdefs.ProgIndexV6HostCtConflict, + tcdefs.ProgIndexV6HostCtConflictDebug: + if !hasHostConflictProg { + continue + } } log.WithField("prog", tcdefs.ProgramNames[idx]).WithField("idx", idx).Debug("UpdateJumpMap") err := obj.UpdateJumpMap(progMap.GetName(), tcdefs.ProgramNames[idx], idx) @@ -646,6 +695,31 @@ func objLoad(fname, bpfFsDir, ipFamily string, topts testOpts, polProg, hasHostC if err := xdp.ConfigureProgram(m, bpfIfaceName, &globals); err != nil { return nil, err } + } else if topts.ipv6 { + ifaceLog := topts.progLog + "-" + bpfIfaceName + globals := libbpf.TcGlobalData6{ + Tmtu: natTunnelMTU, + VxlanPort: testVxlanPort, + PSNatStart: uint16(topts.psnaStart), + PSNatLen: uint16(topts.psnatEnd-topts.psnaStart) + 1, + Flags: libbpf.GlobalsIPv6Enabled | libbpf.GlobalsNoDSRCidrs, + LogFilterJmp: 0xffffffff, + } + + copy(globals.HostTunnelIP[:], node1tunIPV6.To16()) + copy(globals.HostIP[:], hostIP.To16()) + copy(globals.IntfIP[:], intfIPV6.To16()) + + for i := 0; i < tcdefs.ProgIndexEnd; i++ { + globals.Jumps[i] = uint32(i) + } + + log.WithField("globals", globals).Debugf("configure program") + + if err := tc.ConfigureProgramV6(m, ifaceLog, &globals); err != nil { + return nil, fmt.Errorf("failed to configure tc program: %w", err) + } + log.WithField("program", fname).Debugf("Configured BPF program iface \"%s\"", ifaceLog) } else { ifaceLog := topts.progLog + "-" + bpfIfaceName globals := libbpf.TcGlobalData{ @@ -674,7 +748,11 @@ func objLoad(fname, bpfFsDir, ipFamily string, topts testOpts, polProg, hasHostC continue } pin := "/sys/fs/bpf/tc/globals/" + m.Name() - log.WithField("pin", pin).Debug("Pinning map") + log.WithFields(log.Fields{ + "pin": pin, + "key size": m.KeySize(), + "value size": m.ValueSize(), + }).Debug("Pinning map") cmd := exec.Command("bpftool", "map", "show", "pinned", pin) log.WithField("cmd", cmd.String()).Debugf("executing") out, _ := cmd.Output() @@ -694,7 +772,6 @@ func objLoad(fname, bpfFsDir, ipFamily string, topts testOpts, polProg, hasHostC policyIdx := tcdefs.ProgIndexPolicy if strings.HasPrefix(ipFamily, "IPv6") { - progDir += "_v6" policyIdx = tcdefs.ProgIndexV6Policy } @@ -726,7 +803,7 @@ func objLoad(fname, bpfFsDir, ipFamily string, topts testOpts, polProg, hasHostC } if !forXDP { - log.WithField("ipFamily", ipFamily).Debug("Udating jump map") + log.WithField("ipFamily", ipFamily).Debug("Updating jump map") err = tcUpdateJumpMap(obj, tcJumpMapIndexes[ipFamily], false, hasHostConflictProg) if err != nil && !strings.Contains(err.Error(), "error updating calico_tc_host_ct_conflict program") { goto out @@ -760,18 +837,36 @@ func objUTLoad(fname, bpfFsDir, ipFamily string, topts testOpts, polProg, hasHos for m, err := obj.FirstMap(); m != nil && err == nil; m, err = m.NextMap() { if m.IsMapInternal() { ifaceLog := topts.progLog + "-" + bpfIfaceName - globals := libbpf.TcGlobalData{ - HostIP: ipToU32(hostIP), - IntfIP: ipToU32(intfIP), - Tmtu: natTunnelMTU, - VxlanPort: testVxlanPort, - PSNatStart: uint16(topts.psnaStart), - PSNatLen: uint16(topts.psnatEnd-topts.psnaStart) + 1, - Flags: libbpf.GlobalsIPv6Enabled | libbpf.GlobalsNoDSRCidrs, - HostTunnelIP: ipToU32(node1tunIP), - } - if err := tc.ConfigureProgram(m, ifaceLog, &globals); err != nil { - return nil, fmt.Errorf("failed to configure tc program: %w", err) + if topts.ipv6 { + globals := libbpf.TcGlobalData6{ + Tmtu: natTunnelMTU, + VxlanPort: testVxlanPort, + PSNatStart: uint16(topts.psnaStart), + PSNatLen: uint16(topts.psnatEnd-topts.psnaStart) + 1, + Flags: libbpf.GlobalsIPv6Enabled | libbpf.GlobalsNoDSRCidrs, + } + + copy(globals.HostTunnelIP[:], node1tunIPV6.To16()) + copy(globals.HostIP[:], hostIP.To16()) + copy(globals.IntfIP[:], intfIPV6.To16()) + + if err := tc.ConfigureProgramV6(m, ifaceLog, &globals); err != nil { + return nil, fmt.Errorf("failed to configure v6 tc program: %w", err) + } + } else { + globals := libbpf.TcGlobalData{ + HostIP: ipToU32(hostIP), + IntfIP: ipToU32(intfIP), + Tmtu: natTunnelMTU, + VxlanPort: testVxlanPort, + PSNatStart: uint16(topts.psnaStart), + PSNatLen: uint16(topts.psnatEnd-topts.psnaStart) + 1, + Flags: libbpf.GlobalsIPv6Enabled | libbpf.GlobalsNoDSRCidrs, + HostTunnelIP: ipToU32(node1tunIP), + } + if err := tc.ConfigureProgram(m, ifaceLog, &globals); err != nil { + return nil, fmt.Errorf("failed to configure tc program: %w", err) + } } break } @@ -783,10 +878,6 @@ func objUTLoad(fname, bpfFsDir, ipFamily string, topts testOpts, polProg, hasHos progDir := bpfFsDir - if ipFamily == "IPv6" { - progDir += "_v6" - } - err = obj.PinPrograms(progDir) if err != nil { obj.Close() @@ -925,7 +1016,12 @@ func runBpfUnitTest(t *testing.T, source string, testFn func(bpfProgRunFn), opts Expect(err).NotTo(HaveOccurred()) defer os.RemoveAll(bpfFsDir) - objFname := "../../bpf-gpl/ut/" + strings.TrimSuffix(source, path.Ext(source)) + ".o" + vExt := "" + if topts.ipv6 { + vExt = "_v6" + } + + objFname := "../../bpf-gpl/ut/" + strings.TrimSuffix(source, path.Ext(source)) + vExt + ".o" obj, err := objUTLoad(objFname, bpfFsDir, "IPv4", topts, true, false) Expect(err).NotTo(HaveOccurred()) @@ -1066,6 +1162,38 @@ func udpResponseRaw(in []byte) []byte { return out.Bytes() } +func udpResponseRawV6(in []byte) []byte { + pkt := gopacket.NewPacket(in, layers.LayerTypeEthernet, gopacket.Default) + ethL := pkt.Layer(layers.LayerTypeEthernet) + ethR := ethL.(*layers.Ethernet) + ethR.SrcMAC, ethR.DstMAC = ethR.DstMAC, ethR.SrcMAC + + ipv6L := pkt.Layer(layers.LayerTypeIPv6) + ipv6R := ipv6L.(*layers.IPv6) + ipv6R.SrcIP, ipv6R.DstIP = ipv6R.DstIP, ipv6R.SrcIP + + lrs := []gopacket.SerializableLayer{ethR, ipv6R} + + if ipv6R.NextHeader == layers.IPProtocolIPv6HopByHop { + l := pkt.Layer(layers.LayerTypeIPv6HopByHop) + lrs = append(lrs, l.(*layers.IPv6HopByHop)) + } + + udpL := pkt.Layer(layers.LayerTypeUDP) + udpR := udpL.(*layers.UDP) + udpR.SrcPort, udpR.DstPort = udpR.DstPort, udpR.SrcPort + + _ = udpR.SetNetworkLayerForChecksum(ipv6R) + + lrs = append(lrs, udpR, gopacket.Payload(pkt.ApplicationLayer().Payload())) + + out := gopacket.NewSerializeBuffer() + err := gopacket.SerializeLayers(out, gopacket.SerializeOptions{ComputeChecksums: true}, lrs...) + Expect(err).NotTo(HaveOccurred()) + + return out.Bytes() +} + func tcpResponseRaw(in []byte) []byte { pkt := gopacket.NewPacket(in, layers.LayerTypeEthernet, gopacket.Default) ethL := pkt.Layer(layers.LayerTypeEthernet) @@ -1102,6 +1230,14 @@ func dumpNATMap(natMap maps.Map) { } } +func dumpNATMapV6(natMap maps.Map) { + nt, err := nat.LoadFrontendMapV6(natMap) + Expect(err).NotTo(HaveOccurred()) + for k, v := range nt { + fmt.Printf("%s : %s\n", k, v) + } +} + func resetMap(m maps.Map) { err := m.Iter(func(_, _ []byte) maps.IteratorAction { return maps.IterDelete @@ -1119,16 +1255,36 @@ func dumpCTMap(ctMap maps.Map) { fmt.Printf("\n") } +func dumpCTMapV6(ctMap maps.Map) { + ct, err := conntrack.LoadMapMemV6(ctMap) + Expect(err).NotTo(HaveOccurred()) + fmt.Printf("Conntrack dump:\n") + for k, v := range ct { + fmt.Printf("- %s : %s\n", k, v) + } + fmt.Printf("\n") +} + func resetCTMap(ctMap maps.Map) { resetMap(ctMap) } +func resetCTMapV6(ctMap maps.Map) { + resetMap(ctMap) +} + func saveCTMap(ctMap maps.Map) conntrack.MapMem { ct, err := conntrack.LoadMapMem(ctMap) Expect(err).NotTo(HaveOccurred()) return ct } +func saveCTMapV6(ctMap maps.Map) conntrack.MapMemV6 { + ct, err := conntrack.LoadMapMemV6(ctMap) + Expect(err).NotTo(HaveOccurred()) + return ct +} + func restoreCTMap(ctMap maps.Map, m conntrack.MapMem) { for k, v := range m { err := ctMap.Update(k[:], v[:]) @@ -1136,6 +1292,13 @@ func restoreCTMap(ctMap maps.Map, m conntrack.MapMem) { } } +func restoreCTMapV6(ctMap maps.Map, m conntrack.MapMemV6) { + for k, v := range m { + err := ctMap.Update(k[:], v[:]) + Expect(err).NotTo(HaveOccurred()) + } +} + func dumpRTMap(rtMap maps.Map) { rt, err := routes.LoadMap(rtMap) Expect(err).NotTo(HaveOccurred()) @@ -1144,16 +1307,34 @@ func dumpRTMap(rtMap maps.Map) { } } +func dumpRTMapV6(rtMap maps.Map) { + rt, err := routes.LoadMapV6(rtMap) + Expect(err).NotTo(HaveOccurred()) + for k, v := range rt { + fmt.Printf("%15s: %s\n", k.Dest(), v) + } +} + func resetRTMap(rtMap maps.Map) { resetMap(rtMap) } +func resetRTMapV6(rtMap maps.Map) { + resetMap(rtMap) +} + func saveRTMap(rtMap maps.Map) routes.MapMem { rt, err := routes.LoadMap(rtMap) Expect(err).NotTo(HaveOccurred()) return rt } +func saveRTMapV6(rtMap maps.Map) routes.MapMemV6 { + rt, err := routes.LoadMapV6(rtMap) + Expect(err).NotTo(HaveOccurred()) + return rt +} + func restoreRTMap(rtMap maps.Map, m routes.MapMem) { for k, v := range m { err := rtMap.Update(k[:], v[:]) @@ -1161,6 +1342,13 @@ func restoreRTMap(rtMap maps.Map, m routes.MapMem) { } } +func restoreRTMapV6(rtMap maps.Map, m routes.MapMemV6) { + for k, v := range m { + err := rtMap.Update(k[:], v[:]) + Expect(err).NotTo(HaveOccurred()) + } +} + func dumpARPMap(arpMap maps.Map) { ct, err := arp.LoadMapMem(arpMap) Expect(err).NotTo(HaveOccurred()) @@ -1171,8 +1359,24 @@ func dumpARPMap(arpMap maps.Map) { fmt.Printf("\n") } -func saveARPMap(ctMap maps.Map) arp.MapMem { - m, err := arp.LoadMapMem(arpMap) +func dumpARPMapV6(arpMap maps.Map) { + ct, err := arp.LoadMapMemV6(arpMap) + Expect(err).NotTo(HaveOccurred()) + fmt.Printf("ARP dump:\n") + for k, v := range ct { + fmt.Printf("- %s : %s\n", k, v) + } + fmt.Printf("\n") +} + +func saveARPMap(am maps.Map) arp.MapMem { + m, err := arp.LoadMapMem(am) + Expect(err).NotTo(HaveOccurred()) + return m +} + +func saveARPMapV6(am maps.Map) arp.MapMemV6 { + m, err := arp.LoadMapMemV6(am) Expect(err).NotTo(HaveOccurred()) return m } @@ -1202,6 +1406,8 @@ var ipv4Default = &layers.IPv4{ var srcIPv6 = net.IP([]byte{0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}) var dstIPv6 = net.IP([]byte{0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2}) +var srcV6CIDR = ip.CIDRFromNetIP(srcIPv6).(ip.V6CIDR) +var dstV6CIDR = ip.CIDRFromNetIP(dstIPv6).(ip.V6CIDR) var ipv6Default = &layers.IPv6{ Version: 6, @@ -1216,24 +1422,53 @@ var udpDefault = &layers.UDP{ DstPort: 5678, } -func testPacket(eth *layers.Ethernet, l3 gopacket.Layer, l4 gopacket.Layer, payload []byte) ( - *layers.Ethernet, *layers.IPv4, gopacket.Layer, []byte, []byte, error) { +func testPacket(family int, eth *layers.Ethernet, l3 gopacket.Layer, l4 gopacket.Layer, + payload []byte, ipv6ext ...gopacket.SerializableLayer) ( + *layers.Ethernet, gopacket.Layer, gopacket.Layer, []byte, []byte, error) { pkt := Packet{ + family: family, eth: eth, l3: l3, l4: l4, payload: payload, + ipv6ext: ipv6ext, } err := pkt.Generate() + if err != nil { + return nil, nil, nil, nil, nil, err + } p := gopacket.NewPacket(pkt.bytes, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("p = %+v\n", p) e := p.Layer(layers.LayerTypeEthernet).(*layers.Ethernet) - ip := p.Layer(layers.LayerTypeIPv4).(*layers.IPv4) + + var ( + ipl gopacket.Layer + proto layers.IPProtocol + ) + + ipv4L := p.Layer(layers.LayerTypeIPv4) + if ipv4L != nil { + ipv4 := ipv4L.(*layers.IPv4) + proto = ipv4.Protocol + ipl = ipv4L + } else { + ipv6L := p.Layer(layers.LayerTypeIPv6) + if ipv6L != nil { + ipv6 := ipv6L.(*layers.IPv6) + proto = ipv6.NextHeader + } + if proto == layers.IPProtocolIPv6HopByHop { + l := p.Layer(layers.LayerTypeIPv6HopByHop) + proto = l.(*layers.IPv6HopByHop).NextHeader + } + ipl = ipv6L + } var l gopacket.Layer - switch ip.Protocol { + switch proto { case layers.IPProtocolUDP: l = p.Layer(layers.LayerTypeUDP) case layers.IPProtocolTCP: @@ -1242,10 +1477,23 @@ func testPacket(eth *layers.Ethernet, l3 gopacket.Layer, l4 gopacket.Layer, payl l = p.Layer(layers.LayerTypeICMPv4) } - return e, ip, l, pkt.payload, pkt.bytes, err + return e, ipl, l, pkt.payload, pkt.bytes, err +} + +func testPacketV4(eth *layers.Ethernet, ipv4 *layers.IPv4, l4 gopacket.Layer, payload []byte) ( + *layers.Ethernet, *layers.IPv4, gopacket.Layer, []byte, []byte, error) { + e, ip4, l4, p, b, err := testPacket(4, eth, ipv4, l4, payload) + return e, ip4.(*layers.IPv4), l4, p, b, err +} + +func testPacketV6(eth *layers.Ethernet, ipv6 *layers.IPv6, l4 gopacket.Layer, payload []byte, ipv6ext ...gopacket.SerializableLayer) ( + *layers.Ethernet, *layers.IPv6, gopacket.Layer, []byte, []byte, error) { + e, ip6, l4, p, b, err := testPacket(6, eth, ipv6, l4, payload, ipv6ext...) + return e, ip6.(*layers.IPv6), l4, p, b, err } type Packet struct { + family int eth *layers.Ethernet l3 gopacket.Layer ipv4 *layers.IPv4 @@ -1261,6 +1509,7 @@ type Packet struct { length int l4Protocol layers.IPProtocol l3Protocol layers.EthernetType + ipv6ext []gopacket.SerializableLayer } func (pkt *Packet) handlePayload() { @@ -1304,9 +1553,64 @@ func (pkt *Packet) handleL4() error { return nil } +func (pkt *Packet) handleIPv6Ext() error { + exts := gopacket.NewSerializeBuffer() + err := gopacket.SerializeLayers(exts, gopacket.SerializeOptions{FixLengths: true}, pkt.ipv6ext...) + if err != nil { + return err + } + + pkt.length += len(exts.Bytes()) + + return nil +} + +func nextHdrIPProto(nh gopacket.Layer) layers.IPProtocol { + switch nh.(type) { + case *layers.IPv6HopByHop: + return layers.IPProtocolIPv6HopByHop + case *layers.ICMPv4: + return layers.IPProtocolICMPv4 + case *layers.IGMP: + return layers.IPProtocolIGMP + case *layers.IPv4: + return layers.IPProtocolIPv4 + case *layers.TCP: + return layers.IPProtocolTCP + case *layers.UDP: + return layers.IPProtocolUDP + case *layers.RUDP: + return layers.IPProtocolRUDP + case *layers.IPv6: + return layers.IPProtocolIPv6 + case *layers.IPv6Routing: + return layers.IPProtocolIPv6Routing + case *layers.IPv6Fragment: + return layers.IPProtocolIPv6Fragment + case *layers.GRE: + return layers.IPProtocolGRE + case *layers.ICMPv6: + return layers.IPProtocolICMPv6 + case *layers.IPv6Destination: + return layers.IPProtocolIPv6Destination + case *layers.EtherIP: + return layers.IPProtocolEtherIP + case *layers.SCTP: + return layers.IPProtocolSCTP + case *layers.UDPLite: + return layers.IPProtocolUDPLite + } + + panic("unknown next layer") +} + func (pkt *Packet) handleL3() error { - if pkt.l3 == nil { - pkt.l3 = ipv4Default + if reflect.ValueOf(pkt.l3).IsNil() { + if pkt.family == 4 { + pkt.l3 = ipv4Default + } else { + pkt.l3 = ipv6Default + } } switch v := pkt.l3.(type) { @@ -1320,7 +1624,18 @@ func (pkt *Packet) handleL3() error { case *layers.IPv6: pkt.ipv6 = v pkt.l3Protocol = layers.EthernetTypeIPv6 - pkt.ipv6.NextHeader = pkt.l4Protocol + if len(pkt.ipv6ext) > 0 { + if err := pkt.handleIPv6Ext(); err != nil { + return fmt.Errorf("handling ipv6 extensions: %w", err) + } + pkt.ipv6.NextHeader = nextHdrIPProto(pkt.ipv6ext[0].(gopacket.Layer)) + for i := len(pkt.ipv6ext); i > 0; i-- { + pkt.layers = append(pkt.layers, pkt.ipv6ext[i-1]) + } + } else { + pkt.ipv6.NextHeader = pkt.l4Protocol + } + pkt.length += 40 pkt.ipv6.Length = uint16(pkt.length) pkt.layers = append(pkt.layers, pkt.ipv6) default: @@ -1401,7 +1716,9 @@ func testPacketUDPDefault() (*layers.Ethernet, *layers.IPv4, gopacket.Layer, []b OptionData: []byte{0xde, 0xad, 0xbe, 0xef}, }} ip.IHL += 2 - return testPacket(nil, &ip, nil, nil) + + e, ip4, l4, p, b, err := testPacket(4, nil, &ip, nil, nil) + return e, ip4.(*layers.IPv4), l4, p, b, err } func testPacketUDPDefaultNP(destIP net.IP) (*layers.Ethernet, *layers.IPv4, gopacket.Layer, []byte, []byte, error) { @@ -1418,7 +1735,42 @@ func testPacketUDPDefaultNP(destIP net.IP) (*layers.Ethernet, *layers.IPv4, gopa }} ip.IHL += 2 - return testPacket(nil, &ip, nil, nil) + e, ip4, l4, p, b, err := testPacket(4, nil, &ip, nil, nil) + return e, ip4.(*layers.IPv4), l4, p, b, err +} + +func ipv6HopByHopExt() gopacket.SerializableLayer { + hop := &layers.IPv6HopByHop{} + hop.NextHeader = layers.IPProtocolUDP + + /* from gopacket ip6_test.go */ + tlv := &layers.IPv6HopByHopOption{} + tlv.OptionType = 0x01 //PadN + tlv.OptionData = []byte{0x00, 0x00, 0x00, 0x00} + hop.Options = append(hop.Options, tlv) + + return hop +} + +func testPacketUDPDefaultNPV6(destIP net.IP) (*layers.Ethernet, *layers.IPv6, gopacket.Layer, []byte, []byte, error) { + if destIP == nil { + return testPacketV6(nil, nil, nil, nil) + } + + ip := *ipv6Default + ip.DstIP = destIP + + hop := &layers.IPv6HopByHop{} + hop.NextHeader = layers.IPProtocolUDP + + /* from gopacket ip6_test.go */ + tlv := &layers.IPv6HopByHopOption{} + tlv.OptionType = 0x01 //PadN + tlv.OptionData = []byte{0x00, 0x00, 0x00, 0x00} + hop.Options = append(hop.Options, tlv) + + e, ip6, l4, p, b, err := testPacketV6(nil, &ip, nil, nil, hop) + return e, ip6, l4, p, b, err } func resetBPFMaps() { diff --git a/felix/bpf/ut/failsafes_test.go b/felix/bpf/ut/failsafes_test.go index d18563b53d9..aff43b8b641 100644 --- a/felix/bpf/ut/failsafes_test.go +++ b/felix/bpf/ut/failsafes_test.go @@ -217,7 +217,7 @@ func TestFailsafes(t *testing.T) { for _, test := range failsafeTests { t.Run(test.Description, func(t *testing.T) { - _, _, _, _, pktBytes, err := testPacket(nil, test.IPHeaderIPv4, test.IPHeaderUDP, nil) + _, _, _, _, pktBytes, err := testPacketV4(nil, test.IPHeaderIPv4, test.IPHeaderUDP, nil) Expect(err).NotTo(HaveOccurred()) prog := "calico_from_host_ep" diff --git a/felix/bpf/ut/filter_test.go b/felix/bpf/ut/filter_test.go index b4988639069..fc502fbb713 100644 --- a/felix/bpf/ut/filter_test.go +++ b/felix/bpf/ut/filter_test.go @@ -29,7 +29,7 @@ import ( func TestFilter(t *testing.T) { RegisterTestingT(t) - _, _, _, _, bytes, _ := testPacket( + _, _, _, _, bytes, _ := testPacketV4( &layers.Ethernet{ SrcMAC: []byte{0, 0, 0, 0, 0, 1}, DstMAC: []byte{0, 0, 0, 0, 0, 2}, diff --git a/felix/bpf/ut/icmp_port_unreachable_test.go b/felix/bpf/ut/icmp_port_unreachable_test.go index 71f198cafec..71725666220 100644 --- a/felix/bpf/ut/icmp_port_unreachable_test.go +++ b/felix/bpf/ut/icmp_port_unreachable_test.go @@ -37,7 +37,7 @@ func TestICMPPortUnreachable(t *testing.T) { }} ipHdr.IHL += 2 - _, ipv4, _, _, pktBytes, err := testPacket(nil, &ipHdr, nil, nil) + _, ipv4, _, _, pktBytes, err := testPacketV4(nil, &ipHdr, nil, nil) Expect(err).NotTo(HaveOccurred()) runBpfUnitTest(t, "icmp_port_unreachable.c", func(bpfrun bpfProgRunFn) { @@ -60,7 +60,7 @@ func TestNATNoBackendFromHEP(t *testing.T) { iphdr := *ipv4Default - _, ipv4, l4, _, pktBytes, err := testPacket(nil, &iphdr, nil, nil) + _, ipv4, l4, _, pktBytes, err := testPacketV4(nil, &iphdr, nil, nil) Expect(err).NotTo(HaveOccurred()) udp := l4.(*layers.UDP) diff --git a/felix/bpf/ut/icmp_too_big_test.go b/felix/bpf/ut/icmp_too_big_test.go index 9838ece2d68..bde1fccb9cf 100644 --- a/felix/bpf/ut/icmp_too_big_test.go +++ b/felix/bpf/ut/icmp_too_big_test.go @@ -62,7 +62,7 @@ func TestICMPTooBigIPOptions(t *testing.T) { }}, } - _, ipv4, l4, _, pktBytes, err := testPacket(nil, ipv4, nil, nil) + _, ipv4, l4, _, pktBytes, err := testPacketV4(nil, ipv4, nil, nil) Expect(err).NotTo(HaveOccurred()) udp := l4.(*layers.UDP) diff --git a/felix/bpf/ut/icmp_ttl_exceeded_test.go b/felix/bpf/ut/icmp_ttl_exceeded_test.go index 47b97043691..e10524295a6 100644 --- a/felix/bpf/ut/icmp_ttl_exceeded_test.go +++ b/felix/bpf/ut/icmp_ttl_exceeded_test.go @@ -55,7 +55,7 @@ func TestICMPttlExceededFromHEP(t *testing.T) { iphdr := *ipv4Default iphdr.TTL = 1 - _, ipv4, l4, _, pktBytes, err := testPacket(nil, &iphdr, nil, nil) + _, ipv4, l4, _, pktBytes, err := testPacketV4(nil, &iphdr, nil, nil) Expect(err).NotTo(HaveOccurred()) udp := l4.(*layers.UDP) diff --git a/felix/bpf/ut/ip_dec_ttl_test.go b/felix/bpf/ut/ip_dec_ttl_test.go index 9e88fbc22e1..fe96ff89102 100644 --- a/felix/bpf/ut/ip_dec_ttl_test.go +++ b/felix/bpf/ut/ip_dec_ttl_test.go @@ -29,7 +29,7 @@ func TestIpDecTTL(t *testing.T) { runBpfUnitTest(t, "ip_dec_ttl.c", func(bpfrun bpfProgRunFn) { ip36 := *ipv4Default ip36.TTL = 36 - _, _, _, _, pktBytes, err := testPacket(nil, &ip36, nil, nil) + _, _, _, _, pktBytes, err := testPacketV4(nil, &ip36, nil, nil) Expect(err).NotTo(HaveOccurred()) res, err := bpfrun(pktBytes) @@ -43,7 +43,7 @@ func TestIpDecTTL(t *testing.T) { ip35 := *ipv4Default ip35.TTL = 35 - _, _, _, _, pktBytes, err = testPacket(nil, &ip35, nil, nil) + _, _, _, _, pktBytes, err = testPacketV4(nil, &ip35, nil, nil) Expect(err).NotTo(HaveOccurred()) Expect(res.dataOut).To(Equal(pktBytes)) diff --git a/felix/bpf/ut/ip_options_test.go b/felix/bpf/ut/ip_options_test.go index 30a710d4a3f..6ed82b4637b 100644 --- a/felix/bpf/ut/ip_options_test.go +++ b/felix/bpf/ut/ip_options_test.go @@ -28,7 +28,7 @@ func TestMalformedIP(t *testing.T) { iphdr := *ipv4Default iphdr.IHL = 4 - _, _, _, _, pktBytes, err := testPacket(nil, &iphdr, nil, nil) + _, _, _, _, pktBytes, err := testPacketV4(nil, &iphdr, nil, nil) Expect(err).NotTo(HaveOccurred()) skbMark = 0 diff --git a/felix/bpf/ut/ip_parse_test.go b/felix/bpf/ut/ip_parse_test.go new file mode 100644 index 00000000000..2d12ee4be3f --- /dev/null +++ b/felix/bpf/ut/ip_parse_test.go @@ -0,0 +1,78 @@ +// Copyright (c) 2023 Tigera, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ut_test + +import ( + "testing" + + "github.com/google/gopacket/layers" + . "github.com/onsi/gomega" +) + +func TestIPv4Parse(t *testing.T) { + RegisterTestingT(t) + + ipHdr := *ipv4Default + ipHdr.Options = []layers.IPv4Option{{ + OptionType: 123, + OptionLength: 6, + OptionData: []byte{0xde, 0xad, 0xbe, 0xef}, + }} + ipHdr.IHL += 2 + + _, _, _, _, pktBytes, err := testPacketV4(nil, &ipHdr, nil, nil) + Expect(err).NotTo(HaveOccurred()) + + runBpfUnitTest(t, "ip_parse_test.c", func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(4)) + }) +} + +func TestIPv6Parse(t *testing.T) { + RegisterTestingT(t) + + _, _, _, _, pktBytes, err := testPacketV6(nil, ipv6Default, nil, nil) + Expect(err).NotTo(HaveOccurred()) + + runBpfUnitTest(t, "ip_parse_test.c", func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(6)) + }, withIPv6()) +} + +func TestIPv6ParseOptsOne(t *testing.T) { + RegisterTestingT(t) + + hop := &layers.IPv6HopByHop{} + hop.NextHeader = layers.IPProtocolUDP + + /* from gopacket ip6_test.go */ + tlv := &layers.IPv6HopByHopOption{} + tlv.OptionType = 0x01 //PadN + tlv.OptionData = []byte{0x00, 0x00, 0x00, 0x00} + hop.Options = append(hop.Options, tlv) + + _, _, _, _, pktBytes, err := testPacketV6(nil, ipv6Default, nil, nil, hop) + Expect(err).NotTo(HaveOccurred()) + + runBpfUnitTest(t, "ip_parse_test.c", func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(6)) + }, withIPv6()) +} diff --git a/felix/bpf/ut/ipv4_opts_test.go b/felix/bpf/ut/ipv4_opts_test.go index a77e2f81bf5..5a7121e2424 100644 --- a/felix/bpf/ut/ipv4_opts_test.go +++ b/felix/bpf/ut/ipv4_opts_test.go @@ -39,7 +39,7 @@ func TestIPv4Opts(t *testing.T) { }} ipHdr.IHL += 2 - _, ipv4, l4, payload, pktBytes, err := testPacket(nil, &ipHdr, nil, nil) + _, ipv4, l4, payload, pktBytes, err := testPacketV4(nil, &ipHdr, nil, nil) Expect(err).NotTo(HaveOccurred()) udp := l4.(*layers.UDP) diff --git a/felix/bpf/ut/nat_encap_test.go b/felix/bpf/ut/nat_encap_test.go index e84e994444d..9da83abd767 100644 --- a/felix/bpf/ut/nat_encap_test.go +++ b/felix/bpf/ut/nat_encap_test.go @@ -34,7 +34,7 @@ func TestNatEncap(t *testing.T) { }} ipHdr.IHL += 2 - _, ipv4, l4, payload, pktBytes, err := testPacket(nil, &ipHdr, nil, nil) + _, ipv4, l4, payload, pktBytes, err := testPacketV4(nil, &ipHdr, nil, nil) Expect(err).NotTo(HaveOccurred()) udp := l4.(*layers.UDP) @@ -76,23 +76,32 @@ func TestNatEncap(t *testing.T) { }) } -func checkVxlanEncap(pktR gopacket.Packet, NATed bool, ipv4 *layers.IPv4, +func checkVxlanEncap(pktR gopacket.Packet, NATed bool, iphdr gopacket.Layer, transport gopacket.Layer, payload []byte) { inner := checkVxlan(pktR) - checkInnerIP(inner, NATed, ipv4, transport, payload) + checkInnerIP(inner, NATed, iphdr, transport, payload) } func checkVxlan(pktR gopacket.Packet) gopacket.Packet { + ipType := layers.LayerTypeIPv4 + ethType := layers.EthernetTypeIPv4 + ipv4L := pktR.Layer(layers.LayerTypeIPv4) - Expect(ipv4L).NotTo(BeNil()) - ipv4R := ipv4L.(*layers.IPv4) + if ipv4L != nil { + ipv4R := ipv4L.(*layers.IPv4) - ipv4CSum := ipv4R.Checksum - iptmp := gopacket.NewSerializeBuffer() - err := ipv4R.SerializeTo(iptmp, gopacket.SerializeOptions{ComputeChecksums: true}) // recompute csum - Expect(err).NotTo(HaveOccurred()) - Expect(ipv4CSum).To(Equal(ipv4R.Checksum)) + ipv4CSum := ipv4R.Checksum + iptmp := gopacket.NewSerializeBuffer() + err := ipv4R.SerializeTo(iptmp, gopacket.SerializeOptions{ComputeChecksums: true}) // recompute csum + Expect(err).NotTo(HaveOccurred()) + Expect(ipv4CSum).To(Equal(ipv4R.Checksum)) + } else { + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + ipType = layers.LayerTypeIPv6 + ethType = layers.EthernetTypeIPv6 + } udpL := pktR.Layer(layers.LayerTypeUDP) Expect(udpL).NotTo(BeNil()) @@ -115,10 +124,10 @@ func checkVxlan(pktR gopacket.Packet) gopacket.Packet { &layers.Ethernet{ SrcMAC: []byte{0, 0, 0, 0, 0, 0}, DstMAC: []byte{0, 0, 0, 0, 0, 0}, - EthernetType: layers.EthernetTypeIPv4, + EthernetType: ethType, })) - return gopacket.NewPacket(ethL.LayerPayload(), layers.LayerTypeIPv4, gopacket.Default) + return gopacket.NewPacket(ethL.LayerPayload(), ipType, gopacket.Default) } func encapedResponse(pktR gopacket.Packet) []byte { @@ -158,7 +167,10 @@ func encapedResponse(pktR gopacket.Packet) []byte { func getVxlanVNI(pktR gopacket.Packet) uint32 { ipv4L := pktR.Layer(layers.LayerTypeIPv4) - Expect(ipv4L).NotTo(BeNil()) + if ipv4L == nil { + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + } udpL := pktR.Layer(layers.LayerTypeUDP) Expect(udpL).NotTo(BeNil()) @@ -178,17 +190,32 @@ func getVxlanVNI(pktR gopacket.Packet) uint32 { return vxlanL.(*layers.VXLAN).VNI } -func checkInnerIP(ip gopacket.Packet, NATed bool, ipv4 *layers.IPv4, +func checkInnerIP(ip gopacket.Packet, NATed bool, iphdr gopacket.Layer, transport gopacket.Layer, payload []byte) { - ipv4L := ip.Layer(layers.LayerTypeIPv4) - Expect(ipv4L).NotTo(BeNil()) - if NATed { - Expect(ipv4L).To(layersMatchFields(ipv4, "Checksum", "TTL", "Options", "Padding")) - } else { - Expect(ipv4L).To(layersMatchFields(ipv4, "DstIP", "Checksum", "TTL", "Options", "Padding")) - } - Expect(ipv4L.(*layers.IPv4).TTL).To(Equal(ipv4.TTL - 1)) + switch t := iphdr.(type) { + case *layers.IPv4: + ipv4L := ip.Layer(layers.LayerTypeIPv4) + Expect(ipv4L).NotTo(BeNil()) + if NATed { + Expect(ipv4L).To(layersMatchFields(iphdr, "Checksum", "TTL", "Options", "Padding")) + } else { + Expect(ipv4L).To(layersMatchFields(iphdr, "DstIP", "Checksum", "TTL", "Options", "Padding")) + } + + Expect(ipv4L.(*layers.IPv4).TTL).To(Equal(t.TTL - 1)) + case *layers.IPv6: + ipv6L := ip.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + if NATed { + Expect(ipv6L).To(layersMatchFields(iphdr, "HopLimit", "HopByHop")) + } else { + Expect(ipv6L).To(layersMatchFields(iphdr, "DstIP", "HopLimit", "HopByHop")) + } + Expect(ipv6L.(*layers.IPv6).HopLimit).To(Equal(t.HopLimit - 1)) + default: + panic("xxx") + } transportL := ip.Layer(transport.LayerType()) Expect(transportL).NotTo(BeNil()) diff --git a/felix/bpf/ut/nat_test.go b/felix/bpf/ut/nat_test.go index 5d04ab70726..55658a6b88e 100644 --- a/felix/bpf/ut/nat_test.go +++ b/felix/bpf/ut/nat_test.go @@ -67,6 +67,7 @@ func TestNATPodPodXNode(t *testing.T) { nat.NewNATBackendValue(natIP, natPort).AsBytes(), ) Expect(err).NotTo(HaveOccurred()) + dumpNATMap(natMap) ctMap := conntrack.Map() err = ctMap.EnsureExists() @@ -101,7 +102,7 @@ func TestNATPodPodXNode(t *testing.T) { udpNat.DstPort = layers.UDPPort(natPort) // created the expected packet after NAT, with recalculated csums - _, _, _, _, resPktBytes, err := testPacket(eth, &ipv4Nat, &udpNat, payload) + _, _, _, _, resPktBytes, err := testPacketV4(eth, &ipv4Nat, &udpNat, payload) Expect(err).NotTo(HaveOccurred()) // expect them to be the same @@ -135,7 +136,7 @@ func TestNATPodPodXNode(t *testing.T) { udpNat.DstPort = layers.UDPPort(natPort) // created the expected packet after NAT, with recalculated csums - _, _, _, _, resPktBytes, err := testPacket(eth, &ipv4Nat, &udpNat, payload) + _, _, _, _, resPktBytes, err := testPacketV4(eth, &ipv4Nat, &udpNat, payload) Expect(err).NotTo(HaveOccurred()) // expect them to be the same @@ -790,7 +791,7 @@ func TestNATNodePort(t *testing.T) { /* * TEST that unknown VNI is passed through */ - testUnrelatedVXLAN(t, node2ip, vni) + testUnrelatedVXLAN(4, t, node2ip, vni) // TEST host-networked backend { @@ -1294,17 +1295,32 @@ func TestNATNodePortMultiNIC(t *testing.T) { dumpCTMap(ctMap) } -func testUnrelatedVXLAN(t *testing.T, nodeIP net.IP, vni uint32) { +func testUnrelatedVXLAN(ipver int, t *testing.T, nodeIP net.IP, vni uint32) { vxlanTest := func(fillUDPCsum bool, validVNI bool) { + var opts []testOption + var iphdr gopacket.SerializableLayer + eth := ethDefault - ipv4 := &layers.IPv4{ - Version: 4, - IHL: 5, - TTL: 64, - Flags: layers.IPv4DontFragment, - SrcIP: net.IPv4(1, 2, 3, 4), - DstIP: nodeIP, - Protocol: layers.IPProtocolUDP, + + if ipver == 4 { + iphdr = &layers.IPv4{ + Version: 4, + IHL: 5, + TTL: 64, + Flags: layers.IPv4DontFragment, + SrcIP: net.IPv4(1, 2, 3, 4), + DstIP: nodeIP, + Protocol: layers.IPProtocolUDP, + } + } else { + iphdr = &layers.IPv6{ + Version: 6, + HopLimit: 64, + SrcIP: net.ParseIP("abcd:ef12::ffff:0102:0304"), + DstIP: nodeIP, + NextHeader: layers.IPProtocolUDP, + } + opts = append(opts, withIPv6()) } udp := &layers.UDP{ @@ -1320,11 +1336,11 @@ func testUnrelatedVXLAN(t *testing.T, nodeIP net.IP, vni uint32) { payload := make([]byte, 64) udp.Length = uint16(8 + 8 + len(payload)) - _ = udp.SetNetworkLayerForChecksum(ipv4) + _ = udp.SetNetworkLayerForChecksum(iphdr.(gopacket.NetworkLayer)) pkt := gopacket.NewSerializeBuffer() err := gopacket.SerializeLayers(pkt, gopacket.SerializeOptions{ComputeChecksums: true}, - eth, ipv4, udp, vxlan, gopacket.Payload(payload)) + eth, iphdr, udp, vxlan, gopacket.Payload(payload)) Expect(err).NotTo(HaveOccurred()) pktBytes := pkt.Bytes() @@ -1338,7 +1354,7 @@ func testUnrelatedVXLAN(t *testing.T, nodeIP net.IP, vni uint32) { fmt.Printf("pktR = %+v\n", pktR) Expect(res.dataOut).To(Equal(pktBytes)) - }) + }, opts...) } hostIP = nodeIP @@ -1350,7 +1366,7 @@ func testUnrelatedVXLAN(t *testing.T, nodeIP net.IP, vni uint32) { func TestNATNodePortICMPTooBig(t *testing.T) { RegisterTestingT(t) - _, ipv4, l4, _, pktBytes, err := testPacket(nil, nil, nil, make([]byte, natTunnelMTU)) + _, ipv4, l4, _, pktBytes, err := testPacketV4(nil, nil, nil, make([]byte, natTunnelMTU)) Expect(err).NotTo(HaveOccurred()) udp := l4.(*layers.UDP) @@ -1435,7 +1451,7 @@ func TestNormalSYNRetryForcePolicy(t *testing.T) { DataOffset: 5, } - _, ipv4, _, _, synPkt, err := testPacket(nil, nil, tcpSyn, nil) + _, ipv4, _, _, synPkt, err := testPacketV4(nil, nil, tcpSyn, nil) Expect(err).NotTo(HaveOccurred()) // Insert a reverse route for the source workload. @@ -1539,7 +1555,7 @@ func TestNATSYNRetryGoesToSameBackend(t *testing.T) { DataOffset: 5, } - _, ipv4, _, _, synPkt, err := testPacket(nil, nil, tcpSyn, nil) + _, ipv4, _, _, synPkt, err := testPacketV4(nil, nil, tcpSyn, nil) Expect(err).NotTo(HaveOccurred()) err = natMap.Update( @@ -1589,7 +1605,7 @@ func TestNATSYNRetryGoesToSameBackend(t *testing.T) { seenOtherIP := false for attempt := 0; attempt < 100; attempt++ { tcpSyn.SrcPort++ - _, _, _, _, synPkt, err := testPacket(nil, nil, tcpSyn, nil) + _, _, _, _, synPkt, err := testPacketV4(nil, nil, tcpSyn, nil) Expect(err).NotTo(HaveOccurred()) res, err := bpfrun(synPkt) Expect(err).NotTo(HaveOccurred()) @@ -1610,7 +1626,7 @@ func TestNATSYNRetryGoesToSameBackend(t *testing.T) { // Change back to the original SYN packet so that we can test the new policy // with an existing CT entry. tcpSyn.SrcPort = origTCPSrcPort - _, _, _, _, synPkt, err = testPacket(nil, nil, tcpSyn, nil) + _, _, _, _, synPkt, err = testPacketV4(nil, nil, tcpSyn, nil) Expect(err).NotTo(HaveOccurred()) bpfIfaceName = "SYNP" @@ -2255,7 +2271,7 @@ func TestNATSourceCollision(t *testing.T) { var recvPkt []byte - _, _, _, _, pktBytes, _ := testPacket(nil, pktIPHdr, pktTCPHdr, + _, _, _, _, pktBytes, _ := testPacketV4(nil, pktIPHdr, pktTCPHdr, []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11, 22, 33, 44, 55, 66, 77, 88, 99, 0}) skbMark = 0 @@ -2356,7 +2372,7 @@ func TestNATSourceCollision(t *testing.T) { pktTCPHdr.ACK = true pktTCPHdr.Seq = 1 - _, _, _, _, pktBytes, _ = testPacket(nil, pktIPHdr, pktTCPHdr, + _, _, _, _, pktBytes, _ = testPacketV4(nil, pktIPHdr, pktTCPHdr, []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11, 22, 33, 44, 55, 66, 77, 88, 99, 0}) dumpCTMap(ctMap) @@ -2414,7 +2430,7 @@ func TestNATSourceCollision(t *testing.T) { DataOffset: 5, } - _, _, _, _, pktBytes, _ = testPacket(nil, pktIPHdr, pktTCPHdr, + _, _, _, _, pktBytes, _ = testPacketV4(nil, pktIPHdr, pktTCPHdr, []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11, 22, 33, 44, 55, 66, 77, 88, 99, 0}) skbMark = 0 @@ -2605,3 +2621,863 @@ func TestNATHostRemoteNPLocalPod(t *testing.T) { dumpCTMap(ctMap) } + +func TestNATPodPodXNodeV6(t *testing.T) { + RegisterTestingT(t) + + bpfIfaceName = "NAT1" + defer func() { bpfIfaceName = "" }() + + eth, ipv6, l4, payload, pktBytes, err := testPacketUDPDefaultNPV6(node1ipV6) + Expect(err).NotTo(HaveOccurred()) + udp := l4.(*layers.UDP) + + err = natMapV6.Update( + nat.NewNATKeyV6(ipv6.DstIP, uint16(udp.DstPort), uint8(17)).AsBytes(), + nat.NewNATValueV6(0, 1, 0, 0).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + natIP := net.ParseIP("abcd::ffff:0808:0808") + natPort := uint16(666) + + err = natBEMapV6.Update( + nat.NewNATBackendKeyV6(0, 0).AsBytes(), + nat.NewNATBackendValueV6(natIP, natPort).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + resetCTMapV6(ctMapV6) // ensure it is clean + + var natedPkt []byte + + hostIP = node1ipV6 + + // Insert a reverse route for the source workload that is not in a calico + // poll, for example 3rd party CNI is used. + rtKey := routes.NewKeyV6(srcV6CIDR).AsBytes() + rtVal := routes.NewValueV6WithIfIndex(routes.FlagsLocalWorkload, 1).AsBytes() + err = rtMapV6.Update(rtKey, rtVal) + Expect(err).NotTo(HaveOccurred()) + dumpRTMapV6(rtMapV6) + dumpNATMapV6(natMapV6) + + skbMark = 0 + // Leaving workloada test for fc711b192f */ + runBpfTest(t, "calico_from_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ipv6Nat := *ipv6Default + ipv6Nat.DstIP = natIP + + udpNat := *udp + udpNat.DstPort = layers.UDPPort(natPort) + + // created the expected packet after NAT, with recalculated csums + _, _, _, _, resPktBytes, err := testPacketV6(eth, &ipv6Nat, &udpNat, payload, ipv6HopByHopExt()) + Expect(err).NotTo(HaveOccurred()) + + // expect them to be the same + Expect(res.dataOut).To(Equal(resPktBytes)) + + natedPkt = res.dataOut + }, withIPv6()) + expectMark(tcdefs.MarkSeenSkipFIB) + + resetCTMapV6(ctMapV6) + + // Insert a reverse route for the source workload that is in pool. + rtVal = routes.NewValueV6WithIfIndex(routes.FlagsLocalWorkload|routes.FlagInIPAMPool, 1).AsBytes() + err = rtMapV6.Update(rtKey, rtVal) + Expect(err).NotTo(HaveOccurred()) + + skbMark = 0 + // Leaving workload + runBpfTest(t, "calico_from_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ipv6Nat := *ipv6Default + ipv6Nat.DstIP = natIP + + udpNat := *udp + udpNat.DstPort = layers.UDPPort(natPort) + + // created the expected packet after NAT, with recalculated csums + _, _, _, _, resPktBytes, err := testPacketV6(eth, &ipv6Nat, &udpNat, payload, ipv6HopByHopExt()) + Expect(err).NotTo(HaveOccurred()) + + // expect them to be the same + Expect(res.dataOut).To(Equal(resPktBytes)) + + natedPkt = res.dataOut + }, withIPv6()) + + // Leaving node 1 + expectMark(tcdefs.MarkSeen) + + runBpfTest(t, "calico_to_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(natedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(natedPkt)) + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + fromHostCT := saveCTMapV6(ctMapV6) + resetCTMapV6(ctMapV6) + + var recvPkt []byte + + hostIP = node2ipV6 + + skbMark = 0 + + // Insert the reverse route for backend for RPF check. + resetRTMapV6(rtMapV6) + beV6CIDR := ip.CIDRFromNetIP(natIP).(ip.V6CIDR) + bertKey := routes.NewKeyV6(beV6CIDR).AsBytes() + bertVal := routes.NewValueV6WithIfIndex(routes.FlagsLocalWorkload|routes.FlagInIPAMPool, 1).AsBytes() + err = rtMapV6.Update(bertKey, bertVal) + Expect(err).NotTo(HaveOccurred()) + + bpfIfaceName = "NAT2" + // Arriving at node 2 + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(natedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(natedPkt)) + }, withIPv6()) + + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + v, ok := ct[conntrack.NewKeyV6(uint8(17), ipv6.SrcIP, uint16(udp.SrcPort), natIP, natPort)] + Expect(ok).To(BeTrue()) + // No NATing, service already resolved + Expect(v.Type()).To(Equal(conntrack.TypeNormal)) + Expect(v.Flags()).To(Equal(uint16(0))) + + // Arriving at workload at node 2 + expectMark(tcdefs.MarkSeen) + runBpfTest(t, "calico_to_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(natedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(natedPkt)) + + recvPkt = res.dataOut + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + + var respPkt []byte + + // Response leaving workload at node 2 + skbMark = 0 + runBpfTest(t, "calico_from_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) { + respPkt = udpResponseRawV6(recvPkt) + res, err := bpfrun(respPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(respPkt)) + }, withIPv6()) + + // Response leaving node 2 + expectMark(tcdefs.MarkSeenBypass) + runBpfTest(t, "calico_to_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(respPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(respPkt)) + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + resetCTMapV6(ctMapV6) + restoreCTMapV6(ctMapV6, fromHostCT) + dumpCTMapV6(ctMapV6) + + hostIP = node1ipV6 + + // Response arriving at node 1 + bpfIfaceName = "NAT1" + skbMark = 0 + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(respPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(respPkt)) + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + + // Response arriving at workload at node 1 + expectMark(tcdefs.MarkSeen) + runBpfTest(t, "calico_to_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) { + pktExp := gopacket.NewPacket(respPkt, layers.LayerTypeEthernet, gopacket.Default) + ipv6L := pktExp.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + ipv6R := ipv6L.(*layers.IPv6) + udpL := pktExp.Layer(layers.LayerTypeUDP) + Expect(udpL).NotTo(BeNil()) + udpR := udpL.(*layers.UDP) + + ipv6R.SrcIP = ipv6.DstIP + udpR.SrcPort = udp.DstPort + _ = udpR.SetNetworkLayerForChecksum(ipv6R) + + pktExpSer := gopacket.NewSerializeBuffer() + err := gopacket.SerializePacket(pktExpSer, gopacket.SerializeOptions{ComputeChecksums: true}, pktExp) + Expect(err).NotTo(HaveOccurred()) + + res, err := bpfrun(respPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(pktExpSer.Bytes())) + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + + // Response leaving to original source + + // clean up + resetCTMapV6(ctMapV6) +} + +func TestNATNodePortV6(t *testing.T) { + RegisterTestingT(t) + + bpfIfaceName = "NP-1" + defer func() { bpfIfaceName = "" }() + + _, ipv6, l4, payload, pktBytes, err := testPacketUDPDefaultNPV6(node1ipV6) + Expect(err).NotTo(HaveOccurred()) + udp := l4.(*layers.UDP) + + err = natMapV6.Update( + nat.NewNATKeyV6(ipv6.DstIP, uint16(udp.DstPort), uint8(17 /* UDP */)).AsBytes(), + nat.NewNATValueV6(0, 1, 0, 0).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + natIP := net.ParseIP("abcd::ffff:0808:0808") + natPort := uint16(666) + + err = natBEMapV6.Update( + nat.NewNATBackendKeyV6(0, 0).AsBytes(), + nat.NewNATBackendValueV6(natIP, natPort).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + node2wCIDR := net.IPNet{ + IP: natIP, + Mask: net.CIDRMask(128, 128), + } + + resetCTMapV6(ctMapV6) // ensure it is clean + + var encapedPkt []byte + + resetRTMap(rtMapV6) + + hostIP = node1ipV6 + skbMark = 0 + + // Arriving at node 1 - non-routable -> denied + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_SHOT)) + }, withIPv6()) + + defer resetRTMapV6(rtMapV6) + Expect(err).NotTo(HaveOccurred()) + err = rtMapV6.Update( + routes.NewKeyV6(ip.CIDRFromIPNet(&node2wCIDR).(ip.V6CIDR)).AsBytes(), + routes.NewValueV6WithNextHop(routes.FlagsRemoteWorkload|routes.FlagInIPAMPool, + ip.FromNetIP(node2ipV6).(ip.V6Addr)).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + err = rtMapV6.Update( + routes.NewKeyV6(ip.CIDRFromIPNet(&node1CIDRV6).(ip.V6CIDR)).AsBytes(), + routes.NewValueV6(routes.FlagsLocalHost).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + err = rtMapV6.Update( + routes.NewKeyV6(ip.CIDRFromIPNet(&node2CIDRV6).(ip.V6CIDR)).AsBytes(), + routes.NewValueV6(routes.FlagsRemoteHost).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + dumpRTMapV6(rtMapV6) + rtNode1 := saveRTMapV6(rtMapV6) + + vni := uint32(0) + + // Arriving at node 1 + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + ipv6R := ipv6L.(*layers.IPv6) + Expect(ipv6R.SrcIP.String()).To(Equal(hostIP.String())) + Expect(ipv6R.DstIP.String()).To(Equal(node2ipV6.String())) + + checkVxlanEncap(pktR, false, ipv6, udp, payload) + vni = getVxlanVNI(pktR) + + encapedPkt = res.dataOut + + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + + ctKey := conntrack.NewKeyV6(uint8(17 /* UDP */), + ipv6.SrcIP, uint16(udp.SrcPort), ipv6.DstIP, uint16(udp.DstPort)) + + Expect(ct).Should(HaveKey(ctKey)) + ctr := ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATForward)) + + ctKey = ctr.ReverseNATKey() + Expect(ct).Should(HaveKey(ctKey)) + ctr = ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATReverse)) + + // Approved for both sides due to forwarding through the tunnel + Expect(ctr.Data().A2B.Approved).To(BeTrue()) + Expect(ctr.Data().B2A.Approved).To(BeTrue()) + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + v, ok := ct[conntrack.NewKeyV6(uint8(17 /* UDP */), ipv6.SrcIP, uint16(udp.SrcPort), natIP, natPort)] + Expect(ok).To(BeTrue()) + Expect(v.Type()).To(Equal(conntrack.TypeNATReverse)) + Expect(v.Flags()).To(Equal(conntrack3.FlagNATNPFwd)) + + expectMark(tcdefs.MarkSeenBypassForward) + // Leaving node 1 + runBpfTest(t, "calico_to_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(encapedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(encapedPkt)) + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + fromHostCT := saveCTMapV6(ctMapV6) + + encapedPktArrivesAtNode2 := make([]byte, len(encapedPkt)) + copy(encapedPktArrivesAtNode2, encapedPkt) + + resetCTMapV6(ctMapV6) + + var recvPkt []byte + + hostIP = node2ipV6 + + // change the routing - it is a local workload now! + err = rtMapV6.Update( + routes.NewKeyV6(ip.CIDRFromIPNet(&node2wCIDR).(ip.V6CIDR)).AsBytes(), + routes.NewValueV6(routes.FlagsLocalWorkload|routes.FlagInIPAMPool).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + // we must know that the encaped packet src ip if from a known host + err = rtMapV6.Update( + routes.NewKeyV6(ip.CIDRFromIPNet(&node1CIDRV6).(ip.V6CIDR)).AsBytes(), + routes.NewValueV6(routes.FlagsRemoteHost).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + err = rtMapV6.Update( + routes.NewKeyV6(ip.CIDRFromIPNet(&node2CIDRV6).(ip.V6CIDR)).AsBytes(), + routes.NewValueV6(routes.FlagsLocalHost).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + dumpRTMapV6(rtMapV6) + + // now we are at the node with local workload + err = natMapV6.Update( + nat.NewNATKeyV6(ipv6.DstIP, uint16(udp.DstPort), uint8(17 /* UDP */)).AsBytes(), + nat.NewNATValueV6(0 /* id */, 1 /* count */, 1 /* local */, 0).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + // Arriving at node 2 + bpfIfaceName = "NP-2" + + arpMapN2 := saveARPMapV6(arpMapV6) + Expect(arpMapN2).To(HaveLen(0)) + + skbMark = 0 + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(encapedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + payloadL := pktR.ApplicationLayer() + Expect(payloadL).NotTo(BeNil()) + vxlanL := gopacket.NewPacket(payloadL.Payload(), layers.LayerTypeVXLAN, gopacket.Default) + Expect(vxlanL).NotTo(BeNil()) + fmt.Printf("vxlanL = %+v\n", vxlanL) + + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + ipv6R := ipv6L.(*layers.IPv6) + Expect(ipv6R.SrcIP.String()).To(Equal(ipv6.SrcIP.String())) + Expect(ipv6R.DstIP.String()).To(Equal(natIP.String())) + + udpL := pktR.Layer(layers.LayerTypeUDP) + Expect(udpL).NotTo(BeNil()) + udpR := udpL.(*layers.UDP) + Expect(udpR.SrcPort).To(Equal(layers.UDPPort(udp.SrcPort))) + Expect(udpR.DstPort).To(Equal(layers.UDPPort(natPort))) + + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + + ctKey := conntrack.NewKeyV6(uint8(17 /* UDP */), + ipv6.SrcIP, uint16(udp.SrcPort), ipv6.DstIP, uint16(udp.DstPort)) + + Expect(ct).Should(HaveKey(ctKey)) + ctr := ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATForward)) + Expect(ctr.NATSPort()).To(Equal(uint16(0))) + + ctKey = ctr.ReverseNATKey() + Expect(ct).Should(HaveKey(ctKey)) + ctr = ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATReverse)) + + // Approved source side + Expect(ctr.Data().A2B.Approved).To(BeTrue()) + // Dest not approved yet + Expect(ctr.Data().B2A.Approved).NotTo(BeTrue()) + + recvPkt = res.dataOut + }, withIPv6()) + + expectMark(tcdefs.MarkSeen) + + dumpCTMapV6(ctMapV6) + ct, err = conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + v, ok = ct[conntrack.NewKeyV6(uint8(17 /* UDP */), ipv6.SrcIP, uint16(udp.SrcPort), natIP, natPort)] + Expect(ok).To(BeTrue()) + Expect(v.Type()).To(Equal(conntrack.TypeNATReverse)) + Expect(v.Flags()).To(Equal(conntrack3.FlagExtLocal)) + + dumpARPMapV6(arpMapV6) + + arpMapN2 = saveARPMapV6(arpMapV6) + Expect(arpMapN2).To(HaveLen(1)) + arpKey := arp.NewKeyV6(node1ipV6, 1) // ifindex is always 1 in UT + Expect(arpMapN2).To(HaveKey(arpKey)) + macDst := encapedPkt[0:6] + macSrc := encapedPkt[6:12] + Expect(arpMapN2[arpKey]).To(Equal(arp.NewValue(macDst, macSrc))) + + // try a spoofed tunnel packet, should be dropped and have no effect + skbMark = 0 + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + // modify the only known good src IP, we do not care about csums at this point + encapedPkt[26] = 234 + res, err := bpfrun(encapedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_SHOT)) + }, withIPv6()) + + skbMark = tcdefs.MarkSeen + + // Insert the reverse route for backend for RPF check. + resetRTMap(rtMapV6) + beV4CIDR := ip.CIDRFromNetIP(natIP).(ip.V6CIDR) + bertKey := routes.NewKeyV6(beV4CIDR).AsBytes() + bertVal := routes.NewValueV6WithIfIndex(routes.FlagsLocalWorkload|routes.FlagInIPAMPool, 1).AsBytes() + err = rtMapV6.Update(bertKey, bertVal) + Expect(err).NotTo(HaveOccurred()) + + // Arriving at workload at node 2 + runBpfTest(t, "calico_to_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(recvPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(recvPkt)) + + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + + ctKey := conntrack.NewKeyV6(uint8(17 /* UDP */), + ipv6.SrcIP, uint16(udp.SrcPort), ipv6.DstIP, uint16(udp.DstPort)) + + Expect(ct).Should(HaveKey(ctKey)) + ctr := ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATForward)) + + ctKey = ctr.ReverseNATKey() + Expect(ct).Should(HaveKey(ctKey)) + ctr = ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATReverse), + fmt.Sprintf("Expected reverse conntrack entry but got %v", ctr)) + + // Approved source side + Expect(ctr.Data().A2B.Approved).To(BeTrue()) + // Approved destination side as well + Expect(ctr.Data().B2A.Approved).To(BeTrue()) + }, withIPv6()) + + skbMark = 0 + + // Response leaving workload at node 2 + runBpfTest(t, "calico_from_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) { + respPkt := udpResponseRawV6(recvPkt) + // Change the MAC addresses so that we can observe that the right + // addresses were patched in. + copy(respPkt[:6], []byte{1, 2, 3, 4, 5, 6}) + copy(respPkt[6:12], []byte{6, 5, 4, 3, 2, 1}) + res, err := bpfrun(respPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_REDIRECT)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ethL := pktR.Layer(layers.LayerTypeEthernet) + Expect(ethL).NotTo(BeNil()) + ethR := ethL.(*layers.Ethernet) + Expect(ethR).To(layersMatchFields(&layers.Ethernet{ + SrcMAC: macDst, + DstMAC: macSrc, + EthernetType: layers.EthernetTypeIPv6, + })) + + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + ipv6R := ipv6L.(*layers.IPv6) + Expect(ipv6R.SrcIP.String()).To(Equal(hostIP.String())) + Expect(ipv6R.DstIP.String()).To(Equal(node1ipV6.String())) + + checkVxlan(pktR) + + encapedPkt = res.dataOut + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + + expectMark(tcdefs.MarkSeen) + + hostIP = node2ipV6 + + // Response leaving node 2 + runBpfTest(t, "calico_to_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(encapedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + ipv6R := ipv6L.(*layers.IPv6) + // check that the IP is fixed up + Expect(ipv6R.SrcIP.String()).To(Equal(node2ipV6.String())) + Expect(ipv6R.DstIP.String()).To(Equal(node1ipV6.String())) + + checkVxlan(pktR) + + encapedPkt = res.dataOut + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + resetCTMapV6(ctMapV6) + restoreCTMapV6(ctMapV6, fromHostCT) + dumpCTMapV6(ctMapV6) + + hostIP = node1ipV6 + + // change to routing again to a remote workload + resetRTMap(rtMapV6) + restoreRTMapV6(rtMapV6, rtNode1) + dumpRTMapV6(rtMapV6) + + // Response arriving at node 1 + bpfIfaceName = "NP-1" + skbMark = 0 + + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(encapedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + ipv6R := ipv6L.(*layers.IPv6) + Expect(ipv6R.DstIP.String()).To(Equal(ipv6.SrcIP.String())) + Expect(ipv6R.SrcIP.String()).To(Equal(ipv6.DstIP.String())) + + udpL := pktR.Layer(layers.LayerTypeUDP) + Expect(udpL).NotTo(BeNil()) + udpR := udpL.(*layers.UDP) + Expect(udpR.SrcPort).To(Equal(udp.DstPort)) + Expect(udpR.DstPort).To(Equal(udp.SrcPort)) + + payloadL := pktR.ApplicationLayer() + Expect(payloadL).NotTo(BeNil()) + Expect(payload).To(Equal(payloadL.Payload())) + + recvPkt = res.dataOut + }, withIPv6()) + + expectMark(tcdefs.MarkSeenBypassForward) + saveMark := skbMark + + dumpCTMapV6(ctMapV6) + + skbMark = 0 + // try a spoofed tunnel packet returnign back, should be dropped and have no effect + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + // modify the only known good src IP, we do not care about csums at this point + encapedPkt[26] = 235 + res, err := bpfrun(encapedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_SHOT)) + }, withIPv6()) + + skbMark = saveMark + // Response leaving to original source + runBpfTest(t, "calico_to_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(recvPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + + ctKey := conntrack.NewKeyV6(uint8(17 /* UDP */), + ipv6.SrcIP, uint16(udp.SrcPort), ipv6.DstIP, uint16(udp.DstPort)) + + Expect(ct).Should(HaveKey(ctKey)) + ctr := ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATForward)) + + ctKey = ctr.ReverseNATKey() + Expect(ct).Should(HaveKey(ctKey)) + ctr = ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATReverse)) + + // Approved for both sides due to forwarding through the tunnel + Expect(ctr.Data().A2B.Approved).To(BeTrue()) + Expect(ctr.Data().B2A.Approved).To(BeTrue()) + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + + skbMark = 0 + // Another pkt arriving at node 1 - uses existing CT entries + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + ipv6R := ipv6L.(*layers.IPv6) + Expect(ipv6R.SrcIP.String()).To(Equal(hostIP.String())) + Expect(ipv6R.DstIP.String()).To(Equal(node2ipV6.String())) + + checkVxlanEncap(pktR, false, ipv6, udp, payload) + }, withIPv6()) + + expectMark(tcdefs.MarkSeenBypassForward) + + /* + * TEST that unknown VNI is passed through + */ + testUnrelatedVXLAN(6, t, node2ipV6, vni) + + // TEST host-networked backend + { + resetCTMapV6(ctMapV6) + + var recvPkt []byte + + hostIP = node2ipV6 + skbMark = 0 + + // we must know that the encaped packet src ip is from a known host + err = rtMapV6.Update( + routes.NewKeyV6(ip.CIDRFromIPNet(&node1CIDRV6).(ip.V6CIDR)).AsBytes(), + routes.NewValueV6(routes.FlagsRemoteHost).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + err = rtMapV6.Update( + routes.NewKeyV6(ip.CIDRFromIPNet(&node2CIDRV6).(ip.V6CIDR)).AsBytes(), + routes.NewValueV6(routes.FlagsLocalHost).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + dumpRTMapV6(rtMapV6) + + // now we are at the node with local workload + err = natMapV6.Update( + nat.NewNATKeyV6(net.ParseIP("ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"), + uint16(udp.DstPort), uint8(17 /* UDP */)).AsBytes(), + nat.NewNATValueV6(0 /* count */, 1 /* local */, 1, 0).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + // make it point to the local host - host networked backend + err = natBEMapV6.Update( + nat.NewNATBackendKeyV6(0, 0).AsBytes(), + nat.NewNATBackendValueV6(node2ipV6, natPort).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + // Arriving at node 2 + bpfIfaceName = "NP-2" + + skbMark = 0 + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(encapedPktArrivesAtNode2) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + ipv6R := ipv6L.(*layers.IPv6) + Expect(ipv6R.SrcIP.String()).To(Equal(ipv6.SrcIP.String())) + Expect(ipv6R.DstIP.String()).To(Equal(node2ipV6.String())) + + udpL := pktR.Layer(layers.LayerTypeUDP) + Expect(udpL).NotTo(BeNil()) + udpR := udpL.(*layers.UDP) + Expect(udpR.SrcPort).To(Equal(layers.UDPPort(udp.SrcPort))) + Expect(udpR.DstPort).To(Equal(layers.UDPPort(natPort))) + + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + + ctKey := conntrack.NewKeyV6(uint8(17 /* UDP */), + ipv6.SrcIP, uint16(udp.SrcPort), ipv6.DstIP, uint16(udp.DstPort)) + + Expect(ct).Should(HaveKey(ctKey)) + ctr := ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATForward)) + + ctKey = ctr.ReverseNATKey() + Expect(ct).Should(HaveKey(ctKey)) + ctr = ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATReverse)) + + // Approved source side + Expect(ctr.Data().A2B.Approved).To(BeTrue()) + // Dest not approved yet + Expect(ctr.Data().B2A.Approved).NotTo(BeTrue()) + + recvPkt = res.dataOut + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + + skbMark = 0 + + // Response leaving workload at node 2 + runBpfTest(t, "calico_to_host_ep", nil, func(bpfrun bpfProgRunFn) { + respPkt := udpResponseRawV6(recvPkt) + + // Change the MAC addresses so that we can observe that the right + // addresses were patched in. + macUntouched := []byte{6, 5, 4, 3, 2, 1} + copy(respPkt[:6], []byte{1, 2, 3, 4, 5, 6}) + copy(respPkt[6:12], macUntouched) + + res, err := bpfrun(respPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ethL := pktR.Layer(layers.LayerTypeEthernet) + Expect(ethL).NotTo(BeNil()) + ethR := ethL.(*layers.Ethernet) + Expect(ethR).To(layersMatchFields(&layers.Ethernet{ + SrcMAC: macUntouched, // Source is set by net stack and should not be touched. + DstMAC: macSrc, + EthernetType: layers.EthernetTypeIPv6, + })) + + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + ipv6R := ipv6L.(*layers.IPv6) + Expect(ipv6R.SrcIP.String()).To(Equal(node2ipV6.String())) + Expect(ipv6R.DstIP.String()).To(Equal(node1ipV6.String())) + + checkVxlan(pktR) + }, withHostNetworked(), withIPv6()) + } +} diff --git a/felix/bpf/ut/snat_test.go b/felix/bpf/ut/snat_test.go index a8b91f25372..900d86a7d8b 100644 --- a/felix/bpf/ut/snat_test.go +++ b/felix/bpf/ut/snat_test.go @@ -38,7 +38,7 @@ func TestSNATHostServiceRemotePod(t *testing.T) { ipHdr := ipv4Default ipHdr.Id = 1 - eth, ipv4, l4, payload, pktBytes, err := testPacket(nil, ipHdr, nil, nil) + eth, ipv4, l4, payload, pktBytes, err := testPacketV4(nil, ipHdr, nil, nil) Expect(err).NotTo(HaveOccurred()) udp := l4.(*layers.UDP) @@ -113,7 +113,7 @@ func TestSNATHostServiceRemotePod(t *testing.T) { udpNat.DstPort = layers.UDPPort(natPort) // created the expected packet after NAT, with recalculated csums - _, _, _, _, resPktBytes, err := testPacket(eth, &ipv4Nat, &udpNat, payload) + _, _, _, _, resPktBytes, err := testPacketV4(eth, &ipv4Nat, &udpNat, payload) Expect(err).NotTo(HaveOccurred()) // expect them to be the same @@ -141,7 +141,7 @@ func TestSNATHostServiceRemotePod(t *testing.T) { // Second packet - conntrack hit ipHdr.Id = 2 - eth, ipv4, _, payload, pktBytes, err = testPacket(nil, ipHdr, nil, nil) + eth, ipv4, _, payload, pktBytes, err = testPacketV4(nil, ipHdr, nil, nil) Expect(err).NotTo(HaveOccurred()) skbMark = 0 @@ -162,7 +162,7 @@ func TestSNATHostServiceRemotePod(t *testing.T) { udpNat.DstPort = layers.UDPPort(natPort) // created the expected packet after NAT, with recalculated csums - _, _, _, _, resPktBytes, err := testPacket(eth, &ipv4Nat, &udpNat, payload) + _, _, _, _, resPktBytes, err := testPacketV4(eth, &ipv4Nat, &udpNat, payload) Expect(err).NotTo(HaveOccurred()) // expect them to be the same @@ -209,7 +209,7 @@ func TestSNATHostServiceRemotePod(t *testing.T) { ethResp.SrcMAC, ethResp.DstMAC = ethResp.DstMAC, ethResp.SrcMAC // created the expected packet after NAT, with recalculated csums - _, _, _, _, resPktBytes, err := testPacket(ðResp, &ipResp, &udpResp, payload) + _, _, _, _, resPktBytes, err := testPacketV4(ðResp, &ipResp, &udpResp, payload) Expect(err).NotTo(HaveOccurred()) // expect them to be the same diff --git a/felix/bpf/ut/tcp_test.go b/felix/bpf/ut/tcp_test.go index af17d137291..82142f46683 100644 --- a/felix/bpf/ut/tcp_test.go +++ b/felix/bpf/ut/tcp_test.go @@ -44,7 +44,7 @@ func TestTCPRecycleClosedConn(t *testing.T) { DataOffset: 5, } - _, _, _, _, synPkt, err := testPacket(nil, nil, tcpSyn, nil) + _, _, _, _, synPkt, err := testPacketV4(nil, nil, tcpSyn, nil) Expect(err).NotTo(HaveOccurred()) // Insert a reverse route for the source workload. @@ -130,7 +130,7 @@ func TestTCPRecycleClosedConnNAT(t *testing.T) { DataOffset: 5, } - _, ipv4, l4, _, synPkt, err := testPacket(nil, nil, tcpSyn, nil) + _, ipv4, l4, _, synPkt, err := testPacketV4(nil, nil, tcpSyn, nil) Expect(err).NotTo(HaveOccurred()) tcp := l4.(*layers.TCP) diff --git a/felix/bpf/ut/to_host_allowed_test.go b/felix/bpf/ut/to_host_allowed_test.go index 38faf29518a..b7725c212b1 100644 --- a/felix/bpf/ut/to_host_allowed_test.go +++ b/felix/bpf/ut/to_host_allowed_test.go @@ -83,7 +83,7 @@ func TestToHostAllowedCTFull(t *testing.T) { DataOffset: 5, } - _, ipv4, _, _, synPkt, err := testPacket(nil, nil, tcpSyn, nil) + _, ipv4, _, _, synPkt, err := testPacketV4(nil, nil, tcpSyn, nil) Expect(err).NotTo(HaveOccurred()) destCIDR := net.IPNet{ @@ -178,7 +178,7 @@ func TestToHostAllowedCTFull(t *testing.T) { ipv4Ret := *ipv4 ipv4Ret.SrcIP, ipv4Ret.DstIP = ipv4Ret.DstIP, ipv4Ret.SrcIP - _, _, _, _, synAckPkt, err := testPacket(nil, &ipv4Ret, tcpSynAck, nil) + _, _, _, _, synAckPkt, err := testPacketV4(nil, &ipv4Ret, tcpSynAck, nil) Expect(err).NotTo(HaveOccurred()) skbMark = tcdefs.MarkSeen @@ -196,7 +196,7 @@ func TestToHostAllowedCTFull(t *testing.T) { DataOffset: 5, } - _, _, _, _, ackPkt, err := testPacket(nil, nil, tcpAck, nil) + _, _, _, _, ackPkt, err := testPacketV4(nil, nil, tcpAck, nil) Expect(err).NotTo(HaveOccurred()) skbMark = 0 diff --git a/felix/bpf/ut/whitelist_test.go b/felix/bpf/ut/whitelist_test.go index 8f380e6aa2b..e003980b4e6 100644 --- a/felix/bpf/ut/whitelist_test.go +++ b/felix/bpf/ut/whitelist_test.go @@ -240,7 +240,7 @@ func TestAllowFromHostExitHost(t *testing.T) { ipHdr.SrcIP = node1ip ipHdr.DstIP = node2ip - _, ipv4, l4, _, pktBytes, err := testPacket(nil, ipHdr, nil, nil) + _, ipv4, l4, _, pktBytes, err := testPacketV4(nil, ipHdr, nil, nil) Expect(err).NotTo(HaveOccurred()) udp := l4.(*layers.UDP) @@ -308,3 +308,81 @@ func TestAllowFromHostExitHost(t *testing.T) { Expect(ctr.Data().B2A.Approved).To(BeTrue()) }) } + +func TestAllowEnterHostToWorkloadV6(t *testing.T) { + RegisterTestingT(t) + + bpfIfaceName = "HWwl" + defer func() { bpfIfaceName = "" }() + + hop := &layers.IPv6HopByHop{} + hop.NextHeader = layers.IPProtocolUDP + + /* from gopacket ip6_test.go */ + tlv := &layers.IPv6HopByHopOption{} + tlv.OptionType = 0x01 //PadN + tlv.OptionData = []byte{0x00, 0x00, 0x00, 0x00} + hop.Options = append(hop.Options, tlv) + + _, _, l4, _, pktBytes, err := testPacketV6(nil, ipv6Default, nil, nil, hop) + Expect(err).NotTo(HaveOccurred()) + udp := l4.(*layers.UDP) + + resetMap(ctMapV6) // ensure it is clean + + hostIP = node1ip + + // Insert a reverse route for the source workload. + rtKey := routes.NewKeyV6(srcV6CIDR).AsBytes() + rtVal := routes.NewValueV6(routes.FlagsRemoteWorkload | routes.FlagInIPAMPool).AsBytes() + err = rtMapV6.Update(rtKey, rtVal) + Expect(err).NotTo(HaveOccurred()) + rtKey = routes.NewKeyV6(dstV6CIDR).AsBytes() + rtVal = routes.NewValueV6WithIfIndex(routes.FlagsRemoteWorkload|routes.FlagInIPAMPool, 1).AsBytes() + err = rtMapV6.Update(rtKey, rtVal) + Expect(err).NotTo(HaveOccurred()) + defer resetRTMap(rtMapV6) + + dumpRTMapV6(rtMapV6) + + ctKey := conntrack.NewKeyV6(17, /* UDP */ + ipv6Default.SrcIP, uint16(udp.SrcPort), ipv6Default.DstIP, uint16(udp.DstPort)) + + skbMark = 0 + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + Expect(ct).Should(HaveKey(ctKey)) + + ctr := ct[ctKey] + + // Approved by HEP + Expect(ctr.Data().A2B.Approved).To(BeTrue()) + // NOt approved by WEP yet + Expect(ctr.Data().B2A.Approved).NotTo(BeTrue()) + }, withIPv6()) + + expectMark(tcdefs.MarkSeen) + + dumpCTMapV6(ctMapV6) + + runBpfTest(t, "calico_to_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + Expect(ct).Should(HaveKey(ctKey)) + + ctr := ct[ctKey] + + // Still approved both by HEP and WEP + Expect(ctr.Data().B2A.Approved).To(BeTrue()) + Expect(ctr.Data().A2B.Approved).To(BeTrue()) + }, withIPv6()) +} diff --git a/felix/bpf/ut/xdp_test.go b/felix/bpf/ut/xdp_test.go index 29a5b34a20c..293dadb82e0 100644 --- a/felix/bpf/ut/xdp_test.go +++ b/felix/bpf/ut/xdp_test.go @@ -241,7 +241,7 @@ func TestXDPPrograms(t *testing.T) { for i, tc := range xdpTestCases { bpfIfaceName = fmt.Sprintf("XDP-%d", i) runBpfTest(t, "xdp_calico_entrypoint", tc.Rules, func(bpfrun bpfProgRunFn) { - _, _, _, _, pktBytes, err := testPacket(nil, tc.IPv4Header, tc.NextHeader, nil) + _, _, _, _, pktBytes, err := testPacketV4(nil, tc.IPv4Header, tc.NextHeader, nil) Expect(err).NotTo(HaveOccurred()) res, err := bpfrun(pktBytes) Expect(err).NotTo(HaveOccurred()) From 55b82f3eae8c02b139ce87c8419c916613ab1b85 Mon Sep 17 00:00:00 2001 From: Tomas Hruby Date: Tue, 13 Jun 2023 08:27:25 -0700 Subject: [PATCH 4/4] [BPF] IPv6 policy tests --- felix/bpf/state/map.go | 6 ------ felix/bpf/ut/pol_prog_test.go | 37 +++++++++++++++-------------------- 2 files changed, 16 insertions(+), 27 deletions(-) diff --git a/felix/bpf/state/map.go b/felix/bpf/state/map.go index b2262f37e07..e47ccc4117c 100644 --- a/felix/bpf/state/map.go +++ b/felix/bpf/state/map.go @@ -17,8 +17,6 @@ package state import ( "unsafe" - log "github.com/sirupsen/logrus" - "github.com/projectcalico/calico/felix/bpf/maps" ) @@ -122,10 +120,6 @@ type State struct { const expectedSize = 464 func (s *State) AsBytes() []byte { - size := unsafe.Sizeof(State{}) - if size != expectedSize { - log.WithField("size", size).Panic("Incorrect struct size") - } bPtr := (*[expectedSize]byte)(unsafe.Pointer(s)) bytes := make([]byte, expectedSize) copy(bytes, bPtr[:]) diff --git a/felix/bpf/ut/pol_prog_test.go b/felix/bpf/ut/pol_prog_test.go index d782ae8b83e..afc1ec1143f 100644 --- a/felix/bpf/ut/pol_prog_test.go +++ b/felix/bpf/ut/pol_prog_test.go @@ -39,7 +39,7 @@ import ( "github.com/projectcalico/calico/felix/proto" ) -func TestLoadAllowAllProgram(t *testing.T) { +func TestPolicyLoadAllowAllProgram(t *testing.T) { RegisterTestingT(t) b := asm.NewBlock(false) @@ -60,7 +60,7 @@ func TestLoadAllowAllProgram(t *testing.T) { Expect(rc.RC).To(BeNumerically("==", -1)) } -func TestLoadProgramWithMapAccess(t *testing.T) { +func TestPolicyLoadProgramWithMapAccess(t *testing.T) { RegisterTestingT(t) ipsMap := ipsets.Map() @@ -113,7 +113,7 @@ func makeRulesSingleTier(protoRules []*proto.Rule) polprog.Rules { } } -func TestLoadKitchenSinkPolicy(t *testing.T) { +func TestPolicyLoadKitchenSinkPolicy(t *testing.T) { RegisterTestingT(t) alloc := idalloc.New() allocID := func(id string) string { @@ -164,7 +164,7 @@ func TestLoadKitchenSinkPolicy(t *testing.T) { Expect(fd.Close()).NotTo(HaveOccurred()) } -func TestLoadGarbageProgram(t *testing.T) { +func TestPolicyLoadGarbageProgram(t *testing.T) { RegisterTestingT(t) var insns asm.Insns @@ -2206,32 +2206,20 @@ func wrap(p polProgramTest) polProgramTestWrapper { return polProgramTestWrapper{p} } -func TestPolicyPrograms(t *testing.T) { +func TestPolicyPolicyPrograms(t *testing.T) { for i, p := range polProgramTests { - if p.ForIPv6 { - // XXX skip for now - continue - } t.Run(fmt.Sprintf("%d:Policy=%s", i, p.PolicyName), func(t *testing.T) { runTest(t, wrap(p)) }) } } -func TestHostPolicyPrograms(t *testing.T) { +func TestPolicyHostPolicyPrograms(t *testing.T) { for i, p := range hostPolProgramTests { - if p.ForIPv6 { - // XXX skip for now - continue - } t.Run(fmt.Sprintf("%d:Policy=%s", i, p.PolicyName), func(t *testing.T) { runTest(t, wrap(p)) }) } } -func TestXDPPolicyPrograms(t *testing.T) { +func TestPolicyXDPPolicyPrograms(t *testing.T) { for i, p := range xdpPolProgramTests { - if p.ForIPv6 { - // XXX skip for now - continue - } t.Run(fmt.Sprintf("%d:Policy=%s", i, p.PolicyName), func(t *testing.T) { runTest(t, wrap(p)) }) } } @@ -2375,7 +2363,7 @@ func ipUintFromString(addrStr string, section int) uint32 { return binary.LittleEndian.Uint32(addrBytes[section*4 : (section+1)*4]) } -func TestIPUintFromString(t *testing.T) { +func TestPolicyIPUintFromString(t *testing.T) { RegisterTestingT(t) Expect(ipUintFromString("10.0.0.1", 0)).To(Equal(uint32(0x0100000a))) Expect(ipUintFromString("10.0.0.1", 1)).To(Equal(uint32(0))) @@ -2423,8 +2411,15 @@ func runTest(t *testing.T, tp testPolicy) { Expect(err).NotTo(HaveOccurred()) // Build the program. + allowIdx := tcdefs.ProgIndexAllowed + denyIdx := tcdefs.ProgIndexDrop + if tp.ForIPv6() { + allowIdx = tcdefs.ProgIndexV6Allowed + denyIdx = tcdefs.ProgIndexV6Drop + } + pg := polprog.NewBuilder(forceAlloc, ipsMap.MapFD(), testStateMap.MapFD(), jumpMap.MapFD(), - polprog.WithAllowDenyJumps(tcdefs.ProgIndexAllowed, tcdefs.ProgIndexDrop)) + polprog.WithAllowDenyJumps(allowIdx, denyIdx)) if tp.ForIPv6() { pg.EnableIPv6Mode() }