diff --git a/felix/bpf-gpl/Makefile b/felix/bpf-gpl/Makefile index ac89c741fa4..1a0d0064d7b 100644 --- a/felix/bpf-gpl/Makefile +++ b/felix/bpf-gpl/Makefile @@ -48,12 +48,14 @@ LD := llc-12 UT_C_FILES:=$(shell find ut -name '*.c') UT_OBJS:=$(UT_C_FILES:.c=.o) $(shell ./list-ut-objs) +UT_OBJS+=ut/ip_parse_test_v6.o OBJS:=$(shell ./list-objs) OBJS+=bin/tc_preamble.o +OBJS+=bin/tc_preamble_v6.o OBJS+=bin/xdp_preamble.o OBJS+=bin/policy_default.o -C_FILES:=tc_preamble.c tc.c tc6.c connect_balancer.c connect_balancer_v6.c xdp_preamble.c xdp.c policy_default.c +C_FILES:=tc_preamble.c tc.c connect_balancer.c connect_balancer_v6.c xdp_preamble.c xdp.c policy_default.c all: $(OBJS) ut-objs: $(UT_OBJS) @@ -72,12 +74,15 @@ UT_CFLAGS=\ -I . # Mini-UT programs that test one or two functions. These are each in their own files. -ut/%.ll: ut/%.c ut/ut.h tc.c tc.d +ut/%.ll: ut/%.c ut/ut.h $(CC) $(UT_CFLAGS) $(CFLAGS) -c $< -o $@ tc_preamble.ll: tc_preamble.c tc_preamble.d $(CC) $(CFLAGS) -c $< -o $@ +tc_preamble_v6.ll: tc_preamble.c tc_preamble.d + $(CC) $(CFLAGS) -DIPVER6 -c $< -o $@ + xdp_preamble.ll: xdp_preamble.c xdp_preamble.d $(CC) $(CFLAGS) -DCALI_COMPILE_FLAGS=64 -c $< -o $@ @@ -90,14 +95,14 @@ to%.ll: tc.c tc.d calculate-flags $(COMPILE) from%.ll: tc.c tc.d calculate-flags $(COMPILE) -#to%_v6.ll: tc6.c tc.d calculate-flags -# $(COMPILE) -#from%_v6.ll: tc6.c tc.d calculate-flags -# $(COMPILE) +to%_v6.ll: tc.c tc.d calculate-flags + $(COMPILE) +from%_v6.ll: tc.c tc.d calculate-flags + $(COMPILE) test%.ll: tc.c tc.d calculate-flags $(COMPILE) -#test%_v6.ll: tc6.c tc.d calculate-flags -# $(COMPILE) +test%_v6.ll: tc.c tc.d calculate-flags + $(COMPILE) xdp%.ll: xdp.c xdp.d calculate-flags $(COMPILE) test_xdp%.ll: xdp.c xdp.d calculate-flags @@ -106,6 +111,8 @@ test_xdp%.ll: xdp.c xdp.d calculate-flags LINK=$(LD) -march=bpf -filetype=obj -o $@ $< bin/tc_preamble.o: tc_preamble.ll | bin $(LINK) +bin/tc_preamble_v6.o: tc_preamble_v6.ll | bin + $(LINK) bin/xdp_preamble.o: xdp_preamble.ll | bin $(LINK) bin/policy_default.o: policy_default.ll | bin @@ -128,6 +135,10 @@ bin/connect_time_%v6_co-re.o: connect_time_%v6.ll | bin $(LINK) ut/%.o: ut/%.ll $(LINK) +ut/ip_parse_test_v6.ll: ut/ip_parse_test.c + $(CC) $(UT_CFLAGS) $(CFLAGS) -DIPVER6 -c $< -o $@ +ut/ip_parse_test_v6.o: ut/ip_parse_test_v6.ll + $(LINK) bin: mkdir -p bin diff --git a/felix/bpf-gpl/arp.h b/felix/bpf-gpl/arp.h index 3febcb43055..9bd86113c18 100644 --- a/felix/bpf-gpl/arp.h +++ b/felix/bpf-gpl/arp.h @@ -5,8 +5,10 @@ #ifndef __CALI_ARP_H__ #define __CALI_ARP_H__ +#include "ip_addr.h" + struct arp_key { - __u32 ip; + ipv46_addr_t ip; __u32 ifindex; }; @@ -15,6 +17,10 @@ struct arp_value { char mac_dst[6]; }; -CALI_MAP(cali_v4_arp, 2, BPF_MAP_TYPE_LRU_HASH, struct arp_key, struct arp_value, 10000, 0) +#ifdef IPVER6 +CALI_MAP_NAMED(cali_v6_arp, cali_arp, 2, BPF_MAP_TYPE_LRU_HASH, struct arp_key, struct arp_value, 10000, 0) +#else +CALI_MAP_NAMED(cali_v4_arp, cali_arp, 2, BPF_MAP_TYPE_LRU_HASH, struct arp_key, struct arp_value, 10000, 0) +#endif #endif /* __CALI_ARP_H__ */ diff --git a/felix/bpf-gpl/bpf.h b/felix/bpf-gpl/bpf.h index e2016733e89..bca62b6a1b2 100644 --- a/felix/bpf-gpl/bpf.h +++ b/felix/bpf-gpl/bpf.h @@ -12,10 +12,14 @@ #include #include #include -#include "globals.h" +/* CALI_BPF_INLINE must be defined before we include any of our headers. They + * assume it exists! + */ #define CALI_BPF_INLINE inline __attribute__((always_inline)) +#include "globals.h" + #define BPF_REDIR_EGRESS 0 #define BPF_REDIR_INGRESS 1 @@ -98,7 +102,12 @@ #define CALI_FIB_LOOKUP_ENABLED true #endif +#ifdef IPVER6 +#undef CALI_FIB_LOOKUP_ENABLED +#define CALI_FIB_LOOKUP_ENABLED false +#else #define CALI_FIB_ENABLED (!CALI_F_L3 && CALI_FIB_LOOKUP_ENABLED && (CALI_F_TO_HOST || CALI_F_TO_HEP)) +#endif #define COMPILE_TIME_ASSERT(expr) {typedef char array[(expr) ? 1 : -1];} static CALI_BPF_INLINE void __compile_asserts(void) { @@ -215,8 +224,18 @@ static CALI_BPF_INLINE __attribute__((noreturn)) void bpf_exit(int rc) { } #pragma clang diagnostic pop +#ifdef IPVER6 + +#define debug_ip(ip) (bpf_htonl((ip).d)) +#define ip_is_dnf(ip) (true) + +#else + +#define debug_ip(ip) bpf_htonl(ip) + #define ip_is_dnf(ip) ((ip)->frag_off & bpf_htons(0x4000)) #define ip_frag_no(ip) ((ip)->frag_off & bpf_htons(0x1fff)) +#endif static CALI_BPF_INLINE void ip_dec_ttl(struct iphdr *ip) { @@ -229,7 +248,11 @@ static CALI_BPF_INLINE void ip_dec_ttl(struct iphdr *ip) ip->check = (__be16) (sum + (sum >> 16)); } +#ifdef IPVER6 +#define ip_ttl_exceeded(ip) (CALI_F_TO_HOST && !CALI_F_TUNNEL && (ip)->hop_limit <= 1) +#else #define ip_ttl_exceeded(ip) (CALI_F_TO_HOST && !CALI_F_TUNNEL && (ip)->ttl <= 1) +#endif #if CALI_F_XDP @@ -275,25 +298,25 @@ CALI_PATCH_DEFINE(__skb_mark, 0x4d424b53) /* be 0x4d424b53 = ASCII(SKBM) */ #define map_symbol(name, ver) name##ver -#define MAP_LOOKUP_FN(name, ver) \ -static CALI_BPF_INLINE void * name##_lookup_elem(const void* key) \ +#define MAP_LOOKUP_FN(fname, name, ver) \ +static CALI_BPF_INLINE void * fname##_lookup_elem(const void* key) \ { \ return bpf_map_lookup_elem(&map_symbol(name, ver), key); \ } -#define MAP_UPDATE_FN(name, ver) \ -static CALI_BPF_INLINE int name##_update_elem(const void* key, const void* value, __u64 flags)\ +#define MAP_UPDATE_FN(fname, name, ver) \ +static CALI_BPF_INLINE int fname##_update_elem(const void* key, const void* value, __u64 flags)\ { \ return bpf_map_update_elem(&map_symbol(name, ver), key, value, flags); \ } -#define MAP_DELETE_FN(name, ver) \ -static CALI_BPF_INLINE int name##_delete_elem(const void* key) \ +#define MAP_DELETE_FN(fname, name, ver) \ +static CALI_BPF_INLINE int fname##_delete_elem(const void* key) \ { \ return bpf_map_delete_elem(&map_symbol(name, ver), key); \ } -#define CALI_MAP(name, ver, map_type, key_type, val_type, size, flags) \ +#define CALI_MAP_NAMED(name, fname, ver, map_type, key_type, val_type, size, flags) \ struct { \ __uint(type, map_type); \ __type(key, key_type); \ @@ -301,9 +324,12 @@ struct { \ __uint(max_entries, size); \ __uint(map_flags, flags); \ }map_symbol(name, ver) SEC(".maps"); \ - MAP_LOOKUP_FN(name, ver) \ - MAP_UPDATE_FN(name, ver) \ - MAP_DELETE_FN(name, ver) + MAP_LOOKUP_FN(fname, name, ver) \ + MAP_UPDATE_FN(fname, name, ver) \ + MAP_DELETE_FN(fname, name, ver) + +#define CALI_MAP(name, ver, map_type, key_type, val_type, size, flags) \ + CALI_MAP_NAMED(name, name, ver, map_type, key_type, val_type, size, flags) #define CALI_MAP_V1(name, map_type, key_type, val_type, size, flags) \ CALI_MAP(name,, map_type, key_type, val_type, size, flags) diff --git a/felix/bpf-gpl/calculate-flags b/felix/bpf-gpl/calculate-flags index 7574ace37f5..9ad456d156a 100755 --- a/felix/bpf-gpl/calculate-flags +++ b/felix/bpf-gpl/calculate-flags @@ -37,7 +37,7 @@ if [[ "${filename}" =~ test_.* ]]; then args+=("-DUNITTEST") fi -if [[ "${filename}" =~ .*_v6.o ]]; then +if [[ "${filename}" =~ .*_v6.ll ]]; then args+=("-DIPVER6") fi diff --git a/felix/bpf-gpl/connect.h b/felix/bpf-gpl/connect.h index 854820aed2c..daed9825738 100644 --- a/felix/bpf-gpl/connect.h +++ b/felix/bpf-gpl/connect.h @@ -10,7 +10,7 @@ #include "bpf.h" #include "nat_lookup.h" -static CALI_BPF_INLINE int do_nat_common(struct bpf_sock_addr *ctx, __u8 proto, __be32 *dst, bool connect) +static CALI_BPF_INLINE int do_nat_common(struct bpf_sock_addr *ctx, __u8 proto, ipv46_addr_t *dst, bool connect) { int err = 0; /* We do not know what the source address is yet, we only know that it @@ -24,7 +24,8 @@ static CALI_BPF_INLINE int do_nat_common(struct bpf_sock_addr *ctx, __u8 proto, nat_lookup_result res = NAT_LOOKUP_ALLOW; __u16 dport_he = (__u16)(bpf_ntohl(ctx->user_port)>>16); struct calico_nat_dest *nat_dest; - nat_dest = calico_v4_nat_lookup(0, *dst, proto, dport_he, false, &res, + ipv46_addr_t voidip = VOID_IP; + nat_dest = calico_nat_lookup(&voidip, dst, proto, dport_he, false, &res, proto == IPPROTO_UDP && !connect ? CTLB_UDP_NOT_SEEN_TIMEO : 0, /* enforce affinity UDP */ proto == IPPROTO_UDP && !connect /* update affinity timer */); if (!nat_dest) { @@ -49,11 +50,11 @@ static CALI_BPF_INLINE int do_nat_common(struct bpf_sock_addr *ctx, __u8 proto, .port = dport_be, .proto = proto, }; - struct sendrecv4_val val = { + struct sendrec_val val = { .ip = *dst, .port = ctx->user_port, }; - int rc = cali_v4_ct_nats_update_elem(&natk, &val, 0); + int rc = cali_ct_nats_update_elem(&natk, &val, 0); if (rc) { /* if this happens things are really bad! report */ CALI_INFO("Failed to update ct_nats map rc=%d\n", rc); @@ -65,13 +66,13 @@ static CALI_BPF_INLINE int do_nat_common(struct bpf_sock_addr *ctx, __u8 proto, __u64 cookie = bpf_get_socket_cookie(ctx); CALI_DEBUG("Store: ip=%x port=%d cookie=%x\n", bpf_ntohl(nat_dest->addr), bpf_ntohs((__u16)dport_be), cookie); - struct sendrecv4_key key = { + struct sendrec_key key = { .ip = nat_dest->addr, .port = dport_be, .cookie = cookie, }; - if (cali_v4_srmsg_update_elem(&key, &val, 0)) { + if (cali_srmsg_update_elem(&key, &val, 0)) { /* if this happens things are really bad! report */ CALI_INFO("Failed to update map\n"); goto out; @@ -85,7 +86,7 @@ static CALI_BPF_INLINE int do_nat_common(struct bpf_sock_addr *ctx, __u8 proto, return err; } -static CALI_BPF_INLINE int connect_v4(struct bpf_sock_addr *ctx, __be32 *dst) +static CALI_BPF_INLINE int connect_v4(struct bpf_sock_addr *ctx, ipv46_addr_t *dst) { int ret = 1; /* OK value */ diff --git a/felix/bpf-gpl/connect_balancer.c b/felix/bpf-gpl/connect_balancer.c index 849a41d6f32..93e26c3d46b 100644 --- a/felix/bpf-gpl/connect_balancer.c +++ b/felix/bpf-gpl/connect_balancer.c @@ -65,13 +65,13 @@ int calico_recvmsg_v4(struct bpf_sock_addr *ctx) __u64 cookie = bpf_get_socket_cookie(ctx); CALI_DEBUG("Lookup: ip=%x port=%d(BE) cookie=%x\n",ctx->user_ip4, ctx->user_port, cookie); - struct sendrecv4_key key = { + struct sendrec_key key = { .ip = ctx->user_ip4, .port = ctx->user_port, .cookie = cookie, }; - struct sendrecv4_val *revnat = cali_v4_srmsg_lookup_elem(&key); + struct sendrec_val *revnat = cali_srmsg_lookup_elem(&key); if (revnat == NULL) { CALI_DEBUG("revnat miss for %x:%d\n", diff --git a/felix/bpf-gpl/connect_balancer_v6.c b/felix/bpf-gpl/connect_balancer_v6.c index f197173e7a3..e3d2e51f1d5 100644 --- a/felix/bpf-gpl/connect_balancer_v6.c +++ b/felix/bpf-gpl/connect_balancer_v6.c @@ -2,6 +2,8 @@ // Copyright (c) 2020-2022 Tigera, Inc. All rights reserved. // SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later +#undef IPVER6 /* XXX */ + #include // socket_type.h contains the definition of SOCK_XXX constants that we need @@ -98,13 +100,13 @@ int calico_recvmsg_v6(struct bpf_sock_addr *ctx) goto out; } - struct sendrecv4_key key = { + struct sendrec_key key = { .ip = ipv4, .port = ctx->user_port, .cookie = bpf_get_socket_cookie(ctx), }; - struct sendrecv4_val *revnat = cali_v4_srmsg_lookup_elem(&key); + struct sendrec_val *revnat = cali_srmsg_lookup_elem(&key); if (revnat == NULL) { CALI_DEBUG("revnat miss for %x:%d\n", diff --git a/felix/bpf-gpl/conntrack.h b/felix/bpf-gpl/conntrack.h index cb71127538d..542134f6baf 100644 --- a/felix/bpf-gpl/conntrack.h +++ b/felix/bpf-gpl/conntrack.h @@ -21,40 +21,58 @@ static CALI_BPF_INLINE int psnat_get_port(struct cali_tc_ctx *ctx) return PSNAT_START + (bpf_get_prandom_u32() % PSNAT_LEN); } +#ifdef IPVER6 + +static CALI_BPF_INLINE bool src_lt_dest(ipv6_addr_t ip_src, ipv6_addr_t ip_dst, __u16 sport, __u16 dport) +{ + int ret = ipv6_addr_t_cmp(ip_src, ip_dst); + + if (ret != 0) { + return ret < 0; + } + + return sport < dport; +} + +#else + #define src_lt_dest(ip_src, ip_dst, sport, dport) \ ((ip_src) < (ip_dst)) || (((ip_src) == (ip_dst)) && (sport) < (dport)) -#define __ct_make_key(proto, ipa, ipb, porta, portb) \ - (struct calico_ct_key) { \ - .protocol = proto, \ - .addr_a = ipa, .port_a = porta, \ - .addr_b = ipb, .port_b = portb, \ - } +#endif /* IPVER6 */ -#define ct_make_key(sltd, p, ipa, ipb, pta, ptb) ({ \ - struct calico_ct_key k; \ - k = sltd ? __ct_make_key(p, ipa, ipb, pta, ptb) : __ct_make_key(p, ipb, ipa, ptb, pta); \ - dump_ct_key(ctx, &k); \ - k; \ -}) +static CALI_BPF_INLINE void fill_ct_key(struct calico_ct_key *k, bool sltd, __u8 proto, + ipv46_addr_t *ipa, ipv46_addr_t *ipb, __u16 pta, __u16 ptb) +{ + k->protocol = proto; + + if (sltd) { + k->addr_a = *ipa; + k->addr_b = *ipb; + k->port_a = pta; + k->port_b = ptb; + } else { + k->addr_a = *ipb; + k->addr_b = *ipa; + k->port_a = ptb; + k->port_b = pta; + } +} #define ct_result_np_node(res) ((res).flags & CALI_CT_FLAG_NP_FWD) static CALI_BPF_INLINE void dump_ct_key(struct cali_tc_ctx *ctx, struct calico_ct_key *k) { - CALI_VERB("CT-ALL key A=%x:%d proto=%d\n", bpf_ntohl(k->addr_a), k->port_a, (int)k->protocol); - CALI_VERB("CT-ALL key B=%x:%d size=%d\n", bpf_ntohl(k->addr_b), k->port_b, (int)sizeof(struct calico_ct_key)); + CALI_VERB("CT-ALL key A=%x:%d proto=%d\n", debug_ip(k->addr_a), k->port_a, (int)k->protocol); + CALI_VERB("CT-ALL key B=%x:%d size=%d\n", debug_ip(k->addr_b), k->port_b, (int)sizeof(struct calico_ct_key)); } static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, struct ct_create_ctx *ct_ctx, struct calico_ct_key *k) { - __be32 ip_src = ct_ctx->src; - __be32 ip_dst = ct_ctx->dst; __u16 sport = ct_ctx->sport; __u16 dport = ct_ctx->dport; - __be32 orig_dst = ct_ctx->orig_dst; __u16 orig_dport = ct_ctx->orig_dport; int err = 0; @@ -63,22 +81,22 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, bool syn = false; __u64 now; - if (ct_ctx->tcp) { - seq = ct_ctx->tcp->seq; - syn = ct_ctx->tcp->syn; + if (ct_ctx->proto == IPPROTO_TCP) { + seq = tcp_hdr(ctx)->seq; + syn = tcp_hdr(ctx)->syn; } - CALI_DEBUG("CT-ALL packet mark is: 0x%x\n", ct_ctx->skb->mark); - if (skb_seen(ct_ctx->skb)) { + CALI_DEBUG("CT-ALL packet mark is: 0x%x\n", ctx->skb->mark); + if (skb_seen(ctx->skb)) { /* Packet already marked as being from another workload, which will * have created a conntrack entry. Look that one up instead of * creating one. */ - CALI_DEBUG("CT-ALL Asked to create entry but packet is marked as " + CALI_VERB("CT-ALL Asked to create entry but packet is marked as " "from another endpoint, doing lookup\n"); - bool srcLTDest = src_lt_dest(ip_src, ip_dst, sport, dport); - *k = ct_make_key(srcLTDest, ct_ctx->proto, ip_src, ip_dst, sport, dport); - struct calico_ct_value *ct_value = cali_v4_ct_lookup_elem(k); + bool srcLTDest = src_lt_dest(ct_ctx->src, ct_ctx->dst, sport, dport); + fill_ct_key(k, srcLTDest, ct_ctx->proto, &ct_ctx->src, &ct_ctx->dst, sport, dport); + struct calico_ct_value *ct_value = cali_ct_lookup_elem(k); if (!ct_value) { CALI_VERB("CT Packet marked as from workload but got a conntrack miss!\n"); goto create; @@ -115,7 +133,7 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, .created=now, .last_seen=now, .type = ct_ctx->type, - .orig_ip = orig_dst, + .orig_ip = ct_ctx->orig_dst, .orig_port = orig_dport, }; @@ -124,42 +142,33 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, ct_value.orig_sip = ct_ctx->orig_src; ct_value.orig_sport = ct_ctx->orig_sport; - CALI_DEBUG("CT-ALL SNAT orig %x:%d\n", bpf_htonl(ct_ctx->orig_src), ct_ctx->orig_sport); + CALI_DEBUG("CT-ALL SNAT orig %x:%d\n", debug_ip(ct_ctx->orig_src), ct_ctx->orig_sport); - if (ct_ctx->type == CALI_CT_TYPE_NAT_REV && ct_ctx->tun_ip) { + if (ct_ctx->type == CALI_CT_TYPE_NAT_REV && !ip_void(ct_ctx->tun_ip)) { if (ct_ctx->flags & CALI_CT_FLAG_NP_FWD) { - CALI_DEBUG("CT-ALL nat tunneled to %x\n", bpf_ntohl(ct_ctx->tun_ip)); + CALI_DEBUG("CT-ALL nat tunneled to %x\n", debug_ip(ct_ctx->tun_ip)); } else { - struct cali_rt *rt = cali_rt_lookup(ct_ctx->tun_ip); + struct cali_rt *rt = cali_rt_lookup(&ct_ctx->tun_ip); if (!rt || !cali_rt_is_host(rt)) { - CALI_DEBUG("CT-ALL nat tunnel IP not a host %x\n", bpf_ntohl(ct_ctx->tun_ip)); + CALI_DEBUG("CT-ALL nat tunnel IP not a host %x\n", debug_ip(ct_ctx->tun_ip)); err = -1; goto out; } - CALI_DEBUG("CT-ALL nat tunneled from %x\n", bpf_ntohl(ct_ctx->tun_ip)); + CALI_DEBUG("CT-ALL nat tunneled from %x\n", debug_ip(ct_ctx->tun_ip)); } ct_value.tun_ip = ct_ctx->tun_ip; } struct calico_ct_leg *src_to_dst, *dst_to_src; - bool srcLTDest = (ip_src < ip_dst) || ((ip_src == ip_dst) && sport < dport); + bool srcLTDest = src_lt_dest(ct_ctx->src, ct_ctx->dst, sport, dport); + fill_ct_key(k, srcLTDest, ct_ctx->proto, &ct_ctx->src, &ct_ctx->dst, sport, dport); if (srcLTDest) { - *k = (struct calico_ct_key) { - .protocol = ct_ctx->proto, - .addr_a = ip_src, .port_a = sport, - .addr_b = ip_dst, .port_b = dport, - }; CALI_VERB("CT-ALL src_to_dst A->B\n"); src_to_dst = &ct_value.a_to_b; dst_to_src = &ct_value.b_to_a; } else { - *k = (struct calico_ct_key) { - .protocol = ct_ctx->proto, - .addr_a = ip_dst, .port_a = dport, - .addr_b = ip_src, .port_b = sport, - }; CALI_VERB("CT-ALL src_to_dst B->A\n"); src_to_dst = &ct_value.b_to_a; dst_to_src = &ct_value.a_to_b; @@ -172,7 +181,7 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, src_to_dst->syn_seen = syn; src_to_dst->opener = 1; if (CALI_F_TO_HOST) { - src_to_dst->ifindex = skb_ingress_ifindex(ct_ctx->skb); + src_to_dst->ifindex = skb_ingress_ifindex(ctx->skb); } else { src_to_dst->ifindex = CT_INVALID_IFINDEX; } @@ -182,7 +191,6 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, if (CALI_F_FROM_WEP) { /* src is the from the WEP, policy approved this side */ src_to_dst->approved = 1; - CALI_DEBUG("CT-ALL approved source side - from WEP\n"); } else if (CALI_F_FROM_HEP) { /* src is the from the HEP, policy approved this side */ src_to_dst->approved = 1; @@ -196,7 +204,7 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, } CALI_DEBUG("CT-ALL approved source side - from HEP tun allow_return=%d\n", ct_ctx->allow_return); - } else if (CALI_F_TO_HEP && !skb_seen(ct_ctx->skb) && (ct_ctx->type == CALI_CT_TYPE_NAT_REV)) { + } else if (CALI_F_TO_HEP && !skb_seen(ctx->skb) && (ct_ctx->type == CALI_CT_TYPE_NAT_REV)) { src_to_dst->approved = 1; dst_to_src->approved = 1; CALI_DEBUG("CT-ALL approved both due to host source port conflict resolution.\n"); @@ -213,28 +221,28 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, } } - err = cali_v4_ct_update_elem(k, &ct_value, BPF_NOEXIST); + err = cali_ct_update_elem(k, &ct_value, BPF_NOEXIST); if (CALI_F_HEP && err == -17 /* EEXIST */) { int i; - CALI_DEBUG("Source collision for 0x%x:%d\n", bpf_htonl(ip_src), sport); + CALI_DEBUG("Source collision for 0x%x:%d\n", debug_ip(ct_ctx->src), sport); ct_value.orig_sport = sport; - bool src_lt_dst = ip_src < ip_dst; + bool src_lt_dst = ip_lt(ct_ctx->src, ct_ctx->dst); for (i = 0; i < PSNAT_RETRIES; i++) { sport = psnat_get_port(ctx); CALI_DEBUG("New sport %d\n", sport); - if (ip_src == ip_dst) { + if (ip_equal(ct_ctx->src, ct_ctx->dst)) { src_lt_dst = sport < dport; } - *k = ct_make_key(src_lt_dst, ct_ctx->proto, ip_src, ip_dst, sport, dport); + fill_ct_key(k, src_lt_dst, ct_ctx->proto, &ct_ctx->src, &ct_ctx->dst, sport, dport); - if (!(err = cali_v4_ct_update_elem(k, &ct_value, BPF_NOEXIST))) { + if (!(err = cali_ct_update_elem(k, &ct_value, BPF_NOEXIST))) { ct_ctx->sport = sport; break; } @@ -242,7 +250,7 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, if (i == PSNAT_RETRIES) { CALI_INFO("Source collision unresolved 0x%x:%d\n", - bpf_htonl(ip_src), ct_value.orig_sport); + debug_ip(ct_ctx->src), ct_value.orig_sport); err = -17; /* EEXIST */ } } @@ -252,18 +260,17 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct cali_tc_ctx *ctx, return err; } -static CALI_BPF_INLINE int calico_ct_v4_create_nat_fwd(struct cali_tc_ctx *ctx, - struct ct_create_ctx *ct_ctx, - struct calico_ct_key *rk) +static CALI_BPF_INLINE int calico_ct_create_nat_fwd(struct cali_tc_ctx *ctx, + struct ct_create_ctx *ct_ctx, + struct calico_ct_key *rk) { - __u8 ip_proto = ct_ctx->proto; - __be32 ip_src = ct_ctx->orig_src; - __be32 ip_dst = ct_ctx->orig_dst; + ipv46_addr_t ip_src = ct_ctx->orig_src; + ipv46_addr_t ip_dst = ct_ctx->orig_dst; __u16 sport = ct_ctx->orig_sport; __u16 dport = ct_ctx->orig_dport; if (CALI_F_TO_HEP && !CALI_F_NAT_IF && sport != ct_ctx->sport && - !(ct_ctx->skb->mark & (CALI_SKB_MARK_FROM_NAT_IFACE_OUT | CALI_SKB_MARK_SEEN))) { + !(ctx->skb->mark & (CALI_SKB_MARK_FROM_NAT_IFACE_OUT | CALI_SKB_MARK_SEEN))) { /* This entry is being created because we have a source port * conflict on a connection from host. We did psnat so we mark * such an entry with a 0 sport. @@ -274,24 +281,30 @@ static CALI_BPF_INLINE int calico_ct_v4_create_nat_fwd(struct cali_tc_ctx *ctx, __u64 now = bpf_ktime_get_ns(); - CALI_DEBUG("CT-%d Creating FWD entry at %llu.\n", ip_proto, now); - CALI_DEBUG("FWD %x -> %x\n", bpf_ntohl(ip_src), bpf_ntohl(ip_dst)); + CALI_DEBUG("CT-%d Creating FWD entry at %llu.\n", ct_ctx->proto, now); + CALI_DEBUG("FWD %x -> %x\n", debug_ip(ip_src), debug_ip(ip_dst)); struct calico_ct_value ct_value = { .type = CALI_CT_TYPE_NAT_FWD, .last_seen = now, .created = now, }; - struct calico_ct_key k; + ct_value.nat_rev_key = *rk; + + /* We do not need rk anymore, we can reause it for the new key. + * + * N.B. calico_ct_create_nat_fwd() is called _after_ calico_ct_v4_create_tracking() + * which also uses the rk! + */ + struct calico_ct_key *k = rk; bool srcLTDest = src_lt_dest(ip_src, ip_dst, sport, dport); - k = ct_make_key(srcLTDest, ct_ctx->proto, ip_src, ip_dst, sport, dport); + fill_ct_key(k, srcLTDest, ct_ctx->proto, &ip_src, &ip_dst, sport, dport); - ct_value.nat_rev_key = *rk; if (ct_ctx->orig_sport != ct_ctx->sport) { ct_value.nat_sport = ct_ctx->sport; } - int err = cali_v4_ct_update_elem(&k, &ct_value, 0); - CALI_VERB("CT-%d Create result: %d.\n", ip_proto, err); + int err = cali_ct_update_elem(k, &ct_value, 0); + CALI_VERB("CT-%d Create result: %d.\n", ctx->state->ip_proto, err); return err; } @@ -302,6 +315,9 @@ static CALI_BPF_INLINE int calico_ct_v4_create_nat_fwd(struct cali_tc_ctx *ctx, */ static CALI_BPF_INLINE bool skb_icmp_err_unpack(struct cali_tc_ctx *ctx, struct ct_lookup_ctx *ct_ctx) { +#ifdef IPVER6 + return false; +#else /* ICMP packet is an error, its payload should contain the full IP header and * at least the first 8 bytes of the next header. */ @@ -377,6 +393,7 @@ static CALI_BPF_INLINE bool skb_icmp_err_unpack(struct cali_tc_ctx *ctx, struct }; return true; +#endif /* IPVER6 */ } #define CALI_CT_LOG(level, fmt, ...) \ @@ -455,12 +472,14 @@ static CALI_BPF_INLINE bool tcp_recycled(bool syn, struct calico_ct_value *v) static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_tc_ctx *ctx) { + __u8 proto = ctx->state->ip_proto; + // TODO: refactor the conntrack code to simply use the ctx instead of its own. This // code is a direct translation of the pre-ctx code so it has some duplication (but it // needs a bit more analysis to sort out because the ct_ctx gets modified in place in // ways that might not make sense to expose through the ctx. struct ct_lookup_ctx ct_lookup_ctx = { - .proto = ctx->state->ip_proto, + .proto = proto, .src = ctx->state->ip_src, .sport = ctx->state->sport, .dst = ctx->state->ip_dst, @@ -468,7 +487,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t }; struct ct_lookup_ctx *ct_ctx = &ct_lookup_ctx; - switch (ctx->state->ip_proto) { + switch (proto) { case IPPROTO_TCP: if (skb_refresh_validate_ptrs(ctx, TCP_SIZE)) { deny_reason(ctx, CALI_REASON_SHORT); @@ -485,15 +504,15 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t } __u8 proto_orig = ct_ctx->proto; - __be32 ip_src = ct_ctx->src; - __be32 ip_dst = ct_ctx->dst; + ipv46_addr_t ip_src = ct_ctx->src; + ipv46_addr_t ip_dst = ct_ctx->dst; __u16 sport = ct_ctx->sport; __u16 dport = ct_ctx->dport; - struct tcphdr *tcp_header = ct_ctx->tcp; + struct tcphdr *tcp_header = proto == IPPROTO_TCP ? tcp_hdr(ctx) : NULL; bool related = false; - CALI_CT_DEBUG("lookup from %x:%d\n", bpf_ntohl(ip_src), sport); - CALI_CT_DEBUG("lookup to %x:%d\n", bpf_ntohl(ip_dst), dport); + CALI_CT_DEBUG("lookup from %x:%d\n", debug_ip(ip_src), sport); + CALI_CT_DEBUG("lookup to %x:%d\n", debug_ip(ip_dst), dport); if (tcp_header) { CALI_CT_VERB("packet seq = %u\n", bpf_ntohl(tcp_header->seq)); CALI_CT_VERB("packet ack_seq = %u\n", bpf_ntohl(tcp_header->ack_seq)); @@ -509,10 +528,12 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t }; bool srcLTDest = src_lt_dest(ip_src, ip_dst, sport, dport); - struct calico_ct_key k = ct_make_key(srcLTDest, ct_ctx->proto, ip_src, ip_dst, sport, dport); + struct calico_ct_key k; bool syn = tcp_header && tcp_header->syn && !tcp_header->ack; - struct calico_ct_value *v = cali_v4_ct_lookup_elem(&k); + fill_ct_key(&k, srcLTDest, ct_ctx->proto, &ip_src, &ip_dst, sport, dport); + + struct calico_ct_value *v = cali_ct_lookup_elem(&k); if (!v) { if (syn) { // SYN packet (new flow); send it to policy. @@ -563,12 +584,12 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t // skb_icmp_err_unpack updates the ct_ctx with the details of the inner packet; // look for a conntrack entry for the inner packet... - CALI_CT_DEBUG("related lookup from %x:%d\n", bpf_ntohl(ct_ctx->src), ct_ctx->sport); - CALI_CT_DEBUG("related lookup to %x:%d\n", bpf_ntohl(ct_ctx->dst), ct_ctx->dport); + CALI_CT_DEBUG("related lookup from %x:%d\n", debug_ip(ct_ctx->src), ct_ctx->sport); + CALI_CT_DEBUG("related lookup to %x:%d\n", debug_ip(ct_ctx->dst), ct_ctx->dport); srcLTDest = src_lt_dest(ct_ctx->src, ct_ctx->dst, ct_ctx->sport, ct_ctx->dport); - k = ct_make_key(srcLTDest, ct_ctx->proto, ct_ctx->src, ct_ctx->dst, ct_ctx->sport, ct_ctx->dport); - v = cali_v4_ct_lookup_elem(&k); + fill_ct_key(&k, srcLTDest, ct_ctx->proto, &ct_ctx->src, &ct_ctx->dst, ct_ctx->sport, ct_ctx->dport); + v = cali_ct_lookup_elem(&k); if (!v) { if (CALI_F_FROM_HOST && ct_ctx->proto == IPPROTO_TCP && @@ -597,7 +618,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t ip_dst = ct_ctx->dst; sport = ct_ctx->sport; dport = ct_ctx->dport; - tcp_header = ct_ctx->tcp; + tcp_header = proto == IPPROTO_TCP ? tcp_hdr(ctx) : NULL; related = true; @@ -626,15 +647,15 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t // This is a forward NAT entry; since we do the bookkeeping on the // reverse entry, we need to do a second lookup. CALI_CT_DEBUG("Hit! NAT FWD entry, doing secondary lookup.\n"); - tracking_v = cali_v4_ct_lookup_elem(&v->nat_rev_key); + tracking_v = cali_ct_lookup_elem(&v->nat_rev_key); if (!tracking_v) { CALI_CT_DEBUG("Miss when looking for secondary entry.\n"); goto out_lookup_fail; } if (tcp_recycled(syn, tracking_v)) { CALI_CT_DEBUG("TCP SYN recycles entry, NEW flow.\n"); - cali_v4_ct_delete_elem(&k); - cali_v4_ct_delete_elem(&v->nat_rev_key); + cali_ct_delete_elem(&k); + cali_ct_delete_elem(&v->nat_rev_key); goto out_lookup_fail; } @@ -668,7 +689,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t } result.tun_ip = tracking_v->tun_ip; - CALI_CT_DEBUG("fwd tun_ip:%x\n", bpf_ntohl(tracking_v->tun_ip)); + CALI_CT_DEBUG("fwd tun_ip:%x\n", debug_ip(tracking_v->tun_ip)); // flags are in the tracking entry result.flags = ct_value_get_flags(tracking_v); CALI_CT_DEBUG("result.flags 0x%x\n", result.flags); @@ -690,9 +711,10 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t /* If we are on a HEP - where encap/decap can happen - and if the packet * arrived through a tunnel, check if the src IP of the packet is expected. */ - if (CALI_F_FROM_HEP && ctx->state->tun_ip && result.tun_ip && result.tun_ip != ctx->state->tun_ip) { + if (CALI_F_FROM_HEP && !ip_void(ctx->state->tun_ip) && !ip_void(result.tun_ip) && + !ip_equal(result.tun_ip, ctx->state->tun_ip)) { CALI_CT_DEBUG("tunnel src changed from %x to %x\n", - bpf_ntohl(result.tun_ip), bpf_ntohl(ctx->state->tun_ip)); + debug_ip(result.tun_ip), debug_ip(ctx->state->tun_ip)); ct_result_set_flag(result.rc, CT_RES_TUN_SRC_CHANGED); } @@ -715,7 +737,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t } result.tun_ip = v->tun_ip; - CALI_CT_DEBUG("tun_ip:%x\n", bpf_ntohl(v->tun_ip)); + CALI_CT_DEBUG("tun_ip:%x\n", debug_ip(v->tun_ip)); result.flags = ct_value_get_flags(v); @@ -742,7 +764,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t */ snat = CALI_F_FROM_HOST; /* if returning packet into a tunnel */ - snat |= (dnat_return_should_encap() && v->tun_ip); + snat |= (dnat_return_should_encap() && !ip_void(v->tun_ip)); snat |= result.flags & CALI_CT_FLAG_VIA_NAT_IF; snat |= result.flags & CALI_CT_FLAG_HOST_PSNAT; snat |= result.flags & CALI_CT_FLAG_NP_LOOP; @@ -771,7 +793,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t CALI_CT_DEBUG("Hit! NORMAL entry.\n"); if (tcp_recycled(syn, v)) { CALI_CT_DEBUG("TCP SYN recycles entry, NEW flow.\n"); - cali_v4_ct_delete_elem(&k); + cali_ct_delete_elem(&k); goto out_lookup_fail; } CALI_CT_VERB("Created: %llu.\n", v->created); @@ -815,7 +837,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t } int ret_from_tun = CALI_F_FROM_HEP && - ctx->state->tun_ip && + !ip_void(ctx->state->tun_ip) && ct_result_rc(result.rc) == CALI_CT_ESTABLISHED_DNAT && src_to_dst->approved && result.flags & CALI_CT_FLAG_NP_FWD; @@ -834,7 +856,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t } if (ret_from_tun) { - CALI_DEBUG("Packet returned from tunnel %x\n", bpf_ntohl(ctx->state->tun_ip)); + CALI_DEBUG("Packet returned from tunnel %x\n", debug_ip(ctx->state->tun_ip)); } else if (CALI_F_TO_HOST || (skb_from_host(ctx->skb) && result.flags & CALI_CT_FLAG_HOST_PSNAT)) { /* Source of the packet is the endpoint, so check the src approval flag. */ if (src_to_dst->approved) { @@ -954,7 +976,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct cali_t /* creates connection tracking for tracked protocols */ static CALI_BPF_INLINE int conntrack_create(struct cali_tc_ctx *ctx, struct ct_create_ctx *ct_ctx) { - struct calico_ct_key k; + struct calico_ct_key *k = &ctx->scratch->ct_key; int err; if (ctx->state->flags & CALI_ST_SUPPRESS_CT_STATE) { @@ -962,17 +984,14 @@ static CALI_BPF_INLINE int conntrack_create(struct cali_tc_ctx *ctx, struct ct_c return 0; } - // Workaround for verifier; make sure verifier sees the skb on all code paths. - ct_ctx->skb = ctx->skb; - - err = calico_ct_v4_create_tracking(ctx, ct_ctx, &k); + err = calico_ct_v4_create_tracking(ctx, ct_ctx, k); if (err) { CALI_DEBUG("calico_ct_v4_create_tracking err %d\n", err); return err; } if (ct_ctx->type == CALI_CT_TYPE_NAT_REV) { - err = calico_ct_v4_create_nat_fwd(ctx, ct_ctx, &k); + err = calico_ct_create_nat_fwd(ctx, ct_ctx, k); if (err) { /* XXX we should clean up the tracking entry */ } diff --git a/felix/bpf-gpl/conntrack_types.h b/felix/bpf-gpl/conntrack_types.h index 5b81818ce25..ad054a7bfd7 100644 --- a/felix/bpf-gpl/conntrack_types.h +++ b/felix/bpf-gpl/conntrack_types.h @@ -9,7 +9,8 @@ struct calico_ct_key { __u32 protocol; - __be32 addr_a, addr_b; // NBO + ipv46_addr_t addr_a; // NBO + ipv46_addr_t addr_b; // NBO __u16 port_a, port_b; // HBO }; @@ -79,18 +80,22 @@ struct calico_ct_value { struct calico_ct_leg b_to_a; // 48 // CALI_CT_TYPE_NAT_REV - __u32 tun_ip; // 72 - __u32 orig_ip; // 76 + ipv46_addr_t tun_ip; // 72 + ipv46_addr_t orig_ip; // 76 __u16 orig_port; // 80 __u16 orig_sport; // 82 - __u32 orig_sip; // 84 + ipv46_addr_t orig_sip; // 84 }; // CALI_CT_TYPE_NAT_FWD; key for the CALI_CT_TYPE_NAT_REV entry. struct { struct calico_ct_key nat_rev_key; // 24 __u16 nat_sport; +#ifdef IPVER6 + __u8 pad2[60]; +#else __u8 pad2[46]; +#endif }; }; @@ -100,7 +105,11 @@ struct calico_ct_value { static CALI_BPF_INLINE void __xxx_compile_asserts(void) { #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wunused-local-typedef" +#ifdef IPVER6 + COMPILE_TIME_ASSERT((sizeof(struct calico_ct_value) == 128)) +#else COMPILE_TIME_ASSERT((sizeof(struct calico_ct_value) == 88)) +#endif #pragma clang diagnostic pop } @@ -117,34 +126,38 @@ static CALI_BPF_INLINE void __xxx_compile_asserts(void) { struct ct_lookup_ctx { __u8 proto; - __be32 src; - __be32 dst; + DECLARE_IP_ADDR(src); + DECLARE_IP_ADDR(dst); __u16 sport; __u16 dport; struct tcphdr *tcp; }; struct ct_create_ctx { - struct __sk_buff *skb; - __u8 proto; - __be32 orig_src; - __be32 src; - __be32 orig_dst; - __be32 dst; + ipv46_addr_t orig_src; + ipv46_addr_t src; + ipv46_addr_t orig_dst; + ipv46_addr_t dst; __u16 sport; __u16 dport; __u16 orig_dport; __u16 orig_sport; struct tcphdr *tcp; - __be32 tun_ip; /* is set when the packet arrive through the NP tunnel. + ipv46_addr_t tun_ip; /* is set when the packet arrive through the NP tunnel. * It is also set on the first node when we create the * initial CT entry for the tunneled traffic. */ __u16 flags; + __u8 proto; + __u8 __pad; enum cali_ct_type type; bool allow_return; }; -CALI_MAP(cali_v4_ct, 3, +#ifdef IPVER6 +CALI_MAP_NAMED(cali_v6_ct, cali_ct, 3, +#else +CALI_MAP_NAMED(cali_v4_ct, cali_ct, 3, +#endif BPF_MAP_TYPE_HASH, struct calico_ct_key, struct calico_ct_value, 512000, BPF_F_NO_PREALLOC) @@ -205,11 +218,11 @@ enum calico_ct_result_type { struct calico_ct_result { __s16 rc; __u16 flags; - __be32 nat_ip; - __be32 nat_sip; + ipv46_addr_t nat_ip; + ipv46_addr_t nat_sip; __u16 nat_port; __u16 nat_sport; - __be32 tun_ip; + ipv46_addr_t tun_ip; __u32 ifindex_fwd; /* if set, the ifindex where the packet should be forwarded */ __u32 ifindex_created; /* For a CT state that was created by a packet ingressing * through an interface towards the host, this is the diff --git a/felix/bpf-gpl/failsafe.h b/felix/bpf-gpl/failsafe.h index 00cdcc88bfb..3bd0733c5b7 100644 --- a/felix/bpf-gpl/failsafe.h +++ b/felix/bpf-gpl/failsafe.h @@ -35,7 +35,8 @@ CALI_MAP(cali_v4_fsafes, 2, #define FSAFE_PREFIX_LEN_IN_BITS (FSAFE_PREFIX_LEN * 8) -static CALI_BPF_INLINE bool is_failsafe_in(__u8 ip_proto, __u16 dport, __be32 ip) { +static CALI_BPF_INLINE bool is_failsafe_in(__u8 ip_proto, __u16 dport, ipv46_addr_t ip) { +#ifndef IPVER6 struct failsafe_key key = { .prefixlen = FSAFE_PREFIX_LEN_IN_BITS, .ip_proto = ip_proto, @@ -46,10 +47,14 @@ static CALI_BPF_INLINE bool is_failsafe_in(__u8 ip_proto, __u16 dport, __be32 ip if (cali_v4_fsafes_lookup_elem(&key)) { return true; } +#else + /* XXX not implemented yet*/ +#endif return false; } -static CALI_BPF_INLINE bool is_failsafe_out(__u8 ip_proto, __u16 dport, __be32 ip) { +static CALI_BPF_INLINE bool is_failsafe_out(__u8 ip_proto, __u16 dport, ipv46_addr_t ip) { +#ifndef IPVER6 struct failsafe_key key = { .prefixlen = FSAFE_PREFIX_LEN_IN_BITS, .ip_proto = ip_proto, @@ -60,6 +65,9 @@ static CALI_BPF_INLINE bool is_failsafe_out(__u8 ip_proto, __u16 dport, __be32 i if (cali_v4_fsafes_lookup_elem(&key)) { return true; } +#else + /* XXX not implemented yet*/ +#endif return false; } diff --git a/felix/bpf-gpl/fib.h b/felix/bpf-gpl/fib.h index eef109d080c..861e946587b 100644 --- a/felix/bpf-gpl/fib.h +++ b/felix/bpf-gpl/fib.h @@ -104,14 +104,14 @@ static CALI_BPF_INLINE int forward_or_drop(struct cali_tc_ctx *ctx) struct arp_value *arpv; struct arp_key arpk = { - .ip = iface != NATIN_IFACE ? state->ip_dst : 0 /* 0.0.0.0 */, + .ip = iface != NATIN_IFACE ? state->ip_dst : VOID_IP, .ifindex = iface, }; - arpv = cali_v4_arp_lookup_elem(&arpk); + arpv = cali_arp_lookup_elem(&arpk); if (!arpv) { CALI_DEBUG("ARP lookup failed for %x dev %d\n", - bpf_ntohl(state->ip_dst), iface); + debug_ip(state->ip_dst), iface); goto skip_redir_ifindex; } @@ -261,15 +261,16 @@ static CALI_BPF_INLINE int forward_or_drop(struct cali_tc_ctx *ctx) __u32 iface = NATIN_IFACE; struct arp_key arpk = { - .ip = 0 /* 0.0.0.0 */, .ifindex = iface, }; - struct arp_value *arpv = cali_v4_arp_lookup_elem(&arpk); + ip_set_void(arpk.ip); + + struct arp_value *arpv = cali_arp_lookup_elem(&arpk); if (!arpv) { ctx->fwd.reason = CALI_REASON_NATIFACE; CALI_DEBUG("ARP lookup failed for %x dev %d\n", - bpf_ntohl(state->ip_dst), iface); + debug_ip(state->ip_dst), iface); goto deny; } diff --git a/felix/bpf-gpl/globals.h b/felix/bpf-gpl/globals.h index b6208873037..44582937ef3 100644 --- a/felix/bpf-gpl/globals.h +++ b/felix/bpf-gpl/globals.h @@ -5,29 +5,39 @@ #ifndef __CALI_GLOBALS_H__ #define __CALI_GLOBALS_H__ -struct cali_tc_globals { - __be32 host_ip; - __be16 tunnel_mtu; - __be16 vxlan_port; - __be32 intf_ip; - __be32 ext_to_svc_mark; - __be16 psnat_start; - __be16 psnat_len; - __be32 host_tunnel_ip; - __be32 flags; - __be16 wg_port; - __be16 __pad; - __u32 natin_idx; - __u32 natout_idx; - __u8 iface_name[16]; - __u32 log_filter_jmp; - __u32 jumps[32]; - /* Needs to be 32bit aligned as it is followed by scratch area for - * building headers. We reuse the same slot in state map to save - * ourselves a lookup. - */ - __u32 __scratch[]; /* N.B. this provides pointer to the location but does not add to the size */ -}; +#include "ip_addr.h" + +#define DECLARE_TC_GLOBALS(name, ip_t) \ +struct name { \ + ip_t host_ip; \ + __be16 tunnel_mtu; \ + __be16 vxlan_port; \ + ip_t intf_ip; \ + __be32 ext_to_svc_mark; \ + __be16 psnat_start; \ + __be16 psnat_len; \ + ip_t host_tunnel_ip; \ + __be32 flags; \ + __be16 wg_port; \ + __be16 __pad; \ + __u32 natin_idx; \ + __u32 natout_idx; \ + __u8 iface_name[16]; \ + __u32 log_filter_jmp; \ + __u32 jumps[32]; \ + /* Needs to be 32bit aligned as it is followed by scratch area for \ + * building headers. We reuse the same slot in state map to save \ + * ourselves a lookup. \ + */ \ + __u32 __scratch[]; /* N.B. this provides pointer to the location but does not add to the size */ \ +} + +DECLARE_TC_GLOBALS(cali_tc_globals, ipv46_addr_t); +/* cali_tc_globals_v6 is for userspace as cali_tc_globals are used for ipv4 in + * userspace, but it has the exact same layout as cali_tc_globals in eBPF when + * compiled for ipv6. + */ +DECLARE_TC_GLOBALS(cali_tc_globals_v6, ipv6_addr_t); enum cali_globals_flags { /* CALI_GLOBALS_IPV6_ENABLED is set when IPv6 is enabled by Felix */ diff --git a/felix/bpf-gpl/icmp.h b/felix/bpf-gpl/icmp.h index e0c6eff6d02..f40602de96c 100644 --- a/felix/bpf-gpl/icmp.h +++ b/felix/bpf-gpl/icmp.h @@ -5,164 +5,10 @@ #ifndef __CALI_ICMP_H__ #define __CALI_ICMP_H__ -#include -#include -#include - -#include "bpf.h" -#include "log.h" -#include "skb.h" - -static CALI_BPF_INLINE int icmp_v4_reply(struct cali_tc_ctx *ctx, - __u8 type, __u8 code, __be32 un) -{ - int ret; - - /* ICMP is on the slow path so we may as well revalidate here to keep calling code - * simple. We only need to look at the IP header before we resize the packet. */ - if (skb_refresh_validate_ptrs(ctx, 0)) { - deny_reason(ctx, CALI_REASON_SHORT); - CALI_DEBUG("ICMP v4 reply: too short\n"); - return -1; - } - - struct iphdr ip_orig = *ip_hdr(ctx); - - /* Trim the packet to the desired length. ICMP requires min 8 bytes of - * payload but the SKB implementation gets upset if we try to trim - * part-way through the UDP/TCP header. - */ - __u32 len = skb_iphdr_offset(ctx) + 60 /* max IP len */; - switch (ip_hdr(ctx)->protocol) { - case IPPROTO_TCP: - len += sizeof(struct tcphdr); - break; - case IPPROTO_UDP: - len += sizeof(struct udphdr); - break; - default: - len += 8; - break; - } - - CALI_DEBUG("Trimming to %d\n", len); - int err = bpf_skb_change_tail(ctx->skb, len, 0); - if (err) { - CALI_DEBUG("ICMP v4 reply: early bpf_skb_change_tail (len=%d) failed (err=%d)\n", len, err); - return -1; - } - - /* make room for the new IP + ICMP header */ - int new_hdrs_len = sizeof(struct iphdr) + sizeof(struct icmphdr); - CALI_DEBUG("Inserting %d\n", new_hdrs_len); - ret = bpf_skb_adjust_room(ctx->skb, new_hdrs_len, BPF_ADJ_ROOM_MAC, 0); - if (ret) { - CALI_DEBUG("ICMP v4 reply: failed to make room\n"); - return -1; - } - - len += new_hdrs_len; - CALI_DEBUG("Len after insert %d\n", len); - - /* ICMP reply carries the IP header + at least 8 bytes of data. */ - if (skb_refresh_validate_ptrs(ctx, len - IP_SIZE - (CALI_F_L3 ? 0 : ETH_SIZE))) { - deny_reason(ctx, CALI_REASON_SHORT); - CALI_DEBUG("ICMP v4 reply: too short after making room\n"); - return -1; - } - - /* we do not touch ethhdr, we rely on linux to rewrite it after routing - * XXX we might want to swap MACs and bounce it back from the same device - */ - ip_hdr(ctx)->version = 4; - ip_hdr(ctx)->ihl = 5; - ip_hdr(ctx)->tos = 0; - ip_hdr(ctx)->ttl = 64; /* good default */ - ip_hdr(ctx)->protocol = IPPROTO_ICMP; - ip_hdr(ctx)->check = 0; - ip_hdr(ctx)->tot_len = bpf_htons(len - (CALI_F_L3_DEV ? 0 : ETH_SIZE)); - - ctx->ipheader_len = 20; - -#ifdef CALI_PARANOID - /* XXX verify that ip_orig.daddr is always the node's IP - * - * we only call this function because of NodePort encap - */ - if (ip_orig.daddr != HOST_IP) { - CALI_DEBUG("ICMP v4 reply: ip_orig.daddr != HOST_IP 0x%x\n", ip_orig.daddr); - } +#ifdef IPVER6 +#include "icmp6.h" +#else +#include "icmp4.h" #endif - /* use the host IP of the program that handles the packet */ - ip_hdr(ctx)->saddr = INTF_IP; - ip_hdr(ctx)->daddr = ip_orig.saddr; - - struct icmphdr *icmp = ((void *)ip_hdr(ctx)) + IP_SIZE; - - icmp->type = type; - icmp->code = code; - *((__be32 *)&icmp->un) = un; - icmp->checksum = 0; - - __wsum ip_csum = bpf_csum_diff(0, 0, ctx->ip_header, sizeof(struct iphdr), 0); - __wsum icmp_csum = bpf_csum_diff(0, 0, (__u32 *)icmp, - len - sizeof(struct iphdr) - skb_iphdr_offset(ctx), 0); - CALI_DEBUG("ICMP: checksum 0x%x len %d\n", icmp_csum, len - sizeof(struct iphdr) - skb_iphdr_offset(ctx)); - - ret = bpf_l3_csum_replace(ctx->skb, - skb_iphdr_offset(ctx) + offsetof(struct iphdr, check), 0, ip_csum, 0); - if (ret) { - CALI_DEBUG("ICMP v4 reply: set ip csum failed\n"); - return -1; - } - ret = bpf_l4_csum_replace(ctx->skb, sizeof(struct ethhdr) + sizeof(struct iphdr) + - offsetof(struct icmphdr, checksum), 0, icmp_csum, 0); - if (ret) { - CALI_DEBUG("ICMP v4 reply: set icmp csum failed\n"); - return -1; - } - - CALI_DEBUG("ICMP v4 reply creation succeeded\n"); - - return 0; -} - -static CALI_BPF_INLINE int icmp_v4_too_big(struct cali_tc_ctx *ctx) -{ - struct { - __be16 unused; - __be16 mtu; - } frag = { - .mtu = bpf_htons(TUNNEL_MTU), - }; - - CALI_DEBUG("Sending ICMP too big mtu=%d\n", bpf_ntohs(frag.mtu)); - return icmp_v4_reply(ctx, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, *(__be32 *)&frag); -} - -static CALI_BPF_INLINE int icmp_v4_ttl_exceeded(struct cali_tc_ctx *ctx) -{ - return icmp_v4_reply(ctx, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); -} - -static CALI_BPF_INLINE int icmp_v4_port_unreachable(struct cali_tc_ctx *ctx) -{ - return icmp_v4_reply(ctx, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); -} - -static CALI_BPF_INLINE bool icmp_type_is_err(__u8 type) -{ - switch (type) { - case ICMP_DEST_UNREACH: - case ICMP_SOURCE_QUENCH: - case ICMP_REDIRECT: - case ICMP_TIME_EXCEEDED: - case ICMP_PARAMETERPROB: - return true; - } - - return false; -} - #endif /* __CALI_ICMP_H__ */ diff --git a/felix/bpf-gpl/icmp4.h b/felix/bpf-gpl/icmp4.h new file mode 100644 index 00000000000..e2d62f97e47 --- /dev/null +++ b/felix/bpf-gpl/icmp4.h @@ -0,0 +1,168 @@ +// Project Calico BPF dataplane programs. +// Copyright (c) 2020-2023 Tigera, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +#ifndef __CALI_ICMP4_H__ +#define __CALI_ICMP4_H__ + +#include +#include +#include + +#include "bpf.h" +#include "log.h" +#include "skb.h" + +static CALI_BPF_INLINE int icmp_v4_reply(struct cali_tc_ctx *ctx, + __u8 type, __u8 code, __be32 un) +{ + int ret; + + /* ICMP is on the slow path so we may as well revalidate here to keep calling code + * simple. We only need to look at the IP header before we resize the packet. */ + if (skb_refresh_validate_ptrs(ctx, 0)) { + deny_reason(ctx, CALI_REASON_SHORT); + CALI_DEBUG("ICMP v4 reply: too short\n"); + return -1; + } + + struct iphdr ip_orig = *ip_hdr(ctx); + + /* Trim the packet to the desired length. ICMP requires min 8 bytes of + * payload but the SKB implementation gets upset if we try to trim + * part-way through the UDP/TCP header. + */ + __u32 len = skb_iphdr_offset(ctx) + 60 /* max IP len */; + switch (ip_hdr(ctx)->protocol) { + case IPPROTO_TCP: + len += sizeof(struct tcphdr); + break; + case IPPROTO_UDP: + len += sizeof(struct udphdr); + break; + default: + len += 8; + break; + } + + CALI_DEBUG("Trimming to %d\n", len); + int err = bpf_skb_change_tail(ctx->skb, len, 0); + if (err) { + CALI_DEBUG("ICMP v4 reply: early bpf_skb_change_tail (len=%d) failed (err=%d)\n", len, err); + return -1; + } + + /* make room for the new IP + ICMP header */ + int new_hdrs_len = sizeof(struct iphdr) + sizeof(struct icmphdr); + CALI_DEBUG("Inserting %d\n", new_hdrs_len); + ret = bpf_skb_adjust_room(ctx->skb, new_hdrs_len, BPF_ADJ_ROOM_MAC, 0); + if (ret) { + CALI_DEBUG("ICMP v4 reply: failed to make room\n"); + return -1; + } + + len += new_hdrs_len; + CALI_DEBUG("Len after insert %d\n", len); + + /* ICMP reply carries the IP header + at least 8 bytes of data. */ + if (skb_refresh_validate_ptrs(ctx, len - IP_SIZE - (CALI_F_L3 ? 0 : ETH_SIZE))) { + deny_reason(ctx, CALI_REASON_SHORT); + CALI_DEBUG("ICMP v4 reply: too short after making room\n"); + return -1; + } + + /* we do not touch ethhdr, we rely on linux to rewrite it after routing + * XXX we might want to swap MACs and bounce it back from the same device + */ + ip_hdr(ctx)->version = 4; + ip_hdr(ctx)->ihl = 5; + ip_hdr(ctx)->tos = 0; + ip_hdr(ctx)->ttl = 64; /* good default */ + ip_hdr(ctx)->protocol = IPPROTO_ICMP; + ip_hdr(ctx)->check = 0; + ip_hdr(ctx)->tot_len = bpf_htons(len - (CALI_F_L3_DEV ? 0 : ETH_SIZE)); + + ctx->ipheader_len = 20; + +#ifdef CALI_PARANOID + /* XXX verify that ip_orig.daddr is always the node's IP + * + * we only call this function because of NodePort encap + */ + if (ip_orig.daddr != HOST_IP) { + CALI_DEBUG("ICMP v4 reply: ip_orig.daddr != HOST_IP 0x%x\n", ip_orig.daddr); + } +#endif + + /* use the host IP of the program that handles the packet */ + ip_hdr(ctx)->saddr = INTF_IP; + ip_hdr(ctx)->daddr = ip_orig.saddr; + + struct icmphdr *icmp = ((void *)ip_hdr(ctx)) + IP_SIZE; + + icmp->type = type; + icmp->code = code; + *((__be32 *)&icmp->un) = un; + icmp->checksum = 0; + + __wsum ip_csum = bpf_csum_diff(0, 0, ctx->ip_header, sizeof(struct iphdr), 0); + __wsum icmp_csum = bpf_csum_diff(0, 0, (__u32 *)icmp, + len - sizeof(struct iphdr) - skb_iphdr_offset(ctx), 0); + CALI_DEBUG("ICMP: checksum 0x%x len %d\n", icmp_csum, len - sizeof(struct iphdr) - skb_iphdr_offset(ctx)); + + ret = bpf_l3_csum_replace(ctx->skb, + skb_iphdr_offset(ctx) + offsetof(struct iphdr, check), 0, ip_csum, 0); + if (ret) { + CALI_DEBUG("ICMP v4 reply: set ip csum failed\n"); + return -1; + } + ret = bpf_l4_csum_replace(ctx->skb, sizeof(struct ethhdr) + sizeof(struct iphdr) + + offsetof(struct icmphdr, checksum), 0, icmp_csum, 0); + if (ret) { + CALI_DEBUG("ICMP v4 reply: set icmp csum failed\n"); + return -1; + } + + CALI_DEBUG("ICMP v4 reply creation succeeded\n"); + + return 0; +} + +static CALI_BPF_INLINE int icmp_v4_too_big(struct cali_tc_ctx *ctx) +{ + struct { + __be16 unused; + __be16 mtu; + } frag = { + .mtu = bpf_htons(TUNNEL_MTU), + }; + + CALI_DEBUG("Sending ICMP too big mtu=%d\n", bpf_ntohs(frag.mtu)); + return icmp_v4_reply(ctx, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, *(__be32 *)&frag); +} + +static CALI_BPF_INLINE int icmp_v4_ttl_exceeded(struct cali_tc_ctx *ctx) +{ + return icmp_v4_reply(ctx, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); +} + +static CALI_BPF_INLINE int icmp_v4_port_unreachable(struct cali_tc_ctx *ctx) +{ + return icmp_v4_reply(ctx, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); +} + +static CALI_BPF_INLINE bool icmp_type_is_err(__u8 type) +{ + switch (type) { + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_REDIRECT: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + return true; + } + + return false; +} + +#endif /* __CALI_ICMP4_H__ */ diff --git a/felix/bpf-gpl/icmp6.h b/felix/bpf-gpl/icmp6.h new file mode 100644 index 00000000000..5d87bcd4795 --- /dev/null +++ b/felix/bpf-gpl/icmp6.h @@ -0,0 +1,13 @@ +// Project Calico BPF dataplane programs. +// Copyright (c) 2023 Tigera, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +#ifndef __CALI_ICMP6_H__ +#define __CALI_ICMP6_H__ + +static CALI_BPF_INLINE bool icmp_type_is_err(__u8 type) { + /* XXX not implemented yet */ + return false; +} + +#endif /* __CALI_ICMP6_H__ */ diff --git a/felix/bpf-gpl/ip_addr.h b/felix/bpf-gpl/ip_addr.h new file mode 100644 index 00000000000..eb405e88497 --- /dev/null +++ b/felix/bpf-gpl/ip_addr.h @@ -0,0 +1,98 @@ +// Project Calico BPF dataplane programs. +// Copyright (c) 2023 Tigera, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +#ifndef __CALI_IP_ADDR_H__ +#define __CALI_IP_ADDR_H__ + +typedef struct { + __be32 a; + __be32 b; + __be32 c; + __be32 d; +} ipv6_addr_t; + +typedef __be32 ipv4_addr_t; + +#ifdef IPVER6 + +#include + +static CALI_BPF_INLINE bool ipv6_addr_t_eq(ipv6_addr_t x, ipv6_addr_t y) +{ + return x.a == y.a && x.b == y.b && x.c == y.c && x.d == y.d; +} + +static CALI_BPF_INLINE int ipv6_addr_t_cmp(ipv6_addr_t x, ipv6_addr_t y) +{ + if (x.a < y.a) { + return -1; + } else if (x.a == y.a) { + if (x.b < y.b) { + return -1; + } else if (x.b == y.b) { + if (x.c < y.c) { + return -1; + } else if (x.c == y.c) { + if (x.d < y.d) { + return -1; + } else if (x.d == y.d) { + return 0; + } + } + } + } + + return 1; +} + +#define ip_void(ip) ((ip).a == 0 && (ip).b == 0 && (ip).c == 0 && (ip).d == 0) +#define VOID_IP ({ipv6_addr_t x = {}; x;}) +#define ip_set_void(ip) do { \ + (ip).a = 0; \ + (ip).b = 0; \ + (ip).c = 0; \ + (ip).d = 0; \ +} while(0) +#define NP_SPECIAL_IP ({ipv6_addr_t x = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; x;}) +#define ip_equal(a, b) ipv6_addr_t_eq(a, b) +#define ip_lt(a, b) (ipv6_addr_t_cmp(a, b) < 0) + +static CALI_BPF_INLINE void ipv6hdr_ip_to_ipv6_addr_t(ipv6_addr_t *us, struct in6_addr *lnx) +{ + us->a = lnx->in6_u.u6_addr32[0]; + us->b = lnx->in6_u.u6_addr32[1]; + us->c = lnx->in6_u.u6_addr32[2]; + us->d = lnx->in6_u.u6_addr32[3]; +} + +static CALI_BPF_INLINE void ipv6_addr_t_to_ipv6hdr_ip(struct in6_addr *lnx, ipv6_addr_t *us) +{ + lnx->in6_u.u6_addr32[0] = us->a; + lnx->in6_u.u6_addr32[1] = us->b; + lnx->in6_u.u6_addr32[2] = us->c; + lnx->in6_u.u6_addr32[3] = us->d; +} + +typedef ipv6_addr_t ipv46_addr_t; + +#define DECLARE_IP_ADDR(name) ipv6_addr_t name + +#else /* ipv4 */ + +#define ip_void(ip) ((ip) == 0) +#define VOID_IP 0 +#define ip_set_void(ip) ((ip) = 0) +#define NP_SPECIAL_IP 0xffffffff +#define ip_equal(a, b) ((a) == (b)) +#define ip_lt(a, b) ((a) < (b)) + +typedef ipv4_addr_t ipv46_addr_t; + +#define DECLARE_IP_ADDR(name) union { \ + ipv4_addr_t name; \ + ipv6_addr_t __pad ## name; \ + } +#endif + +#endif /* __CALI_IP_ADDR_H__ */ diff --git a/felix/bpf-gpl/ipv6.h b/felix/bpf-gpl/ipv6.h deleted file mode 100644 index 98b157e9092..00000000000 --- a/felix/bpf-gpl/ipv6.h +++ /dev/null @@ -1,24 +0,0 @@ -// Project Calico BPF dataplane programs. -// Copyright (c) 2022 Tigera, Inc. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later - -#ifndef __CALI_BPF_IPV6_H__ -#define __CALI_BPF_IPV6_H__ - -// We can only pass in 3 parameters to a helper function because of bpf -// architecture, so we need to split printing ipv6 address into 2 parts. -#define CALI_LOG_IPV6(ipv6) \ - CALI_DEBUG("src: %x%x", \ - bpf_ntohl((ipv6)->saddr.in6_u.u6_addr32[0]), \ - bpf_ntohl((ipv6)->saddr.in6_u.u6_addr32[1])); \ - CALI_DEBUG("%x%x\n", \ - bpf_ntohl((ipv6)->saddr.in6_u.u6_addr32[2]), \ - bpf_ntohl((ipv6)->saddr.in6_u.u6_addr32[3])); \ - CALI_DEBUG("dst: %x%x", \ - bpf_ntohl((ipv6)->daddr.in6_u.u6_addr32[0]), \ - bpf_ntohl((ipv6)->daddr.in6_u.u6_addr32[1])); \ - CALI_DEBUG("%x%x\n", \ - bpf_ntohl((ipv6)->daddr.in6_u.u6_addr32[2]), \ - bpf_ntohl((ipv6)->daddr.in6_u.u6_addr32[3])) \ - -#endif /* __CALI_BPF_IPV6_H__ */ diff --git a/felix/bpf-gpl/jump.h b/felix/bpf-gpl/jump.h index 1dacbf4780f..902ed171715 100644 --- a/felix/bpf-gpl/jump.h +++ b/felix/bpf-gpl/jump.h @@ -5,7 +5,7 @@ #ifndef __CALI_BPF_JUMP_H__ #define __CALI_BPF_JUMP_H__ -CALI_MAP(cali_state, 3, +CALI_MAP(cali_state, 4, BPF_MAP_TYPE_PERCPU_ARRAY, __u32, struct cali_tc_state, 2, 0) @@ -42,16 +42,24 @@ static CALI_BPF_INLINE struct cali_xdp_globals *state_get_globals_xdp(void) CALI_MAP_V1(cali_jump_map, BPF_MAP_TYPE_PROG_ARRAY, __u32, __u32, 200, 0) #define CALI_JUMP_TO(ctx, index) bpf_tail_call((ctx)->xdp, &cali_jump_map, (ctx)->xdp_globals->jumps[PROG_PATH(index)]) -#else + +#else /* CALI_F_XDP */ #define cali_jump_map map_symbol(cali_progs, 2) CALI_MAP_V1(cali_jump_map, BPF_MAP_TYPE_PROG_ARRAY, __u32, __u32, 200, 0) -#define CALI_JUMP_TO(ctx, index) do { \ +#define __CALI_JUMP_TO(ctx, index) do { \ CALI_DEBUG("jump to idx %d prog at %d\n", index, (ctx)->globals->jumps[PROG_PATH(index)]); \ bpf_tail_call((ctx)->skb, &cali_jump_map, (ctx)->globals->jumps[PROG_PATH(index)]); \ } while (0) + +#ifdef IPVER6 +#define CALI_JUMP_TO(ctx, index) __CALI_JUMP_TO(ctx, index ## _V6) +#else +#define CALI_JUMP_TO(ctx, index) __CALI_JUMP_TO(ctx, index) +#endif + #endif /* Add new values to the end as these are program indices */ @@ -72,17 +80,21 @@ enum cali_jump_index { PROG_INDEX_HOST_CT_CONFLICT_DEBUG, PROG_INDEX_ICMP_INNER_NAT_DEBUG, - PROG_INDEX_V6_PROLOGUE, - PROG_INDEX_V6_POLICY, - PROG_INDEX_V6_ALLOWED, - PROG_INDEX_V6_ICMP, - PROG_INDEX_V6_DROP, - - PROG_INDEX_V6_PROLOGUE_DEBUG, - PROG_INDEX_V6_POLICY_DEBUG, - PROG_INDEX_V6_ALLOWED_DEBUG, - PROG_INDEX_V6_ICMP_DEBUG, - PROG_INDEX_V6_DROP_DEBUG, + PROG_INDEX_MAIN_V6, + PROG_INDEX_POLICY_V6, + PROG_INDEX_ALLOWED_V6, + PROG_INDEX_ICMP_V6, + PROG_INDEX_DROP_V6, + PROG_INDEX_HOST_CT_CONFLICT_V6, + PROG_INDEX_ICMP_INNER_NAT_V6, + + PROG_INDEX_MAIN_V6_DEBUG, + PROG_INDEX_POLICY_V6_DEBUG, + PROG_INDEX_ALLOWED_V6_DEBUG, + PROG_INDEX_ICMP_V6_DEBUG, + PROG_INDEX_DROP_V6_DEBUG, + PROG_INDEX_HOST_CT_CONFLICT_V6_DEBUG, + PROG_INDEX_ICMP_INNER_NAT_V6_DEBUG, }; #if CALI_F_XDP @@ -96,21 +108,29 @@ CALI_MAP_V1(cali_jump_prog_map, BPF_MAP_TYPE_PROG_ARRAY, __u32, __u32, 100, 0) */ #define CALI_JUMP_TO_POLICY(ctx) \ bpf_tail_call((ctx)->xdp, &cali_jump_prog_map, (ctx)->xdp_globals->jumps[PROG_INDEX_POLICY]) -#else +#else /* CALI_F_XDP */ #define cali_jump_prog_map map_symbol(cali_jump, 2) CALI_MAP_V1(cali_jump_prog_map, BPF_MAP_TYPE_PROG_ARRAY, __u32, __u32, 10000, 0) -#define CALI_JUMP_TO_POLICY(ctx) do { \ - (ctx)->skb->cb[0] = (ctx)->globals->jumps[PROG_PATH(PROG_INDEX_ALLOWED)]; \ - (ctx)->skb->cb[1] = (ctx)->globals->jumps[PROG_PATH(PROG_INDEX_DROP)]; \ - CALI_DEBUG("policy allow prog at %d\n", (ctx)->globals->jumps[PROG_PATH(PROG_INDEX_ALLOWED)]); \ - CALI_DEBUG("policy deny prog at %d\n", (ctx)->globals->jumps[PROG_PATH(PROG_INDEX_DROP)]); \ - CALI_DEBUG("jump to policy prog at %d\n", (ctx)->globals->jumps[PROG_INDEX_POLICY]); \ - bpf_tail_call((ctx)->skb, &cali_jump_prog_map, (ctx)->globals->jumps[PROG_INDEX_POLICY]); \ +#define __CALI_JUMP_TO_POLICY(ctx, allow, deny, pol) do { \ + (ctx)->skb->cb[0] = (ctx)->globals->jumps[PROG_PATH(allow)]; \ + (ctx)->skb->cb[1] = (ctx)->globals->jumps[PROG_PATH(deny)]; \ + CALI_DEBUG("policy allow prog at %d\n", (ctx)->globals->jumps[PROG_PATH(allow)]); \ + CALI_DEBUG("policy deny prog at %d\n", (ctx)->globals->jumps[PROG_PATH(deny)]); \ + CALI_DEBUG("jump to policy prog at %d\n", (ctx)->globals->jumps[pol]); \ + bpf_tail_call((ctx)->skb, &cali_jump_prog_map, (ctx)->globals->jumps[pol]); \ } while (0) +#ifdef IPVER6 +#define CALI_JUMP_TO_POLICY(ctx) \ + __CALI_JUMP_TO_POLICY(ctx, PROG_INDEX_ALLOWED_V6, PROG_INDEX_DROP_V6, PROG_INDEX_POLICY_V6) +#else +#define CALI_JUMP_TO_POLICY(ctx) \ + __CALI_JUMP_TO_POLICY(ctx, PROG_INDEX_ALLOWED, PROG_INDEX_DROP, PROG_INDEX_POLICY) +#endif + #endif #endif /* __CALI_BPF_JUMP_H__ */ diff --git a/felix/bpf-gpl/list-ut-objs b/felix/bpf-gpl/list-ut-objs index 561ec109ebf..c5517d97143 100755 --- a/felix/bpf-gpl/list-ut-objs +++ b/felix/bpf-gpl/list-ut-objs @@ -11,7 +11,7 @@ emit_filename() { echo "bin/test_${from_or_to}_${ep_type}_fib_${log_level}${dsr}.o" -# echo "bin/test_${from_or_to}_${ep_type}_fib_${log_level}${dsr}_v6.o" + echo "bin/test_${from_or_to}_${ep_type}_fib_${log_level}${dsr}_v6.o" } log_level=debug diff --git a/felix/bpf-gpl/metadata.h b/felix/bpf-gpl/metadata.h index cd6416677bc..ab0db17dad6 100644 --- a/felix/bpf-gpl/metadata.h +++ b/felix/bpf-gpl/metadata.h @@ -22,6 +22,8 @@ enum cali_metadata_flags { // Set metadata to be received by TC programs static CALI_BPF_INLINE int xdp2tc_set_metadata(struct cali_tc_ctx *ctx, __u32 flags) { +#ifndef IPVER6 + /* XXX */ #ifndef UNITTEST struct cali_metadata *metadata; // Reserve space in-front of xdp_md.meta for metadata. @@ -55,6 +57,7 @@ static CALI_BPF_INLINE int xdp2tc_set_metadata(struct cali_tc_ctx *ctx, __u32 fl CALI_DEBUG("Set IP TOS: %d\n", ip_hdr(ctx)->tos); goto metadata_ok; #endif +#endif error: return -1; @@ -62,22 +65,23 @@ static CALI_BPF_INLINE int xdp2tc_set_metadata(struct cali_tc_ctx *ctx, __u32 fl metadata_ok: return 0; } -#endif /* CALI_F_XDP */ +#else /* CALI_F_XDP */ // Fetch metadata set by XDP program. If not set or on error return 0. static CALI_BPF_INLINE __u32 xdp2tc_get_metadata(struct __sk_buff *skb) { +#ifndef IPVER6 + /* XXX */ struct cali_metadata *metadata; - if (CALI_F_FROM_HEP && !CALI_F_XDP) { #ifndef UNITTEST - metadata = (void *)(unsigned long)skb->data_meta; + metadata = (void *)(unsigned long)skb->data_meta; - if (skb->data_meta + sizeof(struct cali_metadata) > skb->data) { - CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "No metadata is shared by XDP\n"); - goto no_metadata; - } + if (skb->data_meta + sizeof(struct cali_metadata) > skb->data) { + CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "No metadata is shared by XDP\n"); + goto no_metadata; + } - CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "Received metadata from XDP: %d\n", metadata->flags); - goto metadata_ok; + CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "Received metadata from XDP: %d\n", metadata->flags); + goto metadata_ok; #else struct cali_tc_ctx ctx = { .skb = skb, @@ -97,15 +101,17 @@ static CALI_BPF_INLINE __u32 xdp2tc_get_metadata(struct __sk_buff *skb) { CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "Set IP TOS: %d\n", ip_hdr(&ctx)->tos); goto metadata_ok; #endif /* UNITTEST */ - } else { - CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "Fetching metadata from XDP not supported in this hook\n"); - } no_metadata: return 0; metadata_ok: return metadata->flags; +#else + return 0; +#endif } +#endif /* CALI_F_XDP */ + #endif /* __CALI_METADATA_H__ */ diff --git a/felix/bpf-gpl/nat.h b/felix/bpf-gpl/nat.h index 411365cf634..75187d736ce 100644 --- a/felix/bpf-gpl/nat.h +++ b/felix/bpf-gpl/nat.h @@ -5,36 +5,52 @@ #ifndef __CALI_NAT_H__ #define __CALI_NAT_H__ -#include - -#include -#include - -#include "bpf.h" -#include "skb.h" -#include "routes.h" -#include "nat_types.h" - #ifndef CALI_VXLAN_VNI #define CALI_VXLAN_VNI 0xca11c0 #endif +#define vxlan_udp_csum_ok(udp) ((udp)->check == 0) + +#ifdef IPVER6 +#include "nat6.h" +#else +#include "nat4.h" +#endif + #define dnat_should_encap() (CALI_F_FROM_HEP && !CALI_F_TUNNEL && !CALI_F_L3_DEV && !CALI_F_NAT_IF) #define dnat_return_should_encap() (CALI_F_FROM_WEP && !CALI_F_TUNNEL && !CALI_F_L3_DEV && !CALI_F_NAT_IF) #define dnat_should_decap() (CALI_F_FROM_HEP && !CALI_F_TUNNEL && !CALI_F_L3_DEV && !CALI_F_NAT_IF) -/* Number of bytes we add to a packet when we do encap. */ -#define VXLAN_ENCAP_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ - sizeof(struct udphdr) + sizeof(struct vxlanhdr)) +static CALI_BPF_INLINE int is_vxlan_tunnel(struct cali_tc_ctx *ctx, __u16 vxlanport) +{ + return ctx->state->ip_proto == IPPROTO_UDP && + ctx->state->dport == vxlanport; +} + +static CALI_BPF_INLINE bool vxlan_encap_too_big(struct cali_tc_ctx *ctx) +{ + __u32 mtu = TUNNEL_MTU; + + /* RFC-1191: MTU is the size in octets of the largest datagram that + * could be forwarded, along the path of the original datagram, without + * being fragmented at this router. The size includes the IP header and + * IP data, and does not include any lower-level headers. + */ + if (ctx->skb->len > sizeof(struct ethhdr) + mtu) { + CALI_DEBUG("SKB too long (len=%d) vs limit=%d\n", ctx->skb->len, mtu); + return true; + } + return false; +} #define EFAULT 14 -static CALI_BPF_INLINE int skb_nat_l4_csum_ipv4(struct cali_tc_ctx *ctx, size_t off, - __be32 ip_src_from, __be32 ip_src_to, - __be32 ip_dst_from, __be32 ip_dst_to, - __u16 dport_from, __u16 dport_to, - __u16 sport_from, __u16 sport_to, - __u64 flags) +static CALI_BPF_INLINE int skb_nat_l4_csum(struct cali_tc_ctx *ctx, size_t off, + ipv46_addr_t ip_src_from, ipv46_addr_t ip_src_to, + ipv46_addr_t ip_dst_from, ipv46_addr_t ip_dst_to, + __u16 dport_from, __u16 dport_to, + __u16 sport_from, __u16 sport_to, + __u64 flags) { int ret = 0; struct __sk_buff *skb = ctx->skb; @@ -70,19 +86,42 @@ static CALI_BPF_INLINE int skb_nat_l4_csum_ipv4(struct cali_tc_ctx *ctx, size_t } } + /* We start with csum == 0 (seed for the first diff) as we are calculating just + * the diff between 2 IPs. We then feed the result as a seed to the next diff if + * we need to as a carry-over. + * + * We must use diff because the replace functions cannot calculate a diff for 16 + * byte ipv6 addresses in one go. And this keeps the code the same for v4/6 with + * minimal impact on v4. + */ + __wsum csum = 0; + + bool csum_update = false; - if (ip_src_from != ip_src_to) { + if (!ip_equal(ip_src_from, ip_src_to)) { CALI_DEBUG("L4 checksum update src IP from %x to %x\n", - bpf_ntohl(ip_src_from), bpf_ntohl(ip_src_to)); - ret = bpf_l4_csum_replace(skb, off, ip_src_from, ip_src_to, flags | BPF_F_PSEUDO_HDR | 4); - CALI_DEBUG("bpf_l4_csum_replace(IP): %d\n", ret); + debug_ip(ip_src_from), debug_ip(ip_src_to)); + + csum = bpf_csum_diff((__u32*)&ip_src_from, sizeof(ip_src_from), (__u32*)&ip_src_to, sizeof(ip_src_to), csum); + CALI_DEBUG("bpf_l4_csum_diff(IP): 0x%x\n", csum); + csum_update = true; } - if (ip_dst_from != ip_dst_to) { + if (!ip_equal(ip_dst_from, ip_dst_to)) { CALI_DEBUG("L4 checksum update dst IP from %x to %x\n", - bpf_ntohl(ip_dst_from), bpf_ntohl(ip_dst_to)); - ret = bpf_l4_csum_replace(skb, off, ip_dst_from, ip_dst_to, flags | BPF_F_PSEUDO_HDR | 4); - CALI_DEBUG("bpf_l4_csum_replace(IP): %d\n", ret); + debug_ip(ip_dst_from), debug_ip(ip_dst_to)); + csum = bpf_csum_diff((__u32*)&ip_dst_from, sizeof(ip_dst_from), (__u32*)&ip_dst_to, sizeof(ip_dst_to), csum); + CALI_DEBUG("bpf_l4_csum_diff(IP): 0x%x\n", csum); + csum_update = true; + } + + /* If the IPs have changed we must replace it as part of the pseudo header that is + * used to calculate L4 csum. + */ + if (csum_update) { + ret = bpf_l4_csum_replace(skb, off, 0, csum, flags | BPF_F_PSEUDO_HDR); } + + /* We can use replace for ports in both v4/6 as they are the same size of 2 bytes. */ if (sport_from != sport_to) { CALI_DEBUG("L4 checksum update sport from %d to %d\n", bpf_ntohs(sport_from), bpf_ntohs(sport_to)); @@ -101,135 +140,6 @@ static CALI_BPF_INLINE int skb_nat_l4_csum_ipv4(struct cali_tc_ctx *ctx, size_t return ret; } -static CALI_BPF_INLINE int vxlan_v4_encap(struct cali_tc_ctx *ctx, __be32 ip_src, __be32 ip_dst) -{ - int ret; - __wsum csum; - - __u32 new_hdrsz = sizeof(struct ethhdr) + sizeof(struct iphdr) + - sizeof(struct udphdr) + sizeof(struct vxlanhdr); - - ret = bpf_skb_adjust_room(ctx->skb, new_hdrsz, BPF_ADJ_ROOM_MAC, - BPF_F_ADJ_ROOM_ENCAP_L4_UDP | - BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | - BPF_F_ADJ_ROOM_ENCAP_L2(sizeof(struct ethhdr))); - - if (ret) { - goto out; - } - - ret = -1; - - if (skb_refresh_validate_ptrs(ctx, new_hdrsz)) { - deny_reason(ctx, CALI_REASON_SHORT); - CALI_DEBUG("Too short VXLAN encap\n"); - goto out; - } - - // Note: assuming L2 packet here so this code can't be used on an L3 device. - struct udphdr *udp = (struct udphdr*) ((void *)ip_hdr(ctx) + IP_SIZE); - struct vxlanhdr *vxlan = (void *)(udp + 1); - struct ethhdr *eth_inner = (void *)(vxlan+1); - struct iphdr *ip_inner = (void*)(eth_inner+1); - - /* Copy the original IP header. Since it is already DNATed, the dest IP is - * already set. All we need to do is to change the source IP - */ - *ip_hdr(ctx) = *ip_inner; - - /* decrement TTL for the inner IP header. TTL must be > 1 to get here */ - ip_dec_ttl(ip_inner); - - ip_hdr(ctx)->saddr = ip_src; - ip_hdr(ctx)->daddr = ip_dst; - ip_hdr(ctx)->tot_len = bpf_htons(bpf_ntohs(ip_hdr(ctx)->tot_len) + new_hdrsz); - ip_hdr(ctx)->ihl = 5; /* in case there were options in ip_inner */ - ip_hdr(ctx)->check = 0; - ip_hdr(ctx)->protocol = IPPROTO_UDP; - - udp->source = udp->dest = bpf_htons(VXLAN_PORT); - udp->len = bpf_htons(bpf_ntohs(ip_hdr(ctx)->tot_len) - sizeof(struct iphdr)); - - *((__u8*)&vxlan->flags) = 1 << 3; /* set the I flag to make the VNI valid */ - vxlan->vni = bpf_htonl(CALI_VXLAN_VNI) >> 8; /* it is actually 24-bit, last 8 reserved */ - - /* keep eth_inner MACs zeroed, it is useless after decap */ - eth_inner->h_proto = eth_hdr(ctx)->h_proto; - - CALI_DEBUG("vxlan encap %x : %x\n", bpf_ntohl(ip_hdr(ctx)->saddr), bpf_ntohl(ip_hdr(ctx)->daddr)); - - /* change the checksums last to avoid pointer access revalidation */ - - csum = bpf_csum_diff(0, 0, ctx->ip_header, sizeof(struct iphdr), 0); - ret = bpf_l3_csum_replace(ctx->skb, ((long) ctx->ip_header) - ((long) skb_start_ptr(ctx->skb)) + - offsetof(struct iphdr, check), 0, csum, 0); - -out: - return ret; -} - -static CALI_BPF_INLINE int vxlan_v4_decap(struct __sk_buff *skb) -{ - __u32 extra_hdrsz; - int ret = -1; - - extra_hdrsz = sizeof(struct ethhdr) + sizeof(struct iphdr) + - sizeof(struct udphdr) + sizeof(struct vxlanhdr); - - ret = bpf_skb_adjust_room(skb, -extra_hdrsz, BPF_ADJ_ROOM_MAC | BPF_F_ADJ_ROOM_FIXED_GSO, 0); - - return ret; -} - -static CALI_BPF_INLINE int is_vxlan_tunnel(struct iphdr *ip, __u16 vxlanport) -{ - struct udphdr *udp = (struct udphdr *)(ip +1); - - return ip->protocol == IPPROTO_UDP && - udp->dest == bpf_htons(vxlanport); -} - -static CALI_BPF_INLINE bool vxlan_size_ok(struct cali_tc_ctx *ctx) -{ - return !skb_refresh_validate_ptrs(ctx, UDP_SIZE + sizeof(struct vxlanhdr)); -} - -static CALI_BPF_INLINE __u32 vxlan_vni(struct cali_tc_ctx *ctx) -{ - struct vxlanhdr *vxlan; - - vxlan = skb_ptr_after(skb, udp_hdr(ctx)); - - return bpf_ntohl(vxlan->vni << 8); /* 24-bit field, last 8 reserved */ -} - -static CALI_BPF_INLINE bool vxlan_vni_is_valid(struct cali_tc_ctx *ctx) -{ - struct vxlanhdr *vxlan; - - vxlan = skb_ptr_after(ctx->skb, udp_hdr(ctx)); - - return *((__u8*)&vxlan->flags) & (1 << 3); -} - -#define vxlan_udp_csum_ok(udp) ((udp)->check == 0) - -static CALI_BPF_INLINE bool vxlan_v4_encap_too_big(struct cali_tc_ctx *ctx) -{ - __u32 mtu = TUNNEL_MTU; - - /* RFC-1191: MTU is the size in octets of the largest datagram that - * could be forwarded, along the path of the original datagram, without - * being fragmented at this router. The size includes the IP header and - * IP data, and does not include any lower-level headers. - */ - if (ctx->skb->len > sizeof(struct ethhdr) + mtu) { - CALI_DEBUG("SKB too long (len=%d) vs limit=%d\n", ctx->skb->len, mtu); - return true; - } - return false; -} - /* vxlan_attempt_decap tries to decode the packet as VXLAN and, if it is a BPF-to-BPF * program VXLAN packet, does the decap. Returns: * @@ -241,10 +151,14 @@ static CALI_BPF_INLINE int vxlan_attempt_decap(struct cali_tc_ctx *ctx) { /* decap on host ep only if directly for the node */ CALI_DEBUG("VXLAN tunnel packet to %x (host IP=%x)\n", +#ifdef IPVER6 + bpf_ntohl(ip_hdr(ctx)->daddr.in6_u.u6_addr32[3]), +#else bpf_ntohl(ip_hdr(ctx)->daddr), - bpf_ntohl(HOST_IP)); +#endif + debug_ip(HOST_IP)); - if (!rt_addr_is_local_host(ip_hdr(ctx)->daddr)) { + if (!rt_addr_is_local_host((ipv46_addr_t *)&ip_hdr(ctx)->daddr)) { goto fall_through; } if (!vxlan_size_ok(ctx)) { @@ -258,7 +172,7 @@ static CALI_BPF_INLINE int vxlan_attempt_decap(struct cali_tc_ctx *ctx) goto fall_through; } if (vxlan_vni(ctx) != CALI_VXLAN_VNI) { - if (rt_addr_is_remote_host(ip_hdr(ctx)->saddr)) { + if (rt_addr_is_remote_host((ipv46_addr_t *)&ip_hdr(ctx)->saddr)) { /* Not BPF-generated VXLAN packet but it was from a Calico host to this node. */ CALI_DEBUG("VXLAN: non-tunnel calico\n"); goto auto_allow; @@ -267,7 +181,7 @@ static CALI_BPF_INLINE int vxlan_attempt_decap(struct cali_tc_ctx *ctx) CALI_DEBUG("VXLAN: Not our VNI\n"); goto fall_through; } - if (!rt_addr_is_remote_host(ip_hdr(ctx)->saddr)) { + if (!rt_addr_is_remote_host((ipv46_addr_t *)&ip_hdr(ctx)->saddr)) { CALI_DEBUG("VXLAN with our VNI from unexpected source.\n"); deny_reason(ctx, CALI_REASON_UNAUTH_SOURCE); goto deny; @@ -279,19 +193,28 @@ static CALI_BPF_INLINE int vxlan_attempt_decap(struct cali_tc_ctx *ctx) goto deny; } - ctx->arpk.ip = ip_hdr(ctx)->saddr; - ctx->arpk.ifindex = ctx->skb->ifindex; - /* We update the map straight with the packet data, eth header is * dst:src but the value is src:dst so it flips it automatically * when we use it on xmit. */ - cali_v4_arp_update_elem(&ctx->arpk, eth_hdr(ctx), 0); - CALI_DEBUG("ARP update for ifindex %d ip %x\n", ctx->arpk.ifindex, bpf_ntohl(ctx->arpk.ip)); + struct arp_key arpk = { + .ifindex = ctx->skb->ifindex, + }; +#ifdef IPVER6 + ipv6hdr_ip_to_ipv6_addr_t(&arpk.ip, &ip_hdr(ctx)->saddr); +#else + arpk.ip = ip_hdr(ctx)->saddr; +#endif + cali_arp_update_elem(&arpk, eth_hdr(ctx), 0); + CALI_DEBUG("ARP update for ifindex %d ip %x\n", arpk.ifindex, debug_ip(arpk.ip)); +#ifdef IPVER6 + ipv6hdr_ip_to_ipv6_addr_t(&ctx->state->tun_ip, &ip_hdr(ctx)->saddr); +#else ctx->state->tun_ip = ip_hdr(ctx)->saddr; +#endif CALI_DEBUG("vxlan decap\n"); - if (vxlan_v4_decap(ctx->skb)) { + if (vxlan_decap(ctx->skb)) { deny_reason(ctx, CALI_REASON_DECAP_FAIL); goto deny; } @@ -303,7 +226,7 @@ static CALI_BPF_INLINE int vxlan_attempt_decap(struct cali_tc_ctx *ctx) goto deny; } - CALI_DEBUG("vxlan decap origin %x\n", bpf_ntohl(ctx->state->tun_ip)); + CALI_DEBUG("vxlan decap origin %x\n", debug_ip(ctx->state->tun_ip)); fall_through: return 0; @@ -316,4 +239,5 @@ static CALI_BPF_INLINE int vxlan_attempt_decap(struct cali_tc_ctx *ctx) return -1; } + #endif /* __CALI_NAT_H__ */ diff --git a/felix/bpf-gpl/nat4.h b/felix/bpf-gpl/nat4.h new file mode 100644 index 00000000000..f79d015024b --- /dev/null +++ b/felix/bpf-gpl/nat4.h @@ -0,0 +1,125 @@ +// Project Calico BPF dataplane programs. +// Copyright (c) 2020-2023 Tigera, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +#ifndef __CALI_NAT4_H__ +#define __CALI_NAT4_H__ + +#include + +#include +#include + +#include "bpf.h" +#include "skb.h" +#include "routes.h" +#include "nat_types.h" + +/* Number of bytes we add to a packet when we do encap. */ +#define VXLAN_ENCAP_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct udphdr) + sizeof(struct vxlanhdr)) + +static CALI_BPF_INLINE int vxlan_encap(struct cali_tc_ctx *ctx, __be32 *ip_src, __be32 *ip_dst) +{ + int ret; + __wsum csum; + + __u32 new_hdrsz = sizeof(struct ethhdr) + sizeof(struct iphdr) + + sizeof(struct udphdr) + sizeof(struct vxlanhdr); + + ret = bpf_skb_adjust_room(ctx->skb, new_hdrsz, BPF_ADJ_ROOM_MAC, + BPF_F_ADJ_ROOM_ENCAP_L4_UDP | + BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | + BPF_F_ADJ_ROOM_ENCAP_L2(sizeof(struct ethhdr))); + + if (ret) { + goto out; + } + + ret = -1; + + if (skb_refresh_validate_ptrs(ctx, new_hdrsz)) { + deny_reason(ctx, CALI_REASON_SHORT); + CALI_DEBUG("Too short VXLAN encap\n"); + goto out; + } + + // Note: assuming L2 packet here so this code can't be used on an L3 device. + struct udphdr *udp = (struct udphdr*) ((void *)ip_hdr(ctx) + IP_SIZE); + struct vxlanhdr *vxlan = (void *)(udp + 1); + struct ethhdr *eth_inner = (void *)(vxlan+1); + struct iphdr *ip_inner = (void*)(eth_inner+1); + + /* Copy the original IP header. Since it is already DNATed, the dest IP is + * already set. All we need to do is to change the source IP + */ + *ip_hdr(ctx) = *ip_inner; + + /* decrement TTL for the inner IP header. TTL must be > 1 to get here */ + ip_dec_ttl(ip_inner); + + ip_hdr(ctx)->saddr = *ip_src; + ip_hdr(ctx)->daddr = *ip_dst; + ip_hdr(ctx)->tot_len = bpf_htons(bpf_ntohs(ip_hdr(ctx)->tot_len) + new_hdrsz); + ip_hdr(ctx)->ihl = 5; /* in case there were options in ip_inner */ + ip_hdr(ctx)->check = 0; + ip_hdr(ctx)->protocol = IPPROTO_UDP; + + udp->source = udp->dest = bpf_htons(VXLAN_PORT); + udp->len = bpf_htons(bpf_ntohs(ip_hdr(ctx)->tot_len) - sizeof(struct iphdr)); + + *((__u8*)&vxlan->flags) = 1 << 3; /* set the I flag to make the VNI valid */ + vxlan->vni = bpf_htonl(CALI_VXLAN_VNI) >> 8; /* it is actually 24-bit, last 8 reserved */ + + /* keep eth_inner MACs zeroed, it is useless after decap */ + eth_inner->h_proto = eth_hdr(ctx)->h_proto; + + CALI_DEBUG("vxlan encap %x : %x\n", bpf_ntohl(ip_hdr(ctx)->saddr), bpf_ntohl(ip_hdr(ctx)->daddr)); + + /* change the checksums last to avoid pointer access revalidation */ + + csum = bpf_csum_diff(0, 0, ctx->ip_header, sizeof(struct iphdr), 0); + ret = bpf_l3_csum_replace(ctx->skb, ((long) ctx->ip_header) - ((long) skb_start_ptr(ctx->skb)) + + offsetof(struct iphdr, check), 0, csum, 0); + +out: + return ret; +} + +static CALI_BPF_INLINE int vxlan_decap(struct __sk_buff *skb) +{ + __u32 extra_hdrsz; + int ret = -1; + + extra_hdrsz = sizeof(struct ethhdr) + sizeof(struct iphdr) + + sizeof(struct udphdr) + sizeof(struct vxlanhdr); + + ret = bpf_skb_adjust_room(skb, -extra_hdrsz, BPF_ADJ_ROOM_MAC | BPF_F_ADJ_ROOM_FIXED_GSO, 0); + + return ret; +} + +static CALI_BPF_INLINE bool vxlan_size_ok(struct cali_tc_ctx *ctx) +{ + return !skb_refresh_validate_ptrs(ctx, UDP_SIZE + sizeof(struct vxlanhdr)); +} + +static CALI_BPF_INLINE __u32 vxlan_vni(struct cali_tc_ctx *ctx) +{ + struct vxlanhdr *vxlan; + + vxlan = skb_ptr_after(skb, udp_hdr(ctx)); + + return bpf_ntohl(vxlan->vni << 8); /* 24-bit field, last 8 reserved */ +} + +static CALI_BPF_INLINE bool vxlan_vni_is_valid(struct cali_tc_ctx *ctx) +{ + struct vxlanhdr *vxlan; + + vxlan = skb_ptr_after(ctx->skb, udp_hdr(ctx)); + + return *((__u8*)&vxlan->flags) & (1 << 3); +} + +#endif /* __CALI_NAT4_H__ */ diff --git a/felix/bpf-gpl/nat6.h b/felix/bpf-gpl/nat6.h new file mode 100644 index 00000000000..d6b0ede2187 --- /dev/null +++ b/felix/bpf-gpl/nat6.h @@ -0,0 +1,113 @@ +// Project Calico BPF dataplane programs. +// Copyright (c) 2020-2023 Tigera, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +#ifndef __CALI_NAT4_H__ +#define __CALI_NAT4_H__ + +#include + +#include +#include + +#include "bpf.h" +#include "skb.h" +#include "routes.h" +#include "nat_types.h" + +/* Number of bytes we add to a packet when we do encap. */ +#define VXLAN_ENCAP_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct udphdr) + sizeof(struct vxlanhdr)) + +static CALI_BPF_INLINE int vxlan_encap(struct cali_tc_ctx *ctx, ipv6_addr_t *ip_src, ipv6_addr_t *ip_dst) +{ + __u32 new_hdrsz = sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + + sizeof(struct udphdr) + sizeof(struct vxlanhdr); + + if (bpf_skb_adjust_room(ctx->skb, new_hdrsz, BPF_ADJ_ROOM_MAC, + BPF_F_ADJ_ROOM_ENCAP_L4_UDP | + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 | + BPF_F_ADJ_ROOM_ENCAP_L2(sizeof(struct ethhdr)))) { + return -1; + } + + if (skb_refresh_validate_ptrs(ctx, new_hdrsz)) { + deny_reason(ctx, CALI_REASON_SHORT); + CALI_DEBUG("Too short VXLAN encap\n"); + return -1; + } + + // Note: assuming L2 packet here so this code can't be used on an L3 device. + struct udphdr *udp = (struct udphdr*) ((void *)ip_hdr(ctx) + IP_SIZE); + struct vxlanhdr *vxlan = (void *)(udp + 1); + struct ethhdr *eth_inner = (void *)(vxlan+1); + struct ipv6hdr *ip_inner = (void*)(eth_inner+1); + + /* Copy the original IP header. Since it is already DNATed, the dest IP is + * already set. All we need to do is to change the source IP + */ + *ip_hdr(ctx) = *ip_inner; + + /* decrement TTL for the inner IP header. TTL must be > 1 to get here */ + ip_inner->hop_limit--; + + ipv6_addr_t_to_ipv6hdr_ip(&ip_hdr(ctx)->saddr, ip_src); + ipv6_addr_t_to_ipv6hdr_ip(&ip_hdr(ctx)->daddr, ip_dst); + ip_hdr(ctx)->payload_len = bpf_htons(bpf_ntohs(ip_hdr(ctx)->payload_len) + new_hdrsz); + ip_hdr(ctx)->nexthdr = IPPROTO_UDP; + + udp->source = udp->dest = bpf_htons(VXLAN_PORT); + udp->len = bpf_htons(bpf_ntohs(ip_hdr(ctx)->payload_len) - sizeof(struct iphdr)); + /* XXX we leave udp->check == 0 which is not legal in IPv6, but we are + * the only ones parsing that packet! + */ + + *((__u8*)&vxlan->flags) = 1 << 3; /* set the I flag to make the VNI valid */ + vxlan->vni = bpf_htonl(CALI_VXLAN_VNI) >> 8; /* it is actually 24-bit, last 8 reserved */ + + /* keep eth_inner MACs zeroed, it is useless after decap */ + eth_inner->h_proto = eth_hdr(ctx)->h_proto; + + CALI_DEBUG("vxlan encap %x : %x\n", + bpf_ntohl(ip_hdr(ctx)->saddr.in6_u.u6_addr32[3]), bpf_ntohl(ip_hdr(ctx)->daddr.in6_u.u6_addr32[3])); + + return 0; +} + +static CALI_BPF_INLINE int vxlan_decap(struct __sk_buff *skb) +{ + __u32 extra_hdrsz; + int ret = -1; + + extra_hdrsz = sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + + sizeof(struct udphdr) + sizeof(struct vxlanhdr); + + ret = bpf_skb_adjust_room(skb, -extra_hdrsz, BPF_ADJ_ROOM_MAC | BPF_F_ADJ_ROOM_FIXED_GSO, 0); + + return ret; +} + +static CALI_BPF_INLINE bool vxlan_size_ok(struct cali_tc_ctx *ctx) +{ + return !skb_refresh_validate_ptrs(ctx, UDP_SIZE + sizeof(struct vxlanhdr)); +} + +static CALI_BPF_INLINE __u32 vxlan_vni(struct cali_tc_ctx *ctx) +{ + struct vxlanhdr *vxlan; + + vxlan = skb_ptr_after(skb, udp_hdr(ctx)); + + return bpf_ntohl(vxlan->vni << 8); /* 24-bit field, last 8 reserved */ +} + +static CALI_BPF_INLINE bool vxlan_vni_is_valid(struct cali_tc_ctx *ctx) +{ + struct vxlanhdr *vxlan; + + vxlan = skb_ptr_after(ctx->skb, udp_hdr(ctx)); + + return *((__u8*)&vxlan->flags) & (1 << 3); +} + +#endif /* __CALI_NAT4_H__ */ diff --git a/felix/bpf-gpl/nat_lookup.h b/felix/bpf-gpl/nat_lookup.h index 0c4284d8667..a506ddab9e2 100644 --- a/felix/bpf-gpl/nat_lookup.h +++ b/felix/bpf-gpl/nat_lookup.h @@ -14,46 +14,46 @@ #include "routes.h" #include "nat_types.h" -static CALI_BPF_INLINE struct calico_nat_dest* calico_v4_nat_lookup(__be32 ip_src, - __be32 ip_dst, - __u8 ip_proto, - __u16 dport, - bool from_tun, - nat_lookup_result *res, - int affinity_always_timeo, - bool affinity_tmr_update +static CALI_BPF_INLINE struct calico_nat_dest* calico_nat_lookup(ipv46_addr_t *ip_src, + ipv46_addr_t *ip_dst, + __u8 ip_proto, + __u16 dport, + bool from_tun, + nat_lookup_result *res, + int affinity_always_timeo, + bool affinity_tmr_update #if !(CALI_F_XDP) && !(CALI_F_CGROUP) - , struct cali_tc_ctx *ctx + , struct cali_tc_ctx *ctx #endif - ) + ) { - struct calico_nat_v4_key nat_key = { + struct calico_nat_key nat_key = { .prefixlen = NAT_PREFIX_LEN_WITH_SRC_MATCH_IN_BITS, - .addr = ip_dst, + .addr = *ip_dst, .port = dport, .protocol = ip_proto, - .saddr = ip_src, + .saddr = *ip_src, }; - struct calico_nat_v4_value *nat_lv1_val; - struct calico_nat_secondary_v4_key nat_lv2_key; + struct calico_nat_value *nat_lv1_val; + struct calico_nat_secondary_key nat_lv2_key; struct calico_nat_dest *nat_lv2_val; - struct calico_nat_v4_affinity_key affkey = {}; + struct calico_nat_affinity_key affkey = {}; __u64 now = 0; - nat_lv1_val = cali_v4_nat_fe_lookup_elem(&nat_key); + nat_lv1_val = cali_nat_fe_lookup_elem(&nat_key); switch (nat_key.protocol) { case IPPROTO_UDP: - CALI_DEBUG("NAT: 1st level lookup addr=%x port=%d udp\n", (int)bpf_ntohl(nat_key.addr), (int)dport); + CALI_DEBUG("NAT: 1st level lookup addr=%x port=%d udp\n", (int)debug_ip(nat_key.addr), (int)dport); break; case IPPROTO_TCP: - CALI_DEBUG("NAT: 1st level lookup addr=%x port=%d tcp\n", (int)bpf_ntohl(nat_key.addr), (int)dport); + CALI_DEBUG("NAT: 1st level lookup addr=%x port=%d tcp\n", (int)debug_ip(nat_key.addr), (int)dport); break; case IPPROTO_ICMP: - CALI_DEBUG("NAT: 1st level lookup addr=%x port=%d icmp\n", (int)bpf_ntohl(nat_key.addr), (int)dport); + CALI_DEBUG("NAT: 1st level lookup addr=%x port=%d icmp\n", (int)debug_ip(nat_key.addr), (int)dport); break; default: - CALI_DEBUG("NAT: 1st level lookup addr=%x port=%d other\n", (int)bpf_ntohl(nat_key.addr), (int)dport); + CALI_DEBUG("NAT: 1st level lookup addr=%x port=%d other\n", (int)debug_ip(nat_key.addr), (int)dport); break; } @@ -66,7 +66,7 @@ static CALI_BPF_INLINE struct calico_nat_dest* calico_v4_nat_lookup(__be32 ip_sr * straight NAT and avoid a possible extra hop. */ if (!(CALI_F_FROM_WEP || CALI_F_TO_HEP || CALI_F_CGROUP || - (CALI_F_FROM_HEP && from_tun)) || ip_dst == 0xffffffff) { + (CALI_F_FROM_HEP && from_tun)) || ip_equal(*ip_dst, NP_SPECIAL_IP)) { return NULL; } @@ -96,8 +96,8 @@ static CALI_BPF_INLINE struct calico_nat_dest* calico_v4_nat_lookup(__be32 ip_sr return NULL; } - nat_key.addr = 0xffffffff; - nat_lv1_val = cali_v4_nat_fe_lookup_elem(&nat_key); + nat_key.addr = NP_SPECIAL_IP; + nat_lv1_val = cali_nat_fe_lookup_elem(&nat_key); if (!nat_lv1_val) { CALI_DEBUG("NAT: nodeport miss\n"); return NULL; @@ -105,7 +105,7 @@ static CALI_BPF_INLINE struct calico_nat_dest* calico_v4_nat_lookup(__be32 ip_sr CALI_DEBUG("NAT: nodeport hit\n"); } /* With LB source range, we install a drop entry in the NAT FE map - * with count equal to 0xffffffff. If we hit this entry, + * with count equal to all-ones for both ip4/6. If we hit this entry, * packet is dropped. */ if (nat_lv1_val->count == NAT_FE_DROP_COUNT) { @@ -147,34 +147,34 @@ static CALI_BPF_INLINE struct calico_nat_dest* calico_v4_nat_lookup(__be32 ip_sr goto skip_affinity; } - struct calico_nat_v4 nat_data = { - .addr = ip_dst, + struct calico_nat nat_data = { + .addr = *ip_dst, .port = dport, .protocol = ip_proto, }; affkey.nat_key = nat_data; - affkey.client_ip = ip_src; + affkey.client_ip = *ip_src; CALI_DEBUG("NAT: backend affinity %d seconds\n", nat_lv1_val->affinity_timeo ? : affinity_always_timeo); - struct calico_nat_v4_affinity_val *affval; + struct calico_nat_affinity_val *affval; now = bpf_ktime_get_ns(); - affval = cali_v4_nat_aff_lookup_elem(&affkey); + affval = cali_nat_aff_lookup_elem(&affkey); if (affval) { int timeo = (affinity_always_timeo ? : nat_lv1_val->affinity_timeo); if (now - affval->ts <= timeo * 1000000000ULL) { CALI_DEBUG("NAT: using affinity backend %x:%d\n", - bpf_ntohl(affval->nat_dest.addr), affval->nat_dest.port); + debug_ip(affval->nat_dest.addr), affval->nat_dest.port); if (affinity_tmr_update) { affval->ts = now; } return &affval->nat_dest; } - CALI_DEBUG("NAT: affinity expired for %x:%d\n", bpf_ntohl(ip_dst), dport); + CALI_DEBUG("NAT: affinity expired for %x:%d\n", debug_ip(*ip_dst), dport); } else { - CALI_DEBUG("no previous affinity for %x:%d", bpf_ntohl(ip_dst), dport); + CALI_DEBUG("no previous affinity for %x:%d", debug_ip(*ip_dst), dport); } /* To be k8s conformant, fall through to pick a random backend. */ @@ -185,23 +185,23 @@ static CALI_BPF_INLINE struct calico_nat_dest* calico_v4_nat_lookup(__be32 ip_sr CALI_DEBUG("NAT: 1st level hit; id=%d ordinal=%d\n", nat_lv2_key.id, nat_lv2_key.ordinal); - if (!(nat_lv2_val = cali_v4_nat_be_lookup_elem(&nat_lv2_key))) { + if (!(nat_lv2_val = cali_nat_be_lookup_elem(&nat_lv2_key))) { CALI_DEBUG("NAT: backend miss\n"); *res = NAT_NO_BACKEND; return NULL; } - CALI_DEBUG("NAT: backend selected %x:%d\n", bpf_ntohl(nat_lv2_val->addr), nat_lv2_val->port); + CALI_DEBUG("NAT: backend selected %x:%d\n", debug_ip(nat_lv2_val->addr), nat_lv2_val->port); if (nat_lv1_val->affinity_timeo != 0 || affinity_always_timeo) { int err; - struct calico_nat_v4_affinity_val val = { + struct calico_nat_affinity_val val = { .ts = now, .nat_dest = *nat_lv2_val, }; - CALI_DEBUG("NAT: updating affinity for client %x\n", bpf_ntohl(ip_src)); - if ((err = cali_v4_nat_aff_update_elem(&affkey, &val, BPF_ANY))) { + CALI_DEBUG("NAT: updating affinity for client %x\n", debug_ip(*ip_src)); + if ((err = cali_nat_aff_update_elem(&affkey, &val, BPF_ANY))) { CALI_INFO("NAT: failed to update affinity table: %d\n", err); /* we do carry on, we have a good nat_lv2_val */ } @@ -211,13 +211,13 @@ static CALI_BPF_INLINE struct calico_nat_dest* calico_v4_nat_lookup(__be32 ip_sr } #if !(CALI_F_XDP) && !(CALI_F_CGROUP) -static CALI_BPF_INLINE struct calico_nat_dest* calico_v4_nat_lookup_tc(struct cali_tc_ctx *ctx, - __be32 ip_src, __be32 ip_dst, - __u8 ip_proto, __u16 dport, - bool from_tun, - nat_lookup_result *res) +static CALI_BPF_INLINE struct calico_nat_dest* calico_nat_lookup_tc(struct cali_tc_ctx *ctx, + ipv46_addr_t *ip_src, ipv46_addr_t *ip_dst, + __u8 ip_proto, __u16 dport, + bool from_tun, + nat_lookup_result *res) { - return calico_v4_nat_lookup(ip_src, ip_dst, ip_proto, dport, from_tun, res, 0, false, ctx); + return calico_nat_lookup(ip_src, ip_dst, ip_proto, dport, from_tun, res, 0, false, ctx); } #endif diff --git a/felix/bpf-gpl/nat_types.h b/felix/bpf-gpl/nat_types.h index 3c47a6d72ec..22996b9fbe1 100644 --- a/felix/bpf-gpl/nat_types.h +++ b/felix/bpf-gpl/nat_types.h @@ -14,8 +14,8 @@ typedef enum calico_nat_lookup_result { } nat_lookup_result; -struct calico_nat_v4 { - __u32 addr; // NBO +struct calico_nat { + ipv46_addr_t addr; // NBO __u16 port; // HBO __u8 protocol; }; @@ -24,31 +24,31 @@ struct calico_nat_v4 { * Modified the map from HASH to LPM_TRIE. This is to drop packets outside * src IP range specified for Load Balancer */ -struct __attribute__((__packed__)) calico_nat_v4_key { +struct __attribute__((__packed__)) calico_nat_key { __u32 prefixlen; - __u32 addr; // NBO + ipv46_addr_t addr; // NBO __u16 port; // HBO __u8 protocol; - __u32 saddr; + ipv46_addr_t saddr; __u8 pad; }; /* Prefix len = (dst_addr + port + protocol + src_addr) in bits. */ -#define NAT_PREFIX_LEN_WITH_SRC_MATCH (sizeof(struct calico_nat_v4_key) - \ - sizeof(((struct calico_nat_v4_key*)0)->prefixlen) - \ - sizeof(((struct calico_nat_v4_key*)0)->pad)) +#define NAT_PREFIX_LEN_WITH_SRC_MATCH (sizeof(struct calico_nat_key) - \ + sizeof(((struct calico_nat_key*)0)->prefixlen) - \ + sizeof(((struct calico_nat_key*)0)->pad)) #define NAT_PREFIX_LEN_WITH_SRC_MATCH_IN_BITS (NAT_PREFIX_LEN_WITH_SRC_MATCH * 8) // This is used as a special ID along with count=0 to drop a packet at nat level1 lookup #define NAT_FE_DROP_COUNT 0xffffffff -union calico_nat_v4_lpm_key { +union calico_nat_lpm_key { struct bpf_lpm_trie_key lpm; - struct calico_nat_v4_key key; + struct calico_nat_key key; }; -struct calico_nat_v4_value { +struct calico_nat_value { __u32 id; __u32 count; __u32 local; @@ -59,45 +59,60 @@ struct calico_nat_v4_value { #define NAT_FLG_EXTERNAL_LOCAL 0x1 #define NAT_FLG_INTERNAL_LOCAL 0x2 -CALI_MAP(cali_v4_nat_fe, 3, +#ifdef IPVER6 +CALI_MAP_NAMED(cali_v6_nat_fe, cali_nat_fe, 3, +#else +CALI_MAP_NAMED(cali_v4_nat_fe, cali_nat_fe, 3, +#endif BPF_MAP_TYPE_LPM_TRIE, - union calico_nat_v4_lpm_key, struct calico_nat_v4_value, + union calico_nat_lpm_key, struct calico_nat_value, 64*1024, BPF_F_NO_PREALLOC) // Map: NAT level two. ID and ordinal -> new dest and port. -struct calico_nat_secondary_v4_key { +struct calico_nat_secondary_key { __u32 id; __u32 ordinal; }; struct calico_nat_dest { - __u32 addr; + ipv46_addr_t addr; __u16 port; __u8 pad[2]; }; -CALI_MAP_V1(cali_v4_nat_be, +#ifdef IPVER6 +CALI_MAP_NAMED(cali_v6_nat_be, cali_nat_be,, +#else +CALI_MAP_NAMED(cali_v4_nat_be, cali_nat_be,, +#endif BPF_MAP_TYPE_HASH, - struct calico_nat_secondary_v4_key, struct calico_nat_dest, + struct calico_nat_secondary_key, struct calico_nat_dest, 256*1024, BPF_F_NO_PREALLOC) -struct calico_nat_v4_affinity_key { - struct calico_nat_v4 nat_key; - __u32 client_ip; +struct calico_nat_affinity_key { + struct calico_nat nat_key; + ipv46_addr_t client_ip; __u32 padding; }; -struct calico_nat_v4_affinity_val { +struct calico_nat_affinity_val { struct calico_nat_dest nat_dest; +#ifdef IPVER6 + __u32 __pad; +#endif __u64 ts; }; -CALI_MAP_V1(cali_v4_nat_aff, +#ifdef IPVER6 +CALI_MAP_NAMED(cali_v6_nat_aff, cali_nat_aff,, +#else +CALI_MAP_NAMED(cali_v4_nat_aff, cali_nat_aff,, +#endif BPF_MAP_TYPE_LRU_HASH, - struct calico_nat_v4_affinity_key, struct calico_nat_v4_affinity_val, + struct calico_nat_affinity_key, struct calico_nat_affinity_val, 64*1024, 0) struct vxlanhdr { diff --git a/felix/bpf-gpl/parsing.h b/felix/bpf-gpl/parsing.h index 61c64ef3d8c..41103faf926 100644 --- a/felix/bpf-gpl/parsing.h +++ b/felix/bpf-gpl/parsing.h @@ -9,6 +9,7 @@ #include #include +#include "types.h" #include "skb.h" #include "routes.h" @@ -17,153 +18,61 @@ #define PARSING_ALLOW_WITHOUT_ENFORCING_POLICY 2 #define PARSING_ERROR -1 -static CALI_BPF_INLINE int bpf_load_bytes(struct cali_tc_ctx *ctx, __u32 offset, void *buf, __u32 len) -{ - int ret; +static CALI_BPF_INLINE int bpf_load_bytes(struct cali_tc_ctx *ctx, __u32 offset, void *buf, __u32 len); - if (CALI_F_XDP) { -#ifdef BPF_CORE_SUPPORTED - if (bpf_core_enum_value_exists(enum bpf_func_id, BPF_FUNC_xdp_load_bytes)) { - ret = bpf_xdp_load_bytes(ctx->xdp, offset, buf, len); - } else +#ifdef IPVER6 +#include "parsing6.h" +#else +#include "parsing4.h" #endif - { - return -22 /* EINVAL */; - } - } else { - ret = bpf_skb_load_bytes(ctx->skb, offset, buf, len); - } - return ret; +#ifdef IPVER6 +static CALI_BPF_INLINE int parse_packet_ip(struct cali_tc_ctx *ctx) +{ + return parse_packet_ip_v6(ctx); } -static CALI_BPF_INLINE int parse_packet_ip(struct cali_tc_ctx *ctx) { - __u16 protocol = 0; - - /* We need to make a decision based on Ethernet protocol, however, - * the protocol number is not available to XDP programs like TC ones. - * In TC programs protocol number is available via skb->protocol. - * For that, in XDP programs we need to parse at least up to Ethernet - * first, before making any decision. But in TC programs we can make - * an initial decision based on Ethernet protocol before parsing packet - * for more headers. - */ - if (CALI_F_XDP) { - if (skb_refresh_validate_ptrs(ctx, UDP_SIZE)) { - deny_reason(ctx, CALI_REASON_SHORT); - CALI_DEBUG("Too short\n"); - goto deny; - } - protocol = bpf_ntohs(eth_hdr(ctx)->h_proto); - } else { - protocol = bpf_ntohs(ctx->skb->protocol); - } - - switch (protocol) { - case ETH_P_IP: - break; - case ETH_P_ARP: - CALI_DEBUG("ARP: allowing packet\n"); - goto allow_no_fib; - case ETH_P_IPV6: - // If IPv6 is supported and enabled, handle the packet - if (GLOBAL_FLAGS & CALI_GLOBALS_IPV6_ENABLED) { - CALI_DEBUG("IPv6 packet, continue with parsing it.\n"); - goto ipv6_packet; - } - // otherwise, drop if the packet is from workload - if (CALI_F_WEP) { - CALI_DEBUG("IPv6 from workload: drop\n"); - goto deny; - } else { // or allow, it the packet is on host interface - CALI_DEBUG("IPv6 on host interface: allow\n"); - goto allow_no_fib; - } - default: - if (CALI_F_WEP) { - CALI_DEBUG("Unknown ethertype (%x), drop\n", protocol); - goto deny; - } else { - CALI_DEBUG("Unknown ethertype on host interface (%x), allow\n", - protocol); - goto allow_no_fib; - } - } - - // In TC programs, parse packet and validate its size. This is - // already done for XDP programs at the beginning of the function. - if (!CALI_F_XDP) { - if (skb_refresh_validate_ptrs(ctx, UDP_SIZE)) { - deny_reason(ctx, CALI_REASON_SHORT); - CALI_DEBUG("Too short\n"); - goto deny; - } - } - - CALI_DEBUG("IP id=%d\n",bpf_ntohs(ip_hdr(ctx)->id)); - CALI_DEBUG("IP s=%x d=%x\n", bpf_ntohl(ip_hdr(ctx)->saddr), bpf_ntohl(ip_hdr(ctx)->daddr)); - // Drop malformed IP packets - if (ip_hdr(ctx)->ihl < 5) { - CALI_DEBUG("Drop malformed IP packets\n"); - deny_reason(ctx, CALI_REASON_IP_MALFORMED); - goto deny; - } else if (ip_hdr(ctx)->ihl > 5) { - /* Drop packets with IP options from/to WEP. - * Also drop packets with IP options if the dest IP is not host IP - */ - ctx->ipheader_len = 4 * ip_hdr(ctx)->ihl; - } - CALI_DEBUG("IP ihl=%d bytes\n", ctx->ipheader_len); - - return PARSING_OK; - -ipv6_packet: - // Parse IPv6 header, and perform necessary checks here - return PARSING_OK_V6; - -allow_no_fib: - return PARSING_ALLOW_WITHOUT_ENFORCING_POLICY; - -deny: - return PARSING_ERROR; +static CALI_BPF_INLINE void tc_state_fill_from_iphdr(struct cali_tc_ctx *ctx) +{ + return tc_state_fill_from_iphdr_v6(ctx); +} +#else +static CALI_BPF_INLINE int parse_packet_ip(struct cali_tc_ctx *ctx) +{ + return parse_packet_ip_v4(ctx); } static CALI_BPF_INLINE void tc_state_fill_from_iphdr(struct cali_tc_ctx *ctx) { - ctx->state->ip_src = ip_hdr(ctx)->saddr; - ctx->state->ip_dst = ip_hdr(ctx)->daddr; - ctx->state->pre_nat_ip_dst = ip_hdr(ctx)->daddr; - ctx->state->ip_proto = ip_hdr(ctx)->protocol; - ctx->state->ip_size = ip_hdr(ctx)->tot_len; + return tc_state_fill_from_iphdr_v4(ctx); } +#endif -static CALI_BPF_INLINE void tc_state_fill_from_ipv6hdr(struct cali_tc_ctx *ctx) +static CALI_BPF_INLINE int bpf_load_bytes(struct cali_tc_ctx *ctx, __u32 offset, void *buf, __u32 len) { - // Fill in source ip - ctx->state->ip_src = ipv6_hdr(ctx)->saddr.in6_u.u6_addr32[0]; - ctx->state->ip_src1 = ipv6_hdr(ctx)->saddr.in6_u.u6_addr32[1]; - ctx->state->ip_src2 = ipv6_hdr(ctx)->saddr.in6_u.u6_addr32[2]; - ctx->state->ip_src3 = ipv6_hdr(ctx)->saddr.in6_u.u6_addr32[3]; - // Fill in dst ip - ctx->state->ip_dst = ipv6_hdr(ctx)->daddr.in6_u.u6_addr32[0]; - ctx->state->ip_dst1 = ipv6_hdr(ctx)->daddr.in6_u.u6_addr32[1]; - ctx->state->ip_dst2 = ipv6_hdr(ctx)->daddr.in6_u.u6_addr32[2]; - ctx->state->ip_dst3 = ipv6_hdr(ctx)->daddr.in6_u.u6_addr32[3]; - // Fill in pre nat ip - ctx->state->pre_nat_ip_dst = ipv6_hdr(ctx)->daddr.in6_u.u6_addr32[0]; - ctx->state->pre_nat_ip_dst1 = ipv6_hdr(ctx)->daddr.in6_u.u6_addr32[1]; - ctx->state->pre_nat_ip_dst2 = ipv6_hdr(ctx)->daddr.in6_u.u6_addr32[2]; - ctx->state->pre_nat_ip_dst3 = ipv6_hdr(ctx)->daddr.in6_u.u6_addr32[3]; - // Fill in other information - ctx->state->ip_proto = ipv6_hdr(ctx)->nexthdr; - ctx->state->ip_size = ipv6_hdr(ctx)->payload_len; + int ret; + +#if CALI_F_XDP +#ifdef BPF_CORE_SUPPORTED + if (bpf_core_enum_value_exists(enum bpf_func_id, BPF_FUNC_xdp_load_bytes)) { + ret = bpf_xdp_load_bytes(ctx->xdp, offset, buf, len); + } else +#endif + { + return -22 /* EINVAL */; + } +#else /* CALI_F_XDP */ + ret = bpf_skb_load_bytes(ctx->skb, offset, buf, len); +#endif /* CALI_F_XDP */ + + return ret; } /* Continue parsing packet based on the IP protocol and fill in relevant fields * in the state (struct cali_tc_state). */ static CALI_BPF_INLINE int tc_state_fill_from_nexthdr(struct cali_tc_ctx *ctx, bool decap) { - if (ip_hdr(ctx)->ihl == 5) { + if (ctx->ipheader_len == 20) { switch (ctx->state->ip_proto) { case IPPROTO_TCP: if (skb_refresh_validate_ptrs(ctx, TCP_SIZE)) { @@ -252,8 +161,8 @@ static CALI_BPF_INLINE int tc_state_fill_from_nexthdr(struct cali_tc_ctx *ctx, b /* CALI_F_FROM_HEP case is handled in vxlan_attempt_decap above since it already decoded * the header. */ if (CALI_F_TO_HEP) { - if (rt_addr_is_remote_host(ctx->state->ip_dst) && - rt_addr_is_local_host(ctx->state->ip_src)) { + if (rt_addr_is_remote_host(&ctx->state->ip_dst) && + rt_addr_is_local_host(&ctx->state->ip_src)) { CALI_DEBUG("VXLAN packet to known Calico host, allow.\n"); goto allow; } else { @@ -279,7 +188,7 @@ static CALI_BPF_INLINE int tc_state_fill_from_nexthdr(struct cali_tc_ctx *ctx, b goto deny; } if (CALI_F_FROM_HEP) { - if (rt_addr_is_remote_host(ctx->state->ip_src)) { + if (rt_addr_is_remote_host(&ctx->state->ip_src)) { CALI_DEBUG("IPIP packet from known Calico host, allow.\n"); goto allow; } else { @@ -288,7 +197,7 @@ static CALI_BPF_INLINE int tc_state_fill_from_nexthdr(struct cali_tc_ctx *ctx, b goto deny; } } else if (CALI_F_TO_HEP && !CALI_F_TUNNEL && !CALI_F_L3_DEV) { - if (rt_addr_is_remote_host(ctx->state->ip_dst)) { + if (rt_addr_is_remote_host(&ctx->state->ip_dst)) { CALI_DEBUG("IPIP packet to known Calico host, allow.\n"); goto allow; } else { @@ -316,4 +225,5 @@ static CALI_BPF_INLINE int tc_state_fill_from_nexthdr(struct cali_tc_ctx *ctx, b return PARSING_ERROR; } + #endif /* __CALI_PARSING_H__ */ diff --git a/felix/bpf-gpl/parsing4.h b/felix/bpf-gpl/parsing4.h new file mode 100644 index 00000000000..1662ee6d4c7 --- /dev/null +++ b/felix/bpf-gpl/parsing4.h @@ -0,0 +1,105 @@ +// Project Calico BPF dataplane programs. +// Copyright (c) 2020-2022 Tigera, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +#ifndef __CALI_PARSING4_H__ +#define __CALI_PARSING4_H__ + +static CALI_BPF_INLINE int parse_packet_ip_v4(struct cali_tc_ctx *ctx) +{ + __u16 protocol = 0; + + /* We need to make a decision based on Ethernet protocol, however, + * the protocol number is not available to XDP programs like TC ones. + * In TC programs protocol number is available via skb->protocol. + * For that, in XDP programs we need to parse at least up to Ethernet + * first, before making any decision. But in TC programs we can make + * an initial decision based on Ethernet protocol before parsing packet + * for more headers. + */ +#if CALI_F_XDP + if (skb_refresh_validate_ptrs(ctx, UDP_SIZE)) { + deny_reason(ctx, CALI_REASON_SHORT); + CALI_DEBUG("Too short\n"); + goto deny; + } + protocol = bpf_ntohs(eth_hdr(ctx)->h_proto); +#else + protocol = bpf_ntohs(ctx->skb->protocol); +#endif + + switch (protocol) { + case ETH_P_IP: + break; + case ETH_P_ARP: + CALI_DEBUG("ARP: allowing packet\n"); + goto allow_no_fib; + case ETH_P_IPV6: + // If IPv6 is supported and enabled, handle the packet + if (GLOBAL_FLAGS & CALI_GLOBALS_IPV6_ENABLED) { + CALI_DEBUG("IPv6 packet, continue with parsing it.\n"); + goto ipv6_packet; + } + // otherwise, drop if the packet is from workload + if (CALI_F_WEP) { + CALI_DEBUG("IPv6 from workload: drop\n"); + goto deny; + } else { // or allow, it the packet is on host interface + CALI_DEBUG("IPv6 on host interface: allow\n"); + goto allow_no_fib; + } + default: + if (CALI_F_WEP) { + CALI_DEBUG("Unknown ethertype (%x), drop\n", protocol); + goto deny; + } else { + CALI_DEBUG("Unknown ethertype on host interface (%x), allow\n", + protocol); + goto allow_no_fib; + } + } + + // In TC programs, parse packet and validate its size. This is + // already done for XDP programs at the beginning of the function. +#if !CALI_F_XDP + if (skb_refresh_validate_ptrs(ctx, UDP_SIZE)) { + deny_reason(ctx, CALI_REASON_SHORT); + CALI_DEBUG("Too short\n"); + goto deny; + } +#endif + + CALI_DEBUG("IP id=%d\n",bpf_ntohs(ip_hdr(ctx)->id)); + CALI_DEBUG("IP s=%x d=%x\n", bpf_ntohl(ip_hdr(ctx)->saddr), bpf_ntohl(ip_hdr(ctx)->daddr)); + // Drop malformed IP packets + if (ip_hdr(ctx)->ihl < 5) { + CALI_DEBUG("Drop malformed IP packets\n"); + deny_reason(ctx, CALI_REASON_IP_MALFORMED); + goto deny; + } + + return PARSING_OK; + +ipv6_packet: + // Parse IPv6 header, and perform necessary checks here + return PARSING_OK_V6; + +allow_no_fib: + return PARSING_ALLOW_WITHOUT_ENFORCING_POLICY; + +deny: + return PARSING_ERROR; +} + +static CALI_BPF_INLINE void tc_state_fill_from_iphdr_v4(struct cali_tc_ctx *ctx) +{ + ctx->state->ip_src = ip_hdr(ctx)->saddr; + ctx->state->ip_dst = ip_hdr(ctx)->daddr; + ctx->state->pre_nat_ip_dst = ip_hdr(ctx)->daddr; + ctx->state->ip_proto = ip_hdr(ctx)->protocol; + ctx->state->ip_size = ip_hdr(ctx)->tot_len; + ctx->ipheader_len = ctx->state->ihl = ip_hdr(ctx)->ihl * 4; + CALI_DEBUG("IP ihl=%d bytes\n", ctx->ipheader_len); +} + +#endif /* __CALI_PARSING4_H__ */ diff --git a/felix/bpf-gpl/parsing6.h b/felix/bpf-gpl/parsing6.h new file mode 100644 index 00000000000..85fd3f3a3fc --- /dev/null +++ b/felix/bpf-gpl/parsing6.h @@ -0,0 +1,176 @@ +// Project Calico BPF dataplane programs. +// Copyright (c) 2023 Tigera, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +#ifndef __CALI_PARSING6_H__ +#define __CALI_PARSING6_H__ + +#define NEXTHDR_HOP 0 +#define NEXTHDR_ROUTING 43 +#define NEXTHDR_FRAGMENT 44 +#define NEXTHDR_GRE 47 +#define NEXTHDR_ESP 50 +#define NEXTHDR_AUTH 51 +#define NEXTHDR_NONE 59 +#define NEXTHDR_DEST 60 +#define NEXTHDR_MOBILITY 135 + + +static CALI_BPF_INLINE int parse_packet_ip_v6(struct cali_tc_ctx *ctx) { + __u16 protocol = 0; + + /* We need to make a decision based on Ethernet protocol, however, + * the protocol number is not available to XDP programs like TC ones. + * In TC programs protocol number is available via skb->protocol. + * For that, in XDP programs we need to parse at least up to Ethernet + * first, before making any decision. But in TC programs we can make + * an initial decision based on Ethernet protocol before parsing packet + * for more headers. + */ + if (CALI_F_XDP) { + if (skb_refresh_validate_ptrs(ctx, UDP_SIZE)) { + deny_reason(ctx, CALI_REASON_SHORT); + CALI_DEBUG("Too short\n"); + goto deny; + } + protocol = bpf_ntohs(eth_hdr(ctx)->h_proto); + } else { + protocol = bpf_ntohs(ctx->skb->protocol); + } + + switch (protocol) { + case ETH_P_IPV6: + break; + default: + if (CALI_F_WEP) { + CALI_DEBUG("Unknown ethertype (%x), drop\n", protocol); + goto deny; + } else { + CALI_DEBUG("Unknown ethertype on host interface (%x), allow\n", + protocol); + goto allow_no_fib; + } + } + + // In TC programs, parse packet and validate its size. This is + // already done for XDP programs at the beginning of the function. + if (!CALI_F_XDP) { + if (skb_refresh_validate_ptrs(ctx, UDP_SIZE)) { + deny_reason(ctx, CALI_REASON_SHORT); + CALI_DEBUG("Too short\n"); + goto deny; + } + } + + return PARSING_OK_V6; + +allow_no_fib: + return PARSING_ALLOW_WITHOUT_ENFORCING_POLICY; + +deny: + return PARSING_ERROR; +} + +static CALI_BPF_INLINE bool ipv6_hexthdr_is_opt(int nexthdr) +{ + switch(nexthdr) { + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_FRAGMENT: + case NEXTHDR_GRE: + case NEXTHDR_ESP: + case NEXTHDR_AUTH: + case NEXTHDR_NONE: + case NEXTHDR_DEST: + case NEXTHDR_MOBILITY: + return true; + } + + return false; +} + +static CALI_BPF_INLINE void tc_state_fill_from_iphdr_v6(struct cali_tc_ctx *ctx) +{ + // Fill in source ip + ipv6hdr_ip_to_ipv6_addr_t(&ctx->state->ip_src, &ip_hdr(ctx)->saddr); + // Fill in dst ip + ipv6hdr_ip_to_ipv6_addr_t(&ctx->state->ip_dst, &ip_hdr(ctx)->daddr); + // Fill in pre nat ip + ctx->state->pre_nat_ip_dst = ctx->state->ip_dst; + // Fill in other information + ctx->state->ip_size = ip_hdr(ctx)->payload_len; + + int hdr; + + switch (ip_hdr(ctx)->nexthdr) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_ICMPV6: + ctx->ipheader_len = ctx->state->ihl = IP_SIZE; + ctx->state->ip_proto = ip_hdr(ctx)->nexthdr; + goto out; + case NEXTHDR_NONE: + goto deny; + default: + hdr = ip_hdr(ctx)->nexthdr; + } + + CALI_DEBUG("ip->nexthdr %d IPv6 options!\n", ip_hdr(ctx)->nexthdr); + + int i; + int ipoff = skb_iphdr_offset(ctx); + int len = IP_SIZE; + + for (i = 0; i < 8; i++) { + struct ipv6_opt_hdr opt; + + CALI_DEBUG("loading extension at offset %d\n", ipoff + len); + if (bpf_load_bytes(ctx, ipoff + len, &opt, sizeof(opt))) { + CALI_DEBUG("Too short\n"); + goto deny; + } + + CALI_DEBUG("ext nexthdr %d hdrlen %d\n", opt.nexthdr, opt.hdrlen); + + switch(hdr) { + case NEXTHDR_FRAGMENT: + len += 16; + break; + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_DEST: + case NEXTHDR_GRE: + case NEXTHDR_ESP: + case NEXTHDR_AUTH: + case NEXTHDR_MOBILITY: + len += (opt.hdrlen + 1) * 8; + break; + } + + switch(opt.nexthdr) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_ICMPV6: + ctx->ipheader_len = ctx->state->ihl = len; + ctx->state->ip_proto = opt.nexthdr; + goto out; + case NEXTHDR_NONE: + goto deny; + } + + + } + +out: + CALI_DEBUG("IP ihl=%d bytes\n", ctx->ipheader_len); + return; + +deny: + if (CALI_F_XDP) { + bpf_exit(XDP_DROP); + } else { + bpf_exit(TC_ACT_SHOT); + } +} + +#endif /* __CALI_PARSING6_H__ */ diff --git a/felix/bpf-gpl/routes.h b/felix/bpf-gpl/routes.h index 8e51d764053..00804899f1b 100644 --- a/felix/bpf-gpl/routes.h +++ b/felix/bpf-gpl/routes.h @@ -12,7 +12,7 @@ struct cali_rt_key { __u32 prefixlen; - __be32 addr; // NBO + ipv46_addr_t addr; // NBO }; union cali_rt_lpm_key { @@ -36,26 +36,34 @@ struct cali_rt { __u32 flags; /* enum cali_rt_flags */ union { // IP encap next hop for remote workload routes. - __u32 next_hop; + ipv46_addr_t next_hop; // Interface index for local workload routes. __u32 if_index; }; }; -CALI_MAP_V1(cali_v4_routes, +#ifdef IPVER6 +CALI_MAP_NAMED(cali_v6_routes, cali_routes,, +#else +CALI_MAP_NAMED(cali_v4_routes, cali_routes,, +#endif BPF_MAP_TYPE_LPM_TRIE, union cali_rt_lpm_key, struct cali_rt, 256*1024, BPF_F_NO_PREALLOC) -static CALI_BPF_INLINE struct cali_rt *cali_rt_lookup(__be32 addr) +static CALI_BPF_INLINE struct cali_rt *cali_rt_lookup(ipv46_addr_t *addr) { union cali_rt_lpm_key k; +#ifdef IPVER6 + k.key.prefixlen = 128; +#else k.key.prefixlen = 32; - k.key.addr = addr; - return cali_v4_routes_lookup_elem(&k); +#endif + k.key.addr = *addr; + return cali_routes_lookup_elem(&k); } -static CALI_BPF_INLINE enum cali_rt_flags cali_rt_lookup_flags(__be32 addr) +static CALI_BPF_INLINE enum cali_rt_flags cali_rt_lookup_flags(ipv46_addr_t *addr) { struct cali_rt *rt = cali_rt_lookup(addr); if (!rt) { @@ -77,22 +85,22 @@ static CALI_BPF_INLINE enum cali_rt_flags cali_rt_lookup_flags(__be32 addr) #define cali_rt_flags_remote_tunneled_host(t) (((t) & (CALI_RT_LOCAL | CALI_RT_HOST | CALI_RT_TUNNELED)) == (CALI_RT_HOST | CALI_RT_TUNNELED)) #define cali_rt_flags_local_tunneled_host(t) (((t) & (CALI_RT_LOCAL | CALI_RT_HOST | CALI_RT_TUNNELED)) == (CALI_RT_LOCAL | CALI_RT_HOST | CALI_RT_TUNNELED)) -static CALI_BPF_INLINE bool rt_addr_is_local_host(__be32 addr) +static CALI_BPF_INLINE bool rt_addr_is_local_host(ipv46_addr_t *addr) { return cali_rt_flags_local_host(cali_rt_lookup_flags(addr)); } -static CALI_BPF_INLINE bool rt_addr_is_remote_host(__be32 addr) +static CALI_BPF_INLINE bool rt_addr_is_remote_host(ipv46_addr_t *addr) { return cali_rt_flags_remote_host(cali_rt_lookup_flags(addr)); } -static CALI_BPF_INLINE bool rt_addr_is_remote_tunneled_host(__be32 addr) +static CALI_BPF_INLINE bool rt_addr_is_remote_tunneled_host(ipv46_addr_t *addr) { return cali_rt_flags_remote_tunneled_host(cali_rt_lookup_flags(addr)); } -static CALI_BPF_INLINE bool rt_addr_is_local_tunneled_host(__be32 addr) +static CALI_BPF_INLINE bool rt_addr_is_local_tunneled_host(ipv46_addr_t *addr) { return cali_rt_flags_local_tunneled_host(cali_rt_lookup_flags(addr)); } diff --git a/felix/bpf-gpl/rpf.h b/felix/bpf-gpl/rpf.h index 5d5933d406c..5fc95de123f 100644 --- a/felix/bpf-gpl/rpf.h +++ b/felix/bpf-gpl/rpf.h @@ -7,11 +7,12 @@ #include "types.h" #include "skb.h" +#include "routes.h" static CALI_BPF_INLINE bool wep_rpf_check(struct cali_tc_ctx *ctx, struct cali_rt *r) { CALI_DEBUG("Workload RPF check src=%x skb iface=%d.\n", - bpf_ntohl(ctx->state->ip_src), ctx->skb->ifindex); + debug_ip(ctx->state->ip_src), ctx->skb->ifindex); if (!r) { CALI_INFO("Workload RPF fail: missing route.\n"); return false; @@ -31,6 +32,9 @@ static CALI_BPF_INLINE bool wep_rpf_check(struct cali_tc_ctx *ctx, struct cali_r static CALI_BPF_INLINE bool hep_rpf_check(struct cali_tc_ctx *ctx) { +#ifdef IPVER6 + return true; +#else bool ret = false; bool strict; @@ -62,18 +66,19 @@ static CALI_BPF_INLINE bool hep_rpf_check(struct cali_tc_ctx *ctx) if (strict) { ret = ctx->skb->ingress_ifindex == fib_params.ifindex; CALI_DEBUG("Host RPF check src=%x skb strict if %d\n", - bpf_ntohl(ctx->state->ip_src), fib_params.ifindex); + debug_ip(ctx->state->ip_src), fib_params.ifindex); } else { ret = fib_params.ifindex != CT_INVALID_IFINDEX; CALI_DEBUG("Host RPF check src=%x skb loose if %d\n", - bpf_ntohl(ctx->state->ip_src), fib_params.ifindex); + debug_ip(ctx->state->ip_src), fib_params.ifindex); } } CALI_DEBUG("Host RPF check src=%x skb iface=%d\n", - bpf_ntohl(ctx->state->ip_src), ctx->skb->ifindex); + debug_ip(ctx->state->ip_src), ctx->skb->ifindex); CALI_DEBUG("Host RPF check rc %d result %d\n", rc, ret); return ret; +#endif } #endif /* __CALI_FIB_H__ */ diff --git a/felix/bpf-gpl/sendrecv.h b/felix/bpf-gpl/sendrecv.h index da2939e2083..1a69cb59ef2 100644 --- a/felix/bpf-gpl/sendrecv.h +++ b/felix/bpf-gpl/sendrecv.h @@ -5,33 +5,41 @@ #ifndef __SENDRECV_H__ #define __SENDRECV_H__ -struct sendrecv4_key { +struct sendrec_key { __u64 cookie; - __u32 ip; + ipv46_addr_t ip; __u32 port; /* because bpf_sock_addr uses 32bit and we would need padding */ }; -struct sendrecv4_val { - __u32 ip; +struct sendrec_val { + ipv46_addr_t ip; __u32 port; /* because bpf_sock_addr uses 32bit and we would need padding */ }; -CALI_MAP_V1(cali_v4_srmsg, +#ifdef IPVER6 +CALI_MAP_NAMED(cali_v6_srmsg, cali_srmsg,, +#else +CALI_MAP_NAMED(cali_v4_srmsg, cali_srmsg,, +#endif BPF_MAP_TYPE_LRU_HASH, - struct sendrecv4_key, struct sendrecv4_val, + struct sendrec_key, struct sendrec_val, 510000, 0) struct ct_nats_key { __u64 cookie; - __u32 ip; + ipv46_addr_t ip; __u32 port; /* because bpf_sock_addr uses 32bit */ __u8 proto; __u8 pad[7]; }; -CALI_MAP_V1(cali_v4_ct_nats, +#ifdef IPVER6 +CALI_MAP_NAMED(cali_v6_ct_nats, cali_ct_nats ,, +#else +CALI_MAP_NAMED(cali_v4_ct_nats, cali_ct_nats ,, +#endif BPF_MAP_TYPE_LRU_HASH, - struct ct_nats_key, struct sendrecv4_val, + struct ct_nats_key, struct sendrec_val, 10000, 0) static CALI_BPF_INLINE __u16 ctx_port_to_host(__u32 port) diff --git a/felix/bpf-gpl/skb.h b/felix/bpf-gpl/skb.h index 6b0260e6219..cb66066bbb8 100644 --- a/felix/bpf-gpl/skb.h +++ b/felix/bpf-gpl/skb.h @@ -52,13 +52,13 @@ static CALI_BPF_INLINE void *skb_end_ptr(struct __sk_buff *skb) { * Fresh values are loaded using skb_start/end_ptr. */ static CALI_BPF_INLINE void skb_refresh_start_end(struct cali_tc_ctx *ctx) { - if (CALI_F_XDP) { - ctx->data_start = (void *)(long)ctx->xdp->data; - ctx->data_end = (void *)(long)ctx->xdp->data_end; - } else { - ctx->data_start = skb_start_ptr(ctx->skb); - ctx->data_end = skb_end_ptr(ctx->skb); - } +#if CALI_F_XDP + ctx->data_start = (void *)(long)ctx->xdp->data; + ctx->data_end = (void *)(long)ctx->xdp->data_end; +#else + ctx->data_start = skb_start_ptr(ctx->skb); + ctx->data_end = skb_end_ptr(ctx->skb); +#endif } /* skb_iphdr_offset returns the expected offset of the IP header for this type of program. @@ -92,17 +92,17 @@ static CALI_BPF_INLINE long skb_iphdr_offset(struct cali_tc_ctx *ctx) * - ctx->ip_header * - ctx->nh/tcp_header/udp_header/icmp_header. */ -static CALI_BPF_INLINE int skb_refresh_validate_ptrs(struct cali_tc_ctx *ctx, long nh_len) { +static CALI_BPF_INLINE int skb_refresh_validate_ptrs(struct cali_tc_ctx *ctx, long nh_len) +{ int min_size = skb_iphdr_offset(ctx) + IP_SIZE; skb_refresh_start_end(ctx); if (ctx->data_start + (min_size + nh_len) > ctx->data_end) { // This is an XDP program and there is not enough data for next header. - if (CALI_F_XDP) { - CALI_DEBUG("Too short to have %d bytes for next header\n", - min_size + nh_len); - return -2; - } - +#if CALI_F_XDP + CALI_DEBUG("Too short to have %d bytes for next header\n", + min_size + nh_len); + return -2; +#else // Try to pull in more data. Ideally enough for TCP, or, failing that, the // minimum we've been asked for. if (nh_len > TCP_SIZE || bpf_skb_pull_data(ctx->skb, min_size + TCP_SIZE)) { @@ -117,10 +117,11 @@ static CALI_BPF_INLINE int skb_refresh_validate_ptrs(struct cali_tc_ctx *ctx, lo if (ctx->data_start + (min_size + nh_len) > ctx->data_end) { return -2; } +#endif } // Success, refresh the ip_header/nh fields in the context. ctx->ip_header = ctx->data_start + skb_iphdr_offset(ctx); - ctx->ipheader_len = 4 * ip_hdr(ctx)->ihl; + return 0; } diff --git a/felix/bpf-gpl/tc.c b/felix/bpf-gpl/tc.c index d5f5c377441..82b557d8816 100644 --- a/felix/bpf-gpl/tc.c +++ b/felix/bpf-gpl/tc.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -37,7 +36,6 @@ #include "fib.h" #include "rpf.h" #include "parsing.h" -#include "ipv6.h" #include "tc.h" #include "failsafe.h" #include "metadata.h" @@ -46,6 +44,8 @@ #define HAS_HOST_CONFLICT_PROG CALI_F_TO_HEP +#define STATE (ctx->state) + /* calico_tc_main is the main function used in all of the tc programs. It is specialised * for particular hook at build time based on the CALI_F build flags. */ @@ -74,7 +74,11 @@ int calico_tc_main(struct __sk_buff *skb) struct cali_tc_ctx *ctx = &_ctx; CALI_DEBUG("New packet at ifindex=%d; mark=%x\n", skb->ifindex, skb->mark); +#ifdef IPVER6 + parse_packet_ip_v6(ctx); +#else parse_packet_ip(ctx); +#endif CALI_DEBUG("Final result=ALLOW (%d). Bypass mark set.\n", CALI_REASON_BYPASS); } return TC_ACT_UNSPEC; @@ -158,15 +162,15 @@ int calico_tc_main(struct __sk_buff *skb) /* Parse the packet as far as the IP header; as a side-effect this validates the packet size * is large enough for UDP. */ switch (parse_packet_ip(ctx)) { +#ifdef IPVER6 + case PARSING_OK_V6: + // IPv6 Packet. + break; +#else case PARSING_OK: // IPv4 Packet. break; - case PARSING_OK_V6: - // An IPv6 packet, so we should jump to the relevant IPv6 programs - CALI_DEBUG("About to jump to IPv6 prologue program\n"); - CALI_JUMP_TO(ctx, PROG_INDEX_V6_PROLOGUE); - CALI_DEBUG("Jump to IPv6 prologue failed.\n"); - goto deny; +#endif case PARSING_ALLOW_WITHOUT_ENFORCING_POLICY: // A packet that we automatically let through fwd_fib_set(&ctx->fwd, false); @@ -184,10 +188,6 @@ int calico_tc_main(struct __sk_buff *skb) allow: finalize: return forward_or_drop(ctx); - -deny: - ctx->fwd.res = TC_ACT_SHOT; - goto finalize; } static CALI_BPF_INLINE int pre_policy_processing(struct cali_tc_ctx *ctx) @@ -206,7 +206,7 @@ static CALI_BPF_INLINE int pre_policy_processing(struct cali_tc_ctx *ctx) /* Now we've got as far as the UDP header, check if this is one of our VXLAN packets, which we * use to forward traffic for node ports. */ if (dnat_should_decap() /* Compile time: is this a BPF program that should decap packets? */ && - is_vxlan_tunnel(ip_hdr(ctx), VXLAN_PORT) /* Is this a VXLAN packet? */ ) { + is_vxlan_tunnel(ctx, VXLAN_PORT) /* Is this a VXLAN packet? */ ) { /* Decap it; vxlan_attempt_decap will revalidate the packet if needed. */ switch (vxlan_attempt_decap(ctx)) { case -1: @@ -299,7 +299,7 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) * IP stack do the RPF check on the source, dest is not important. */ goto deny; - } else if (!wep_rpf_check(ctx, cali_rt_lookup(ctx->state->ip_src))) { + } else if (!wep_rpf_check(ctx, cali_rt_lookup(&ctx->state->ip_src))) { goto deny; } } @@ -381,10 +381,10 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) nat_lookup_result nat_res = NAT_LOOKUP_ALLOW; if (CALI_F_TO_HOST || (CALI_F_FROM_HOST && !skb_seen(ctx->skb) && !ctx->nat_dest /* no sport conflcit */)) { - ctx->nat_dest = calico_v4_nat_lookup_tc(ctx, - ctx->state->ip_src, ctx->state->ip_dst, - ctx->state->ip_proto, ctx->state->dport, - ctx->state->tun_ip != 0, &nat_res); + ctx->nat_dest = calico_nat_lookup_tc(ctx, + &ctx->state->ip_src, &ctx->state->ip_dst, + ctx->state->ip_proto, ctx->state->dport, + !ip_void(ctx->state->tun_ip), &nat_res); } if (nat_res == NAT_FE_LOOKUP_DROP) { @@ -399,7 +399,7 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) /* send icmp port unreachable if there is no backend for a service */ ctx->state->icmp_type = ICMP_DEST_UNREACH; ctx->state->icmp_code = ICMP_PORT_UNREACH; - ctx->state->tun_ip = 0; + ip_set_void(ctx->state->tun_ip); goto icmp_send_reply; } else { ctx->state->post_nat_ip_dst = ctx->state->ip_dst; @@ -409,14 +409,14 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) syn_force_policy: /* DNAT in state is set correctly now */ - if ((!(ctx->state->tun_ip) && CALI_F_FROM_HEP) && !CALI_F_NAT_IF && !CALI_F_LO) { + if ((ip_void(ctx->state->tun_ip) && CALI_F_FROM_HEP) && !CALI_F_NAT_IF && !CALI_F_LO) { if (!hep_rpf_check(ctx)) { goto deny; } } if (CALI_F_TO_WEP && !skb_seen(ctx->skb) && - cali_rt_flags_local_host(cali_rt_lookup_flags(ctx->state->ip_src))) { + cali_rt_flags_local_host(cali_rt_lookup_flags(&ctx->state->ip_src))) { /* Host to workload traffic always allowed. We discount traffic that was * seen by another program since it must have come in via another interface. */ @@ -425,7 +425,7 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) } if (CALI_F_FROM_WEP) { - struct cali_rt *r = cali_rt_lookup(ctx->state->ip_src); + struct cali_rt *r = cali_rt_lookup(&ctx->state->ip_src); /* Do RPF check since it's our responsibility to police that. */ if (!wep_rpf_check(ctx, r)) { goto deny; @@ -433,7 +433,7 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) // Check whether the workload needs outgoing NAT to this address. if (r->flags & CALI_RT_NAT_OUT) { - if (!(cali_rt_lookup_flags(ctx->state->post_nat_ip_dst) & CALI_RT_IN_POOL)) { + if (!(cali_rt_lookup_flags(&ctx->state->post_nat_ip_dst) & CALI_RT_IN_POOL)) { CALI_DEBUG("Source is in NAT-outgoing pool " "but dest is not, need to SNAT.\n"); ctx->state->flags |= CALI_ST_NAT_OUTGOING; @@ -441,10 +441,10 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) } /* If 3rd party CNI is used and dest is outside cluster. See commit fc711b192f for details. */ if (!(r->flags & CALI_RT_IN_POOL)) { - CALI_DEBUG("Source %x not in IP pool\n", bpf_ntohl(ctx->state->ip_src)); - r = cali_rt_lookup(ctx->state->post_nat_ip_dst); + CALI_DEBUG("Source %x not in IP pool\n", debug_ip(ctx->state->ip_src)); + r = cali_rt_lookup(&ctx->state->post_nat_ip_dst); if (!r || !(r->flags & (CALI_RT_WORKLOAD | CALI_RT_HOST))) { - CALI_DEBUG("Outside cluster dest %x\n", bpf_ntohl(ctx->state->post_nat_ip_dst)); + CALI_DEBUG("Outside cluster dest %x\n", debug_ip(ctx->state->post_nat_ip_dst)); ctx->state->flags |= CALI_ST_SKIP_FIB; } } @@ -464,7 +464,7 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) ctx->state->nat_dest.addr = ctx->nat_dest->addr; ctx->state->nat_dest.port = ctx->nat_dest->port; } else { - ctx->state->nat_dest.addr = 0; + ip_set_void(ctx->state->nat_dest.addr); ctx->state->nat_dest.port = 0; } @@ -484,15 +484,15 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) // If we didn't find a CTLB NAT entry then use the packet's own IP/port for the // pre-DNAT values that's set by tc_state_fill_from_iphdr() and // tc_state_fill_from_nextheader(). - struct sendrecv4_val *revnat = cali_v4_ct_nats_lookup_elem(&ct_nkey); + struct sendrec_val *revnat = cali_ct_nats_lookup_elem(&ct_nkey); if (revnat) { - CALI_DEBUG("Got cali_v4_ct_nats entry; flow was NATted by CTLB.\n"); + CALI_DEBUG("Got cali_ct_nats entry; flow was NATted by CTLB.\n"); ctx->state->pre_nat_ip_dst = revnat->ip; ctx->state->pre_nat_dport = ctx_port_to_host(revnat->port); } } - if (!forwarding && rt_addr_is_local_host(ctx->state->ip_src)) { + if (!forwarding && rt_addr_is_local_host(&ctx->state->ip_src)) { CALI_DEBUG("Source IP is local host.\n"); if (CALI_F_TO_HEP && is_failsafe_out(ctx->state->ip_proto, ctx->state->post_nat_dport, ctx->state->post_nat_ip_dst)) { CALI_DEBUG("Outbound failsafe port: %d. Skip policy.\n", ctx->state->post_nat_dport); @@ -502,10 +502,10 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) ctx->state->flags |= CALI_ST_SRC_IS_HOST; } - struct cali_rt *dest_rt = cali_rt_lookup(ctx->state->post_nat_ip_dst); + struct cali_rt *dest_rt = cali_rt_lookup(&ctx->state->post_nat_ip_dst); if (!dest_rt) { - CALI_DEBUG("No route for post DNAT dest %x\n", bpf_ntohl(ctx->state->post_nat_ip_dst)); + CALI_DEBUG("No route for post DNAT dest %x\n", debug_ip(ctx->state->post_nat_ip_dst)); if (CALI_F_FROM_HEP) { /* Disable FIB, let the packet go through the host after it is * policed. It is ingress into the system and we do not know what @@ -531,7 +531,7 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) if (CALI_F_TO_HEP && ctx->nat_dest && !skb_seen(ctx->skb) && !(ctx->state->flags & CALI_ST_HOST_PSNAT)) { CALI_DEBUG("Host accesses nodeport backend %x:%d\n", - bpf_htonl(ctx->state->post_nat_ip_dst), ctx->state->post_nat_dport); + debug_ip(ctx->state->post_nat_ip_dst), ctx->state->post_nat_dport); CALI_DEBUG("Host accesses nodeport state->flags 0x%x\n", ctx->state->flags); if (cali_rt_flags_local_workload(dest_rt->flags)) { CALI_DEBUG("NP redir on HEP - skip policy\n"); @@ -594,18 +594,16 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, size_t l4_csum_off, bool ct_related, int ct_rc, - struct ct_create_ctx ct_ctx_nat, + struct ct_create_ctx *ct_ctx_nat, bool *is_dnat, __u32 *seen_mark, bool in_place) { - int res = 0; bool encap_needed = false; - struct cali_tc_state *state = ctx->state; switch (ct_rc){ case CALI_CT_ESTABLISHED_DNAT: - if (CALI_F_FROM_HEP && state->tun_ip && ct_result_np_node(state->ct_result)) { + if (CALI_F_FROM_HEP && !ip_void(STATE->tun_ip) && ct_result_np_node(STATE->ct_result)) { /* Packet is returning from a NAT tunnel, * already SNATed, just forward it. */ @@ -613,8 +611,8 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, CALI_DEBUG("returned from NAT tunnel\n"); goto allow; } - state->post_nat_ip_dst = state->ct_result.nat_ip; - state->post_nat_dport = state->ct_result.nat_port; + STATE->post_nat_ip_dst = STATE->ct_result.nat_ip; + STATE->post_nat_dport = STATE->ct_result.nat_port; /* fall through */ @@ -622,10 +620,10 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, /* We may not do a true DNAT here if we are resolving service source port * conflict with host->pod w/o service. See calico_tc_host_ct_conflict(). */ - *is_dnat = state->ip_dst != state->post_nat_ip_dst || state->dport != state->post_nat_dport; + *is_dnat = !ip_equal(STATE->ip_dst, STATE->post_nat_ip_dst) || STATE->dport != STATE->post_nat_dport; CALI_DEBUG("CT: DNAT to %x:%d\n", - bpf_ntohl(state->post_nat_ip_dst), state->post_nat_dport); + debug_ip(STATE->post_nat_ip_dst), STATE->post_nat_dport); encap_needed = dnat_should_encap(); @@ -640,29 +638,29 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, /* When we need to encap, we need to find out if the backend is * local or not. If local, we actually do not need the encap. */ - rt = cali_rt_lookup(state->post_nat_ip_dst); + rt = cali_rt_lookup(&STATE->post_nat_ip_dst); if (!rt) { deny_reason(ctx, CALI_REASON_RT_UNKNOWN); goto deny; } CALI_DEBUG("rt found for 0x%x local %d\n", - bpf_ntohl(state->post_nat_ip_dst), !!cali_rt_is_local(rt)); + debug_ip(STATE->post_nat_ip_dst), !!cali_rt_is_local(rt)); encap_needed = !cali_rt_is_local(rt); if (encap_needed) { - if (CALI_F_FROM_HEP && state->tun_ip == 0) { + if (CALI_F_FROM_HEP && ip_void(STATE->tun_ip)) { if (CALI_F_DSR) { - ct_ctx_nat.flags |= CALI_CT_FLAG_DSR_FWD | - (state->ct_result.flags & CALI_CT_FLAG_NP_NO_DSR); + ct_ctx_nat->flags |= CALI_CT_FLAG_DSR_FWD | + (STATE->ct_result.flags & CALI_CT_FLAG_NP_NO_DSR); } - ct_ctx_nat.flags |= CALI_CT_FLAG_NP_FWD; + ct_ctx_nat->flags |= CALI_CT_FLAG_NP_FWD; } - ct_ctx_nat.allow_return = true; - ct_ctx_nat.tun_ip = rt->next_hop; - state->ip_dst = rt->next_hop; + ct_ctx_nat->allow_return = true; + ct_ctx_nat->tun_ip = rt->next_hop; + STATE->ip_dst = rt->next_hop; } else if (cali_rt_is_workload(rt) && - state->ip_dst != state->post_nat_ip_dst && + !ip_equal(STATE->ip_dst, STATE->post_nat_ip_dst) && !CALI_F_NAT_IF) { /* Packet arrived from a HEP for a workload and we're * about to NAT it. We can't rely on the kernel's RPF check @@ -675,82 +673,84 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, * rule is used. */ - ct_ctx_nat.flags |= CALI_CT_FLAG_EXT_LOCAL; - ctx->state->ct_result.flags |= CALI_CT_FLAG_EXT_LOCAL; + ct_ctx_nat->flags |= CALI_CT_FLAG_EXT_LOCAL; + STATE->ct_result.flags |= CALI_CT_FLAG_EXT_LOCAL; CALI_DEBUG("CT_NEW marked with FLAG_EXT_LOCAL\n"); } } - if (CALI_F_FROM_WEP && state->ip_src == state->post_nat_ip_dst) { + if (CALI_F_FROM_WEP && ip_equal(STATE->ip_src, STATE->post_nat_ip_dst)) { CALI_DEBUG("New loopback SNAT\n"); - ct_ctx_nat.flags |= CALI_CT_FLAG_SVC_SELF; - ctx->state->ct_result.flags |= CALI_CT_FLAG_SVC_SELF; + ct_ctx_nat->flags |= CALI_CT_FLAG_SVC_SELF; + STATE->ct_result.flags |= CALI_CT_FLAG_SVC_SELF; } - ct_ctx_nat.type = CALI_CT_TYPE_NAT_REV; + ct_ctx_nat->type = CALI_CT_TYPE_NAT_REV; int err; - if ((err = conntrack_create(ctx, &ct_ctx_nat))) { + if ((err = conntrack_create(ctx, ct_ctx_nat))) { CALI_DEBUG("Creating NAT conntrack failed with %d\n", err); goto deny; } - state->ct_result.nat_sip = ct_ctx_nat.src; - state->ct_result.nat_sport = ct_ctx_nat.sport; + STATE->ct_result.nat_sip = ct_ctx_nat->src; + STATE->ct_result.nat_sport = ct_ctx_nat->sport; } else { - if (encap_needed && ct_result_np_node(state->ct_result)) { - CALI_DEBUG("CT says encap to node %x\n", bpf_ntohl(state->ct_result.tun_ip)); - state->ip_dst = state->ct_result.tun_ip; + if (encap_needed && ct_result_np_node(STATE->ct_result)) { + CALI_DEBUG("CT says encap to node %x\n", debug_ip(STATE->ct_result.tun_ip)); + STATE->ip_dst = STATE->ct_result.tun_ip; } else { encap_needed = false; } } if (encap_needed) { - if (!(state->ip_proto == IPPROTO_TCP && skb_is_gso(ctx->skb)) && - ip_is_dnf(ip_hdr(ctx)) && vxlan_v4_encap_too_big(ctx)) { + if (!(STATE->ip_proto == IPPROTO_TCP && skb_is_gso(ctx->skb)) && + ip_is_dnf(ip_hdr(ctx)) && vxlan_encap_too_big(ctx)) { CALI_DEBUG("Request packet with DNF set is too big\n"); goto icmp_too_big; } - state->ip_src = HOST_IP; + STATE->ip_src = HOST_IP; *seen_mark = CALI_SKB_MARK_BYPASS_FWD; /* Do FIB if possible */ CALI_DEBUG("marking CALI_SKB_MARK_BYPASS_FWD\n"); goto nat_encap; } - ip_hdr(ctx)->saddr = state->ct_result.nat_sip; - ip_hdr(ctx)->daddr = state->post_nat_ip_dst; + ip_hdr_set_ip(ctx, saddr, STATE->ct_result.nat_sip); + ip_hdr_set_ip(ctx, daddr, STATE->post_nat_ip_dst); - switch (ip_hdr(ctx)->protocol) { + switch (STATE->ip_proto) { case IPPROTO_TCP: - if (state->ct_result.nat_sport) { + if (STATE->ct_result.nat_sport) { CALI_DEBUG("Fixing TCP source port from %d to %d\n", - bpf_ntohs(tcp_hdr(ctx)->source), state->ct_result.nat_sport); - tcp_hdr(ctx)->source = bpf_htons(state->ct_result.nat_sport); + bpf_ntohs(tcp_hdr(ctx)->source), STATE->ct_result.nat_sport); + tcp_hdr(ctx)->source = bpf_htons(STATE->ct_result.nat_sport); } - tcp_hdr(ctx)->dest = bpf_htons(state->post_nat_dport); + tcp_hdr(ctx)->dest = bpf_htons(STATE->post_nat_dport); break; case IPPROTO_UDP: - if (state->ct_result.nat_sport) { + if (STATE->ct_result.nat_sport) { CALI_DEBUG("Fixing UDP source port from %d to %d\n", - bpf_ntohs(udp_hdr(ctx)->source), state->ct_result.nat_sport); - udp_hdr(ctx)->source = bpf_htons(state->ct_result.nat_sport); + bpf_ntohs(udp_hdr(ctx)->source), STATE->ct_result.nat_sport); + udp_hdr(ctx)->source = bpf_htons(STATE->ct_result.nat_sport); } - udp_hdr(ctx)->dest = bpf_htons(state->post_nat_dport); + udp_hdr(ctx)->dest = bpf_htons(STATE->post_nat_dport); break; } CALI_DEBUG("L3 csum at %d L4 csum at %d\n", l3_csum_off, l4_csum_off); if (l4_csum_off) { - res = skb_nat_l4_csum_ipv4(ctx, l4_csum_off, - state->ip_src, - state->ct_result.nat_sip, - state->ip_dst, - state->post_nat_ip_dst, - bpf_htons(state->dport), - bpf_htons(state->post_nat_dport), - bpf_htons(state->sport), - bpf_htons(state->ct_result.nat_sport ? : state->sport), - ip_hdr(ctx)->protocol == IPPROTO_UDP ? BPF_F_MARK_MANGLED_0 : 0); + if (skb_nat_l4_csum(ctx, l4_csum_off, + STATE->ip_src, + STATE->ct_result.nat_sip, + STATE->ip_dst, + STATE->post_nat_ip_dst, + bpf_htons(STATE->dport), + bpf_htons(STATE->post_nat_dport), + bpf_htons(STATE->sport), + bpf_htons(STATE->ct_result.nat_sport ? : STATE->sport), + STATE->ip_proto == IPPROTO_UDP ? BPF_F_MARK_MANGLED_0 : 0)) { + goto deny; + } } if (!in_place) { @@ -768,24 +768,24 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, offset += ctx->ipheader_len; - if (bpf_skb_store_bytes(ctx->skb, offset, ctx->scratch->l4, 8, 0)) { + if (bpf_skb_store_bytes(ctx->skb, offset, ctx->nh, 8, 0)) { CALI_DEBUG("Too short\n"); deny_reason(ctx, CALI_REASON_SHORT); goto deny; } } - res |= bpf_l3_csum_replace(ctx->skb, l3_csum_off, state->ip_src, state->ct_result.nat_sip, 4); - res |= bpf_l3_csum_replace(ctx->skb, l3_csum_off, state->ip_dst, state->post_nat_ip_dst, 4); - /* From now on, the packet has a new source IP */ - if (state->ct_result.nat_sip) { - state->ip_src = state->ct_result.nat_sip; - } - - if (res) { +#ifndef IPVER6 + if (bpf_l3_csum_replace(ctx->skb, l3_csum_off, STATE->ip_src, STATE->ct_result.nat_sip, 4) || + bpf_l3_csum_replace(ctx->skb, l3_csum_off, STATE->ip_dst, STATE->post_nat_ip_dst, 4)) { deny_reason(ctx, CALI_REASON_CSUM_FAIL); goto deny; } +#endif + /* From now on, the packet has a new source IP */ + if (!ip_void(STATE->ct_result.nat_sip)) { + STATE->ip_src = STATE->ct_result.nat_sip; + } /* Handle returning ICMP related to tunnel * @@ -794,9 +794,9 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, * unlikely that we are anywhere to close the MTU limit. If we * are, we need to fail anyway. */ - if (ct_related && state->ip_proto == IPPROTO_ICMP - && state->ct_result.tun_ip - && (!CALI_F_DSR || (state->ct_result.flags & CALI_CT_FLAG_NP_NO_DSR))) { + if (ct_related && STATE->ip_proto == IPPROTO_ICMP + && !ip_void(STATE->ct_result.tun_ip) + && (!CALI_F_DSR || (STATE->ct_result.flags & CALI_CT_FLAG_NP_NO_DSR))) { if (dnat_return_should_encap()) { CALI_DEBUG("Returning related ICMP from workload to tunnel\n"); } else if (CALI_F_TO_HEP) { @@ -812,66 +812,69 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, CALI_DEBUG("Returning related ICMP from host to tunnel\n"); } - state->ip_src = HOST_IP; - state->ip_dst = state->ct_result.tun_ip; + STATE->ip_src = HOST_IP; + STATE->ip_dst = STATE->ct_result.tun_ip; goto nat_encap; } - state->dport = state->post_nat_dport; - state->ip_dst = state->post_nat_ip_dst; + STATE->dport = STATE->post_nat_dport; + STATE->ip_dst = STATE->post_nat_ip_dst; goto allow; case CALI_CT_ESTABLISHED_SNAT: CALI_DEBUG("CT: SNAT from %x:%d\n", - bpf_ntohl(state->ct_result.nat_ip), state->ct_result.nat_port); + debug_ip(STATE->ct_result.nat_ip), STATE->ct_result.nat_port); - if (dnat_return_should_encap() && state->ct_result.tun_ip) { - if (CALI_F_DSR && !(state->ct_result.flags & CALI_CT_FLAG_NP_NO_DSR)) { + if (dnat_return_should_encap() && !ip_void(STATE->ct_result.tun_ip)) { + if (CALI_F_DSR && !(STATE->ct_result.flags & CALI_CT_FLAG_NP_NO_DSR)) { /* SNAT will be done after routing, when leaving HEP */ CALI_DEBUG("DSR enabled, skipping SNAT + encap\n"); goto allow; } - if (!(state->ip_proto == IPPROTO_TCP && skb_is_gso(ctx->skb)) && - ip_is_dnf(ip_hdr(ctx)) && vxlan_v4_encap_too_big(ctx)) { + if (!(STATE->ip_proto == IPPROTO_TCP && skb_is_gso(ctx->skb)) && + ip_is_dnf(ip_hdr(ctx)) && vxlan_encap_too_big(ctx)) { CALI_DEBUG("Return ICMP mtu is too big\n"); goto icmp_too_big; } } // Actually do the NAT. - ip_hdr(ctx)->saddr = state->ct_result.nat_ip; - ip_hdr(ctx)->daddr = state->ct_result.nat_sip; + ip_hdr_set_ip(ctx, saddr, STATE->ct_result.nat_ip); + ip_hdr_set_ip(ctx, daddr, STATE->ct_result.nat_sip); - switch (ip_hdr(ctx)->protocol) { + switch (ctx->state->ip_proto) { case IPPROTO_TCP: - tcp_hdr(ctx)->source = bpf_htons(state->ct_result.nat_port); - if (state->ct_result.nat_sport) { + tcp_hdr(ctx)->source = bpf_htons(STATE->ct_result.nat_port); + if (STATE->ct_result.nat_sport) { CALI_DEBUG("Fixing TCP dest port from %d to %d\n", - bpf_ntohs(tcp_hdr(ctx)->dest), state->ct_result.nat_sport); - tcp_hdr(ctx)->dest = bpf_htons(state->ct_result.nat_sport); + bpf_ntohs(tcp_hdr(ctx)->dest), STATE->ct_result.nat_sport); + tcp_hdr(ctx)->dest = bpf_htons(STATE->ct_result.nat_sport); } break; case IPPROTO_UDP: - udp_hdr(ctx)->source = bpf_htons(state->ct_result.nat_port); - if (state->ct_result.nat_sport) { + udp_hdr(ctx)->source = bpf_htons(STATE->ct_result.nat_port); + if (STATE->ct_result.nat_sport) { CALI_DEBUG("Fixing UDP dest port from %d to %d\n", - bpf_ntohs(tcp_hdr(ctx)->dest), state->ct_result.nat_sport); - udp_hdr(ctx)->dest = bpf_htons(state->ct_result.nat_sport); + bpf_ntohs(tcp_hdr(ctx)->dest), STATE->ct_result.nat_sport); + udp_hdr(ctx)->dest = bpf_htons(STATE->ct_result.nat_sport); } break; } + /* XXX */ CALI_DEBUG("L3 csum at %d L4 csum at %d\n", l3_csum_off, l4_csum_off); - if (l4_csum_off) { - res = skb_nat_l4_csum_ipv4(ctx, l4_csum_off, - state->ip_src, state->ct_result.nat_ip, - state->ip_dst, state->ct_result.nat_sip, - bpf_htons(state->dport), bpf_htons(state->ct_result.nat_sport ? : state->dport), - bpf_htons(state->sport), bpf_htons(state->ct_result.nat_port), - ip_hdr(ctx)->protocol == IPPROTO_UDP ? BPF_F_MARK_MANGLED_0 : 0); + if (l4_csum_off && skb_nat_l4_csum(ctx, l4_csum_off, + STATE->ip_src, STATE->ct_result.nat_ip, + STATE->ip_dst, STATE->ct_result.nat_sip, + bpf_htons(STATE->dport), + bpf_htons(STATE->ct_result.nat_sport ? : STATE->dport), + bpf_htons(STATE->sport), bpf_htons(STATE->ct_result.nat_port), + STATE->ip_proto == IPPROTO_UDP ? BPF_F_MARK_MANGLED_0 : 0)) { + deny_reason(ctx, CALI_REASON_CSUM_FAIL); + goto deny; } if (!in_place) { @@ -896,20 +899,18 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, } } +#ifndef IPVER6 CALI_VERB("L3 checksum update (csum is at %d) port from %x to %x\n", - l3_csum_off, state->ip_src, state->ct_result.nat_ip); + l3_csum_off, STATE->ip_src, STATE->ct_result.nat_ip); - int csum_rc = bpf_l3_csum_replace(ctx->skb, l3_csum_off, - state->ip_src, state->ct_result.nat_ip, 4); - csum_rc |= bpf_l3_csum_replace(ctx->skb, l3_csum_off, - state->ip_dst, state->ct_result.nat_sip, 4); - CALI_VERB("bpf_l3_csum_replace(IP): %d\n", csum_rc); - res |= csum_rc; - - if (res) { + if (bpf_l3_csum_replace(ctx->skb, l3_csum_off, + STATE->ip_src, STATE->ct_result.nat_ip, 4) || + bpf_l3_csum_replace(ctx->skb, l3_csum_off, + STATE->ip_dst, STATE->ct_result.nat_sip, 4)) { deny_reason(ctx, CALI_REASON_CSUM_FAIL); goto deny; } +#endif /* In addition to dnat_return_should_encap() we also need to encap on the * host endpoint for egress traffic, when we hit an SNAT rule. This is the @@ -918,14 +919,14 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, * able to match as SNAT. */ if ((dnat_return_should_encap() || (CALI_F_TO_HEP && !CALI_F_DSR)) && - state->ct_result.tun_ip) { - state->ip_src = HOST_IP; - state->ip_dst = state->ct_result.tun_ip; + !ip_void(STATE->ct_result.tun_ip)) { + STATE->ip_src = HOST_IP; + STATE->ip_dst = STATE->ct_result.tun_ip; goto nat_encap; } - state->sport = state->ct_result.nat_port; - state->ip_src = state->ct_result.nat_ip; + STATE->sport = STATE->ct_result.nat_port; + STATE->ip_src = STATE->ct_result.nat_ip; goto allow; } @@ -937,8 +938,9 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, return NAT_ALLOW; icmp_too_big: - state->icmp_type = ICMP_DEST_UNREACH; - state->icmp_code = ICMP_FRAG_NEEDED; +#ifndef IPVER6 + STATE->icmp_type = ICMP_DEST_UNREACH; + STATE->icmp_code = ICMP_FRAG_NEEDED; struct { __be16 unused; @@ -946,11 +948,16 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, } frag = { .mtu = bpf_htons(TUNNEL_MTU), }; - state->tun_ip = *(__be32 *)&frag; + STATE->tun_ip = *(__be32 *)&frag; return NAT_ICMP_TOO_BIG; +#else + /* XXX not implemented yet. */ + return NAT_DENY; +#endif nat_encap: + /* XXX */ /* We are about to encap return traffic that originated on the local host * namespace - a host networked pod. Routing was based on the dst IP, * which was the original client's IP at that time, not the node's that @@ -959,14 +966,14 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, if (CALI_F_TO_HEP) { struct arp_value *arpv; struct arp_key arpk = { - .ip = state->ip_dst, + .ip = STATE->ip_dst, .ifindex = ctx->skb->ifindex, }; - arpv = cali_v4_arp_lookup_elem(&arpk); + arpv = cali_arp_lookup_elem(&arpk); if (!arpv) { CALI_DEBUG("ARP lookup failed for %x dev %d at HEP\n", - bpf_ntohl(state->ip_dst), arpk.ifindex); + debug_ip(STATE->ip_dst), arpk.ifindex); /* Don't drop it yet, we might get lucky and the MAC is correct */ } else { if (skb_refresh_validate_ptrs(ctx, 0)) { @@ -975,7 +982,7 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, goto deny; } __builtin_memcpy(ð_hdr(ctx)->h_dest, arpv->mac_dst, ETH_ALEN); - if (state->ct_result.ifindex_fwd == ctx->skb->ifindex) { + if (STATE->ct_result.ifindex_fwd == ctx->skb->ifindex) { /* No need to change src MAC, if we are at the right device */ } else { /* FIXME we need to redirect to the right device */ @@ -983,16 +990,16 @@ static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx, } } - if (vxlan_v4_encap(ctx, state->ip_src, state->ip_dst)) { + if (vxlan_encap(ctx, &STATE->ip_src, &STATE->ip_dst)) { deny_reason(ctx, CALI_REASON_ENCAP_FAIL); goto deny; } - state->sport = state->dport = VXLAN_PORT; - state->ip_proto = IPPROTO_UDP; + STATE->sport = STATE->dport = VXLAN_PORT; + STATE->ip_proto = IPPROTO_UDP; CALI_DEBUG("vxlan return %d ifindex_fwd %d\n", - dnat_return_should_encap(), state->ct_result.ifindex_fwd); + dnat_return_should_encap(), STATE->ct_result.ifindex_fwd); return NAT_ENCAP_ALLOW; } @@ -1028,12 +1035,12 @@ static CALI_BPF_INLINE struct fwd post_nat(struct cali_tc_ctx *ctx, } if (CALI_F_TO_HEP && !skb_seen(ctx->skb) && is_dnat) { - struct cali_rt *r = cali_rt_lookup(state->post_nat_ip_dst); + struct cali_rt *r = cali_rt_lookup(&state->post_nat_ip_dst); if (r && cali_rt_flags_local_workload(r->flags)) { state->ct_result.ifindex_fwd = r->if_index; CALI_DEBUG("NP local WL %x:%d on HEP\n", - bpf_htonl(state->post_nat_ip_dst), state->post_nat_dport); + debug_ip(state->post_nat_ip_dst), state->post_nat_dport); ctx->state->flags |= CALI_ST_CT_NP_LOOP; fib = true; /* Enforce FIB since we want to redirect */ } else if (!r || cali_rt_flags_remote_workload(r->flags)) { @@ -1041,7 +1048,7 @@ static CALI_BPF_INLINE struct fwd post_nat(struct cali_tc_ctx *ctx, if (CALI_F_LO || CALI_F_MAIN) { state->ct_result.ifindex_fwd = NATIN_IFACE ; CALI_DEBUG("NP remote WL %x:%d on LO or main HEP\n", - bpf_htonl(state->post_nat_ip_dst), state->post_nat_dport); + debug_ip(state->post_nat_ip_dst), state->post_nat_dport); ctx->state->flags |= CALI_ST_CT_NP_LOOP; } ctx->state->flags |= CALI_ST_CT_NP_REMOTE; @@ -1081,7 +1088,6 @@ int calico_tc_skb_accepted_entrypoint(struct __sk_buff *skb) .reason = CALI_REASON_UNKNOWN, .mark = CALI_SKB_MARK_SEEN, }, - .ipheader_len = IP_SIZE, ); struct cali_tc_ctx *ctx = &_ctx; @@ -1111,7 +1117,7 @@ int calico_tc_skb_accepted_entrypoint(struct __sk_buff *skb) .addr = ctx->state->nat_dest.addr, .port = ctx->state->nat_dest.port, }; - if (ctx->state->nat_dest.addr != 0) { + if (!ip_void(ctx->state->nat_dest.addr)) { nat_dest = &nat_dest_2; } @@ -1128,17 +1134,18 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx CALI_DEBUG("Entering calico_tc_skb_accepted\n"); struct cali_tc_state *state = ctx->state; bool fib = true; - struct ct_create_ctx ct_ctx_nat = {}; + struct ct_create_ctx *ct_ctx_nat = &ctx->scratch->ct_ctx_nat; int ct_rc = ct_result_rc(state->ct_result.rc); bool ct_related = ct_result_is_related(state->ct_result.rc); __u32 seen_mark = ctx->fwd.mark; - size_t l4_csum_off = 0, l3_csum_off; + size_t l4_csum_off = 0; + size_t l3_csum_off = 0;; bool is_dnat = false; enum do_nat_res nat_res = NAT_ALLOW; - CALI_DEBUG("src=%x dst=%x\n", bpf_ntohl(state->ip_src), bpf_ntohl(state->ip_dst)); - CALI_DEBUG("post_nat=%x:%d\n", bpf_ntohl(state->post_nat_ip_dst), state->post_nat_dport); - CALI_DEBUG("tun_ip=%x\n", state->tun_ip); + CALI_DEBUG("src=%x dst=%x\n", debug_ip(state->ip_src), debug_ip(state->ip_dst)); + CALI_DEBUG("post_nat=%x:%d\n", debug_ip(state->post_nat_ip_dst), state->post_nat_dport); + CALI_DEBUG("tun_ip=%x\n", debug_ip(state->tun_ip)); CALI_DEBUG("pol_rc=%d\n", state->pol_rc); CALI_DEBUG("sport=%d\n", state->sport); CALI_DEBUG("flags=%x\n", state->flags); @@ -1171,7 +1178,11 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx /* We check the ttl here to avoid needing complicated handling of * related traffic back from the host if we let the host to handle it. */ +#ifdef IPVER6 + CALI_DEBUG("ip->hop_limit %d\n", ip_hdr(ctx)->hop_limit); +#else CALI_DEBUG("ip->ttl %d\n", ip_hdr(ctx)->ttl); +#endif if (ip_ttl_exceeded(ip_hdr(ctx))) { switch (ct_rc){ case CALI_CT_NEW: @@ -1185,16 +1196,18 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx } } +#ifndef IPVER6 l3_csum_off = skb_iphdr_offset(ctx) + offsetof(struct iphdr, check); +#endif if (ct_related) { - if (ip_hdr(ctx)->protocol == IPPROTO_ICMP) { + if (ctx->state->ip_proto == IPPROTO_ICMP) { bool outer_ip_snat; /* if we do SNAT ... */ outer_ip_snat = ct_rc == CALI_CT_ESTABLISHED_SNAT; /* ... there is a return path to the tunnel ... */ - outer_ip_snat = outer_ip_snat && state->ct_result.tun_ip; + outer_ip_snat = outer_ip_snat && !ip_void(state->ct_result.tun_ip); /* ... and should do encap and it is not DSR or it is leaving host * and either DSR from WEP or originated at host ... */ outer_ip_snat = outer_ip_snat && @@ -1204,15 +1217,17 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx /* ... then fix the outer header IP first */ if (outer_ip_snat) { - ip_hdr(ctx)->saddr = state->ct_result.nat_ip; + ip_hdr_set_ip(ctx, saddr, state->ct_result.nat_ip); +#ifndef IPVER6 int res = bpf_l3_csum_replace(ctx->skb, l3_csum_off, state->ip_src, state->ct_result.nat_ip, 4); if (res) { deny_reason(ctx, CALI_REASON_CSUM_FAIL); goto deny; } +#endif CALI_DEBUG("ICMP related: outer IP SNAT to %x\n", - bpf_ntohl(state->ct_result.nat_ip)); + debug_ip(state->ct_result.nat_ip)); } /* Related ICMP traffic must be an error response so it should include inner IP @@ -1234,7 +1249,7 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx } } - switch (ip_hdr(ctx)->protocol) { + switch (ctx->state->ip_proto) { case IPPROTO_TCP: l4_csum_off = skb_l4hdr_offset(ctx) + offsetof(struct tcphdr, check); break; @@ -1259,45 +1274,46 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx if (CALI_F_FROM_WEP && CALI_DROP_WORKLOAD_TO_HOST && cali_rt_flags_local_host( - cali_rt_lookup_flags(state->post_nat_ip_dst))) { + cali_rt_lookup_flags(&state->post_nat_ip_dst))) { CALI_DEBUG("Workload to host traffic blocked by " "DefaultEndpointToHostAction: DROP\n"); goto deny; } - ct_ctx_nat.skb = ctx->skb; - ct_ctx_nat.proto = state->ip_proto; - ct_ctx_nat.src = state->ip_src; - ct_ctx_nat.sport = state->sport; - ct_ctx_nat.dst = state->post_nat_ip_dst; - ct_ctx_nat.dport = state->post_nat_dport; - ct_ctx_nat.tun_ip = state->tun_ip; - ct_ctx_nat.type = CALI_CT_TYPE_NORMAL; - ct_ctx_nat.allow_return = false; + __builtin_memset(ct_ctx_nat, 0, sizeof(*ct_ctx_nat)); + + ct_ctx_nat->proto = state->ip_proto; + ct_ctx_nat->src = state->ip_src; + ct_ctx_nat->sport = state->sport; + ct_ctx_nat->dst = state->post_nat_ip_dst; + ct_ctx_nat->dport = state->post_nat_dport; + ct_ctx_nat->tun_ip = state->tun_ip; + ct_ctx_nat->type = CALI_CT_TYPE_NORMAL; + ct_ctx_nat->allow_return = false; if (state->flags & CALI_ST_NAT_OUTGOING) { - ct_ctx_nat.flags |= CALI_CT_FLAG_NAT_OUT; + ct_ctx_nat->flags |= CALI_CT_FLAG_NAT_OUT; } if (CALI_F_FROM_WEP && state->flags & CALI_ST_SKIP_FIB) { - ct_ctx_nat.flags |= CALI_CT_FLAG_SKIP_FIB; + ct_ctx_nat->flags |= CALI_CT_FLAG_SKIP_FIB; } /* Packets received at WEP with CALI_CT_FLAG_SKIP_FIB mark signal * that all traffic on this connection must flow via host namespace as it was * originally meant for host, but got redirected to a WEP by a 3rd party DNAT rule. */ if (CALI_F_TO_WEP && ((ctx->skb->mark & CALI_SKB_MARK_SKIP_FIB) == CALI_SKB_MARK_SKIP_FIB)) { - ct_ctx_nat.flags |= CALI_CT_FLAG_SKIP_FIB; + ct_ctx_nat->flags |= CALI_CT_FLAG_SKIP_FIB; } if (CALI_F_TO_HOST && CALI_F_NAT_IF) { - ct_ctx_nat.flags |= CALI_CT_FLAG_VIA_NAT_IF; + ct_ctx_nat->flags |= CALI_CT_FLAG_VIA_NAT_IF; } if (CALI_F_TO_HEP && !CALI_F_NAT_IF && state->flags & CALI_ST_CT_NP_LOOP) { - ct_ctx_nat.flags |= CALI_CT_FLAG_NP_LOOP; + ct_ctx_nat->flags |= CALI_CT_FLAG_NP_LOOP; } if (CALI_F_TO_HEP && !CALI_F_NAT_IF && state->flags & CALI_ST_CT_NP_REMOTE) { - ct_ctx_nat.flags |= CALI_CT_FLAG_NP_REMOTE; + ct_ctx_nat->flags |= CALI_CT_FLAG_NP_REMOTE; } if (state->flags & CALI_ST_HOST_PSNAT) { - ct_ctx_nat.flags |= CALI_CT_FLAG_HOST_PSNAT; + ct_ctx_nat->flags |= CALI_CT_FLAG_HOST_PSNAT; } /* Mark connections that were routed via bpfnatout, but had CT miss at * HEP. That is because of SNAT happened between bpfnatout and here. @@ -1306,7 +1322,7 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx */ if (CALI_F_TO_HEP && ((ctx->skb->mark & CALI_SKB_MARK_FROM_NAT_IFACE_OUT) == CALI_SKB_MARK_FROM_NAT_IFACE_OUT)) { - ct_ctx_nat.flags |= CALI_CT_FLAG_VIA_NAT_IF; + ct_ctx_nat->flags |= CALI_CT_FLAG_VIA_NAT_IF; } /* If we just received the first packet for a NP forwarded from a @@ -1314,12 +1330,10 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx * CIDRs from DSR, we need to make a check if this client also opted out * and save the information in conntrack. */ - CALI_DEBUG("CALI_F_DSR: %d\n", CALI_F_DSR); - CALI_DEBUG("GLOBAL_FLAGS: 0x%x\n", GLOBAL_FLAGS); if (CALI_F_FROM_HEP && CALI_F_DSR && (GLOBAL_FLAGS & CALI_GLOBALS_NO_DSR_CIDRS)) { - CALI_DEBUG("state->tun_ip = 0x%x\n", state->tun_ip); - if (state->tun_ip && cali_rt_lookup_flags(state->ip_src) & CALI_RT_NO_DSR) { - ct_ctx_nat.flags |= CALI_CT_FLAG_NP_NO_DSR; + CALI_DEBUG("state->tun_ip = 0x%x\n", debug_ip(state->tun_ip)); + if (!ip_void(state->tun_ip) && cali_rt_lookup_flags(&state->ip_src) & CALI_RT_NO_DSR) { + ct_ctx_nat->flags |= CALI_CT_FLAG_NP_NO_DSR; CALI_DEBUG("CALI_CT_FLAG_NP_NO_DSR\n"); } } @@ -1330,17 +1344,17 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx CALI_DEBUG("Too short for TCP: DROP\n"); goto deny; } - ct_ctx_nat.tcp = tcp_hdr(ctx); + ct_ctx_nat->tcp = tcp_hdr(ctx); } // If we get here, we've passed policy. if (nat_dest == NULL) { - if (conntrack_create(ctx, &ct_ctx_nat)) { + if (conntrack_create(ctx, ct_ctx_nat)) { CALI_DEBUG("Creating normal conntrack failed\n"); - if ((CALI_F_FROM_HEP && rt_addr_is_local_host(ct_ctx_nat.dst)) || - (CALI_F_TO_HEP && rt_addr_is_local_host(ct_ctx_nat.src))) { + if ((CALI_F_FROM_HEP && rt_addr_is_local_host(&ct_ctx_nat->dst)) || + (CALI_F_TO_HEP && rt_addr_is_local_host(&ct_ctx_nat->src))) { CALI_DEBUG("Allowing local host traffic without CT\n"); goto allow; } @@ -1350,26 +1364,26 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx goto allow; } - ct_ctx_nat.orig_src = state->ip_src; - ct_ctx_nat.orig_dst = state->ip_dst; - ct_ctx_nat.orig_dport = state->dport; - ct_ctx_nat.orig_sport = state->sport; - state->ct_result.nat_sport = ct_ctx_nat.sport; + ct_ctx_nat->orig_src = state->ip_src; + ct_ctx_nat->orig_dst = state->ip_dst; + ct_ctx_nat->orig_dport = state->dport; + ct_ctx_nat->orig_sport = state->sport; + state->ct_result.nat_sport = ct_ctx_nat->sport; /* fall through as DNAT is now established */ if ((CALI_F_TO_HOST && CALI_F_NAT_IF) || (CALI_F_TO_HEP && (CALI_F_LO || CALI_F_MAIN))) { - struct cali_rt *r = cali_rt_lookup(state->post_nat_ip_dst); + struct cali_rt *r = cali_rt_lookup(&state->post_nat_ip_dst); if (r && cali_rt_flags_remote_workload(r->flags) && cali_rt_is_tunneled(r)) { CALI_DEBUG("remote wl %x tunneled via %x\n", - bpf_htonl(state->post_nat_ip_dst), bpf_htonl(HOST_TUNNEL_IP)); - ct_ctx_nat.src = HOST_TUNNEL_IP; + debug_ip(state->post_nat_ip_dst), debug_ip(HOST_TUNNEL_IP)); + ct_ctx_nat->src = HOST_TUNNEL_IP; /* This would be the place to set a new source port if we * had a way how to allocate it. Instead we rely on source * port collision resolution. - * ct_ctx_nat.sport = 10101; + * ct_ctx_nat->sport = 10101; */ - state->ct_result.nat_sip = ct_ctx_nat.src; - state->ct_result.nat_sport = ct_ctx_nat.sport; + state->ct_result.nat_sip = ct_ctx_nat->src; + state->ct_result.nat_sport = ct_ctx_nat->sport; } } @@ -1410,12 +1424,14 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx goto deny; icmp_ttl_exceeded: +#ifndef IPVER6 if (ip_frag_no(ip_hdr(ctx))) { goto deny; } +#endif state->icmp_type = ICMP_TIME_EXCEEDED; state->icmp_code = ICMP_EXC_TTL; - state->tun_ip = 0; + ip_set_void(state->tun_ip); goto icmp_send_reply; icmp_send_reply: @@ -1434,6 +1450,11 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx SEC("tc") int calico_tc_skb_icmp_inner_nat(struct __sk_buff *skb) { +#ifdef IPVER6 + /* XXX not implemented yet */ + return TC_ACT_SHOT; +#else + /* Initialise the context, which is stored on the stack, and the state, which * we use to pass data from one program to the next via tail calls. */ DECLARE_TC_CTX(_ctx, @@ -1442,14 +1463,13 @@ int calico_tc_skb_icmp_inner_nat(struct __sk_buff *skb) .res = TC_ACT_UNSPEC, .reason = CALI_REASON_UNKNOWN, }, - .ipheader_len = IP_SIZE, ); struct cali_tc_ctx *ctx = &_ctx; struct cali_tc_state *state = ctx->state; bool ct_related = ct_result_is_related(state->ct_result.rc); int ct_rc = ct_result_rc(state->ct_result.rc); - + CALI_DEBUG("Entering calico_tc_skb_icmp_inner_nat\n"); if (!ct_related) { @@ -1480,7 +1500,7 @@ int calico_tc_skb_icmp_inner_nat(struct __sk_buff *skb) } ctx->ip_header = (struct iphdr*)pkt; - ctx->ipheader_len = ip_hdr(ctx)->ihl * 4; + tc_state_fill_from_iphdr(ctx); if (ctx->ipheader_len > 60) { CALI_DEBUG("this cannot be!\n"); goto deny; @@ -1495,7 +1515,7 @@ int calico_tc_skb_icmp_inner_nat(struct __sk_buff *skb) goto deny; } - ctx->scratch = (void *)(pkt + ctx->ipheader_len); + ctx->nh = (void *)(pkt + ctx->ipheader_len); /* Flip the direction, we need to reverse the original packet. */ switch (ct_rc) { @@ -1527,22 +1547,39 @@ int calico_tc_skb_icmp_inner_nat(struct __sk_buff *skb) bool fib = true; struct ct_create_ctx ct_ctx_nat = {}; /* CT_NEW is not the option so pass an empty one. */ - nat_res = do_nat(ctx, l3_csum_off, 0, false, ct_rc, ct_ctx_nat, &is_dnat, &seen_mark, false); + nat_res = do_nat(ctx, l3_csum_off, 0, false, ct_rc, &ct_ctx_nat, &is_dnat, &seen_mark, false); ctx->fwd = post_nat(ctx, nat_res, fib, seen_mark, is_dnat); allow: + /* We are going to forward the packet now. But all the state is about + * the inner IP so we need to refresh our state back to the outter IP + * that is used for forwarding! + * + * N.B. we could just remember an update the state, however, forwarding + * also updates ttl/hops in the header so we need the right header + * available anyway. + */ + if (parse_packet_ip(ctx) != PARSING_OK) { + CALI_DEBUG("Non ipv4 packet on icmp path! DROP!\n"); + goto deny; + } + tc_state_fill_from_iphdr(ctx); fwd_fib_set(&ctx->fwd, true); return forward_or_drop(ctx); deny: return TC_ACT_SHOT; +#endif /* IPVER6 */ } SEC("tc") int calico_tc_skb_send_icmp_replies(struct __sk_buff *skb) { +#ifdef IPVER6 + return TC_ACT_SHOT; +#else __u32 fib_flags = 0; /* Initialise the context, which is stored on the stack, and the state, which @@ -1553,7 +1590,6 @@ int calico_tc_skb_send_icmp_replies(struct __sk_buff *skb) .res = TC_ACT_UNSPEC, .reason = CALI_REASON_UNKNOWN, }, - .ipheader_len = IP_SIZE, ); struct cali_tc_ctx *ctx = &_ctx; @@ -1588,6 +1624,7 @@ int calico_tc_skb_send_icmp_replies(struct __sk_buff *skb) return forward_or_drop(ctx); deny: return TC_ACT_SHOT; +#endif /* IPVER6 */ } #if HAS_HOST_CONFLICT_PROG @@ -1602,7 +1639,6 @@ int calico_tc_host_ct_conflict(struct __sk_buff *skb) .res = TC_ACT_UNSPEC, .reason = CALI_REASON_UNKNOWN, }, - .ipheader_len = IP_SIZE, ); struct cali_tc_ctx *ctx = &_ctx; @@ -1662,7 +1698,6 @@ int calico_tc_skb_drop(struct __sk_buff *skb) { DECLARE_TC_CTX(_ctx, .skb = skb, - .ipheader_len = IP_SIZE, ); struct cali_tc_ctx *ctx = &_ctx; @@ -1672,12 +1707,12 @@ int calico_tc_skb_drop(struct __sk_buff *skb) counter_inc(ctx, CALI_REASON_DROPPED_BY_POLICY); CALI_DEBUG("proto=%d\n", ctx->state->ip_proto); - CALI_DEBUG("src=%x dst=%x\n", bpf_ntohl(ctx->state->ip_src), - bpf_ntohl(ctx->state->ip_dst)); - CALI_DEBUG("pre_nat=%x:%d\n", bpf_ntohl(ctx->state->pre_nat_ip_dst), + CALI_DEBUG("src=%x dst=%x\n", debug_ip(ctx->state->ip_src), + debug_ip(ctx->state->ip_dst)); + CALI_DEBUG("pre_nat=%x:%d\n", debug_ip(ctx->state->pre_nat_ip_dst), ctx->state->pre_nat_dport); - CALI_DEBUG("post_nat=%x:%d\n", bpf_ntohl(ctx->state->post_nat_ip_dst), ctx->state->post_nat_dport); - CALI_DEBUG("tun_ip=%x\n", ctx->state->tun_ip); + CALI_DEBUG("post_nat=%x:%d\n", debug_ip(ctx->state->post_nat_ip_dst), ctx->state->post_nat_dport); + CALI_DEBUG("tun_ip=%x\n", debug_ip(ctx->state->tun_ip)); CALI_DEBUG("pol_rc=%d\n", ctx->state->pol_rc); CALI_DEBUG("sport=%d\n", ctx->state->sport); CALI_DEBUG("flags=0x%x\n", ctx->state->flags); @@ -1698,17 +1733,17 @@ int calico_tc_skb_drop(struct __sk_buff *skb) ctx->state->pre_nat_dport == WG_PORT && ctx->state->sport == WG_PORT) { if ((CALI_F_FROM_HEP && - rt_addr_is_local_host(ctx->state->ip_dst) && - rt_addr_is_remote_host(ctx->state->ip_src)) || + rt_addr_is_local_host(&ctx->state->ip_dst) && + rt_addr_is_remote_host(&ctx->state->ip_src)) || (CALI_F_TO_HEP && - rt_addr_is_remote_host(ctx->state->ip_dst) && - rt_addr_is_local_host(ctx->state->ip_src))) { + rt_addr_is_remote_host(&ctx->state->ip_dst) && + rt_addr_is_local_host(&ctx->state->ip_src))) { /* This is info as it is supposed to be low intensity (only when a * new flow detected - should happen exactly once in a blue moon ;-) ) * but would be good to know about for issue debugging. */ CALI_INFO("Allowing WG %x <-> %x despite blocked by policy - known hosts.\n", - bpf_ntohl(ctx->state->ip_src), bpf_ntohl(ctx->state->ip_dst)); + debug_ip(ctx->state->ip_src), debug_ip(ctx->state->ip_dst)); goto allow; } } diff --git a/felix/bpf-gpl/tc6.c b/felix/bpf-gpl/tc6.c deleted file mode 100644 index 5bb0d3cacb0..00000000000 --- a/felix/bpf-gpl/tc6.c +++ /dev/null @@ -1,134 +0,0 @@ -// Project Calico BPF dataplane programs. -// Copyright (c) 2022 Tigera, Inc. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later - -#include -#include -#include - -// stdbool.h has no deps so it's OK to include; stdint.h pulls in parts -// of the std lib that aren't compatible with BPF. -#include - -#include "bpf.h" -#include "types.h" -#include "counters.h" -#include "log.h" -#include "skb.h" -#include "routes.h" -#include "parsing.h" -#include "ipv6.h" -#include "jump.h" -#include "policy.h" - -const volatile struct cali_tc_globals __globals; - - -SEC("tc") -int calico_tc6(struct __sk_buff *skb) -{ - struct cali_tc_ctx _ctx = { - .state = state_get(), - .globals = state_get_globals_tc(), - .skb = skb, - .fwd = { - .res = TC_ACT_UNSPEC, - .reason = CALI_REASON_UNKNOWN, - }, - .ipheader_len = IPv6_SIZE, - }; - struct cali_tc_ctx *ctx = &_ctx; - - if (!ctx->globals) { - CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "State map globals lookup failed: DROP\n"); - return TC_ACT_SHOT; - } - - if (!ctx->state) { - CALI_DEBUG("State map lookup failed: DROP\n"); - return TC_ACT_SHOT; - } - - CALI_DEBUG("Entering IPv6 prologue program\n"); - - // TODO: Add IPv6 counters - - if (CALI_LOG_LEVEL >= CALI_LOG_LEVEL_INFO) { - ctx->state->prog_start_time = bpf_ktime_get_ns(); - } - - if (skb_refresh_validate_ptrs(ctx, UDP_SIZE)) { - deny_reason(ctx, CALI_REASON_SHORT); - CALI_DEBUG("Too short\n"); - goto deny; - } - - tc_state_fill_from_ipv6hdr(ctx); - - /* Parse out the source/dest ports (or type/code for ICMP). */ - switch (tc_state_fill_from_nexthdr(ctx)) { - case PARSING_ERROR: - goto deny; - case PARSING_ALLOW_WITHOUT_ENFORCING_POLICY: - goto allow; - } - - CALI_LOG_IPV6(ipv6_hdr(ctx)); - CALI_DEBUG("IP src=%x\n", ctx->state->ip_src); - CALI_DEBUG("IP src1=%x\n", ctx->state->ip_src1); - CALI_DEBUG("IP src2=%x\n", ctx->state->ip_src2); - CALI_DEBUG("IP src3=%x\n", ctx->state->ip_src3); - CALI_DEBUG("proto=%d\n", ctx->state->ip_proto); - CALI_DEBUG("sport=%d\n", ctx->state->sport); - CALI_DEBUG("dport=%d\n", ctx->state->dport); - - if (CALI_F_WEP) { - CALI_DEBUG("IPv6 from workload: drop\n"); - goto deny; - } - CALI_DEBUG("IPv6 on host interface: allow\n"); - CALI_DEBUG("About to jump to normal policy program\n"); - CALI_JUMP_TO(ctx, PROG_INDEX_V6_POLICY); - if (CALI_F_HEP) { - CALI_DEBUG("HEP with no policy, allow.\n"); - goto allow; - } - CALI_DEBUG("Tail call to normal policy program failed: DROP\n"); - -deny: - skb->mark = CALI_SKB_MARK_SEEN; - return TC_ACT_SHOT; - -allow: - skb->mark = CALI_SKB_MARK_SEEN; - return TC_ACT_UNSPEC; -} - -SEC("tc") -int calico_tc_skb_accepted_entrypoint(struct __sk_buff *skb) -{ - CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "Entering IPv6 accepted program\n"); - // TODO: Implement the logic for accepted packets by the policy program - // We should not reach here since no tail call happens to this program - skb->mark = CALI_SKB_MARK_SEEN; - return TC_ACT_UNSPEC; -} - -SEC("tc") -int calico_tc_skb_send_icmp_replies(struct __sk_buff *skb) -{ - CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "Entering IPv6 icmp program\n"); - // TODO: Implement the logic for accepted icmp packets by the policy program - // We should not reach here since no tail call happens to this program - return TC_ACT_SHOT; -} - -SEC("tc") -int calico_tc_skb_drop(struct __sk_buff *skb) -{ - CALI_LOG_IF(CALI_LOG_LEVEL_DEBUG, "Entering IPv6 drop program\n"); - // TODO: Implement the logic for dropped packets by the policy program - // We should not reach here since no tail call happens to this program - return TC_ACT_SHOT; -} - diff --git a/felix/bpf-gpl/tc_preamble.c b/felix/bpf-gpl/tc_preamble.c index cc9cfce00af..5ec06c081e2 100644 --- a/felix/bpf-gpl/tc_preamble.c +++ b/felix/bpf-gpl/tc_preamble.c @@ -14,6 +14,19 @@ const volatile struct cali_tc_globals __globals; +#ifdef IPVER6 +#define IPV " v6" +#define JUMP_IDX(idx) (idx ## _V6) +#define JUMP_IDX_DEBUG(idx) (idx ## _V6_DEBUG) +#else +#define IPV " v4" +#define JUMP_IDX(idx) (idx) +#define JUMP_IDX_DEBUG(idx) (idx ## _DEBUG) +#endif + +#define JUMP(idx) globals->jumps[JUMP_IDX(idx)] +#define JUMP_DEBUG(idx) globals->jumps[JUMP_IDX_DEBUG(idx)] + SEC("tc") int cali_tc_preamble(struct __sk_buff *skb) { @@ -27,39 +40,39 @@ int cali_tc_preamble(struct __sk_buff *skb) *globals = __globals; #if EMIT_LOGS - CALI_LOG("tc_preamble iface %s\n", globals->iface_name); + CALI_LOG("tc_preamble" IPV " iface %s\n", globals->iface_name); #endif /* If we have log filter installed, tell the filter where to jump next * and jump to the filter. */ if (globals->log_filter_jmp != (__u32)-1) { - skb->cb[0] = globals->jumps[PROG_INDEX_MAIN]; - skb->cb[1] = globals->jumps[PROG_INDEX_MAIN_DEBUG]; + skb->cb[0] = JUMP(PROG_INDEX_MAIN); + skb->cb[1] = JUMP_DEBUG(PROG_INDEX_MAIN); bpf_tail_call(skb, &cali_jump_prog_map, globals->log_filter_jmp); - CALI_LOG("tc_preamble iface %s failed to call log filter %d\n", + CALI_LOG("tc_preamble" IPV " iface %s failed to call log filter %d\n", globals->iface_name, globals->log_filter_jmp); /* try to jump to the regular path */ } /* Jump to the start of the prog chain. */ #if EMIT_LOGS - CALI_LOG("tc_preamble iface %s jump to %d\n", - globals->iface_name, globals->jumps[PROG_INDEX_MAIN]); + CALI_LOG("tc_preamble" IPV " iface %s jump to %d\n", + globals->iface_name, JUMP(PROG_INDEX_MAIN)); #endif - bpf_tail_call(skb, &cali_jump_map, globals->jumps[PROG_INDEX_MAIN]); - CALI_LOG("tc_preamble iface %s failed to call main %d\n", - globals->iface_name, globals->jumps[PROG_INDEX_MAIN]); + bpf_tail_call(skb, &cali_jump_map, JUMP(PROG_INDEX_MAIN)); + CALI_LOG("tc_preamble" IPV " iface %s failed to call main %d\n", + globals->iface_name, JUMP(PROG_INDEX_MAIN)); /* Try debug path in the unexpected case of not being able to make the jump. */ - CALI_LOG("tc_preamble iface %s jump to %d\n", - globals->iface_name, globals->jumps[PROG_INDEX_MAIN_DEBUG]); - bpf_tail_call(skb, &cali_jump_map, globals->jumps[PROG_INDEX_MAIN_DEBUG]); - CALI_LOG("tc_preamble iface %s failed to call debug main %d\n", - globals->iface_name, globals->jumps[PROG_INDEX_MAIN_DEBUG]); + CALI_LOG("tc_preamble" IPV " iface %s jump to %d\n", + globals->iface_name, JUMP_DEBUG(PROG_INDEX_MAIN)); + bpf_tail_call(skb, &cali_jump_map, JUMP_DEBUG(PROG_INDEX_MAIN)); + CALI_LOG("tc_preamble" IPV " iface %s failed to call debug main %d\n", + globals->iface_name, JUMP_DEBUG(PROG_INDEX_MAIN)); /* Drop the packet in the unexpected case of not being able to make the jump. */ - CALI_LOG("tc_preamble iface %s failed to call main %d\n", globals->iface_name, globals->jumps[PROG_INDEX_MAIN]); + CALI_LOG("tc_preamble" IPV " iface %s failed to call main %d\n", globals->iface_name, JUMP(PROG_INDEX_MAIN)); return TC_ACT_SHOT; } diff --git a/felix/bpf-gpl/types.h b/felix/bpf-gpl/types.h index 794d284bfac..1b3d2fee362 100644 --- a/felix/bpf-gpl/types.h +++ b/felix/bpf-gpl/types.h @@ -8,7 +8,11 @@ #include #include #include +#ifdef IPVER6 +#include +#else #include +#endif #include #include #include @@ -23,8 +27,11 @@ #define ETH_IPV4_UDP_SIZE (sizeof(struct ethhdr) + IPV4_UDP_SIZE) #define ETH_SIZE (sizeof(struct ethhdr)) +#ifdef IPVER6 +#define IP_SIZE (sizeof(struct ipv6hdr)) +#else #define IP_SIZE (sizeof(struct iphdr)) -#define IPv6_SIZE (sizeof(struct ipv6hdr)) +#endif #define UDP_SIZE (sizeof(struct udphdr)) #define TCP_SIZE (sizeof(struct tcphdr)) #define ICMP_SIZE (sizeof(struct icmphdr)) @@ -38,37 +45,23 @@ struct cali_tc_state { /* Initial IP read from the packet, updated to host's IP when doing NAT encap/ICMP error. * updated when doing CALI_CT_ESTABLISHED_SNAT handling. Used for FIB lookup. */ - __be32 ip_src; - __be32 ip_src1; - __be32 ip_src2; - __be32 ip_src3; + DECLARE_IP_ADDR(ip_src); /* Initial IP read from packet. Updated when doing encap and ICMP errors or CALI_CT_ESTABLISHED_DNAT. * If connect-time load balancing is enabled, this will be the post-NAT IP because the connect-time * load balancer gets in before TC. */ - __be32 ip_dst; - __be32 ip_dst1; - __be32 ip_dst2; - __be32 ip_dst3; + DECLARE_IP_ADDR(ip_dst); /* Set when invoking the policy program; if no NAT, ip_dst; otherwise, the pre-DNAT IP. If the connect * time load balancer is enabled, this may be different from ip_dst. */ - __be32 pre_nat_ip_dst; - __be32 pre_nat_ip_dst1; - __be32 pre_nat_ip_dst2; - __be32 pre_nat_ip_dst3; + DECLARE_IP_ADDR(pre_nat_ip_dst); /* If no NAT, ip_dst. Otherwise the NAT dest that we look up from the NAT maps or the conntrack entry * for CALI_CT_ESTABLISHED_DNAT. */ - __be32 post_nat_ip_dst; - __be32 post_nat_ip_dst1; - __be32 post_nat_ip_dst2; - __be32 post_nat_ip_dst3; + DECLARE_IP_ADDR(post_nat_ip_dst); /* For packets that arrived over our VXLAN tunnel, the source IP of the tunnel packet. * Zeroed out when we decide to respond with an ICMP error. * Also used to stash the ICMP MTU when calling the ICMP response program. */ - __be32 tun_ip; - __be32 tun_ip1; - __be32 tun_ip2; - __be32 tun_ip3; - __u32 unused; + DECLARE_IP_ADDR(tun_ip); + __u16 ihl; + __u16 unused; /* Return code from the policy program CALI_POL_DENY/ALLOW etc. */ __s32 pol_rc; /* Source port of the packet; updated on the CALI_CT_ESTABLISHED_SNAT path or when doing encap. @@ -106,10 +99,15 @@ struct cali_tc_state { struct calico_nat_dest nat_dest; /* 8 bytes */ __u64 prog_start_time; __u64 flags; +#ifndef IPVER6 + __u8 __pad_ipv4[48]; +#endif }; struct pkt_scratch { - __u8 l4[20]; /* 20 bytes to fit udp, icmp, tcp w/o options */ + __u8 l4[24]; /* 20 bytes to fit udp, icmp, tcp w/o options and 24 to make 8-aligned */ + struct ct_create_ctx ct_ctx_nat; + struct calico_ct_key ct_key; }; enum cali_state_flags { @@ -149,20 +147,26 @@ struct fwd { }; struct cali_tc_ctx { +#if !CALI_F_XDP struct __sk_buff *skb; +#else struct xdp_md *xdp; +#endif /* Our single copies of the data start/end pointers loaded from the skb. */ void *data_start; void *data_end; void *ip_header; long ipheader_len; + void *nh; struct cali_tc_state *state; +#if !CALI_F_XDP const volatile struct cali_tc_globals *globals; +#else const volatile struct cali_xdp_globals *xdp_globals; /* XXX we must split the state between tc/xdp */ +#endif struct calico_nat_dest *nat_dest; - struct arp_key arpk; struct fwd fwd; void *counters; struct pkt_scratch *scratch; @@ -186,25 +190,48 @@ struct cali_tc_ctx { bpf_exit(TC_ACT_SHOT); \ } \ struct pkt_scratch *scratch = (void *)(gl->__scratch); \ - (struct cali_tc_ctx) { \ + struct cali_tc_ctx x = { \ .state = state, \ .counters = counters, \ .globals = gl, \ .scratch = scratch, \ + .nh = &scratch->l4, \ __VA_ARGS__ \ }; \ + if (x.ipheader_len == 0) { \ + x.ipheader_len = state->ihl; \ + } \ + \ + x; \ }) \ -static CALI_BPF_INLINE struct iphdr* ip_hdr(struct cali_tc_ctx *ctx) +#ifdef IPVER6 +static CALI_BPF_INLINE struct ipv6hdr* ip_hdr(struct cali_tc_ctx *ctx) { - return (struct iphdr *)ctx->ip_header; + return (struct ipv6hdr *)ctx->ip_header; } -static CALI_BPF_INLINE struct ipv6hdr* ipv6_hdr(struct cali_tc_ctx *ctx) +#define ip_hdr_set_ip(ctx, field, ip) do { \ + struct in6_addr *addr = &(ip_hdr(ctx)->field); \ + addr->in6_u.u6_addr32[0] = ip.a; \ + addr->in6_u.u6_addr32[1] = ip.b; \ + addr->in6_u.u6_addr32[2] = ip.c; \ + addr->in6_u.u6_addr32[3] = ip.d; \ +} while(0) + +#else + +static CALI_BPF_INLINE struct iphdr* ip_hdr(struct cali_tc_ctx *ctx) { - return (struct ipv6hdr *)ctx->ip_header; + return (struct iphdr *)ctx->ip_header; } +#define ip_hdr_set_ip(ctx, field, ip) do { \ + ip_hdr(ctx)->field = ip; \ +} while (0) + +#endif + static CALI_BPF_INLINE struct ethhdr* eth_hdr(struct cali_tc_ctx *ctx) { return (struct ethhdr *)ctx->data_start; @@ -212,22 +239,17 @@ static CALI_BPF_INLINE struct ethhdr* eth_hdr(struct cali_tc_ctx *ctx) static CALI_BPF_INLINE struct tcphdr* tcp_hdr(struct cali_tc_ctx *ctx) { - return (struct tcphdr *)ctx->scratch->l4; + return (struct tcphdr *)ctx->nh; } static CALI_BPF_INLINE struct udphdr* udp_hdr(struct cali_tc_ctx *ctx) { - return (struct udphdr *)ctx->scratch->l4; + return (struct udphdr *)ctx->nh; } static CALI_BPF_INLINE struct icmphdr* icmp_hdr(struct cali_tc_ctx *ctx) { - return (struct icmphdr *)ctx->scratch->l4; -} - -static CALI_BPF_INLINE struct ipv6_opt_hdr* ipv6ext_hdr(struct cali_tc_ctx *ctx) -{ - return (struct ipv6_opt_hdr *)ctx->scratch->l4; + return (struct icmphdr *)ctx->nh; } static CALI_BPF_INLINE __u32 ctx_ifindex(struct cali_tc_ctx *ctx) @@ -253,5 +275,9 @@ static CALI_BPF_INLINE int l4_hdr_len(struct cali_tc_ctx *ctx) return 0; } +#define IP_VOID 0 +#define IP_EQ(ip1, ip2) ((ip1) == (ip2)) +#define IP_SET(var, val) ((var) = (val)) + #endif /* __CALI_BPF_TYPES_H__ */ diff --git a/felix/bpf-gpl/ut/ip_parse_test.c b/felix/bpf-gpl/ut/ip_parse_test.c new file mode 100644 index 00000000000..39110d03b96 --- /dev/null +++ b/felix/bpf-gpl/ut/ip_parse_test.c @@ -0,0 +1,59 @@ +// Project Calico BPF dataplane programs. +// Copyright (c) 2023 Tigera, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later + +#include "ut.h" +#include "parsing.h" +#include "jump.h" +#include "nat.h" + +const volatile struct cali_tc_globals __globals; + +static CALI_BPF_INLINE int calico_unittest_entry (struct __sk_buff *skb) +{ + volatile struct cali_tc_globals *globals = state_get_globals_tc(); + + if (!globals) { + return TC_ACT_SHOT; + } + + /* Set the globals for the rest of the prog chain. */ + *globals = __globals; + DECLARE_TC_CTX(_ctx, + .skb = skb, + .ipheader_len = IP_SIZE, + ); + struct cali_tc_ctx *ctx = &_ctx; + + if (!ctx->counters) { + CALI_DEBUG("Counters map lookup failed: DROP\n"); + return TC_ACT_SHOT; + } + + int ver; + + switch (parse_packet_ip(ctx)) { +#ifdef IPVER6 + case PARSING_OK_V6: + ver = 6; + break; +#else + case PARSING_OK: + ver = 4; + break; +#endif + default: + return TC_ACT_UNSPEC; + } + + tc_state_fill_from_iphdr(ctx); + + switch (tc_state_fill_from_nexthdr(ctx, true)) { + case PARSING_ERROR: + return -1; + case PARSING_ALLOW_WITHOUT_ENFORCING_POLICY: + return -2; + } + + return ver; +} diff --git a/felix/bpf-gpl/ut/ipv4_opts_test.c b/felix/bpf-gpl/ut/ipv4_opts_test.c index c547a601a54..6f3febd1485 100644 --- a/felix/bpf-gpl/ut/ipv4_opts_test.c +++ b/felix/bpf-gpl/ut/ipv4_opts_test.c @@ -44,7 +44,10 @@ static CALI_BPF_INLINE int calico_unittest_entry (struct __sk_buff *skb) goto allow; } - if (vxlan_v4_encap(ctx, 0x06060606, 0x10101010)) { + __u32 a = 0x06060606; + __u32 b = 0x10101010; + + if (vxlan_encap(ctx, &a, &b)) { CALI_DEBUG("vxlan: encap failed!\n"); deny_reason(ctx, CALI_REASON_ENCAP_FAIL); goto deny; diff --git a/felix/bpf-gpl/ut/nat_decap_test.c b/felix/bpf-gpl/ut/nat_decap_test.c index 6f67ba642ef..aab9216782a 100644 --- a/felix/bpf-gpl/ut/nat_decap_test.c +++ b/felix/bpf-gpl/ut/nat_decap_test.c @@ -8,5 +8,5 @@ static CALI_BPF_INLINE int calico_unittest_entry (struct __sk_buff *skb) { - return vxlan_v4_decap(skb); + return vxlan_decap(skb); } diff --git a/felix/bpf-gpl/ut/nat_encap_test.c b/felix/bpf-gpl/ut/nat_encap_test.c index 932bd462277..32a3e9b2bc6 100644 --- a/felix/bpf-gpl/ut/nat_encap_test.c +++ b/felix/bpf-gpl/ut/nat_encap_test.c @@ -31,5 +31,9 @@ static CALI_BPF_INLINE int calico_unittest_entry (struct __sk_buff *skb) CALI_DEBUG("Counters map lookup failed: DROP\n"); return TC_ACT_SHOT; } - return vxlan_v4_encap(ctx, HOST_IP, 0x02020202); + + __u32 a = HOST_IP; + __u32 b = 0x02020202; + + return vxlan_encap(ctx, &a, &b); } diff --git a/felix/bpf-gpl/xdp.c b/felix/bpf-gpl/xdp.c index dea9dd6682c..5743c921c2e 100644 --- a/felix/bpf-gpl/xdp.c +++ b/felix/bpf-gpl/xdp.c @@ -62,6 +62,7 @@ int calico_xdp_main(struct xdp_md *xdp) } __builtin_memset(ctx->state, 0, sizeof(*ctx->state)); ctx->scratch = (void *)(ctx->xdp_globals + 1); /* needs to be set to something, not used, there is space */ + ctx->nh = &ctx->scratch->l4; counter_inc(ctx, COUNTER_TOTAL_PACKETS); diff --git a/felix/bpf/arp/map.go b/felix/bpf/arp/map.go index 7319410650e..c4e92a19649 100644 --- a/felix/bpf/arp/map.go +++ b/felix/bpf/arp/map.go @@ -26,6 +26,7 @@ import ( func init() { maps.SetSize(MapParams.VersionedName(), MapParams.MaxEntries) + maps.SetSize(MapV6Params.VersionedName(), MapParams.MaxEntries) } var MapParams = maps.MapParameters{ @@ -110,10 +111,10 @@ type MapMem map[Key]Value func LoadMapMem(m maps.Map) (MapMem, error) { ret := make(MapMem) - err := m.Iter(func(k, v []byte) maps.IteratorAction { - ks := len(Key{}) - vs := len(Value{}) + ks := len(Key{}) + vs := len(Value{}) + err := m.Iter(func(k, v []byte) maps.IteratorAction { var key Key copy(key[:ks], k[:ks]) diff --git a/felix/bpf/arp/map6.go b/felix/bpf/arp/map6.go new file mode 100644 index 00000000000..d0feab07f39 --- /dev/null +++ b/felix/bpf/arp/map6.go @@ -0,0 +1,111 @@ +// Copyright (c) 2020 Tigera, Inc. All rights reserved. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arp + +import ( + "encoding/binary" + "fmt" + "net" + + "github.com/projectcalico/calico/felix/bpf/maps" +) + +var MapV6Params = maps.MapParameters{ + Type: "lru_hash", + KeySize: KeyV6Size, + ValueSize: ValueV6Size, + MaxEntries: 10000, // max number of nodes that can forward nodeports to a single node + Name: "cali_v6_arp", + Version: 2, +} + +func MapV6() maps.Map { + return maps.NewPinnedMap(MapV6Params) +} + +const KeyV6Size = 20 + +type KeyV6 [KeyV6Size]byte + +func NewKeyV6(ip net.IP, ifIndex uint32) KeyV6 { + var k KeyV6 + + ip = ip.To16() + + copy(k[:16], ip) + binary.LittleEndian.PutUint32(k[16:20], ifIndex) + + return k +} + +func (k KeyV6) IP() net.IP { + return net.IP(k[:16]) +} + +func (k KeyV6) IfIndex() uint32 { + return binary.LittleEndian.Uint32(k[16:20]) +} + +func (k KeyV6) String() string { + return fmt.Sprintf("ip %s ifindex %d", k.IP(), k.IfIndex()) +} + +func (k KeyV6) AsBytes() []byte { + return k[:] +} + +const ValueV6Size = ValueSize + +type ValueV6 = Value + +type MapMemV6 map[KeyV6]ValueV6 + +// LoadMapMem loads ConntrackMap into memory +func LoadMapMemV6(m maps.Map) (MapMemV6, error) { + ret := make(MapMemV6) + + ks := len(KeyV6{}) + vs := len(ValueV6{}) + + err := m.Iter(func(k, v []byte) maps.IteratorAction { + var key KeyV6 + copy(key[:ks], k[:ks]) + + var val ValueV6 + copy(val[:vs], v[:vs]) + + ret[key] = val + return maps.IterNone + }) + + return ret, err +} + +// MapMemIterV6 returns maps.MapIter that loads the provided MapMem +func MapMemIterV6(m MapMemV6) maps.IterCallback { + ks := len(KeyV6{}) + vs := len(ValueV6{}) + + return func(k, v []byte) maps.IteratorAction { + var key KeyV6 + copy(key[:ks], k[:ks]) + + var val ValueV6 + copy(val[:vs], v[:vs]) + + m[key] = val + return maps.IterNone + } +} diff --git a/felix/bpf/conntrack/map.go b/felix/bpf/conntrack/map.go index 4a92473f3df..e47415d9189 100644 --- a/felix/bpf/conntrack/map.go +++ b/felix/bpf/conntrack/map.go @@ -28,24 +28,33 @@ import ( ) func init() { - SetMapSize(MapParams.MaxEntries) + SetMapSize(MaxEntries) } func SetMapSize(size int) { - maps.SetSize(MapParams.VersionedName(), size) + maps.SetSize(curVer.MapParams.VersionedName(), size) + maps.SetSize(curVer.MapParamsV6.VersionedName(), size) } const KeySize = curVer.KeySize +const KeyV6Size = curVer.KeyV6Size const ValueSize = curVer.ValueSize +const ValueV6Size = curVer.ValueV6Size const MaxEntries = curVer.MaxEntries type Key = curVer.Key +type KeyV6 = curVer.KeyV6 func NewKey(proto uint8, ipA net.IP, portA uint16, ipB net.IP, portB uint16) Key { return curVer.NewKey(proto, ipA, portA, ipB, portB) } +func NewKeyV6(proto uint8, ipA net.IP, portA uint16, ipB net.IP, portB uint16) KeyV6 { + return curVer.NewKeyV6(proto, ipA, portA, ipB, portB) +} + type Value = curVer.Value +type ValueV6 = curVer.ValueV6 const ( TypeNormal uint8 = iota @@ -77,9 +86,34 @@ func NewValueNATReverseSNAT(created, lastSeen time.Duration, flags uint16, legA, return curVer.NewValueNATReverseSNAT(created, lastSeen, flags, legA, legB, tunnelIP, origIP, origSrcIP, origPort) } +// NewValueV6Normal creates a new ValueV6 of type TypeNormal based on the given parameters +func NewValueV6Normal(created, lastSeen time.Duration, flags uint16, legA, legB Leg) ValueV6 { + return curVer.NewValueV6Normal(created, lastSeen, flags, legA, legB) +} + +// NewValueV6NATForward creates a new ValueV6 of type TypeNATForward for the given +// arguments and the reverse key +func NewValueV6NATForward(created, lastSeen time.Duration, flags uint16, revKey KeyV6) ValueV6 { + return curVer.NewValueV6NATForward(created, lastSeen, flags, revKey) +} + +// NewValueV6NATReverse creates a new ValueV6 of type TypeNATReverse for the given +// arguments and reverse parameters +func NewValueV6NATReverse(created, lastSeen time.Duration, flags uint16, legA, legB Leg, + tunnelIP, origIP net.IP, origPort uint16) ValueV6 { + return curVer.NewValueV6NATReverse(created, lastSeen, flags, legA, legB, tunnelIP, origIP, origPort) +} + +// NewValueV6NATReverseSNAT in addition to NewValueV6NATReverse sets the orig source IP +func NewValueV6NATReverseSNAT(created, lastSeen time.Duration, flags uint16, legA, legB Leg, + tunnelIP, origIP, origSrcIP net.IP, origPort uint16) ValueV6 { + return curVer.NewValueV6NATReverseSNAT(created, lastSeen, flags, legA, legB, tunnelIP, origIP, origSrcIP, origPort) +} + type Leg = curVer.Leg var MapParams = curVer.MapParams +var MapParamsV6 = curVer.MapParamsV6 func Map() maps.Map { b := maps.NewPinnedMap(MapParams) @@ -89,6 +123,12 @@ func Map() maps.Map { return b } +func MapV6() maps.Map { + b := maps.NewPinnedMap(MapParamsV6) + b.GetMapParams = GetMapParams + return b +} + func MapV2() maps.Map { return maps.NewPinnedMap(v2.MapParams) } @@ -158,6 +198,65 @@ func StringToValue(str string) Value { return BytesToValue([]byte(str)) } +func KeyV6FromBytes(k []byte) KeyV6 { + var ctKeyV6 KeyV6 + if len(k) != len(ctKeyV6) { + log.Panic("KeyV6 has unexpected length") + } + copy(ctKeyV6[:], k[:]) + return ctKeyV6 +} + +func ValueV6FromBytes(v []byte) ValueV6 { + var ctVal ValueV6 + if len(v) != len(ctVal) { + log.Panic("ValueV6 has unexpected length") + } + copy(ctVal[:], v[:]) + return ctVal +} + +type MapMemV6 = curVer.MapMemV6 + +// LoadMapMem loads ConntrackMap into memory +func LoadMapMemV6(m maps.Map) (MapMemV6, error) { + ret, err := curVer.LoadMapMemV6(m) + return ret, err +} + +// MapMemIter returns maps.MapIter that loads the provided MapMem +func MapMemIterV6(m MapMemV6) func(k, v []byte) { + return curVer.MapMemIterV6(m) +} + +// BytesToKeyV6 turns a slice of bytes into a KeyV6 +func BytesToKeyV6(bytes []byte) KeyV6 { + var k KeyV6 + + copy(k[:], bytes[:]) + + return k +} + +// StringToKeyV6 turns a string into a KeyV6 +func StringToKeyV6(str string) KeyV6 { + return BytesToKeyV6([]byte(str)) +} + +// BytesToValueV6 turns a slice of bytes into a value +func BytesToValueV6(bytes []byte) ValueV6 { + var v ValueV6 + + copy(v[:], bytes) + + return v +} + +// StringToValueV6 turns a string into a ValueV6 +func StringToValueV6(str string) ValueV6 { + return BytesToValueV6([]byte(str)) +} + func GetMapParams(version int) maps.MapParameters { switch version { case 2: diff --git a/felix/bpf/conntrack/v3/map.go b/felix/bpf/conntrack/v3/map.go index 129592443b1..3fa50d4b0ce 100644 --- a/felix/bpf/conntrack/v3/map.go +++ b/felix/bpf/conntrack/v3/map.go @@ -506,12 +506,6 @@ var MapParams = maps.MapParameters{ UpdatedByBPF: true, } -const ( - ProtoICMP = 1 - ProtoTCP = 6 - ProtoUDP = 17 -) - func KeyFromBytes(k []byte) Key { var ctKey Key if len(k) != len(ctKey) { diff --git a/felix/bpf/conntrack/v3/map6.go b/felix/bpf/conntrack/v3/map6.go new file mode 100644 index 00000000000..5799e6cf92f --- /dev/null +++ b/felix/bpf/conntrack/v3/map6.go @@ -0,0 +1,444 @@ +// Copyright (c) 2022 Tigera, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v3 + +import ( + "encoding/binary" + "fmt" + "net" + "time" + + log "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + + "github.com/projectcalico/calico/felix/bpf/maps" +) + +// struct calico_ct_key { +// uint32_t protocol; +// __be32 addr_a, addr_b; // NBO +// uint16_t port_a, port_b; // HBO +// }; +const KeyV6Size = 40 +const ValueV6Size = 128 + +type KeyV6 [KeyV6Size]byte + +func (k KeyV6) AsBytes() []byte { + return k[:] +} + +func (k KeyV6) Proto() uint8 { + return uint8(binary.LittleEndian.Uint32(k[:4])) +} + +func (k KeyV6) AddrA() net.IP { + return k[4:20] +} + +func (k KeyV6) PortA() uint16 { + return binary.LittleEndian.Uint16(k[36:38]) +} + +func (k KeyV6) AddrB() net.IP { + return k[20:36] +} + +func (k KeyV6) PortB() uint16 { + return binary.LittleEndian.Uint16(k[38:40]) +} + +func (k KeyV6) String() string { + return fmt.Sprintf("ConntrackKey{proto=%v %v:%v <-> %v:%v}", + k.Proto(), k.AddrA(), k.PortA(), k.AddrB(), k.PortB()) +} + +func (k KeyV6) Upgrade() maps.Upgradable { + panic("conntrack map key already at its latest version") +} + +func NewKeyV6(proto uint8, ipA net.IP, portA uint16, ipB net.IP, portB uint16) KeyV6 { + var k KeyV6 + binary.LittleEndian.PutUint32(k[:4], uint32(proto)) + copy(k[4:20], ipA.To16()) + copy(k[20:36], ipB.To16()) + binary.LittleEndian.PutUint16(k[36:38], portA) + binary.LittleEndian.PutUint16(k[38:40], portB) + return k +} + +// struct calico_ct_value { +// __u64 created; +// __u64 last_seen; // 8 +// __u8 type; // 16 +// __u8 flags; // 17 +// +// // Important to use explicit padding, otherwise the compiler can decide +// // not to zero the padding bytes, which upsets the verifier. Worse than +// // that, debug logging often prevents such optimisation resulting in +// // failures when debug logging is compiled out only :-). +// __u8 pad0[5]; +// __u8 flags2; +// union { +// // CALI_CT_TYPE_NORMAL and CALI_CT_TYPE_NAT_REV. +// struct { +// struct calico_ct_leg a_to_b; // 24 +// struct calico_ct_leg b_to_a; // 36 +// +// // CALI_CT_TYPE_NAT_REV only. +// __u32 orig_dst; // 48 +// __u16 orig_port; // 52 +// __u8 pad1[2]; // 54 +// __u32 tun_ip; // 56 +// __u32 pad3; // 60 +// }; +// +// // CALI_CT_TYPE_NAT_FWD; key for the CALI_CT_TYPE_NAT_REV entry. +// struct { +// struct calico_ct_key nat_rev_key; // 24 +// __u8 pad2[8]; +// }; +// }; +// }; + +const ( + VoCreatedV6 int = 0 + VoLastSeenV6 int = 8 + VoTypeV6 int = 16 + VoFlagsV6 int = 17 + VoFlags2V6 int = 23 + VoRevKeyV6 int = 24 + VoLegABV6 int = 24 + VoLegBAV6 int = 48 + VoTunIPV6 int = 72 + VoOrigIPV6 int = VoTunIPV6 + 16 + VoOrigPortV6 int = VoOrigIPV6 + 16 + VoOrigSPortV6 int = VoOrigPortV6 + 2 + VoOrigSIPV6 int = VoOrigSPortV6 + 2 + VoNATSPortV6 int = VoRevKeyV6 + KeyV6Size +) + +type ValueV6 [ValueV6Size]byte + +func (e ValueV6) Created() int64 { + return int64(binary.LittleEndian.Uint64(e[VoCreatedV6 : VoCreatedV6+8])) +} + +func (e ValueV6) LastSeen() int64 { + return int64(binary.LittleEndian.Uint64(e[VoLastSeenV6 : VoLastSeenV6+8])) +} + +func (e ValueV6) Type() uint8 { + return e[VoTypeV6] +} + +func (e ValueV6) Flags() uint16 { + return uint16(e[VoFlagsV6]) | (uint16(e[VoFlags2]) << 8) +} + +// OrigIP returns the original destination IP, valid only if Type() is TypeNormal or TypeNATReverse +func (e ValueV6) OrigIP() net.IP { + return e[VoOrigIPV6 : VoOrigIPV6+16] +} + +// OrigPort returns the original destination port, valid only if Type() is TypeNormal or TypeNATReverse +func (e ValueV6) OrigPort() uint16 { + return binary.LittleEndian.Uint16(e[VoOrigPortV6 : VoOrigPortV6+2]) +} + +// OrigSPort returns the original source port, valid only if Type() is +// TypeNATReverse and if the value returned is non-zero. +func (e ValueV6) OrigSPort() uint16 { + return binary.LittleEndian.Uint16(e[VoOrigSPortV6 : VoOrigSPortV6+2]) +} + +// NATSPort returns the port to SNAT to, valid only if Type() is TypeNATForward. +func (e ValueV6) NATSPort() uint16 { + return binary.LittleEndian.Uint16(e[VoNATSPortV6 : VoNATSPortV6+2]) +} + +// OrigSrcIP returns the original source IP. +func (e ValueV6) OrigSrcIP() net.IP { + return e[VoOrigSIPV6 : VoOrigSIPV6+16] +} + +func (e ValueV6) ReverseNATKey() KeyV6 { + var ret KeyV6 + + l := len(KeyV6{}) + copy(ret[:l], e[VoRevKeyV6:VoRevKeyV6+l]) + + return ret +} + +// AsBytes returns the value as slice of bytes +func (e ValueV6) AsBytes() []byte { + return e[:] +} + +func (e *ValueV6) SetLegA2B(leg Leg) { + copy(e[VoLegABV6:VoLegABV6+legSize], leg.AsBytes()) +} + +func (e *ValueV6) SetLegB2A(leg Leg) { + copy(e[VoLegBAV6:VoLegBAV6+legSize], leg.AsBytes()) +} + +func (e *ValueV6) SetOrigSport(sport uint16) { + binary.LittleEndian.PutUint16(e[VoOrigSPortV6:VoOrigSPortV6+2], sport) +} + +func (e *ValueV6) SetNATSport(sport uint16) { + binary.LittleEndian.PutUint16(e[VoNATSPortV6:VoNATSPortV6+2], sport) +} + +func initValueV6(v *ValueV6, created, lastSeen time.Duration, typ uint8, flags uint16) { + binary.LittleEndian.PutUint64(v[VoCreatedV6:VoCreatedV6+8], uint64(created)) + binary.LittleEndian.PutUint64(v[VoLastSeenV6:VoLastSeenV6+8], uint64(lastSeen)) + v[VoTypeV6] = typ + v[VoFlagsV6] = byte(flags & 0xff) + v[VoFlags2] = byte((flags >> 8) & 0xff) +} + +// NewValueV6Normal creates a new ValueV6 of type TypeNormal based on the given parameters +func NewValueV6Normal(created, lastSeen time.Duration, flags uint16, legA, legB Leg) ValueV6 { + v := ValueV6{} + + initValueV6(&v, created, lastSeen, TypeNormal, flags) + + v.SetLegA2B(legA) + v.SetLegB2A(legB) + + return v +} + +// NewValueV6NATForward creates a new ValueV6 of type TypeNATForward for the given +// arguments and the reverse key +func NewValueV6NATForward(created, lastSeen time.Duration, flags uint16, revKey KeyV6) ValueV6 { + v := ValueV6{} + + initValueV6(&v, created, lastSeen, TypeNATForward, flags) + + copy(v[VoRevKeyV6:VoRevKeyV6+KeySize], revKey.AsBytes()) + + return v +} + +// NewValueV6NATReverse creates a new ValueV6 of type TypeNATReverse for the given +// arguments and reverse parameters +func NewValueV6NATReverse(created, lastSeen time.Duration, flags uint16, legA, legB Leg, + tunnelIP, origIP net.IP, origPort uint16) ValueV6 { + v := ValueV6{} + + initValueV6(&v, created, lastSeen, TypeNATReverse, flags) + + v.SetLegA2B(legA) + v.SetLegB2A(legB) + + copy(v[VoOrigIPV6:VoOrigIPV6+16], origIP.To4()) + binary.LittleEndian.PutUint16(v[VoOrigPortV6:VoOrigPortV6+2], origPort) + + copy(v[VoTunIPV6:VoTunIPV6+16], tunnelIP.To4()) + + return v +} + +// NewValueV6NATReverseSNAT in addition to NewValueV6NATReverse sets the orig source IP +func NewValueV6NATReverseSNAT(created, lastSeen time.Duration, flags uint16, legA, legB Leg, + tunnelIP, origIP, origSrcIP net.IP, origPort uint16) ValueV6 { + v := NewValueV6NATReverse(created, lastSeen, flags, legA, legB, tunnelIP, origIP, origPort) + copy(v[VoOrigSIPV6:VoOrigSIPV6+16], origIP.To4()) + + return v +} + +func readConntrackLegV6(b []byte) Leg { + bits := binary.LittleEndian.Uint32(b[legExtra+4 : legExtra+8]) + return Leg{ + Bytes: binary.LittleEndian.Uint64(b[0:8]), + Packets: binary.LittleEndian.Uint32(b[8:12]), + Seqno: binary.BigEndian.Uint32(b[legExtra+0 : legExtra+4]), + SynSeen: bitSet(bits, 0), + AckSeen: bitSet(bits, 1), + FinSeen: bitSet(bits, 2), + RstSeen: bitSet(bits, 3), + Approved: bitSet(bits, 4), + Opener: bitSet(bits, 5), + Ifindex: binary.LittleEndian.Uint32(b[legExtra+8 : legExtra+12]), + } +} + +func (e ValueV6) Data() EntryData { + ip := e[VoOrigIPV6 : VoOrigIPV6+16] + tip := e[VoTunIPV6 : VoTunIPV6+16] + sip := e[VoOrigSIPV6 : VoOrigSIPV6+16] + return EntryData{ + A2B: readConntrackLegV6(e[VoLegABV6 : VoLegABV6+legSize]), + B2A: readConntrackLegV6(e[VoLegBAV6 : VoLegBAV6+legSize]), + OrigDst: ip, + OrigSrc: sip, + OrigPort: binary.LittleEndian.Uint16(e[VoOrigPortV6 : VoOrigPortV6+2]), + OrigSPort: binary.LittleEndian.Uint16(e[VoOrigPortV6+2 : VoOrigPortV6+4]), + TunIP: tip, + } +} + +func (e ValueV6) String() string { + flagsStr := "" + flags := e.Flags() + + if flags == 0 { + flagsStr = " " + } else { + flagsStr = fmt.Sprintf(" 0x%x", flags) + if flags&FlagNATOut != 0 { + flagsStr += " nat-out" + } + + if flags&FlagNATFwdDsr != 0 { + flagsStr += " fwd-dsr" + } + + if flags&FlagNATNPFwd != 0 { + flagsStr += " np-fwd" + } + + if flags&FlagSkipFIB != 0 { + flagsStr += " skip-fib" + } + + if flags&FlagExtLocal != 0 { + flagsStr += " ext-local" + } + + if flags&FlagViaNATIf != 0 { + flagsStr += " via-nat-iface" + } + + if flags&FlagSrcDstBA != 0 { + flagsStr += " B-A" + } + + if flags&FlagHostPSNAT != 0 { + flagsStr += " host-psnat" + } + + if flags&FlagSvcSelf != 0 { + flagsStr += " svc-self" + } + + if flags&FlagNPLoop != 0 { + flagsStr += " np-loop" + } + + if flags&FlagNPRemote != 0 { + flagsStr += " np-remote" + } + + if flags&FlagNPRemote != 0 { + flagsStr += " no-dsr" + } + } + + ret := fmt.Sprintf("Entry{Type:%d, Created:%d, LastSeen:%d, Flags:%s ", + e.Type(), e.Created(), e.LastSeen(), flagsStr) + + switch e.Type() { + case TypeNATForward: + ret += fmt.Sprintf("REVKey: %s NATSPort: %d", e.ReverseNATKey().String(), e.NATSPort()) + case TypeNormal, TypeNATReverse: + ret += fmt.Sprintf("Data: %+v", e.Data()) + default: + ret += "TYPE INVALID" + } + + return ret + "}" +} + +func (e ValueV6) IsForwardDSR() bool { + return e.Flags()&FlagNATFwdDsr != 0 +} + +func (e ValueV6) Upgrade() maps.Upgradable { + panic("conntrack map value already at its latest version") +} + +var MapParamsV6 = maps.MapParameters{ + Type: "hash", + KeySize: KeyV6Size, + ValueSize: ValueV6Size, + MaxEntries: MaxEntries, + Name: "cali_v6_ct", + Flags: unix.BPF_F_NO_PREALLOC, + Version: 3, + UpdatedByBPF: true, +} + +func KeyV6FromBytes(k []byte) KeyV6 { + var ctKey KeyV6 + if len(k) != len(ctKey) { + log.Panic("KeyV6 has unexpected length") + } + copy(ctKey[:], k[:]) + return ctKey +} + +func ValueV6FromBytes(v []byte) ValueV6 { + var ctVal ValueV6 + if len(v) != len(ctVal) { + log.Panic("ValueV6 has unexpected length") + } + copy(ctVal[:], v[:]) + return ctVal +} + +type MapMemV6 map[KeyV6]ValueV6 + +// LoadMapMem loads ConntrackMap into memory +func LoadMapMemV6(m maps.Map) (MapMemV6, error) { + ret := make(MapMemV6) + + err := m.Iter(func(k, v []byte) maps.IteratorAction { + ks := len(KeyV6{}) + vs := len(ValueV6{}) + + var key KeyV6 + copy(key[:ks], k[:ks]) + + var val ValueV6 + copy(val[:vs], v[:vs]) + + ret[key] = val + return maps.IterNone + }) + + return ret, err +} + +// MapMemIterV6 returns maps.MapIter that loads the provided MapMemV6 +func MapMemIterV6(m MapMemV6) func(k, v []byte) { + ks := len(KeyV6{}) + vs := len(ValueV6{}) + + return func(k, v []byte) { + var key KeyV6 + copy(key[:ks], k[:ks]) + + var val ValueV6 + copy(val[:vs], v[:vs]) + + m[key] = val + } +} diff --git a/felix/bpf/libbpf/libbpf.go b/felix/bpf/libbpf/libbpf.go index 83145af3d44..d3a1a40eeaa 100644 --- a/felix/bpf/libbpf/libbpf.go +++ b/felix/bpf/libbpf/libbpf.go @@ -66,6 +66,10 @@ func (m *Map) ValueSize() int { return int(C.bpf_map__value_size(m.bpfMap)) } +func (m *Map) KeySize() int { + return int(C.bpf_map__key_size(m.bpfMap)) +} + func (m *Map) SetPinPath(path string) error { cPath := C.CString(path) defer C.free(unsafe.Pointer(cPath)) @@ -415,6 +419,41 @@ func TcSetGlobals( return err } +func TcSetGlobals6( + m *Map, + globalData *TcGlobalData6, +) error { + + cName := C.CString(globalData.IfaceName) + defer C.free(unsafe.Pointer(cName)) + + cJumps := make([]C.uint, len(globalData.Jumps)) + + for i, v := range globalData.Jumps { + cJumps[i] = C.uint(v) + } + + _, err := C.bpf_tc_set_globals_v6(m.bpfMap, + cName, + (*C.char)(unsafe.Pointer(&globalData.HostIP[0])), + (*C.char)(unsafe.Pointer(&globalData.IntfIP[0])), + C.uint(globalData.ExtToSvcMark), + C.ushort(globalData.Tmtu), + C.ushort(globalData.VxlanPort), + C.ushort(globalData.PSNatStart), + C.ushort(globalData.PSNatLen), + (*C.char)(unsafe.Pointer(&globalData.HostTunnelIP[0])), + C.uint(globalData.Flags), + C.ushort(globalData.WgPort), + C.uint(globalData.NatIn), + C.uint(globalData.NatOut), + C.uint(globalData.LogFilterJmp), + &cJumps[0], // it is safe because we hold the reference here until we return. + ) + + return err +} + func CTLBSetGlobals(m *Map, udpNotSeen time.Duration, excludeUDP bool) error { udpNotSeen /= time.Second // Convert to seconds _, err := C.bpf_ctlb_set_globals(m.bpfMap, C.uint(udpNotSeen), C.bool(excludeUDP)) diff --git a/felix/bpf/libbpf/libbpf_api.h b/felix/bpf/libbpf/libbpf_api.h index 03df79842a1..33c36dc51a4 100644 --- a/felix/bpf/libbpf/libbpf_api.h +++ b/felix/bpf/libbpf/libbpf_api.h @@ -19,6 +19,7 @@ #include #include #include "globals.h" +#include "ip_addr.h" static void set_errno(int ret) { errno = ret >= 0 ? ret : -ret; @@ -181,6 +182,52 @@ void bpf_tc_set_globals(struct bpf_map *map, set_errno(bpf_map__set_initial_value(map, (void*)(&data), sizeof(data))); } +void bpf_tc_set_globals_v6(struct bpf_map *map, + char *iface_name, + char* host_ip, + char* intf_ip, + uint ext_to_svc_mark, + ushort tmtu, + ushort vxlanPort, + ushort psnat_start, + ushort psnat_len, + char* host_tunnel_ip, + uint flags, + ushort wg_port, + uint natin, + uint natout, + uint log_filter_jmp, + uint *jumps) +{ + struct cali_tc_globals_v6 data = { + .tunnel_mtu = tmtu, + .vxlan_port = vxlanPort, + .ext_to_svc_mark = ext_to_svc_mark, + .psnat_start = psnat_start, + .psnat_len = psnat_len, + .flags = flags, + .wg_port = wg_port, + .natin_idx = natin, + .natout_idx = natout, + .log_filter_jmp = log_filter_jmp, + }; + + memcpy(&data.host_ip, host_ip, 16); + memcpy(&data.intf_ip, intf_ip, 16); + memcpy(&data.host_tunnel_ip, host_tunnel_ip, 16); + + strncpy(data.iface_name, iface_name, sizeof(data.iface_name)); + data.iface_name[sizeof(data.iface_name)-1] = '\0'; + + int i; + + for (i = 0; i < sizeof(data.jumps)/sizeof(uint); i++) { + data.jumps[i] = jumps[i]; + } + + set_errno(bpf_map__set_initial_value(map, (void*)(&data), sizeof(data))); +} + int bpf_xdp_program_id(int ifIndex) { __u32 prog_id = 0, flags = 0; int err; diff --git a/felix/bpf/libbpf/libbpf_common.go b/felix/bpf/libbpf/libbpf_common.go index 5082e6c82a5..888ef084286 100644 --- a/felix/bpf/libbpf/libbpf_common.go +++ b/felix/bpf/libbpf/libbpf_common.go @@ -32,6 +32,24 @@ type TcGlobalData struct { Jumps [32]uint32 } +type TcGlobalData6 struct { + IfaceName string + HostIP [16]byte + IntfIP [16]byte + ExtToSvcMark uint32 + Tmtu uint16 + VxlanPort uint16 + PSNatStart uint16 + PSNatLen uint16 + HostTunnelIP [16]byte + Flags uint32 + WgPort uint16 + NatIn uint32 + NatOut uint32 + LogFilterJmp uint32 + Jumps [32]uint32 +} + type XDPGlobalData struct { IfaceName string Jumps [32]uint32 diff --git a/felix/bpf/libbpf/libbpf_stub.go b/felix/bpf/libbpf/libbpf_stub.go index ae089f84f93..fc57f75ad9a 100644 --- a/felix/bpf/libbpf/libbpf_stub.go +++ b/felix/bpf/libbpf/libbpf_stub.go @@ -133,6 +133,10 @@ func TcSetGlobals(_ *Map, globalData *TcGlobalData) error { panic("LIBBPF syscall stub") } +func TcSetGlobals6(_ *Map, globalData *TcGlobalData6) error { + panic("LIBBPF syscall stub") +} + func CTLBSetGlobals(_ *Map, _ time.Duration, _ bool) error { panic("LIBBPF syscall stub") } diff --git a/felix/bpf/maps/maps.go b/felix/bpf/maps/maps.go index 000d428ede7..cdafae9982f 100644 --- a/felix/bpf/maps/maps.go +++ b/felix/bpf/maps/maps.go @@ -645,7 +645,11 @@ func (b *PinnedMap) EnsureExists() error { } } - log.WithField("name", b.Name).Debug("Map didn't exist, creating it") + log.WithFields(log.Fields{ + "name": b.Name, + "keySize": b.KeySize, + "valuesize": b.ValueSize, + }).Debug("Map didn't exist, creating it") cmd := exec.Command("bpftool", "map", "create", b.VersionedFilename(), "type", b.Type, "key", fmt.Sprint(b.KeySize), diff --git a/felix/bpf/nat/maps.go b/felix/bpf/nat/maps.go index 7314c5368d5..ecc760506bd 100644 --- a/felix/bpf/nat/maps.go +++ b/felix/bpf/nat/maps.go @@ -34,12 +34,22 @@ func init() { maps.SetSize(AffinityMapParameters.VersionedName(), AffinityMapParameters.MaxEntries) maps.SetSize(SendRecvMsgMapParameters.VersionedName(), SendRecvMsgMapParameters.MaxEntries) maps.SetSize(CTNATsMapParameters.VersionedName(), CTNATsMapParameters.MaxEntries) + + maps.SetSize(FrontendMapV6Parameters.VersionedName(), FrontendMapV6Parameters.MaxEntries) + maps.SetSize(BackendMapV6Parameters.VersionedName(), BackendMapV6Parameters.MaxEntries) + maps.SetSize(AffinityMapV6Parameters.VersionedName(), AffinityMapV6Parameters.MaxEntries) + maps.SetSize(SendRecvMsgMapV6Parameters.VersionedName(), SendRecvMsgMapV6Parameters.MaxEntries) + maps.SetSize(CTNATsMapV6Parameters.VersionedName(), CTNATsMapV6Parameters.MaxEntries) } func SetMapSizes(fsize, bsize, asize int) { maps.SetSize(FrontendMapParameters.VersionedName(), fsize) maps.SetSize(BackendMapParameters.VersionedName(), bsize) maps.SetSize(AffinityMapParameters.VersionedName(), asize) + + maps.SetSize(FrontendMapV6Parameters.VersionedName(), fsize) + maps.SetSize(BackendMapV6Parameters.VersionedName(), bsize) + maps.SetSize(AffinityMapV6Parameters.VersionedName(), asize) } // struct calico_nat_v4_key { diff --git a/felix/bpf/nat/maps6.go b/felix/bpf/nat/maps6.go new file mode 100644 index 00000000000..591aaa8cf53 --- /dev/null +++ b/felix/bpf/nat/maps6.go @@ -0,0 +1,630 @@ +// Copyright (c) 2020-2021 Tigera, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package nat + +import ( + "encoding/binary" + "fmt" + "net" + "time" + + "golang.org/x/sys/unix" + + "github.com/projectcalico/calico/felix/bpf/maps" + "github.com/projectcalico/calico/felix/ip" +) + +// struct calico_nat_v4_key { +// uint32_t prefixLen; +// uint32_t addr; // NBO +// uint16_t port; // HBO +// uint8_t protocol; +// uint32_t saddr; +// uint8_t pad; +// }; +const frontendKeyV6Size = 40 + +// struct calico_nat { +// uint32_t addr; +// uint16_t port; +// uint8_t protocol; +// uint8_t pad; +// }; +const frontendAffKeyV6Size = 20 + +// struct calico_nat_v4_value { +// uint32_t id; +// uint32_t count; +// uint32_t local; +// uint32_t affinity_timeo; +// uint32_t flags; +// }; +const frontendValueV6Size = 20 + +// struct calico_nat_secondary_v4_key { +// uint32_t id; +// uint32_t ordinal; +// }; +const backendKeyV6Size = 8 + +// struct calico_nat_dest { +// uint32_t addr; +// uint16_t port; +// uint8_t pad[2]; +// }; +const backendValueV6Size = 20 + +// (sizeof(addr) + sizeof(port) + sizeof(proto)) in bits +const ZeroCIDRV6PrefixLen = (16 + 2 + 1) * 8 + +var ZeroCIDRV6 = ip.MustParseCIDROrIP("::/0").(ip.V6CIDR) + +type FrontendKeyV6 [frontendKeyV6Size]byte + +func NewNATKeyV6(addr net.IP, port uint16, protocol uint8) FrontendKeyV6 { + return NewNATKeyV6Src(addr, port, protocol, ZeroCIDRV6) +} + +func NewNATKeyV6Src(addr net.IP, port uint16, protocol uint8, cidr ip.V6CIDR) FrontendKeyV6 { + var k FrontendKeyV6 + prefixlen := ZeroCIDRV6PrefixLen + addr = addr.To16() + binary.LittleEndian.PutUint32(k[:4], uint32(prefixlen)+uint32(cidr.Prefix())) + copy(k[4:20], addr) + binary.LittleEndian.PutUint16(k[20:22], port) + k[22] = protocol + copy(k[23:39], cidr.Addr().AsNetIP().To16()) + return k +} + +func (k FrontendKeyV6) Proto() uint8 { + return k[22] +} + +func (k FrontendKeyV6) Addr() net.IP { + return k[4:20] +} + +func (k FrontendKeyV6) srcAddr() ip.Addr { + var addr ip.V6Addr + copy(addr[:], k[23:39]) + return addr +} + +// This function returns the Prefix length of the source CIDR +func (k FrontendKeyV6) SrcPrefixLen() uint32 { + return k.PrefixLen() - ZeroCIDRV6PrefixLen +} + +func (k FrontendKeyV6) SrcCIDR() ip.CIDR { + return ip.CIDRFromAddrAndPrefix(k.srcAddr(), int(k.SrcPrefixLen())) +} + +func (k FrontendKeyV6) PrefixLen() uint32 { + return binary.LittleEndian.Uint32(k[0:4]) +} + +func (k FrontendKeyV6) Port() uint16 { + return binary.LittleEndian.Uint16(k[20:22]) +} + +func (k FrontendKeyV6) AsBytes() []byte { + return k[:] +} + +func (k FrontendKeyV6) Affinitykey() []byte { + return k[4:12] +} + +func (k FrontendKeyV6) String() string { + return fmt.Sprintf("NATKeyV6{Proto:%v Addr:%v Port:%v SrcAddr:%v}", k.Proto(), k.Addr(), k.Port(), k.SrcCIDR()) +} + +func FrontendKeyV6FromBytes(b []byte) FrontendKeyV6 { + var k FrontendKeyV6 + copy(k[:], b) + return k +} + +type FrontendValueV6 = FrontendValue + +func NewNATValueV6(id uint32, count, local, affinityTimeo uint32) FrontendValueV6 { + return NewNATValue(id, count, local, affinityTimeo) +} + +func NewNATValueV6WithFlags(id uint32, count, local, affinityTimeo, flags uint32) FrontendValueV6 { + v := NewNATValue(id, count, local, affinityTimeo) + binary.LittleEndian.PutUint32(v[16:20], flags) + return v +} + +func FrontendValueV6FromBytes(b []byte) FrontendValueV6 { + var v FrontendValueV6 + copy(v[:], b) + return v +} + +type BackendKeyV6 = BackendKey + +func NewNATBackendKeyV6(id, ordinal uint32) BackendKeyV6 { + return NewNATBackendKey(id, ordinal) +} + +func BackendKeyV6FromBytes(b []byte) BackendKeyV6 { + var k BackendKeyV6 + copy(k[:], b) + return k +} + +type BackendValueV6 [backendValueV6Size]byte + +func NewNATBackendValueV6(addr net.IP, port uint16) BackendValueV6 { + var k BackendValueV6 + addr = addr.To16() + copy(k[:16], addr) + binary.LittleEndian.PutUint16(k[16:18], port) + return k +} + +func (k BackendValueV6) Addr() net.IP { + return k[:16] +} + +func (k BackendValueV6) Port() uint16 { + return binary.LittleEndian.Uint16(k[4:6]) +} + +func (k BackendValueV6) String() string { + return fmt.Sprintf("NATBackendValueV6{Addr:%v Port:%v}", k.Addr(), k.Port()) +} + +func (k BackendValueV6) AsBytes() []byte { + return k[:] +} + +func BackendValueV6FromBytes(b []byte) BackendValueV6 { + var v BackendValueV6 + copy(v[:], b) + return v +} + +var FrontendMapV6Parameters = maps.MapParameters{ + Type: "lpm_trie", + KeySize: frontendKeyV6Size, + ValueSize: frontendValueV6Size, + MaxEntries: 64 * 1024, + Name: "cali_v6_nat_fe", + Flags: unix.BPF_F_NO_PREALLOC, + Version: 3, +} + +func FrontendMapV6() maps.MapWithExistsCheck { + return maps.NewPinnedMap(FrontendMapV6Parameters) +} + +var BackendMapV6Parameters = maps.MapParameters{ + Type: "hash", + KeySize: backendKeyV6Size, + ValueSize: backendValueV6Size, + MaxEntries: 256 * 1024, + Name: "cali_v6_nat_be", + Flags: unix.BPF_F_NO_PREALLOC, +} + +func BackendMapV6() maps.MapWithExistsCheck { + return maps.NewPinnedMap(BackendMapV6Parameters) +} + +// NATMapMem represents FrontendMap loaded into memory +type MapMemV6 map[FrontendKeyV6]FrontendValueV6 + +// Equal compares keys and values of the NATMapMem +func (m MapMemV6) Equal(cmp MapMemV6) bool { + if len(m) != len(cmp) { + return false + } + + for k, v := range m { + v2, ok := cmp[k] + if !ok || v != v2 { + return false + } + } + + return true +} + +// LoadFrontendMap loads the NAT map into a go map or returns an error +func LoadFrontendMapV6(m maps.Map) (MapMemV6, error) { + ret := make(MapMemV6) + + if err := m.Open(); err != nil { + return nil, err + } + + iterFn := MapMemV6Iter(ret) + + err := m.Iter(func(k, v []byte) maps.IteratorAction { + iterFn(k, v) + return maps.IterNone + }) + if err != nil { + ret = nil + } + + return ret, err +} + +// MapMemIter returns maps.MapIter that loads the provided NATMapMem +func MapMemV6Iter(m MapMemV6) func(k, v []byte) { + ks := len(FrontendKeyV6{}) + vs := len(FrontendValueV6{}) + + return func(k, v []byte) { + var key FrontendKeyV6 + copy(key[:ks], k[:ks]) + + var val FrontendValueV6 + copy(val[:vs], v[:vs]) + + m[key] = val + } +} + +// BackendMapMemV6 represents a NATBackend loaded into memory +type BackendMapMemV6 map[BackendKeyV6]BackendValueV6 + +// Equal compares keys and values of the NATBackendMapMem +func (m BackendMapMemV6) Equal(cmp BackendMapMemV6) bool { + if len(m) != len(cmp) { + return false + } + + for k, v := range m { + v2, ok := cmp[k] + if !ok || v != v2 { + return false + } + } + + return true +} + +// LoadBackendMap loads the NATBackend map into a go map or returns an error +func LoadBackendMapV6(m maps.Map) (BackendMapMemV6, error) { + ret := make(BackendMapMemV6) + + if err := m.Open(); err != nil { + return nil, err + } + + iterFn := BackendMapMemV6Iter(ret) + + err := m.Iter(func(k, v []byte) maps.IteratorAction { + iterFn(k, v) + return maps.IterNone + }) + if err != nil { + ret = nil + } + + return ret, err +} + +// BackendMapMemIter returns maps.MapIter that loads the provided NATBackendMapMem +func BackendMapMemV6Iter(m BackendMapMemV6) func(k, v []byte) { + ks := len(BackendKeyV6{}) + vs := len(BackendValueV6{}) + + return func(k, v []byte) { + var key BackendKeyV6 + copy(key[:ks], k[:ks]) + + var val BackendValueV6 + copy(val[:vs], v[:vs]) + + m[key] = val + } +} + +// struct calico_nat_v4_affinity_key { +// struct calico_nat_v4 nat_key; +// uint32_t client_ip; +// uint32_t padding; +// }; + +const affinityKeyV6Size = frontendAffKeyV6Size + 16 + 4 + +// AffinityKeyV6 is a key into the affinity table that consist of FrontendKeyV6 and +// the client's IP +type AffinityKeyV6 [affinityKeyV6Size]byte + +type FrontEndAffinityKeyV6 [frontendAffKeyV6Size]byte + +func (k FrontEndAffinityKeyV6) AsBytes() []byte { + return k[:] +} + +func (k FrontEndAffinityKeyV6) String() string { + return fmt.Sprintf("FrontEndAffinityKeyV6{Proto:%v Addr:%v Port:%v}", k.Proto(), k.Addr(), k.Port()) +} + +func (k FrontEndAffinityKeyV6) Proto() uint8 { + return k[6] +} + +func (k FrontEndAffinityKeyV6) Addr() net.IP { + return k[0:16] +} + +func (k FrontEndAffinityKeyV6) Port() uint16 { + return binary.LittleEndian.Uint16(k[16:18]) +} + +// NewAffinityKey create a new AffinityKeyV6 from a clientIP and FrontendKeyV6 +func NewAffinityKeyV6(clientIP net.IP, fEndKey FrontendKeyV6) AffinityKeyV6 { + var k AffinityKeyV6 + + copy(k[:], fEndKey[4:4+frontendAffKeyV6Size]) + + addr := clientIP.To16() + copy(k[frontendAffKeyV6Size:frontendAffKeyV6Size+16], addr) + return k +} + +// ClientIP returns the ClientIP part of the key +func (k AffinityKeyV6) ClientIP() net.IP { + return k[frontendAffKeySize : frontendAffKeySize+4] +} + +// FrontendKeyV6 returns the FrontendKeyV6 part of the key +func (k AffinityKeyV6) FrontendAffinityKey() FrontEndAffinityKeyV6 { + var f FrontEndAffinityKeyV6 + copy(f[:], k[:frontendAffKeySize]) + + return f +} + +func (k AffinityKeyV6) String() string { + return fmt.Sprintf("AffinityKeyV6{ClientIP:%v %s}", k.ClientIP(), k.FrontendAffinityKey()) +} + +// AsBytes returns the key as []byte +func (k AffinityKeyV6) AsBytes() []byte { + return k[:] +} + +// struct calico_nat_v4_affinity_val { +// struct calico_nat_dest; +// uint64_t ts; +// }; + +const affinityValueV6Size = backendValueV6Size + 4 + 8 + +// AffinityValueV6 represents a backend picked by the affinity and the timestamp +// of its creating +type AffinityValueV6 [affinityValueV6Size]byte + +// NewAffinityValue creates a value from a timestamp and a backend +func NewAffinityValueV6(ts uint64, backend BackendValueV6) AffinityValueV6 { + var v AffinityValueV6 + + copy(v[:], backend[:]) + binary.LittleEndian.PutUint64(v[backendValueV6Size:backendValueV6Size+8], ts) + + return v +} + +// Timestamp returns the timestamp of the entry. It is generated by +// bpf_ktime_get_ns which returns the time since the system boot in nanoseconds +// - it is the monotonic clock reading, which is compatible with time operations +// in time package. +func (v AffinityValueV6) Timestamp() time.Duration { + nano := binary.LittleEndian.Uint64(v[backendValueSize : backendValueSize+8]) + return time.Duration(nano) * time.Nanosecond +} + +// Backend returns the backend the affinity ties the frontend + client to. +func (v AffinityValueV6) Backend() BackendValueV6 { + var b BackendValueV6 + + copy(b[:], v[:backendValueSize]) + + return b +} + +func (v AffinityValueV6) String() string { + return fmt.Sprintf("AffinityValueV6{Timestamp:%d,Backend:%v}", v.Timestamp(), v.Backend()) +} + +// AsBytes returns the value as []byte +func (v AffinityValueV6) AsBytes() []byte { + return v[:] +} + +// AffinityMapParameters describe the AffinityMap +var AffinityMapV6Parameters = maps.MapParameters{ + Type: "lru_hash", + KeySize: affinityKeyV6Size, + ValueSize: affinityValueV6Size, + MaxEntries: 64 * 1024, + Name: "cali_v6_nat_aff", +} + +// AffinityMap returns an instance of an affinity map +func AffinityMapV6() maps.Map { + return maps.NewPinnedMap(AffinityMapV6Parameters) +} + +// AffinityMapMem represents affinity map in memory +type AffinityMapMemV6 map[AffinityKeyV6]AffinityValueV6 + +// LoadAffinityMap loads affinity map into memory +func LoadAffinityMapV6(m maps.Map) (AffinityMapMemV6, error) { + ret := make(AffinityMapMemV6) + + if err := m.Open(); err != nil { + return nil, err + } + + iterFn := AffinityMapMemV6Iter(ret) + + err := m.Iter(func(k, v []byte) maps.IteratorAction { + iterFn(k, v) + return maps.IterNone + }) + if err != nil { + ret = nil + } + + return ret, err +} + +// AffinityMapMemIter returns maps.MapIter that loads the provided AffinityMapMem +func AffinityMapMemV6Iter(m AffinityMapMemV6) func(k, v []byte) { + ks := len(AffinityKeyV6{}) + vs := len(AffinityValueV6{}) + + return func(k, v []byte) { + var key AffinityKeyV6 + copy(key[:ks], k[:ks]) + + var val AffinityValueV6 + copy(val[:vs], v[:vs]) + + m[key] = val + } +} + +// struct sendrecv4_key { +// uint64_t cookie; +// uint32_t ip; +// uint32_t port; +// }; +// +// struct sendrecv4_val { +// uint32_t ip; +// uint32_t port; +// }; + +const sendRecvMsgKeyV6Size = 28 +const ctNATsMsgKeyV6Size = 38 + +// SendRecvMsgKeyV6 is the key for SendRecvMsgMap +type SendRecvMsgKeyV6 [sendRecvMsgKeyV6Size]byte + +// Cookie returns the socket cookie part of the key that can be used to match +// the socket. +func (k SendRecvMsgKeyV6) Cookie() uint64 { + return binary.LittleEndian.Uint64(k[0:8]) +} + +// IP returns the IP address part of the key +func (k SendRecvMsgKeyV6) IP() net.IP { + return k[8:24] +} + +// Port returns port converted to 16-bit host endianness +func (k SendRecvMsgKeyV6) Port() uint16 { + port := binary.BigEndian.Uint32(k[24:28]) + return uint16(port >> 16) +} + +func (k SendRecvMsgKeyV6) String() string { + return fmt.Sprintf("SendRecvMsgKeyV6{Cookie: 0x%016x, IP: %+v, Port: %+v}", k.Cookie(), k.IP(), k.Port()) +} + +const sendRecvMsgValueV6Size = 20 + +// SendRecvMsgValueV6 is the value of SendRecvMsgMap +type SendRecvMsgValueV6 [sendRecvMsgValueV6Size]byte + +// IP returns the IP address part of the key +func (v SendRecvMsgValueV6) IP() net.IP { + return v[0:16] +} + +// Port returns port converted to 16-bit host endianness +func (v SendRecvMsgValueV6) Port() uint16 { + port := binary.BigEndian.Uint32(v[16:20]) + return uint16(port >> 16) +} + +func (v SendRecvMsgValueV6) String() string { + return fmt.Sprintf("SendRecvMsgValueV6{IP: %+v, Port: %+v}", v.IP(), v.Port()) +} + +// SendRecvMsgMapParameters define SendRecvMsgMap +var SendRecvMsgMapV6Parameters = maps.MapParameters{ + Type: "lru_hash", + KeySize: sendRecvMsgKeyV6Size, + ValueSize: sendRecvMsgValueV6Size, + MaxEntries: 510000, + Name: "cali_v6_srmsg", +} + +var CTNATsMapV6Parameters = maps.MapParameters{ + Type: "lru_hash", + KeySize: ctNATsMsgKeyV6Size, + ValueSize: sendRecvMsgValueV6Size, + MaxEntries: 10000, + Name: "cali_v6_ct_nats", +} + +// SendRecvMsgMap tracks reverse translations for sendmsg/recvmsg of +// unconnected UDP +func SendRecvMsgMapV6() maps.Map { + return maps.NewPinnedMap(SendRecvMsgMapV6Parameters) +} + +func AllNATsMsgMapV6() maps.Map { + return maps.NewPinnedMap(CTNATsMapV6Parameters) +} + +// SendRecvMsgMapMem represents affinity map in memory +type SendRecvMsgMapMemV6 map[SendRecvMsgKeyV6]SendRecvMsgValueV6 + +// LoadSendRecvMsgMap loads affinity map into memory +func LoadSendRecvMsgMapV6(m maps.Map) (SendRecvMsgMapMemV6, error) { + ret := make(SendRecvMsgMapMemV6) + + iterFn := SendRecvMsgMapMemV6Iter(ret) + + err := m.Iter(func(k, v []byte) maps.IteratorAction { + iterFn(k, v) + return maps.IterNone + }) + if err != nil { + ret = nil + } + + return ret, err +} + +// SendRecvMsgMapMemIter returns maps.MapIter that loads the provided SendRecvMsgMapMem +func SendRecvMsgMapMemV6Iter(m SendRecvMsgMapMemV6) func(k, v []byte) { + ks := len(SendRecvMsgKeyV6{}) + vs := len(SendRecvMsgValueV6{}) + + return func(k, v []byte) { + var key SendRecvMsgKeyV6 + copy(key[:ks], k[:ks]) + + var val SendRecvMsgValueV6 + copy(val[:vs], v[:vs]) + + m[key] = val + } +} diff --git a/felix/bpf/routes/map.go b/felix/bpf/routes/map.go index 834fe897bcf..effd17cec22 100644 --- a/felix/bpf/routes/map.go +++ b/felix/bpf/routes/map.go @@ -33,6 +33,7 @@ func init() { func SetMapSize(size int) { maps.SetSize(MapParameters.VersionedName(), size) + maps.SetSize(MapV6Parameters.VersionedName(), size) } // struct cali_rt_key { @@ -44,7 +45,7 @@ const KeySize = 8 type Key [KeySize]byte func (k Key) Addr() ip.Addr { - var addr ip.V4Addr // FIXME IPv6 + var addr ip.V4Addr copy(addr[:], k[4:8]) return addr } @@ -101,7 +102,7 @@ func (v Value) Flags() Flags { } func (v Value) NextHop() ip.Addr { - var addr ip.V4Addr // FIXME IPv6 + var addr ip.V4Addr copy(addr[:], v[4:8]) return addr } diff --git a/felix/bpf/routes/map6.go b/felix/bpf/routes/map6.go new file mode 100644 index 00000000000..7cc8b213e32 --- /dev/null +++ b/felix/bpf/routes/map6.go @@ -0,0 +1,223 @@ +// Copyright (c) 2023 Tigera, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package routes + +import ( + "encoding/binary" + "fmt" + "strings" + "sync" + + "github.com/pkg/errors" + "golang.org/x/sys/unix" + + "github.com/projectcalico/calico/felix/bpf/maps" + "github.com/projectcalico/calico/felix/ip" +) + +const KeyV6Size = 20 + +type KeyV6 [KeyV6Size]byte + +func (k KeyV6) Addr() ip.Addr { + var addr ip.V6Addr + copy(addr[:], k[4:20]) + return addr +} + +func (k KeyV6) Dest() ip.CIDR { + addr := k.Addr() + return ip.CIDRFromAddrAndPrefix(addr, k.PrefixLen()) +} + +func (k KeyV6) PrefixLen() int { + return int(binary.LittleEndian.Uint32(k[:4])) +} + +func (k KeyV6) AsBytes() []byte { + return k[:] +} + +const ValueV6Size = 20 + +type ValueV6 [ValueV6Size]byte + +func (v ValueV6) Flags() Flags { + return Flags(binary.LittleEndian.Uint32(v[:4])) +} + +func (v ValueV6) NextHop() ip.Addr { + var addr ip.V6Addr + copy(addr[:], v[4:20]) + return addr +} + +func (v ValueV6) IfaceIndex() uint32 { + return binary.LittleEndian.Uint32(v[4:8]) +} + +func (v ValueV6) AsBytes() []byte { + return v[:] +} + +func (v ValueV6) String() string { + var parts []string + + typeFlags := v.Flags() + + if typeFlags&FlagLocal != 0 { + parts = append(parts, "local") + } else { + parts = append(parts, "remote") + } + + if typeFlags&FlagHost != 0 { + parts = append(parts, "host") + } else if typeFlags&FlagWorkload != 0 { + parts = append(parts, "workload") + } + + if typeFlags&FlagInIPAMPool != 0 { + parts = append(parts, "in-pool") + } + + if typeFlags&FlagNATOutgoing != 0 { + parts = append(parts, "nat-out") + } + + if typeFlags&FlagSameSubnet != 0 { + parts = append(parts, "same-subnet") + } + + if typeFlags&FlagNoDSR != 0 { + parts = append(parts, "no-dsr") + } + + if typeFlags&FlagTunneled != 0 { + parts = append(parts, "tunneled") + } + + if typeFlags&FlagLocal != 0 && typeFlags&FlagWorkload != 0 { + parts = append(parts, "idx", fmt.Sprint(v.IfaceIndex())) + } + + if typeFlags&FlagLocal == 0 && typeFlags&FlagWorkload != 0 { + parts = append(parts, "nh", fmt.Sprint(v.NextHop())) + } + + if len(parts) == 0 { + return fmt.Sprintf("unknown type (%d)", typeFlags) + } + + return strings.Join(parts, " ") +} + +func NewKeyV6(cidr ip.V6CIDR) KeyV6 { + var k KeyV6 + + binary.LittleEndian.PutUint32(k[:4], uint32(cidr.Prefix())) + copy(k[4:20], cidr.Addr().AsNetIP().To16()) + + return k +} + +func NewValueV6(flags Flags) ValueV6 { + var v ValueV6 + binary.LittleEndian.PutUint32(v[:4], uint32(flags)) + return v +} + +func NewValueV6WithNextHop(flags Flags, nextHop ip.V6Addr) ValueV6 { + var v ValueV6 + binary.LittleEndian.PutUint32(v[:4], uint32(flags)) + copy(v[4:20], nextHop.AsNetIP().To16()) + return v +} + +func NewValueV6WithIfIndex(flags Flags, ifIndex int) ValueV6 { + var v ValueV6 + binary.LittleEndian.PutUint32(v[:4], uint32(flags)) + binary.LittleEndian.PutUint32(v[4:8], uint32(ifIndex)) + return v +} + +var MapV6Parameters = maps.MapParameters{ + Type: "lpm_trie", + KeySize: KeyV6Size, + ValueSize: ValueV6Size, + MaxEntries: 256 * 1024, + Name: "cali_v6_routes", + Flags: unix.BPF_F_NO_PREALLOC, +} + +func MapV6() maps.Map { + return maps.NewPinnedMap(MapV6Parameters) +} + +type MapMemV6 map[KeyV6]ValueV6 + +// LoadMap loads a routes.Map into memory +func LoadMapV6(rtm maps.Map) (MapMemV6, error) { + m := make(MapMemV6) + + err := rtm.Iter(func(k, v []byte) maps.IteratorAction { + var key KeyV6 + var value ValueV6 + copy(key[:], k) + copy(value[:], v) + + m[key] = value + return maps.IterNone + }) + + return m, err +} + +type LPMv6 struct { + sync.RWMutex + t *ip.CIDRTrie +} + +func NewLPMv6() *LPMv6 { + return &LPMv6{ + t: ip.NewCIDRTrie(), + } +} + +func (lpm *LPMv6) Update(k KeyV6, v ValueV6) error { + if cidrv6, ok := k.Dest().(ip.V6CIDR); ok { + lpm.t.Update(cidrv6, v) + return nil + } + + return errors.Errorf("k.Dest() %+v type %T is not ip.V4CIDR", k.Dest(), k.Dest()) +} + +func (lpm *LPMv6) Delete(k KeyV6) error { + if cidrv6, ok := k.Dest().(ip.V6CIDR); ok { + lpm.t.Delete(cidrv6) + return nil + } + + return errors.Errorf("k.Dest() %+v type %T is not ip.V4CIDR", k.Dest(), k.Dest()) +} + +func (lpm *LPMv6) Lookup(addr ip.V6Addr) (ValueV6, bool) { + _, v := lpm.t.LPM(addr.AsCIDR().(ip.V6CIDR)) + if v == nil { + return ValueV6{}, false + } + return v.(ValueV6), true +} diff --git a/felix/bpf/state/map.go b/felix/bpf/state/map.go index 86741948345..e47ccc4117c 100644 --- a/felix/bpf/state/map.go +++ b/felix/bpf/state/map.go @@ -17,8 +17,6 @@ package state import ( "unsafe" - log "github.com/sirupsen/logrus" - "github.com/projectcalico/calico/felix/bpf/maps" ) @@ -94,7 +92,8 @@ type State struct { TunIP1 uint32 TunIP2 uint32 TunIP3 uint32 - _ uint32 + ihl uint16 + _ uint16 PolicyRC PolicyResult SrcPort uint16 DstPort uint16 @@ -115,15 +114,12 @@ type State struct { NATData uint64 ProgStartTime uint64 Flags uint64 + _ [48]byte // ipv6 padding } -const expectedSize = 416 +const expectedSize = 464 func (s *State) AsBytes() []byte { - size := unsafe.Sizeof(State{}) - if size != expectedSize { - log.WithField("size", size).Panic("Incorrect struct size") - } bPtr := (*[expectedSize]byte)(unsafe.Pointer(s)) bytes := make([]byte, expectedSize) copy(bytes, bPtr[:]) @@ -143,7 +139,7 @@ var MapParameters = maps.MapParameters{ ValueSize: expectedSize, MaxEntries: 2, Name: "cali_state", - Version: 3, + Version: 4, } func Map() maps.Map { diff --git a/felix/bpf/tc/attach.go b/felix/bpf/tc/attach.go index fade8fde626..2a38d4041ed 100644 --- a/felix/bpf/tc/attach.go +++ b/felix/bpf/tc/attach.go @@ -459,6 +459,14 @@ func ConfigureProgram(m *libbpf.Map, iface string, globalData *libbpf.TcGlobalDa return libbpf.TcSetGlobals(m, globalData) } +func ConfigureProgramV6(m *libbpf.Map, iface string, globalData *libbpf.TcGlobalData6) error { + in := []byte("---------------") + copy(in, iface) + globalData.IfaceName = string(in) + + return libbpf.TcSetGlobals6(m, globalData) +} + func convertIPToUint32(ip net.IP) (uint32, error) { ipv4 := ip.To4() if ipv4 == nil { diff --git a/felix/bpf/tc/defs/defs.go b/felix/bpf/tc/defs/defs.go index 6a3048b5310..947ed97e1e7 100644 --- a/felix/bpf/tc/defs/defs.go +++ b/felix/bpf/tc/defs/defs.go @@ -60,16 +60,20 @@ const ( ProgIndexDropDebug ProgIndexHostCtConflictDebug ProgIndexIcmpInnerNatDebug - ProgIndexV6Prologue + ProgIndexV6Main ProgIndexV6Policy ProgIndexV6Allowed ProgIndexV6Icmp ProgIndexV6Drop - ProgIndexV6PrologueDebug + ProgIndexV6HostCtConflict + ProgIndexV6IcmpInnerNat + ProgIndexV6MainDebug ProgIndexV6PolicyDebug ProgIndexV6AllowedDebug ProgIndexV6IcmpDebug ProgIndexV6DropDebug + ProgIndexV6HostCtConflictDebug + ProgIndexV6IcmpInnerNatDebug ProgIndexEndDebug ProgIndexEnd @@ -101,17 +105,21 @@ var ProgramNames = []string{ "calico_tc_host_ct_conflict", "calico_tc_skb_icmp_inner_nat", /* ipv6 */ - "calico_tc6", + "calico_tc_main", "calico_tc_norm_pol_tail", "calico_tc_skb_accepted_entrypoint", "calico_tc_skb_send_icmp_replies", "calico_tc_skb_drop", + "calico_tc_host_ct_conflict", + "calico_tc_skb_icmp_inner_nat", /* ipv6 - debug */ - "calico_tc6", + "calico_tc_main", "calico_tc_norm_pol_tail", "calico_tc_skb_accepted_entrypoint", "calico_tc_skb_send_icmp_replies", "calico_tc_skb_drop", + "calico_tc_host_ct_conflict", + "calico_tc_skb_icmp_inner_nat", } type ToOrFromEp string diff --git a/felix/bpf/ut/bpf_prog_test.go b/felix/bpf/ut/bpf_prog_test.go index 9f7ce14e48b..3ee8d5c1b05 100644 --- a/felix/bpf/ut/bpf_prog_test.go +++ b/felix/bpf/ut/bpf_prog_test.go @@ -104,6 +104,20 @@ var ( IP: node2ip, Mask: net.IPv4Mask(255, 255, 255, 255), } + + node1ipV6 = net.ParseIP("abcd::ffff:0a0a:0001").To16() + node1ip2V6 = net.ParseIP("abcd::ffff:0a0a:0201").To16() + node1tunIPV6 = net.ParseIP("abcd::ffff:0b0b:0001").To16() + node2ipV6 = net.ParseIP("abcd::ffff:0a0a:0002").To16() + intfIPV6 = net.ParseIP("abcd::ffff:0a0a:0003").To16() + node1CIDRV6 = net.IPNet{ + IP: node1ipV6, + Mask: net.CIDRMask(128, 128), + } + node2CIDRV6 = net.IPNet{ + IP: node2ipV6, + Mask: net.CIDRMask(128, 128), + } ) // Globals that we use to configure the next test run. @@ -195,11 +209,22 @@ var tcJumpMapIndexes = map[string][]int{ tcdefs.ProgIndexIcmpInnerNatDebug, }, "IPv6": []int{ - tcdefs.ProgIndexV6PrologueDebug, + tcdefs.ProgIndexV6Main, + tcdefs.ProgIndexV6Policy, + tcdefs.ProgIndexV6Allowed, + tcdefs.ProgIndexV6Icmp, + tcdefs.ProgIndexV6Drop, + tcdefs.ProgIndexV6HostCtConflict, + tcdefs.ProgIndexV6IcmpInnerNat, + }, + "IPv6 debug": []int{ + tcdefs.ProgIndexV6MainDebug, tcdefs.ProgIndexV6PolicyDebug, tcdefs.ProgIndexV6AllowedDebug, tcdefs.ProgIndexV6IcmpDebug, tcdefs.ProgIndexV6DropDebug, + tcdefs.ProgIndexV6HostCtConflictDebug, + tcdefs.ProgIndexV6IcmpInnerNatDebug, }, } @@ -309,8 +334,13 @@ func setupAndRun(logger testLogger, loglevel, section string, rules *polprog.Rul topts.progLog = "WEP" } - log.WithField("hostIP", hostIP).Info("Host IP") - log.WithField("intfIP", intfIP).Info("Intf IP") + if topts.ipv6 { + log.WithField("hostIP", hostIP).Info("Host IP") + log.WithField("intfIP", intfIPV6).Info("Intf IP") + } else { + log.WithField("hostIP", hostIP).Info("Host IP") + log.WithField("intfIP", intfIP).Info("Intf IP") + } obj += fmt.Sprintf("fib_%s", loglevel) if strings.Contains(section, "_dsr") { @@ -318,12 +348,24 @@ func setupAndRun(logger testLogger, loglevel, section string, rules *polprog.Rul } } - if !topts.xdp { - o, err := objLoad("../../bpf-gpl/bin/tc_preamble.o", bpfFsDir, "preamble", topts, false, false) + ipFamily := "IPv4" + policyIdx := tcdefs.ProgIndexPolicy + if topts.ipv6 { + ipFamily = "IPv6" + obj += "_v6" + policyIdx = tcdefs.ProgIndexV6Policy + } + + if topts.xdp { + o, err := objLoad("../../bpf-gpl/bin/xdp_preamble.o", bpfFsDir, "preamble", topts, false, false) + Expect(err).NotTo(HaveOccurred()) + defer o.Close() + } else if topts.ipv6 { + o, err := objLoad("../../bpf-gpl/bin/tc_preamble_v6.o", bpfFsDir, "preamble", topts, false, false) Expect(err).NotTo(HaveOccurred()) defer o.Close() } else { - o, err := objLoad("../../bpf-gpl/bin/xdp_preamble.o", bpfFsDir, "preamble", topts, false, false) + o, err := objLoad("../../bpf-gpl/bin/tc_preamble.o", bpfFsDir, "preamble", topts, false, false) Expect(err).NotTo(HaveOccurred()) defer o.Close() } @@ -340,25 +382,15 @@ func setupAndRun(logger testLogger, loglevel, section string, rules *polprog.Rul err = bin.WriteToFile(tempObj) Expect(err).NotTo(HaveOccurred()) - ipFamily := "IPv4" if loglevel == "debug" { ipFamily += " debug" } - o, err := objLoad(tempObj, bpfFsDir, ipFamily, topts, rules != nil, true) - Expect(err).NotTo(HaveOccurred()) - defer o.Close() - - if topts.ipv6 { - o, err := objLoad(obj+"_v6.o", bpfFsDir, "IPv6", topts, rules != nil, false) - Expect(err).NotTo(HaveOccurred()) - defer o.Close() - } + var o *libbpf.Obj - if err != nil { - logger.Log("Error:", string(err.(*exec.ExitError).Stderr)) - } + o, err = objLoad(tempObj, bpfFsDir, ipFamily, topts, rules != nil, true) Expect(err).NotTo(HaveOccurred()) + defer o.Close() if rules != nil { jmpMap := progMap @@ -385,7 +417,7 @@ func setupAndRun(logger testLogger, loglevel, section string, rules *polprog.Rul } Expect(err).NotTo(HaveOccurred(), "Failed to load rules program.") defer func() { _ = polProgFD.Close() }() - err = jumpMapUpdate(polMap, tcdefs.ProgIndexPolicy, int(polProgFD)) + err = jumpMapUpdate(polMap, policyIdx, int(polProgFD)) Expect(err).NotTo(HaveOccurred()) log.WithField("rules", rules).Debug("set policy") } @@ -491,27 +523,35 @@ func bpftool(args ...string) ([]byte, error) { var ( mapInitOnce sync.Once - natMap, natBEMap, ctMap, rtMap, ipsMap, stateMap, testStateMap, progMap, progMapXDP, affinityMap, arpMap, fsafeMap, countersMap, ifstateMap, jumpMap, jumpMapXDP maps.Map - allMaps []maps.Map + natMap, natBEMap, ctMap, rtMap, ipsMap, testStateMap, affinityMap, arpMap, fsafeMap maps.Map + natMapV6, natBEMapV6, ctMapV6, rtMapV6, affinityMapV6, arpMapV6 maps.Map + stateMap, countersMap, ifstateMap, progMap, progMapXDP, jumpMap, jumpMapXDP maps.Map + allMaps []maps.Map ) func initMapsOnce() { mapInitOnce.Do(func() { natMap = nat.FrontendMap() natBEMap = nat.BackendMap() + natMapV6 = nat.FrontendMapV6() + natBEMapV6 = nat.BackendMapV6() ctMap = conntrack.Map() + ctMapV6 = conntrack.MapV6() rtMap = routes.Map() + rtMapV6 = routes.MapV6() ipsMap = ipsets.Map() stateMap = state.Map() testStateMap = state.MapForTest() affinityMap = nat.AffinityMap() + affinityMapV6 = nat.AffinityMapV6() arpMap = arp.Map() + arpMapV6 = arp.MapV6() fsafeMap = failsafes.Map() countersMap = counters.Map() ifstateMap = ifstate.Map() - allMaps = []maps.Map{natMap, natBEMap, ctMap, rtMap, ipsMap, stateMap, testStateMap, - affinityMap, arpMap, fsafeMap, countersMap, ifstateMap} + allMaps = []maps.Map{natMap, natBEMap, natMapV6, natBEMapV6, ctMap, ctMapV6, rtMap, rtMapV6, ipsMap, + stateMap, testStateMap, affinityMap, affinityMapV6, arpMap, arpMapV6, fsafeMap, countersMap, ifstateMap} for _, m := range allMaps { err := m.EnsureExists() if err != nil { @@ -588,9 +628,18 @@ func tcUpdateJumpMap(obj *libbpf.Obj, progs []int, hasPolicyProg, hasHostConflic if !hasPolicyProg { continue } - } - if (idx == tcdefs.ProgIndexHostCtConflict || idx == tcdefs.ProgIndexHostCtConflictDebug) && !hasHostConflictProg { - continue + case + tcdefs.ProgIndexV6Icmp, + tcdefs.ProgIndexV6IcmpDebug: + continue // XXX not implemented + case + tcdefs.ProgIndexHostCtConflict, + tcdefs.ProgIndexHostCtConflictDebug, + tcdefs.ProgIndexV6HostCtConflict, + tcdefs.ProgIndexV6HostCtConflictDebug: + if !hasHostConflictProg { + continue + } } log.WithField("prog", tcdefs.ProgramNames[idx]).WithField("idx", idx).Debug("UpdateJumpMap") err := obj.UpdateJumpMap(progMap.GetName(), tcdefs.ProgramNames[idx], idx) @@ -646,6 +695,31 @@ func objLoad(fname, bpfFsDir, ipFamily string, topts testOpts, polProg, hasHostC if err := xdp.ConfigureProgram(m, bpfIfaceName, &globals); err != nil { return nil, err } + } else if topts.ipv6 { + ifaceLog := topts.progLog + "-" + bpfIfaceName + globals := libbpf.TcGlobalData6{ + Tmtu: natTunnelMTU, + VxlanPort: testVxlanPort, + PSNatStart: uint16(topts.psnaStart), + PSNatLen: uint16(topts.psnatEnd-topts.psnaStart) + 1, + Flags: libbpf.GlobalsIPv6Enabled | libbpf.GlobalsNoDSRCidrs, + LogFilterJmp: 0xffffffff, + } + + copy(globals.HostTunnelIP[:], node1tunIPV6.To16()) + copy(globals.HostIP[:], hostIP.To16()) + copy(globals.IntfIP[:], intfIPV6.To16()) + + for i := 0; i < tcdefs.ProgIndexEnd; i++ { + globals.Jumps[i] = uint32(i) + } + + log.WithField("globals", globals).Debugf("configure program") + + if err := tc.ConfigureProgramV6(m, ifaceLog, &globals); err != nil { + return nil, fmt.Errorf("failed to configure tc program: %w", err) + } + log.WithField("program", fname).Debugf("Configured BPF program iface \"%s\"", ifaceLog) } else { ifaceLog := topts.progLog + "-" + bpfIfaceName globals := libbpf.TcGlobalData{ @@ -674,7 +748,11 @@ func objLoad(fname, bpfFsDir, ipFamily string, topts testOpts, polProg, hasHostC continue } pin := "/sys/fs/bpf/tc/globals/" + m.Name() - log.WithField("pin", pin).Debug("Pinning map") + log.WithFields(log.Fields{ + "pin": pin, + "key size": m.KeySize(), + "value size": m.ValueSize(), + }).Debug("Pinning map") cmd := exec.Command("bpftool", "map", "show", "pinned", pin) log.WithField("cmd", cmd.String()).Debugf("executing") out, _ := cmd.Output() @@ -694,7 +772,6 @@ func objLoad(fname, bpfFsDir, ipFamily string, topts testOpts, polProg, hasHostC policyIdx := tcdefs.ProgIndexPolicy if strings.HasPrefix(ipFamily, "IPv6") { - progDir += "_v6" policyIdx = tcdefs.ProgIndexV6Policy } @@ -726,7 +803,7 @@ func objLoad(fname, bpfFsDir, ipFamily string, topts testOpts, polProg, hasHostC } if !forXDP { - log.WithField("ipFamily", ipFamily).Debug("Udating jump map") + log.WithField("ipFamily", ipFamily).Debug("Updating jump map") err = tcUpdateJumpMap(obj, tcJumpMapIndexes[ipFamily], false, hasHostConflictProg) if err != nil && !strings.Contains(err.Error(), "error updating calico_tc_host_ct_conflict program") { goto out @@ -760,18 +837,36 @@ func objUTLoad(fname, bpfFsDir, ipFamily string, topts testOpts, polProg, hasHos for m, err := obj.FirstMap(); m != nil && err == nil; m, err = m.NextMap() { if m.IsMapInternal() { ifaceLog := topts.progLog + "-" + bpfIfaceName - globals := libbpf.TcGlobalData{ - HostIP: ipToU32(hostIP), - IntfIP: ipToU32(intfIP), - Tmtu: natTunnelMTU, - VxlanPort: testVxlanPort, - PSNatStart: uint16(topts.psnaStart), - PSNatLen: uint16(topts.psnatEnd-topts.psnaStart) + 1, - Flags: libbpf.GlobalsIPv6Enabled | libbpf.GlobalsNoDSRCidrs, - HostTunnelIP: ipToU32(node1tunIP), - } - if err := tc.ConfigureProgram(m, ifaceLog, &globals); err != nil { - return nil, fmt.Errorf("failed to configure tc program: %w", err) + if topts.ipv6 { + globals := libbpf.TcGlobalData6{ + Tmtu: natTunnelMTU, + VxlanPort: testVxlanPort, + PSNatStart: uint16(topts.psnaStart), + PSNatLen: uint16(topts.psnatEnd-topts.psnaStart) + 1, + Flags: libbpf.GlobalsIPv6Enabled | libbpf.GlobalsNoDSRCidrs, + } + + copy(globals.HostTunnelIP[:], node1tunIPV6.To16()) + copy(globals.HostIP[:], hostIP.To16()) + copy(globals.IntfIP[:], intfIPV6.To16()) + + if err := tc.ConfigureProgramV6(m, ifaceLog, &globals); err != nil { + return nil, fmt.Errorf("failed to configure v6 tc program: %w", err) + } + } else { + globals := libbpf.TcGlobalData{ + HostIP: ipToU32(hostIP), + IntfIP: ipToU32(intfIP), + Tmtu: natTunnelMTU, + VxlanPort: testVxlanPort, + PSNatStart: uint16(topts.psnaStart), + PSNatLen: uint16(topts.psnatEnd-topts.psnaStart) + 1, + Flags: libbpf.GlobalsIPv6Enabled | libbpf.GlobalsNoDSRCidrs, + HostTunnelIP: ipToU32(node1tunIP), + } + if err := tc.ConfigureProgram(m, ifaceLog, &globals); err != nil { + return nil, fmt.Errorf("failed to configure tc program: %w", err) + } } break } @@ -783,10 +878,6 @@ func objUTLoad(fname, bpfFsDir, ipFamily string, topts testOpts, polProg, hasHos progDir := bpfFsDir - if ipFamily == "IPv6" { - progDir += "_v6" - } - err = obj.PinPrograms(progDir) if err != nil { obj.Close() @@ -925,7 +1016,12 @@ func runBpfUnitTest(t *testing.T, source string, testFn func(bpfProgRunFn), opts Expect(err).NotTo(HaveOccurred()) defer os.RemoveAll(bpfFsDir) - objFname := "../../bpf-gpl/ut/" + strings.TrimSuffix(source, path.Ext(source)) + ".o" + vExt := "" + if topts.ipv6 { + vExt = "_v6" + } + + objFname := "../../bpf-gpl/ut/" + strings.TrimSuffix(source, path.Ext(source)) + vExt + ".o" obj, err := objUTLoad(objFname, bpfFsDir, "IPv4", topts, true, false) Expect(err).NotTo(HaveOccurred()) @@ -1066,6 +1162,38 @@ func udpResponseRaw(in []byte) []byte { return out.Bytes() } +func udpResponseRawV6(in []byte) []byte { + pkt := gopacket.NewPacket(in, layers.LayerTypeEthernet, gopacket.Default) + ethL := pkt.Layer(layers.LayerTypeEthernet) + ethR := ethL.(*layers.Ethernet) + ethR.SrcMAC, ethR.DstMAC = ethR.DstMAC, ethR.SrcMAC + + ipv6L := pkt.Layer(layers.LayerTypeIPv6) + ipv6R := ipv6L.(*layers.IPv6) + ipv6R.SrcIP, ipv6R.DstIP = ipv6R.DstIP, ipv6R.SrcIP + + lrs := []gopacket.SerializableLayer{ethR, ipv6R} + + if ipv6R.NextHeader == layers.IPProtocolIPv6HopByHop { + l := pkt.Layer(layers.LayerTypeIPv6HopByHop) + lrs = append(lrs, l.(*layers.IPv6HopByHop)) + } + + udpL := pkt.Layer(layers.LayerTypeUDP) + udpR := udpL.(*layers.UDP) + udpR.SrcPort, udpR.DstPort = udpR.DstPort, udpR.SrcPort + + _ = udpR.SetNetworkLayerForChecksum(ipv6R) + + lrs = append(lrs, udpR, gopacket.Payload(pkt.ApplicationLayer().Payload())) + + out := gopacket.NewSerializeBuffer() + err := gopacket.SerializeLayers(out, gopacket.SerializeOptions{ComputeChecksums: true}, lrs...) + Expect(err).NotTo(HaveOccurred()) + + return out.Bytes() +} + func tcpResponseRaw(in []byte) []byte { pkt := gopacket.NewPacket(in, layers.LayerTypeEthernet, gopacket.Default) ethL := pkt.Layer(layers.LayerTypeEthernet) @@ -1102,6 +1230,14 @@ func dumpNATMap(natMap maps.Map) { } } +func dumpNATMapV6(natMap maps.Map) { + nt, err := nat.LoadFrontendMapV6(natMap) + Expect(err).NotTo(HaveOccurred()) + for k, v := range nt { + fmt.Printf("%s : %s\n", k, v) + } +} + func resetMap(m maps.Map) { err := m.Iter(func(_, _ []byte) maps.IteratorAction { return maps.IterDelete @@ -1119,16 +1255,36 @@ func dumpCTMap(ctMap maps.Map) { fmt.Printf("\n") } +func dumpCTMapV6(ctMap maps.Map) { + ct, err := conntrack.LoadMapMemV6(ctMap) + Expect(err).NotTo(HaveOccurred()) + fmt.Printf("Conntrack dump:\n") + for k, v := range ct { + fmt.Printf("- %s : %s\n", k, v) + } + fmt.Printf("\n") +} + func resetCTMap(ctMap maps.Map) { resetMap(ctMap) } +func resetCTMapV6(ctMap maps.Map) { + resetMap(ctMap) +} + func saveCTMap(ctMap maps.Map) conntrack.MapMem { ct, err := conntrack.LoadMapMem(ctMap) Expect(err).NotTo(HaveOccurred()) return ct } +func saveCTMapV6(ctMap maps.Map) conntrack.MapMemV6 { + ct, err := conntrack.LoadMapMemV6(ctMap) + Expect(err).NotTo(HaveOccurred()) + return ct +} + func restoreCTMap(ctMap maps.Map, m conntrack.MapMem) { for k, v := range m { err := ctMap.Update(k[:], v[:]) @@ -1136,6 +1292,13 @@ func restoreCTMap(ctMap maps.Map, m conntrack.MapMem) { } } +func restoreCTMapV6(ctMap maps.Map, m conntrack.MapMemV6) { + for k, v := range m { + err := ctMap.Update(k[:], v[:]) + Expect(err).NotTo(HaveOccurred()) + } +} + func dumpRTMap(rtMap maps.Map) { rt, err := routes.LoadMap(rtMap) Expect(err).NotTo(HaveOccurred()) @@ -1144,16 +1307,34 @@ func dumpRTMap(rtMap maps.Map) { } } +func dumpRTMapV6(rtMap maps.Map) { + rt, err := routes.LoadMapV6(rtMap) + Expect(err).NotTo(HaveOccurred()) + for k, v := range rt { + fmt.Printf("%15s: %s\n", k.Dest(), v) + } +} + func resetRTMap(rtMap maps.Map) { resetMap(rtMap) } +func resetRTMapV6(rtMap maps.Map) { + resetMap(rtMap) +} + func saveRTMap(rtMap maps.Map) routes.MapMem { rt, err := routes.LoadMap(rtMap) Expect(err).NotTo(HaveOccurred()) return rt } +func saveRTMapV6(rtMap maps.Map) routes.MapMemV6 { + rt, err := routes.LoadMapV6(rtMap) + Expect(err).NotTo(HaveOccurred()) + return rt +} + func restoreRTMap(rtMap maps.Map, m routes.MapMem) { for k, v := range m { err := rtMap.Update(k[:], v[:]) @@ -1161,6 +1342,13 @@ func restoreRTMap(rtMap maps.Map, m routes.MapMem) { } } +func restoreRTMapV6(rtMap maps.Map, m routes.MapMemV6) { + for k, v := range m { + err := rtMap.Update(k[:], v[:]) + Expect(err).NotTo(HaveOccurred()) + } +} + func dumpARPMap(arpMap maps.Map) { ct, err := arp.LoadMapMem(arpMap) Expect(err).NotTo(HaveOccurred()) @@ -1171,8 +1359,24 @@ func dumpARPMap(arpMap maps.Map) { fmt.Printf("\n") } -func saveARPMap(ctMap maps.Map) arp.MapMem { - m, err := arp.LoadMapMem(arpMap) +func dumpARPMapV6(arpMap maps.Map) { + ct, err := arp.LoadMapMemV6(arpMap) + Expect(err).NotTo(HaveOccurred()) + fmt.Printf("ARP dump:\n") + for k, v := range ct { + fmt.Printf("- %s : %s\n", k, v) + } + fmt.Printf("\n") +} + +func saveARPMap(am maps.Map) arp.MapMem { + m, err := arp.LoadMapMem(am) + Expect(err).NotTo(HaveOccurred()) + return m +} + +func saveARPMapV6(am maps.Map) arp.MapMemV6 { + m, err := arp.LoadMapMemV6(am) Expect(err).NotTo(HaveOccurred()) return m } @@ -1202,6 +1406,8 @@ var ipv4Default = &layers.IPv4{ var srcIPv6 = net.IP([]byte{0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}) var dstIPv6 = net.IP([]byte{0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2}) +var srcV6CIDR = ip.CIDRFromNetIP(srcIPv6).(ip.V6CIDR) +var dstV6CIDR = ip.CIDRFromNetIP(dstIPv6).(ip.V6CIDR) var ipv6Default = &layers.IPv6{ Version: 6, @@ -1216,24 +1422,53 @@ var udpDefault = &layers.UDP{ DstPort: 5678, } -func testPacket(eth *layers.Ethernet, l3 gopacket.Layer, l4 gopacket.Layer, payload []byte) ( - *layers.Ethernet, *layers.IPv4, gopacket.Layer, []byte, []byte, error) { +func testPacket(family int, eth *layers.Ethernet, l3 gopacket.Layer, l4 gopacket.Layer, + payload []byte, ipv6ext ...gopacket.SerializableLayer) ( + *layers.Ethernet, gopacket.Layer, gopacket.Layer, []byte, []byte, error) { pkt := Packet{ + family: family, eth: eth, l3: l3, l4: l4, payload: payload, + ipv6ext: ipv6ext, } err := pkt.Generate() + if err != nil { + return nil, nil, nil, nil, nil, err + } p := gopacket.NewPacket(pkt.bytes, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("p = %+v\n", p) e := p.Layer(layers.LayerTypeEthernet).(*layers.Ethernet) - ip := p.Layer(layers.LayerTypeIPv4).(*layers.IPv4) + + var ( + ipl gopacket.Layer + proto layers.IPProtocol + ) + + ipv4L := p.Layer(layers.LayerTypeIPv4) + if ipv4L != nil { + ipv4 := ipv4L.(*layers.IPv4) + proto = ipv4.Protocol + ipl = ipv4L + } else { + ipv6L := p.Layer(layers.LayerTypeIPv6) + if ipv6L != nil { + ipv6 := ipv6L.(*layers.IPv6) + proto = ipv6.NextHeader + } + if proto == layers.IPProtocolIPv6HopByHop { + l := p.Layer(layers.LayerTypeIPv6HopByHop) + proto = l.(*layers.IPv6HopByHop).NextHeader + } + ipl = ipv6L + } var l gopacket.Layer - switch ip.Protocol { + switch proto { case layers.IPProtocolUDP: l = p.Layer(layers.LayerTypeUDP) case layers.IPProtocolTCP: @@ -1242,10 +1477,23 @@ func testPacket(eth *layers.Ethernet, l3 gopacket.Layer, l4 gopacket.Layer, payl l = p.Layer(layers.LayerTypeICMPv4) } - return e, ip, l, pkt.payload, pkt.bytes, err + return e, ipl, l, pkt.payload, pkt.bytes, err +} + +func testPacketV4(eth *layers.Ethernet, ipv4 *layers.IPv4, l4 gopacket.Layer, payload []byte) ( + *layers.Ethernet, *layers.IPv4, gopacket.Layer, []byte, []byte, error) { + e, ip4, l4, p, b, err := testPacket(4, eth, ipv4, l4, payload) + return e, ip4.(*layers.IPv4), l4, p, b, err +} + +func testPacketV6(eth *layers.Ethernet, ipv6 *layers.IPv6, l4 gopacket.Layer, payload []byte, ipv6ext ...gopacket.SerializableLayer) ( + *layers.Ethernet, *layers.IPv6, gopacket.Layer, []byte, []byte, error) { + e, ip6, l4, p, b, err := testPacket(6, eth, ipv6, l4, payload, ipv6ext...) + return e, ip6.(*layers.IPv6), l4, p, b, err } type Packet struct { + family int eth *layers.Ethernet l3 gopacket.Layer ipv4 *layers.IPv4 @@ -1261,6 +1509,7 @@ type Packet struct { length int l4Protocol layers.IPProtocol l3Protocol layers.EthernetType + ipv6ext []gopacket.SerializableLayer } func (pkt *Packet) handlePayload() { @@ -1304,9 +1553,64 @@ func (pkt *Packet) handleL4() error { return nil } +func (pkt *Packet) handleIPv6Ext() error { + exts := gopacket.NewSerializeBuffer() + err := gopacket.SerializeLayers(exts, gopacket.SerializeOptions{FixLengths: true}, pkt.ipv6ext...) + if err != nil { + return err + } + + pkt.length += len(exts.Bytes()) + + return nil +} + +func nextHdrIPProto(nh gopacket.Layer) layers.IPProtocol { + switch nh.(type) { + case *layers.IPv6HopByHop: + return layers.IPProtocolIPv6HopByHop + case *layers.ICMPv4: + return layers.IPProtocolICMPv4 + case *layers.IGMP: + return layers.IPProtocolIGMP + case *layers.IPv4: + return layers.IPProtocolIPv4 + case *layers.TCP: + return layers.IPProtocolTCP + case *layers.UDP: + return layers.IPProtocolUDP + case *layers.RUDP: + return layers.IPProtocolRUDP + case *layers.IPv6: + return layers.IPProtocolIPv6 + case *layers.IPv6Routing: + return layers.IPProtocolIPv6Routing + case *layers.IPv6Fragment: + return layers.IPProtocolIPv6Fragment + case *layers.GRE: + return layers.IPProtocolGRE + case *layers.ICMPv6: + return layers.IPProtocolICMPv6 + case *layers.IPv6Destination: + return layers.IPProtocolIPv6Destination + case *layers.EtherIP: + return layers.IPProtocolEtherIP + case *layers.SCTP: + return layers.IPProtocolSCTP + case *layers.UDPLite: + return layers.IPProtocolUDPLite + } + + panic("unknown next layer") +} + func (pkt *Packet) handleL3() error { - if pkt.l3 == nil { - pkt.l3 = ipv4Default + if reflect.ValueOf(pkt.l3).IsNil() { + if pkt.family == 4 { + pkt.l3 = ipv4Default + } else { + pkt.l3 = ipv6Default + } } switch v := pkt.l3.(type) { @@ -1320,7 +1624,18 @@ func (pkt *Packet) handleL3() error { case *layers.IPv6: pkt.ipv6 = v pkt.l3Protocol = layers.EthernetTypeIPv6 - pkt.ipv6.NextHeader = pkt.l4Protocol + if len(pkt.ipv6ext) > 0 { + if err := pkt.handleIPv6Ext(); err != nil { + return fmt.Errorf("handling ipv6 extensions: %w", err) + } + pkt.ipv6.NextHeader = nextHdrIPProto(pkt.ipv6ext[0].(gopacket.Layer)) + for i := len(pkt.ipv6ext); i > 0; i-- { + pkt.layers = append(pkt.layers, pkt.ipv6ext[i-1]) + } + } else { + pkt.ipv6.NextHeader = pkt.l4Protocol + } + pkt.length += 40 pkt.ipv6.Length = uint16(pkt.length) pkt.layers = append(pkt.layers, pkt.ipv6) default: @@ -1401,7 +1716,9 @@ func testPacketUDPDefault() (*layers.Ethernet, *layers.IPv4, gopacket.Layer, []b OptionData: []byte{0xde, 0xad, 0xbe, 0xef}, }} ip.IHL += 2 - return testPacket(nil, &ip, nil, nil) + + e, ip4, l4, p, b, err := testPacket(4, nil, &ip, nil, nil) + return e, ip4.(*layers.IPv4), l4, p, b, err } func testPacketUDPDefaultNP(destIP net.IP) (*layers.Ethernet, *layers.IPv4, gopacket.Layer, []byte, []byte, error) { @@ -1418,7 +1735,42 @@ func testPacketUDPDefaultNP(destIP net.IP) (*layers.Ethernet, *layers.IPv4, gopa }} ip.IHL += 2 - return testPacket(nil, &ip, nil, nil) + e, ip4, l4, p, b, err := testPacket(4, nil, &ip, nil, nil) + return e, ip4.(*layers.IPv4), l4, p, b, err +} + +func ipv6HopByHopExt() gopacket.SerializableLayer { + hop := &layers.IPv6HopByHop{} + hop.NextHeader = layers.IPProtocolUDP + + /* from gopacket ip6_test.go */ + tlv := &layers.IPv6HopByHopOption{} + tlv.OptionType = 0x01 //PadN + tlv.OptionData = []byte{0x00, 0x00, 0x00, 0x00} + hop.Options = append(hop.Options, tlv) + + return hop +} + +func testPacketUDPDefaultNPV6(destIP net.IP) (*layers.Ethernet, *layers.IPv6, gopacket.Layer, []byte, []byte, error) { + if destIP == nil { + return testPacketV6(nil, nil, nil, nil) + } + + ip := *ipv6Default + ip.DstIP = destIP + + hop := &layers.IPv6HopByHop{} + hop.NextHeader = layers.IPProtocolUDP + + /* from gopacket ip6_test.go */ + tlv := &layers.IPv6HopByHopOption{} + tlv.OptionType = 0x01 //PadN + tlv.OptionData = []byte{0x00, 0x00, 0x00, 0x00} + hop.Options = append(hop.Options, tlv) + + e, ip6, l4, p, b, err := testPacketV6(nil, &ip, nil, nil, hop) + return e, ip6, l4, p, b, err } func resetBPFMaps() { diff --git a/felix/bpf/ut/failsafes_test.go b/felix/bpf/ut/failsafes_test.go index d18563b53d9..aff43b8b641 100644 --- a/felix/bpf/ut/failsafes_test.go +++ b/felix/bpf/ut/failsafes_test.go @@ -217,7 +217,7 @@ func TestFailsafes(t *testing.T) { for _, test := range failsafeTests { t.Run(test.Description, func(t *testing.T) { - _, _, _, _, pktBytes, err := testPacket(nil, test.IPHeaderIPv4, test.IPHeaderUDP, nil) + _, _, _, _, pktBytes, err := testPacketV4(nil, test.IPHeaderIPv4, test.IPHeaderUDP, nil) Expect(err).NotTo(HaveOccurred()) prog := "calico_from_host_ep" diff --git a/felix/bpf/ut/filter_test.go b/felix/bpf/ut/filter_test.go index b4988639069..fc502fbb713 100644 --- a/felix/bpf/ut/filter_test.go +++ b/felix/bpf/ut/filter_test.go @@ -29,7 +29,7 @@ import ( func TestFilter(t *testing.T) { RegisterTestingT(t) - _, _, _, _, bytes, _ := testPacket( + _, _, _, _, bytes, _ := testPacketV4( &layers.Ethernet{ SrcMAC: []byte{0, 0, 0, 0, 0, 1}, DstMAC: []byte{0, 0, 0, 0, 0, 2}, diff --git a/felix/bpf/ut/icmp_port_unreachable_test.go b/felix/bpf/ut/icmp_port_unreachable_test.go index 71f198cafec..71725666220 100644 --- a/felix/bpf/ut/icmp_port_unreachable_test.go +++ b/felix/bpf/ut/icmp_port_unreachable_test.go @@ -37,7 +37,7 @@ func TestICMPPortUnreachable(t *testing.T) { }} ipHdr.IHL += 2 - _, ipv4, _, _, pktBytes, err := testPacket(nil, &ipHdr, nil, nil) + _, ipv4, _, _, pktBytes, err := testPacketV4(nil, &ipHdr, nil, nil) Expect(err).NotTo(HaveOccurred()) runBpfUnitTest(t, "icmp_port_unreachable.c", func(bpfrun bpfProgRunFn) { @@ -60,7 +60,7 @@ func TestNATNoBackendFromHEP(t *testing.T) { iphdr := *ipv4Default - _, ipv4, l4, _, pktBytes, err := testPacket(nil, &iphdr, nil, nil) + _, ipv4, l4, _, pktBytes, err := testPacketV4(nil, &iphdr, nil, nil) Expect(err).NotTo(HaveOccurred()) udp := l4.(*layers.UDP) diff --git a/felix/bpf/ut/icmp_too_big_test.go b/felix/bpf/ut/icmp_too_big_test.go index 9838ece2d68..bde1fccb9cf 100644 --- a/felix/bpf/ut/icmp_too_big_test.go +++ b/felix/bpf/ut/icmp_too_big_test.go @@ -62,7 +62,7 @@ func TestICMPTooBigIPOptions(t *testing.T) { }}, } - _, ipv4, l4, _, pktBytes, err := testPacket(nil, ipv4, nil, nil) + _, ipv4, l4, _, pktBytes, err := testPacketV4(nil, ipv4, nil, nil) Expect(err).NotTo(HaveOccurred()) udp := l4.(*layers.UDP) diff --git a/felix/bpf/ut/icmp_ttl_exceeded_test.go b/felix/bpf/ut/icmp_ttl_exceeded_test.go index 47b97043691..e10524295a6 100644 --- a/felix/bpf/ut/icmp_ttl_exceeded_test.go +++ b/felix/bpf/ut/icmp_ttl_exceeded_test.go @@ -55,7 +55,7 @@ func TestICMPttlExceededFromHEP(t *testing.T) { iphdr := *ipv4Default iphdr.TTL = 1 - _, ipv4, l4, _, pktBytes, err := testPacket(nil, &iphdr, nil, nil) + _, ipv4, l4, _, pktBytes, err := testPacketV4(nil, &iphdr, nil, nil) Expect(err).NotTo(HaveOccurred()) udp := l4.(*layers.UDP) diff --git a/felix/bpf/ut/ip_dec_ttl_test.go b/felix/bpf/ut/ip_dec_ttl_test.go index 9e88fbc22e1..fe96ff89102 100644 --- a/felix/bpf/ut/ip_dec_ttl_test.go +++ b/felix/bpf/ut/ip_dec_ttl_test.go @@ -29,7 +29,7 @@ func TestIpDecTTL(t *testing.T) { runBpfUnitTest(t, "ip_dec_ttl.c", func(bpfrun bpfProgRunFn) { ip36 := *ipv4Default ip36.TTL = 36 - _, _, _, _, pktBytes, err := testPacket(nil, &ip36, nil, nil) + _, _, _, _, pktBytes, err := testPacketV4(nil, &ip36, nil, nil) Expect(err).NotTo(HaveOccurred()) res, err := bpfrun(pktBytes) @@ -43,7 +43,7 @@ func TestIpDecTTL(t *testing.T) { ip35 := *ipv4Default ip35.TTL = 35 - _, _, _, _, pktBytes, err = testPacket(nil, &ip35, nil, nil) + _, _, _, _, pktBytes, err = testPacketV4(nil, &ip35, nil, nil) Expect(err).NotTo(HaveOccurred()) Expect(res.dataOut).To(Equal(pktBytes)) diff --git a/felix/bpf/ut/ip_options_test.go b/felix/bpf/ut/ip_options_test.go index 30a710d4a3f..6ed82b4637b 100644 --- a/felix/bpf/ut/ip_options_test.go +++ b/felix/bpf/ut/ip_options_test.go @@ -28,7 +28,7 @@ func TestMalformedIP(t *testing.T) { iphdr := *ipv4Default iphdr.IHL = 4 - _, _, _, _, pktBytes, err := testPacket(nil, &iphdr, nil, nil) + _, _, _, _, pktBytes, err := testPacketV4(nil, &iphdr, nil, nil) Expect(err).NotTo(HaveOccurred()) skbMark = 0 diff --git a/felix/bpf/ut/ip_parse_test.go b/felix/bpf/ut/ip_parse_test.go new file mode 100644 index 00000000000..2d12ee4be3f --- /dev/null +++ b/felix/bpf/ut/ip_parse_test.go @@ -0,0 +1,78 @@ +// Copyright (c) 2023 Tigera, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ut_test + +import ( + "testing" + + "github.com/google/gopacket/layers" + . "github.com/onsi/gomega" +) + +func TestIPv4Parse(t *testing.T) { + RegisterTestingT(t) + + ipHdr := *ipv4Default + ipHdr.Options = []layers.IPv4Option{{ + OptionType: 123, + OptionLength: 6, + OptionData: []byte{0xde, 0xad, 0xbe, 0xef}, + }} + ipHdr.IHL += 2 + + _, _, _, _, pktBytes, err := testPacketV4(nil, &ipHdr, nil, nil) + Expect(err).NotTo(HaveOccurred()) + + runBpfUnitTest(t, "ip_parse_test.c", func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(4)) + }) +} + +func TestIPv6Parse(t *testing.T) { + RegisterTestingT(t) + + _, _, _, _, pktBytes, err := testPacketV6(nil, ipv6Default, nil, nil) + Expect(err).NotTo(HaveOccurred()) + + runBpfUnitTest(t, "ip_parse_test.c", func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(6)) + }, withIPv6()) +} + +func TestIPv6ParseOptsOne(t *testing.T) { + RegisterTestingT(t) + + hop := &layers.IPv6HopByHop{} + hop.NextHeader = layers.IPProtocolUDP + + /* from gopacket ip6_test.go */ + tlv := &layers.IPv6HopByHopOption{} + tlv.OptionType = 0x01 //PadN + tlv.OptionData = []byte{0x00, 0x00, 0x00, 0x00} + hop.Options = append(hop.Options, tlv) + + _, _, _, _, pktBytes, err := testPacketV6(nil, ipv6Default, nil, nil, hop) + Expect(err).NotTo(HaveOccurred()) + + runBpfUnitTest(t, "ip_parse_test.c", func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(6)) + }, withIPv6()) +} diff --git a/felix/bpf/ut/ipv4_opts_test.go b/felix/bpf/ut/ipv4_opts_test.go index a77e2f81bf5..5a7121e2424 100644 --- a/felix/bpf/ut/ipv4_opts_test.go +++ b/felix/bpf/ut/ipv4_opts_test.go @@ -39,7 +39,7 @@ func TestIPv4Opts(t *testing.T) { }} ipHdr.IHL += 2 - _, ipv4, l4, payload, pktBytes, err := testPacket(nil, &ipHdr, nil, nil) + _, ipv4, l4, payload, pktBytes, err := testPacketV4(nil, &ipHdr, nil, nil) Expect(err).NotTo(HaveOccurred()) udp := l4.(*layers.UDP) diff --git a/felix/bpf/ut/nat_encap_test.go b/felix/bpf/ut/nat_encap_test.go index e84e994444d..9da83abd767 100644 --- a/felix/bpf/ut/nat_encap_test.go +++ b/felix/bpf/ut/nat_encap_test.go @@ -34,7 +34,7 @@ func TestNatEncap(t *testing.T) { }} ipHdr.IHL += 2 - _, ipv4, l4, payload, pktBytes, err := testPacket(nil, &ipHdr, nil, nil) + _, ipv4, l4, payload, pktBytes, err := testPacketV4(nil, &ipHdr, nil, nil) Expect(err).NotTo(HaveOccurred()) udp := l4.(*layers.UDP) @@ -76,23 +76,32 @@ func TestNatEncap(t *testing.T) { }) } -func checkVxlanEncap(pktR gopacket.Packet, NATed bool, ipv4 *layers.IPv4, +func checkVxlanEncap(pktR gopacket.Packet, NATed bool, iphdr gopacket.Layer, transport gopacket.Layer, payload []byte) { inner := checkVxlan(pktR) - checkInnerIP(inner, NATed, ipv4, transport, payload) + checkInnerIP(inner, NATed, iphdr, transport, payload) } func checkVxlan(pktR gopacket.Packet) gopacket.Packet { + ipType := layers.LayerTypeIPv4 + ethType := layers.EthernetTypeIPv4 + ipv4L := pktR.Layer(layers.LayerTypeIPv4) - Expect(ipv4L).NotTo(BeNil()) - ipv4R := ipv4L.(*layers.IPv4) + if ipv4L != nil { + ipv4R := ipv4L.(*layers.IPv4) - ipv4CSum := ipv4R.Checksum - iptmp := gopacket.NewSerializeBuffer() - err := ipv4R.SerializeTo(iptmp, gopacket.SerializeOptions{ComputeChecksums: true}) // recompute csum - Expect(err).NotTo(HaveOccurred()) - Expect(ipv4CSum).To(Equal(ipv4R.Checksum)) + ipv4CSum := ipv4R.Checksum + iptmp := gopacket.NewSerializeBuffer() + err := ipv4R.SerializeTo(iptmp, gopacket.SerializeOptions{ComputeChecksums: true}) // recompute csum + Expect(err).NotTo(HaveOccurred()) + Expect(ipv4CSum).To(Equal(ipv4R.Checksum)) + } else { + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + ipType = layers.LayerTypeIPv6 + ethType = layers.EthernetTypeIPv6 + } udpL := pktR.Layer(layers.LayerTypeUDP) Expect(udpL).NotTo(BeNil()) @@ -115,10 +124,10 @@ func checkVxlan(pktR gopacket.Packet) gopacket.Packet { &layers.Ethernet{ SrcMAC: []byte{0, 0, 0, 0, 0, 0}, DstMAC: []byte{0, 0, 0, 0, 0, 0}, - EthernetType: layers.EthernetTypeIPv4, + EthernetType: ethType, })) - return gopacket.NewPacket(ethL.LayerPayload(), layers.LayerTypeIPv4, gopacket.Default) + return gopacket.NewPacket(ethL.LayerPayload(), ipType, gopacket.Default) } func encapedResponse(pktR gopacket.Packet) []byte { @@ -158,7 +167,10 @@ func encapedResponse(pktR gopacket.Packet) []byte { func getVxlanVNI(pktR gopacket.Packet) uint32 { ipv4L := pktR.Layer(layers.LayerTypeIPv4) - Expect(ipv4L).NotTo(BeNil()) + if ipv4L == nil { + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + } udpL := pktR.Layer(layers.LayerTypeUDP) Expect(udpL).NotTo(BeNil()) @@ -178,17 +190,32 @@ func getVxlanVNI(pktR gopacket.Packet) uint32 { return vxlanL.(*layers.VXLAN).VNI } -func checkInnerIP(ip gopacket.Packet, NATed bool, ipv4 *layers.IPv4, +func checkInnerIP(ip gopacket.Packet, NATed bool, iphdr gopacket.Layer, transport gopacket.Layer, payload []byte) { - ipv4L := ip.Layer(layers.LayerTypeIPv4) - Expect(ipv4L).NotTo(BeNil()) - if NATed { - Expect(ipv4L).To(layersMatchFields(ipv4, "Checksum", "TTL", "Options", "Padding")) - } else { - Expect(ipv4L).To(layersMatchFields(ipv4, "DstIP", "Checksum", "TTL", "Options", "Padding")) - } - Expect(ipv4L.(*layers.IPv4).TTL).To(Equal(ipv4.TTL - 1)) + switch t := iphdr.(type) { + case *layers.IPv4: + ipv4L := ip.Layer(layers.LayerTypeIPv4) + Expect(ipv4L).NotTo(BeNil()) + if NATed { + Expect(ipv4L).To(layersMatchFields(iphdr, "Checksum", "TTL", "Options", "Padding")) + } else { + Expect(ipv4L).To(layersMatchFields(iphdr, "DstIP", "Checksum", "TTL", "Options", "Padding")) + } + + Expect(ipv4L.(*layers.IPv4).TTL).To(Equal(t.TTL - 1)) + case *layers.IPv6: + ipv6L := ip.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + if NATed { + Expect(ipv6L).To(layersMatchFields(iphdr, "HopLimit", "HopByHop")) + } else { + Expect(ipv6L).To(layersMatchFields(iphdr, "DstIP", "HopLimit", "HopByHop")) + } + Expect(ipv6L.(*layers.IPv6).HopLimit).To(Equal(t.HopLimit - 1)) + default: + panic("xxx") + } transportL := ip.Layer(transport.LayerType()) Expect(transportL).NotTo(BeNil()) diff --git a/felix/bpf/ut/nat_test.go b/felix/bpf/ut/nat_test.go index 5d04ab70726..55658a6b88e 100644 --- a/felix/bpf/ut/nat_test.go +++ b/felix/bpf/ut/nat_test.go @@ -67,6 +67,7 @@ func TestNATPodPodXNode(t *testing.T) { nat.NewNATBackendValue(natIP, natPort).AsBytes(), ) Expect(err).NotTo(HaveOccurred()) + dumpNATMap(natMap) ctMap := conntrack.Map() err = ctMap.EnsureExists() @@ -101,7 +102,7 @@ func TestNATPodPodXNode(t *testing.T) { udpNat.DstPort = layers.UDPPort(natPort) // created the expected packet after NAT, with recalculated csums - _, _, _, _, resPktBytes, err := testPacket(eth, &ipv4Nat, &udpNat, payload) + _, _, _, _, resPktBytes, err := testPacketV4(eth, &ipv4Nat, &udpNat, payload) Expect(err).NotTo(HaveOccurred()) // expect them to be the same @@ -135,7 +136,7 @@ func TestNATPodPodXNode(t *testing.T) { udpNat.DstPort = layers.UDPPort(natPort) // created the expected packet after NAT, with recalculated csums - _, _, _, _, resPktBytes, err := testPacket(eth, &ipv4Nat, &udpNat, payload) + _, _, _, _, resPktBytes, err := testPacketV4(eth, &ipv4Nat, &udpNat, payload) Expect(err).NotTo(HaveOccurred()) // expect them to be the same @@ -790,7 +791,7 @@ func TestNATNodePort(t *testing.T) { /* * TEST that unknown VNI is passed through */ - testUnrelatedVXLAN(t, node2ip, vni) + testUnrelatedVXLAN(4, t, node2ip, vni) // TEST host-networked backend { @@ -1294,17 +1295,32 @@ func TestNATNodePortMultiNIC(t *testing.T) { dumpCTMap(ctMap) } -func testUnrelatedVXLAN(t *testing.T, nodeIP net.IP, vni uint32) { +func testUnrelatedVXLAN(ipver int, t *testing.T, nodeIP net.IP, vni uint32) { vxlanTest := func(fillUDPCsum bool, validVNI bool) { + var opts []testOption + var iphdr gopacket.SerializableLayer + eth := ethDefault - ipv4 := &layers.IPv4{ - Version: 4, - IHL: 5, - TTL: 64, - Flags: layers.IPv4DontFragment, - SrcIP: net.IPv4(1, 2, 3, 4), - DstIP: nodeIP, - Protocol: layers.IPProtocolUDP, + + if ipver == 4 { + iphdr = &layers.IPv4{ + Version: 4, + IHL: 5, + TTL: 64, + Flags: layers.IPv4DontFragment, + SrcIP: net.IPv4(1, 2, 3, 4), + DstIP: nodeIP, + Protocol: layers.IPProtocolUDP, + } + } else { + iphdr = &layers.IPv6{ + Version: 6, + HopLimit: 64, + SrcIP: net.ParseIP("abcd:ef12::ffff:0102:0304"), + DstIP: nodeIP, + NextHeader: layers.IPProtocolUDP, + } + opts = append(opts, withIPv6()) } udp := &layers.UDP{ @@ -1320,11 +1336,11 @@ func testUnrelatedVXLAN(t *testing.T, nodeIP net.IP, vni uint32) { payload := make([]byte, 64) udp.Length = uint16(8 + 8 + len(payload)) - _ = udp.SetNetworkLayerForChecksum(ipv4) + _ = udp.SetNetworkLayerForChecksum(iphdr.(gopacket.NetworkLayer)) pkt := gopacket.NewSerializeBuffer() err := gopacket.SerializeLayers(pkt, gopacket.SerializeOptions{ComputeChecksums: true}, - eth, ipv4, udp, vxlan, gopacket.Payload(payload)) + eth, iphdr, udp, vxlan, gopacket.Payload(payload)) Expect(err).NotTo(HaveOccurred()) pktBytes := pkt.Bytes() @@ -1338,7 +1354,7 @@ func testUnrelatedVXLAN(t *testing.T, nodeIP net.IP, vni uint32) { fmt.Printf("pktR = %+v\n", pktR) Expect(res.dataOut).To(Equal(pktBytes)) - }) + }, opts...) } hostIP = nodeIP @@ -1350,7 +1366,7 @@ func testUnrelatedVXLAN(t *testing.T, nodeIP net.IP, vni uint32) { func TestNATNodePortICMPTooBig(t *testing.T) { RegisterTestingT(t) - _, ipv4, l4, _, pktBytes, err := testPacket(nil, nil, nil, make([]byte, natTunnelMTU)) + _, ipv4, l4, _, pktBytes, err := testPacketV4(nil, nil, nil, make([]byte, natTunnelMTU)) Expect(err).NotTo(HaveOccurred()) udp := l4.(*layers.UDP) @@ -1435,7 +1451,7 @@ func TestNormalSYNRetryForcePolicy(t *testing.T) { DataOffset: 5, } - _, ipv4, _, _, synPkt, err := testPacket(nil, nil, tcpSyn, nil) + _, ipv4, _, _, synPkt, err := testPacketV4(nil, nil, tcpSyn, nil) Expect(err).NotTo(HaveOccurred()) // Insert a reverse route for the source workload. @@ -1539,7 +1555,7 @@ func TestNATSYNRetryGoesToSameBackend(t *testing.T) { DataOffset: 5, } - _, ipv4, _, _, synPkt, err := testPacket(nil, nil, tcpSyn, nil) + _, ipv4, _, _, synPkt, err := testPacketV4(nil, nil, tcpSyn, nil) Expect(err).NotTo(HaveOccurred()) err = natMap.Update( @@ -1589,7 +1605,7 @@ func TestNATSYNRetryGoesToSameBackend(t *testing.T) { seenOtherIP := false for attempt := 0; attempt < 100; attempt++ { tcpSyn.SrcPort++ - _, _, _, _, synPkt, err := testPacket(nil, nil, tcpSyn, nil) + _, _, _, _, synPkt, err := testPacketV4(nil, nil, tcpSyn, nil) Expect(err).NotTo(HaveOccurred()) res, err := bpfrun(synPkt) Expect(err).NotTo(HaveOccurred()) @@ -1610,7 +1626,7 @@ func TestNATSYNRetryGoesToSameBackend(t *testing.T) { // Change back to the original SYN packet so that we can test the new policy // with an existing CT entry. tcpSyn.SrcPort = origTCPSrcPort - _, _, _, _, synPkt, err = testPacket(nil, nil, tcpSyn, nil) + _, _, _, _, synPkt, err = testPacketV4(nil, nil, tcpSyn, nil) Expect(err).NotTo(HaveOccurred()) bpfIfaceName = "SYNP" @@ -2255,7 +2271,7 @@ func TestNATSourceCollision(t *testing.T) { var recvPkt []byte - _, _, _, _, pktBytes, _ := testPacket(nil, pktIPHdr, pktTCPHdr, + _, _, _, _, pktBytes, _ := testPacketV4(nil, pktIPHdr, pktTCPHdr, []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11, 22, 33, 44, 55, 66, 77, 88, 99, 0}) skbMark = 0 @@ -2356,7 +2372,7 @@ func TestNATSourceCollision(t *testing.T) { pktTCPHdr.ACK = true pktTCPHdr.Seq = 1 - _, _, _, _, pktBytes, _ = testPacket(nil, pktIPHdr, pktTCPHdr, + _, _, _, _, pktBytes, _ = testPacketV4(nil, pktIPHdr, pktTCPHdr, []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11, 22, 33, 44, 55, 66, 77, 88, 99, 0}) dumpCTMap(ctMap) @@ -2414,7 +2430,7 @@ func TestNATSourceCollision(t *testing.T) { DataOffset: 5, } - _, _, _, _, pktBytes, _ = testPacket(nil, pktIPHdr, pktTCPHdr, + _, _, _, _, pktBytes, _ = testPacketV4(nil, pktIPHdr, pktTCPHdr, []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11, 22, 33, 44, 55, 66, 77, 88, 99, 0}) skbMark = 0 @@ -2605,3 +2621,863 @@ func TestNATHostRemoteNPLocalPod(t *testing.T) { dumpCTMap(ctMap) } + +func TestNATPodPodXNodeV6(t *testing.T) { + RegisterTestingT(t) + + bpfIfaceName = "NAT1" + defer func() { bpfIfaceName = "" }() + + eth, ipv6, l4, payload, pktBytes, err := testPacketUDPDefaultNPV6(node1ipV6) + Expect(err).NotTo(HaveOccurred()) + udp := l4.(*layers.UDP) + + err = natMapV6.Update( + nat.NewNATKeyV6(ipv6.DstIP, uint16(udp.DstPort), uint8(17)).AsBytes(), + nat.NewNATValueV6(0, 1, 0, 0).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + natIP := net.ParseIP("abcd::ffff:0808:0808") + natPort := uint16(666) + + err = natBEMapV6.Update( + nat.NewNATBackendKeyV6(0, 0).AsBytes(), + nat.NewNATBackendValueV6(natIP, natPort).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + resetCTMapV6(ctMapV6) // ensure it is clean + + var natedPkt []byte + + hostIP = node1ipV6 + + // Insert a reverse route for the source workload that is not in a calico + // poll, for example 3rd party CNI is used. + rtKey := routes.NewKeyV6(srcV6CIDR).AsBytes() + rtVal := routes.NewValueV6WithIfIndex(routes.FlagsLocalWorkload, 1).AsBytes() + err = rtMapV6.Update(rtKey, rtVal) + Expect(err).NotTo(HaveOccurred()) + dumpRTMapV6(rtMapV6) + dumpNATMapV6(natMapV6) + + skbMark = 0 + // Leaving workloada test for fc711b192f */ + runBpfTest(t, "calico_from_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ipv6Nat := *ipv6Default + ipv6Nat.DstIP = natIP + + udpNat := *udp + udpNat.DstPort = layers.UDPPort(natPort) + + // created the expected packet after NAT, with recalculated csums + _, _, _, _, resPktBytes, err := testPacketV6(eth, &ipv6Nat, &udpNat, payload, ipv6HopByHopExt()) + Expect(err).NotTo(HaveOccurred()) + + // expect them to be the same + Expect(res.dataOut).To(Equal(resPktBytes)) + + natedPkt = res.dataOut + }, withIPv6()) + expectMark(tcdefs.MarkSeenSkipFIB) + + resetCTMapV6(ctMapV6) + + // Insert a reverse route for the source workload that is in pool. + rtVal = routes.NewValueV6WithIfIndex(routes.FlagsLocalWorkload|routes.FlagInIPAMPool, 1).AsBytes() + err = rtMapV6.Update(rtKey, rtVal) + Expect(err).NotTo(HaveOccurred()) + + skbMark = 0 + // Leaving workload + runBpfTest(t, "calico_from_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ipv6Nat := *ipv6Default + ipv6Nat.DstIP = natIP + + udpNat := *udp + udpNat.DstPort = layers.UDPPort(natPort) + + // created the expected packet after NAT, with recalculated csums + _, _, _, _, resPktBytes, err := testPacketV6(eth, &ipv6Nat, &udpNat, payload, ipv6HopByHopExt()) + Expect(err).NotTo(HaveOccurred()) + + // expect them to be the same + Expect(res.dataOut).To(Equal(resPktBytes)) + + natedPkt = res.dataOut + }, withIPv6()) + + // Leaving node 1 + expectMark(tcdefs.MarkSeen) + + runBpfTest(t, "calico_to_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(natedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(natedPkt)) + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + fromHostCT := saveCTMapV6(ctMapV6) + resetCTMapV6(ctMapV6) + + var recvPkt []byte + + hostIP = node2ipV6 + + skbMark = 0 + + // Insert the reverse route for backend for RPF check. + resetRTMapV6(rtMapV6) + beV6CIDR := ip.CIDRFromNetIP(natIP).(ip.V6CIDR) + bertKey := routes.NewKeyV6(beV6CIDR).AsBytes() + bertVal := routes.NewValueV6WithIfIndex(routes.FlagsLocalWorkload|routes.FlagInIPAMPool, 1).AsBytes() + err = rtMapV6.Update(bertKey, bertVal) + Expect(err).NotTo(HaveOccurred()) + + bpfIfaceName = "NAT2" + // Arriving at node 2 + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(natedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(natedPkt)) + }, withIPv6()) + + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + v, ok := ct[conntrack.NewKeyV6(uint8(17), ipv6.SrcIP, uint16(udp.SrcPort), natIP, natPort)] + Expect(ok).To(BeTrue()) + // No NATing, service already resolved + Expect(v.Type()).To(Equal(conntrack.TypeNormal)) + Expect(v.Flags()).To(Equal(uint16(0))) + + // Arriving at workload at node 2 + expectMark(tcdefs.MarkSeen) + runBpfTest(t, "calico_to_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(natedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(natedPkt)) + + recvPkt = res.dataOut + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + + var respPkt []byte + + // Response leaving workload at node 2 + skbMark = 0 + runBpfTest(t, "calico_from_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) { + respPkt = udpResponseRawV6(recvPkt) + res, err := bpfrun(respPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(respPkt)) + }, withIPv6()) + + // Response leaving node 2 + expectMark(tcdefs.MarkSeenBypass) + runBpfTest(t, "calico_to_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(respPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(respPkt)) + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + resetCTMapV6(ctMapV6) + restoreCTMapV6(ctMapV6, fromHostCT) + dumpCTMapV6(ctMapV6) + + hostIP = node1ipV6 + + // Response arriving at node 1 + bpfIfaceName = "NAT1" + skbMark = 0 + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(respPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(respPkt)) + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + + // Response arriving at workload at node 1 + expectMark(tcdefs.MarkSeen) + runBpfTest(t, "calico_to_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) { + pktExp := gopacket.NewPacket(respPkt, layers.LayerTypeEthernet, gopacket.Default) + ipv6L := pktExp.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + ipv6R := ipv6L.(*layers.IPv6) + udpL := pktExp.Layer(layers.LayerTypeUDP) + Expect(udpL).NotTo(BeNil()) + udpR := udpL.(*layers.UDP) + + ipv6R.SrcIP = ipv6.DstIP + udpR.SrcPort = udp.DstPort + _ = udpR.SetNetworkLayerForChecksum(ipv6R) + + pktExpSer := gopacket.NewSerializeBuffer() + err := gopacket.SerializePacket(pktExpSer, gopacket.SerializeOptions{ComputeChecksums: true}, pktExp) + Expect(err).NotTo(HaveOccurred()) + + res, err := bpfrun(respPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(pktExpSer.Bytes())) + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + + // Response leaving to original source + + // clean up + resetCTMapV6(ctMapV6) +} + +func TestNATNodePortV6(t *testing.T) { + RegisterTestingT(t) + + bpfIfaceName = "NP-1" + defer func() { bpfIfaceName = "" }() + + _, ipv6, l4, payload, pktBytes, err := testPacketUDPDefaultNPV6(node1ipV6) + Expect(err).NotTo(HaveOccurred()) + udp := l4.(*layers.UDP) + + err = natMapV6.Update( + nat.NewNATKeyV6(ipv6.DstIP, uint16(udp.DstPort), uint8(17 /* UDP */)).AsBytes(), + nat.NewNATValueV6(0, 1, 0, 0).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + natIP := net.ParseIP("abcd::ffff:0808:0808") + natPort := uint16(666) + + err = natBEMapV6.Update( + nat.NewNATBackendKeyV6(0, 0).AsBytes(), + nat.NewNATBackendValueV6(natIP, natPort).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + node2wCIDR := net.IPNet{ + IP: natIP, + Mask: net.CIDRMask(128, 128), + } + + resetCTMapV6(ctMapV6) // ensure it is clean + + var encapedPkt []byte + + resetRTMap(rtMapV6) + + hostIP = node1ipV6 + skbMark = 0 + + // Arriving at node 1 - non-routable -> denied + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_SHOT)) + }, withIPv6()) + + defer resetRTMapV6(rtMapV6) + Expect(err).NotTo(HaveOccurred()) + err = rtMapV6.Update( + routes.NewKeyV6(ip.CIDRFromIPNet(&node2wCIDR).(ip.V6CIDR)).AsBytes(), + routes.NewValueV6WithNextHop(routes.FlagsRemoteWorkload|routes.FlagInIPAMPool, + ip.FromNetIP(node2ipV6).(ip.V6Addr)).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + err = rtMapV6.Update( + routes.NewKeyV6(ip.CIDRFromIPNet(&node1CIDRV6).(ip.V6CIDR)).AsBytes(), + routes.NewValueV6(routes.FlagsLocalHost).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + err = rtMapV6.Update( + routes.NewKeyV6(ip.CIDRFromIPNet(&node2CIDRV6).(ip.V6CIDR)).AsBytes(), + routes.NewValueV6(routes.FlagsRemoteHost).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + dumpRTMapV6(rtMapV6) + rtNode1 := saveRTMapV6(rtMapV6) + + vni := uint32(0) + + // Arriving at node 1 + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + ipv6R := ipv6L.(*layers.IPv6) + Expect(ipv6R.SrcIP.String()).To(Equal(hostIP.String())) + Expect(ipv6R.DstIP.String()).To(Equal(node2ipV6.String())) + + checkVxlanEncap(pktR, false, ipv6, udp, payload) + vni = getVxlanVNI(pktR) + + encapedPkt = res.dataOut + + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + + ctKey := conntrack.NewKeyV6(uint8(17 /* UDP */), + ipv6.SrcIP, uint16(udp.SrcPort), ipv6.DstIP, uint16(udp.DstPort)) + + Expect(ct).Should(HaveKey(ctKey)) + ctr := ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATForward)) + + ctKey = ctr.ReverseNATKey() + Expect(ct).Should(HaveKey(ctKey)) + ctr = ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATReverse)) + + // Approved for both sides due to forwarding through the tunnel + Expect(ctr.Data().A2B.Approved).To(BeTrue()) + Expect(ctr.Data().B2A.Approved).To(BeTrue()) + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + v, ok := ct[conntrack.NewKeyV6(uint8(17 /* UDP */), ipv6.SrcIP, uint16(udp.SrcPort), natIP, natPort)] + Expect(ok).To(BeTrue()) + Expect(v.Type()).To(Equal(conntrack.TypeNATReverse)) + Expect(v.Flags()).To(Equal(conntrack3.FlagNATNPFwd)) + + expectMark(tcdefs.MarkSeenBypassForward) + // Leaving node 1 + runBpfTest(t, "calico_to_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(encapedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(encapedPkt)) + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + fromHostCT := saveCTMapV6(ctMapV6) + + encapedPktArrivesAtNode2 := make([]byte, len(encapedPkt)) + copy(encapedPktArrivesAtNode2, encapedPkt) + + resetCTMapV6(ctMapV6) + + var recvPkt []byte + + hostIP = node2ipV6 + + // change the routing - it is a local workload now! + err = rtMapV6.Update( + routes.NewKeyV6(ip.CIDRFromIPNet(&node2wCIDR).(ip.V6CIDR)).AsBytes(), + routes.NewValueV6(routes.FlagsLocalWorkload|routes.FlagInIPAMPool).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + // we must know that the encaped packet src ip if from a known host + err = rtMapV6.Update( + routes.NewKeyV6(ip.CIDRFromIPNet(&node1CIDRV6).(ip.V6CIDR)).AsBytes(), + routes.NewValueV6(routes.FlagsRemoteHost).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + err = rtMapV6.Update( + routes.NewKeyV6(ip.CIDRFromIPNet(&node2CIDRV6).(ip.V6CIDR)).AsBytes(), + routes.NewValueV6(routes.FlagsLocalHost).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + dumpRTMapV6(rtMapV6) + + // now we are at the node with local workload + err = natMapV6.Update( + nat.NewNATKeyV6(ipv6.DstIP, uint16(udp.DstPort), uint8(17 /* UDP */)).AsBytes(), + nat.NewNATValueV6(0 /* id */, 1 /* count */, 1 /* local */, 0).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + // Arriving at node 2 + bpfIfaceName = "NP-2" + + arpMapN2 := saveARPMapV6(arpMapV6) + Expect(arpMapN2).To(HaveLen(0)) + + skbMark = 0 + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(encapedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + payloadL := pktR.ApplicationLayer() + Expect(payloadL).NotTo(BeNil()) + vxlanL := gopacket.NewPacket(payloadL.Payload(), layers.LayerTypeVXLAN, gopacket.Default) + Expect(vxlanL).NotTo(BeNil()) + fmt.Printf("vxlanL = %+v\n", vxlanL) + + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + ipv6R := ipv6L.(*layers.IPv6) + Expect(ipv6R.SrcIP.String()).To(Equal(ipv6.SrcIP.String())) + Expect(ipv6R.DstIP.String()).To(Equal(natIP.String())) + + udpL := pktR.Layer(layers.LayerTypeUDP) + Expect(udpL).NotTo(BeNil()) + udpR := udpL.(*layers.UDP) + Expect(udpR.SrcPort).To(Equal(layers.UDPPort(udp.SrcPort))) + Expect(udpR.DstPort).To(Equal(layers.UDPPort(natPort))) + + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + + ctKey := conntrack.NewKeyV6(uint8(17 /* UDP */), + ipv6.SrcIP, uint16(udp.SrcPort), ipv6.DstIP, uint16(udp.DstPort)) + + Expect(ct).Should(HaveKey(ctKey)) + ctr := ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATForward)) + Expect(ctr.NATSPort()).To(Equal(uint16(0))) + + ctKey = ctr.ReverseNATKey() + Expect(ct).Should(HaveKey(ctKey)) + ctr = ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATReverse)) + + // Approved source side + Expect(ctr.Data().A2B.Approved).To(BeTrue()) + // Dest not approved yet + Expect(ctr.Data().B2A.Approved).NotTo(BeTrue()) + + recvPkt = res.dataOut + }, withIPv6()) + + expectMark(tcdefs.MarkSeen) + + dumpCTMapV6(ctMapV6) + ct, err = conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + v, ok = ct[conntrack.NewKeyV6(uint8(17 /* UDP */), ipv6.SrcIP, uint16(udp.SrcPort), natIP, natPort)] + Expect(ok).To(BeTrue()) + Expect(v.Type()).To(Equal(conntrack.TypeNATReverse)) + Expect(v.Flags()).To(Equal(conntrack3.FlagExtLocal)) + + dumpARPMapV6(arpMapV6) + + arpMapN2 = saveARPMapV6(arpMapV6) + Expect(arpMapN2).To(HaveLen(1)) + arpKey := arp.NewKeyV6(node1ipV6, 1) // ifindex is always 1 in UT + Expect(arpMapN2).To(HaveKey(arpKey)) + macDst := encapedPkt[0:6] + macSrc := encapedPkt[6:12] + Expect(arpMapN2[arpKey]).To(Equal(arp.NewValue(macDst, macSrc))) + + // try a spoofed tunnel packet, should be dropped and have no effect + skbMark = 0 + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + // modify the only known good src IP, we do not care about csums at this point + encapedPkt[26] = 234 + res, err := bpfrun(encapedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_SHOT)) + }, withIPv6()) + + skbMark = tcdefs.MarkSeen + + // Insert the reverse route for backend for RPF check. + resetRTMap(rtMapV6) + beV4CIDR := ip.CIDRFromNetIP(natIP).(ip.V6CIDR) + bertKey := routes.NewKeyV6(beV4CIDR).AsBytes() + bertVal := routes.NewValueV6WithIfIndex(routes.FlagsLocalWorkload|routes.FlagInIPAMPool, 1).AsBytes() + err = rtMapV6.Update(bertKey, bertVal) + Expect(err).NotTo(HaveOccurred()) + + // Arriving at workload at node 2 + runBpfTest(t, "calico_to_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(recvPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + Expect(res.dataOut).To(Equal(recvPkt)) + + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + + ctKey := conntrack.NewKeyV6(uint8(17 /* UDP */), + ipv6.SrcIP, uint16(udp.SrcPort), ipv6.DstIP, uint16(udp.DstPort)) + + Expect(ct).Should(HaveKey(ctKey)) + ctr := ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATForward)) + + ctKey = ctr.ReverseNATKey() + Expect(ct).Should(HaveKey(ctKey)) + ctr = ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATReverse), + fmt.Sprintf("Expected reverse conntrack entry but got %v", ctr)) + + // Approved source side + Expect(ctr.Data().A2B.Approved).To(BeTrue()) + // Approved destination side as well + Expect(ctr.Data().B2A.Approved).To(BeTrue()) + }, withIPv6()) + + skbMark = 0 + + // Response leaving workload at node 2 + runBpfTest(t, "calico_from_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) { + respPkt := udpResponseRawV6(recvPkt) + // Change the MAC addresses so that we can observe that the right + // addresses were patched in. + copy(respPkt[:6], []byte{1, 2, 3, 4, 5, 6}) + copy(respPkt[6:12], []byte{6, 5, 4, 3, 2, 1}) + res, err := bpfrun(respPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_REDIRECT)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ethL := pktR.Layer(layers.LayerTypeEthernet) + Expect(ethL).NotTo(BeNil()) + ethR := ethL.(*layers.Ethernet) + Expect(ethR).To(layersMatchFields(&layers.Ethernet{ + SrcMAC: macDst, + DstMAC: macSrc, + EthernetType: layers.EthernetTypeIPv6, + })) + + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + ipv6R := ipv6L.(*layers.IPv6) + Expect(ipv6R.SrcIP.String()).To(Equal(hostIP.String())) + Expect(ipv6R.DstIP.String()).To(Equal(node1ipV6.String())) + + checkVxlan(pktR) + + encapedPkt = res.dataOut + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + + expectMark(tcdefs.MarkSeen) + + hostIP = node2ipV6 + + // Response leaving node 2 + runBpfTest(t, "calico_to_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(encapedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + ipv6R := ipv6L.(*layers.IPv6) + // check that the IP is fixed up + Expect(ipv6R.SrcIP.String()).To(Equal(node2ipV6.String())) + Expect(ipv6R.DstIP.String()).To(Equal(node1ipV6.String())) + + checkVxlan(pktR) + + encapedPkt = res.dataOut + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + resetCTMapV6(ctMapV6) + restoreCTMapV6(ctMapV6, fromHostCT) + dumpCTMapV6(ctMapV6) + + hostIP = node1ipV6 + + // change to routing again to a remote workload + resetRTMap(rtMapV6) + restoreRTMapV6(rtMapV6, rtNode1) + dumpRTMapV6(rtMapV6) + + // Response arriving at node 1 + bpfIfaceName = "NP-1" + skbMark = 0 + + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(encapedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + ipv6R := ipv6L.(*layers.IPv6) + Expect(ipv6R.DstIP.String()).To(Equal(ipv6.SrcIP.String())) + Expect(ipv6R.SrcIP.String()).To(Equal(ipv6.DstIP.String())) + + udpL := pktR.Layer(layers.LayerTypeUDP) + Expect(udpL).NotTo(BeNil()) + udpR := udpL.(*layers.UDP) + Expect(udpR.SrcPort).To(Equal(udp.DstPort)) + Expect(udpR.DstPort).To(Equal(udp.SrcPort)) + + payloadL := pktR.ApplicationLayer() + Expect(payloadL).NotTo(BeNil()) + Expect(payload).To(Equal(payloadL.Payload())) + + recvPkt = res.dataOut + }, withIPv6()) + + expectMark(tcdefs.MarkSeenBypassForward) + saveMark := skbMark + + dumpCTMapV6(ctMapV6) + + skbMark = 0 + // try a spoofed tunnel packet returnign back, should be dropped and have no effect + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + // modify the only known good src IP, we do not care about csums at this point + encapedPkt[26] = 235 + res, err := bpfrun(encapedPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_SHOT)) + }, withIPv6()) + + skbMark = saveMark + // Response leaving to original source + runBpfTest(t, "calico_to_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(recvPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + + ctKey := conntrack.NewKeyV6(uint8(17 /* UDP */), + ipv6.SrcIP, uint16(udp.SrcPort), ipv6.DstIP, uint16(udp.DstPort)) + + Expect(ct).Should(HaveKey(ctKey)) + ctr := ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATForward)) + + ctKey = ctr.ReverseNATKey() + Expect(ct).Should(HaveKey(ctKey)) + ctr = ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATReverse)) + + // Approved for both sides due to forwarding through the tunnel + Expect(ctr.Data().A2B.Approved).To(BeTrue()) + Expect(ctr.Data().B2A.Approved).To(BeTrue()) + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + + skbMark = 0 + // Another pkt arriving at node 1 - uses existing CT entries + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + ipv6R := ipv6L.(*layers.IPv6) + Expect(ipv6R.SrcIP.String()).To(Equal(hostIP.String())) + Expect(ipv6R.DstIP.String()).To(Equal(node2ipV6.String())) + + checkVxlanEncap(pktR, false, ipv6, udp, payload) + }, withIPv6()) + + expectMark(tcdefs.MarkSeenBypassForward) + + /* + * TEST that unknown VNI is passed through + */ + testUnrelatedVXLAN(6, t, node2ipV6, vni) + + // TEST host-networked backend + { + resetCTMapV6(ctMapV6) + + var recvPkt []byte + + hostIP = node2ipV6 + skbMark = 0 + + // we must know that the encaped packet src ip is from a known host + err = rtMapV6.Update( + routes.NewKeyV6(ip.CIDRFromIPNet(&node1CIDRV6).(ip.V6CIDR)).AsBytes(), + routes.NewValueV6(routes.FlagsRemoteHost).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + err = rtMapV6.Update( + routes.NewKeyV6(ip.CIDRFromIPNet(&node2CIDRV6).(ip.V6CIDR)).AsBytes(), + routes.NewValueV6(routes.FlagsLocalHost).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + dumpRTMapV6(rtMapV6) + + // now we are at the node with local workload + err = natMapV6.Update( + nat.NewNATKeyV6(net.ParseIP("ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"), + uint16(udp.DstPort), uint8(17 /* UDP */)).AsBytes(), + nat.NewNATValueV6(0 /* count */, 1 /* local */, 1, 0).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + // make it point to the local host - host networked backend + err = natBEMapV6.Update( + nat.NewNATBackendKeyV6(0, 0).AsBytes(), + nat.NewNATBackendValueV6(node2ipV6, natPort).AsBytes(), + ) + Expect(err).NotTo(HaveOccurred()) + + // Arriving at node 2 + bpfIfaceName = "NP-2" + + skbMark = 0 + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(encapedPktArrivesAtNode2) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + ipv6R := ipv6L.(*layers.IPv6) + Expect(ipv6R.SrcIP.String()).To(Equal(ipv6.SrcIP.String())) + Expect(ipv6R.DstIP.String()).To(Equal(node2ipV6.String())) + + udpL := pktR.Layer(layers.LayerTypeUDP) + Expect(udpL).NotTo(BeNil()) + udpR := udpL.(*layers.UDP) + Expect(udpR.SrcPort).To(Equal(layers.UDPPort(udp.SrcPort))) + Expect(udpR.DstPort).To(Equal(layers.UDPPort(natPort))) + + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + + ctKey := conntrack.NewKeyV6(uint8(17 /* UDP */), + ipv6.SrcIP, uint16(udp.SrcPort), ipv6.DstIP, uint16(udp.DstPort)) + + Expect(ct).Should(HaveKey(ctKey)) + ctr := ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATForward)) + + ctKey = ctr.ReverseNATKey() + Expect(ct).Should(HaveKey(ctKey)) + ctr = ct[ctKey] + Expect(ctr.Type()).To(Equal(conntrack.TypeNATReverse)) + + // Approved source side + Expect(ctr.Data().A2B.Approved).To(BeTrue()) + // Dest not approved yet + Expect(ctr.Data().B2A.Approved).NotTo(BeTrue()) + + recvPkt = res.dataOut + }, withIPv6()) + + dumpCTMapV6(ctMapV6) + + skbMark = 0 + + // Response leaving workload at node 2 + runBpfTest(t, "calico_to_host_ep", nil, func(bpfrun bpfProgRunFn) { + respPkt := udpResponseRawV6(recvPkt) + + // Change the MAC addresses so that we can observe that the right + // addresses were patched in. + macUntouched := []byte{6, 5, 4, 3, 2, 1} + copy(respPkt[:6], []byte{1, 2, 3, 4, 5, 6}) + copy(respPkt[6:12], macUntouched) + + res, err := bpfrun(respPkt) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default) + fmt.Printf("pktR = %+v\n", pktR) + + ethL := pktR.Layer(layers.LayerTypeEthernet) + Expect(ethL).NotTo(BeNil()) + ethR := ethL.(*layers.Ethernet) + Expect(ethR).To(layersMatchFields(&layers.Ethernet{ + SrcMAC: macUntouched, // Source is set by net stack and should not be touched. + DstMAC: macSrc, + EthernetType: layers.EthernetTypeIPv6, + })) + + ipv6L := pktR.Layer(layers.LayerTypeIPv6) + Expect(ipv6L).NotTo(BeNil()) + ipv6R := ipv6L.(*layers.IPv6) + Expect(ipv6R.SrcIP.String()).To(Equal(node2ipV6.String())) + Expect(ipv6R.DstIP.String()).To(Equal(node1ipV6.String())) + + checkVxlan(pktR) + }, withHostNetworked(), withIPv6()) + } +} diff --git a/felix/bpf/ut/pol_prog_test.go b/felix/bpf/ut/pol_prog_test.go index d782ae8b83e..afc1ec1143f 100644 --- a/felix/bpf/ut/pol_prog_test.go +++ b/felix/bpf/ut/pol_prog_test.go @@ -39,7 +39,7 @@ import ( "github.com/projectcalico/calico/felix/proto" ) -func TestLoadAllowAllProgram(t *testing.T) { +func TestPolicyLoadAllowAllProgram(t *testing.T) { RegisterTestingT(t) b := asm.NewBlock(false) @@ -60,7 +60,7 @@ func TestLoadAllowAllProgram(t *testing.T) { Expect(rc.RC).To(BeNumerically("==", -1)) } -func TestLoadProgramWithMapAccess(t *testing.T) { +func TestPolicyLoadProgramWithMapAccess(t *testing.T) { RegisterTestingT(t) ipsMap := ipsets.Map() @@ -113,7 +113,7 @@ func makeRulesSingleTier(protoRules []*proto.Rule) polprog.Rules { } } -func TestLoadKitchenSinkPolicy(t *testing.T) { +func TestPolicyLoadKitchenSinkPolicy(t *testing.T) { RegisterTestingT(t) alloc := idalloc.New() allocID := func(id string) string { @@ -164,7 +164,7 @@ func TestLoadKitchenSinkPolicy(t *testing.T) { Expect(fd.Close()).NotTo(HaveOccurred()) } -func TestLoadGarbageProgram(t *testing.T) { +func TestPolicyLoadGarbageProgram(t *testing.T) { RegisterTestingT(t) var insns asm.Insns @@ -2206,32 +2206,20 @@ func wrap(p polProgramTest) polProgramTestWrapper { return polProgramTestWrapper{p} } -func TestPolicyPrograms(t *testing.T) { +func TestPolicyPolicyPrograms(t *testing.T) { for i, p := range polProgramTests { - if p.ForIPv6 { - // XXX skip for now - continue - } t.Run(fmt.Sprintf("%d:Policy=%s", i, p.PolicyName), func(t *testing.T) { runTest(t, wrap(p)) }) } } -func TestHostPolicyPrograms(t *testing.T) { +func TestPolicyHostPolicyPrograms(t *testing.T) { for i, p := range hostPolProgramTests { - if p.ForIPv6 { - // XXX skip for now - continue - } t.Run(fmt.Sprintf("%d:Policy=%s", i, p.PolicyName), func(t *testing.T) { runTest(t, wrap(p)) }) } } -func TestXDPPolicyPrograms(t *testing.T) { +func TestPolicyXDPPolicyPrograms(t *testing.T) { for i, p := range xdpPolProgramTests { - if p.ForIPv6 { - // XXX skip for now - continue - } t.Run(fmt.Sprintf("%d:Policy=%s", i, p.PolicyName), func(t *testing.T) { runTest(t, wrap(p)) }) } } @@ -2375,7 +2363,7 @@ func ipUintFromString(addrStr string, section int) uint32 { return binary.LittleEndian.Uint32(addrBytes[section*4 : (section+1)*4]) } -func TestIPUintFromString(t *testing.T) { +func TestPolicyIPUintFromString(t *testing.T) { RegisterTestingT(t) Expect(ipUintFromString("10.0.0.1", 0)).To(Equal(uint32(0x0100000a))) Expect(ipUintFromString("10.0.0.1", 1)).To(Equal(uint32(0))) @@ -2423,8 +2411,15 @@ func runTest(t *testing.T, tp testPolicy) { Expect(err).NotTo(HaveOccurred()) // Build the program. + allowIdx := tcdefs.ProgIndexAllowed + denyIdx := tcdefs.ProgIndexDrop + if tp.ForIPv6() { + allowIdx = tcdefs.ProgIndexV6Allowed + denyIdx = tcdefs.ProgIndexV6Drop + } + pg := polprog.NewBuilder(forceAlloc, ipsMap.MapFD(), testStateMap.MapFD(), jumpMap.MapFD(), - polprog.WithAllowDenyJumps(tcdefs.ProgIndexAllowed, tcdefs.ProgIndexDrop)) + polprog.WithAllowDenyJumps(allowIdx, denyIdx)) if tp.ForIPv6() { pg.EnableIPv6Mode() } diff --git a/felix/bpf/ut/snat_test.go b/felix/bpf/ut/snat_test.go index a8b91f25372..900d86a7d8b 100644 --- a/felix/bpf/ut/snat_test.go +++ b/felix/bpf/ut/snat_test.go @@ -38,7 +38,7 @@ func TestSNATHostServiceRemotePod(t *testing.T) { ipHdr := ipv4Default ipHdr.Id = 1 - eth, ipv4, l4, payload, pktBytes, err := testPacket(nil, ipHdr, nil, nil) + eth, ipv4, l4, payload, pktBytes, err := testPacketV4(nil, ipHdr, nil, nil) Expect(err).NotTo(HaveOccurred()) udp := l4.(*layers.UDP) @@ -113,7 +113,7 @@ func TestSNATHostServiceRemotePod(t *testing.T) { udpNat.DstPort = layers.UDPPort(natPort) // created the expected packet after NAT, with recalculated csums - _, _, _, _, resPktBytes, err := testPacket(eth, &ipv4Nat, &udpNat, payload) + _, _, _, _, resPktBytes, err := testPacketV4(eth, &ipv4Nat, &udpNat, payload) Expect(err).NotTo(HaveOccurred()) // expect them to be the same @@ -141,7 +141,7 @@ func TestSNATHostServiceRemotePod(t *testing.T) { // Second packet - conntrack hit ipHdr.Id = 2 - eth, ipv4, _, payload, pktBytes, err = testPacket(nil, ipHdr, nil, nil) + eth, ipv4, _, payload, pktBytes, err = testPacketV4(nil, ipHdr, nil, nil) Expect(err).NotTo(HaveOccurred()) skbMark = 0 @@ -162,7 +162,7 @@ func TestSNATHostServiceRemotePod(t *testing.T) { udpNat.DstPort = layers.UDPPort(natPort) // created the expected packet after NAT, with recalculated csums - _, _, _, _, resPktBytes, err := testPacket(eth, &ipv4Nat, &udpNat, payload) + _, _, _, _, resPktBytes, err := testPacketV4(eth, &ipv4Nat, &udpNat, payload) Expect(err).NotTo(HaveOccurred()) // expect them to be the same @@ -209,7 +209,7 @@ func TestSNATHostServiceRemotePod(t *testing.T) { ethResp.SrcMAC, ethResp.DstMAC = ethResp.DstMAC, ethResp.SrcMAC // created the expected packet after NAT, with recalculated csums - _, _, _, _, resPktBytes, err := testPacket(ðResp, &ipResp, &udpResp, payload) + _, _, _, _, resPktBytes, err := testPacketV4(ðResp, &ipResp, &udpResp, payload) Expect(err).NotTo(HaveOccurred()) // expect them to be the same diff --git a/felix/bpf/ut/tcp_test.go b/felix/bpf/ut/tcp_test.go index af17d137291..82142f46683 100644 --- a/felix/bpf/ut/tcp_test.go +++ b/felix/bpf/ut/tcp_test.go @@ -44,7 +44,7 @@ func TestTCPRecycleClosedConn(t *testing.T) { DataOffset: 5, } - _, _, _, _, synPkt, err := testPacket(nil, nil, tcpSyn, nil) + _, _, _, _, synPkt, err := testPacketV4(nil, nil, tcpSyn, nil) Expect(err).NotTo(HaveOccurred()) // Insert a reverse route for the source workload. @@ -130,7 +130,7 @@ func TestTCPRecycleClosedConnNAT(t *testing.T) { DataOffset: 5, } - _, ipv4, l4, _, synPkt, err := testPacket(nil, nil, tcpSyn, nil) + _, ipv4, l4, _, synPkt, err := testPacketV4(nil, nil, tcpSyn, nil) Expect(err).NotTo(HaveOccurred()) tcp := l4.(*layers.TCP) diff --git a/felix/bpf/ut/to_host_allowed_test.go b/felix/bpf/ut/to_host_allowed_test.go index 38faf29518a..b7725c212b1 100644 --- a/felix/bpf/ut/to_host_allowed_test.go +++ b/felix/bpf/ut/to_host_allowed_test.go @@ -83,7 +83,7 @@ func TestToHostAllowedCTFull(t *testing.T) { DataOffset: 5, } - _, ipv4, _, _, synPkt, err := testPacket(nil, nil, tcpSyn, nil) + _, ipv4, _, _, synPkt, err := testPacketV4(nil, nil, tcpSyn, nil) Expect(err).NotTo(HaveOccurred()) destCIDR := net.IPNet{ @@ -178,7 +178,7 @@ func TestToHostAllowedCTFull(t *testing.T) { ipv4Ret := *ipv4 ipv4Ret.SrcIP, ipv4Ret.DstIP = ipv4Ret.DstIP, ipv4Ret.SrcIP - _, _, _, _, synAckPkt, err := testPacket(nil, &ipv4Ret, tcpSynAck, nil) + _, _, _, _, synAckPkt, err := testPacketV4(nil, &ipv4Ret, tcpSynAck, nil) Expect(err).NotTo(HaveOccurred()) skbMark = tcdefs.MarkSeen @@ -196,7 +196,7 @@ func TestToHostAllowedCTFull(t *testing.T) { DataOffset: 5, } - _, _, _, _, ackPkt, err := testPacket(nil, nil, tcpAck, nil) + _, _, _, _, ackPkt, err := testPacketV4(nil, nil, tcpAck, nil) Expect(err).NotTo(HaveOccurred()) skbMark = 0 diff --git a/felix/bpf/ut/whitelist_test.go b/felix/bpf/ut/whitelist_test.go index 8f380e6aa2b..e003980b4e6 100644 --- a/felix/bpf/ut/whitelist_test.go +++ b/felix/bpf/ut/whitelist_test.go @@ -240,7 +240,7 @@ func TestAllowFromHostExitHost(t *testing.T) { ipHdr.SrcIP = node1ip ipHdr.DstIP = node2ip - _, ipv4, l4, _, pktBytes, err := testPacket(nil, ipHdr, nil, nil) + _, ipv4, l4, _, pktBytes, err := testPacketV4(nil, ipHdr, nil, nil) Expect(err).NotTo(HaveOccurred()) udp := l4.(*layers.UDP) @@ -308,3 +308,81 @@ func TestAllowFromHostExitHost(t *testing.T) { Expect(ctr.Data().B2A.Approved).To(BeTrue()) }) } + +func TestAllowEnterHostToWorkloadV6(t *testing.T) { + RegisterTestingT(t) + + bpfIfaceName = "HWwl" + defer func() { bpfIfaceName = "" }() + + hop := &layers.IPv6HopByHop{} + hop.NextHeader = layers.IPProtocolUDP + + /* from gopacket ip6_test.go */ + tlv := &layers.IPv6HopByHopOption{} + tlv.OptionType = 0x01 //PadN + tlv.OptionData = []byte{0x00, 0x00, 0x00, 0x00} + hop.Options = append(hop.Options, tlv) + + _, _, l4, _, pktBytes, err := testPacketV6(nil, ipv6Default, nil, nil, hop) + Expect(err).NotTo(HaveOccurred()) + udp := l4.(*layers.UDP) + + resetMap(ctMapV6) // ensure it is clean + + hostIP = node1ip + + // Insert a reverse route for the source workload. + rtKey := routes.NewKeyV6(srcV6CIDR).AsBytes() + rtVal := routes.NewValueV6(routes.FlagsRemoteWorkload | routes.FlagInIPAMPool).AsBytes() + err = rtMapV6.Update(rtKey, rtVal) + Expect(err).NotTo(HaveOccurred()) + rtKey = routes.NewKeyV6(dstV6CIDR).AsBytes() + rtVal = routes.NewValueV6WithIfIndex(routes.FlagsRemoteWorkload|routes.FlagInIPAMPool, 1).AsBytes() + err = rtMapV6.Update(rtKey, rtVal) + Expect(err).NotTo(HaveOccurred()) + defer resetRTMap(rtMapV6) + + dumpRTMapV6(rtMapV6) + + ctKey := conntrack.NewKeyV6(17, /* UDP */ + ipv6Default.SrcIP, uint16(udp.SrcPort), ipv6Default.DstIP, uint16(udp.DstPort)) + + skbMark = 0 + runBpfTest(t, "calico_from_host_ep", nil, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + Expect(ct).Should(HaveKey(ctKey)) + + ctr := ct[ctKey] + + // Approved by HEP + Expect(ctr.Data().A2B.Approved).To(BeTrue()) + // NOt approved by WEP yet + Expect(ctr.Data().B2A.Approved).NotTo(BeTrue()) + }, withIPv6()) + + expectMark(tcdefs.MarkSeen) + + dumpCTMapV6(ctMapV6) + + runBpfTest(t, "calico_to_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) { + res, err := bpfrun(pktBytes) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Retval).To(Equal(resTC_ACT_UNSPEC)) + + ct, err := conntrack.LoadMapMemV6(ctMapV6) + Expect(err).NotTo(HaveOccurred()) + Expect(ct).Should(HaveKey(ctKey)) + + ctr := ct[ctKey] + + // Still approved both by HEP and WEP + Expect(ctr.Data().B2A.Approved).To(BeTrue()) + Expect(ctr.Data().A2B.Approved).To(BeTrue()) + }, withIPv6()) +} diff --git a/felix/bpf/ut/xdp_test.go b/felix/bpf/ut/xdp_test.go index 29a5b34a20c..293dadb82e0 100644 --- a/felix/bpf/ut/xdp_test.go +++ b/felix/bpf/ut/xdp_test.go @@ -241,7 +241,7 @@ func TestXDPPrograms(t *testing.T) { for i, tc := range xdpTestCases { bpfIfaceName = fmt.Sprintf("XDP-%d", i) runBpfTest(t, "xdp_calico_entrypoint", tc.Rules, func(bpfrun bpfProgRunFn) { - _, _, _, _, pktBytes, err := testPacket(nil, tc.IPv4Header, tc.NextHeader, nil) + _, _, _, _, pktBytes, err := testPacketV4(nil, tc.IPv4Header, tc.NextHeader, nil) Expect(err).NotTo(HaveOccurred()) res, err := bpfrun(pktBytes) Expect(err).NotTo(HaveOccurred())