netobserv · jotak · Dec 12, 2024 · Nov 27, 2024 · Dec 5, 2024 · Dec 5, 2024
diff --git a/.golangci.yml b/.golangci.yml
@@ -16,9 +16,9 @@ linters:
     - stylecheck
     - typecheck
     - unused
+run:
+  go: "1.22"
 linters-settings:
-  stylecheck:
-    go: "1.22"
   gocritic:
     enabled-checks:
       - hugeParam

diff --git a/Makefile b/Makefile
@@ -106,6 +106,7 @@ prereqs: ## Check if prerequisites are met, and install missing dependencies
 fmt: ## Run go fmt against code.
 	@echo "### Formatting code"
 	go fmt ./...
+	find ./bpf -type f -not -path "./bpf/headers/*" -name "*.[ch]" | xargs clang-format -i --Werror
 
 .PHONY: lint
 lint: prereqs ## Lint the code

diff --git a/bpf/flows.c b/bpf/flows.c
@@ -51,22 +51,25 @@
  */
 #include "network_events_monitoring.h"
 
-static inline void update_existing_flow(flow_metrics *aggregate_flow, pkt_info *pkt, int dns_errno,
-                                        u64 len) {
+static inline void update_existing_flow(flow_metrics *aggregate_flow, pkt_info *pkt, u64 len) {
+    bpf_spin_lock(&aggregate_flow->lock);
     aggregate_flow->packets += 1;
     aggregate_flow->bytes += len;
     aggregate_flow->end_mono_time_ts = pkt->current_ts;
-    // it might happen that start_mono_time hasn't been set due to
-    // the way percpu hashmap deal with concurrent map entries
-    if (aggregate_flow->start_mono_time_ts == 0) {
-        aggregate_flow->start_mono_time_ts = pkt->current_ts;
-    }
     aggregate_flow->flags |= pkt->flags;
     aggregate_flow->dscp = pkt->dscp;
-    aggregate_flow->dns_record.id = pkt->dns_id;
-    aggregate_flow->dns_record.flags = pkt->dns_flags;
-    aggregate_flow->dns_record.latency = pkt->dns_latency;
-    aggregate_flow->dns_record.errno = dns_errno;
+    bpf_spin_unlock(&aggregate_flow->lock);
 // it might happen that start_mono_time hasn't been set due to 
 // it might happen that start_mono_time hasn't been set due to 
+}
+
+static inline void update_dns(additional_metrics *extra_metrics, pkt_info *pkt, int dns_errno) {
+    if (pkt->dns_id != 0) {
+        extra_metrics->dns_record.id = pkt->dns_id;
+        extra_metrics->dns_record.flags = pkt->dns_flags;
+        extra_metrics->dns_record.latency = pkt->dns_latency;
+    }
+    if (dns_errno != 0) {
+        extra_metrics->dns_record.errno = dns_errno;
+    }
 }
 
 static inline int flow_monitor(struct __sk_buff *skb, u8 direction) {
@@ -76,6 +79,9 @@ static inline int flow_monitor(struct __sk_buff *skb, u8 direction) {
         return TC_ACT_OK;
     }
     do_sampling = 1;
+
+    u16 eth_protocol = 0;
+
     pkt_info pkt;
     __builtin_memset(&pkt, 0, sizeof(pkt));
 
@@ -90,7 +96,7 @@ static inline int flow_monitor(struct __sk_buff *skb, u8 direction) {
     struct ethhdr *eth = (struct ethhdr *)data;
     u64 len = skb->len;
 
-    if (fill_ethhdr(eth, data_end, &pkt) == DISCARD) {
+    if (fill_ethhdr(eth, data_end, &pkt, &eth_protocol) == DISCARD) {
         return TC_ACT_OK;
     }
 
@@ -99,7 +105,7 @@ static inline int flow_monitor(struct __sk_buff *skb, u8 direction) {
     id.direction = direction;
 
     // check if this packet need to be filtered if filtering feature is enabled
-    bool skip = check_and_do_flow_filtering(&id, pkt.flags, 0);
+    bool skip = check_and_do_flow_filtering(&id, pkt.flags, 0, eth_protocol);
     if (skip) {
         return TC_ACT_OK;
     }
@@ -108,30 +114,22 @@ static inline int flow_monitor(struct __sk_buff *skb, u8 direction) {
     if (enable_dns_tracking) {
         dns_errno = track_dns_packet(skb, &pkt);
     }
-    // TODO: we need to add spinlock here when we deprecate versions prior to 5.1, or provide
-    // a spinlocked alternative version and use it selectively https://lwn.net/Articles/779120/
     flow_metrics *aggregate_flow = (flow_metrics *)bpf_map_lookup_elem(&aggregated_flows, &id);
     if (aggregate_flow != NULL) {
-        update_existing_flow(aggregate_flow, &pkt, dns_errno, len);
+        update_existing_flow(aggregate_flow, &pkt, len);
     } else {
         // Key does not exist in the map, and will need to create a new entry.
-        u64 rtt = 0;
-        if (enable_rtt && id.transport_protocol == IPPROTO_TCP) {
-            rtt = MIN_RTT;
-        }
         flow_metrics new_flow = {
             .packets = 1,
             .bytes = len,
+            .eth_protocol = eth_protocol,
             .start_mono_time_ts = pkt.current_ts,
             .end_mono_time_ts = pkt.current_ts,
             .flags = pkt.flags,
             .dscp = pkt.dscp,
-            .dns_record.id = pkt.dns_id,
-            .dns_record.flags = pkt.dns_flags,
-            .dns_record.latency = pkt.dns_latency,
-            .dns_record.errno = dns_errno,
-            .flow_rtt = rtt,
         };
+        __builtin_memcpy(new_flow.dst_mac, eth->h_dest, ETH_ALEN);
+        __builtin_memcpy(new_flow.src_mac, eth->h_source, ETH_ALEN);
 
         long ret = bpf_map_update_elem(&aggregated_flows, &id, &new_flow, BPF_NOEXIST);
         if (ret != 0) {
@@ -142,7 +140,7 @@ static inline int flow_monitor(struct __sk_buff *skb, u8 direction) {
                 flow_metrics *aggregate_flow =
                     (flow_metrics *)bpf_map_lookup_elem(&aggregated_flows, &id);
                 if (aggregate_flow != NULL) {
-                    update_existing_flow(aggregate_flow, &pkt, dns_errno, len);
+                    update_existing_flow(aggregate_flow, &pkt, len);
                 } else {
                     if (trace_messages) {
                         bpf_printk("failed to update an exising flow\n");
@@ -171,6 +169,48 @@ static inline int flow_monitor(struct __sk_buff *skb, u8 direction) {
             }
         }
     }
+
+    // Update additional metrics (per-CPU map)
+    if (pkt.dns_id != 0 || dns_errno != 0) {
+        // hack on id will be removed with dedup-in-kernel work
+        id.direction = 0;
+        id.if_index = 0;
+        additional_metrics *extra_metrics =
+            (additional_metrics *)bpf_map_lookup_elem(&additional_flow_metrics, &id);
+        if (extra_metrics != NULL) {
+            update_dns(extra_metrics, &pkt, dns_errno);
+        } else {
+            additional_metrics new_metrics = {
+                .dns_record.id = pkt.dns_id,
+                .dns_record.flags = pkt.dns_flags,
+                .dns_record.latency = pkt.dns_latency,
+                .dns_record.errno = dns_errno,
+            };
+            long ret =
+                bpf_map_update_elem(&additional_flow_metrics, &id, &new_metrics, BPF_NOEXIST);
+            if (ret != 0) {
+                if (trace_messages && ret != -EEXIST) {
+                    bpf_printk("error adding DNS %d\n", ret);
+                }
+                if (ret == -EEXIST) {
+                    // Concurrent write from another CPU; retry
+                    additional_metrics *extra_metrics =
+                        (additional_metrics *)bpf_map_lookup_elem(&additional_flow_metrics, &id);
+                    if (extra_metrics != NULL) {
+                        update_dns(extra_metrics, &pkt, dns_errno);
+                    } else {
+                        if (trace_messages) {
+                            bpf_printk("failed to update DNS\n");
+                        }
+                        increase_counter(HASHMAP_FAIL_UPDATE_DNS);
+                    }
+                } else {
+                    increase_counter(HASHMAP_FAIL_UPDATE_DNS);
+                }
+            }
+        }
+    }
+
     return TC_ACT_OK;
 }
 

diff --git a/bpf/flows_filter.h b/bpf/flows_filter.h
@@ -202,9 +202,10 @@ static __always_inline int do_flow_filter_lookup(flow_id *id, struct filter_key_
 }
 
 static __always_inline int flow_filter_setup_lookup_key(flow_id *id, struct filter_key_t *key,
-                                                        u8 *len, u8 *offset, bool use_src_ip) {
+                                                        u8 *len, u8 *offset, bool use_src_ip,
+                                                        u16 eth_protocol) {
 
-    if (id->eth_protocol == ETH_P_IP) {
+    if (eth_protocol == ETH_P_IP) {
         *len = sizeof(u32);
         *offset = sizeof(ip4in6);
         if (use_src_ip) {
@@ -213,7 +214,7 @@ static __always_inline int flow_filter_setup_lookup_key(flow_id *id, struct filt
             __builtin_memcpy(key->ip_data, id->dst_ip + *offset, *len);
         }
         key->prefix_len = 32;
-    } else if (id->eth_protocol == ETH_P_IPV6) {
+    } else if (eth_protocol == ETH_P_IPV6) {
         *len = IP_MAX_LEN;
         *offset = 0;
         if (use_src_ip) {
@@ -232,7 +233,7 @@ static __always_inline int flow_filter_setup_lookup_key(flow_id *id, struct filt
  * check if the flow match filter rule and return >= 1 if the flow is to be dropped
  */
 static __always_inline int is_flow_filtered(flow_id *id, filter_action *action, u16 flags,
-                                            u32 drop_reason) {
+                                            u32 drop_reason, u16 eth_protocol) {
     struct filter_key_t key;
     u8 len, offset;
     int result = 0;
@@ -241,7 +242,7 @@ static __always_inline int is_flow_filtered(flow_id *id, filter_action *action,
     *action = MAX_FILTER_ACTIONS;
 
     // Lets do first CIDR match using srcIP.
-    result = flow_filter_setup_lookup_key(id, &key, &len, &offset, true);
+    result = flow_filter_setup_lookup_key(id, &key, &len, &offset, true, eth_protocol);
     if (result < 0) {
         return result;
     }
@@ -253,7 +254,7 @@ static __always_inline int is_flow_filtered(flow_id *id, filter_action *action,
     }
 
     // if we can't find a match then Lets do second CIDR match using dstIP.
-    result = flow_filter_setup_lookup_key(id, &key, &len, &offset, false);
+    result = flow_filter_setup_lookup_key(id, &key, &len, &offset, false, eth_protocol);
     if (result < 0) {
         return result;
     }

diff --git a/bpf/maps_definition.h b/bpf/maps_definition.h
@@ -11,13 +11,22 @@ struct {
 
 // Key: the flow identifier. Value: the flow metrics for that identifier.
 struct {
-    __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+    __uint(type, BPF_MAP_TYPE_HASH);
     __type(key, flow_id);
     __type(value, flow_metrics);
     __uint(max_entries, 1 << 24);
     __uint(map_flags, BPF_F_NO_PREALLOC);
 } aggregated_flows SEC(".maps");
 
+// Key: the flow identifier. Value: extra metrics for that identifier.
+struct {
+    __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+    __type(key, flow_id);
+    __type(value, additional_metrics);
+    __uint(max_entries, 1 << 24);
+    __uint(map_flags, BPF_F_NO_PREALLOC);
+} additional_flow_metrics SEC(".maps");
+
 //PerfEvent Array for Packet Payloads
 struct {
     __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);

diff --git a/bpf/network_events_monitoring.h b/bpf/network_events_monitoring.h
@@ -35,15 +35,14 @@ static inline int lookup_and_update_existing_flow_network_events(flow_id *id, u8
 
     bpf_probe_read(cookie, md_len, user_cookie);
 
-    flow_metrics *aggregate_flow = bpf_map_lookup_elem(&aggregated_flows, id);
-    if (aggregate_flow != NULL) {
-        u8 idx = aggregate_flow->network_events_idx;
-        aggregate_flow->end_mono_time_ts = bpf_ktime_get_ns();
+    additional_metrics *extra_metrics = bpf_map_lookup_elem(&additional_flow_metrics, id);
+    if (extra_metrics != NULL) {
+        u8 idx = extra_metrics->network_events_idx;
         // Needed to check length here again to keep JIT verifier happy
         if (idx < MAX_NETWORK_EVENTS && md_len <= MAX_EVENT_MD) {
-            if (!md_already_exists(aggregate_flow->network_events, (u8 *)cookie)) {
-                __builtin_memcpy(aggregate_flow->network_events[idx], cookie, MAX_EVENT_MD);
-                aggregate_flow->network_events_idx = (idx + 1) % MAX_NETWORK_EVENTS;
+            if (!md_already_exists(extra_metrics->network_events, (u8 *)cookie)) {
+                __builtin_memcpy(extra_metrics->network_events[idx], cookie, MAX_EVENT_MD);
+                extra_metrics->network_events_idx = (idx + 1) % MAX_NETWORK_EVENTS;
             }
             return 0;
         }
@@ -53,10 +52,9 @@ static inline int lookup_and_update_existing_flow_network_events(flow_id *id, u8
 
 static inline int trace_network_events(struct sk_buff *skb, struct rh_psample_metadata *md) {
     u8 dscp = 0, protocol = 0, md_len = 0;
-    u16 family = 0, flags = 0;
+    u16 family = 0, flags = 0, eth_protocol = 0;
     u8 *user_cookie = NULL;
     long ret = 0;
-    u64 len = 0;
     flow_id id;
 
     __builtin_memset(&id, 0, sizeof(id));
@@ -67,12 +65,8 @@ static inline int trace_network_events(struct sk_buff *skb, struct rh_psample_me
         return -1;
     }
 
-    id.if_index = BPF_CORE_READ(md, in_ifindex);
-
-    len = BPF_CORE_READ(skb, len);
-
     // read L2 info
-    core_fill_in_l2(skb, &id, &family);
+    core_fill_in_l2(skb, &eth_protocol, &family);
 
     // read L3 info
     core_fill_in_l3(skb, &id, family, &protocol, &dscp);
@@ -99,7 +93,7 @@ static inline int trace_network_events(struct sk_buff *skb, struct rh_psample_me
     }
 
     // check if this packet need to be filtered if filtering feature is enabled
-    bool skip = check_and_do_flow_filtering(&id, flags, 0);
+    bool skip = check_and_do_flow_filtering(&id, flags, 0, eth_protocol);
     if (skip) {
         return 0;
     }
@@ -113,19 +107,12 @@ static inline int trace_network_events(struct sk_buff *skb, struct rh_psample_me
     }
 
     // there is no matching flows so lets create new one and add the network event metadata
-    u64 current_time = bpf_ktime_get_ns();
-    id.direction = INGRESS;
-    flow_metrics new_flow = {
-        .packets = 1,
-        .bytes = len,
-        .start_mono_time_ts = current_time,
-        .end_mono_time_ts = current_time,
-        .flags = flags,
+    additional_metrics new_flow = {
         .network_events_idx = 0,
     };
     bpf_probe_read(new_flow.network_events[0], md_len, user_cookie);
     new_flow.network_events_idx++;
-    ret = bpf_map_update_elem(&aggregated_flows, &id, &new_flow, BPF_NOEXIST);
+    ret = bpf_map_update_elem(&additional_flow_metrics, &id, &new_flow, BPF_NOEXIST);
     if (ret != 0) {
         if (trace_messages && ret != -EEXIST) {
             bpf_printk("error network events creating new flow %d\n", ret);

diff --git a/bpf/pca.h b/bpf/pca.h
@@ -41,14 +41,15 @@ static inline bool validate_pca_filter(struct __sk_buff *skb, direction dir) {
     __builtin_memset(&pkt, 0, sizeof(pkt));
     flow_id id;
     __builtin_memset(&id, 0, sizeof(id));
+    u16 eth_protocol = 0;
 
     pkt.id = &id;
 
     void *data_end = (void *)(long)skb->data_end;
     void *data = (void *)(long)skb->data;
     struct ethhdr *eth = (struct ethhdr *)data;
 
-    if (fill_ethhdr(eth, data_end, &pkt) == DISCARD) {
+    if (fill_ethhdr(eth, data_end, &pkt, &eth_protocol) == DISCARD) {
         return false;
     }
 
@@ -57,7 +58,7 @@ static inline bool validate_pca_filter(struct __sk_buff *skb, direction dir) {
     id.direction = dir;
 
     // check if this packet need to be filtered if filtering feature is enabled
-    bool skip = check_and_do_flow_filtering(&id, pkt.flags, 0);
+    bool skip = check_and_do_flow_filtering(&id, pkt.flags, 0, eth_protocol);
     if (skip) {
         return false;
     }