-
Notifications
You must be signed in to change notification settings - Fork 35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
NETOBSERV-1995: Use global map, spinlock, split maps #469
Changes from all commits
e3fcbdc
a4085be
3f850d7
119061c
e083fbb
70c3b54
1723055
f8c24aa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -51,22 +51,25 @@ | |||
*/ | ||||
#include "network_events_monitoring.h" | ||||
|
||||
static inline void update_existing_flow(flow_metrics *aggregate_flow, pkt_info *pkt, int dns_errno, | ||||
u64 len) { | ||||
static inline void update_existing_flow(flow_metrics *aggregate_flow, pkt_info *pkt, u64 len) { | ||||
bpf_spin_lock(&aggregate_flow->lock); | ||||
aggregate_flow->packets += 1; | ||||
aggregate_flow->bytes += len; | ||||
aggregate_flow->end_mono_time_ts = pkt->current_ts; | ||||
// it might happen that start_mono_time hasn't been set due to | ||||
// the way percpu hashmap deal with concurrent map entries | ||||
if (aggregate_flow->start_mono_time_ts == 0) { | ||||
aggregate_flow->start_mono_time_ts = pkt->current_ts; | ||||
} | ||||
aggregate_flow->flags |= pkt->flags; | ||||
aggregate_flow->dscp = pkt->dscp; | ||||
aggregate_flow->dns_record.id = pkt->dns_id; | ||||
aggregate_flow->dns_record.flags = pkt->dns_flags; | ||||
aggregate_flow->dns_record.latency = pkt->dns_latency; | ||||
aggregate_flow->dns_record.errno = dns_errno; | ||||
bpf_spin_unlock(&aggregate_flow->lock); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we no longer needs netobserv-ebpf-agent/bpf/flows.c Line 59 in 119061c
as that was specific to perCPU map There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. correct, good catch There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||||
} | ||||
|
||||
static inline void update_dns(additional_metrics *extra_metrics, pkt_info *pkt, int dns_errno) { | ||||
if (pkt->dns_id != 0) { | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. u can check dns_err here and if its There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there can be an error and still have DNS data: when the dns request wasn't found, we can't generate latency, but we still provide the other data, and return ENOENT. |
||||
extra_metrics->dns_record.id = pkt->dns_id; | ||||
extra_metrics->dns_record.flags = pkt->dns_flags; | ||||
extra_metrics->dns_record.latency = pkt->dns_latency; | ||||
} | ||||
if (dns_errno != 0) { | ||||
extra_metrics->dns_record.errno = dns_errno; | ||||
} | ||||
} | ||||
|
||||
static inline int flow_monitor(struct __sk_buff *skb, u8 direction) { | ||||
|
@@ -76,6 +79,9 @@ static inline int flow_monitor(struct __sk_buff *skb, u8 direction) { | |||
return TC_ACT_OK; | ||||
} | ||||
do_sampling = 1; | ||||
|
||||
u16 eth_protocol = 0; | ||||
|
||||
pkt_info pkt; | ||||
__builtin_memset(&pkt, 0, sizeof(pkt)); | ||||
|
||||
|
@@ -90,7 +96,7 @@ static inline int flow_monitor(struct __sk_buff *skb, u8 direction) { | |||
struct ethhdr *eth = (struct ethhdr *)data; | ||||
u64 len = skb->len; | ||||
|
||||
if (fill_ethhdr(eth, data_end, &pkt) == DISCARD) { | ||||
if (fill_ethhdr(eth, data_end, &pkt, ð_protocol) == DISCARD) { | ||||
return TC_ACT_OK; | ||||
} | ||||
|
||||
|
@@ -99,7 +105,7 @@ static inline int flow_monitor(struct __sk_buff *skb, u8 direction) { | |||
id.direction = direction; | ||||
|
||||
// check if this packet need to be filtered if filtering feature is enabled | ||||
bool skip = check_and_do_flow_filtering(&id, pkt.flags, 0); | ||||
bool skip = check_and_do_flow_filtering(&id, pkt.flags, 0, eth_protocol); | ||||
if (skip) { | ||||
return TC_ACT_OK; | ||||
} | ||||
|
@@ -108,30 +114,22 @@ static inline int flow_monitor(struct __sk_buff *skb, u8 direction) { | |||
if (enable_dns_tracking) { | ||||
dns_errno = track_dns_packet(skb, &pkt); | ||||
} | ||||
// TODO: we need to add spinlock here when we deprecate versions prior to 5.1, or provide | ||||
// a spinlocked alternative version and use it selectively https://lwn.net/Articles/779120/ | ||||
flow_metrics *aggregate_flow = (flow_metrics *)bpf_map_lookup_elem(&aggregated_flows, &id); | ||||
if (aggregate_flow != NULL) { | ||||
update_existing_flow(aggregate_flow, &pkt, dns_errno, len); | ||||
update_existing_flow(aggregate_flow, &pkt, len); | ||||
} else { | ||||
// Key does not exist in the map, and will need to create a new entry. | ||||
u64 rtt = 0; | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @msherif1234 I don't understand the rationale for this code. We're setting default RTT for TCP flows, how could that be accurate? I don't see how this wouldn't produce fake data. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this the min value of ythe hook isn't fire IIRC it was a define in the kernel and I believe we doc that as the default min RTT, when I recall more I will update here |
||||
if (enable_rtt && id.transport_protocol == IPPROTO_TCP) { | ||||
rtt = MIN_RTT; | ||||
} | ||||
flow_metrics new_flow = { | ||||
.packets = 1, | ||||
.bytes = len, | ||||
.eth_protocol = eth_protocol, | ||||
.start_mono_time_ts = pkt.current_ts, | ||||
.end_mono_time_ts = pkt.current_ts, | ||||
.flags = pkt.flags, | ||||
.dscp = pkt.dscp, | ||||
.dns_record.id = pkt.dns_id, | ||||
.dns_record.flags = pkt.dns_flags, | ||||
.dns_record.latency = pkt.dns_latency, | ||||
.dns_record.errno = dns_errno, | ||||
.flow_rtt = rtt, | ||||
}; | ||||
__builtin_memcpy(new_flow.dst_mac, eth->h_dest, ETH_ALEN); | ||||
__builtin_memcpy(new_flow.src_mac, eth->h_source, ETH_ALEN); | ||||
|
||||
long ret = bpf_map_update_elem(&aggregated_flows, &id, &new_flow, BPF_NOEXIST); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we will probably need to spin lock/unlock here as well There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the lock is on the value object, not the key, and it's a new object in that case, so I don't think it makes sense to lock There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if u have concurrent new flows on different cpus one should create and the other will update ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's what the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. with eexist lookup and update we need to take the lock as well what about There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||
if (ret != 0) { | ||||
|
@@ -142,7 +140,7 @@ static inline int flow_monitor(struct __sk_buff *skb, u8 direction) { | |||
flow_metrics *aggregate_flow = | ||||
(flow_metrics *)bpf_map_lookup_elem(&aggregated_flows, &id); | ||||
if (aggregate_flow != NULL) { | ||||
update_existing_flow(aggregate_flow, &pkt, dns_errno, len); | ||||
update_existing_flow(aggregate_flow, &pkt, len); | ||||
} else { | ||||
if (trace_messages) { | ||||
bpf_printk("failed to update an exising flow\n"); | ||||
|
@@ -171,6 +169,48 @@ static inline int flow_monitor(struct __sk_buff *skb, u8 direction) { | |||
} | ||||
} | ||||
} | ||||
|
||||
// Update additional metrics (per-CPU map) | ||||
if (pkt.dns_id != 0 || dns_errno != 0) { | ||||
// hack on id will be removed with dedup-in-kernel work | ||||
id.direction = 0; | ||||
id.if_index = 0; | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will the feature work with this hack ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure, it's precisely here to make it work :-) |
||||
additional_metrics *extra_metrics = | ||||
(additional_metrics *)bpf_map_lookup_elem(&additional_flow_metrics, &id); | ||||
if (extra_metrics != NULL) { | ||||
update_dns(extra_metrics, &pkt, dns_errno); | ||||
} else { | ||||
additional_metrics new_metrics = { | ||||
.dns_record.id = pkt.dns_id, | ||||
.dns_record.flags = pkt.dns_flags, | ||||
.dns_record.latency = pkt.dns_latency, | ||||
.dns_record.errno = dns_errno, | ||||
}; | ||||
long ret = | ||||
bpf_map_update_elem(&additional_flow_metrics, &id, &new_metrics, BPF_NOEXIST); | ||||
if (ret != 0) { | ||||
if (trace_messages && ret != -EEXIST) { | ||||
bpf_printk("error adding DNS %d\n", ret); | ||||
} | ||||
if (ret == -EEXIST) { | ||||
// Concurrent write from another CPU; retry | ||||
additional_metrics *extra_metrics = | ||||
(additional_metrics *)bpf_map_lookup_elem(&additional_flow_metrics, &id); | ||||
if (extra_metrics != NULL) { | ||||
update_dns(extra_metrics, &pkt, dns_errno); | ||||
} else { | ||||
if (trace_messages) { | ||||
bpf_printk("failed to update DNS\n"); | ||||
} | ||||
increase_counter(HASHMAP_FAIL_UPDATE_DNS); | ||||
} | ||||
} else { | ||||
increase_counter(HASHMAP_FAIL_UPDATE_DNS); | ||||
} | ||||
} | ||||
} | ||||
} | ||||
|
||||
return TC_ACT_OK; | ||||
} | ||||
|
||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,15 +35,14 @@ static inline int lookup_and_update_existing_flow_network_events(flow_id *id, u8 | |
|
||
bpf_probe_read(cookie, md_len, user_cookie); | ||
|
||
flow_metrics *aggregate_flow = bpf_map_lookup_elem(&aggregated_flows, id); | ||
if (aggregate_flow != NULL) { | ||
u8 idx = aggregate_flow->network_events_idx; | ||
aggregate_flow->end_mono_time_ts = bpf_ktime_get_ns(); | ||
additional_metrics *extra_metrics = bpf_map_lookup_elem(&additional_flow_metrics, id); | ||
if (extra_metrics != NULL) { | ||
u8 idx = extra_metrics->network_events_idx; | ||
// Needed to check length here again to keep JIT verifier happy | ||
if (idx < MAX_NETWORK_EVENTS && md_len <= MAX_EVENT_MD) { | ||
if (!md_already_exists(aggregate_flow->network_events, (u8 *)cookie)) { | ||
__builtin_memcpy(aggregate_flow->network_events[idx], cookie, MAX_EVENT_MD); | ||
aggregate_flow->network_events_idx = (idx + 1) % MAX_NETWORK_EVENTS; | ||
if (!md_already_exists(extra_metrics->network_events, (u8 *)cookie)) { | ||
__builtin_memcpy(extra_metrics->network_events[idx], cookie, MAX_EVENT_MD); | ||
extra_metrics->network_events_idx = (idx + 1) % MAX_NETWORK_EVENTS; | ||
} | ||
return 0; | ||
} | ||
|
@@ -53,10 +52,9 @@ static inline int lookup_and_update_existing_flow_network_events(flow_id *id, u8 | |
|
||
static inline int trace_network_events(struct sk_buff *skb, struct rh_psample_metadata *md) { | ||
u8 dscp = 0, protocol = 0, md_len = 0; | ||
u16 family = 0, flags = 0; | ||
u16 family = 0, flags = 0, eth_protocol = 0; | ||
u8 *user_cookie = NULL; | ||
long ret = 0; | ||
u64 len = 0; | ||
flow_id id; | ||
|
||
__builtin_memset(&id, 0, sizeof(id)); | ||
|
@@ -67,12 +65,8 @@ static inline int trace_network_events(struct sk_buff *skb, struct rh_psample_me | |
return -1; | ||
} | ||
|
||
id.if_index = BPF_CORE_READ(md, in_ifindex); | ||
|
||
len = BPF_CORE_READ(skb, len); | ||
|
||
// read L2 info | ||
core_fill_in_l2(skb, &id, &family); | ||
core_fill_in_l2(skb, ð_protocol, &family); | ||
|
||
// read L3 info | ||
core_fill_in_l3(skb, &id, family, &protocol, &dscp); | ||
|
@@ -99,7 +93,7 @@ static inline int trace_network_events(struct sk_buff *skb, struct rh_psample_me | |
} | ||
|
||
// check if this packet need to be filtered if filtering feature is enabled | ||
bool skip = check_and_do_flow_filtering(&id, flags, 0); | ||
bool skip = check_and_do_flow_filtering(&id, flags, 0, eth_protocol); | ||
if (skip) { | ||
return 0; | ||
} | ||
|
@@ -113,19 +107,12 @@ static inline int trace_network_events(struct sk_buff *skb, struct rh_psample_me | |
} | ||
|
||
// there is no matching flows so lets create new one and add the network event metadata | ||
u64 current_time = bpf_ktime_get_ns(); | ||
id.direction = INGRESS; | ||
flow_metrics new_flow = { | ||
.packets = 1, | ||
.bytes = len, | ||
.start_mono_time_ts = current_time, | ||
.end_mono_time_ts = current_time, | ||
.flags = flags, | ||
additional_metrics new_flow = { | ||
.network_events_idx = 0, | ||
}; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there are instances where pkt hit the hook but not the tcx in that case u will see flows with o bytes and pkts did u check for that ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what sort of instances? My problem with that is it would duplicate counts, if this hook is called before the tc - or if the TC flow was just flushed - hence leading to over-estimate. I would rather keep strict responsibility boundaries : TC for counters, other hooks for additional features, and not mix up roles. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. then u need to make sure u have no rows in the UI with no counters as that is misleading |
||
bpf_probe_read(new_flow.network_events[0], md_len, user_cookie); | ||
new_flow.network_events_idx++; | ||
ret = bpf_map_update_elem(&aggregated_flows, &id, &new_flow, BPF_NOEXIST); | ||
ret = bpf_map_update_elem(&additional_flow_metrics, &id, &new_flow, BPF_NOEXIST); | ||
if (ret != 0) { | ||
if (trace_messages && ret != -EEXIST) { | ||
bpf_printk("error network events creating new flow %d\n", ret); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have this rule as part of the lint target we can make it part of the fmt but in that case we need to remove it from the lint so we won't run it twice when build ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that's for two different things: the format rule allows to auto-format as part of the local build process on your machine, whereas the lint rule is still useful on the CI to make sure we didn't commit & push unformatted code.