Skip to content

Commit

Permalink
zebra: batch stream notifications messages
Browse files Browse the repository at this point in the history
There is a CPU issue in ZEBRA when BGP installs and removes
a lot of routes at the same time. The vtysh and shell become
unreachable. This is the case of BGP failover scenarios with
two peers, and one of the peers becoming unreachable.

The following message may appear:

> 2024/05/22 16:12:42.628688 ZEBRA: [QH9AB-Y4XMZ][EC 100663314] STARVATION:
> task zebra_route_process_notify_thread_loop (561f0118472f) ran for 5091ms (cpu time 40ms)

This means that the task in charge of notifying the clients about a route
(un) install success/failure takes too much time.

Batch the stream messages as much as we can, to decrease the CPU time.

Signed-off-by: Philippe Guibert <[email protected]>
  • Loading branch information
pguibert6WIND committed May 30, 2024
1 parent 9fc5084 commit 452a014
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 10 deletions.
1 change: 1 addition & 0 deletions zebra/rib.h
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,7 @@ extern int rib_add_gr_run(afi_t afi, vrf_id_t vrf_id, uint8_t proto,
extern void zebra_vty_init(void);
extern void zebra_rnh_job_list_display(struct vty *vty);
extern void zebra_route_notify_job_owner_list_display(struct vty *vty);
extern void zebra_process_notify_client_list_display(struct vty *vty);
extern void
zebra_route_notify_job_owner_list_enqueue(struct route_node *rn,
const struct zebra_dplane_ctx *ctx,
Expand Down
9 changes: 6 additions & 3 deletions zebra/zapi_msg.c
Original file line number Diff line number Diff line change
Expand Up @@ -727,7 +727,7 @@ int route_notify_internal_prefix(const struct prefix *p, int type,
uint16_t instance, vrf_id_t vrf_id,
uint32_t table_id,
enum zapi_route_notify_owner note, afi_t afi,
safi_t safi)
safi_t safi, struct stream_fifo *out_fifo)
{
struct zserv *client;
struct stream *s;
Expand Down Expand Up @@ -771,7 +771,10 @@ int route_notify_internal_prefix(const struct prefix *p, int type,

stream_putw_at(s, 0, stream_get_endp(s));

return zserv_send_message(client, s);
if (out_fifo == NULL)
return zserv_send_message(client, s);
stream_fifo_push(out_fifo, s);
return 1;
}

static int route_notify_internal(const struct route_node *rn, int type,
Expand All @@ -781,7 +784,7 @@ static int route_notify_internal(const struct route_node *rn, int type,
safi_t safi)
{
return route_notify_internal_prefix(&rn->p, type, instance, vrf_id,
table_id, note, afi, safi);
table_id, note, afi, safi, false);
}

int zsend_route_notify_owner(const struct route_node *rn,
Expand Down
3 changes: 2 additions & 1 deletion zebra/zapi_msg.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ extern int route_notify_internal_prefix(const struct prefix *p, int type,
uint16_t instance, vrf_id_t vrf_id,
uint32_t table_id,
enum zapi_route_notify_owner note,
afi_t afi, safi_t safi);
afi_t afi, safi_t safi,
struct stream_fifo *out_fifo);
#ifdef __cplusplus
}
#endif
84 changes: 78 additions & 6 deletions zebra/zebra_rib.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ DEFINE_MTYPE_STATIC(ZEBRA, RIB_UPDATE_CTX, "Rib update context object");
DEFINE_MTYPE_STATIC(ZEBRA, WQ_WRAPPER, "WQ wrapper");
DEFINE_MTYPE_STATIC(ZEBRA, RNH_JOB_CTX, "Rnh Job context");
DEFINE_MTYPE_STATIC(ZEBRA, ROUTE_NOTIFY_JOB_OWNER_CTX, "Route Notify Job Owner");
DEFINE_MTYPE_STATIC(ZEBRA, PROCESS_NOTIFY_CLIENT_LIST,
"Process Notify Client List");

/*
* Event, list, and mutex for delivery of dataplane results
Expand Down Expand Up @@ -5053,11 +5055,39 @@ void zebra_route_notify_job_owner_list_display(struct vty *vty)
zebra_route_notify_job_owner_list_max_batch);
}

PREDECL_DLIST(zebra_process_notify_client_list);

struct zebra_process_notify_client_ctx {
uint16_t num_msgs;
int instance;
int type;
struct zserv *zclient;
struct stream_fifo out_fifo;
/* Embedded list linkage */
struct zebra_process_notify_client_list_item pnc_entries;
};
DECLARE_DLIST(zebra_process_notify_client_list,
struct zebra_process_notify_client_ctx, pnc_entries);
static uint32_t zebra_process_notify_client_list_num;
static uint32_t zebra_process_notify_client_list_processed;

void zebra_process_notify_client_list_display(struct vty *vty)
{
vty_out(vty, "Process Notify Client list count %u, processed %u\n",
zebra_process_notify_client_list_num,
zebra_process_notify_client_list_processed);
}

static void zebra_route_process_notify_thread_loop(struct event *event)
{
struct zebra_route_notify_job_owner_list_head ctxlist;
struct zebra_route_notify_job_owner_ctx *ctx;
uint32_t count = 0;
struct zebra_process_notify_client_list_head client_list;
struct zebra_process_notify_client_ctx *client;
struct zserv *zclient;

zebra_process_notify_client_list_init(&client_list);

do {
zebra_route_notify_job_owner_list_init(&ctxlist);
Expand All @@ -5075,18 +5105,60 @@ static void zebra_route_process_notify_thread_loop(struct event *event)
break;
while (ctx) {
zebra_route_notify_job_owner_list_processed++;
count++;
route_notify_internal_prefix(&ctx->prefix, ctx->type,
ctx->instance, ctx->vrf_id,
ctx->table, ctx->note,
ctx->afi, ctx->safi);
zclient = zserv_find_client(ctx->type, ctx->instance);
if (zclient && zclient->notify_owner) {
frr_each_safe (zebra_process_notify_client_list,
&client_list, client) {
if (client->type == ctx->type &&
client->instance == ctx->instance)
break;
}
if (!client) {
client = XCALLOC(
MTYPE_PROCESS_NOTIFY_CLIENT_LIST,
sizeof(struct zebra_process_notify_client_ctx));
stream_fifo_init(&client->out_fifo);
client->instance = ctx->instance;
client->type = ctx->type;
client->zclient = zclient;
zebra_process_notify_client_list_add_tail(
&client_list, client);
}
route_notify_internal_prefix(&ctx->prefix,
ctx->type,
ctx->instance,
ctx->vrf_id,
ctx->table,
ctx->note,
ctx->afi, ctx->safi,
&client->out_fifo);
client->num_msgs++;
count++;
zebra_process_notify_client_list_num++;
if (client->num_msgs == 20) {
zserv_send_batch(zclient,
&client->out_fifo);
zebra_process_notify_client_list_processed++;
client->num_msgs = 0;
}
}
XFREE(MTYPE_ROUTE_NOTIFY_JOB_OWNER_CTX, ctx);
ctx = zebra_route_notify_job_owner_list_pop(&ctxlist);
}
} while (1);

if (count > zebra_route_notify_job_owner_list_max_batch) {
if (count > zebra_route_notify_job_owner_list_max_batch)
zebra_route_notify_job_owner_list_max_batch = count;

while ((client = zebra_process_notify_client_list_pop(&client_list)) !=
NULL) {
if (client->num_msgs) {
zserv_send_batch(client->zclient, &client->out_fifo);
zebra_process_notify_client_list_processed++;
client->num_msgs = 0;
}
stream_fifo_deinit(&client->out_fifo);
XFREE(MTYPE_PROCESS_NOTIFY_CLIENT_LIST, client);
}
}

Expand Down
1 change: 1 addition & 0 deletions zebra/zebra_vty.c
Original file line number Diff line number Diff line change
Expand Up @@ -4093,6 +4093,7 @@ DEFUN(show_rib_info, show_rib_info_cmd, "show rib info",
{
zebra_rnh_job_list_display(vty);
zebra_route_notify_job_owner_list_display(vty);
zebra_process_notify_client_list_display(vty);
return CMD_SUCCESS;
}

Expand Down

0 comments on commit 452a014

Please sign in to comment.