From bffb1259d400ac93a83a2ddb17458c86faa5e6e6 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 8 Feb 2024 19:11:30 +0100 Subject: [PATCH] northd: lflow-mgr: Allocate DP reference counters on a second use. Currently, whenever a new logical flow is created, northd allocates a reference counter for every datapath in the datapath group for that logical flow. Those reference counters are necessary in order to not remove the datapath from a logical flow during incremental processing if it was created from two different objects for the same datapath and now one of them is removed. However, that creates a serious scalability problem. In a large scale setups with tens of thousands of logical flows applied to large datapath groups we may have hundreds of millions of these reference counters allocated, which is many GB of RSS just for that purpose. For example, in ovn-heater's cluster-density 500 node test, ovn-northd started to consume up to 8.5 GB or RAM. In the same test before the reference counting, ovn-northd consumed only 2.5 GB. All those memory allocation also increased CPU usage. Re-compute time went up from 1.5 seconds to 6 seconds in the same test. In the end we have about 4x increase on both CPU and memory usage. Running under valgrind --tool=massif shows: ------------------------------------------------------------- total(B) useful-heap(B) extra-heap(B) stacks(B) ------------------------------------------------------------- 9,416,438,200 7,401,971,556 2,014,466,644 0 78.61% (7,401 MB) (heap allocation functions) malloc ->45.78% (4,311 MB) xcalloc__ (util.c:124) | ->45.68% (4,301 MB) xzalloc__ (util.c:134) | | ->45.68% (4,301 MB) xzalloc (util.c:168) | | ->40.97% (3,857 MB) dp_refcnt_use (lflow-mgr.c:1348) | | | ->40.89% (3,850 MB) lflow_table_add_lflow (lflow-mgr.c:696) | | | | ->12.27% (1,155 MB) build_lb_rules_pre_stateful (northd.c:7180) | | | | ->12.27% (1,155 MB) build_lb_rules (northd.c:7658) | | | | ->12.27% (1,155,MB) build_gw_lrouter_nat_flows_for_lb (northd.c:11054) ->28.62% (2,694 MB) xmalloc__ (util.c:140) | ->28.62% (2,694 MB) xmalloc (util.c:175) | ->06.71% (631 MB) resize (hmap.c:100) | | ->06.01% (565 MB) hmap_expand_at (hmap.c:175) | | | ->05.24% (492 MB) hmap_insert_at (hmap.h:309) | | | | ->05.24% (492 MB) dp_refcnt_use (lflow-mgr.c:1351) 45% of all the memory is allocated for reference counters themselves and another 7% is taken by hash maps to hold them. Also, there is more than 20% of a total memory allocation overhead (extra-heap) since all these allocated objects are very small (32B). This test allocates 120 M reference counters total. However, the vast majority of all the reference counters always has a value of 1, i.e. these datapaths are not used more than once. Defer allocation of reference counters until the datapath is used for the same logical flow for the second time. We can do that by checking the current datapath group bitmap. With this change, the amount of allocated reference counters goes down from 120 M to just 12 M. Memory consumption reduced from 8.5 GB to 2.67 GB and the northd recompute time reduced from 6 to 2.1 seconds. It is still a little higher than resource usage before introduction of incremental processing for logical flows, but it is fairly manageable. Also, the resource usage and memory consumption may be further improved by reducing the number of cases where northd attempts to create the logical flows for the same datapaths multiple times. Note: the cluster-density test in ovn-heater creates new port groups on every iteration and ovn-northd doesn't handle this incrementally, so it always re-computes. That's why there is no benefit from northd I-P for CPU usage in this scenario. Fixes: a623606052ea ("northd: Refactor lflow management into a separate module.") Signed-off-by: Ilya Maximets --- northd/lflow-mgr.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/northd/lflow-mgr.c b/northd/lflow-mgr.c index df62cd6ab4..f70896cce7 100644 --- a/northd/lflow-mgr.c +++ b/northd/lflow-mgr.c @@ -47,8 +47,7 @@ static void ovn_lflow_destroy(struct lflow_table *lflow_table, static char *ovn_lflow_hint(const struct ovsdb_idl_row *row); static struct ovn_lflow *do_ovn_lflow_add( - struct lflow_table *, const struct ovn_datapath *, - const unsigned long *dp_bitmap, size_t dp_bitmap_len, uint32_t hash, + struct lflow_table *, size_t dp_bitmap_len, uint32_t hash, enum ovn_stage stage, uint16_t priority, const char *match, const char *actions, const char *io_port, const char *ctrl_meter, @@ -674,9 +673,9 @@ lflow_table_add_lflow(struct lflow_table *lflow_table, hash_lock = lflow_hash_lock(&lflow_table->entries, hash); struct ovn_lflow *lflow = - do_ovn_lflow_add(lflow_table, od, dp_bitmap, - dp_bitmap_len, hash, stage, - priority, match, actions, + do_ovn_lflow_add(lflow_table, + od ? ods_size(od->datapaths) : dp_bitmap_len, + hash, stage, priority, match, actions, io_port, ctrl_meter, stage_hint, where); if (lflow_ref) { @@ -702,17 +701,24 @@ lflow_table_add_lflow(struct lflow_table *lflow_table, ovs_assert(lrn->dpgrp_bitmap_len == dp_bitmap_len); size_t index; BITMAP_FOR_EACH_1 (index, dp_bitmap_len, dp_bitmap) { - dp_refcnt_use(&lflow->dp_refcnts_map, index); + /* Allocate a reference counter only if already used. */ + if (bitmap_is_set(lflow->dpg_bitmap, index)) { + dp_refcnt_use(&lflow->dp_refcnts_map, index); + } } } else { - dp_refcnt_use(&lflow->dp_refcnts_map, lrn->dp_index); + /* Allocate a reference counter only if already used. */ + if (bitmap_is_set(lflow->dpg_bitmap, lrn->dp_index)) { + dp_refcnt_use(&lflow->dp_refcnts_map, lrn->dp_index); + } } } lrn->linked = true; } - lflow_hash_unlock(hash_lock); + ovn_dp_group_add_with_reference(lflow, od, dp_bitmap, dp_bitmap_len); + lflow_hash_unlock(hash_lock); } void @@ -946,9 +952,7 @@ ovn_lflow_destroy(struct lflow_table *lflow_table, struct ovn_lflow *lflow) } static struct ovn_lflow * -do_ovn_lflow_add(struct lflow_table *lflow_table, - const struct ovn_datapath *od, - const unsigned long *dp_bitmap, size_t dp_bitmap_len, +do_ovn_lflow_add(struct lflow_table *lflow_table, size_t dp_bitmap_len, uint32_t hash, enum ovn_stage stage, uint16_t priority, const char *match, const char *actions, const char *io_port, const char *ctrl_meter, @@ -959,14 +963,11 @@ do_ovn_lflow_add(struct lflow_table *lflow_table, struct ovn_lflow *old_lflow; struct ovn_lflow *lflow; - size_t bitmap_len = od ? ods_size(od->datapaths) : dp_bitmap_len; - ovs_assert(bitmap_len); + ovs_assert(dp_bitmap_len); old_lflow = ovn_lflow_find(&lflow_table->entries, stage, priority, match, actions, ctrl_meter, hash); if (old_lflow) { - ovn_dp_group_add_with_reference(old_lflow, od, dp_bitmap, - bitmap_len); return old_lflow; } @@ -974,14 +975,12 @@ do_ovn_lflow_add(struct lflow_table *lflow_table, /* While adding new logical flows we're not setting single datapath, but * collecting a group. 'od' will be updated later for all flows with only * one datapath in a group, so it could be hashed correctly. */ - ovn_lflow_init(lflow, NULL, bitmap_len, stage, priority, + ovn_lflow_init(lflow, NULL, dp_bitmap_len, stage, priority, xstrdup(match), xstrdup(actions), io_port ? xstrdup(io_port) : NULL, nullable_xstrdup(ctrl_meter), ovn_lflow_hint(stage_hint), where); - ovn_dp_group_add_with_reference(lflow, od, dp_bitmap, bitmap_len); - if (parallelization_state != STATE_USE_PARALLELIZATION) { hmap_insert(&lflow_table->entries, &lflow->hmap_node, hash); } else { @@ -1350,8 +1349,10 @@ dp_refcnt_use(struct hmap *dp_refcnts_map, size_t dp_index) struct dp_refcnt *dp_refcnt = dp_refcnt_find(dp_refcnts_map, dp_index); if (!dp_refcnt) { - dp_refcnt = xzalloc(sizeof *dp_refcnt); + dp_refcnt = xmalloc(sizeof *dp_refcnt); dp_refcnt->dp_index = dp_index; + /* Allocation is happening on the second (!) use. */ + dp_refcnt->refcnt = 1; hmap_insert(dp_refcnts_map, &dp_refcnt->key_node, dp_index); }