Skip to content

Commit

Permalink
northd: Use datapath groups for southbound load balancers.
Browse files Browse the repository at this point in the history
Some CMSes, like ovn-kubernetes, tend to create load balancers attached
to a big number of logical switches or routers.  For each of these
load balancers northd creates a record in Load_Balancer table of the
Southbound database with all the logical datapaths (switches, routers)
listed in the 'datapaths' column.  All these logical datapaths are
references to datapath bindings.  With large number of load balancers
like these applied to the same set of load balancers, the size of
the Southbound database can grow significantly and these references
can take up to 90% of all the traffic between Sb DB and ovn-controllers.

For example, while creating 3 load balancers (1 for tcp, udp and sctp)
in a 250-node cluster in ovn-heater cluster-density test, the database
update sent to every ovn-controller is about 40 KB in size, out of
which 36 KB are just UUIDs of 250 logical datapaths repeated 3 times.

Introducing a new column 'datapath_group' in a Load_Balancer table,
so we can create a group once and just refer to it from each load
balancer.  This saves a lot of CPU time, memory and network bandwidth.
Re-using already existing Logical_DP_Group table to store these groups.

In 250 node cluster-density test with ovn-heater that creates 30K load
balancers applied to all 250 logical switches the following improvement
was observed:

 - Southbound database size decreased from 310 MB to 118 MB.
 - CPU time on Southbound ovsdb-server process decreased by a
   factor of 7, from ~35 minutes per server to ~5.
 - Memory consumption on Southbound ovsdb-server process decreased
   from 12 GB (peaked at ~14) RSS down to ~1 GB (peaked at ~2).
 - Memory consumption on ovn-controller processes decreased
   by 400 MB on each node, from 1300 MB to 900 MB.  Similar decrease
   observed for ovn-northd as well.

We're adding some extra work to ovn-northd process with this change
to find/create datapath groups.  CPU time in the test above increased
by ~10%, but overall round trip time for changes in OVN to be
propagated to OVN controllers is still noticeably lower due to
improvements in other components like Sb DB.

Implementation details:
 - Groups are not shared between logical flows and load balancers,
   so there could be duplicated groups this way, but that should
   not be a big problem.  Sharing groups will require some code
   re-structuring with unclear benefits.
 - ovn-controller and ovn-sbctl are parsing both the 'datapaths' column
   and the 'datapath_group' to keep them backward compatible.
 - Load_Balancer table is not conditionally monitored by ovn-controller
   right now, so not changing that behavior.  If conditional monitoring
   will be introduced later, same considerations as for logical flows
   should be applied.

Signed-off-by: Ilya Maximets <[email protected]>
  • Loading branch information
igsilya committed Aug 3, 2022
1 parent 3affdfe commit 80ba0b7
Show file tree
Hide file tree
Showing 7 changed files with 271 additions and 106 deletions.
44 changes: 37 additions & 7 deletions controller/lflow.c
Original file line number Diff line number Diff line change
Expand Up @@ -2063,6 +2063,20 @@ add_lb_vip_hairpin_flows(struct ovn_controller_lb *lb,
ofpbuf_uninit(&ofpacts);
}

static void
add_lb_ct_snat_hairpin_for_dp(const struct ovn_controller_lb *lb,
const struct sbrec_datapath_binding *datapath,
struct match *dp_match,
struct ofpbuf *dp_acts,
struct ovn_desired_flow_table *flow_table)
{
match_set_metadata(dp_match, htonll(datapath->tunnel_key));
ofctrl_add_or_append_flow(flow_table, OFTABLE_CT_SNAT_HAIRPIN, 200,
lb->slb->header_.uuid.parts[0],
dp_match, dp_acts, &lb->slb->header_.uuid,
NX_CTLR_NO_METER, NULL);
}

static void
add_lb_ct_snat_hairpin_dp_flows(struct ovn_controller_lb *lb,
uint32_t id,
Expand All @@ -2088,12 +2102,15 @@ add_lb_ct_snat_hairpin_dp_flows(struct ovn_controller_lb *lb,
struct match dp_match = MATCH_CATCHALL_INITIALIZER;

for (size_t i = 0; i < lb->slb->n_datapaths; i++) {
match_set_metadata(&dp_match,
htonll(lb->slb->datapaths[i]->tunnel_key));
ofctrl_add_or_append_flow(flow_table, OFTABLE_CT_SNAT_HAIRPIN, 200,
lb->slb->header_.uuid.parts[0],
&dp_match, &dp_acts, &lb->slb->header_.uuid,
NX_CTLR_NO_METER, NULL);
add_lb_ct_snat_hairpin_for_dp(lb, lb->slb->datapaths[i],
&dp_match, &dp_acts, flow_table);
}
if (lb->slb->datapath_group) {
for (size_t i = 0; i < lb->slb->datapath_group->n_datapaths; i++) {
add_lb_ct_snat_hairpin_for_dp(
lb, lb->slb->datapath_group->datapaths[i],
&dp_match, &dp_acts, flow_table);
}
}

ofpbuf_uninit(&dp_acts);
Expand Down Expand Up @@ -2351,7 +2368,20 @@ consider_lb_hairpin_flows(const struct sbrec_load_balancer *sbrec_lb,
}
}

if (i == sbrec_lb->n_datapaths) {
if (sbrec_lb->n_datapaths && i == sbrec_lb->n_datapaths) {
return;
}

struct sbrec_logical_dp_group *dp_group = sbrec_lb->datapath_group;

for (i = 0; dp_group && i < dp_group->n_datapaths; i++) {
if (get_local_datapath(local_datapaths,
dp_group->datapaths[i]->tunnel_key)) {
break;
}
}

if (dp_group && i == dp_group->n_datapaths) {
return;
}

Expand Down
54 changes: 35 additions & 19 deletions controller/ovn-controller.c
Original file line number Diff line number Diff line change
Expand Up @@ -2210,6 +2210,33 @@ load_balancers_by_dp_find(struct hmap *lbs,
return NULL;
}

static void
load_balancers_by_dp_add_one(const struct hmap *local_datapaths,
const struct sbrec_datapath_binding *datapath,
const struct sbrec_load_balancer *lb,
struct hmap *lbs)
{
struct local_datapath *ldp =
get_local_datapath(local_datapaths, datapath->tunnel_key);

if (!ldp) {
return;
}

struct load_balancers_by_dp *lbs_by_dp =
load_balancers_by_dp_find(lbs, ldp->datapath);
if (!lbs_by_dp) {
lbs_by_dp = load_balancers_by_dp_create(lbs, ldp->datapath);
}

if (lbs_by_dp->n_dp_lbs == lbs_by_dp->n_allocated_dp_lbs) {
lbs_by_dp->dp_lbs = x2nrealloc(lbs_by_dp->dp_lbs,
&lbs_by_dp->n_allocated_dp_lbs,
sizeof *lbs_by_dp->dp_lbs);
}
lbs_by_dp->dp_lbs[lbs_by_dp->n_dp_lbs++] = lb;
}

/* Builds and returns a hmap of 'load_balancers_by_dp', one record for each
* local datapath.
*/
Expand All @@ -2223,25 +2250,14 @@ load_balancers_by_dp_init(const struct hmap *local_datapaths,
const struct sbrec_load_balancer *lb;
SBREC_LOAD_BALANCER_TABLE_FOR_EACH (lb, lb_table) {
for (size_t i = 0; i < lb->n_datapaths; i++) {
struct local_datapath *ldp =
get_local_datapath(local_datapaths,
lb->datapaths[i]->tunnel_key);
if (!ldp) {
continue;
}

struct load_balancers_by_dp *lbs_by_dp =
load_balancers_by_dp_find(lbs, ldp->datapath);
if (!lbs_by_dp) {
lbs_by_dp = load_balancers_by_dp_create(lbs, ldp->datapath);
}

if (lbs_by_dp->n_dp_lbs == lbs_by_dp->n_allocated_dp_lbs) {
lbs_by_dp->dp_lbs = x2nrealloc(lbs_by_dp->dp_lbs,
&lbs_by_dp->n_allocated_dp_lbs,
sizeof *lbs_by_dp->dp_lbs);
}
lbs_by_dp->dp_lbs[lbs_by_dp->n_dp_lbs++] = lb;
load_balancers_by_dp_add_one(local_datapaths,
lb->datapaths[i], lb, lbs);
}
for (size_t i = 0; lb->datapath_group
&& i < lb->datapath_group->n_datapaths; i++) {
load_balancers_by_dp_add_one(local_datapaths,
lb->datapath_group->datapaths[i],
lb, lbs);
}
}
return lbs;
Expand Down
168 changes: 115 additions & 53 deletions northd/northd.c
Original file line number Diff line number Diff line change
Expand Up @@ -4159,16 +4159,62 @@ build_lb_port_related_data(struct hmap *datapaths, struct hmap *ports,
build_lb_svcs(input_data, ovnsb_txn, ports, lbs);
}


struct ovn_dp_group {
struct hmapx map;
struct sbrec_logical_dp_group *dp_group;
struct hmap_node node;
};

static struct ovn_dp_group *
ovn_dp_group_find(const struct hmap *dp_groups,
const struct hmapx *od, uint32_t hash)
{
struct ovn_dp_group *dpg;

HMAP_FOR_EACH_WITH_HASH (dpg, node, hash, dp_groups) {
if (hmapx_equals(&dpg->map, od)) {
return dpg;
}
}
return NULL;
}

static struct sbrec_logical_dp_group *
ovn_sb_insert_logical_dp_group(struct ovsdb_idl_txn *ovnsb_txn,
const struct hmapx *od)
{
struct sbrec_logical_dp_group *dp_group;
const struct sbrec_datapath_binding **sb;
const struct hmapx_node *node;
int n = 0;

sb = xmalloc(hmapx_count(od) * sizeof *sb);
HMAPX_FOR_EACH (node, od) {
sb[n++] = ((struct ovn_datapath *) node->data)->sb;
}
dp_group = sbrec_logical_dp_group_insert(ovnsb_txn);
sbrec_logical_dp_group_set_datapaths(
dp_group, (struct sbrec_datapath_binding **) sb, n);
free(sb);

return dp_group;
}

/* Syncs relevant load balancers (applied to logical switches) to the
* Southbound database.
*/
static void
sync_lbs(struct northd_input *input_data, struct ovsdb_idl_txn *ovnsb_txn,
struct hmap *datapaths, struct hmap *lbs)
{
struct hmap dp_groups = HMAP_INITIALIZER(&dp_groups);
struct ovn_northd_lb *lb;

/* Delete any stale SB load balancer rows. */
/* Delete any stale SB load balancer rows and collect existing valid
* datapath groups. */
struct hmapx existing_sb_dp_groups =
HMAPX_INITIALIZER(&existing_sb_dp_groups);
struct hmapx existing_lbs = HMAPX_INITIALIZER(&existing_lbs);
const struct sbrec_load_balancer *sbrec_lb;
SBREC_LOAD_BALANCER_TABLE_FOR_EACH_SAFE (sbrec_lb,
Expand All @@ -4191,11 +4237,46 @@ sync_lbs(struct northd_input *input_data, struct ovsdb_idl_txn *ovnsb_txn,
lb = ovn_northd_lb_find(lbs, &lb_uuid);
if (!lb || !lb->n_nb_ls || !hmapx_add(&existing_lbs, lb)) {
sbrec_load_balancer_delete(sbrec_lb);
} else {
lb->slb = sbrec_lb;
continue;
}

lb->slb = sbrec_lb;

/* Collect the datapath group. */
struct sbrec_logical_dp_group *dp_group = sbrec_lb->datapath_group;

if (!dp_group || !hmapx_add(&existing_sb_dp_groups, dp_group)) {
continue;
}

struct ovn_dp_group *dpg = xzalloc(sizeof *dpg);
size_t i;

hmapx_init(&dpg->map);
for (i = 0; i < dp_group->n_datapaths; i++) {
struct ovn_datapath *datapath_od;

datapath_od = ovn_datapath_from_sbrec(datapaths,
dp_group->datapaths[i]);
if (!datapath_od || ovn_datapath_is_stale(datapath_od)) {
break;
}
hmapx_add(&dpg->map, datapath_od);
}
if (i == dp_group->n_datapaths) {
uint32_t hash = hash_int(hmapx_count(&dpg->map), 0);

if (!ovn_dp_group_find(&dp_groups, &dpg->map, hash)) {
dpg->dp_group = dp_group;
hmap_insert(&dp_groups, &dpg->node, hash);
continue;
}
}
hmapx_destroy(&dpg->map);
free(dpg);
}
hmapx_destroy(&existing_lbs);
hmapx_destroy(&existing_sb_dp_groups);

/* Create SB Load balancer records if not present and sync
* the SB load balancer columns. */
Expand All @@ -4212,13 +4293,6 @@ sync_lbs(struct northd_input *input_data, struct ovsdb_idl_txn *ovnsb_txn,
smap_clone(&options, &lb->nlb->options);
smap_replace(&options, "hairpin_orig_tuple", "true");

struct sbrec_datapath_binding **lb_dps =
xmalloc(lb->n_nb_ls * sizeof *lb_dps);
for (size_t i = 0; i < lb->n_nb_ls; i++) {
lb_dps[i] = CONST_CAST(struct sbrec_datapath_binding *,
lb->nb_ls[i]->sb);
}

if (!lb->slb) {
sbrec_lb = sbrec_load_balancer_insert(ovnsb_txn);
lb->slb = sbrec_lb;
Expand All @@ -4229,15 +4303,44 @@ sync_lbs(struct northd_input *input_data, struct ovsdb_idl_txn *ovnsb_txn,
sbrec_load_balancer_set_external_ids(sbrec_lb, &external_ids);
free(lb_id);
}

/* Find datapath group for this load balancer. */
struct hmapx lb_dps = HMAPX_INITIALIZER(&lb_dps);
struct ovn_dp_group *dpg;
uint32_t hash;

for (size_t i = 0; i < lb->n_nb_ls; i++) {
hmapx_add(&lb_dps, lb->nb_ls[i]);
}

hash = hash_int(hmapx_count(&lb_dps), 0);
dpg = ovn_dp_group_find(&dp_groups, &lb_dps, hash);
if (!dpg) {
dpg = xzalloc(sizeof *dpg);
dpg->dp_group = ovn_sb_insert_logical_dp_group(ovnsb_txn, &lb_dps);
hmapx_clone(&dpg->map, &lb_dps);
hmap_insert(&dp_groups, &dpg->node, hash);
}
hmapx_destroy(&lb_dps);

/* Update columns. */
sbrec_load_balancer_set_name(lb->slb, lb->nlb->name);
sbrec_load_balancer_set_vips(lb->slb, &lb->nlb->vips);
sbrec_load_balancer_set_protocol(lb->slb, lb->nlb->protocol);
sbrec_load_balancer_set_datapaths(lb->slb, lb_dps, lb->n_nb_ls);
sbrec_load_balancer_set_datapath_group(lb->slb, dpg->dp_group);
sbrec_load_balancer_set_options(lb->slb, &options);
/* Clearing 'datapaths' column, since 'dp_group' is in use. */
sbrec_load_balancer_set_datapaths(lb->slb, NULL, 0);
smap_destroy(&options);
free(lb_dps);
}

struct ovn_dp_group *dpg;
HMAP_FOR_EACH_POP (dpg, node, &dp_groups) {
hmapx_destroy(&dpg->map);
free(dpg);
}
hmap_destroy(&dp_groups);

/* Datapath_Binding.load_balancers is not used anymore, it's still in the
* schema for compatibility reasons. Reset it to empty, just in case.
*/
Expand Down Expand Up @@ -14023,47 +14126,6 @@ build_lswitch_and_lrouter_flows(const struct hmap *datapaths,
build_lswitch_flows(datapaths, lflows);
}

struct ovn_dp_group {
struct hmapx map;
struct sbrec_logical_dp_group *dp_group;
struct hmap_node node;
};

static struct ovn_dp_group *
ovn_dp_group_find(const struct hmap *dp_groups,
const struct hmapx *od, uint32_t hash)
{
struct ovn_dp_group *dpg;

HMAP_FOR_EACH_WITH_HASH (dpg, node, hash, dp_groups) {
if (hmapx_equals(&dpg->map, od)) {
return dpg;
}
}
return NULL;
}

static struct sbrec_logical_dp_group *
ovn_sb_insert_logical_dp_group(struct ovsdb_idl_txn *ovnsb_txn,
const struct hmapx *od)
{
struct sbrec_logical_dp_group *dp_group;
const struct sbrec_datapath_binding **sb;
const struct hmapx_node *node;
int n = 0;

sb = xmalloc(hmapx_count(od) * sizeof *sb);
HMAPX_FOR_EACH (node, od) {
sb[n++] = ((struct ovn_datapath *) node->data)->sb;
}
dp_group = sbrec_logical_dp_group_insert(ovnsb_txn);
sbrec_logical_dp_group_set_datapaths(
dp_group, (struct sbrec_datapath_binding **) sb, n);
free(sb);

return dp_group;
}

static void
ovn_sb_set_lflow_logical_dp_group(
struct ovsdb_idl_txn *ovnsb_txn,
Expand Down
8 changes: 6 additions & 2 deletions ovn-sb.ovsschema
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "OVN_Southbound",
"version": "20.23.0",
"cksum": "4045988377 28575",
"version": "20.24.0",
"cksum": "3074645903 28786",
"tables": {
"SB_Global": {
"columns": {
Expand Down Expand Up @@ -505,6 +505,10 @@
"type": {"key": {"type": "uuid",
"refTable": "Datapath_Binding"},
"min": 0, "max": "unlimited"}},
"datapath_group":
{"type": {"key": {"type": "uuid",
"refTable": "Logical_DP_Group"},
"min": 0, "max": 1}},
"options": {
"type": {"key": "string",
"value": "string",
Expand Down
5 changes: 5 additions & 0 deletions ovn-sb.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4589,6 +4589,11 @@ tcp.flags = RST;
Datapaths to which this load balancer applies to.
</column>

<column name="datapath_group">
The group of datapaths to which this load balancer applies to. This
means that the same load balancer applies to all datapaths in a group.
</column>

<group title="Load_Balancer options">
<column name="options" key="hairpin_snat_ip">
IP to be used as source IP for packets that have been hair-pinned after
Expand Down
Loading

0 comments on commit 80ba0b7

Please sign in to comment.