Skip to content

Commit

Permalink
Merge branch 'fixes-for-lpm-trie'
Browse files Browse the repository at this point in the history
Hou Tao says:

====================
This patch set fixes several issues for LPM trie. These issues were
found during adding new test cases or were reported by syzbot.

The patch set is structured as follows:

Patch #1~#2 are clean-ups for lpm_trie_update_elem().
Patch #3 handles BPF_EXIST and BPF_NOEXIST correctly for LPM trie.
Patch #4 fixes the accounting of n_entries when doing in-place update.
Patch #5 fixes the exact match condition in trie_get_next_key() and it
may skip keys when the passed key is not found in the map.
Patch torvalds#6~torvalds#7 switch from kmalloc() to bpf memory allocator for LPM trie
to fix several lock order warnings reported by syzbot. It also enables
raw_spinlock_t for LPM trie again. After these changes, the LPM trie will
be closer to being usable in any context (though the reentrance check of
trie->lock is still missing, but it is on my todo list).
Patch torvalds#8: move test_lpm_map to map_tests to make it run regularly.
Patch torvalds#9: add test cases for the issues fixed by patch #3~#5.

Please see individual patches for more details. Comments are always
welcome.

Change Log:
v3:
  * patch #2: remove the unnecessary NULL-init for im_node
  * patch torvalds#6: alloc the leaf node before disabling IRQ to low
    the possibility of -ENOMEM when leaf_size is large; Free
    these nodes outside the trie lock (Suggested by Alexei)
  * collect review and ack tags (Thanks for Toke & Daniel)

v2: https://lore.kernel.org/bpf/[email protected]/
  * collect review tags (Thanks for Toke)
  * drop "Add bpf_mem_cache_is_mergeable() helper" patch
  * patch #3~#4: add fix tag
  * patch #4: rename the helper to trie_check_add_elem() and increase
    n_entries in it.
  * patch torvalds#6: use one bpf mem allocator and update commit message to
    clarify that using bpf mem allocator is more appropriate.
  * patch torvalds#7: update commit message to add the possible max running time
    for update operation.
  * patch torvalds#9: update commit message to specify the purpose of these test
    cases.

v1: https://lore.kernel.org/bpf/[email protected]/
====================

Link: https://lore.kernel.org/all/[email protected]/
Signed-off-by: Alexei Starovoitov <[email protected]>
  • Loading branch information
Alexei Starovoitov committed Dec 6, 2024
2 parents e2cf913 + 04d4ce9 commit 509df67
Show file tree
Hide file tree
Showing 4 changed files with 484 additions and 57 deletions.
133 changes: 85 additions & 48 deletions kernel/bpf/lpm_trie.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
#include <net/ipv6.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/bpf_mem_alloc.h>

/* Intermediate node */
#define LPM_TREE_NODE_FLAG_IM BIT(0)

struct lpm_trie_node;

struct lpm_trie_node {
struct rcu_head rcu;
struct lpm_trie_node __rcu *child[2];
u32 prefixlen;
u32 flags;
Expand All @@ -32,10 +32,11 @@ struct lpm_trie_node {
struct lpm_trie {
struct bpf_map map;
struct lpm_trie_node __rcu *root;
struct bpf_mem_alloc ma;
size_t n_entries;
size_t max_prefixlen;
size_t data_size;
spinlock_t lock;
raw_spinlock_t lock;
};

/* This trie implements a longest prefix match algorithm that can be used to
Expand Down Expand Up @@ -287,17 +288,18 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
return found->data + trie->data_size;
}

static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
const void *value)
static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie,
const void *value,
bool disable_migration)
{
struct lpm_trie_node *node;
size_t size = sizeof(struct lpm_trie_node) + trie->data_size;

if (value)
size += trie->map.value_size;
if (disable_migration)
migrate_disable();
node = bpf_mem_cache_alloc(&trie->ma);
if (disable_migration)
migrate_enable();

node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
trie->map.numa_node);
if (!node)
return NULL;

Expand All @@ -310,12 +312,22 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
return node;
}

static int trie_check_add_elem(struct lpm_trie *trie, u64 flags)
{
if (flags == BPF_EXIST)
return -ENOENT;
if (trie->n_entries == trie->map.max_entries)
return -ENOSPC;
trie->n_entries++;
return 0;
}

/* Called from syscall or from eBPF program */
static long trie_update_elem(struct bpf_map *map,
void *_key, void *value, u64 flags)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
struct lpm_trie_node *node, *im_node, *new_node;
struct lpm_trie_node *free_node = NULL;
struct lpm_trie_node __rcu **slot;
struct bpf_lpm_trie_key_u8 *key = _key;
Expand All @@ -330,22 +342,14 @@ static long trie_update_elem(struct bpf_map *map,
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;

spin_lock_irqsave(&trie->lock, irq_flags);

/* Allocate and fill a new node */

if (trie->n_entries == trie->map.max_entries) {
ret = -ENOSPC;
goto out;
}

new_node = lpm_trie_node_alloc(trie, value);
if (!new_node) {
ret = -ENOMEM;
goto out;
}
/* Allocate and fill a new node. Need to disable migration before
* invoking bpf_mem_cache_alloc().
*/
new_node = lpm_trie_node_alloc(trie, value, true);
if (!new_node)
return -ENOMEM;

trie->n_entries++;
raw_spin_lock_irqsave(&trie->lock, irq_flags);

new_node->prefixlen = key->prefixlen;
RCU_INIT_POINTER(new_node->child[0], NULL);
Expand All @@ -364,8 +368,7 @@ static long trie_update_elem(struct bpf_map *map,
matchlen = longest_prefix_match(trie, node, key);

if (node->prefixlen != matchlen ||
node->prefixlen == key->prefixlen ||
node->prefixlen == trie->max_prefixlen)
node->prefixlen == key->prefixlen)
break;

next_bit = extract_bit(key->data, node->prefixlen);
Expand All @@ -376,6 +379,10 @@ static long trie_update_elem(struct bpf_map *map,
* simply assign the @new_node to that slot and be done.
*/
if (!node) {
ret = trie_check_add_elem(trie, flags);
if (ret)
goto out;

rcu_assign_pointer(*slot, new_node);
goto out;
}
Expand All @@ -384,18 +391,30 @@ static long trie_update_elem(struct bpf_map *map,
* which already has the correct data array set.
*/
if (node->prefixlen == matchlen) {
if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) {
if (flags == BPF_NOEXIST) {
ret = -EEXIST;
goto out;
}
} else {
ret = trie_check_add_elem(trie, flags);
if (ret)
goto out;
}

new_node->child[0] = node->child[0];
new_node->child[1] = node->child[1];

if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
trie->n_entries--;

rcu_assign_pointer(*slot, new_node);
free_node = node;

goto out;
}

ret = trie_check_add_elem(trie, flags);
if (ret)
goto out;

/* If the new node matches the prefix completely, it must be inserted
* as an ancestor. Simply insert it between @node and *@slot.
*/
Expand All @@ -406,8 +425,10 @@ static long trie_update_elem(struct bpf_map *map,
goto out;
}

im_node = lpm_trie_node_alloc(trie, NULL);
/* migration is disabled within the locked scope */
im_node = lpm_trie_node_alloc(trie, NULL, false);
if (!im_node) {
trie->n_entries--;
ret = -ENOMEM;
goto out;
}
Expand All @@ -429,16 +450,13 @@ static long trie_update_elem(struct bpf_map *map,
rcu_assign_pointer(*slot, im_node);

out:
if (ret) {
if (new_node)
trie->n_entries--;
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);

kfree(new_node);
kfree(im_node);
}

spin_unlock_irqrestore(&trie->lock, irq_flags);
kfree_rcu(free_node, rcu);
migrate_disable();
if (ret)
bpf_mem_cache_free(&trie->ma, new_node);
bpf_mem_cache_free_rcu(&trie->ma, free_node);
migrate_enable();

return ret;
}
Expand All @@ -459,7 +477,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;

spin_lock_irqsave(&trie->lock, irq_flags);
raw_spin_lock_irqsave(&trie->lock, irq_flags);

/* Walk the tree looking for an exact key/length match and keeping
* track of the path we traverse. We will need to know the node
Expand Down Expand Up @@ -535,9 +553,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
free_node = node;

out:
spin_unlock_irqrestore(&trie->lock, irq_flags);
kfree_rcu(free_parent, rcu);
kfree_rcu(free_node, rcu);
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);

migrate_disable();
bpf_mem_cache_free_rcu(&trie->ma, free_parent);
bpf_mem_cache_free_rcu(&trie->ma, free_node);
migrate_enable();

return ret;
}
Expand All @@ -559,6 +580,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
struct lpm_trie *trie;
size_t leaf_size;
int err;

/* check sanity of attributes */
if (attr->max_entries == 0 ||
Expand All @@ -581,9 +604,19 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
offsetof(struct bpf_lpm_trie_key_u8, data);
trie->max_prefixlen = trie->data_size * 8;

spin_lock_init(&trie->lock);
raw_spin_lock_init(&trie->lock);

/* Allocate intermediate and leaf nodes from the same allocator */
leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
trie->map.value_size;
err = bpf_mem_alloc_init(&trie->ma, leaf_size, false);
if (err)
goto free_out;
return &trie->map;

free_out:
bpf_map_area_free(trie);
return ERR_PTR(err);
}

static void trie_free(struct bpf_map *map)
Expand Down Expand Up @@ -615,13 +648,17 @@ static void trie_free(struct bpf_map *map)
continue;
}

kfree(node);
/* No bpf program may access the map, so freeing the
* node without waiting for the extra RCU GP.
*/
bpf_mem_cache_raw_free(node);
RCU_INIT_POINTER(*slot, NULL);
break;
}
}

out:
bpf_mem_alloc_destroy(&trie->ma);
bpf_map_area_free(trie);
}

Expand All @@ -633,7 +670,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
struct lpm_trie_node **node_stack = NULL;
int err = 0, stack_ptr = -1;
unsigned int next_bit;
size_t matchlen;
size_t matchlen = 0;

/* The get_next_key follows postorder. For the 4 node example in
* the top of this file, the trie_get_next_key() returns the following
Expand Down Expand Up @@ -672,7 +709,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
next_bit = extract_bit(key->data, node->prefixlen);
node = rcu_dereference(node->child[next_bit]);
}
if (!node || node->prefixlen != key->prefixlen ||
if (!node || node->prefixlen != matchlen ||
(node->flags & LPM_TREE_NODE_FLAG_IM))
goto find_leftmost;

Expand Down
1 change: 0 additions & 1 deletion tools/testing/selftests/bpf/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ bpf-syscall*
test_verifier
test_maps
test_lru_map
test_lpm_map
test_tag
FEATURE-DUMP.libbpf
FEATURE-DUMP.selftests
Expand Down
2 changes: 1 addition & 1 deletion tools/testing/selftests/bpf/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ CLANG_CPUV4 := 1
endif

# Order correspond to 'make run_tests' order
TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_progs \
test_sockmap \
test_tcpnotify_user test_sysctl \
test_progs-no_alu32
Expand Down
Loading

0 comments on commit 509df67

Please sign in to comment.