Skip to content

Commit

Permalink
[Memtrie] No longer use mmap for memtrie arena. (#10132)
Browse files Browse the repository at this point in the history
This solves an issue with the current memtrie implementation that mmap
requires either NO_RESERVE or overcommit enabled, by avoiding mmap
altogether.

It's not actually necessary to have contiguous memory. We'll instead
have chunks of contiguous memory. Individual allocations will be
entirely within a chunk anyway.

Instead of using `usize` to represent a position in the arena, we now
use `ArenaPos`, which simply is just the chunk index and position within
chunk. We just needed some simple tweaks to use ArenaPos everywhere and
there are no big changes.

I've picked 4MB as the chunk size, so that the memory overhead for not
being able to allocate a big value is at most 8KB (I think in practice
it's at most key size (2k?) + value size (4k?)) per chunk. So with 16GB
of total size the overhead here is worst-case 32MB; on the other hand
the overhead from having a last chunk is at most 4MB. So, that seems
like a reasonable chunk size value to pick.

@Ekleog I haven't forgotten about your comments on the original PR
(#9541 ) . I'm just tackling the more blocker items first. I'll come
back to address these comments before the whole thing is considered
complete.
  • Loading branch information
robin-near authored Nov 14, 2023
1 parent 451d4ef commit ca852b7
Show file tree
Hide file tree
Showing 12 changed files with 231 additions and 144 deletions.
1 change: 0 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion core/store/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ hex.workspace = true
itoa.workspace = true
itertools.workspace = true
lru.workspace = true
memmap2.workspace = true
num_cpus.workspace = true
once_cell.workspace = true
rand.workspace = true
Expand Down
110 changes: 80 additions & 30 deletions core/store/src/trie/mem/arena/alloc.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,37 @@
use near_o11y::metrics::IntGauge;

use super::metrics::MEM_TRIE_ARENA_ACTIVE_ALLOCS_COUNT;
use super::{ArenaMemory, ArenaSliceMut};
use super::{ArenaMemory, ArenaPos, ArenaSliceMut};
use crate::trie::mem::arena::metrics::{
MEM_TRIE_ARENA_ACTIVE_ALLOCS_BYTES, MEM_TRIE_ARENA_MEMORY_USAGE_BYTES,
};
use crate::trie::mem::flexible_data::encoding::BorshFixedSize;

/// Simple bump allocator with freelists. Allocations are rounded up to its
/// allocation class, so that deallocated memory can be reused by a similarly
/// sized allocation.
/// Simple bump allocator with freelists.
///
/// Allocations are rounded up to its allocation class, so that deallocated
/// memory can be reused by a similarly sized allocation. Each allocation
/// class maintains a separate freelist.
///
/// Allocations are first done by popping from a freelist, if available. If not,
/// we allocate a new region by bump `next_alloc_pos` forward. Deallocated
/// regions are added to their corresponding freelist.
///
/// As a result, the memory usage of this allocator never decreases. In
/// practice, for in-memory tries, there is very little memory usage creep even
/// when tested over weeks of uptime.
pub struct Allocator {
freelists: [usize; NUM_ALLOCATION_CLASSES],
next_ptr: usize,
/// The head of a linked list of freed allocations; one for each allocation
/// class.
/// The "next" pointer of a freed allocation is stored in the first
/// 8 bytes of the allocated memory itself. An empty freelist is represented
/// using `ArenaPos::invalid()`.
freelists: [ArenaPos; NUM_ALLOCATION_CLASSES],
/// The next position in the arena that a new allocation (which cannot be
/// satisfied by popping one from a freelist) will be allocated at.
/// This position would only ever move forward. De-allocating an allocation
/// does not affect this position; it only adds an entry to a freelist.
next_alloc_pos: ArenaPos,

// Stats. Note that keep the bytes and count locally too because the
// gauges are process-wide, so stats-keeping directly with those may not be
Expand All @@ -26,6 +46,7 @@ pub struct Allocator {
const MAX_ALLOC_SIZE: usize = 16 * 1024;
const ROUND_UP_TO_8_BYTES_UNDER: usize = 256;
const ROUND_UP_TO_64_BYTES_UNDER: usize = 1024;
const CHUNK_SIZE: usize = 4 * 1024 * 1024;

/// Calculates the allocation class (an index from 0 to NUM_ALLOCATION_CLASSES)
/// for the given size that we wish to allocate.
Expand Down Expand Up @@ -56,8 +77,8 @@ const NUM_ALLOCATION_CLASSES: usize = allocation_class(MAX_ALLOC_SIZE) + 1;
impl Allocator {
pub fn new(name: String) -> Self {
Self {
freelists: [usize::MAX; NUM_ALLOCATION_CLASSES],
next_ptr: 0,
freelists: [ArenaPos::invalid(); NUM_ALLOCATION_CLASSES],
next_alloc_pos: ArenaPos::invalid(),
active_allocs_bytes: 0,
active_allocs_count: 0,
active_allocs_bytes_gauge: MEM_TRIE_ARENA_ACTIVE_ALLOCS_BYTES
Expand All @@ -68,6 +89,15 @@ impl Allocator {
}
}

/// Adds a new chunk to the arena, and updates the next_alloc_pos to the beginning of
/// the new chunk.
fn new_chunk(&mut self, arena: &mut ArenaMemory) {
arena.chunks.push(vec![0; CHUNK_SIZE]);
self.next_alloc_pos =
ArenaPos { chunk: u32::try_from(arena.chunks.len() - 1).unwrap(), pos: 0 };
self.memory_usage_gauge.set(arena.chunks.len() as i64 * CHUNK_SIZE as i64);
}

/// Allocates a slice of the given size in the arena.
pub fn allocate<'a>(&mut self, arena: &'a mut ArenaMemory, size: usize) -> ArenaSliceMut<'a> {
assert!(size <= MAX_ALLOC_SIZE, "Cannot allocate {} bytes", size);
Expand All @@ -77,38 +107,32 @@ impl Allocator {
self.active_allocs_count_gauge.set(self.active_allocs_count as i64);
let size_class = allocation_class(size);
let allocation_size = allocation_size(size_class);
if self.freelists[size_class] == usize::MAX {
if arena.mmap.len() < self.next_ptr + allocation_size {
panic!(
"In-memory trie Arena out of memory; configured as {} bytes maximum,
tried to allocate {} when {} bytes already used",
arena.mmap.len(),
allocation_size,
self.next_ptr
);
if self.freelists[size_class].is_invalid() {
if self.next_alloc_pos.is_invalid()
|| arena.chunks[self.next_alloc_pos.chunk()].len()
<= self.next_alloc_pos.pos() + allocation_size
{
self.new_chunk(arena);
}
let ptr = self.next_ptr;
self.next_ptr += allocation_size;
self.memory_usage_gauge.set(self.next_ptr as i64);
let ptr = self.next_alloc_pos;
self.next_alloc_pos = self.next_alloc_pos.offset_by(allocation_size);
arena.slice_mut(ptr, size)
} else {
let pos = self.freelists[size_class];
self.freelists[size_class] = arena.ptr(pos).read_usize();
self.freelists[size_class] = arena.ptr(pos).read_pos();
arena.slice_mut(pos, size)
}
}

/// Deallocates the given slice from the arena; the slice's `pos` and `len`
/// must be the same as an allocation that was returned earlier.
pub fn deallocate(&mut self, arena: &mut ArenaMemory, pos: usize, len: usize) {
pub fn deallocate(&mut self, arena: &mut ArenaMemory, pos: ArenaPos, len: usize) {
self.active_allocs_bytes -= len;
self.active_allocs_count -= 1;
self.active_allocs_bytes_gauge.set(self.active_allocs_bytes as i64);
self.active_allocs_count_gauge.set(self.active_allocs_count as i64);
let size_class = allocation_class(len);
arena
.slice_mut(pos, allocation_size(size_class))
.write_usize_at(0, self.freelists[size_class]);
arena.slice_mut(pos, ArenaPos::SERIALIZED_SIZE).write_pos_at(0, self.freelists[size_class]);
self.freelists[size_class] = pos;
}

Expand All @@ -121,16 +145,17 @@ impl Allocator {
#[cfg(test)]
mod test {
use super::MAX_ALLOC_SIZE;
use crate::trie::mem::arena::alloc::CHUNK_SIZE;
use crate::trie::mem::arena::Arena;
use std::mem::size_of;

#[test]
fn test_allocate_deallocate() {
let mut arena = Arena::new(10000, "".to_owned());
// Repeatedly allocate and deallocate; we should not run out of memory.
for i in 0..1000 {
let mut arena = Arena::new("".to_owned());
// Repeatedly allocate and deallocate.
for i in 0..10 {
let mut slices = Vec::new();
for size in 1..=100 {
for size in (1..=16384).step_by(3) {
let mut alloc = arena.alloc(size);
// Check that the allocated length is large enough.
assert!(alloc.len >= size);
Expand All @@ -144,12 +169,37 @@ mod test {
slices.sort_by_key(|(pos, _)| *pos);
// Check that the allocated intervals don't overlap.
for i in 1..slices.len() {
assert!(slices[i - 1].0 + slices[i - 1].1 <= slices[i].0);
assert!(slices[i - 1].0.offset_by(slices[i - 1].1) <= slices[i].0);
}
// Check that each allocated interval is valid.
for (pos, len) in &slices {
assert!((pos.chunk()) < arena.memory.chunks.len());
assert!(pos.offset_by(*len).pos() <= CHUNK_SIZE);
}
for (pos, len) in slices {
arena.dealloc(pos, len);
}
}
}

#[test]
fn test_allocation_reuse() {
let mut arena = Arena::new("".to_owned());
// Repeatedly allocate and deallocate. New allocations should reuse
// old deallocated memory.
for _ in 0..10 {
let mut slices = Vec::new();
for _ in 0..2000 {
let alloc = arena.alloc(8192);
slices.push((alloc.pos, alloc.len));
}
for (pos, len) in slices {
arena.dealloc(pos, len);
}
}
assert_eq!(arena.num_active_allocs(), 0);
// 8192 * 2000 <= 16MB, so we should have allocated only 4 chunks.
assert_eq!(arena.memory.chunks.len(), 4);
}

#[test]
Expand Down
Loading

0 comments on commit ca852b7

Please sign in to comment.