Skip to content

Commit

Permalink
resource: fallback to sysconf when failed to detect memory size from …
Browse files Browse the repository at this point in the history
…hwloc

On Fedora 41 AMI on some aarch64 instance such as m7gd.16xlarge, Seastar
program such as Scylla fails to startup with following error message:
```
$ /opt/scylladb/bin/scylla --log-to-stdout 1
WARNING: debug mode. Not for benchmarking or production
hwloc/linux: failed to find sysfs cpu topology directory, aborting linux discovery.
scylla: seastar/src/core/resource.cc:683: resources seastar::resource::allocate(configuration &): Assertion `!remain' failed.
```

It seems like hwloc is failed to initialize because of
/sys/devices/system/cpu/cpu0/topology/ not available on the instance.

I debugged src/core/resource.cc to find out why assert occured,
and found that alloc_from_node() is failing because node->total_memory is 0.
It is likely because of failure of hwloc initialize described above.

I also found that calculate_memory() going wrong since
machine->total_memory is also 0.

To avoid the error on such environment, we need to fixup memory size on
both machine->total_memory and node->total_memory.
We can use sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES) for this,
just like we do on non-hwloc version of allocate().

Fixes scylladb/scylladb#22382
Related scylladb/scylla-pkg#4797
  • Loading branch information
syuu1228 committed Jan 30, 2025
1 parent 4e6ce2d commit 662bc23
Showing 1 changed file with 57 additions and 10 deletions.
67 changes: 57 additions & 10 deletions src/core/resource.cc
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,10 @@ optional<T> read_setting_V1V2_as(std::string cg1_path, std::string cg2_fname) {

namespace resource {

static unsigned long get_total_memory_from_sysconf() {
return ::sysconf(_SC_PAGESIZE) * size_t(::sysconf(_SC_PHYS_PAGES));
}

static
size_t
kernel_memory_reservation() {
Expand Down Expand Up @@ -305,13 +309,43 @@ size_t div_roundup(size_t num, size_t denom) {
return (num + denom - 1) / denom;
}

static size_t alloc_from_node(cpu& this_cpu, hwloc_obj_t node, std::unordered_map<hwloc_obj_t, size_t>& used_mem, size_t alloc) {
static hwloc_uint64_t get_machine_memory(hwloc_obj_t machine) {
#if HWLOC_API_VERSION >= 0x00020000
auto available_memory = machine->total_memory;
#else
auto available_memory = machine->memory.total_memory;
#endif
return available_memory;
}

static void set_machine_memory(hwloc_obj_t machine, hwloc_uint64_t available_memory) {
#if HWLOC_API_VERSION >= 0x00020000
machine->total_memory = available_memory;
#else
machine->memory.total_memory = available_memory;
#endif
}

static hwloc_uint64_t get_local_memory(hwloc_obj_t node) {
#if HWLOC_API_VERSION >= 0x00020000
// FIXME: support nodes with multiple NUMA nodes, whatever that means
auto local_memory = node->total_memory;
// FIXME: support nodes with multiple NUMA nodes, whatever that means
auto local_memory = node->total_memory;
#else
auto local_memory = node->memory.local_memory;
auto local_memory = node->memory.local_memory;
#endif
return local_memory;
}

static void set_local_memory(hwloc_obj_t node, hwloc_uint64_t local_memory) {
#if HWLOC_API_VERSION >= 0x00020000
node->total_memory = local_memory;
#else
node->memory.local_memory = local_memory;
#endif
}

static size_t alloc_from_node(cpu& this_cpu, hwloc_obj_t node, std::unordered_map<hwloc_obj_t, size_t>& used_mem, size_t alloc) {
auto local_memory = get_local_memory(node);
auto taken = std::min(local_memory - used_mem[node], alloc);
if (taken) {
used_mem[node] += taken;
Expand Down Expand Up @@ -574,11 +608,13 @@ resources allocate(configuration& c) {
auto machine_depth = hwloc_get_type_depth(topology, HWLOC_OBJ_MACHINE);
assert(hwloc_get_nbobjs_by_depth(topology, machine_depth) == 1);
auto machine = hwloc_get_obj_by_depth(topology, machine_depth, 0);
#if HWLOC_API_VERSION >= 0x00020000
auto available_memory = machine->total_memory;
#else
auto available_memory = machine->memory.total_memory;
#endif
auto available_memory = get_machine_memory(machine);
if (!available_memory) {
available_memory = get_total_memory_from_sysconf();
set_machine_memory(machine, available_memory);
seastar_logger.warn("hwloc failed to detect machine-wide memory size, using memory size fetched from sysconf");
}

size_t mem = calculate_memory(c, std::min(available_memory,
cgroup::memory_limit()));
// limit memory address to fit in 36-bit, see core/memory.cc:Memory map
Expand All @@ -592,6 +628,7 @@ resources allocate(configuration& c) {
std::vector<std::pair<cpu, size_t>> remains;

auto cpu_sets = distribute_objects(topology, procs);
auto num_nodes = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NUMANODE);

for (auto&& cs : cpu_sets()) {
auto cpu_id = hwloc_bitmap_first(cs);
Expand All @@ -601,6 +638,16 @@ resources allocate(configuration& c) {
if (node == nullptr) {
orphan_pus.push_back(cpu_id);
} else {
if (!get_local_memory(node)) {
// This code does not assume that there are multiple nodes,
// but when this 'if' condition is met, hwloc fails to detect
// the hardware configuration and is expected to operate as
// a single node configuration, so it should work correctly.
assert(num_nodes == 1);
auto local_memory = get_total_memory_from_sysconf();
set_local_memory(node, local_memory);
seastar_logger.warn("hwloc failed to detect NUMA node memory size, using memory size fetched from sysfs");
}
cpu_to_node[cpu_id] = node;
seastar_logger.debug("Assign CPU{} to NUMA{}", cpu_id, node->os_index);
}
Expand Down Expand Up @@ -730,7 +777,7 @@ allocate_io_queues(configuration c, std::vector<cpu> cpus) {
resources allocate(configuration& c) {
resources ret;

auto available_memory = ::sysconf(_SC_PAGESIZE) * size_t(::sysconf(_SC_PHYS_PAGES));
auto available_memory = get_total_memory_from_sysconf();
auto mem = calculate_memory(c, available_memory);
auto procs = c.cpus;
ret.cpus.reserve(procs);
Expand Down

0 comments on commit 662bc23

Please sign in to comment.