eth-cscs · msimberg · Sep 4, 2024
diff --git a/include/dlaf/memory/memory_chunk.h b/include/dlaf/memory/memory_chunk.h
@@ -26,6 +26,8 @@ namespace dlaf {
 namespace memory {
 
 namespace internal {
+void print_cuda_stats(std::string_view label);
+
 umpire::Allocator& getUmpireHostAllocator();
 void initializeUmpireHostAllocator(std::size_t initial_bytes);
 void finalizeUmpireHostAllocator();

diff --git a/src/memory/memory_chunk.cpp b/src/memory/memory_chunk.cpp
@@ -10,15 +10,60 @@
 
 #include <cstddef>
 
+#include <mpi.h>
+
 #include <umpire/ResourceManager.hpp>
 #include <umpire/strategy/QuickPool.hpp>
 #include <umpire/strategy/ThreadSafeAllocator.hpp>
 
+#include <pika/init.hpp>
+
 #include <dlaf/memory/memory_chunk.h>
 
+#include <string_view>
+
 namespace dlaf {
 namespace memory {
 namespace internal {
+static void print_alloc_stats(std::string label, std::ostream& os) {
+  auto alloc = umpire::ResourceManager::getInstance().getAllocator(label);
+  os << "name: " << alloc.getName() << ", ";
+  os << "id: " << alloc.getId() << ", ";
+  os << "strategy: " << alloc.getStrategyName() << ", ";
+  os << "high water: " << alloc.getHighWatermark() << ", ";
+  os << "current size: " << alloc.getCurrentSize() << ", ";
+  os << "actual size: " << alloc.getActualSize() << ", ";
+  os << "alloc count: " << alloc.getAllocationCount() << ", ";
+  os << '\n';
+}
+
+void print_cuda_stats(std::string_view label) {
+  // if (pika::is_runtime_initialized()) { pika::wait(); }
+  // cudaDeviceSynchronize();
+  std::size_t cuda_free, cuda_total;
+  int id = 0;
+  cudaGetDevice(&id);
+  cudaMemGetInfo(&cuda_free, &cuda_total);
+
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  std::ostringstream os;
+  os << "### " << label << '\n';
+  os << "rank: " << rank << ", ";
+  os << "cuda_free: " << cuda_free << ", ";
+  os << "cuda_total: " << cuda_total << '\n';
+
+  print_alloc_stats("DEVICE", os);
+  print_alloc_stats("DEVICE_pool", os);
+  print_alloc_stats("DEVICE_thread_safe_pool", os);
+  print_alloc_stats("PINNED", os);
+  print_alloc_stats("PINNED_pool", os);
+  print_alloc_stats("PINNED_thread_safe_pool", os);
+
+  std::cerr << os.str();
+}
+
 #ifdef DLAF_WITH_GPU
 umpire::Allocator& getUmpireHostAllocator() {
   static auto host_allocator = umpire::ResourceManager::getInstance().getAllocator("PINNED");