diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt-c/Runtime/Runtime.h b/mlir-tensorrt/compiler/include/mlir-tensorrt-c/Runtime/Runtime.h index 00dc363f1..b6e0d406b 100644 --- a/mlir-tensorrt/compiler/include/mlir-tensorrt-c/Runtime/Runtime.h +++ b/mlir-tensorrt/compiler/include/mlir-tensorrt-c/Runtime/Runtime.h @@ -221,10 +221,15 @@ mtrtRuntimeClientGetNumDevices(MTRT_RuntimeClient client, int32_t *numDevices); MLIR_CAPI_EXPORTED MTRT_Status mtrtRuntimeClientGetDevice( MTRT_RuntimeClient client, int32_t index, MTRT_Device *device); -/// Retrieve the runtiem client that manages the specified memref. +/// Retrieve the runtime client that manages the specified memref. MLIR_CAPI_EXPORTED MTRT_RuntimeClient mtrtMemRefGetClient(MTRT_MemRefValue memref); +/// Retrieve the runtime client allocated cpu and gpu memory. +MTRT_Status mtrtReportAllocatedMemory(MTRT_RuntimeClient client, + int64_t *totalCpuMemory, + int64_t *totalGpuMemory); + //===----------------------------------------------------------------------===// // Data Transfer //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/compiler/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/compiler/lib/CAPI/Runtime/Runtime.cpp index 127c865f5..448fc26ff 100644 --- a/mlir-tensorrt/compiler/lib/CAPI/Runtime/Runtime.cpp +++ b/mlir-tensorrt/compiler/lib/CAPI/Runtime/Runtime.cpp @@ -640,6 +640,16 @@ MTRT_Status mtrtRuntimeClientGetDevice(MTRT_RuntimeClient client, int32_t index, return mtrtStatusGetOk(); } +MTRT_Status mtrtReportAllocatedMemory(MTRT_RuntimeClient client, + int64_t *totalCpuMemory, + int64_t *totalGpuMemory) { + RuntimeClient *cppClient = unwrap(client); + auto const &allocated = cppClient->getAllocTracker().reportAllocatedMemory(); + *totalCpuMemory = allocated.first; + *totalGpuMemory = allocated.second; + return mtrtStatusGetOk(); +} + //===----------------------------------------------------------------------===// // MTRT_ScalarValue //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h index f3b41520c..60d1388ac 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h @@ -773,6 +773,9 @@ class AllocTracker { /// Return true if the tracker's map contains `ptr`. bool contains(uintptr_t ptr) const; + /// Report total CPU and GPU memory allocated by runtime client. + std::pair reportAllocatedMemory() const; + private: llvm::DenseMap map; }; diff --git a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp index 1d8c36ae3..54f0ca84e 100644 --- a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp @@ -429,6 +429,24 @@ PointerInfo AllocTracker::lookupOrDefault(uintptr_t ptr) const { return map.at(ptr); } +std::pair AllocTracker::reportAllocatedMemory() const { + int64_t totalCpuMemory = 0; + int64_t totalGpuMemory = 0; + + for (const auto &entry : map) { + const PointerInfo &info = entry.second; + if (info.isExternallyManaged()) + continue; + if (info.type == PointerType::host || info.type == PointerType::pinned_host) { + totalCpuMemory += info.size; + } else if (info.type == PointerType::device || info.type == PointerType::unified) { + totalGpuMemory += info.size; + } + } + + return {totalCpuMemory, totalGpuMemory}; +} + StatusOr runtime::allocate(AllocTracker &tracker, PointerType type, uint64_t size, std::optional alignment, diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp index 3a48061b7..202b08727 100644 --- a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp +++ b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp @@ -775,7 +775,18 @@ PYBIND11_MODULE(_api, m) { THROW_IF_MTRT_ERROR(s); }, py::arg("device_memref"), py::arg("existing_host_memref"), - py::arg("stream") = py::none()); + py::arg("stream") = py::none()) + .def( + "report_allocated_memory", + [](PyRuntimeClient &self) { + int64_t totalGpuMemory; + int64_t totalCpuMemory; + MTRT_Status s = mtrtReportAllocatedMemory(self, &totalCpuMemory, &totalGpuMemory); + THROW_IF_MTRT_ERROR(s); + py::object namedtuple = py::module::import("collections").attr("namedtuple"); + py::object MemoryUsage = namedtuple("MemoryUsage", "cpu_memory gpu_memory"); + return MemoryUsage(totalCpuMemory, totalGpuMemory); + }); py::class_(m, "RuntimeValue", py::module_local()) .def_property_readonly(MTRT_PYTHON_CAPI_PTR_ATTR, diff --git a/mlir-tensorrt/test/python/mlir_tensorrt_runtime/test_runtime_api.py b/mlir-tensorrt/test/python/mlir_tensorrt_runtime/test_runtime_api.py index 699c8a4e7..ed3f35418 100644 --- a/mlir-tensorrt/test/python/mlir_tensorrt_runtime/test_runtime_api.py +++ b/mlir-tensorrt/test/python/mlir_tensorrt_runtime/test_runtime_api.py @@ -3,6 +3,7 @@ import mlir_tensorrt.runtime.api as runtime import numpy as np +import cupy as cp TESTS = [] @@ -190,6 +191,57 @@ def test_host_memref(): # CHECK-NEXT: PointerType.host # CHECK-NEXT: mlir_tensorrt.compiler.api.MemRefValue._CAPIPtr + +@make_test +def test_report_allocated_memory(): + client = runtime.RuntimeClient() + devices = client.get_devices() + + np_arr = np.ones((1000), dtype=np.int32) + cp_arr = cp.ones((1000), dtype=np.int32) + + # Allocate GPU memory + memref = client.create_memref(np_arr, device=devices[0]) + memory_usage = client.report_allocated_memory() + print("CPU Memory: ", memory_usage.cpu_memory) + print("GPU Memory: ", memory_usage.gpu_memory) + + # Allocate CPU memory + memref = client.create_memref(np_arr) + memory_usage = client.report_allocated_memory() + print("CPU Memory: ", memory_usage.cpu_memory) + print("GPU Memory: ", memory_usage.gpu_memory) + + # No CPU memory allocation as creating a view. + memref = client.create_host_memref_view( + np_arr.ctypes.data, shape=list(np_arr.shape), dtype=runtime.ScalarTypeCode.i32 + ) + memory_usage = client.report_allocated_memory() + print("CPU Memory: ", memory_usage.cpu_memory) + print("GPU Memory: ", memory_usage.gpu_memory) + + # No CPU memory allocation as creating a view. + memref = client.create_device_memref_view( + cp_arr.data.ptr, + shape=list(cp_arr.shape), + dtype=runtime.ScalarTypeCode.i32, + device=devices[0], + ) + memory_usage = client.report_allocated_memory() + print("CPU Memory: ", memory_usage.cpu_memory) + print("GPU Memory: ", memory_usage.gpu_memory) + + +# CHECK-LABEL: Test: test_report_allocated_memory +# CHECK: CPU Memory: 0 +# CHECK: GPU Memory: 4000 +# CHECK: CPU Memory: 4000 +# CHECK: GPU Memory: 0 +# CHECK: CPU Memory: 0 +# CHECK: GPU Memory: 0 +# CHECK: CPU Memory: 0 +# CHECK: GPU Memory: 0 + if __name__ == "__main__": for t in TESTS: t()