Reduce GPU memory usage when using reset(q) in user programs (#942)

NVIDIA · Nov 17, 2023 · f5d107c · f5d107c
1 parent b6a7ce2
commit f5d107c
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 9 deletions.
diff --git a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu
@@ -113,9 +113,6 @@ protected:
   /// @brief The size of the extra workspace
   size_t extraWorkspaceSizeInBytes = 0;
 
-  /// @brief Count the number of resets.
-  int nResets = 0;
-
   custatevecComputeType_t cuStateVecComputeType = CUSTATEVEC_COMPUTE_64F;
   cudaDataType_t cuStateVecCudaDataType = CUDA_C_64F;
   std::random_device randomDevice;
@@ -163,10 +160,8 @@ protected:
     if (extraWorkspaceSizeInBytes > 0)
       HANDLE_CUDA_ERROR(cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes));
 
-    // When we perform a deallocation we apply a
-    // qubit reset, and the state does not shrink (trying to minimize device
-    // memory manipulations), but nQubitsAllocated decrements.
-    auto localNQubitsAllocated = nQubitsAllocated + nResets;
+    auto localNQubitsAllocated =
+        stateDimension > 0 ? std::log2(stateDimension) : 0;
 
     // apply gate
     HANDLE_ERROR(custatevecApplyMatrix(
@@ -266,7 +261,6 @@ protected:
       HANDLE_CUDA_ERROR(cudaFree(extraWorkspace));
     deviceStateVector = nullptr;
     extraWorkspaceSizeInBytes = 0;
-    nResets = 0;
   }
 
   /// @brief Apply the given GateApplicationTask
@@ -350,7 +344,6 @@ public:
   /// @param qubitIdx
   void resetQubit(const std::size_t qubitIdx) override {
     flushGateQueue();
-    nResets++;
     const int basisBits[] = {(int)qubitIdx};
     int parity;
     double rand = randomValues(1, 1.0)[0];

diff --git a/unittests/qir/NVQIRTester.cpp b/unittests/qir/NVQIRTester.cpp
@@ -155,6 +155,26 @@ CUDAQ_TEST(NVQIRTester, checkQuantumIntrinsics) {
 }
 #endif
 
+CUDAQ_TEST(NVQIRTester, checkReset) {
+  __quantum__rt__initialize(0, nullptr);
+  auto qubits = __quantum__rt__qubit_allocate_array(2);
+  Qubit *q0 = *reinterpret_cast<Qubit **>(
+      __quantum__rt__array_get_element_ptr_1d(qubits, 0));
+  Qubit *q1 = *reinterpret_cast<Qubit **>(
+      __quantum__rt__array_get_element_ptr_1d(qubits, 1));
+
+  // Make sure that the state vector doesn't grow with each additional reset
+  for (int i = 0; i < 100; i++) {
+    __quantum__qis__reset(q0);
+    __quantum__qis__reset(q1);
+    __quantum__qis__x(q1);
+    __quantum__qis__swap(q0, q1);
+    assert(*__quantum__qis__mz(q0) == 1);
+  }
+  __quantum__rt__qubit_release_array(qubits);
+  __quantum__rt__finalize();
+}
+
 #ifndef CUDAQ_BACKEND_TENSORNET_MPS
 // MPS doesn't support gates on more than 2 qubits (controlled swap)
 // SWAP with a single ctrl qubit in 0 state.