diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index 15ecaccd0..1a7744493 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -7,7 +7,8 @@ on:
       - develop
   pull_request:
     branches:
-      - '**'
+      - master
+      - develop
 
 jobs:
 
diff --git a/.github/workflows/llvm-asan.yml b/.github/workflows/llvm-asan.yml
index ad3c49efe..60cfd3a06 100644
--- a/.github/workflows/llvm-asan.yml
+++ b/.github/workflows/llvm-asan.yml
@@ -7,7 +7,8 @@ on:
       - develop
   pull_request:
     branches:
-      - '**'
+      - master
+      - develop
 
 jobs:
 
diff --git a/.github/workflows/macos-unit.yml b/.github/workflows/macos-unit.yml
index a439b1c52..bb751cd3a 100644
--- a/.github/workflows/macos-unit.yml
+++ b/.github/workflows/macos-unit.yml
@@ -7,7 +7,8 @@ on:
       - develop
   pull_request:
     branches:
-      - '**'
+      - master
+      - develop
 
 jobs:
 
diff --git a/.github/workflows/ubuntu-unit.yml b/.github/workflows/ubuntu-unit.yml
index 68e9dc8a1..650d533d6 100644
--- a/.github/workflows/ubuntu-unit.yml
+++ b/.github/workflows/ubuntu-unit.yml
@@ -7,7 +7,8 @@ on:
       - develop
   pull_request:
     branches:
-      - '**'
+      - master
+      - develop
 
 jobs:
 
diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index 74817bf0c..958f32398 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -7,7 +7,8 @@ on:
       - develop
   pull_request:
     branches:
-      - '**'
+      - master
+      - develop
 
 jobs:
 
diff --git a/.github/workflows/windows-unit.yml b/.github/workflows/windows-unit.yml
index 72d79be9a..2d5076d21 100644
--- a/.github/workflows/windows-unit.yml
+++ b/.github/workflows/windows-unit.yml
@@ -7,7 +7,8 @@ on:
       - develop
   pull_request:
     branches:
-      - '**'
+      - master
+      - develop
 
 jobs:
 
diff --git a/AUTHORS.txt b/AUTHORS.txt
index 6c6500b33..e20b7ea7d 100644
--- a/AUTHORS.txt
+++ b/AUTHORS.txt
@@ -33,6 +33,8 @@ Dr Mihai Duta [developer]
     original prototyping
 
 External contributors:
+Jakub Adamski
+    optimised distributed communication by sending max-size messages asynchronously
 Bruno Villasenor Alvarez on behalf of AMD
     ported the GPU backend to HIP, for AMD GPU compatibility
 Dr Nicolas Vogt on behalf of HQS Quantum Simulations
diff --git a/QuEST/CMakeLists.txt b/QuEST/CMakeLists.txt
index 899f3fb98..0ed65f974 100644
--- a/QuEST/CMakeLists.txt
+++ b/QuEST/CMakeLists.txt
@@ -37,6 +37,8 @@ option(USE_HIP "Whether to use HIP for GPU code compilation for AMD GPUs. Set to
 
 set(GPU_ARCH gfx90 CACHE STRING "GPU hardware dependent, used for AMD GPUs when USE_HIP=1. Lookup at https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table. Write without fullstop")
 
+option(USE_CUQUANTUM "Whether to use NVIDIA's cuQuantum library (requires prior installation) in lieu of QuEST's bespoke GPU kernel. Set to 1 to enable." 0)
+
 
 # *****************************************************************************
 # ***** NO CHANGES SHOULD BE REQUIRED FROM THE USER BEYOND THIS POINT *********
@@ -49,6 +51,7 @@ message(STATUS "OMP acceleration is ${MULTITHREADED}")
 message(STATUS "MPI distribution is ${DISTRIBUTED}")
 if (${GPUACCELERATED})
   message(STATUS "HIP compilation is ${USE_HIP}")
+  message(STATUS "cuQuantum compilation is ${USE_CUQUANTUM}")
 endif()
 
 
@@ -119,25 +122,28 @@ endif()
 if (GPUACCELERATED)
   if (USE_HIP)
 
-  if(NOT DEFINED HIP_PATH)
-    if(NOT DEFINED ENV{HIP_PATH})
-      message(WARNING "WARNING: HIP_PATH is not defiend. Using default HIP_PATH=/opt/rocm/hip    " ${HIP_VERSION})
-      set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
-    else()
-      set(HIP_PATH $ENV{HIP_PATH}	CACHE PATH "Path to which HIP has been installed")
+    if(NOT DEFINED HIP_PATH)
+      if(NOT DEFINED ENV{HIP_PATH})
+        message(WARNING "WARNING: HIP_PATH is not defiend. Using default HIP_PATH=/opt/rocm/hip    " ${HIP_VERSION})
+        set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+      else()
+        set(HIP_PATH $ENV{HIP_PATH}	CACHE PATH "Path to which HIP has been installed")
+      endif()
     endif()
-  endif()
 
-  if(EXISTS "${HIP_PATH}")
-    set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
-    find_package(HIP REQUIRED)
-    message(STATUS "Found HIP: " ${HIP_VERSION})
-    message(STATUS "HIP PATH: " ${HIP_PATH})
-  endif()
-    
-  ADD_DEFINITIONS( -DUSE_HIP )
-  ADD_DEFINITIONS( -D__HIP_PLATFORM_AMD__ )
+    if(EXISTS "${HIP_PATH}")
+      set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+      find_package(HIP REQUIRED)
+      message(STATUS "Found HIP: " ${HIP_VERSION})
+      message(STATUS "HIP PATH: " ${HIP_PATH})
+    endif()
+      
+    ADD_DEFINITIONS( -DUSE_HIP )
+    ADD_DEFINITIONS( -D__HIP_PLATFORM_AMD__ )
 
+  elseif (USE_CUQUANTUM)
+    find_package(CUDA REQUIRED)
+    ADD_DEFINITIONS( -DUSE_CUQUANTUM )
   else()
     find_package(CUDA REQUIRED)
   endif()  
@@ -280,7 +286,12 @@ endif()
 # ----- C++ COMPILER FLAGS --------------------------------------------------
 
 # set C++ flags that are common between compilers and build types
-set (CMAKE_CXX_STANDARD 98)
+if (USE_CUQUANTUM)
+  set(CMAKE_CXX_STANDARD 14)
+  set(CMAKE_CXX_STANDARD_REQUIRED ON)
+else ()
+  set (CMAKE_CXX_STANDARD 98)
+endif ()
 
 # Use -O2 for all but debug mode by default 
 if (NOT("${CMAKE_BUILD_TYPE}" STREQUAL "Debug"))
@@ -412,6 +423,14 @@ target_link_libraries(QuEST PUBLIC ${MPI_C_LIBRARIES})
 # ----- GPU -------------------------------------------------------------------
 if (USE_HIP)
   target_link_libraries(QuEST PUBLIC  ${HIP_PATH}/lib/libamdhip64.so )
+elseif (USE_CUQUANTUM)
+  find_library(CUQUANTUM_LIBRARIES custatevec)
+  if (NOT CUQUANTUM_LIBRARIES)
+    message(FATAL_ERROR "cuQuantum library (specifically custatevec) not found")
+  endif ()
+
+  target_link_libraries(QuEST ${CUDA_LIBRARIES} ${CUQUANTUM_LIBRARIES})
+  target_include_directories(QuEST PUBLIC "/usr/local/cuda/include")
 else()
   target_link_libraries(QuEST ${CUDA_LIBRARIES})
 endif()
diff --git a/QuEST/include/QuEST.h b/QuEST/include/QuEST.h
index dd7709780..ba89ad38d 100644
--- a/QuEST/include/QuEST.h
+++ b/QuEST/include/QuEST.h
@@ -34,6 +34,23 @@
 
 # include "QuEST_precision.h"
 
+
+
+// ensure custatevecHandle_t is defined, even if no GPU
+# ifdef USE_CUQUANTUM
+# include <custatevec.h>
+typedef struct CuQuantumConfig {
+    cudaMemPool_t cuMemPool;
+    cudaStream_t cuStream;
+    custatevecHandle_t cuQuantumHandle;
+    custatevecDeviceMemHandler_t cuMemHandler;
+} CuQuantumConfig;
+# else
+# define CuQuantumConfig void*
+# endif
+
+
+
 // prevent C++ name mangling
 #ifdef __cplusplus
 extern "C" {
@@ -368,6 +385,11 @@ typedef struct Qureg
     //! Storage for reduction of probabilities on GPU
     qreal *firstLevelReduction, *secondLevelReduction;
 
+    //! Storage for wavefunction amplitues and config (copy of QuESTEnv's handle) in cuQuantum deployment
+    cuAmp* cuStateVec;
+    cuAmp* deviceCuStateVec;
+    CuQuantumConfig* cuConfig;
+
     //! Storage for generated QASM output
     QASMLogger* qasmLog;
     
@@ -386,6 +408,10 @@ typedef struct QuESTEnv
     int numRanks;
     unsigned long int* seeds;
     int numSeeds;
+
+    // a copy of the QuESTEnv's config, used only in cuQuantum deployment
+    CuQuantumConfig* cuConfig;
+    
 } QuESTEnv;
 
 
@@ -4236,6 +4262,10 @@ qreal calcPurity(Qureg qureg);
  * linear algebra calculation.
  *
  * The number of qubits represented in \p qureg and \p pureState must match.
+ *
+ * > In the GPU-accelerated cuQuantum backend, this function further assumes that
+ * > the density matrix \p qureg is correctly normalised, and otherwise returns the 
+ * > fidelity of the conjugate-transpose of \p qureg.
  * 
  * @see
  * - calcHilbertSchmidtDistance()
diff --git a/QuEST/include/QuEST_precision.h b/QuEST/include/QuEST_precision.h
index 48c9a1ccf..244460b78 100644
--- a/QuEST/include/QuEST_precision.h
+++ b/QuEST/include/QuEST_precision.h
@@ -10,15 +10,27 @@
  * @author Tyson Jones (doc)
  */
 
-# include <math.h>
-
 # ifndef QUEST_PRECISION_H
 # define QUEST_PRECISION_H
 
+# include <math.h>
+
+
+// define CUDA complex types as void if not using cuQuantum.
+// note we used cuComplex.h for complex numbers, in lieu of
+// Thrust's complex<qreal>, so that the QuEST.h header can
+// always be compiled with C99, rather than C++14.
+# ifdef USE_CUQUANTUM
+    # include <cuComplex.h>
+# else
+    # define cuFloatComplex void
+    # define cuDoubleComplex void
+# endif
+
 
 // set default double precision if not set during compilation
 # ifndef QuEST_PREC
-# define QuEST_PREC 2
+    # define QuEST_PREC 2
 # endif
 
 
@@ -28,6 +40,7 @@
 # if QuEST_PREC==1
     # define qreal float
     // \cond HIDDEN_SYMBOLS   
+    # define cuAmp cuFloatComplex
     # define MPI_QuEST_REAL MPI_FLOAT
     # define MPI_MAX_AMPS_IN_MSG (1LL<<29) // must be 2^int
     # define REAL_STRING_FORMAT "%.8f"
@@ -41,7 +54,8 @@
  */
 # elif QuEST_PREC==2
     # define qreal double
-    // \cond HIDDEN_SYMBOLS   
+    // \cond HIDDEN_SYMBOLS
+    # define cuAmp cuDoubleComplex
     # define MPI_QuEST_REAL MPI_DOUBLE
     # define MPI_MAX_AMPS_IN_MSG (1LL<<28) // must be 2^int
     # define REAL_STRING_FORMAT "%.14f"
@@ -57,6 +71,7 @@
 # elif QuEST_PREC==4
     # define qreal long double
     // \cond HIDDEN_SYMBOLS   
+    # define cuAmp void // invalid
     # define MPI_QuEST_REAL MPI_LONG_DOUBLE
     # define MPI_MAX_AMPS_IN_MSG (1LL<<27) // must be 2^int
     # define REAL_STRING_FORMAT "%.17Lf"
diff --git a/QuEST/src/CPU/QuEST_cpu.c b/QuEST/src/CPU/QuEST_cpu.c
index 56e82d54f..1736de3cd 100644
--- a/QuEST/src/CPU/QuEST_cpu.c
+++ b/QuEST/src/CPU/QuEST_cpu.c
@@ -1641,53 +1641,6 @@ void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg) {
     }
 }
 
-/**
- * Initialise the state vector of probability amplitudes such that one qubit is set to 'outcome' and all other qubits are in an equal superposition of zero and one.
- * @param[in,out] qureg object representing the set of qubits to be initialised
- * @param[in] qubitId id of qubit to set to state 'outcome'
- * @param[in] outcome of qubit 'qubitId'
- */
-void statevec_initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome)
-{
-    long long int chunkSize, stateVecSize;
-    long long int index;
-    int bit;
-    long long int chunkId=qureg->chunkId;
-
-    // dimension of the state vector
-    chunkSize = qureg->numAmpsPerChunk;
-    stateVecSize = chunkSize*qureg->numChunks;
-    qreal normFactor = 1.0/sqrt((qreal)stateVecSize/2.0);
-
-    // Can't use qureg->stateVec as a private OMP var
-    qreal *stateVecReal = qureg->stateVec.real;
-    qreal *stateVecImag = qureg->stateVec.imag;
-
-    // initialise the state to |0000..0000>
-# ifdef _OPENMP
-# pragma omp parallel \
-    default  (none) \
-    shared   (chunkSize, stateVecReal, stateVecImag, normFactor, qubitId, outcome, chunkId) \
-    private  (index, bit)
-# endif
-    {
-# ifdef _OPENMP
-# pragma omp for schedule (static)
-# endif
-        for (index=0; index<chunkSize; index++) {
-            bit = extractBit(qubitId, index+chunkId*chunkSize);
-            if (bit==outcome) {
-                stateVecReal[index] = normFactor;
-                stateVecImag[index] = 0.0;
-            } else {
-                stateVecReal[index] = 0.0;
-                stateVecImag[index] = 0.0;
-            }
-        }
-    }
-}
-
-
 /**
  * Initialise the state vector of probability amplitudes to an (unphysical) state with
  * each component of each probability amplitude a unique floating point value. For debugging processes
@@ -1726,62 +1679,6 @@ void statevec_initDebugState (Qureg qureg)
     }
 }
 
-// returns 1 if successful, else 0
-int statevec_initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env){
-    long long int chunkSize, stateVecSize;
-    long long int indexInChunk, totalIndex;
-
-    chunkSize = qureg->numAmpsPerChunk;
-    stateVecSize = chunkSize*qureg->numChunks;
-
-    qreal *stateVecReal = qureg->stateVec.real;
-    qreal *stateVecImag = qureg->stateVec.imag;
-
-    FILE *fp;
-    char line[200];
-
-    for (int rank=0; rank<(qureg->numChunks); rank++){
-        if (rank==qureg->chunkId){
-            fp = fopen(filename, "r");
-            
-            // indicate file open failure
-            if (fp == NULL)
-                return 0;
-            
-            indexInChunk = 0; totalIndex = 0;
-            while (fgets(line, sizeof(char)*200, fp) != NULL && totalIndex<stateVecSize){
-                if (line[0]!='#'){
-                    int chunkId = (int) (totalIndex/chunkSize);
-                    if (chunkId==qureg->chunkId){
-                        sscanf(line, REAL_SPECIFIER ", " REAL_SPECIFIER, &(stateVecReal[indexInChunk]),
-                                &(stateVecImag[indexInChunk])); 
-                        indexInChunk += 1;
-                    }
-                    totalIndex += 1;
-                }
-            }   
-            fclose(fp);
-        }
-        syncQuESTEnv(env);
-    }
-    
-    // indicate success
-    return 1;
-}
-
-int statevec_compareStates(Qureg mq1, Qureg mq2, qreal precision){
-    qreal diff;
-    long long int chunkSize = mq1.numAmpsPerChunk;
-    
-    for (long long int i=0; i<chunkSize; i++){
-        diff = absReal(mq1.stateVec.real[i] - mq2.stateVec.real[i]);
-        if (diff>precision) return 0;
-        diff = absReal(mq1.stateVec.imag[i] - mq2.stateVec.imag[i]);
-        if (diff>precision) return 0;
-    }
-    return 1;
-}
-
 void statevec_compactUnitaryLocal (Qureg qureg, int targetQubit, Complex alpha, Complex beta)
 {
     long long int sizeBlock, sizeHalfBlock;
diff --git a/QuEST/src/CPU/QuEST_cpu_distributed.c b/QuEST/src/CPU/QuEST_cpu_distributed.c
index 4d07ef2b3..4c9ce0af8 100644
--- a/QuEST/src/CPU/QuEST_cpu_distributed.c
+++ b/QuEST/src/CPU/QuEST_cpu_distributed.c
@@ -494,8 +494,12 @@ void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg) {
 
 void exchangeStateVectors(Qureg qureg, int pairRank){
     // MPI send/receive vars
-    int TAG=100;
-    MPI_Status status;
+    int sendTag;
+    int recvTag;
+    int rank;
+    MPI_Request * requests;
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
     // Multiple messages are required as MPI uses int rather than long long int for count
     // For openmpi, messages are further restricted to 2GB in size -- do this for all cases
@@ -506,20 +510,26 @@ void exchangeStateVectors(Qureg qureg, int pairRank){
     
     // safely assume MPI_MAX... = 2^n, so division always exact
     int numMessages = qureg.numAmpsPerChunk/maxMessageCount;
+    requests = (MPI_Request*) malloc(4 * numMessages * sizeof(MPI_Request));
     int i;
     long long int offset;
+    
     // send my state vector to pairRank's qureg.pairStateVec
     // receive pairRank's state vector into qureg.pairStateVec
     for (i=0; i<numMessages; i++){
         offset = i*maxMessageCount;
-        MPI_Sendrecv(&qureg.stateVec.real[offset], maxMessageCount, MPI_QuEST_REAL, pairRank, TAG,
-                &qureg.pairStateVec.real[offset], maxMessageCount, MPI_QuEST_REAL,
-                pairRank, TAG, MPI_COMM_WORLD, &status);
-        //printf("rank: %d err: %d\n", qureg.rank, err);
-        MPI_Sendrecv(&qureg.stateVec.imag[offset], maxMessageCount, MPI_QuEST_REAL, pairRank, TAG,
-                &qureg.pairStateVec.imag[offset], maxMessageCount, MPI_QuEST_REAL,
-                pairRank, TAG, MPI_COMM_WORLD, &status);
+        sendTag = rank * numMessages + i;
+        recvTag = pairRank * numMessages + i;
+
+        MPI_Isend(&qureg.stateVec.real[offset], maxMessageCount, MPI_QuEST_REAL, pairRank, sendTag, MPI_COMM_WORLD, requests + 4*i);
+        MPI_Isend(&qureg.stateVec.imag[offset], maxMessageCount, MPI_QuEST_REAL, pairRank, sendTag, MPI_COMM_WORLD, requests + 4*i + 1);
+        MPI_Irecv(&qureg.pairStateVec.real[offset], maxMessageCount, MPI_QuEST_REAL, pairRank, recvTag, MPI_COMM_WORLD, requests + 4*i + 2);
+        MPI_Irecv(&qureg.pairStateVec.imag[offset], maxMessageCount, MPI_QuEST_REAL, pairRank, recvTag, MPI_COMM_WORLD, requests + 4*i + 3);
     }
+
+    MPI_Waitall(4 * numMessages, requests, MPI_STATUSES_IGNORE);
+
+    free(requests);
 }
 
 void exchangePairStateVectorHalves(Qureg qureg, int pairRank){
diff --git a/QuEST/src/GPU/CMakeLists.txt b/QuEST/src/GPU/CMakeLists.txt
index 037382808..14054a62e 100644
--- a/QuEST/src/GPU/CMakeLists.txt
+++ b/QuEST/src/GPU/CMakeLists.txt
@@ -1,6 +1,11 @@
+if (USE_CUQUANTUM)
+    set(GPU_CORE QuEST_cuQuantum.cu)
+else ()
+    set(GPU_CORE QuEST_gpu.cu)
+endif()
+
 set(QuEST_SRC_ARCHITECTURE_DEPENDENT
-    ${CMAKE_CURRENT_SOURCE_DIR}/QuEST_gpu.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/${GPU_CORE}
+    ${CMAKE_CURRENT_SOURCE_DIR}/QuEST_gpu_common.cu
     PARENT_SCOPE
 )
-
-
diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu
new file mode 100644
index 000000000..96e029a94
--- /dev/null
+++ b/QuEST/src/GPU/QuEST_cuQuantum.cu
@@ -0,0 +1,1280 @@
+// Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details
+
+/** @file
+ * An implementation of QuEST's backend (../QuEST_internal.h) using NVIDIA's cuQuantum library.
+ * This makes no use of the ComplexArray qureg.deviceStateVec, used by the bespoke GPU kernels,
+ * which is not malloc'd in this deployment. Instead, this cuQuantum backend mallocs and uses
+ * two dedicated arrays of 'cuAmp' complex primitives; qureg.cuStateVec (CPU memory) and
+ * qureg.deviceCuStateVec (GPU memory). Note that some API functions are implemented directly
+ * in QuEST_gpu_common.cu, in a way agnostic to cuQuantum vs bespoke kernels.
+ *
+ * @author Tyson Jones
+ */
+
+# include "QuEST.h"
+# include "QuEST_gpu_common.h"
+# include "QuEST_precision.h"
+# include "QuEST_validation.h"
+
+# include <cuComplex.h>
+# include <vector>
+# include <custatevec.h>
+# include <thrust/device_ptr.h>
+# include <thrust/sequence.h>
+# include <thrust/iterator/zip_iterator.h>
+# include <thrust/iterator/counting_iterator.h>
+# include <thrust/iterator/transform_iterator.h>
+# include <thrust/for_each.h>
+# include <thrust/inner_product.h>
+
+
+
+/*
+ * TYPES AND ADAPTERS
+ */
+
+// precision-agnostic conversions between cuAmp, qreal and Complex
+# if QuEST_PREC==1
+    # define TO_CU_AMP(re, im) make_cuFloatComplex(re, im)
+    # define cuAmpReal(amp) cuCrealf(amp)
+    # define cuAmpImag(amp) cuCimagf(amp)
+    # define cuAmpConj(amp) cuConjf(amp)
+    # define CU_AMP_IN_STATE_PREC CUDA_C_F32
+    # define CU_AMP_IN_MATRIX_PREC CUDA_C_64F
+# elif QuEST_PREC==2
+    # define TO_CU_AMP(re, im) make_cuDoubleComplex(re, im)
+    # define cuAmpReal(amp) cuCreal(amp)
+    # define cuAmpImag(amp) cuCimag(amp)
+    # define cuAmpConj(amp) cuConj(amp)
+    # define CU_AMP_IN_STATE_PREC CUDA_C_64F
+    # define CU_AMP_IN_MATRIX_PREC CUDA_C_64F
+# elif QuEST_PREC==4
+    # define TO_CU_AMP(re, im) -1 // invalid precision config
+    # define cuAmpReal(amp) -1
+    # define cuAmpImag(amp) -1
+    # define CU_AMP_IN_STATE_PREC void // invalid
+    # define CU_AMP_IN_MATRIX_PREC void // invalid
+#endif
+
+// convenient operator overloads for cuAmp, for doing complex artihmetic.
+// some of these are defined to be used by Thrust's backend, because we
+// avoided Thrust's complex<qreal> (see QuEST_precision.h for explanation).
+// notice we are manually performing the arithmetic using qreals and 
+// re-packing the result into TO_CU_AMP, rather than using cuComplex.h's 
+// functions like cuCadd(). This is because such functions are precision
+// specific (grr) and do exactly the same thing themselves!
+__host__ __device__ inline cuAmp operator - (const cuAmp& a) {
+    return TO_CU_AMP(-cuAmpReal(a), -cuAmpImag(a));
+}
+__host__ __device__ inline cuAmp operator * (const cuAmp& a, const std::size_t n) {
+    return TO_CU_AMP(n*cuAmpReal(a), n*cuAmpImag(a));
+}
+__host__ __device__ inline cuAmp operator + (const cuAmp& a, const cuAmp& b) {
+    return TO_CU_AMP(cuAmpReal(a) + cuAmpReal(b), cuAmpImag(a) + cuAmpImag(b));
+}
+__host__ __device__ inline cuAmp operator * (const cuAmp& a, const cuAmp& b) {
+    qreal aRe = cuAmpReal(a);
+    qreal aIm = cuAmpImag(a);
+    qreal bRe = cuAmpReal(b);
+    qreal bIm = cuAmpImag(b);
+    return TO_CU_AMP(aRe*bRe - aIm*bIm, aRe*bIm + aIm*bRe);
+}
+
+// convert user-facing Complex to cuQuantum-facing cuAmp
+cuAmp toCuAmp(Complex c) {
+    return TO_CU_AMP(c.real, c.imag);
+}
+
+// concise alias for row-wise flattened complex matrix
+typedef std::vector<cuAmp> cuMatr;
+
+// flatten ComplexMatrixN mIn to a cuMatr mOut
+#define GET_cuMatr_FROM_ComplexMatrix( mOut, mIn, nQubits ) \
+    long long int dim = (1LL << nQubits); \
+    cuMatr mOut(dim*dim); \
+    long long int i=0; \
+    for (long long int r=0; r<(dim); r++) \
+        for (long long int c=0; c<(dim); c++) \
+            mOut[i++] = TO_CU_AMP(mIn.real[r][c], mIn.imag[r][c]);
+
+// convert user-facing ComplexMatrixN to cuQuantum-facing cuMatr
+cuMatr toCuMatr(ComplexMatrix2 mIn) {
+    GET_cuMatr_FROM_ComplexMatrix(mOut, mIn, 1);
+    return mOut;
+}
+cuMatr toCuMatr(ComplexMatrix4 mIn) {
+    GET_cuMatr_FROM_ComplexMatrix(mOut, mIn, 2);
+    return mOut;
+}
+cuMatr toCuMatr(ComplexMatrixN mIn) {
+    GET_cuMatr_FROM_ComplexMatrix(mOut, mIn, mIn.numQubits);
+    return mOut;
+}
+
+// convert QuEST backend masks back into user-input qubit lists (needed by cuQuantum)
+std::vector<int> getIndsFromMask(long long int mask, int numBits) {
+    std::vector<int> inds;
+    for (int i=0; i<numBits; i++)
+        if (mask & (1LL<<i))
+            inds.push_back(i);
+    return inds;
+}
+
+
+
+/*
+ * CUQUANTUM MEMORY MANAGEMENT
+ */
+
+int GPUSupportsMemPools() {
+
+    // consult only the first device (garuanteed already to exist)
+    int deviceId;
+    cudaGetDevice(&deviceId);
+    int supports;
+    cudaDeviceGetAttribute(&supports, cudaDevAttrMemoryPoolsSupported, deviceId);
+    return supports;
+}
+
+int memPoolAlloc(void* ctx, void** ptr, size_t size, cudaStream_t stream) {
+    cudaMemPool_t& pool = *static_cast<cudaMemPool_t*>(ctx);
+    return cudaMallocFromPoolAsync(ptr, size, pool, stream); 
+}
+int memPoolFree(void* ctx, void* ptr, size_t size, cudaStream_t stream) {
+    return cudaFreeAsync(ptr, stream); 
+}
+
+CuQuantumConfig* createCuConfig() {
+
+    // create cuQuantumConfig in heap memory
+    CuQuantumConfig* config = (CuQuantumConfig*) malloc(sizeof(CuQuantumConfig));
+
+    // bind existing memory pool (does not need later manual freeing)
+    int deviceId;
+    cudaGetDevice(&deviceId);
+    cudaDeviceGetMemPool(&(config->cuMemPool), deviceId);
+
+    // create new custatevecHandle_t
+    custatevecCreate(&(config->cuQuantumHandle));
+
+    // create new cudaStream_t
+    cudaStreamCreate(&(config->cuStream));
+
+    // custatevecDeviceMemHandler_t needs no explicit creation
+
+    // return config's heap pointer
+    return config;
+}
+
+void initCuConfig(CuQuantumConfig* config) {
+
+    // get existing memPool threshold, above which memory gets freed at every stream synch
+    size_t currMaxMem;
+    cudaMemPoolGetAttribute(config->cuMemPool, cudaMemPoolAttrReleaseThreshold, &currMaxMem); 
+
+    // if memPool threshold smaller than 1 MiB = 16 qubits, extend it
+    size_t desiredMaxMem = 16*(1<<15);
+    if (currMaxMem < desiredMaxMem)
+        cudaMemPoolSetAttribute(config->cuMemPool, cudaMemPoolAttrReleaseThreshold, &desiredMaxMem); 
+
+    // bind mempool to deviceMemHandler
+    config->cuMemHandler.ctx = &(config->cuMemPool);
+    config->cuMemHandler.device_alloc = memPoolAlloc;
+    config->cuMemHandler.device_free = memPoolFree;
+    strcpy(config->cuMemHandler.name, "mempool");
+
+    // bind deviceMemHandler to cuQuantum
+    custatevecSetDeviceMemHandler(config->cuQuantumHandle, &(config->cuMemHandler));
+
+    // bind stream to cuQuantum
+    custatevecSetStream(config->cuQuantumHandle, config->cuStream);
+}
+
+void destroyCuConfig(CuQuantumConfig* config) {
+
+    // free config's heap attributes
+    cudaStreamDestroy(config->cuStream);
+    custatevecDestroy(config->cuQuantumHandle);
+
+    // don't need to free cuMemPool; it already existed
+    // don't need to free cuMemHandler; it's a struct included in config's heap memory
+
+    // free config's heap memory
+    free(config);
+}
+
+
+
+/*
+ * CUQUANTUM WRAPPERS (to reduce boilerplate)
+ */
+
+void custatevec_applyMatrix(Qureg qureg, std::vector<int> ctrls, std::vector<int> targs, cuMatr matr) {
+
+    // do not adjoint matrix
+    int adj = 0;
+
+    // condition all ctrls on =1 state
+    int* ctrlVals = nullptr;
+
+    // use automatic workspace management
+    void* work = nullptr;
+    size_t workSize = 0;
+
+    custatevecApplyMatrix(
+        qureg.cuConfig->cuQuantumHandle, 
+        qureg.deviceCuStateVec, CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, 
+        matr.data(), CU_AMP_IN_MATRIX_PREC, CUSTATEVEC_MATRIX_LAYOUT_ROW, adj, 
+        targs.data(), targs.size(), 
+        ctrls.data(), ctrlVals, ctrls.size(), 
+        CUSTATEVEC_COMPUTE_DEFAULT,
+        work, workSize);
+}
+
+void custatevec_applyDiagonal(Qureg qureg, std::vector<int> ctrls, std::vector<int> targs, cuAmp* elems) {
+
+    // apply no permutation matrix
+    custatevecIndex_t *perm = nullptr;
+
+    // do not adjoint elems
+    int adj = 0;
+
+    // condition all ctrls on =1 state
+    int* ctrlVals = nullptr;
+
+    // use automatic workspace management
+    void* work = nullptr;
+    size_t workSize = 0;
+
+    custatevecApplyGeneralizedPermutationMatrix(
+        qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec,
+        CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec,
+        perm, elems, CU_AMP_IN_MATRIX_PREC, adj, 
+        targs.data(), targs.size(), 
+        ctrls.data(), ctrlVals, ctrls.size(),
+        work, workSize);
+}
+
+
+
+/*
+ * THRUST WRAPPERS AND FUNCTORS (to reduce boilerplate, and for custom modification of statevectors)
+ */
+
+struct functor_prodOfConj : public thrust::binary_function<cuAmp,cuAmp,cuAmp>
+{
+    __host__ __device__ cuAmp operator()(cuAmp braAmp, cuAmp ketAmp) { 
+        return  ketAmp * cuAmpConj(braAmp);
+    }
+};
+
+struct functor_absOfDif : public thrust::binary_function<cuAmp,cuAmp,cuAmp>
+{
+    __host__ __device__ cuAmp operator()(cuAmp a, cuAmp b) { 
+        qreal difRe = cuAmpReal(a) - cuAmpReal(b);
+        qreal difIm = cuAmpImag(a) - cuAmpImag(b);
+        qreal difAbs = difRe*difRe + difIm*difIm;
+        return TO_CU_AMP(difAbs, 0);
+    }
+};
+
+struct functor_absSquared : public thrust::unary_function<cuAmp,qreal>
+{
+    __host__ __device__ qreal operator()(cuAmp amp) {
+        qreal re = cuAmpReal(amp);
+        qreal im = cuAmpImag(amp);
+        qreal absSq = re*re + im*im;
+        return absSq;
+    }
+};
+
+struct functor_prodOfAbsSquared : public thrust::binary_function<cuAmp,cuAmp,cuAmp>
+{
+    __host__ __device__ cuAmp operator()(cuAmp probAmp, cuAmp rawAmp) { 
+        qreal re = cuAmpReal(probAmp);
+        qreal im = cuAmpImag(probAmp);
+        qreal absSq = re*re + im*im;
+        cuAmp prod = rawAmp * TO_CU_AMP(absSq, 0); 
+        return  prod;
+    }
+};
+
+struct functor_scaleInd : public thrust::unary_function<long long int,long long int>
+{
+    const long long int factor;
+    functor_scaleInd(long long int _factor) : factor(_factor) {}
+
+    __host__ __device__ long long int operator()(long long int ind) {
+        return factor * ind;
+    }
+};
+
+struct functor_mapIndToAlternatingBlockScaledInd : public thrust::unary_function<long long int,long long int>
+{
+    long long int blockSize;
+    long long int factor;
+    int useOffsetBlocks;
+
+    functor_mapIndToAlternatingBlockScaledInd(long long int _blockSize, long long int _factor, int _useOffsetBlocks) :
+        blockSize(_blockSize), factor(_factor), useOffsetBlocks(_useOffsetBlocks) {}
+
+    __host__ __device__ long long int operator()(long long int rawInd) {
+
+        long long int blockNum = rawInd / blockSize;
+        long long int subBlockInd = rawInd % blockSize;
+
+        long long int blockStartInd = (blockNum * 2 * blockSize) + (useOffsetBlocks * blockSize);
+        long long int blockifiedInd = blockStartInd + subBlockInd;
+
+        long long int finalInd = factor * blockifiedInd;
+
+        return finalInd;
+    }
+};
+
+struct functor_setWeightedQureg
+{
+    // functor requires 3 complex scalars
+    const cuAmp fac1;
+    const cuAmp fac2;
+    const cuAmp facOut;
+    functor_setWeightedQureg(cuAmp _fac1, cuAmp _fac2, cuAmp _facOut) : 
+        fac1(_fac1), fac2(_fac2), facOut(_facOut) {}
+
+    // and modifies out to (facOut out + fac1 qureg1 + fac2 qureg2)
+    template <typename Tuple> __host__ __device__ void operator()(Tuple t) {
+        thrust::get<2>(t) = facOut*thrust::get<2>(t) + fac1*thrust::get<0>(t) + fac2*thrust::get<1>(t);
+    }
+};
+
+struct functor_matrixColumnDotVector : public thrust::unary_function<long long int,cuAmp>
+{
+    cuAmp* matrix; // flattened column-wise
+    cuAmp* vector;
+    long long int dim;
+    functor_matrixColumnDotVector(cuAmp* _matrix, cuAmp* _vector, long long int _dim) :
+        matrix(_matrix), vector(_vector), dim(_dim) {}
+
+    __host__ __device__ cuAmp operator()(long long int columnInd) {
+
+        // safe to iterate serially, since #columnInd is exponentially growing
+        cuAmp sum = TO_CU_AMP(0, 0);
+        for (long long int rowInd=0; rowInd<dim; rowInd++)
+            sum = sum + vector[rowInd] * cuAmpConj(matrix[columnInd*dim + rowInd]);
+        
+        return sum;
+    }
+};
+
+struct functor_getDiagonalOpAmp : public thrust::unary_function<long long int,cuAmp>
+{
+    DiagonalOp op;
+    functor_getDiagonalOpAmp(DiagonalOp _op) : op(_op) {}
+
+    __host__ __device__ cuAmp operator()(long long int columnInd) {
+
+        return TO_CU_AMP(
+            op.deviceOperator.real[columnInd], 
+            op.deviceOperator.imag[columnInd]);
+    }
+};
+
+struct functor_multDiagOntoDensMatr
+{
+    cuAmp* matrix;
+    DiagonalOp op;
+    functor_multDiagOntoDensMatr(cuAmp* _matrix, DiagonalOp _op) : matrix(_matrix), op(_op) {}
+
+    __host__ __device__ void operator()(long long int matrixInd) {
+
+        long long int opDim = 1LL << op.numQubits;
+        long long int opInd = matrixInd % opDim;
+        cuAmp opAmp = TO_CU_AMP(
+            op.deviceOperator.real[opInd], 
+            op.deviceOperator.imag[opInd]);
+
+        matrix[matrixInd] = opAmp * matrix[matrixInd];
+    }
+};
+
+struct functor_collapseDensMatrToOutcome
+{
+    cuAmp* matrix;
+    int numQubits;
+    int outcome;
+    qreal outcomeProb;
+    int target;
+    functor_collapseDensMatrToOutcome(cuAmp* _matrix, int _numQubits, int _outcome, qreal _outcomeProb, int _target) : 
+        matrix(_matrix), numQubits(_numQubits), outcome(_outcome), outcomeProb(_outcomeProb), target(_target) {}
+    
+    __host__ __device__ void operator()(long long int ind) {
+
+        // obtain bits |...b2...><...b1...|
+        int b1 = (ind >> target) & 1;
+        int b2 = (ind >> target >> numQubits) & 1;
+
+        int f1 = !(b1 ^ b2);        // true if b1 == b2
+        int f2 = !(b1 ^ outcome);   // true if b1 == outcome
+        qreal fac = f1 * f2 / outcomeProb;  // 0 or 1/prob
+
+        matrix[ind] = TO_CU_AMP(fac, 0) * matrix[ind];
+    }
+};
+
+thrust::device_ptr<cuAmp> getStartPtr(Qureg qureg) {
+
+    return thrust::device_pointer_cast(qureg.deviceCuStateVec);
+}
+
+thrust::device_ptr<cuAmp> getEndPtr(Qureg qureg) {
+
+    return getStartPtr(qureg) + qureg.numAmpsTotal;
+}
+
+auto iter_strided(Qureg qureg, long long int stride, long long int strideIndInit) {
+
+    // iterates qureg[i * stride] over i, from i = strideIndInit, until exceeding qureg
+    auto rawIndIter = thrust::make_counting_iterator(strideIndInit);
+    auto stridedIndIter = thrust::make_transform_iterator(rawIndIter, functor_scaleInd(stride));
+    auto stridedAmpIter = thrust::make_permutation_iterator(getStartPtr(qureg), stridedIndIter);
+    return stridedAmpIter;
+}
+
+auto getStartStridedAmpIter(Qureg qureg, long long int stride) {
+
+    return iter_strided(qureg, stride, 0);
+}
+
+auto getEndStridedAmpIter(Qureg qureg, long long int stride) {
+
+    long long int numIters = ceil(qureg.numAmpsTotal / (qreal) stride);
+    return iter_strided(qureg, stride, numIters);
+}
+
+auto iter_blockStrided(Qureg qureg, long long int blockSize, int useOffsetBlocks, long long int stride, long long int rawIndInit) {
+
+    // iterates qureg[(i mapped to alternating blocks) * stride] over i, from i = rawIndInit, until exceeding qureg
+    auto functor = functor_mapIndToAlternatingBlockScaledInd(blockSize, stride, useOffsetBlocks);
+    auto rawIndIter = thrust::make_counting_iterator(rawIndInit);
+    auto stridedBlockIndIter = thrust::make_transform_iterator(rawIndIter, functor);
+    auto stridedBlockAmpIter = thrust::make_permutation_iterator(getStartPtr(qureg), stridedBlockIndIter);
+    return stridedBlockAmpIter;
+}
+
+auto getStartBlockStridedAmpIter(Qureg qureg, long long int blockSize, int useOffsetBlocks, long long int stride) {
+
+    return iter_blockStrided(qureg, blockSize, useOffsetBlocks, stride, 0);
+}
+
+auto getEndBlockStridedAmpIter(Qureg qureg, long long int blockSize, int useOffsetBlocks, long long int stride) {
+
+    long long int numAltBlockAmps = qureg.numAmpsTotal / 2;
+    long long int numIters = ceil(numAltBlockAmps / (qreal) stride);
+    return iter_blockStrided(qureg, blockSize, useOffsetBlocks, stride, numIters);
+}
+
+auto iter_diagonalOp(DiagonalOp op, long long int initInd) {
+
+    auto rawIndIter = thrust::make_counting_iterator(initInd);
+    auto diagAmpIter = thrust::make_transform_iterator(rawIndIter, functor_getDiagonalOpAmp(op));
+    return diagAmpIter;
+}
+
+auto getStartDiagonalOpAmpIter(DiagonalOp op) {
+
+    return iter_diagonalOp(op, 0);
+}
+
+auto getEndDiagonalOpAmpIter(DiagonalOp op) {
+
+    return iter_diagonalOp(op, op.numElemsPerChunk);
+}
+
+
+
+/*
+ * Start C-linkage, so that below functions may only use C signatures (not C++)
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/* 
+ * ENVIRONMENT MANAGEMENT
+ */
+
+QuESTEnv createQuESTEnv(void) {
+    validateGPUExists(GPUExists(), __func__);
+    validateGPUIsCuQuantumCompatible(GPUSupportsMemPools(),__func__);
+    
+    QuESTEnv env;
+    env.rank=0;
+    env.numRanks=1;
+    
+    env.seeds = NULL;
+    env.numSeeds = 0;
+    seedQuESTDefault(&env);
+
+    // prepare cuQuantum with automatic workspaces
+    env.cuConfig = createCuConfig();
+    initCuConfig(env.cuConfig);
+
+    return env;
+}
+
+void destroyQuESTEnv(QuESTEnv env){
+    free(env.seeds);
+
+    // finalise cuQuantum
+    destroyCuConfig(env.cuConfig);
+}
+
+
+
+/* 
+ * QUREG CREATION AND AMP SET/GET
+ */
+
+void statevec_createQureg(Qureg *qureg, int numQubits, QuESTEnv env)
+{   
+    // set standard fields
+    long long int numAmps = 1LL << numQubits;
+    qureg->numQubitsInStateVec = numQubits;
+    qureg->numAmpsPerChunk = numAmps;
+    qureg->numAmpsTotal = numAmps;
+    qureg->chunkId = 0;
+    qureg->numChunks = 1;
+    qureg->isDensityMatrix = 0;
+
+    // copy env's cuQuantum config handle
+    qureg->cuConfig = env.cuConfig;
+
+    // allocate user-facing CPU memory
+    qureg->stateVec.real = (qreal*) malloc(numAmps * sizeof(qureg->stateVec.real));
+    qureg->stateVec.imag = (qreal*) malloc(numAmps * sizeof(qureg->stateVec.imag));
+    validateQuregAllocation(qureg, env, __func__);
+
+    // allocate cuQuantum GPU memory (unvalidated)
+    cudaMalloc( &(qureg->deviceCuStateVec), numAmps * sizeof(*(qureg->deviceCuStateVec)) );
+
+    // allocate private cuQuantum CPU memory (for exchanging with GPU memory)
+    qureg->cuStateVec = (cuAmp*) malloc(numAmps * sizeof(*(qureg->cuStateVec)));
+}
+
+void statevec_destroyQureg(Qureg qureg, QuESTEnv env)
+{
+    // free user-facing CPU memory 
+    free(qureg.stateVec.real);
+    free(qureg.stateVec.imag);
+    
+    // free private cuQuantum CPU memory
+    free(qureg.cuStateVec);
+
+    // free cuQuantum GPU memory
+    cudaFree(qureg.deviceCuStateVec);
+}
+
+void statevec_setAmps(Qureg qureg, long long int startInd, qreal* reals, qreal* imags, long long int numAmps)
+{
+    // slowly manually overwrite subset of private cuQuantum CPU memory
+    for (long long int i=0; i<numAmps; i++)
+        qureg.cuStateVec[i+startInd] = TO_CU_AMP(reals[i], imags[i]);
+
+    // cuda-copy subset to GPU memory subset
+    cudaDeviceSynchronize();
+    cudaMemcpy(
+        &(qureg.deviceCuStateVec[startInd]), 
+        &(qureg.cuStateVec[startInd]), 
+        numAmps * sizeof(cuAmp), cudaMemcpyHostToDevice);
+}
+
+void statevec_copySubstateToGPU(Qureg qureg, long long int startInd, long long int numAmps)
+{
+    statevec_setAmps(qureg, startInd, &(qureg.stateVec.real[startInd]), &(qureg.stateVec.imag[startInd]), numAmps);
+}
+
+void statevec_copySubstateFromGPU(Qureg qureg, long long int startInd, long long int numAmps)
+{
+    // cuda-copy subset of GPU memory to private cuQuantum CPU memory
+    cudaDeviceSynchronize();
+    cudaMemcpy(
+        &(qureg.cuStateVec[startInd]), 
+        &(qureg.deviceCuStateVec[startInd]), 
+        numAmps * sizeof(*(qureg.cuStateVec)), 
+        cudaMemcpyDeviceToHost);
+
+    // slowly manually overwrite public CPU memory from private
+    for (long long int i=startInd; i<(startInd+numAmps); i++) {
+        qureg.stateVec.real[i] = cuAmpReal(qureg.cuStateVec[i]);
+        qureg.stateVec.imag[i] = cuAmpImag(qureg.cuStateVec[i]);
+    }
+}
+
+void copyStateToGPU(Qureg qureg)
+{
+    statevec_copySubstateToGPU(qureg, 0, qureg.numAmpsTotal);
+}
+
+void copyStateFromGPU(Qureg qureg)
+{
+    statevec_copySubstateFromGPU(qureg, 0, qureg.numAmpsTotal);
+}
+
+void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg)
+{
+    // directly cuda-copy the GPU memory 
+    cudaDeviceSynchronize();
+    cudaMemcpy(
+        targetQureg.deviceCuStateVec,
+        copyQureg.deviceCuStateVec,
+        copyQureg.numAmpsTotal * sizeof(cuAmp),
+        cudaMemcpyDeviceToDevice);
+}
+
+qreal statevec_getRealAmp(Qureg qureg, long long int index)
+{
+    cuAmp amp;
+    cudaDeviceSynchronize();
+    cudaMemcpy(&amp, &(qureg.deviceCuStateVec[index]), sizeof(cuAmp), cudaMemcpyDeviceToHost);
+    return cuAmpReal(amp);
+}
+
+qreal statevec_getImagAmp(Qureg qureg, long long int index)
+{
+    cuAmp amp;
+    cudaDeviceSynchronize();
+    cudaMemcpy(&amp, &(qureg.deviceCuStateVec[index]), sizeof(cuAmp), cudaMemcpyDeviceToHost);
+    return cuAmpImag(amp);
+}
+
+
+
+/*
+ * STATE INITIALISATION
+ */
+
+void densmatr_initPlusState(Qureg qureg)
+{
+    qreal val = 1./(1LL << qureg.numQubitsRepresented);
+    thrust::fill(getStartPtr(qureg), getEndPtr(qureg), TO_CU_AMP(val, 0));
+}
+
+void statevec_initZeroState(Qureg qureg)
+{
+    custatevecInitializeStateVector(
+        qureg.cuConfig->cuQuantumHandle, 
+        qureg.deviceCuStateVec, 
+        CU_AMP_IN_STATE_PREC, 
+        qureg.numQubitsInStateVec, 
+        CUSTATEVEC_STATE_VECTOR_TYPE_ZERO);
+}
+
+void statevec_initBlankState(Qureg qureg)
+{
+    // init to |0> = {1, 0, 0, ...}
+    statevec_initZeroState(qureg);
+
+    // overwrite amps[0] to 0
+    qreal re[] = {0};
+    qreal im[] = {0};
+    statevec_setAmps(qureg, 0, re, im, 1);
+}
+
+void statevec_initPlusState(Qureg qureg)
+{
+    custatevecInitializeStateVector(
+        qureg.cuConfig->cuQuantumHandle,
+        qureg.deviceCuStateVec, 
+        CU_AMP_IN_STATE_PREC, 
+        qureg.numQubitsInStateVec, 
+        CUSTATEVEC_STATE_VECTOR_TYPE_UNIFORM);
+}
+
+void statevec_initClassicalState(Qureg qureg, long long int stateInd)
+{
+    // init to |null> = {0, 0, 0, ...}
+    statevec_initBlankState(qureg);
+
+    // overwrite amps[stateInd] to 1
+    qreal re[] = {1};
+    qreal im[] = {0};
+    statevec_setAmps(qureg, stateInd, re, im, 1);
+}
+
+void densmatr_initClassicalState(Qureg qureg, long long int stateInd)
+{
+    // init to |null> = {0, 0, 0, ...}
+    statevec_initBlankState(qureg);
+
+    // index of the desired state in the flat density matrix
+    long long int densityDim = 1LL << qureg.numQubitsRepresented;
+    long long int densityInd = (densityDim + 1)*stateInd;
+
+    // overwrite amps[densityInd] to 1
+    qreal re[] = {1};
+    qreal im[] = {0};
+    statevec_setAmps(qureg, densityInd, re, im, 1);
+}
+
+void statevec_initDebugState(Qureg qureg)
+{
+    // |n> -> (.2n + (.2n+.1)i) |n>
+    cuAmp init = TO_CU_AMP(0,  .1);
+    cuAmp step = TO_CU_AMP(.2, .2);
+    thrust::sequence(getStartPtr(qureg), getEndPtr(qureg), init, step);
+}
+
+void statevec_setWeightedQureg(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out)
+{
+    // out ->  facOut + fac1 qureg1 + fac2 qureg2
+    thrust::for_each(
+        thrust::make_zip_iterator(thrust::make_tuple(getStartPtr(qureg1), getStartPtr(qureg2), getStartPtr(out))),
+        thrust::make_zip_iterator(thrust::make_tuple(getEndPtr(qureg1),   getEndPtr(qureg2),   getEndPtr(out))),
+        functor_setWeightedQureg(toCuAmp(fac1), toCuAmp(fac2), toCuAmp(facOut)));
+}
+
+
+
+/*
+ * OPERATORS
+ */
+
+void statevec_compactUnitary(Qureg qureg, int targetQubit, Complex alpha, Complex beta) 
+{
+    cuAmp a = toCuAmp(alpha);
+    cuAmp b = toCuAmp(beta);
+    cuMatr matrix{
+        a, -cuAmpConj(b),
+        b,  cuAmpConj(a)
+    };
+    custatevec_applyMatrix(qureg, {}, {targetQubit}, matrix);
+}
+
+void statevec_controlledCompactUnitary(Qureg qureg, int controlQubit, int targetQubit, Complex alpha, Complex beta) 
+{
+    cuAmp a = toCuAmp(alpha);
+    cuAmp b = toCuAmp(beta);
+    cuMatr matrix{
+        a, -cuAmpConj(b),
+        b,  cuAmpConj(a)
+    };
+    custatevec_applyMatrix(qureg, {controlQubit}, {targetQubit}, matrix);
+}
+
+void statevec_unitary(Qureg qureg, int targetQubit, ComplexMatrix2 u)
+{
+    custatevec_applyMatrix(qureg, {}, {targetQubit}, toCuMatr(u));
+}
+
+void statevec_multiControlledMultiQubitUnitary(Qureg qureg, long long int ctrlMask, int* targs, int numTargs, ComplexMatrixN u)
+{
+    std::vector<int> c = getIndsFromMask(ctrlMask,qureg.numQubitsInStateVec);
+    std::vector<int> t(targs,targs+numTargs); 
+    custatevec_applyMatrix(qureg, c, t, toCuMatr(u));
+}
+
+void statevec_multiControlledTwoQubitUnitary(Qureg qureg, long long int ctrlMask, int q1, int q2, ComplexMatrix4 u)
+{
+    std::vector<int> c = getIndsFromMask(ctrlMask,qureg.numQubitsInStateVec);
+    custatevec_applyMatrix(qureg, c, {q1,q2}, toCuMatr(u));
+}
+
+void statevec_controlledUnitary(Qureg qureg, int controlQubit, int targetQubit, ComplexMatrix2 u)
+{
+    custatevec_applyMatrix(qureg, {controlQubit}, {targetQubit}, toCuMatr(u));
+}
+
+void statevec_multiControlledUnitary(Qureg qureg, long long int ctrlQubitsMask, long long int ctrlFlipMask, int targetQubit, ComplexMatrix2 u)
+{
+    int targs[] = {targetQubit};
+    std::vector<int> ctrlInds = getIndsFromMask(ctrlQubitsMask,qureg.numQubitsInStateVec);
+    std::vector<int> ctrlVals(ctrlInds.size());
+    for (size_t i=0; i<ctrlInds.size(); i++)
+        ctrlVals[i] = !(ctrlFlipMask & (1LL<<ctrlInds[i]));
+
+    custatevecApplyMatrix(
+        qureg.cuConfig->cuQuantumHandle,
+        qureg.deviceCuStateVec, CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, 
+        toCuMatr(u).data(), CU_AMP_IN_MATRIX_PREC, CUSTATEVEC_MATRIX_LAYOUT_ROW, 0, 
+        targs, 1, ctrlInds.data(), ctrlVals.data(), ctrlInds.size(), 
+        CUSTATEVEC_COMPUTE_DEFAULT, nullptr, 0);
+}
+
+void statevec_pauliX(Qureg qureg, int targetQubit) 
+{
+    cuAmp a0 = TO_CU_AMP(0, 0);
+    cuAmp a1 = TO_CU_AMP(1, 0);
+    cuMatr matrix{
+        a0, a1,
+        a1, a0
+    };
+    custatevec_applyMatrix(qureg, {}, {targetQubit}, matrix);
+}
+
+void statevec_pauliY(Qureg qureg, int targetQubit) 
+{
+    cuAmp a0 = TO_CU_AMP(0, 0);
+    cuAmp aI = TO_CU_AMP(0, 1);
+    cuMatr matrix{
+        a0, -aI,
+        aI,  a0
+    };
+    custatevec_applyMatrix(qureg, {}, {targetQubit}, matrix);
+}
+
+void statevec_pauliYConj(Qureg qureg, int targetQubit) 
+{
+    cuAmp a0 = TO_CU_AMP(0, 0);
+    cuAmp aI = TO_CU_AMP(0, 1);
+    cuMatr matrix{
+         a0, aI,
+        -aI, a0
+    };
+    custatevec_applyMatrix(qureg, {}, {targetQubit}, matrix);
+}
+
+void statevec_controlledPauliY(Qureg qureg, int controlQubit, int targetQubit)
+{
+    cuAmp a0 = TO_CU_AMP(0, 0);
+    cuAmp aI = TO_CU_AMP(0, 1);
+    cuMatr matrix{
+        a0, -aI,
+        aI,  a0
+    };
+    custatevec_applyMatrix(qureg, {controlQubit}, {targetQubit}, matrix);
+}
+
+void statevec_controlledPauliYConj(Qureg qureg, int controlQubit, int targetQubit)
+{
+    cuAmp a0 = TO_CU_AMP(0, 0);
+    cuAmp aI = TO_CU_AMP(0, 1);
+    cuMatr matrix{
+         a0, aI,
+        -aI, a0
+    };
+    custatevec_applyMatrix(qureg, {controlQubit}, {targetQubit}, matrix);
+}
+
+void statevec_phaseShiftByTerm(Qureg qureg, int targetQubit, Complex term)
+{   
+    cuAmp a1 = TO_CU_AMP(1, 0);
+    cuAmp a2 = toCuAmp(term);
+    cuAmp elems[] = {a1, a2};
+
+    custatevec_applyDiagonal(qureg, {}, {targetQubit}, elems);
+}
+
+void statevec_controlledPhaseShift(Qureg qureg, int idQubit1, int idQubit2, qreal angle)
+{
+    cuAmp a1 = TO_CU_AMP(1, 0);
+    cuAmp a2 = TO_CU_AMP(cos(angle), sin(angle));
+    cuAmp elems[] = {a1, a2};
+
+    custatevec_applyDiagonal(qureg, {idQubit1}, {idQubit2}, elems);
+}
+
+void statevec_multiControlledPhaseShift(Qureg qureg, int *controlQubits, int numControlQubits, qreal angle)
+{   
+    cuAmp a1 = TO_CU_AMP(1, 0);
+    cuAmp a2 = TO_CU_AMP(cos(angle), sin(angle));
+    cuAmp elems[] = {a1, a2};
+
+    std::vector<int> targs{controlQubits[0]};
+    std::vector<int> ctrls(controlQubits + 1, controlQubits + numControlQubits); 
+    custatevec_applyDiagonal(qureg, ctrls, targs, elems);
+}
+
+void statevec_multiRotateZ(Qureg qureg, long long int mask, qreal angle)
+{   
+    qreal theta = - angle/2.;
+    std::vector<int> targs = getIndsFromMask(mask, qureg.numQubitsInStateVec);
+    std::vector<custatevecPauli_t> paulis(targs.size(), CUSTATEVEC_PAULI_Z);
+
+    custatevecApplyPauliRotation(
+        qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, 
+        CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, 
+        theta, paulis.data(), targs.data(), targs.size(),
+        nullptr, nullptr, 0);
+}
+
+void statevec_multiControlledMultiRotateZ(Qureg qureg, long long int ctrlMask, long long int targMask, qreal angle)
+{   
+    qreal theta = - angle/2.;
+    std::vector<int> ctrls = getIndsFromMask(ctrlMask, qureg.numQubitsInStateVec);
+    std::vector<int> targs = getIndsFromMask(targMask, qureg.numQubitsInStateVec);
+    std::vector<custatevecPauli_t> paulis(targs.size(), CUSTATEVEC_PAULI_Z);
+
+    custatevecApplyPauliRotation(
+        qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, 
+        CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, 
+        theta, paulis.data(), targs.data(), targs.size(),
+        ctrls.data(), nullptr, ctrls.size());
+}
+
+void statevec_controlledPhaseFlip(Qureg qureg, int idQubit1, int idQubit2)
+{
+    cuAmp a1 = TO_CU_AMP( 1, 0);
+    cuAmp a2 = TO_CU_AMP(-1, 0);
+    cuAmp elems[] = {a1, a2};
+
+    custatevec_applyDiagonal(qureg, {idQubit1}, {idQubit2}, elems);
+}
+
+void statevec_multiControlledPhaseFlip(Qureg qureg, int *controlQubits, int numControlQubits)
+{
+    cuAmp a1 = TO_CU_AMP( 1, 0);
+    cuAmp a2 = TO_CU_AMP(-1, 0);
+    cuAmp elems[] = {a1, a2};
+
+    std::vector<int> targs{controlQubits[0]};
+    std::vector<int> ctrls(controlQubits + 1, controlQubits + numControlQubits); 
+    custatevec_applyDiagonal(qureg, ctrls, targs, elems);
+}
+
+void statevec_swapQubitAmps(Qureg qureg, int qb1, int qb2) 
+{
+    int2 targPairs[] = {{qb1, qb2}}; 
+    int numPairs = 1;
+
+    custatevecSwapIndexBits(
+        qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, 
+        CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, 
+        targPairs, numPairs,
+        nullptr, nullptr, 0);
+}
+
+void statevec_hadamard(Qureg qureg, int targetQubit) 
+{
+    cuAmp a = TO_CU_AMP(1/sqrt(2.), 0);
+    cuMatr matrix{
+        a,  a,
+        a, -a
+    };
+    custatevec_applyMatrix(qureg, {}, {targetQubit}, matrix);
+}
+
+void statevec_controlledNot(Qureg qureg, int controlQubit, int targetQubit)
+{
+    cuAmp a0 = TO_CU_AMP(0, 0);
+    cuAmp a1 = TO_CU_AMP(1, 0);
+    cuMatr matrix{
+        a0, a1,
+        a1, a0
+    };
+    custatevec_applyMatrix(qureg, {controlQubit}, {targetQubit}, matrix);
+}
+
+void statevec_multiControlledMultiQubitNot(Qureg qureg, int ctrlMask, int targMask)
+{
+    // this operator can be effected in one-shot using a custom kernel, but we here
+    // instead resort to slowly (by at most a factor #targs) effect it as a sequence
+    // of one-target multi-ctrl NOT gates.
+
+    cuAmp a0 = TO_CU_AMP(0, 0);
+    cuAmp a1 = TO_CU_AMP(1, 0);
+    cuMatr matrix{
+        a0, a1,
+        a1, a0
+    };
+    std::vector<int> ctrls = getIndsFromMask(ctrlMask, qureg.numQubitsInStateVec);
+    std::vector<int> targs = getIndsFromMask(targMask, qureg.numQubitsInStateVec);
+    for (int targ : targs)
+        custatevec_applyMatrix(qureg, ctrls, {targ}, matrix);
+}
+
+void statevec_applySubDiagonalOp(Qureg qureg, int* targs, SubDiagonalOp op, int conj)
+{
+    // sneakily leverage the CPU cuQuantum memory in order to convert op
+    // (as separate arrays op.real and op.imag) into cuAmp*
+    cuAmp* elems = qureg.cuStateVec;
+    for (long long int i=0; i<op.numElems; i++)
+        elems[i] = TO_CU_AMP(op.real[i], ((conj)? -1 : 1) * op.imag[i]);
+
+    std::vector<int> t(targs, targs + op.numQubits); 
+    custatevec_applyDiagonal(qureg, {}, t, elems);
+}
+
+void statevec_applyDiagonalOp(Qureg qureg, DiagonalOp op) 
+{
+    thrust::transform(
+        getStartDiagonalOpAmpIter(op), getEndDiagonalOpAmpIter(op),
+        getStartPtr(qureg), getStartPtr(qureg), // both are begin iters
+        thrust::multiplies<cuAmp>());
+}
+
+void densmatr_applyDiagonalOp(Qureg qureg, DiagonalOp op)
+{
+    auto startIndIter = thrust::make_counting_iterator(0);
+    auto endIndIter = startIndIter + qureg.numAmpsTotal;
+    auto functor = functor_multDiagOntoDensMatr(qureg.deviceCuStateVec, op);
+    thrust::for_each(startIndIter, endIndIter, functor);
+}
+
+
+
+/*
+ * DECOHERENCE
+ */
+
+void densmatr_mixDensityMatrix(Qureg combineQureg, qreal otherProb, Qureg otherQureg)
+{
+    Complex facOut   = {.real=(1-otherProb), .imag=0};
+    Complex facOther = {.real=otherProb,     .imag=0};
+    Complex zero     = {.real=0,             .imag=0};
+
+    statevec_setWeightedQureg(facOther, otherQureg, zero, otherQureg, facOut, combineQureg);
+}
+
+void densmatr_mixDephasing(Qureg qureg, int targetQubit, qreal dephase) 
+{
+    cuAmp a = TO_CU_AMP(1, 0);
+    cuAmp b = TO_CU_AMP(1-dephase, 0); // dephase = 2*prob
+    cuAmp elems[] = {a, b, b, a};
+
+    std::vector<int> targs{targetQubit, targetQubit + qureg.numQubitsRepresented};
+    custatevec_applyDiagonal(qureg, {}, targs, elems);
+}
+
+void densmatr_mixTwoQubitDephasing(Qureg qureg, int qb1, int qb2, qreal dephase)
+{
+    // this function effects the two-qubit dephasing on a density matrix, via the
+    // four-qubit diagonal superoperator on a Choi vector. The 16 elements of the
+    // diagonal have only two unique entries; 1 and 1-2*dephase/3. It's conceivable
+    // that a bespoke kernel could be faster, but likely by little.
+
+    cuAmp a = TO_CU_AMP(1, 0);
+    cuAmp b = TO_CU_AMP(1 - dephase, 0); // dephase = 4*prob/3
+    cuAmp elems[] = {a, b, b, b,   b, a, b, b,   b, b, a, b,   b, b, b, a};
+
+    int shift = qureg.numQubitsRepresented;
+    std::vector<int> targs{qb1, qb2, qb1 + shift, qb2 + shift};
+    custatevec_applyDiagonal(qureg, {}, targs, elems);
+}
+
+void densmatr_mixDepolarising(Qureg qureg, int targetQubit, qreal depol)
+{
+    // this function effects depolarising as a dense two-qubit superoperator 
+    // on a Choi vector, where only 6 of the 16 elements are non-zero. This is
+    // potentially wasteful, and a bespoke kernel could be faster, leveraging
+    // QuEST's existing GPU code (or the optimised algorithm in the "distributed"
+    // manuscript).
+
+    // depol = (4*prob)/3.0
+    cuAmp a = TO_CU_AMP(depol/2., 0);     // 2*prob/3
+    cuAmp b = TO_CU_AMP(1 - depol/2., 0); // 1-2*prob/3
+    cuAmp c = TO_CU_AMP(1 - depol, 0);    // 1-4*prob/3
+    cuAmp z = TO_CU_AMP(0, 0);            // 0
+
+    cuMatr matr{
+        b, z, z, a,
+        z, c, z, z,
+        z, z, c, z,
+        a, z, z, b
+    };
+    std::vector<int> targs{ targetQubit, targetQubit + qureg.numQubitsRepresented };
+    custatevec_applyMatrix(qureg, {}, targs, matr);
+}
+
+void densmatr_mixDamping(Qureg qureg, int qb, qreal prob)
+{
+    // this function effects damping as a dense two-qubit superoperator 
+    // on a Choi vector, where only 5 of the 16 elements are non-zero. This is
+    // potentially wasteful, and a bespoke kernel could be faster, leveraging
+    // QuEST's existing GPU code (or the optimised algorithm in the "distributed"
+    // manuscript).
+
+    cuAmp w = TO_CU_AMP(1, 0);
+    cuAmp z = TO_CU_AMP(0, 0);
+    cuAmp p = TO_CU_AMP(prob, 0);
+    cuAmp a = TO_CU_AMP(sqrt(1 - prob), 0);
+    cuAmp b = TO_CU_AMP(1-prob, 0);
+
+    cuMatr matr{
+        w, z, z, p,
+        z, a, z, z,
+        z, z, a, z,
+        z, z, z, b
+    };
+    std::vector<int> targs{qb, qb + qureg.numQubitsRepresented};
+    custatevec_applyMatrix(qureg, {}, targs, matr);
+}
+
+
+
+/*
+ * CALCULATIONS
+ */
+
+qreal densmatr_calcTotalProb(Qureg qureg)
+{
+    long long int diagStride = 1 + (1LL << qureg.numQubitsRepresented);
+
+    cuAmp sum = thrust::reduce(
+        getStartStridedAmpIter(qureg, diagStride),
+        getEndStridedAmpIter(qureg, diagStride),
+        TO_CU_AMP(0, 0));
+
+    return cuAmpReal(sum);
+}
+
+qreal statevec_calcTotalProb(Qureg qureg)
+{
+    qreal abs2sum0;
+    qreal abs2sum1;
+    int basisBits[] = {0};
+    int numBasisBits = 1;
+
+    custatevecAbs2SumOnZBasis(
+        qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, 
+        CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec,
+        &abs2sum0, &abs2sum1, basisBits, numBasisBits);
+
+    return abs2sum0 + abs2sum1;
+}
+
+qreal statevec_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome)
+{
+    // we only ever compute prob(outcome=0) as per the QuEST API's limitation
+    qreal prob0;
+    qreal* prob1 = nullptr;
+
+    int basisBits[] = {measureQubit};
+    int numBasisBits = 1;
+
+    custatevecAbs2SumOnZBasis(
+        qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, 
+        CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec,
+        &prob0, prob1, basisBits, numBasisBits);
+
+    return (outcome == 0)? prob0 : (1-prob0);
+}
+
+qreal densmatr_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome)
+{
+    long long int blockSize = (1LL << measureQubit);
+    long long int diagStride = (1LL << qureg.numQubitsRepresented) + 1;
+
+    // we could set this to outcome to sum the outcome=1 amps directly, but the
+    // QuEST API specifies that the outcome=0 amps are always summed instead.
+    int useOffsetBlocks = 0;
+
+    cuAmp sum = thrust::reduce(
+        getStartBlockStridedAmpIter(qureg, blockSize, useOffsetBlocks, diagStride),
+        getEndBlockStridedAmpIter(qureg, blockSize, useOffsetBlocks, diagStride),
+        TO_CU_AMP(0, 0));
+
+    qreal prob0 = cuAmpReal(sum);
+    return (outcome == 0)? prob0 : (1-prob0);
+}
+
+qreal densmatr_calcInnerProduct(Qureg a, Qureg b)
+{
+    cuAmp prod = thrust::inner_product(
+        getStartPtr(a), getEndPtr(a), getStartPtr(b), 
+        TO_CU_AMP(0,0), thrust::plus<cuAmp>(), functor_prodOfConj());
+
+    return cuAmpReal(prod);
+}
+
+Complex statevec_calcInnerProduct(Qureg bra, Qureg ket)
+{
+    cuAmp prod = thrust::inner_product(
+        getStartPtr(bra), getEndPtr(bra), getStartPtr(ket), 
+        TO_CU_AMP(0,0), thrust::plus<cuAmp>(), functor_prodOfConj());
+
+    return (Complex) {.real=cuAmpReal(prod), .imag=cuAmpImag(prod)};
+}
+
+qreal densmatr_calcFidelity(Qureg qureg, Qureg pureState)
+{
+    // create iterator f(i) = sum_j qureg_ij pureState_j
+    auto functor = functor_matrixColumnDotVector(
+        qureg.deviceCuStateVec, pureState.deviceCuStateVec, // both init iters
+        pureState.numAmpsTotal);
+    auto rawIndIter = thrust::make_counting_iterator(0);
+    auto prodAmpIter = thrust::make_transform_iterator(rawIndIter, functor);
+
+    // compute sum_i conj(pureState_i) * f(i)
+    cuAmp prod = thrust::inner_product(
+        getStartPtr(pureState), getEndPtr(pureState), prodAmpIter, 
+        TO_CU_AMP(0,0), thrust::plus<cuAmp>(), functor_prodOfConj());
+
+    qreal prodRe = cuAmpReal(prod);
+    return prodRe;
+}
+
+qreal densmatr_calcHilbertSchmidtDistance(Qureg a, Qureg b)
+{
+    cuAmp trace = thrust::inner_product(
+        getStartPtr(a), getEndPtr(a), getStartPtr(b), 
+        TO_CU_AMP(0,0), thrust::plus<cuAmp>(), functor_absOfDif());
+
+    qreal dist = sqrt(cuAmpReal(trace));
+    return dist;
+}
+
+qreal densmatr_calcPurity(Qureg qureg)
+{
+    qreal sumOfAbsSquared = thrust::transform_reduce(
+        getStartPtr(qureg), getEndPtr(qureg), functor_absSquared(),
+        0., thrust::plus<qreal>());
+
+    return sumOfAbsSquared;
+}
+
+Complex statevec_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op)
+{
+    cuAmp prod = thrust::inner_product(
+        getStartPtr(qureg), getEndPtr(qureg), getStartDiagonalOpAmpIter(op), 
+        TO_CU_AMP(0,0), thrust::plus<cuAmp>(), functor_prodOfAbsSquared());
+
+    return (Complex) {.real=cuAmpReal(prod), .imag=cuAmpImag(prod)};
+}
+
+Complex densmatr_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op)
+{
+   long long int diagStride = 1 + (1LL << qureg.numQubitsRepresented);
+
+    cuAmp prod = thrust::inner_product(
+        getStartStridedAmpIter(qureg, diagStride),
+        getEndStridedAmpIter(qureg, diagStride),
+        getStartDiagonalOpAmpIter(op),
+        TO_CU_AMP(0,0), thrust::plus<cuAmp>(), thrust::multiplies<cuAmp>());
+
+    return (Complex) {.real=cuAmpReal(prod), .imag=cuAmpImag(prod)};
+}
+
+
+
+/*
+ * REDUCTIONS
+ */
+
+void statevec_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outcome, qreal outcomeProb)
+{
+    int basisBits[] = {measureQubit};
+
+    custatevecCollapseOnZBasis(
+        qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, 
+        CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, 
+        outcome, basisBits, 1, outcomeProb);
+}
+
+void densmatr_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outcome, qreal outcomeProb)
+{
+    auto functor = functor_collapseDensMatrToOutcome(
+        qureg.deviceCuStateVec, qureg.numQubitsRepresented, 
+        outcome, outcomeProb, measureQubit);
+
+    auto startIndIter = thrust::make_counting_iterator(0);
+    auto endIndIter = startIndIter + qureg.numAmpsTotal;
+    thrust::for_each(startIndIter, endIndIter, functor);
+}
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/QuEST/src/GPU/QuEST_gpu.cu b/QuEST/src/GPU/QuEST_gpu.cu
index 3b2138a4c..aa2bc085d 100755
--- a/QuEST/src/GPU/QuEST_gpu.cu
+++ b/QuEST/src/GPU/QuEST_gpu.cu
@@ -1,17 +1,17 @@
 // Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details
 
 /** @file
- * An implementation of the backend in ../QuEST_internal.h for a GPU environment.
+ * A custom-kernel implementation of the backend in ../QuEST_internal.h for a GPU environment.
  *
  * @author Ania Brown 
  * @author Tyson Jones
  */
 
 # include "QuEST.h"
+# include "QuEST_gpu_common.h"
 # include "QuEST_precision.h"
 # include "QuEST_validation.h"
-# include "QuEST_internal.h"    // purely to resolve getQuESTDefaultSeedKey
-# include "mt19937ar.h"
+# include "QuEST_internal.h"
 
 # include <stdlib.h>
 # include <stdio.h>
@@ -156,6 +156,25 @@ extern "C" {
 #endif
 
 
+
+QuESTEnv createQuESTEnv(void) {
+    validateGPUExists(GPUExists(), __func__);
+    
+    QuESTEnv env;
+    env.rank=0;
+    env.numRanks=1;
+    
+    env.seeds = NULL;
+    env.numSeeds = 0;
+    seedQuESTDefault(&env);
+
+    return env;
+}
+
+void destroyQuESTEnv(QuESTEnv env){
+    free(env.seeds);
+}
+
 void statevec_setAmps(Qureg qureg, long long int startInd, qreal* reals, qreal* imags, long long int numAmps) {
     
     cudaDeviceSynchronize();
@@ -171,7 +190,6 @@ void statevec_setAmps(Qureg qureg, long long int startInd, qreal* reals, qreal*
         cudaMemcpyHostToDevice);
 }
 
-
 /** works for both statevectors and density matrices */
 void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg) {
     
@@ -189,36 +207,6 @@ void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg) {
         cudaMemcpyDeviceToDevice);
 }
 
-__global__ void densmatr_initPureStateKernel(
-    long long int numPureAmps,
-    qreal *targetVecReal, qreal *targetVecImag, 
-    qreal *copyVecReal, qreal *copyVecImag) 
-{
-    // this is a particular index of the pure copyQureg
-    long long int index = blockIdx.x*blockDim.x + threadIdx.x;
-    if (index>=numPureAmps) return;
-    
-    qreal realRow = copyVecReal[index];
-    qreal imagRow = copyVecImag[index];
-    for (long long int col=0; col < numPureAmps; col++) {
-        qreal realCol =   copyVecReal[col];
-        qreal imagCol = - copyVecImag[col]; // minus for conjugation
-        targetVecReal[col*numPureAmps + index] = realRow*realCol - imagRow*imagCol;
-        targetVecImag[col*numPureAmps + index] = realRow*imagCol + imagRow*realCol;
-    }
-}
-
-void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg)
-{
-    int threadsPerCUDABlock, CUDABlocks;
-    threadsPerCUDABlock = 128;
-    CUDABlocks = ceil((qreal)(copyQureg.numAmpsPerChunk)/threadsPerCUDABlock);
-    densmatr_initPureStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
-        copyQureg.numAmpsPerChunk,
-        targetQureg.deviceStateVec.real, targetQureg.deviceStateVec.imag,
-        copyQureg.deviceStateVec.real,   copyQureg.deviceStateVec.imag);
-}
-
 __global__ void densmatr_initPlusStateKernel(long long int stateVecSize, qreal probFactor, qreal *stateVecReal, qreal *stateVecImag){
     long long int index;
 
@@ -328,178 +316,6 @@ void statevec_destroyQureg(Qureg qureg, QuESTEnv env)
     cudaFree(qureg.secondLevelReduction);
 }
 
-DiagonalOp agnostic_createDiagonalOp(int numQubits, QuESTEnv env) {
-
-    DiagonalOp op;
-    op.numQubits = numQubits;
-    op.numElemsPerChunk = (1LL << numQubits) / env.numRanks;
-    op.chunkId = env.rank;
-    op.numChunks = env.numRanks;
-
-    // allocate CPU memory (initialised to zero)
-    op.real = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal));
-    op.imag = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal));
-    // @TODO no handling of rank>1 allocation (no distributed GPU)
-
-    // check cpu memory allocation was successful
-    validateDiagonalOpAllocation(&op, env, __func__);
-
-    // allocate GPU memory
-    size_t arrSize = op.numElemsPerChunk * sizeof(qreal);
-    cudaMalloc(&(op.deviceOperator.real), arrSize);
-    cudaMalloc(&(op.deviceOperator.imag), arrSize);
-
-    // check gpu memory allocation was successful
-    validateDiagonalOpGPUAllocation(&op, env, __func__);
-
-    // initialise GPU memory to zero
-    cudaMemset(op.deviceOperator.real, 0, arrSize);
-    cudaMemset(op.deviceOperator.imag, 0, arrSize);
-
-    return op;
-}
-
-void agnostic_destroyDiagonalOp(DiagonalOp op) {
-    free(op.real);
-    free(op.imag);
-    cudaFree(op.deviceOperator.real);
-    cudaFree(op.deviceOperator.imag);
-}
-
-void agnostic_syncDiagonalOp(DiagonalOp op) {
-    cudaDeviceSynchronize();
-    size_t mem_elems = op.numElemsPerChunk * sizeof *op.real;
-    cudaMemcpy(op.deviceOperator.real, op.real, mem_elems, cudaMemcpyHostToDevice);
-    cudaMemcpy(op.deviceOperator.imag, op.imag, mem_elems, cudaMemcpyHostToDevice);
-}
-
-__global__ void agnostic_initDiagonalOpFromPauliHamilKernel(
-    DiagonalOp op, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms
-) {    
-    // each thread processes one diagonal element
-    long long int elemInd = blockIdx.x*blockDim.x + threadIdx.x;
-    if (elemInd >= op.numElemsPerChunk)
-        return;
-
-    qreal elem = 0;
-    
-    // elem is (+-) every coefficient, with sign determined by parity
-    for (int t=0; t<numSumTerms; t++) {
-        
-        // determine the parity of the Z-targeted qubits in the element's corresponding state
-        int isOddNumOnes = 0;
-        for (int q=0; q<op.numQubits; q++)
-            if (pauliCodes[q + t*op.numQubits] == PAULI_Z)
-                if (extractBit(q, elemInd))
-                    isOddNumOnes = !isOddNumOnes;
-        
-        // avoid warp divergence
-        int sign = 1 - 2*isOddNumOnes; // (-1 if isOddNumOnes, else +1)
-        elem += termCoeffs[t] * sign;
-    }
-    
-    op.deviceOperator.real[elemInd] = elem;
-    op.deviceOperator.imag[elemInd] = 0;
-}
-
-void agnostic_initDiagonalOpFromPauliHamil(DiagonalOp op, PauliHamil hamil) {
-    
-    // copy args intop GPU memory
-    enum pauliOpType* d_pauliCodes;
-    size_t mem_pauliCodes = hamil.numSumTerms * op.numQubits * sizeof *d_pauliCodes;
-    cudaMalloc(&d_pauliCodes, mem_pauliCodes);
-    cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice);
-    
-    qreal* d_termCoeffs;
-    size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs;
-    cudaMalloc(&d_termCoeffs, mem_termCoeffs);
-    cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice);
-    
-    int numThreadsPerBlock = 128;
-    int numBlocks = ceil(op.numElemsPerChunk / (qreal) numThreadsPerBlock);
-    agnostic_initDiagonalOpFromPauliHamilKernel<<<numBlocks, numThreadsPerBlock>>>(
-        op, d_pauliCodes, d_termCoeffs, hamil.numSumTerms);
-    
-    // copy populated operator into to RAM
-    cudaDeviceSynchronize();
-    size_t mem_elems = op.numElemsPerChunk * sizeof *op.real;
-    cudaMemcpy(op.real, op.deviceOperator.real, mem_elems, cudaMemcpyDeviceToHost);
-    cudaMemcpy(op.imag, op.deviceOperator.imag, mem_elems, cudaMemcpyDeviceToHost);
-
-    cudaFree(d_pauliCodes);
-    cudaFree(d_termCoeffs);
-}
-
-int GPUExists(void){
-    int deviceCount, device;
-    int gpuDeviceCount = 0;
-    struct cudaDeviceProp properties;
-    cudaError_t cudaResultCode = cudaGetDeviceCount(&deviceCount);
-    if (cudaResultCode != cudaSuccess) deviceCount = 0;
-    /* machines with no GPUs can still report one emulation device */
-    for (device = 0; device < deviceCount; ++device) {
-        cudaGetDeviceProperties(&properties, device);
-        if (properties.major != 9999) { /* 9999 means emulation only */
-            ++gpuDeviceCount;
-        }
-    }
-    if (gpuDeviceCount) return 1;
-    else return 0;
-}
-
-QuESTEnv createQuESTEnv(void) {
-    
-    validateGPUExists(GPUExists(), __func__);
-    
-    QuESTEnv env;
-    env.rank=0;
-    env.numRanks=1;
-    
-    env.seeds = NULL;
-    env.numSeeds = 0;
-    seedQuESTDefault(&env);
-    
-    return env;
-}
-
-void syncQuESTEnv(QuESTEnv env){
-    cudaDeviceSynchronize();
-} 
-
-int syncQuESTSuccess(int successCode){
-    return successCode;
-}
-
-void destroyQuESTEnv(QuESTEnv env){
-    free(env.seeds);
-}
-
-void reportQuESTEnv(QuESTEnv env){
-    printf("EXECUTION ENVIRONMENT:\n");
-    printf("Running locally on one node with GPU\n");
-    printf("Number of ranks is %d\n", env.numRanks);
-# ifdef _OPENMP
-    printf("OpenMP enabled\n");
-    printf("Number of threads available is %d\n", omp_get_max_threads());
-# else
-    printf("OpenMP disabled\n");
-# endif
-}
-	
-void getEnvironmentString(QuESTEnv env, char str[200]){
-	
-    // OpenMP can be hybridised with GPU in future, so this check is safe and worthwhile
-    int ompStatus=0;
-    int numThreads=1;
-# ifdef _OPENMP
-    ompStatus=1;
-    numThreads=omp_get_max_threads(); 
-# endif
-	
-    // there is no reporting of CUDA cores/threads/blocks currently (since non-trivial)
-    sprintf(str, "CUDA=1 OpenMP=%d MPI=0 threads=%d ranks=1", ompStatus, numThreads);
-}
-
 void copyStateToGPU(Qureg qureg)
 {
     if (DEBUG) printf("Copying data to GPU\n");
@@ -542,35 +358,6 @@ void statevec_copySubstateFromGPU(Qureg qureg, long long int startInd, long long
     if (DEBUG) printf("Finished copying data from GPU\n");
 }
 
-/** Print the current state vector of probability amplitudes for a set of qubits to standard out. 
-  For debugging purposes. Each rank should print output serially. Only print output for systems <= 5 qubits
- */
-void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank){
-    long long int index;
-    int rank;
-    copyStateFromGPU(qureg); 
-    if (qureg.numQubitsInStateVec<=5){
-        for (rank=0; rank<qureg.numChunks; rank++){
-            if (qureg.chunkId==rank){
-                if (reportRank) {
-                    printf("Reporting state from rank %d [\n", qureg.chunkId);
-                    //printf("\trank, index, real, imag\n");
-                    printf("real, imag\n");
-                } else if (rank==0) {
-                    printf("Reporting state [\n");
-                    printf("real, imag\n");
-                }
-
-                for(index=0; index<qureg.numAmpsPerChunk; index++){
-                    printf(REAL_STRING_FORMAT ", " REAL_STRING_FORMAT "\n", qureg.stateVec.real[index], qureg.stateVec.imag[index]);
-                }
-                if (reportRank || rank==qureg.numChunks-1) printf("]\n");
-            }
-            syncQuESTEnv(env);
-        }
-    }
-}
-
 qreal statevec_getRealAmp(Qureg qureg, long long int index){
     qreal el=0;
     cudaMemcpy(&el, &(qureg.deviceStateVec.real[index]), 
@@ -702,87 +489,6 @@ void statevec_initDebugState(Qureg qureg)
         qureg.deviceStateVec.imag);
 }
 
-__global__ void statevec_initStateOfSingleQubitKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag, int qubitId, int outcome){
-    long long int index;
-    int bit;
-
-    index = blockIdx.x*blockDim.x + threadIdx.x;
-    if (index>=stateVecSize) return;
-
-    qreal normFactor = 1.0/sqrt((qreal)stateVecSize/2);
-    bit = extractBit(qubitId, index);
-    if (bit==outcome) {
-        stateVecReal[index] = normFactor;
-        stateVecImag[index] = 0.0;
-    } else {
-        stateVecReal[index] = 0.0;
-        stateVecImag[index] = 0.0;
-    }
-}
-
-void statevec_initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome)
-{
-    int threadsPerCUDABlock, CUDABlocks;
-    threadsPerCUDABlock = 128;
-    CUDABlocks = ceil((qreal)(qureg->numAmpsPerChunk)/threadsPerCUDABlock);
-    statevec_initStateOfSingleQubitKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg->numAmpsPerChunk, qureg->deviceStateVec.real, qureg->deviceStateVec.imag, qubitId, outcome);
-}
-
-// returns 1 if successful, else 0
-int statevec_initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env){
-    long long int chunkSize, stateVecSize;
-    long long int indexInChunk, totalIndex;
-
-    chunkSize = qureg->numAmpsPerChunk;
-    stateVecSize = chunkSize*qureg->numChunks;
-
-    qreal *stateVecReal = qureg->stateVec.real;
-    qreal *stateVecImag = qureg->stateVec.imag;
-
-    FILE *fp;
-    char line[200];
-
-    fp = fopen(filename, "r");
-    if (fp == NULL)
-        return 0;
-    
-    indexInChunk = 0; totalIndex = 0;
-    while (fgets(line, sizeof(char)*200, fp) != NULL && totalIndex<stateVecSize){
-        if (line[0]!='#'){
-            int chunkId = totalIndex/chunkSize;
-            if (chunkId==qureg->chunkId){
-                sscanf(line, REAL_SPECIFIER ", " REAL_SPECIFIER, &(stateVecReal[indexInChunk]),
-                        &(stateVecImag[indexInChunk]));
-                indexInChunk += 1;
-            }
-            totalIndex += 1;
-        }
-    }
-    fclose(fp);
-    copyStateToGPU(*qureg);
-    
-    // indicate success
-    return 1;
-}
-
-int statevec_compareStates(Qureg mq1, Qureg mq2, qreal precision){
-    qreal diff;
-    int chunkSize = mq1.numAmpsPerChunk;
-
-    copyStateFromGPU(mq1);
-    copyStateFromGPU(mq2);
-
-    for (int i=0; i<chunkSize; i++){
-        diff = mq1.stateVec.real[i] - mq2.stateVec.real[i];
-        if (diff<0) diff *= -1;
-        if (diff>precision) return 0;
-        diff = mq1.stateVec.imag[i] - mq2.stateVec.imag[i];
-        if (diff<0) diff *= -1;
-        if (diff>precision) return 0;
-    }
-    return 1;
-}
-
 __global__ void statevec_compactUnitaryKernel (Qureg qureg, int rotQubit, Complex alpha, Complex beta){
     // ----- sizes
     long long int sizeBlock,                                           // size of blocks
@@ -2171,131 +1877,6 @@ qreal densmatr_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome)
     return outcomeProb;
 }
 
-// atomicAdd on floats/doubles isn't available on <6 CC devices, so we add it ourselves
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
-#else
-static __inline__ __device__ double atomicAdd(double* address, double val)
-{
-    unsigned long long int* address_as_ull = (unsigned long long int*) address;
-    unsigned long long int old = *address_as_ull, assumed;
-
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed,
-            __double_as_longlong(val + __longlong_as_double(assumed)));
-
-    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
-    } while (assumed != old);
-
-    return __longlong_as_double(old);
-}
-#endif
-
-__global__ void statevec_calcProbOfAllOutcomesKernel(
-    qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits
-) {
-    // each thread handles one amplitude (all amplitudes are involved)
-    long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x;
-    if (ampInd >= qureg.numAmpsTotal) return;
-    
-    qreal prob = (
-        qureg.deviceStateVec.real[ampInd]*qureg.deviceStateVec.real[ampInd] + 
-        qureg.deviceStateVec.imag[ampInd]*qureg.deviceStateVec.imag[ampInd]);
-    
-    // each amplitude contributes to one outcome
-    long long int outcomeInd = 0;
-    for (int q=0; q<numQubits; q++)
-        outcomeInd += extractBit(qubits[q], ampInd) * (1LL << q);
-    
-    // each thread atomically writes directly to the global output.
-    // this beat block-heirarchal atomic reductions in both global and shared memory!
-    atomicAdd(&outcomeProbs[outcomeInd], prob);
-}
-
-void statevec_calcProbOfAllOutcomes(qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits) {
-
-    // copy qubits to GPU memory
-    int* d_qubits;
-    size_t mem_qubits = numQubits * sizeof *d_qubits;
-    cudaMalloc(&d_qubits, mem_qubits);
-    cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice);
-
-    // create one thread for every amplitude
-    int numThreadsPerBlock = 128;
-    int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) numThreadsPerBlock);
-    
-    // create global GPU array for outcomeProbs
-    qreal* d_outcomeProbs;
-    long long int numOutcomes = (1LL << numQubits);
-    size_t mem_outcomeProbs = numOutcomes * sizeof *d_outcomeProbs;
-    cudaMalloc(&d_outcomeProbs, mem_outcomeProbs);
-    cudaMemset(d_outcomeProbs, 0, mem_outcomeProbs);
-    
-    // populate per-block subarrays
-    statevec_calcProbOfAllOutcomesKernel<<<numBlocks, numThreadsPerBlock>>>(
-        d_outcomeProbs, qureg, d_qubits, numQubits);
-        
-    // copy outcomeProbs from GPU memory
-    cudaMemcpy(outcomeProbs, d_outcomeProbs, mem_outcomeProbs, cudaMemcpyDeviceToHost);
-    
-    // free GPU memory
-    cudaFree(d_qubits);
-    cudaFree(d_outcomeProbs);
-}
-
-__global__ void densmatr_calcProbOfAllOutcomesKernel(
-    qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits
-) {
-    // each thread handles one diagonal amplitude
-    long long int diagInd = blockIdx.x*blockDim.x + threadIdx.x;
-    long long int numDiags = (1LL << qureg.numQubitsRepresented);
-    if (diagInd >= numDiags) return;
-    
-    long long int flatInd = (1 + numDiags)*diagInd;
-    qreal prob = qureg.deviceStateVec.real[flatInd];   // im[flatInd] assumed ~ 0
-    
-    // each diagonal amplitude contributes to one outcome
-    long long int outcomeInd = 0;
-    for (int q=0; q<numQubits; q++)
-        outcomeInd += extractBit(qubits[q], diagInd) * (1LL << q);
-    
-    // each thread atomically writes directly to the global output.
-    // this beat block-heirarchal atomic reductions in both global and shared memory!
-    atomicAdd(&outcomeProbs[outcomeInd], prob);
-}
-
-void densmatr_calcProbOfAllOutcomes(qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits) {
-
-    // copy qubits to GPU memory
-    int* d_qubits;
-    size_t mem_qubits = numQubits * sizeof *d_qubits;
-    cudaMalloc(&d_qubits, mem_qubits);
-    cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice);
-    
-    // create global array, with per-block subarrays
-    int numThreadsPerBlock = 128;
-    int numDiags = (1LL << qureg.numQubitsRepresented);
-    int numBlocks = ceil(numDiags / (qreal) numThreadsPerBlock);
-        
-    // create global GPU array for outcomeProbs
-    qreal* d_outcomeProbs;
-    long long int numOutcomes = (1LL << numQubits);
-    size_t mem_outcomeProbs = numOutcomes * sizeof *d_outcomeProbs;
-    cudaMalloc(&d_outcomeProbs, mem_outcomeProbs);
-    cudaMemset(d_outcomeProbs, 0, mem_outcomeProbs);
-    
-    // populate per-block subarrays
-    densmatr_calcProbOfAllOutcomesKernel<<<numBlocks, numThreadsPerBlock>>>(
-        d_outcomeProbs, qureg, d_qubits, numQubits);
-        
-    // copy outcomeProbs from GPU memory
-    cudaMemcpy(outcomeProbs, d_outcomeProbs, mem_outcomeProbs, cudaMemcpyDeviceToHost);
-    
-    // free GPU memory
-    cudaFree(d_qubits);
-    cudaFree(d_outcomeProbs);
-}
-
 /** computes Tr(conjTrans(a) b) = sum of (a_ij^* b_ij), which is a real number */
 __global__ void densmatr_calcInnerProductKernel(
     Qureg a, Qureg b, long long int numTermsToSum, qreal* reducedArray
@@ -3080,74 +2661,6 @@ void densmatr_mixDamping(Qureg qureg, int targetQubit, qreal damping) {
         part1, part2, part3, bothBits);
 }
 
-/** Called once for every 16 amplitudes */
-__global__ void densmatr_mixTwoQubitDepolarisingKernel(
-    qreal depolLevel, qreal* vecReal, qreal *vecImag, long long int numAmpsToVisit,
-    long long int part1, long long int part2, long long int part3, 
-    long long int part4, long long int part5,
-    long long int rowCol1, long long int rowCol2)
-{
-    long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
-    if (scanInd >= numAmpsToVisit) return;
-    
-    // index of |..0..0..><..0..0|
-    long long int ind00 = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2) + ((scanInd&part4)<<3) + ((scanInd&part5)<<4);
-    long long int ind01 = ind00 + rowCol1;
-    long long int ind10 = ind00 + rowCol2;
-    long long int ind11 = ind00 + rowCol1 + rowCol2;
-    
-    qreal realAvDepol = depolLevel * 0.25 * (
-        vecReal[ind00] + vecReal[ind01] + vecReal[ind10] + vecReal[ind11]);
-    qreal imagAvDepol = depolLevel * 0.25 * (
-        vecImag[ind00] + vecImag[ind01] + vecImag[ind10] + vecImag[ind11]);
-    
-    qreal retain = 1 - depolLevel;
-    vecReal[ind00] *= retain; vecImag[ind00] *= retain;
-    vecReal[ind01] *= retain; vecImag[ind01] *= retain;
-    vecReal[ind10] *= retain; vecImag[ind10] *= retain;
-    vecReal[ind11] *= retain; vecImag[ind11] *= retain;
-
-    vecReal[ind00] += realAvDepol; vecImag[ind00] += imagAvDepol;
-    vecReal[ind01] += realAvDepol; vecImag[ind01] += imagAvDepol;
-    vecReal[ind10] += realAvDepol; vecImag[ind10] += imagAvDepol;
-    vecReal[ind11] += realAvDepol; vecImag[ind11] += imagAvDepol;
-}
-
-void densmatr_mixTwoQubitDepolarising(Qureg qureg, int qubit1, int qubit2, qreal depolLevel) {
-    
-    if (depolLevel == 0)
-        return;
-    
-    // assumes qubit2 > qubit1
-    
-    densmatr_mixTwoQubitDephasing(qureg, qubit1, qubit2, depolLevel);
-    
-    int rowQubit1 = qubit1 + qureg.numQubitsRepresented;
-    int rowQubit2 = qubit2 + qureg.numQubitsRepresented;
-    
-    long long int colBit1 = 1LL << qubit1;
-    long long int rowBit1 = 1LL << rowQubit1;
-    long long int colBit2 = 1LL << qubit2;
-    long long int rowBit2 = 1LL << rowQubit2;
-    
-    long long int rowCol1 = colBit1 | rowBit1;
-    long long int rowCol2 = colBit2 | rowBit2;
-    
-    long long int numAmpsToVisit = qureg.numAmpsPerChunk/16;
-    long long int part1 = colBit1 - 1;
-    long long int part2 = (colBit2 >> 1) - colBit1;
-    long long int part3 = (rowBit1 >> 2) - (colBit2 >> 1);
-    long long int part4 = (rowBit2 >> 3) - (rowBit1 >> 2);
-    long long int part5 = numAmpsToVisit - (rowBit2 >> 3);
-    
-    int threadsPerCUDABlock, CUDABlocks;
-    threadsPerCUDABlock = 128;
-    CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
-    densmatr_mixTwoQubitDepolarisingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
-        depolLevel, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
-        part1, part2, part3, part4, part5, rowCol1, rowCol2);
-}
-
 __global__ void statevec_setWeightedQuregKernel(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out) {
 
     long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x;
@@ -3566,569 +3079,6 @@ Complex densmatr_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op) {
     return expecVal;
 }
 
-void agnostic_setDiagonalOpElems(DiagonalOp op, long long int startInd, qreal* real, qreal* imag, long long int numElems) {
-
-    // update both RAM and VRAM, for consistency
-    memcpy(&op.real[startInd], real, numElems * sizeof(qreal));
-    memcpy(&op.imag[startInd], imag, numElems * sizeof(qreal));
-
-    cudaDeviceSynchronize();
-    cudaMemcpy(
-        op.deviceOperator.real + startInd, 
-        real,
-        numElems * sizeof(*(op.deviceOperator.real)), 
-        cudaMemcpyHostToDevice);
-    cudaMemcpy(
-        op.deviceOperator.imag + startInd,
-        imag,
-        numElems * sizeof(*(op.deviceOperator.imag)), 
-        cudaMemcpyHostToDevice);
-}
-
-__global__ void statevec_applyPhaseFuncOverridesKernel(
-    Qureg qureg, int* qubits, int numQubits, enum bitEncoding encoding,
-    qreal* coeffs, qreal* exponents, int numTerms, 
-    long long int* overrideInds, qreal* overridePhases, int numOverrides, 
-    int conj
-) {
-    long long int index = blockIdx.x*blockDim.x + threadIdx.x;
-    if (index>=qureg.numAmpsPerChunk) return;
-
-    // determine global amplitude index (non-distributed, so it's just local index)
-    long long int globalAmpInd = index;
-
-    // determine phase index of {qubits}
-    long long int phaseInd = 0LL;
-    if (encoding == UNSIGNED) {
-        for (int q=0; q<numQubits; q++)
-            phaseInd += (1LL << q) * extractBit(qubits[q], globalAmpInd);
-    }
-    else if (encoding == TWOS_COMPLEMENT) {
-        for (int q=0; q<numQubits-1; q++) // use final qubit to indicate sign 
-            phaseInd += (1LL << q) * extractBit(qubits[q], globalAmpInd);
-        if (extractBit(qubits[numQubits-1], globalAmpInd) == 1)
-            phaseInd -= (1LL << (numQubits-1));
-    }
-
-    // determine if this phase index has an overriden value (i < numOverrides)
-    int i;
-    for (i=0; i<numOverrides; i++)
-        if (phaseInd == overrideInds[i])
-            break;
-
-    // determine phase from {coeffs}, {exponents} (unless overriden)
-    qreal phase = 0;
-    if (i < numOverrides)
-        phase = overridePhases[i];
-    else
-        for (int t=0; t<numTerms; t++)
-            phase += coeffs[t] * pow((qreal) phaseInd, (qreal) exponents[t]);
-            
-    // negate phase to conjugate operator 
-    if (conj)
-        phase *= -1;
-
-    // modify amp to amp * exp(i phase) 
-    qreal c = cos(phase);
-    qreal s = sin(phase);
-    qreal re = qureg.deviceStateVec.real[index];
-    qreal im = qureg.deviceStateVec.imag[index];
-
-    // = {re[amp] cos(phase) - im[amp] sin(phase)} + i {re[amp] sin(phase) + im[amp] cos(phase)}
-    qureg.deviceStateVec.real[index] = re*c - im*s;
-    qureg.deviceStateVec.imag[index] = re*s + im*c;
-}
-
- void statevec_applyPhaseFuncOverrides(
-     Qureg qureg, int* qubits, int numQubits, enum bitEncoding encoding,
-     qreal* coeffs, qreal* exponents, int numTerms, 
-     long long int* overrideInds, qreal* overridePhases, int numOverrides,
-     int conj
- ) {
-    // allocate device space for global list of {qubits}, {coeffs}, {exponents}, {overrideInds} and {overridePhases}
-    int* d_qubits;                          size_t mem_qubits = numQubits * sizeof *d_qubits;
-    qreal* d_coeffs;                        size_t mem_terms = numTerms * sizeof *d_coeffs;
-    qreal* d_exponents;                 
-    long long int* d_overrideInds;          size_t mem_inds = numOverrides * sizeof *d_overrideInds;
-    qreal* d_overridePhases;                size_t mem_phas = numOverrides * sizeof *d_overridePhases;
-    cudaMalloc(&d_qubits, mem_qubits);      cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice);
-    cudaMalloc(&d_coeffs, mem_terms);       cudaMemcpy(d_coeffs, coeffs, mem_terms, cudaMemcpyHostToDevice);
-    cudaMalloc(&d_exponents, mem_terms);    cudaMemcpy(d_exponents, exponents, mem_terms, cudaMemcpyHostToDevice);
-    cudaMalloc(&d_overrideInds, mem_inds);  cudaMemcpy(d_overrideInds, overrideInds, mem_inds, cudaMemcpyHostToDevice);
-    cudaMalloc(&d_overridePhases,mem_phas); cudaMemcpy(d_overridePhases, overridePhases, mem_phas, cudaMemcpyHostToDevice);
-
-    // call kernel
-    int threadsPerCUDABlock = 128;
-    int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock);
-    statevec_applyPhaseFuncOverridesKernel<<<CUDABlocks,threadsPerCUDABlock>>>(
-        qureg, d_qubits, numQubits, encoding, 
-        d_coeffs, d_exponents, numTerms, 
-        d_overrideInds, d_overridePhases, numOverrides,
-        conj);
-
-    // cleanup device memory 
-    cudaFree(d_qubits);
-    cudaFree(d_coeffs);
-    cudaFree(d_exponents);
-    cudaFree(d_overrideInds);
-    cudaFree(d_overridePhases);
-}
-
-__global__ void statevec_applyMultiVarPhaseFuncOverridesKernel(
-    Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding,
-    qreal* coeffs, qreal* exponents, int* numTermsPerReg, 
-    long long int* overrideInds, qreal* overridePhases, int numOverrides,
-    long long int *phaseInds,
-    int conj
-) {
-    long long int index = blockIdx.x*blockDim.x + threadIdx.x;
-    if (index>=qureg.numAmpsPerChunk) return;
-
-    // determine global amplitude index (non-distributed, so it's just local index)
-    long long int globalAmpInd = index;
-
-    /*
-     * each thread needs to write to a local:
-     *      long long int phaseInds[numRegs];
-     * but instead has access to shared array phaseInds, with below stride and offset
-    */
-    size_t stride = gridDim.x*blockDim.x;
-    size_t offset = blockIdx.x*blockDim.x + threadIdx.x;
-
-    // determine phase indices
-    int flatInd = 0;
-    if (encoding == UNSIGNED) {
-        for (int r=0; r<numRegs; r++) {
-            phaseInds[r*stride+offset] = 0LL;
-            for (int q=0; q<numQubitsPerReg[r]; q++)
-                phaseInds[r*stride+offset] += (1LL << q) * extractBit(qubits[flatInd++], globalAmpInd);
-        }
-    }
-    else if  (encoding == TWOS_COMPLEMENT) {
-        for (int r=0; r<numRegs; r++) {
-            phaseInds[r*stride+offset] = 0LL;
-            for (int q=0; q<numQubitsPerReg[r]-1; q++)  
-                phaseInds[r*stride+offset] += (1LL << q) * extractBit(qubits[flatInd++], globalAmpInd);
-            // use final qubit to indicate sign
-            if (extractBit(qubits[flatInd++], globalAmpInd) == 1)
-                phaseInds[r*stride+offset] -= (1LL << (numQubitsPerReg[r]-1)); 
-        }
-    }
-
-    // determine if this phase index has an overriden value (i < numOverrides)
-    int i;
-    for (i=0; i<numOverrides; i++) {
-        int found = 1;
-        for (int r=0; r<numRegs; r++) {
-            if (phaseInds[r*stride+offset] != overrideInds[i*numRegs+r]) {
-                found = 0;
-                break;
-            }
-        }
-        if (found)
-            break;
-    }
-
-    // compute the phase (unless overriden)
-    qreal phase = 0;
-    if (i < numOverrides)
-        phase = overridePhases[i];
-    else {
-        flatInd = 0;
-        for (int r=0; r<numRegs; r++) {
-            for (int t=0; t<numTermsPerReg[r]; t++) {
-                phase += coeffs[flatInd] * pow((qreal) phaseInds[r*stride+offset], (qreal) exponents[flatInd]);
-                flatInd++;
-            }
-        }
-    }
-    
-    // negate phase to conjugate operator 
-    if (conj)
-        phase *= -1;
-
-    // modify amp to amp * exp(i phase) 
-    qreal c = cos(phase);
-    qreal s = sin(phase);
-    qreal re = qureg.deviceStateVec.real[index];
-    qreal im = qureg.deviceStateVec.imag[index];
-
-    // = {re[amp] cos(phase) - im[amp] sin(phase)} + i {re[amp] sin(phase) + im[amp] cos(phase)}
-    qureg.deviceStateVec.real[index] = re*c - im*s;
-    qureg.deviceStateVec.imag[index] = re*s + im*c;
-}
-
-void statevec_applyMultiVarPhaseFuncOverrides(
-    Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding,
-    qreal* coeffs, qreal* exponents, int* numTermsPerReg, 
-    long long int* overrideInds, qreal* overridePhases, int numOverrides,
-    int conj
-) {
-    // determine size of arrays, for cloning into GPU memory
-    size_t mem_numQubitsPerReg = numRegs * sizeof *numQubitsPerReg;
-    size_t mem_numTermsPerReg = numRegs * sizeof *numTermsPerReg;
-    size_t mem_overridePhases = numOverrides * sizeof *overridePhases;
-    size_t mem_overrideInds = numOverrides * numRegs * sizeof *overrideInds;
-    size_t mem_qubits = 0;
-    size_t mem_coeffs = 0;  
-    size_t mem_exponents = 0;
-    for (int r=0; r<numRegs; r++) {
-        mem_qubits += numQubitsPerReg[r] * sizeof *qubits;
-        mem_coeffs += numTermsPerReg[r] * sizeof *coeffs;
-        mem_exponents += numTermsPerReg[r] * sizeof *exponents;
-    }
-
-    // allocate global GPU memory
-    int* d_qubits;                  cudaMalloc(&d_qubits,           mem_qubits);
-    qreal* d_coeffs;                cudaMalloc(&d_coeffs,           mem_coeffs);
-    qreal* d_exponents;             cudaMalloc(&d_exponents,        mem_exponents);
-    int* d_numQubitsPerReg;         cudaMalloc(&d_numQubitsPerReg,  mem_numQubitsPerReg);
-    int* d_numTermsPerReg;          cudaMalloc(&d_numTermsPerReg,   mem_numTermsPerReg);
-    long long int* d_overrideInds;  cudaMalloc(&d_overrideInds,     mem_overrideInds);
-    qreal* d_overridePhases;        cudaMalloc(&d_overridePhases,   mem_overridePhases);
-
-    // copy function args into GPU memory
-    cudaMemcpy(d_qubits, qubits,                    mem_qubits,             cudaMemcpyHostToDevice);
-    cudaMemcpy(d_coeffs, coeffs,                    mem_coeffs,             cudaMemcpyHostToDevice);
-    cudaMemcpy(d_exponents, exponents,              mem_exponents,          cudaMemcpyHostToDevice);
-    cudaMemcpy(d_numQubitsPerReg, numQubitsPerReg,  mem_numQubitsPerReg,    cudaMemcpyHostToDevice);
-    cudaMemcpy(d_numTermsPerReg, numTermsPerReg,    mem_numTermsPerReg,     cudaMemcpyHostToDevice);
-    cudaMemcpy(d_overrideInds, overrideInds,        mem_overrideInds,       cudaMemcpyHostToDevice);
-    cudaMemcpy(d_overridePhases, overridePhases,    mem_overridePhases,     cudaMemcpyHostToDevice);
-
-    int threadsPerCUDABlock = 128;
-    int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock);
-
-    // allocate thread-local working space {phaseInds}
-    long long int *d_phaseInds;
-    size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks;
-    cudaMalloc(&d_phaseInds, numRegs*gridSize * sizeof *d_phaseInds);
-
-    // call kernel
-    statevec_applyMultiVarPhaseFuncOverridesKernel<<<CUDABlocks,threadsPerCUDABlock>>>(
-        qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding,
-        d_coeffs, d_exponents, d_numTermsPerReg, 
-        d_overrideInds, d_overridePhases, numOverrides,
-        d_phaseInds, 
-        conj);
-
-    // free device memory
-    cudaFree(d_qubits);
-    cudaFree(d_coeffs);
-    cudaFree(d_exponents);
-    cudaFree(d_numQubitsPerReg);
-    cudaFree(d_numTermsPerReg);
-    cudaFree(d_overrideInds);
-    cudaFree(d_overridePhases);
-    cudaFree(d_phaseInds);
-}
-
-__global__ void statevec_applyParamNamedPhaseFuncOverridesKernel(
-    Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding,
-    enum phaseFunc phaseFuncName, qreal* params, int numParams,
-    long long int* overrideInds, qreal* overridePhases, int numOverrides,
-    long long int* phaseInds,
-    int conj
-) {
-    long long int index = blockIdx.x*blockDim.x + threadIdx.x;
-    if (index>=qureg.numAmpsPerChunk) return;
-
-    // determine global amplitude index (non-distributed, so it's just local index)
-    long long int globalAmpInd = index;
-
-    /*
-     * each thread needs to write to a local:
-     *      long long int phaseInds[numRegs];
-     * but instead has access to shared array phaseInds, with below stride and offset
-    */
-    size_t stride = gridDim.x*blockDim.x;
-    size_t offset = blockIdx.x*blockDim.x + threadIdx.x;
-
-    // determine phase indices
-    if (encoding == UNSIGNED) {
-        int flatInd = 0;
-        for (int r=0; r<numRegs; r++) {
-            phaseInds[r*stride+offset] = 0LL;
-            for (int q=0; q<numQubitsPerReg[r]; q++)
-                phaseInds[r*stride+offset] += (1LL << q) * extractBit(qubits[flatInd++], globalAmpInd);
-        }
-    }
-    else if  (encoding == TWOS_COMPLEMENT) {
-        int flatInd = 0;
-        for (int r=0; r<numRegs; r++) {
-            phaseInds[r*stride+offset] = 0LL;
-            for (int q=0; q<numQubitsPerReg[r]-1; q++)  
-                phaseInds[r*stride+offset] += (1LL << q) * extractBit(qubits[flatInd++], globalAmpInd);
-            // use final qubit to indicate sign
-            if (extractBit(qubits[flatInd++], globalAmpInd) == 1)
-                phaseInds[r*stride+offset] -= (1LL << (numQubitsPerReg[r]-1));
-        }
-    }
-
-    // determine if this phase index has an overriden value (i < numOverrides)
-    int i;
-    for (i=0; i<numOverrides; i++) {
-        int found = 1;
-        for (int r=0; r<numRegs; r++) {
-            if (phaseInds[r*stride+offset] != overrideInds[i*numRegs+r]) {
-                found = 0;
-                break;
-            }
-        }
-        if (found)
-            break;
-    }
-
-    // compute the phase (unless overriden)
-    qreal phase = 0;
-    if (i < numOverrides)
-        phase = overridePhases[i];
-    else {
-        // compute norm related phases
-        if (phaseFuncName == NORM || phaseFuncName == INVERSE_NORM ||
-            phaseFuncName == SCALED_NORM || phaseFuncName == SCALED_INVERSE_NORM ||
-            phaseFuncName == SCALED_INVERSE_SHIFTED_NORM) {
-            qreal norm = 0;
-            if (phaseFuncName == SCALED_INVERSE_SHIFTED_NORM) {
-                for (int r=0; r<numRegs; r++) {
-                    qreal dif = phaseInds[r*stride+offset] - params[2+r];
-                    norm += dif*dif;
-                }
-            }
-            else
-                for (int r=0; r<numRegs; r++)
-                    norm += phaseInds[r*stride+offset]*phaseInds[r*stride+offset];
-            norm = sqrt(norm);
-
-            if (phaseFuncName == NORM)
-                phase = norm;
-            else if (phaseFuncName == INVERSE_NORM)
-                phase = (norm == 0.)? params[0] : 1/norm; // smallest non-zero norm is 1
-            else if (phaseFuncName == SCALED_NORM)
-                phase = params[0] * norm;
-            else if (phaseFuncName == SCALED_INVERSE_NORM || phaseFuncName == SCALED_INVERSE_SHIFTED_NORM)
-                phase = (norm <= REAL_EPS)? params[1] : params[0] / norm; // unless shifted closer to zero
-        }
-        // compute product related phases
-        else if (phaseFuncName == PRODUCT || phaseFuncName == INVERSE_PRODUCT ||
-                 phaseFuncName == SCALED_PRODUCT || phaseFuncName == SCALED_INVERSE_PRODUCT) {
-
-            qreal prod = 1;
-            for (int r=0; r<numRegs; r++)
-                prod *= phaseInds[r*stride+offset];
-
-            if (phaseFuncName == PRODUCT)
-                phase = prod;
-            else if (phaseFuncName == INVERSE_PRODUCT)
-                phase = (prod == 0.)? params[0] : 1/prod; // smallest non-zero prod is +- 1
-            else if (phaseFuncName == SCALED_PRODUCT)
-                phase = params[0] * prod;
-            else if (phaseFuncName == SCALED_INVERSE_PRODUCT)
-                phase = (prod == 0.)? params[1] : params[0] / prod;
-        }
-        // compute Euclidean distance related phases 
-        else if (phaseFuncName == DISTANCE || phaseFuncName == INVERSE_DISTANCE ||
-                 phaseFuncName == SCALED_DISTANCE || phaseFuncName == SCALED_INVERSE_DISTANCE ||
-                 phaseFuncName == SCALED_INVERSE_SHIFTED_DISTANCE || phaseFuncName == SCALED_INVERSE_SHIFTED_WEIGHTED_DISTANCE) {
-
-            qreal dist = 0;
-            if (phaseFuncName == SCALED_INVERSE_SHIFTED_DISTANCE) {
-                for (int r=0; r<numRegs; r+=2) {
-                    qreal dif = (phaseInds[r*stride+offset] - phaseInds[(r+1)*stride+offset] - params[2+r/2]);
-                    dist += dif*dif;
-                }
-            }
-            else if (phaseFuncName == SCALED_INVERSE_SHIFTED_WEIGHTED_DISTANCE) {
-                for (int r=0; r<numRegs; r+=2) {
-                    qreal dif = (phaseInds[r*stride+offset] - phaseInds[(r+1)*stride+offset] - params[2+r+1]);
-                    dist += params[2+r] * dif*dif;
-                }
-            }
-            else
-                for (int r=0; r<numRegs; r+=2) {
-                    qreal dif = (phaseInds[(r+1)*stride+offset] - phaseInds[r*stride+offset]);
-                    dist += dif*dif;
-                }
-
-            // if sqrt() arg would be negative, set it to divergence param
-            if (dist < 0)
-                dist = 0;
-            dist = sqrt(dist);
-
-            if (phaseFuncName == DISTANCE)
-                phase = dist;
-            else if (phaseFuncName == INVERSE_DISTANCE)
-                phase = (dist == 0.)? params[0] : 1/dist; // smallest non-zero dist is 1
-            else if (phaseFuncName == SCALED_DISTANCE)
-                phase = params[0] * dist;
-            else if (phaseFuncName == SCALED_INVERSE_DISTANCE || phaseFuncName == SCALED_INVERSE_SHIFTED_DISTANCE || phaseFuncName == SCALED_INVERSE_SHIFTED_WEIGHTED_DISTANCE)
-                phase = (dist <= REAL_EPS)? params[1] : params[0] / dist; // unless shifted closer
-        }
-    }
-    
-    
-    // negate phase to conjugate operator 
-    if (conj)
-        phase *= -1;
-
-    // modify amp to amp * exp(i phase) 
-    qreal c = cos(phase);
-    qreal s = sin(phase);
-    qreal re = qureg.deviceStateVec.real[index];
-    qreal im = qureg.deviceStateVec.imag[index];
-
-    // = {re[amp] cos(phase) - im[amp] sin(phase)} + i {re[amp] sin(phase) + im[amp] cos(phase)}
-    qureg.deviceStateVec.real[index] = re*c - im*s;
-    qureg.deviceStateVec.imag[index] = re*s + im*c;
-}
-
-void statevec_applyParamNamedPhaseFuncOverrides(
-    Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding,
-    enum phaseFunc phaseFuncName, qreal* params, int numParams,
-    long long int* overrideInds, qreal* overridePhases, int numOverrides,
-    int conj 
-) {
-    // determine size of arrays, for cloning into GPU memory
-    size_t mem_numQubitsPerReg = numRegs * sizeof *numQubitsPerReg;
-    size_t mem_overridePhases = numOverrides * sizeof *overridePhases;
-    size_t mem_overrideInds = numOverrides * numRegs * sizeof *overrideInds;
-    size_t mem_params = numParams * sizeof *params;
-    size_t mem_qubits = 0;
-    for (int r=0; r<numRegs; r++)
-        mem_qubits += numQubitsPerReg[r] * sizeof *qubits;
-
-    // allocate global GPU memory
-    int* d_qubits;                  cudaMalloc(&d_qubits,           mem_qubits);
-    int* d_numQubitsPerReg;         cudaMalloc(&d_numQubitsPerReg,  mem_numQubitsPerReg);
-    long long int* d_overrideInds;  cudaMalloc(&d_overrideInds,     mem_overrideInds);
-    qreal* d_overridePhases;        cudaMalloc(&d_overridePhases,   mem_overridePhases);
-    qreal* d_params = NULL;         if (numParams > 0) cudaMalloc(&d_params, mem_params);
-
-    // copy function args into GPU memory
-    cudaMemcpy(d_qubits, qubits,                    mem_qubits,             cudaMemcpyHostToDevice);
-    cudaMemcpy(d_numQubitsPerReg, numQubitsPerReg,  mem_numQubitsPerReg,    cudaMemcpyHostToDevice);
-    cudaMemcpy(d_overrideInds, overrideInds,        mem_overrideInds,       cudaMemcpyHostToDevice);
-    cudaMemcpy(d_overridePhases, overridePhases,    mem_overridePhases,     cudaMemcpyHostToDevice);
-    if (numParams > 0)
-        cudaMemcpy(d_params, params, mem_params, cudaMemcpyHostToDevice);
-
-    int threadsPerCUDABlock = 128;
-    int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock);
-
-    // allocate thread-local working space {phaseInds}
-    long long int *d_phaseInds;
-    size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks;
-    cudaMalloc(&d_phaseInds, numRegs*gridSize * sizeof *d_phaseInds);
-
-    // call kernel
-    statevec_applyParamNamedPhaseFuncOverridesKernel<<<CUDABlocks,threadsPerCUDABlock>>>(
-        qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding,
-        phaseFuncName, d_params, numParams,
-        d_overrideInds, d_overridePhases, numOverrides,
-        d_phaseInds,
-        conj);
-
-    // free device memory
-    cudaFree(d_qubits);
-    cudaFree(d_numQubitsPerReg);
-    cudaFree(d_overrideInds);
-    cudaFree(d_overridePhases);
-    cudaFree(d_phaseInds);
-    if (numParams > 0)
-        cudaFree(d_params);
-}
-
-__global__ void densmatr_setQuregToPauliHamilKernel(
-    Qureg qureg, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms
-) {
-    long long int n = blockIdx.x*blockDim.x + threadIdx.x;
-    if (n>=qureg.numAmpsPerChunk) return;
-    
-    // flattened {I,X,Y,Z} matrix elements, where [k] = [p][i][j]
-    const int pauliRealElems[] = {   1,0, 0,1,   0,1, 1,0,   0,0, 0,0,   1,0, 0,-1  };
-    const int pauliImagElems[] = {   0,0, 0,0,   0,0, 0,0,   0,-1,1,0,   0,0, 0,0   };
-
-    // |n> = |c>|r>
-    const int numQubits = qureg.numQubitsRepresented;
-    const long long int r = n & ((1LL << numQubits) - 1);
-    const long long int c = n >> numQubits;
-
-    // new amplitude of |n>
-    qreal elemRe = 0;
-    qreal elemIm = 0;
-    
-    for (long long int t=0; t<numSumTerms; t++) {
-        
-        // pauliKronecker[r][c] = prod_q Pauli[q][q-th bit of r and c]
-        int kronRe = 1;
-        int kronIm = 0;
-        long long int pInd = t * numQubits;
-        
-        for (int q=0; q<numQubits; q++) {
-            
-            // get element of Pauli matrix
-            int i = (r >> q) & 1;
-            int j = (c >> q) & 1;
-            int p = (int) pauliCodes[pInd++];
-            int k = (p<<2) + (i<<1) + j;
-            int pauliRe = pauliRealElems[k]; 
-            int pauliIm = pauliImagElems[k];
-            
-            // kron *= pauli
-            int tmp = (pauliRe*kronRe) - (pauliIm*kronIm);
-            kronIm = (pauliRe*kronIm) + (pauliIm*kronRe);
-            kronRe = tmp;
-        }
-        
-        // elem = sum_t coeffs[t] pauliKronecker[r][c]
-        elemRe += termCoeffs[t] * kronRe;
-        elemIm += termCoeffs[t] * kronIm;
-    }
-    
-    // overwrite the density matrix entry
-    qureg.deviceStateVec.real[n] = elemRe;
-    qureg.deviceStateVec.imag[n] = elemIm;
-}
-
-void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) {
-    
-    // copy hamil into GPU memory
-    enum pauliOpType* d_pauliCodes;
-    size_t mem_pauliCodes = hamil.numSumTerms * hamil.numQubits * sizeof *d_pauliCodes;
-    cudaMalloc(&d_pauliCodes, mem_pauliCodes);
-    cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice);
-    
-    qreal* d_termCoeffs;
-    size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs;
-    cudaMalloc(&d_termCoeffs, mem_termCoeffs);
-    cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice);
-    
-    int numThreadsPerBlock = 128;
-    int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) numThreadsPerBlock);
-    densmatr_setQuregToPauliHamilKernel<<<numBlocks, numThreadsPerBlock>>>(
-        qureg, d_pauliCodes, d_termCoeffs, hamil.numSumTerms);
-
-    // free tmp GPU memory
-    cudaFree(d_pauliCodes);
-    cudaFree(d_termCoeffs);
-}
-
-void seedQuEST(QuESTEnv *env, unsigned long int *seedArray, int numSeeds) {
-
-    // free existing seed array, if exists
-    if (env->seeds != NULL)
-        free(env->seeds);
-        
-    // record keys in permanent heap
-    env->seeds = (unsigned long int*) malloc(numSeeds * sizeof *(env->seeds));
-    for (int i=0; i<numSeeds; i++)
-        (env->seeds)[i] = seedArray[i];
-    env->numSeeds = numSeeds;
-    
-    // pass keys to Mersenne Twister seeder
-    init_by_array(seedArray, numSeeds); 
-}
-
 
 
 #ifdef __cplusplus
diff --git a/QuEST/src/GPU/QuEST_gpu_common.cu b/QuEST/src/GPU/QuEST_gpu_common.cu
new file mode 100644
index 000000000..1e0a2ea1c
--- /dev/null
+++ b/QuEST/src/GPU/QuEST_gpu_common.cu
@@ -0,0 +1,1158 @@
+// Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details
+
+/** @file
+ * GPU routines which are agnostic to the cuQuantum backend or the custom kernels
+ *
+ * @author Tyson Jones
+ * @author Ania Brown (defined some original functions moved here)
+ */
+
+
+# include "QuEST.h"
+# include "QuEST_precision.h"
+# include "QuEST_validation.h"
+# include "QuEST_internal.h"
+# include "mt19937ar.h"
+
+# include <stdlib.h>
+# include <stdio.h>
+# include <math.h>
+
+#ifdef USE_HIP
+// Translate CUDA calls into HIP calls 
+#include "cuda_to_hip.h"
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/*
+ * GPU CONFIG
+ */
+
+const int NUM_THREADS_PER_BLOCK = 128;
+
+__forceinline__ __device__ long long int getThreadInd() {
+    return blockIdx.x*blockDim.x + threadIdx.x;
+}
+
+
+
+/*
+ * BACKEND AGNOSTIC COMPLEX ARITHMETIC
+ */
+
+#define PROD_REAL(aRe, aIm, bRe, bIm) \
+    ((aRe)*(bRe) - (aIm)*(bIm))
+#define PROD_IMAG(aRe, aIm, bRe, bIm) \
+    ((aRe)*(bIm) + (aIm)*(bRe))
+
+#ifdef USE_CUQUANTUM
+    #define GET_AMP(aRe, aIm, qureg, i) \
+        aRe = qureg.deviceCuStateVec[i].x; \
+        aIm = qureg.deviceCuStateVec[i].y;
+#else
+    #define GET_AMP(aRe, aIm, qureg, i) \
+        aRe = qureg.deviceStateVec.real[i]; \
+        aIm = qureg.deviceStateVec.imag[i];
+#endif
+
+#ifdef USE_CUQUANTUM
+    #define SET_AMP(qureg, i, aRe, aIm) \
+        qureg.deviceCuStateVec[i].x = aRe; \
+        qureg.deviceCuStateVec[i].y = aIm;
+#else
+    #define SET_AMP(qureg, i, aRe, aIm) \
+        qureg.deviceStateVec.real[i] = aRe; \
+        qureg.deviceStateVec.imag[i] = aIm;
+#endif
+
+
+
+/*
+ * ENVIRONMENT MANAGEMENT
+ */
+
+int GPUExists(void) {
+    int deviceCount, device;
+    int gpuDeviceCount = 0;
+    struct cudaDeviceProp properties;
+    cudaError_t cudaResultCode = cudaGetDeviceCount(&deviceCount);
+    if (cudaResultCode != cudaSuccess) deviceCount = 0;
+    /* machines with no GPUs can still report one emulation device */
+    for (device = 0; device < deviceCount; ++device) {
+        cudaGetDeviceProperties(&properties, device);
+        if (properties.major != 9999) { /* 9999 means emulation only */
+            ++gpuDeviceCount;
+        }
+    }
+    if (gpuDeviceCount) return 1;
+    else return 0;
+}
+
+void syncQuESTEnv(QuESTEnv env){
+    cudaDeviceSynchronize();
+} 
+
+int syncQuESTSuccess(int successCode){
+    return successCode;
+}
+
+void reportQuESTEnv(QuESTEnv env){
+    printf("EXECUTION ENVIRONMENT:\n");
+    printf("Running locally on one node with GPU ");
+# ifdef USE_CUQUANTUM
+    printf("via cuQuantum\n");
+# else 
+    printf("via custom kernels\n");
+# endif
+    printf("Number of ranks is %d\n", env.numRanks);
+# ifdef _OPENMP
+    printf("OpenMP enabled\n");
+    printf("Number of threads available is %d\n", omp_get_max_threads());
+# else
+    printf("OpenMP disabled\n");
+# endif
+}
+	
+void getEnvironmentString(QuESTEnv env, char str[200]){
+	
+    // OpenMP can be hybridised with GPU in future, so this check is safe and worthwhile
+    int ompStatus=0;
+    int numThreads=1;
+# ifdef _OPENMP
+    ompStatus=1;
+    numThreads=omp_get_max_threads(); 
+# endif
+
+    // ascertain whether we're using cuQuantum or bespoke kernels
+    int cuQuantumStatus=0;
+# ifdef USE_CUQUANTUM
+    cuQuantumStatus=1;
+# endif
+	
+    // there is no reporting of CUDA cores/threads/blocks currently (since non-trivial)
+    sprintf(str, "CUDA=1 cuQuantum=%d OpenMP=%d MPI=0 threads=%d ranks=1", cuQuantumStatus, ompStatus, numThreads);
+}
+
+void seedQuEST(QuESTEnv *env, unsigned long int *seedArray, int numSeeds) {
+
+    // free existing seed array, if exists
+    if (env->seeds != NULL)
+        free(env->seeds);
+        
+    // record keys in permanent heap
+    env->seeds = (unsigned long int*) malloc(numSeeds * sizeof *(env->seeds));
+    for (int i=0; i<numSeeds; i++)
+        (env->seeds)[i] = seedArray[i];
+    env->numSeeds = numSeeds;
+    
+    // pass keys to Mersenne Twister seeder
+    init_by_array(seedArray, numSeeds); 
+}
+
+
+
+/*
+ * DEBUG 
+ */
+
+void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank){
+
+    if (qureg.numQubitsInStateVec > 5) {
+        printf("State reporting disabled for >5 qubits");
+        return;
+    }
+
+    copyStateFromGPU(qureg); 
+
+    // distributed GPU not yet supported
+    if (reportRank != 0)
+        return;
+
+    printf("Reporting state from rank %d [\n", qureg.chunkId);
+    printf("real, imag\n");
+
+    for(long long int i=0; i<qureg.numAmpsPerChunk; i++)
+        printf(
+            REAL_STRING_FORMAT ", " REAL_STRING_FORMAT "\n", 
+            qureg.stateVec.real[i], qureg.stateVec.imag[i]);
+
+    printf("]\n");
+}
+
+
+
+/*
+ * STATE INITIALISATION
+ */
+
+__global__ void densmatr_initPureStateKernel(Qureg dens, Qureg pure)
+{
+    long long int pureInd = getThreadInd();
+    if (pureInd >= pure.numAmpsPerChunk) return;
+
+    qreal aRe, aIm;
+    GET_AMP( aRe, aIm, pure, pureInd );
+
+    for (long long int col=0; col<pure.numAmpsPerChunk; col++) {
+        long long int densInd = col*pure.numAmpsPerChunk + pureInd;
+
+        qreal bRe, bIm;
+        GET_AMP( bRe, bIm, pure, col );
+        bIm *= -1; // conjugated
+        
+        qreal cRe = PROD_REAL( aRe, aIm, bRe, bIm );
+        qreal cIm = PROD_IMAG( aRe, aIm, bRe, bIm );
+
+        SET_AMP( dens, densInd, cRe, cIm );
+    }
+}
+
+void densmatr_initPureState(Qureg dens, Qureg pure)
+{
+    int CUDABlocks = ceil(pure.numAmpsPerChunk/ (qreal) NUM_THREADS_PER_BLOCK);
+    densmatr_initPureStateKernel<<<CUDABlocks, NUM_THREADS_PER_BLOCK>>>(dens, pure);
+}
+
+__global__ void densmatr_setQuregToPauliHamilKernel(
+    Qureg qureg, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms
+) {
+    long long int n = getThreadInd();
+    if (n>=qureg.numAmpsPerChunk) return;
+    
+    // flattened {I,X,Y,Z} matrix elements, where [k] = [p][i][j]
+    const int pauliRealElems[] = {   1,0, 0,1,   0,1, 1,0,   0,0, 0,0,   1,0, 0,-1  };
+    const int pauliImagElems[] = {   0,0, 0,0,   0,0, 0,0,   0,-1,1,0,   0,0, 0,0   };
+
+    // |n> = |c>|r>
+    const int numQubits = qureg.numQubitsRepresented;
+    const long long int r = n & ((1LL << numQubits) - 1);
+    const long long int c = n >> numQubits;
+
+    // new amplitude of |n>
+    qreal elemRe = 0;
+    qreal elemIm = 0;
+    
+    for (long long int t=0; t<numSumTerms; t++) {
+        
+        // pauliKronecker[r][c] = prod_q Pauli[q][q-th bit of r and c]
+        int kronRe = 1;
+        int kronIm = 0;
+        long long int pInd = t * numQubits;
+        
+        for (int q=0; q<numQubits; q++) {
+            
+            // get element of Pauli matrix
+            int i = (r >> q) & 1;
+            int j = (c >> q) & 1;
+            int p = (int) pauliCodes[pInd++];
+            int k = (p<<2) + (i<<1) + j;
+            int pauliRe = pauliRealElems[k]; 
+            int pauliIm = pauliImagElems[k];
+            
+            // kron *= pauli
+            int tmp = (pauliRe*kronRe) - (pauliIm*kronIm);
+            kronIm = (pauliRe*kronIm) + (pauliIm*kronRe);
+            kronRe = tmp;
+        }
+        
+        // elem = sum_t coeffs[t] pauliKronecker[r][c]
+        elemRe += termCoeffs[t] * kronRe;
+        elemIm += termCoeffs[t] * kronIm;
+    }
+    
+    // overwrite the density matrix entry
+    SET_AMP( qureg, n, elemRe, elemIm );
+}
+
+void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) {
+    
+    // copy hamil into GPU memory
+    enum pauliOpType* d_pauliCodes;
+    size_t mem_pauliCodes = hamil.numSumTerms * hamil.numQubits * sizeof *d_pauliCodes;
+    cudaMalloc(&d_pauliCodes, mem_pauliCodes);
+    cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice);
+    
+    qreal* d_termCoeffs;
+    size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs;
+    cudaMalloc(&d_termCoeffs, mem_termCoeffs);
+    cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice);
+    
+    int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) NUM_THREADS_PER_BLOCK);
+    densmatr_setQuregToPauliHamilKernel<<<numBlocks, NUM_THREADS_PER_BLOCK>>>(
+        qureg, d_pauliCodes, d_termCoeffs, hamil.numSumTerms);
+
+    // free tmp GPU memory
+    cudaFree(d_pauliCodes);
+    cudaFree(d_termCoeffs);
+}
+
+
+
+/*
+ * CALCULATIONS
+ */
+
+// atomicAdd on floats/doubles isn't available on <6 CC devices, so we add it ourselves
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+#else
+static __inline__ __device__ double atomicAdd(double* address, double val)
+{
+    unsigned long long int* address_as_ull = (unsigned long long int*) address;
+    unsigned long long int old = *address_as_ull, assumed;
+
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+            __double_as_longlong(val + __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+
+    return __longlong_as_double(old);
+}
+#endif
+
+__global__ void statevec_calcProbOfAllOutcomesKernel(
+    qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits
+) {
+    // each thread handles one amplitude (all amplitudes are involved)
+    long long int ampInd = getThreadInd();
+    if (ampInd >= qureg.numAmpsTotal) return;
+
+    qreal ampRe, ampIm;
+    GET_AMP( ampRe, ampIm, qureg, ampInd );
+    qreal prob = ampRe*ampRe + ampIm*ampIm;
+    
+    // each amplitude contributes to one outcome
+    long long int outcomeInd = 0;
+    for (int q=0; q<numQubits; q++)
+        outcomeInd += ((ampInd >> qubits[q]) & 1) << q;
+    
+    // each thread atomically writes directly to the global output.
+    // this beat block-heirarchal atomic reductions in both global and shared memory!
+    atomicAdd(&outcomeProbs[outcomeInd], prob);
+}
+
+void statevec_calcProbOfAllOutcomes(qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits) {
+
+    // copy qubits to GPU memory
+    int* d_qubits;
+    size_t mem_qubits = numQubits * sizeof *d_qubits;
+    cudaMalloc(&d_qubits, mem_qubits);
+    cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice);
+
+    // create one thread for every amplitude
+    int numThreadsPerBlock = 128;
+    int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) numThreadsPerBlock);
+    
+    // create global GPU array for outcomeProbs
+    qreal* d_outcomeProbs;
+    long long int numOutcomes = (1LL << numQubits);
+    size_t mem_outcomeProbs = numOutcomes * sizeof *d_outcomeProbs;
+    cudaMalloc(&d_outcomeProbs, mem_outcomeProbs);
+    cudaMemset(d_outcomeProbs, 0, mem_outcomeProbs);
+    
+    // populate per-block subarrays
+    statevec_calcProbOfAllOutcomesKernel<<<numBlocks, numThreadsPerBlock>>>(
+        d_outcomeProbs, qureg, d_qubits, numQubits);
+        
+    // copy outcomeProbs from GPU memory
+    cudaMemcpy(outcomeProbs, d_outcomeProbs, mem_outcomeProbs, cudaMemcpyDeviceToHost);
+    
+    // free GPU memory
+    cudaFree(d_qubits);
+    cudaFree(d_outcomeProbs);
+}
+
+__global__ void densmatr_calcProbOfAllOutcomesKernel(
+    qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits
+) {
+    // each thread handles one diagonal amplitude
+    long long int diagInd = getThreadInd();
+    long long int numDiags = (1LL << qureg.numQubitsRepresented);
+    if (diagInd >= numDiags) return;
+    
+    long long int flatInd = (1 + numDiags)*diagInd;
+
+    qreal ampRe, ampIm;
+    GET_AMP( ampRe, ampIm, qureg, flatInd );
+    qreal prob = ampRe + 0*ampIm;   // ampIm assumed ~ 0, silence unused warning
+    
+    // each diagonal amplitude contributes to one outcome
+    long long int outcomeInd = 0;
+    for (int q=0; q<numQubits; q++)
+        outcomeInd += ((diagInd >> qubits[q]) & 1) << q;
+    
+    // each thread atomically writes directly to the global output.
+    // this beat block-heirarchal atomic reductions in both global and shared memory!
+    atomicAdd(&outcomeProbs[outcomeInd], prob);
+}
+
+void densmatr_calcProbOfAllOutcomes(qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits) {
+
+    // copy qubits to GPU memory
+    int* d_qubits;
+    size_t mem_qubits = numQubits * sizeof *d_qubits;
+    cudaMalloc(&d_qubits, mem_qubits);
+    cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice);
+    
+    // create global array, with per-block subarrays
+    int numThreadsPerBlock = 128;
+    int numDiags = (1LL << qureg.numQubitsRepresented);
+    int numBlocks = ceil(numDiags / (qreal) numThreadsPerBlock);
+        
+    // create global GPU array for outcomeProbs
+    qreal* d_outcomeProbs;
+    long long int numOutcomes = (1LL << numQubits);
+    size_t mem_outcomeProbs = numOutcomes * sizeof *d_outcomeProbs;
+    cudaMalloc(&d_outcomeProbs, mem_outcomeProbs);
+    cudaMemset(d_outcomeProbs, 0, mem_outcomeProbs);
+    
+    // populate per-block subarrays
+    densmatr_calcProbOfAllOutcomesKernel<<<numBlocks, numThreadsPerBlock>>>(
+        d_outcomeProbs, qureg, d_qubits, numQubits);
+        
+    // copy outcomeProbs from GPU memory
+    cudaMemcpy(outcomeProbs, d_outcomeProbs, mem_outcomeProbs, cudaMemcpyDeviceToHost);
+    
+    // free GPU memory
+    cudaFree(d_qubits);
+    cudaFree(d_outcomeProbs);
+}
+
+
+
+/*
+ * DECOHERENCE
+ */
+
+__global__ void densmatr_mixTwoQubitDepolarisingKernel(
+    Qureg qureg, qreal depolLevel, long long int part1, long long int part2, 
+    long long int part3, long long int part4, long long int part5,
+    long long int rowCol1, long long int rowCol2)
+{
+    long long int scanInd = getThreadInd();
+    if (scanInd >= qureg.numAmpsPerChunk) return;
+    
+    long long int ind00 = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2) + ((scanInd&part4)<<3) + ((scanInd&part5)<<4);
+    long long int ind01 = ind00 + rowCol1;
+    long long int ind10 = ind00 + rowCol2;
+    long long int ind11 = ind00 + rowCol1 + rowCol2;
+
+    qreal re00, re01, re10, re11;
+    qreal im00, im01, im10, im11;
+    GET_AMP( re00, im00, qureg, ind00 );
+    GET_AMP( re01, im01, qureg, ind01 );
+    GET_AMP( re10, im10, qureg, ind10 );
+    GET_AMP( re11, im11, qureg, ind11 );
+    
+    qreal realAvDepol = depolLevel * 0.25 * (re00 + re01 + re10 + re11);
+    qreal imagAvDepol = depolLevel * 0.25 * (im00 + im01 + im10 + im11);
+    qreal retain = 1 - depolLevel;
+
+    SET_AMP( qureg, ind00, retain*re00 + realAvDepol, retain*im00 + imagAvDepol );
+    SET_AMP( qureg, ind01, retain*re01 + realAvDepol, retain*im01 + imagAvDepol );
+    SET_AMP( qureg, ind10, retain*re10 + realAvDepol, retain*im10 + imagAvDepol );
+    SET_AMP( qureg, ind11, retain*re11 + realAvDepol, retain*im11 + imagAvDepol );
+}
+
+void densmatr_mixTwoQubitDepolarising(Qureg qureg, int qubit1, int qubit2, qreal depolLevel) {
+    
+    if (depolLevel == 0)
+        return;
+
+    densmatr_mixTwoQubitDephasing(qureg, qubit1, qubit2, depolLevel);
+    
+    // this code is painfully and unnecessarily verbose; it computes index offsets 
+    // using bitwise logic, only to (within the kernel) effect those offsets with
+    // integer arithmetic. Instead, one might as well obtain the ultimate indices
+    // directly using bitwise logic within the kernel, passing no offsets at all.
+    // We defer this to when the insertBits (and other bit twiddles) have been moved
+    // into QuEST_gpu_common.cu.
+
+    int rowQubit1 = qubit1 + qureg.numQubitsRepresented;
+    int rowQubit2 = qubit2 + qureg.numQubitsRepresented;
+    
+    long long int colBit1 = 1LL << qubit1;
+    long long int rowBit1 = 1LL << rowQubit1;
+    long long int colBit2 = 1LL << qubit2;
+    long long int rowBit2 = 1LL << rowQubit2;
+
+    long long int rowCol1 = colBit1 | rowBit1;
+    long long int rowCol2 = colBit2 | rowBit2;
+    
+    long long int numAmpsToVisit = qureg.numAmpsPerChunk/16;
+    long long int part1 = colBit1 - 1;
+    long long int part2 = (colBit2 >> 1) - colBit1;
+    long long int part3 = (rowBit1 >> 2) - (colBit2 >> 1);
+    long long int part4 = (rowBit2 >> 3) - (rowBit1 >> 2);
+    long long int part5 = numAmpsToVisit - (rowBit2 >> 3);
+    
+    int CUDABlocks = ceil(numAmpsToVisit / (qreal) NUM_THREADS_PER_BLOCK);
+    densmatr_mixTwoQubitDepolarisingKernel<<<CUDABlocks, NUM_THREADS_PER_BLOCK>>>(
+        qureg, depolLevel, part1, part2, part3, part4, part5, rowCol1, rowCol2);
+}
+
+
+
+/*
+ * MANAGING NON-QUREG TYPES 
+ */
+
+DiagonalOp agnostic_createDiagonalOp(int numQubits, QuESTEnv env) {
+
+    DiagonalOp op;
+    op.numQubits = numQubits;
+    op.numElemsPerChunk = (1LL << numQubits) / env.numRanks;
+    op.chunkId = env.rank;
+    op.numChunks = env.numRanks;
+
+    // allocate CPU memory (initialised to zero)
+    op.real = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal));
+    op.imag = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal));
+
+    // check cpu memory allocation was successful
+    validateDiagonalOpAllocation(&op, env, __func__);
+
+    // allocate GPU memory
+    size_t arrSize = op.numElemsPerChunk * sizeof(qreal);
+    cudaMalloc(&(op.deviceOperator.real), arrSize);
+    cudaMalloc(&(op.deviceOperator.imag), arrSize);
+
+    // check gpu memory allocation was successful
+    validateDiagonalOpGPUAllocation(&op, env, __func__);
+
+    // initialise GPU memory to zero
+    cudaMemset(op.deviceOperator.real, 0, arrSize);
+    cudaMemset(op.deviceOperator.imag, 0, arrSize);
+
+    return op;
+}
+
+void agnostic_destroyDiagonalOp(DiagonalOp op) {
+    free(op.real);
+    free(op.imag);
+    cudaFree(op.deviceOperator.real);
+    cudaFree(op.deviceOperator.imag);
+}
+
+void agnostic_syncDiagonalOp(DiagonalOp op) {
+    cudaDeviceSynchronize();
+    size_t mem_elems = op.numElemsPerChunk * sizeof *op.real;
+    cudaMemcpy(op.deviceOperator.real, op.real, mem_elems, cudaMemcpyHostToDevice);
+    cudaMemcpy(op.deviceOperator.imag, op.imag, mem_elems, cudaMemcpyHostToDevice);
+}
+
+__global__ void agnostic_initDiagonalOpFromPauliHamilKernel(
+    DiagonalOp op, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms
+) {    
+    // each thread processes one diagonal element
+    long long int elemInd = getThreadInd();
+    if (elemInd >= op.numElemsPerChunk)
+        return;
+
+    qreal elem = 0;
+    
+    // elem is (+-) every coefficient, with sign determined by parity
+    for (int t=0; t<numSumTerms; t++) {
+        
+        // determine the parity of the Z-targeted qubits in the element's corresponding state
+        int isOddNumOnes = 0;
+        for (int q=0; q<op.numQubits; q++)
+            if (pauliCodes[q + t*op.numQubits] == PAULI_Z)
+                if ((elemInd >> q) & 1)
+                    isOddNumOnes = !isOddNumOnes;
+        
+        // avoid warp divergence
+        int sign = 1 - 2*isOddNumOnes; // (-1 if isOddNumOnes, else +1)
+        elem += termCoeffs[t] * sign;
+    }
+    
+    op.deviceOperator.real[elemInd] = elem;
+    op.deviceOperator.imag[elemInd] = 0;
+}
+
+void agnostic_initDiagonalOpFromPauliHamil(DiagonalOp op, PauliHamil hamil) {
+    
+    // copy args intop GPU memory
+    enum pauliOpType* d_pauliCodes;
+    size_t mem_pauliCodes = hamil.numSumTerms * op.numQubits * sizeof *d_pauliCodes;
+    cudaMalloc(&d_pauliCodes, mem_pauliCodes);
+    cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice);
+    
+    qreal* d_termCoeffs;
+    size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs;
+    cudaMalloc(&d_termCoeffs, mem_termCoeffs);
+    cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice);
+    
+    int numBlocks = ceil(op.numElemsPerChunk / (qreal) NUM_THREADS_PER_BLOCK);
+    agnostic_initDiagonalOpFromPauliHamilKernel<<<numBlocks, NUM_THREADS_PER_BLOCK>>>(
+        op, d_pauliCodes, d_termCoeffs, hamil.numSumTerms);
+    
+    // copy populated operator into to RAM
+    cudaDeviceSynchronize();
+    size_t mem_elems = op.numElemsPerChunk * sizeof *op.real;
+    cudaMemcpy(op.real, op.deviceOperator.real, mem_elems, cudaMemcpyDeviceToHost);
+    cudaMemcpy(op.imag, op.deviceOperator.imag, mem_elems, cudaMemcpyDeviceToHost);
+
+    cudaFree(d_pauliCodes);
+    cudaFree(d_termCoeffs);
+}
+
+void agnostic_setDiagonalOpElems(DiagonalOp op, long long int startInd, qreal* real, qreal* imag, long long int numElems) {
+
+    // update both RAM and VRAM, for consistency
+    memcpy(&op.real[startInd], real, numElems * sizeof(qreal));
+    memcpy(&op.imag[startInd], imag, numElems * sizeof(qreal));
+
+    cudaDeviceSynchronize();
+    cudaMemcpy(
+        op.deviceOperator.real + startInd, 
+        real,
+        numElems * sizeof(*(op.deviceOperator.real)), 
+        cudaMemcpyHostToDevice);
+    cudaMemcpy(
+        op.deviceOperator.imag + startInd,
+        imag,
+        numElems * sizeof(*(op.deviceOperator.imag)), 
+        cudaMemcpyHostToDevice);
+}
+
+
+
+/*
+ * PHASE FUNCTIONS
+ */
+
+__forceinline__ __device__ int getBit(long long int num, int ind) {
+    return (num >> ind) & 1;
+}
+
+__forceinline__ __device__ void applyPhaseToAmp(Qureg qureg, long long int index, qreal phase, int conj) 
+{
+    // negate phase to conjugate operator
+    phase *= (1 - 2*conj);
+    qreal c = cos(phase);
+    qreal s = sin(phase);
+
+    // modify amp to amp * exp(i phase) 
+    qreal re, im;
+    GET_AMP( re, im, qureg, index );
+    SET_AMP( qureg, index, re*c - im*s, re*s + im*c );
+}
+
+__forceinline__ __device__ void getMultiRegPhaseIndOffsets(size_t* stride, size_t* offset)
+{
+    // determine the phaseInds tuple relevant for this thread
+    *stride = gridDim.x*blockDim.x;
+    *offset = blockIdx.x*blockDim.x + threadIdx.x;
+}
+
+__forceinline__ __device__ long long int getPhaseInd(long long int globalAmpInd, int* qubits, int numQubits, enum bitEncoding encoding)
+{
+    long long int phaseInd = 0LL;
+
+    if (encoding == UNSIGNED) {
+        for (int q=0; q<numQubits; q++)
+            phaseInd += (1LL << q) * getBit(globalAmpInd, qubits[q]);
+    }
+    else if (encoding == TWOS_COMPLEMENT) {
+        for (int q=0; q<numQubits-1; q++) // use final qubit to indicate sign 
+            phaseInd += (1LL << q) * getBit(globalAmpInd, qubits[q]);
+        if (getBit(globalAmpInd, qubits[numQubits-1]) == 1)
+            phaseInd -= (1LL << (numQubits-1));
+    }
+
+    return phaseInd;
+}
+
+__forceinline__ __device__ void setMultiRegPhaseInds(
+    long long int *phaseInds, long long int globalAmpInd, 
+    int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding
+) {
+    // determine the phaseInds tuple relevant for this thread
+    size_t stride, offset;
+    getMultiRegPhaseIndOffsets(&stride, &offset);
+
+    // determine phase indices
+    if (encoding == UNSIGNED) {
+        int flatInd = 0;
+        for (int r=0; r<numRegs; r++) {
+            phaseInds[r*stride+offset] = 0LL;
+            for (int q=0; q<numQubitsPerReg[r]; q++)
+                phaseInds[r*stride+offset] += (1LL << q) * getBit(globalAmpInd, qubits[flatInd++]);
+        }
+    }
+    else if  (encoding == TWOS_COMPLEMENT) {
+        int flatInd = 0;
+        for (int r=0; r<numRegs; r++) {
+            phaseInds[r*stride+offset] = 0LL;
+            for (int q=0; q<numQubitsPerReg[r]-1; q++)  
+                phaseInds[r*stride+offset] += (1LL << q) * getBit(globalAmpInd, qubits[flatInd++]);
+            // use final qubit to indicate sign
+            if (getBit(globalAmpInd, qubits[flatInd++]) == 1)
+                phaseInds[r*stride+offset] -= (1LL << (numQubitsPerReg[r]-1));
+        }
+    }
+}
+
+__forceinline__ __device__ long long int getIndOfPhaseOverride(long long int phaseInd, long long int* overrideInds, int numOverrides)
+{
+    // determine if this phase index has an overriden value (i < numOverrides).
+    // this is painfully inefficient and ill-suited to kernel lockstep, but we
+    // assume there are very few (<5) overrides, so optimisation is unnecessary
+
+    int i;
+    for (i=0; i<numOverrides; i++)
+        if (phaseInd == overrideInds[i])
+            break;
+
+    return i;
+}
+
+__forceinline__ __device__ long long int getIndOfMultiRegPhaseOverride(
+    long long int* phaseInds, int numRegs, long long int* overrideInds, int numOverrides
+) {
+    // determine the phaseInds tuple relevant for this thread
+    size_t stride, offset;
+    getMultiRegPhaseIndOffsets(&stride, &offset);
+
+    // determine if this phase index has an overriden value (i < numOverrides).
+    // this is painfully inefficient and ill-suited to kernel lockstep, but we
+    // assume there are very few (<5) overrides, so optimisation is unnecessary
+
+    int i;
+    for (i=0; i<numOverrides; i++) {
+
+        // to be a match, every ind in the index-tuple must match
+        int found = 1;
+        for (int r=0; r<numRegs; r++) {
+            if (phaseInds[r*stride+offset] != overrideInds[i*numRegs+r]) {
+                found = 0;
+                break;
+            }
+        }
+
+        // breaking is mostly cosmetic; if even a single thread encounters no overrides,
+        // it must iterate all overrides and all other lock-step threads must wait
+        if (found)
+            break;
+    }
+
+    // (i<numOverrides) indicates an override was found
+    return i;
+}
+
+__forceinline__ __device__ qreal evalNormPhaseFunc(
+    long long int* phaseInds, size_t stride, size_t offset,
+    int numRegs, enum phaseFunc phaseFuncName, qreal* params, int numParams
+) {
+    // determine norm
+    qreal norm = 0;
+    if (phaseFuncName == SCALED_INVERSE_SHIFTED_NORM) {
+        for (int r=0; r<numRegs; r++) {
+            qreal dif = phaseInds[r*stride+offset] - params[2+r];
+            norm += dif*dif;
+        }
+    }
+    else
+        for (int r=0; r<numRegs; r++)
+            norm += phaseInds[r*stride+offset]*phaseInds[r*stride+offset];
+    norm = sqrt(norm);
+
+    // determine phase via phase function
+
+    if (phaseFuncName == NORM)
+        return norm;
+
+    if (phaseFuncName == INVERSE_NORM)
+        return (norm == 0.)? params[0] : 1/norm; // smallest non-zero norm is 1
+
+    if (phaseFuncName == SCALED_NORM)
+        return params[0] * norm;
+
+    if (
+        phaseFuncName == SCALED_INVERSE_NORM || 
+        phaseFuncName == SCALED_INVERSE_SHIFTED_NORM
+    )
+        return (norm <= REAL_EPS)? params[1] : params[0] / norm; // unless shifted closer to zero
+}
+
+__forceinline__ __device__ qreal evalProductPhaseFunc(
+    long long int* phaseInds, size_t stride, size_t offset,
+    int numRegs, enum phaseFunc phaseFuncName, qreal* params, int numParams
+) {
+    // determine product of phase indices
+    qreal prod = 1;
+    for (int r=0; r<numRegs; r++)
+        prod *= phaseInds[r*stride+offset];
+
+    // determine phase via phase function
+    if (phaseFuncName == PRODUCT)
+        return prod;
+
+    if (phaseFuncName == INVERSE_PRODUCT)
+        return (prod == 0.)? params[0] : 1/prod; // smallest non-zero prod is +- 1
+    
+    if (phaseFuncName == SCALED_PRODUCT)
+        return params[0] * prod;
+
+    if (phaseFuncName == SCALED_INVERSE_PRODUCT)
+        return (prod == 0.)? params[1] : params[0] / prod;
+}
+
+__forceinline__ __device__ qreal evalDistancePhaseFunc(
+    long long int* phaseInds, size_t stride, size_t offset,
+    int numRegs, enum phaseFunc phaseFuncName, qreal* params, int numParams
+) {
+    // evaluate distance (depends on phase function)
+    qreal dist = 0;
+    if (phaseFuncName == SCALED_INVERSE_SHIFTED_DISTANCE) {
+        for (int r=0; r<numRegs; r+=2) {
+            qreal dif = (phaseInds[r*stride+offset] - phaseInds[(r+1)*stride+offset] - params[2+r/2]);
+            dist += dif*dif;
+        }
+    }
+    else if (phaseFuncName == SCALED_INVERSE_SHIFTED_WEIGHTED_DISTANCE) {
+        for (int r=0; r<numRegs; r+=2) {
+            qreal dif = (phaseInds[r*stride+offset] - phaseInds[(r+1)*stride+offset] - params[2+r+1]);
+            dist += params[2+r] * dif*dif;
+        }
+    }
+    else
+        for (int r=0; r<numRegs; r+=2) {
+            qreal dif = (phaseInds[(r+1)*stride+offset] - phaseInds[r*stride+offset]);
+            dist += dif*dif;
+        }
+
+    // if sqrt() arg of distance would be negative, set it to zero, to subsequently be set to the divergence param
+    if (dist < 0)
+        dist = 0;
+
+    dist = sqrt(dist);
+
+    if (phaseFuncName == DISTANCE)
+        return dist;
+
+    if (phaseFuncName == INVERSE_DISTANCE)
+        return (dist == 0.)? params[0] : 1/dist; // smallest non-zero dist is 1
+    
+    if (phaseFuncName == SCALED_DISTANCE)
+        return params[0] * dist;
+    
+    if (
+        phaseFuncName == SCALED_INVERSE_DISTANCE || 
+        phaseFuncName == SCALED_INVERSE_SHIFTED_DISTANCE || 
+        phaseFuncName == SCALED_INVERSE_SHIFTED_WEIGHTED_DISTANCE
+    )
+        return (dist <= REAL_EPS)? params[1] : params[0] / dist; // unless shifted closer
+}
+
+__forceinline__ __device__ qreal getPhaseFromParamNamedFunc(
+    long long int* phaseInds, int numRegs, enum phaseFunc phaseFuncName, qreal* params, int numParams
+) {
+    // determine the phaseInds tuple relevant for the calling thread
+    size_t stride, offset;
+    getMultiRegPhaseIndOffsets(&stride, &offset);
+
+    // compute norm related phases
+    if (
+        phaseFuncName == NORM || 
+        phaseFuncName == INVERSE_NORM ||
+        phaseFuncName == SCALED_NORM || 
+        phaseFuncName == SCALED_INVERSE_NORM ||
+        phaseFuncName == SCALED_INVERSE_SHIFTED_NORM
+    )
+        return evalNormPhaseFunc(phaseInds, stride, offset, numRegs, phaseFuncName, params, numParams);
+
+    // compute product related phases
+    if (
+        phaseFuncName == PRODUCT || 
+        phaseFuncName == INVERSE_PRODUCT ||
+        phaseFuncName == SCALED_PRODUCT || 
+        phaseFuncName == SCALED_INVERSE_PRODUCT
+    )
+        return evalProductPhaseFunc(phaseInds, stride, offset, numRegs, phaseFuncName, params, numParams);
+
+    // compute Euclidean distance related phases 
+    if (
+        phaseFuncName == DISTANCE || 
+        phaseFuncName == INVERSE_DISTANCE ||
+        phaseFuncName == SCALED_DISTANCE || 
+        phaseFuncName == SCALED_INVERSE_DISTANCE ||
+        phaseFuncName == SCALED_INVERSE_SHIFTED_DISTANCE || 
+        phaseFuncName == SCALED_INVERSE_SHIFTED_WEIGHTED_DISTANCE
+    )
+        return evalDistancePhaseFunc(phaseInds, stride, offset, numRegs, phaseFuncName, params, numParams);
+}
+
+__global__ void statevec_applyPhaseFuncOverridesKernel(
+    Qureg qureg, int* qubits, int numQubits, enum bitEncoding encoding,
+    qreal* coeffs, qreal* exponents, int numTerms, 
+    long long int* overrideInds, qreal* overridePhases, int numOverrides, 
+    int conj
+) {
+    long long int index = getThreadInd();
+    if (index>=qureg.numAmpsPerChunk) return;
+
+    // determine global amplitude index (non-distributed, so it's just local index)
+    long long int globalAmpInd = index;
+
+    // determine phaseInd from the qubit encoding
+    long long int phaseInd = getPhaseInd(globalAmpInd, qubits, numQubits, encoding);
+
+    // determine if this phase index has an overriden value (i < numOverrides)
+    int overInd = getIndOfPhaseOverride(phaseInd, overrideInds, numOverrides);
+
+    // determine phase from {coeffs}, {exponents} (unless overriden)
+    qreal phase = 0;
+    if (overInd < numOverrides)
+        phase = overridePhases[overInd];
+    else
+        for (int t=0; t<numTerms; t++)
+            phase += coeffs[t] * pow((qreal) phaseInd, (qreal) exponents[t]);
+            
+    // modify amp to amp * exp(i phase) 
+    applyPhaseToAmp(qureg, index, phase, conj);
+}
+
+ void statevec_applyPhaseFuncOverrides(
+     Qureg qureg, int* qubits, int numQubits, enum bitEncoding encoding,
+     qreal* coeffs, qreal* exponents, int numTerms, 
+     long long int* overrideInds, qreal* overridePhases, int numOverrides,
+     int conj
+ ) {
+    // allocate device space for global list of {qubits}, {coeffs}, {exponents}, {overrideInds} and {overridePhases}
+    int* d_qubits;                          size_t mem_qubits = numQubits * sizeof *d_qubits;
+    qreal* d_coeffs;                        size_t mem_terms = numTerms * sizeof *d_coeffs;
+    qreal* d_exponents;                 
+    long long int* d_overrideInds;          size_t mem_inds = numOverrides * sizeof *d_overrideInds;
+    qreal* d_overridePhases;                size_t mem_phas = numOverrides * sizeof *d_overridePhases;
+    cudaMalloc(&d_qubits, mem_qubits);      cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice);
+    cudaMalloc(&d_coeffs, mem_terms);       cudaMemcpy(d_coeffs, coeffs, mem_terms, cudaMemcpyHostToDevice);
+    cudaMalloc(&d_exponents, mem_terms);    cudaMemcpy(d_exponents, exponents, mem_terms, cudaMemcpyHostToDevice);
+    cudaMalloc(&d_overrideInds, mem_inds);  cudaMemcpy(d_overrideInds, overrideInds, mem_inds, cudaMemcpyHostToDevice);
+    cudaMalloc(&d_overridePhases,mem_phas); cudaMemcpy(d_overridePhases, overridePhases, mem_phas, cudaMemcpyHostToDevice);
+
+    // call kernel
+    int threadsPerCUDABlock = 128;
+    int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock);
+    statevec_applyPhaseFuncOverridesKernel<<<CUDABlocks,threadsPerCUDABlock>>>(
+        qureg, d_qubits, numQubits, encoding, 
+        d_coeffs, d_exponents, numTerms, 
+        d_overrideInds, d_overridePhases, numOverrides,
+        conj);
+
+    // cleanup device memory 
+    cudaFree(d_qubits);
+    cudaFree(d_coeffs);
+    cudaFree(d_exponents);
+    cudaFree(d_overrideInds);
+    cudaFree(d_overridePhases);
+}
+
+__global__ void statevec_applyMultiVarPhaseFuncOverridesKernel(
+    Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding,
+    qreal* coeffs, qreal* exponents, int* numTermsPerReg, 
+    long long int* overrideInds, qreal* overridePhases, int numOverrides,
+    long long int *phaseInds,
+    int conj
+) {
+    long long int index = getThreadInd();
+    if (index>=qureg.numAmpsPerChunk) return;
+
+    // determine global amplitude index (non-distributed, so it's just local index)
+    long long int globalAmpInd = index;
+
+    // determine phase indices (each thread has phaseInds[numRegs] sub-array)
+    setMultiRegPhaseInds(phaseInds, globalAmpInd, qubits, numQubitsPerReg, numRegs, encoding);
+
+    // determine if this phase index has an overriden value
+    long long int overInd = getIndOfMultiRegPhaseOverride(phaseInds, numRegs, overrideInds, numOverrides);
+
+    // either use the overriden phase...
+    qreal phase = 0;
+    if (overInd < numOverrides)
+        phase = overridePhases[overInd];
+
+    else {
+        // else determine the phaseInds tuple relevant for this thread
+        size_t stride, offset;
+        getMultiRegPhaseIndOffsets(&stride, &offset);
+
+        // and compute the phase from coeffs and exponents
+        long long int flatInd = 0;
+        for (int r=0; r<numRegs; r++) {
+            for (int t=0; t<numTermsPerReg[r]; t++) {
+                phase += coeffs[flatInd] * pow((qreal) phaseInds[r*stride+offset], (qreal) exponents[flatInd]);
+                flatInd++;
+            }
+        }
+    }
+    
+    // modify amp to amp * exp(i phase) 
+    applyPhaseToAmp(qureg, index, phase, conj);
+}
+
+void statevec_applyMultiVarPhaseFuncOverrides(
+    Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding,
+    qreal* coeffs, qreal* exponents, int* numTermsPerReg, 
+    long long int* overrideInds, qreal* overridePhases, int numOverrides,
+    int conj
+) {
+    // determine size of arrays, for cloning into GPU memory
+    size_t mem_numQubitsPerReg = numRegs * sizeof *numQubitsPerReg;
+    size_t mem_numTermsPerReg = numRegs * sizeof *numTermsPerReg;
+    size_t mem_overridePhases = numOverrides * sizeof *overridePhases;
+    size_t mem_overrideInds = numOverrides * numRegs * sizeof *overrideInds;
+    size_t mem_qubits = 0;
+    size_t mem_coeffs = 0;  
+    size_t mem_exponents = 0;
+    for (int r=0; r<numRegs; r++) {
+        mem_qubits += numQubitsPerReg[r] * sizeof *qubits;
+        mem_coeffs += numTermsPerReg[r] * sizeof *coeffs;
+        mem_exponents += numTermsPerReg[r] * sizeof *exponents;
+    }
+
+    // allocate global GPU memory
+    int* d_qubits;                  cudaMalloc(&d_qubits,           mem_qubits);
+    qreal* d_coeffs;                cudaMalloc(&d_coeffs,           mem_coeffs);
+    qreal* d_exponents;             cudaMalloc(&d_exponents,        mem_exponents);
+    int* d_numQubitsPerReg;         cudaMalloc(&d_numQubitsPerReg,  mem_numQubitsPerReg);
+    int* d_numTermsPerReg;          cudaMalloc(&d_numTermsPerReg,   mem_numTermsPerReg);
+    long long int* d_overrideInds;  cudaMalloc(&d_overrideInds,     mem_overrideInds);
+    qreal* d_overridePhases;        cudaMalloc(&d_overridePhases,   mem_overridePhases);
+
+    // copy function args into GPU memory
+    cudaMemcpy(d_qubits, qubits,                    mem_qubits,             cudaMemcpyHostToDevice);
+    cudaMemcpy(d_coeffs, coeffs,                    mem_coeffs,             cudaMemcpyHostToDevice);
+    cudaMemcpy(d_exponents, exponents,              mem_exponents,          cudaMemcpyHostToDevice);
+    cudaMemcpy(d_numQubitsPerReg, numQubitsPerReg,  mem_numQubitsPerReg,    cudaMemcpyHostToDevice);
+    cudaMemcpy(d_numTermsPerReg, numTermsPerReg,    mem_numTermsPerReg,     cudaMemcpyHostToDevice);
+    cudaMemcpy(d_overrideInds, overrideInds,        mem_overrideInds,       cudaMemcpyHostToDevice);
+    cudaMemcpy(d_overridePhases, overridePhases,    mem_overridePhases,     cudaMemcpyHostToDevice);
+
+    int threadsPerCUDABlock = 128;
+    int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock);
+
+    // allocate thread-local working space {phaseInds}
+    long long int *d_phaseInds;
+    size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks;
+    cudaMalloc(&d_phaseInds, numRegs*gridSize * sizeof *d_phaseInds);
+
+    // call kernel
+    statevec_applyMultiVarPhaseFuncOverridesKernel<<<CUDABlocks,threadsPerCUDABlock>>>(
+        qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding,
+        d_coeffs, d_exponents, d_numTermsPerReg, 
+        d_overrideInds, d_overridePhases, numOverrides,
+        d_phaseInds, 
+        conj);
+
+    // free device memory
+    cudaFree(d_qubits);
+    cudaFree(d_coeffs);
+    cudaFree(d_exponents);
+    cudaFree(d_numQubitsPerReg);
+    cudaFree(d_numTermsPerReg);
+    cudaFree(d_overrideInds);
+    cudaFree(d_overridePhases);
+    cudaFree(d_phaseInds);
+}
+
+__global__ void statevec_applyParamNamedPhaseFuncOverridesKernel(
+    Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding,
+    enum phaseFunc phaseFuncName, qreal* params, int numParams,
+    long long int* overrideInds, qreal* overridePhases, int numOverrides,
+    long long int* phaseInds,
+    int conj
+) {
+    long long int index = getThreadInd();
+    if (index>=qureg.numAmpsPerChunk) return;
+
+    // determine global amplitude index (non-distributed, so it's just local index)
+    long long int globalAmpInd = index;
+
+    // determine phase indices (each thread has phaseInds[numRegs] sub-array)
+    setMultiRegPhaseInds(phaseInds, globalAmpInd, qubits, numQubitsPerReg, numRegs, encoding);
+
+    // determine if this phase index has an overriden value
+    long long int overInd = getIndOfMultiRegPhaseOverride(phaseInds, numRegs, overrideInds, numOverrides);
+
+    // determine the phase, or the overriden one
+    qreal phase = 0;
+    if (overInd < numOverrides)
+        phase = overridePhases[overInd];
+    else
+        phase = getPhaseFromParamNamedFunc(phaseInds, numRegs, phaseFuncName, params, numParams);
+    
+    // modify amp to amp * exp(i phase) 
+    applyPhaseToAmp(qureg, index, phase, conj);
+}
+
+void statevec_applyParamNamedPhaseFuncOverrides(
+    Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding,
+    enum phaseFunc phaseFuncName, qreal* params, int numParams,
+    long long int* overrideInds, qreal* overridePhases, int numOverrides,
+    int conj 
+) {
+    // determine size of arrays, for cloning into GPU memory
+    size_t mem_numQubitsPerReg = numRegs * sizeof *numQubitsPerReg;
+    size_t mem_overridePhases = numOverrides * sizeof *overridePhases;
+    size_t mem_overrideInds = numOverrides * numRegs * sizeof *overrideInds;
+    size_t mem_params = numParams * sizeof *params;
+    size_t mem_qubits = 0;
+    for (int r=0; r<numRegs; r++)
+        mem_qubits += numQubitsPerReg[r] * sizeof *qubits;
+
+    // allocate global GPU memory
+    int* d_qubits;                  cudaMalloc(&d_qubits,           mem_qubits);
+    int* d_numQubitsPerReg;         cudaMalloc(&d_numQubitsPerReg,  mem_numQubitsPerReg);
+    long long int* d_overrideInds;  cudaMalloc(&d_overrideInds,     mem_overrideInds);
+    qreal* d_overridePhases;        cudaMalloc(&d_overridePhases,   mem_overridePhases);
+    qreal* d_params = NULL;         if (numParams > 0) cudaMalloc(&d_params, mem_params);
+
+    // copy function args into GPU memory
+    cudaMemcpy(d_qubits, qubits,                    mem_qubits,             cudaMemcpyHostToDevice);
+    cudaMemcpy(d_numQubitsPerReg, numQubitsPerReg,  mem_numQubitsPerReg,    cudaMemcpyHostToDevice);
+    cudaMemcpy(d_overrideInds, overrideInds,        mem_overrideInds,       cudaMemcpyHostToDevice);
+    cudaMemcpy(d_overridePhases, overridePhases,    mem_overridePhases,     cudaMemcpyHostToDevice);
+    if (numParams > 0)
+        cudaMemcpy(d_params, params, mem_params, cudaMemcpyHostToDevice);
+
+    int threadsPerCUDABlock = 128;
+    int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock);
+
+    // allocate thread-local working space {phaseInds}
+    long long int *d_phaseInds;
+    size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks;
+    cudaMalloc(&d_phaseInds, numRegs*gridSize * sizeof *d_phaseInds);
+
+    // call kernel
+    statevec_applyParamNamedPhaseFuncOverridesKernel<<<CUDABlocks,threadsPerCUDABlock>>>(
+        qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding,
+        phaseFuncName, d_params, numParams,
+        d_overrideInds, d_overridePhases, numOverrides,
+        d_phaseInds,
+        conj);
+
+    // free device memory
+    cudaFree(d_qubits);
+    cudaFree(d_numQubitsPerReg);
+    cudaFree(d_overrideInds);
+    cudaFree(d_overridePhases);
+    cudaFree(d_phaseInds);
+    if (numParams > 0)
+        cudaFree(d_params);
+}
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/QuEST/src/GPU/QuEST_gpu_common.h b/QuEST/src/GPU/QuEST_gpu_common.h
new file mode 100644
index 000000000..cf6998cb4
--- /dev/null
+++ b/QuEST/src/GPU/QuEST_gpu_common.h
@@ -0,0 +1,27 @@
+// Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details
+
+/** @file
+ * GPU backend routines which are invoked by both the cuQuantum and bespoke-kernel backends.
+ * This excludes backend-agnostic API implementations which do not need a header declaration.
+ *
+ * @author Tyson Jones
+ */
+
+# ifndef QUEST_GPU_COMMON_H
+# define QUEST_GPU_COMMON_H
+
+# ifdef __cplusplus
+extern "C" {
+# endif
+
+
+
+int GPUExists(void);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+# endif // QUEST_GPU_COMMON_H
\ No newline at end of file
diff --git a/QuEST/src/QuEST.c b/QuEST/src/QuEST.c
index aae1b433c..ff18e5efb 100644
--- a/QuEST/src/QuEST.c
+++ b/QuEST/src/QuEST.c
@@ -1715,27 +1715,10 @@ void destroySubDiagonalOp(SubDiagonalOp op) {
  * debug
  */
 
-int compareStates(Qureg qureg1, Qureg qureg2, qreal precision) {
-    validateMatchingQuregDims(qureg1, qureg2, __func__);
-    return statevec_compareStates(qureg1, qureg2, precision);
-}
-
 void initDebugState(Qureg qureg) {
     statevec_initDebugState(qureg);
 }
 
-void initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env) {
-    int success = statevec_initStateFromSingleFile(qureg, filename, env);
-    validateFileOpened(success, filename, __func__);
-}
-
-void initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome) {
-    validateStateVecQureg(*qureg, __func__);
-    validateTarget(*qureg, qubitId, __func__);
-    validateOutcome(outcome, __func__);
-    statevec_initStateOfSingleQubit(qureg, qubitId, outcome);
-}
-
 void reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank)  {
     statevec_reportStateToScreen(qureg, env, reportRank);
 }
diff --git a/QuEST/src/QuEST_debug.h b/QuEST/src/QuEST_debug.h
deleted file mode 100755
index e3861f55e..000000000
--- a/QuEST/src/QuEST_debug.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details
-
-/** @file
- * Developer functions used for unit testing and debugging, which are not part of the public API. 
- * May contain functions that are incomplete or untested.
- *
- * @author Ania Brown
- */
-
-# ifndef QUEST_DEBUG_H
-# define QUEST_DEBUG_H
-
-# include "QuEST_precision.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Initialise the state vector of probability amplitudes such that one qubit is set to 'outcome' and all other qubits are in an equal superposition of zero and one.
- * @param[in,out] qureg object representing the set of qubits to be initialised
- * @param[in] qubitId id of qubit to set to state 'outcome'
- * @param[in] outcome value of qubit 'qubitId' to set
- */
-void initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome);
-
-/**
- * Initialise the state vector of probability amplitudes to an (unphysical) state with
- * each component of each probability amplitude a unique floating point value. For debugging processes
- * @param[in,out] qureg object representing the set of qubits to be initialised
- */
-void initStateDebug(Qureg qureg);
-
-/** Initialises the wavefunction amplitudes according to those specified in a file.
- * For debugging purpsoses 
- */
-void initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env);
-
-/** Return whether two given wavefunctions are equivalent within a given precision
- * Global phase included in equivalence check. For debugging purposes.
- */
-int compareStates(Qureg mq1, Qureg mq2, qreal precision);
-
-/** Set elements in the underlying state vector represenation of a density matrix. Not exposed in the public
- * API as this requires an understanding of how the state vector is used to represent a density matrix.
- * Currently can only be used to set all amps. 
- */
-void setDensityAmps(Qureg qureg, qreal* reals, qreal* imags);
-
-
-
-/** Return the precision of qreal for use in testing 
- * */
-  
-int QuESTPrecision(void);
-  
-#ifdef __cplusplus
-}
-#endif
-
-# endif // QUEST_DEBUG_H
diff --git a/QuEST/src/QuEST_internal.h b/QuEST/src/QuEST_internal.h
index 48c2dc742..4fbf99762 100644
--- a/QuEST/src/QuEST_internal.h
+++ b/QuEST/src/QuEST_internal.h
@@ -119,12 +119,6 @@ void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil);
     
 void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank);
 
-int statevec_compareStates(Qureg mq1, Qureg mq2, qreal precision);
-
-int statevec_initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env);
-
-void statevec_initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome);
-
 void statevec_createQureg(Qureg *qureg, int numQubits, QuESTEnv env);
 
 void statevec_destroyQureg(Qureg qureg, QuESTEnv env);
diff --git a/QuEST/src/QuEST_validation.c b/QuEST/src/QuEST_validation.c
index 166b8ba6b..b62732e18 100644
--- a/QuEST/src/QuEST_validation.c
+++ b/QuEST/src/QuEST_validation.c
@@ -120,6 +120,7 @@ typedef enum {
     E_DIAGONAL_OP_NOT_ALLOCATED,
     E_DIAGONAL_OP_NOT_ALLOCATED_ON_GPU,
     E_NO_GPU,
+    E_GPU_DOES_NOT_SUPPORT_MEM_POOLS,
     E_QASM_BUFFER_OVERFLOW
 } ErrorCode;
 
@@ -213,6 +214,7 @@ static const char* errorMessages[] = {
     [E_DIAGONAL_OP_NOT_ALLOCATED] = "Could not allocate memory for DiagonalOp. Possibly insufficient memory.",
     [E_DIAGONAL_OP_NOT_ALLOCATED_ON_GPU] = "Could not allocate memory for DiagonalOp on GPU. Possibly insufficient memory.",
     [E_NO_GPU] = "Trying to run GPU code with no GPU available.",
+    [E_GPU_DOES_NOT_SUPPORT_MEM_POOLS] = "The GPU does not support stream-ordered memory pools, required by the cuQuantum backend.",
     [E_QASM_BUFFER_OVERFLOW] = "QASM line buffer filled."
 };
 
@@ -255,10 +257,10 @@ int isComplexPairUnitary(Complex alpha, Complex beta) {
                 + beta.imag*beta.imag) < REAL_EPS );
 }
 
-#define macro_isMatrixUnitary(m, dim, retVal) { \
-    /* elemRe_ and elemIm_ must not exist in caller scope */ \
+#define macro_isMatrixUnitary(m, dim) { \
+    /* REAL_EPS_SQUARED_, elemRe_, elemIm_, reDif_ and distSq_ must not exist in caller scope */ \
+    qreal REAL_EPS_SQUARED_ = REAL_EPS * REAL_EPS; \
     qreal elemRe_, elemIm_; \
-    retVal = 1; \
     /* check m * ConjugateTranspose(m) == Identity */ \
     for (int r=0; r < (dim); r++) { \
         for (int c=0; c < (dim); c++) { \
@@ -270,40 +272,33 @@ int isComplexPairUnitary(Complex alpha, Complex beta) {
                 elemRe_ += m.real[r][i]*m.real[c][i] + m.imag[r][i]*m.imag[c][i]; \
                 elemIm_ += m.imag[r][i]*m.real[c][i] - m.real[r][i]*m.imag[c][i]; \
             } \
-            /* check distance from identity */ \
-            if ((absReal(elemIm_) > REAL_EPS) || \
-                (r == c && absReal(elemRe_ - 1) > REAL_EPS) || \
-                (r != c && absReal(elemRe_    ) > REAL_EPS)) { \
-                retVal = 0; \
-                break; \
-            } \
+            /* assert elem has Euclidean distance < REAL_EPS from Identity elem */ \
+            qreal reDif_ = elemRe_ - (r==c); \
+            qreal distSq_ = (elemIm_*elemIm_) + (reDif_*reDif_); \
+            if (distSq_ > REAL_EPS_SQUARED_) \
+                return 0; \
         } \
-        if (retVal == 0) \
-            break; \
     } \
+    return 1; \
 }
 int isMatrix2Unitary(ComplexMatrix2 u) {
-    int dim = 2;
-    int retVal;
-    macro_isMatrixUnitary(u, dim, retVal);
-    return retVal;
+    macro_isMatrixUnitary(u, 2);
 }
 int isMatrix4Unitary(ComplexMatrix4 u) {
-    int dim = 4;
-    int retVal;
-    macro_isMatrixUnitary(u, dim, retVal);
-    return retVal;
+    macro_isMatrixUnitary(u, 4);
 }
 int isMatrixNUnitary(ComplexMatrixN u) {
     int dim = 1 << u.numQubits;
-    int retVal;
-    macro_isMatrixUnitary(u, dim, retVal);
-    return retVal;
+    macro_isMatrixUnitary(u, dim);
 }
 
 #define macro_isCompletelyPositiveMap(ops, numOps, opDim) { \
+    /* REAL_EPS_SQUARED_, elemRe_, elemIm_, reDif_ and distSq_ must not exist in caller scope */ \
+    qreal REAL_EPS_SQUARED_ = REAL_EPS * REAL_EPS; \
+    /* check sum_op (ConjugateTranspose(op) * op) == Identity */ \
     for (int r=0; r<(opDim); r++) { \
         for (int c=0; c<(opDim); c++) { \
+            /* compute (r,c)-th elem of sum_op (...) */ \
             qreal elemRe_ = 0; \
             qreal elemIm_ = 0; \
             for (int n=0; n<(numOps); n++) { \
@@ -312,8 +307,10 @@ int isMatrixNUnitary(ComplexMatrixN u) {
                     elemIm_ += ops[n].real[k][r]*ops[n].imag[k][c] - ops[n].imag[k][r]*ops[n].real[k][c]; \
                 } \
             } \
-            qreal dist_ = absReal(elemIm_) + absReal(elemRe_ - ((r==c)? 1:0)); \
-            if (dist_ > REAL_EPS) \
+            /* assert elem has Euclidean distance < REAL_EPS from Identity elem */ \
+            qreal reDif_ = elemRe_ - (r==c); \
+            qreal distSq_ = (elemIm_*elemIm_) + (reDif_*reDif_); \
+            if (distSq_ > REAL_EPS_SQUARED_) \
                 return 0; \
         } \
     } \
@@ -1109,14 +1106,20 @@ void validateDiagonalOpGPUAllocation(DiagonalOp* op, QuESTEnv env, const char* c
     QuESTAssert(allocationSuccessful, E_DIAGONAL_OP_NOT_ALLOCATED_ON_GPU, caller);
 }
 
-// This is really just a dummy shim, because the scope of GPUExists()
-// is limited to the QuEST_gpu.cu file.
+void raiseQASMBufferOverflow(const char* caller) {
+    invalidQuESTInputError(errorMessages[E_QASM_BUFFER_OVERFLOW], caller);
+}
+
+
+/* The below functions are dummy shims, whereby the GPU has already determined the outcome of
+ * validation, though sends the boolean results here for error handling.
+ */
 void validateGPUExists(int GPUPresent, const char* caller) {
     QuESTAssert(GPUPresent, E_NO_GPU, caller);
 }
 
-void raiseQASMBufferOverflow(const char* caller) {
-    invalidQuESTInputError(errorMessages[E_QASM_BUFFER_OVERFLOW], caller);
+void validateGPUIsCuQuantumCompatible(int supportsMemPools, const char* caller) {
+    QuESTAssert(supportsMemPools, E_GPU_DOES_NOT_SUPPORT_MEM_POOLS, caller);
 }
 
 
diff --git a/QuEST/src/QuEST_validation.h b/QuEST/src/QuEST_validation.h
index eb26a48eb..6df09f93b 100644
--- a/QuEST/src/QuEST_validation.h
+++ b/QuEST/src/QuEST_validation.h
@@ -174,10 +174,10 @@ void validateDiagonalOpAllocation(DiagonalOp* op, QuESTEnv env, const char* call
 
 void validateDiagonalOpGPUAllocation(DiagonalOp* op, QuESTEnv env, const char* caller);
 
-// This is really just a dummy shim, because the scope of GPUExists()
-// is limited to the QuEST_gpu.cu file.
 void validateGPUExists(int GPUPresent, const char* caller);
 
+void validateGPUIsCuQuantumCompatible(int supportsMemPools, const char* caller);
+
 void raiseQASMBufferOverflow(const char* caller);
 
 # ifdef __cplusplus
diff --git a/README.md b/README.md
index 9b8da9068..c3ae29a29 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,10 @@
 [![MIT license](https://img.shields.io/badge/license-MIT-lightgrey.svg)](LICENCE.txt)
 
 
-The **Quantum Exact Simulation Toolkit** is a high performance simulator of quantum circuits, state-vectors and density matrices. QuEST uses **multithreading**, **GPU acceleration** and **distribution** to run lightning first on laptops, desktops and networked supercomputers. QuEST *just works*; it is stand-alone, requires no installation, and trivial to compile and getting running.
+The **Quantum Exact Simulation Toolkit** is a high performance simulator of quantum circuits, state-vectors and density matrices. QuEST uses **multithreading**, **GPU acceleration** and **distribution** to run lightning first on laptops, desktops and networked supercomputers. 
+QuEST *just works*; it is stand-alone, requires no installation, and is trivial to compile and run. 
+QuEST hybridises [OpenMP](https://www.openmp.org/) and [MPI](https://www.mpi-forum.org/) with _huge_ compiler support to run on all sorts of multicore, multi-CPU and distributed hardware, uses [HIP](https://docs.amd.com/bundle/HIP-Programming-Guide-v5.3) to run on AMD GPUs, integrates [cuQuantum](https://developer.nvidia.com/cuquantum-sdk) and [Thrust](https://developer.nvidia.com/thrust) for cutting-edge performance on modern NVIDIA GPUs, and has a custom kernel backend to run on older CUDA-compatible GPUs. 
+And it hides these deployment modes behind a single, seamless interface.
 
 [![Languages](https://img.shields.io/badge/C-99-ff69b4.svg)](http://www.open-std.org/jtc1/sc22/wg14/www/standards.html#9899)
 [![Languages](https://img.shields.io/badge/C++-11-ff69b4.svg)](https://isocpp.org/wiki/faq/cpp11)
@@ -36,16 +39,17 @@ The **Quantum Exact Simulation Toolkit** is a high performance simulator of quan
 ![OS](https://img.shields.io/badge/os-Linux-9cbd3c.svg)
 ![OS](https://img.shields.io/badge/os-Windows-9cbd3c.svg)
 [![Platforms](https://img.shields.io/badge/multithreaded-OpenMP-6699ff.svg)](https://www.openmp.org/)
+[![Platforms](https://img.shields.io/badge/distributed-MPI-6699ff.svg)](https://www.mpi-forum.org/) 
 [![Platforms](https://img.shields.io/badge/GPU-CUDA-6699ff.svg)](https://developer.nvidia.com/cuda-zone)
 [![Platforms](https://img.shields.io/badge/GPU-AMD-6699ff.svg)](https://docs.amd.com/bundle/HIP-Programming-Guide-v5.3)
-[![Platforms](https://img.shields.io/badge/distributed-MPI-6699ff.svg)](https://www.mpi-forum.org/) 
+[![Platforms](https://img.shields.io/badge/GPU-cuQuantum-6699ff.svg)](https://developer.nvidia.com/cuquantum-sdk)
 
 QuEST is developed by the [QTechTheory](http://qtechtheory.org/) group at the University of Oxford, and [these authors](https://github.com/QuEST-Kit/QuEST/blob/master/AUTHORS.txt). To learn more:
 - see the [tutorial](https://github.com/QuEST-Kit/QuEST/blob/master/examples/README.md)
 - view the [documentation](https://quest-kit.github.io/QuEST/modules.html)
-- visit our [website](https://quest.qtechtheory.org/)
+- visit the [website](https://quest.qtechtheory.org/)
 - see some [examples](https://github.com/QuEST-Kit/QuEST/blob/master/examples/)
-- read our [whitepaper](https://www.nature.com/articles/s41598-019-47174-9), which featured in Scientific Report's [Top 100 in Physics](https://www.nature.com/collections/ecehgdfcba/) :trophy:
+- read the [whitepaper](https://www.nature.com/articles/s41598-019-47174-9), which featured in Scientific Report's [Top 100 in Physics](https://www.nature.com/collections/ecehgdfcba/) :trophy:
 
 [![DOI](https://img.shields.io/badge/DOI-10.1038%2Fs41598--019--47174--9-yellow.svg)](https://doi.org/10.1038/s41598-019-47174-9)
 [![Email](https://img.shields.io/badge/email-quest@materials.ox.ac.uk-red.svg)](mailto:quest@materials.ox.ac.uk)
@@ -131,7 +135,7 @@ QuEST supports:
   - [testing utilities](https://quest-kit.github.io/QuEST/group__testutilities.html)
 
 
-> **For developers**: To regenerate the doc after making changes to the code, run `doxygen doxyconfig/config` in the root directory. This will generate documentation in `Doxygen_doc/html`, the contents of which should be copied into [`docs/`](/docs/)
+> **For developers**: QuEST's doc is automatically regenerated when the `master` branch is updated via [Github Actions](https://github.com/features/actions). To locally regenerate the doc, run `doxygen doxyconfig/config` in the root directory, which generates html documentation in `Doxygen_doc/html`.
 
 ---------------------------------
 
@@ -176,6 +180,7 @@ then run it with
 
 We sincerely thank the following external contributors to QuEST.
 
+- [Jakub Adamski](https://github.com/jjacobx) for optimising distributed communication of max-size messages.
 - [Bruno Villasenor Alvarez](https://github.com/bvillasen) of [AMD](https://www.amd.com/en.html) for porting the GPU backend to [HIP](https://github.com/ROCm-Developer-Tools/HIP), for compatibility with AMD GPUs.
 - [HQS Quantum simulations](https://quantumsimulations.de/) for contributing `mixDamping` on CPU.
 - [Kshitij Chhabra](https://github.com/kshitijc) for patching some validation bugs.
@@ -183,8 +188,8 @@ We sincerely thank the following external contributors to QuEST.
 - [Zach van Rijn](https://github.com/zv-io) for patching the multithreading code for GCC-9 OpenMP-5 compatibility.
 - [SchineCompton](https://github.com/SchineCompton) for patching the GPU CMake release build.
 - [Christopher J. Anders](https://github.com/chr5tphr) for patching the multithreading (when default off) and GPU builds (revising min cmake).
-- [Gleb Struchalin](https://github.com/glebx-f) for patching the cmake standalone build
-- [Milos Prokop](https://github.com/Milos9304) for serial prototyping of `initDiagonalOpFromPauliHamil`
+- [Gleb Struchalin](https://github.com/glebx-f) for patching the cmake standalone build.
+- [Milos Prokop](https://github.com/Milos9304) for serial prototyping of `initDiagonalOpFromPauliHamil`.
 
 QuEST uses the [mt19937ar](http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/emt19937ar.html) Mersenne Twister algorithm for random number generation, under the BSD licence. QuEST optionally (by additionally importing `QuEST_complex.h`) integrates the [language agnostic complex type](http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/2003/0303/cuj0303meyers/index.htm) by Randy Meyers and Dr. Thomas Plum
 
diff --git a/examples/README.md b/examples/README.md
index 019353e17..1252666dd 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -7,7 +7,6 @@ Tutorial
 - [Running](#running)
 - [Testing](#testing)
 
-> Before getting started, it is best to [test](#testing) QuEST on your hardware.
 
 # Coding
 
@@ -174,6 +173,8 @@ QuEST uses the [Mersenne Twister](http://www.math.sci.hiroshima-u.ac.jp/~m-mat/M
 
 # Compiling
 
+See [this page](https://quest.qtechtheory.org/download/) to obtain the necessary compilers.
+
 QuEST uses [CMake](https://cmake.org/) (version `3.7` or higher) as its build system. Configure the build by supplying the below `-D[VAR=VALUE]` options after the `cmake ..` command. You can alternatively compile via [GNU Make](https://www.gnu.org/software/make/) directly with the provided [makefile](makefile).
 
 > **Windows** users should install [CMake](https://cmake.org/download/) and [Build Tools](https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2019), and run the below commands in the *Developer Command Prompt for VS*
@@ -242,9 +243,9 @@ If your project contains multiple source files, separate them with semi-colons.
 
 - To compile for GPU, use
   ```console
-   -DGPUACCELERATED=1 -DGPU_COMPUTE_CAPABILITY=[CC] ..
+   -DGPUACCELERATED=1 -DGPU_COMPUTE_CAPABILITY=[CC]
   ```
-  where `[CC]` is the compute cabability of your GPU, written without a decimal point. This can can be looked up at the [NVIDIA website](https://developer.nvidia.com/cuda-gpus).
+  where `[CC]` is the compute cabability of your GPU, written without a decimal point. This can can be looked up at the [NVIDIA website](https://developer.nvidia.com/cuda-gpus), and to check you have selected the right one, you should run the [unit tests](#testing).
   > Note that CUDA is not compatible with all compilers. To force `cmake` to use a 
   > compatible compiler, override `CMAKE_C_COMPILER` and `CMAKE_CXX_COMPILER`.  
   > For example, to compile for the [Quadro P6000](https://www.pny.com/nvidia-quadro-p6000)
@@ -254,11 +255,16 @@ If your project contains multiple source files, separate them with semi-colons.
   >          -DCMAKE_C_COMPILER=gcc-6 -DCMAKE_CXX_COMPILER=g++-6
   > ```
 
+  QuEST can also leverage NVIDIA's [cuQuantum](https://developer.nvidia.com/cuquantum-sdk) and [Thrust](https://developer.nvidia.com/thrust) libraries for optimised GPU simulation on modern GPUs. You must first install cuQuantum (which includes sub-library `cuStateVec` used by QuEST) [here](https://developer.nvidia.com/cuQuantum-downloads). When compiling QuEST, in addition to the above compiler options, simply specify
+  ```console
+   -DUSE_CUQUANTUM=1
+  ```
+
   QuEST can also run on AMD GPUs using HIP. For the HIP documentation see [HIP programming guide](https://docs.amd.com/bundle/HIP-Programming-Guide-v5.3/page/Introduction_to_HIP_Programming_Guide.html). To compile for AMD GPUs, use
     ```console
-    -DGPUACCELERATED=1 -DUSE_HIP=1 -DGPU_ARCH=[ARCH] ..
+    -DGPUACCELERATED=1 -DUSE_HIP=1 -DGPU_ARCH=[ARCH]
     ```
-  where `[ARCH]` is the architecture of your GPU, for example `gfx90a`. A table for AMD GPU architectures can be looked up [here](https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table). 
+  where `[ARCH]` is the architecture of your GPU, for example `gfx90a`. A table for AMD GPU architectures can be looked up [here](https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table). To check you have used the correct `GPU_ARCH`, you should run the [unit tests](#testing).
 
 - You can additionally customise the floating point precision used by QuEST's `qreal` type, via
   ```console
@@ -310,6 +316,7 @@ Once compiled as above, the compiled executable can be locally run from within t
   export OMP_NUM_THREADS=16
   mpirun -np 8 ./myExecutable
   ```
+  In some circumstances, like when large-memory multi-core nodes have multiple CPU sockets, it is worthwhile to deploy _multiple_ MPI processes to each node.
 
 - In GPU mode, the executable is launched directly via 
   ```console 
diff --git a/tests/test_decoherence.cpp b/tests/test_decoherence.cpp
index c5955b54f..cf572f0a3 100644
--- a/tests/test_decoherence.cpp
+++ b/tests/test_decoherence.cpp
@@ -9,7 +9,8 @@
 #define PREPARE_TEST(qureg, ref) \
     Qureg qureg = createDensityQureg(NUM_QUBITS, QUEST_ENV); \
     initDebugState(qureg); \
-    QMatrix ref = toQMatrix(qureg);
+    QMatrix ref = toQMatrix(qureg); \
+    assertQuregAndRefInDebugState(qureg, ref);
 
 /* allows concise use of Contains in catch's REQUIRE_THROWS_WITH */
 using Catch::Matchers::Contains;
@@ -388,7 +389,7 @@ TEST_CASE( "mixMultiQubitKrausMap", "[decoherence]" ) {
             }
             
             // make only one invalid
-            ops[GENERATE_REF( range(0,numOps) )].real[0][0] = 0;
+            ops[GENERATE_REF( range(0,numOps) )].real[0][0] = -123456789;
             
             // make valid targets to avoid triggering target validation
             VLA(int, targs, numTargs);
diff --git a/tests/test_operators.cpp b/tests/test_operators.cpp
index ac960d962..d4c32b424 100644
--- a/tests/test_operators.cpp
+++ b/tests/test_operators.cpp
@@ -13,7 +13,9 @@
     initDebugState(quregVec); \
     initDebugState(quregMatr); \
     QVector refVec = toQVector(quregVec); \
-    QMatrix refMatr = toQMatrix(quregMatr);
+    QMatrix refMatr = toQMatrix(quregMatr); \
+    assertQuregAndRefInDebugState(quregVec, refVec); \
+    assertQuregAndRefInDebugState(quregMatr, refMatr);
 
 /** Destroys the data structures made by PREPARE_TEST */
 #define CLEANUP_TEST(quregVec, quregMatr) \
diff --git a/tests/test_unitaries.cpp b/tests/test_unitaries.cpp
index 76cbd86d9..811641f36 100644
--- a/tests/test_unitaries.cpp
+++ b/tests/test_unitaries.cpp
@@ -27,7 +27,9 @@
     initDebugState(quregVec); \
     initDebugState(quregMatr); \
     QVector refVec = toQVector(quregVec); \
-    QMatrix refMatr = toQMatrix(quregMatr);
+    QMatrix refMatr = toQMatrix(quregMatr); \
+    assertQuregAndRefInDebugState(quregVec, refVec); \
+    assertQuregAndRefInDebugState(quregMatr, refMatr);
 
 /** Destroys the data structures made by PREPARE_TEST */
 #define CLEANUP_TEST(quregVec, quregMatr) \
diff --git a/tests/utilities.cpp b/tests/utilities.cpp
index 72ae94680..df4ad90b0 100644
--- a/tests/utilities.cpp
+++ b/tests/utilities.cpp
@@ -140,6 +140,38 @@ QVector operator * (const QMatrix& m, const QVector& v) {
     return prod;
 }
 
+void assertQuregAndRefInDebugState(Qureg qureg, QVector ref) {
+    DEMAND( qureg.isDensityMatrix == 0 );
+    DEMAND( qureg.numAmpsTotal == (long long int) ref.size() );
+
+    // assert ref is in the debug state (else initDebugState failed)
+    for (size_t i=0; i<ref.size(); i++) {
+        qcomp val = qcomp(.2*i, .2*i+.1);
+        DEMAND( abs(ref[i] - val) < REAL_EPS );
+    }
+
+    // check qureg and ref agree
+    DEMAND( areEqual(qureg, ref) );
+}
+
+void assertQuregAndRefInDebugState(Qureg qureg, QMatrix ref) {
+    DEMAND( qureg.isDensityMatrix == 1 );
+    DEMAND( (1LL << qureg.numQubitsRepresented) == (long long int) ref.size() );
+
+    // assert ref is in the (column-wise) debug state (else initDebugState failed)
+    size_t i = 0;
+    for (size_t c=0; c<ref.size(); c++) {
+        for (size_t r=0; r<ref.size(); r++) {
+            qcomp val = qcomp(.2*i, .2*i+.1);
+            DEMAND( abs(ref[r][c] - val) < REAL_EPS );
+            i++;
+        }
+    }
+
+    // check qureg and ref agree
+    DEMAND( areEqual(qureg, ref) );
+}
+
 QVector getKroneckerProduct(QVector b, QVector a) {
 
     QVector prod = QVector(a.size() * b.size());
@@ -186,6 +218,14 @@ QMatrix getKroneckerProduct(QMatrix a, QMatrix b) {
     return prod;
 }
 
+QMatrix getTranspose(QMatrix a) {
+    QMatrix b = a;
+    for (size_t r=0; r<a.size(); r++)
+        for (size_t c=0; c<a.size(); c++)
+            b[r][c] = a[c][r];
+    return b;
+}
+
 QMatrix getConjugateTranspose(QMatrix a) {
     QMatrix b = a;
     for (size_t r=0; r<a.size(); r++)
@@ -445,6 +485,8 @@ QVector getRandomQVector(int dim) {
 }
 
 QVector getNormalised(QVector vec) {
+
+    // compute the vec norm via Kahan summation to suppress numerical error
     qreal norm = 0;
     qreal y, t, c;
     c = 0;
@@ -528,54 +570,76 @@ int getRandomInt(int min, int max) {
     return (int) round(getRandomReal(min, max-1));
 }
 
-QMatrix getRandomUnitary(int numQb) {
-    DEMAND( numQb >= 1 );
-
-    QMatrix matr = getRandomQMatrix(1 << numQb);
+QMatrix getOrthonormalisedRows(QMatrix matr) {
 
+    // perform the Gram-Schmidt process, processing each row of matr in-turn
     for (size_t i=0; i<matr.size(); i++) {
         QVector row = matr[i];
         
         // compute new orthogonal row by subtracting proj row onto prevs
         for (int k=i-1; k>=0; k--) {
 
-            // compute row . prev = sum_n row_n conj(prev_n)
-            qcomp prod = 0;
-            for (size_t n=0; n<row.size(); n++)
-                prod += row[n] * conj(matr[k][n]);
+            // compute inner_product(row, prev) = row . conj(prev)
+            qcomp prod = row * matr[k];
                             
             // subtract (proj row onto prev) = (prod * prev) from final row
-            for (size_t n=0; n<row.size(); n++)
-                matr[i][n] -= prod * matr[k][n];
+            matr[i] -= prod * matr[k];
         }
     
-        // compute row magnitude 
-        qreal mag = 0;
-        for (size_t j=0; j<row.size(); j++)
-            mag += pow(abs(matr[i][j]), 2);
-        mag = sqrt(mag);
-        
-        // normalise row
-        for (size_t j=0; j<row.size(); j++)
-            matr[i][j] /= mag;
+        // normalise the row 
+        matr[i] = getNormalised(matr[i]);
     }
     
-    // ensure matrix is indeed unitary 
-    QMatrix conjprod = matr * getConjugateTranspose(matr);
-    QMatrix iden = getIdentityMatrix(1 << numQb);
-    
-    // generating big unitary matrices is hard; if we fail, default to identity
-    if ( numQb >= 3 && !areEqual(conjprod, iden) ) {
-        
-        matr = getIdentityMatrix(1 << numQb);
-        conjprod = matr;
-    }
-    DEMAND( areEqual(conjprod, iden) );
-    
     // return the new orthonormal matrix
     return matr;
 }
 
+QMatrix getRandomDiagonalUnitary(int numQb) {
+    DEMAND( numQb >= 1 );
+
+    QMatrix matr = getZeroMatrix(1 << numQb);
+    for (size_t i=0; i<matr.size(); i++)
+        matr[i][i] = expI(getRandomReal(0,4*M_PI));
+
+    return matr;
+}
+
+QMatrix getRandomUnitary(int numQb) {
+    DEMAND( numQb >= 1 );
+
+    // create Z ~ random complex matrix (distribution not too important)
+    size_t dim = 1 << numQb;
+    QMatrix matrZ = getRandomQMatrix(dim);
+    QMatrix matrZT = getTranspose(matrZ);
+
+    // create Z = Q R (via QR decomposition) ...
+    QMatrix matrQT = getOrthonormalisedRows(matrZ);
+    QMatrix matrQ = getTranspose(matrQT);
+    QMatrix matrR = getZeroMatrix(dim);
+
+    // ... where R_rc = (columm c of Z) . (column r of Q) = (row c of ZT) . (row r of QT)
+    for (size_t r=0; r<dim; r++)
+        for (size_t c=r; c<dim; c++)
+            matrR[r][c] = matrZT[c] * matrQT[r];
+
+    // create D = normalised diagonal of R
+    QMatrix matrD = getZeroMatrix(dim);
+    for (size_t i=0; i<dim; i++)
+        matrD[i][i] = matrR[i][i] / abs(matrR[i][i]);
+
+    // create U = Q D
+    QMatrix matrU = matrQ * matrD;
+
+    // in the rare scenario the result is not sufficiently precisely unitary,
+    // replace it with a trivially unitary diagonal matrix
+    QMatrix daggerProd = matrU * getConjugateTranspose(matrU);
+    QMatrix iden = getIdentityMatrix(dim);
+    if( ! areEqual(daggerProd, iden) )
+        matrU = getRandomDiagonalUnitary(numQb);
+
+    return matrU;
+}
+
 std::vector<QMatrix> getRandomKrausMap(int numQb, int numOps) {
     DEMAND( numOps >= 1 );
     DEMAND( numOps <= 4*numQb*numQb );
@@ -606,8 +670,13 @@ std::vector<QMatrix> getRandomKrausMap(int numQb, int numOps) {
     QMatrix prodSum = getZeroMatrix(1 << numQb);
     for (int i=0; i<numOps; i++)
         prodSum += getConjugateTranspose(ops[i]) * ops[i];
-    DEMAND( areEqual(prodSum, iden) );
-        
+
+    // in the rare scenario it is insufficiently numerically precise,
+    // replace the map with trivially precise diagonals
+    if( ! areEqual(prodSum, iden) )
+        for (int i=0; i<numOps; i++)
+            ops[i] = weights[i] * getRandomDiagonalUnitary(numQb);
+
     return ops;
 }
 
diff --git a/tests/utilities.hpp b/tests/utilities.hpp
index 7dbd0146b..2ea37448e 100644
--- a/tests/utilities.hpp
+++ b/tests/utilities.hpp
@@ -76,6 +76,26 @@ typedef std::vector<std::vector<qcomp>> QMatrix;
  */
 typedef std::vector<qcomp> QVector;
 
+/** Asserts the given statevector qureg and reference agree, and are properly initialised in the debug state.
+ * Assertion uses the DEMAND() macro, calling Catch2's FAIL() if unsatisfied, so does not contribute
+ * toward unit test statistics. This should be called within every PREPARE_TEST macro, to ensure that
+ * the test states themselves are initially correct, and do not accidentally agree by (e.g.) being all-zero.
+ * 
+ * @ingroup testutilities
+ * @author Tyson Jones
+ */
+void assertQuregAndRefInDebugState(Qureg qureg, QVector ref);
+
+/** Asserts the given density qureg and reference agree, and are properly initialised in the debug state.
+ * Assertion uses the DEMAND() macro, calling Catch2's FAIL() if unsatisfied, so does not contribute
+ * toward unit test statistics. This should be called within every PREPARE_TEST macro, to ensure that
+ * the test states themselves are initially correct, and do not accidentally agree by (e.g.) being all-zero.
+ * 
+ * @ingroup testutilities
+ * @author Tyson Jones
+ */
+void assertQuregAndRefInDebugState(Qureg qureg, QMatrix ref);
+
 /* (Excluded from Doxygen doc)
  *
  * Define QVector and QMatrix operator overloads.