diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 15ecaccd0..1a7744493 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -7,7 +7,8 @@ on: - develop pull_request: branches: - - '**' + - master + - develop jobs: diff --git a/.github/workflows/llvm-asan.yml b/.github/workflows/llvm-asan.yml index ad3c49efe..60cfd3a06 100644 --- a/.github/workflows/llvm-asan.yml +++ b/.github/workflows/llvm-asan.yml @@ -7,7 +7,8 @@ on: - develop pull_request: branches: - - '**' + - master + - develop jobs: diff --git a/.github/workflows/macos-unit.yml b/.github/workflows/macos-unit.yml index a439b1c52..bb751cd3a 100644 --- a/.github/workflows/macos-unit.yml +++ b/.github/workflows/macos-unit.yml @@ -7,7 +7,8 @@ on: - develop pull_request: branches: - - '**' + - master + - develop jobs: diff --git a/.github/workflows/ubuntu-unit.yml b/.github/workflows/ubuntu-unit.yml index 68e9dc8a1..650d533d6 100644 --- a/.github/workflows/ubuntu-unit.yml +++ b/.github/workflows/ubuntu-unit.yml @@ -7,7 +7,8 @@ on: - develop pull_request: branches: - - '**' + - master + - develop jobs: diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml index 74817bf0c..958f32398 100644 --- a/.github/workflows/windows-build.yml +++ b/.github/workflows/windows-build.yml @@ -7,7 +7,8 @@ on: - develop pull_request: branches: - - '**' + - master + - develop jobs: diff --git a/.github/workflows/windows-unit.yml b/.github/workflows/windows-unit.yml index 72d79be9a..2d5076d21 100644 --- a/.github/workflows/windows-unit.yml +++ b/.github/workflows/windows-unit.yml @@ -7,7 +7,8 @@ on: - develop pull_request: branches: - - '**' + - master + - develop jobs: diff --git a/AUTHORS.txt b/AUTHORS.txt index 6c6500b33..e20b7ea7d 100644 --- a/AUTHORS.txt +++ b/AUTHORS.txt @@ -33,6 +33,8 @@ Dr Mihai Duta [developer] original prototyping External contributors: +Jakub Adamski + optimised distributed communication by sending max-size messages asynchronously Bruno Villasenor Alvarez on behalf of AMD ported the GPU backend to HIP, for AMD GPU compatibility Dr Nicolas Vogt on behalf of HQS Quantum Simulations diff --git a/QuEST/CMakeLists.txt b/QuEST/CMakeLists.txt index 899f3fb98..0ed65f974 100644 --- a/QuEST/CMakeLists.txt +++ b/QuEST/CMakeLists.txt @@ -37,6 +37,8 @@ option(USE_HIP "Whether to use HIP for GPU code compilation for AMD GPUs. Set to set(GPU_ARCH gfx90 CACHE STRING "GPU hardware dependent, used for AMD GPUs when USE_HIP=1. Lookup at https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table. Write without fullstop") +option(USE_CUQUANTUM "Whether to use NVIDIA's cuQuantum library (requires prior installation) in lieu of QuEST's bespoke GPU kernel. Set to 1 to enable." 0) + # ***************************************************************************** # ***** NO CHANGES SHOULD BE REQUIRED FROM THE USER BEYOND THIS POINT ********* @@ -49,6 +51,7 @@ message(STATUS "OMP acceleration is ${MULTITHREADED}") message(STATUS "MPI distribution is ${DISTRIBUTED}") if (${GPUACCELERATED}) message(STATUS "HIP compilation is ${USE_HIP}") + message(STATUS "cuQuantum compilation is ${USE_CUQUANTUM}") endif() @@ -119,25 +122,28 @@ endif() if (GPUACCELERATED) if (USE_HIP) - if(NOT DEFINED HIP_PATH) - if(NOT DEFINED ENV{HIP_PATH}) - message(WARNING "WARNING: HIP_PATH is not defiend. Using default HIP_PATH=/opt/rocm/hip " ${HIP_VERSION}) - set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") - else() - set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") + if(NOT DEFINED HIP_PATH) + if(NOT DEFINED ENV{HIP_PATH}) + message(WARNING "WARNING: HIP_PATH is not defiend. Using default HIP_PATH=/opt/rocm/hip " ${HIP_VERSION}) + set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") + else() + set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") + endif() endif() - endif() - if(EXISTS "${HIP_PATH}") - set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) - find_package(HIP REQUIRED) - message(STATUS "Found HIP: " ${HIP_VERSION}) - message(STATUS "HIP PATH: " ${HIP_PATH}) - endif() - - ADD_DEFINITIONS( -DUSE_HIP ) - ADD_DEFINITIONS( -D__HIP_PLATFORM_AMD__ ) + if(EXISTS "${HIP_PATH}") + set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) + find_package(HIP REQUIRED) + message(STATUS "Found HIP: " ${HIP_VERSION}) + message(STATUS "HIP PATH: " ${HIP_PATH}) + endif() + + ADD_DEFINITIONS( -DUSE_HIP ) + ADD_DEFINITIONS( -D__HIP_PLATFORM_AMD__ ) + elseif (USE_CUQUANTUM) + find_package(CUDA REQUIRED) + ADD_DEFINITIONS( -DUSE_CUQUANTUM ) else() find_package(CUDA REQUIRED) endif() @@ -280,7 +286,12 @@ endif() # ----- C++ COMPILER FLAGS -------------------------------------------------- # set C++ flags that are common between compilers and build types -set (CMAKE_CXX_STANDARD 98) +if (USE_CUQUANTUM) + set(CMAKE_CXX_STANDARD 14) + set(CMAKE_CXX_STANDARD_REQUIRED ON) +else () + set (CMAKE_CXX_STANDARD 98) +endif () # Use -O2 for all but debug mode by default if (NOT("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")) @@ -412,6 +423,14 @@ target_link_libraries(QuEST PUBLIC ${MPI_C_LIBRARIES}) # ----- GPU ------------------------------------------------------------------- if (USE_HIP) target_link_libraries(QuEST PUBLIC ${HIP_PATH}/lib/libamdhip64.so ) +elseif (USE_CUQUANTUM) + find_library(CUQUANTUM_LIBRARIES custatevec) + if (NOT CUQUANTUM_LIBRARIES) + message(FATAL_ERROR "cuQuantum library (specifically custatevec) not found") + endif () + + target_link_libraries(QuEST ${CUDA_LIBRARIES} ${CUQUANTUM_LIBRARIES}) + target_include_directories(QuEST PUBLIC "/usr/local/cuda/include") else() target_link_libraries(QuEST ${CUDA_LIBRARIES}) endif() diff --git a/QuEST/include/QuEST.h b/QuEST/include/QuEST.h index dd7709780..ba89ad38d 100644 --- a/QuEST/include/QuEST.h +++ b/QuEST/include/QuEST.h @@ -34,6 +34,23 @@ # include "QuEST_precision.h" + + +// ensure custatevecHandle_t is defined, even if no GPU +# ifdef USE_CUQUANTUM +# include +typedef struct CuQuantumConfig { + cudaMemPool_t cuMemPool; + cudaStream_t cuStream; + custatevecHandle_t cuQuantumHandle; + custatevecDeviceMemHandler_t cuMemHandler; +} CuQuantumConfig; +# else +# define CuQuantumConfig void* +# endif + + + // prevent C++ name mangling #ifdef __cplusplus extern "C" { @@ -368,6 +385,11 @@ typedef struct Qureg //! Storage for reduction of probabilities on GPU qreal *firstLevelReduction, *secondLevelReduction; + //! Storage for wavefunction amplitues and config (copy of QuESTEnv's handle) in cuQuantum deployment + cuAmp* cuStateVec; + cuAmp* deviceCuStateVec; + CuQuantumConfig* cuConfig; + //! Storage for generated QASM output QASMLogger* qasmLog; @@ -386,6 +408,10 @@ typedef struct QuESTEnv int numRanks; unsigned long int* seeds; int numSeeds; + + // a copy of the QuESTEnv's config, used only in cuQuantum deployment + CuQuantumConfig* cuConfig; + } QuESTEnv; @@ -4236,6 +4262,10 @@ qreal calcPurity(Qureg qureg); * linear algebra calculation. * * The number of qubits represented in \p qureg and \p pureState must match. + * + * > In the GPU-accelerated cuQuantum backend, this function further assumes that + * > the density matrix \p qureg is correctly normalised, and otherwise returns the + * > fidelity of the conjugate-transpose of \p qureg. * * @see * - calcHilbertSchmidtDistance() diff --git a/QuEST/include/QuEST_precision.h b/QuEST/include/QuEST_precision.h index 48c9a1ccf..244460b78 100644 --- a/QuEST/include/QuEST_precision.h +++ b/QuEST/include/QuEST_precision.h @@ -10,15 +10,27 @@ * @author Tyson Jones (doc) */ -# include - # ifndef QUEST_PRECISION_H # define QUEST_PRECISION_H +# include + + +// define CUDA complex types as void if not using cuQuantum. +// note we used cuComplex.h for complex numbers, in lieu of +// Thrust's complex, so that the QuEST.h header can +// always be compiled with C99, rather than C++14. +# ifdef USE_CUQUANTUM + # include +# else + # define cuFloatComplex void + # define cuDoubleComplex void +# endif + // set default double precision if not set during compilation # ifndef QuEST_PREC -# define QuEST_PREC 2 + # define QuEST_PREC 2 # endif @@ -28,6 +40,7 @@ # if QuEST_PREC==1 # define qreal float // \cond HIDDEN_SYMBOLS + # define cuAmp cuFloatComplex # define MPI_QuEST_REAL MPI_FLOAT # define MPI_MAX_AMPS_IN_MSG (1LL<<29) // must be 2^int # define REAL_STRING_FORMAT "%.8f" @@ -41,7 +54,8 @@ */ # elif QuEST_PREC==2 # define qreal double - // \cond HIDDEN_SYMBOLS + // \cond HIDDEN_SYMBOLS + # define cuAmp cuDoubleComplex # define MPI_QuEST_REAL MPI_DOUBLE # define MPI_MAX_AMPS_IN_MSG (1LL<<28) // must be 2^int # define REAL_STRING_FORMAT "%.14f" @@ -57,6 +71,7 @@ # elif QuEST_PREC==4 # define qreal long double // \cond HIDDEN_SYMBOLS + # define cuAmp void // invalid # define MPI_QuEST_REAL MPI_LONG_DOUBLE # define MPI_MAX_AMPS_IN_MSG (1LL<<27) // must be 2^int # define REAL_STRING_FORMAT "%.17Lf" diff --git a/QuEST/src/CPU/QuEST_cpu.c b/QuEST/src/CPU/QuEST_cpu.c index 56e82d54f..1736de3cd 100644 --- a/QuEST/src/CPU/QuEST_cpu.c +++ b/QuEST/src/CPU/QuEST_cpu.c @@ -1641,53 +1641,6 @@ void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg) { } } -/** - * Initialise the state vector of probability amplitudes such that one qubit is set to 'outcome' and all other qubits are in an equal superposition of zero and one. - * @param[in,out] qureg object representing the set of qubits to be initialised - * @param[in] qubitId id of qubit to set to state 'outcome' - * @param[in] outcome of qubit 'qubitId' - */ -void statevec_initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome) -{ - long long int chunkSize, stateVecSize; - long long int index; - int bit; - long long int chunkId=qureg->chunkId; - - // dimension of the state vector - chunkSize = qureg->numAmpsPerChunk; - stateVecSize = chunkSize*qureg->numChunks; - qreal normFactor = 1.0/sqrt((qreal)stateVecSize/2.0); - - // Can't use qureg->stateVec as a private OMP var - qreal *stateVecReal = qureg->stateVec.real; - qreal *stateVecImag = qureg->stateVec.imag; - - // initialise the state to |0000..0000> -# ifdef _OPENMP -# pragma omp parallel \ - default (none) \ - shared (chunkSize, stateVecReal, stateVecImag, normFactor, qubitId, outcome, chunkId) \ - private (index, bit) -# endif - { -# ifdef _OPENMP -# pragma omp for schedule (static) -# endif - for (index=0; indexnumAmpsPerChunk; - stateVecSize = chunkSize*qureg->numChunks; - - qreal *stateVecReal = qureg->stateVec.real; - qreal *stateVecImag = qureg->stateVec.imag; - - FILE *fp; - char line[200]; - - for (int rank=0; rank<(qureg->numChunks); rank++){ - if (rank==qureg->chunkId){ - fp = fopen(filename, "r"); - - // indicate file open failure - if (fp == NULL) - return 0; - - indexInChunk = 0; totalIndex = 0; - while (fgets(line, sizeof(char)*200, fp) != NULL && totalIndexchunkId){ - sscanf(line, REAL_SPECIFIER ", " REAL_SPECIFIER, &(stateVecReal[indexInChunk]), - &(stateVecImag[indexInChunk])); - indexInChunk += 1; - } - totalIndex += 1; - } - } - fclose(fp); - } - syncQuESTEnv(env); - } - - // indicate success - return 1; -} - -int statevec_compareStates(Qureg mq1, Qureg mq2, qreal precision){ - qreal diff; - long long int chunkSize = mq1.numAmpsPerChunk; - - for (long long int i=0; iprecision) return 0; - diff = absReal(mq1.stateVec.imag[i] - mq2.stateVec.imag[i]); - if (diff>precision) return 0; - } - return 1; -} - void statevec_compactUnitaryLocal (Qureg qureg, int targetQubit, Complex alpha, Complex beta) { long long int sizeBlock, sizeHalfBlock; diff --git a/QuEST/src/CPU/QuEST_cpu_distributed.c b/QuEST/src/CPU/QuEST_cpu_distributed.c index 4d07ef2b3..4c9ce0af8 100644 --- a/QuEST/src/CPU/QuEST_cpu_distributed.c +++ b/QuEST/src/CPU/QuEST_cpu_distributed.c @@ -494,8 +494,12 @@ void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg) { void exchangeStateVectors(Qureg qureg, int pairRank){ // MPI send/receive vars - int TAG=100; - MPI_Status status; + int sendTag; + int recvTag; + int rank; + MPI_Request * requests; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Multiple messages are required as MPI uses int rather than long long int for count // For openmpi, messages are further restricted to 2GB in size -- do this for all cases @@ -506,20 +510,26 @@ void exchangeStateVectors(Qureg qureg, int pairRank){ // safely assume MPI_MAX... = 2^n, so division always exact int numMessages = qureg.numAmpsPerChunk/maxMessageCount; + requests = (MPI_Request*) malloc(4 * numMessages * sizeof(MPI_Request)); int i; long long int offset; + // send my state vector to pairRank's qureg.pairStateVec // receive pairRank's state vector into qureg.pairStateVec for (i=0; i +# include +# include +# include +# include +# include +# include +# include +# include +# include + + + +/* + * TYPES AND ADAPTERS + */ + +// precision-agnostic conversions between cuAmp, qreal and Complex +# if QuEST_PREC==1 + # define TO_CU_AMP(re, im) make_cuFloatComplex(re, im) + # define cuAmpReal(amp) cuCrealf(amp) + # define cuAmpImag(amp) cuCimagf(amp) + # define cuAmpConj(amp) cuConjf(amp) + # define CU_AMP_IN_STATE_PREC CUDA_C_F32 + # define CU_AMP_IN_MATRIX_PREC CUDA_C_64F +# elif QuEST_PREC==2 + # define TO_CU_AMP(re, im) make_cuDoubleComplex(re, im) + # define cuAmpReal(amp) cuCreal(amp) + # define cuAmpImag(amp) cuCimag(amp) + # define cuAmpConj(amp) cuConj(amp) + # define CU_AMP_IN_STATE_PREC CUDA_C_64F + # define CU_AMP_IN_MATRIX_PREC CUDA_C_64F +# elif QuEST_PREC==4 + # define TO_CU_AMP(re, im) -1 // invalid precision config + # define cuAmpReal(amp) -1 + # define cuAmpImag(amp) -1 + # define CU_AMP_IN_STATE_PREC void // invalid + # define CU_AMP_IN_MATRIX_PREC void // invalid +#endif + +// convenient operator overloads for cuAmp, for doing complex artihmetic. +// some of these are defined to be used by Thrust's backend, because we +// avoided Thrust's complex (see QuEST_precision.h for explanation). +// notice we are manually performing the arithmetic using qreals and +// re-packing the result into TO_CU_AMP, rather than using cuComplex.h's +// functions like cuCadd(). This is because such functions are precision +// specific (grr) and do exactly the same thing themselves! +__host__ __device__ inline cuAmp operator - (const cuAmp& a) { + return TO_CU_AMP(-cuAmpReal(a), -cuAmpImag(a)); +} +__host__ __device__ inline cuAmp operator * (const cuAmp& a, const std::size_t n) { + return TO_CU_AMP(n*cuAmpReal(a), n*cuAmpImag(a)); +} +__host__ __device__ inline cuAmp operator + (const cuAmp& a, const cuAmp& b) { + return TO_CU_AMP(cuAmpReal(a) + cuAmpReal(b), cuAmpImag(a) + cuAmpImag(b)); +} +__host__ __device__ inline cuAmp operator * (const cuAmp& a, const cuAmp& b) { + qreal aRe = cuAmpReal(a); + qreal aIm = cuAmpImag(a); + qreal bRe = cuAmpReal(b); + qreal bIm = cuAmpImag(b); + return TO_CU_AMP(aRe*bRe - aIm*bIm, aRe*bIm + aIm*bRe); +} + +// convert user-facing Complex to cuQuantum-facing cuAmp +cuAmp toCuAmp(Complex c) { + return TO_CU_AMP(c.real, c.imag); +} + +// concise alias for row-wise flattened complex matrix +typedef std::vector cuMatr; + +// flatten ComplexMatrixN mIn to a cuMatr mOut +#define GET_cuMatr_FROM_ComplexMatrix( mOut, mIn, nQubits ) \ + long long int dim = (1LL << nQubits); \ + cuMatr mOut(dim*dim); \ + long long int i=0; \ + for (long long int r=0; r<(dim); r++) \ + for (long long int c=0; c<(dim); c++) \ + mOut[i++] = TO_CU_AMP(mIn.real[r][c], mIn.imag[r][c]); + +// convert user-facing ComplexMatrixN to cuQuantum-facing cuMatr +cuMatr toCuMatr(ComplexMatrix2 mIn) { + GET_cuMatr_FROM_ComplexMatrix(mOut, mIn, 1); + return mOut; +} +cuMatr toCuMatr(ComplexMatrix4 mIn) { + GET_cuMatr_FROM_ComplexMatrix(mOut, mIn, 2); + return mOut; +} +cuMatr toCuMatr(ComplexMatrixN mIn) { + GET_cuMatr_FROM_ComplexMatrix(mOut, mIn, mIn.numQubits); + return mOut; +} + +// convert QuEST backend masks back into user-input qubit lists (needed by cuQuantum) +std::vector getIndsFromMask(long long int mask, int numBits) { + std::vector inds; + for (int i=0; i(ctx); + return cudaMallocFromPoolAsync(ptr, size, pool, stream); +} +int memPoolFree(void* ctx, void* ptr, size_t size, cudaStream_t stream) { + return cudaFreeAsync(ptr, stream); +} + +CuQuantumConfig* createCuConfig() { + + // create cuQuantumConfig in heap memory + CuQuantumConfig* config = (CuQuantumConfig*) malloc(sizeof(CuQuantumConfig)); + + // bind existing memory pool (does not need later manual freeing) + int deviceId; + cudaGetDevice(&deviceId); + cudaDeviceGetMemPool(&(config->cuMemPool), deviceId); + + // create new custatevecHandle_t + custatevecCreate(&(config->cuQuantumHandle)); + + // create new cudaStream_t + cudaStreamCreate(&(config->cuStream)); + + // custatevecDeviceMemHandler_t needs no explicit creation + + // return config's heap pointer + return config; +} + +void initCuConfig(CuQuantumConfig* config) { + + // get existing memPool threshold, above which memory gets freed at every stream synch + size_t currMaxMem; + cudaMemPoolGetAttribute(config->cuMemPool, cudaMemPoolAttrReleaseThreshold, &currMaxMem); + + // if memPool threshold smaller than 1 MiB = 16 qubits, extend it + size_t desiredMaxMem = 16*(1<<15); + if (currMaxMem < desiredMaxMem) + cudaMemPoolSetAttribute(config->cuMemPool, cudaMemPoolAttrReleaseThreshold, &desiredMaxMem); + + // bind mempool to deviceMemHandler + config->cuMemHandler.ctx = &(config->cuMemPool); + config->cuMemHandler.device_alloc = memPoolAlloc; + config->cuMemHandler.device_free = memPoolFree; + strcpy(config->cuMemHandler.name, "mempool"); + + // bind deviceMemHandler to cuQuantum + custatevecSetDeviceMemHandler(config->cuQuantumHandle, &(config->cuMemHandler)); + + // bind stream to cuQuantum + custatevecSetStream(config->cuQuantumHandle, config->cuStream); +} + +void destroyCuConfig(CuQuantumConfig* config) { + + // free config's heap attributes + cudaStreamDestroy(config->cuStream); + custatevecDestroy(config->cuQuantumHandle); + + // don't need to free cuMemPool; it already existed + // don't need to free cuMemHandler; it's a struct included in config's heap memory + + // free config's heap memory + free(config); +} + + + +/* + * CUQUANTUM WRAPPERS (to reduce boilerplate) + */ + +void custatevec_applyMatrix(Qureg qureg, std::vector ctrls, std::vector targs, cuMatr matr) { + + // do not adjoint matrix + int adj = 0; + + // condition all ctrls on =1 state + int* ctrlVals = nullptr; + + // use automatic workspace management + void* work = nullptr; + size_t workSize = 0; + + custatevecApplyMatrix( + qureg.cuConfig->cuQuantumHandle, + qureg.deviceCuStateVec, CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + matr.data(), CU_AMP_IN_MATRIX_PREC, CUSTATEVEC_MATRIX_LAYOUT_ROW, adj, + targs.data(), targs.size(), + ctrls.data(), ctrlVals, ctrls.size(), + CUSTATEVEC_COMPUTE_DEFAULT, + work, workSize); +} + +void custatevec_applyDiagonal(Qureg qureg, std::vector ctrls, std::vector targs, cuAmp* elems) { + + // apply no permutation matrix + custatevecIndex_t *perm = nullptr; + + // do not adjoint elems + int adj = 0; + + // condition all ctrls on =1 state + int* ctrlVals = nullptr; + + // use automatic workspace management + void* work = nullptr; + size_t workSize = 0; + + custatevecApplyGeneralizedPermutationMatrix( + qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + perm, elems, CU_AMP_IN_MATRIX_PREC, adj, + targs.data(), targs.size(), + ctrls.data(), ctrlVals, ctrls.size(), + work, workSize); +} + + + +/* + * THRUST WRAPPERS AND FUNCTORS (to reduce boilerplate, and for custom modification of statevectors) + */ + +struct functor_prodOfConj : public thrust::binary_function +{ + __host__ __device__ cuAmp operator()(cuAmp braAmp, cuAmp ketAmp) { + return ketAmp * cuAmpConj(braAmp); + } +}; + +struct functor_absOfDif : public thrust::binary_function +{ + __host__ __device__ cuAmp operator()(cuAmp a, cuAmp b) { + qreal difRe = cuAmpReal(a) - cuAmpReal(b); + qreal difIm = cuAmpImag(a) - cuAmpImag(b); + qreal difAbs = difRe*difRe + difIm*difIm; + return TO_CU_AMP(difAbs, 0); + } +}; + +struct functor_absSquared : public thrust::unary_function +{ + __host__ __device__ qreal operator()(cuAmp amp) { + qreal re = cuAmpReal(amp); + qreal im = cuAmpImag(amp); + qreal absSq = re*re + im*im; + return absSq; + } +}; + +struct functor_prodOfAbsSquared : public thrust::binary_function +{ + __host__ __device__ cuAmp operator()(cuAmp probAmp, cuAmp rawAmp) { + qreal re = cuAmpReal(probAmp); + qreal im = cuAmpImag(probAmp); + qreal absSq = re*re + im*im; + cuAmp prod = rawAmp * TO_CU_AMP(absSq, 0); + return prod; + } +}; + +struct functor_scaleInd : public thrust::unary_function +{ + const long long int factor; + functor_scaleInd(long long int _factor) : factor(_factor) {} + + __host__ __device__ long long int operator()(long long int ind) { + return factor * ind; + } +}; + +struct functor_mapIndToAlternatingBlockScaledInd : public thrust::unary_function +{ + long long int blockSize; + long long int factor; + int useOffsetBlocks; + + functor_mapIndToAlternatingBlockScaledInd(long long int _blockSize, long long int _factor, int _useOffsetBlocks) : + blockSize(_blockSize), factor(_factor), useOffsetBlocks(_useOffsetBlocks) {} + + __host__ __device__ long long int operator()(long long int rawInd) { + + long long int blockNum = rawInd / blockSize; + long long int subBlockInd = rawInd % blockSize; + + long long int blockStartInd = (blockNum * 2 * blockSize) + (useOffsetBlocks * blockSize); + long long int blockifiedInd = blockStartInd + subBlockInd; + + long long int finalInd = factor * blockifiedInd; + + return finalInd; + } +}; + +struct functor_setWeightedQureg +{ + // functor requires 3 complex scalars + const cuAmp fac1; + const cuAmp fac2; + const cuAmp facOut; + functor_setWeightedQureg(cuAmp _fac1, cuAmp _fac2, cuAmp _facOut) : + fac1(_fac1), fac2(_fac2), facOut(_facOut) {} + + // and modifies out to (facOut out + fac1 qureg1 + fac2 qureg2) + template __host__ __device__ void operator()(Tuple t) { + thrust::get<2>(t) = facOut*thrust::get<2>(t) + fac1*thrust::get<0>(t) + fac2*thrust::get<1>(t); + } +}; + +struct functor_matrixColumnDotVector : public thrust::unary_function +{ + cuAmp* matrix; // flattened column-wise + cuAmp* vector; + long long int dim; + functor_matrixColumnDotVector(cuAmp* _matrix, cuAmp* _vector, long long int _dim) : + matrix(_matrix), vector(_vector), dim(_dim) {} + + __host__ __device__ cuAmp operator()(long long int columnInd) { + + // safe to iterate serially, since #columnInd is exponentially growing + cuAmp sum = TO_CU_AMP(0, 0); + for (long long int rowInd=0; rowInd +{ + DiagonalOp op; + functor_getDiagonalOpAmp(DiagonalOp _op) : op(_op) {} + + __host__ __device__ cuAmp operator()(long long int columnInd) { + + return TO_CU_AMP( + op.deviceOperator.real[columnInd], + op.deviceOperator.imag[columnInd]); + } +}; + +struct functor_multDiagOntoDensMatr +{ + cuAmp* matrix; + DiagonalOp op; + functor_multDiagOntoDensMatr(cuAmp* _matrix, DiagonalOp _op) : matrix(_matrix), op(_op) {} + + __host__ __device__ void operator()(long long int matrixInd) { + + long long int opDim = 1LL << op.numQubits; + long long int opInd = matrixInd % opDim; + cuAmp opAmp = TO_CU_AMP( + op.deviceOperator.real[opInd], + op.deviceOperator.imag[opInd]); + + matrix[matrixInd] = opAmp * matrix[matrixInd]; + } +}; + +struct functor_collapseDensMatrToOutcome +{ + cuAmp* matrix; + int numQubits; + int outcome; + qreal outcomeProb; + int target; + functor_collapseDensMatrToOutcome(cuAmp* _matrix, int _numQubits, int _outcome, qreal _outcomeProb, int _target) : + matrix(_matrix), numQubits(_numQubits), outcome(_outcome), outcomeProb(_outcomeProb), target(_target) {} + + __host__ __device__ void operator()(long long int ind) { + + // obtain bits |...b2...><...b1...| + int b1 = (ind >> target) & 1; + int b2 = (ind >> target >> numQubits) & 1; + + int f1 = !(b1 ^ b2); // true if b1 == b2 + int f2 = !(b1 ^ outcome); // true if b1 == outcome + qreal fac = f1 * f2 / outcomeProb; // 0 or 1/prob + + matrix[ind] = TO_CU_AMP(fac, 0) * matrix[ind]; + } +}; + +thrust::device_ptr getStartPtr(Qureg qureg) { + + return thrust::device_pointer_cast(qureg.deviceCuStateVec); +} + +thrust::device_ptr getEndPtr(Qureg qureg) { + + return getStartPtr(qureg) + qureg.numAmpsTotal; +} + +auto iter_strided(Qureg qureg, long long int stride, long long int strideIndInit) { + + // iterates qureg[i * stride] over i, from i = strideIndInit, until exceeding qureg + auto rawIndIter = thrust::make_counting_iterator(strideIndInit); + auto stridedIndIter = thrust::make_transform_iterator(rawIndIter, functor_scaleInd(stride)); + auto stridedAmpIter = thrust::make_permutation_iterator(getStartPtr(qureg), stridedIndIter); + return stridedAmpIter; +} + +auto getStartStridedAmpIter(Qureg qureg, long long int stride) { + + return iter_strided(qureg, stride, 0); +} + +auto getEndStridedAmpIter(Qureg qureg, long long int stride) { + + long long int numIters = ceil(qureg.numAmpsTotal / (qreal) stride); + return iter_strided(qureg, stride, numIters); +} + +auto iter_blockStrided(Qureg qureg, long long int blockSize, int useOffsetBlocks, long long int stride, long long int rawIndInit) { + + // iterates qureg[(i mapped to alternating blocks) * stride] over i, from i = rawIndInit, until exceeding qureg + auto functor = functor_mapIndToAlternatingBlockScaledInd(blockSize, stride, useOffsetBlocks); + auto rawIndIter = thrust::make_counting_iterator(rawIndInit); + auto stridedBlockIndIter = thrust::make_transform_iterator(rawIndIter, functor); + auto stridedBlockAmpIter = thrust::make_permutation_iterator(getStartPtr(qureg), stridedBlockIndIter); + return stridedBlockAmpIter; +} + +auto getStartBlockStridedAmpIter(Qureg qureg, long long int blockSize, int useOffsetBlocks, long long int stride) { + + return iter_blockStrided(qureg, blockSize, useOffsetBlocks, stride, 0); +} + +auto getEndBlockStridedAmpIter(Qureg qureg, long long int blockSize, int useOffsetBlocks, long long int stride) { + + long long int numAltBlockAmps = qureg.numAmpsTotal / 2; + long long int numIters = ceil(numAltBlockAmps / (qreal) stride); + return iter_blockStrided(qureg, blockSize, useOffsetBlocks, stride, numIters); +} + +auto iter_diagonalOp(DiagonalOp op, long long int initInd) { + + auto rawIndIter = thrust::make_counting_iterator(initInd); + auto diagAmpIter = thrust::make_transform_iterator(rawIndIter, functor_getDiagonalOpAmp(op)); + return diagAmpIter; +} + +auto getStartDiagonalOpAmpIter(DiagonalOp op) { + + return iter_diagonalOp(op, 0); +} + +auto getEndDiagonalOpAmpIter(DiagonalOp op) { + + return iter_diagonalOp(op, op.numElemsPerChunk); +} + + + +/* + * Start C-linkage, so that below functions may only use C signatures (not C++) + */ +#ifdef __cplusplus +extern "C" { +#endif + + + +/* + * ENVIRONMENT MANAGEMENT + */ + +QuESTEnv createQuESTEnv(void) { + validateGPUExists(GPUExists(), __func__); + validateGPUIsCuQuantumCompatible(GPUSupportsMemPools(),__func__); + + QuESTEnv env; + env.rank=0; + env.numRanks=1; + + env.seeds = NULL; + env.numSeeds = 0; + seedQuESTDefault(&env); + + // prepare cuQuantum with automatic workspaces + env.cuConfig = createCuConfig(); + initCuConfig(env.cuConfig); + + return env; +} + +void destroyQuESTEnv(QuESTEnv env){ + free(env.seeds); + + // finalise cuQuantum + destroyCuConfig(env.cuConfig); +} + + + +/* + * QUREG CREATION AND AMP SET/GET + */ + +void statevec_createQureg(Qureg *qureg, int numQubits, QuESTEnv env) +{ + // set standard fields + long long int numAmps = 1LL << numQubits; + qureg->numQubitsInStateVec = numQubits; + qureg->numAmpsPerChunk = numAmps; + qureg->numAmpsTotal = numAmps; + qureg->chunkId = 0; + qureg->numChunks = 1; + qureg->isDensityMatrix = 0; + + // copy env's cuQuantum config handle + qureg->cuConfig = env.cuConfig; + + // allocate user-facing CPU memory + qureg->stateVec.real = (qreal*) malloc(numAmps * sizeof(qureg->stateVec.real)); + qureg->stateVec.imag = (qreal*) malloc(numAmps * sizeof(qureg->stateVec.imag)); + validateQuregAllocation(qureg, env, __func__); + + // allocate cuQuantum GPU memory (unvalidated) + cudaMalloc( &(qureg->deviceCuStateVec), numAmps * sizeof(*(qureg->deviceCuStateVec)) ); + + // allocate private cuQuantum CPU memory (for exchanging with GPU memory) + qureg->cuStateVec = (cuAmp*) malloc(numAmps * sizeof(*(qureg->cuStateVec))); +} + +void statevec_destroyQureg(Qureg qureg, QuESTEnv env) +{ + // free user-facing CPU memory + free(qureg.stateVec.real); + free(qureg.stateVec.imag); + + // free private cuQuantum CPU memory + free(qureg.cuStateVec); + + // free cuQuantum GPU memory + cudaFree(qureg.deviceCuStateVec); +} + +void statevec_setAmps(Qureg qureg, long long int startInd, qreal* reals, qreal* imags, long long int numAmps) +{ + // slowly manually overwrite subset of private cuQuantum CPU memory + for (long long int i=0; icuQuantumHandle, + qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, + qureg.numQubitsInStateVec, + CUSTATEVEC_STATE_VECTOR_TYPE_ZERO); +} + +void statevec_initBlankState(Qureg qureg) +{ + // init to |0> = {1, 0, 0, ...} + statevec_initZeroState(qureg); + + // overwrite amps[0] to 0 + qreal re[] = {0}; + qreal im[] = {0}; + statevec_setAmps(qureg, 0, re, im, 1); +} + +void statevec_initPlusState(Qureg qureg) +{ + custatevecInitializeStateVector( + qureg.cuConfig->cuQuantumHandle, + qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, + qureg.numQubitsInStateVec, + CUSTATEVEC_STATE_VECTOR_TYPE_UNIFORM); +} + +void statevec_initClassicalState(Qureg qureg, long long int stateInd) +{ + // init to |null> = {0, 0, 0, ...} + statevec_initBlankState(qureg); + + // overwrite amps[stateInd] to 1 + qreal re[] = {1}; + qreal im[] = {0}; + statevec_setAmps(qureg, stateInd, re, im, 1); +} + +void densmatr_initClassicalState(Qureg qureg, long long int stateInd) +{ + // init to |null> = {0, 0, 0, ...} + statevec_initBlankState(qureg); + + // index of the desired state in the flat density matrix + long long int densityDim = 1LL << qureg.numQubitsRepresented; + long long int densityInd = (densityDim + 1)*stateInd; + + // overwrite amps[densityInd] to 1 + qreal re[] = {1}; + qreal im[] = {0}; + statevec_setAmps(qureg, densityInd, re, im, 1); +} + +void statevec_initDebugState(Qureg qureg) +{ + // |n> -> (.2n + (.2n+.1)i) |n> + cuAmp init = TO_CU_AMP(0, .1); + cuAmp step = TO_CU_AMP(.2, .2); + thrust::sequence(getStartPtr(qureg), getEndPtr(qureg), init, step); +} + +void statevec_setWeightedQureg(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out) +{ + // out -> facOut + fac1 qureg1 + fac2 qureg2 + thrust::for_each( + thrust::make_zip_iterator(thrust::make_tuple(getStartPtr(qureg1), getStartPtr(qureg2), getStartPtr(out))), + thrust::make_zip_iterator(thrust::make_tuple(getEndPtr(qureg1), getEndPtr(qureg2), getEndPtr(out))), + functor_setWeightedQureg(toCuAmp(fac1), toCuAmp(fac2), toCuAmp(facOut))); +} + + + +/* + * OPERATORS + */ + +void statevec_compactUnitary(Qureg qureg, int targetQubit, Complex alpha, Complex beta) +{ + cuAmp a = toCuAmp(alpha); + cuAmp b = toCuAmp(beta); + cuMatr matrix{ + a, -cuAmpConj(b), + b, cuAmpConj(a) + }; + custatevec_applyMatrix(qureg, {}, {targetQubit}, matrix); +} + +void statevec_controlledCompactUnitary(Qureg qureg, int controlQubit, int targetQubit, Complex alpha, Complex beta) +{ + cuAmp a = toCuAmp(alpha); + cuAmp b = toCuAmp(beta); + cuMatr matrix{ + a, -cuAmpConj(b), + b, cuAmpConj(a) + }; + custatevec_applyMatrix(qureg, {controlQubit}, {targetQubit}, matrix); +} + +void statevec_unitary(Qureg qureg, int targetQubit, ComplexMatrix2 u) +{ + custatevec_applyMatrix(qureg, {}, {targetQubit}, toCuMatr(u)); +} + +void statevec_multiControlledMultiQubitUnitary(Qureg qureg, long long int ctrlMask, int* targs, int numTargs, ComplexMatrixN u) +{ + std::vector c = getIndsFromMask(ctrlMask,qureg.numQubitsInStateVec); + std::vector t(targs,targs+numTargs); + custatevec_applyMatrix(qureg, c, t, toCuMatr(u)); +} + +void statevec_multiControlledTwoQubitUnitary(Qureg qureg, long long int ctrlMask, int q1, int q2, ComplexMatrix4 u) +{ + std::vector c = getIndsFromMask(ctrlMask,qureg.numQubitsInStateVec); + custatevec_applyMatrix(qureg, c, {q1,q2}, toCuMatr(u)); +} + +void statevec_controlledUnitary(Qureg qureg, int controlQubit, int targetQubit, ComplexMatrix2 u) +{ + custatevec_applyMatrix(qureg, {controlQubit}, {targetQubit}, toCuMatr(u)); +} + +void statevec_multiControlledUnitary(Qureg qureg, long long int ctrlQubitsMask, long long int ctrlFlipMask, int targetQubit, ComplexMatrix2 u) +{ + int targs[] = {targetQubit}; + std::vector ctrlInds = getIndsFromMask(ctrlQubitsMask,qureg.numQubitsInStateVec); + std::vector ctrlVals(ctrlInds.size()); + for (size_t i=0; icuQuantumHandle, + qureg.deviceCuStateVec, CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + toCuMatr(u).data(), CU_AMP_IN_MATRIX_PREC, CUSTATEVEC_MATRIX_LAYOUT_ROW, 0, + targs, 1, ctrlInds.data(), ctrlVals.data(), ctrlInds.size(), + CUSTATEVEC_COMPUTE_DEFAULT, nullptr, 0); +} + +void statevec_pauliX(Qureg qureg, int targetQubit) +{ + cuAmp a0 = TO_CU_AMP(0, 0); + cuAmp a1 = TO_CU_AMP(1, 0); + cuMatr matrix{ + a0, a1, + a1, a0 + }; + custatevec_applyMatrix(qureg, {}, {targetQubit}, matrix); +} + +void statevec_pauliY(Qureg qureg, int targetQubit) +{ + cuAmp a0 = TO_CU_AMP(0, 0); + cuAmp aI = TO_CU_AMP(0, 1); + cuMatr matrix{ + a0, -aI, + aI, a0 + }; + custatevec_applyMatrix(qureg, {}, {targetQubit}, matrix); +} + +void statevec_pauliYConj(Qureg qureg, int targetQubit) +{ + cuAmp a0 = TO_CU_AMP(0, 0); + cuAmp aI = TO_CU_AMP(0, 1); + cuMatr matrix{ + a0, aI, + -aI, a0 + }; + custatevec_applyMatrix(qureg, {}, {targetQubit}, matrix); +} + +void statevec_controlledPauliY(Qureg qureg, int controlQubit, int targetQubit) +{ + cuAmp a0 = TO_CU_AMP(0, 0); + cuAmp aI = TO_CU_AMP(0, 1); + cuMatr matrix{ + a0, -aI, + aI, a0 + }; + custatevec_applyMatrix(qureg, {controlQubit}, {targetQubit}, matrix); +} + +void statevec_controlledPauliYConj(Qureg qureg, int controlQubit, int targetQubit) +{ + cuAmp a0 = TO_CU_AMP(0, 0); + cuAmp aI = TO_CU_AMP(0, 1); + cuMatr matrix{ + a0, aI, + -aI, a0 + }; + custatevec_applyMatrix(qureg, {controlQubit}, {targetQubit}, matrix); +} + +void statevec_phaseShiftByTerm(Qureg qureg, int targetQubit, Complex term) +{ + cuAmp a1 = TO_CU_AMP(1, 0); + cuAmp a2 = toCuAmp(term); + cuAmp elems[] = {a1, a2}; + + custatevec_applyDiagonal(qureg, {}, {targetQubit}, elems); +} + +void statevec_controlledPhaseShift(Qureg qureg, int idQubit1, int idQubit2, qreal angle) +{ + cuAmp a1 = TO_CU_AMP(1, 0); + cuAmp a2 = TO_CU_AMP(cos(angle), sin(angle)); + cuAmp elems[] = {a1, a2}; + + custatevec_applyDiagonal(qureg, {idQubit1}, {idQubit2}, elems); +} + +void statevec_multiControlledPhaseShift(Qureg qureg, int *controlQubits, int numControlQubits, qreal angle) +{ + cuAmp a1 = TO_CU_AMP(1, 0); + cuAmp a2 = TO_CU_AMP(cos(angle), sin(angle)); + cuAmp elems[] = {a1, a2}; + + std::vector targs{controlQubits[0]}; + std::vector ctrls(controlQubits + 1, controlQubits + numControlQubits); + custatevec_applyDiagonal(qureg, ctrls, targs, elems); +} + +void statevec_multiRotateZ(Qureg qureg, long long int mask, qreal angle) +{ + qreal theta = - angle/2.; + std::vector targs = getIndsFromMask(mask, qureg.numQubitsInStateVec); + std::vector paulis(targs.size(), CUSTATEVEC_PAULI_Z); + + custatevecApplyPauliRotation( + qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + theta, paulis.data(), targs.data(), targs.size(), + nullptr, nullptr, 0); +} + +void statevec_multiControlledMultiRotateZ(Qureg qureg, long long int ctrlMask, long long int targMask, qreal angle) +{ + qreal theta = - angle/2.; + std::vector ctrls = getIndsFromMask(ctrlMask, qureg.numQubitsInStateVec); + std::vector targs = getIndsFromMask(targMask, qureg.numQubitsInStateVec); + std::vector paulis(targs.size(), CUSTATEVEC_PAULI_Z); + + custatevecApplyPauliRotation( + qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + theta, paulis.data(), targs.data(), targs.size(), + ctrls.data(), nullptr, ctrls.size()); +} + +void statevec_controlledPhaseFlip(Qureg qureg, int idQubit1, int idQubit2) +{ + cuAmp a1 = TO_CU_AMP( 1, 0); + cuAmp a2 = TO_CU_AMP(-1, 0); + cuAmp elems[] = {a1, a2}; + + custatevec_applyDiagonal(qureg, {idQubit1}, {idQubit2}, elems); +} + +void statevec_multiControlledPhaseFlip(Qureg qureg, int *controlQubits, int numControlQubits) +{ + cuAmp a1 = TO_CU_AMP( 1, 0); + cuAmp a2 = TO_CU_AMP(-1, 0); + cuAmp elems[] = {a1, a2}; + + std::vector targs{controlQubits[0]}; + std::vector ctrls(controlQubits + 1, controlQubits + numControlQubits); + custatevec_applyDiagonal(qureg, ctrls, targs, elems); +} + +void statevec_swapQubitAmps(Qureg qureg, int qb1, int qb2) +{ + int2 targPairs[] = {{qb1, qb2}}; + int numPairs = 1; + + custatevecSwapIndexBits( + qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + targPairs, numPairs, + nullptr, nullptr, 0); +} + +void statevec_hadamard(Qureg qureg, int targetQubit) +{ + cuAmp a = TO_CU_AMP(1/sqrt(2.), 0); + cuMatr matrix{ + a, a, + a, -a + }; + custatevec_applyMatrix(qureg, {}, {targetQubit}, matrix); +} + +void statevec_controlledNot(Qureg qureg, int controlQubit, int targetQubit) +{ + cuAmp a0 = TO_CU_AMP(0, 0); + cuAmp a1 = TO_CU_AMP(1, 0); + cuMatr matrix{ + a0, a1, + a1, a0 + }; + custatevec_applyMatrix(qureg, {controlQubit}, {targetQubit}, matrix); +} + +void statevec_multiControlledMultiQubitNot(Qureg qureg, int ctrlMask, int targMask) +{ + // this operator can be effected in one-shot using a custom kernel, but we here + // instead resort to slowly (by at most a factor #targs) effect it as a sequence + // of one-target multi-ctrl NOT gates. + + cuAmp a0 = TO_CU_AMP(0, 0); + cuAmp a1 = TO_CU_AMP(1, 0); + cuMatr matrix{ + a0, a1, + a1, a0 + }; + std::vector ctrls = getIndsFromMask(ctrlMask, qureg.numQubitsInStateVec); + std::vector targs = getIndsFromMask(targMask, qureg.numQubitsInStateVec); + for (int targ : targs) + custatevec_applyMatrix(qureg, ctrls, {targ}, matrix); +} + +void statevec_applySubDiagonalOp(Qureg qureg, int* targs, SubDiagonalOp op, int conj) +{ + // sneakily leverage the CPU cuQuantum memory in order to convert op + // (as separate arrays op.real and op.imag) into cuAmp* + cuAmp* elems = qureg.cuStateVec; + for (long long int i=0; i t(targs, targs + op.numQubits); + custatevec_applyDiagonal(qureg, {}, t, elems); +} + +void statevec_applyDiagonalOp(Qureg qureg, DiagonalOp op) +{ + thrust::transform( + getStartDiagonalOpAmpIter(op), getEndDiagonalOpAmpIter(op), + getStartPtr(qureg), getStartPtr(qureg), // both are begin iters + thrust::multiplies()); +} + +void densmatr_applyDiagonalOp(Qureg qureg, DiagonalOp op) +{ + auto startIndIter = thrust::make_counting_iterator(0); + auto endIndIter = startIndIter + qureg.numAmpsTotal; + auto functor = functor_multDiagOntoDensMatr(qureg.deviceCuStateVec, op); + thrust::for_each(startIndIter, endIndIter, functor); +} + + + +/* + * DECOHERENCE + */ + +void densmatr_mixDensityMatrix(Qureg combineQureg, qreal otherProb, Qureg otherQureg) +{ + Complex facOut = {.real=(1-otherProb), .imag=0}; + Complex facOther = {.real=otherProb, .imag=0}; + Complex zero = {.real=0, .imag=0}; + + statevec_setWeightedQureg(facOther, otherQureg, zero, otherQureg, facOut, combineQureg); +} + +void densmatr_mixDephasing(Qureg qureg, int targetQubit, qreal dephase) +{ + cuAmp a = TO_CU_AMP(1, 0); + cuAmp b = TO_CU_AMP(1-dephase, 0); // dephase = 2*prob + cuAmp elems[] = {a, b, b, a}; + + std::vector targs{targetQubit, targetQubit + qureg.numQubitsRepresented}; + custatevec_applyDiagonal(qureg, {}, targs, elems); +} + +void densmatr_mixTwoQubitDephasing(Qureg qureg, int qb1, int qb2, qreal dephase) +{ + // this function effects the two-qubit dephasing on a density matrix, via the + // four-qubit diagonal superoperator on a Choi vector. The 16 elements of the + // diagonal have only two unique entries; 1 and 1-2*dephase/3. It's conceivable + // that a bespoke kernel could be faster, but likely by little. + + cuAmp a = TO_CU_AMP(1, 0); + cuAmp b = TO_CU_AMP(1 - dephase, 0); // dephase = 4*prob/3 + cuAmp elems[] = {a, b, b, b, b, a, b, b, b, b, a, b, b, b, b, a}; + + int shift = qureg.numQubitsRepresented; + std::vector targs{qb1, qb2, qb1 + shift, qb2 + shift}; + custatevec_applyDiagonal(qureg, {}, targs, elems); +} + +void densmatr_mixDepolarising(Qureg qureg, int targetQubit, qreal depol) +{ + // this function effects depolarising as a dense two-qubit superoperator + // on a Choi vector, where only 6 of the 16 elements are non-zero. This is + // potentially wasteful, and a bespoke kernel could be faster, leveraging + // QuEST's existing GPU code (or the optimised algorithm in the "distributed" + // manuscript). + + // depol = (4*prob)/3.0 + cuAmp a = TO_CU_AMP(depol/2., 0); // 2*prob/3 + cuAmp b = TO_CU_AMP(1 - depol/2., 0); // 1-2*prob/3 + cuAmp c = TO_CU_AMP(1 - depol, 0); // 1-4*prob/3 + cuAmp z = TO_CU_AMP(0, 0); // 0 + + cuMatr matr{ + b, z, z, a, + z, c, z, z, + z, z, c, z, + a, z, z, b + }; + std::vector targs{ targetQubit, targetQubit + qureg.numQubitsRepresented }; + custatevec_applyMatrix(qureg, {}, targs, matr); +} + +void densmatr_mixDamping(Qureg qureg, int qb, qreal prob) +{ + // this function effects damping as a dense two-qubit superoperator + // on a Choi vector, where only 5 of the 16 elements are non-zero. This is + // potentially wasteful, and a bespoke kernel could be faster, leveraging + // QuEST's existing GPU code (or the optimised algorithm in the "distributed" + // manuscript). + + cuAmp w = TO_CU_AMP(1, 0); + cuAmp z = TO_CU_AMP(0, 0); + cuAmp p = TO_CU_AMP(prob, 0); + cuAmp a = TO_CU_AMP(sqrt(1 - prob), 0); + cuAmp b = TO_CU_AMP(1-prob, 0); + + cuMatr matr{ + w, z, z, p, + z, a, z, z, + z, z, a, z, + z, z, z, b + }; + std::vector targs{qb, qb + qureg.numQubitsRepresented}; + custatevec_applyMatrix(qureg, {}, targs, matr); +} + + + +/* + * CALCULATIONS + */ + +qreal densmatr_calcTotalProb(Qureg qureg) +{ + long long int diagStride = 1 + (1LL << qureg.numQubitsRepresented); + + cuAmp sum = thrust::reduce( + getStartStridedAmpIter(qureg, diagStride), + getEndStridedAmpIter(qureg, diagStride), + TO_CU_AMP(0, 0)); + + return cuAmpReal(sum); +} + +qreal statevec_calcTotalProb(Qureg qureg) +{ + qreal abs2sum0; + qreal abs2sum1; + int basisBits[] = {0}; + int numBasisBits = 1; + + custatevecAbs2SumOnZBasis( + qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + &abs2sum0, &abs2sum1, basisBits, numBasisBits); + + return abs2sum0 + abs2sum1; +} + +qreal statevec_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome) +{ + // we only ever compute prob(outcome=0) as per the QuEST API's limitation + qreal prob0; + qreal* prob1 = nullptr; + + int basisBits[] = {measureQubit}; + int numBasisBits = 1; + + custatevecAbs2SumOnZBasis( + qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + &prob0, prob1, basisBits, numBasisBits); + + return (outcome == 0)? prob0 : (1-prob0); +} + +qreal densmatr_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome) +{ + long long int blockSize = (1LL << measureQubit); + long long int diagStride = (1LL << qureg.numQubitsRepresented) + 1; + + // we could set this to outcome to sum the outcome=1 amps directly, but the + // QuEST API specifies that the outcome=0 amps are always summed instead. + int useOffsetBlocks = 0; + + cuAmp sum = thrust::reduce( + getStartBlockStridedAmpIter(qureg, blockSize, useOffsetBlocks, diagStride), + getEndBlockStridedAmpIter(qureg, blockSize, useOffsetBlocks, diagStride), + TO_CU_AMP(0, 0)); + + qreal prob0 = cuAmpReal(sum); + return (outcome == 0)? prob0 : (1-prob0); +} + +qreal densmatr_calcInnerProduct(Qureg a, Qureg b) +{ + cuAmp prod = thrust::inner_product( + getStartPtr(a), getEndPtr(a), getStartPtr(b), + TO_CU_AMP(0,0), thrust::plus(), functor_prodOfConj()); + + return cuAmpReal(prod); +} + +Complex statevec_calcInnerProduct(Qureg bra, Qureg ket) +{ + cuAmp prod = thrust::inner_product( + getStartPtr(bra), getEndPtr(bra), getStartPtr(ket), + TO_CU_AMP(0,0), thrust::plus(), functor_prodOfConj()); + + return (Complex) {.real=cuAmpReal(prod), .imag=cuAmpImag(prod)}; +} + +qreal densmatr_calcFidelity(Qureg qureg, Qureg pureState) +{ + // create iterator f(i) = sum_j qureg_ij pureState_j + auto functor = functor_matrixColumnDotVector( + qureg.deviceCuStateVec, pureState.deviceCuStateVec, // both init iters + pureState.numAmpsTotal); + auto rawIndIter = thrust::make_counting_iterator(0); + auto prodAmpIter = thrust::make_transform_iterator(rawIndIter, functor); + + // compute sum_i conj(pureState_i) * f(i) + cuAmp prod = thrust::inner_product( + getStartPtr(pureState), getEndPtr(pureState), prodAmpIter, + TO_CU_AMP(0,0), thrust::plus(), functor_prodOfConj()); + + qreal prodRe = cuAmpReal(prod); + return prodRe; +} + +qreal densmatr_calcHilbertSchmidtDistance(Qureg a, Qureg b) +{ + cuAmp trace = thrust::inner_product( + getStartPtr(a), getEndPtr(a), getStartPtr(b), + TO_CU_AMP(0,0), thrust::plus(), functor_absOfDif()); + + qreal dist = sqrt(cuAmpReal(trace)); + return dist; +} + +qreal densmatr_calcPurity(Qureg qureg) +{ + qreal sumOfAbsSquared = thrust::transform_reduce( + getStartPtr(qureg), getEndPtr(qureg), functor_absSquared(), + 0., thrust::plus()); + + return sumOfAbsSquared; +} + +Complex statevec_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op) +{ + cuAmp prod = thrust::inner_product( + getStartPtr(qureg), getEndPtr(qureg), getStartDiagonalOpAmpIter(op), + TO_CU_AMP(0,0), thrust::plus(), functor_prodOfAbsSquared()); + + return (Complex) {.real=cuAmpReal(prod), .imag=cuAmpImag(prod)}; +} + +Complex densmatr_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op) +{ + long long int diagStride = 1 + (1LL << qureg.numQubitsRepresented); + + cuAmp prod = thrust::inner_product( + getStartStridedAmpIter(qureg, diagStride), + getEndStridedAmpIter(qureg, diagStride), + getStartDiagonalOpAmpIter(op), + TO_CU_AMP(0,0), thrust::plus(), thrust::multiplies()); + + return (Complex) {.real=cuAmpReal(prod), .imag=cuAmpImag(prod)}; +} + + + +/* + * REDUCTIONS + */ + +void statevec_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outcome, qreal outcomeProb) +{ + int basisBits[] = {measureQubit}; + + custatevecCollapseOnZBasis( + qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + outcome, basisBits, 1, outcomeProb); +} + +void densmatr_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outcome, qreal outcomeProb) +{ + auto functor = functor_collapseDensMatrToOutcome( + qureg.deviceCuStateVec, qureg.numQubitsRepresented, + outcome, outcomeProb, measureQubit); + + auto startIndIter = thrust::make_counting_iterator(0); + auto endIndIter = startIndIter + qureg.numAmpsTotal; + thrust::for_each(startIndIter, endIndIter, functor); +} + + + +#ifdef __cplusplus +} +#endif diff --git a/QuEST/src/GPU/QuEST_gpu.cu b/QuEST/src/GPU/QuEST_gpu.cu index 3b2138a4c..aa2bc085d 100755 --- a/QuEST/src/GPU/QuEST_gpu.cu +++ b/QuEST/src/GPU/QuEST_gpu.cu @@ -1,17 +1,17 @@ // Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details /** @file - * An implementation of the backend in ../QuEST_internal.h for a GPU environment. + * A custom-kernel implementation of the backend in ../QuEST_internal.h for a GPU environment. * * @author Ania Brown * @author Tyson Jones */ # include "QuEST.h" +# include "QuEST_gpu_common.h" # include "QuEST_precision.h" # include "QuEST_validation.h" -# include "QuEST_internal.h" // purely to resolve getQuESTDefaultSeedKey -# include "mt19937ar.h" +# include "QuEST_internal.h" # include # include @@ -156,6 +156,25 @@ extern "C" { #endif + +QuESTEnv createQuESTEnv(void) { + validateGPUExists(GPUExists(), __func__); + + QuESTEnv env; + env.rank=0; + env.numRanks=1; + + env.seeds = NULL; + env.numSeeds = 0; + seedQuESTDefault(&env); + + return env; +} + +void destroyQuESTEnv(QuESTEnv env){ + free(env.seeds); +} + void statevec_setAmps(Qureg qureg, long long int startInd, qreal* reals, qreal* imags, long long int numAmps) { cudaDeviceSynchronize(); @@ -171,7 +190,6 @@ void statevec_setAmps(Qureg qureg, long long int startInd, qreal* reals, qreal* cudaMemcpyHostToDevice); } - /** works for both statevectors and density matrices */ void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg) { @@ -189,36 +207,6 @@ void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg) { cudaMemcpyDeviceToDevice); } -__global__ void densmatr_initPureStateKernel( - long long int numPureAmps, - qreal *targetVecReal, qreal *targetVecImag, - qreal *copyVecReal, qreal *copyVecImag) -{ - // this is a particular index of the pure copyQureg - long long int index = blockIdx.x*blockDim.x + threadIdx.x; - if (index>=numPureAmps) return; - - qreal realRow = copyVecReal[index]; - qreal imagRow = copyVecImag[index]; - for (long long int col=0; col < numPureAmps; col++) { - qreal realCol = copyVecReal[col]; - qreal imagCol = - copyVecImag[col]; // minus for conjugation - targetVecReal[col*numPureAmps + index] = realRow*realCol - imagRow*imagCol; - targetVecImag[col*numPureAmps + index] = realRow*imagCol + imagRow*realCol; - } -} - -void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg) -{ - int threadsPerCUDABlock, CUDABlocks; - threadsPerCUDABlock = 128; - CUDABlocks = ceil((qreal)(copyQureg.numAmpsPerChunk)/threadsPerCUDABlock); - densmatr_initPureStateKernel<<>>( - copyQureg.numAmpsPerChunk, - targetQureg.deviceStateVec.real, targetQureg.deviceStateVec.imag, - copyQureg.deviceStateVec.real, copyQureg.deviceStateVec.imag); -} - __global__ void densmatr_initPlusStateKernel(long long int stateVecSize, qreal probFactor, qreal *stateVecReal, qreal *stateVecImag){ long long int index; @@ -328,178 +316,6 @@ void statevec_destroyQureg(Qureg qureg, QuESTEnv env) cudaFree(qureg.secondLevelReduction); } -DiagonalOp agnostic_createDiagonalOp(int numQubits, QuESTEnv env) { - - DiagonalOp op; - op.numQubits = numQubits; - op.numElemsPerChunk = (1LL << numQubits) / env.numRanks; - op.chunkId = env.rank; - op.numChunks = env.numRanks; - - // allocate CPU memory (initialised to zero) - op.real = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal)); - op.imag = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal)); - // @TODO no handling of rank>1 allocation (no distributed GPU) - - // check cpu memory allocation was successful - validateDiagonalOpAllocation(&op, env, __func__); - - // allocate GPU memory - size_t arrSize = op.numElemsPerChunk * sizeof(qreal); - cudaMalloc(&(op.deviceOperator.real), arrSize); - cudaMalloc(&(op.deviceOperator.imag), arrSize); - - // check gpu memory allocation was successful - validateDiagonalOpGPUAllocation(&op, env, __func__); - - // initialise GPU memory to zero - cudaMemset(op.deviceOperator.real, 0, arrSize); - cudaMemset(op.deviceOperator.imag, 0, arrSize); - - return op; -} - -void agnostic_destroyDiagonalOp(DiagonalOp op) { - free(op.real); - free(op.imag); - cudaFree(op.deviceOperator.real); - cudaFree(op.deviceOperator.imag); -} - -void agnostic_syncDiagonalOp(DiagonalOp op) { - cudaDeviceSynchronize(); - size_t mem_elems = op.numElemsPerChunk * sizeof *op.real; - cudaMemcpy(op.deviceOperator.real, op.real, mem_elems, cudaMemcpyHostToDevice); - cudaMemcpy(op.deviceOperator.imag, op.imag, mem_elems, cudaMemcpyHostToDevice); -} - -__global__ void agnostic_initDiagonalOpFromPauliHamilKernel( - DiagonalOp op, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms -) { - // each thread processes one diagonal element - long long int elemInd = blockIdx.x*blockDim.x + threadIdx.x; - if (elemInd >= op.numElemsPerChunk) - return; - - qreal elem = 0; - - // elem is (+-) every coefficient, with sign determined by parity - for (int t=0; t>>( - op, d_pauliCodes, d_termCoeffs, hamil.numSumTerms); - - // copy populated operator into to RAM - cudaDeviceSynchronize(); - size_t mem_elems = op.numElemsPerChunk * sizeof *op.real; - cudaMemcpy(op.real, op.deviceOperator.real, mem_elems, cudaMemcpyDeviceToHost); - cudaMemcpy(op.imag, op.deviceOperator.imag, mem_elems, cudaMemcpyDeviceToHost); - - cudaFree(d_pauliCodes); - cudaFree(d_termCoeffs); -} - -int GPUExists(void){ - int deviceCount, device; - int gpuDeviceCount = 0; - struct cudaDeviceProp properties; - cudaError_t cudaResultCode = cudaGetDeviceCount(&deviceCount); - if (cudaResultCode != cudaSuccess) deviceCount = 0; - /* machines with no GPUs can still report one emulation device */ - for (device = 0; device < deviceCount; ++device) { - cudaGetDeviceProperties(&properties, device); - if (properties.major != 9999) { /* 9999 means emulation only */ - ++gpuDeviceCount; - } - } - if (gpuDeviceCount) return 1; - else return 0; -} - -QuESTEnv createQuESTEnv(void) { - - validateGPUExists(GPUExists(), __func__); - - QuESTEnv env; - env.rank=0; - env.numRanks=1; - - env.seeds = NULL; - env.numSeeds = 0; - seedQuESTDefault(&env); - - return env; -} - -void syncQuESTEnv(QuESTEnv env){ - cudaDeviceSynchronize(); -} - -int syncQuESTSuccess(int successCode){ - return successCode; -} - -void destroyQuESTEnv(QuESTEnv env){ - free(env.seeds); -} - -void reportQuESTEnv(QuESTEnv env){ - printf("EXECUTION ENVIRONMENT:\n"); - printf("Running locally on one node with GPU\n"); - printf("Number of ranks is %d\n", env.numRanks); -# ifdef _OPENMP - printf("OpenMP enabled\n"); - printf("Number of threads available is %d\n", omp_get_max_threads()); -# else - printf("OpenMP disabled\n"); -# endif -} - -void getEnvironmentString(QuESTEnv env, char str[200]){ - - // OpenMP can be hybridised with GPU in future, so this check is safe and worthwhile - int ompStatus=0; - int numThreads=1; -# ifdef _OPENMP - ompStatus=1; - numThreads=omp_get_max_threads(); -# endif - - // there is no reporting of CUDA cores/threads/blocks currently (since non-trivial) - sprintf(str, "CUDA=1 OpenMP=%d MPI=0 threads=%d ranks=1", ompStatus, numThreads); -} - void copyStateToGPU(Qureg qureg) { if (DEBUG) printf("Copying data to GPU\n"); @@ -542,35 +358,6 @@ void statevec_copySubstateFromGPU(Qureg qureg, long long int startInd, long long if (DEBUG) printf("Finished copying data from GPU\n"); } -/** Print the current state vector of probability amplitudes for a set of qubits to standard out. - For debugging purposes. Each rank should print output serially. Only print output for systems <= 5 qubits - */ -void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank){ - long long int index; - int rank; - copyStateFromGPU(qureg); - if (qureg.numQubitsInStateVec<=5){ - for (rank=0; rank=stateVecSize) return; - - qreal normFactor = 1.0/sqrt((qreal)stateVecSize/2); - bit = extractBit(qubitId, index); - if (bit==outcome) { - stateVecReal[index] = normFactor; - stateVecImag[index] = 0.0; - } else { - stateVecReal[index] = 0.0; - stateVecImag[index] = 0.0; - } -} - -void statevec_initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome) -{ - int threadsPerCUDABlock, CUDABlocks; - threadsPerCUDABlock = 128; - CUDABlocks = ceil((qreal)(qureg->numAmpsPerChunk)/threadsPerCUDABlock); - statevec_initStateOfSingleQubitKernel<<>>(qureg->numAmpsPerChunk, qureg->deviceStateVec.real, qureg->deviceStateVec.imag, qubitId, outcome); -} - -// returns 1 if successful, else 0 -int statevec_initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env){ - long long int chunkSize, stateVecSize; - long long int indexInChunk, totalIndex; - - chunkSize = qureg->numAmpsPerChunk; - stateVecSize = chunkSize*qureg->numChunks; - - qreal *stateVecReal = qureg->stateVec.real; - qreal *stateVecImag = qureg->stateVec.imag; - - FILE *fp; - char line[200]; - - fp = fopen(filename, "r"); - if (fp == NULL) - return 0; - - indexInChunk = 0; totalIndex = 0; - while (fgets(line, sizeof(char)*200, fp) != NULL && totalIndexchunkId){ - sscanf(line, REAL_SPECIFIER ", " REAL_SPECIFIER, &(stateVecReal[indexInChunk]), - &(stateVecImag[indexInChunk])); - indexInChunk += 1; - } - totalIndex += 1; - } - } - fclose(fp); - copyStateToGPU(*qureg); - - // indicate success - return 1; -} - -int statevec_compareStates(Qureg mq1, Qureg mq2, qreal precision){ - qreal diff; - int chunkSize = mq1.numAmpsPerChunk; - - copyStateFromGPU(mq1); - copyStateFromGPU(mq2); - - for (int i=0; iprecision) return 0; - diff = mq1.stateVec.imag[i] - mq2.stateVec.imag[i]; - if (diff<0) diff *= -1; - if (diff>precision) return 0; - } - return 1; -} - __global__ void statevec_compactUnitaryKernel (Qureg qureg, int rotQubit, Complex alpha, Complex beta){ // ----- sizes long long int sizeBlock, // size of blocks @@ -2171,131 +1877,6 @@ qreal densmatr_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome) return outcomeProb; } -// atomicAdd on floats/doubles isn't available on <6 CC devices, so we add it ourselves -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 -#else -static __inline__ __device__ double atomicAdd(double* address, double val) -{ - unsigned long long int* address_as_ull = (unsigned long long int*) address; - unsigned long long int old = *address_as_ull, assumed; - - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + __longlong_as_double(assumed))); - - // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) - } while (assumed != old); - - return __longlong_as_double(old); -} -#endif - -__global__ void statevec_calcProbOfAllOutcomesKernel( - qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits -) { - // each thread handles one amplitude (all amplitudes are involved) - long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x; - if (ampInd >= qureg.numAmpsTotal) return; - - qreal prob = ( - qureg.deviceStateVec.real[ampInd]*qureg.deviceStateVec.real[ampInd] + - qureg.deviceStateVec.imag[ampInd]*qureg.deviceStateVec.imag[ampInd]); - - // each amplitude contributes to one outcome - long long int outcomeInd = 0; - for (int q=0; q>>( - d_outcomeProbs, qureg, d_qubits, numQubits); - - // copy outcomeProbs from GPU memory - cudaMemcpy(outcomeProbs, d_outcomeProbs, mem_outcomeProbs, cudaMemcpyDeviceToHost); - - // free GPU memory - cudaFree(d_qubits); - cudaFree(d_outcomeProbs); -} - -__global__ void densmatr_calcProbOfAllOutcomesKernel( - qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits -) { - // each thread handles one diagonal amplitude - long long int diagInd = blockIdx.x*blockDim.x + threadIdx.x; - long long int numDiags = (1LL << qureg.numQubitsRepresented); - if (diagInd >= numDiags) return; - - long long int flatInd = (1 + numDiags)*diagInd; - qreal prob = qureg.deviceStateVec.real[flatInd]; // im[flatInd] assumed ~ 0 - - // each diagonal amplitude contributes to one outcome - long long int outcomeInd = 0; - for (int q=0; q>>( - d_outcomeProbs, qureg, d_qubits, numQubits); - - // copy outcomeProbs from GPU memory - cudaMemcpy(outcomeProbs, d_outcomeProbs, mem_outcomeProbs, cudaMemcpyDeviceToHost); - - // free GPU memory - cudaFree(d_qubits); - cudaFree(d_outcomeProbs); -} - /** computes Tr(conjTrans(a) b) = sum of (a_ij^* b_ij), which is a real number */ __global__ void densmatr_calcInnerProductKernel( Qureg a, Qureg b, long long int numTermsToSum, qreal* reducedArray @@ -3080,74 +2661,6 @@ void densmatr_mixDamping(Qureg qureg, int targetQubit, qreal damping) { part1, part2, part3, bothBits); } -/** Called once for every 16 amplitudes */ -__global__ void densmatr_mixTwoQubitDepolarisingKernel( - qreal depolLevel, qreal* vecReal, qreal *vecImag, long long int numAmpsToVisit, - long long int part1, long long int part2, long long int part3, - long long int part4, long long int part5, - long long int rowCol1, long long int rowCol2) -{ - long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x; - if (scanInd >= numAmpsToVisit) return; - - // index of |..0..0..><..0..0| - long long int ind00 = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2) + ((scanInd&part4)<<3) + ((scanInd&part5)<<4); - long long int ind01 = ind00 + rowCol1; - long long int ind10 = ind00 + rowCol2; - long long int ind11 = ind00 + rowCol1 + rowCol2; - - qreal realAvDepol = depolLevel * 0.25 * ( - vecReal[ind00] + vecReal[ind01] + vecReal[ind10] + vecReal[ind11]); - qreal imagAvDepol = depolLevel * 0.25 * ( - vecImag[ind00] + vecImag[ind01] + vecImag[ind10] + vecImag[ind11]); - - qreal retain = 1 - depolLevel; - vecReal[ind00] *= retain; vecImag[ind00] *= retain; - vecReal[ind01] *= retain; vecImag[ind01] *= retain; - vecReal[ind10] *= retain; vecImag[ind10] *= retain; - vecReal[ind11] *= retain; vecImag[ind11] *= retain; - - vecReal[ind00] += realAvDepol; vecImag[ind00] += imagAvDepol; - vecReal[ind01] += realAvDepol; vecImag[ind01] += imagAvDepol; - vecReal[ind10] += realAvDepol; vecImag[ind10] += imagAvDepol; - vecReal[ind11] += realAvDepol; vecImag[ind11] += imagAvDepol; -} - -void densmatr_mixTwoQubitDepolarising(Qureg qureg, int qubit1, int qubit2, qreal depolLevel) { - - if (depolLevel == 0) - return; - - // assumes qubit2 > qubit1 - - densmatr_mixTwoQubitDephasing(qureg, qubit1, qubit2, depolLevel); - - int rowQubit1 = qubit1 + qureg.numQubitsRepresented; - int rowQubit2 = qubit2 + qureg.numQubitsRepresented; - - long long int colBit1 = 1LL << qubit1; - long long int rowBit1 = 1LL << rowQubit1; - long long int colBit2 = 1LL << qubit2; - long long int rowBit2 = 1LL << rowQubit2; - - long long int rowCol1 = colBit1 | rowBit1; - long long int rowCol2 = colBit2 | rowBit2; - - long long int numAmpsToVisit = qureg.numAmpsPerChunk/16; - long long int part1 = colBit1 - 1; - long long int part2 = (colBit2 >> 1) - colBit1; - long long int part3 = (rowBit1 >> 2) - (colBit2 >> 1); - long long int part4 = (rowBit2 >> 3) - (rowBit1 >> 2); - long long int part5 = numAmpsToVisit - (rowBit2 >> 3); - - int threadsPerCUDABlock, CUDABlocks; - threadsPerCUDABlock = 128; - CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock); - densmatr_mixTwoQubitDepolarisingKernel<<>>( - depolLevel, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit, - part1, part2, part3, part4, part5, rowCol1, rowCol2); -} - __global__ void statevec_setWeightedQuregKernel(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out) { long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x; @@ -3566,569 +3079,6 @@ Complex densmatr_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op) { return expecVal; } -void agnostic_setDiagonalOpElems(DiagonalOp op, long long int startInd, qreal* real, qreal* imag, long long int numElems) { - - // update both RAM and VRAM, for consistency - memcpy(&op.real[startInd], real, numElems * sizeof(qreal)); - memcpy(&op.imag[startInd], imag, numElems * sizeof(qreal)); - - cudaDeviceSynchronize(); - cudaMemcpy( - op.deviceOperator.real + startInd, - real, - numElems * sizeof(*(op.deviceOperator.real)), - cudaMemcpyHostToDevice); - cudaMemcpy( - op.deviceOperator.imag + startInd, - imag, - numElems * sizeof(*(op.deviceOperator.imag)), - cudaMemcpyHostToDevice); -} - -__global__ void statevec_applyPhaseFuncOverridesKernel( - Qureg qureg, int* qubits, int numQubits, enum bitEncoding encoding, - qreal* coeffs, qreal* exponents, int numTerms, - long long int* overrideInds, qreal* overridePhases, int numOverrides, - int conj -) { - long long int index = blockIdx.x*blockDim.x + threadIdx.x; - if (index>=qureg.numAmpsPerChunk) return; - - // determine global amplitude index (non-distributed, so it's just local index) - long long int globalAmpInd = index; - - // determine phase index of {qubits} - long long int phaseInd = 0LL; - if (encoding == UNSIGNED) { - for (int q=0; q>>( - qureg, d_qubits, numQubits, encoding, - d_coeffs, d_exponents, numTerms, - d_overrideInds, d_overridePhases, numOverrides, - conj); - - // cleanup device memory - cudaFree(d_qubits); - cudaFree(d_coeffs); - cudaFree(d_exponents); - cudaFree(d_overrideInds); - cudaFree(d_overridePhases); -} - -__global__ void statevec_applyMultiVarPhaseFuncOverridesKernel( - Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, - qreal* coeffs, qreal* exponents, int* numTermsPerReg, - long long int* overrideInds, qreal* overridePhases, int numOverrides, - long long int *phaseInds, - int conj -) { - long long int index = blockIdx.x*blockDim.x + threadIdx.x; - if (index>=qureg.numAmpsPerChunk) return; - - // determine global amplitude index (non-distributed, so it's just local index) - long long int globalAmpInd = index; - - /* - * each thread needs to write to a local: - * long long int phaseInds[numRegs]; - * but instead has access to shared array phaseInds, with below stride and offset - */ - size_t stride = gridDim.x*blockDim.x; - size_t offset = blockIdx.x*blockDim.x + threadIdx.x; - - // determine phase indices - int flatInd = 0; - if (encoding == UNSIGNED) { - for (int r=0; r>>( - qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding, - d_coeffs, d_exponents, d_numTermsPerReg, - d_overrideInds, d_overridePhases, numOverrides, - d_phaseInds, - conj); - - // free device memory - cudaFree(d_qubits); - cudaFree(d_coeffs); - cudaFree(d_exponents); - cudaFree(d_numQubitsPerReg); - cudaFree(d_numTermsPerReg); - cudaFree(d_overrideInds); - cudaFree(d_overridePhases); - cudaFree(d_phaseInds); -} - -__global__ void statevec_applyParamNamedPhaseFuncOverridesKernel( - Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, - enum phaseFunc phaseFuncName, qreal* params, int numParams, - long long int* overrideInds, qreal* overridePhases, int numOverrides, - long long int* phaseInds, - int conj -) { - long long int index = blockIdx.x*blockDim.x + threadIdx.x; - if (index>=qureg.numAmpsPerChunk) return; - - // determine global amplitude index (non-distributed, so it's just local index) - long long int globalAmpInd = index; - - /* - * each thread needs to write to a local: - * long long int phaseInds[numRegs]; - * but instead has access to shared array phaseInds, with below stride and offset - */ - size_t stride = gridDim.x*blockDim.x; - size_t offset = blockIdx.x*blockDim.x + threadIdx.x; - - // determine phase indices - if (encoding == UNSIGNED) { - int flatInd = 0; - for (int r=0; r 0) cudaMalloc(&d_params, mem_params); - - // copy function args into GPU memory - cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice); - cudaMemcpy(d_numQubitsPerReg, numQubitsPerReg, mem_numQubitsPerReg, cudaMemcpyHostToDevice); - cudaMemcpy(d_overrideInds, overrideInds, mem_overrideInds, cudaMemcpyHostToDevice); - cudaMemcpy(d_overridePhases, overridePhases, mem_overridePhases, cudaMemcpyHostToDevice); - if (numParams > 0) - cudaMemcpy(d_params, params, mem_params, cudaMemcpyHostToDevice); - - int threadsPerCUDABlock = 128; - int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock); - - // allocate thread-local working space {phaseInds} - long long int *d_phaseInds; - size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks; - cudaMalloc(&d_phaseInds, numRegs*gridSize * sizeof *d_phaseInds); - - // call kernel - statevec_applyParamNamedPhaseFuncOverridesKernel<<>>( - qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding, - phaseFuncName, d_params, numParams, - d_overrideInds, d_overridePhases, numOverrides, - d_phaseInds, - conj); - - // free device memory - cudaFree(d_qubits); - cudaFree(d_numQubitsPerReg); - cudaFree(d_overrideInds); - cudaFree(d_overridePhases); - cudaFree(d_phaseInds); - if (numParams > 0) - cudaFree(d_params); -} - -__global__ void densmatr_setQuregToPauliHamilKernel( - Qureg qureg, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms -) { - long long int n = blockIdx.x*blockDim.x + threadIdx.x; - if (n>=qureg.numAmpsPerChunk) return; - - // flattened {I,X,Y,Z} matrix elements, where [k] = [p][i][j] - const int pauliRealElems[] = { 1,0, 0,1, 0,1, 1,0, 0,0, 0,0, 1,0, 0,-1 }; - const int pauliImagElems[] = { 0,0, 0,0, 0,0, 0,0, 0,-1,1,0, 0,0, 0,0 }; - - // |n> = |c>|r> - const int numQubits = qureg.numQubitsRepresented; - const long long int r = n & ((1LL << numQubits) - 1); - const long long int c = n >> numQubits; - - // new amplitude of |n> - qreal elemRe = 0; - qreal elemIm = 0; - - for (long long int t=0; t> q) & 1; - int j = (c >> q) & 1; - int p = (int) pauliCodes[pInd++]; - int k = (p<<2) + (i<<1) + j; - int pauliRe = pauliRealElems[k]; - int pauliIm = pauliImagElems[k]; - - // kron *= pauli - int tmp = (pauliRe*kronRe) - (pauliIm*kronIm); - kronIm = (pauliRe*kronIm) + (pauliIm*kronRe); - kronRe = tmp; - } - - // elem = sum_t coeffs[t] pauliKronecker[r][c] - elemRe += termCoeffs[t] * kronRe; - elemIm += termCoeffs[t] * kronIm; - } - - // overwrite the density matrix entry - qureg.deviceStateVec.real[n] = elemRe; - qureg.deviceStateVec.imag[n] = elemIm; -} - -void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) { - - // copy hamil into GPU memory - enum pauliOpType* d_pauliCodes; - size_t mem_pauliCodes = hamil.numSumTerms * hamil.numQubits * sizeof *d_pauliCodes; - cudaMalloc(&d_pauliCodes, mem_pauliCodes); - cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice); - - qreal* d_termCoeffs; - size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs; - cudaMalloc(&d_termCoeffs, mem_termCoeffs); - cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice); - - int numThreadsPerBlock = 128; - int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) numThreadsPerBlock); - densmatr_setQuregToPauliHamilKernel<<>>( - qureg, d_pauliCodes, d_termCoeffs, hamil.numSumTerms); - - // free tmp GPU memory - cudaFree(d_pauliCodes); - cudaFree(d_termCoeffs); -} - -void seedQuEST(QuESTEnv *env, unsigned long int *seedArray, int numSeeds) { - - // free existing seed array, if exists - if (env->seeds != NULL) - free(env->seeds); - - // record keys in permanent heap - env->seeds = (unsigned long int*) malloc(numSeeds * sizeof *(env->seeds)); - for (int i=0; iseeds)[i] = seedArray[i]; - env->numSeeds = numSeeds; - - // pass keys to Mersenne Twister seeder - init_by_array(seedArray, numSeeds); -} - #ifdef __cplusplus diff --git a/QuEST/src/GPU/QuEST_gpu_common.cu b/QuEST/src/GPU/QuEST_gpu_common.cu new file mode 100644 index 000000000..1e0a2ea1c --- /dev/null +++ b/QuEST/src/GPU/QuEST_gpu_common.cu @@ -0,0 +1,1158 @@ +// Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details + +/** @file + * GPU routines which are agnostic to the cuQuantum backend or the custom kernels + * + * @author Tyson Jones + * @author Ania Brown (defined some original functions moved here) + */ + + +# include "QuEST.h" +# include "QuEST_precision.h" +# include "QuEST_validation.h" +# include "QuEST_internal.h" +# include "mt19937ar.h" + +# include +# include +# include + +#ifdef USE_HIP +// Translate CUDA calls into HIP calls +#include "cuda_to_hip.h" +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + + + +/* + * GPU CONFIG + */ + +const int NUM_THREADS_PER_BLOCK = 128; + +__forceinline__ __device__ long long int getThreadInd() { + return blockIdx.x*blockDim.x + threadIdx.x; +} + + + +/* + * BACKEND AGNOSTIC COMPLEX ARITHMETIC + */ + +#define PROD_REAL(aRe, aIm, bRe, bIm) \ + ((aRe)*(bRe) - (aIm)*(bIm)) +#define PROD_IMAG(aRe, aIm, bRe, bIm) \ + ((aRe)*(bIm) + (aIm)*(bRe)) + +#ifdef USE_CUQUANTUM + #define GET_AMP(aRe, aIm, qureg, i) \ + aRe = qureg.deviceCuStateVec[i].x; \ + aIm = qureg.deviceCuStateVec[i].y; +#else + #define GET_AMP(aRe, aIm, qureg, i) \ + aRe = qureg.deviceStateVec.real[i]; \ + aIm = qureg.deviceStateVec.imag[i]; +#endif + +#ifdef USE_CUQUANTUM + #define SET_AMP(qureg, i, aRe, aIm) \ + qureg.deviceCuStateVec[i].x = aRe; \ + qureg.deviceCuStateVec[i].y = aIm; +#else + #define SET_AMP(qureg, i, aRe, aIm) \ + qureg.deviceStateVec.real[i] = aRe; \ + qureg.deviceStateVec.imag[i] = aIm; +#endif + + + +/* + * ENVIRONMENT MANAGEMENT + */ + +int GPUExists(void) { + int deviceCount, device; + int gpuDeviceCount = 0; + struct cudaDeviceProp properties; + cudaError_t cudaResultCode = cudaGetDeviceCount(&deviceCount); + if (cudaResultCode != cudaSuccess) deviceCount = 0; + /* machines with no GPUs can still report one emulation device */ + for (device = 0; device < deviceCount; ++device) { + cudaGetDeviceProperties(&properties, device); + if (properties.major != 9999) { /* 9999 means emulation only */ + ++gpuDeviceCount; + } + } + if (gpuDeviceCount) return 1; + else return 0; +} + +void syncQuESTEnv(QuESTEnv env){ + cudaDeviceSynchronize(); +} + +int syncQuESTSuccess(int successCode){ + return successCode; +} + +void reportQuESTEnv(QuESTEnv env){ + printf("EXECUTION ENVIRONMENT:\n"); + printf("Running locally on one node with GPU "); +# ifdef USE_CUQUANTUM + printf("via cuQuantum\n"); +# else + printf("via custom kernels\n"); +# endif + printf("Number of ranks is %d\n", env.numRanks); +# ifdef _OPENMP + printf("OpenMP enabled\n"); + printf("Number of threads available is %d\n", omp_get_max_threads()); +# else + printf("OpenMP disabled\n"); +# endif +} + +void getEnvironmentString(QuESTEnv env, char str[200]){ + + // OpenMP can be hybridised with GPU in future, so this check is safe and worthwhile + int ompStatus=0; + int numThreads=1; +# ifdef _OPENMP + ompStatus=1; + numThreads=omp_get_max_threads(); +# endif + + // ascertain whether we're using cuQuantum or bespoke kernels + int cuQuantumStatus=0; +# ifdef USE_CUQUANTUM + cuQuantumStatus=1; +# endif + + // there is no reporting of CUDA cores/threads/blocks currently (since non-trivial) + sprintf(str, "CUDA=1 cuQuantum=%d OpenMP=%d MPI=0 threads=%d ranks=1", cuQuantumStatus, ompStatus, numThreads); +} + +void seedQuEST(QuESTEnv *env, unsigned long int *seedArray, int numSeeds) { + + // free existing seed array, if exists + if (env->seeds != NULL) + free(env->seeds); + + // record keys in permanent heap + env->seeds = (unsigned long int*) malloc(numSeeds * sizeof *(env->seeds)); + for (int i=0; iseeds)[i] = seedArray[i]; + env->numSeeds = numSeeds; + + // pass keys to Mersenne Twister seeder + init_by_array(seedArray, numSeeds); +} + + + +/* + * DEBUG + */ + +void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank){ + + if (qureg.numQubitsInStateVec > 5) { + printf("State reporting disabled for >5 qubits"); + return; + } + + copyStateFromGPU(qureg); + + // distributed GPU not yet supported + if (reportRank != 0) + return; + + printf("Reporting state from rank %d [\n", qureg.chunkId); + printf("real, imag\n"); + + for(long long int i=0; i= pure.numAmpsPerChunk) return; + + qreal aRe, aIm; + GET_AMP( aRe, aIm, pure, pureInd ); + + for (long long int col=0; col>>(dens, pure); +} + +__global__ void densmatr_setQuregToPauliHamilKernel( + Qureg qureg, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms +) { + long long int n = getThreadInd(); + if (n>=qureg.numAmpsPerChunk) return; + + // flattened {I,X,Y,Z} matrix elements, where [k] = [p][i][j] + const int pauliRealElems[] = { 1,0, 0,1, 0,1, 1,0, 0,0, 0,0, 1,0, 0,-1 }; + const int pauliImagElems[] = { 0,0, 0,0, 0,0, 0,0, 0,-1,1,0, 0,0, 0,0 }; + + // |n> = |c>|r> + const int numQubits = qureg.numQubitsRepresented; + const long long int r = n & ((1LL << numQubits) - 1); + const long long int c = n >> numQubits; + + // new amplitude of |n> + qreal elemRe = 0; + qreal elemIm = 0; + + for (long long int t=0; t> q) & 1; + int j = (c >> q) & 1; + int p = (int) pauliCodes[pInd++]; + int k = (p<<2) + (i<<1) + j; + int pauliRe = pauliRealElems[k]; + int pauliIm = pauliImagElems[k]; + + // kron *= pauli + int tmp = (pauliRe*kronRe) - (pauliIm*kronIm); + kronIm = (pauliRe*kronIm) + (pauliIm*kronRe); + kronRe = tmp; + } + + // elem = sum_t coeffs[t] pauliKronecker[r][c] + elemRe += termCoeffs[t] * kronRe; + elemIm += termCoeffs[t] * kronIm; + } + + // overwrite the density matrix entry + SET_AMP( qureg, n, elemRe, elemIm ); +} + +void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) { + + // copy hamil into GPU memory + enum pauliOpType* d_pauliCodes; + size_t mem_pauliCodes = hamil.numSumTerms * hamil.numQubits * sizeof *d_pauliCodes; + cudaMalloc(&d_pauliCodes, mem_pauliCodes); + cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice); + + qreal* d_termCoeffs; + size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs; + cudaMalloc(&d_termCoeffs, mem_termCoeffs); + cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice); + + int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) NUM_THREADS_PER_BLOCK); + densmatr_setQuregToPauliHamilKernel<<>>( + qureg, d_pauliCodes, d_termCoeffs, hamil.numSumTerms); + + // free tmp GPU memory + cudaFree(d_pauliCodes); + cudaFree(d_termCoeffs); +} + + + +/* + * CALCULATIONS + */ + +// atomicAdd on floats/doubles isn't available on <6 CC devices, so we add it ourselves +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 +#else +static __inline__ __device__ double atomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*) address; + unsigned long long int old = *address_as_ull, assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __longlong_as_double(old); +} +#endif + +__global__ void statevec_calcProbOfAllOutcomesKernel( + qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits +) { + // each thread handles one amplitude (all amplitudes are involved) + long long int ampInd = getThreadInd(); + if (ampInd >= qureg.numAmpsTotal) return; + + qreal ampRe, ampIm; + GET_AMP( ampRe, ampIm, qureg, ampInd ); + qreal prob = ampRe*ampRe + ampIm*ampIm; + + // each amplitude contributes to one outcome + long long int outcomeInd = 0; + for (int q=0; q> qubits[q]) & 1) << q; + + // each thread atomically writes directly to the global output. + // this beat block-heirarchal atomic reductions in both global and shared memory! + atomicAdd(&outcomeProbs[outcomeInd], prob); +} + +void statevec_calcProbOfAllOutcomes(qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits) { + + // copy qubits to GPU memory + int* d_qubits; + size_t mem_qubits = numQubits * sizeof *d_qubits; + cudaMalloc(&d_qubits, mem_qubits); + cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice); + + // create one thread for every amplitude + int numThreadsPerBlock = 128; + int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) numThreadsPerBlock); + + // create global GPU array for outcomeProbs + qreal* d_outcomeProbs; + long long int numOutcomes = (1LL << numQubits); + size_t mem_outcomeProbs = numOutcomes * sizeof *d_outcomeProbs; + cudaMalloc(&d_outcomeProbs, mem_outcomeProbs); + cudaMemset(d_outcomeProbs, 0, mem_outcomeProbs); + + // populate per-block subarrays + statevec_calcProbOfAllOutcomesKernel<<>>( + d_outcomeProbs, qureg, d_qubits, numQubits); + + // copy outcomeProbs from GPU memory + cudaMemcpy(outcomeProbs, d_outcomeProbs, mem_outcomeProbs, cudaMemcpyDeviceToHost); + + // free GPU memory + cudaFree(d_qubits); + cudaFree(d_outcomeProbs); +} + +__global__ void densmatr_calcProbOfAllOutcomesKernel( + qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits +) { + // each thread handles one diagonal amplitude + long long int diagInd = getThreadInd(); + long long int numDiags = (1LL << qureg.numQubitsRepresented); + if (diagInd >= numDiags) return; + + long long int flatInd = (1 + numDiags)*diagInd; + + qreal ampRe, ampIm; + GET_AMP( ampRe, ampIm, qureg, flatInd ); + qreal prob = ampRe + 0*ampIm; // ampIm assumed ~ 0, silence unused warning + + // each diagonal amplitude contributes to one outcome + long long int outcomeInd = 0; + for (int q=0; q> qubits[q]) & 1) << q; + + // each thread atomically writes directly to the global output. + // this beat block-heirarchal atomic reductions in both global and shared memory! + atomicAdd(&outcomeProbs[outcomeInd], prob); +} + +void densmatr_calcProbOfAllOutcomes(qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits) { + + // copy qubits to GPU memory + int* d_qubits; + size_t mem_qubits = numQubits * sizeof *d_qubits; + cudaMalloc(&d_qubits, mem_qubits); + cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice); + + // create global array, with per-block subarrays + int numThreadsPerBlock = 128; + int numDiags = (1LL << qureg.numQubitsRepresented); + int numBlocks = ceil(numDiags / (qreal) numThreadsPerBlock); + + // create global GPU array for outcomeProbs + qreal* d_outcomeProbs; + long long int numOutcomes = (1LL << numQubits); + size_t mem_outcomeProbs = numOutcomes * sizeof *d_outcomeProbs; + cudaMalloc(&d_outcomeProbs, mem_outcomeProbs); + cudaMemset(d_outcomeProbs, 0, mem_outcomeProbs); + + // populate per-block subarrays + densmatr_calcProbOfAllOutcomesKernel<<>>( + d_outcomeProbs, qureg, d_qubits, numQubits); + + // copy outcomeProbs from GPU memory + cudaMemcpy(outcomeProbs, d_outcomeProbs, mem_outcomeProbs, cudaMemcpyDeviceToHost); + + // free GPU memory + cudaFree(d_qubits); + cudaFree(d_outcomeProbs); +} + + + +/* + * DECOHERENCE + */ + +__global__ void densmatr_mixTwoQubitDepolarisingKernel( + Qureg qureg, qreal depolLevel, long long int part1, long long int part2, + long long int part3, long long int part4, long long int part5, + long long int rowCol1, long long int rowCol2) +{ + long long int scanInd = getThreadInd(); + if (scanInd >= qureg.numAmpsPerChunk) return; + + long long int ind00 = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2) + ((scanInd&part4)<<3) + ((scanInd&part5)<<4); + long long int ind01 = ind00 + rowCol1; + long long int ind10 = ind00 + rowCol2; + long long int ind11 = ind00 + rowCol1 + rowCol2; + + qreal re00, re01, re10, re11; + qreal im00, im01, im10, im11; + GET_AMP( re00, im00, qureg, ind00 ); + GET_AMP( re01, im01, qureg, ind01 ); + GET_AMP( re10, im10, qureg, ind10 ); + GET_AMP( re11, im11, qureg, ind11 ); + + qreal realAvDepol = depolLevel * 0.25 * (re00 + re01 + re10 + re11); + qreal imagAvDepol = depolLevel * 0.25 * (im00 + im01 + im10 + im11); + qreal retain = 1 - depolLevel; + + SET_AMP( qureg, ind00, retain*re00 + realAvDepol, retain*im00 + imagAvDepol ); + SET_AMP( qureg, ind01, retain*re01 + realAvDepol, retain*im01 + imagAvDepol ); + SET_AMP( qureg, ind10, retain*re10 + realAvDepol, retain*im10 + imagAvDepol ); + SET_AMP( qureg, ind11, retain*re11 + realAvDepol, retain*im11 + imagAvDepol ); +} + +void densmatr_mixTwoQubitDepolarising(Qureg qureg, int qubit1, int qubit2, qreal depolLevel) { + + if (depolLevel == 0) + return; + + densmatr_mixTwoQubitDephasing(qureg, qubit1, qubit2, depolLevel); + + // this code is painfully and unnecessarily verbose; it computes index offsets + // using bitwise logic, only to (within the kernel) effect those offsets with + // integer arithmetic. Instead, one might as well obtain the ultimate indices + // directly using bitwise logic within the kernel, passing no offsets at all. + // We defer this to when the insertBits (and other bit twiddles) have been moved + // into QuEST_gpu_common.cu. + + int rowQubit1 = qubit1 + qureg.numQubitsRepresented; + int rowQubit2 = qubit2 + qureg.numQubitsRepresented; + + long long int colBit1 = 1LL << qubit1; + long long int rowBit1 = 1LL << rowQubit1; + long long int colBit2 = 1LL << qubit2; + long long int rowBit2 = 1LL << rowQubit2; + + long long int rowCol1 = colBit1 | rowBit1; + long long int rowCol2 = colBit2 | rowBit2; + + long long int numAmpsToVisit = qureg.numAmpsPerChunk/16; + long long int part1 = colBit1 - 1; + long long int part2 = (colBit2 >> 1) - colBit1; + long long int part3 = (rowBit1 >> 2) - (colBit2 >> 1); + long long int part4 = (rowBit2 >> 3) - (rowBit1 >> 2); + long long int part5 = numAmpsToVisit - (rowBit2 >> 3); + + int CUDABlocks = ceil(numAmpsToVisit / (qreal) NUM_THREADS_PER_BLOCK); + densmatr_mixTwoQubitDepolarisingKernel<<>>( + qureg, depolLevel, part1, part2, part3, part4, part5, rowCol1, rowCol2); +} + + + +/* + * MANAGING NON-QUREG TYPES + */ + +DiagonalOp agnostic_createDiagonalOp(int numQubits, QuESTEnv env) { + + DiagonalOp op; + op.numQubits = numQubits; + op.numElemsPerChunk = (1LL << numQubits) / env.numRanks; + op.chunkId = env.rank; + op.numChunks = env.numRanks; + + // allocate CPU memory (initialised to zero) + op.real = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal)); + op.imag = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal)); + + // check cpu memory allocation was successful + validateDiagonalOpAllocation(&op, env, __func__); + + // allocate GPU memory + size_t arrSize = op.numElemsPerChunk * sizeof(qreal); + cudaMalloc(&(op.deviceOperator.real), arrSize); + cudaMalloc(&(op.deviceOperator.imag), arrSize); + + // check gpu memory allocation was successful + validateDiagonalOpGPUAllocation(&op, env, __func__); + + // initialise GPU memory to zero + cudaMemset(op.deviceOperator.real, 0, arrSize); + cudaMemset(op.deviceOperator.imag, 0, arrSize); + + return op; +} + +void agnostic_destroyDiagonalOp(DiagonalOp op) { + free(op.real); + free(op.imag); + cudaFree(op.deviceOperator.real); + cudaFree(op.deviceOperator.imag); +} + +void agnostic_syncDiagonalOp(DiagonalOp op) { + cudaDeviceSynchronize(); + size_t mem_elems = op.numElemsPerChunk * sizeof *op.real; + cudaMemcpy(op.deviceOperator.real, op.real, mem_elems, cudaMemcpyHostToDevice); + cudaMemcpy(op.deviceOperator.imag, op.imag, mem_elems, cudaMemcpyHostToDevice); +} + +__global__ void agnostic_initDiagonalOpFromPauliHamilKernel( + DiagonalOp op, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms +) { + // each thread processes one diagonal element + long long int elemInd = getThreadInd(); + if (elemInd >= op.numElemsPerChunk) + return; + + qreal elem = 0; + + // elem is (+-) every coefficient, with sign determined by parity + for (int t=0; t> q) & 1) + isOddNumOnes = !isOddNumOnes; + + // avoid warp divergence + int sign = 1 - 2*isOddNumOnes; // (-1 if isOddNumOnes, else +1) + elem += termCoeffs[t] * sign; + } + + op.deviceOperator.real[elemInd] = elem; + op.deviceOperator.imag[elemInd] = 0; +} + +void agnostic_initDiagonalOpFromPauliHamil(DiagonalOp op, PauliHamil hamil) { + + // copy args intop GPU memory + enum pauliOpType* d_pauliCodes; + size_t mem_pauliCodes = hamil.numSumTerms * op.numQubits * sizeof *d_pauliCodes; + cudaMalloc(&d_pauliCodes, mem_pauliCodes); + cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice); + + qreal* d_termCoeffs; + size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs; + cudaMalloc(&d_termCoeffs, mem_termCoeffs); + cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice); + + int numBlocks = ceil(op.numElemsPerChunk / (qreal) NUM_THREADS_PER_BLOCK); + agnostic_initDiagonalOpFromPauliHamilKernel<<>>( + op, d_pauliCodes, d_termCoeffs, hamil.numSumTerms); + + // copy populated operator into to RAM + cudaDeviceSynchronize(); + size_t mem_elems = op.numElemsPerChunk * sizeof *op.real; + cudaMemcpy(op.real, op.deviceOperator.real, mem_elems, cudaMemcpyDeviceToHost); + cudaMemcpy(op.imag, op.deviceOperator.imag, mem_elems, cudaMemcpyDeviceToHost); + + cudaFree(d_pauliCodes); + cudaFree(d_termCoeffs); +} + +void agnostic_setDiagonalOpElems(DiagonalOp op, long long int startInd, qreal* real, qreal* imag, long long int numElems) { + + // update both RAM and VRAM, for consistency + memcpy(&op.real[startInd], real, numElems * sizeof(qreal)); + memcpy(&op.imag[startInd], imag, numElems * sizeof(qreal)); + + cudaDeviceSynchronize(); + cudaMemcpy( + op.deviceOperator.real + startInd, + real, + numElems * sizeof(*(op.deviceOperator.real)), + cudaMemcpyHostToDevice); + cudaMemcpy( + op.deviceOperator.imag + startInd, + imag, + numElems * sizeof(*(op.deviceOperator.imag)), + cudaMemcpyHostToDevice); +} + + + +/* + * PHASE FUNCTIONS + */ + +__forceinline__ __device__ int getBit(long long int num, int ind) { + return (num >> ind) & 1; +} + +__forceinline__ __device__ void applyPhaseToAmp(Qureg qureg, long long int index, qreal phase, int conj) +{ + // negate phase to conjugate operator + phase *= (1 - 2*conj); + qreal c = cos(phase); + qreal s = sin(phase); + + // modify amp to amp * exp(i phase) + qreal re, im; + GET_AMP( re, im, qureg, index ); + SET_AMP( qureg, index, re*c - im*s, re*s + im*c ); +} + +__forceinline__ __device__ void getMultiRegPhaseIndOffsets(size_t* stride, size_t* offset) +{ + // determine the phaseInds tuple relevant for this thread + *stride = gridDim.x*blockDim.x; + *offset = blockIdx.x*blockDim.x + threadIdx.x; +} + +__forceinline__ __device__ long long int getPhaseInd(long long int globalAmpInd, int* qubits, int numQubits, enum bitEncoding encoding) +{ + long long int phaseInd = 0LL; + + if (encoding == UNSIGNED) { + for (int q=0; q=qureg.numAmpsPerChunk) return; + + // determine global amplitude index (non-distributed, so it's just local index) + long long int globalAmpInd = index; + + // determine phaseInd from the qubit encoding + long long int phaseInd = getPhaseInd(globalAmpInd, qubits, numQubits, encoding); + + // determine if this phase index has an overriden value (i < numOverrides) + int overInd = getIndOfPhaseOverride(phaseInd, overrideInds, numOverrides); + + // determine phase from {coeffs}, {exponents} (unless overriden) + qreal phase = 0; + if (overInd < numOverrides) + phase = overridePhases[overInd]; + else + for (int t=0; t>>( + qureg, d_qubits, numQubits, encoding, + d_coeffs, d_exponents, numTerms, + d_overrideInds, d_overridePhases, numOverrides, + conj); + + // cleanup device memory + cudaFree(d_qubits); + cudaFree(d_coeffs); + cudaFree(d_exponents); + cudaFree(d_overrideInds); + cudaFree(d_overridePhases); +} + +__global__ void statevec_applyMultiVarPhaseFuncOverridesKernel( + Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, + qreal* coeffs, qreal* exponents, int* numTermsPerReg, + long long int* overrideInds, qreal* overridePhases, int numOverrides, + long long int *phaseInds, + int conj +) { + long long int index = getThreadInd(); + if (index>=qureg.numAmpsPerChunk) return; + + // determine global amplitude index (non-distributed, so it's just local index) + long long int globalAmpInd = index; + + // determine phase indices (each thread has phaseInds[numRegs] sub-array) + setMultiRegPhaseInds(phaseInds, globalAmpInd, qubits, numQubitsPerReg, numRegs, encoding); + + // determine if this phase index has an overriden value + long long int overInd = getIndOfMultiRegPhaseOverride(phaseInds, numRegs, overrideInds, numOverrides); + + // either use the overriden phase... + qreal phase = 0; + if (overInd < numOverrides) + phase = overridePhases[overInd]; + + else { + // else determine the phaseInds tuple relevant for this thread + size_t stride, offset; + getMultiRegPhaseIndOffsets(&stride, &offset); + + // and compute the phase from coeffs and exponents + long long int flatInd = 0; + for (int r=0; r>>( + qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding, + d_coeffs, d_exponents, d_numTermsPerReg, + d_overrideInds, d_overridePhases, numOverrides, + d_phaseInds, + conj); + + // free device memory + cudaFree(d_qubits); + cudaFree(d_coeffs); + cudaFree(d_exponents); + cudaFree(d_numQubitsPerReg); + cudaFree(d_numTermsPerReg); + cudaFree(d_overrideInds); + cudaFree(d_overridePhases); + cudaFree(d_phaseInds); +} + +__global__ void statevec_applyParamNamedPhaseFuncOverridesKernel( + Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, + enum phaseFunc phaseFuncName, qreal* params, int numParams, + long long int* overrideInds, qreal* overridePhases, int numOverrides, + long long int* phaseInds, + int conj +) { + long long int index = getThreadInd(); + if (index>=qureg.numAmpsPerChunk) return; + + // determine global amplitude index (non-distributed, so it's just local index) + long long int globalAmpInd = index; + + // determine phase indices (each thread has phaseInds[numRegs] sub-array) + setMultiRegPhaseInds(phaseInds, globalAmpInd, qubits, numQubitsPerReg, numRegs, encoding); + + // determine if this phase index has an overriden value + long long int overInd = getIndOfMultiRegPhaseOverride(phaseInds, numRegs, overrideInds, numOverrides); + + // determine the phase, or the overriden one + qreal phase = 0; + if (overInd < numOverrides) + phase = overridePhases[overInd]; + else + phase = getPhaseFromParamNamedFunc(phaseInds, numRegs, phaseFuncName, params, numParams); + + // modify amp to amp * exp(i phase) + applyPhaseToAmp(qureg, index, phase, conj); +} + +void statevec_applyParamNamedPhaseFuncOverrides( + Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, + enum phaseFunc phaseFuncName, qreal* params, int numParams, + long long int* overrideInds, qreal* overridePhases, int numOverrides, + int conj +) { + // determine size of arrays, for cloning into GPU memory + size_t mem_numQubitsPerReg = numRegs * sizeof *numQubitsPerReg; + size_t mem_overridePhases = numOverrides * sizeof *overridePhases; + size_t mem_overrideInds = numOverrides * numRegs * sizeof *overrideInds; + size_t mem_params = numParams * sizeof *params; + size_t mem_qubits = 0; + for (int r=0; r 0) cudaMalloc(&d_params, mem_params); + + // copy function args into GPU memory + cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice); + cudaMemcpy(d_numQubitsPerReg, numQubitsPerReg, mem_numQubitsPerReg, cudaMemcpyHostToDevice); + cudaMemcpy(d_overrideInds, overrideInds, mem_overrideInds, cudaMemcpyHostToDevice); + cudaMemcpy(d_overridePhases, overridePhases, mem_overridePhases, cudaMemcpyHostToDevice); + if (numParams > 0) + cudaMemcpy(d_params, params, mem_params, cudaMemcpyHostToDevice); + + int threadsPerCUDABlock = 128; + int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock); + + // allocate thread-local working space {phaseInds} + long long int *d_phaseInds; + size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks; + cudaMalloc(&d_phaseInds, numRegs*gridSize * sizeof *d_phaseInds); + + // call kernel + statevec_applyParamNamedPhaseFuncOverridesKernel<<>>( + qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding, + phaseFuncName, d_params, numParams, + d_overrideInds, d_overridePhases, numOverrides, + d_phaseInds, + conj); + + // free device memory + cudaFree(d_qubits); + cudaFree(d_numQubitsPerReg); + cudaFree(d_overrideInds); + cudaFree(d_overridePhases); + cudaFree(d_phaseInds); + if (numParams > 0) + cudaFree(d_params); +} + + + +#ifdef __cplusplus +} +#endif diff --git a/QuEST/src/GPU/QuEST_gpu_common.h b/QuEST/src/GPU/QuEST_gpu_common.h new file mode 100644 index 000000000..cf6998cb4 --- /dev/null +++ b/QuEST/src/GPU/QuEST_gpu_common.h @@ -0,0 +1,27 @@ +// Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details + +/** @file + * GPU backend routines which are invoked by both the cuQuantum and bespoke-kernel backends. + * This excludes backend-agnostic API implementations which do not need a header declaration. + * + * @author Tyson Jones + */ + +# ifndef QUEST_GPU_COMMON_H +# define QUEST_GPU_COMMON_H + +# ifdef __cplusplus +extern "C" { +# endif + + + +int GPUExists(void); + + + +#ifdef __cplusplus +} +#endif + +# endif // QUEST_GPU_COMMON_H \ No newline at end of file diff --git a/QuEST/src/QuEST.c b/QuEST/src/QuEST.c index aae1b433c..ff18e5efb 100644 --- a/QuEST/src/QuEST.c +++ b/QuEST/src/QuEST.c @@ -1715,27 +1715,10 @@ void destroySubDiagonalOp(SubDiagonalOp op) { * debug */ -int compareStates(Qureg qureg1, Qureg qureg2, qreal precision) { - validateMatchingQuregDims(qureg1, qureg2, __func__); - return statevec_compareStates(qureg1, qureg2, precision); -} - void initDebugState(Qureg qureg) { statevec_initDebugState(qureg); } -void initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env) { - int success = statevec_initStateFromSingleFile(qureg, filename, env); - validateFileOpened(success, filename, __func__); -} - -void initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome) { - validateStateVecQureg(*qureg, __func__); - validateTarget(*qureg, qubitId, __func__); - validateOutcome(outcome, __func__); - statevec_initStateOfSingleQubit(qureg, qubitId, outcome); -} - void reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank) { statevec_reportStateToScreen(qureg, env, reportRank); } diff --git a/QuEST/src/QuEST_debug.h b/QuEST/src/QuEST_debug.h deleted file mode 100755 index e3861f55e..000000000 --- a/QuEST/src/QuEST_debug.h +++ /dev/null @@ -1,61 +0,0 @@ -// Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details - -/** @file - * Developer functions used for unit testing and debugging, which are not part of the public API. - * May contain functions that are incomplete or untested. - * - * @author Ania Brown - */ - -# ifndef QUEST_DEBUG_H -# define QUEST_DEBUG_H - -# include "QuEST_precision.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/** - * Initialise the state vector of probability amplitudes such that one qubit is set to 'outcome' and all other qubits are in an equal superposition of zero and one. - * @param[in,out] qureg object representing the set of qubits to be initialised - * @param[in] qubitId id of qubit to set to state 'outcome' - * @param[in] outcome value of qubit 'qubitId' to set - */ -void initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome); - -/** - * Initialise the state vector of probability amplitudes to an (unphysical) state with - * each component of each probability amplitude a unique floating point value. For debugging processes - * @param[in,out] qureg object representing the set of qubits to be initialised - */ -void initStateDebug(Qureg qureg); - -/** Initialises the wavefunction amplitudes according to those specified in a file. - * For debugging purpsoses - */ -void initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env); - -/** Return whether two given wavefunctions are equivalent within a given precision - * Global phase included in equivalence check. For debugging purposes. - */ -int compareStates(Qureg mq1, Qureg mq2, qreal precision); - -/** Set elements in the underlying state vector represenation of a density matrix. Not exposed in the public - * API as this requires an understanding of how the state vector is used to represent a density matrix. - * Currently can only be used to set all amps. - */ -void setDensityAmps(Qureg qureg, qreal* reals, qreal* imags); - - - -/** Return the precision of qreal for use in testing - * */ - -int QuESTPrecision(void); - -#ifdef __cplusplus -} -#endif - -# endif // QUEST_DEBUG_H diff --git a/QuEST/src/QuEST_internal.h b/QuEST/src/QuEST_internal.h index 48c2dc742..4fbf99762 100644 --- a/QuEST/src/QuEST_internal.h +++ b/QuEST/src/QuEST_internal.h @@ -119,12 +119,6 @@ void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil); void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank); -int statevec_compareStates(Qureg mq1, Qureg mq2, qreal precision); - -int statevec_initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env); - -void statevec_initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome); - void statevec_createQureg(Qureg *qureg, int numQubits, QuESTEnv env); void statevec_destroyQureg(Qureg qureg, QuESTEnv env); diff --git a/QuEST/src/QuEST_validation.c b/QuEST/src/QuEST_validation.c index 166b8ba6b..b62732e18 100644 --- a/QuEST/src/QuEST_validation.c +++ b/QuEST/src/QuEST_validation.c @@ -120,6 +120,7 @@ typedef enum { E_DIAGONAL_OP_NOT_ALLOCATED, E_DIAGONAL_OP_NOT_ALLOCATED_ON_GPU, E_NO_GPU, + E_GPU_DOES_NOT_SUPPORT_MEM_POOLS, E_QASM_BUFFER_OVERFLOW } ErrorCode; @@ -213,6 +214,7 @@ static const char* errorMessages[] = { [E_DIAGONAL_OP_NOT_ALLOCATED] = "Could not allocate memory for DiagonalOp. Possibly insufficient memory.", [E_DIAGONAL_OP_NOT_ALLOCATED_ON_GPU] = "Could not allocate memory for DiagonalOp on GPU. Possibly insufficient memory.", [E_NO_GPU] = "Trying to run GPU code with no GPU available.", + [E_GPU_DOES_NOT_SUPPORT_MEM_POOLS] = "The GPU does not support stream-ordered memory pools, required by the cuQuantum backend.", [E_QASM_BUFFER_OVERFLOW] = "QASM line buffer filled." }; @@ -255,10 +257,10 @@ int isComplexPairUnitary(Complex alpha, Complex beta) { + beta.imag*beta.imag) < REAL_EPS ); } -#define macro_isMatrixUnitary(m, dim, retVal) { \ - /* elemRe_ and elemIm_ must not exist in caller scope */ \ +#define macro_isMatrixUnitary(m, dim) { \ + /* REAL_EPS_SQUARED_, elemRe_, elemIm_, reDif_ and distSq_ must not exist in caller scope */ \ + qreal REAL_EPS_SQUARED_ = REAL_EPS * REAL_EPS; \ qreal elemRe_, elemIm_; \ - retVal = 1; \ /* check m * ConjugateTranspose(m) == Identity */ \ for (int r=0; r < (dim); r++) { \ for (int c=0; c < (dim); c++) { \ @@ -270,40 +272,33 @@ int isComplexPairUnitary(Complex alpha, Complex beta) { elemRe_ += m.real[r][i]*m.real[c][i] + m.imag[r][i]*m.imag[c][i]; \ elemIm_ += m.imag[r][i]*m.real[c][i] - m.real[r][i]*m.imag[c][i]; \ } \ - /* check distance from identity */ \ - if ((absReal(elemIm_) > REAL_EPS) || \ - (r == c && absReal(elemRe_ - 1) > REAL_EPS) || \ - (r != c && absReal(elemRe_ ) > REAL_EPS)) { \ - retVal = 0; \ - break; \ - } \ + /* assert elem has Euclidean distance < REAL_EPS from Identity elem */ \ + qreal reDif_ = elemRe_ - (r==c); \ + qreal distSq_ = (elemIm_*elemIm_) + (reDif_*reDif_); \ + if (distSq_ > REAL_EPS_SQUARED_) \ + return 0; \ } \ - if (retVal == 0) \ - break; \ } \ + return 1; \ } int isMatrix2Unitary(ComplexMatrix2 u) { - int dim = 2; - int retVal; - macro_isMatrixUnitary(u, dim, retVal); - return retVal; + macro_isMatrixUnitary(u, 2); } int isMatrix4Unitary(ComplexMatrix4 u) { - int dim = 4; - int retVal; - macro_isMatrixUnitary(u, dim, retVal); - return retVal; + macro_isMatrixUnitary(u, 4); } int isMatrixNUnitary(ComplexMatrixN u) { int dim = 1 << u.numQubits; - int retVal; - macro_isMatrixUnitary(u, dim, retVal); - return retVal; + macro_isMatrixUnitary(u, dim); } #define macro_isCompletelyPositiveMap(ops, numOps, opDim) { \ + /* REAL_EPS_SQUARED_, elemRe_, elemIm_, reDif_ and distSq_ must not exist in caller scope */ \ + qreal REAL_EPS_SQUARED_ = REAL_EPS * REAL_EPS; \ + /* check sum_op (ConjugateTranspose(op) * op) == Identity */ \ for (int r=0; r<(opDim); r++) { \ for (int c=0; c<(opDim); c++) { \ + /* compute (r,c)-th elem of sum_op (...) */ \ qreal elemRe_ = 0; \ qreal elemIm_ = 0; \ for (int n=0; n<(numOps); n++) { \ @@ -312,8 +307,10 @@ int isMatrixNUnitary(ComplexMatrixN u) { elemIm_ += ops[n].real[k][r]*ops[n].imag[k][c] - ops[n].imag[k][r]*ops[n].real[k][c]; \ } \ } \ - qreal dist_ = absReal(elemIm_) + absReal(elemRe_ - ((r==c)? 1:0)); \ - if (dist_ > REAL_EPS) \ + /* assert elem has Euclidean distance < REAL_EPS from Identity elem */ \ + qreal reDif_ = elemRe_ - (r==c); \ + qreal distSq_ = (elemIm_*elemIm_) + (reDif_*reDif_); \ + if (distSq_ > REAL_EPS_SQUARED_) \ return 0; \ } \ } \ @@ -1109,14 +1106,20 @@ void validateDiagonalOpGPUAllocation(DiagonalOp* op, QuESTEnv env, const char* c QuESTAssert(allocationSuccessful, E_DIAGONAL_OP_NOT_ALLOCATED_ON_GPU, caller); } -// This is really just a dummy shim, because the scope of GPUExists() -// is limited to the QuEST_gpu.cu file. +void raiseQASMBufferOverflow(const char* caller) { + invalidQuESTInputError(errorMessages[E_QASM_BUFFER_OVERFLOW], caller); +} + + +/* The below functions are dummy shims, whereby the GPU has already determined the outcome of + * validation, though sends the boolean results here for error handling. + */ void validateGPUExists(int GPUPresent, const char* caller) { QuESTAssert(GPUPresent, E_NO_GPU, caller); } -void raiseQASMBufferOverflow(const char* caller) { - invalidQuESTInputError(errorMessages[E_QASM_BUFFER_OVERFLOW], caller); +void validateGPUIsCuQuantumCompatible(int supportsMemPools, const char* caller) { + QuESTAssert(supportsMemPools, E_GPU_DOES_NOT_SUPPORT_MEM_POOLS, caller); } diff --git a/QuEST/src/QuEST_validation.h b/QuEST/src/QuEST_validation.h index eb26a48eb..6df09f93b 100644 --- a/QuEST/src/QuEST_validation.h +++ b/QuEST/src/QuEST_validation.h @@ -174,10 +174,10 @@ void validateDiagonalOpAllocation(DiagonalOp* op, QuESTEnv env, const char* call void validateDiagonalOpGPUAllocation(DiagonalOp* op, QuESTEnv env, const char* caller); -// This is really just a dummy shim, because the scope of GPUExists() -// is limited to the QuEST_gpu.cu file. void validateGPUExists(int GPUPresent, const char* caller); +void validateGPUIsCuQuantumCompatible(int supportsMemPools, const char* caller); + void raiseQASMBufferOverflow(const char* caller); # ifdef __cplusplus diff --git a/README.md b/README.md index 9b8da9068..c3ae29a29 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,10 @@ [![MIT license](https://img.shields.io/badge/license-MIT-lightgrey.svg)](LICENCE.txt) -The **Quantum Exact Simulation Toolkit** is a high performance simulator of quantum circuits, state-vectors and density matrices. QuEST uses **multithreading**, **GPU acceleration** and **distribution** to run lightning first on laptops, desktops and networked supercomputers. QuEST *just works*; it is stand-alone, requires no installation, and trivial to compile and getting running. +The **Quantum Exact Simulation Toolkit** is a high performance simulator of quantum circuits, state-vectors and density matrices. QuEST uses **multithreading**, **GPU acceleration** and **distribution** to run lightning first on laptops, desktops and networked supercomputers. +QuEST *just works*; it is stand-alone, requires no installation, and is trivial to compile and run. +QuEST hybridises [OpenMP](https://www.openmp.org/) and [MPI](https://www.mpi-forum.org/) with _huge_ compiler support to run on all sorts of multicore, multi-CPU and distributed hardware, uses [HIP](https://docs.amd.com/bundle/HIP-Programming-Guide-v5.3) to run on AMD GPUs, integrates [cuQuantum](https://developer.nvidia.com/cuquantum-sdk) and [Thrust](https://developer.nvidia.com/thrust) for cutting-edge performance on modern NVIDIA GPUs, and has a custom kernel backend to run on older CUDA-compatible GPUs. +And it hides these deployment modes behind a single, seamless interface. [![Languages](https://img.shields.io/badge/C-99-ff69b4.svg)](http://www.open-std.org/jtc1/sc22/wg14/www/standards.html#9899) [![Languages](https://img.shields.io/badge/C++-11-ff69b4.svg)](https://isocpp.org/wiki/faq/cpp11) @@ -36,16 +39,17 @@ The **Quantum Exact Simulation Toolkit** is a high performance simulator of quan ![OS](https://img.shields.io/badge/os-Linux-9cbd3c.svg) ![OS](https://img.shields.io/badge/os-Windows-9cbd3c.svg) [![Platforms](https://img.shields.io/badge/multithreaded-OpenMP-6699ff.svg)](https://www.openmp.org/) +[![Platforms](https://img.shields.io/badge/distributed-MPI-6699ff.svg)](https://www.mpi-forum.org/) [![Platforms](https://img.shields.io/badge/GPU-CUDA-6699ff.svg)](https://developer.nvidia.com/cuda-zone) [![Platforms](https://img.shields.io/badge/GPU-AMD-6699ff.svg)](https://docs.amd.com/bundle/HIP-Programming-Guide-v5.3) -[![Platforms](https://img.shields.io/badge/distributed-MPI-6699ff.svg)](https://www.mpi-forum.org/) +[![Platforms](https://img.shields.io/badge/GPU-cuQuantum-6699ff.svg)](https://developer.nvidia.com/cuquantum-sdk) QuEST is developed by the [QTechTheory](http://qtechtheory.org/) group at the University of Oxford, and [these authors](https://github.com/QuEST-Kit/QuEST/blob/master/AUTHORS.txt). To learn more: - see the [tutorial](https://github.com/QuEST-Kit/QuEST/blob/master/examples/README.md) - view the [documentation](https://quest-kit.github.io/QuEST/modules.html) -- visit our [website](https://quest.qtechtheory.org/) +- visit the [website](https://quest.qtechtheory.org/) - see some [examples](https://github.com/QuEST-Kit/QuEST/blob/master/examples/) -- read our [whitepaper](https://www.nature.com/articles/s41598-019-47174-9), which featured in Scientific Report's [Top 100 in Physics](https://www.nature.com/collections/ecehgdfcba/) :trophy: +- read the [whitepaper](https://www.nature.com/articles/s41598-019-47174-9), which featured in Scientific Report's [Top 100 in Physics](https://www.nature.com/collections/ecehgdfcba/) :trophy: [![DOI](https://img.shields.io/badge/DOI-10.1038%2Fs41598--019--47174--9-yellow.svg)](https://doi.org/10.1038/s41598-019-47174-9) [![Email](https://img.shields.io/badge/email-quest@materials.ox.ac.uk-red.svg)](mailto:quest@materials.ox.ac.uk) @@ -131,7 +135,7 @@ QuEST supports: - [testing utilities](https://quest-kit.github.io/QuEST/group__testutilities.html) -> **For developers**: To regenerate the doc after making changes to the code, run `doxygen doxyconfig/config` in the root directory. This will generate documentation in `Doxygen_doc/html`, the contents of which should be copied into [`docs/`](/docs/) +> **For developers**: QuEST's doc is automatically regenerated when the `master` branch is updated via [Github Actions](https://github.com/features/actions). To locally regenerate the doc, run `doxygen doxyconfig/config` in the root directory, which generates html documentation in `Doxygen_doc/html`. --------------------------------- @@ -176,6 +180,7 @@ then run it with We sincerely thank the following external contributors to QuEST. +- [Jakub Adamski](https://github.com/jjacobx) for optimising distributed communication of max-size messages. - [Bruno Villasenor Alvarez](https://github.com/bvillasen) of [AMD](https://www.amd.com/en.html) for porting the GPU backend to [HIP](https://github.com/ROCm-Developer-Tools/HIP), for compatibility with AMD GPUs. - [HQS Quantum simulations](https://quantumsimulations.de/) for contributing `mixDamping` on CPU. - [Kshitij Chhabra](https://github.com/kshitijc) for patching some validation bugs. @@ -183,8 +188,8 @@ We sincerely thank the following external contributors to QuEST. - [Zach van Rijn](https://github.com/zv-io) for patching the multithreading code for GCC-9 OpenMP-5 compatibility. - [SchineCompton](https://github.com/SchineCompton) for patching the GPU CMake release build. - [Christopher J. Anders](https://github.com/chr5tphr) for patching the multithreading (when default off) and GPU builds (revising min cmake). -- [Gleb Struchalin](https://github.com/glebx-f) for patching the cmake standalone build -- [Milos Prokop](https://github.com/Milos9304) for serial prototyping of `initDiagonalOpFromPauliHamil` +- [Gleb Struchalin](https://github.com/glebx-f) for patching the cmake standalone build. +- [Milos Prokop](https://github.com/Milos9304) for serial prototyping of `initDiagonalOpFromPauliHamil`. QuEST uses the [mt19937ar](http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/emt19937ar.html) Mersenne Twister algorithm for random number generation, under the BSD licence. QuEST optionally (by additionally importing `QuEST_complex.h`) integrates the [language agnostic complex type](http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/2003/0303/cuj0303meyers/index.htm) by Randy Meyers and Dr. Thomas Plum diff --git a/examples/README.md b/examples/README.md index 019353e17..1252666dd 100644 --- a/examples/README.md +++ b/examples/README.md @@ -7,7 +7,6 @@ Tutorial - [Running](#running) - [Testing](#testing) -> Before getting started, it is best to [test](#testing) QuEST on your hardware. # Coding @@ -174,6 +173,8 @@ QuEST uses the [Mersenne Twister](http://www.math.sci.hiroshima-u.ac.jp/~m-mat/M # Compiling +See [this page](https://quest.qtechtheory.org/download/) to obtain the necessary compilers. + QuEST uses [CMake](https://cmake.org/) (version `3.7` or higher) as its build system. Configure the build by supplying the below `-D[VAR=VALUE]` options after the `cmake ..` command. You can alternatively compile via [GNU Make](https://www.gnu.org/software/make/) directly with the provided [makefile](makefile). > **Windows** users should install [CMake](https://cmake.org/download/) and [Build Tools](https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2019), and run the below commands in the *Developer Command Prompt for VS* @@ -242,9 +243,9 @@ If your project contains multiple source files, separate them with semi-colons. - To compile for GPU, use ```console - -DGPUACCELERATED=1 -DGPU_COMPUTE_CAPABILITY=[CC] .. + -DGPUACCELERATED=1 -DGPU_COMPUTE_CAPABILITY=[CC] ``` - where `[CC]` is the compute cabability of your GPU, written without a decimal point. This can can be looked up at the [NVIDIA website](https://developer.nvidia.com/cuda-gpus). + where `[CC]` is the compute cabability of your GPU, written without a decimal point. This can can be looked up at the [NVIDIA website](https://developer.nvidia.com/cuda-gpus), and to check you have selected the right one, you should run the [unit tests](#testing). > Note that CUDA is not compatible with all compilers. To force `cmake` to use a > compatible compiler, override `CMAKE_C_COMPILER` and `CMAKE_CXX_COMPILER`. > For example, to compile for the [Quadro P6000](https://www.pny.com/nvidia-quadro-p6000) @@ -254,11 +255,16 @@ If your project contains multiple source files, separate them with semi-colons. > -DCMAKE_C_COMPILER=gcc-6 -DCMAKE_CXX_COMPILER=g++-6 > ``` + QuEST can also leverage NVIDIA's [cuQuantum](https://developer.nvidia.com/cuquantum-sdk) and [Thrust](https://developer.nvidia.com/thrust) libraries for optimised GPU simulation on modern GPUs. You must first install cuQuantum (which includes sub-library `cuStateVec` used by QuEST) [here](https://developer.nvidia.com/cuQuantum-downloads). When compiling QuEST, in addition to the above compiler options, simply specify + ```console + -DUSE_CUQUANTUM=1 + ``` + QuEST can also run on AMD GPUs using HIP. For the HIP documentation see [HIP programming guide](https://docs.amd.com/bundle/HIP-Programming-Guide-v5.3/page/Introduction_to_HIP_Programming_Guide.html). To compile for AMD GPUs, use ```console - -DGPUACCELERATED=1 -DUSE_HIP=1 -DGPU_ARCH=[ARCH] .. + -DGPUACCELERATED=1 -DUSE_HIP=1 -DGPU_ARCH=[ARCH] ``` - where `[ARCH]` is the architecture of your GPU, for example `gfx90a`. A table for AMD GPU architectures can be looked up [here](https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table). + where `[ARCH]` is the architecture of your GPU, for example `gfx90a`. A table for AMD GPU architectures can be looked up [here](https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table). To check you have used the correct `GPU_ARCH`, you should run the [unit tests](#testing). - You can additionally customise the floating point precision used by QuEST's `qreal` type, via ```console @@ -310,6 +316,7 @@ Once compiled as above, the compiled executable can be locally run from within t export OMP_NUM_THREADS=16 mpirun -np 8 ./myExecutable ``` + In some circumstances, like when large-memory multi-core nodes have multiple CPU sockets, it is worthwhile to deploy _multiple_ MPI processes to each node. - In GPU mode, the executable is launched directly via ```console diff --git a/tests/test_decoherence.cpp b/tests/test_decoherence.cpp index c5955b54f..cf572f0a3 100644 --- a/tests/test_decoherence.cpp +++ b/tests/test_decoherence.cpp @@ -9,7 +9,8 @@ #define PREPARE_TEST(qureg, ref) \ Qureg qureg = createDensityQureg(NUM_QUBITS, QUEST_ENV); \ initDebugState(qureg); \ - QMatrix ref = toQMatrix(qureg); + QMatrix ref = toQMatrix(qureg); \ + assertQuregAndRefInDebugState(qureg, ref); /* allows concise use of Contains in catch's REQUIRE_THROWS_WITH */ using Catch::Matchers::Contains; @@ -388,7 +389,7 @@ TEST_CASE( "mixMultiQubitKrausMap", "[decoherence]" ) { } // make only one invalid - ops[GENERATE_REF( range(0,numOps) )].real[0][0] = 0; + ops[GENERATE_REF( range(0,numOps) )].real[0][0] = -123456789; // make valid targets to avoid triggering target validation VLA(int, targs, numTargs); diff --git a/tests/test_operators.cpp b/tests/test_operators.cpp index ac960d962..d4c32b424 100644 --- a/tests/test_operators.cpp +++ b/tests/test_operators.cpp @@ -13,7 +13,9 @@ initDebugState(quregVec); \ initDebugState(quregMatr); \ QVector refVec = toQVector(quregVec); \ - QMatrix refMatr = toQMatrix(quregMatr); + QMatrix refMatr = toQMatrix(quregMatr); \ + assertQuregAndRefInDebugState(quregVec, refVec); \ + assertQuregAndRefInDebugState(quregMatr, refMatr); /** Destroys the data structures made by PREPARE_TEST */ #define CLEANUP_TEST(quregVec, quregMatr) \ diff --git a/tests/test_unitaries.cpp b/tests/test_unitaries.cpp index 76cbd86d9..811641f36 100644 --- a/tests/test_unitaries.cpp +++ b/tests/test_unitaries.cpp @@ -27,7 +27,9 @@ initDebugState(quregVec); \ initDebugState(quregMatr); \ QVector refVec = toQVector(quregVec); \ - QMatrix refMatr = toQMatrix(quregMatr); + QMatrix refMatr = toQMatrix(quregMatr); \ + assertQuregAndRefInDebugState(quregVec, refVec); \ + assertQuregAndRefInDebugState(quregMatr, refMatr); /** Destroys the data structures made by PREPARE_TEST */ #define CLEANUP_TEST(quregVec, quregMatr) \ diff --git a/tests/utilities.cpp b/tests/utilities.cpp index 72ae94680..df4ad90b0 100644 --- a/tests/utilities.cpp +++ b/tests/utilities.cpp @@ -140,6 +140,38 @@ QVector operator * (const QMatrix& m, const QVector& v) { return prod; } +void assertQuregAndRefInDebugState(Qureg qureg, QVector ref) { + DEMAND( qureg.isDensityMatrix == 0 ); + DEMAND( qureg.numAmpsTotal == (long long int) ref.size() ); + + // assert ref is in the debug state (else initDebugState failed) + for (size_t i=0; i= 1 ); - - QMatrix matr = getRandomQMatrix(1 << numQb); +QMatrix getOrthonormalisedRows(QMatrix matr) { + // perform the Gram-Schmidt process, processing each row of matr in-turn for (size_t i=0; i=0; k--) { - // compute row . prev = sum_n row_n conj(prev_n) - qcomp prod = 0; - for (size_t n=0; n= 3 && !areEqual(conjprod, iden) ) { - - matr = getIdentityMatrix(1 << numQb); - conjprod = matr; - } - DEMAND( areEqual(conjprod, iden) ); - // return the new orthonormal matrix return matr; } +QMatrix getRandomDiagonalUnitary(int numQb) { + DEMAND( numQb >= 1 ); + + QMatrix matr = getZeroMatrix(1 << numQb); + for (size_t i=0; i= 1 ); + + // create Z ~ random complex matrix (distribution not too important) + size_t dim = 1 << numQb; + QMatrix matrZ = getRandomQMatrix(dim); + QMatrix matrZT = getTranspose(matrZ); + + // create Z = Q R (via QR decomposition) ... + QMatrix matrQT = getOrthonormalisedRows(matrZ); + QMatrix matrQ = getTranspose(matrQT); + QMatrix matrR = getZeroMatrix(dim); + + // ... where R_rc = (columm c of Z) . (column r of Q) = (row c of ZT) . (row r of QT) + for (size_t r=0; r getRandomKrausMap(int numQb, int numOps) { DEMAND( numOps >= 1 ); DEMAND( numOps <= 4*numQb*numQb ); @@ -606,8 +670,13 @@ std::vector getRandomKrausMap(int numQb, int numOps) { QMatrix prodSum = getZeroMatrix(1 << numQb); for (int i=0; i> QMatrix; */ typedef std::vector QVector; +/** Asserts the given statevector qureg and reference agree, and are properly initialised in the debug state. + * Assertion uses the DEMAND() macro, calling Catch2's FAIL() if unsatisfied, so does not contribute + * toward unit test statistics. This should be called within every PREPARE_TEST macro, to ensure that + * the test states themselves are initially correct, and do not accidentally agree by (e.g.) being all-zero. + * + * @ingroup testutilities + * @author Tyson Jones + */ +void assertQuregAndRefInDebugState(Qureg qureg, QVector ref); + +/** Asserts the given density qureg and reference agree, and are properly initialised in the debug state. + * Assertion uses the DEMAND() macro, calling Catch2's FAIL() if unsatisfied, so does not contribute + * toward unit test statistics. This should be called within every PREPARE_TEST macro, to ensure that + * the test states themselves are initially correct, and do not accidentally agree by (e.g.) being all-zero. + * + * @ingroup testutilities + * @author Tyson Jones + */ +void assertQuregAndRefInDebugState(Qureg qureg, QMatrix ref); + /* (Excluded from Doxygen doc) * * Define QVector and QMatrix operator overloads.